From 0495fc54359b7fc38fc4f950d791022efd6a91a5 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 Dec 2025 09:13:30 +0000
Subject: [PATCH 001/753] Bump ubuntu in
 /tensorflow/tools/tf_sig_build_dockerfiles

Bumps ubuntu from `0950623` to `104ae83`.

---
updated-dependencies:
- dependency-name: ubuntu
  dependency-version: '22.04'
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
index 8d35977d14a987..b9d06f956f6d2a 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/Dockerfile
@@ -1,5 +1,5 @@
 ################################################################################
-FROM ubuntu:22.04@sha256:09506232a8004baa32c47d68f1e5c307d648fdd59f5e7eaa42aaf87914100db3 as builder
+FROM ubuntu:22.04@sha256:104ae83764a5119017b8e8d6218fa0832b09df65aae7d5a6de29a85d813da2fb as builder
 ################################################################################
 
 # Install devtoolset build dependencies

From 800ff3bf850590f12a3940b33a1a196bb98b5d6f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 Dec 2025 09:57:00 +0000
Subject: [PATCH 002/753] Bump the github-actions group with 5 updates

Bumps the github-actions group with 5 updates:

| Package | From | To |
| --- | --- | --- |
| [actions/checkout](https://github.com/actions/checkout) | `5.0.0` | `6.0.0` |
| [google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml](https://github.com/google/osv-scanner-action) | `2.2.4` | `2.3.0` |
| [actions/setup-python](https://github.com/actions/setup-python) | `6.0.0` | `6.1.0` |
| [peter-evans/create-pull-request](https://github.com/peter-evans/create-pull-request) | `7.0.8` | `7.0.9` |
| [github/codeql-action](https://github.com/github/codeql-action) | `4.31.2` | `4.31.6` |


Updates `actions/checkout` from 5.0.0 to 6.0.0
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/08c6903cd8c0fde910a37f88322edcfb5dd907a8...1af3b93b6815bc44a9784bd300feb67ff0d1eeb3)

Updates `google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml` from 2.2.4 to 2.3.0
- [Release notes](https://github.com/google/osv-scanner-action/releases)
- [Commits](https://github.com/google/osv-scanner-action/compare/v2.2.4...v2.3.0)

Updates `actions/setup-python` from 6.0.0 to 6.1.0
- [Release notes](https://github.com/actions/setup-python/releases)
- [Commits](https://github.com/actions/setup-python/compare/e797f83bcb11b83ae66e0230d6156d7c80228e7c...83679a892e2d95755f2dac6acb0bfd1e9ac5d548)

Updates `peter-evans/create-pull-request` from 7.0.8 to 7.0.9
- [Release notes](https://github.com/peter-evans/create-pull-request/releases)
- [Commits](https://github.com/peter-evans/create-pull-request/compare/271a8d0340265f705b14b6d32b9829c1cb33d45e...84ae59a2cdc2258d6fa0732dd66352dddae2a412)

Updates `github/codeql-action` from 4.31.2 to 4.31.6
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/0499de31b99561a6d14a36a5f662c2a54f91beee...fe4161a26a8629af62121b670040955b330f9af2)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-version: 6.0.0
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: github-actions
- dependency-name: google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml
  dependency-version: 2.3.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: github-actions
- dependency-name: actions/setup-python
  dependency-version: 6.1.0
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: github-actions
- dependency-name: peter-evans/create-pull-request
  dependency-version: 7.0.9
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: github-actions
- dependency-name: github/codeql-action
  dependency-version: 4.31.6
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: github-actions
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .github/workflows/arm-cd.yml                    | 4 ++--
 .github/workflows/arm-ci-extended-cpp.yml       | 4 ++--
 .github/workflows/arm-ci-extended.yml           | 4 ++--
 .github/workflows/arm-ci.yml                    | 2 +-
 .github/workflows/cffconvert.yml                | 2 +-
 .github/workflows/issue-on-pr-rollback.yml      | 2 +-
 .github/workflows/osv-scanner-scheduled.yml     | 2 +-
 .github/workflows/pylint-presubmit.yml          | 4 ++--
 .github/workflows/release-branch-cherrypick.yml | 4 ++--
 .github/workflows/scorecards-analysis.yml       | 4 ++--
 .github/workflows/update-rbe.yml                | 4 ++--
 11 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/arm-cd.yml b/.github/workflows/arm-cd.yml
index 2e3912041d9cf2..5430fc1c8151e8 100644
--- a/.github/workflows/arm-cd.yml
+++ b/.github/workflows/arm-cd.yml
@@ -52,12 +52,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: 'nightly'
       - name: Checkout repository for releases (skipped for nightly)
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build and test pip wheel
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci-extended-cpp.yml b/.github/workflows/arm-ci-extended-cpp.yml
index 54903a6998b090..09085e814daba1 100644
--- a/.github/workflows/arm-ci-extended-cpp.yml
+++ b/.github/workflows/arm-ci-extended-cpp.yml
@@ -50,12 +50,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: 'nightly'
       - name: Checkout repository
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build binary and run C++ tests
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci-extended.yml b/.github/workflows/arm-ci-extended.yml
index 2235cfc2d986da..94237fcaa6cca5 100644
--- a/.github/workflows/arm-ci-extended.yml
+++ b/.github/workflows/arm-ci-extended.yml
@@ -51,12 +51,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: 'nightly'
       - name: Checkout repository
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build binary and run python tests on nightly for all python versions
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci.yml b/.github/workflows/arm-ci.yml
index a141bdd4676852..12d8ab4a2cf719 100644
--- a/.github/workflows/arm-ci.yml
+++ b/.github/workflows/arm-ci.yml
@@ -47,7 +47,7 @@ jobs:
         shell: bash
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Build binary and run python tests
         shell: bash
         run: |
diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
index 6421e08ccf0839..de578ffec96327 100644
--- a/.github/workflows/cffconvert.yml
+++ b/.github/workflows/cffconvert.yml
@@ -30,7 +30,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Check out a copy of the repository
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Check whether the citation metadata from CITATION.cff is valid
         uses: citation-file-format/cffconvert-github-action@4cf11baa70a673bfdf9dad0acc7ee33b3f4b6084 # v2.0.0
diff --git a/.github/workflows/issue-on-pr-rollback.yml b/.github/workflows/issue-on-pr-rollback.yml
index d5e0661a5f356b..1d548e9204e563 100644
--- a/.github/workflows/issue-on-pr-rollback.yml
+++ b/.github/workflows/issue-on-pr-rollback.yml
@@ -33,7 +33,7 @@ jobs:
       startsWith(github.event.head_commit.message, 'Rollback of PR #')
     steps:
       - name: Checkout repo
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Create a new Github Issue
         uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
         with:
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
index 07896a48470753..984dead9db7388 100644
--- a/.github/workflows/osv-scanner-scheduled.yml
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -28,7 +28,7 @@ permissions:
 jobs:
   scan-scheduled:
     if: github.repository == 'tensorflow/tensorflow'
-    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.2.4"
+    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v2.3.0"
     with:
       scan-args: |-
         --lockfile=requirements.txt:./requirements_lock_3_9.txt
diff --git a/.github/workflows/pylint-presubmit.yml b/.github/workflows/pylint-presubmit.yml
index 59068d9d86f45d..483cf5bfc0addf 100644
--- a/.github/workflows/pylint-presubmit.yml
+++ b/.github/workflows/pylint-presubmit.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout code
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     - name: Get file changes
       id: get_file_changes
       uses: trilom/file-changes-action@a6ca26c14274c33b15e6499323aac178af06ad4b # v1.2.4
@@ -38,7 +38,7 @@ jobs:
       run: |
         echo Changed files: ${{ steps.get_file_changes.outputs.files }}
     - name: Set up Python 3.9
-      uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+      uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0
       with:
         python-version: "3.9"
     - name: Install Python dependencies
diff --git a/.github/workflows/release-branch-cherrypick.yml b/.github/workflows/release-branch-cherrypick.yml
index 69e03a040ae1a2..fc643c92d304d1 100644
--- a/.github/workflows/release-branch-cherrypick.yml
+++ b/.github/workflows/release-branch-cherrypick.yml
@@ -45,7 +45,7 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     steps:
     - name: Checkout code
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       with:
         ref: ${{ github.event.inputs.release_branch }}
     - name: Get some helpful info for formatting
@@ -58,7 +58,7 @@ jobs:
           echo "SHORTSHA=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%h")" >> "$GITHUB_OUTPUT"
           echo "TITLE=$(git log -1 ${{ github.event.inputs.git_commit }} --format="%s")" >> "$GITHUB_OUTPUT"
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+      uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
       with:
         title: '${{ github.event.inputs.release_branch }} cherry-pick: ${{ steps.cherrypick.outputs.SHORTSHA }} "${{ steps.cherrypick.outputs.TITLE }}"'
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>
diff --git a/.github/workflows/scorecards-analysis.yml b/.github/workflows/scorecards-analysis.yml
index e635c4cd8ccc88..ce2d7075019b5d 100644
--- a/.github/workflows/scorecards-analysis.yml
+++ b/.github/workflows/scorecards-analysis.yml
@@ -41,7 +41,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           persist-credentials: false
 
@@ -64,6 +64,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@0499de31b99561a6d14a36a5f662c2a54f91beee # v3.29.5
+        uses: github/codeql-action/upload-sarif@fe4161a26a8629af62121b670040955b330f9af2 # v3.29.5
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index a8dba883f5ff14..d2cc83b7f5c2c2 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -30,7 +30,7 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     steps:
     - name: Checkout code
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     - name: Update the RBE Configs
       run: |
         function map() {
@@ -130,7 +130,7 @@ jobs:
         map sigbuild-r2.17-clang-python3.11 2.17-python3.11
         map sigbuild-r2.17-clang-python3.12 2.17-python3.12
     - name: Create Pull Request with changes
-      uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # v7.0.8
+      uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
       with:
         title: Update the RBE images to the latest container versions
         committer: TensorFlow Release Automation <jenkins@tensorflow.org>

From 37562c3d83b7366276341bfdbd83b8a7ed5d97ce Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 1 Dec 2025 10:12:39 +0000
Subject: [PATCH 003/753] Bump ubuntu from `66460d5` to `c35e29c` in
 /tensorflow/tools/gcs_test

Bumps ubuntu from `66460d5` to `c35e29c`.

---
updated-dependencies:
- dependency-name: ubuntu
  dependency-version: '24.04'
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 tensorflow/tools/gcs_test/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/gcs_test/Dockerfile b/tensorflow/tools/gcs_test/Dockerfile
index b5fbef19051f8a..19958cb6478765 100644
--- a/tensorflow/tools/gcs_test/Dockerfile
+++ b/tensorflow/tools/gcs_test/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:24.04@sha256:66460d557b25769b102175144d538d88219c077c678a49af4afca6fbfc1b5252
+FROM ubuntu:24.04@sha256:c35e29c9450151419d9448b0fd75374fec4fff364a27f176fb458d472dfc9e54
 
 LABEL maintainer="Shanqing Cai <cais@google.com>"
 

From 515af6fc2e13085c5d71dd75426f881ae7418a20 Mon Sep 17 00:00:00 2001
From: 1ndig0 <1090891928@qq.com>
Date: Tue, 2 Dec 2025 15:52:07 +0800
Subject: [PATCH 004/753] Change begin and size types to include int16

Updated the type annotations for begin and size parameters to include int16.
---
 tensorflow/python/ops/array_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9f6644b4342ada..94dadf91a0e18d 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -982,8 +982,8 @@ def slice(input_, begin, size, name=None):
 
   Args:
     input_: A `Tensor`.
-    begin: An `int32` or `int64` `Tensor`.
-    size: An `int32` or `int64` `Tensor`.
+    begin: An `int16`, `int32` or `int64` `Tensor`.
+    size: An `int16`, `int32` or `int64` `Tensor`.
     name: A name for the operation (optional).
 
   Returns:

From 2fb9073f2205e48bf54263e60d846a3e4ab8d39d Mon Sep 17 00:00:00 2001
From: "guozhong.zhuang" <guozhong.zhuang@intel.com>
Date: Tue, 2 Dec 2025 13:04:41 -0800
Subject: [PATCH 005/753] [oneDNN] Improve oneDNN primitive caching performance

---
 tensorflow/core/util/BUILD      | 1 +
 tensorflow/core/util/mkl_util.h | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index 3acd07c02fadf8..72cd0b7751e2cc 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -298,6 +298,7 @@ filegroup(
         "mkl_heuristics.h",
         "mkl_util.h",
         "onednn_env_vars.h",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@local_xla//xla/tsl/util:onednn_util_hdrs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index d15ec3034a93c9..a3a5381583a196 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "oneapi/dnnl/dnnl.hpp"
 #include "oneapi/dnnl/dnnl_threadpool.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -1963,7 +1964,7 @@ class LRUCache {
   size_t capacity_;
 
   // The cache, a map from string key to a LRU entry.
-  std::unordered_map<string, Entry> cache_;
+  absl::flat_hash_map<string, Entry> cache_;
 
   // The LRU list of entries.
   // The front of the list contains the key of the most recently accessed

From ce6f59526c772e345faf09d3d2f60a3078e2e331 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 6 Dec 2025 04:52:02 +0000
Subject: [PATCH 006/753] Bump urllib3 in
 /ci/official/requirements_updater/numpy1_requirements

Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.5.0 to 2.6.0.
- [Release notes](https://github.com/urllib3/urllib3/releases)
- [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst)
- [Commits](https://github.com/urllib3/urllib3/compare/2.5.0...2.6.0)

---
updated-dependencies:
- dependency-name: urllib3
  dependency-version: 2.6.0
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .../numpy1_requirements/requirements_lock_3_10.txt          | 6 +++---
 .../numpy1_requirements/requirements_lock_3_11.txt          | 6 +++---
 .../numpy1_requirements/requirements_lock_3_12.txt          | 6 +++---
 .../numpy1_requirements/requirements_lock_3_9.txt           | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
index 1bef2b2f7903df..898ea6c0418532 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
@@ -729,9 +729,9 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
index 7bc734c2624710..eae965757fad3b 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
@@ -728,9 +728,9 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
index 8d9d9dc47fc5d7..ca6904da19ebbe 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
@@ -728,9 +728,9 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
index 41eb61f5557d7f..e34567660cc5f7 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
@@ -725,9 +725,9 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \

From a027f53ab0dfa6e0e3ee86aa71165e809c990ff3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 00:26:49 -0800
Subject: [PATCH 007/753] Automated Code Change

PiperOrigin-RevId: 841630902
---
 third_party/xla/xla/tools/ptx_opt/BUILD      | 1 +
 third_party/xla/xla/tools/ptx_opt/ptx_opt.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/tools/ptx_opt/BUILD b/third_party/xla/xla/tools/ptx_opt/BUILD
index 262a448e9e2f9e..dcdbb9cc1162b4 100644
--- a/third_party/xla/xla/tools/ptx_opt/BUILD
+++ b/third_party/xla/xla/tools/ptx_opt/BUILD
@@ -22,6 +22,7 @@ xla_cc_binary(
     ],
     deps = [
         "//xla:debug_options_flags",
+        "//xla:xla_proto_cc",
         "//xla/service/gpu/llvm_gpu_backend:load_ir_module",
         "//xla/service/gpu/llvm_gpu_backend:nvptx_backend",
         "//xla/stream_executor:device_description",
diff --git a/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc b/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
index df00cb8039c253..64114733254f84 100644
--- a/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
+++ b/third_party/xla/xla/tools/ptx_opt/ptx_opt.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/util/command_line_flags.h"
+#include "xla/xla.pb.h"
 #include "tsl/platform/init_main.h"
 
 namespace xla::gpu::nvptx {

From bfa9f8bf5e3557d5544b606f9d730162db448b8c Mon Sep 17 00:00:00 2001
From: Shaogang Wang <shawnw@nvidia.com>
Date: Mon, 8 Dec 2025 00:42:02 -0800
Subject: [PATCH 008/753] PR #34802: [XLA:GPU] Add buffer type information for
 GpuExecutable memory allocation profile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34802

📝 Summary of Changes
Add buffer type information for GpuExecutable memory allocation profile

🎯 Justification
Give us some insights on which allocation is frequently changed, and useful for command buffer optimization.

🚀 Kind of Contribution
 📚 Documentation

Copybara import of the project:

--
8930cae81077e508dc69c3bf367f03d0439c6205 by Shawn Wang <shawnw@nvidia.com>:

Update stable address profile to include allocation type

Merging this change closes #34802

PiperOrigin-RevId: 841634857
---
 third_party/xla/xla/service/gpu/gpu_executable.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 96d1b4ca13c2d9..d72fdeddab4fe3 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -1062,7 +1062,14 @@ absl::Status GpuExecutable::ExecuteThunks(
         }
         module_allocations_[executor][i] =
             buffer_allocations.GetDeviceAddress(i);
-        VLOG(5) << "Gpu address changed for module " << module_name_;
+        const BufferAllocation& allocation =
+            buffer_assignment_->GetAllocation(i);
+        const char* allocation_type =
+            allocation.is_entry_computation_parameter() ? "parameter"
+            : allocation.maybe_live_out()               ? "live-out"
+                                                        : "temp";
+        VLOG(5) << "Gpu address changed for module " << module_name_
+                << ", allocation " << i << " (" << allocation_type << ")";
       }
     }
   }

From ea054af52030b0d5af9e5b5dfd66c5d853affa7f Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 8 Dec 2025 00:53:54 -0800
Subject: [PATCH 009/753] Remove HloProfilePrinterData and HloProfileIndexMap
 from Executable.

This change removes the HloProfilePrinterData and HloProfileIndexMap members from the Executable base class and updates all derived classes and call sites to reflect this change. The profiling data is no longer stored within the Executable.

PiperOrigin-RevId: 841638347
---
 .../backends/interpreter/executable_base.cc   |  3 +-
 third_party/xla/xla/service/BUILD             |  1 -
 .../service/cpu/cpu_aot_compilation_result.cc |  4 +--
 .../service/cpu/cpu_aot_compilation_result.h  | 18 ----------
 .../xla/xla/service/cpu/cpu_compiler.cc       | 23 +++---------
 .../xla/xla/service/cpu/cpu_executable.cc     | 21 ++++-------
 .../xla/xla/service/cpu/cpu_executable.h      |  4 ---
 third_party/xla/xla/service/executable.h      | 35 -------------------
 8 files changed, 14 insertions(+), 95 deletions(-)

diff --git a/third_party/xla/xla/backends/interpreter/executable_base.cc b/third_party/xla/xla/backends/interpreter/executable_base.cc
index d8a9ac91c7d39f..7ba92f41d87701 100644
--- a/third_party/xla/xla/backends/interpreter/executable_base.cc
+++ b/third_party/xla/xla/backends/interpreter/executable_base.cc
@@ -55,8 +55,7 @@ namespace interpreter {
 
 InterpreterExecutableBase::InterpreterExecutableBase(
     std::unique_ptr<HloModule> hlo_module)
-    : Executable(std::move(hlo_module), /*hlo_profile_printer_data=*/nullptr,
-                 /*hlo_profile_index_map=*/nullptr) {}
+    : Executable(std::move(hlo_module)) {}
 
 absl::StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 252abf3c4d8f98..fb4b60aedea3f1 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1526,7 +1526,6 @@ cc_library(
     deps = [
         ":buffer_assignment",
         ":computation_layout",
-        ":hlo_execution_profile",
         ":hlo_module_config",
         ":hlo_proto_cc",
         ":maybe_owning_device_memory",
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
index e422891c24ec34..31ca1d590cd292 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
@@ -196,8 +196,8 @@ CpuAotCompilationResult::LoadExecutable(
       cpu_executable,
       CpuExecutable::Create(std::move(function_library_),
                             std::move(buffer_assignment), std::move(module),
-                            std::move(*thunks), std::move(constants), nullptr,
-                            nullptr, target_machine_options));
+                            std::move(*thunks), std::move(constants),
+                            target_machine_options));
 
   // Dump computation proto state and buffer assignment for
   // GetCompiledMemoryStats results.
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index e6589fb1787da5..4817200999814f 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -110,24 +110,6 @@ class CpuAotCompilationResult : public AotCompilationResult {
       TargetMachineOptionsProto target_machine_options =
           TargetMachineOptionsProto());
 
-  [[deprecated(
-      "HloProfilePrinterData is not used anymore. Use the other Create "
-      "method instead.")]] static absl::
-      StatusOr<std::unique_ptr<CpuAotCompilationResult>>
-      Create(const HloModule* hlo_module,
-             const BufferAssignment* buffer_assignment,
-             absl::string_view function_name,
-             std::vector<ObjFileProto> obj_files,
-             std::vector<SymbolProto> symbols, const ThunkSequence& thunks,
-             std::unique_ptr<FunctionLibrary> function_library,
-             std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-             TargetMachineOptionsProto target_machine_options =
-                 TargetMachineOptionsProto()) {
-    return Create(hlo_module, buffer_assignment, function_name,
-                  std::move(obj_files), std::move(symbols), thunks,
-                  std::move(function_library), target_machine_options);
-  }
-
   ~CpuAotCompilationResult() override = default;
 
   absl::StatusOr<std::string> SerializeAsString() const override {
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 895657789c891f..a6117a8169ddc3 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -2019,11 +2019,10 @@ CpuCompiler::CompileCpuExecutable(
 
   TF_ASSIGN_OR_RETURN(
       auto cpu_executable,
-      CpuExecutable::Create(
-          std::move(function_library), std::move(assignment), std::move(module),
-          std::move(thunks), std::move(constants),
-          std::move(hlo_profile_printer_data), std::move(hlo_profile_index_map),
-          std::move(target_machine_options)));
+      CpuExecutable::Create(std::move(function_library), std::move(assignment),
+                            std::move(module), std::move(thunks),
+                            std::move(constants),
+                            std::move(target_machine_options)));
 
   // Save object files to be able to export them to AOT compilation
   // result.
@@ -2243,12 +2242,6 @@ CpuCompiler::CompileAheadOfTimeThunks(
   const ThunkSequence& thunk_sequence =
       cpu_executable->thunks().thunk_sequence();
 
-  std::unique_ptr<HloProfilePrinterData> executable_hlo_profile_printer_data =
-      cpu_executable->module().config().hlo_profiling_enabled()
-          ? std::make_unique<HloProfilePrinterData>(
-                cpu_executable->hlo_profile_printer_data())
-          : nullptr;
-
   if (cpu_executable->obj_files().size() > 1) {
     return Internal(
         "Expected at most one object file for AOT compilation, but got %d",
@@ -2266,7 +2259,6 @@ CpuCompiler::CompileAheadOfTimeThunks(
       cpu_executable->module_name(), std::move(obj_files),
       cpu_executable->get_compiled_symbols_proto(), thunk_sequence,
       std::move(*cpu_executable).consume_function_library(),
-      std::move(executable_hlo_profile_printer_data),
       cpu_executable->target_machine_options().ToProto());
 }
 
@@ -2299,12 +2291,6 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
   std::vector<SymbolProto> compiled_symbols_proto =
       cpu_executable->get_compiled_symbols_proto();
 
-  std::unique_ptr<HloProfilePrinterData> executable_hlo_profile_printer_data =
-      cpu_executable->module().config().hlo_profiling_enabled()
-          ? std::make_unique<HloProfilePrinterData>(
-                cpu_executable->hlo_profile_printer_data())
-          : nullptr;
-
   TF_ASSIGN_OR_RETURN(auto compiled_symbols,
                       GetCompiledSymbolsFromProto(compiled_symbols_proto));
 
@@ -2319,7 +2305,6 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
       cpu_executable->module_name(), std::move(obj_files),
       std::move(compiled_symbols_proto), *thunk_sequence,
       std::move(function_library),
-      std::move(executable_hlo_profile_printer_data),
       cpu_executable->target_machine_options().ToProto());
 }
 
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 6e0cf855e34f97..6bb3a695e9523e 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -88,16 +88,13 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
     std::unique_ptr<BufferAssignment> assignment,
     std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
     std::vector<ConstantAllocation> constants,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
     TargetMachineOptions target_machine_options) {
   VLOG(2) << "Create CpuExecutable from a thunk sequence; module="
           << hlo_module->name() << ", constants=" << constants.size();
 
-  std::unique_ptr<CpuExecutable> executable(new CpuExecutable(
-      std::move(hlo_module), std::move(hlo_profile_printer_data),
-      std::move(hlo_profile_index_map), std::move(assignment),
-      std::move(target_machine_options)));
+  std::unique_ptr<CpuExecutable> executable(
+      new CpuExecutable(std::move(hlo_module), std::move(assignment),
+                        std::move(target_machine_options)));
   executable->function_library_ = std::move(function_library);
 
   ThunkExecutor::Options thunk_executor_options;
@@ -129,14 +126,10 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
   return executable;
 }
 
-CpuExecutable::CpuExecutable(
-    std::unique_ptr<HloModule> hlo_module,
-    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
-    std::unique_ptr<BufferAssignment> assignment,
-    TargetMachineOptions target_machine_options)
-    : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
-                 std::move(hlo_profile_index_map)),
+CpuExecutable::CpuExecutable(std::unique_ptr<HloModule> hlo_module,
+                             std::unique_ptr<BufferAssignment> assignment,
+                             TargetMachineOptions target_machine_options)
+    : Executable(std::move(hlo_module)),
       assignment_(std::move(assignment)),
       target_machine_options_(std::move(target_machine_options)) {
   if (assignment_ && has_module()) {
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index ee590e472dbf83..ebb97baf217e47 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -62,8 +62,6 @@ class CpuExecutable : public Executable {
       std::unique_ptr<BufferAssignment> assignment,
       std::unique_ptr<HloModule> hlo_module, ThunkSequence thunks,
       std::vector<ConstantAllocation> constants,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
       TargetMachineOptions target_machine_options);
 
   ~CpuExecutable() override;
@@ -246,8 +244,6 @@ class CpuExecutable : public Executable {
   std::string entry_function_name_;
 
   CpuExecutable(std::unique_ptr<HloModule> hlo_module,
-                std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
                 std::unique_ptr<BufferAssignment> assignment,
                 TargetMachineOptions target_machine_options);
   CpuExecutable(const CpuExecutable&) = delete;
diff --git a/third_party/xla/xla/service/executable.h b/third_party/xla/xla/service/executable.h
index db444230abe342..e59ac39a932d44 100644
--- a/third_party/xla/xla/service/executable.h
+++ b/third_party/xla/xla/service/executable.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -37,7 +36,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/service_executable_run_options.h"
@@ -265,20 +263,6 @@ class Executable {
   // doesn't need it for execution.
   explicit Executable(std::shared_ptr<HloModule> hlo_module)
       : hlo_module_(std::move(hlo_module)) {}
-
-  // TODO(b/172012028): Remove this constructor.
-  // The hlo_module parameter may be nullptr, if the given executable type
-  // doesn't need it for execution.
-  explicit Executable(
-      std::shared_ptr<HloModule> hlo_module,
-      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-      std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
-      : hlo_module_(std::move(hlo_module)),
-        hlo_profile_printer_data_(std::move(hlo_profile_printer_data)),
-        hlo_profile_index_map_(std::move(hlo_profile_index_map)) {
-    CHECK_EQ(hlo_profile_printer_data_.get() == nullptr,
-             hlo_profile_index_map_.get() == nullptr);
-  }
   virtual ~Executable() = default;
 
   // Enqueues the compilation result on the provided stream, passing the given
@@ -344,22 +328,6 @@ class Executable {
       const ServiceExecutableRunOptions* run_options,
       std::vector<ExecutionInput> arguments);
 
-  const HloProfilePrinterData& hlo_profile_printer_data() const {
-    CHECK(hlo_profiling_enabled());
-    return *hlo_profile_printer_data_;
-  }
-
-  const HloProfileIndexMap& hlo_profile_index_map() const {
-    CHECK(hlo_profiling_enabled());
-    return *hlo_profile_index_map_;
-  }
-
-  // Returns whether this executable was compiled with HLO profilings support
-  // enabled. If not, the caller should not expect an hlo_execution_profile
-  // passed to ExecuteOnStream above to be populated during execution.
-  bool hlo_profiling_enabled() const {
-    return hlo_profile_printer_data_ != nullptr;
-  }
 
   HloModule& module() const {
     CHECK(hlo_module_ != nullptr);
@@ -477,9 +445,6 @@ class Executable {
   // execution.
   int64_t execution_count_ = 0;
 
-  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
-  std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map_;
-
   // A map from kernel name to relevant kernel stats.
   ModuleStats module_stats_;
 

From a6e0e6fea1a7910728a32763f3e8a4722d5491d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eusebio=20Dur=C3=A1n=20Monta=C3=B1a?= <eusebiodm@google.com>
Date: Mon, 8 Dec 2025 00:58:04 -0800
Subject: [PATCH 010/753] Add missing BUILD dependencies, and remove unused
 ones

PiperOrigin-RevId: 841639400
---
 third_party/xla/xla/backends/autotuner/BUILD  |  1 -
 third_party/xla/xla/backends/cpu/BUILD        |  3 --
 .../xla/xla/backends/cpu/autotuner/BUILD      |  4 ---
 .../xla/xla/backends/cpu/codegen/BUILD        |  6 ----
 .../xla/xla/backends/cpu/codegen/dot/BUILD    |  2 --
 .../xla/backends/cpu/codegen/elemental/BUILD  |  2 --
 .../xla/backends/cpu/codegen/emitters/BUILD   |  1 -
 .../cpu/codegen/tiled/transforms/BUILD        |  1 -
 .../xla/xla/backends/cpu/collectives/BUILD    |  5 ---
 .../xla/xla/backends/cpu/runtime/BUILD        | 35 ++++---------------
 .../xla/backends/cpu/runtime/xnnpack/BUILD    |  2 --
 .../xla/backends/cpu/runtime/ynnpack/BUILD    |  8 -----
 .../xla/xla/backends/cpu/testlib/BUILD        |  3 --
 third_party/xla/xla/backends/cpu/tests/BUILD  |  8 -----
 .../xla/xla/backends/gpu/autotuner/BUILD      |  4 ---
 .../xla/xla/backends/gpu/codegen/BUILD        |  1 -
 .../backends/gpu/codegen/emitters/ir/BUILD    |  1 -
 .../xla/xla/backends/gpu/codegen/tools/BUILD  |  2 --
 .../xla/xla/backends/gpu/codegen/triton/BUILD | 17 ---------
 .../xla/xla/backends/gpu/runtime/BUILD        | 12 -------
 .../xla/xla/backends/interpreter/BUILD        |  1 -
 .../xla/xla/backends/profiler/gpu/BUILD       |  8 -----
 third_party/xla/xla/codegen/BUILD             |  8 -----
 third_party/xla/xla/codegen/emitters/BUILD    |  6 ----
 third_party/xla/xla/codegen/tiling/BUILD      |  6 ----
 25 files changed, 7 insertions(+), 140 deletions(-)

diff --git a/third_party/xla/xla/backends/autotuner/BUILD b/third_party/xla/xla/backends/autotuner/BUILD
index 87fa53104ca953..b9c1637e90d582 100644
--- a/third_party/xla/xla/backends/autotuner/BUILD
+++ b/third_party/xla/xla/backends/autotuner/BUILD
@@ -181,7 +181,6 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:any_cc_proto",
diff --git a/third_party/xla/xla/backends/cpu/BUILD b/third_party/xla/xla/backends/cpu/BUILD
index b124f4f72962ea..7d9e7ad114aba0 100644
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@@ -54,7 +54,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
@@ -182,7 +181,6 @@ cc_library(
     srcs = ["xnn_gemm_config.cc"],
     hdrs = ["xnn_gemm_config.h"],
     deps = [
-        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/backends/cpu/runtime:dot_dims",
@@ -208,7 +206,6 @@ cc_library(
         "//xla/service:pattern_matcher",
         "//xla/tsl/platform:statusor",
         "@XNNPACK",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/third_party/xla/xla/backends/cpu/autotuner/BUILD b/third_party/xla/xla/backends/cpu/autotuner/BUILD
index 89b7f87c5cdce7..e5bc7335a1a082 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/cpu/autotuner/BUILD
@@ -88,7 +88,6 @@ cc_library(
     hdrs = ["xnnpack_backend.h"],
     deps = [
         ":cpu_codegen_backend",
-        "//xla:status_macros",
         "//xla:util",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/cpu:xnn_fusion_options_proto_cc",
@@ -102,7 +101,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:casts",
     ],
 )
 
@@ -203,7 +201,5 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/BUILD b/third_party/xla/xla/backends/cpu/codegen/BUILD
index 4753a80c341de5..db9b0a2cfbd267 100644
--- a/third_party/xla/xla/backends/cpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/BUILD
@@ -94,7 +94,6 @@ cc_library(
     srcs = ["ir_compiler.cc"],
     hdrs = ["ir_compiler.h"],
     deps = [
-        ":cpu_features",
         ":kernel_api_ir_builder",
         ":polynomial_approximations",
         "//xla:util",
@@ -133,7 +132,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
-        "@local_tsl//tsl/platform:platform_port",
     ],
 )
 
@@ -351,7 +349,6 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
@@ -557,9 +554,7 @@ cc_library(
     hdrs = ["object_loader.h"],
     deps = [
         ":compiled_function_library",
-        ":contiguous_section_memory_manager",
         ":execution_engine",
-        ":jit_memory_mapper",
         "//xla/backends/cpu/runtime:function_library",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -776,7 +771,6 @@ py_strict_test(
         "//third_party/py/numpy",
         "//xla/backends/cpu/testlib",
         "//xla/codegen/testlib",
-        "//xla/python:xla_extension",
         "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
index 067d7d21b6df1d..97db03ab0cfcaa 100644
--- a/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/dot/BUILD
@@ -15,7 +15,6 @@ cc_library(
         "//xla:util",
         "//xla/backends/cpu/codegen:kernel_api_ir_builder",
         "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:llvm_kernel_source",
@@ -25,7 +24,6 @@ cc_library(
         "//xla/service:hlo_module_config",
         "//xla/service/cpu:dot_op_emitter",
         "//xla/service/llvm_ir:ir_array",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
index bdce35a2907d7e..f31b161c01e632 100644
--- a/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/elemental/BUILD
@@ -35,7 +35,6 @@ cc_library(
         "//xla/service/cpu:backend_config_proto_cc",
         "//xla/service/cpu:ir_emitter",
         "//xla/service/llvm_ir:ir_array",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
@@ -104,7 +103,6 @@ xla_cc_test(
         ":elemental_kernel_emitter",
         "//xla:xla_data_proto_cc",
         "//xla/codegen:kernel_definition",
-        "//xla/codegen:kernel_emitter",
         "//xla/codegen:llvm_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
index d0ac6c73095272..a7f171019e92cf 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
@@ -124,6 +124,5 @@ xla_cc_test(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:Pass",
-        "@local_tsl//tsl/platform:casts",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
index d81c0557967508..ee920c56af1ac2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/BUILD
@@ -114,7 +114,6 @@ cc_library(
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:VectorDialect",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/collectives/BUILD b/third_party/xla/xla/backends/cpu/collectives/BUILD
index abec1d790f9b90..c1103b1457180a 100644
--- a/third_party/xla/xla/backends/cpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/cpu/collectives/BUILD
@@ -76,13 +76,11 @@ xla_cc_test(
         ":cpu_clique_key",
         ":cpu_cliques",
         ":in_process_collectives",
-        "//xla:util",
         "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -144,7 +142,6 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:rendezvous",
         "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/math:math_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -272,7 +269,6 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
@@ -338,7 +334,6 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index 1ee4f81b7db5f6..8d68edd2d6e12e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -1,6 +1,6 @@
 load("//xla:xla.default.bzl", "xla_cc_test", "xla_internal")
 load("//xla/service/cpu:build_defs.bzl", "runtime_copts")
-load("//xla/tsl:tsl.bzl", "if_windows", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "if_google", "if_windows", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
@@ -105,14 +105,11 @@ cc_library(
         "//xla:util",
         "//xla/runtime:work_group",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
     ],
@@ -370,13 +367,10 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -496,14 +490,11 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -524,12 +515,10 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -550,13 +539,10 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -580,13 +566,10 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -752,6 +735,11 @@ cc_library(
         "dot_lib_s8.cc",
     ],
     hdrs = ["dot_lib.h"],
+    tags = if_google([
+        #  Prevent build_cleaner from adding a dependency on eigen_contraction_kernel.h, see comment
+        # on `:dot_lib_onednn` below.
+        "ignore_for_dep=third_party/tensorflow/compiler/xla/tsl/framework/contraction/eigen_contraction_kernel.h",
+    ]),
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
@@ -799,14 +787,11 @@ cc_library(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -967,7 +952,6 @@ cc_library(
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -995,7 +979,6 @@ xla_cc_test(
     deps = [
         ":buffer_allocations",
         ":function_library",
-        ":kernel",
         ":kernel_c_api",
         ":kernel_thunk",
         ":thunk",
@@ -1003,7 +986,7 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
@@ -1099,7 +1082,6 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1227,7 +1209,6 @@ cc_library(
         "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
     ],
@@ -1424,8 +1405,6 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
index 9cd96a5e90b63b..dc32dac687585b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
@@ -160,7 +160,6 @@ cc_library(
     deps = [
         ":xnn_interop",
         "//xla:shape_util",
-        "//xla:util",
         "//xla/backends/cpu/runtime:thunk",
         "//xla/runtime:buffer_use",
         "//xla/runtime:object_pool",
@@ -172,7 +171,6 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@XNNPACK",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
index 5172563e23ca0c..f5b22b22c19d8c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
@@ -22,10 +22,7 @@ cc_library(
         "//xla/backends/cpu/runtime:work_queue",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -72,15 +69,10 @@ cc_library(
         ":slinky_threadpool",
         ":ynn_interop",
         "@XNNPACK//ynnpack:ynnpack_h",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
-        "@slinky//slinky/base:thread_pool",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/testlib/BUILD b/third_party/xla/xla/backends/cpu/testlib/BUILD
index e4d57e372c0ef0..e926426a322d37 100644
--- a/third_party/xla/xla/backends/cpu/testlib/BUILD
+++ b/third_party/xla/xla/backends/cpu/testlib/BUILD
@@ -159,7 +159,6 @@ tsl_pybind_extension(
         "//xla/codegen:llvm_kernel_source",
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/testlib:kernel_runner",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
@@ -192,7 +191,6 @@ xla_cc_test(
         "//xla/codegen:llvm_kernel_source",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
@@ -200,7 +198,6 @@ xla_cc_test(
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:JITLink",
         "@llvm-project//llvm:ir_headers",
-        "@local_tsl//tsl/platform:casts",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/tests/BUILD b/third_party/xla/xla/backends/cpu/tests/BUILD
index 0e63cc568a6459..435200a13e65d2 100644
--- a/third_party/xla/xla/backends/cpu/tests/BUILD
+++ b/third_party/xla/xla/backends/cpu/tests/BUILD
@@ -20,14 +20,9 @@ xla_test(
     tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:error_spec",
-        "//xla:literal",
-        "//xla:literal_util",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
-        "//xla/tests:client_library_test_runner_mixin",
         "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
         "//xla/tests:hlo_pjrt_test_base",
-        "//xla/tests:literal_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -41,14 +36,11 @@ xla_test(
     tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         "//xla:error_spec",
-        "//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
         "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/platform:test",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:platform_port",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index d9eb72d2b71296..3d1f9c93001508 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -40,7 +40,6 @@ xla_cc_test(
     srcs = ["gpu_codegen_backend_test.cc"],
     deps = [
         ":gpu_codegen_backend",
-        "//xla:xla_proto_cc",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -225,7 +224,6 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -573,7 +571,6 @@ xla_test(
         ":native_emitter",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:compiler",
         "//xla/service:executable",
@@ -607,7 +604,6 @@ cc_library(
         ":fission_backend",
         ":triton",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/service:compiler",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index 2be8a0247fbe27..62f99605665307 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -295,7 +295,6 @@ cc_library(
         "//xla/backends/gpu/codegen/emitters:transpose",
         "//xla/backends/gpu/codegen/triton:fusion",
         "//xla/codegen:ir_emission_utils",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:buffer_assignment",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
index 20b5dec1c59a74..4c9961033ebc17 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/ir/BUILD
@@ -116,7 +116,6 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:BytecodeOpInterface",
-        "@llvm-project//mlir:CallOpInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
index dfe8f7f9a8100f..df59b5ebed0cec 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
@@ -32,7 +32,6 @@ cc_library(
         "//xla:status_macros",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/emitters:emitter_base",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -55,7 +54,6 @@ xla_cc_binary(
     deps = [
         ":test_lib",
         "//xla/codegen/tools:test_lib",
-        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 8e365aba6a0b59..19587ed10c4d4a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -259,7 +259,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/codegen/emitters:elemental_hlo_to_mlir",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/tiling:symbolic_tile_analysis",
@@ -559,7 +558,6 @@ xla_test(
         "//xla:error_spec",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -612,19 +610,13 @@ xla_test(
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/tests:gpu_codegen_test",
-        "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
@@ -711,7 +703,6 @@ xla_test(
         "no_mac",
     ],
     deps = [
-        ":fusion_emitter",
         ":support",
         ":test_utils",
         ":xtile_compiler",
@@ -724,12 +715,6 @@ xla_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/codegen/tiling:tiled_hlo_computation",
-        "//xla/codegen/tiling:tiled_hlo_instruction",
-        "//xla/codegen/tiling:tiled_hlo_schedule",
-        "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -787,7 +772,6 @@ cc_library(
         "//xla/service/gpu:gpu_float_support",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:target_constants",
         "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/model:triton_emitter_constraints",
         "//xla/stream_executor:device_description",
@@ -1010,7 +994,6 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:LLVMDialect",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 1ba07e7410f2c2..192c99de9902a6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1667,10 +1667,8 @@ cc_library(
     hdrs = ["collective_params.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        ":collective_clique_requests",
         "//xla:executable_run_options",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/runtime:device_id",
         "//xla/service:computation_placer",
@@ -1681,9 +1679,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1700,7 +1696,6 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_clique",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/backends/gpu/collectives:gpu_cliques",
-        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
@@ -1710,9 +1705,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
@@ -2153,11 +2146,8 @@ cc_library(
         "//xla:executable_run_options",
         "//xla:status_macros",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_cliques",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
         "//xla/ffi:execution_context",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
@@ -2171,7 +2161,6 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/lib/gtl:int_type",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -2179,7 +2168,6 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index 6b9515523cf0c1..6bf5957323a49c 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -168,7 +168,6 @@ cc_library(
         "//xla/stream_executor:stream_executor_common",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/host:host_stream",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 30e6a1cfa72cb4..fefd3b9b992862 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -68,7 +68,6 @@ cc_library(
         ":rocm_tracer_utils",
         "//xla/stream_executor/rocm:roctracer_wrapper",
         "//xla/tsl/platform:env_time",
-        "//xla/tsl/platform:errors",
         "//xla/tsl/profiler/backends/cpu:annotation_stack",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -201,9 +200,7 @@ xla_test(
         ":cupti_wrapper",
         ":mock_cupti",
         "//xla/tsl/profiler/utils:time_utils",
-        "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -362,7 +359,6 @@ cc_library(
     ],
     deps = [
         ":cupti_collector",
-        ":cupti_interface",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
     ],
@@ -756,11 +752,7 @@ xla_cc_test(
         ":ondevice_event_exporter",
         "//xla/tsl/profiler/backends/gpu:ondevice_event_receiver",
         "//xla/tsl/profiler/backends/gpu:ondevice_trace_event",
-        "//xla/tsl/profiler/utils:xplane_builder",
-        "//xla/tsl/profiler/utils:xplane_schema",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
diff --git a/third_party/xla/xla/codegen/BUILD b/third_party/xla/xla/codegen/BUILD
index f58bf888d460bb..ddcc8810df0a6f 100644
--- a/third_party/xla/xla/codegen/BUILD
+++ b/third_party/xla/xla/codegen/BUILD
@@ -57,8 +57,6 @@ cc_library(
     srcs = ["llvm_kernel_source.cc"],
     hdrs = ["llvm_kernel_source.h"],
     deps = [
-        ":kernel_definition",
-        ":kernel_emitter",
         ":kernel_source",
         "//xla/service/llvm_ir:llvm_util",
         "@llvm-project//llvm:Core",
@@ -77,7 +75,6 @@ cc_library(
     deps = [
         ":kernel_source",
         ":kernel_spec",
-        "//xla/tsl/platform:logging",
     ],
 )
 
@@ -86,8 +83,6 @@ cc_library(
     srcs = ["mlir_kernel_source.cc"],
     hdrs = ["mlir_kernel_source.h"],
     deps = [
-        ":kernel_definition",
-        ":kernel_emitter",
         ":kernel_source",
         "//xla:util",
         "//xla/hlo/analysis:symbolic_expr",
@@ -117,7 +112,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
     ],
@@ -136,8 +130,6 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status_matchers",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
diff --git a/third_party/xla/xla/codegen/emitters/BUILD b/third_party/xla/xla/codegen/emitters/BUILD
index 3bf04998441f0b..5fc4861574b9ed 100644
--- a/third_party/xla/xla/codegen/emitters/BUILD
+++ b/third_party/xla/xla/codegen/emitters/BUILD
@@ -257,7 +257,6 @@ cc_library(
         "//xla/codegen:kernel_spec",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
@@ -287,7 +286,6 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/runtime:work_cluster",
         "//xla/runtime:work_dimensions",
         "//xla/runtime:work_group",
@@ -311,7 +309,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:mlir_kernel_source",
@@ -327,7 +324,6 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -365,7 +361,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:mlir_kernel_source",
@@ -425,7 +420,6 @@ cc_library(
         "//xla:util",
         "//xla/codegen:hlo_fusion_spec",
         "//xla/codegen:ir_emission_utils",
-        "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_emitter",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:mlir_kernel_source",
diff --git a/third_party/xla/xla/codegen/tiling/BUILD b/third_party/xla/xla/codegen/tiling/BUILD
index a373ed7c1ced60..07ebadad917774 100644
--- a/third_party/xla/xla/codegen/tiling/BUILD
+++ b/third_party/xla/xla/codegen/tiling/BUILD
@@ -306,12 +306,7 @@ cc_library(
     hdrs = ["tiling_specification.h"],
     deps = [
         ":constraint_expression",
-        ":symbolic_tiled_hlo_instruction",
-        ":tiled_hlo_computation",
-        "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/utils:hlo_traversal",
-        "//xla/service:instruction_fusion",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -321,7 +316,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
     ],
 )
 

From 21f33aec5ab61b897851145109f476c17bb64e94 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 01:05:38 -0800
Subject: [PATCH 011/753] Update GraphDef version to 2435.

PiperOrigin-RevId: 841641925
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 97ae6af69c56ae..5448bf12c3dcfe 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2434  // Updated: 2025/12/7
+#define TF_GRAPH_DEF_VERSION 2435  // Updated: 2025/12/8
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From e5a79a127258429ea35a27e3b4c16f511887cdfc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 01:06:32 -0800
Subject: [PATCH 012/753] compat: Update forward compatibility horizon to
 2025-12-08

PiperOrigin-RevId: 841642257
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 77eb63a7551ed6..019f2360af662e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 7)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 8)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a79c3d3a39db173699271673c6f33eb6d4209cff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 01:12:17 -0800
Subject: [PATCH 013/753] Automated Code Change

PiperOrigin-RevId: 841644074
---
 .../xla/xla/backends/gpu/codegen/emitters/emitter_base.cc        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
index 91b52d4d011ee6..f171a5cb6b4f33 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"

From 59e985cfa334b8bb32d07b0f74d0503eb0c03bf6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 01:14:14 -0800
Subject: [PATCH 014/753] [XLA:GPU] Filter out multi-gpu tests since current
 GPU L4 action has only single GPU.

PiperOrigin-RevId: 841644586
---
 third_party/xla/build_tools/ci/build.py       | 130 ++++--------------
 .../xla/build_tools/ci/golden_commands.txt    |  28 ++--
 .../xla/xla/stream_executor/cuda/BUILD        |   1 -
 third_party/xla/xla/tests/BUILD               |  15 --
 .../xla/xla/tests/collective_ops_e2e_test.cc  |   8 ++
 .../xla/tests/collective_ops_e2e_test_base.h  |   5 +
 6 files changed, 53 insertions(+), 134 deletions(-)

diff --git a/third_party/xla/build_tools/ci/build.py b/third_party/xla/build_tools/ci/build.py
index 20d77da3bee540..0b6e4dbcbf8822 100755
--- a/third_party/xla/build_tools/ci/build.py
+++ b/third_party/xla/build_tools/ci/build.py
@@ -273,6 +273,17 @@ def _tag_filters_for_compute_capability(
   return tag_filters
 
 
+nvidia_gpu_filters = (
+    "-no_oss",
+    "requires-gpu-nvidia",
+    "gpu",
+    "-rocm-only",
+    "-oneapi-only",
+)
+
+single_nvidia_gpu_filters = nvidia_gpu_filters + ("-multi_gpu",)
+
+
 def nvidia_gpu_build_with_compute_capability(
     *,
     type_: BuildType,
@@ -285,21 +296,8 @@ def nvidia_gpu_build_with_compute_capability(
       repo="openxla/xla",
       target_patterns=_XLA_DEFAULT_TARGET_PATTERNS,
       configs=configs,
-      test_tag_filters=(
-          "-no_oss",
-          "requires-gpu-nvidia",
-          "gpu",
-          "-rocm-only",
-          "-oneapi-only",
-      )
-      + extra_gpu_tags,
-      build_tag_filters=(
-          "-no_oss",
-          "requires-gpu-nvidia",
-          "gpu",
-          "-rocm-only",
-          "-oneapi-only",
-      ),
+      test_tag_filters=single_nvidia_gpu_filters + extra_gpu_tags,
+      build_tag_filters=single_nvidia_gpu_filters,
       options={
           "run_under": "//build_tools/ci:parallel_gpu_execute",
           "//xla/tsl:ci_build": True,
@@ -510,21 +508,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=single_nvidia_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=single_nvidia_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -542,21 +528,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=single_nvidia_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=single_nvidia_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -575,21 +549,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=single_nvidia_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=single_nvidia_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -607,21 +569,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=single_nvidia_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=single_nvidia_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -640,21 +590,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=(),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=single_nvidia_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=100),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=single_nvidia_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         # Use User Mode and Kernel Mode Drivers pre-installed on the system.
@@ -675,21 +613,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=(),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    )
+    test_tag_filters=single_nvidia_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=100),
-    build_tag_filters=(
-        "-no_oss",
-        "requires-gpu-nvidia",
-        "gpu",
-        "-rocm-only",
-        "-oneapi-only",
-    ),
+    build_tag_filters=single_nvidia_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         # Use User Mode and Kernel Mode Drivers pre-installed on the system.
@@ -932,11 +858,7 @@ def nvidia_gpu_build_with_compute_capability(
 Build(
     type_=BuildType.TENSORFLOW_LINUX_X86_GPU_L4_GITHUB_ACTIONS,
     repo="tensorflow/tensorflow",
-    configs=(
-        "release_gpu_linux",
-        "rbe_linux_cuda",
-        "hermetic_cuda_umd"
-    ),
+    configs=("release_gpu_linux", "rbe_linux_cuda", "hermetic_cuda_umd"),
     target_patterns=(
         "//tensorflow/compiler/...",
         "-//tensorflow/compiler/tf2tensorrt/...",
diff --git a/third_party/xla/build_tools/ci/golden_commands.txt b/third_party/xla/build_tools/ci/golden_commands.txt
index f5e914157ec888..e067ee9ecc80dd 100644
--- a/third_party/xla/build_tools/ci/golden_commands.txt
+++ b/third_party/xla/build_tools/ci/golden_commands.txt
@@ -55,44 +55,44 @@ bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_16_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
-bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
+bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_48_VCPU_PRESUBMIT_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS
 nvidia-smi
-parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
-bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
+bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm75-only,requires-gpu-sm60,requires-gpu-sm70,-requires-gpu-sm80,-requires-gpu-sm80-only,-requires-gpu-sm90,-requires-gpu-sm90-only,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=7.5 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 24a305d31a76ce..f173369799956f 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1248,7 +1248,6 @@ xla_test(
     backend_tags = {
         "gpu": [
             "multi_gpu",
-            "no_oss",
         ],
     },
     backends = ["gpu"],
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index b8f0ea10cad89b..aff7b7e1abfcdd 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2916,10 +2916,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "broken",
-            "no_oss",
-        ],
     },
     backends = [
         "gpu",
@@ -2971,9 +2967,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "no_oss",
-        ],
     },
     backends = ["gpu"],
     deps = [
@@ -3016,10 +3009,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "broken",
-            "no_oss",
-        ],
     },
     backends = [
         "gpu",
@@ -3043,10 +3032,6 @@ xla_test(
         "gpu": [
             "multi_gpu",
         ],
-        "nvgpu_any": [
-            "broken",
-            "no_oss",
-        ],
     },
     backends = [
         "gpu",
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test.cc b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
index 1191df40032c41..872492ffecbdeb 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
@@ -2396,6 +2396,14 @@ class AllReduceTest
                                    /*memory_size=*/32 * kMB,
                                    /*collectives_memory_size=*/0) {}
 
+  void SetUp() override {
+    CollectiveOpsE2ETestBase::SetUp();
+    if (!IsAmpereAndHigher()) {
+      GTEST_SKIP() << "Test requires Ampere or newer architecture since it's "
+                      "using triton.";
+    }
+  }
+
  protected:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions opts = CollectiveOpsWithFlagsBase::GetDebugOptionsForTest();
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test_base.h b/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
index 93190cc0e7c85c..8cf62249adca78 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test_base.h
@@ -80,6 +80,11 @@ class CollectiveOpsE2ETestBase : public HloHardwareIndependentTestBase {
            Capability().cuda_compute_capability()->IsAtLeastHopper();
   }
 
+  bool IsAmpereAndHigher() {
+    return Capability().IsCuda() &&
+           Capability().cuda_compute_capability()->IsAtLeastAmpere();
+  }
+
  protected:
   std::unique_ptr<HloRunner> hlo_runner_;
   std::unique_ptr<HloRunner> reference_hlo_runner_;

From a380e6c5e155b4ae40c28a4c4bdc13cb4b74b941 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 8 Dec 2025 01:38:49 -0800
Subject: [PATCH 015/753] Remove accidental const keyword.

The alias_info_ member should not be const.

PiperOrigin-RevId: 841652983
---
 third_party/xla/xla/service/buffer_assignment_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index 31a0cc1c4d5e32..7627b0afd76bab 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -399,7 +399,7 @@ class BufferAssignmentTest : public HloHardwareIndependentTestBase {
   Shape f32a100x10_ = ShapeUtil::MakeShape(F32, {100, 10});
   Shape t_s32_f32v4_ = ShapeUtil::MakeTupleShape({s32_, f32vec4_});
   Shape t_s32_f32v10_ = ShapeUtil::MakeTupleShape({s32_, f32vec10_});
-  const AliasInfo alias_info_;
+  AliasInfo alias_info_;
 };
 
 // Returns true if the buffers assigned to instructions in "a" are distinct

From 325a5b2649bcdf4522a1a58ea354268399f14488 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 8 Dec 2025 02:09:09 -0800
Subject: [PATCH 016/753] Store ThunkProto in GpuExecutable.

This change modifies GpuExecutable to generate and store the ThunkProto during creation, before running thunk passes. The stored proto is then used when serializing the GpuExecutable to a proto, instead of generating it on demand after thunk passes ran.

This is a temporary measure to make debug dumping of GPU executables possible. Long term we want to split GpuExecutable into 2 entities - one that is being produced by the compiler and doesn't depend on runtime facilities, and a second one which gets generated from the first one and has all the execution code. But this is unfortunately a bigger refactoring, therefore we need a quicker way.

PiperOrigin-RevId: 841662771
---
 .../xla/xla/service/gpu/gpu_executable.cc     | 18 +++--
 .../xla/xla/service/gpu/gpu_executable.h      |  7 +-
 .../xla/service/gpu/gpu_executable_test.cc    | 72 +++++++++++++++++++
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index d72fdeddab4fe3..df3767982ce5f9 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -88,6 +88,7 @@ limitations under the License.
 #include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/event_based_timer.h"
+#include "xla/stream_executor/kernel_stats.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
@@ -238,6 +239,10 @@ absl::StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(
 
   GpuExecutableThunkPassBufferAllocator allocator(next_idx);
 
+  // TODO(b/461380690): Remove this once we have a better way to distinguish
+  // between compiler-generated and runtime-loaded GPU executables.
+  absl::StatusOr<ThunkProto> thunk_proto = params.executable->ToProto();
+
   TF_RETURN_IF_ERROR(RunThunkPasses(
       params.debug_options, params.device_description, params.executable.get(),
       params.debug_module.get(), allocator));
@@ -251,7 +256,7 @@ absl::StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(
       std::move(allocator.MutableAllocations()), std::move(params.alias_info),
       std::move(params.debug_options), std::move(params.constants),
       std::move(params.output_info), params.enable_debug_info_manager,
-      std::move(params.module_stats)));
+      std::move(params.module_stats), std::move(thunk_proto)));
 }
 
 // Implementation note: HLO profiling is always enabled for GPU executables,
@@ -268,7 +273,8 @@ GpuExecutable::GpuExecutable(
     std::unique_ptr<GpuAliasInfo> alias_info, DebugOptions debug_options,
     std::vector<ConstantInfo> constants,
     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
-    bool enable_debug_info_manager, ModuleStats module_stats)
+    bool enable_debug_info_manager, ModuleStats module_stats,
+    absl::StatusOr<ThunkProto> thunk_proto)
     : Executable(std::move(debug_module)),
       text_(std::move(asm_text)),
       binary_(std::move(binary)),
@@ -288,7 +294,8 @@ GpuExecutable::GpuExecutable(
           debug_options.xla_debug_buffer_assignment_show_max()),
       constants_(std::move(constants)),
       output_info_(std::move(output_info)),
-      enable_debug_info_manager_(enable_debug_info_manager) {
+      enable_debug_info_manager_(enable_debug_info_manager),
+      thunk_proto_(std::move(thunk_proto)) {
   if (gpu_version_.IsRocm()) {
     // ROCm uses hsaco hashes to distinguish between modules.
     // Bad things happen if multiple modules with identical code are loaded.
@@ -1230,7 +1237,10 @@ absl::StatusOr<GpuExecutableProto> GpuExecutable::ToProto() const {
 
   *proto.mutable_gpu_compute_capability() = gpu_version_.ToProto();
 
-  TF_ASSIGN_OR_RETURN(*proto.mutable_thunk(), thunks_->ToProto());
+  // TODO(b/461380690): Generate the proto on-the-fly once we have a better way
+  // to distinguish between compiler-generated and runtime-loaded GPU
+  // executables.
+  TF_ASSIGN_OR_RETURN(*proto.mutable_thunk(), thunk_proto_);
 
   proto.set_module_name(module_name_);
   *proto.mutable_program_shape() = program_shape_.ToProto();
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index ce1a5eff0bb591..867dbf2275fb4c 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -247,7 +247,8 @@ class GpuExecutable : public Executable {
       std::unique_ptr<GpuAliasInfo> alias_info, DebugOptions debug_options,
       std::vector<ConstantInfo> constants,
       absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
-      bool enable_debug_info_manager, ModuleStats module_stats);
+      bool enable_debug_info_manager, ModuleStats module_stats,
+      absl::StatusOr<ThunkProto> thunk_proto);
 
   // GpuExecutable check with either AMD's ISA version, or Nvidia's major minor
   // version for compute capability, depending on the hardware.
@@ -369,6 +370,10 @@ class GpuExecutable : public Executable {
 
   GpuExecutable(const GpuExecutable&) = delete;
   GpuExecutable& operator=(const GpuExecutable&) = delete;
+
+  // Stores the thunk graph as a proto from before running the thunk pass.
+  // Might contain an error if the given thunk graph is not serializable.
+  absl::StatusOr<ThunkProto> thunk_proto_;
 };
 
 absl::StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_test.cc b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
index 1d5d68823b970f..33483843b616cc 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
@@ -649,5 +649,77 @@ TEST(GpuExecutableTest, FromProtoWithSymbolResolver) {
   EXPECT_EQ(symbol_resolver_invocations, 1);
 }
 
+TEST(GpuExecutableTest, ToProtoReturnsUnchangedThunkGraph) {
+  DebugOptions debug_options;
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::FUSION);
+
+  auto create_executable = [&]() {
+    ThunkSequence thunk_sequence;
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(1),
+        /*kernel_name=*/"test_kernel_0",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(2),
+        /*kernel_name=*/"test_kernel_1",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(3),
+        /*kernel_name=*/"test_kernel_2",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(4),
+        /*kernel_name=*/"test_kernel_3",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+    thunk_sequence.push_back(std::make_unique<KernelThunk>(
+        ThunkInfoWithId(5),
+        /*kernel_name=*/"test_kernel_4",
+        /*kernel_arguments=*/emitters::KernelArguments({}),
+        /*launch_dimensions=*/LaunchDimensions(),
+        /*cluster_dim=*/std::nullopt,
+        /*shmem_bytes=*/0,
+        /*tma_metadata=*/se::gpu::TmaMetadata()));
+
+    GpuExecutable::Params params;
+    params.executable = std::make_unique<SequentialThunk>(
+        ThunkInfoWithId(20), std::move(thunk_sequence));
+    params.debug_options = debug_options;
+
+    params.module_name = "test_module";
+    return GpuExecutable::Create(std::move(params));
+  };
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GpuExecutable> executable,
+                          create_executable());
+
+  // We expect our 5 kernel launches got wrapped in a command buffer thunk.
+  // If this assertion fails, you might need to either adjust the thunk graph or
+  // the debug options such that we do some kind of thunk graph transformation
+  // that we can test for.
+  ASSERT_THAT(executable->GetThunk().thunks(), SizeIs(1));
+
+  // The proto should be a straight dump of the thunk graph, without any
+  // transformation.
+  TF_ASSERT_OK_AND_ASSIGN(GpuExecutableProto proto, executable->ToProto());
+  ASSERT_TRUE(proto.thunk().has_sequential_thunk());
+  EXPECT_THAT(proto.thunk().sequential_thunk().thunks(), SizeIs(5));
+}
+
 }  // namespace
 }  // namespace xla::gpu

From 8ee821c813471aaea6a80f33a8bfd319aa90cee2 Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Mon, 8 Dec 2025 03:19:12 -0800
Subject: [PATCH 017/753] [XLA:CPU] Use loop emitter rather than copy thunk for
 sub-byte types.

Confirmed that the test fails before this change.

PiperOrigin-RevId: 841683617
---
 third_party/xla/xla/service/cpu/tests/BUILD   | 16 ++++++
 .../xla/service/cpu/tests/cpu_copy_test.cc    | 54 +++++++++++++++++++
 .../xla/xla/service/cpu/thunk_emitter.cc      |  5 +-
 3 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 third_party/xla/xla/service/cpu/tests/cpu_copy_test.cc

diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 3ea7a53b3b6206..dd27f5eb8a38e9 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -442,3 +442,19 @@ xla_cc_test(
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
+
+xla_cc_test(
+    name = "cpu_copy_test",
+    srcs = ["cpu_copy_test.cc"],
+    deps = [
+        ":cpu_codegen_test_main",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:literal_test_util",
+        "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_copy_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_copy_test.cc
new file mode 100644
index 00000000000000..20de31fc7be8fd
--- /dev/null
+++ b/third_party/xla/xla/service/cpu/tests/cpu_copy_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/service/cpu/tests/cpu_codegen_test.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::cpu {
+namespace {
+
+TEST_F(CpuCodegenTest, SubByteCopy) {
+  const std::string hlo_text = R"hlo(
+HloModule module
+
+ENTRY entry {
+  in = u2[20,20]{1,0:E(2)} iota(), iota_dimension=1
+  transpose = u2[20,20]{0,1:E(2)} transpose(in), dimensions={1,0}
+  copy = u2[20,20]{1,0:E(2)} copy(transpose)
+  ROOT out = u8[20,20]{1,0} convert(copy)
+}
+)hlo";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const Literal result,
+      Execute(std::move(module), {}, /*run_hlo_passes=*/false));
+
+  absl::Span<const uint8_t> result_data = result.data<uint8_t>();
+  for (int64_t row = 0; row < 20; ++row) {
+    for (int64_t col = 0; col < 20; ++col) {
+      EXPECT_EQ(result_data[row * 20 + col], row % 4);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 3f9932fc78bc43..0506620c3bb7a6 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -486,7 +486,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
       return EmitConvolutionThunk(instruction);
 
     case HloOpcode::kCopy: {
-      if (options_.compile_copy_as_llvm_kernel) {
+      // The copy thunk does not support sub-byte data types.
+      bool has_byte_strides =
+          ShapeUtil::ByteStrides(instruction->shape()).has_value();
+      if (!has_byte_strides || options_.compile_copy_as_llvm_kernel) {
         return EmitElementalKernelThunk(instruction);
       }
       return EmitCopyThunk(instruction);

From 7c95198b02af894ca3177991c90ea42fe904ed02 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Mon, 8 Dec 2025 03:35:20 -0800
Subject: [PATCH 018/753] [XLA:GPU] Return early in
 `CalculateBitcastOfTransposeImpl` if indices are empty.

This avoids hitting an assert later. This is a stop gap solution until support for size-1 dims in bitcasts has been added.

PiperOrigin-RevId: 841687992
---
 .../xla/xla/service/gpu/transforms/nest_gemm_fusion.cc      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index 0a8633fd5e7b4d..3383d936a0c931 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -679,6 +679,12 @@ absl::StatusOr<BitcastParams> CalculateBitcastOfTransposeImpl(
       indices.push_back(index);
     };
 
+    if (indices.empty()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
+                       " because size-1 dims in bitcasts are not yet supported "
+                       "(b/466065483)."));
+    }
     if (indices.back() - indices.front() >= transpose_to - transpose_from ||
         !absl::c_is_sorted(indices)) {
       return absl::InvalidArgumentError(

From 144a6cb45fcdcc39f743825ea3ebcbff2a24d7c5 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 8 Dec 2025 03:45:30 -0800
Subject: [PATCH 019/753] Make use of inserted_window_dims attribute of
 scatter.

This makes the logic a bit easier. Right now, ScatterSimplifier will turn this
into the original expanded update shape with 1-sized update window dimensions.
But once our scatter emitter can handle it, we might avoid the reshape.

PiperOrigin-RevId: 841690525
---
 .../expanders/permutation_sort_expander.cc          | 11 ++---------
 .../expanders/permutation_sort_expander_test.cc     | 13 ++++++++-----
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
index 5af2a3e0056679..cfb796aef6c898 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
@@ -127,13 +127,6 @@ absl::StatusOr<HloInstruction*> PermutationSortExpander::ExpandInstruction(
       instruction->AddInstruction(HloInstruction::CreateBroadcast(
           update_shape, zero, /*broadcast_dimensions=*/{}));
 
-  // Construct the updates operand of scatter.
-  for (int64_t i = 0; i < rank; ++i) {
-    ShapeUtil::AppendMinorDimension(1, &update_shape);
-  }
-  HloInstruction* scatter_updates = instruction->AddInstruction(
-      HloInstruction::CreateReshape(update_shape, values));
-
   // Construct the updates computation, which simply replaces the operand
   // values with the update values.
   HloComputation::Builder b("update_replace_computation");
@@ -149,12 +142,12 @@ absl::StatusOr<HloInstruction*> PermutationSortExpander::ExpandInstruction(
   ScatterDimensionNumbers dim_numbers;
   dim_numbers.set_index_vector_dim(rank);
   for (int64_t i = 0; i < rank; ++i) {
-    dim_numbers.add_update_window_dims(rank + i);
+    dim_numbers.add_inserted_window_dims(i);
     dim_numbers.add_scatter_dims_to_operand_dims(i);
   }
   HloInstruction* scatter =
       instruction->AddInstruction(HloInstruction::CreateScatter(
-          values->shape(), scatter_operand, scatter_indices, scatter_updates,
+          update_shape, scatter_operand, scatter_indices, values,
           update_replace_computation, dim_numbers,
           /*indices_are_sorted=*/false, /*unique_indices=*/true));
   return instruction->AddInstruction(HloInstruction::CreateTuple(
diff --git a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc
index 0351221da62b0d..329df8c95e9bd6 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander_test.cc
@@ -63,11 +63,14 @@ TEST_F(PermutationSortExpanderTest, ReplacePermutationSortWithScatter) {
 
   EXPECT_THAT(PermutationSortExpander().Run(module.get()), IsOkAndHolds(true));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root,
-              op::Tuple(op::Iota(),
-                        op::Scatter(op::Broadcast(op::Constant()),
-                                    op::Concatenate(op::Iota(), op::Reshape()),
-                                    op::Reshape())));
+  EXPECT_THAT(
+      root, op::Tuple(op::Iota(),
+                      op::Scatter(
+                          op::Broadcast(op::Constant()),
+                          op::Concatenate(op::Iota(),
+                                          op::Reshape(op::GetTupleElement(
+                                              op::Sort(), /*tuple_index=*/1))),
+                          op::Iota())));
 }
 
 TEST_F(PermutationSortExpanderTest, DontReplaceIfWrongComparisonDirection) {

From a693e6d76f42ee719cc623b60d6b2e9fb17c9f1e Mon Sep 17 00:00:00 2001
From: Shanbin Ke <ske@nvidia.com>
Date: Mon, 8 Dec 2025 04:05:54 -0800
Subject: [PATCH 020/753] PR #34789: [XLA:GPU] Fix cuDNN SDPA test to use 0 as
 workspace size to work universally on all archs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34789

📝 Summary of Changes
use 0 as default workspace size and query later so it works universally on all archs, cuDNN paged attention reference doesn't do this like other cuDNN sdpa tests, it fails on B200 in NV internal CI. Therefore the fix.

🎯 Justification
use 0 as default workspace size and query later so it works universally on all archs, cuDNN paged attention reference doesn't do this like other cuDNN sdpa tests, it fails on B200 in NV internal CI. Therefore the fix.

🚀 Kind of Contribution
🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
None

🧪 Unit Tests:
None

🧪 Execution Tests:
None

Copybara import of the project:

--
7c53e935fcb424970da1ffed4c18a95e08835d57 by Cjkkkk <ske@nvidia.com>:

use 0 as workspace to work universally on all arch

Merging this change closes #34789

PiperOrigin-RevId: 841696508
---
 third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index 3e984387001e6f..8a0329b6a1fbf6 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -1372,7 +1372,7 @@ class FlashAttentionPagedAttention : public MultiHeadedAttentionTest {
     ENTRY %main.7 (Arg_0.1: bf16[1,128,2,128], Arg_1.2: bf16[1,128,2,128]) -> bf16[1,128,2,128] {
       %Arg_1.2 = bf16[1,128,2,128]{3,2,1,0} parameter(1)
       %Arg_0.1 = bf16[1,128,2,128]{3,2,1,0} parameter(0)
-      %custom-call.3 = (bf16[1,2,128,128]{3,1,2,0}, u8[256]{0}) custom-call(%Arg_0.1, %Arg_1.2, %Arg_1.2), custom_call_target="__cudnn$fmhaSoftmax", operand_layout_constraints={bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "workspace_size": "0"}, "fmha_scale": 1.0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["1", "2", "128", "128"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "is_flash_attention": true, "mask_type": "NO_MASK", "bmm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "dropout_rate": 0, "seed": 42, "sliding_window_length": 0, "max_seg_per_batch": 1, "is_paged_attention": false}}
+      %custom-call.3 = (bf16[1,2,128,128]{3,1,2,0}, u8[0]{0}) custom-call(%Arg_0.1, %Arg_1.2, %Arg_1.2), custom_call_target="__cudnn$fmhaSoftmax", operand_layout_constraints={bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}, bf16[1,128,2,128]{3,2,1,0}}, api_version=API_VERSION_STATUS_RETURNING, backend_config={"cudnn_fmha_backend_config": {"algorithm": {"algo_id": "0", "math_type": "TENSOR_OP_MATH", "tuning_knobs": {"17": "1", "24": "0"}, "workspace_size": "0"}, "fmha_scale": 1.0, "intermediate_tensor_shape": {"element_type": "BF16", "dimensions": ["1", "2", "128", "128"], "tuple_shapes": [], "layout": {"dim_level_types": [], "dim_unique": [], "dim_ordered": [], "minor_to_major": ["3", "2", "1", "0"], "tiles": [], "element_size_in_bits": "0", "memory_space": "0", "index_primitive_type": "PRIMITIVE_TYPE_INVALID", "pointer_primitive_type": "PRIMITIVE_TYPE_INVALID", "dynamic_shape_metadata_prefix_bytes": "0"}, "is_dynamic_dimension": [false, false, false, false]}, "is_flash_attention": true, "mask_type": "NO_MASK", "bmm1_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["3"], "lhs_batch_dimensions": ["0", "2"], "rhs_batch_dimensions": ["0", "2"]}, "bmm2_dot_dimension_numbers": {"lhs_contracting_dimensions": ["3"], "rhs_contracting_dimensions": ["1"], "lhs_batch_dimensions": ["0", "1"], "rhs_batch_dimensions": ["0", "2"]}, "dropout_rate": 0, "seed": 42, "sliding_window_length": 0, "max_seg_per_batch": 1, "is_paged_attention": false}}
       %get-tuple-element.4.0 = bf16[1,2,128,128]{3,1,2,0} get-tuple-element(%custom-call.3), index=0
       ROOT %bitcast.6.0 = bf16[1,128,2,128]{3,2,1,0} bitcast(%get-tuple-element.4.0)
     }

From f745573a52d8348648048d42817c66579536338c Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Mon, 8 Dec 2025 04:07:20 -0800
Subject: [PATCH 021/753] [XLA:CPU/GPU][XTile] Fix not instruction for non-pred
 types.

PiperOrigin-RevId: 841696951
---
 .../xla/xla/backends/gpu/codegen/triton/BUILD |  1 +
 .../gpu/codegen/triton/emitter_helpers.cc     | 14 +++++++++-
 .../gpu/codegen/triton/emitter_helpers.h      |  5 +---
 .../triton/fusion_emitter_device_test.cc      | 26 +++++++++++++++++++
 4 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 19587ed10c4d4a..ce2212b64621e9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -132,6 +132,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:Support",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
index c81bfdb35696c5..f73c72bcf7873a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Builders.h"
@@ -156,6 +157,17 @@ absl::StatusOr<TensorValue> EmitNestedFusion(
 
   return EmitScope(b, to_emit, region_values);
 }
+
+// Get a constant with all high bits of the same type as provided.
+mlir::Value OnesLike(mlir::ImplicitLocOpBuilder& b, mlir::Type type) {
+  mlir::Type element_type = mlir::getElementTypeOrSelf(type);
+  CHECK(element_type.isInteger()) << "OnesLike only supports integer types.";
+
+  int64_t width = element_type.getIntOrFloatBitWidth();
+  mlir::APInt all_ones = mlir::APInt::getAllOnes(width);
+  return mlir::createScalarOrSplatConstant(b, b.getLoc(), type, all_ones);
+}
+
 }  // namespace
 
 SmallVector<int64_t> GetPaddedTileSizes(ArrayRef<int64_t> tile_sizes) {
@@ -425,7 +437,7 @@ absl::StatusOr<Value> EmitElementwise(mlir::ImplicitLocOpBuilder& b,
     case HloOpcode::kFloor:
       return mm::FloorOp::create(b, inputs[0]);
     case HloOpcode::kNot:
-      return ma::XOrIOp::create(b, inputs[0], OnesLike(b, inputs[0]));
+      return ma::XOrIOp::create(b, inputs[0], OnesLike(b, inputs[0].getType()));
     case HloOpcode::kNegate:
       // NegFOp is not supported by Triton.
       return Subtract(b, {ZerosLike(b, inputs[0]), inputs[0]});
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
index 89ed1ef978bb52..5d1dfee338123f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -197,10 +198,6 @@ inline mlir::Value ZerosLike(mlir::ImplicitLocOpBuilder& b, mlir::Value x) {
   return ConstLike(b, x, 0);
 }
 
-inline mlir::Value OnesLike(mlir::ImplicitLocOpBuilder& b, mlir::Value x) {
-  return ConstLike(b, x, 1);
-}
-
 bool IsFp8Type(mlir::Type t);
 
 // Triton type conversions.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index 07605fb6d34987..99112cd3bf51b3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -420,6 +420,32 @@ CHECK: arith.divsi {{.*}} : i32
   EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
 }
 
+TEST_F(TritonEmitterTest, BitwiseNotIsEmittedCorrectly) {
+  constexpr absl::string_view kHloText = R"(
+HloModule m
+
+fused_not {
+  param_0 = s32[100] parameter(0)
+  ROOT not = s32[100] not(param_0)
+}
+
+ENTRY main {
+  p0 = s32[100] parameter(0)
+  ROOT not = s32[100] fusion(p0), kind=kCustom, calls=fused_not,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "num_warps":"1","output_tiles":[{"sizes":[100]}],
+        "num_ctas":1,"num_stages":1,"is_tma_allowed":false}}}
+}
+)";
+  TF_EXPECT_OK(CreateTritonIrAndFileCheck(this, kHloText, "fused_not", R"(
+CHECK: arith.constant dense<-1>
+CHECK: arith.xori
+)"));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, kExactMatch));
+}
+
 TEST_F(TritonEmitterTest, ReductionOnMinormostAxisIsEmittedCorrectly) {
   constexpr absl::string_view kHloText = R"(
 HloModule m

From de1e81c3c327409d551af8973ff5eaadf4acf9a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eusebio=20Dur=C3=A1n=20Monta=C3=B1a?= <eusebiodm@google.com>
Date: Mon, 8 Dec 2025 04:08:12 -0800
Subject: [PATCH 022/753] Add missing BUILD dependencies, and remove unused
 ones

PiperOrigin-RevId: 841697193
---
 third_party/xla/xla/backends/cpu/BUILD              |  1 -
 third_party/xla/xla/backends/cpu/autotuner/BUILD    |  3 ---
 .../backends/cpu/codegen/emitters/transforms/BUILD  |  2 --
 third_party/xla/xla/backends/cpu/runtime/BUILD      |  2 --
 third_party/xla/xla/backends/gpu/codegen/llvm/BUILD |  1 -
 third_party/xla/xla/backends/gpu/collectives/BUILD  |  3 ---
 third_party/xla/xla/backends/gpu/runtime/BUILD      |  9 ---------
 .../xla/xla/backends/profiler/subprocess/BUILD      |  4 ----
 .../xla/xla/codegen/emitters/transforms/BUILD       |  2 --
 third_party/xla/xla/codegen/xtile/ir/BUILD          |  1 -
 third_party/xla/xla/pjrt/distributed/BUILD          | 13 -------------
 third_party/xla/xla/service/BUILD                   |  1 -
 third_party/xla/xla/service/gpu/BUILD               |  1 -
 third_party/xla/xla/stream_executor/BUILD           |  3 ---
 third_party/xla/xla/stream_executor/cuda/BUILD      |  1 -
 third_party/xla/xla/stream_executor/gpu/BUILD       |  1 -
 third_party/xla/xla/tsl/framework/BUILD             |  1 -
 third_party/xla/xla/tsl/profiler/rpc/BUILD          |  6 ------
 third_party/xla/xla/util/BUILD                      |  1 -
 19 files changed, 56 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/BUILD b/third_party/xla/xla/backends/cpu/BUILD
index 7d9e7ad114aba0..05df8d4e5fd66d 100644
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@@ -67,7 +67,6 @@ cc_library(
         "//xla/ffi",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/autotuner/BUILD b/third_party/xla/xla/backends/cpu/autotuner/BUILD
index e5bc7335a1a082..16640e22a3a8f5 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/cpu/autotuner/BUILD
@@ -24,13 +24,10 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service:executable",
         "//xla/stream_executor:platform_manager",
-        "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/host:host_platform",
         "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
     ],
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
index 1f74e57abe45a5..123e0394dcd297 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/BUILD
@@ -61,9 +61,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:UBDialect",
         "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:VectorUtils",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index 8d68edd2d6e12e..e1e85a5cd4675a 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -986,8 +986,6 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla/runtime:work_group",
         "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:launch_dim",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
index 43cd45f395a9cc..4f7e246607e1c0 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
@@ -86,7 +86,6 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index 975166a9b64b34..6859a56e88c0fa 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -150,7 +150,6 @@ cc_library(
         "//xla/service:rendezvous",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -166,11 +165,9 @@ xla_cc_test(
         ":gpu_clique_rendezvous",
         "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
-        "//xla/service:rendezvous",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 192c99de9902a6..2c79ce55668f95 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -87,7 +87,6 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
@@ -1723,7 +1722,6 @@ cc_library(
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/core/collectives:communicator",
-        "//xla/core/collectives:rank_id",
         "//xla/runtime:device_id",
         "//xla/service:collective_ops_utils",
         "//xla/tsl/platform:statusor",
@@ -1739,7 +1737,6 @@ cc_library(
     srcs = ["collective_thunk.cc"],
     hdrs = ["collective_thunk.h"],
     deps = [
-        ":collective_cliques",
         ":collective_execution",
         ":collective_params",
         ":thunk",
@@ -1751,14 +1748,12 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/collectives:gpu_clique_key",
-        "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:rendezvous",
         "//xla/service/gpu:buffer_allocations",
@@ -2025,8 +2020,6 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:collective_kernel_metadata",
-        "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/stream_executor/gpu:multicast_memory",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -2151,7 +2144,6 @@ cc_library(
         "//xla/ffi:execution_context",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
-        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:backend_configs_cc",
@@ -3608,7 +3600,6 @@ cc_library(
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:kernel",
         "//xla/stream_executor:kernel_args",
-        "//xla/stream_executor:kernel_argument_packing_spec",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/profiler/subprocess/BUILD b/third_party/xla/xla/backends/profiler/subprocess/BUILD
index 1a62071beafea5..4ca882ae468219 100644
--- a/third_party/xla/xla/backends/profiler/subprocess/BUILD
+++ b/third_party/xla/xla/backends/profiler/subprocess/BUILD
@@ -59,7 +59,6 @@ cc_library(
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -87,7 +86,6 @@ xla_cc_test(
         ":subprocess_registry",
         "//xla/backends/profiler:profiler_backends",  # buildcleaner: keep
         "//xla/tsl/platform:env",
-        "//xla/tsl/platform:resource_loader",
         "//xla/tsl/platform:subprocess",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_main",
@@ -100,8 +98,6 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform",
-        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/lib:traceme",
diff --git a/third_party/xla/xla/codegen/emitters/transforms/BUILD b/third_party/xla/xla/codegen/emitters/transforms/BUILD
index 429feecc9eb982..4f910f30935a01 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/codegen/emitters/transforms/BUILD
@@ -82,7 +82,6 @@ cc_library(
         "//xla/codegen/intrinsic:tanh",
         "//xla/codegen/intrinsic:type",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:ir_emission_utils",
@@ -91,7 +90,6 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/rocm:rocm_compute_capability",
         "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/codegen/xtile/ir/BUILD b/third_party/xla/xla/codegen/xtile/ir/BUILD
index de986615da6c0e..58f8c97439da40 100644
--- a/third_party/xla/xla/codegen/xtile/ir/BUILD
+++ b/third_party/xla/xla/codegen/xtile/ir/BUILD
@@ -105,7 +105,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:InliningUtils",
-        "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 710dfc9f0208a5..4f5e5356f6a343 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -20,8 +20,6 @@ cc_library(
     srcs = ["service.cc"],
     hdrs = ["service.h"],
     deps = [
-        ":topology_util",
-        ":util",
         "//xla:types",
         "//xla:util",
         "//xla/tsl/distributed_runtime/coordination:coordination_service",
@@ -31,14 +29,11 @@ cc_library(
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:random",
     ],
 )
 
@@ -70,7 +65,6 @@ cc_library(
     ],
     deps = [
         ":key_value_store_interface",
-        ":util",
         "//xla/runtime:device_id",
         "//xla/tsl/distributed_runtime/coordination:coordination_client",
         "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
@@ -87,8 +81,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -153,7 +145,6 @@ xla_cc_test(
         ":topology_util",
         "//xla:status_macros",
         "//xla/runtime:device_id",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
@@ -170,10 +161,6 @@ xla_cc_test(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index fb4b60aedea3f1..ddf008d75b3c80 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -3973,7 +3973,6 @@ cc_library(
         "//xla:literal",
         "//xla:shape_util",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 038abc6f3c5dd6..98e1675e56ab9a 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3265,7 +3265,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
-        "//xla/service:collective_permute_decomposer",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
         "//xla/service:profile_guided_latency_estimator",
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index f00f783257bce6..195ec82c4d2777 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -77,7 +77,6 @@ cc_library(
     name = "device_address",
     hdrs = ["device_address.h"],
     deps = [
-        "//xla/tsl/platform:logging",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log:check",
     ],
@@ -716,8 +715,6 @@ cc_library(
     name = "kernel",
     hdrs = ["kernel.h"],
     deps = [
-        ":device_address",
-        ":device_memory",
         ":kernel_args",
         ":kernel_metadata",
         ":launch_dim",
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index f173369799956f..ba0403fbed0832 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1035,7 +1035,6 @@ xla_cc_test(
         "notsan",
     ],
     deps = [
-        ":compilation_provider",
         ":cuda_compute_capability",
         ":nvjitlink",
         ":nvjitlink_support",
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index ddabc85bce9692..52e926f2befb72 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -171,7 +171,6 @@ cc_library(
     hdrs = ["gpu_executor.h"],
     deps = [
         ":multicast_memory",
-        "//xla/stream_executor:device_address",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_common",
         "//xla/stream_executor:stream_executor_h",
diff --git a/third_party/xla/xla/tsl/framework/BUILD b/third_party/xla/xla/tsl/framework/BUILD
index 6983cd1250f1ba..2c058abf55f2dd 100644
--- a/third_party/xla/xla/tsl/framework/BUILD
+++ b/third_party/xla/xla/tsl/framework/BUILD
@@ -413,7 +413,6 @@ tsl_cc_test(
         ":cancellation",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:env_impl",  # buildcleaner: keep
-        "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/tsl/profiler/rpc/BUILD b/third_party/xla/xla/tsl/profiler/rpc/BUILD
index 523db019d51d6f..fa081c98557a7e 100644
--- a/third_party/xla/xla/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/xla/tsl/profiler/rpc/BUILD
@@ -35,21 +35,15 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:types",
         "//xla/tsl/profiler/rpc/client:save_profile",
-        "//xla/tsl/profiler/utils:file_system_utils",
         "//xla/tsl/profiler/utils:math_utils",
         "//xla/tsl/profiler/utils:profiler_options_util",
         "//xla/tsl/profiler/utils:time_utils",
         "//xla/tsl/profiler/utils:xplane_utils",
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/profiler/lib:profiler_session",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_cc_grpc_proto",
         "@local_tsl//tsl/profiler/protobuf:profiler_service_proto_cc",
diff --git a/third_party/xla/xla/util/BUILD b/third_party/xla/xla/util/BUILD
index 3e503bd2dded2e..a83c965f8c16af 100644
--- a/third_party/xla/xla/util/BUILD
+++ b/third_party/xla/xla/util/BUILD
@@ -28,7 +28,6 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status:statusor",
         "@dlpack",
     ],

From 0dc10cd4e296a41f229a8b9196e127c7576ed30f Mon Sep 17 00:00:00 2001
From: Chenhao Jiang <chenhaoj@nvidia.com>
Date: Mon, 8 Dec 2025 04:10:46 -0800
Subject: [PATCH 023/753] PR #34917: Turn on the scatter determinism expander
 by default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34917

📝 Summary of Changes
Enable xla_gpu_enable_scatter_determinism_expander flag by default (change from false to true).
🎯 Justification
The scatter determinism expander provides significant performance improvements for deterministic scatter operations (up to 9000x speedup for certain input sizes compared to the sequential while-loop approach).
With recent fixes for batched scatter support and proper handling of scatter_dims_to_operand_dims, the pass is now robust enough to be enabled by default.
Users who experience issues can still disable it with --xla_gpu_enable_scatter_determinism_expander=false.
🚀 Kind of Contribution
⚡️ Performance Improvement
🧪 Unit Tests
All existing scatter tests pass with the flag enabled by default:
//xla/service:scatter_determinism_expander_test
//xla/tests:scatter_test
Copybara import of the project:

--
0bb296398991b3de5a4d15f45fd4e80f52880852 by Chenhao Jiang <chenhaoj@nvidia.com>:

Turn on the scatter determinism expander by default

Merging this change closes #34917

PiperOrigin-RevId: 841697954
---
 third_party/xla/xla/debug_options_flags.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index c435337dda368a..fe8c14f18dbd8c 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -441,7 +441,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_enable_fast_math(false);
   opts.set_xla_gpu_experimental_parallel_collective_overlap_limit(1);
   opts.set_xla_pjrt_allow_auto_layout_in_hlo(false);
-  opts.set_xla_gpu_enable_scatter_determinism_expander(false);
+  opts.set_xla_gpu_enable_scatter_determinism_expander(true);
   opts.set_xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(false);
   opts.set_xla_gpu_unsupported_use_all_reduce_one_shot_kernel(false);
   opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(true);

From bb8f2561f9e137b93a405525b888687971748611 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eetu=20Sj=C3=B6blom?= <eetu.sjoblom@amd.com>
Date: Mon, 8 Dec 2025 04:12:13 -0800
Subject: [PATCH 024/753] PR #34956:  [ROCm] flush rocprofiler buffer when
 disabling RocmTracer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34956

Fixes a bug in RocmTracer where events not reaching the rocprofiler watermark are not captured when disabling the tracer. Happens e.g. when the workload is very small.

Added an explicit buffer flush in and a relevant test that fails without the flush: `//xla/backends/profiler/gpu:rocm_tracer_test`

🚀 Kind of Contribution
🐛 Bug Fix
Copybara import of the project:

--
7d27ae5615c5dd1ba244e6b55b16200ff7f45d2c by Eetu Sjöblom <eetu.sjoblom@amd.com>:

flush rocprofiler buffer when disabling RocmTracer

Merging this change closes #34956

PiperOrigin-RevId: 841698424
---
 .../xla/xla/backends/profiler/gpu/BUILD       |  1 +
 .../xla/backends/profiler/gpu/rocm_tracer.cc  |  4 ++
 .../backends/profiler/gpu/rocm_tracer_test.cc | 69 +++++++++++++++++++
 3 files changed, 74 insertions(+)

diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index fefd3b9b992862..1a559898f65e3e 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -533,6 +533,7 @@ xla_cc_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@local_config_rocm//rocm:hip",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index 40f0e0e96cfbe9..a15f2e4bb690d1 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -499,6 +499,10 @@ void RocmTracer::toolFinalize(void* tool_data) {
 }
 
 void RocmTracer::Disable() {
+  rocprofiler_status_t status = rocprofiler_flush_buffer(buffer_);
+  if (status != ROCPROFILER_STATUS_SUCCESS) {
+    LOG(WARNING) << "rocprofiler_flush_buffer failed with error " << status;
+  }
   absl::MutexLock lock(collector_mutex_);
   collector_->Flush();
   collector_ = nullptr;
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
index d8ad1392738d20..d03bb15dc80527 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
+#include "rocm/include/hip/hip_runtime.h"
 #include "xla/backends/profiler/gpu/rocm_collector.h"
 #include "xla/backends/profiler/gpu/rocm_tracer_utils.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
@@ -124,6 +126,73 @@ TEST(RocmTracerTest, AnnotationMapWorks) {
   EXPECT_EQ(result, annotation);
 }
 
+// Simple collector that tracks received events for verification.
+class EventCapturingCollector : public RocmTraceCollector {
+ public:
+  EventCapturingCollector() : RocmTraceCollector(MakeCollectorOptions()) {}
+
+  void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override {
+    event_count_++;
+  }
+
+  void OnEventsDropped(const std::string& reason,
+                       uint32_t num_events) override {}
+  void Flush() override {}
+  void Export(tsl::profiler::XSpace* space) override {}
+
+  int event_count() const { return event_count_; }
+
+ private:
+  static RocmTraceCollectorOptions MakeCollectorOptions() {
+    RocmTraceCollectorOptions options;
+    options.max_callback_api_events = 2 * 1024 * 1024;
+    options.max_activity_api_events = 2 * 1024 * 1024;
+    options.max_annotation_strings = 1024 * 1024;
+    options.num_gpus = RocmTracer::GetRocmTracerSingleton().NumGpus();
+    return options;
+  }
+  int event_count_ = 0;
+};
+
+std::unique_ptr<EventCapturingCollector> CreateEventCapturingCollector() {
+  return std::make_unique<EventCapturingCollector>();
+}
+
+TEST(RocmTracerTest, CapturesHipEvents) {
+#define HIP_ASSERT_OK(expr) ASSERT_EQ((expr), hipSuccess) << #expr " failed"
+
+  int device_count = 0;
+  HIP_ASSERT_OK(hipGetDeviceCount(&device_count));
+  ASSERT_GT(device_count, 0) << "No HIP devices available";
+
+  auto collector = CreateEventCapturingCollector();
+  EventCapturingCollector* collector_ptr = collector.get();
+
+  RocmTracer& tracer = RocmTracer::GetRocmTracerSingleton();
+  RocmTracerOptions tracer_options{/*max_annotation_strings=*/1024 * 1024};
+  tracer.Enable(tracer_options, collector.get());
+
+  constexpr size_t kNumFloats = 1024;
+  constexpr size_t kSize = kNumFloats * sizeof(float);
+  std::vector<float> host_data(kNumFloats, 1.0f);
+  void* device_data = nullptr;
+
+  HIP_ASSERT_OK(hipMalloc(&device_data, kSize));
+  HIP_ASSERT_OK(
+      hipMemcpy(device_data, host_data.data(), kSize, hipMemcpyHostToDevice));
+  HIP_ASSERT_OK(
+      hipMemcpy(host_data.data(), device_data, kSize, hipMemcpyDeviceToHost));
+  HIP_ASSERT_OK(hipDeviceSynchronize());
+
+  tracer.Disable();
+  hipFree(device_data);
+
+#undef HIP_ASSERT_OK
+
+  EXPECT_GT(collector_ptr->event_count(), 0)
+      << "Expected to capture at least one trace event";
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace xla

From c768540f431880e29c481a2609b31c2a70c69b1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 06:28:49 -0800
Subject: [PATCH 025/753] Automated Code Change

PiperOrigin-RevId: 841737217
---
 third_party/xla/xla/hlo/pass/BUILD                 | 2 ++
 third_party/xla/xla/hlo/pass/hlo_pass_interface.cc | 2 ++
 third_party/xla/xla/hlo/pass/hlo_pass_interface.h  | 1 +
 3 files changed, 5 insertions(+)

diff --git a/third_party/xla/xla/hlo/pass/BUILD b/third_party/xla/xla/hlo/pass/BUILD
index 5a64c36e4596ad..4dff3438a1cd05 100644
--- a/third_party/xla/xla/hlo/pass/BUILD
+++ b/third_party/xla/xla/hlo/pass/BUILD
@@ -38,6 +38,7 @@ cc_library(
         "//xla/hlo/ir:hlo_module_group",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
@@ -118,6 +119,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
index bec1de8aaaa219..be0eb44c285037 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
+#include <memory>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
diff --git a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
index cfbe9723201e1f..fb43ac39280e8a 100644
--- a/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
+++ b/third_party/xla/xla/hlo/pass/hlo_pass_interface.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"

From 181d057de8d1327e88f3531a99bf08fd0e0f6373 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eusebio=20Dur=C3=A1n=20Monta=C3=B1a?= <eusebiodm@google.com>
Date: Mon, 8 Dec 2025 06:30:50 -0800
Subject: [PATCH 026/753] Set up internal presubmit for unused/extra
 dependencies.

PiperOrigin-RevId: 841737824
---
 third_party/xla/xla/BUILD | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 151c46c8df3408..d07d329e7c2dca 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,6 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
 
+# copybara:uncomment load("//devtools/build_cleaner/skylark:action_config_test.bzl", "action_config_test")
 # copybara:uncomment load("@rules_python//python:proto.bzl", "py_proto_library")
 load("//xla:package_groups.bzl", "xla_package_groups")
 load("//xla:xla.default.bzl", "xla_bzl_library", "xla_cc_test", "xla_py_proto_library")
@@ -1286,7 +1287,6 @@ xla_cc_test(
             "//xla/tsl/platform:env",
             "//xla/tsl/platform:test",
             "//xla/tsl/util:command_line_flags",
-            "@com_google_absl//absl/base:nullability",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/status",
             "@com_google_absl//absl/status:status_matchers",
@@ -1408,6 +1408,11 @@ cc_library(
 #     visibility = internal_visibility([":friends"]),
 #     deps = [":xla_proto"],
 # )
+#
+# action_config_test(
+#     name = "build_cleaner_spec_test",
+#     src = "build_cleaner_spec.textproto",
+# )
 # copybara:uncomment_end
 
 cc_library(

From 9981c805dcf2114836c8ea5eae4dc589ff4855fc Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 8 Dec 2025 06:36:09 -0800
Subject: [PATCH 027/753] [XLA:GPU] Improve error messages when we fail to tile
 a fusion.

PiperOrigin-RevId: 841739377
---
 .../xla/xla/codegen/tiling/symbolic_tile_analysis.cc   |  4 ++--
 .../xla/xla/service/gpu/transforms/nest_gemm_fusion.cc | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
index af166ae57df5cf..0ac831f03114a3 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
@@ -459,8 +459,8 @@ FusionDecision ShouldProceedWithSymbolicTileDerivation(
         SymbolicTile::FromIndexingMap(reshape_indexing_map);
 
     if (!reshape_symbolic_tile.has_value()) {
-      return FusionDecision::Forbid("Bailing out on reshape ")
-             << hlo->ToString() << " with indexing map "
+      return FusionDecision::Forbid("Bailing out on reshape")
+             << " " << hlo->ToString() << " with indexing map "
              << ToString(reshape_indexing_map);
     }
   }
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index 3383d936a0c931..dc972406913fc3 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -1248,13 +1248,13 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
       SymbolicTileAnalysis::AnalyzeComputation(
           *computation, ctx,
           TritonEmitterConstraints::GetBuilder(device_description));
-  if (std::holds_alternative<FusionDecision>(analysis_or)) {
+
+  if (const auto* fusion_decision = std::get_if<FusionDecision>(&analysis_or)) {
     std::unique_ptr<HloModule> extracted_computation_module =
         ExtractInstructionIntoNewModule(*computation->FusionInstruction());
-    return absl::InternalError(
-        absl::StrCat("Failed to analyze the computation (",
-                     std::get<FusionDecision>(analysis_or).Explain(),
-                     "): ", extracted_computation_module->ToString()));
+    return absl::InternalError(absl::StrCat(
+        "Failed to analyze the computation (", fusion_decision->Explain(),
+        "):\n", extracted_computation_module->ToString()));
   }
 
   auto& analysis = std::get<SymbolicTileAnalysis>(analysis_or);

From 9baba425e7bfdd4b20ff35a8526abdf9488fdbba Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <mkuper@google.com>
Date: Mon, 8 Dec 2025 07:14:46 -0800
Subject: [PATCH 028/753] [XLA] Print backend config in
 HloPrintOptions::ShortParsable()

The backend config carries semantic information. While "ShortParsable" is intended to be compact, it should be semantically equivalent to the default print style.

PiperOrigin-RevId: 841750733
---
 third_party/xla/xla/hlo/ir/hlo_print_options.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_print_options.h b/third_party/xla/xla/hlo/ir/hlo_print_options.h
index cd4a72bb176dd8..87eca03d396113 100644
--- a/third_party/xla/xla/hlo/ir/hlo_print_options.h
+++ b/third_party/xla/xla/hlo/ir/hlo_print_options.h
@@ -96,7 +96,6 @@ class HloPrintOptions {
         .set_print_large_constants(true)
         .set_print_subcomputation_mode(PrintSubcomputationMode::kNameOnly)
         .set_print_metadata(false)
-        .set_print_backend_config(false)
         .set_print_operand_shape(false)
         .set_print_operand_index_annotation_interval(0)
         .set_print_program_shape(false)

From 0df3891980edf719f6ff2b3b6f806db84aa3812f Mon Sep 17 00:00:00 2001
From: Raffi Khatchadourian <khatchad@hunter.cuny.edu>
Date: Mon, 8 Dec 2025 10:53:08 -0500
Subject: [PATCH 029/753] Update default `dtype` description in
 `ragged_factory_ops.py`

Clarified default `dtype` for `RaggedTensor` when `pylist` is empty. Fixes https://github.com/tensorflow/tensorflow/issues/105858.
---
 tensorflow/python/ops/ragged/ragged_factory_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index 55505df533d447..a21d85eca16fb5 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -61,7 +61,8 @@ def constant(
       compatible with `dtype`.
     dtype: The type of elements for the returned `RaggedTensor`.  If not
       specified, then a default is chosen based on the scalar values in
-      `pylist`.
+      `pylist`. If there are no scalar values in `pylist`, then the default
+      is `tf.float32`.
     ragged_rank: An integer specifying the ragged rank of the returned
       `RaggedTensor`.  Must be nonnegative and less than `K`. Defaults to
       `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to

From cc26e9a554f13dc82985f52ea5f41320389884ca Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 8 Dec 2025 07:55:34 -0800
Subject: [PATCH 030/753] [XLA:GPU] Keep explanation and location separate in
 FusionDecision.

Current logic didn't work nicely for streaming operator (<<), because it was concatenating a new location at every call and resulted in unreadable error message.

This change also adds a SourceLocationHolder to limit `#if defined(PLATFORM_GOOGLE)` usage.

PiperOrigin-RevId: 841764270
---
 third_party/xla/xla/service/BUILD             |  1 +
 .../xla/xla/service/instruction_fusion.cc     | 12 ---
 .../xla/xla/service/instruction_fusion.h      | 90 +++++++++----------
 3 files changed, 44 insertions(+), 59 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index ddf008d75b3c80..b8b6a62ec06674 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1977,6 +1977,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:macros",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index 1d9b568f88f9f0..f4e84b7e585969 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -56,18 +56,6 @@ limitations under the License.
 #include "xla/util.h"
 
 namespace xla {
-
-#if defined(PLATFORM_GOOGLE)
-FusionDecision::FusionDecision(bool decision,
-                               absl::SourceLocation source_location) {
-  if (!decision) {
-    explanation_ =
-        absl::StrCat("Not fusing: due to ", source_location.file_name(), ":",
-                     source_location.line());
-  }
-}
-#endif  // PLATFORM_GOOGLE
-
 namespace {
 
 // These nodes can always be duplicated into consumers, even if
diff --git a/third_party/xla/xla/service/instruction_fusion.h b/third_party/xla/xla/service/instruction_fusion.h
index 85ff4dde04035c..d5ad1b7c17e1a6 100644
--- a/third_party/xla/xla/service/instruction_fusion.h
+++ b/third_party/xla/xla/service/instruction_fusion.h
@@ -21,18 +21,14 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/service/hlo_module_config.h"
-#include "tsl/platform/macros.h"
 // The source_location.h is not available in open source.
 #if defined(PLATFORM_GOOGLE)
 #include "absl/types/source_location.h"
@@ -54,6 +50,29 @@ struct InPlaceFusionOptions {
   bool relax_multiple_non_elementwise_ops = false;
 };
 
+// A holder for the source location. absl::SourceLocation is not available in
+// open source, so we have a stub implementation to limit
+// #if define(PLATFORM_GOOGLE).
+class SourceLocationHolder {
+ public:
+#if defined(PLATFORM_GOOGLE)
+  explicit constexpr SourceLocationHolder(
+      absl::SourceLocation source_location = absl::SourceLocation::current())
+      : source_location_(source_location) {}
+
+  std::string ToString() const {
+    return absl::StrCat(" at: ", source_location_.file_name(), ":",
+                        source_location_.line());
+  }
+
+ private:
+  absl::SourceLocation source_location_;
+#else
+  SourceLocationHolder() = default;
+  std::string ToString() const { return ""; }
+#endif  // PLATFORM_GOOGLE
+};
+
 // Propagating explanation of fusion decisions: if something could not be fused,
 // explain the reason.
 class FusionDecision {
@@ -61,34 +80,29 @@ class FusionDecision {
   static FusionDecision Allow() { return FusionDecision(); }
   FusionDecision(const FusionDecision& decision) = default;
 
-#if defined(PLATFORM_GOOGLE)
-  static std::string LocToString(absl::SourceLocation source_location) {
-    return absl::StrCat(" at: ", source_location.file_name(), ":",
-                        source_location.line());
-  }
   static FusionDecision Forbid(
       absl::string_view explanation,
-      absl::SourceLocation source_location = absl::SourceLocation::current()) {
-    return FusionDecision(
-        absl::StrCat(explanation, LocToString(source_location)));
+      SourceLocationHolder source_location = SourceLocationHolder()) {
+    return FusionDecision(false, explanation, source_location);
   }
 
   // If condition is `true` means that we CAN fuse. In that case, explanation is
   // discarded.
   FusionDecision(
       bool condition, absl::string_view explanation,
-      absl::SourceLocation source_location = absl::SourceLocation::current()) {
+      SourceLocationHolder source_location = SourceLocationHolder()) {
     if (!condition) {
-      explanation_ = absl::StrCat(explanation, LocToString(source_location));
+      explanation_ = explanation;
+      source_location_ = source_location;
     }
   }
 
   explicit FusionDecision(
       absl::Status status,
-      absl::SourceLocation source_location = absl::SourceLocation::current()) {
+      SourceLocationHolder source_location = SourceLocationHolder()) {
     if (!status.ok()) {
-      explanation_ =
-          absl::StrCat(status.message(), LocToString(source_location));
+      explanation_ = status.message();
+      source_location_ = source_location;
     }
   }
 
@@ -97,25 +111,8 @@ class FusionDecision {
   // provide explicit explanation.
   FusionDecision(  // NOLINT
       bool decision,
-      absl::SourceLocation source_location = absl::SourceLocation::current());
-#else
-  // If condition is `true` means that we CAN fuse. In that case, explanation is
-  // discarded.
-  FusionDecision(bool condition, absl::string_view explanation) {
-    if (!condition) {
-      explanation_ = std::string(explanation);
-    }
-  }
-  static FusionDecision Forbid(absl::string_view explanation) {
-    return FusionDecision(explanation);
-  }
-  explicit FusionDecision(absl::Status status) {
-    if (!status.ok()) {
-      explanation_ = status.message();
-    }
-  }
-
-#endif  // PLATFORM_GOOGLE
+      SourceLocationHolder source_location = SourceLocationHolder())
+      : FusionDecision(decision, "Not fusing", source_location) {}
 
   // Returns whether it can be fused.
   explicit operator bool() const { return CanFuse(); }
@@ -130,8 +127,7 @@ class FusionDecision {
     if (CanFuse() || decision.CanFuse()) {
       return Allow();
     }
-    return Forbid(
-        absl::StrCat(explanation_.value_or(""), " ; ", decision.Explain()));
+    return Forbid(absl::StrCat(Explain(), " ; ", decision.Explain()));
   }
 
   // Connects two fusion decision with a conjunction. Unlike disjunction,
@@ -150,30 +146,30 @@ class FusionDecision {
 
   // Appends to explanation, or turns the decision negative.
   FusionDecision operator<<(absl::string_view explanation) const {
-    return Forbid(absl::StrCat(explanation_.value_or(""), explanation));
+    return Forbid(absl::StrCat(explanation_.value_or(""), explanation),
+                  source_location_);
   }
 
   // Appends to explanation, or turns the decision negative.
   FusionDecision operator<<(int64_t explanation) const {
-    return Forbid(absl::StrCat(explanation_.value_or(""), explanation));
+    return Forbid(absl::StrCat(explanation_.value_or(""), explanation),
+                  source_location_);
   }
 
   // Explains why the fusion could not be performed, or that it can be.
   std::string Explain() const {
-    return explanation_.value_or("Actually, we can fuse it.");
+    if (explanation_.has_value()) {
+      return absl::StrCat(explanation_.value(), source_location_.ToString());
+    }
+    return "Actually, we can fuse it.";
   }
 
  private:
   // Empty IFF fusion is possible (explanation provided for negative cases).
   std::optional<std::string> explanation_;
+  SourceLocationHolder source_location_;
 
   FusionDecision() = default;
-
-  explicit FusionDecision(absl::string_view explanation)
-      : explanation_(explanation) {}
-
-  explicit FusionDecision(const char* explanation)
-      : explanation_(explanation) {}
 };
 
 #define RETURN_IF_NOT_FUSIBLE(...)                   \

From 83a265ddfe3edcf5fc8d5f84900543d7fca4c2e4 Mon Sep 17 00:00:00 2001
From: Yurii Topin <yuriit@google.com>
Date: Mon, 8 Dec 2025 08:35:06 -0800
Subject: [PATCH 031/753] Reverts 31228b49f1c3af6f784556a1845782a3969358d6

PiperOrigin-RevId: 841777859
---
 third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
index 83cca313adf4f5..3a079c87ab9dd6 100644
--- a/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
+++ b/third_party/xla/third_party/mkl_dnn/mkldnn_acl.BUILD
@@ -156,13 +156,5 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@compute_library//:arm_compute",
-    ] + select({
-        # When using MKL-DNN on the AArch64 architecture, OpenMP is required
-        # for parallelization. Because the Hermetic C++ build environment uses
-        # the -nodefaultlibs flag, simply passing -fopenmp is insufficient.
-        # OpenMP's dependencies must be explicitly linked to ensure correct
-        # inclusion, as automatic linking is disabled.
-        "@rules_ml_toolchain//common:is_hermetic_cc_enabled": ["@rules_ml_toolchain//cc/sysroots:openmp"],
-        "//conditions:default": [],
-    }),
+    ],
 )

From 79cd71f875f583563fcbfeb04ad0660ed5f997fa Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 8 Dec 2025 08:51:28 -0800
Subject: [PATCH 032/753] [xla:gpu] Switch CollectiveMetadataThunk to
 GpuCliqueRendezvous

PiperOrigin-RevId: 841783861
---
 .../gpu/collectives/gpu_clique_rendezvous.cc  |   4 +-
 .../gpu/collectives/gpu_clique_rendezvous.h   |  16 +--
 .../collectives/gpu_clique_rendezvous_test.cc |   4 +-
 .../xla/xla/backends/gpu/runtime/BUILD        |   1 +
 .../runtime/collective_kernel_thunk_test.cc   |   2 +-
 .../gpu/runtime/collective_metadata_thunk.cc  | 105 ++++--------------
 6 files changed, 37 insertions(+), 95 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc
index 277cea6198ae80..fcb87afc21d28e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.cc
@@ -65,8 +65,8 @@ struct RankFormatter {
 }  // namespace
 
 GpuCliqueRendezvous::GpuCliqueRendezvous(
-    GpuCliqueKey clique_key, absl::btree_map<RankId, std::any> state)
-    : clique_key_(std::move(clique_key)), state_(std::move(state)) {}
+    GpuCliqueKey clique_key, absl::btree_map<RankId, std::any> values)
+    : clique_key_(std::move(clique_key)), values_(std::move(values)) {}
 
 absl::StatusOr<std::shared_ptr<GpuCliqueRendezvous>> GpuCliqueRendezvous::Join(
     const GpuCliqueKey& clique_key, RankId rank, std::any data) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h
index a3220996f214c5..623cd7d8513fd7 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous.h
@@ -42,15 +42,15 @@ class GpuCliqueRendezvous {
   static absl::StatusOr<std::shared_ptr<GpuCliqueRendezvous>> Join(
       const GpuCliqueKey& clique_key, RankId rank, std::any data);
 
-  // Returns the clique key associated with this data.
+  // Returns the clique key associated with this rendezvous object.
   const GpuCliqueKey& clique_key() const { return clique_key_; }
 
-  // Returns the state associated with the given rank. If state type is not
-  // the same as `T`, returns an error.
+  // Returns the value at the given rank. If value type is not the same as `T`,
+  // returns an error.
   template <typename T>
-  absl::StatusOr<std::reference_wrapper<const T>> state(RankId rank) const {
-    auto it = state_.find(rank);
-    if (it == state_.end()) {
+  absl::StatusOr<std::reference_wrapper<const T>> at(RankId rank) const {
+    auto it = values_.find(rank);
+    if (it == values_.end()) {
       return NotFound("Data not found for rank %d", rank.value());
     }
 
@@ -64,10 +64,10 @@ class GpuCliqueRendezvous {
 
  private:
   GpuCliqueRendezvous(GpuCliqueKey clique_key,
-                      absl::btree_map<RankId, std::any> state);
+                      absl::btree_map<RankId, std::any> values);
 
   GpuCliqueKey clique_key_;
-  absl::btree_map<RankId, std::any> state_;
+  absl::btree_map<RankId, std::any> values_;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc
index 58d0bd7b2c402a..ef8a2ce09383c0 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_rendezvous_test.cc
@@ -48,8 +48,8 @@ TEST(GpuCliqueRendezvousTest, TwoParticipants) {
 
       GpuCliqueRendezvous& data = **rendezvous;
       ASSERT_EQ(data.clique_key(), key);
-      ASSERT_EQ(*data.state<int32_t>(RankId(0)), 0);
-      ASSERT_EQ(*data.state<int32_t>(RankId(1)), 1);
+      ASSERT_EQ(*data.at<int32_t>(RankId(0)), 0);
+      ASSERT_EQ(*data.at<int32_t>(RankId(1)), 1);
     };
   };
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 2c79ce55668f95..c40f279f9212dd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -2010,6 +2010,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/backends/gpu/collectives:gpu_clique_rendezvous",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
index 993b4d0cc06b0d..e65d7760e8981f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
@@ -394,7 +394,7 @@ TEST(CollectiveKernelThunkTest, MultiprocessTest) {
   for (absl::StatusOr<se::DeviceAddressBase> result :
        RunCollectiveKernelThunkOnDevices(metadata,
                                          /*emulate_multiprocess=*/true)) {
-    EXPECT_THAT(result, StatusIs(absl::StatusCode::kUnimplemented));
+    EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
index ae4757dec337fa..a5e44c890f34fe 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
@@ -15,23 +15,20 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
 
+#include <any>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
 #include "google/protobuf/repeated_ptr_field.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_clique_rendezvous.h"
 #include "xla/backends/gpu/runtime/collective_multimem.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/rank_id.h"
@@ -39,7 +36,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/rendezvous.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/gpu/collective_kernel_metadata.h"
@@ -80,98 +76,43 @@ CollectiveConfig CollectiveMetadataThunk::GetCollectiveConfig(
   return config;
 }
 
-struct DeviceParameters {
-  RankId rank;
-  std::vector<se::DeviceAddressBase> parameters;
-
-  bool operator<(const DeviceParameters& other) const {
-    return rank < other.rank;
-  }
-};
-
-absl::StatusOr<std::vector<DeviceParameters>> SyncLocalDeviceParameters(
-    const GpuCliqueKey& clique_key, RankId rank,
-    std::vector<se::DeviceAddressBase> parameters) {
-  std::vector<DeviceParameters> device_parameters;
-  auto rendezvous_fn = [](absl::Span<const DeviceParameters* const> values) {
-    std::vector<DeviceParameters> values_copy;
-    for (const auto& value : values) {
-      values_copy.push_back(*value);
-    }
-    // Sort to make sure that values are in the same order as the
-    // devices are ordered in the communicator.
-    absl::c_sort(values_copy);
-    return values_copy;
-  };
-
-  std::string start_rendezvous_key = absl::StrFormat(
-      "[rank=%d] Initializing collective metadata for clique %s", rank.value(),
-      clique_key.ToString());
-
-  DeviceParameters params;
-  params.rank = rank;
-  params.parameters = std::move(parameters);
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<std::vector<DeviceParameters>> local_ranks_parameters,
-      Rendezvous<std::vector<DeviceParameters>>(
-          /*name=*/start_rendezvous_key, /*key=*/clique_key,
-          /*value=*/params,
-          /*num_threads=*/clique_key.num_local_participants(), rendezvous_fn));
-  return std::vector<DeviceParameters>(local_ranks_parameters->begin(),
-                                       local_ranks_parameters->end());
-}
-
-absl::StatusOr<std::vector<DeviceParameters>> SyncGlobalDeviceParameters(
-    const GpuCliqueKey& clique_key, RankId rank,
-    std::vector<se::DeviceAddressBase> parameters) {
-  if (!clique_key.is_local()) {
-    return Unimplemented(
-        "[rank=%d] Multiprocess collective metadata is not supported yet in "
-        "clique %s",
-        rank.value(), clique_key.ToString());
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceParameters> local_ranks_parameters,
-      SyncLocalDeviceParameters(clique_key, rank, std::move(parameters)));
-
-  return local_ranks_parameters;
-}
-
 absl::Status CollectiveMetadataThunk::ConstructCollectiveMetadata(
     const GpuCliqueKey& clique_key, RankId rank, se::Stream* stream,
     std::vector<se::DeviceAddressBase> parameters,
     std::shared_ptr<CollectiveMultimem> multimem,
     se::DeviceAddressBase destination) {
-  CollectiveKernelMetadata metadata;
-  metadata.rank = rank.value();
-  metadata.multicast_buffer_ptr =
-      multimem ? multimem->mapped_ptr(rank) : nullptr;
+  size_t num_parameters = parameters.size();
+
+  using DeviceParameters = std::vector<se::DeviceAddressBase>;
+
+  // Exchange device parameters with all ranks in the clique.
   TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceParameters> device_parameters,
-      SyncGlobalDeviceParameters(clique_key, rank, std::move(parameters)));
-  TF_RET_CHECK(!device_parameters.empty())
-      << "Not enough devices in the clique.";
-  const size_t num_parameters = device_parameters[0].parameters.size();
-  for (const auto& value : device_parameters) {
-    TF_RET_CHECK(value.parameters.size() == num_parameters);
-  }
+      auto device_parameters,
+      GpuCliqueRendezvous::Join(clique_key, rank, std::move(parameters)));
 
+  // Collect pointers to device buffers from all participating ranks.
   std::vector<void*> param_to_peers_ptrs;
-  param_to_peers_ptrs.reserve(device_parameters.size() * num_parameters);
-  for (int peer = 0; peer < device_parameters.size(); ++peer) {
-    for (int param = 0; param < num_parameters; ++param) {
-      param_to_peers_ptrs.push_back(
-          device_parameters[peer].parameters[param].opaque());
+  for (auto peer = RankId(0); peer < RankId(clique_key.num_devices()); ++peer) {
+    TF_ASSIGN_OR_RETURN(const DeviceParameters& peer_parameters,
+                        device_parameters->at<DeviceParameters>(peer));
+    for (se::DeviceAddressBase peer_parameter : peer_parameters) {
+      param_to_peers_ptrs.push_back(peer_parameter.opaque());
     }
   }
 
+  // Check that all participants have the same number of parameters.
+  TF_RET_CHECK(param_to_peers_ptrs.size() ==
+               num_parameters * clique_key.num_local_participants());
+
   const int64_t param_to_peers_ptrs_size =
       param_to_peers_ptrs.size() * sizeof(void*);
   se::DeviceAddressBase param_to_peers_ptrs_buffer = destination.GetByteSlice(
       sizeof(CollectiveKernelMetadata), param_to_peers_ptrs_size);
 
+  CollectiveKernelMetadata metadata;
+  metadata.rank = rank.value();
+  metadata.multicast_buffer_ptr =
+      multimem ? multimem->mapped_ptr(rank) : nullptr;
   metadata.param_to_peers =
       reinterpret_cast<void**>(param_to_peers_ptrs_buffer.opaque());
 

From eed57fa340139e372413b2695de32db1ae75fe8a Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 8 Dec 2025 08:53:20 -0800
Subject: [PATCH 033/753] [xla] Prepare to MaybeOwningDeviceAddress migration

PiperOrigin-RevId: 841784507
---
 third_party/xla/xla/service/BUILD             | 27 ++++--
 ...mory.cc => maybe_owning_device_address.cc} | 30 +++----
 .../xla/service/maybe_owning_device_address.h | 88 +++++++++++++++++++
 ...cc => maybe_owning_device_address_test.cc} | 15 ++--
 .../xla/service/maybe_owning_device_memory.h  | 72 ++-------------
 5 files changed, 134 insertions(+), 98 deletions(-)
 rename third_party/xla/xla/service/{maybe_owning_device_memory.cc => maybe_owning_device_address.cc} (53%)
 create mode 100644 third_party/xla/xla/service/maybe_owning_device_address.h
 rename third_party/xla/xla/service/{maybe_owning_device_memory_test.cc => maybe_owning_device_address_test.cc} (77%)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b8b6a62ec06674..8973a3631eccdf 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4080,28 +4080,39 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "maybe_owning_device_memory",
-    srcs = ["maybe_owning_device_memory.cc"],
-    hdrs = ["maybe_owning_device_memory.h"],
+    name = "maybe_owning_device_address",
+    srcs = ["maybe_owning_device_address.cc"],
+    hdrs = ["maybe_owning_device_address.h"],
     deps = [
+        "//xla:types",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
 xla_cc_test(
-    name = "maybe_owning_device_memory_test",
-    srcs = ["maybe_owning_device_memory_test.cc"],
+    name = "maybe_owning_device_address_test",
+    srcs = ["maybe_owning_device_address_test.cc"],
     deps = [
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
     ],
 )
 
+cc_library(
+    name = "maybe_owning_device_memory",
+    hdrs = ["maybe_owning_device_memory.h"],
+    deps = [
+        ":maybe_owning_device_address",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
 cc_library(
     name = "float8_fnuz_ir_emitter",
     srcs = [
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory.cc b/third_party/xla/xla/service/maybe_owning_device_address.cc
similarity index 53%
rename from third_party/xla/xla/service/maybe_owning_device_memory.cc
rename to third_party/xla/xla/service/maybe_owning_device_address.cc
index a7b3aa5e4b641c..6f8e252ebac99d 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory.cc
+++ b/third_party/xla/xla/service/maybe_owning_device_address.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 
 #include <cstdint>
 #include <optional>
@@ -25,33 +25,29 @@ limitations under the License.
 
 namespace xla {
 
-stream_executor::DeviceAddressBase MaybeOwningDeviceMemory::AsDeviceMemoryBase()
-    const {
+se::DeviceAddressBase MaybeOwningDeviceAddress::AsDeviceAddress() const {
   if (HasOwnership()) {
-    return *std::get<stream_executor::ScopedDeviceAddress<uint8_t>>(mem_);
+    return *std::get<se::ScopedDeviceAddress<uint8_t>>(mem_);
   }
-  return std::get<stream_executor::DeviceAddressBase>(mem_);
+  return std::get<se::DeviceAddressBase>(mem_);
 }
 
-bool MaybeOwningDeviceMemory::HasOwnership() const {
-  return std::holds_alternative<stream_executor::ScopedDeviceAddress<uint8_t>>(
-      mem_);
+bool MaybeOwningDeviceAddress::HasOwnership() const {
+  return std::holds_alternative<se::ScopedDeviceAddress<uint8_t>>(mem_);
 }
 
-std::optional<stream_executor::ScopedDeviceAddress<uint8_t>>
-MaybeOwningDeviceMemory::Release() {
+std::optional<se::ScopedDeviceAddress<uint8_t>>
+MaybeOwningDeviceAddress::Release() {
   if (!HasOwnership()) {
     return {};
   }
-  return std::move(
-      std::get<stream_executor::ScopedDeviceAddress<uint8_t>>(mem_));
+  return std::move(std::get<se::ScopedDeviceAddress<uint8_t>>(mem_));
 }
 
-const stream_executor::ScopedDeviceAddress<uint8_t>*
-MaybeOwningDeviceMemory::AsOwningDeviceMemory() const {
-  return HasOwnership()
-             ? &std::get<stream_executor::ScopedDeviceAddress<uint8_t>>(mem_)
-             : nullptr;
+const se::ScopedDeviceAddress<uint8_t>*
+MaybeOwningDeviceAddress::AsScopedDeviceAddress() const {
+  return HasOwnership() ? &std::get<se::ScopedDeviceAddress<uint8_t>>(mem_)
+                        : nullptr;
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/maybe_owning_device_address.h b/third_party/xla/xla/service/maybe_owning_device_address.h
new file mode 100644
index 00000000000000..8a6f52e15adcaf
--- /dev/null
+++ b/third_party/xla/xla/service/maybe_owning_device_address.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MAYBE_OWNING_DEVICE_ADDRESS_H_
+#define XLA_SERVICE_MAYBE_OWNING_DEVICE_ADDRESS_H_
+
+#include <cstdint>
+#include <optional>
+#include <utility>
+#include <variant>
+
+#include "absl/base/macros.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/types.h"  // IWYU pragma: keep
+
+namespace xla {
+
+// MaybeOwningDeviceAddress represents either an owned or unowned device
+// address. Like std::variant<se::ScopedDeviceAddress<uint8_t>, DeviceMemory>.
+// When the object goes output of scope, it will free the underlying device
+// address if it owns it.
+class MaybeOwningDeviceAddress {
+ public:
+  MaybeOwningDeviceAddress() = default;
+  MaybeOwningDeviceAddress(MaybeOwningDeviceAddress&&) = default;
+  MaybeOwningDeviceAddress& operator=(MaybeOwningDeviceAddress&&) = default;
+
+  explicit MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t> owned)
+      : mem_(std::move(owned)) {}
+
+  explicit MaybeOwningDeviceAddress(se::DeviceAddressBase unowned)
+      : mem_(unowned) {}
+
+  MaybeOwningDeviceAddress& operator=(se::DeviceAddressBase unowned) {
+    mem_ = unowned;
+    return *this;
+  }
+
+  MaybeOwningDeviceAddress& operator=(se::ScopedDeviceAddress<uint8_t> owned) {
+    mem_ = std::move(owned);
+    return *this;
+  }
+
+  // Fetches the underlying DeviceAddressBase. The caller of this function is
+  // *not* responsible for freeing the address.
+  se::DeviceAddressBase AsDeviceAddress() const;
+
+  // Release the se::ScopedDeviceAddress<uint8_t> without freeing
+  // it, and moves the ownership of the address from the object to the caller.
+  //
+  // A nullopt is returned if the HasOwnership() == false;
+  std::optional<se::ScopedDeviceAddress<uint8_t>> Release();
+
+  // If the device address is owned, returns a pointer to the internal
+  // ScopedDeviceAddress, otherwise nullptr is returned.
+  const se::ScopedDeviceAddress<uint8_t>* AsScopedDeviceAddress() const;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  se::DeviceAddressBase AsDeviceMemoryBase() const { return AsDeviceAddress(); }
+
+  ABSL_DEPRECATE_AND_INLINE()
+  const se::ScopedDeviceAddress<uint8_t>* AsOwningDeviceMemory() const {
+    return AsScopedDeviceAddress();
+  }
+
+  // Returns true if has ownership over underlying address.
+  bool HasOwnership() const;
+
+ private:
+  std::variant<se::DeviceAddressBase, se::ScopedDeviceAddress<uint8_t>> mem_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MAYBE_OWNING_DEVICE_ADDRESS_H_
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory_test.cc b/third_party/xla/xla/service/maybe_owning_device_address_test.cc
similarity index 77%
rename from third_party/xla/xla/service/maybe_owning_device_memory_test.cc
rename to third_party/xla/xla/service/maybe_owning_device_address_test.cc
index 2d3a5a8cf38708..d2dbcc46aad3ca 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory_test.cc
+++ b/third_party/xla/xla/service/maybe_owning_device_address_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -21,13 +21,14 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using MaybeOwningDeviceMemoryTest = ::testing::Test;
+using MaybeOwningDeviceAddressTest = ::testing::Test;
 
-TEST(MaybeOwningDeviceMemoryTest, DefaultConstructed) {
-  MaybeOwningDeviceMemory memory;
+TEST(MaybeOwningDeviceAddressTest, DefaultConstructed) {
+  MaybeOwningDeviceAddress memory;
   EXPECT_FALSE(memory.HasOwnership());
-  EXPECT_EQ(memory.AsDeviceMemoryBase().opaque(), nullptr);
-  EXPECT_EQ(memory.AsDeviceMemoryBase().size(), 0);
+
+  EXPECT_EQ(memory.AsDeviceAddress().opaque(), nullptr);
+  EXPECT_EQ(memory.AsDeviceAddress().size(), 0);
 }
 
 //===-----------------------------------------------------------------------===/
@@ -36,7 +37,7 @@ TEST(MaybeOwningDeviceMemoryTest, DefaultConstructed) {
 
 void BM_DefaultConstructed(benchmark::State& state) {
   for (auto s : state) {
-    MaybeOwningDeviceMemory memory;
+    MaybeOwningDeviceAddress memory;
     benchmark::DoNotOptimize(memory);
   }
 }
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory.h b/third_party/xla/xla/service/maybe_owning_device_memory.h
index 8f7b33b4d2d66e..897003ffb17429 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory.h
+++ b/third_party/xla/xla/service/maybe_owning_device_memory.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,76 +16,16 @@ limitations under the License.
 #ifndef XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
 #define XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_
 
-#include <cstdint>
-#include <optional>
-#include <utility>
-#include <variant>
-
-#include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "absl/base/macros.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/stream_executor/device_memory.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory_allocator.h"  // IWYU pragma: keep
 
 namespace xla {
 
-// MaybeOwningDeviceMemory represents either an owned or unowned
-// device memory. Like std::variant<se::ScopedDeviceAddress<uint8_t>,
-// DeviceMemory>. When the object goes output of scope, it will free the
-// underlying memory if it owns it.
-class MaybeOwningDeviceMemory {
- public:
-  MaybeOwningDeviceMemory() = default;
-  ~MaybeOwningDeviceMemory() = default;
-
-  explicit MaybeOwningDeviceMemory(
-      stream_executor::ScopedDeviceAddress<uint8_t> owned)
-      : mem_(std::move(owned)) {}
-
-  explicit MaybeOwningDeviceMemory(stream_executor::DeviceAddressBase unowned)
-      : mem_(unowned) {}
-
-  MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
-
-  MaybeOwningDeviceMemory& operator=(
-      stream_executor::DeviceAddressBase unowned) {
-    mem_ = unowned;
-    return *this;
-  }
-
-  MaybeOwningDeviceMemory& operator=(
-      stream_executor::ScopedDeviceAddress<uint8_t> owned) {
-    mem_ = std::move(owned);
-    return *this;
-  }
-
-  MaybeOwningDeviceMemory& operator=(MaybeOwningDeviceMemory&&) = default;
-
-  // Fetches the underlying DeviceAddressBase from a
-  // MaybeOwningDeviceMemory. The caller of this function is *not*
-  // responsible for freeing the memory.
-  stream_executor::DeviceAddressBase AsDeviceMemoryBase() const;
-
-  // Release the stream_executor::ScopedDeviceAddress<uint8_t> without freeing
-  // it, and moves the ownership of the memory buffer from the object to the
-  // caller.
-  //
-  // A nullopt is returned if the HasOwnership() == false;
-  std::optional<stream_executor::ScopedDeviceAddress<uint8_t>> Release();
-
-  // If the device memory is owned, returns a pointer to the internal
-  // OwningDeviceMemory, otherwise nullptr is returned.
-  const stream_executor::ScopedDeviceAddress<uint8_t>* AsOwningDeviceMemory()
-      const;
-
-  // Returns true if the device_memory has ownership over underlying memory.
-  bool HasOwnership() const;
-
- private:
-  std::variant<stream_executor::DeviceAddressBase,
-               stream_executor::ScopedDeviceAddress<uint8_t>>
-      mem_;
-};
+using MaybeOwningDeviceMemory ABSL_DEPRECATE_AND_INLINE() =
+    MaybeOwningDeviceAddress;
 
-}  // namespace xla
+}
 
 #endif  // XLA_SERVICE_MAYBE_OWNING_DEVICE_MEMORY_H_

From bacbd6b4513f2e019a287a89cf5da5e521ff6fe0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Dec 2025 09:26:48 -0800
Subject: [PATCH 034/753] PR #105775: Bump urllib3 from 2.5.0 to 2.6.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/105775

Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.5.0 to 2.6.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/urllib3/urllib3/releases">urllib3's releases</a>.</em></p>
<blockquote>
<h2>2.6.0</h2>
<h2>🚀 urllib3 is fundraising for HTTP/2 support</h2>
<p><a href="https://sethmlarson.dev/urllib3-is-fundraising-for-http2-support">urllib3 is raising ~$40,000 USD</a> to release HTTP/2 support and ensure long-term sustainable maintenance of the project after a sharp decline in financial support. If your company or organization uses Python and would benefit from HTTP/2 support in Requests, pip, cloud SDKs, and thousands of other projects <a href="https://opencollective.com/urllib3">please consider contributing financially</a> to ensure HTTP/2 support is developed sustainably and maintained for the long-haul.</p>
<p>Thank you for your support.</p>
<h2>Security</h2>
<ul>
<li>Fixed a security issue where streaming API could improperly handle highly compressed HTTP content (&quot;decompression bombs&quot;) leading to excessive resource consumption even when a small amount of data was requested. Reading small chunks of compressed data is safer and much more efficient now. (CVE-2025-66471 reported by <a href="https://github.com/Cycloctane"><code>@​Cycloctane</code></a>, 8.9 High, GHSA-2xpw-w6gg-jr37)</li>
<li>Fixed a security issue where an attacker could compose an HTTP response with virtually unlimited links in the <code>Content-Encoding</code> header, potentially leading to a denial of service (DoS) attack by exhausting system resources during decoding. The number of allowed chained encodings is now limited to 5. (CVE-2025-66418 reported by <a href="https://github.com/illia-v"><code>@​illia-v</code></a>, 8.9 High, GHSA-gm62-xv2j-4w53)</li>
</ul>
<blockquote>
<p>[!IMPORTANT]</p>
<ul>
<li>If urllib3 is not installed with the optional <code>urllib3[brotli]</code> extra, but your environment contains a Brotli/brotlicffi/brotlipy package anyway, make sure to upgrade it to at least Brotli 1.2.0 or brotlicffi 1.2.0.0 to  benefit from the security fixes and avoid warnings. Prefer using  <code>urllib3[brotli]</code> to install a compatible Brotli package automatically.</li>
<li>If you use custom decompressors, please make sure to update them to  respect the changed API of <code>urllib3.response.ContentDecoder</code>.</li>
</ul>
</blockquote>
<h2>Features</h2>
<ul>
<li>Enabled retrieval, deletion, and membership testing in <code>HTTPHeaderDict</code> using bytes keys. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3653">#3653</a>)</li>
<li>Added host and port information to string representations of <code>HTTPConnection</code>. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3666">#3666</a>)</li>
<li>Added support for Python 3.14 free-threading builds explicitly. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3696">#3696</a>)</li>
</ul>
<h2>Removals</h2>
<ul>
<li>Removed the <code>HTTPResponse.getheaders()</code> method in favor of <code>HTTPResponse.headers</code>. Removed the <code>HTTPResponse.getheader(name, default)</code> method in favor of <code>HTTPResponse.headers.get(name, default)</code>. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3622">#3622</a>)</li>
</ul>
<h2>Bugfixes</h2>
<ul>
<li>Fixed redirect handling in <code>urllib3.PoolManager</code> when an integer is passed for the retries parameter. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3649">#3649</a>)</li>
<li>Fixed <code>HTTPConnectionPool</code> when used in Emscripten with no explicit port. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3664">#3664</a>)</li>
<li>Fixed handling of <code>SSLKEYLOGFILE</code> with expandable variables. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3700">#3700</a>)</li>
</ul>
<h2>Misc</h2>
<ul>
<li>Changed the <code>zstd</code> extra to install <code>backports.zstd</code> instead of <code>zstandard</code> on Python 3.13 and before. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3693">#3693</a>)</li>
<li>Improved the performance of content decoding by optimizing <code>BytesQueueBuffer</code> class. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3710">#3710</a>)</li>
<li>Allowed building the urllib3 package with newer setuptools-scm v9.x. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3652">#3652</a>)</li>
<li>Ensured successful urllib3 builds by setting Hatchling requirement to ≥ 1.27.0. (<a href="https://redirect.github.com/urllib3/urllib3/issues/3638">#3638</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/urllib3/urllib3/blob/main/CHANGES.rst">urllib3's changelog</a>.</em></p>
<blockquote>
<h1>2.6.0 (2025-12-05)</h1>
<h2>Security</h2>
<ul>
<li>Fixed a security issue where streaming API could improperly handle highly
compressed HTTP content (&quot;decompression bombs&quot;) leading to excessive resource
consumption even when a small amount of data was requested. Reading small
chunks of compressed data is safer and much more efficient now.
(<code>GHSA-2xpw-w6gg-jr37 &lt;https://github.com/urllib3/urllib3/security/advisories/GHSA-2xpw-w6gg-jr37&gt;</code>__)</li>
<li>Fixed a security issue where an attacker could compose an HTTP response with
virtually unlimited links in the <code>Content-Encoding</code> header, potentially
leading to a denial of service (DoS) attack by exhausting system resources
during decoding. The number of allowed chained encodings is now limited to 5.
(<code>GHSA-gm62-xv2j-4w53 &lt;https://github.com/urllib3/urllib3/security/advisories/GHSA-gm62-xv2j-4w53&gt;</code>__)</li>
</ul>
<p>.. caution::</p>
<ul>
<li>
<p>If urllib3 is not installed with the optional <code>urllib3[brotli]</code> extra, but
your environment contains a Brotli/brotlicffi/brotlipy package anyway, make
sure to upgrade it to at least Brotli 1.2.0 or brotlicffi 1.2.0.0 to
benefit from the security fixes and avoid warnings. Prefer using
<code>urllib3[brotli]</code> to install a compatible Brotli package automatically.</p>
</li>
<li>
<p>If you use custom decompressors, please make sure to update them to
respect the changed API of <code>urllib3.response.ContentDecoder</code>.</p>
</li>
</ul>
<h2>Features</h2>
<ul>
<li>Enabled retrieval, deletion, and membership testing in <code>HTTPHeaderDict</code> using bytes keys. (<code>[#3653](https://github.com/urllib3/urllib3/issues/3653) &lt;https://github.com/urllib3/urllib3/issues/3653&gt;</code>__)</li>
<li>Added host and port information to string representations of <code>HTTPConnection</code>. (<code>[#3666](https://github.com/urllib3/urllib3/issues/3666) &lt;https://github.com/urllib3/urllib3/issues/3666&gt;</code>__)</li>
<li>Added support for Python 3.14 free-threading builds explicitly. (<code>[#3696](https://github.com/urllib3/urllib3/issues/3696) &lt;https://github.com/urllib3/urllib3/issues/3696&gt;</code>__)</li>
</ul>
<h2>Removals</h2>
<ul>
<li>Removed the <code>HTTPResponse.getheaders()</code> method in favor of <code>HTTPResponse.headers</code>.
Removed the <code>HTTPResponse.getheader(name, default)</code> method in favor of <code>HTTPResponse.headers.get(name, default)</code>. (<code>[#3622](https://github.com/urllib3/urllib3/issues/3622) &lt;https://github.com/urllib3/urllib3/issues/3622&gt;</code>__)</li>
</ul>
<h2>Bugfixes</h2>
<ul>
<li>Fixed redirect handling in <code>urllib3.PoolManager</code> when an integer is passed
for the retries parameter. (<code>[#3649](https://github.com/urllib3/urllib3/issues/3649) &lt;https://github.com/urllib3/urllib3/issues/3649&gt;</code>__)</li>
<li>Fixed <code>HTTPConnectionPool</code> when used in Emscripten with no explicit port. (<code>[#3664](https://github.com/urllib3/urllib3/issues/3664) &lt;https://github.com/urllib3/urllib3/issues/3664&gt;</code>__)</li>
<li>Fixed handling of <code>SSLKEYLOGFILE</code> with expandable variables. (<code>[#3700](https://github.com/urllib3/urllib3/issues/3700) &lt;https://github.com/urllib3/urllib3/issues/3700&gt;</code>__)</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/urllib3/urllib3/commit/720f484b605f18887a48eef448d0084e2b76902d"><code>720f484</code></a> Release 2.6.0</li>
<li><a href="https://github.com/urllib3/urllib3/commit/24d7b67eac89f94e11003424bcf0d8f7b72222a8"><code>24d7b67</code></a> Merge commit from fork</li>
<li><a href="https://github.com/urllib3/urllib3/commit/c19571de34c47de3a766541b041637ba5f716ed7"><code>c19571d</code></a> Merge commit from fork</li>
<li><a href="https://github.com/urllib3/urllib3/commit/816fcf04528bc0f89672e13398eb813dcc892490"><code>816fcf0</code></a> Bump actions/setup-python from 6.0.0 to 6.1.0 (<a href="https://redirect.github.com/urllib3/urllib3/issues/3725">#3725</a>)</li>
<li><a href="https://github.com/urllib3/urllib3/commit/18af0a10efc4c99dd028f7ad5a461470b9a8b0fd"><code>18af0a1</code></a> Improve speed of <code>BytesQueueBuffer.get()</code> by using memoryview (<a href="https://redirect.github.com/urllib3/urllib3/issues/3711">#3711</a>)</li>
<li><a href="https://github.com/urllib3/urllib3/commit/1f6abac3e6d426c3939b8a17cf4afa099e691ab2"><code>1f6abac</code></a> Bump versions of pre-commit hooks (<a href="https://redirect.github.com/urllib3/urllib3/issues/3716">#3716</a>)</li>
<li><a href="https://github.com/urllib3/urllib3/commit/1c8fbf787b8e6ed151842c5d6874c9d5bdbf1d0b"><code>1c8fbf7</code></a> Bump actions/checkout from 5.0.0 to 6.0.0 (<a href="https://redirect.github.com/urllib3/urllib3/issues/3722">#3722</a>)</li>
<li><a href="https://github.com/urllib3/urllib3/commit/7784b9eee95b7c90802c02b111e98df70259ae4f"><code>7784b9e</code></a> Add Python 3.15 to CI (<a href="https://redirect.github.com/urllib3/urllib3/issues/3717">#3717</a>)</li>
<li><a href="https://github.com/urllib3/urllib3/commit/0241c9e7286d3008e3cce18effc13b40dc633385"><code>0241c9e</code></a> Updated docs to reflect change in optional zstd dependency from <code>zstandard</code> t...</li>
<li><a href="https://github.com/urllib3/urllib3/commit/7afcabb6489d9a8ea95a40e5afcb46463af17351"><code>7afcabb</code></a> Expand environment variable of SSLKEYLOGFILE (<a href="https://redirect.github.com/urllib3/urllib3/issues/3705">#3705</a>)</li>
<li>Additional commits viewable in <a href="https://github.com/urllib3/urllib3/compare/2.5.0...2.6.0">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=urllib3&package-manager=pip&previous-version=2.5.0&new-version=2.6.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/tensorflow/tensorflow/network/alerts).

</details>
Copybara import of the project:

--
8e0e52510295ad7244fb8fa46e001f6544d94122 by dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>:

Bump urllib3 from 2.5.0 to 2.6.0

Bumps [urllib3](https://github.com/urllib3/urllib3) from 2.5.0 to 2.6.0.
- [Release notes](https://github.com/urllib3/urllib3/releases)
- [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst)
- [Commits](https://github.com/urllib3/urllib3/compare/2.5.0...2.6.0)

---
updated-dependencies:
- dependency-name: urllib3
  dependency-version: 2.6.0
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Merging this change closes #105775

COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/tensorflow/pull/105775 from tensorflow:dependabot/pip/urllib3-2.6.0 8e0e52510295ad7244fb8fa46e001f6544d94122
PiperOrigin-RevId: 841796827
---
 requirements_lock_3_10.txt | 6 +++---
 requirements_lock_3_11.txt | 6 +++---
 requirements_lock_3_12.txt | 6 +++---
 requirements_lock_3_13.txt | 6 +++---
 requirements_lock_3_9.txt  | 6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
index 36a6e6b78b5604..486c66c2fdb52f 100644
--- a/requirements_lock_3_10.txt
+++ b/requirements_lock_3_10.txt
@@ -748,9 +748,9 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
index 6238e70c957632..80a1a2e834b3c2 100644
--- a/requirements_lock_3_11.txt
+++ b/requirements_lock_3_11.txt
@@ -747,9 +747,9 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
diff --git a/requirements_lock_3_12.txt b/requirements_lock_3_12.txt
index 2d655921b2f9d8..ac1fb6ff141e7d 100644
--- a/requirements_lock_3_12.txt
+++ b/requirements_lock_3_12.txt
@@ -747,9 +747,9 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
diff --git a/requirements_lock_3_13.txt b/requirements_lock_3_13.txt
index 45461447246243..4e0988a88aff92 100644
--- a/requirements_lock_3_13.txt
+++ b/requirements_lock_3_13.txt
@@ -729,9 +729,9 @@ typing-extensions==4.14.1 \
     # via
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
diff --git a/requirements_lock_3_9.txt b/requirements_lock_3_9.txt
index 26e2d0ae19171b..6a52f5d70bdcae 100644
--- a/requirements_lock_3_9.txt
+++ b/requirements_lock_3_9.txt
@@ -734,9 +734,9 @@ typing-extensions==4.14.1 \
     #   -r ci/official/requirements_updater/requirements.in
     #   optree
     #   rich
-urllib3==2.5.0 \
-    --hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
-    --hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc
+urllib3==2.6.0 \
+    --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
+    --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
 werkzeug==3.1.3 \
     --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \

From 7785057a2ac37f900181eb7720bcfe68a1d44b1c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Dec 2025 17:43:36 +0000
Subject: [PATCH 035/753] Bump werkzeug from 3.1.3 to 3.1.4

Bumps [werkzeug](https://github.com/pallets/werkzeug) from 3.1.3 to 3.1.4.
- [Release notes](https://github.com/pallets/werkzeug/releases)
- [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/werkzeug/compare/3.1.3...3.1.4)

---
updated-dependencies:
- dependency-name: werkzeug
  dependency-version: 3.1.4
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements_lock_3_10.txt | 6 +++---
 requirements_lock_3_11.txt | 6 +++---
 requirements_lock_3_12.txt | 6 +++---
 requirements_lock_3_13.txt | 6 +++---
 requirements_lock_3_9.txt  | 6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
index 486c66c2fdb52f..a2645ee5ddbdb4 100644
--- a/requirements_lock_3_10.txt
+++ b/requirements_lock_3_10.txt
@@ -752,9 +752,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
index 80a1a2e834b3c2..cd51c5e0c0c338 100644
--- a/requirements_lock_3_11.txt
+++ b/requirements_lock_3_11.txt
@@ -751,9 +751,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_12.txt b/requirements_lock_3_12.txt
index ac1fb6ff141e7d..1b8d63c9d75147 100644
--- a/requirements_lock_3_12.txt
+++ b/requirements_lock_3_12.txt
@@ -751,9 +751,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_13.txt b/requirements_lock_3_13.txt
index 4e0988a88aff92..ded80d5230a8c9 100644
--- a/requirements_lock_3_13.txt
+++ b/requirements_lock_3_13.txt
@@ -733,9 +733,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/requirements_lock_3_9.txt b/requirements_lock_3_9.txt
index 6a52f5d70bdcae..6e68ddf6f79595 100644
--- a/requirements_lock_3_9.txt
+++ b/requirements_lock_3_9.txt
@@ -738,9 +738,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \

From 1f0f883a4c3eec8c320189a212c43444817cb298 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Mon, 8 Dec 2025 09:30:20 -0800
Subject: [PATCH 036/753] Gracefully handle missing parameter/output shardings
 from single-device atom executables

`PjRtExecutable::GetParameterShardings()` and `PjRtExecutable::GetOutputShardings()` may return `std::nullopt` for single-device executables, but their shardings are trivial and we can infer them from the device count.

PiperOrigin-RevId: 841798186
---
 .../xla/xla/python/ifrt/ir/tests/ifrt-opt.cc  | 11 ++++-
 .../ifrt_compile_and_propagate_shardings.mlir | 30 ++++++++++++
 ...rt_compile_and_propagate_shardings_pass.cc | 29 +++++++----
 ...y_bound_external_loaded_executable_pass.cc | 48 +++++++++----------
 4 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc b/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
index 596767a9dc3a1d..097136f5b1c631 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt-opt.cc
@@ -67,11 +67,18 @@ class TestChildExecutableCompiler : public AtomProgramCompiler {
            "invalidated some method string_views.";
     auto mock_executable =
         std::make_unique<testing::NiceMock<MockLoadedExecutable>>();
+    int num_devices;
+    if (options.executable_build_options.has_device_assignment()) {
+      num_devices =
+          options.executable_build_options.device_assignment().num_elements();
+    } else {
+      num_devices = 1;
+    }
     int num_parameters_to_propagate =
         options.executable_build_options
             .allow_spmd_sharding_propagation_to_parameters()
             .size();
-    if (num_parameters_to_propagate > 0) {
+    if (num_devices > 1 && num_parameters_to_propagate > 0) {
       xla::OpSharding op_sharding;
       op_sharding.set_type(xla::OpSharding::REPLICATED);
       std::vector<xla::OpSharding> parameter_shardings(
@@ -83,7 +90,7 @@ class TestChildExecutableCompiler : public AtomProgramCompiler {
         options.executable_build_options
             .allow_spmd_sharding_propagation_to_output()
             .size();
-    if (num_outputs_to_propagate > 0) {
+    if (num_devices > 1 && num_outputs_to_propagate > 0) {
       // Always infer output shardings to be replicated for the lit tests.
       xla::OpSharding op_sharding;
       op_sharding.set_type(xla::OpSharding::REPLICATED);
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir
index 4021496168cb8c..e8c49c453b6853 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir
+++ b/third_party/xla/xla/python/ifrt/ir/tests/ifrt_compile_and_propagate_shardings.mlir
@@ -286,6 +286,36 @@ module @propagate_to_inputs {
 
 // -----
 
+!array_unspecified = !ifrt.array<tensor<2x2xi32>,
+                                 #ifrt.sharding_unspecified, [0]>
+// CHECK-LABEL: @propagate_single_device
+module @propagate_single_device {
+  func.func @main(%arg0: !array_unspecified)
+      -> !array_unspecified attributes {ifrt.function} {
+    // CHECK: %[[OUT:.+]], %{{.+}} = ifrt.CallLoadedExecutable @[[CALLEE:.+]](%arg0)
+    // CHECK-SAME: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+    // CHECK-SAME: -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+    %0, %ctrl_0 = ifrt.Call @add_one_0::@main(%arg0) on devices [0]
+        {ifrt.module_type = "xla"} : (!array_unspecified) -> !array_unspecified
+    return %0 : !array_unspecified
+  }
+
+  // CHECK: ifrt.LoadedExecutable @[[CALLEE]]
+  // CHECK-SAME: on devices [0]
+  // CHECK-SAME: !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+  // CHECK-SAME: -> !ifrt.array<tensor<2x2xi32>, #ifrt.sharding_param<1x1 to [0] on 1>, [0]>
+  module @add_one_0 attributes {sym_visibility = "private"} {
+    func.func @main(%arg0: tensor<2x2xi32>) -> (tensor<2x2xi32>) {
+      %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+      %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+      return %1 : tensor<2x2xi32>
+    }
+  }
+
+}
+
+// -----
+
 !array = !ifrt.array<tensor<2x2xi32>,
                      #ifrt.sharding_param<2x1 to [0] on 2>, [0, 1]>
 !array_unspecified = !ifrt.array<tensor<2x2xi32>,
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc
index c48128130c59f7..a828664243a673 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_and_propagate_shardings_pass.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "xla/python/ifrt/ir/transforms/utils.h"
 #include "xla/python/ifrt/support/sharding_conversions.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace ifrt {
@@ -388,10 +389,16 @@ IfrtCompileAndPropagateShardingsPass::GetInputShardingParams(
     if (llvm::isa<IfrtUnspecifiedShardingAttr>(
             in_array_type.getShardingAttr())) {
       if (!in_shardings.has_value()) {
-        in_shardings = compile_result.executable->GetParameterShardings();
-        if (!in_shardings.has_value()) {
-          return call_op.emitError()
-                 << "executable does not have input shardings";
+        if (call_op.getDevices().size() == 1) {
+          // Use replicated sharding for single-device inputs without calling
+          // `GetParameterShardings` since it may return `std::nullopt`.
+          in_shardings.emplace(call_op.getOutputs().size());
+        } else {
+          in_shardings = compile_result.executable->GetParameterShardings();
+          if (!in_shardings.has_value()) {
+            return call_op.emitError()
+                   << "executable does not have input shardings";
+          }
         }
         if (in_shardings->size() != call_op.getOutputs().size()) {
           return call_op.emitError()
@@ -443,10 +450,16 @@ IfrtCompileAndPropagateShardingsPass::GetOutputShardingParams(
     if (llvm::isa<IfrtUnspecifiedShardingAttr>(
             out_array_type.getShardingAttr())) {
       if (!out_shardings.has_value()) {
-        out_shardings = compile_result.executable->GetOutputShardings();
-        if (!out_shardings.has_value()) {
-          return call_op.emitError()
-                 << "executable does not have output shardings";
+        if (call_op.getDevices().size() == 1) {
+          // Use replicated sharding for single-device inputs without calling
+          // `GetParameterShardings` since it may return `std::nullopt`.
+          out_shardings.emplace(call_op.getOutputs().size());
+        } else {
+          out_shardings = compile_result.executable->GetOutputShardings();
+          if (!out_shardings.has_value()) {
+            return call_op.emitError()
+                   << "executable does not have output shardings";
+          }
         }
         if (out_shardings->size() != call_op.getOutputs().size()) {
           return call_op.emitError()
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc
index c62dc0cc98b897..fe0f68ccbea178 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_verify_bound_external_loaded_executable_pass.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -130,50 +131,49 @@ void IfrtVerifyBoundExternalLoadedExecutablePass::runOnOperation() {
       }
 
       auto func_type = loaded_exec_op.getFunctionType();
-      if (!exec_it->second->GetParameterShardings().has_value()) {
+      std::optional<std::vector<xla::OpSharding>> parameter_shardings;
+      if (loaded_exec_op.getDevices().size() == 1) {
+        parameter_shardings.emplace(func_type.getNumInputs());
+      } else {
+        parameter_shardings = exec_it->second->GetParameterShardings();
+      }
+      if (!parameter_shardings.has_value()) {
         return loaded_exec_op.emitOpError()
                << "cannot be bound to an executable without parameter "
                   "shardings";
       }
-      if (!exec_it->second->GetOutputShardings().has_value()) {
+      std::optional<std::vector<xla::OpSharding>> output_shardings;
+      if (loaded_exec_op.getDevices().size() == 1) {
+        output_shardings.emplace(func_type.getNumResults());
+      } else {
+        output_shardings = exec_it->second->GetOutputShardings();
+      }
+      if (!output_shardings.has_value()) {
         return loaded_exec_op.emitOpError()
-               << "cannot be bound to an executable without output shardings";
+               << "cannot be bound to a multi-device executable without output "
+                  "shardings";
       }
-      if (func_type.getNumInputs() !=
-          exec_it->second->GetParameterShardings()->size()) {
+      if (func_type.getNumInputs() != parameter_shardings->size()) {
         return loaded_exec_op.emitOpError()
                << "expects an executable with " << func_type.getNumInputs()
                << " inputs, but was bound to an executable with "
-               << exec_it->second->GetParameterShardings()->size() << " inputs";
+               << parameter_shardings->size() << " inputs";
       }
-      if (func_type.getNumResults() !=
-          exec_it->second->GetOutputShardings()->size()) {
+      if (func_type.getNumResults() != output_shardings->size()) {
         return loaded_exec_op.emitOpError()
                << "expects an executable with " << func_type.getNumResults()
                << " results, but was bound to an executable with "
-               << exec_it->second->GetOutputShardings()->size() << " results";
+               << output_shardings->size() << " results";
       }
       // Verify that the input and output shardings of the LoadedExecutableOp
       // are the same as the shardings of the bound executable.
-      if (!exec_it->second->GetParameterShardings().has_value()) {
-        return loaded_exec_op.emitOpError()
-               << "cannot be bound to an executable without parameter "
-                  "shardings";
-      }
-      if (!exec_it->second->GetOutputShardings().has_value()) {
-        return loaded_exec_op.emitOpError()
-               << "cannot be bound to an executable without output "
-                  "shardings";
-      }
       auto sharding_equal_status = VerifyShardingsEqual(
-          func_type.getInputs(), *exec_it->second->GetParameterShardings(),
-          "input");
+          func_type.getInputs(), *parameter_shardings, "input");
       if (!sharding_equal_status.ok()) {
         return loaded_exec_op.emitOpError() << sharding_equal_status.message();
       }
-      sharding_equal_status = VerifyShardingsEqual(
-          func_type.getResults(), *exec_it->second->GetOutputShardings(),
-          "output");
+      sharding_equal_status = VerifyShardingsEqual(func_type.getResults(),
+                                                   *output_shardings, "output");
       if (!sharding_equal_status.ok()) {
         return loaded_exec_op.emitOpError() << sharding_equal_status.message();
       }

From 5d9394b3fca77e1a85361b7b6177f855b9b96e36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 09:53:58 -0800
Subject: [PATCH 037/753] [XLA:TPU] SPMD Partitioner should not change layout
 if no partition changes done

PiperOrigin-RevId: 841806676
---
 .../xla/xla/service/spmd/spmd_partitioner.cc  | 115 +++++++++++-------
 1 file changed, 71 insertions(+), 44 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 3efed6ad73375a..052fe73912d8ba 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -5488,6 +5488,26 @@ int64_t SpmdPartitioner::CommunicationCostInBytes(HloInstruction* hlo) {
   module->set_spmd_output_sharding(entry_root->sharding());
 }
 
+namespace {
+
+// Returns true if the old and the new entry layout shapes differ.
+// NOTE: that we explicitly ignore the layout, since it is either defined
+// beforehand or during layout assignment.
+bool ShapeChangesBetween(const ComputationLayout& old_entry_layout,
+                         const ProgramShape& new_program_shape) {
+  for (int64_t i = 0; i < new_program_shape.parameters_size(); ++i) {
+    if (!Shape::Equal().IgnoreLayout()(old_entry_layout.parameter_shape(i),
+                                       new_program_shape.parameters(i))) {
+      return true;
+    }
+  }
+
+  return !Shape::Equal().IgnoreLayout()(old_entry_layout.result_shape(),
+                                        new_program_shape.result());
+}
+
+}  // namespace
+
 absl::StatusOr<bool> SpmdPartitioner::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -5582,57 +5602,64 @@ absl::StatusOr<bool> SpmdPartitioner::RunImpl(
       }));
 
   // For the entry computation, make sure that the root instruction and the
-  // parameters preserve their signatures.
+  // parameters preserve their signatures if there are any partitioning changes.
   auto new_program_shape = module->entry_computation()->ComputeProgramShape();
-  if (!options_.allow_module_signature_change) {
-    if (!Shape::Equal()(program_shape.result(), new_program_shape.result())) {
-      return absl::InvalidArgumentError(
-          "Result shape changed for the entry computation from: " +
-          program_shape.result().ToString() +
-          " to: " + new_program_shape.result().ToString());
-    }
-    if (program_shape.parameters_size() !=
-        new_program_shape.parameters_size()) {
-      return absl::InvalidArgumentError(
-          "Parameter count changed for the entry computation from: " +
-          std::to_string(program_shape.parameters_size()) +
-          " to: " + std::to_string(new_program_shape.parameters_size()));
-    }
-    for (int64_t i = 0; i < program_shape.parameters_size(); ++i) {
-      if (!Shape::Equal()(program_shape.parameters(i),
-                          new_program_shape.parameters(i))) {
+  const ComputationLayout& old_entry_layout =
+      module->entry_computation_layout();
+  if (ShapeChangesBetween(old_entry_layout, new_program_shape)) {
+    if (!options_.allow_module_signature_change) {
+      if (!Shape::Equal()(program_shape.result(), new_program_shape.result())) {
         return absl::InvalidArgumentError(
-            "Parameter shape changed for the entry computation parameter " +
-            std::to_string(i) +
-            " from: " + program_shape.parameters(i).ToString() +
-            " to: " + new_program_shape.parameters(i).ToString());
+            "Result shape changed for the entry computation from: " +
+            program_shape.result().ToString() +
+            " to: " + new_program_shape.result().ToString());
       }
-    }
-  } else {
-    // Fix up some bad tiling in entry computation layout.
-    auto update_shape = [this](Shape* subshape, const xla::ShapeIndex& index) {
-      if (subshape->IsArray() && subshape->has_layout()) {
-        UpdateLayout(subshape);
+      if (program_shape.parameters_size() !=
+          new_program_shape.parameters_size()) {
+        return absl::InvalidArgumentError(
+            "Parameter count changed for the entry computation from: " +
+            std::to_string(program_shape.parameters_size()) +
+            " to: " + std::to_string(new_program_shape.parameters_size()));
       }
-    };
-    const auto& old_entry_layout = module->entry_computation_layout();
-    // Shapes can change but the layout should still remain the same.
-    for (int64_t i = 0; i < new_program_shape.parameters_size(); ++i) {
+      for (int64_t i = 0; i < program_shape.parameters_size(); ++i) {
+        if (!Shape::Equal()(program_shape.parameters(i),
+                            new_program_shape.parameters(i))) {
+          return absl::InvalidArgumentError(
+              "Parameter shape changed for the entry computation parameter " +
+              std::to_string(i) +
+              " from: " + program_shape.parameters(i).ToString() +
+              " to: " + new_program_shape.parameters(i).ToString());
+        }
+      }
+    } else {
+      // For the cases where we update the shape, also fix up some bad tiling in
+      // entry computation layout.
+      auto update_shape = [this](Shape* subshape,
+                                 const xla::ShapeIndex& index) {
+        if (subshape->IsArray() && subshape->has_layout()) {
+          UpdateLayout(subshape);
+        }
+      };
+      // Shapes can change but the layout should still remain the same.
+      // If the shapes do not change, we shouldn't change the layout if pre-set.
+      for (int64_t i = 0; i < new_program_shape.parameters_size(); ++i) {
+        TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+            old_entry_layout.parameter_shape(i),
+            new_program_shape.mutable_parameters(i)));
+        ShapeUtil::ForEachMutableSubshape(
+            new_program_shape.mutable_parameters(i), update_shape);
+      }
+
       TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-          old_entry_layout.parameter_shape(i),
-          new_program_shape.mutable_parameters(i)));
-      ShapeUtil::ForEachMutableSubshape(new_program_shape.mutable_parameters(i),
+          old_entry_layout.result_shape(), new_program_shape.mutable_result()));
+      ShapeUtil::ForEachMutableSubshape(new_program_shape.mutable_result(),
                                         update_shape);
+
+      HloModuleConfig config = module->config();
+      *config.mutable_entry_computation_layout() =
+          ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
+      module->set_config(config);
     }
-    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
-        old_entry_layout.result_shape(), new_program_shape.mutable_result()));
-    ShapeUtil::ForEachMutableSubshape(new_program_shape.mutable_result(),
-                                      update_shape);
-
-    HloModuleConfig config = module->config();
-    *config.mutable_entry_computation_layout() =
-        ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
-    module->set_config(config);
   }
 
   XLA_VLOG_LINES(1, SpmdLogger::ReportAfterPartition(

From 3764d7a4a80b4f8d4a97c4beeaa11883465acc35 Mon Sep 17 00:00:00 2001
From: Yin Zhang <yinzz@google.com>
Date: Mon, 8 Dec 2025 09:55:37 -0800
Subject: [PATCH 038/753] Change OpSourceInfo::source_file from
 absl::string_view to std::string to prevent dangling references, as we will
 switch to prioritize parsing the file_name from stack_frame. (Since
 stack_frame is a string, so parsed file_name will be a temp string.)

PiperOrigin-RevId: 841807283
---
 third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
index 601a43b564a3c6..bc074d9bbbd3a2 100644
--- a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
+++ b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils.h
@@ -173,7 +173,7 @@ inline bool IsOffDutyOp(absl::string_view category) {
 // to in a user's program; e.g. it could be the file and line of user code that
 // generated the op.
 struct OpSourceInfo {
-  absl::string_view source_file;
+  std::string source_file;
   int32_t source_line = -1;
   std::string stack_frame;
 

From bf24158520d842ad494969edf868a3671f7f6257 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Mon, 8 Dec 2025 10:56:46 -0800
Subject: [PATCH 039/753] Implement a new two-stage `ProgramInterpreter` design
 for better efficiency and safety

This CL implements a new design for `ProgramInterpreter` with a goal of separating work that can be done once per program vs. work that needs to be done per program invocation.

The current `ProgramInterpreter` design iterates over the compiled IFRT IR program and invokes IFRT APIs on demand. In this design, the interpreter needs to convert MLIR types into IFRT types, perform validation, etc. during every execution, which is wasteful since such information does not change.

The new design avoids the aforementioned problem by splitting the program interpreter into two stages. First, `ProgramInterpreter::BuildExecuteFn()` now traverses the program and *returns a function that can be invoked to run the program*. The execute function is built only once during compilation and can perform any work that only needs static information, e.g., building `xla::ifrt::RemapPlan` from a `RemapArraysOp` MLIR op. Once that is complete, each program invocation just needs to call the execute function produced by the program interpreter.

`CompiledIfrtIRProgram` now carries this "compiled" execute functions so that this can be invoked by executables. This makes `CompiledIfrtIRProgram::program` optional since we no longer need to carry the MLIR module to execute an IFRT IR program. This can save host memory if there are a large number of IFRT IR programs.

PiperOrigin-RevId: 841834380
---
 third_party/xla/xla/python/ifrt/ir/BUILD      |   6 +-
 .../ifrt/ir/compiled_ifrt_ir_program.cc       |  14 +-
 .../python/ifrt/ir/compiled_ifrt_ir_program.h |  14 +
 .../xla/python/ifrt/ir/program_interpreter.cc | 927 +++++++++++-------
 .../xla/python/ifrt/ir/program_interpreter.h  |  85 +-
 5 files changed, 633 insertions(+), 413 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index c48becc712bcb4..8f35b34468e5b8 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -507,6 +507,7 @@ cc_library(
         ":atom_program_compiler",
         ":ifrt_ir_program",
         ":ir",
+        ":program_interpreter",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_layout",
@@ -520,6 +521,7 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -542,7 +544,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = ["//xla/python/ifrt:users"],
     deps = [
-        ":compiled_ifrt_ir_program",
+        ":atom_program_compiler",
         ":ir",
         "//xla:status_macros",
         "//xla/python/ifrt",
@@ -555,6 +557,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
index b3ec80c3d6f3e9..67c912b86cafc2 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
 #include "xla/python/ifrt/ir/ifrt_ir_program.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
+#include "xla/python/ifrt/ir/program_interpreter.h"
 #include "xla/python/ifrt/ir/transforms/debug.h"
 #include "xla/python/ifrt/ir/transforms/passes.h"
 #include "xla/python/ifrt/ir/transforms/utils.h"
@@ -327,6 +328,8 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
   mlir::MLIRContext* context = mlir_module.getContext();
   xla::ifrt::support::RegisterMlirDialects(*context);
 
+  std::string program_name = mlir_module.getName().value_or("unknown").str();
+
   // Add the bounded executables to the atom program executable map so that
   // they can be used by the interpreter
   std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_executable_map =
@@ -434,8 +437,16 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     }
   }
 
+  TF_ASSIGN_OR_RETURN(DeviceListRef device_list,
+                      client->MakeDeviceList(devices));
+  TF_ASSIGN_OR_RETURN(
+      auto interpreter,
+      ProgramInterpreter::Create(client, program_name, mlir_module,
+                                 atom_executable_map, std::move(device_list)));
+  TF_ASSIGN_OR_RETURN(auto execute_fn, interpreter->BuildExecuteFn());
+
   return CompiledIfrtIrProgram{
-      /*program_name=*/mlir_module.getName().value_or("unknown").str(),
+      /*program_name=*/std::move(program_name),
       /*atom_program_executables=*/std::move(atom_executable_map),
       /*in_specs=*/std::move(in_specs),
       /*out_specs=*/std::move(out_specs),
@@ -444,6 +455,7 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
       /*program=*/std::move(ifrt_ir_program),
       /*device_assignments=*/std::move(device_assignments),
       /*compile_options=*/compile_options,
+      /*execute_fn=*/std::move(execute_fn),
   };
 }
 
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
index 509750627489e1..c9baf2b35b5b8c 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.h
@@ -16,14 +16,20 @@ limitations under the License.
 #ifndef XLA_PYTHON_IFRT_IR_COMPILED_IFRT_IR_PROGRAM_H_
 #define XLA_PYTHON_IFRT_IR_COMPILED_IFRT_IR_PROGRAM_H_
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/device_list.h"
+#include "xla/python/ifrt/executable.h"
 #include "xla/python/ifrt/ir/atom_program_compiler.h"
 #include "xla/python/ifrt/ir/ifrt_ir_program.h"
 
@@ -63,6 +69,14 @@ struct CompiledIfrtIrProgram {
   // The compile options used to compile the program.
   std::shared_ptr<xla::ifrt::IfrtIRCompileOptions> compile_options;
 
+  // Precompiled execute function that interprets the IFRT IR program. The
+  // signature matches that of `xla::ifrt::LoadedExecutable::Execute()`.
+  absl::AnyInvocable<absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult>(
+      absl::Span<xla::ifrt::ArrayRef> arrays,
+      const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+      std::optional<xla::ifrt::DeviceListRef> devices)>
+      execute_fn;
+
   // Compiles an IFRT IR program.
   static absl::StatusOr<CompiledIfrtIrProgram> Create(
       std::unique_ptr<xla::ifrt::IfrtIRProgram> ifrt_ir_program,
diff --git a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
index 7e8612f830303f..deddd328048a7d 100644
--- a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
+++ b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/python/ifrt/ir/program_interpreter.h"
 
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -24,18 +25,21 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/functional/bind_front.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/DebugStringHelper.h"
@@ -46,7 +50,7 @@ limitations under the License.
 #include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/dtype.h"
 #include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/ir/compiled_ifrt_ir_program.h"
+#include "xla/python/ifrt/ir/atom_program_compiler.h"
 #include "xla/python/ifrt/ir/constants.h"
 #include "xla/python/ifrt/ir/ifrt_dialect.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
@@ -74,12 +78,20 @@ using ExecuteResult = ::xla::ifrt::LoadedExecutable::ExecuteResult;
 
 namespace {
 
+// Opaque handle that represents an array. Zero is reserved for null.
+using ArrayHandle = uintptr_t;
+
 // Array with additional metadata (e.g., if it can be donated).
 struct ArrayState {
   ArrayRef array;
   bool can_be_donated;
 };
 
+// Assigns a unique handle to the given MLIR value.
+ArrayHandle ToArrayHandle(mlir::Value value) {
+  return reinterpret_cast<ArrayHandle>(value.getAsOpaquePointer());
+}
+
 // Returns an xla::ifrt::Sharding for the given IFRT array type.
 absl::StatusOr<xla::ifrt::ShardingRef> GetSharding(
     xla::ifrt::IfrtArrayType array_type, xla::ifrt::Client* client,
@@ -110,65 +122,23 @@ std::string PrettyPrintGeneric(mlir::Operation* op) {
                       GetPrettyLocation(op->getLoc()));
 }
 
-// Populates the cache storing a Sharding for each IfrtArrayType.
-//
-// This cache exists to avoid traversing and creating large device lists at
-// execution time.
-//
-// Note that the cache is only populated for array types returned by CopyArrays
-// and RemapArrays ops because they are the only ops that need shardings.
-absl::StatusOr<llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>>
-PopulateShardingCache(mlir::func::FuncOp main_func, xla::ifrt::Client* client,
-                      const xla::ifrt::DeviceListRef& devices) {
-  llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>
-      array_type_to_sharding;
-  for (const mlir::Operation& op : main_func.getOps()) {
-    if (auto copy_arrays_op = llvm::dyn_cast<xla::ifrt::CopyArraysOp>(&op);
-        copy_arrays_op != nullptr) {
-      for (const auto [idx, output] :
-           llvm::enumerate(copy_arrays_op.getOutputs())) {
-        const auto array_type =
-            llvm::cast<xla::ifrt::IfrtArrayType>(output.getType());
-        TF_RET_CHECK(array_type != nullptr)
-            << "Output array #" << idx << " is not of type `IfrtArrayType`. "
-            << PrettyPrintGeneric(copy_arrays_op);
-        if (array_type_to_sharding.find(array_type) ==
-            array_type_to_sharding.end()) {
-          TF_ASSIGN_OR_RETURN(auto sharding,
-                              GetSharding(array_type, client, devices));
-          array_type_to_sharding[array_type] = std::move(sharding);
-        }
-      }
-    } else if (auto remap_op = llvm::dyn_cast<xla::ifrt::RemapArraysOp>(&op);
-               remap_op != nullptr) {
-      for (const auto [idx, output] : llvm::enumerate(remap_op.getOutputs())) {
-        const auto array_type =
-            llvm::cast<xla::ifrt::IfrtArrayType>(output.getType());
-        TF_RET_CHECK(array_type != nullptr)
-            << "Output array #" << idx << " is not of type `IfrtArrayType`. "
-            << PrettyPrintGeneric(remap_op);
-        if (array_type_to_sharding.find(array_type) ==
-            array_type_to_sharding.end()) {
-          TF_ASSIGN_OR_RETURN(auto sharding,
-                              GetSharding(array_type, client, devices));
-          array_type_to_sharding[array_type] = std::move(sharding);
-        }
-      }
-    }
-  }
-  return array_type_to_sharding;
-}
-
 }  // namespace
 
 struct Environment {
-  // Associates array with an MLIR value.
-  void AssociateArray(mlir::Value value, ArrayState array) {
-    CHECK(value_to_array.try_emplace(value, array).second);
+  // Associates array with an opaque handle.
+  void AssociateArray(ArrayHandle handle, ArrayState array) {
+    CHECK(handle_to_array.try_emplace(handle, array).second);
   }
 
-  // Map from MLIR value to IFRT array corresponding to the value.
-  llvm::DenseMap<mlir::Value, ArrayState> value_to_array;
+  // IFRT client for execution.
+  xla::ifrt::Client* client;
+  // Name of the program.
+  std::string program_name;
+  // Set of donated program arguments, which can be deleted after their last
+  // use. Entries are removed upon deletion or if they are aliased.
+  absl::flat_hash_set<ArrayHandle> deletable_program_arguments;
+  // Map from an opaque handle to IFRT array corresponding to the value.
+  absl::flat_hash_map<ArrayHandle, ArrayState> handle_to_array;
   // Outputs of the program.
   std::vector<ArrayRef> outputs;
   // `ExecuteOptions.fill_status` passed to Execute().
@@ -179,213 +149,401 @@ struct Environment {
 };
 
 absl::StatusOr<std::unique_ptr<ProgramInterpreter>> ProgramInterpreter::Create(
-    xla::ifrt::Client* client, std::shared_ptr<CompiledIfrtIrProgram> program,
+    xla::ifrt::Client* client, absl::string_view program_name,
+    mlir::ModuleOp mlir_module,
+    std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables,
     xla::ifrt::DeviceListRef devices) {
-  mlir::func::FuncOp main_func =
-      xla::ifrt::GetMainFunction(program->program->mlir_module);
+  mlir::func::FuncOp main_func = xla::ifrt::GetMainFunction(mlir_module);
   if (!main_func->hasAttr(xla::ifrt::kIfrtFunctionAttrName)) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "`main` function of IFRT IR program: ", program->program_name,
-        " is not an IFRT function."));
+    return absl::InvalidArgumentError(
+        absl::StrCat("`main` function of IFRT IR program: ", program_name,
+                     " is not an IFRT function."));
   }
-  TF_ASSIGN_OR_RETURN(auto array_type_to_sharding,
-                      PopulateShardingCache(main_func, client, devices));
   return std::unique_ptr<ProgramInterpreter>(new ProgramInterpreter(
-      client, std::move(program), std::move(devices), mlir::Liveness(main_func),
-      std::move(array_type_to_sharding)));
+      client, program_name, mlir_module, std::move(atom_program_executables),
+      std::move(devices), mlir::Liveness(main_func)));
 }
 
-absl::StatusOr<ExecuteResult> ProgramInterpreter::Execute(
-    absl::Span<ArrayRef> arrays, const ExecuteOptions& options,
-    std::optional<xla::ifrt::DeviceListRef> devices) {
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchProgram",
-                         {
-                             {"ifrt_ir_program", program_->program_name},
-                         });
-  });
-  VLOG(2) << "Started interpreting program: " << program_->program_name;
-  mlir::func::FuncOp main_func =
-      xla::ifrt::GetMainFunction(program_->program->mlir_module);
-  if (arrays.size() != main_func.getNumArguments()) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "`main` function of IFRT IR program: ", program_->program_name,
-        " invoked with ", arrays.size(), " arguments, but it expects ",
-        main_func.getNumArguments(), " arguments."));
-  }
+namespace {
+
+struct ProgramInterpreterState {
+  xla::ifrt::Client* client;
+  std::string program_name;
 
-  for (const auto& [idx, array] : llvm::enumerate(arrays)) {
-    if (array->IsDeleted()) {
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<int> donated_input_indices;
+
+  std::vector<absl::AnyInvocable<absl::Status(Environment& env) const>> op_fns;
+
+  absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult> Run(
+      absl::Span<xla::ifrt::ArrayRef> arrays,
+      const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+      std::optional<xla::ifrt::DeviceListRef> devices) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchProgram",
+                           {{"ifrt_ir_program", program_name}});
+    });
+    VLOG(2) << "Started interpreting program: " << program_name;
+
+    if (arrays.size() != input_handles.size()) {
       return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " of program ", program_->program_name,
-          " has already been deleted or donated."));
+          "`main` function of IFRT IR program: ", program_name,
+          " invoked with ", arrays.size(), " arguments, but it expects ",
+          input_handles.size(), " arguments."));
     }
-  }
 
-  Environment env;
-  env.fill_status = options.fill_status;
+    for (int idx = 0; idx < arrays.size(); ++idx) {
+      const xla::ifrt::ArrayRef& array = arrays[idx];
+      if (array->IsDeleted()) {
+        return absl::InvalidArgumentError(
+            absl::StrCat("Input array #", idx, " of program ", program_name,
+                         " has already been deleted or donated."));
+      }
+    }
+
+    Environment env;
+    env.client = client;
+    env.fill_status = options.fill_status;
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      // Add to the environment the arrays that are used.
+      bool is_donated = donated_input_indices.contains(idx) &&
+                        !options.non_donatable_input_indices.contains(idx);
+      const ArrayHandle handle = input_handles[idx];
+      if (handle != 0) {
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/arrays[idx],
+                                       /*can_be_donated=*/is_donated,
+                                   });
+        if (is_donated) {
+          env.deletable_program_arguments.insert(handle);
+        }
+      } else if (is_donated) {
+        // If the argument is donated but not used, it can be deleted.
+        arrays[idx]->Delete();
+      }
+    }
+
+    for (const auto& op_fn : op_fns) {
+      TF_RETURN_IF_ERROR(op_fn(env));
+    }
+
+    VLOG(2) << "Finished interpreting program: " << program_name;
+    ExecuteResult result;
+    if (env.fill_status) {
+      result.status =
+          tsl::JoinFutures(absl::MakeSpan(env.leaf_call_op_futures));
+    }
+    result.outputs = std::move(env.outputs);
+    return result;
+  };
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::ExecuteFn>
+ProgramInterpreter::BuildExecuteFn() {
+  ProgramInterpreterState state;
+  state.client = client_;
+  state.program_name = program_name_;
+
+  mlir::func::FuncOp main_func = xla::ifrt::GetMainFunction(mlir_module_);
+
   for (const auto [idx, arg] : llvm::enumerate(main_func.getArguments())) {
     // Add to the environment the arrays that are used.
-    bool is_donated = main_func.getArgAttr(
-                          idx, xla::ifrt::kIfrtDonatedArgAttrName) != nullptr &&
-                      !options.non_donatable_input_indices.contains(idx);
-    if (!arg.use_empty()) {
-      env.AssociateArray(arg, ArrayState{/*array=*/arrays[idx],
-                                         /*can_be_donated=*/is_donated});
-      if (is_donated) {
-        deletable_program_arguments_.insert(arg);
-      }
-    } else if (is_donated) {
-      // If the argument is donated but not used, it can be deleted.
-      arrays[idx]->Delete();
+    const ArrayHandle handle = arg.use_empty() ? 0 : ToArrayHandle(arg);
+    state.input_handles.push_back(handle);
+    if (main_func.getArgAttr(idx, xla::ifrt::kIfrtDonatedArgAttrName) !=
+        nullptr) {
+      state.donated_input_indices.insert(idx);
     }
   }
 
-  // Walk ops one-by-one in program order, and dispatch atom program and
-  // copy arrays.
+  // Walk ops one-by-one in program order and create functions that execute each
+  // op on a given environment.
   for (mlir::Operation& op : main_func.getOps()) {
-    auto exec_op_status =
-        llvm::TypeSwitch<const mlir::Operation&, absl::Status>(op)
+    auto op_fn =
+        llvm::TypeSwitch<const mlir::Operation&, absl::StatusOr<OpFn>>(op)
             .Case<xla::ifrt::CallLoadedExecutableOp, xla::ifrt::RemapArraysOp,
                   xla::ifrt::CopyArraysOp, mlir::func::ReturnOp>(
-                [&](const auto& op) { return ExecuteOp(op, env); })
-            .Default([&](const auto& op) {
+                [this](const auto& op) { return HandleOp(op); })
+            .Default([](const mlir::Operation& op) {
               return absl::InvalidArgumentError(absl::StrCat(
                   "Interpreter found unexpected op: ", mlir::debugString(op)));
             });
-    if (!exec_op_status.ok()) {
-      tsl::errors::AppendToMessage(&exec_op_status, PrettyPrint(&op));
-      return exec_op_status;
+    if (!op_fn.ok()) {
+      absl::Status status = op_fn.status();
+      tsl::errors::AppendToMessage(&status, PrettyPrint(&op));
+      return status;
     }
+    state.op_fns.push_back(
+        [op_fn = *std::move(op_fn),
+         pretty_print = PrettyPrint(&op)](Environment& env) -> absl::Status {
+          absl::Status status = op_fn(env);
+          tsl::errors::AppendToMessage(&status, pretty_print);
+          return status;
+        });
   }
 
-  VLOG(2) << "Finished interpreting program: " << program_->program_name;
-  ExecuteResult result;
-  if (env.fill_status) {
-    result.status = tsl::JoinFutures(absl::MakeSpan(env.leaf_call_op_futures));
-  }
-  result.outputs = std::move(env.outputs);
-  return result;
+  return absl::bind_front(&ProgramInterpreterState::Run, std::move(state));
 }
 
-absl::Status ProgramInterpreter::ExecuteOp(
-    xla::ifrt::CallLoadedExecutableOp call_loaded_op, Environment& env) {
+namespace {
+
+struct CallLoadedExecutableOpState {
+  std::string pretty_print;
+  std::string atom_program_name;
+
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<int> donated_arg_idxs;
+  absl::flat_hash_set<ArrayHandle> dead_inputs;
+
+  xla::ifrt::LoadedExecutable::ExecuteOptions execute_options;
+  std::shared_ptr<xla::ifrt::LoadedExecutable> executable;
+
+  std::vector<ArrayHandle> output_handles;
+  bool is_leaf_op;
+
+  absl::Status Run(Environment& env) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchLoadedExecutableOp",
+                           {
+                               {"ifrt_ir_program", env.program_name},
+                               {"atom_program", atom_program_name},
+                           });
+    });
+    VLOG(3) << pretty_print;
+
+    xla::ifrt::LoadedExecutable::ExecuteOptions options = execute_options;
+    options.fill_status = env.fill_status;
+
+    // Get the inputs of the loaded executable.
+    std::vector<ArrayRef> inputs;
+    std::vector<ArrayHandle> arrays_to_remove;
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      const ArrayHandle handle = input_handles[idx];
+
+      auto array_it = env.handle_to_array.find(handle);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      if (array_it->second.array->IsDeleted()) {
+        // We explicitly check here for deletion in order to provide a more
+        // informative error message.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Input array #", idx, "` has already been deleted or donated. ",
+            pretty_print));
+      }
+      inputs.push_back(array_it->second.array);
+
+      bool is_donated = donated_arg_idxs.contains(idx);
+      if (is_donated && !array_it->second.can_be_donated) {
+        VLOG(2) << "Atom program donates input #" << idx
+                << ", but it has not been donated to the IFRT IR program. "
+                   "Input will not be donated. \n"
+                << pretty_print;
+        is_donated = false;
+      }
+      if (is_donated || dead_inputs.contains(handle)) {
+        arrays_to_remove.push_back(handle);
+      }
+      if (!is_donated) {
+        options.non_donatable_input_indices.insert(idx);
+      }
+    }
+
+    TF_ASSIGN_OR_RETURN(xla::ifrt::LoadedExecutable::ExecuteResult result,
+                        executable->Execute(absl::MakeSpan(inputs), options,
+                                            /*devices=*/std::nullopt));
+    TF_RET_CHECK(result.outputs.size() == output_handles.size())
+        << "Got " << result.outputs.size() << " results, but atom program has "
+        << output_handles.size() << ". " << pretty_print;
+
+    // Remove the arrays from the environment after the inputs vector is
+    // created. This is because in situations such as `ifrt.Call(%0, %0)` the
+    // liveness analysis will return that %0 is dead, but it's used for the
+    // second argument.
+    for (const auto handle : arrays_to_remove) {
+      if (env.deletable_program_arguments.erase(handle)) {
+        // Explicitly delete donated program arguments that are not used later.
+        env.handle_to_array[handle].array->Delete();
+      }
+      env.handle_to_array.erase(handle);
+    }
+
+    for (int i = 0; i < output_handles.size(); ++i) {
+      const ArrayHandle handle = output_handles[i];
+      if (handle != 0) {
+        // The output array is kept only if it used later. This can happen if an
+        // executable has multiple output arrays, but only some of them are
+        // used.
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/std::move(result.outputs[i]),
+                                       /*can_be_donated=*/true,
+                                   });
+      }
+    }
+    if (is_leaf_op && env.fill_status) {
+      env.leaf_call_op_futures.push_back(std::move(result.status));
+    }
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    xla::ifrt::CallLoadedExecutableOp call_loaded_op) {
+  CallLoadedExecutableOpState state;
+  state.pretty_print = PrettyPrint(call_loaded_op);
+
   xla::ifrt::LoadedExecutableOp loaded_exec_op =
       call_loaded_op.getCalleeOp(symbol_table_);
-  std::string atom_program_name = loaded_exec_op.getSymName().str();
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchLoadedExecutableOp",
-                         {
-                             {"ifrt_ir_program", program_->program_name},
-                             {"atom_program", atom_program_name},
-                         });
-  });
-  std::string op_name = call_loaded_op->getName().getStringRef().str();
-  VLOG(3) << PrettyPrint(call_loaded_op);
+  state.atom_program_name = loaded_exec_op.getSymName().str();
+
   // Get the loaded executable for the atom program.
-  auto exec_it = program_->atom_program_executables->find(atom_program_name);
-  TF_RET_CHECK(exec_it != program_->atom_program_executables->end())
-      << "Could not find executable. " << PrettyPrint(call_loaded_op);
+  auto exec_it = atom_program_executables_->find(state.atom_program_name);
+  TF_RET_CHECK(exec_it != atom_program_executables_->end())
+      << "Could not find executable. " << state.pretty_print;
+  state.executable = exec_it->second;
 
-  absl::flat_hash_set<int> donated_arg_idxs(
-      call_loaded_op.getDonatedInputIndices().begin(),
-      call_loaded_op.getDonatedInputIndices().end());
+  state.donated_arg_idxs.insert(call_loaded_op.getDonatedInputIndices().begin(),
+                                call_loaded_op.getDonatedInputIndices().end());
   for (const auto& io_alias :
        call_loaded_op.getIoAliases().getAsRange<mlir::DenseI32ArrayAttr>()) {
     // Insert the aliased input to the set.
-    donated_arg_idxs.insert(io_alias.asArrayRef()[0]);
+    state.donated_arg_idxs.insert(io_alias.asArrayRef()[0]);
   }
-  // Get the inputs of the loaded executable.
-  std::vector<ArrayRef> inputs;
-  xla::ifrt::LoadedExecutable::ExecuteOptions execute_options;
-  execute_options.fill_status = env.fill_status;
-  llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
-  for (const auto [idx, input] : llvm::enumerate(call_loaded_op.getInputs())) {
-    auto array_it = env.value_to_array.find(input);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. "
-        << PrettyPrint(call_loaded_op);
-    if (array_it->second.array->IsDeleted()) {
-      // We explicitly check here for deletion in order to provide a more
-      // informative error message.
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, "` has already been deleted or donated. ",
-          PrettyPrint(call_loaded_op)));
-    }
-    inputs.push_back(array_it->second.array);
-
-    bool is_donated = donated_arg_idxs.contains(idx);
-    if (is_donated && !array_it->second.can_be_donated) {
-      VLOG(2) << "Atom program donates input #" << idx
-              << ", but it has not been donated to the IFRT IR program. "
-                 "Input will not be donated. \n"
-              << PrettyPrint(call_loaded_op);
-      is_donated = false;
-    }
-    if (is_donated || liveness_.isDeadAfter(input, call_loaded_op)) {
-      array_values_to_gc_from_env.insert(input);
-    }
-    if (!is_donated) {
-      execute_options.non_donatable_input_indices.insert(idx);
+  for (const auto input : call_loaded_op.getInputs()) {
+    state.input_handles.push_back(ToArrayHandle(input));
+    if (liveness_.isDeadAfter(input, call_loaded_op)) {
+      state.dead_inputs.insert(ToArrayHandle(input));
     }
   }
 
-  TF_ASSIGN_OR_RETURN(
-      xla::ifrt::LoadedExecutable::ExecuteResult result,
-      exec_it->second->Execute(absl::MakeSpan(inputs), execute_options,
-                               /*devices=*/std::nullopt));
-  TF_RET_CHECK(result.outputs.size() == call_loaded_op.getOutputs().size())
-      << "Got " << result.outputs.size() << " results, but atom program has "
-      << call_loaded_op.getOutputs().size() << ". "
-      << PrettyPrint(call_loaded_op);
-
-  // Remove the arrays from the environment after the inputs vector is created.
-  // This is because in situations such as `ifrt.Call(%0, %0)` the liveness
-  // analysis will return that %0 is dead, but it's used for the second
-  // argument.
-  for (const auto& array_value : array_values_to_gc_from_env) {
-    if (deletable_program_arguments_.erase(array_value)) {
-      // Explicitly delete donated program arguments that are not used later.
-      env.value_to_array[array_value].array->Delete();
-    }
-    env.value_to_array.erase(array_value);
-  }
+  state.is_leaf_op = true;
+  for (const auto output : call_loaded_op.getOutputs()) {
+    const ArrayHandle handle = output.use_empty() ? 0 : ToArrayHandle(output);
+    state.output_handles.push_back(handle);
 
-  bool is_leaf_op = true;
-  for (const auto [output_array, output] :
-       llvm::zip(result.outputs, call_loaded_op.getOutputs())) {
-    if (!output.use_empty()) {
-      // The output array is kept only if it used later. This can happen if
-      // an executable has multiple output arrays, but only some of them are
-      // used.
-      env.AssociateArray(output, ArrayState{/*array=*/std::move(output_array),
-                                            /*can_be_donated=*/true});
-    }
-    if (is_leaf_op) {
+    if (state.is_leaf_op) {
       for (mlir::OpOperand& use : output.getUses()) {
         // An ifrt.CallOp is not a leaf if any of its outputs are not returned.
         if (llvm::dyn_cast<mlir::func::ReturnOp>(use.getOwner()) == nullptr) {
-          is_leaf_op = false;
+          state.is_leaf_op = false;
           break;
         }
       }
     }
   }
-  if (is_leaf_op && env.fill_status) {
-    env.leaf_call_op_futures.push_back(std::move(result.status));
-  }
 
-  return absl::OkStatus();
+  return absl::bind_front(&CallLoadedExecutableOpState::Run, std::move(state));
 }
 
-absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
-                                           Environment& env) {
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchRemapArraysOp",
-                         {{"ifrt_ir_program", program_->program_name}});
-  });
-  std::string op_name = remap_op->getName().getStringRef().str();
-  VLOG(3) << PrettyPrint(remap_op);
+namespace {
+
+struct RemapArraysOpState {
+  std::string pretty_print;
+
+  xla::ifrt::RemapPlan remap_plan;
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<ArrayHandle> dead_inputs;
+  bool remap_is_donated;
+
+  std::vector<ArrayHandle> output_handles;
+
+  absl::Status Run(Environment& env) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchRemapArraysOp",
+                           {{"ifrt_ir_program", env.program_name}});
+    });
+    VLOG(3) << pretty_print;
+
+    std::vector<ArrayRef> inputs;
+    inputs.reserve(remap_plan.input_specs.size());
+
+    std::optional<bool> is_donated;
+    std::vector<ArrayHandle> arrays_to_remove;
+
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      const ArrayHandle handle = input_handles[idx];
+
+      auto array_it = env.handle_to_array.find(handle);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      if (array_it->second.array->IsDeleted()) {
+        // We explicitly check here for deletion in order to provide a more
+        // informative error message.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Input array #", idx, "` has already been deleted or donated. ",
+            pretty_print));
+      }
+      inputs.push_back(array_it->second.array);
+
+      // The default buffer donation semantic is finalized at compilation time.
+      // Users can override the donation semantic at runtime. In the meantime,
+      // the IFRT client RemapArrays API requires all input arrays have the same
+      // donation semantic.
+      if (!is_donated.has_value()) {
+        is_donated = remap_is_donated && array_it->second.can_be_donated;
+      }
+      if (*is_donated && !array_it->second.can_be_donated) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Donation semantic must be consistent across all input arrays of "
+            "RemapArraysOp. Input array #",
+            idx,
+            " cannot be donated, but previous input arrays can be donated. "
+            "It's likely due to a MPMD program argument is marked as "
+            "non-donatable. ",
+            pretty_print));
+      }
+      if (*is_donated || dead_inputs.contains(handle)) {
+        arrays_to_remove.push_back(handle);
+      }
+    }
+    TF_RET_CHECK(is_donated.has_value())
+        << "Unable to determine the donation semantic of the remap op. The "
+           "remap op has no inputs. "
+        << pretty_print;
+
+    // Apply the remap arrays operation.
+    xla::ifrt::ArrayCopySemantics copy_semantics =
+        *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
+                    : xla::ifrt::ArrayCopySemantics::kReuseInput;
+    TF_ASSIGN_OR_RETURN(auto out_arrays, env.client->RemapArrays(
+                                             remap_plan, absl::MakeSpan(inputs),
+                                             copy_semantics));
+
+    for (const auto handle : arrays_to_remove) {
+      // Donated remapped arrays are pro-actively deleted, and aliased arrays
+      // cannot be deleted later. Thus, remove the arrays from the deletable
+      // program arguments set.
+      env.deletable_program_arguments.erase(handle);
+      env.handle_to_array.erase(handle);
+    }
+
+    // Store the result arrays in the environment.
+    TF_RET_CHECK(out_arrays.size() == remap_plan.output_specs.size())
+        << "Got " << out_arrays.size() << " results, but op has "
+        << remap_plan.output_specs.size() << ". " << pretty_print;
+    for (int i = 0; i < output_handles.size(); ++i) {
+      const ArrayHandle handle = output_handles[i];
+      if (handle != 0) {
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/std::move(out_arrays[i]),
+                                       /*can_be_donated=*/true,
+                                   });
+      }
+    }
+
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    xla::ifrt::RemapArraysOp remap_op) {
+  RemapArraysOpState state;
+  state.pretty_print = PrettyPrint(remap_op);
 
   // Construct the mappings of the remap plan.
   auto mappings =
@@ -410,54 +568,28 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
     }
   };
 
-  std::vector<ArrayRef> inputs;
-  std::vector<xla::ifrt::ArraySpec> input_specs;
-  inputs.reserve(remap_op.getInputs().size());
-  input_specs.reserve(remap_op.getInputs().size());
   // Get the input specs of the remap plan and the input arrays.
-  llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
-  std::optional<bool> is_donated;
+  std::vector<xla::ifrt::ArraySpec> input_specs;
+  input_specs.reserve(remap_op.getOutputs().size());
   for (const auto [idx, input] : llvm::enumerate(remap_op.getInputs())) {
-    auto array_it = env.value_to_array.find(input);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. " << PrettyPrint(remap_op);
-    if (array_it->second.array->IsDeleted()) {
-      // We explicitly check here for deletion in order to provide a more
-      // informative error message.
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " has already been deleted or donated. ",
-          PrettyPrint(remap_op)));
-    }
-    inputs.push_back(array_it->second.array);
+    state.input_handles.push_back(ToArrayHandle(input));
+
+    const auto array_type =
+        llvm::cast<xla::ifrt::IfrtArrayType>(input.getType());
+    TF_ASSIGN_OR_RETURN(
+        xla::ifrt::DType dtype,
+        xla::ifrt::ToIfrtDType(array_type.getShape().getElementType()));
+    TF_ASSIGN_OR_RETURN(xla::ifrt::ShardingRef sharding,
+                        GetSharding(array_type, client_, devices_));
     input_specs.push_back(xla::ifrt::ArraySpec{
-        /*dtype=*/array_it->second.array->dtype(),
-        /*shape=*/array_it->second.array->shape(),
-        /*sharding=*/array_it->second.array->shared_ptr_sharding()});
-
-    // The default buffer donation semantic is finalized at compilation time.
-    // Users can override the donation semantic at runtime. In the meantime, the
-    // IFRT client RemapArrays API requires all input arrays have the same
-    // donation semantic.
-    if (!is_donated.has_value()) {
-      is_donated = remap_op.getDonated() && array_it->second.can_be_donated;
-    }
-    if (*is_donated && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Donation semantic must be consistent across all input arrays of "
-          "RemapArraysOp. Input array #",
-          idx,
-          " cannot be donated, but previous input arrays can be donated. It's "
-          "likely due to a MPMD program argument is marked as non-donatable. ",
-          PrettyPrint(remap_op)));
-    }
-    if (*is_donated || liveness_.isDeadAfter(input, remap_op)) {
-      array_values_to_gc_from_env.insert(input);
+        /*dtype=*/dtype,
+        /*shape=*/xla::ifrt::Shape(array_type.getShape().getShape()),
+        /*sharding=*/std::move(sharding)});
+
+    if (liveness_.isDeadAfter(input, remap_op)) {
+      state.dead_inputs.insert(ToArrayHandle(input));
     }
   }
-  TF_RET_CHECK(is_donated.has_value())
-      << "Unable to determine the donation semantic of the remap op. The remap "
-         "op has no inputs. "
-      << PrettyPrint(remap_op);
 
   // Get the output specs of the remap plan.
   std::vector<xla::ifrt::ArraySpec> output_specs;
@@ -468,153 +600,196 @@ absl::Status ProgramInterpreter::ExecuteOp(xla::ifrt::RemapArraysOp remap_op,
     TF_ASSIGN_OR_RETURN(
         xla::ifrt::DType dtype,
         xla::ifrt::ToIfrtDType(array_type.getShape().getElementType()));
+    TF_ASSIGN_OR_RETURN(xla::ifrt::ShardingRef sharding,
+                        GetSharding(array_type, client_, devices_));
     output_specs.push_back(xla::ifrt::ArraySpec{
         /*dtype=*/dtype,
         /*shape=*/xla::ifrt::Shape(array_type.getShape().getShape()),
-        /*sharding=*/array_type_to_sharding_.at(array_type)});
+        /*sharding=*/std::move(sharding)});
   }
 
-  // Apply the remap arrays operation.
-  xla::ifrt::ArrayCopySemantics copy_semantics =
-      *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
-                  : xla::ifrt::ArrayCopySemantics::kReuseInput;
-  TF_ASSIGN_OR_RETURN(
-      auto out_arrays,
-      client_->RemapArrays({
-                               /*input_specs=*/std::move(input_specs),
-                               /*output_specs=*/std::move(output_specs),
-                               /*mappings=*/std::move(mappings),
-                           },
-                           absl::MakeSpan(inputs), copy_semantics));
-
-  for (const auto& array_value : array_values_to_gc_from_env) {
-    // Donated remapped arrays are pro-actively deleted, and aliased arrays
-    // cannot be deleted later. Thus, remove the arrays from the deletable
-    // program arguments set.
-    deletable_program_arguments_.erase(array_value);
-    env.value_to_array.erase(array_value);
-  }
+  state.remap_plan = xla::ifrt::RemapPlan{
+      /*input_specs=*/std::move(input_specs),
+      /*output_specs=*/std::move(output_specs),
+      /*mappings=*/std::move(mappings),
+  };
+  state.remap_is_donated = remap_op.getDonated();
 
-  // Store the result arrays in the environment.
-  TF_RET_CHECK(out_arrays.size() == remap_op.getOutputs().size())
-      << "Got " << out_arrays.size() << " results, but op has "
-      << remap_op.getOutputs().size() << ". " << PrettyPrint(remap_op);
-  for (const auto [output_array, output] :
-       llvm::zip(out_arrays, remap_op.getOutputs())) {
-    if (!output.use_empty()) {
-      env.AssociateArray(output, ArrayState{/*array=*/std::move(output_array),
-                                            /*can_be_donated=*/true});
-    }
+  for (const auto output : remap_op.getOutputs()) {
+    const ArrayHandle handle = output.use_empty() ? 0 : ToArrayHandle(output);
+    state.output_handles.push_back(handle);
   }
-  return absl::OkStatus();
+
+  return absl::bind_front(&RemapArraysOpState::Run, std::move(state));
 }
 
-absl::Status ProgramInterpreter::ExecuteOp(
-    xla::ifrt::CopyArraysOp copy_arrays_op, Environment& env) {
-  TraceMe traceme([&]() {
-    return TraceMeEncode("DispatchCopyArraysOp",
-                         {{"ifrt_ir_program", program_->program_name}});
-  });
-  std::string op_name = copy_arrays_op->getName().getStringRef().str();
-  VLOG(3) << PrettyPrint(copy_arrays_op);
-
-  std::vector<ArrayRef> inputs;
-  inputs.reserve(copy_arrays_op.getInputs().size());
-  llvm::DenseSet<mlir::Value> array_values_to_gc_from_env;
-  std::optional<bool> is_donated;
-  for (const auto [idx, input] : llvm::enumerate(copy_arrays_op.getInputs())) {
-    auto array_it = env.value_to_array.find(input);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. "
-        << PrettyPrint(copy_arrays_op);
-    if (array_it->second.array->IsDeleted()) {
-      // We explicitly check here for deletion in order to provide a more
-      // informative error message.
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Input array #", idx, " has already been deleted or donated. ",
-          PrettyPrint(copy_arrays_op)));
+namespace {
+
+struct CopyArraysOpState {
+  std::string pretty_print;
+
+  std::vector<ArrayHandle> input_handles;
+  absl::flat_hash_set<ArrayHandle> dead_inputs;
+  bool copy_is_donated;
+
+  std::vector<ArrayHandle> output_handles;
+  xla::ifrt::ShardingRef new_sharding;
+
+  absl::Status Run(Environment& env) const {
+    TraceMe traceme([&]() {
+      return TraceMeEncode("DispatchCopyArraysOp",
+                           {{"ifrt_ir_program", env.program_name}});
+    });
+    VLOG(3) << pretty_print;
+
+    std::vector<ArrayRef> inputs;
+    inputs.reserve(input_handles.size());
+
+    std::optional<bool> is_donated;
+    std::vector<ArrayHandle> arrays_to_remove;
+
+    for (int idx = 0; idx < input_handles.size(); ++idx) {
+      const ArrayHandle handle = input_handles[idx];
+
+      auto array_it = env.handle_to_array.find(handle);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      if (array_it->second.array->IsDeleted()) {
+        // We explicitly check here for deletion in order to provide a more
+        // informative error message.
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Input array #", idx, " has already been deleted or donated. ",
+            pretty_print));
+      }
+      inputs.push_back(array_it->second.array);
+
+      // The default buffer donation semantic is finalized at compilation time.
+      // Users can override the donation semantic at runtime. In the meantime,
+      // the IFRT client CopyArrays API requires all input arrays have the same
+      // donation semantic.
+      if (!is_donated.has_value()) {
+        is_donated = copy_is_donated && array_it->second.can_be_donated;
+      }
+      if (*is_donated && !array_it->second.can_be_donated) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Donation semantic must be consistent across all input arrays of "
+            "CopyArraysOp. Input array #",
+            idx,
+            " cannot be donated, but previous input arrays can be donated. "
+            "It's likely due to a MPMD program argument is marked as "
+            "non-donatable. ",
+            pretty_print));
+      }
+      if (*is_donated || dead_inputs.contains(handle)) {
+        arrays_to_remove.push_back(handle);
+      }
     }
-    inputs.push_back(array_it->second.array);
-
-    // The default buffer donation semantic is finalized at compilation time.
-    // Users can override the donation semantic at runtime. In the meantime, the
-    // IFRT client CopyArrays API requires all input arrays have the same
-    // donation semantic.
-    if (!is_donated.has_value()) {
-      is_donated =
-          copy_arrays_op.getDonated() && array_it->second.can_be_donated;
+    TF_RET_CHECK(is_donated.has_value())
+        << "Unable to determine the donation semantic of the copy arrays op. "
+           "The copy arrays op has no inputs. "
+        << pretty_print;
+
+    auto array_copy_semantics =
+        *is_donated ? xla::ifrt::ArrayCopySemantics::kDonateInput
+                    : xla::ifrt::ArrayCopySemantics::kAlwaysCopy;
+    // It is safe to get the devices and memory kind from the first output
+    // because all outputs use the same devices and have the same memory kind.
+    TF_ASSIGN_OR_RETURN(auto copied_arrays,
+                        env.client->CopyArrays(
+                            absl::MakeSpan(inputs), new_sharding->devices(),
+                            new_sharding->memory_kind(), array_copy_semantics));
+
+    for (const auto handle : arrays_to_remove) {
+      if (env.deletable_program_arguments.erase(handle)) {
+        // Explicitly delete donated program arguments that are not used later.
+        env.handle_to_array[handle].array->Delete();
+      }
+      env.handle_to_array.erase(handle);
     }
-    if (*is_donated && !array_it->second.can_be_donated) {
-      return absl::InvalidArgumentError(absl::StrCat(
-          "Donation semantic must be consistent across all input arrays of "
-          "CopyArraysOp. Input array #",
-          idx,
-          " cannot be donated, but previous input arrays can be donated. It's "
-          "likely due to a MPMD program argument is marked as non-donatable. ",
-          PrettyPrint(copy_arrays_op)));
+
+    TF_RET_CHECK(copied_arrays.size() == inputs.size())
+        << "Got " << copied_arrays.size() << " results, but op has "
+        << inputs.size() << ". " << pretty_print;
+    for (int i = 0; i < output_handles.size(); ++i) {
+      const ArrayHandle handle = output_handles[i];
+      if (handle != 0) {
+        env.AssociateArray(handle, ArrayState{
+                                       /*array=*/std::move(copied_arrays[i]),
+                                       /*can_be_donated=*/true,
+                                   });
+      }
     }
-    if (*is_donated || liveness_.isDeadAfter(input, copy_arrays_op)) {
-      array_values_to_gc_from_env.insert(input);
+
+    return absl::OkStatus();
+  }
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    xla::ifrt::CopyArraysOp copy_arrays_op) {
+  CopyArraysOpState state;
+  state.pretty_print = PrettyPrint(copy_arrays_op);
+
+  for (const auto [idx, input] : llvm::enumerate(copy_arrays_op.getInputs())) {
+    state.input_handles.push_back(ToArrayHandle(input));
+    if (liveness_.isDeadAfter(input, copy_arrays_op)) {
+      state.dead_inputs.insert(ToArrayHandle(input));
     }
   }
-  TF_RET_CHECK(is_donated.has_value())
-      << "Unable to determine the donation semantic of the copy arrays op. The "
-         "copy arrays op has no inputs. "
-      << PrettyPrint(copy_arrays_op);
+  state.copy_is_donated = copy_arrays_op.getDonated();
 
   const auto out_array_type = llvm::cast<xla::ifrt::IfrtArrayType>(
       copy_arrays_op.getOutputs().front().getType());
   TF_RET_CHECK(out_array_type != nullptr)
       << "Output array #0 is not of type `IfrtArrayType`. "
-      << PrettyPrint(copy_arrays_op);
-  auto new_sharding = array_type_to_sharding_.at(out_array_type);
-  auto array_copy_semantics = *is_donated
-                                  ? xla::ifrt::ArrayCopySemantics::kDonateInput
-                                  : xla::ifrt::ArrayCopySemantics::kAlwaysCopy;
-  // It is safe to get the devices and memory kind from the first output
-  // because all outputs use the same devices and have the same memory kind.
-  TF_ASSIGN_OR_RETURN(
-      auto copied_arrays,
-      client_->CopyArrays(absl::MakeSpan(inputs), new_sharding->devices(),
-                          new_sharding->memory_kind(), array_copy_semantics));
-
-  for (const auto& array_value : array_values_to_gc_from_env) {
-    if (deletable_program_arguments_.erase(array_value)) {
-      // Explicitly delete donated program arguments that are not used later.
-      env.value_to_array[array_value].array->Delete();
-    }
-    env.value_to_array.erase(array_value);
+      << state.pretty_print;
+  TF_ASSIGN_OR_RETURN(state.new_sharding,
+                      GetSharding(out_array_type, client_, devices_));
+
+  for (const auto output : copy_arrays_op.getOutputs()) {
+    const ArrayHandle handle = output.use_empty() ? 0 : ToArrayHandle(output);
+    state.output_handles.push_back(handle);
   }
 
-  // Store the result arrays in the environment.
-  TF_RET_CHECK(copied_arrays.size() == copy_arrays_op.getOutputs().size())
-      << "Got " << copied_arrays.size() << " results, but op has "
-      << copy_arrays_op.getOutputs().size() << ". "
-      << PrettyPrint(copy_arrays_op);
-  for (const auto [output_array, output] :
-       llvm::zip(copied_arrays, copy_arrays_op.getOutputs())) {
-    if (!output.use_empty()) {
-      env.AssociateArray(output, ArrayState{/*array=*/std::move(output_array),
-                                            /*can_be_donated=*/true});
+  return absl::bind_front(&CopyArraysOpState::Run, std::move(state));
+}
+
+namespace {
+
+struct ReturnOpState {
+  std::string pretty_print;
+  std::vector<ArrayHandle> output_handles;
+
+  absl::Status Run(Environment& env) const {
+    VLOG(3) << "func.return of `main` function";
+    env.outputs.reserve(output_handles.size());
+    for (int idx = 0; idx < output_handles.size(); ++idx) {
+      auto array_it = env.handle_to_array.find(output_handles[idx]);
+      TF_RET_CHECK(array_it != env.handle_to_array.end())
+          << "Input array #" << idx << " not found. " << pretty_print;
+      env.outputs.push_back(std::move(array_it->second.array));
     }
+    env.handle_to_array.clear();
+    return absl::OkStatus();
   }
-  return absl::OkStatus();
-}
+};
+
+}  // namespace
+
+absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
+    mlir::func::ReturnOp return_op) {
+  ReturnOpState state;
+  state.pretty_print = PrettyPrint(return_op);
 
-absl::Status ProgramInterpreter::ExecuteOp(mlir::func::ReturnOp return_op,
-                                           Environment& env) {
   auto func_op = return_op->getParentOfType<mlir::func::FuncOp>();
   CHECK_EQ(func_op.getSymName().str(), "main");
-  VLOG(3) << return_op->getName().getStringRef().str() << " of `main` function";
-  env.outputs.reserve(return_op->getNumOperands());
+  state.output_handles.reserve(return_op->getNumOperands());
   for (const auto& [idx, result] : llvm::enumerate(return_op.getOperands())) {
-    auto array_it = env.value_to_array.find(result);
-    TF_RET_CHECK(array_it != env.value_to_array.end())
-        << "Input array #" << idx << " not found. " << PrettyPrint(return_op);
-    env.outputs.push_back(std::move(array_it->second.array));
+    state.output_handles.push_back(ToArrayHandle(result));
   }
-  env.value_to_array.clear();
-  return absl::OkStatus();
+
+  return absl::bind_front(&ReturnOpState::Run, std::move(state));
 }
 
 std::string ProgramInterpreter::PrettyPrint(mlir::Operation* op) {
diff --git a/third_party/xla/xla/python/ifrt/ir/program_interpreter.h b/third_party/xla/xla/python/ifrt/ir/program_interpreter.h
index 3f8e8075404185..35158ac1305124 100644
--- a/third_party/xla/xla/python/ifrt/ir/program_interpreter.h
+++ b/third_party/xla/xla/python/ifrt/ir/program_interpreter.h
@@ -21,23 +21,22 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
 #include "mlir/Analysis/Liveness.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/SymbolTable.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/device.h"
 #include "xla/python/ifrt/device_list.h"
 #include "xla/python/ifrt/executable.h"
-#include "xla/python/ifrt/ir/compiled_ifrt_ir_program.h"
-#include "xla/python/ifrt/ir/ifrt_dialect.h"
+#include "xla/python/ifrt/ir/atom_program_compiler.h"
 #include "xla/python/ifrt/ir/ifrt_ops.h"
-#include "xla/python/ifrt/sharding.h"
 
 namespace xla {
 namespace ifrt {
@@ -46,59 +45,75 @@ namespace ifrt {
 struct Environment;
 
 // Interpreter for an IFRT IR program.
+//
+// The program interpreter is responsible for executing an IFRT IR program. The
+// interpreter works in two stages. First, when `BuildExecuteFn` is called, it
+// traverses the program and builds a function that can be invoked to execute
+// the program, which happens only once during compilation. Second, the returned
+// execute function can be called multiple times to interpret the IFRT IR
+// program.
+//
+// This two-stage design has two primary purposes:
+//
+// 1. It allows us to leverage the static information available in the program
+//    as much as possible. For example, `RemapArraysOp` builds its remap plan
+//    during the first stage and the plan is reused for all executions.
+//
+// 2. It avoids running any LLVM/MLIR code during execution. This is
+//    particularly useful in environments where the use of LLVM/MLIR
+//    synchronization primitives may cause deadlocks, e.g., cooperatively
+//    scheduled fibers.
 class ProgramInterpreter {
  public:
+  using ExecuteFn = absl::AnyInvocable<
+      absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult>(
+          absl::Span<xla::ifrt::ArrayRef> arrays,
+          const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+          std::optional<xla::ifrt::DeviceListRef> devices)>;
+
   static absl::StatusOr<std::unique_ptr<ProgramInterpreter>> Create(
-      xla::ifrt::Client* client, std::shared_ptr<CompiledIfrtIrProgram> program,
+      xla::ifrt::Client* client, absl::string_view program_name,
+      mlir::ModuleOp mlir_module,
+      std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables,
       xla::ifrt::DeviceListRef devices);
 
-  // Executes the IFRT IR program.
-  absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult> Execute(
-      absl::Span<xla::ifrt::ArrayRef> arrays,
-      const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
-      std::optional<xla::ifrt::DeviceListRef> devices);
+  absl::StatusOr<ExecuteFn> BuildExecuteFn();
 
  private:
+  using OpFn = absl::AnyInvocable<absl::Status(Environment& env) const>;
+
   ProgramInterpreter(
-      xla::ifrt::Client* client, std::shared_ptr<CompiledIfrtIrProgram> program,
-      xla::ifrt::DeviceListRef devices, mlir::Liveness liveness,
-      llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>
-          array_type_to_sharding)
+      xla::ifrt::Client* client, absl::string_view program_name,
+      mlir::ModuleOp mlir_module,
+      std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables,
+      xla::ifrt::DeviceListRef devices, mlir::Liveness liveness)
       : client_(client),
-        program_(std::move(program)),
+        program_name_(program_name),
+        mlir_module_(mlir_module),
+        atom_program_executables_(std::move(atom_program_executables)),
         devices_(std::move(devices)),
-        liveness_(std::move(liveness)),
-        array_type_to_sharding_(std::move(array_type_to_sharding)) {}
+        liveness_(std::move(liveness)) {}
 
-  absl::Status ExecuteOp(xla::ifrt::CallLoadedExecutableOp call_loaded_op,
-                         Environment& env);
-  absl::Status ExecuteOp(xla::ifrt::RemapArraysOp remap_op, Environment& env);
-  absl::Status ExecuteOp(xla::ifrt::CopyArraysOp copy_arrays_op,
-                         Environment& env);
-  absl::Status ExecuteOp(mlir::func::ReturnOp return_op, Environment& env);
+  absl::StatusOr<OpFn> HandleOp(
+      xla::ifrt::CallLoadedExecutableOp call_loaded_op);
+  absl::StatusOr<OpFn> HandleOp(xla::ifrt::RemapArraysOp remap_op);
+  absl::StatusOr<OpFn> HandleOp(xla::ifrt::CopyArraysOp copy_arrays_op);
+  absl::StatusOr<OpFn> HandleOp(mlir::func::ReturnOp return_op);
 
   // Returns a pretty string representation of the op.
   std::string PrettyPrint(mlir::Operation* op);
 
   xla::ifrt::Client* client_;
   mlir::SymbolTableCollection symbol_table_;
-  std::shared_ptr<CompiledIfrtIrProgram> program_;
+  std::string program_name_;
+  mlir::ModuleOp mlir_module_;
+  std::shared_ptr<xla::ifrt::AtomExecutableMap> atom_program_executables_;
 
   // All the devices the program uses.
   xla::ifrt::DeviceListRef devices_;
 
   // Cached liveness analysis of the IFRT IR program.
   mlir::Liveness liveness_;
-
-  // Mapping between IfrtArrayType and Sharding. This map is used to cache
-  // the Shardings at IFRT IR program compilation time in order to avoid
-  // overheads at execution time.
-  llvm::DenseMap<xla::ifrt::IfrtArrayType, xla::ifrt::ShardingRef>
-      array_type_to_sharding_;
-
-  // Set of donated program arguments, which can be deleted after their last
-  // use. Entries are removed upon deletion or if they are aliased.
-  llvm::DenseSet<mlir::Value> deletable_program_arguments_;
 };
 
 }  // namespace ifrt

From 0de1b0f2ad88d1c14bb825102aa0d10f909a319b Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Mon, 8 Dec 2025 11:24:53 -0800
Subject: [PATCH 040/753] Only show warning about nvml symbol when relevant

PiperOrigin-RevId: 841846078
---
 third_party/xla/xla/stream_executor/cuda/cuda_executor.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 488e0f465f594a..cc487db3345103 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -1837,8 +1837,10 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
       info.cluster_uuid = fabric_info->cluster_uuid;
       info.clique_id = fabric_info->clique_id;
     } else {
-      LOG(WARNING) << "GPU interconnect information not available: "
-                   << fabric_info.status();
+      if (cc.IsAtLeastHopper() && p2p_link_count.ok() && *p2p_link_count) {
+        LOG(WARNING) << "GPU interconnect information not available: "
+                     << fabric_info.status();
+      }
     }
     desc.set_device_interconnect_info(info);
   }

From d70ec822d2566d485e52c8f52cb0a4b75f75c775 Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Mon, 8 Dec 2025 11:35:17 -0800
Subject: [PATCH 041/753] Add `DCNTopology` and `EndpointAddresses`

PiperOrigin-RevId: 841850899
---
 third_party/xla/opensource_only.files         |  1 +
 third_party/xla/xla/megascale/BUILD           | 28 ++++++++
 third_party/xla/xla/megascale/addresses.proto | 28 ++++++++
 .../xla/xla/megascale/dcn_topology.proto      | 64 +++++++++++++++++++
 .../xla/xla/megascale/package_groups.bzl      |  7 ++
 5 files changed, 128 insertions(+)
 create mode 100644 third_party/xla/xla/megascale/BUILD
 create mode 100644 third_party/xla/xla/megascale/addresses.proto
 create mode 100644 third_party/xla/xla/megascale/dcn_topology.proto
 create mode 100644 third_party/xla/xla/megascale/package_groups.bzl

diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 4a78380bc9dd7d..888a0978aac8f0 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -1,6 +1,7 @@
 tensorflow/compiler/xla/backends/cpu/nanort/package_groups.bzl:
 tensorflow/compiler/xla/backends/cpu/package_groups.bzl:
 tensorflow/compiler/xla/internal/package_groups.bzl:
+tensorflow/compiler/xla/megascale/package_groups.bzl:
 tensorflow/compiler/xla/mlir_hlo/WORKSPACE:
 tensorflow/compiler/xla/package_groups.bzl:
 tensorflow/compiler/xla/pjrt/cpu/package_groups.bzl:
diff --git a/third_party/xla/xla/megascale/BUILD b/third_party/xla/xla/megascale/BUILD
new file mode 100644
index 00000000000000..4aa6b4e8f5498e
--- /dev/null
+++ b/third_party/xla/xla/megascale/BUILD
@@ -0,0 +1,28 @@
+load("//xla/megascale:package_groups.bzl", "megascale_package_groups")
+load("//xla/tsl:tsl.bzl", "internal_visibility")
+load(
+    "//xla/tsl/platform:build_config.bzl",
+    "tf_proto_library",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":internal"]),
+    licenses = ["notice"],
+)
+
+megascale_package_groups()
+
+tf_proto_library(
+    name = "dcn_topology_proto",
+    srcs = ["dcn_topology.proto"],
+    create_grpc_library = True,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "addresses_proto",
+    srcs = ["addresses.proto"],
+    create_grpc_library = True,
+    make_default_target_header_only = True,
+)
diff --git a/third_party/xla/xla/megascale/addresses.proto b/third_party/xla/xla/megascale/addresses.proto
new file mode 100644
index 00000000000000..ad6611335bbc3c
--- /dev/null
+++ b/third_party/xla/xla/megascale/addresses.proto
@@ -0,0 +1,28 @@
+syntax = "proto3";
+
+package xla.megascale.runtime;
+
+option java_multiple_files = true;
+option java_outer_classname = "Runtime";
+
+message HostNetworkAddress {
+  string address = 1;
+  string interface_name = 2;
+  // The host name used for debugging only, and is supplied by pathways or MXLA
+  // coordinator. Do not use this for creating connection to other peers, use
+  // the address above.
+  string host_name_for_debugging = 3;
+}
+
+// NetworkAddressMapping provides mapping between a unique endpoint (slice_id,
+// host_id) and the network address it is reachable at.
+message NetworkAddressMapping {
+  int32 slice_id = 1;
+  int32 host_id = 2;
+  repeated HostNetworkAddress addresses = 3;
+}
+
+// Holds the network address mapping of all endpoints (slice_id, host_id).
+message EndpointAddresses {
+  repeated NetworkAddressMapping address_mappings = 1;
+}
diff --git a/third_party/xla/xla/megascale/dcn_topology.proto b/third_party/xla/xla/megascale/dcn_topology.proto
new file mode 100644
index 00000000000000..de87573195e36b
--- /dev/null
+++ b/third_party/xla/xla/megascale/dcn_topology.proto
@@ -0,0 +1,64 @@
+syntax = "proto3";
+
+package xla.megascale.runtime;
+
+option java_multiple_files = true;
+option java_outer_classname = "Runtime";
+
+message DCNTopology {
+  // SymmetricTree represents a simple network topology with symmetric
+  // splitting at each level.
+  message SymmetricTree {
+    // The length of branching_per_layer is the depth (number of distinct
+    // layers) of the network topology. The values give the branching factor at
+    // each layer. Index 0 holds the uppermost level in the topology. For
+    // example: a 24 slice topology, in three groups of two subgroups of four
+    // slices would be represented as: branching_per_layer =
+    // [3, 2, 4] slice_ids are not explicitly specified and are assumed to be
+    // contiguously assigned. i.e. slice_id = branching_per_layer[0] * 8  +
+    // branching_per_layer[1] * 4 + branching_per_layer[2]
+    repeated int32 branching_per_layer = 1;
+  }
+
+  // Node recursively defines a fully specified tree. The tree is expected to
+  // be balanced but allowed to be asymmetric.
+  message TreeNode {
+    // Contiguous range of slices in half-open interval [slice_id_start,
+    // slice_id_end). The contiguous nature has no special signficance beyond
+    // compactly represent large number of slices. e.g. SliceRange{0, 10} and
+    // SliceRange{20, 30} all have the same connectivity between them.
+    message SliceRange {
+      int32 slice_id_start = 1;
+      // Ignored when slice_id_end <= slice_id_start.
+      int32 slice_id_end = 2;
+    }
+
+    // Optional label for readability.
+    optional string label = 1;
+
+    // We expect the Topology to be a balanced asymmetric tree. This implies
+    // that at any level we should either have nodes OR slice_ranges.
+    repeated TreeNode nodes = 2;
+    repeated SliceRange slice_ranges = 3;
+
+    // Specifies the degree to which egress from this node to higher layers in
+    // topology is constrained. Valid range [0.0, 1.0]. 0.0 -> no
+    // constraint, 1.0 -> never use. When egress_constraint for a node is higher
+    // than other nodes with which it performas a reduction, it is assigned
+    // shards for reduction with less probability. This will result in fewer
+    // transfers out of these nodes to higher layers in topology.
+    optional float egress_constraint = 4;
+
+    // Whether to perform the ring algorithm instead of the shuffle algorithm
+    // between the children. The ring order is the order of the children.
+    bool ring_transfers = 5;
+  }
+
+  oneof representation {
+    // Simple representation of a symmetric hierarchical network.
+    SymmetricTree symmetric_tree = 1;
+    // Fully specified tree with no assumptions on symmetry and slice id
+    // mappings.
+    TreeNode tree = 2;
+  }
+}
diff --git a/third_party/xla/xla/megascale/package_groups.bzl b/third_party/xla/xla/megascale/package_groups.bzl
new file mode 100644
index 00000000000000..9d3f8d701a735b
--- /dev/null
+++ b/third_party/xla/xla/megascale/package_groups.bzl
@@ -0,0 +1,7 @@
+"""Megascale package_group definitions"""
+
+def megascale_package_groups(name = "megascale_package_groups"):
+    native.package_group(
+        name = "internal",
+        packages = ["//..."],
+    )

From cc327345d12c6e82a0e0c8345b66cfa30844ea16 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Mon, 8 Dec 2025 11:50:09 -0800
Subject: [PATCH 042/753] Optimize remap ops in IFRT IR by performing more work
 at compile time

With two-stage program interpreter, we can now calculate `input_devices_for_output_map` and verify the remap plan at compile time without having to worry about their overheads. The former may improve the performance for runtime implementations that leverage the additional information.

PiperOrigin-RevId: 841856704
---
 third_party/xla/xla/python/ifrt/ir/program_interpreter.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
index deddd328048a7d..313977f13d1030 100644
--- a/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
+++ b/third_party/xla/xla/python/ifrt/ir/program_interpreter.cc
@@ -615,6 +615,9 @@ absl::StatusOr<ProgramInterpreter::OpFn> ProgramInterpreter::HandleOp(
   };
   state.remap_is_donated = remap_op.getDonated();
 
+  TF_RETURN_IF_ERROR(state.remap_plan.ComputeInputDevicesForOutputMap(client_));
+  TF_RETURN_IF_ERROR(state.remap_plan.Validate());
+
   for (const auto output : remap_op.getOutputs()) {
     const ArrayHandle handle = output.use_empty() ? 0 : ToArrayHandle(output);
     state.output_handles.push_back(handle);

From a3f49eccd20479d5f910b481e15c076c741737db Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 8 Dec 2025 12:04:35 -0800
Subject: [PATCH 043/753] [xla] Migrate XLA to MaybeOwningDeviceMemory

PiperOrigin-RevId: 841862688
---
 tensorflow/compiler/jit/xla_launch_util.h     |  1 +
 .../xla/xla/backends/cpu/autotuner/BUILD      |  2 +-
 .../backends/cpu/autotuner/cpu_profiler.cc    |  4 +--
 .../xla/backends/cpu/autotuner/cpu_profiler.h |  6 ++--
 .../xla/xla/backends/cpu/runtime/BUILD        |  2 +-
 .../backends/cpu/runtime/buffer_allocations.h |  7 ++--
 .../xla/xla/backends/gpu/autotuner/BUILD      |  2 +-
 .../backends/gpu/autotuner/gpu_profiler.cc    |  4 +--
 .../xla/xla/backends/interpreter/BUILD        |  2 +-
 .../backends/interpreter/executable_base.cc   |  8 ++---
 third_party/xla/xla/client/BUILD              |  2 +-
 third_party/xla/xla/client/local_client.cc    |  4 +--
 third_party/xla/xla/client/local_client.h     |  2 +-
 third_party/xla/xla/pjrt/BUILD                |  2 +-
 third_party/xla/xla/pjrt/cpu/BUILD            |  2 +-
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    |  6 ++--
 third_party/xla/xla/pjrt/gpu/tfrt/BUILD       |  2 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc  | 14 ++++----
 .../pjrt/gpu/tfrt/tracked_gpu_device_buffer.h |  2 +-
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  6 ++--
 .../xla/xla/pjrt/tracked_device_buffer.h      | 10 +++---
 third_party/xla/xla/service/BUILD             |  6 ++--
 third_party/xla/xla/service/cpu/BUILD         |  2 +-
 .../xla/xla/service/cpu/cpu_executable.cc     | 32 +++++++++----------
 .../xla/xla/service/cpu/cpu_executable.h      | 11 ++++---
 third_party/xla/xla/service/executable.cc     | 10 +++---
 third_party/xla/xla/service/executable.h      | 28 ++++++++--------
 third_party/xla/xla/service/gpu/BUILD         |  2 +-
 .../xla/xla/service/gpu/autotuning/BUILD      |  2 +-
 .../gpu/autotuning/autotuner_compile_util.cc  |  4 +--
 .../xla/xla/service/gpu/gpu_executable.cc     |  6 ++--
 third_party/xla/xla/service/hlo_runner.cc     |  6 ++--
 .../xla/xla/service/transfer_manager.cc       |  5 +--
 .../xla/xla/service/transfer_manager.h        |  4 +--
 third_party/xla/xla/stream_executor/tpu/BUILD |  4 +--
 .../stream_executor/tpu/c_api_conversions.cc  | 16 +++++-----
 .../stream_executor/tpu/c_api_conversions.h   |  7 ++--
 .../xla/xla/stream_executor/tpu/c_api_decl.h  |  6 ++--
 .../tpu/tpu_executable_interface.cc           |  6 ++--
 third_party/xla/xla/tests/BUILD               |  2 +-
 .../xla/xla/tests/buffer_donation_test.cc     |  7 ++--
 41 files changed, 132 insertions(+), 124 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 2876b3a7b96373..401f15587fcf39 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "xla/client/local_client.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
diff --git a/third_party/xla/xla/backends/cpu/autotuner/BUILD b/third_party/xla/xla/backends/cpu/autotuner/BUILD
index 16640e22a3a8f5..81247efe7e118a 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/cpu/autotuner/BUILD
@@ -46,7 +46,7 @@ cc_library(
         "//xla/backends/autotuner:profiler",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service/cpu:cpu_executable",
         "//xla/tsl/platform:errors",
diff --git a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc
index 8d29f0afdb15ef..6e841511d97c18 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc
+++ b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/cpu/cpu_executable.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/xla_data.pb.h"
@@ -94,7 +94,7 @@ absl::StatusOr<ProfileResult> CpuProfiler::Profile(
 }
 
 absl::Status CpuProfiler::Execute(
-    Executable* executable, absl::Span<const MaybeOwningDeviceMemory> buffers,
+    Executable* executable, absl::Span<const MaybeOwningDeviceAddress> buffers,
     ExecutionProfile* profile) {
   ExecutableRunOptions run_options;
   run_options.set_execution_profile(profile);
diff --git a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h
index cb62437957c187..5d5f32c780cd20 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h
+++ b/third_party/xla/xla/backends/cpu/autotuner/cpu_profiler.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "xla/backends/autotuner/profiler.h"
 #include "xla/literal.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/xla_data.pb.h"
 
@@ -33,7 +33,7 @@ namespace xla::cpu {
 
 struct LiteralBackedCpuBuffers : public InputBuffers {
   std::vector<Literal> backing_literals;
-  std::vector<MaybeOwningDeviceMemory> buffers;
+  std::vector<MaybeOwningDeviceAddress> buffers;
 };
 
 class CpuProfiler : public Profiler {
@@ -60,7 +60,7 @@ class CpuProfiler : public Profiler {
   explicit CpuProfiler(ProfileOptions options) : options_(options) {}
 
   absl::Status Execute(Executable* executable,
-                       absl::Span<const MaybeOwningDeviceMemory> buffers,
+                       absl::Span<const MaybeOwningDeviceAddress> buffers,
                        ExecutionProfile* profile);
 
  private:
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index e1e85a5cd4675a..026a0476d92786 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -56,7 +56,7 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/service:buffer_assignment",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/stream_executor:device_address",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h b/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
index b1f7e8142d2939..d91f41dcec389a 100644
--- a/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
+++ b/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/util.h"
 
@@ -40,7 +40,8 @@ class BufferAllocations {
 
   explicit BufferAllocations(Buffers buffers);
   explicit BufferAllocations(absl::Span<const se::DeviceAddressBase> buffers);
-  explicit BufferAllocations(absl::Span<const MaybeOwningDeviceMemory> buffers);
+  explicit BufferAllocations(
+      absl::Span<const MaybeOwningDeviceAddress> buffers);
 
   // Returns the device address of buffer at the given index. Returns an error
   // if the index is out of range.
@@ -80,7 +81,7 @@ inline BufferAllocations::BufferAllocations(
       num_buffers_(buffers_.size()) {}
 
 inline BufferAllocations::BufferAllocations(
-    absl::Span<const MaybeOwningDeviceMemory> buffers)
+    absl::Span<const MaybeOwningDeviceAddress> buffers)
     : buffers_(buffers.size()),
       buffers_data_(buffers_.data()),
       num_buffers_(buffers_.size()) {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index 3d1f9c93001508..662e3f7e03ddf5 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -507,7 +507,7 @@ cc_library(
         "//xla/backends/gpu/runtime:buffer_comparator",
         "//xla/hlo/ir:hlo",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu/autotuning:redzone_buffers",
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc
index 81b5135600507c..82c8405af97e3d 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/autotuning/redzone_buffers.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -62,7 +62,7 @@ std::vector<ExecutionInput> CreateExecutionInputsFromBuffers(
     // Our executable doesn't have input-output aliasing, so we can pass
     // unowned input buffers.
     inputs.back().SetUnownedBuffer(
-        /*index=*/{}, MaybeOwningDeviceMemory(/*unowned=*/buffers.at(i)));
+        /*index=*/{}, MaybeOwningDeviceAddress(/*unowned=*/buffers.at(i)));
   }
   return inputs;
 }
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index 6bf5957323a49c..b8af7e32ada24e 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -84,7 +84,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:dynamic_dimension_inference",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/stream_executor:device_address",
diff --git a/third_party/xla/xla/backends/interpreter/executable_base.cc b/third_party/xla/xla/backends/interpreter/executable_base.cc
index 7ba92f41d87701..eb7fa5d4c07832 100644
--- a/third_party/xla/xla/backends/interpreter/executable_base.cc
+++ b/third_party/xla/xla/backends/interpreter/executable_base.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
@@ -73,7 +73,7 @@ absl::StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     device_ordinal = 0;
   }
   for (auto& argument : arguments) {
-    const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
+    const ShapeTree<MaybeOwningDeviceAddress>& buffers = argument.Buffers();
     argument_buffers.push_back(ShapedBuffer(buffers.shape(),
                                             /*device_ordinal=*/device_ordinal));
     auto in_it = buffers.begin();
@@ -179,7 +179,7 @@ InterpreterExecutableBase::AllocateOutputMemoryWithInputReuse(
           -> absl::Status {
         if (alias && alias->must_alias()) {
           VLOG(1) << alias->ToString();
-          const MaybeOwningDeviceMemory& original_input =
+          const MaybeOwningDeviceAddress& original_input =
               (*arguments)[alias->parameter_number].Buffers().element(
                   alias->parameter_index);
           if (!original_input.HasOwnership()) {
@@ -214,7 +214,7 @@ InterpreterExecutableBase::AllocateOutputMemoryWithInputReuse(
     if (alias) {
       TF_RET_CHECK(alias->parameter_number < arguments->size());
       ExecutionInput& input = (*arguments)[alias->parameter_number];
-      MaybeOwningDeviceMemory* device_memory =
+      MaybeOwningDeviceAddress* device_memory =
           input.MutableBuffer(alias->parameter_index);
       if (auto owning = device_memory->Release()) {
         se::DeviceAddressBase device_memory_base = owning->Release();
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 8fa2963bc27550..c2801fa3fa8410 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -124,7 +124,7 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:local_service",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:source_map_util",
         "//xla/service:stream_pool",
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index 3c865b508f5700..cc383a9aa81b34 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -37,7 +37,7 @@ limitations under the License.
 #include "xla/service/computation_layout.h"
 #include "xla/service/dump.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/source_map_util.h"
@@ -319,7 +319,7 @@ absl::StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
 }
 
 static ShapedBuffer MaybeOwningShapeTreeToShapedBuffer(
-    const ShapeTree<MaybeOwningDeviceMemory>& tree, int device_ordinal) {
+    const ShapeTree<MaybeOwningDeviceAddress>& tree, int device_ordinal) {
   ShapedBuffer result(tree.shape(), device_ordinal);
   auto it = tree.begin();
   auto out_it = result.buffers().begin();
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 3f06595a88500d..3ccda5d43f6794 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
 #include "xla/service/local_service.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/stream_pool.h"
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 11fa3d24c990c8..6b382142dbb42f 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -710,7 +710,7 @@ cc_library(
         "//xla/service:generic_transfer_manager",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 59bcdbffdc173f..6e7d2fad54a6dc 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -202,7 +202,7 @@ cc_library(
         "//xla/service:hlo_module_util",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service/cpu:cpu_compiler",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_executable_run_options",
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 9a2a4dcb42d319..5e2f7aa65df9ef 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -114,7 +114,7 @@ limitations under the License.
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/llvm_ir/llvm_command_line_options.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
@@ -1620,7 +1620,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
 
     if (cpu_executable->has_thunks()) {
       // Call interpreted thunk sequence implementing XLA executable.
-      absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
+      absl::InlinedVector<MaybeOwningDeviceAddress, 8> buffer_device_mem;
       buffer_device_mem.reserve(buffer_table.size());
       for (const auto& buffer_info : buffer_table) {
         buffer_device_mem.emplace_back(
@@ -1764,7 +1764,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           absl::Status status;
           if (cpu_executable->has_thunks()) {
             // Call interpreted thunk sequence implementing XLA executable.
-            absl::InlinedVector<MaybeOwningDeviceMemory, 8> buffer_device_mem;
+            absl::InlinedVector<MaybeOwningDeviceAddress, 8> buffer_device_mem;
             buffer_device_mem.reserve(buffer_table.size());
             for (const auto& buffer_info : buffer_table) {
               buffer_device_mem.emplace_back(
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index 92f3be08e16c1a..b6d28c5e744e7f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -100,7 +100,7 @@ cc_library(
         "//xla/service:generic_transfer_manager",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 7dacb707060297..1c97ab898cbd21 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -69,7 +69,7 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/hlo.pb.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
@@ -902,19 +902,19 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         std::vector<ExecutionInput> inputs;
         if (parameter_is_tupled_arguments) {
           inputs.emplace_back(
-              ShapeTree<MaybeOwningDeviceMemory>(&parameter_shapes->front()));
+              ShapeTree<MaybeOwningDeviceAddress>(&parameter_shapes->front()));
           ExecutionInput& input = inputs.back();
           for (int i = 0; i < tracked_buffers.size(); ++i) {
             VLOG(4) << "tupled input[" << i
                     << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {i}, MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+                  {i}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
                            tracked_buffers[i]->buffer()->buffer(),
                            device->local_hardware_id().value(),
                            client->allocator())));
             } else {
-              input.SetBuffer({i}, MaybeOwningDeviceMemory(
+              input.SetBuffer({i}, MaybeOwningDeviceAddress(
                                        tracked_buffers[i]->buffer()->buffer()));
             }
           }
@@ -924,16 +924,16 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
             VLOG(4) << "untupled input[" << i
                     << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
             inputs.emplace_back(
-                ShapeTree<MaybeOwningDeviceMemory>(&(*parameter_shapes)[i]));
+                ShapeTree<MaybeOwningDeviceAddress>(&(*parameter_shapes)[i]));
             ExecutionInput& input = inputs.back();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {}, MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+                  {}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
                           tracked_buffers[i]->buffer()->buffer(),
                           device->local_hardware_id().value(),
                           client->allocator())));
             } else {
-              input.SetBuffer({}, MaybeOwningDeviceMemory(
+              input.SetBuffer({}, MaybeOwningDeviceAddress(
                                       tracked_buffers[i]->buffer()->buffer()));
             }
           }
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
index 3a1b1bc186f1e9..19c949075f320d 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/async_value_ref.h"
 
 namespace xla {
-// TODO(b/400541410): Refactor and Merge this with MaybeOwningDeviceMemory.
+// TODO(b/400541410): Refactor and Merge this with MaybeOwningDeviceAddress.
 
 // GpuDeviceMemory represents either an owned or unowned GPU memory. It
 // owns GPU memory if an allocator is provided. When the object goes output of
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 69635421f80399..4c175b7390e14c 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -141,7 +141,7 @@ limitations under the License.
 #include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
@@ -1673,12 +1673,12 @@ PjRtStreamExecutorClient::RunAsync(
     auto it = tmp.MutableBuffers()->begin();
     for (auto& v : input) {
       if (v.second.is_donated) {
-        it->second = MaybeOwningDeviceMemory(se::OwningDeviceMemory(
+        it->second = MaybeOwningDeviceAddress(se::OwningDeviceMemory(
             v.second.buf->mem(), device->local_device_id().value(),
             run_options.allocator()));
         tmp.SetUnownedIndex(it->first);
       } else {
-        it->second = MaybeOwningDeviceMemory(v.second.buf->mem());
+        it->second = MaybeOwningDeviceAddress(v.second.buf->mem());
       }
       ++it;
     }
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index ecc4a64dc73c45..62b36de4923881 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
@@ -109,8 +109,8 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
   // on_device_shape matches that of the TrackedDeviceBuffer. 'end' is used to
   // check that 'iterator' doesn't run out of bounds.
   void AddToInputAsImmutable(
-      ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-      const ShapeTree<MaybeOwningDeviceMemory>::iterator& end) const;
+      ShapeTree<MaybeOwningDeviceAddress>::iterator* iterator,
+      const ShapeTree<MaybeOwningDeviceAddress>::iterator& end) const;
 
   // Adds the owned device buffers in order to 'iterator', marking them as
   // available to be donated. If donation succeeds, i.e., execution_input is
@@ -121,8 +121,8 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
   // that of the TrackedDeviceBuffer. 'end' is used to check that 'iterator'
   // doesn't run out of bounds.
   void AddToInputAsDonated(
-      ShapeTree<MaybeOwningDeviceMemory>::iterator* iterator,
-      const ShapeTree<MaybeOwningDeviceMemory>::iterator& end,
+      ShapeTree<MaybeOwningDeviceAddress>::iterator* iterator,
+      const ShapeTree<MaybeOwningDeviceAddress>::iterator& end,
       ExecutionInput* execution_input,
       se::DeviceMemoryAllocator* allocator) const;
 
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 8973a3631eccdf..b87bc885903e10 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1528,7 +1528,7 @@ cc_library(
         ":computation_layout",
         ":hlo_module_config",
         ":hlo_proto_cc",
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         ":shaped_buffer",
         ":stream_pool",
         "//xla:executable_run_options",
@@ -1666,7 +1666,7 @@ cc_library(
     hdrs = ["transfer_manager.h"],
     deps = [
         ":compiler",
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         ":shaped_buffer",
         "//xla:literal",
         "//xla:shape_tree",
@@ -4460,7 +4460,7 @@ cc_library(
         ":executable",
         ":hlo_module_util",
         ":hlo_runner_interface",
-        ":maybe_owning_device_memory",
+        ":maybe_owning_device_address",
         ":shaped_buffer",
         ":transfer_manager",
         "//xla:executable_run_options",
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 986fe761476f80..f9d93965489130 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -550,7 +550,7 @@ cc_library(
         "//xla/service:hlo_execution_profile",
         "//xla/service:hlo_profile_printer_data_cc",
         "//xla/service:hlo_value",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:xla_debug_info_manager",
         "//xla/stream_executor:device_address",
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 6bb3a695e9523e..c0c1e6446220fa 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -58,7 +58,7 @@ limitations under the License.
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/xla_debug_info_manager.h"
@@ -156,7 +156,7 @@ CpuExecutable::~CpuExecutable() {
   }
 }
 
-static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
+static absl::StatusOr<MaybeOwningDeviceAddress> MemoryForAllocation(
     const BufferAllocation& allocation,
     absl::Span<const ExecutionInput> arguments,
     absl::Span<const ConstantAllocation> constants,
@@ -170,17 +170,17 @@ static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
         << "Size mismatch on param " << allocation.parameter_number()
         << " at shape index " << allocation.param_shape_index().ToString();
     VLOG(3) << "allocation is a parameter";
-    return MaybeOwningDeviceMemory{out};
+    return MaybeOwningDeviceAddress{out};
   } else if (allocation.is_constant()) {
     VLOG(3) << "allocation is a constant";
     if (allocation.index() < constants.size()) {
-      return MaybeOwningDeviceMemory(
+      return MaybeOwningDeviceAddress(
           constants[allocation.index()].AsDeviceMemoryBase());
     }
-    return MaybeOwningDeviceMemory{se::DeviceAddressBase{}};
+    return MaybeOwningDeviceAddress{se::DeviceAddressBase{}};
   } else if (allocation.is_thread_local()) {
     VLOG(3) << "buffer is thread-local";
-    return MaybeOwningDeviceMemory{se::DeviceAddressBase{}};
+    return MaybeOwningDeviceAddress{se::DeviceAddressBase{}};
   }
 
   int64_t buffer_size = allocation.size();
@@ -194,14 +194,14 @@ static absl::StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
   // initialized. Mark them initialized so that memory sanitizer doesn't flag
   // loads from these buffers.
   ABSL_ANNOTATE_MEMORY_IS_INITIALIZED(out->opaque(), buffer_size);
-  return MaybeOwningDeviceMemory{std::move(out)};
+  return MaybeOwningDeviceAddress{std::move(out)};
 }
 
-absl::StatusOr<std::vector<MaybeOwningDeviceMemory>>
+absl::StatusOr<std::vector<MaybeOwningDeviceAddress>>
 CpuExecutable::CreateBufferTable(se::DeviceAddressAllocator* memory_allocator,
                                  int device_ordinal,
                                  absl::Span<ExecutionInput const> arguments) {
-  std::vector<MaybeOwningDeviceMemory> buffers(
+  std::vector<MaybeOwningDeviceAddress> buffers(
       assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -233,7 +233,7 @@ static int32_t GetDeviceOrdinal(const ExecutableRunOptions* run_options) {
 
 absl::Status CpuExecutable::ExecuteThunks(
     const ExecutableRunOptions* run_options,
-    absl::Span<MaybeOwningDeviceMemory const> buffers) {
+    absl::Span<MaybeOwningDeviceAddress const> buffers) {
   uint64_t start_ns = tsl::Env::Default()->NowNanos();
 
   size_t profile_counters_size = 0;
@@ -244,7 +244,7 @@ absl::Status CpuExecutable::ExecuteThunks(
   VLOG(3) << "Executing XLA:CPU thunks:";
   VLOG(3) << absl::StrFormat("  Number of buffer allocations: %u",
                              buffers.size());
-  auto mem_printer = [](std::string* out, const MaybeOwningDeviceMemory& mem) {
+  auto mem_printer = [](std::string* out, const MaybeOwningDeviceAddress& mem) {
     absl::StrAppend(out,
                     absl::StrFormat("%p", mem.AsDeviceMemoryBase().opaque()));
   };
@@ -308,7 +308,7 @@ absl::Status CpuExecutable::ExecuteThunks(
 
 absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    absl::Span<MaybeOwningDeviceMemory> buffers,
+    absl::Span<MaybeOwningDeviceAddress> buffers,
     absl::Span<ExecutionInput> arguments) {
   se::Stream* stream = run_options->stream();
   ExecutionOutput result(/*on_device_shape=*/result_shape(),
@@ -345,7 +345,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     if (alias) {
       CHECK_LT(alias->parameter_number, arguments.size());
       ExecutionInput& input = arguments[alias->parameter_number];
-      MaybeOwningDeviceMemory* maybe_owning_memory =
+      MaybeOwningDeviceAddress* maybe_owning_memory =
           input.MutableBuffer(alias->parameter_index);
       if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
         return InvalidArgument(
@@ -381,7 +381,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
             run_options->allocator()->Allocate(
                 stream->parent()->device_ordinal(), allocation_size));
         result_buffer = allocated_buffer.Release();
-        MaybeOwningDeviceMemory& registered_buffer = buffers[buffer_index];
+        MaybeOwningDeviceAddress& registered_buffer = buffers[buffer_index];
         CHECK_EQ(result_buffer.size(),
                  registered_buffer.AsDeviceMemoryBase().size());
         std::memcpy(/*dest=*/result_buffer.opaque(),
@@ -392,7 +392,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     }
 
     if (result_buffer.is_null()) {
-      MaybeOwningDeviceMemory& buffer = buffers[buffer_index];
+      MaybeOwningDeviceAddress& buffer = buffers[buffer_index];
       if (std::optional<se::ScopedDeviceAddress<uint8_t>> owned_buffer =
               buffer.Release()) {
         result_buffer = owned_buffer->Release();
@@ -437,7 +437,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   se::Stream* stream = run_options->stream();
   se::DeviceAddressAllocator* memory_allocator = run_options->allocator();
   TF_ASSIGN_OR_RETURN(
-      std::vector<MaybeOwningDeviceMemory> buffers,
+      std::vector<MaybeOwningDeviceAddress> buffers,
       CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(),
                         arguments));
 
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.h b/third_party/xla/xla/service/cpu/cpu_executable.h
index ebb97baf217e47..3db37885900445 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.h
+++ b/third_party/xla/xla/service/cpu/cpu_executable.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_address_allocator.h"
 
@@ -72,8 +72,9 @@ class CpuExecutable : public Executable {
 
   // Calls emitted thunk sequence with the given arguments using the supplied
   // buffers.
-  absl::Status ExecuteThunks(const ExecutableRunOptions* run_options,
-                             absl::Span<MaybeOwningDeviceMemory const> buffers);
+  absl::Status ExecuteThunks(
+      const ExecutableRunOptions* run_options,
+      absl::Span<MaybeOwningDeviceAddress const> buffers);
 
   absl::Span<const ObjFileProto> obj_files() const { return obj_files_; }
 
@@ -172,7 +173,7 @@ class CpuExecutable : public Executable {
   //
   //  - buffers_to_free: buffers whose ownership was donated by the caller that
   //    are to be freed by the caller.
-  absl::StatusOr<std::vector<MaybeOwningDeviceMemory>> CreateBufferTable(
+  absl::StatusOr<std::vector<MaybeOwningDeviceAddress>> CreateBufferTable(
       se::DeviceAddressAllocator* memory_allocator, int device_ordinal,
       absl::Span<ExecutionInput const> arguments);
 
@@ -182,7 +183,7 @@ class CpuExecutable : public Executable {
   // assignment.
   absl::StatusOr<ExecutionOutput> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      absl::Span<MaybeOwningDeviceMemory> buffers,
+      absl::Span<MaybeOwningDeviceAddress> buffers,
       absl::Span<ExecutionInput> arguments);
 
   // Returns the instruction value set of the root instruction of the entry
diff --git a/third_party/xla/xla/service/executable.cc b/third_party/xla/xla/service/executable.cc
index b52166c243dea1..a9f8da25d12d1c 100644
--- a/third_party/xla/xla/service/executable.cc
+++ b/third_party/xla/xla/service/executable.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -70,7 +70,7 @@ absl::Status ExecutionInput::SetDynamicShape(Shape dynamic_shape) {
 }
 
 void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
-                                      MaybeOwningDeviceMemory buffer) {
+                                      MaybeOwningDeviceAddress buffer) {
   *buffers_.mutable_element(index) = std::move(buffer);
   unowned_indices_.insert(index);
 }
@@ -86,12 +86,12 @@ absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
   return result;
 }
 
-static ExecutionInput MakeMaybeOwningDeviceMemoryTree(
+static ExecutionInput MakeMaybeOwningDeviceAddressTree(
     const ShapedBuffer& shaped_buffer) {
   ExecutionInput result(shaped_buffer.on_device_shape());
   shaped_buffer.buffers().ForEachElement(
       [&](const ShapeIndex& index, const se::DeviceAddressBase& mem) {
-        result.SetBuffer(index, MaybeOwningDeviceMemory(mem));
+        result.SetBuffer(index, MaybeOwningDeviceAddress(mem));
       });
   return result;
 }
@@ -102,7 +102,7 @@ absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteAsyncOnStream(
   std::vector<ExecutionInput> args;
   args.reserve(arguments.size());
   for (const ShapedBuffer* arg : arguments) {
-    args.emplace_back(MakeMaybeOwningDeviceMemoryTree(*arg));
+    args.emplace_back(MakeMaybeOwningDeviceAddressTree(*arg));
   }
   TF_ASSIGN_OR_RETURN(ExecutionOutput out,
                       ExecuteAsyncOnStream(run_options, std::move(args)));
diff --git a/third_party/xla/xla/service/executable.h b/third_party/xla/xla/service/executable.h
index e59ac39a932d44..e76038f8a95f9a 100644
--- a/third_party/xla/xla/service/executable.h
+++ b/third_party/xla/xla/service/executable.h
@@ -37,7 +37,7 @@ limitations under the License.
 #include "xla/service/computation_layout.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -61,11 +61,11 @@ namespace xla {
 // 3) Donated by the caller and freed on error.
 //
 // Case (1) buffers are stored as
-// MaybeOwningDeviceMemory(DeviceAddressBase). Case (2) buffers are
-// stored as MaybeOwningDeviceMemory(ScopedDeviceAddress<uint8_t>),
+// MaybeOwningDeviceAddress(DeviceAddressBase). Case (2) buffers are
+// stored as MaybeOwningDeviceAddress(ScopedDeviceAddress<uint8_t>),
 //   with their indices present in unowned_indices_.
 // Case (3) buffers are stored as
-// MaybeOwningDeviceMemory(ScopedDeviceAddress<uint8_t>),
+// MaybeOwningDeviceAddress(ScopedDeviceAddress<uint8_t>),
 //   with their indices absent from unowned_indices_.
 class ExecutionInput {
  public:
@@ -88,14 +88,14 @@ class ExecutionInput {
     }
   }
 
-  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
+  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceAddress> buffers)
       : buffers_(std::move(buffers)) {
     if (!ShapeUtil::DeviceShapeIsHostShape(buffers_.shape())) {
       SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
     }
   }
   // TODO(b/170310047): remove this overload.
-  ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
+  ExecutionInput(ShapeTree<MaybeOwningDeviceAddress> buffers,
                  xla::Shape host_shape)
       : buffers_(std::move(buffers)) {
     if (!ShapeUtil::DeviceShapeIsHostShape(buffers_.shape())) {
@@ -119,12 +119,12 @@ class ExecutionInput {
 
   absl::Status SetDynamicShape(Shape dynamic_shape);
 
-  void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceMemory buffer) {
+  void SetBuffer(const ShapeIndex& index, MaybeOwningDeviceAddress buffer) {
     *buffers_.mutable_element(index) = std::move(buffer);
   }
 
   void SetUnownedBuffer(const ShapeIndex& index,
-                        MaybeOwningDeviceMemory buffer);
+                        MaybeOwningDeviceAddress buffer);
 
   void SetUnownedIndex(const ShapeIndex& index) {
     unowned_indices_.insert(index);
@@ -138,15 +138,17 @@ class ExecutionInput {
     return unowned_indices_;
   }
 
-  const ShapeTree<MaybeOwningDeviceMemory>& Buffers() const { return buffers_; }
+  const ShapeTree<MaybeOwningDeviceAddress>& Buffers() const {
+    return buffers_;
+  }
 
-  ShapeTree<MaybeOwningDeviceMemory>* MutableBuffers() { return &buffers_; }
+  ShapeTree<MaybeOwningDeviceAddress>* MutableBuffers() { return &buffers_; }
 
-  MaybeOwningDeviceMemory* MutableBuffer(const ShapeIndex& index) {
+  MaybeOwningDeviceAddress* MutableBuffer(const ShapeIndex& index) {
     return buffers_.mutable_element(index);
   }
 
-  const MaybeOwningDeviceMemory& Buffer(const ShapeIndex& index) const {
+  const MaybeOwningDeviceAddress& Buffer(const ShapeIndex& index) const {
     return buffers_.element(index);
   }
 
@@ -157,7 +159,7 @@ class ExecutionInput {
     }
   }
 
-  ShapeTree<MaybeOwningDeviceMemory> buffers_;
+  ShapeTree<MaybeOwningDeviceAddress> buffers_;
 
   // Set of indices of buffers that should be returned to the caller if an error
   // occurs when enqueuing the computation.
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 98e1675e56ab9a..b9a929cdf282fe 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -712,7 +712,7 @@ cc_library(
         "//xla/service:dump",
         "//xla/service:executable",
         "//xla/service:hlo_value",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:rendezvous",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index f619be71cd9d86..56d8c4cb2f64ee 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -496,7 +496,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu:ir_emission_utils",
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
index 14ab52352ad047..290187c503292d 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_compile_util.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/service/gpu/autotuning/autotuner_util.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -63,7 +63,7 @@ std::vector<ExecutionInput> ExecutionInputsFromBuffers(
     // Our executable doesn't have input-output aliasing, so we can pass
     // unowned input buffers.
     inputs.back().SetUnownedBuffer(
-        /*index=*/{}, MaybeOwningDeviceMemory(/*unowned=*/buffers.at(i)));
+        /*index=*/{}, MaybeOwningDeviceAddress(/*unowned=*/buffers.at(i)));
   }
   return inputs;
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index df3767982ce5f9..905cac3061925c 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -72,7 +72,7 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/hlo_value.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/rendezvous.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
@@ -937,8 +937,8 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
             << " @ index: " << index.ToString();
 
     if (output_info.alias_config) {
-      MaybeOwningDeviceMemory* maybe_owning_memory =
-          [&]() -> xla::MaybeOwningDeviceMemory* {
+      MaybeOwningDeviceAddress* maybe_owning_memory =
+          [&]() -> xla::MaybeOwningDeviceAddress* {
         // ScopedBuffer is never an owned buffer.
         if (std::holds_alternative<absl::Span<const ShapedBuffer* const>>(
                 arguments)) {
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index d0e58a65b97d98..077c92bf517de3 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -41,7 +41,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/service/hlo_runner_interface.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
@@ -294,7 +294,7 @@ static std::vector<ExecutionInput> ExecutionInputsFromScopedShapedBuffers(
 
   for (int param_num = 0; param_num < inputs.size(); param_num++) {
     const ScopedShapedBuffer& input_buffer = inputs[param_num];
-    ShapeTree<MaybeOwningDeviceMemory> buffer_tree(
+    ShapeTree<MaybeOwningDeviceAddress> buffer_tree(
         input_buffer.on_device_shape());
 
     input_buffer.buffers().ForEachElement(
@@ -329,7 +329,7 @@ static void ExecutionInputsFromMovedScopedShapedBuffers(
   for (int param_num = 0; param_num < inputs.size(); param_num++) {
     ShapedBuffer input_buffer = inputs[param_num].release();
 
-    ShapeTree<MaybeOwningDeviceMemory> buffer_tree(
+    ShapeTree<MaybeOwningDeviceAddress> buffer_tree(
         input_buffer.on_device_shape());
 
     input_buffer.buffers().ForEachElement(
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index da4264b9cb302e..4fbdcdc58ce116 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -32,7 +32,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "xla/literal.h"
 #include "xla/service/compiler.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -288,7 +288,8 @@ absl::Status TransferManager::WriteRootTupleIndexTable(
 }
 
 absl::Status TransferManager::WriteRootTupleIndexTable(
-    se::Stream* stream, const ShapeTree<MaybeOwningDeviceMemory>& buffer_tree) {
+    se::Stream* stream,
+    const ShapeTree<MaybeOwningDeviceAddress>& buffer_tree) {
   TF_RET_CHECK(buffer_tree.shape().IsTuple());
   if (ShapeUtil::TupleElementCount(buffer_tree.shape()) == 0) {
     return absl::OkStatus();
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index 978bcfc523bfc1..811138ba23f905 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
@@ -215,7 +215,7 @@ class TransferManager {
                                         const ShapedBuffer& device_buffer);
   absl::Status WriteRootTupleIndexTable(
       se::Stream* stream,
-      const ShapeTree<MaybeOwningDeviceMemory>& buffer_tree);
+      const ShapeTree<MaybeOwningDeviceAddress>& buffer_tree);
 
   // Determines the byte size requirement for the given shape on the underlying
   // architecture. This will be used to allocate an appropriately sized memory
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index c0e4af83f39ddc..1bc9dc62a099ed 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -72,7 +72,7 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
@@ -608,7 +608,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/stream_executor:device_address",
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index 4f06ee508fd8fb..58eb6c2c3033f9 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_layout.h"
@@ -158,14 +158,14 @@ xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
   return xla_shaped_buffer;
 }
 
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem,
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceAddress& mem,
                                bool aliased) {
   SE_MaybeOwningDeviceMemory se_mem;
   se_mem.owned = mem.HasOwnership();
-  se_mem.memory = ApiConverter::ToC(mem.AsDeviceMemoryBase());
+  se_mem.memory = ApiConverter::ToC(mem.AsDeviceAddress());
   if (mem.HasOwnership()) {
-    const stream_executor::OwningDeviceAddress* owned =
-        mem.AsOwningDeviceMemory();
+    const stream_executor::ScopedDeviceAddress<uint8_t>* owned =
+        mem.AsScopedDeviceAddress();
     se_mem.device_ordinal = owned->device_ordinal();
     se_mem.allocator = ApiConverter::ToC(owned->allocator());
     if (!aliased) {
@@ -180,15 +180,15 @@ SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem,
   return se_mem;
 }
 
-xla::MaybeOwningDeviceMemory FromC(
+xla::MaybeOwningDeviceAddress FromC(
     SE_MaybeOwningDeviceMemory* se_mem,
     stream_executor::DeviceAddressAllocator* allocator) {
   if (se_mem->owned) {
-    return xla::MaybeOwningDeviceMemory(stream_executor::OwningDeviceAddress(
+    return xla::MaybeOwningDeviceAddress(stream_executor::OwningDeviceAddress(
         ApiConverter::FromC(se_mem->memory), se_mem->device_ordinal,
         allocator));
   } else {
-    return xla::MaybeOwningDeviceMemory(ApiConverter::FromC(se_mem->memory));
+    return xla::MaybeOwningDeviceAddress(ApiConverter::FromC(se_mem->memory));
   }
 }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
index a3b7c716996b34..da3db36c17a1d2 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -117,7 +117,7 @@ struct TpuEmbeddingEngineParametersData {
 
 std::unique_ptr<TpuEmbeddingEngineParametersData> Create(int num_tables);
 
-xla::MaybeOwningDeviceMemory FromC(
+xla::MaybeOwningDeviceAddress FromC(
     SE_MaybeOwningDeviceMemory* se_mem,
     stream_executor::DeviceAddressAllocator* allocator);
 
@@ -132,7 +132,8 @@ SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceAddress* mem);
 // mem.HasOwnership() may be true if the buffer is aliased and shouldn't be
 // released. 'aliased' should be true in this case. 'aliased' has no effect if
 // 'mem' is unowned.
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem, bool aliased);
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceAddress& mem,
+                               bool aliased);
 
 // HloModule
 XLA_HloModule ToC(const xla::HloModule& module);
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
index 096f265acaec79..834a3da9f4ed0d 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
@@ -277,10 +277,10 @@ typedef struct XLA_Literal {
   XLA_Shape shape;
 } XLA_Literal;
 
-typedef struct XLA_MaybeOwningDeviceMemoryShapeTree {
+typedef struct XLA_MaybeOwningDeviceAddressShapeTree {
   XLA_Shape shape;
   SE_MaybeOwningDeviceMemory* buffers;
-} XLA_MaybeOwningDeviceMemoryShapeTree;
+} XLA_MaybeOwningDeviceAddressShapeTree;
 
 typedef struct XLA_ShapeIndex {
   int64_t indices[8];
@@ -288,7 +288,7 @@ typedef struct XLA_ShapeIndex {
 } XLA_ShapeIndex;
 
 typedef struct SE_ExecutionInput {
-  XLA_MaybeOwningDeviceMemoryShapeTree shape_tree;
+  XLA_MaybeOwningDeviceAddressShapeTree shape_tree;
   XLA_ShapeIndex* unowned_indices;
   int unowned_indices_size;
   XLA_Shape dynamic_shape;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
index 0b4c4db98728d2..ab8616ddc8ecc4 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
@@ -106,7 +106,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
           -> absl::Status {
         if (alias && alias->must_alias()) {
           VLOG(1) << alias->ToString();
-          const MaybeOwningDeviceMemory& original_input =
+          const MaybeOwningDeviceAddress& original_input =
               (*arguments)[alias->parameter_number].Buffers().element(
                   alias->parameter_index);
           if (!original_input.HasOwnership()) {
@@ -152,7 +152,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
     if (alias) {
       TF_RET_CHECK(alias->parameter_number < arguments->size());
       ExecutionInput& input = (*arguments)[alias->parameter_number];
-      MaybeOwningDeviceMemory* device_memory =
+      MaybeOwningDeviceAddress* device_memory =
           input.MutableBuffer(alias->parameter_index);
       if (auto owning = device_memory->Release()) {
         // If the caller passes the ownership of the device memory, reuse it
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index aff7b7e1abfcdd..9f617478a6ea7b 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -520,7 +520,7 @@ xla_test(
         "//xla/service:backend",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
-        "//xla/service:maybe_owning_device_memory",
+        "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 150e6c769ace79..324917cbd57df6 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/backend.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_module_config.h"
-#include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -132,10 +132,11 @@ class BufferDonationTest : public HloTestBase {
           stream.get(), argument_literal, shaped_buffer));
       ShapeTree<se::DeviceMemoryBase> input_buffers = shaped_buffer.buffers();
       inputs_buffers.push_back(input_buffers);
-      ShapeTree<MaybeOwningDeviceMemory> owned_buffers(
+      ShapeTree<MaybeOwningDeviceAddress> owned_buffers(
           argument_literal.shape());
       owned_buffers.ForEachMutableElement(
-          [&](const ShapeIndex& index, MaybeOwningDeviceMemory* device_memory) {
+          [&](const ShapeIndex& index,
+              MaybeOwningDeviceAddress* device_memory) {
             if (donate_argument) {
               *device_memory = se::OwningDeviceMemory(
                   input_buffers.element(index), executor_->device_ordinal(),

From 232d54ce017655b3a085fccb98fcf195b5fcf739 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 12:56:24 -0800
Subject: [PATCH 044/753] Reverts a243fdc6b63024c7d71a4cef1b841003a8f408c2

PiperOrigin-RevId: 841881595
---
 .../simplifiers/algebraic_simplifier.cc       | 79 -------------------
 .../simplifiers/algebraic_simplifier_test.cc  | 60 --------------
 2 files changed, 139 deletions(-)

diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
index f2368abca9f8a0..413520c0f2ab48 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
@@ -6899,85 +6899,6 @@ absl::Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     }
   }
 
-  // Simplify:
-  //    Txx[...,1]   slice(Txx[..., K] reshape(Txx[...,N*K])) // N > 1
-  // To:
-  //    Txx[...,1] reshape(Txx[..., N]   slice(Txx[...,N*K], stride(-1)=K)
-  //
-  // Maintaining data-parallelism to improve throughput on some architectures.
-  HloInstruction* reshape;
-  if (Match(slice, m::Slice(m::Reshape(&reshape, m::Op())))) {
-    HloInstruction* input = reshape->mutable_operand(0);
-    const Shape& input_shape = input->shape();
-    const Shape& reshape_shape = reshape->shape();
-
-    const int64_t input_rank = input_shape.dimensions().size();
-    const int64_t reshape_rank = reshape_shape.dimensions().size();
-    const int64_t slice_rank = slice->shape().dimensions().size();
-
-    // Reshape must have at least 2 dimensions and same number of
-    // dimensions as slice.
-    if (reshape_rank >= 2 && reshape_rank == slice_rank) {
-      bool is_valid_reshape_slice = true;
-      for (int64_t i = 0; i < slice_rank; ++i) {
-        if (i == slice_rank - 1) {
-          // Continue if we are slicing exactly one element from the last
-          // dimension.
-          if (slice->slice_limits(i) - slice->slice_starts(i) == 1) {
-            continue;
-          }
-        } else {
-          // Continue if we are not slicing any other dimension.
-          if (slice->slice_starts(i) == 0 &&
-              slice->slice_limits(i) == reshape_shape.dimensions(i) &&
-              slice->slice_strides(i) == 1) {
-            continue;
-          }
-        }
-        // If the rules above are not met, prevent a match.
-        is_valid_reshape_slice = false;
-        break;
-      }
-
-      // Check if slice is selecting a single element from the last dimension.
-      if (is_valid_reshape_slice) {
-        int64_t slice_index = slice->slice_starts()[slice_rank - 1];
-        int64_t K = reshape_shape.dimensions(reshape_rank - 1);
-
-        // Check if input shape can be viewed as [..., N*K], where N is two or
-        // more, e.g. Input [1, 2024, 4, 128], Reshape [518144, 2].
-        // Last dim of input 128 is multiple of 2.
-        if (!input_shape.dimensions().empty()) {
-          int64_t last_dim = input_shape.dimensions(input_rank - 1);
-          if (last_dim % K == 0 && last_dim / K > 1) {
-            // It matches!
-            DimensionVector starts(input_rank, 0);
-            DimensionVector limits(input_shape.dimensions().begin(),
-                                   input_shape.dimensions().end());
-            DimensionVector strides(input_rank, 1);
-
-            starts[input_rank - 1] = slice_index;
-            limits[input_rank - 1] = last_dim;
-            strides[input_rank - 1] = K;
-
-            Shape new_slice_shape = input_shape;
-            new_slice_shape.set_dimensions(
-                input_rank - 1, input_shape.dimensions(input_rank - 1) / K);
-            simplifier_->UpdateLayout(&new_slice_shape);
-
-            HloInstruction* new_slice =
-                slice->parent()->AddInstruction(HloInstruction::CreateSlice(
-                    new_slice_shape, input, starts, limits, strides));
-            HloInstruction* new_reshape = slice->parent()->AddInstruction(
-                HloInstruction::CreateReshape(slice->shape(), new_slice));
-
-            return ReplaceInstruction(slice, new_reshape);
-          }
-        }
-      }
-    }
-  }
-
   if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
       hlo_instruction_utils::IsUnstridedSlice(slice) &&
       hlo_instruction_utils::IsUnstridedSlice(slice->operand(0))) {
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
index cecf7762332bbd..34bea124e05379 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
@@ -5299,66 +5299,6 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
 }
 
-TEST_F(AlgebraicSimplifierTest, SliceWithReshape) {
-  const absl::string_view hlo_string = R"hlo(
-  HloModule SliceWithReshape
-
-  ENTRY main {
-    %arg = f32[1,2024,4,128]{3,2,1,0} parameter(0)
-    %reshape.1 = f32[2,259072,2]{2,1,0} reshape(%arg)
-    %slice = f32[2,259072,1]{2,1,0} slice(%reshape.1), slice={[0:2], [0:259072], [1:2]}
-    ROOT %reshape.2 = f32[518144]{0} reshape(%slice)
-  }
-)hlo";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(module.get()).value());
-
-  auto* root = module->entry_computation()->root_instruction();
-  VLOG(2) << module->ToString();
-
-  // Expected: Reshape(Slice(Arg))
-  // AlgebraicSimplifier merges the two reshapes.
-  const HloInstruction* slice;
-  EXPECT_THAT(root, GmockMatch(m::Reshape(
-                        m::Slice(&slice, m::Parameter(0)))));
-
-  EXPECT_EQ(slice->slice_strides(3), 2);
-  EXPECT_EQ(slice->slice_starts(3), 1);
-  EXPECT_EQ(slice->slice_limits(3), 128);
-  EXPECT_EQ(slice->shape().dimensions(3), 64);
-}
-
-TEST_F(AlgebraicSimplifierTest, SmallSliceWithReshape) {
-  const absl::string_view hlo_string = R"hlo(
-  HloModule SliceWithReshape
-
-  ENTRY main {
-    %arg = f32[2]{0} parameter(0)
-    %reshape.1 = f32[2,1]{1,0} reshape(%arg)
-    %slice = f32[1,1]{1,0} slice(%reshape.1), slice={[0:1], [0:1]}
-    ROOT %reshape.2 = f32[1]{0} reshape(%slice)
-  }
-)hlo";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(module.get()).value());
-
-  auto* root = module->entry_computation()->root_instruction();
-  LOG(INFO) << module->ToString();
-
-  // Expected: Reshape(Slice(Arg))
-  // AlgebraicSimplifier merges the two reshapes.
-  const HloInstruction* slice;
-  EXPECT_THAT(root, GmockMatch(m::Reshape(
-                        m::Slice(&slice, m::Parameter(0)))));
-
-  EXPECT_EQ(slice->slice_strides(0), 1);
-  EXPECT_EQ(slice->slice_starts(0), 0);
-  EXPECT_EQ(slice->slice_limits(0), 1);
-  EXPECT_EQ(slice->shape().dimensions(0), 1);
-}
-
 TEST_F(AlgebraicSimplifierTest, SliceOfBroadcastToBroadcast) {
   HloComputation::Builder builder(TestName());
   const int64_t dim0 = 11;

From b198f87cb214c8e52c39a57b374e5ba320c9804c Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Mon, 8 Dec 2025 13:19:09 -0800
Subject: [PATCH 045/753] [XLA:CPU/GPU] Emit Arith::NegFOp in tiled emitter.

This gives better numerical stability on CPU which does support the instruction, we simply rewrite it back to it's original form to ensure the triton lowering works.

PiperOrigin-RevId: 841890088
---
 .../codegen/triton/compilation_pipeline.cc    |  1 +
 .../gpu/codegen/triton/emitter_helpers.cc     |  6 +-
 .../triton/fusion_emitter_device_test.cc      |  8 +-
 .../backends/gpu/codegen/triton/support.cc    |  3 +-
 .../gpu/codegen/triton/support_test.cc        |  3 +-
 .../gpu/codegen/triton/transforms/BUILD       |  2 +
 .../gpu/codegen/triton/transforms/passes.h    |  1 +
 .../gpu/codegen/triton/transforms/passes.td   |  8 ++
 ...nsupported_elementwise_to_triton_pass.mlir | 20 +++++
 .../unsupported_elementwise_to_triton_pass.cc | 83 +++++++++++++++++++
 10 files changed, 127 insertions(+), 8 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/unsupported_elementwise_to_triton_pass.mlir
 create mode 100644 third_party/xla/xla/backends/gpu/codegen/triton/transforms/unsupported_elementwise_to_triton_pass.cc

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
index b23a7dfe498e48..67c1eb9b94d7a7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
@@ -40,6 +40,7 @@ void CreateTritonXlaPipeline(
   pm->addPass(mlir::triton::xla::CreateStableHLOLowerToTritonPass());
 
   pm->addPass(emitters::CreateSafeIntegerArithmeticPass());
+  pm->addPass(mlir::triton::xla::CreateUnsupportedElementwiseToTritonPass());
 
   auto* cuda_cc = gpu_cc.cuda_compute_capability();
   bool is_at_least_hopper = cuda_cc != nullptr && cuda_cc->IsAtLeastHopper();
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
index f73c72bcf7873a..19c2bed34cf55c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/emitter_helpers.cc
@@ -439,8 +439,10 @@ absl::StatusOr<Value> EmitElementwise(mlir::ImplicitLocOpBuilder& b,
     case HloOpcode::kNot:
       return ma::XOrIOp::create(b, inputs[0], OnesLike(b, inputs[0].getType()));
     case HloOpcode::kNegate:
-      // NegFOp is not supported by Triton.
-      return Subtract(b, {ZerosLike(b, inputs[0]), inputs[0]});
+      if (is_integer) {
+        return Subtract(b, {ZerosLike(b, inputs[0]), inputs[0]});
+      }
+      return ma::NegFOp::create(b, inputs[0]);
     case HloOpcode::kConvert: {
       TF_ASSIGN_OR_RETURN(
           Type dst_ty, PrimitiveTypeToMlirType(b, hlo.shape().element_type()));
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index 99112cd3bf51b3..d93e3016930055 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -3109,10 +3109,10 @@ ENTRY entry_computation {
       CreateTritonIrAndFileCheck(this, kHloText, "triton_computation", R"(
 CHECK:     xtile.extract {{.*}} -> tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
-CHECK:     arith.subf {{.*}} f32
+CHECK:     arith.negf {{.*}} f32
 CHECK:     xtile.extract {{.*}} -> tensor<f32>
 CHECK:     tt.extern_elementwise {{.*}} (f32) -> f32
-CHECK:     arith.subf {{.*}} f32
+CHECK:     arith.negf {{.*}} f32
 CHECK:     arith.addf {{.*}} f32
 CHECK:     arith.mulf {{.*}} f32
 CHECK:     arith.divf {{.*}} f32
@@ -3622,7 +3622,7 @@ CHECK:      {{.*}} = scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]]
 CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>) {
 CHECK-DAG:  xtile.extract %[[ARG0]]
 CHECK-DAG:  xtile.extract %[[ARG1]]
-CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
+CHECK-DAG:  arith.negf {{.*}} : tensor<16x32xf32>
 CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
 CHECK:      stablehlo.dot_general {{.*}} (tensor<16x32xf32>, tensor<32x64xf32>) -> tensor<16x64xf32>
 CHECK:      arith.addf {{.*}}
@@ -3643,7 +3643,7 @@ CHECK:      {{.*}} = scf.for %{{.*}} = %[[C0]] to %[[C4]] step %[[C1]]
 CHECK-SAME: iter_args({{.*}}) -> (tensor<16x64xf32>) {
 CHECK-DAG:  xtile.extract %[[ARG0]]
 CHECK-DAG:  xtile.extract %[[ARG1]]
-CHECK-DAG:  arith.subf {{.*}} : tensor<16x32xf32>
+CHECK-DAG:  arith.negf {{.*}} : tensor<16x32xf32>
 CHECK-DAG:  math.absf {{.*}} : tensor<32x64xf32>
 CHECK:      tt.dot {{.*}} tensor<16x32xf32> * tensor<32x64xf32> -> tensor<16x64xf32>
 CHECK:      scf.yield {{.*}} : tensor<16x64xf32>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
index de63f5dd669128..2f044f7f98afa7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
@@ -92,7 +92,8 @@ absl::flat_hash_set<HloOpcode> TritonSupportedUnaryElementwiseOps(
   absl::flat_hash_set<HloOpcode> ret{HloOpcode::kAbs, HloOpcode::kCopy};
 
   if (element_type != PrimitiveType::F8E5M2 &&
-      element_type != PrimitiveType::F8E4M3FN) {
+      element_type != PrimitiveType::F8E4M3FN &&
+      element_type != PrimitiveType::F8E8M0FNU) {
     ret.insert(HloOpcode::kNegate);
   }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
index bbb7ac9a3e931a..7cda77ed4e673a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
@@ -123,9 +123,10 @@ bool DoesOpSupportType(HloOpcode opcode, PrimitiveType type) {
     case HloOpcode::kDivide:
     case HloOpcode::kRemainder:
     case HloOpcode::kSubtract:
-    case HloOpcode::kNegate:
     case HloOpcode::kIota:
       return type != PRED;
+    case HloOpcode::kNegate:
+      return type != PRED && type != F8E8M0FNU;
     case HloOpcode::kRng:
       return !pu::IsComplexType(type);
     case HloOpcode::kComplex:
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
index 5cc3e4f9c12505..9a722714a8357f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/BUILD
@@ -49,6 +49,7 @@ cc_library(
         "triton_xla_math_to_libdevice.cc",
         "triton_xla_squeeze_dims_pass.cc",
         "triton_xla_unswitch_loops_pass.cc",
+        "unsupported_elementwise_to_triton_pass.cc",
         "xtile_lower_to_triton.cc",
     ],
     hdrs = ["passes.h"],
@@ -82,6 +83,7 @@ cc_library(
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FunctionInterfaces",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
index 2ab433be265408..75007131ffdb13 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.h
@@ -54,6 +54,7 @@ std::unique_ptr<mlir::Pass> CreateTritonXLAMathToLibdevicePass(
     absl::string_view libdevice_path, absl::string_view triple);
 std::unique_ptr<mlir::Pass> CreateXTileLowerToTritonPass();
 std::unique_ptr<mlir::Pass> CreateArithFP8ConversionToTritonPass();
+std::unique_ptr<mlir::Pass> CreateUnsupportedElementwiseToTritonPass();
 
 // Returns true if the `op` contains an operation in it's regions that satisfies
 // the `fn`.
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
index 1c4d71feb98e1b..d8779d2ba0f4ce 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/passes.td
@@ -264,5 +264,13 @@ def ArithFP8ConversionToTritonPass
     "::mlir::triton::TritonDialect",
   ];
 }
+def UnsupportedElementwiseToTritonPass
+    : Pass<"unsupported-elementwise-to-triton"> {
+  let summary =
+    "Converts unsupported elementwise operations to their Triton equivalent.";
+  let dependentDialects = [
+    "::mlir::arith::ArithDialect",
+  ];
+}
 
 #endif  // XLA_BACKENDS_GPU_CODEGEN_TRITON_PASSES_TD_
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/unsupported_elementwise_to_triton_pass.mlir b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/unsupported_elementwise_to_triton_pass.mlir
new file mode 100644
index 00000000000000..8313bbc324ae06
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/tests/unsupported_elementwise_to_triton_pass.mlir
@@ -0,0 +1,20 @@
+// RUN: xla-opt %s -split-input-file -unsupported-elementwise-to-triton \
+// RUN: | FileCheck %s
+
+func.func @converts_tensor_negf_to_subf(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  // CHECK: %[[ZERO:.*]] = arith.constant dense<0.000000e+00> : tensor<10xf32>
+  // CHECK: %[[SUB:.*]] = arith.subf %[[ZERO]], %arg0 : tensor<10xf32>
+  %0 = arith.negf %arg0 : tensor<10xf32>
+  // CHECK: return %[[SUB]] : tensor<10xf32>
+  func.return %0 : tensor<10xf32>
+}
+
+//-----
+
+func.func @converts_scalar_negf_to_subf(%arg0: f32) -> f32 {
+  // CHECK: %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[SUB:.*]] = arith.subf %[[ZERO]], %arg0 : f32
+  %0 = arith.negf %arg0 : f32
+  // CHECK: return %[[SUB]] : f32
+  func.return %0 : f32
+}
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/unsupported_elementwise_to_triton_pass.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/unsupported_elementwise_to_triton_pass.cc
new file mode 100644
index 00000000000000..fabfc8d9815c3b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/unsupported_elementwise_to_triton_pass.cc
@@ -0,0 +1,83 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "llvm/ADT/APFloat.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h"
+
+namespace mlir::triton::xla {
+
+#define GEN_PASS_DEF_UNSUPPORTEDELEMENTWISETOTRITONPASS
+#include "xla/backends/gpu/codegen/triton/transforms/passes.h.inc"
+
+namespace {
+
+class RewriteNegFToSubtract : public OpRewritePattern<mlir::arith::NegFOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mlir::arith::NegFOp op,
+                                PatternRewriter& rewriter) const override {
+    mlir::Type element_type = getElementTypeOrSelf(op.getType());
+    auto type = mlir::dyn_cast<mlir::FloatType>(element_type);
+
+    if (!type) {
+      return rewriter.notifyMatchFailure(op, "expected float type");
+    }
+
+    const llvm::fltSemantics& semantics = type.getFloatSemantics();
+    mlir::Value zero_value =
+        mlir::createScalarOrSplatConstant(rewriter, op->getLoc(), op.getType(),
+                                          mlir::APFloat::getZero(semantics));
+
+    rewriter.replaceOpWithNewOp<mlir::arith::SubFOp>(op, zero_value,
+                                                     op.getOperand());
+    return success();
+  }
+};
+
+struct UnsupportedElementwiseToTritonPass
+    : public impl::UnsupportedElementwiseToTritonPassBase<
+          UnsupportedElementwiseToTritonPass> {
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::RewritePatternSet patterns(
+        &getContext(), std::make_unique<RewriteNegFToSubtract>(&getContext()));
+    if (failed(applyPatternsGreedily(module, std::move(patterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::Pass> CreateUnsupportedElementwiseToTritonPass() {
+  return std::make_unique<UnsupportedElementwiseToTritonPass>();
+}
+
+}  // namespace mlir::triton::xla

From 5380ac8c2e7b8f20046d1115eee73d0cc4e464e3 Mon Sep 17 00:00:00 2001
From: Alex Pivovarov <upwind@google.com>
Date: Mon, 8 Dec 2025 13:42:05 -0800
Subject: [PATCH 046/753] Use absl::StrAppend for string building.

This change

- Replaces string concatenation with absl::StrAppend for efficiency

- Updates usages of se::DeviceMemoryBase to se::DeviceAddressBase, reflecting a type renaming.

PiperOrigin-RevId: 841899098
---
 .../gpu/runtime/dynamic_slice_thunk.cc        | 32 ++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
index 5628f682b31981..23227934cd3775 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
@@ -118,15 +118,17 @@ std::string DynamicSliceThunk::SliceDef::ToString() const {
 
   // embedded_thunk_argument
   if (embedded_thunk_argument.has_value()) {
-    result += "embedded_thunk_argument:" + embedded_thunk_argument->ToString();
+    absl::StrAppend(&result, "embedded_thunk_argument:",
+                    embedded_thunk_argument->ToString());
   } else {
-    result += "embedded_thunk_argument:null";
+    absl::StrAppend(&result, "embedded_thunk_argument:null");
   }
 
   // offsets
   if (offsets.has_value()) {
-    result += ", offsets:[";
-    result +=
+    absl::StrAppend(&result, ", offsets:[");
+    absl::StrAppend(
+        &result,
         absl::StrJoin(*offsets, ", ", [](std::string* out, const auto& offset) {
           std::visit(
               [out](const auto& value) {
@@ -141,34 +143,34 @@ std::string DynamicSliceThunk::SliceDef::ToString() const {
                 }
               },
               offset);
-        });
-    result += "]";
+        }));
+    absl::StrAppend(&result, "]");
   } else {
-    result += ", offsets:null";
+    absl::StrAppend(&result, ", offsets:null");
   }
 
   // orig_shape
   if (orig_shape.has_value()) {
-    result += ", orig_shape:" + orig_shape->ToString();
+    absl::StrAppend(&result, ", orig_shape:", orig_shape->ToString());
   } else {
-    result += ", orig_shape:null";
+    absl::StrAppend(&result, ", orig_shape:null");
   }
 
   // sliced_shape
   if (sliced_shape.has_value()) {
-    result += ", sliced_shape:" + sliced_shape->ToString();
+    absl::StrAppend(&result, ", sliced_shape:", sliced_shape->ToString());
   } else {
-    result += ", sliced_shape:null";
+    absl::StrAppend(&result, ", sliced_shape:null");
   }
 
   // offset_byte_size
   if (offset_byte_size.has_value()) {
-    result += ", offset_byte_size:" + absl::StrCat(*offset_byte_size);
+    absl::StrAppend(&result, ", offset_byte_size:", *offset_byte_size);
   } else {
-    result += ", offset_byte_size:null";
+    absl::StrAppend(&result, ", offset_byte_size:null");
   }
 
-  result += "}";
+  absl::StrAppend(&result, "}");
   return result;
 }
 
@@ -243,7 +245,7 @@ absl::Status DynamicSliceThunk::Prepare(const PrepareParams& params) {
         HloEvaluator()
             .Evaluate(
                 /*module=*/*offset_as_function_of_indvar_metadata_->indvar_init,
-                /*arg_literals=*/{})
+                /*args=*/{})
             .value();
     VLOG(2) << "Indvar init module: "
             << offset_as_function_of_indvar_metadata_->indvar_init->ToString();

From 72b1ad96d11b5cb8278667a57d5e8255ff82cfe5 Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Mon, 8 Dec 2025 13:45:56 -0800
Subject: [PATCH 047/753] [XLA:CPU][XTile] Wrap copy operation if tiling
 enabled.

This is significantly quicker than the existing implementation when tiled emitter is enabled (due to it being multithreaded)

PiperOrigin-RevId: 841900416
---
 third_party/xla/xla/backends/cpu/codegen/tiled/BUILD       | 5 ++++-
 .../xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc | 4 ++--
 .../xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h  | 2 ++
 .../cpu/codegen/tiled/tiled_fusion_emitter_stub.cc         | 2 ++
 .../xla/backends/cpu/testlib/kernel_runner_extension.cc    | 3 ++-
 third_party/xla/xla/service/cpu/BUILD                      | 1 +
 third_party/xla/xla/service/cpu/cpu_compiler.cc            | 4 +++-
 third_party/xla/xla/service/cpu/fusion_wrapper.cc          | 7 +++++++
 third_party/xla/xla/service/cpu/fusion_wrapper.h           | 6 ++++--
 third_party/xla/xla/service/cpu/fusion_wrapper_test.cc     | 2 +-
 10 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
index 0e0243f5b6dbd5..1ae35c6a56c63d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/BUILD
@@ -34,7 +34,10 @@ cc_library(
         ["tiled_fusion_emitter_stub.cc"],
     ),
     hdrs = ["tiled_fusion_emitter.h"],
-    visibility = ["//xla/backends/cpu/codegen:__pkg__"],
+    visibility = [
+        "//xla/backends/cpu/codegen:__pkg__",
+        "//xla/service/cpu:__pkg__",
+    ],
     deps = [
         "//xla:shape_util",
         "//xla:util",
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
index ca968b3df04a1d..74f951b2ef9d3b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.cc
@@ -113,7 +113,7 @@ absl::StatusOr<std::vector<FlatTiling>> GetTiling(
 }
 
 // We don't currently support sub-byte types in the tiled CPU emitter.
-static bool IsSupportedType(PrimitiveType type) {
+bool IsSupportedTilingType(PrimitiveType type) {
   if (type == PRED) {
     return true;
   }
@@ -144,7 +144,7 @@ static bool IsSupportedShape(const Shape& shape) {
   ShapeUtil::ForEachSubshape(
       shape, [&](const Shape& subshape, const ShapeIndex& index) {
         if (subshape.IsArray()) {
-          if (!IsSupportedType(subshape.element_type())) {
+          if (!IsSupportedTilingType(subshape.element_type())) {
             is_supported = false;
           }
         }
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
index d2f88d17d85b74..6a8eaaf96b7ac2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h
@@ -32,6 +32,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
+bool IsSupportedTilingType(PrimitiveType type);
+
 absl::StatusOr<std::vector<FlatTiling>> GetTilingIfSupported(
     mlir::MLIRContext& context, const HloFusionInstruction& fusion);
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
index 37f2abadb37ce4..87e14516c1d740 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/tiled_fusion_emitter_stub.cc
@@ -30,6 +30,8 @@ limitations under the License.
 
 namespace xla::cpu {
 
+bool IsSupportedTilingType(PrimitiveType type) { return false; }
+
 absl::StatusOr<std::vector<FlatTiling>> GetTilingIfSupported(
     mlir::MLIRContext& context, const HloFusionInstruction& fusion) {
   return absl::UnimplementedError("not supported for this build configuration");
diff --git a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
index 20385e34237e0d..e3646f2fd624b2 100644
--- a/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
+++ b/third_party/xla/xla/backends/cpu/testlib/kernel_runner_extension.cc
@@ -271,7 +271,8 @@ NB_MODULE(_extension, kernel_runner_module) {
   kernel_runner_module.def(
       "run_fusion_wrapper_pass",
       [](std::unique_ptr<HloModule, nb::deleter<HloModule>> hlo_module) {
-        FusionWrapper fusion_wrapper(true);
+        FusionWrapper fusion_wrapper(/*using_new_fusion_emitter=*/true,
+                                     /*use_tiled_emitter=*/true);
         absl::StatusOr<bool> result = fusion_wrapper.Run(hlo_module.get());
         if (!result.ok()) {
           throw std::runtime_error(std::string(result.status().message()));
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index f9d93965489130..7154ec3e7ff9c5 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -1097,6 +1097,7 @@ cc_library(
     srcs = ["fusion_wrapper.cc"],
     hdrs = ["fusion_wrapper.h"],
     deps = [
+        "//xla/backends/cpu/codegen/tiled:tiled_fusion_emitter",
         "//xla/codegen/emitters:fusion_wrapper_base",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index a6117a8169ddc3..12b1e38459b79e 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1017,7 +1017,9 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   if (is_fusion_emitters) {
     bool use_experimental_loop_fusion =
         options::UseExperimentalLoopFusion(module->config());
-    pipeline.AddPass<FusionWrapper>(use_experimental_loop_fusion);
+    bool use_tiled_emitter = options::EnableTiledEmitter(module->config());
+    pipeline.AddPass<FusionWrapper>(use_experimental_loop_fusion,
+                                    use_tiled_emitter);
   }
 
   AliasInfo alias_info;
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.cc b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
index af4cf569643a95..1bef382eff71fe 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/cpu/fusion_wrapper.h"
 
+#include "xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 
@@ -85,6 +86,12 @@ bool FusionWrapper::MustWrapInstruction(const HloInstruction& instruction) {
     case HloOpcode::kTanh:
     case HloOpcode::kXor:
       return using_new_fusion_emitter_;
+    case HloOpcode::kCopy:
+      if (use_tiled_emitter_) {
+        PrimitiveType type = instruction.shape().element_type();
+        return IsSupportedTilingType(type);
+      }
+      return false;
     // The following ops are supported but the performance is not as good as the
     // non-fusion path.
     // TODO(willfroom): Remove this once the performance is improved.
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.h b/third_party/xla/xla/service/cpu/fusion_wrapper.h
index 5f430f93afa8c7..5da07c2f3efc3f 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper.h
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.h
@@ -28,8 +28,9 @@ namespace cpu {
 // kick in.
 class FusionWrapper : public emitters::FusionWrapperBase {
  public:
-  explicit FusionWrapper(bool using_new_fusion_emitter)
-      : using_new_fusion_emitter_(using_new_fusion_emitter) {}
+  explicit FusionWrapper(bool using_new_fusion_emitter, bool use_tiled_emitter)
+      : using_new_fusion_emitter_(using_new_fusion_emitter),
+        use_tiled_emitter_(use_tiled_emitter) {}
   ~FusionWrapper() override = default;
 
   absl::string_view name() const override { return "fusion-wrapper"; }
@@ -38,6 +39,7 @@ class FusionWrapper : public emitters::FusionWrapperBase {
 
  private:
   bool using_new_fusion_emitter_;
+  bool use_tiled_emitter_;
 };
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc b/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
index b8e1438ef1dc34..c81369604eb756 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper_test.cc
@@ -56,7 +56,7 @@ TEST_F(FusionWrapperTest, Scatter) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
                           ParseAndReturnVerifiedModule(hlo_string));
-  FusionWrapper wrapper(false);
+  FusionWrapper wrapper(false, false);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, wrapper.Run(m.get()));
   EXPECT_TRUE(changed);
 

From 5854d191dc17b477b4efc7228160f3febcfd72a6 Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <mkuper@google.com>
Date: Mon, 8 Dec 2025 13:53:25 -0800
Subject: [PATCH 048/753] [XLA] Add methods to permute the operands of a fusion
 op.

PiperOrigin-RevId: 841903126
---
 third_party/xla/xla/hlo/ir/hlo_computation.cc | 30 ++++++++++++++++
 third_party/xla/xla/hlo/ir/hlo_computation.h  |  4 +++
 .../xla/xla/hlo/ir/hlo_instructions.cc        | 27 ++++++++++++++
 third_party/xla/xla/hlo/ir/hlo_instructions.h |  4 +++
 third_party/xla/xla/service/BUILD             |  1 +
 .../xla/xla/service/hlo_instruction_test.cc   | 35 +++++++++++++++++++
 6 files changed, 101 insertions(+)

diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 57989017c4e43a..50d5655aeab3ab 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -593,6 +593,36 @@ absl::Status HloComputation::RemoveUnusedParametersImpl(bool allow_non_fusion) {
   return absl::OkStatus();
 }
 
+absl::Status HloComputation::PermuteParameters(
+    absl::Span<const int64_t> permutation) {
+  if (permutation.size() != num_parameters()) {
+    return absl::InvalidArgumentError(
+        "Permutation size must match the number of parameters.");
+  }
+  if (permutation.size() == 1) {
+    return absl::OkStatus();
+  }
+
+  std::vector<std::unique_ptr<HloInstruction>> new_param_instructions(
+      num_parameters());
+  for (int64_t i = 0; i < num_parameters(); ++i) {
+    int64_t new_param_number = permutation[i];
+    new_param_instructions[new_param_number] = HloInstruction::CreateParameter(
+        new_param_number, param_instructions_[i]->shape(),
+        param_instructions_[i]->name());
+  }
+
+  for (int64_t i = 0; i < num_parameters(); ++i) {
+    ReplaceParameter(i, std::move(new_param_instructions[permutation[i]]));
+  }
+
+  absl::c_sort(param_instructions_,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->parameter_number() < b->parameter_number();
+               });
+  return absl::OkStatus();
+}
+
 bool HloComputation::IsSafelyRemovable(
     const HloInstruction* instruction, bool ignore_control_dependency,
     std::optional<
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index f13f1a8a937aa0..64f3ca34d84a98 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -984,6 +984,10 @@ class HloComputation {
 
   void ClearCalledComputations();
 
+  // Permutes the parameter numbers of this computation according to the
+  // provided permutation.
+  absl::Status PermuteParameters(absl::Span<const int64_t> permutation);
+
  private:
   friend class HloModule;
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 55c3b05fe0d10f..526165754c9794 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -2597,6 +2597,33 @@ absl::Status HloFusionInstruction::DeduplicateFusionOperands() {
   return absl::OkStatus();
 }
 
+absl::Status HloFusionInstruction::PermuteFusionOperands(
+    absl::Span<const int64_t> permutation) {
+  if (permutation.size() != operand_count()) {
+    return absl::InvalidArgumentError(
+        "Permutation size must match the number of operands.");
+  }
+  std::vector<bool> seen(permutation.size(), false);
+  for (int64_t i = 0; i < permutation.size(); ++i) {
+    if (permutation[i] < 0 || permutation[i] >= operand_count() ||
+        seen[permutation[i]]) {
+      return absl::InvalidArgumentError(
+          "Argument is not a permutation of operand indices.");
+    }
+    seen[permutation[i]] = true;
+  }
+
+  TF_RETURN_IF_ERROR(
+      fused_instructions_computation()->PermuteParameters(permutation));
+  InstructionVector new_operands(operand_count());
+  for (int64_t i = 0; i < operand_count(); ++i) {
+    new_operands[permutation[i]] = mutable_operand(i);
+  }
+  RemoveAllOperands();
+  AppendOperands(new_operands);
+  return absl::OkStatus();
+}
+
 HloCallInstruction::HloCallInstruction(const Shape& shape,
                                        HloInstruction* called_computation_root)
     : HloCallableInstruction(HloOpcode::kCall, shape) {
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 7c6753c80071a6..b73f54e0b830f9 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -1588,6 +1588,10 @@ class HloFusionInstruction : public HloCallableInstruction {
   // If multiple operands are the same instruction, keeps only one of them.
   absl::Status DeduplicateFusionOperands();
 
+  // Permutes the operands computation according to the provided permutation.
+  // The fusion computation is also adjusted accordingly.
+  absl::Status PermuteFusionOperands(absl::Span<const int64_t> permutation);
+
   static bool ClassOf(const HloInstruction* hlo) {
     return hlo->opcode() == HloOpcode::kFusion;
   }
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b87bc885903e10..52e39430e85f9d 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -925,6 +925,7 @@ xla_cc_test(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/service/hlo_instruction_test.cc b/third_party/xla/xla/service/hlo_instruction_test.cc
index fddfda34a81e12..aa90adc01f6e8a 100644
--- a/third_party/xla/xla/service/hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/hlo_instruction_test.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "xla/window_util.h"
@@ -3381,5 +3382,39 @@ TEST_F(HloInstructionTest, DifferentResultAccuracy) {
   EXPECT_FALSE(exp1->equal_result_accuracy(exp2));
 }
 
+TEST_F(HloInstructionTest, FusionPermuteOperandsTest) {
+  constexpr char kHloString[] = R"(
+  HloModule test_module
+  fusion_computation {
+    p0 = f32[] parameter(0)
+    p1 = f32[32] parameter(1)
+    p2 = f32[32,32] parameter(2)
+    bcast0 = f32[32,32] broadcast(p0), dimensions={}
+    bcast1 = f32[32,32] broadcast(p1), dimensions={0}
+    sub = f32[32,32] subtract(bcast0, bcast1)
+    ROOT add = f32[32,32] add(sub, p2)
+  }
+
+  ENTRY reduce {
+    p0 = f32[] parameter(0)
+    p1 = f32[32] parameter(1)
+    p2 = f32[32,32] parameter(2)
+    ROOT root = f32[32,32] fusion(p0, p1, p2), kind=kLoop, calls=fusion_computation
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloString));
+  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(
+      module->entry_computation()->root_instruction());
+  EXPECT_OK(fusion->PermuteFusionOperands({1, 2, 0}));
+
+  EXPECT_THAT(fusion, GmockMatch(m::Fusion(m::Parameter(2), m::Parameter(0),
+                                           m::Parameter(1))));
+  HloComputation* fusion_computation = fusion->fused_instructions_computation();
+  EXPECT_THAT(fusion_computation->root_instruction(),
+              GmockMatch(m::Add(m::Subtract(m::Broadcast(m::Parameter(1)),
+                                            m::Broadcast(m::Parameter(2))),
+                                m::Parameter(0))));
+}
+
 }  // namespace
 }  // namespace xla

From c78a6691190e8a5428f54515888123fc2e63c50b Mon Sep 17 00:00:00 2001
From: Alex Pivovarov <upwind@google.com>
Date: Mon, 8 Dec 2025 14:16:03 -0800
Subject: [PATCH 049/753] Move helper functions to an anonymous namespace.

This change moves `getScalarLimitOfFloatType` and `getScalarLimitOfIntegerType` into an anonymous namespace to limit their visibility to the current translation unit.

PiperOrigin-RevId: 841912322
---
 .../xla/xla/mlir_hlo/utils/hlo_utils.cc       | 94 ++++++++++---------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
index af292de860e3f0..4e2a8c6358a16f 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
@@ -17,21 +17,71 @@ limitations under the License.
 
 #include <algorithm>
 #include <cassert>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
 #include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "llvm/Support/ErrorHandling.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/IR/Value.h"
 #include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace hlo {
+namespace {
+APFloat getScalarLimitOfFloatType(FloatType floatTy, ScalarLimit limit) {
+  auto& semantics = floatTy.getFloatSemantics();
+  switch (limit) {
+    case kLowest:
+      return APFloat::getLargest(semantics, /*negative=*/true);
+    case kInfinityLowest:
+      return APFloat::getInf(semantics, /*negative=*/true);
+    case kMax:
+      return APFloat::getLargest(semantics, /*negative=*/false);
+    case kInfinityMax:
+      return APFloat::getInf(semantics, /*negative=*/false);
+  }
+  llvm_unreachable("invalid limit");
+}
+
+// Returns a scalar value for the given integer type.
+//
+// The argument 'scalar' describes which scalar value to return. `integer_value`
+// is used to specify the integer value for kInteger. For any other scalar,
+// integer_value is ignored.
+APInt getScalarLimitOfIntegerType(IntegerType integerTy, ScalarLimit limit) {
+  unsigned width = integerTy.getWidth();
+  bool isBool = (width == 1);
+  switch (limit) {
+    case kLowest:
+    case kInfinityLowest:
+      if (integerTy.isUnsigned() || isBool) {
+        return APInt::getMinValue(width);
+      } else {
+        return APInt::getSignedMinValue(width);
+      }
+
+    case kMax:
+    case kInfinityMax:
+      if (integerTy.isUnsigned() || isBool) {
+        return APInt::getMaxValue(width);
+      } else {
+        return APInt::getSignedMaxValue(width);
+      }
+  }
+  llvm_unreachable("invalid limit");
+}
+}  // namespace
 
 static constexpr size_t kPaddingSize = 64;
 
@@ -110,50 +160,6 @@ DenseElementsAttr getScalarNegZeroOfType(Type ty) {
   llvm_unreachable("unsupported type");
 }
 
-static APFloat getScalarLimitOfFloatType(FloatType floatTy, ScalarLimit limit) {
-  auto& semantics = floatTy.getFloatSemantics();
-  switch (limit) {
-    case kLowest:
-      return APFloat::getLargest(semantics, /*negative=*/true);
-    case kInfinityLowest:
-      return APFloat::getInf(semantics, /*negative=*/true);
-    case kMax:
-      return APFloat::getLargest(semantics, /*negative=*/false);
-    case kInfinityMax:
-      return APFloat::getInf(semantics, /*negative=*/false);
-  }
-  llvm_unreachable("invalid limit");
-}
-
-// Returns a scalar value for the given integer type.
-//
-// The argument 'scalar' describes which scalar value to return. `integer_value`
-// is used to specify the integer value for kInteger. For any other scalar,
-// integer_value is ignored.
-static APInt getScalarLimitOfIntegerType(IntegerType integerTy,
-                                         ScalarLimit limit) {
-  unsigned width = integerTy.getWidth();
-  bool isBool = (width == 1);
-  switch (limit) {
-    case kLowest:
-    case kInfinityLowest:
-      if (integerTy.isUnsigned() || isBool) {
-        return APInt::getMinValue(width);
-      } else {
-        return APInt::getSignedMinValue(width);
-      }
-
-    case kMax:
-    case kInfinityMax:
-      if (integerTy.isUnsigned() || isBool) {
-        return APInt::getMaxValue(width);
-      } else {
-        return APInt::getSignedMaxValue(width);
-      }
-  }
-  llvm_unreachable("invalid limit");
-}
-
 DenseElementsAttr getScalarLimitOfType(Type ty, ScalarLimit limit) {
   RankedTensorType scalarTy = RankedTensorType::get({}, ty);
   if (auto floatTy = mlir::dyn_cast<FloatType>(ty)) {

From f1d8c83302c361ce4d3bd15a0b4112e844654634 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Mon, 8 Dec 2025 14:17:50 -0800
Subject: [PATCH 050/753] Generalize CommonPjRtClient::PrepareArguments for
 processing all the input argument handles. This requires introducing a new
 EventSet concept for collecting the definition and device events before
 passing all of this to the internal Execute() function.

PiperOrigin-RevId: 841913034
---
 third_party/xla/xla/pjrt/BUILD                |   3 +
 .../xla/pjrt/abstract_tracked_device_buffer.h |  12 ++
 .../xla/xla/pjrt/common_pjrt_client.cc        | 136 ++++++++++++++++++
 third_party/xla/xla/pjrt/common_pjrt_client.h |  11 ++
 third_party/xla/xla/pjrt/device_event.h       |   7 +
 5 files changed, 169 insertions(+)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 6b382142dbb42f..0f895ff6047f2e 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -143,7 +143,9 @@ cc_library(
         ":device_event",
         ":host_callback",
         ":pjrt_client",
+        ":pjrt_executable",
         ":raw_buffer",
+        ":utils",
         "//xla:future",
         "//xla:literal",
         "//xla:shape_util",
@@ -155,6 +157,7 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
index 85975e95db0a28..03436e55390afb 100644
--- a/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/abstract_tracked_device_buffer.h
@@ -98,6 +98,18 @@ class AbstractTrackedDeviceBuffer {
         "WaitUntilBufferReadyOnStream is only implemented for GPU.");
   }
 
+  // TODO(parkers): definition events are fixed, so we should just store them
+  // directly.
+  // Returns true if there is an error in any of the events.
+  virtual bool AddDefinitionEventsToSet(PjRtDeviceEventSet& events) {
+    LOG(FATAL) << "TODO IMPLEMENT: AddDefinitionEventsToSet.";
+    return false;
+  }
+
+  virtual void AddUsageEventsToSet(PjRtDeviceEventSet& events) {
+    LOG(FATAL) << "TODO IMPLEMENT: AddUsageEventsToSet.";
+  }
+
  protected:
   void ReleaseDeviceMemory() {
     raw_buffer_ = tsl::RCReference<CommonPjRtRawBuffer>();
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index e737e773ff85f6..d3756104d08943 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
@@ -47,7 +48,9 @@ limitations under the License.
 #include "xla/pjrt/device_event.h"
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/pjrt/utils.h"
 #include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -426,6 +429,139 @@ void CommonPjRtClient::ScheduleRemoteSend(
   usage_event_promise->SetError(error);
 }
 
+absl::Status CommonPjRtClient::PrepareArguments(
+    const ExecuteOptions& options,
+    absl::Span<PjRtBuffer* const> argument_handles,
+    absl::Span<int const> donated_params, PjRtDeviceEventSet& extra_deps,
+    PjRtDeviceEventSet& control_deps,
+    absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>&
+        input_buffers,
+    absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4>& device_buffers,
+    PjRtDevice* device, int replica, int partition,
+    absl::Span<const Shape> parameter_device_shapes, bool& is_error) {
+  input_buffers.reserve(argument_handles.size());
+  device_buffers.reserve(argument_handles.size());
+  auto donate_it = donated_params.begin();
+  {
+    tsl::profiler::TraceMe t2("Handle inputs");
+    // State for `TestBufferDonationClashes`.
+    absl::flat_hash_map<const void*, std::pair<bool, int>> donation_clashes;
+    donation_clashes.reserve(argument_handles.size());
+    // The first element is the argument index of the donated buffer, and the
+    // second element is the size in bytes of the donated buffer.
+    std::vector<std::pair<int, size_t>> donated_buffer_stats;
+    for (int i = 0; i < argument_handles.size(); ++i) {
+      PjRtBuffer* handle = argument_handles[i];
+      auto* tfrt_buffer = tensorflow::down_cast<CommonPjRtBufferImpl*>(handle);
+      if (tfrt_buffer->device() != device) {
+        return InvalidArgument(
+            "Buffer passed to Execute() as argument %d to replica %d is on "
+            "device %s, but replica is assigned to device %s.",
+            i, replica, tfrt_buffer->device()->DebugString(),
+            device->DebugString());
+      }
+      const bool donated_param =
+          donate_it != donated_params.end() && *donate_it == i;
+      const bool donation_denied_at_runtime =
+          options.non_donatable_input_indices.contains(i);
+      if (donated_param && donation_denied_at_runtime &&
+          tfrt_buffer->on_device_shape().has_layout() &&
+          tfrt_buffer->on_device_shape().layout().memory_space() ==
+              Layout::kHostMemorySpace) {
+        return absl::UnimplementedError(
+            "pinned_host buffers do not support donation denial at runtime via "
+            "`ExecuteOptions::non_donatable_input_indices`");
+      }
+      bool must_donate = donated_param && !donation_denied_at_runtime;
+      if (must_donate) {
+        ++donate_it;
+        if (VLOG_IS_ON(1)) {
+          TF_ASSIGN_OR_RETURN(size_t on_device_size,
+                              tfrt_buffer->GetOnDeviceSizeInBytes());
+          donated_buffer_stats.emplace_back(std::make_pair(i, on_device_size));
+        }
+      }
+      TF_RETURN_IF_ERROR(TestBufferDonationClashes(
+          tfrt_buffer, donation_clashes, must_donate, i, replica, partition));
+      device_buffers.emplace_back(tfrt_buffer->GetBufferWithHold(
+          must_donate ? CommonPjRtBuffer::ScopedHold::kDonation
+                      : CommonPjRtBuffer::ScopedHold::kUsage));
+      CommonPjRtBuffer::ScopedHold& hold = device_buffers.back();
+      if (!hold.ok()) {
+        return InvalidArgument(
+            "Invalid buffer passed to Execute() as argument %d to replica %d: "
+            "%s",
+            i, replica, hold.status().ToString());
+      }
+      auto* device_buffer = hold.buffer();
+
+      const bool is_handle_dynamic_shape =
+          handle->on_device_shape().is_dynamic();
+
+      const Shape& expected_shape = parameter_device_shapes[i];
+      if (device_buffer->raw_buffer()) {
+        tsl::RCReference<CommonPjRtRawBuffer> actual_buffer =
+            device_buffer->raw_buffer();
+        if (is_handle_dynamic_shape && !expected_shape.is_dynamic()) {
+          TF_ASSIGN_OR_RETURN(auto handle_logical_device_shape,
+                              handle->logical_on_device_shape());
+          auto status_or_buffer =
+              actual_buffer->RemoveDynamicShapeMetadataIfPresent(
+                  handle_logical_device_shape);
+
+          if (!status_or_buffer.ok()) {
+            absl::Status status = status_or_buffer.status();
+            tsl::errors::AppendToMessage(
+                &status, absl::StrCat("; Error when preparing the input buffer "
+                                      "to Execute() as argument ",
+                                      i, " to replica ", replica));
+            return status;
+          }
+          actual_buffer = std::move(status_or_buffer).value();
+        }
+        input_buffers.push_back(std::move(actual_buffer));
+      } else {
+        is_error = true;
+      }
+
+      // Definition events are never modified after buffer construction.
+      is_error |= device_buffer->AddDefinitionEventsToSet(extra_deps);
+      // If we are trying to donate this buffer, we must wait on its usage
+      // events as well as its definition events to ensure that all reads on
+      // this buffer (e.g., d2h transfer) have been completed before it can be
+      // mutated. Usage holds on this buffer are excluded during a donation hold
+      // so we know that its usage events won't be modified while we are
+      // enqueueing, but we ignore any errors from usage events.
+      if (must_donate) {
+        device_buffer->AddUsageEventsToSet(control_deps);
+      }
+    }
+    // Debug logging of buffer donation and input buffer shapes and size.
+    if (VLOG_IS_ON(1)) {
+      // Buffer donation information.
+      if (!argument_handles.empty()) {
+        LOG(INFO) << donated_buffer_stats.size() << " arguments out of total "
+                  << argument_handles.size() << " arguments will be donated.";
+        for (auto [index, buffer_size] : donated_buffer_stats) {
+          LOG(INFO) << "Argument " << index << " with size " << buffer_size
+                    << " will be donated.";
+        }
+      }
+      // Input buffers shape and size.
+      for (int i = 0; i < input_buffers.size(); ++i) {
+        size_t buffer_size = input_buffers[i]->GetOnDeviceSizeInBytes();
+        TF_ASSIGN_OR_RETURN(Shape actual_input_shape,
+                            argument_handles[i]->logical_on_device_shape());
+        VLOG(2) << "input buffer with index " << i
+                << " has shape: " << actual_input_shape.ToString()
+                << " and size: " << buffer_size;
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
 absl::StatusOr<absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>>
 CommonPjRtClient::AllocateOutputBuffersWithInputReuse(
     const Shape& output_device_shape,
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index 43cc61bc024696..5470a087376444 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -236,6 +236,17 @@ class CommonPjRtClient : public PjRtClient {
       Future<std::string> serialized_descriptor,
       PjRtBuffer::RemoteSendCallback on_done);
 
+  static absl::Status PrepareArguments(
+      const ExecuteOptions& options,
+      absl::Span<PjRtBuffer* const> argument_handles,
+      absl::Span<int const> donated_params, PjRtDeviceEventSet& extra_deps,
+      PjRtDeviceEventSet& control_deps,
+      absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>&
+          input_buffers,
+      absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4>& device_buffers,
+      PjRtDevice* device, int replica, int partition,
+      absl::Span<const Shape> parameter_device_shapes, bool& is_error);
+
   absl::StatusOr<absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>>
   AllocateOutputBuffersWithInputReuse(
       const Shape& output_device_shape,
diff --git a/third_party/xla/xla/pjrt/device_event.h b/third_party/xla/xla/pjrt/device_event.h
index 9aa231ebca926f..5e307e0dcc5cc9 100644
--- a/third_party/xla/xla/pjrt/device_event.h
+++ b/third_party/xla/xla/pjrt/device_event.h
@@ -106,6 +106,13 @@ class PjRtDeviceEventPromise : public PjRtDeviceEventOrPromise {
   virtual void SetReady() = 0;
 };
 
+// A collection of events. This is not an event itself because we may want to
+// add events in the future.
+class PjRtDeviceEventSet {
+ public:
+  virtual ~PjRtDeviceEventSet() = default;
+};
+
 }  // namespace xla
 
 #endif  // XLA_PJRT_DEVICE_EVENT_H_

From cb1ad8a22b10a96c3308b16749dd6737b634b929 Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Mon, 8 Dec 2025 14:18:14 -0800
Subject: [PATCH 051/753] [XLA:CPU][XTile] Rewrite llvm vectorized log to
 polynomial approximations.

This is needed to get the same numerics from the tiled & scalar emitter.

PiperOrigin-RevId: 841913170
---
 .../xla/xla/backends/cpu/codegen/polynomial_approximations.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
index 232bfe0488ba26..0c8084568e41c5 100644
--- a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
@@ -541,6 +541,10 @@ void RewriteToPolynomialApproximations(llvm::Module* module,
 
   rewrite_calls("logf", GenerateVF32Log, /*vector_width=*/1);
   rewrite_calls("llvm.log.f32", GenerateVF32Log, /*vector_width=*/1);
+  rewrite_calls("llvm.log.v2f32", GenerateVF32Log, /*vector_width=*/2);
+  rewrite_calls("llvm.log.v4f32", GenerateVF32Log, /*vector_width=*/4);
+  rewrite_calls("llvm.log.v8f32", GenerateVF32Log, /*vector_width=*/8);
+  rewrite_calls("llvm.log.v16f32", GenerateVF32Log, /*vector_width=*/16);
   rewrite_calls(kLogV4F32Sym, GenerateVF32Log, /*vector_width=*/4);
   rewrite_calls(kLogV8F32Sym, GenerateVF32Log, /*vector_width=*/8);
   rewrite_calls(kLogV16F32Sym, GenerateVF32Log, /*vector_width=*/16);

From b59325607d1ce30065a0a6552e7feb8d7d3dbadc Mon Sep 17 00:00:00 2001
From: Krishna Haridasan <krishnahari@google.com>
Date: Mon, 8 Dec 2025 15:18:41 -0800
Subject: [PATCH 052/753] Make a variant of Get in AttributeMap that returns a
 variant type.

PiperOrigin-RevId: 841934935
---
 third_party/xla/xla/python/ifrt/BUILD                |  1 -
 third_party/xla/xla/python/ifrt/attribute_map.h      | 12 ++++++++----
 .../xla/xla/python/ifrt/attribute_map_test.cc        |  6 +-----
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 852c0d29384626..1efa12d993c918 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -179,7 +179,6 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/python/ifrt/attribute_map.h b/third_party/xla/xla/python/ifrt/attribute_map.h
index 714b437c8f092b..b1823d2d88b562 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map.h
+++ b/third_party/xla/xla/python/ifrt/attribute_map.h
@@ -95,15 +95,19 @@ class AttributeMap {
 
   template <typename T>
   absl::StatusOr<T> Get(const std::string& key) const {
-    if constexpr (std::is_same_v<T, std::string> ||
-                  std::is_same_v<T, absl::string_view>) {
+    if constexpr (std::is_same_v<T, Value>) {
+      auto it = map_.find(key);
+      if (it == map_.end()) {
+        return absl::NotFoundError(absl::StrCat("Key not found: ", key));
+      }
+      return it->second;
+    } else if constexpr (std::is_same_v<T, std::string>) {
       return Get<T, StringValue>(key);
     } else if constexpr (std::is_same_v<T, bool>) {
       return Get<T, BoolValue>(key);
     } else if constexpr (std::is_same_v<T, int64_t>) {
       return Get<T, Int64Value>(key);
-    } else if constexpr (std::is_same_v<T, std::vector<int64_t>> ||
-                         std::is_same_v<T, absl::Span<const int64_t>>) {
+    } else if constexpr (std::is_same_v<T, std::vector<int64_t>>) {
       return Get<T, Int64ListValue>(key);
     } else if constexpr (std::is_same_v<T, float>) {
       return Get<T, FloatValue>(key);
diff --git a/third_party/xla/xla/python/ifrt/attribute_map_test.cc b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
index 96069fdfb6ee74..c8425ec79038a4 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map_test.cc
+++ b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
-#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
@@ -67,19 +66,16 @@ TEST(AttributeMapTest, Get) {
   });
 
   EXPECT_THAT(map.Get<std::string>("string"), IsOkAndHolds("value"));
-  EXPECT_THAT(map.Get<absl::string_view>("string"), IsOkAndHolds("value"));
   EXPECT_THAT(map.Get<bool>("bool"), IsOkAndHolds(true));
   EXPECT_THAT(map.Get<int64_t>("int64"), IsOkAndHolds(123));
   EXPECT_THAT(map.Get<std::vector<int64_t>>("int64_list"),
               IsOkAndHolds(std::vector<int64_t>{1, 2}));
-  EXPECT_THAT(map.Get<absl::Span<const int64_t>>("int64_list"),
-              IsOkAndHolds(std::vector<int64_t>{1, 2}));
   EXPECT_THAT(map.Get<float>("float"), IsOkAndHolds(1.23f));
 
   EXPECT_THAT(map.Get<std::string>("float"),
               StatusIs(absl::StatusCode::kInvalidArgument,
                        HasSubstr("Value type mismatch for key: float")));
-  EXPECT_THAT(map.Get<absl::Span<const int64_t>>("string"),
+  EXPECT_THAT(map.Get<std::vector<int64_t>>("string"),
               StatusIs(absl::StatusCode::kInvalidArgument,
                        HasSubstr("Value type mismatch for key: string")));
 }

From 4cd611b520678b3b7d39094a6278acaf8b9b3370 Mon Sep 17 00:00:00 2001
From: Krishna Haridasan <krishnahari@google.com>
Date: Mon, 8 Dec 2025 17:01:13 -0800
Subject: [PATCH 053/753] Add ForEach method to IFRT AttributeMap

PiperOrigin-RevId: 841969101
---
 third_party/xla/xla/python/ifrt/BUILD          |  2 +-
 .../xla/xla/python/ifrt/attribute_map.h        | 17 +++++++++++++++--
 .../xla/xla/python/ifrt/attribute_map_test.cc  | 18 +++++++++---------
 .../pjrt_ifrt/pjrt_attribute_map_util.cc       | 10 +++++-----
 .../pjrt_ifrt/pjrt_attribute_map_util_test.cc  |  4 ++--
 .../pjrt_ifrt/xla_executable_impl_test_lib.cc  |  2 +-
 6 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 1efa12d993c918..c411c99ab65d9c 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -160,11 +160,11 @@ cc_library(
         ":serdes_version",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/third_party/xla/xla/python/ifrt/attribute_map.h b/third_party/xla/xla/python/ifrt/attribute_map.h
index b1823d2d88b562..e53be413eb1c4a 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map.h
+++ b/third_party/xla/xla/python/ifrt/attribute_map.h
@@ -26,11 +26,10 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "xla/python/ifrt/attribute_map.pb.h"
 #include "xla/python/ifrt/serdes_default_version_accessor.h"
 #include "xla/python/ifrt/serdes_version.h"
@@ -141,6 +140,20 @@ class AttributeMap {
 
   bool IsEmpty() const { return map_.empty(); }
 
+  // Invokes `f` for each key-value pair in the attribute map.
+  void ForEach(
+      absl::FunctionRef<void(const std::string&, const Value&)> f) const {
+    for (const auto& [key, value] : map_) {
+      f(key, value);
+    }
+  }
+
+  bool operator==(const AttributeMap& other) const {
+    return map_ == other.map_;
+  }
+
+  size_t size() const { return map_.size(); }
+
  private:
   template <typename T, typename V>
   absl::StatusOr<T> Get(const std::string& key) const {
diff --git a/third_party/xla/xla/python/ifrt/attribute_map_test.cc b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
index c8425ec79038a4..c658838e2af1c3 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map_test.cc
+++ b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
@@ -45,14 +45,14 @@ TEST(AttributeMapTest, MapElements) {
       {"float", AttributeMap::FloatValue(1.23f)},
   });
 
-  EXPECT_EQ(map.map(), AttributeMap::Map({
-                           {"string", AttributeMap::StringValue("value")},
-                           {"bool", AttributeMap::BoolValue(true)},
-                           {"int64", AttributeMap::Int64Value(123)},
-                           {"int64_list", AttributeMap::Int64ListValue(
-                                              {int64_t{1}, int64_t{2}})},
-                           {"float", AttributeMap::FloatValue(1.23f)},
-                       }))
+  EXPECT_EQ(map, AttributeMap({
+                     {"string", AttributeMap::StringValue("value")},
+                     {"bool", AttributeMap::BoolValue(true)},
+                     {"int64", AttributeMap::Int64Value(123)},
+                     {"int64_list",
+                      AttributeMap::Int64ListValue({int64_t{1}, int64_t{2}})},
+                     {"float", AttributeMap::FloatValue(1.23f)},
+                 }))
       << map.DebugString();
 }
 
@@ -101,7 +101,7 @@ TEST_P(AttributeMapSerDesTest, ToFromProto) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto map_copy,
                           AttributeMap::FromProto(map.ToProto(version())));
-  EXPECT_EQ(map_copy.map(), map.map()) << map_copy.DebugString();
+  EXPECT_EQ(map_copy, map) << map_copy.DebugString();
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc
index af2a07cb85d92f..a28a1cfa8cf481 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util.cc
@@ -59,12 +59,12 @@ AttributeMap FromPjRtAttributeMap(
 absl::flat_hash_map<std::string, xla::PjRtValueType> ToPjRtAttributeMap(
     AttributeMap attributes) {
   absl::flat_hash_map<std::string, xla::PjRtValueType> result;
-  result.reserve(attributes.map().size());
-  for (auto& item : attributes.map()) {
+  result.reserve(attributes.size());
+  attributes.ForEach([&](const std::string& key,
+                         const AttributeMap::Value& value) {
     std::visit(
         [&](auto& value) {
           using T = std::decay_t<decltype(value)>;
-          const auto& key = item.first;
           if constexpr (std::is_same_v<T, AttributeMap::StringValue>) {
             result.insert({key, std::move(value.value)});
           } else if constexpr (std::is_same_v<T, AttributeMap::BoolValue>) {
@@ -78,8 +78,8 @@ absl::flat_hash_map<std::string, xla::PjRtValueType> ToPjRtAttributeMap(
             result.insert({key, value.value});
           }
         },
-        item.second);
-  }
+        value);
+  });
   return result;
 }
 
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc
index afee66155aa4ad..dd8742d16610ad 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_attribute_map_util_test.cc
@@ -38,8 +38,8 @@ TEST(PjRtAttributeMapUtilTest, FromPjRtAttributeMap) {
       {"float", xla::PjRtValueType(1.23f)},
   });
 
-  EXPECT_EQ(FromPjRtAttributeMap(pjrt_map).map(),
-            AttributeMap::Map({
+  EXPECT_EQ(FromPjRtAttributeMap(pjrt_map),
+            AttributeMap({
                 {"string", AttributeMap::StringValue("value")},
                 {"bool", AttributeMap::BoolValue(true)},
                 {"int64", AttributeMap::Int64Value(123)},
diff --git a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index 31cc3645eaead6..54007917478279 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -294,7 +294,7 @@ TEST_P(LoadedExecutableImplTest, Analysis) {
 
   TF_ASSERT_OK_AND_ASSIGN(const auto cost_analysis,
                           executable->GetCostAnalysis());
-  EXPECT_THAT(cost_analysis.map(), Not(IsEmpty()));
+  EXPECT_FALSE(cost_analysis.IsEmpty());
 }
 
 TEST_P(LoadedExecutableImplTest, GetDonatableInputIndices) {

From b8a2067837dab89276b5f0d4f2f104b4ccf3e975 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Mon, 8 Dec 2025 18:32:31 -0800
Subject: [PATCH 054/753] Add Shape to MemzeroThunk/MemzeroCmd buffer_uses

Modify Thunk's serialization

PiperOrigin-RevId: 841995197
---
 third_party/xla/xla/backends/gpu/runtime/BUILD    |  8 +++++++-
 .../backends/gpu/runtime/command_buffer_cmd.cc    | 15 ++++++++-------
 .../xla/backends/gpu/runtime/command_buffer_cmd.h |  6 +++---
 .../gpu/runtime/command_buffer_cmd_test.cc        |  7 ++++++-
 .../gpu/runtime/command_buffer_thunk_test.cc      |  5 ++++-
 .../xla/xla/backends/gpu/runtime/memset_thunk.cc  |  9 +++++----
 .../xla/xla/backends/gpu/runtime/memset_thunk.h   | 10 +++++-----
 .../xla/backends/gpu/runtime/memset_thunk_test.cc |  9 ++++++++-
 .../xla/xla/backends/gpu/runtime/thunk.proto      |  2 +-
 9 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index c40f279f9212dd..856ea1bc586623 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -88,6 +88,7 @@ cc_library(
         "//xla:util",
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/core/collectives:communicator",
+        "//xla/core/collectives:reduction_kind",
         "//xla/ffi:attribute_map",
         "//xla/ffi:call_frame",
         "//xla/ffi:execution_state",
@@ -101,7 +102,6 @@ cc_library(
         "//xla/runtime:object_pool",
         "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
         "//xla/service:computation_placer",
         "//xla/service:custom_call_status_internal",
         "//xla/service:custom_call_status_public_headers",
@@ -119,6 +119,7 @@ cc_library(
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:tensor_map",
         "//xla/stream_executor:trace_command_buffer_factory",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/lib/gtl:int_type",
@@ -152,7 +153,9 @@ xla_test(
     deps = [
         ":command_buffer_cmd",
         ":copy_thunk",
+        ":shaped_slice",
         ":thunk",
+        "//xla:shape_util",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
@@ -359,6 +362,7 @@ xla_test(
         ":gpublas_lt_matmul_thunk",
         ":memset_thunk",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -377,6 +381,7 @@ xla_test(
         "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:kernel",
+        "//xla/stream_executor:kernel_args",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
@@ -1190,6 +1195,7 @@ cc_library(
     srcs = ["memset_thunk.cc"],
     hdrs = ["memset_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index 37428162a4b910..866e9a9f0b870b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/debug_options_flags.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/call_frame.h"
@@ -74,7 +75,6 @@ limitations under the License.
 #include "xla/runtime/execution_graph.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
@@ -95,6 +95,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tensor_map.h"
 #include "xla/stream_executor/trace_command_buffer_factory.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -1369,7 +1370,7 @@ CommandBufferCmd::BufferUseVector MemcpyDeviceToDeviceCmd::buffers() const {
 // MemzeroCmd
 //===----------------------------------------------------------------------===//
 
-MemzeroCmd::MemzeroCmd(BufferAllocation::Slice dst)
+MemzeroCmd::MemzeroCmd(ShapedSlice dst)
     : CommandBufferCmd(CommandBufferCmdType::kMemzeroCmd), dst_(dst) {}
 
 absl::StatusOr<const se::CommandBuffer::Command*> MemzeroCmd::Record(
@@ -1377,12 +1378,12 @@ absl::StatusOr<const se::CommandBuffer::Command*> MemzeroCmd::Record(
     const RecordParams& record_params, RecordAction record_action,
     se::CommandBuffer* command_buffer) {
   se::DeviceAddressBase dst =
-      execute_params.buffer_allocations->GetDeviceAddress(dst_);
+      execute_params.buffer_allocations->GetDeviceAddress(dst_.slice);
 
   VLOG(5) << "MemzeroCmd:";
   VLOG(5) << "  Dst: " << dst_ << " (" << dst.opaque() << ")";
 
-  if (dst_.size() == 0) {
+  if (dst_.slice.size() == 0) {
     VLOG(5) << "Skip recording MemzeroCmd command of 0 bytes";
     return nullptr;
   }
@@ -1391,17 +1392,17 @@ absl::StatusOr<const se::CommandBuffer::Command*> MemzeroCmd::Record(
       std::move(record_action),
       [&](absl::Span<const se::CommandBuffer::Command* const> dependencies) {
         return command_buffer->CreateMemset(&dst, uint8_t{0},
-                                            /*num_elements=*/dst_.size(),
+                                            /*num_elements=*/dst_.slice.size(),
                                             dependencies);
       },
       [&](const se::CommandBuffer::Command* command) {
         return command_buffer->UpdateMemset(command, &dst, uint8_t{0},
-                                            /*num_elements=*/dst_.size());
+                                            /*num_elements=*/dst_.slice.size());
       });
 }
 
 CommandBufferCmd::BufferUseVector MemzeroCmd::buffers() const {
-  return {BufferUse::Write(dst_)};
+  return {BufferUse::Write(dst_.slice, dst_.shape)};
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index 40a5b9cff1a7c1..70114c4b8a2e00 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
 #include "xla/ffi/call_frame.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/runtime/object_pool.h"
 #include "xla/runtime/resource_use.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
@@ -767,7 +767,7 @@ class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
 
 class MemzeroCmd : public CommandBufferCmd {
  public:
-  explicit MemzeroCmd(BufferAllocation::Slice dst);
+  explicit MemzeroCmd(ShapedSlice dst);
 
   absl::StatusOr<const se::CommandBuffer::Command*> Record(
       const Thunk::ExecuteParams& execute_params,
@@ -777,7 +777,7 @@ class MemzeroCmd : public CommandBufferCmd {
   BufferUseVector buffers() const override;
 
  private:
-  BufferAllocation::Slice dst_;
+  ShapedSlice dst_;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
index 5dffa09d49e184..c8e59ba1093233 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -36,6 +37,8 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
@@ -721,6 +724,8 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
   // Prepare device memory for three buffers.
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
+
   se::DeviceAddress<int32_t> a =
       stream_executor->AllocateArray<int32_t>(length);
   se::DeviceAddress<int32_t> b =
@@ -763,7 +768,7 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
   CommandBufferCmdSequence outer_seq;
   outer_seq.Emplace<ChildCmd>(std::move(middle_executor));
   // Add a couple more commands at the outer level that still don't affect `c`.
-  outer_seq.Emplace<MemzeroCmd>(slice_b);
+  outer_seq.Emplace<MemzeroCmd>(ShapedSlice{slice_b, shape});
   outer_seq.Emplace<EmptyCmd>();
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor outer_executor,
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index d4a472d5c542e5..fff417ba5dacbe 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -54,6 +55,7 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/kernel_args.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/platform.h"
@@ -222,6 +224,7 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
 
   // Prepare arguments: a=42
   se::DeviceAddress<int32_t> a =
@@ -234,7 +237,7 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
 
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
-  commands.Emplace<MemzeroCmd>(slice_a);
+  commands.Emplace<MemzeroCmd>(ShapedSlice{slice_a, shape});
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
index a370a04753620b..75508e2365b0b8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/tsl/platform/statusor.h"
@@ -30,16 +31,16 @@ namespace gpu {
 
 absl::Status MemzeroThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::DeviceAddressBase dest_data =
-      params.buffer_allocations->GetDeviceAddress(dest_);
+      params.buffer_allocations->GetDeviceAddress(dest_.slice);
   return params.stream->MemZero(&dest_data, dest_data.size());
 }
 
 absl::StatusOr<std::unique_ptr<MemzeroThunk>> MemzeroThunk::FromProto(
     ThunkInfo thunk_info, const MemzeroThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dest,
-                      BufferAllocation::Slice::FromProto(
-                          thunk_proto.dest_buffer(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      ShapedSlice dest,
+      ShapedSlice::FromProto(thunk_proto.dest_buffer(), buffer_allocations));
   return std::make_unique<MemzeroThunk>(std::move(thunk_info), dest);
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
index 6b627180c9f5af..aec432fa785d82 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -35,17 +36,16 @@ namespace gpu {
 // Thunk that zeroes out a given chunk of memory.
 class MemzeroThunk : public Thunk {
  public:
-  explicit MemzeroThunk(ThunkInfo thunk_info,
-                        const BufferAllocation::Slice& dest)
+  explicit MemzeroThunk(ThunkInfo thunk_info, const ShapedSlice& dest)
       : Thunk(Kind::kMemzero, thunk_info), dest_(dest) {}
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  const BufferAllocation::Slice& destination() const { return dest_; }
+  const ShapedSlice& destination() const { return dest_; }
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Write(dest_),
+        BufferUse::Write(dest_.slice, dest_.shape),
     };
   }
 
@@ -56,7 +56,7 @@ class MemzeroThunk : public Thunk {
   absl::StatusOr<ThunkProto> ToProto() const override;
 
  private:
-  const BufferAllocation::Slice dest_;
+  const ShapedSlice dest_;
 };
 
 // Thunk that sets a given chunk of memory to a particular 32-bit value.  The
diff --git a/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
index 0eb1bc60ff2cb3..67a6d8044ab5a4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/memset_thunk_test.cc
@@ -41,7 +41,14 @@ TEST(MemzeroThunkTest, ProtoRoundTrip) {
           execution_stream_id: 2
         }
         memzero_thunk {
-          dest_buffer { offset: 0 size: 4 buffer_allocation_index: 0 }
+          dest_buffer {
+            slice { offset: 0 size: 4 buffer_allocation_index: 0 }
+            shape {
+              dimensions: 1
+              element_type: F32
+              is_dynamic_dimension: false
+            }
+          }
         }
       )pb");
   std::vector<BufferAllocation> buffer_allocations = {
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index c34eabae9e45f4..7b9bbf093b6863 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -191,7 +191,7 @@ message DynamicSliceThunkProto {
 }
 
 message MemzeroThunkProto {
-  xla.buffer_assignment.BufferAllocationSliceProto dest_buffer = 1;
+  ShapedSliceProto dest_buffer = 1;
 }
 
 message Memset32BitValueThunkProto {

From eb43894883764e9732a3118912e454c520969659 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 21:20:16 -0800
Subject: [PATCH 055/753] Automated Code Change

PiperOrigin-RevId: 842047558
---
 third_party/xla/xla/runtime/buffer_use.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/runtime/buffer_use.cc b/third_party/xla/xla/runtime/buffer_use.cc
index 23030c71cffac6..10aeab882b9e27 100644
--- a/third_party/xla/xla/runtime/buffer_use.cc
+++ b/third_party/xla/xla/runtime/buffer_use.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 

From 8828f2a418fa1f9c2dedc32294529c135f050251 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 22:34:14 -0800
Subject: [PATCH 056/753] Automated Code Change

PiperOrigin-RevId: 842069505
---
 third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index c6318aaa7f02d1..cfc07e10bfb631 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -22,7 +22,6 @@
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"

From 9e24441327c16113bb1e1f578616209dc8ae1bf7 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 8 Dec 2025 22:50:48 -0800
Subject: [PATCH 057/753] [stream_executor] Switch SE TPU backend to
 se::DeviceMemoryAddress

Renaming types in preparation for introducing physical memory allocation concept to SE.

PiperOrigin-RevId: 842074373
---
 tensorflow/core/tpu/tpu_execute.cc                 |  4 ++--
 .../xla/stream_executor/tpu/c_api_conversions.cc   | 12 ++++++------
 .../xla/stream_executor/tpu/c_api_conversions.h    |  8 ++++----
 .../stream_executor/tpu/c_api_conversions_test.cc  |  2 +-
 .../xla/xla/stream_executor/tpu/c_api_decl.h       | 14 ++++----------
 .../xla/xla/stream_executor/tpu/tpu_executable.cc  |  6 +++---
 .../tpu/tpu_executable_interface.cc                |  8 ++++----
 .../xla/xla/stream_executor/tpu/tpu_executor.h     |  4 ++--
 .../xla/stream_executor/tpu/tpu_executor_c_api.h   |  8 ++++----
 .../stream_executor/tpu/tpu_executor_init_fns.inc  |  2 +-
 .../stream_executor/tpu/tpu_executor_interface.h   |  9 +++------
 11 files changed, 34 insertions(+), 43 deletions(-)

diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index 865683dcb430cf..a8edf650bc1718 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -474,7 +474,7 @@ absl::StatusOr<xla::ExecutionOutput> TPUExecute(
   VLOG(1) << "TPUExecute: Updating TPUEmbedding memory addresses on "
           << device_ordinal;
 
-  SE_DeviceMemoryBase* device_memory_addrs = nullptr;
+  SE_DeviceAddressBase* device_memory_addrs = nullptr;
   size_t device_memory_addrs_count;
   auto device_memory_cleanup =
       absl::MakeCleanup([device_memory_addrs, device_ordinal]() {
@@ -501,7 +501,7 @@ absl::StatusOr<xla::ExecutionOutput> TPUExecute(
   for (int i = 0; i < device_memory_addrs_count; ++i) {
     xla::ShapeTree<xla::MaybeOwningDeviceMemory> tree(
         xla::ShapeUtil::MakeOpaqueShape());
-    const SE_DeviceMemoryBase& addr = device_memory_addrs[i];
+    const SE_DeviceAddressBase& addr = device_memory_addrs[i];
     VLOG(2) << absl::StrFormat("Device memory addr[%i] = {%p, %llu, %llu}", i,
                                addr.opaque, addr.size, addr.payload);
     *tree.mutable_element({}) = ApiConverter::FromC(addr);
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index 58eb6c2c3033f9..cb53f7c79336dc 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -158,9 +158,9 @@ xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
   return xla_shaped_buffer;
 }
 
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceAddress& mem,
-                               bool aliased) {
-  SE_MaybeOwningDeviceMemory se_mem;
+SE_MaybeOwningDeviceAddress ToC(xla::MaybeOwningDeviceAddress& mem,
+                                bool aliased) {
+  SE_MaybeOwningDeviceAddress se_mem;
   se_mem.owned = mem.HasOwnership();
   se_mem.memory = ApiConverter::ToC(mem.AsDeviceAddress());
   if (mem.HasOwnership()) {
@@ -181,7 +181,7 @@ SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceAddress& mem,
 }
 
 xla::MaybeOwningDeviceAddress FromC(
-    SE_MaybeOwningDeviceMemory* se_mem,
+    SE_MaybeOwningDeviceAddress* se_mem,
     stream_executor::DeviceAddressAllocator* allocator) {
   if (se_mem->owned) {
     return xla::MaybeOwningDeviceAddress(stream_executor::OwningDeviceAddress(
@@ -244,8 +244,8 @@ stream_executor::DeviceAddressAllocator* FromC(
       c_allocator.ctx);
 }
 
-SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceAddress* mem) {
-  SE_MaybeOwningDeviceMemory se_mem;
+SE_MaybeOwningDeviceAddress ToC(stream_executor::OwningDeviceAddress* mem) {
+  SE_MaybeOwningDeviceAddress se_mem;
   se_mem.device_ordinal = mem->device_ordinal();
   se_mem.memory = ApiConverter::ToC(mem->Release());
   se_mem.allocator = ApiConverter::ToC(mem->allocator());
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
index da3db36c17a1d2..cdfcab80fabb69 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
@@ -118,7 +118,7 @@ struct TpuEmbeddingEngineParametersData {
 std::unique_ptr<TpuEmbeddingEngineParametersData> Create(int num_tables);
 
 xla::MaybeOwningDeviceAddress FromC(
-    SE_MaybeOwningDeviceMemory* se_mem,
+    SE_MaybeOwningDeviceAddress* se_mem,
     stream_executor::DeviceAddressAllocator* allocator);
 
 // DeviceAddressAllocator
@@ -128,12 +128,12 @@ stream_executor::DeviceAddressAllocator* FromC(
     const SE_DeviceAddressAllocator& c_allocator);
 
 // OwningDeviceAddress
-SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceAddress* mem);
+SE_MaybeOwningDeviceAddress ToC(stream_executor::OwningDeviceAddress* mem);
 // mem.HasOwnership() may be true if the buffer is aliased and shouldn't be
 // released. 'aliased' should be true in this case. 'aliased' has no effect if
 // 'mem' is unowned.
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceAddress& mem,
-                               bool aliased);
+SE_MaybeOwningDeviceAddress ToC(xla::MaybeOwningDeviceAddress& mem,
+                                bool aliased);
 
 // HloModule
 XLA_HloModule ToC(const xla::HloModule& module);
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
index 05ec51c5e79ea8..c96e6be263d884 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions_test.cc
@@ -308,7 +308,7 @@ TEST(XlaHloModule, ToAndFromC) {
 }
 
 // TODO(b/290654348): SE_DeviceAddressBase, SE_DeviceAddressAllocator,
-// SE_MaybeOwningDeviceMemory
+// SE_MaybeOwningDeviceAddress
 
 }  // namespace
 
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
index 834a3da9f4ed0d..a42221294fa16c 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
@@ -72,15 +72,11 @@ typedef struct SE_DeviceAddressBase {
   uint64_t payload;
 } SE_DeviceAddressBase;
 
-typedef SE_DeviceAddressBase SE_DeviceMemoryBase;
-
 typedef struct SE_ScopedDeviceAddress {
   SE_DeviceAddressBase wrapped;
   int device_ordinal;
 } SE_ScopedDeviceAddress;
 
-typedef SE_ScopedDeviceAddress SE_ScopedDeviceMemory;
-
 typedef struct SE_AllocatorStats {
   int64_t num_allocs;
   int64_t bytes_in_use;
@@ -117,8 +113,6 @@ typedef struct SE_DeviceAddressAllocator {
   SE_DeallocateFn deallocate;
 } SE_DeviceAddressAllocator;
 
-typedef SE_DeviceAddressAllocator SE_DeviceMemoryAllocator;
-
 typedef struct SE_DeviceDescription {
   char* device_vendor;
   char* platform_version;
@@ -175,14 +169,14 @@ typedef struct SE_ExecutableRunOptions {
 typedef struct SE_ExecutableSerializationHandle
     SE_ExecutableSerializationHandle;
 
-typedef struct SE_MaybeOwningDeviceMemory {
+typedef struct SE_MaybeOwningDeviceAddress {
   SE_DeviceAddressBase memory;
   bool owned;
 
   // Set if owned
   int device_ordinal;
   SE_DeviceAddressAllocator allocator;
-} SE_MaybeOwningDeviceMemory;
+} SE_MaybeOwningDeviceAddress;
 
 typedef struct IntList {
   union {
@@ -279,7 +273,7 @@ typedef struct XLA_Literal {
 
 typedef struct XLA_MaybeOwningDeviceAddressShapeTree {
   XLA_Shape shape;
-  SE_MaybeOwningDeviceMemory* buffers;
+  SE_MaybeOwningDeviceAddress* buffers;
 } XLA_MaybeOwningDeviceAddressShapeTree;
 
 typedef struct XLA_ShapeIndex {
@@ -296,7 +290,7 @@ typedef struct SE_ExecutionInput {
 
 typedef struct SE_ExecutionOutput {
   XLA_ShapedBuffer result;
-  SE_MaybeOwningDeviceMemory* to_be_released;
+  SE_MaybeOwningDeviceAddress* to_be_released;
   int to_be_released_size;
   XLA_ShapeIndex* aliased_indices;
   int aliased_indices_size;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
index 3ef4c531a066fd..b5f5c6d80017ab 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable.cc
@@ -102,13 +102,13 @@ absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
 
     ApiConverter::ToC(arg.shape(), &se_args[i]->shape_tree.shape);
     auto* arg_buffers = arg.MutableBuffers();
-    absl::InlinedVector<SE_MaybeOwningDeviceMemory, 2> se_buffers;
+    absl::InlinedVector<SE_MaybeOwningDeviceAddress, 2> se_buffers;
     for (auto& pair : *arg_buffers) {
       bool aliased = arg.unowned_indices().count(pair.first) > 0;
       se_buffers.push_back(ApiConverter::ToC(pair.second, aliased));
     }
     se_args[i]->shape_tree.buffers =
-        new SE_MaybeOwningDeviceMemory[se_buffers.size()];
+        new SE_MaybeOwningDeviceAddress[se_buffers.size()];
     for (int j = 0; j < se_buffers.size(); ++j) {
       se_args[i]->shape_tree.buffers[j] = se_buffers[j];
     }
@@ -166,7 +166,7 @@ absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
             .Release()
             .value());
   }
-  ExecutorApiFn()->TpuExecutable_FreeMaybeOwningDeviceMemoryArrayFn(
+  ExecutorApiFn()->TpuExecutable_FreeMaybeOwningDeviceAddressArrayFn(
       se_execution_output.to_be_released);
 
   return output;
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
index ab8616ddc8ecc4..f8080a29d01fb3 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -212,7 +212,7 @@ absl::StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
   std::vector<se::DeviceAddressBase> memory_bases;
   memory_bases.reserve(arguments.size());
   for (auto& argument : arguments) {
-    memory_bases.push_back(argument.Buffer({}).AsDeviceMemoryBase());
+    memory_bases.push_back(argument.Buffer({}).AsDeviceAddress());
   }
   se::Stream* stream = run_options->stream();
 
@@ -240,16 +240,16 @@ absl::StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
       // data from fast memory instead of fresh data in large memory.
       auto it = arguments[parameter].MutableBuffers()->find({index});
       CHECK(it != arguments[parameter].MutableBuffers()->end());
-      CHECK(!it->second.AsDeviceMemoryBase().is_null());
+      CHECK(!it->second.AsDeviceAddress().is_null());
       CHECK(offset);
       bool is_prefetch_output_alias =
           absl::c_any_of(result.Result().buffers(), [&](auto index_addr_pair) {
             return index_addr_pair.second.IsSameAs(
-                it->second.AsDeviceMemoryBase());
+                it->second.AsDeviceAddress());
           });
       cross_program_prefetch_addrs.emplace_back(
           is_prefetch_output_alias ? stream_executor::DeviceAddressBase()
-                                   : it->second.AsDeviceMemoryBase());
+                                   : it->second.AsDeviceAddress());
       cross_program_prefetch_offsets.emplace_back(
           is_prefetch_output_alias ? std::numeric_limits<uint32_t>::max()
                                    : *offset);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index 8209ec55e0b12c..7edaba7a8e9fdd 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -118,8 +118,8 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   // to TpuExecutorInterface.
   absl::StatusOr<std::unique_ptr<
       tensorflow::tpu::TpuExecutorInterface::TemporaryDeviceAddress>>
-  CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
-                              int64_t size) override {
+  CreateTemporaryDeviceAddress(int64_t memory_space, int64_t byte_offset,
+                               int64_t size) override {
     LOG(FATAL) << "Unimplemented.";
   }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
index ce57d254450d4e..7f2c1b02e094b2 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_c_api.h
@@ -298,12 +298,12 @@ TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
 TFTPU_CAPI_EXPORT void TpuExecutable_FreeXlaShapeIndexArray(
     XLA_ShapeIndex* array);
 
-// This frees the SE_MaybeOwningDeviceMemory* array allocated when se_output is
+// This frees the SE_MaybeOwningDeviceAddress* array allocated when se_output is
 // returned by TpuExecutable_ExecuteAsyncOnStream.
 // Note that this only frees the heap-allocated array itself, and does not
 // free any of the underlying device memory.
-TFTPU_CAPI_EXPORT void TpuExecutable_FreeMaybeOwningDeviceMemoryArray(
-    SE_MaybeOwningDeviceMemory* array);
+TFTPU_CAPI_EXPORT void TpuExecutable_FreeMaybeOwningDeviceAddressArray(
+    SE_MaybeOwningDeviceAddress* array);
 
 TFTPU_CAPI_EXPORT void TpuExecutable_Fingerprint(SE_Executable* executable,
                                                  const char** fingerprint,
@@ -479,7 +479,7 @@ struct TfTpu_ExecutorApiFn {
 
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_ExecuteAsyncOnStream);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeXlaShapeIndexArray);
-  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeMaybeOwningDeviceMemoryArray);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_FreeMaybeOwningDeviceAddressArray);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Fingerprint);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Serialize);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutableSerialize_GetByteSize);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc b/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
index 5bc6a8ac9c4086..ee02abad1bf401 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_init_fns.inc
@@ -125,7 +125,7 @@ absl::Status SetExecutorStructFn(
   TFTPU_SET_FN(executor_fn, TpuCompiler_DefaultDeviceShapeRepresentation);
   TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
   TFTPU_SET_FN(executor_fn, TpuExecutable_FreeXlaShapeIndexArray);
-  TFTPU_SET_FN(executor_fn, TpuExecutable_FreeMaybeOwningDeviceMemoryArray);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_FreeMaybeOwningDeviceAddressArray);
   TFTPU_SET_FN(executor_fn, TpuExecutable_Fingerprint);
   TFTPU_SET_FN(executor_fn, TpuExecutable_Serialize);
   TFTPU_SET_FN(executor_fn, TpuExecutableSerialize_GetByteSize);
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
index 6012bb3752dd4f..db95ca86242a95 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor_interface.h
@@ -42,15 +42,12 @@ class TpuExecutorInterface : public stream_executor::StreamExecutorCommon {
   class TemporaryDeviceAddress {
    public:
     virtual ~TemporaryDeviceAddress() {}
-    virtual stream_executor::DeviceAddressBase AsDeviceMemoryBase() const = 0;
+    virtual stream_executor::DeviceAddressBase AsDeviceAddress() const = 0;
   };
 
-  using TemporaryDeviceMemory ABSL_DEPRECATE_AND_INLINE() =
-      TemporaryDeviceAddress;
-
   virtual absl::StatusOr<std::unique_ptr<TemporaryDeviceAddress>>
-  CreateTemporaryDeviceMemory(int64_t memory_space, int64_t byte_offset,
-                              int64_t size) {
+  CreateTemporaryDeviceAddress(int64_t memory_space, int64_t byte_offset,
+                               int64_t size) {
     LOG(FATAL) << "Unimplemented.";
   }
 

From 7b476a6047be6de8899eb1cae660ac84ea3bc3db Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 8 Dec 2025 23:04:22 -0800
Subject: [PATCH 058/753] Automated Code Change

PiperOrigin-RevId: 842078054
---
 third_party/xla/xla/service/spmd/BUILD                           | 1 +
 third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index ff5fdaf4ea5b0b..30ab20ab0b5cc6 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -279,6 +279,7 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:call_graph",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
     ],
diff --git a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
index fc0e1269962e53..3c39561468aa99 100644
--- a/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/stateful_rng_spmd_partitioner.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"

From 419991f7da1549cadeecb5a1ee3166075fb57726 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Mon, 8 Dec 2025 23:42:08 -0800
Subject: [PATCH 059/753] [xla:gpu] Document CommandBufferCmd statelessness and
 StateManager

PiperOrigin-RevId: 842088918
---
 .../backends/gpu/runtime/command_buffer_cmd.h | 42 +++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index 70114c4b8a2e00..31dc41b43596d0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -118,6 +118,8 @@ std::string CommandBufferCmdString(CommandBufferCmdType type);
 // CommandBufferCmd
 //===----------------------------------------------------------------------===//
 
+using ResourceUseVector = absl::InlinedVector<ResourceUse, 1>;
+
 // Command is a Thunk counterpart that instead of launching operations directly
 // on the underlying device records them into command buffers.
 //
@@ -127,9 +129,41 @@ std::string CommandBufferCmdString(CommandBufferCmdType type);
 //
 // Commands must be thread safe as they can be recorded into multiple command
 // buffers concurrently on different stream executors.
-
-using ResourceUseVector = absl::InlinedVector<ResourceUse, 1>;
-
+//
+// IMPORTANT: In contrast to GPU thunks, commands MUST be stateless. Thunk state
+// typically belongs to the Thunk instance itself, and tends to be kept in
+// synchronized hash maps keyed by `se::StreamExecutor*` pointer. Commands on
+// the other hand should attach state to the underlying command buffer, and
+// because the number of command buffers that can be instantiated from a command
+// sequence is unbounded (as we have an eviction policy for command buffers),
+// keeping a state in a map inside the command will lead to memory leaks.
+//
+// Commands have an external state manager, which is responsible for managing
+// the lifetime of command state. See `State` and `StateManager` classes below.
+//
+// To make command stateful, it needs a `params.state` indirection:
+//
+//   class MyCommand : public CommandBufferCmd {
+//     public:
+//
+//     // Container for mutable state required for command execution.
+//     struct MyState : CommandBufferCmd::State {
+//       ...
+//     };
+//
+//     absl::StatusOr<Command*> Record(...) override {
+//       // Attach a new instance of `MyState` to the `command_buffer`. When
+//       // command buffer will be destroyed, the state will be destroyed as
+//       // well automatically by XLA runtime. If this command will be recorded
+//       // into another command buffer, the state will be re-created
+//       // automatically using the provided callback.
+//       MyState* my_state = record_params.state.GetOrCreate<MyState>(this,
+//         command_buffer, [&] { // create MyState for a `command_buffer` });
+//       ...
+//     }
+//
+//   };
+//
 class CommandBufferCmd {
  public:
   explicit CommandBufferCmd(
@@ -156,6 +190,8 @@ class CommandBufferCmd {
   // Externally managed state (owned and synchronized by CommandBufferThunk)
   // allows commands to attach a piece of information to command buffer in a
   // safe and performant way.
+  //
+  // See example above next to `CommandBufferCmd` definition.
   class State {
    public:
     virtual ~State() = default;

From d4e51d88fb7334d79da06cdb40f5006478c44157 Mon Sep 17 00:00:00 2001
From: Alex Pivovarov <upwind@google.com>
Date: Tue, 9 Dec 2025 00:25:12 -0800
Subject: [PATCH 060/753] Replace std::copy with absl::c_copy for readability.

This change refactors usages of std::copy(container.begin(), container.end(), ...) to the more compact absl::c_copy(container, ...). This improves readability and reduces verbosity. Necessary includes for absl/algorithm/container.h have been added where required.

PiperOrigin-RevId: 842102140
---
 .../gpu/runtime/select_k_exec_raft_test.cc        |  2 +-
 third_party/xla/xla/service/BUILD                 | 11 ++++++++---
 .../xla/xla/service/cpu/onednn_memory_util.cc     |  3 +--
 third_party/xla/xla/service/gpu/transforms/BUILD  |  1 +
 .../xla/service/gpu/transforms/async_wrapper.cc   |  7 +++----
 .../xla/service/gpu/triton_tiling_propagation.cc  | 10 +++++-----
 third_party/xla/xla/service/hlo_sharding_test.cc  | 10 ++++++++--
 third_party/xla/xla/service/shape_inference.cc    | 15 ++++++---------
 .../xla/xla/service/triangular_solve_expander.cc  |  7 +++----
 third_party/xla/xla/stream_executor/dnn.cc        |  6 ++----
 .../xla/stream_executor/tpu/c_api_conversions.cc  |  5 ++---
 11 files changed, 40 insertions(+), 37 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc
index 623a0dd23c61f9..5cb759a96f3f01 100644
--- a/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/select_k_exec_raft_test.cc
@@ -106,7 +106,7 @@ void RunSelectKTest() {
   std::vector<T> h_data_in(batch * n);
   for (int j = 0; j < batch; ++j) {
     std::shuffle(topk.begin(), topk.end(), gen);
-    std::copy(topk.begin(), topk.end(), h_data_in.begin() + j * n);
+    absl::c_copy(topk, h_data_in.begin() + j * n);
   }
 
   // Compute golden Top-K values for verification
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 52e39430e85f9d..b5d097d79b4715 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -940,6 +940,9 @@ xla_cc_test(
     name = "hlo_sharding_test",
     srcs = ["hlo_sharding_test.cc"],
     deps = [
+        "//xla:array",
+        "//xla:array3d",
+        "//xla:array4d",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -950,8 +953,11 @@ xla_cc_test(
         "//xla/hlo/testlib:test_helpers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -2280,7 +2286,6 @@ cc_library(
     hdrs = ["triangular_solve_expander.h"],
     deps = [
         ":hlo_creation_utils",
-        ":hlo_module_config",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -2292,14 +2297,14 @@ cc_library(
         "//xla/hlo/builder/lib:slicing",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/expanders:op_expander_pass",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.cc b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
index 2233c84d814d38..94cf1d16e8cfac 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
@@ -59,8 +59,7 @@ MemrefInfoHandler CreateMemrefFromShape(const Shape& shape, const void* buf) {
   result->dtype = shape.element_type();
   result->rank = shape.dimensions().size();
   auto dimensions = shape.dimensions();
-  std::copy(dimensions.begin(), dimensions.end(),
-            absl::MakeSpan(result->dims).begin());
+  absl::c_copy(dimensions, absl::MakeSpan(result->dims).begin());
 
   int64_t stride = 1;
   for (int i : shape.layout().minor_to_major()) {
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index e76b5f9ee12da0..e4ed891c6391b9 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -160,6 +160,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
index a96615edca6eb3..7529a1c4be4d45 100644
--- a/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/async_wrapper.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/async_wrapper.h"
 
-#include <algorithm>
 #include <deque>
 #include <iterator>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -70,9 +70,8 @@ absl::StatusOr<bool> AsyncWrapper::RunImpl(
       // instructions that can potentially be made async.
       if (HloPredicateIsOp<HloOpcode::kCall, HloOpcode::kConditional,
                            HloOpcode::kWhile>(instruction)) {
-        std::copy(instruction->called_computations().begin(),
-                  instruction->called_computations().end(),
-                  std::back_inserter(computations));
+        absl::c_copy(instruction->called_computations(),
+                     std::back_inserter(computations));
       }
     }
   }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 437a2269739cf7..926307d6f0c0e7 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/triton_tiling_propagation.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -260,7 +259,9 @@ TensorIterationSpec DimensionOrder::ToTensorIterationSpec() const {
 
     // We should not remove the only fragment in a dimension, because if it is
     // removed, the dimension will be removed from the TensorIterationSpec.
-    if (dim_spec.size() <= 1) continue;
+    if (dim_spec.size() <= 1) {
+      continue;
+    }
 
     TensorIterationSpec::DimIterationSpec filtered_dim_spec;
     absl::c_copy_if(dim_spec, std::back_inserter(filtered_dim_spec),
@@ -575,9 +576,8 @@ DimOrderMapOrError GetPropagatedDimOrdersForBitcast(
     std::vector<int>& dst = dst_dim_fragment_orders[dim_index];
     dst.reserve(dim_sequence.size());
     for (const int src : dim_sequence) {
-      std::copy(src_to_dst[&src_fragments_order[src]].cbegin(),
-                src_to_dst[&src_fragments_order[src]].cend(),
-                std::back_inserter(dst));
+      absl::c_copy(src_to_dst[&src_fragments_order[src]],
+                   std::back_inserter(dst));
     }
   }
 
diff --git a/third_party/xla/xla/service/hlo_sharding_test.cc b/third_party/xla/xla/service/hlo_sharding_test.cc
index ee87360d9c2c2c..bb97ca0578e0a7 100644
--- a/third_party/xla/xla/service/hlo_sharding_test.cc
+++ b/third_party/xla/xla/service/hlo_sharding_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <algorithm>
 #include <cstdint>
 #include <sstream>
 #include <string>
@@ -21,16 +20,23 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/hash/hash.h"
+#include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "xla/array.h"
+#include "xla/array3d.h"
+#include "xla/array4d.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/hlo/testlib/test_helpers.h"
+#include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
 
@@ -42,7 +48,7 @@ using ::tsl::proto_testing::EqualsProto;
 Array<int64_t> MakeArray(absl::Span<const int64_t> dimensions,
                          absl::Span<const int64_t> contents) {
   Array<int64_t> a(dimensions);
-  std::copy(contents.begin(), contents.end(), a.begin());
+  absl::c_copy(contents, a.begin());
   return a;
 }
 
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index b5dc87cbf44ee7..8bc3ccb4987186 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -2189,22 +2189,19 @@ ShapeInference::InferScalarBroadcastShape(absl::Span<const Shape> shapes) {
   std::vector<int64_t> input_dnums(num_dims);
   input_dnums[0] = dnums.input_batch_dimension();
   input_dnums[1] = dnums.input_feature_dimension();
-  std::copy(dnums.input_spatial_dimensions().begin(),
-            dnums.input_spatial_dimensions().end(), input_dnums.begin() + 2);
+  absl::c_copy(dnums.input_spatial_dimensions(), input_dnums.begin() + 2);
   absl::c_sort(input_dnums);
 
   std::vector<int64_t> window_dnums(num_dims);
   window_dnums[0] = dnums.kernel_input_feature_dimension();
   window_dnums[1] = dnums.kernel_output_feature_dimension();
-  std::copy(dnums.kernel_spatial_dimensions().begin(),
-            dnums.kernel_spatial_dimensions().end(), window_dnums.begin() + 2);
+  absl::c_copy(dnums.kernel_spatial_dimensions(), window_dnums.begin() + 2);
   absl::c_sort(window_dnums);
 
   std::vector<int64_t> output_dnums(num_dims);
   output_dnums[0] = dnums.output_batch_dimension();
   output_dnums[1] = dnums.output_feature_dimension();
-  std::copy(dnums.output_spatial_dimensions().begin(),
-            dnums.output_spatial_dimensions().end(), output_dnums.begin() + 2);
+  absl::c_copy(dnums.output_spatial_dimensions(), output_dnums.begin() + 2);
   absl::c_sort(output_dnums);
 
   std::vector<int64_t> expected_dnums(num_dims);
@@ -3590,9 +3587,9 @@ ShapeInference::InferCollectivePermuteDoneShape(const Shape& operand_shape) {
 
   std::vector<int64_t> dimensions(operand.dimensions().size() +
                                   broadcast_sizes.size());
-  std::copy(broadcast_sizes.begin(), broadcast_sizes.end(), dimensions.begin());
-  std::copy(operand.dimensions().begin(), operand.dimensions().end(),
-            dimensions.begin() + broadcast_sizes.size());
+  absl::c_copy(broadcast_sizes, dimensions.begin());
+  absl::c_copy(operand.dimensions(),
+               dimensions.begin() + broadcast_sizes.size());
 
   TF_ASSIGN_OR_RETURN(Shape result, ShapeUtil::MakeValidatedShape(
                                         operand.element_type(), dimensions));
diff --git a/third_party/xla/xla/service/triangular_solve_expander.cc b/third_party/xla/xla/service/triangular_solve_expander.cc
index 049249aa5b0481..5c8577a47eca98 100644
--- a/third_party/xla/xla/service/triangular_solve_expander.cc
+++ b/third_party/xla/xla/service/triangular_solve_expander.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
@@ -36,13 +37,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/hlo_creation_utils.h"
-#include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -120,7 +119,7 @@ XlaOp DiagonalBlocks(XlaOp a, int64_t block_size) {
       TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(last_blocks));
       auto shape_dims = blocks_shape.dimensions();
       auto last_blocks_dims = std::vector<int64_t>(ndims);
-      std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
+      absl::c_copy(shape_dims, last_blocks_dims.begin());
       last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
       last_blocks = Reshape(last_blocks, last_blocks_dims);
 
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index f38a2597972d75..d837220e6f4fe4 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -691,8 +691,7 @@ std::vector<int64_t> BatchDescriptor::full_dims(
   std::vector<int64_t> bdyx_dims(ndims() + 2);
   bdyx_dims[0] = count();
   bdyx_dims[1] = feature_map_count();
-  std::copy(spatial_size().begin(), spatial_size().end(),
-            bdyx_dims.begin() + 2);
+  absl::c_copy(spatial_size(), bdyx_dims.begin() + 2);
   return ReorderDims(bdyx_dims, DataLayout::kBatchDepthYX, layout);
 }
 
@@ -831,8 +830,7 @@ std::vector<int64_t> FilterDescriptor::full_dims(
   std::vector<int64_t> oiyx_dims(ndims() + 2);
   oiyx_dims[0] = output_feature_map_count();
   oiyx_dims[1] = input_feature_map_count();
-  std::copy(input_filter_dims().begin(), input_filter_dims().end(),
-            oiyx_dims.begin() + 2);
+  absl::c_copy(input_filter_dims(), oiyx_dims.begin() + 2);
   return ReorderDims(oiyx_dims, FilterLayout::kOutputInputYX, layout);
 }
 
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index cb53f7c79336dc..b4aefe96cc6995 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 
-#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -66,9 +65,9 @@ static void CreateVectorBase(const absl::Span<Src> src, DstList* dst) {
   dst->size = src.size();
   if (dst->size > TPU_C_API_MAX_INLINED) {
     dst->heap = new Dst[dst->size];
-    std::copy(src.begin(), src.end(), dst->heap);
+    absl::c_copy(src, dst->heap);
   } else {
-    std::copy(src.begin(), src.end(), dst->inlined);
+    absl::c_copy(src, dst->inlined);
   }
 }
 

From 06a58e00b8afa55222f4efaeadd4770b3c712b9e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 00:29:19 -0800
Subject: [PATCH 061/753] Automated Code Change

PiperOrigin-RevId: 842103155
---
 tensorflow/lite/tools/utils.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/tools/utils.cc b/tensorflow/lite/tools/utils.cc
index 6173ec1b112203..96b8bf8689e610 100644
--- a/tensorflow/lite/tools/utils.cc
+++ b/tensorflow/lite/tools/utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <random>
 #include <string>
+#include <type_traits>
 
 #include "absl/types/span.h"
 #include "Eigen/Core"  // from @eigen_archive

From 51058956fec5e3eda3d2e81338b26244378f2bcf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 01:03:12 -0800
Subject: [PATCH 062/753] compat: Update forward compatibility horizon to
 2025-12-09

PiperOrigin-RevId: 842113241
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 019f2360af662e..949094ad18d927 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 8)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 9)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e534b1654b447e26e2e22e78afe236067cc46d62 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 01:04:04 -0800
Subject: [PATCH 063/753] Update GraphDef version to 2436.

PiperOrigin-RevId: 842113538
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 5448bf12c3dcfe..b483429b89ccd9 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2435  // Updated: 2025/12/8
+#define TF_GRAPH_DEF_VERSION 2436  // Updated: 2025/12/9
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 6cfe46b1e50d9be83db107c9fac159775e74b6a7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 01:17:28 -0800
Subject: [PATCH 064/753] Reverts dbd604a06c501bb6dcfe9448a4582ef586539855

PiperOrigin-RevId: 842117724
---
 third_party/xla/xla/debug_options_flags.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index fe8c14f18dbd8c..e7473940048f0a 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -416,7 +416,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_autotune_gemm_rtol(0.1f);
 
-  opts.set_xla_enable_command_buffers_during_profiling(true);
+  // TODO(b/355487968): Remove this flag once all data will be presented in
+  // xprof with command buffers.
+  opts.set_xla_enable_command_buffers_during_profiling(false);
 
   opts.set_xla_gpu_cudnn_gemm_max_plans(5);
 

From b722a687c6faa1aff94584b6cb3cdefc69d2256f Mon Sep 17 00:00:00 2001
From: Alex Pivovarov <upwind@google.com>
Date: Tue, 9 Dec 2025 01:24:24 -0800
Subject: [PATCH 065/753] Add nullptr comparison and boolean conversion to
 MaybeOwning.

This change introduces `operator==`, `operator!=` for comparing `MaybeOwning` with `nullptr_t`, and an `explicit operator bool()` to `MaybeOwning`. These allow for more idiomatic checks against null. Updated several call sites to use these new operators, simplifying expressions like `obj.get() == nullptr` to `obj == nullptr` or `obj.get() != nullptr` to `obj`.

PiperOrigin-RevId: 842119942
---
 .../xla/backends/cpu/nanort/ifrt_client.cc    |  2 +-
 third_party/xla/xla/hlo/ir/hlo_module.cc      |  2 +-
 third_party/xla/xla/maybe_owning.h            | 26 +++++++++++++++++++
 .../xla/xla/service/gpu/gpu_compiler.cc       |  6 ++---
 .../xla/xla/tsl/concurrency/async_value_ref.h |  2 +-
 5 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
index 733dd00f6eb2f7..f3731f64821230 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
@@ -511,7 +511,7 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
     OwnedDataPtr owned_data(
         tsl::port::AlignedMalloc(std::max<size_t>(size, Align()), Align()),
         [](void* ptr) { tsl::port::AlignedFree(ptr); });
-    if (ABSL_PREDICT_FALSE(owned_data.get() == nullptr)) {
+    if (ABSL_PREDICT_FALSE(owned_data == nullptr)) {
       return Internal("Failed to allocate memory for NanoArray. Errno: %s",
                       strerror(errno));
     }
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index 4656f2a442ca91..fe9502af64f15c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -1751,7 +1751,7 @@ void HloModule::OriginalValueRecoveryTable::AddRecoveryComputation(
     std::optional<OriginalArray>* new_original_array =
         new_inst->original_value()->mutable_original_array(shape_index);
     if (!*new_original_array) {
-      if (recovery_computation->get() == nullptr) {
+      if (*recovery_computation == nullptr) {
         // If the recovery computation is a nullptr, it means this is an
         // identity computation and we can just pass through the original array.
         new_original_array->emplace(*old_original_array);
diff --git a/third_party/xla/xla/maybe_owning.h b/third_party/xla/xla/maybe_owning.h
index 2b63a45543375d..04bd39a670bea3 100644
--- a/third_party/xla/xla/maybe_owning.h
+++ b/third_party/xla/xla/maybe_owning.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_MAYBE_OWNING_H_
 #define XLA_MAYBE_OWNING_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 
@@ -76,6 +77,31 @@ class MaybeOwning final {
 
   bool OwnsPtr() const { return kOwningBitMask & ptr_and_owning_bit_; }
 
+  friend bool operator==(const MaybeOwning& mo, std::nullptr_t) {
+    // A MaybeOwning is considered null if its internal pointer is null.
+    // The get() method correctly removes the mask and returns the raw pointer.
+    return mo.get() == nullptr;
+  }
+
+  friend bool operator==(std::nullptr_t, const MaybeOwning& mo) {
+    // Maintain symmetry for the comparison order
+    return mo.get() == nullptr;
+  }
+
+  friend bool operator!=(const MaybeOwning& mo, std::nullptr_t) {
+    return mo.get() != nullptr;
+  }
+
+  friend bool operator!=(std::nullptr_t, const MaybeOwning& mo) {
+    return mo.get() != nullptr;
+  }
+
+  explicit operator bool() const {
+    // The class is considered 'true' if the underlying pointer is not null.
+    // We use the existing get() method, which correctly handles the mask.
+    return get() != nullptr;
+  }
+
  private:
   enum : uint64_t {
     kOwningBitMask = 1UL,
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 55c74ba56d3ecb..a98c18b53adc89 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/SplitModule.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "google/protobuf/text_format.h"
 #include "xla/backends/cpu/nanort/nanort_client.h"
@@ -69,7 +70,6 @@ limitations under the License.
 #include "xla/core/host_offloading/hlo_host_device_type_call_wrapper.h"
 #include "xla/core/host_offloading/host_compute_asyncifier.h"
 #include "xla/hlo/analysis/alias_info.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -2224,7 +2224,7 @@ absl::StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileAndLink(
   // function per module. If caching is not used limit the number of modules to
   // the number of threads.
   int num_modules = CountFunctions(*llvm_module);
-  if (thread_pool.get() != nullptr && !use_cache) {
+  if (thread_pool && !use_cache) {
     num_modules = std::max(1, std::min(thread_pool->NumThreads(), num_modules));
   }
   if (compile_module_results.llvm_module_constants != nullptr) {
@@ -2262,7 +2262,7 @@ absl::StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileAndLink(
     absl::StatusOr<BackendCompileResult> result;
   };
   std::vector<NamedCompileResult> compile_results(llvm_modules.size());
-  if (thread_pool.get() != nullptr) {
+  if (thread_pool) {
     absl::BlockingCounter counter(llvm_modules.size());
     for (int i = 0; i < llvm_modules.size(); ++i) {
       thread_pool.get_mutable()->Schedule(
diff --git a/third_party/xla/xla/tsl/concurrency/async_value_ref.h b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
index 83825c973a4e5f..1d72dbb4cd05b0 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_ref.h
+++ b/third_party/xla/xla/tsl/concurrency/async_value_ref.h
@@ -357,7 +357,7 @@ class AsyncValueRef {
     SetError(absl::InternalError(message_view));
   }
 
-  explicit operator bool() const { return value_.get() != nullptr; }
+  explicit operator bool() const { return value_ != nullptr; }
   bool operator==(const AsyncValueRef& r) const { return value_ == r.value_; }
   bool operator!=(const AsyncValueRef& r) const { return value_ != r.value_; }
 

From 7ccf06445fa083e296d135a6ca37753ffbf15e06 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 01:28:36 -0800
Subject: [PATCH 066/753] Automated Code Change

PiperOrigin-RevId: 842121699
---
 tensorflow/cc/BUILD                   | 1 +
 tensorflow/cc/gradients/array_grad.cc | 2 ++
 tensorflow/cc/gradients/image_grad.cc | 1 +
 tensorflow/cc/gradients/math_grad.cc  | 1 +
 tensorflow/cc/gradients/nn_grad.cc    | 1 +
 5 files changed, 6 insertions(+)

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index bfa665a09f7588..3131284b4802bd 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -359,6 +359,7 @@ cc_library(
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index f3c3fd045a3d6f..f0189c60c714e1 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/grad_op_registry.h"
 #include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
diff --git a/tensorflow/cc/gradients/image_grad.cc b/tensorflow/cc/gradients/image_grad.cc
index deb90eec264ee7..bb37c90b3f32a8 100644
--- a/tensorflow/cc/gradients/image_grad.cc
+++ b/tensorflow/cc/gradients/image_grad.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index c785af15f95447..af39009ad3f2a5 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <initializer_list>
 #include <iterator>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 6309080492c1da..9b980bd9e8321d 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
 #include <functional>
 #include <string>
 #include <vector>

From 25ed31c1a6c0a135239618b6d885385a6d79701b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 01:49:00 -0800
Subject: [PATCH 067/753] Automated Code Change

PiperOrigin-RevId: 842128029
---
 .../xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
index 5b47611387f732..12216068683e2d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler_stub_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include <gtest/gtest.h>
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"

From 41fddcf21863db7f7dbb855a5867858f8726d90a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 01:57:02 -0800
Subject: [PATCH 068/753] Automated Code Change

PiperOrigin-RevId: 842130465
---
 .../xla/xla/backends/autotuner/file_based_autotuner_cache.cc     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc b/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
index 969286250aa1e9..d9aee5dcd69dd8 100644
--- a/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
+++ b/third_party/xla/xla/backends/autotuner/file_based_autotuner_cache.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"

From f90cdb544e97accf09780a2c8810cab6a3a305fb Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Tue, 9 Dec 2025 02:01:31 -0800
Subject: [PATCH 069/753] PR #34898: [GPU] Do not float-normalize bf16 negation
 and abs.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34898

📝 Summary of Changes
Avoid unnecessary type casts - bf16 negation and abs are supported in PTX.

🚀 Kind of Contribution
♻️ Cleanup

🧪 Unit Tests:
yes

🧪 Execution Tests:
no

Copybara import of the project:

--
867f131cccba2df2cbc61d584ebc238cb0aceeae by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Do not float-normalize bf16 negation and abs.

Merging this change closes #34898

PiperOrigin-RevId: 842132075
---
 .../xla/xla/service/gpu/gpu_float_support.cc  |  2 ++
 .../xla/service/gpu/gpu_float_support_test.cc | 29 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/gpu_float_support.cc b/third_party/xla/xla/service/gpu/gpu_float_support.cc
index cb0477bf19b9a9..6aa7e4b1ec1f68 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support.cc
@@ -131,8 +131,10 @@ bool GpuFloatSupport::IsSupported(const HloInstruction& hlo) const {
         return compute_capability_.IsCuda();
       }
       return false;
+    case HloOpcode::kAbs:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
+    case HloOpcode::kNegate:
       if (LowPrecisionType() == BF16) {
         auto* cuda_compute_capability =
             compute_capability_.cuda_compute_capability();
diff --git a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
index f464b670a57701..bd88890113d1a5 100644
--- a/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_float_support_test.cc
@@ -432,6 +432,35 @@ ENTRY main {
       se::GpuComputeCapability{se::CudaComputeCapability::Volta()}, BF16, F32));
 }
 
+class Bf16UnaryOpTest : public FloatSupportTest,
+                        public ::testing::WithParamInterface<HloOpcode> {};
+
+TEST_P(Bf16UnaryOpTest, IsOnlyNormalizedPreAmpere) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(
+                              absl::Substitute(R"(
+entry {
+  a = bf16[] parameter(0)
+  r = bf16[] $0(a)
+})",
+                                               HloOpcodeString(GetParam()))));
+  EXPECT_FALSE(
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Hopper()},
+                BF16, F32));
+  EXPECT_FALSE(
+      Normalize(module.get(),
+                se::GpuComputeCapability{se::CudaComputeCapability::Ampere()},
+                BF16, F32));
+  EXPECT_TRUE(Normalize(
+      module.get(),
+      se::GpuComputeCapability{se::CudaComputeCapability::Volta()}, BF16, F32));
+}
+
+INSTANTIATE_TEST_SUITE_P(Bf16UnaryOps, Bf16UnaryOpTest,
+                         ::testing::Values(HloOpcode::kNegate,
+                                           HloOpcode::kAbs));
+
 TEST_F(FloatSupportTest,
        BF16ReductionOnHopperIsOnlyNormalizedIfReducerIsUnsupported) {
   auto cc = se::CudaComputeCapability::Hopper();

From a337f310754bf04bb0d69706cec11380a80d8485 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 9 Dec 2025 02:19:24 -0800
Subject: [PATCH 070/753] PR #34964: Bump github/codeql-action from 4.31.6 to
 4.31.7

Imported from GitHub PR https://github.com/openxla/xla/pull/34964

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.31.6 to 4.31.7.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/github/codeql-action/releases">github/codeql-action's releases</a>.</em></p>
<blockquote>
<h2>v4.31.7</h2>
<h1>CodeQL Action Changelog</h1>
<p>See the <a href="https://github.com/github/codeql-action/releases">releases page</a> for the relevant changes to the CodeQL CLI and language packs.</p>
<h2>4.31.7 - 05 Dec 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.7. <a href="https://redirect.github.com/github/codeql-action/pull/3343">#3343</a></li>
</ul>
<p>See the full <a href="https://github.com/github/codeql-action/blob/v4.31.7/CHANGELOG.md">CHANGELOG.md</a> for more information.</p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/github/codeql-action/blob/main/CHANGELOG.md">github/codeql-action's changelog</a>.</em></p>
<blockquote>
<h1>CodeQL Action Changelog</h1>
<p>See the <a href="https://github.com/github/codeql-action/releases">releases page</a> for the relevant changes to the CodeQL CLI and language packs.</p>
<h2>[UNRELEASED]</h2>
<p>No user facing changes.</p>
<h2>4.31.7 - 05 Dec 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.7. <a href="https://redirect.github.com/github/codeql-action/pull/3343">#3343</a></li>
</ul>
<h2>4.31.6 - 01 Dec 2025</h2>
<p>No user facing changes.</p>
<h2>4.31.5 - 24 Nov 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.6. <a href="https://redirect.github.com/github/codeql-action/pull/3321">#3321</a></li>
</ul>
<h2>4.31.4 - 18 Nov 2025</h2>
<p>No user facing changes.</p>
<h2>4.31.3 - 13 Nov 2025</h2>
<ul>
<li>CodeQL Action v3 will be deprecated in December 2026.  The Action now logs a warning for customers who are running v3 but could be running v4. For more information, see <a href="https://github.blog/changelog/2025-10-28-upcoming-deprecation-of-codeql-action-v3/">Upcoming deprecation of CodeQL Action v3</a>.</li>
<li>Update default CodeQL bundle version to 2.23.5. <a href="https://redirect.github.com/github/codeql-action/pull/3288">#3288</a></li>
</ul>
<h2>4.31.2 - 30 Oct 2025</h2>
<p>No user facing changes.</p>
<h2>4.31.1 - 30 Oct 2025</h2>
<ul>
<li>The <code>add-snippets</code> input has been removed from the <code>analyze</code> action. This input has been deprecated since CodeQL Action 3.26.4 in August 2024 when this removal was announced.</li>
</ul>
<h2>4.31.0 - 24 Oct 2025</h2>
<ul>
<li>Bump minimum CodeQL bundle version to 2.17.6. <a href="https://redirect.github.com/github/codeql-action/pull/3223">#3223</a></li>
<li>When SARIF files are uploaded by the <code>analyze</code> or <code>upload-sarif</code> actions, the CodeQL Action automatically performs post-processing steps to prepare the data for the upload. Previously, these post-processing steps were only performed before an upload took place. We are now changing this so that the post-processing steps will always be performed, even when the SARIF files are not uploaded. This does not change anything for the <code>upload-sarif</code> action. For <code>analyze</code>, this may affect Advanced Setup for CodeQL users who specify a value other than <code>always</code> for the <code>upload</code> input. <a href="https://redirect.github.com/github/codeql-action/pull/3222">#3222</a></li>
</ul>
<h2>4.30.9 - 17 Oct 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.3. <a href="https://redirect.github.com/github/codeql-action/pull/3205">#3205</a></li>
<li>Experimental: A new <code>setup-codeql</code> action has been added which is similar to <code>init</code>, except it only installs the CodeQL CLI and does not initialize a database. Do not use this in production as it is part of an internal experiment and subject to change at any time. <a href="https://redirect.github.com/github/codeql-action/pull/3204">#3204</a></li>
</ul>
<h2>4.30.8 - 10 Oct 2025</h2>
<p>No user facing changes.</p>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/github/codeql-action/commit/cf1bb45a277cb3c205638b2cd5c984db1c46a412"><code>cf1bb45</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3344">#3344</a> from github/update-v4.31.7-f5c63fadd</li>
<li><a href="https://github.com/github/codeql-action/commit/f4ebe95061f10e93e9d301f51ed59c37fc67acde"><code>f4ebe95</code></a> Update changelog for v4.31.7</li>
<li><a href="https://github.com/github/codeql-action/commit/f5c63fadd50734aadb36128b8fd75caabc02a3dc"><code>f5c63fa</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3343">#3343</a> from github/update-bundle/codeql-bundle-v2.23.7</li>
<li><a href="https://github.com/github/codeql-action/commit/a2c01e776e434421d4f8cba239abb06ec9713e92"><code>a2c01e7</code></a> Add changelog note</li>
<li><a href="https://github.com/github/codeql-action/commit/ac34c1383489d3ac7641a26c5fbbf8ec5112f4fc"><code>ac34c13</code></a> Update default bundle to codeql-bundle-v2.23.7</li>
<li><a href="https://github.com/github/codeql-action/commit/267c4672a565967e4531438f2498370de5e8a98d"><code>267c467</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3339">#3339</a> from github/dependabot/npm_and_yarn/npm-minor-77d264...</li>
<li><a href="https://github.com/github/codeql-action/commit/aeabef7b69ed0dc14688dbc848e5f1edaeae59f1"><code>aeabef7</code></a> Merge branch 'main' into dependabot/npm_and_yarn/npm-minor-77d26487b0</li>
<li><a href="https://github.com/github/codeql-action/commit/78357d3fc9e24912713f993f791b2aef1b04bf6d"><code>78357d3</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3341">#3341</a> from github/mbg/ci/update-cs-config-cli-tests</li>
<li><a href="https://github.com/github/codeql-action/commit/d61a6fa793c84c98e08555552b4b9c6374665d24"><code>d61a6fa</code></a> Update CLI config test to account for overlay db changes on PRs</li>
<li><a href="https://github.com/github/codeql-action/commit/ce27e95f791dfda287706648ff69d9226c4526c2"><code>ce27e95</code></a> Rebuild</li>
<li>Additional commits viewable in <a href="https://github.com/github/codeql-action/compare/fe4161a26a8629af62121b670040955b330f9af2...cf1bb45a277cb3c205638b2cd5c984db1c46a412">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github/codeql-action&package-manager=github_actions&previous-version=4.31.6&new-version=4.31.7)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>
Copybara import of the project:

--
0321b497362c7e4020514d78607ca2d0069f6c89 by dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>:

Bump github/codeql-action from 4.31.6 to 4.31.7

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.31.6 to 4.31.7.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/fe4161a26a8629af62121b670040955b330f9af2...cf1bb45a277cb3c205638b2cd5c984db1c46a412)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-version: 4.31.7
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Merging this change closes #34964

PiperOrigin-RevId: 842137219
---
 third_party/xla/.github/workflows/scorecards-analysis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/.github/workflows/scorecards-analysis.yml b/third_party/xla/.github/workflows/scorecards-analysis.yml
index f781a8bcb93b8a..d2bf9a77ef7ab6 100644
--- a/third_party/xla/.github/workflows/scorecards-analysis.yml
+++ b/third_party/xla/.github/workflows/scorecards-analysis.yml
@@ -67,6 +67,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@fe4161a26a8629af62121b670040955b330f9af2 # v4.31.6
+        uses: github/codeql-action/upload-sarif@cf1bb45a277cb3c205638b2cd5c984db1c46a412 # v4.31.7
         with:
           sarif_file: results.sarif

From aa5af21c0e39c1a91ed074f846aa8160fd5521af Mon Sep 17 00:00:00 2001
From: Terry Sun <tesun@nvidia.com>
Date: Tue, 9 Dec 2025 02:46:25 -0800
Subject: [PATCH 071/753] PR #34864: Update link in GPU flag guidance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34864

📝 Summary of Changes
Update link in GPU flag guidance.

🎯 Justification
The original linked page is moved, need update.

🚀 Kind of Contribution
📚 Documentation

📊 Benchmark (for Performance Improvements)
N/A.

🧪 Unit Tests:
N/A.

🧪 Execution Tests:
N/A.

Copybara import of the project:

--
27abae70ae87bc005166ddbedfb2c1a0bd15f3f8 by Terry Sun <tesun@nvidia.com>:

update link

Merging this change closes #34864

PiperOrigin-RevId: 842145979
---
 third_party/xla/docs/flags_guidance.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/xla/docs/flags_guidance.md b/third_party/xla/docs/flags_guidance.md
index f887b66c4fbc50..c973a1a665ca7c 100644
--- a/third_party/xla/docs/flags_guidance.md
+++ b/third_party/xla/docs/flags_guidance.md
@@ -79,8 +79,7 @@ data-parallel collectives (`xla_gpu_enable_pipelined_all_gather`,
 (`xla_gpu_enable_while_loop_double_buffering`), latency hiding scheduling
 (`xla_gpu_enable_latency_hiding_scheduler`), and SOL latency estimator on
 Hopper/Blackwell (`xla_gpu_enable_analytical_sol_latency_estimator`). See
-[GPU Optimization Levels](https://openxla.org/xla/gpu_optimization_levels) for
-details.
+[GPU Effort Levels](https://openxla.org/xla/effort_levels) for details.
 
 | Flag | Type | Notes |
 | :---- | :---- | :----- |

From 2a5ffea6a109dbb85951060a83187aa8251c3097 Mon Sep 17 00:00:00 2001
From: spiao <Songlin.Piao@amd.com>
Date: Tue, 9 Dec 2025 02:48:20 -0800
Subject: [PATCH 072/753] PR #34806: [ROCm] fix the calling convention for AMD
 GPU

Imported from GitHub PR https://github.com/openxla/xla/pull/34806

Bugfix: PR #34230 ("argument removal without building prototype") removed the call to **BuildKernelPrototypeFromUniqueName** which internally called **AnnotateFunctionAsGpuKernel** to set the correct calling convention based on the target GPU. Without this, Triton's **PTX_Kernel** calling convention was copied directly, which doesn't work on AMD GPUs and lead to "LLVM ERROR: unsupported calling convention".

Fix: Added a call to **AnnotateFunctionAsGpuKernel** in **RemoveUnusedTritonAbiArguments** to properly set:

PTX_Kernel (71) for NVIDIA
AMDGPU_KERNEL (91) for AMD
SPIR_KERNEL (76) for SPIR

@xla-rotation could you review my PR, please?
Copybara import of the project:

--
ebd6e1fa03033bc9f6913351323fce26e1a8e4d2 by Songlin Piao <Songlin.Piao@amd.com>:

replace the manual calling convention fix with AnnotateFunctionAsGpuKernel

--
4f16d9579b11c2984c8ebe58041b0d2b9ea5ba3f by Songlin Piao <Songlin.Piao@amd.com>:

added a filecheck test

Merging this change closes #34806

PiperOrigin-RevId: 842146580
---
 .../backends/gpu/codegen/fusion_emitter.cc    |  7 ++++-
 third_party/xla/xla/service/gpu/tests/BUILD   |  3 +++
 .../gpu/tests/triton_calling_convention.hlo   | 26 +++++++++++++++++++
 3 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo

diff --git a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
index ad12a2d0923948..d051a6daf3e778 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusion_emitter.cc
@@ -266,10 +266,15 @@ absl::StatusOr<llvm::Function*> RemoveUnusedTritonAbiArguments(
           .getCallee();
   llvm::Function* new_function = static_cast<llvm::Function*>(inserted);
 
-  new_function->setCallingConv(impl_fn->getCallingConv());
   new_function->copyMetadata(impl_fn, 0);
   new_function->setAttributes(impl_fn->getAttributes());
 
+  // Set the correct calling convention for the target GPU.
+  // Triton generates PTX_Kernel CC even for AMD, so we need to use
+  // AnnotateFunctionAsGpuKernel to set the correct CC based on target triple.
+  llvm::IRBuilder<> builder(llvm_module->getContext());
+  AnnotateFunctionAsGpuKernel(llvm_module, new_function, &builder);
+
   new_function->splice(new_function->begin(), impl_fn);
 
   for (const auto& [impl_fn_arg, kernel_arg] :
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 55c8c2316833fd..284f4090f3fe0a 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -661,6 +661,7 @@ lit_test_suite_for_gpus(
             "slice_to_dynamic.hlo",
             "sorting.hlo",
             "sub_byte_collectives.hlo",
+            "triton_calling_convention.hlo",
             "triton_naming.hlo",
             "zero_clamp_abs_index.hlo",
         ],
@@ -673,10 +674,12 @@ lit_test_suite_for_gpus(
     disabled_on_gpus = {
         "v100": [
             "kernel_reuse.hlo",
+            "triton_calling_convention.hlo",
             "triton_naming.hlo",
         ],
         "p100": [
             "kernel_reuse.hlo",
+            "triton_calling_convention.hlo",
             "triton_naming.hlo",
         ],
         "mi200": [
diff --git a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
new file mode 100644
index 00000000000000..6a83c444793d47
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
@@ -0,0 +1,26 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+
+// Verify that Triton kernels have the correct calling convention:
+// - PTX_KERNEL (71) for NVIDIA targets
+// - AMDGPU_KERNEL (91) for AMD targets
+// CHECK-PTX: define ptx_kernel void @triton_
+// CHECK-GCN: define amdgpu_kernel void @triton_
+
+HloModule TritonCallingConvention, is_scheduled=true
+
+triton_softmax {
+  param_0 = f32[4,4]{1,0} parameter(0)
+  ROOT exp = f32[4,4]{1,0} exponential(param_0)
+}
+
+ENTRY main {
+  param_0 = f32[4,4]{1,0} parameter(0)
+  ROOT triton_softmax = f32[4,4]{1,0} fusion(param_0), kind=kCustom,
+    calls=triton_softmax,
+    backend_config={"fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{"output_tiles":[{"sizes":["4","4"]}],
+                                   "num_warps":"1",
+                                   "num_ctas":"1",
+                                   "num_stages":"1"}}}
+}

From aa202263bdaa236f07f01c6a95aafb72a0c65251 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Tue, 9 Dec 2025 02:54:53 -0800
Subject: [PATCH 073/753] Add constructor to `NamedSharding` accepting axis
 names.

This provides a more intuitive way to create `NamedSharding` objects, especially in tests, as it's easier to work with human-readable axis names than with `AxisRef` indices.

PiperOrigin-RevId: 842148670
---
 third_party/xla/xla/hlo/ir/BUILD              |  2 +
 third_party/xla/xla/hlo/ir/named_sharding.cc  | 81 +++++++++++++++++++
 third_party/xla/xla/hlo/ir/named_sharding.h   | 19 ++++-
 .../xla/xla/hlo/ir/named_sharding_test.cc     | 26 ++++++
 4 files changed, 126 insertions(+), 2 deletions(-)
 create mode 100644 third_party/xla/xla/hlo/ir/named_sharding.cc

diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 6d4c270f5b856f..07b2e615d62479 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -203,12 +203,14 @@ cc_library(
 
 cc_library(
     name = "named_sharding",
+    srcs = ["named_sharding.cc"],
     hdrs = ["named_sharding.h"],
     deps = [
         ":mesh_and_axis",
         ":tile_assignment",
         "//xla:xla_data_proto_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.cc b/third_party/xla/xla/hlo/ir/named_sharding.cc
new file mode 100644
index 00000000000000..0db5d8e916aa53
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/named_sharding.cc
@@ -0,0 +1,81 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/named_sharding.h"
+
+#include <cstdint>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+
+namespace xla {
+
+namespace test_utils {
+// Construct sharding with given mesh. 'dim_shardings', 'replicated_axes',
+// 'unreduced_axes' refer to axis names in the mesh.
+// This is a test only helper function.
+NamedSharding FromAxisNames(
+    Mesh mesh, absl::Span<const std::vector<std::string>> dim_shardings,
+    absl::Span<const std::string> replicated_axes,
+    absl::Span<const std::string> unreduced_axes,
+    absl::Span<const OpMetadata> metadata) {
+  std::map<std::string, int64_t> mesh_axis_to_index;
+  for (int64_t i = 0; i < mesh.axis_names().size(); ++i) {
+    mesh_axis_to_index[mesh.axis_names()[i]] = i;
+  }
+
+  std::vector<NamedSharding::DimensionSharding> dim_shardings_;
+  dim_shardings_.reserve(dim_shardings.size());
+  for (const auto& axes_for_dim : dim_shardings) {
+    std::vector<AxisRef> axis_refs;
+    axis_refs.reserve(axes_for_dim.size());
+    for (const std::string& axis_name : axes_for_dim) {
+      auto it = mesh_axis_to_index.find(axis_name);
+      CHECK(it != mesh_axis_to_index.end())
+          << "Axis " << axis_name << " not found in mesh " << mesh.ToString();
+      axis_refs.push_back(AxisRef(it->second));
+    }
+    dim_shardings_.push_back(NamedSharding::DimensionSharding(
+        std::move(axis_refs), /*is_closed=*/true));
+  }
+
+  std::vector<AxisRef> replicated_axes_;
+  replicated_axes_.reserve(replicated_axes.size());
+  for (const std::string& axis_name : replicated_axes) {
+    auto it = mesh_axis_to_index.find(axis_name);
+    CHECK(it != mesh_axis_to_index.end())
+        << "Axis " << axis_name << " not found in mesh " << mesh.ToString();
+    replicated_axes_.push_back(AxisRef(it->second));
+  }
+
+  std::vector<AxisRef> unreduced_axes_;
+  unreduced_axes_.reserve(unreduced_axes.size());
+  for (const std::string& axis_name : unreduced_axes) {
+    auto it = mesh_axis_to_index.find(axis_name);
+    CHECK(it != mesh_axis_to_index.end())
+        << "Axis " << axis_name << " not found in mesh " << mesh.ToString();
+    unreduced_axes_.push_back(AxisRef(it->second));
+  }
+
+  return NamedSharding(mesh, dim_shardings_, replicated_axes_, unreduced_axes_,
+                       metadata);
+}
+}  // namespace test_utils
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index 6c93bed8d40c74..bfdc9966c0b15d 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_HLO_IR_NAMED_SHARDING_H_
 
 #include <cstdint>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -38,8 +39,11 @@ class NamedSharding {
       return axes_ == other.axes_ && is_closed_ == other.is_closed_;
     }
 
-    explicit DimensionSharding(std::vector<AxisRef> axes, bool is_closed)
-        : axes_(std::move(axes)), is_closed_(is_closed) {}
+    // Note that by default we assume closed sharding.
+    explicit DimensionSharding() : is_closed_(true) {};
+
+    explicit DimensionSharding(absl::Span<const AxisRef> axes, bool is_closed)
+        : axes_(axes.begin(), axes.end()), is_closed_(is_closed) {}
 
     absl::Span<const AxisRef> axes() const { return axes_; }
 
@@ -118,6 +122,17 @@ class NamedSharding {
   std::vector<OpMetadata> metadata_;
 };
 
+// Contains test only helper functions.
+namespace test_utils {
+// Construct sharding with given mesh. 'dim_shardings', 'replicated_axes',
+// 'unreduced_axes' refer to axis names in the mesh.
+NamedSharding FromAxisNames(
+    Mesh mesh, absl::Span<const std::vector<std::string>> dim_shardings,
+    absl::Span<const std::string> replicated_axes = {},
+    absl::Span<const std::string> unreduced_axes = {},
+    absl::Span<const OpMetadata> metadata = {});
+}  // namespace test_utils
+
 }  // namespace xla
 
 #endif  // XLA_HLO_IR_NAMED_SHARDING_H_
diff --git a/third_party/xla/xla/hlo/ir/named_sharding_test.cc b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
index 36e9cfbbba67bb..78e3b3e3b08095 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding_test.cc
+++ b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
@@ -24,6 +24,32 @@ namespace {
 
 using DimensionSharding = NamedSharding::DimensionSharding;
 
+TEST(NamedShardingTest, AxisNameCtor) {
+  Mesh mesh_abcd({2, 4, 3, 8}, {"a", "b", "c", "d"});
+  AxisRef axis_a(0);
+  AxisRef axis_b(1);
+  AxisRef axis_c(2);
+  AxisRef axis_d(3);
+
+  NamedSharding sharding =
+      test_utils::FromAxisNames(mesh_abcd, /*dim_shardings=*/{{"c"}, {"b"}},
+                                /*replicated_axes=*/{"a"},
+                                /*unreduced_axes=*/{"d"});
+  DimensionSharding ds_c({axis_c}, /*is_closed=*/true);
+  DimensionSharding ds_b({axis_b}, /*is_closed=*/true);
+  EXPECT_EQ(sharding,
+            NamedSharding(mesh_abcd, {ds_c, ds_b}, {axis_a}, {axis_d}));
+
+  NamedSharding sharding2 = test_utils::FromAxisNames(
+      mesh_abcd,
+      /*dim_shardings=*/{{"c", "a"}, {}, {"b"}},
+      /*replicated_axes=*/{"d"}, /*unreduced_axes=*/{});
+  DimensionSharding ds_ca({axis_c, axis_a}, /*is_closed=*/true);
+  EXPECT_EQ(sharding2,
+            NamedSharding(mesh_abcd, {ds_ca, DimensionSharding(), ds_b},
+                          {axis_d}, {}));
+}
+
 TEST(NamedShardingTest, Equality) {
   Mesh mesh_abcd({2, 4, 3, 8}, {"a", "b", "c", "d"});
 

From a05e35e330f70fe1920a07573e709247b09ddb15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 03:05:52 -0800
Subject: [PATCH 074/753] Reverts 0752a12d8a06aaefc942eaf1f5255a6eea23ca14

PiperOrigin-RevId: 842151521
---
 .../xla/hlo/analysis/hlo_dataflow_analysis.cc | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
index ca00349f4c25d5..893c233f9bd1c2 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
 #include "xla/map_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_value.h"
@@ -1616,6 +1617,26 @@ HloDataflowAnalysis::GetInPlaceInputOutputPairs(
   return alias_info->GetInPlaceInputOutputPairs(instruction);
 }
 
+// Returns true if the instruction is a fusion consisting of a single copy which
+// changes tiling. This is handled by the emitters and effectively are no-ops.
+static bool IsChangeTilingCopyFusion(HloInstruction* instr) {
+  if (!instr->parent()->IsFusionComputation() ||
+      instr->opcode() != HloOpcode::kFusion ||
+      instr->called_computations().size() != 1 || instr->operand_count() != 1) {
+    return false;
+  }
+  // These copy fusions should only change tiling (and sometimes memory space).
+  HloInstruction* fusion_root = instr->fused_expression_root();
+  const Layout& operand_layout = fusion_root->operand(0)->shape().layout();
+  const Layout& output_layout = fusion_root->shape().layout();
+  absl::Span<const Tile> operand_tiles = operand_layout.tiles();
+  absl::Span<const Tile> output_tiles = output_layout.tiles();
+  return fusion_root->opcode() == HloOpcode::kCopy &&
+         Layout::Equal().IgnoreTiles().IgnoreMemorySpace()(operand_layout,
+                                                           output_layout) &&
+         operand_tiles != output_tiles;
+}
+
 bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     HloInstruction* operand, const ShapeIndex& operand_index,
     HloInstruction* user, const ShapeIndex& user_index,
@@ -1631,7 +1652,12 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
   const Shape& user_subshape =
       ShapeUtil::GetSubshape(user->shape(), user_index);
 
-  auto shapes_equal = ShapeUtil::Equal(operand_subshape, user_subshape);
+  // During tiling assignment, we can add no-op instructions which appear to
+  // change tiling (and memory space) of the operand, but don't.
+  if (IsChangeTilingCopyFusion(user) || IsChangeTilingCopyFusion(operand)) {
+    return true;
+  }
+  const bool shapes_equal = ShapeUtil::Equal(operand_subshape, user_subshape);
   // Check that operand and user emit the same shape and layout.
   if (shapes_equal) {
     // Must-alias relationship returns true for in-place operations (DUS and DUS

From 10344d0c57913c6abffe86c6dbc5bac8322b19f2 Mon Sep 17 00:00:00 2001
From: Alexander Lyashuk <crem@google.com>
Date: Tue, 9 Dec 2025 04:32:33 -0800
Subject: [PATCH 075/753] [XLA:GPU] Move default Triton configs to text proto
 format.

This is to make default configuration consistent to what new `--xla_gpu_gemm_autotuner_override_file` flag takes.

PiperOrigin-RevId: 842176141
---
 .../xla/xla/backends/gpu/autotuner/triton.cc  |   8 +-
 .../xla/xla/service/gpu/autotuning/BUILD      |  12 +-
 .../autotuning/gemm_fusion_autotuner_cuda.cc  |   6 +-
 .../autotuning/gemm_fusion_autotuner_rocm.cc  |   2 +-
 .../service/gpu/autotuning/triton_configs.cc  | 207 ++++++++++++++++++
 .../service/gpu/autotuning/triton_configs.h   |  77 +------
 6 files changed, 235 insertions(+), 77 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/autotuning/triton_configs.cc

diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.cc b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
index a6cc696a80ef91..6a6246ee386be9 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
@@ -60,7 +60,7 @@ namespace {
 std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
     se::GpuComputeCapability compute_capability, bool autotune_tma) {
   if (compute_capability.IsRocm()) {
-    return *kDefaultRocmConfigs;
+    return GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultRocm);
   }
 
   CHECK(compute_capability.IsCuda());
@@ -68,12 +68,12 @@ std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
   std::vector<TritonGemmConfig> configs;
 
   if (cuda_compute_capability->IsAtLeastBlackwell()) {
-    configs = *kBlackwellConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kBlackwell);
   } else if (cuda_compute_capability->IsHopper() ||
              cuda_compute_capability->IsAmpere()) {
-    configs = *kHopperAmpereConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopperAmpere);
   } else {
-    configs = *kDefaultCudaConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultCuda);
   }
 
   if (!autotune_tma) {
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 56d8c4cb2f64ee..5ca2354d355add 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -758,6 +758,16 @@ cc_library(
 
 cc_library(
     name = "triton_configs",
+    srcs = ["triton_configs.cc"],
     hdrs = ["triton_configs.h"],
-    deps = ["//xla/service/gpu:matmul_utils"],
+    deps = [
+        "//xla:autotuning_proto_cc",
+        "//xla/service/gpu:matmul_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf",
+    ],
 )
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
index 7dc86e8a9c2fde..336b668d4b3160 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
@@ -118,11 +118,11 @@ std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
   std::vector<TritonGemmConfig> configs;
 
   if (compute_capability.IsAtLeastBlackwell()) {
-    configs = *kBlackwellConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kBlackwell);
   } else if (compute_capability.IsHopper() || compute_capability.IsAmpere()) {
-    configs = *kHopperAmpereConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopperAmpere);
   } else {
-    configs = *kDefaultCudaConfigs;
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultCuda);
   }
 
   if (!debug_options_.xla_gpu_experimental_enable_triton_tma() ||
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
index 83232e68d4e126..e7d072f1f0d96e 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_rocm.cc
@@ -49,7 +49,7 @@ GemmFusionAutotuner::GetPlatformCodegenBackends(
 
 std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
     const {
-  return *kDefaultRocmConfigs;
+  return GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultRocm);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc b/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc
new file mode 100644
index 00000000000000..e57bb34bf71e97
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc
@@ -0,0 +1,207 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/autotuning/triton_configs.h"
+
+#include <initializer_list>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
+#include "xla/autotuning.pb.h"
+#include "xla/service/gpu/matmul_utils.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+// TODO(b/467265599): Replace string constants with cc_embed_data when
+// https://github.com/bazelbuild/rules_cc/issues/41 is fixed.
+
+constexpr absl::string_view kBlackwellTritonConfigs = R"(
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 1 num_stages: 1 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 8 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 16 split_k: 512 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 1 num_stages: 5 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 64 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 2 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 4 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 8 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 8 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 32 block_k: 64 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 256 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 256 block_n: 32 block_k: 32 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 512 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 1 num_warps: 16 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 2 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 64 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 8 num_stages: 1 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+)";
+
+constexpr absl::string_view kDefaultCudaTritonConfigs = R"(
+config { block_m: 32 block_n: 32 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 64 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 128 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 64 block_k: 128 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 128 block_k: 32 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 512 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 16 block_k: 512 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 1 num_stages: 2 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 128 block_k: 32 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 256 block_n: 128 block_k: 128 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 256 block_n: 64 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 32 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 256 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 2 num_stages: 1 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 2 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 64 block_k: 256 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 256 block_n: 256 block_k: 128 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 }
+)";
+
+constexpr absl::string_view kDefaultRocmTritonConfigs = R"(
+config { block_m: 32 block_n: 32 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 64 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 4 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 256 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 128 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+)";
+
+constexpr absl::string_view kHopperAmpereTritonConfigs = R"(
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 256 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 32 block_k: 128 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 16 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 16 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 4 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 4 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 256 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 128 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 32 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 8 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 8 block_k: 128 split_k: 2 num_stages: 3 num_warps: 4 num_ctas: 1 }
+)";
+
+absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>
+LoadTritonConfigs() {
+  absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>
+      result;
+
+  auto parse_config =
+      [](absl::string_view config_str) -> std::vector<TritonGemmConfig> {
+    TritonGemmConfigsProto proto;
+    CHECK(tsl::protobuf::TextFormat::ParseFromString(config_str, &proto))
+        << config_str;
+    std::vector<TritonGemmConfig> configs;
+    absl::c_transform(proto.config(), std::back_inserter(configs),
+                      [](const AutotuneResult::TritonGemmKey& config_proto) {
+                        absl::StatusOr<TritonGemmConfig> config =
+                            TritonGemmConfig::FromProto(config_proto);
+                        CHECK_OK(config);
+                        return *config;
+                      });
+    return configs;
+  };
+
+  const std::initializer_list<
+      std::pair<TritonConfigsPlatform, absl::string_view>>
+      kConfigsMap = {
+          {TritonConfigsPlatform::kBlackwell, kBlackwellTritonConfigs},
+          {TritonConfigsPlatform::kDefaultCuda, kDefaultCudaTritonConfigs},
+          {TritonConfigsPlatform::kDefaultRocm, kDefaultRocmTritonConfigs},
+          {TritonConfigsPlatform::kHopperAmpere, kHopperAmpereTritonConfigs},
+      };
+  for (const auto& [platform, config_str] : kConfigsMap) {
+    result[platform] = parse_config(config_str);
+  }
+
+  return result;
+}
+
+}  // namespace
+
+const std::vector<TritonGemmConfig>& GetTritonConfigsForPlatform(
+    TritonConfigsPlatform platform) {
+  static const absl::NoDestructor<
+      absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>>
+      kConfigs(LoadTritonConfigs());
+  return kConfigs->at(platform);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
index 7cb0896477b419..252b4be2b1b692 100644
--- a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
+++ b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
@@ -23,74 +23,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using Config = TritonGemmConfig;
-
-static const std::vector<TritonGemmConfig>* const kBlackwellConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(128, 128, 32, 1, 4, 4), Config(128, 128, 64, 1, 1, 8),
-         Config(128, 128, 64, 8, 3, 4), Config(128, 16, 16, 512, 4, 2),
-         Config(128, 16, 32, 16, 3, 2), Config(128, 16, 64, 1, 5, 4),
-         Config(128, 16, 64, 16, 3, 4), Config(128, 16, 64, 64, 1, 2),
-         Config(128, 256, 64, 1, 4, 8), Config(128, 256, 64, 2, 4, 8),
-         Config(128, 256, 64, 4, 3, 8), Config(128, 64, 64, 1, 3, 4),
-         Config(128, 64, 64, 16, 4, 8), Config(128, 64, 64, 8, 4, 4),
-         Config(16, 16, 128, 1, 3, 2),  Config(16, 16, 16, 1, 1, 2),
-         Config(16, 16, 64, 8, 3, 2),   Config(16, 32, 64, 1, 3, 2),
-         Config(256, 128, 64, 1, 3, 8), Config(256, 16, 16, 1, 1, 2),
-         Config(256, 32, 32, 16, 3, 4), Config(32, 16, 32, 1, 4, 2),
-         Config(32, 16, 512, 1, 1, 4),  Config(32, 16, 64, 1, 1, 2),
-         Config(32, 16, 64, 1, 4, 2),   Config(64, 128, 16, 1, 1, 16),
-         Config(64, 128, 16, 1, 3, 2),  Config(64, 128, 64, 1, 4, 4),
-         Config(64, 16, 64, 1, 2, 2),   Config(64, 32, 128, 1, 3, 2),
-         Config(64, 32, 32, 1, 4, 2),   Config(64, 32, 64, 64, 3, 2),
-         Config(64, 64, 128, 8, 1, 8),  Config(64, 64, 16, 1, 1, 2),
-         Config(64, 64, 16, 1, 3, 2)});
-
-static const std::vector<TritonGemmConfig>* const kHopperAmpereConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(16, 16, 64, 1, 4, 2),    Config(16, 16, 128, 1, 4, 4),
-         Config(16, 16, 128, 128, 4, 2), Config(16, 16, 128, 16, 1, 2),
-         Config(16, 256, 16, 1, 1, 2),   Config(32, 32, 128, 16, 1, 4),
-         Config(32, 256, 32, 1, 3, 4),   Config(32, 256, 32, 16, 3, 8),
-         Config(64, 16, 32, 1, 4, 2),    Config(64, 16, 32, 16, 4, 2),
-         Config(64, 16, 64, 1, 1, 4),    Config(64, 16, 64, 4, 3, 2),
-         Config(64, 16, 64, 16, 4, 4),   Config(64, 16, 128, 1, 4, 2),
-         Config(64, 16, 128, 16, 4, 4),  Config(64, 32, 32, 1, 4, 4),
-         Config(64, 32, 64, 16, 3, 4),   Config(64, 32, 128, 1, 3, 2),
-         Config(64, 32, 128, 128, 2, 4), Config(64, 64, 32, 1, 4, 4),
-         Config(64, 64, 64, 1, 4, 4),    Config(64, 64, 64, 4, 4, 4),
-         Config(64, 64, 128, 16, 3, 4),  Config(64, 64, 256, 16, 4, 8),
-         Config(64, 128, 16, 1, 4, 2),   Config(64, 128, 64, 1, 3, 4),
-         Config(64, 128, 128, 8, 1, 4),  Config(64, 256, 32, 1, 4, 4),
-         Config(128, 16, 32, 8, 4, 2),   Config(128, 16, 64, 16, 3, 2),
-         Config(128, 16, 64, 16, 1, 4),  Config(128, 32, 32, 8, 4, 2),
-         Config(128, 128, 32, 8, 4, 8),  Config(128, 256, 32, 1, 4, 8),
-         Config(128, 256, 64, 1, 4, 8),  Config(64, 8, 128, 2, 3, 4, 1)});
-
-static const std::vector<TritonGemmConfig>* const kDefaultCudaConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(32, 32, 256, 1, 1, 4),   Config(64, 32, 32, 16, 1, 4),
-         Config(32, 64, 64, 4, 1, 4),    Config(128, 128, 64, 4, 1, 4),
-         Config(16, 16, 256, 1, 1, 4),   Config(16, 128, 32, 16, 1, 4),
-         Config(16, 64, 128, 1, 1, 4),   Config(16, 128, 32, 8, 1, 4),
-         Config(16, 16, 512, 1, 1, 4),   Config(32, 16, 512, 1, 1, 4),
-         Config(64, 32, 64, 1, 2, 8),    Config(128, 256, 32, 1, 3, 8),
-         Config(256, 128, 32, 1, 3, 8),  Config(256, 64, 32, 1, 4, 4),
-         Config(64, 256, 32, 1, 4, 4),   Config(128, 64, 32, 1, 4, 4),
-         Config(64, 128, 32, 1, 4, 4),   Config(256, 128, 128, 1, 3, 8),
-         Config(256, 64, 128, 1, 4, 4),  Config(64, 256, 128, 1, 4, 4),
-         Config(128, 128, 128, 1, 4, 4), Config(128, 64, 64, 1, 4, 4),
-         Config(64, 128, 64, 1, 4, 4),   Config(128, 32, 64, 1, 4, 4),
-         Config(64, 32, 64, 1, 4, 4),    Config(32, 128, 32, 1, 4, 4),
-         Config(128, 128, 32, 1, 4, 4),  Config(16, 16, 256, 1, 3, 4),
-         Config(128, 128, 64, 2, 1, 8),  Config(64, 64, 64, 1, 2, 4),
-         Config(16, 64, 256, 8, 1, 4),   Config(256, 256, 128, 1, 3, 8)});
-
-static const std::vector<TritonGemmConfig>* const kDefaultRocmConfigs =
-    new std::vector<TritonGemmConfig>(
-        {Config(32, 32, 256, 1, 1, 4), Config(64, 32, 32, 16, 1, 4),
-         Config(32, 64, 64, 4, 1, 4), Config(128, 128, 64, 4, 1, 4),
-         Config(16, 16, 256, 1, 1, 4), Config(16, 128, 32, 16, 1, 4)});
+enum class TritonConfigsPlatform {
+  kBlackwell,
+  kDefaultCuda,
+  kDefaultRocm,
+  kHopperAmpere,
+};
+
+const std::vector<TritonGemmConfig>& GetTritonConfigsForPlatform(
+    TritonConfigsPlatform);
 
 }  // namespace gpu
 }  // namespace xla

From 5d3686aeb72d6b9db120f516419663d62cd28e1c Mon Sep 17 00:00:00 2001
From: spiao <Songlin.Piao@amd.com>
Date: Tue, 9 Dec 2025 05:00:28 -0800
Subject: [PATCH 076/753] PR #34812: [ROCm] Add register spilling detection
 support AMD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34812

✨ New Feature

Added register spilling detection support.

🧪 Execution Test

./bazel-7.4.1-linux-x86_64 build //xla/service/gpu/transforms:triton_fusion_numerics_verifier_test
bazel-bin/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test_amdgpu_any --gtest_filter=TritonFusionNumericsVerifierTest.CompilationSucceedsEvenIfKernelWillSpillRegisters

```
I0000 00:00:1764849271.079538 2923925 amdgpu_backend.cc:447] ====== REGISTER SPILLING DETECTED ======
I0000 00:00:1764849271.079561 2923925 amdgpu_backend.cc:448] Module: triton_softmax_consts
I0000 00:00:1764849271.079565 2923925 amdgpu_backend.cc:449] SGPR spill count: 0
I0000 00:00:1764849271.079569 2923925 amdgpu_backend.cc:450] VGPR spill count: 194
I0000 00:00:1764849271.079572 2923925 amdgpu_backend.cc:451] Private segment size: 780 bytes
I0000 00:00:1764849271.079574 2923925 amdgpu_backend.cc:452] Performance may be degraded due to register pressure
I0000 00:00:1764849271.079576 2923925 amdgpu_backend.cc:453] ========================================
I0000 00:00:1764849271.390972 2923925 amdgpu_backend.cc:447] ====== REGISTER SPILLING DETECTED ======
I0000 00:00:1764849271.390996 2923925 amdgpu_backend.cc:448] Module: triton_softmax_consts
I0000 00:00:1764849271.391000 2923925 amdgpu_backend.cc:449] SGPR spill count: 0
I0000 00:00:1764849271.391005 2923925 amdgpu_backend.cc:450] VGPR spill count: 194
I0000 00:00:1764849271.391007 2923925 amdgpu_backend.cc:451] Private segment size: 780 bytes
I0000 00:00:1764849271.391009 2923925 amdgpu_backend.cc:452] Performance may be degraded due to register pressure
I0000 00:00:1764849271.391012 2923925 amdgpu_backend.cc:453] ========================================
I0000 00:00:1764849271.397868 2923925 tfrt_gpu_client.cc:197] TfrtGpuClient destroyed.
[       OK ] TritonFusionNumericsVerifierTest.CompilationSucceedsEvenIfKernelWillSpillRegisters (8019 ms)
[----------] 1 test from TritonFusionNumericsVerifierTest (8019 ms total)

[----------] Global test environment tear-down
[==========] 1 test from 1 test suite ran. (8019 ms total)
[  PASSED  ] 1 test.
```

This PR is on top of another bugfix PR (https://github.com/openxla/xla/pull/34806).

@xla-rotation could you review my PR, please?

Copybara import of the project:

--
ebd6e1fa03033bc9f6913351323fce26e1a8e4d2 by Songlin Piao <Songlin.Piao@amd.com>:

replace the manual calling convention fix with AnnotateFunctionAsGpuKernel

--
fafc7f1f6ad5a47204a32d433eab2bc5ec44dbd3 by Songlin Piao <Songlin.Piao@amd.com>:

register spilling by disassembling object file

--
f6b86f6fc96fd3398608c0078233db2efa74fce7 by Songlin Piao <Songlin.Piao@amd.com>:

added time measurement to the spilling check

--
8e5ea8455fc730b73b3768cbdde07079c8c53c29 by Songlin Piao <Songlin.Piao@amd.com>:

adapt the num_warps so that the hlo could be compiled on both amd and nvidia

--
22ef808416e6d339356c3a901ce1f5d03a396a60 by Songlin Piao <Songlin.Piao@amd.com>:

pass though is_autotuning_compilation flag to the function CompileToHsaco

--
b1d5e976c8051332ca1fc45e5f3b91fcd15a3da8 by Songlin Piao <Songlin.Piao@amd.com>:

implementation of register spilling by reading meta data of hasco file using llvm-readobj

--
d74ae83731a0a56a7285c1ac57689678d21e42d4 by Songlin Piao <Songlin.Piao@amd.com>:

adapted functiona calls as is_autotuning_compilation is removed in upstream

--
07ed74d49361fb1945092cac459a3bb70262265b by Songlin Piao <Songlin.Piao@amd.com>:

utilize amd code object manager library for parsing HSACO metadata

--
11e83bcb502ee341ddf7db9044b05b4b757ca5e9 by Songlin Piao <Songlin.Piao@amd.com>:

Revert "replace the manual calling convention fix with AnnotateFunctionAsGpuKernel"

This reverts commit ebd6e1fa03033bc9f6913351323fce26e1a8e4d2.

Merging this change closes #34812

PiperOrigin-RevId: 842183737
---
 .../xla/third_party/gpus/rocm/BUILD.tpl       |   9 +-
 .../xla/xla/backends/gpu/codegen/triton/BUILD |   1 -
 .../xla/service/gpu/llvm_gpu_backend/BUILD    |   4 +
 .../gpu/llvm_gpu_backend/amdgpu_backend.cc    | 175 +++++++++++++++++-
 .../triton_fusion_numerics_verifier_test.cc   |   2 +-
 5 files changed, 185 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index c95f9a95933fbc..4eba66c971da72 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -150,9 +150,11 @@ cc_library(
         ],
         ":multiple_rocm_paths": [
             "-Wl,-rpath=%{rocm_lib_paths}",
+            "-Lexternal/local_config_rocm/rocm/%{rocm_root}/lib",
         ],
         "//conditions:default": [
             "-Wl,-rpath,/opt/rocm/lib",
+            "-Lexternal/local_config_rocm/rocm/%{rocm_root}/lib",
         ],
     }),
     visibility = ["//visibility:public"],
@@ -535,7 +537,7 @@ cc_library(
 cc_library(
     name = "amd_comgr",
     hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
-    data = glob([
+    srcs = glob([
         "%{rocm_root}/lib/libamd_comgr_loader.so*",
         "%{rocm_root}/lib/libamd_comgr.so*",
         "%{rocm_root}/lib/llvm/lib/libLLVM.so*",
@@ -548,9 +550,12 @@ cc_library(
         ":build_hermetic": [
             "-lamd_comgr_loader",
         ],
-        "//conditions:default": [],
+        "//conditions:default": [
+            "-lamd_comgr",
+	],
     }),
     strip_include_prefix = "%{rocm_root}",
+    visibility = ["//visibility:public"],
     deps = [
         ":rocm_config",
         ":rocm_rpath",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index ce2212b64621e9..edd7570f1ed64c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -413,7 +413,6 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/llvm_gpu_backend:amdgpu_backend",
         "//xla/service/gpu/llvm_gpu_backend:nvptx_libdevice_path",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:triton_fusion_analysis",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 668b495f7626ac..f70483614dd8c9 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -171,7 +171,9 @@ cc_library(
         "HAS_SUPPORT_FOR_EMBEDDED_LIB_DEVICE=1",
     ]),
     tags = [
+        "gpu",
         "nofixdeps",  # This target crashes build_cleaner ¯\_(ツ)_/¯
+        "rocm-only",
     ],
     deps = [
         ":llvm_gpu_backend",
@@ -210,6 +212,8 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
+        "@local_config_rocm//rocm:amd_comgr",
+        "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/profiler/lib:traceme",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
index c927c1eb627731..536216735c9adf 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h"
 
+#include <algorithm>
 #include <cstdint>
 #include <cstdlib>
 #include <fstream>
@@ -39,6 +40,7 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "amd_comgr/amd_comgr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
@@ -318,14 +320,151 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
     }
   }
 
-  // Read HSACO.
+  // Read HSACO file into memory (used for both metadata extraction and return)
   std::ifstream hsaco_file(hsaco_path, std::ios::binary | std::ios::ate);
+  if (!hsaco_file) {
+    return xla::Internal("Failed to open HSACO file: %s", hsaco_path);
+  }
   std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
-
   std::vector<uint8_t> hsaco(hsaco_file_size);
   hsaco_file.seekg(0, std::ios::beg);
   hsaco_file.read(reinterpret_cast<char*>(hsaco.data()), hsaco_file_size);
   hsaco_file.close();
+
+  // Check for register spilling using HSACO metadata
+  // Use amd_comgr library for fast in-process metadata extraction
+  VLOG(2) << "Checking for register spilling in: "
+          << module->getModuleIdentifier();
+
+  bool has_spilling = false;
+  int sgpr_spill_count = 0;
+  int vgpr_spill_count = 0;
+  int private_segment_size = 0;
+
+  // Use already-loaded HSACO data for amd_comgr parsing
+  {
+    // Create amd_comgr data object from HSACO
+    amd_comgr_data_t comgr_data;
+    amd_comgr_status_t status =
+        amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &comgr_data);
+
+    if (status == AMD_COMGR_STATUS_SUCCESS) {
+      status = amd_comgr_set_data(comgr_data, hsaco.size(),
+                                  reinterpret_cast<const char*>(hsaco.data()));
+
+      if (status == AMD_COMGR_STATUS_SUCCESS) {
+        // Get metadata from the executable
+        amd_comgr_metadata_node_t metadata;
+        status = amd_comgr_get_data_metadata(comgr_data, &metadata);
+
+        if (status == AMD_COMGR_STATUS_SUCCESS) {
+          // Helper lambda to lookup integer value from metadata map
+          auto lookup_int_value = [](amd_comgr_metadata_node_t root,
+                                     const char* key) -> int {
+            amd_comgr_metadata_node_t value_node;
+            amd_comgr_status_t s =
+                amd_comgr_metadata_lookup(root, key, &value_node);
+            if (s != AMD_COMGR_STATUS_SUCCESS) {
+              return 0;
+            }
+
+            size_t size = 0;
+            s = amd_comgr_get_metadata_string(value_node, &size, nullptr);
+            if (s != AMD_COMGR_STATUS_SUCCESS || size == 0) {
+              amd_comgr_destroy_metadata(value_node);
+              return 0;
+            }
+
+            std::string str_value(size, '\0');
+            s = amd_comgr_get_metadata_string(value_node, &size,
+                                              str_value.data());
+            amd_comgr_destroy_metadata(value_node);
+
+            if (s != AMD_COMGR_STATUS_SUCCESS) {
+              return 0;
+            }
+
+            // Parse the integer value
+            try {
+              return std::stoi(str_value);
+            } catch (...) {
+              return 0;
+            }
+          };
+
+          // Navigate to amdhsa.kernels array and check each kernel
+          amd_comgr_metadata_node_t kernels_node;
+          if (amd_comgr_metadata_lookup(metadata, "amdhsa.kernels",
+                                        &kernels_node) ==
+              AMD_COMGR_STATUS_SUCCESS) {
+            size_t kernel_count = 0;
+            amd_comgr_get_metadata_list_size(kernels_node, &kernel_count);
+
+            for (size_t i = 0; i < kernel_count; ++i) {
+              amd_comgr_metadata_node_t kernel_node;
+              if (amd_comgr_index_list_metadata(kernels_node, i,
+                                                &kernel_node) ==
+                  AMD_COMGR_STATUS_SUCCESS) {
+                // Get spill counts for this kernel
+                int kernel_sgpr_spill =
+                    lookup_int_value(kernel_node, ".sgpr_spill_count");
+                int kernel_vgpr_spill =
+                    lookup_int_value(kernel_node, ".vgpr_spill_count");
+                int kernel_private_size = lookup_int_value(
+                    kernel_node, ".private_segment_fixed_size");
+
+                // Aggregate max values across all kernels
+                sgpr_spill_count =
+                    std::max(sgpr_spill_count, kernel_sgpr_spill);
+                vgpr_spill_count =
+                    std::max(vgpr_spill_count, kernel_vgpr_spill);
+                private_segment_size =
+                    std::max(private_segment_size, kernel_private_size);
+
+                amd_comgr_destroy_metadata(kernel_node);
+              }
+            }
+            amd_comgr_destroy_metadata(kernels_node);
+          }
+
+          amd_comgr_destroy_metadata(metadata);
+        } else {
+          VLOG(2) << "Could not get HSACO metadata via amd_comgr";
+        }
+      }
+      amd_comgr_release_data(comgr_data);
+    } else {
+      VLOG(2) << "Could not create amd_comgr data object";
+    }
+
+    if (sgpr_spill_count > 0 || vgpr_spill_count > 0 ||
+        private_segment_size > 0) {
+      has_spilling = true;
+    }
+  }
+
+  if (has_spilling) {
+    VLOG(0) << "====== REGISTER SPILLING DETECTED ======";
+    VLOG(0) << "Module: " << module->getModuleIdentifier();
+    VLOG(0) << "SGPR spill count: " << sgpr_spill_count;
+    VLOG(0) << "VGPR spill count: " << vgpr_spill_count;
+    VLOG(0) << "Private segment size: " << private_segment_size << " bytes";
+    VLOG(0) << "Performance may be degraded due to register pressure";
+    VLOG(0) << "========================================";
+
+    // Filter out kernels with register spilling during autotuning
+    // This matches NVIDIA's behavior in ptx_compiler_impl.cc
+    // TODO: remove ptx from xla_gpu_fail_ptx_compilation_on_register_spilling
+    // to make the flag more general
+    if (debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling()) {
+      return xla::Cancelled(
+          "Compilation result discarded due to register spilling");
+    }
+  } else {
+    VLOG(2) << "No register spilling detected";
+  }
+
+  // Clean up temp files
   if (!keep_tempfiles) {
     remove(ir_path.c_str());
     remove(isabin_path.c_str());
@@ -562,6 +701,34 @@ std::vector<std::string> GetAMDGPUBackendOptions(
                            backend_extra_llvm_opts.cbegin(),
                            backend_extra_llvm_opts.cend());
 
+  // Manually add LLVM debug options for register usage analysis
+  // Note: The disassembly-based spilling detection is now the primary method.
+  // These options are mainly useful for debugging the compiler itself.
+
+  // Uncomment if you want to see LLVM compilation details:
+
+  // Option 1: Enable LLVM statistics (aggregate stats, not per-kernel)
+  // backend_llvm_opts.push_back("-stats");
+
+  // Option 2: Print final machine code (very verbose)
+  // backend_llvm_opts.push_back("-print-after-all");
+
+  // Option 3: Print after register allocation (shows register assignments)
+  // backend_llvm_opts.push_back("-print-after=regallocfast");
+  // backend_llvm_opts.push_back("-print-after=regallocgreedy");
+
+  // Option 4: Enable pass timing (shows compilation time breakdown)
+  // backend_llvm_opts.push_back("-time-passes");
+
+  // Log the final LLVM options
+  if (!backend_llvm_opts.empty()) {
+    LOG(INFO) << "AMDGPU backend LLVM options (" << backend_llvm_opts.size()
+              << "):";
+    for (const auto& opt : backend_llvm_opts) {
+      LOG(INFO) << "  " << opt;
+    }
+  }
+
   return backend_llvm_opts;
 }
 
@@ -576,6 +743,10 @@ absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
   absl::call_once(backend_init_flag, AMDGPUBackendInit, debug_options,
                   rocdl_dir_path);
   auto llvm_opts = GetAMDGPUBackendOptions(debug_options);
+
+  VLOG(2) << "CompileToHsaco called for module: "
+          << module->getModuleIdentifier();
+
   llvm_ir::LLVMCommandLineOptionsLock llvm_lock(llvm_opts);
 
   std::vector<uint8_t> hsaco;
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index a2494c5d7d6b60..dae6b73eba50a6 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -393,7 +393,7 @@ ENTRY main {
         "kind":"__triton",
         "block_level_fusion_config":{
           "output_tiles":[{"sizes":["1","256000"]}],
-          "num_warps":"32",
+          "num_warps":"16",
           "num_ctas":"1",
           "num_stages":"1"}}}
 })",

From 29c25e0d6bbcec9c3067054680e962dd4a0765ca Mon Sep 17 00:00:00 2001
From: Marcin Radomski <dextero@google.com>
Date: Tue, 9 Dec 2025 05:36:53 -0800
Subject: [PATCH 077/753] [XLA] Use absl::StrCat instead of strings::StrCat

PiperOrigin-RevId: 842193625
---
 third_party/xla/xla/tsl/platform/cloud/BUILD  |  1 -
 .../platform/cloud/gcs_file_system_test.cc    | 36 +++++++++----------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/tsl/platform/cloud/BUILD b/third_party/xla/xla/tsl/platform/cloud/BUILD
index 4fbc7b0633da6f..cd3ca465817554 100644
--- a/third_party/xla/xla/tsl/platform/cloud/BUILD
+++ b/third_party/xla/xla/tsl/platform/cloud/BUILD
@@ -431,7 +431,6 @@ tsl_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:retrying_utils",
-        "@local_tsl//tsl/platform:strcat",
     ],
 )
 
diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
index 58db30d178d149..646ee9f1a8a68b 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system_test.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/retrying_utils.h"
-#include "tsl/platform/strcat.h"
 
 // Undef DeleteFile macro defined in wndows.h.
 #ifdef PLATFORM_WINDOWS
@@ -1497,9 +1496,9 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
            "path%2Frandom_access.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"", content.size(), "\"",
-                           ", \"generation\": \"1\"",
-                           ", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+           absl::StrCat("{\"size\": \"", content.size(), "\"",
+                        ", \"generation\": \"1\"",
+                        ", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            absl::StrCat("Uri: https://storage.googleapis.com/bucket/"
                         "path%2Frandom_access.txt\n"
@@ -4383,12 +4382,12 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
                             "location"}}),
       // Uploads entire file again.
       new FakeHttpRequest(
-          strings::StrCat("Uri: https://custom/upload/location\n"
-                          "Auth Token: fake_token\n"
-                          "Header Content-Range: bytes 0-26/27\n"
-                          "Timeouts: 5 1 30\n"
-                          "Put body: ",
-                          contents[0], contents[1], contents[2], "\n"),
+          absl::StrCat("Uri: https://custom/upload/location\n"
+                       "Auth Token: fake_token\n"
+                       "Header Content-Range: bytes 0-26/27\n"
+                       "Timeouts: 5 1 30\n"
+                       "Put body: ",
+                       contents[0], contents[1], contents[2], "\n"),
           ""),
       new FakeHttpRequest(
           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o?"
@@ -4399,15 +4398,14 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithoutCompose) {
           "Timeouts: 5 1 10\n",
           "", {{"Location", "https://custom/upload/location"}}),
       // Uploads entire file again.
-      new FakeHttpRequest(
-          strings::StrCat("Uri: https://custom/upload/location\n"
-                          "Auth Token: fake_token\n"
-                          "Header Content-Range: bytes 0-35/36\n"
-                          "Timeouts: 5 1 30\n"
-                          "Put body: ",
-                          contents[0], contents[1], contents[2], contents[3],
-                          "\n"),
-          ""),
+      new FakeHttpRequest(absl::StrCat("Uri: https://custom/upload/location\n"
+                                       "Auth Token: fake_token\n"
+                                       "Header Content-Range: bytes 0-35/36\n"
+                                       "Timeouts: 5 1 30\n"
+                                       "Put body: ",
+                                       contents[0], contents[1], contents[2],
+                                       contents[3], "\n"),
+                          ""),
   });
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),

From e3a3c04334fc280d096246f12e074404eb419376 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 06:29:51 -0800
Subject: [PATCH 078/753] Automated Code Change

PiperOrigin-RevId: 842209201
---
 third_party/xla/xla/tsl/concurrency/async_value_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tsl/concurrency/async_value_test.cc b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
index 57f968d6824057..75005391c3ef4b 100644
--- a/third_party/xla/xla/tsl/concurrency/async_value_test.cc
+++ b/third_party/xla/xla/tsl/concurrency/async_value_test.cc
@@ -178,7 +178,8 @@ TEST(AsyncValueTest, StackAllocatedAsyncValue) {
   EXPECT_TRUE(ptr.IsAvailable());
 
   // Check that when owner is destructed it calls the payload destructor.
-  std::make_unique<AsyncValueOwningRef<Payload>>(std::move(owner));
+  static_cast<void>(
+      std::make_unique<AsyncValueOwningRef<Payload>>(std::move(owner)));
   EXPECT_EQ(2, counter);
 }
 

From 80246cf0975f03be9b5f315fcfebe7029d8ff207 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Tue, 9 Dec 2025 07:57:12 -0800
Subject: [PATCH 079/753] [Autotuner] Log the per backend supported config
 count.

PiperOrigin-RevId: 842238520
---
 third_party/xla/xla/backends/autotuner/autotuner.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index 2c2dcb14750ef7..2d98a517270eb0 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -314,7 +314,8 @@ absl::StatusOr<Autotuner::Config> Autotuner::TuneBestConfig(
         absl::StrCat("Autotuner could not find any supported configs for HLO: ",
                      instr->ToString()));
   }
-  VLOG(1) << "Found " << supported_configs.size() << " supported configs.";
+  VLOG(1) << "Found total of " << supported_configs.size()
+          << " supported configs.";
 
   std::vector<absl::StatusOr<std::unique_ptr<Executable>>> executables =
       CompileAll(instr, supported_configs);
@@ -411,8 +412,13 @@ absl::StatusOr<std::vector<Autotuner::Config>> Autotuner::GetSupportedConfigs(
     absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
         per_backend_configs = codegen_backend->GetSupportedConfigs(*instr);
     if (!per_backend_configs.ok()) {
+      VLOG(3) << "Failed to get supported configs for backend "
+              << codegen_backend->name() << ": "
+              << per_backend_configs.status();
       continue;
     }
+    VLOG(3) << "Found of " << per_backend_configs->size()
+            << " supported configs for backend " << codegen_backend->name();
     for (auto& config : *per_backend_configs) {
       configs.push_back({codegen_backend.get(), std::move(config)});
     }

From 6f72793d606a8d8d680d9c286b198b6c533b5ade Mon Sep 17 00:00:00 2001
From: Theotime Combes <tcombes@google.com>
Date: Tue, 9 Dec 2025 08:28:35 -0800
Subject: [PATCH 080/753] [XLA:GPU]Extract transpose normalization logic to
 utils

Pre-requisite to performing normalization OTF and remove the pass

No-op in terms of behavior

PiperOrigin-RevId: 842250914
---
 third_party/xla/xla/BUILD                     |   1 +
 .../transforms/transpose_dimension_grouper.cc | 121 +----------------
 third_party/xla/xla/shape_util.cc             |  95 ++++++++++++++
 third_party/xla/xla/shape_util.h              |  32 +++++
 third_party/xla/xla/shape_util_test.cc        | 124 ++++++++++++++++++
 5 files changed, 254 insertions(+), 119 deletions(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index d07d329e7c2dca..e653fc2cc8b7e3 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -565,6 +565,7 @@ xla_cc_test(
         "//xla/tsl/platform:test_benchmark",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
index d6dfc08863b5a2..26d9a8f87049c0 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <functional>
-#include <numeric>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
@@ -42,126 +41,10 @@ namespace xla {
 namespace gpu {
 
 namespace {
-// Returns the indices of the first elements of all consecutive subarrays of the
-// given array. For example:
-// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4}
-absl::InlinedVector<size_t, 3> ConsecutiveSegments(
-    absl::Span<const int64_t> xs) {
-  absl::InlinedVector<size_t, 3> is = {0};
-  for (size_t i = 1; i < xs.size(); ++i) {
-    if (1 != xs[i] - xs[i - 1]) {
-      is.push_back(i);
-    }
-  }
-  return is;
-}
-
-// Merges the sequences of dimensions of the given shape which start at the
-// given indices `segs`.
-Shape MergeDimensions(absl::Span<const size_t> segs, const Shape &shape) {
-  std::vector<int64_t> dimensions;
-  const auto size = segs.size();
-  dimensions.reserve(size);
-  for (size_t i = 1; i <= size; ++i) {
-    dimensions.push_back(std::accumulate(
-        shape.dimensions().begin() + segs[i - 1],
-        shape.dimensions().begin() +
-            (segs.size() == i ? shape.dimensions().size() : segs[i]),
-        int64_t{1}, std::multiplies<int64_t>()));
-  }
-  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
-                                                  dimensions);
-}
-
-absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
-    const Shape &output_shape, absl::Span<int64_t const> output_to_input,
-    absl::InlinedVector<int64_t, 3> &permutation) {
-  absl::InlinedVector<size_t, 3> segments =
-      ConsecutiveSegments(output_to_input);
-  Shape normalized_shape = MergeDimensions(segments, output_shape);
-  absl::InlinedVector<int64_t, 3> normalized_dims(
-      normalized_shape.dimensions().begin(),
-      normalized_shape.dimensions().end());
-  if (segments.size() == 1) {
-    return normalized_dims;
-  }
-  // Derive the permutation from the segments.
-  std::vector<int64_t> segment_to_normalized_dim(
-      output_shape.dimensions().size(), -1);
-  for (size_t segment : segments) {
-    segment_to_normalized_dim[output_to_input[segment]] = 0;
-  }
-  int64_t normalized_dim = 0;
-  for (int64_t i = 0; i < segment_to_normalized_dim.size(); ++i) {
-    if (segment_to_normalized_dim[i] >= 0) {
-      segment_to_normalized_dim[i] = normalized_dim++;
-    }
-  }
-  permutation.reserve(segments.size());
-  for (int64_t i = 0; i < segments.size(); ++i) {
-    permutation.push_back(
-        segment_to_normalized_dim[output_to_input[segments[i]]]);
-  }
-  return normalized_dims;
-}
-
-// In this case, we care about transposes that permute dimensions of a shape
-// that can be viewed as several logical components in the order of major to
-// minor. As an example, let's consider a 0-2-1 transpose:
-//
-// If a shape can be viewed as three logical components 0-1-2 in the order of
-// major to minor, a 0-2-1-transpose changes the order of such logical
-// components to 0-2-1. We call the shape being transposed the input shape and
-// the transposed shape the output shape. The logical view of the input/output
-// shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
-// shapes. The original input/output shapes are called unnormalized shapes.
-//
-// 'output_shape' should have the default layout (enforced by the caller).
-//
-// 'dimensions' specifies the kind of the unnormalized transpose and defines the
-// permutation of the input shape that will result in the provided output shape.
-// So to compute the input shape, we need to apply the inverse permutation of
-// 'dimensions'.
-//
-// 'permutation' is an output parameter and specifies the kind of the normalized
-// transpose.
-//
-// The method returns the dimensions for the normalized transpose shape.
-//
-// Example: Suppose the unnormalized output shape is [32, 1, 10, 11], and
-// 'dimensions' is set to {3, 1, 0, 2}. This means the corresponding input shape
-// is [10, 1, 11, 32]. The normalized output shape is [32, 110] with
-// 'permutation' set to {1,0}.
-absl::InlinedVector<int64_t, 3> GetNormalizedLogicalTransposeShape(
-    const Shape &output_shape, absl::Span<int64_t const> dimensions,
-    absl::InlinedVector<int64_t, 3> &permutation) {
-  permutation.clear();
-  // Drop degenerate dimensions.
-  absl::InlinedVector<int64_t, 3> delta(output_shape.dimensions().size() + 1,
-                                        0);
-  auto input_dimensions =
-      Permute(output_shape.dimensions(), InversePermutation(dimensions));
-  for (int i = 0; i < output_shape.dimensions().size(); ++i) {
-    delta[i + 1] = delta[i];
-    if (input_dimensions[i] == static_cast<int64_t>(1)) {
-      ++delta[i + 1];
-    }
-  }
-  absl::InlinedVector<int64_t, 3> new_dimensions;
-  for (int i = 0; i < dimensions.size(); i++) {
-    if (output_shape.dimensions(i) != 1) {
-      new_dimensions.push_back(dimensions[i] - delta[dimensions[i]]);
-    }
-  }
-
-  return GetNormalizedTransposeShapeHelper(
-      ShapeUtil::DropDegenerateDimensions(output_shape), new_dimensions,
-      permutation);
-}
 
 class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
  public:
-  absl::Status HandleTranspose(HloInstruction *transpose) override {
+  absl::Status HandleTranspose(HloInstruction* transpose) override {
     VLOG(4) << "Input: " << transpose->ToString();
     if (!LayoutUtil::IsMonotonicWithDim0Major(transpose->shape().layout()) ||
         !LayoutUtil::IsMonotonicWithDim0Major(
@@ -174,7 +57,7 @@ class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
           "transpose and its operand");
     }
     absl::InlinedVector<int64_t, 3> permutation;
-    auto normalized_dims = GetNormalizedLogicalTransposeShape(
+    auto normalized_dims = ShapeUtil::GetNormalizedLogicalTransposeShape(
         transpose->shape(), transpose->dimensions(), permutation);
     if (normalized_dims.size() == 1 ||
         normalized_dims == transpose->shape().dimensions()) {
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 321f9dfffab4cd..1fbea10079413b 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -2336,6 +2336,101 @@ int64_t ShapeUtil::ForEachState::CalculateNumSteps() const {
   });
 }
 
+namespace {
+
+// Returns the indices of the first elements of all consecutive subarrays of the
+// given array. For example:
+// ConsecutiveSegments({m, m+1, m+2, n, k, k+1}) = {0, 3, 4}
+absl::InlinedVector<size_t, 3> ConsecutiveSegments(
+    absl::Span<const int64_t> xs) {
+  absl::InlinedVector<size_t, 3> is = {0};
+  for (size_t i = 1; i < xs.size(); ++i) {
+    if (1 != xs[i] - xs[i - 1]) {
+      is.push_back(i);
+    }
+  }
+  return is;
+}
+
+// Merges the sequences of dimensions of the given shape which start at the
+// given indices `segs`.
+Shape MergeDimensions(absl::Span<const size_t> segs, const Shape& shape) {
+  std::vector<int64_t> dimensions;
+  const auto size = segs.size();
+  dimensions.reserve(size);
+  for (size_t i = 1; i <= size; ++i) {
+    dimensions.push_back(std::accumulate(
+        shape.dimensions().begin() + segs[i - 1],
+        shape.dimensions().begin() +
+            (segs.size() == i ? shape.dimensions().size() : segs[i]),
+        int64_t{1}, std::multiplies<int64_t>()));
+  }
+  return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
+                                                  dimensions);
+}
+
+absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
+    const Shape& output_shape, absl::Span<int64_t const> output_to_input,
+    absl::InlinedVector<int64_t, 3>& permutation) {
+  absl::InlinedVector<size_t, 3> segments =
+      ConsecutiveSegments(output_to_input);
+  Shape normalized_shape = MergeDimensions(segments, output_shape);
+  absl::InlinedVector<int64_t, 3> normalized_dims(
+      normalized_shape.dimensions().begin(),
+      normalized_shape.dimensions().end());
+  if (segments.size() == 1) {
+    return normalized_dims;
+  }
+  // Derive the permutation from the segments.
+  std::vector<int64_t> segment_to_normalized_dim(
+      output_shape.dimensions().size(), -1);
+  for (size_t segment : segments) {
+    segment_to_normalized_dim[output_to_input[segment]] = 0;
+  }
+  int64_t normalized_dim = 0;
+  for (int64_t i = 0; i < segment_to_normalized_dim.size(); ++i) {
+    if (segment_to_normalized_dim[i] >= 0) {
+      segment_to_normalized_dim[i] = normalized_dim++;
+    }
+  }
+  permutation.reserve(segments.size());
+  for (int64_t i = 0; i < segments.size(); ++i) {
+    permutation.push_back(
+        segment_to_normalized_dim[output_to_input[segments[i]]]);
+  }
+  return normalized_dims;
+}
+
+}  // namespace
+
+/*static*/ absl::InlinedVector<int64_t, 3>
+ShapeUtil::GetNormalizedLogicalTransposeShape(
+    const Shape& output_shape, absl::Span<int64_t const> dimensions,
+    absl::InlinedVector<int64_t, 3>& permutation) {
+  permutation.clear();
+  // Drop degenerate dimensions.
+  absl::InlinedVector<int64_t, 3> delta(output_shape.dimensions().size() + 1,
+                                        0);
+  auto input_dimensions =
+      Permute(output_shape.dimensions(), InversePermutation(dimensions));
+  for (int i = 0; i < output_shape.dimensions().size(); ++i) {
+    delta[i + 1] = delta[i];
+    if (input_dimensions[i] == static_cast<int64_t>(1)) {
+      ++delta[i + 1];
+    }
+  }
+  absl::InlinedVector<int64_t, 3> new_dimensions;
+  for (int i = 0; i < dimensions.size(); i++) {
+    if (output_shape.dimensions(i) != 1) {
+      new_dimensions.push_back(dimensions[i] - delta[dimensions[i]]);
+    }
+  }
+
+  return GetNormalizedTransposeShapeHelper(
+      ShapeUtil::DropDegenerateDimensions(output_shape), new_dimensions,
+      permutation);
+}
+
 /*static*/ void ShapeUtil::FlattenTupleShape(
     const Shape& shape, std::vector<const Shape*>& flattened) {
   if (shape.IsTuple()) {
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index fde70d0dd22ef5..12cd8e59bd58c7 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -435,6 +435,38 @@ class ShapeUtil {
   static bool IsEffectivelyMostMajorDimension(const Shape& shape,
                                               int64_t dimension);
 
+  // In this case, we care about transposes that permute dimensions of a shape
+  // that can be viewed as several logical components in the order of major to
+  // minor. As an example, let's consider a 0-2-1 transpose:
+  //
+  // If a shape can be viewed as three logical components 0-1-2 in the order of
+  // major to minor, a 0-2-1-transpose changes the order of such logical
+  // components to 0-2-1. We call the shape being transposed the input shape and
+  // the transposed shape the output shape. The logical view of the input/output
+  // shapes for the transpose are called the 0-1-2/0-2-1 shapes or the
+  // normalized shapes. The original input/output shapes are called unnormalized
+  // shapes.
+  //
+  // 'output_shape' should have the default layout (enforced by the caller).
+  //
+  // 'dimensions' specifies the kind of the unnormalized transpose and defines
+  // the permutation of the input shape that will result in the provided output
+  // shape. So to compute the input shape, we need to apply the inverse
+  // permutation of 'dimensions'.
+  //
+  // 'permutation' is an output parameter and specifies the kind of the
+  // normalized transpose.
+  //
+  // The method returns the dimensions for the normalized transpose shape.
+  //
+  // Example: Suppose the unnormalized output shape is [32, 1, 10, 11], and
+  // 'dimensions' is set to {3, 1, 0, 2}. This means the corresponding input
+  // shape is [10, 1, 11, 32]. The normalized output shape is [32, 110] with
+  // 'permutation' set to {1,0}.
+  static absl::InlinedVector<int64_t, 3> GetNormalizedLogicalTransposeShape(
+      const Shape& output_shape, absl::Span<int64_t const> dimensions,
+      absl::InlinedVector<int64_t, 3>& permutation);
+
   // Returns an empty tuple shape. Can be used as a sentinel Shape value.
   static Shape MakeNil() { return Shape(std::vector<Shape>{}); }
 
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index e1015ca2dc7778..265e36b839b289 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -45,6 +47,7 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
+using ::testing::IsEmpty;
 
 TEST(ShapeUtilTest, GetDimensionHelperCanNegativeIndex) {
   Shape matrix = ShapeUtil::MakeShape(F32, {2, 3});
@@ -1776,5 +1779,126 @@ void BM_ForEachIndexNoStatus(::testing::benchmark::State& state) {
 
 BENCHMARK(BM_ForEachIndexNoStatus)->Arg(0)->Arg(1)->Arg(2);
 
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {32, 1, 10, 11});
+  absl::InlinedVector<int64_t, 3> dimensions = {3, 1, 0, 2};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(32, 110));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape2) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {20, 30, 50});
+  absl::InlinedVector<int64_t, 3> dimensions = {1, 2, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(600, 50));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NoTranspose) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {64, 1, 128});
+  absl::InlinedVector<int64_t, 3> dimensions = {0, 2, 1};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(8192));
+  EXPECT_THAT(permutation, IsEmpty());
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple2D) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {64, 128});
+  absl::InlinedVector<int64_t, 3> dimensions = {1, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(64, 128));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_021) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 32768});
+  absl::InlinedVector<int64_t, 3> dimensions = {0, 2, 1};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(8, 16, 32768));
+  EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_210) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {16, 32768, 8});
+  absl::InlinedVector<int64_t, 3> dimensions = {2, 1, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(16, 32768, 8));
+  EXPECT_THAT(permutation, ElementsAre(2, 1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple4D) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {16, 32768, 8, 4});
+  absl::InlinedVector<int64_t, 3> dimensions = {2, 0, 3, 1};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(16, 32768, 8, 4));
+  EXPECT_THAT(permutation, ElementsAre(2, 0, 3, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NormalizeTo3D) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 32, 32, 32});
+  absl::InlinedVector<int64_t, 3> dimensions = {0, 4, 1, 2, 3};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(8, 16, 32768));
+  EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_LargeShapeSizeOverflow) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {16, 4096, 4096, 128});
+  absl::InlinedVector<int64_t, 3> dimensions = {3, 0, 1, 2};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(16, 2147483648));
+  EXPECT_THAT(permutation, ElementsAre(1, 0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_DegenerateDims) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {1, 32, 1, 64, 1, 3, 1});
+  absl::InlinedVector<int64_t, 3> dimensions = {6, 1, 4, 5, 2, 3, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(32, 64, 3));
+  EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_TransposeWithGrouping) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {10, 1, 32, 100, 2});
+  absl::InlinedVector<int64_t, 3> dimensions = {2, 1, 3, 0, 4};
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      output_shape, dimensions, permutation);
+
+  EXPECT_THAT(normalized_shape, ElementsAre(320, 100, 2));
+  EXPECT_THAT(permutation, ElementsAre(1, 0, 2));
+}
+
 }  // namespace
 }  // namespace xla

From 37b13b82702bc818c1f0c83e9b45515890eed9d1 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Tue, 9 Dec 2025 08:33:30 -0800
Subject: [PATCH 081/753] (2/N) Add support for `NamedSharding` in existing
 `HloShardingUtil` methods. Remaining methods will be updated in follow up
 cl's.

PiperOrigin-RevId: 842252683
---
 third_party/xla/xla/hlo/ir/hlo_sharding.h     | 27 +++----
 third_party/xla/xla/hlo/ir/named_sharding.h   | 10 ++-
 third_party/xla/xla/hlo/utils/BUILD           |  4 ++
 .../xla/xla/hlo/utils/hlo_sharding_util.cc    | 70 +++++++++++++++++++
 .../xla/xla/hlo/utils/hlo_sharding_util.h     | 26 +++++--
 .../xla/hlo/utils/hlo_sharding_util_test.cc   | 55 +++++++++++++++
 .../xla/service/spmd/spmd_partitioner_util.cc |  2 +-
 7 files changed, 171 insertions(+), 23 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index d4a0515e931146..488dfdb2793421 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -130,16 +130,6 @@ class HloSharding {
                        metadata);
   }
 
-  explicit HloSharding(NamedSharding named_sharding)
-      : replicated_(false),
-        maximal_(false),
-        tuple_(false),
-        manual_(false),
-        unknown_(false),
-        unreduced_(false),
-        replicate_on_last_tile_dim_(false),
-        named_sharding_(std::move(named_sharding)) {}
-
   // Creates a subgroup sharding with device-level tile assignment, the
   // sharding type of each subgroup is defined by subgroup_types. When creating
   // the HloSharding, subgroup dims of the same type will be merged.
@@ -493,6 +483,11 @@ class HloSharding {
   // REQUIRES: !IsReplicated() && !IsTuple()
   const TileAssignment& tile_assignment() const { return tile_assignment_; }
 
+  const NamedSharding& named_sharding() const {
+    CHECK(UseNamedShardingLeaf());
+    return named_sharding_.value();
+  }
+
   // Returns the number of dimensions.
   int64_t num_dimensions() const { return tile_assignment().num_dimensions(); }
 
@@ -668,9 +663,15 @@ class HloSharding {
 
   const ShardGroup& GetShardGroup() const { return shard_group_; }
 
-  std::optional<NamedSharding> named_sharding() const {
-    return named_sharding_;
-  }
+  explicit HloSharding(NamedSharding named_sharding)
+      : replicated_(false),
+        maximal_(false),
+        tuple_(false),
+        manual_(false),
+        unknown_(false),
+        unreduced_(false),
+        replicate_on_last_tile_dim_(false),
+        named_sharding_(std::move(named_sharding)) {}
 
  private:
   explicit HloSharding(bool manual, bool replicated, bool unknown,
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index bfdc9966c0b15d..01ab052d24a22b 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -64,8 +64,6 @@ class NamedSharding {
     return !(*this == other);
   }
 
-  const Mesh& mesh() const { return mesh_; }
-
   // TODO(b/456212087): Add validation checks
   explicit NamedSharding(Mesh mesh,
                          absl::Span<const DimensionSharding> dim_shardings = {},
@@ -78,6 +76,14 @@ class NamedSharding {
         unreduced_axes_(unreduced_axes.begin(), unreduced_axes.end()),
         metadata_(metadata.begin(), metadata.end()) {}
 
+  const Mesh& mesh() const { return mesh_; }
+  absl::Span<const DimensionSharding> dim_shardings() const {
+    return dim_shardings_;
+  }
+  absl::Span<const AxisRef> replicated_axes() const { return replicated_axes_; }
+  absl::Span<const AxisRef> unreduced_axes() const { return unreduced_axes_; }
+  absl::Span<const OpMetadata> metadata() const { return metadata_; }
+
  private:
   friend class HloSharding;
 
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index 534fdbcafb1d58..66f7bf731b8dad 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -151,6 +151,8 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:mesh_and_axis",
+        "//xla/hlo/ir:named_sharding",
         "//xla/hlo/ir:tile_assignment",
         "//xla/service:call_graph",
         "//xla/service:dot_as_convolution_util",
@@ -184,6 +186,8 @@ xla_cc_test(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/ir:mesh_and_axis",
+        "//xla/hlo/ir:named_sharding",
         "//xla/hlo/ir:tile_assignment",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 1644a384510159..c8ac759cf20973 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -48,6 +48,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/named_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/utils/hlo_container_util.h"
 #include "xla/layout.h"
@@ -1080,6 +1082,21 @@ HloSharding PropagateShardingAlongDimsAndReplicateOthers(
     return source_sharding;
   }
 
+  if (source_sharding.UseNamedShardingLeaf()) {
+    std::vector<NamedSharding::DimensionSharding> target_dim_shardings(
+        target_shape_rank);
+    for (int i = 0; i < source_dims.size(); ++i) {
+      target_dim_shardings[target_dims[i]] =
+          source_sharding.named_sharding().dim_shardings()[source_dims[i]];
+    }
+
+    return HloSharding(NamedSharding(
+        source_sharding.named_sharding().mesh(), target_dim_shardings,
+        source_sharding.named_sharding().replicated_axes(),
+        source_sharding.named_sharding().unreduced_axes(),
+        source_sharding.named_sharding().metadata()));
+  }
+
   HloSharding replicate_other_dims =
       PartiallyReplicateTiledShardingOnAllDimsExcept(source_sharding,
                                                      source_dims);
@@ -1493,6 +1510,22 @@ HloSharding PartiallyReplicateTiledShardingOnDims(
   if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
+
+  if (sharding.UseNamedShardingLeaf()) {
+    std::vector<NamedSharding::DimensionSharding> dim_shardings(
+        sharding.named_sharding().dim_shardings().begin(),
+        sharding.named_sharding().dim_shardings().end());
+    for (int64_t dim : dims_to_replicate) {
+      if (dim < dim_shardings.size()) {
+        dim_shardings[dim] = NamedSharding::DimensionSharding();
+      }
+    }
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(), sharding.metadata()));
+  }
+
   int64_t group_count = 1;
   DimensionVector valid_dims_to_replicate;
   for (int64_t dim : dims_to_replicate) {
@@ -1555,6 +1588,15 @@ HloSharding PartiallyReplicateTiledShardingOnAllDimsExcept(
 
 HloSharding ReplicateAllDataDims(const HloSharding& sharding,
                                  int64_t data_rank) {
+  if (sharding.UseNamedShardingLeaf()) {
+    std::vector<NamedSharding::DimensionSharding> dim_shardings(
+        data_rank >= 0 ? data_rank : sharding.num_dimensions());
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(), sharding.metadata()));
+  }
+
   if (sharding.IsManual()) {
     return sharding;
   }
@@ -1580,6 +1622,34 @@ HloSharding RemoveShapeDimensions(const HloSharding& sharding,
   if (sharding.IsTileMaximal() || dims_to_remove.empty()) {
     return sharding;
   }
+
+  if (sharding.UseNamedShardingLeaf()) {
+    // Check to ensure subgroup dimensions are not passed in dims_to_remove as
+    // named sharding doesn't handle them as part of dim_shardings but separate
+    // replicated, unreduced axes as opposed to tile hlo sharding format which
+    // uses tile dimensions to represent subgroup dimensions as well.
+    DCHECK(
+        std::all_of(dims_to_remove.begin(), dims_to_remove.end(),
+                    [&](int64_t i) { return i < sharding.num_dimensions(); }));
+
+    std::vector<NamedSharding::DimensionSharding> new_dim_shardings;
+    new_dim_shardings.reserve(sharding.num_dimensions() -
+                              dims_to_remove.size());
+    for (int64_t i = 0; i < sharding.num_dimensions(); ++i) {
+      if (absl::c_linear_search(dims_to_remove, i)) {
+        CHECK_EQ(sharding.dimension(i), 1);
+      } else {
+        new_dim_shardings.push_back(
+            sharding.named_sharding().dim_shardings()[i]);
+      }
+    }
+
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), new_dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(), sharding.metadata()));
+  }
+
   DimensionVector new_tile_shape;
   new_tile_shape.reserve(sharding.num_dimensions() - dims_to_remove.size());
   for (int64_t i = 0; i < sharding.num_dimensions(); ++i) {
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index c5164f0be6e26f..1f521eedaa8006 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -251,9 +251,10 @@ HloSharding PartiallyReplicateTiledShardingOnAllDimsExcept(
 HloSharding ReplicateAllDataDims(const HloSharding& sharding,
                                  int64_t data_rank = -1);
 
-// Returns a sharding the removes given tile dimensions.
+// Returns a sharding that removes given sharding dimensions.
 //
-// Precondition: if not tile maximal, the size of each tile dimension must be 1.
+// Precondition: if not tile maximal, the size of each sharding dimension must
+// be 1.
 HloSharding RemoveShapeDimensions(const HloSharding& sharding,
                                   absl::Span<const int64_t> dims_to_remove);
 
@@ -264,12 +265,13 @@ std::optional<HloSharding> TransposeShardingWithCollapsedDims(
     const HloSharding& source, absl::Span<int64_t const> src_to_tgt,
     absl::Span<int64_t const> tgt_to_src);
 
-// Given a `source_sharding`, preserve the tiles along the `source_dims` and
-// replicate the rest. The `target_dims` are used to determine the order of the
-// dimensions in the resulting sharding. If `source_dims` and `target_dims` are
-// in the different order (i.e., different ArgSort results), we need to
-// transpose the tile assignment.
+// Given a `source_sharding`, preserve the dimensions along the `source_dims`
+// and replicate the rest. The `target_dims` are used to determine the order of
+// the dimensions in the resulting sharding.
 //
+// [For tiled sharding format] If `source_dims` and `target_dims` are in the
+// different order (i.e., different ArgSort results), we need to transpose the
+// tile assignment.
 // Given the following input,
 //   * source_sharding = {devices=[2,3,5,7,11]<=[2310]}
 //   * source_dims = [2, 4, 1]
@@ -277,6 +279,16 @@ std::optional<HloSharding> TransposeShardingWithCollapsedDims(
 //   * target_shape_rank = 5
 // The result shoule be {devices=[1,11,5,3,1,14]<=[2,3,5,7,11]T(4,2,1,0,3)
 // last_tile_dim_replicate}.
+//
+// [For named sharding format]
+// Given the following input,
+//   * mesh = Mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+//   * source_sharding = NamedSharding(mesh, {{"a"}, {"b"}, {"c"}, {"d"},
+//   {"e"}})
+//   * source_dims = [2, 4, 1]
+//   * target_dims = [2, 1, 3]
+//   * target_shape_rank = 5
+// The result shoule be NamedSharding(mesh, {{}, {"e"}, {"c"}, {"b"}, {}})
 HloSharding PropagateShardingAlongDimsAndReplicateOthers(
     const HloSharding& source_sharding, absl::Span<const int64_t> source_dims,
     absl::Span<const int64_t> target_dims, int64_t target_shape_rank);
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index ecaad635b7a440..ab7a203e0d2ae1 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/array.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/mesh_and_axis.h"
+#include "xla/hlo/ir/named_sharding.h"
 #include "xla/hlo/ir/tile_assignment.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
@@ -566,6 +568,18 @@ TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers1) {
   HloSharding expected = HloSharding::PartialTile(
       TileAssignment({1, 11, 5, 3, 1, 14}, {2, 3, 5, 7, 11}, {4, 2, 1, 0, 3}));
   EXPECT_EQ(target_sharding, expected);
+
+  {
+    Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+    NamedSharding source_sharding =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+    HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+        HloSharding(source_sharding), source_dims, target_dims,
+        target_shape_rank);
+    NamedSharding expected =
+        test_utils::FromAxisNames(mesh, {{}, {"e"}, {"c"}, {"b"}, {}});
+    EXPECT_EQ(target_sharding.named_sharding(), expected);
+  }
 }
 
 TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers2) {
@@ -578,6 +592,18 @@ TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers2) {
   HloSharding expected = HloSharding::PartialTile(
       TileAssignment({2, 5, 11, 21}, {2, 3, 5, 7, 11}, {0, 2, 4, 1, 3}));
   EXPECT_EQ(target_sharding, expected);
+
+  {
+    Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+    NamedSharding source_sharding =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+    HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+        HloSharding(source_sharding), source_dims, target_dims,
+        target_shape_rank);
+    NamedSharding expected =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"c"}, {"e"}});
+    EXPECT_EQ(target_sharding.named_sharding(), expected);
+  }
 }
 
 TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers3) {
@@ -590,6 +616,35 @@ TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers3) {
   HloSharding expected = HloSharding::PartialTile(
       TileAssignment({11, 7, 1, 3, 10}, {2, 3, 5, 7, 11}, {4, 3, 1, 0, 2}));
   EXPECT_EQ(target_sharding, expected);
+
+  {
+    Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+    NamedSharding source_sharding =
+        test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+    HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+        HloSharding(source_sharding), source_dims, target_dims,
+        target_shape_rank);
+    NamedSharding expected =
+        test_utils::FromAxisNames(mesh, {{"e"}, {"d"}, {}, {"b"}});
+    EXPECT_EQ(target_sharding.named_sharding(), expected);
+  }
+}
+
+TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers4) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding =
+      test_utils::FromAxisNames(mesh, {{"a"}, {"c", "b"}, {}, {"d"}, {}}, {},
+                                /*unreduced_axes=*/{"e"});
+  std::vector<int64_t> source_dims = {2, 1, 3};
+  std::vector<int64_t> target_dims = {0, 3, 1};
+  int64_t target_shape_rank = 4;
+  HloSharding target_sharding = PropagateShardingAlongDimsAndReplicateOthers(
+      HloSharding(source_sharding), source_dims, target_dims,
+      target_shape_rank);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{}, {"d"}, {}, {"c", "b"}}, {},
+                                /*unreduced_axes=*/{"e"});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
 }
 
 TEST(HloShardingUtilTest, MergeManualSubgroupSharding) {
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index d87ef705b8185c..8e79e7c16d2e84 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -3053,7 +3053,7 @@ std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsForReplication(
 std::optional<Mesh> GetMeshFromSharding(const HloSharding& sharding) {
   // For V3 shardings, use the mesh associated with the named sharding.
   if (sharding.UseNamedShardingLeaf()) {
-    return sharding.named_sharding()->mesh();
+    return sharding.named_sharding().mesh();
   }
 
   // For V2 shardings, create the mesh from the tile assignment.

From 2009f3930d65c32cf010cd685ce9a988b831ef67 Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Tue, 9 Dec 2025 08:33:49 -0800
Subject: [PATCH 082/753] [XLA:CPU] Add missing vectorization sizes from tanh
 and exp approximation.

PiperOrigin-RevId: 842252816
---
 .../cpu/codegen/polynomial_approximations.cc        |  7 ++++---
 third_party/xla/xla/codegen/intrinsic/tanh.h        | 13 +++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
index 0c8084568e41c5..947566d3de8715 100644
--- a/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/polynomial_approximations.cc
@@ -521,12 +521,13 @@ void RewriteToPolynomialApproximations(llvm::Module* module,
 
   rewrite_calls("expf", GenerateVF32Exp, /*vector_width=*/1);
   rewrite_calls("llvm.exp.f32", GenerateVF32Exp, /*vector_width=*/1);
-  rewrite_calls(kExpV4F32Sym, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls("llvm.exp.v2f32", GenerateVF32Exp, /*vector_width=*/2);
   rewrite_calls("llvm.exp.v4f32", GenerateVF32Exp, /*vector_width=*/4);
-  rewrite_calls(kExpV8F32Sym, GenerateVF32Exp, /*vector_width=*/8);
   rewrite_calls("llvm.exp.v8f32", GenerateVF32Exp, /*vector_width=*/8);
-  rewrite_calls(kExpV16F32Sym, GenerateVF32Exp, /*vector_width=*/16);
   rewrite_calls("llvm.exp.v16f32", GenerateVF32Exp, /*vector_width=*/16);
+  rewrite_calls(kExpV4F32Sym, GenerateVF32Exp, /*vector_width=*/4);
+  rewrite_calls(kExpV8F32Sym, GenerateVF32Exp, /*vector_width=*/8);
+  rewrite_calls(kExpV16F32Sym, GenerateVF32Exp, /*vector_width=*/16);
 
   rewrite_calls("llvm.exp.f16", UpcastF16ToF32<GenerateVF32Exp>,
                 /*vector_width=*/1);
diff --git a/third_party/xla/xla/codegen/intrinsic/tanh.h b/third_party/xla/xla/codegen/intrinsic/tanh.h
index 34d60229c29026..022a09951f3d3a 100644
--- a/third_party/xla/xla/codegen/intrinsic/tanh.h
+++ b/third_party/xla/xla/codegen/intrinsic/tanh.h
@@ -33,12 +33,13 @@ class Tanh : public Intrinsic<Tanh> {
   static std::vector<std::vector<Type>> SupportedVectorTypes() {
     // F16 via upcast to F32.
     return {
-        {Type::S(xla::F16)},    {Type::V(xla::F16, 8)}, {Type::V(xla::F16, 16)},
-        {Type::S(xla::F32)},
-
-        {Type::V(xla::F32, 4)}, {Type::V(xla::F32, 8)}, {Type::V(xla::F32, 16)},
-        {Type::S(xla::F64)},    {Type::V(xla::F64, 2)}, {Type::V(xla::F64, 4)},
-        {Type::V(xla::F64, 8)},
+        {Type::S(xla::F16)},     {Type::V(xla::F16, 2)},
+        {Type::V(xla::F16, 4)},  {Type::V(xla::F16, 8)},
+        {Type::V(xla::F16, 16)}, {Type::S(xla::F32)},
+        {Type::V(xla::F32, 2)},  {Type::V(xla::F32, 4)},
+        {Type::V(xla::F32, 8)},  {Type::V(xla::F32, 16)},
+        {Type::S(xla::F64)},     {Type::V(xla::F64, 2)},
+        {Type::V(xla::F64, 4)},  {Type::V(xla::F64, 8)},
     };
   }
   static absl::StatusOr<llvm::Function*> CreateDefinition(llvm::Module* module,

From 13a525b5831ff70f781fe3dd361440c059017464 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 9 Dec 2025 08:36:32 -0800
Subject: [PATCH 083/753] [xla:pjrt] Migrate to se::DeviceMemoryAddress

PiperOrigin-RevId: 842253793
---
 third_party/xla/xla/pjrt/BUILD                |  8 ++---
 third_party/xla/xla/pjrt/cpu/BUILD            |  4 +--
 .../xla/xla/pjrt/cpu/abstract_cpu_buffer.cc   |  2 +-
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    | 12 ++++----
 third_party/xla/xla/pjrt/gpu/BUILD            |  6 ++--
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    | 24 +++++++--------
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  4 +--
 .../xla/pjrt/gpu/se_gpu_pjrt_client_test.cc   |  6 ++--
 third_party/xla/xla/pjrt/gpu/tfrt/BUILD       | 12 ++++----
 ...u_async_host_to_device_transfer_manager.cc |  6 ++--
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc  | 10 +++----
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc  | 10 +++----
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h   |  8 ++---
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc |  6 ++--
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc  |  2 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h   |  2 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc  | 16 +++++-----
 .../gpu/tfrt/tracked_gpu_device_buffer.cc     | 14 ++++-----
 .../pjrt/gpu/tfrt/tracked_gpu_device_buffer.h | 18 +++++------
 .../tfrt/tracked_gpu_device_buffer_test.cc    |  6 ++--
 third_party/xla/xla/pjrt/gpu/tfrt/utils.cc    | 24 +++++++--------
 third_party/xla/xla/pjrt/gpu/tfrt/utils.h     |  4 +--
 .../xla/xla/pjrt/local_device_state.cc        |  4 +--
 third_party/xla/xla/pjrt/local_device_state.h |  2 +-
 .../xla/pjrt/pjrt_stream_executor_client.cc   | 30 +++++++++----------
 .../xla/pjrt/pjrt_stream_executor_client.h    | 10 +++----
 third_party/xla/xla/pjrt/se_raw_buffer.cc     |  8 ++---
 .../xla/xla/pjrt/tracked_device_buffer.cc     | 20 ++++++-------
 .../xla/xla/pjrt/tracked_device_buffer.h      | 18 +++++------
 .../xla/pjrt/tracked_device_buffer_test.cc    |  4 +--
 30 files changed, 150 insertions(+), 150 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 0f895ff6047f2e..954225f6a10310 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -194,7 +194,7 @@ xla_cc_test(
         "//xla/client:local_client",
         "//xla/hlo/testlib:test",
         "//xla/service:cpu_plugin",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:statusor",
@@ -225,7 +225,7 @@ cc_library(
         ":worker_thread",
         "//xla:util",
         "//xla/client:local_client",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -717,8 +717,8 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 6e7d2fad54a6dc..0756d7beac73bb 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service/cpu:cpu_executable",
         "//xla/service/cpu:cpu_xfeed",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:env",
@@ -208,7 +208,7 @@ cc_library(
         "//xla/service/cpu:cpu_executable_run_options",
         "//xla/service/cpu:executable_proto_cc",
         "//xla/service/llvm_ir:llvm_command_line_options",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
index 3a98e4200f496a..f330c75ca62e13 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
@@ -53,7 +53,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 5e2f7aa65df9ef..55690711a5fc40 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -117,7 +117,7 @@ limitations under the License.
 #include "xla/service/maybe_owning_device_address.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -1275,7 +1275,7 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
 
   } else if (allocation.is_constant() &&
              allocation.index() < constants.size()) {
-    se::DeviceMemoryBase constant =
+    se::DeviceAddressBase constant =
         constants[allocation.index()].AsDeviceMemoryBase();
     buffer_info.buffer = CpuDeviceMemory::CreateConstantMemory(
         constant.opaque(), constant.size());
@@ -1624,8 +1624,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
       buffer_device_mem.reserve(buffer_table.size());
       for (const auto& buffer_info : buffer_table) {
         buffer_device_mem.emplace_back(
-            se::DeviceMemoryBase(buffer_info.buffer->untyped_data(),
-                                 buffer_info.buffer->size_bytes()));
+            se::DeviceAddressBase(buffer_info.buffer->untyped_data(),
+                                  buffer_info.buffer->size_bytes()));
       }
 
       cpu::BufferAllocations allocations(buffer_device_mem);
@@ -1768,8 +1768,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
             buffer_device_mem.reserve(buffer_table.size());
             for (const auto& buffer_info : buffer_table) {
               buffer_device_mem.emplace_back(
-                  se::DeviceMemoryBase(buffer_info.buffer->untyped_data(),
-                                       buffer_info.buffer->size_bytes()));
+                  se::DeviceAddressBase(buffer_info.buffer->untyped_data(),
+                                        buffer_info.buffer->size_bytes()));
             }
 
             cpu::BufferAllocations allocations(buffer_device_mem);
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 96c8ebbe3c39f3..1b082cc7a37f5f 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -116,9 +116,9 @@ cc_library(
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
         "//xla/service/gpu:gpu_memory_space_assignment",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -246,7 +246,7 @@ xla_test(
         "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/service:platform_util",
         "//xla/service/gpu:gpu_memory_space_assignment",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:literal_test_util",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 82cbcf932e6dda..e210a480bc74dd 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -102,9 +102,9 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -206,7 +206,7 @@ static absl::flat_hash_map<std::string, PjRtDeviceAttribute> GetAttrsForDevices(
 StreamExecutorGpuClient::StreamExecutorGpuClient(
     std::string platform_name, LocalClient* client,
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
-    int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+    int process_index, std::unique_ptr<se::DeviceAddressAllocator> allocator,
     std::unique_ptr<tsl::Allocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
@@ -1414,7 +1414,7 @@ BuildLocalDeviceStates(LocalClient* xla_client) {
 
 // Constructs a GPU device memory allocator to use, according to the allocator
 // configuration the client requested.
-absl::StatusOr<std::unique_ptr<se::DeviceMemoryAllocator>>
+absl::StatusOr<std::unique_ptr<se::DeviceAddressAllocator>>
 GetStreamExecutorGpuDeviceAllocator(
     se::Platform* platform, const GpuAllocatorConfig& allocator_config,
     const std::map<int, std::unique_ptr<LocalDeviceState>>&
@@ -1849,7 +1849,7 @@ std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
 
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 static absl::Status CheckAlignment(const BufferAllocation& allocation,
-                                   se::DeviceMemoryBase buffer, int arg_idx) {
+                                   se::DeviceAddressBase buffer, int arg_idx) {
   const int64_t expected_alignment = [&] {
     if (allocation.is_entry_computation_parameter()) {
       return gpu::kEntryParameterAlignBytes;
@@ -1887,7 +1887,7 @@ StreamExecutorGpuClient::RunAsync(
   auto* gpu_exec =
       tensorflow::down_cast<xla::gpu::GpuExecutable*>(exec.executable());
   const ServiceExecutableRunOptions* run_options = &options_and_stream.first;
-  se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
+  se::DeviceAddressAllocator* const memory_allocator = run_options->allocator();
 
   se::StreamExecutor* executor = run_options->stream()->parent();
 
@@ -1932,7 +1932,7 @@ StreamExecutorGpuClient::RunAsync(
   absl::Span<const BufferAllocation* const> allocations =
       gpu_exec->GetAllocations();
 
-  std::vector<se::DeviceMemoryBase> buffers(allocations.size());
+  std::vector<se::DeviceAddressBase> buffers(allocations.size());
   {
     tsl::profiler::TraceMe hlo_module_activity(
         [&] { return std::string("Build buffer allocations"); },
@@ -1940,9 +1940,9 @@ StreamExecutorGpuClient::RunAsync(
     const int64_t num_buffers = allocations.size();
     for (int64_t i = 0; i < num_buffers; ++i) {
       const BufferAllocation& allocation = *allocations[i];
-      se::DeviceMemoryBase& buffer = buffers[i];
+      se::DeviceAddressBase& buffer = buffers[i];
       if (allocation.is_thread_local()) {
-        // buffer = se::DeviceMemoryBase{};
+        // buffer = se::DeviceAddressBase{};
       } else if (allocation.is_entry_computation_parameter()) {
         int64_t param_no = allocation.parameter_number();
         buffer = [&] {
@@ -1985,7 +1985,7 @@ StreamExecutorGpuClient::RunAsync(
   XLA_VLOG_DEVICE(3, device_ordinal)
       << "Buffer allocations: " << buffer_allocations.ToString();
 
-  std::set<se::DeviceMemoryBase> buffers_in_result;
+  std::set<se::DeviceAddressBase> buffers_in_result;
 
   xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       gpu_exec->result_shape());
@@ -1999,7 +1999,7 @@ StreamExecutorGpuClient::RunAsync(
         gpu_exec->output_info().at(index);
     const BufferAllocation* allocation =
         allocations[output_info.allocation_index];
-    se::DeviceMemoryBase result_buffer;
+    se::DeviceAddressBase result_buffer;
 
     XLA_VLOG_DEVICE(4, device_ordinal)
         << "Looking at: allocation " << output_info.allocation_index
@@ -2043,7 +2043,7 @@ StreamExecutorGpuClient::RunAsync(
           return gpu_exec->VerboseAllocationError(allocated_buffer.status());
         }
         result_buffer = allocated_buffer->Release();
-        se::DeviceMemoryBase& aliased_buffer =
+        se::DeviceAddressBase& aliased_buffer =
             buffer_allocations.GetMutableDeviceAddress(
                 output_info.allocation_index);
         CHECK_EQ(aliased_buffer.size(), result_buffer.size());
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index b43592589b9cf3..c56d5757a3c929 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -58,8 +58,8 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -109,7 +109,7 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
   StreamExecutorGpuClient(
       std::string platform_name, LocalClient* client,
       std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
-      int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+      int process_index, std::unique_ptr<se::DeviceAddressAllocator> allocator,
       std::unique_ptr<tsl::Allocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
       std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index d6283dda89b5fe..787f43b0691a21 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -92,7 +92,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -510,7 +510,7 @@ static absl::Status MemsetFromValue(
   uint32_t pattern;
   std::memcpy(&pattern, &memset_value->value, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
@@ -559,7 +559,7 @@ static absl::Status MemsetFromAttr(
   uint32_t pattern;
   std::memcpy(&pattern, &attr, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index b6d28c5e744e7f..7cb5b0892be377 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -104,10 +104,10 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -217,8 +217,8 @@ xla_test(
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
         "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/service:platform_util",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_description",
-        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/cuda:cuda_compute_capability",
@@ -272,8 +272,8 @@ cc_library(
         "//xla:util",
         "//xla/pjrt:pjrt_client",
         "//xla/service:shaped_buffer",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:event",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/concurrency:async_value",
@@ -317,8 +317,8 @@ xla_cc_test(
         "//xla/pjrt:pjrt_common",
         "//xla/service:gpu_plugin",
         "//xla/service:shaped_buffer",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
-        "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         # copybara:uncomment "//xla/tsl/framework:allocator",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
index 7ec8d2dc198fee..44031d7249faa5 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
@@ -54,9 +54,9 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
@@ -275,7 +275,7 @@ TfrtGpuAsyncHostToDeviceTransferManager::TransferRawDataToSubBuffer(
     staging_buffer = host_memory_allocator->Allocate(transfer_size);
   }
 
-  se::DeviceMemoryBase sub_buffer;
+  se::DeviceAddressBase sub_buffer;
   {
     absl::MutexLock l(mu_);
     DCHECK_LT(buffer_index, buffer_ptrs_.size());
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index 65e7a6ace81d0b..6c043547ed5e60 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -57,9 +57,9 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/framework/allocator.h"
@@ -583,7 +583,7 @@ Future<> TfrtGpuBuffer::CopyRawToHostFuture(Future<void*> dst_future,
       promise.Set(device_buffer->definition_event().GetError());
       return;
     }
-    se::DeviceMemoryBase device_memory = device_buffer->buffer()->buffer();
+    se::DeviceAddressBase device_memory = device_buffer->buffer()->buffer();
     if (offset < 0 || offset > device_memory.size() ||
         device_memory.size() - offset < transfer_size) {
       LOG(ERROR) << "Copy raw buffer called on buffer size "
@@ -596,7 +596,7 @@ Future<> TfrtGpuBuffer::CopyRawToHostFuture(Future<void*> dst_future,
       return;
     }
 
-    se::DeviceMemoryBase sub_buffer;
+    se::DeviceAddressBase sub_buffer;
     if (transfer_size < device_memory.size()) {
       sub_buffer = device_memory.GetByteSlice(offset, transfer_size);
     } else {
@@ -824,7 +824,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
 
         auto stream = dst_device->stream();
 
-        se::DeviceMemoryBase dst(allocated_dst_buffer->buffer());
+        se::DeviceAddressBase dst(allocated_dst_buffer->buffer());
         VLOG(3) << "D2D copy: " << src_buffer->buffer().opaque() << " -> "
                 << dst.opaque() << " (" << src_buffer->buffer().size()
                 << " bytes)";
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index eec0d99679e068..f63c05f5aa8b77 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -94,9 +94,9 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -148,7 +148,7 @@ TfrtGpuClient::TfrtGpuClient(
     std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
     bool should_stage_host_to_device_transfers,
     bool abort_collectives_on_failure,
-    MaybeOwning<se::DeviceMemoryAllocator> allocator,
+    MaybeOwning<se::DeviceAddressAllocator> allocator,
     std::unique_ptr<tsl::Allocator> host_memory_allocator,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store,
@@ -437,7 +437,7 @@ TfrtGpuClient::CreateViewOfDeviceBuffer(
   CHECK_EQ(memory_space->devices().size(), 1);
   auto* device = memory_space->devices().front();
   size_t byte_size = ShapeUtil::ByteSizeOf(shape);
-  se::DeviceMemoryBase device_memory(device_ptr, byte_size);
+  se::DeviceAddressBase device_memory(device_ptr, byte_size);
   auto non_owning_buffer = GpuDeviceMemory(device_memory);
   auto buffer_async_value_ref =
       tsl::MakeAvailableAsyncValueRef<GpuDeviceMemory>(
@@ -972,7 +972,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
     });
     auto stream = device->stream();
 
-    se::DeviceMemoryBase dest = gpu_buffer->buffer();
+    se::DeviceAddressBase dest = gpu_buffer->buffer();
     VLOG(3) << "H2D copy: " << src_buf << " -> " << dest.opaque() << " ("
             << packed_size << " bytes) on device " << device->DebugString();
 
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
index 41e95484b084a7..88bed1881f355f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
@@ -63,7 +63,7 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/xla.pb.h"
@@ -119,7 +119,7 @@ class TfrtGpuClient final : public PjRtClient {
                 std::vector<std::unique_ptr<TfrtGpuDevice>> devices,
                 bool should_stage_host_to_device_transfers,
                 bool abort_collectives_on_failure,
-                MaybeOwning<se::DeviceMemoryAllocator> allocator,
+                MaybeOwning<se::DeviceAddressAllocator> allocator,
                 std::unique_ptr<tsl::Allocator> host_memory_allocator,
                 std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
                 std::shared_ptr<KeyValueStoreInterface> kv_store,
@@ -156,7 +156,7 @@ class TfrtGpuClient final : public PjRtClient {
 
   xla::LocalClient* xla_client() const { return xla_client_; }
 
-  se::DeviceMemoryAllocator* allocator() { return allocator_.get_mutable(); }
+  se::DeviceAddressAllocator* allocator() { return allocator_.get_mutable(); }
 
   bool should_stage_host_to_device_transfers() const {
     return should_stage_host_to_device_transfers_;
@@ -337,7 +337,7 @@ class TfrtGpuClient final : public PjRtClient {
   // Device memory allocator. If owned, the allocator must outlive the devices,
   // because it is the device destructor that waits for any outstanding work to
   // complete.
-  MaybeOwning<se::DeviceMemoryAllocator> allocator_;
+  MaybeOwning<se::DeviceAddressAllocator> allocator_;
   // Allocator to be used for staging memory transfers to devices.
   std::unique_ptr<HostMemoryAllocator> host_memory_allocator_;
 
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
index 3e5fcc20deb231..c078751882b00c 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
@@ -77,8 +77,8 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/literal_test_util.h"
@@ -386,7 +386,7 @@ static absl::Status MemsetFromValue(
   uint32_t pattern;
   std::memcpy(&pattern, &memset_value->value, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
@@ -434,7 +434,7 @@ static absl::Status MemsetFromAttr(
   uint32_t pattern;
   std::memcpy(&pattern, &attr, sizeof(pattern));
 
-  se::DeviceMemoryBase base = result->device_memory();
+  se::DeviceAddressBase base = result->device_memory();
   return stream->Memset32(&base, pattern, base.size());
 }
 
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
index 588227432b216a..96204308bc2384 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
@@ -55,8 +55,8 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
index 95961906b1ace8..97707c3690fb06 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.h
@@ -46,7 +46,7 @@ limitations under the License.
 #include "xla/pjrt/semaphore.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 1c97ab898cbd21..5e84506057c524 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -76,9 +76,9 @@ limitations under the License.
 #include "xla/shape_layout.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -110,7 +110,7 @@ namespace xla {
 class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  public:
   TfrtGpuCopyToDeviceStream(int64_t channel_id, se::Stream* stream,
-                            se::DeviceMemoryBase dst,
+                            se::DeviceAddressBase dst,
                             tsl::AsyncValueRef<std::unique_ptr<se::Event>> done)
       : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
         channel_id_(channel_id),
@@ -146,7 +146,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
       return Future<>(done_.GetError());
     }
 
-    se::DeviceMemoryBase dst(
+    se::DeviceAddressBase dst(
         reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
         dst_.size() - current_bytes_);
 
@@ -190,7 +190,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  private:
   int64_t channel_id_;
   se::Stream* stream_;
-  se::DeviceMemoryBase dst_;
+  se::DeviceAddressBase dst_;
 
   // Async value will become available after we'll submit the last memcpy
   // operation, and the event will be recorded on the stream.
@@ -771,13 +771,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
         if (result_is_tuple) {
           for (int i = 0; i < output_buffers.size(); ++i) {
             ScopedShapedBuffer tuple_buffer = output.TakeSubTree({i});
-            stream_executor::DeviceMemoryBase* elem =
+            stream_executor::DeviceAddressBase* elem =
                 tuple_buffer.buffers().mutable_element({});
             VLOG(3) << "untuple: output_buffers[" << i
                     << "].emplace: " << elem->opaque();
             output_buffers[i].emplace(stream_executor::OwningDeviceMemory(
                 *elem, device->local_device_id().value(), client->allocator()));
-            *elem = se::DeviceMemoryBase();
+            *elem = se::DeviceAddressBase();
           }
         } else {
           CHECK_EQ(output_buffers.size(), 1);
@@ -785,7 +785,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
           VLOG(3) << "output_buffers[0].emplace: " << elem->opaque();
           output_buffers.front().emplace(stream_executor::OwningDeviceMemory(
               *elem, device->local_device_id().value(), client->allocator()));
-          *elem = se::DeviceMemoryBase();
+          *elem = se::DeviceAddressBase();
         }
 
         // Set the scheduled event to concrete to indicate that the scheduling
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
index 32543f080947b1..3d49f9d7a16823 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.cc
@@ -28,8 +28,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
@@ -44,7 +44,7 @@ ShapedBuffer GpuDeviceMemory::AsShapedBuffer(const Shape& on_device_shape,
                                              const PjRtDevice* device) const {
   ShapedBuffer shaped_buffer(on_device_shape, device->local_device_id().value(),
                              device->local_hardware_id().value());
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+  ShapeTree<se::DeviceAddressBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   CHECK(iterator != shaped_buffer.buffers().end());
   iterator->second = buffer_;
@@ -60,19 +60,19 @@ void GpuDeviceMemory::SetUnOwned() {
 }
 
 absl::StatusOr<GpuDeviceMemory> GpuDeviceMemory::Allocate(
-    se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size) {
+    se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size) {
   return Allocate(allocator, device_ordinal, size,
                   static_cast<int>(se::MemoryType::kDevice));
 }
 
 absl::StatusOr<GpuDeviceMemory> GpuDeviceMemory::Allocate(
-    se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size,
+    se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size,
     int64_t memory_space) {
   if (size == 0) {
-    return GpuDeviceMemory(se::DeviceMemoryBase());
+    return GpuDeviceMemory(se::DeviceAddressBase());
   }
   TF_ASSIGN_OR_RETURN(
-      stream_executor::OwningDeviceMemory memory,
+      stream_executor::ScopedDeviceAddress<uint8_t> memory,
       allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/true,
                           memory_space));
   return GpuDeviceMemory(std::move(memory));
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
index 19c949075f320d..71abf7139016dd 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h
@@ -29,8 +29,8 @@ limitations under the License.
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/event.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 
@@ -47,11 +47,11 @@ class GpuDeviceMemory {
   GpuDeviceMemory& operator=(GpuDeviceMemory&& other) = default;
 
   // Creates non-owning GPU device memory from a raw data pointer.
-  explicit GpuDeviceMemory(stream_executor::DeviceMemoryBase buffer)
+  explicit GpuDeviceMemory(stream_executor::DeviceAddressBase buffer)
       : buffer_(buffer) {}
 
   // Creates owning GPU device memory from an owned data pointer.
-  explicit GpuDeviceMemory(stream_executor::OwningDeviceMemory buffer)
+  explicit GpuDeviceMemory(stream_executor::ScopedDeviceAddress<uint8_t> buffer)
       : owning_buffer_(std::move(buffer)), buffer_(*owning_buffer_) {}
 
   ShapedBuffer AsShapedBuffer(const Shape& on_device_shape,
@@ -62,19 +62,19 @@ class GpuDeviceMemory {
 
   // Allocates raw owning memory.
   static absl::StatusOr<GpuDeviceMemory> Allocate(
-      se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size);
+      se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size);
 
   static absl::StatusOr<GpuDeviceMemory> Allocate(
-      se::DeviceMemoryAllocator* allocator, int device_ordinal, size_t size,
+      se::DeviceAddressAllocator* allocator, int device_ordinal, size_t size,
       int64_t memory_space);
 
-  stream_executor::DeviceMemoryBase buffer() const { return buffer_; }
+  stream_executor::DeviceAddressBase buffer() const { return buffer_; }
   size_t size_bytes() const { return buffer_.size(); }
   bool owns_data() const { return !owning_buffer_.is_null(); }
 
  private:
-  stream_executor::OwningDeviceMemory owning_buffer_;
-  se::DeviceMemoryBase buffer_;
+  stream_executor::ScopedDeviceAddress<uint8_t> owning_buffer_;
+  se::DeviceAddressBase buffer_;
 };
 
 // Class that represents a GPU buffer. It optionally owns the buffer. It also
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
index 4c0020c87b2329..7961f01d17b439 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer_test.cc
@@ -39,8 +39,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_allocator.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/env.h"
@@ -65,11 +65,11 @@ class TestAllocator : public se::DeviceAddressAllocator {
   absl::StatusOr<stream_executor::ScopedDeviceAddress<uint8_t>> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
       int64_t memory_space) override {
-    const se::DeviceMemoryBase base(kOpaque, size);
+    const se::DeviceAddressBase base(kOpaque, size);
     return stream_executor::ScopedDeviceAddress<uint8_t>(base, 0, this);
   }
   absl::Status Deallocate(int device_ordinal,
-                          se::DeviceMemoryBase mem) override {
+                          se::DeviceAddressBase mem) override {
     return absl::OkStatus();
   }
   absl::StatusOr<se::Stream*> GetStream(int device_ordinal) override {
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
index d6a067722f3c85..a59dd22155ddb6 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
@@ -83,10 +83,10 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -307,7 +307,7 @@ absl::flat_hash_map<std::string, PjRtDeviceAttribute> GetAttrsForDevices(
 class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  public:
   TfrtGpuCopyToDeviceStream(int64_t channel_id, se::Stream* stream,
-                            se::DeviceMemoryBase dst,
+                            se::DeviceAddressBase dst,
                             tsl::AsyncValueRef<std::unique_ptr<se::Event>> done)
       : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
         channel_id_(channel_id),
@@ -343,7 +343,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
       return Future<>(done_.GetError());
     }
 
-    se::DeviceMemoryBase dst(
+    se::DeviceAddressBase dst(
         reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
         dst_.size() - current_bytes_);
 
@@ -387,7 +387,7 @@ class TfrtGpuCopyToDeviceStream : public CopyToDeviceStream {
  private:
   int64_t channel_id_;
   se::Stream* stream_;
-  se::DeviceMemoryBase dst_;
+  se::DeviceAddressBase dst_;
 
   // Async value will become available after we'll submit the last memcpy
   // operation, and the event will be recorded on the stream.
@@ -401,7 +401,7 @@ SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     const se::DeviceMemoryBase&,
+                     const se::DeviceAddressBase&,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return Internal(
           "Don't send a buffer to the channel_id=%d, there was no send "
@@ -415,7 +415,7 @@ SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
 
   return [callbacks, thread_pool](
              int64_t channel_id, se::Stream* stream, const Shape& shape,
-             const se::DeviceMemoryBase& src,
+             const se::DeviceAddressBase& src,
              const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(4) << "Send " << src.size() << " bytes to channel #" << channel_id
@@ -490,7 +490,7 @@ RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     se::DeviceMemoryBase*,
+                     se::DeviceAddressBase*,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return InvalidArgument(
           "Failed to receive a buffer from the channel_id=%d, there was no "
@@ -503,7 +503,7 @@ RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   absl::Span<const RecvCallback> callbacks = options.recv_callbacks[replica];
 
   return [callbacks](int64_t channel_id, se::Stream* stream, const Shape& shape,
-                     se::DeviceMemoryBase* dst,
+                     se::DeviceAddressBase* dst,
                      const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(4) << "Recv from channel #" << channel_id
@@ -650,7 +650,7 @@ absl::StatusOr<std::unique_ptr<tsl::Allocator>> CreateAllocatorForDevice(
   }
 }
 
-absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
+absl::StatusOr<MaybeOwning<se::DeviceAddressAllocator>> CreateDeviceAllocator(
     LocalClient* xla_client, const GpuAllocatorConfig& allocator_config,
     const std::vector<std::unique_ptr<TfrtGpuDevice>>& devices) {
   if (allocator_config.kind == GpuAllocatorConfig::Kind::kPlatform) {
@@ -660,7 +660,7 @@ absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
           << "collective_memory_size is non-zero, but allocator kind is set "
              "to \"platform\". Collective memory will not be allocated.";
     }
-    return MaybeOwning<se::DeviceMemoryAllocator>(
+    return MaybeOwning<se::DeviceAddressAllocator>(
         xla_client->backend().memory_allocator());
   }
 
@@ -697,7 +697,7 @@ absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
         /*memory_space=*/static_cast<int>(se::MemoryType::kHost),
         executor->device_ordinal(), executor->GetPlatform());
   }
-  return MaybeOwning<se::DeviceMemoryAllocator>(
+  return MaybeOwning<se::DeviceAddressAllocator>(
       std::make_unique<se::MultiDeviceAdapter>(xla_client->platform(),
                                                std::move(allocators)));
 }
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
index 9fdf52226cecba..c7599bd4967d97 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
@@ -53,7 +53,7 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -154,7 +154,7 @@ std::vector<std::unique_ptr<PjRtMemorySpace>> InitializeMemorySpaces(
 absl::StatusOr<std::unique_ptr<tsl::Allocator>> CreateAllocatorForDevice(
     se::StreamExecutor* executor, const GpuAllocatorConfig& allocator_config);
 
-absl::StatusOr<MaybeOwning<se::DeviceMemoryAllocator>> CreateDeviceAllocator(
+absl::StatusOr<MaybeOwning<se::DeviceAddressAllocator>> CreateDeviceAllocator(
     LocalClient* xla_client, const GpuAllocatorConfig& allocator_config,
     const std::vector<std::unique_ptr<TfrtGpuDevice>>& devices);
 
diff --git a/third_party/xla/xla/pjrt/local_device_state.cc b/third_party/xla/xla/pjrt/local_device_state.cc
index b3e16c8e8f20ab..a1812c63ec19a0 100644
--- a/third_party/xla/xla/pjrt/local_device_state.cc
+++ b/third_party/xla/xla/pjrt/local_device_state.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/pjrt/buffer_sequencing_event.h"
 #include "xla/pjrt/worker_thread.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
@@ -177,7 +177,7 @@ absl::Status LocalDeviceState::SynchronizeAllActivity() {
 
 absl::Status LocalDeviceState::ThenMemcpyDeviceToDevice(
     se::Stream* transfer_stream, se::Stream* dst_stream,
-    se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer) {
+    se::DeviceAddressBase src_buffer, se::DeviceAddressBase dst_buffer) {
   // The default implementation simply calls MemcpyD2D, and assumes that
   // the buffer addresses identify the devices. This does not work
   // on all platforms; this method is virtual so it can be overridden.
diff --git a/third_party/xla/xla/pjrt/local_device_state.h b/third_party/xla/xla/pjrt/local_device_state.h
index 675b6b81459f05..38ca812e589c74 100644
--- a/third_party/xla/xla/pjrt/local_device_state.h
+++ b/third_party/xla/xla/pjrt/local_device_state.h
@@ -168,7 +168,7 @@ class LocalDeviceState {
   // Enqueues a copy of `src_buffer` to `dst_buffer` onto `transfer_stream`.
   virtual absl::Status ThenMemcpyDeviceToDevice(
       se::Stream* transfer_stream, se::Stream* dst_stream,
-      se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer);
+      se::DeviceAddressBase src_buffer, se::DeviceAddressBase dst_buffer);
 
   WorkerThread* execute_thread() const { return execute_thread_.get(); }
 
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 4c175b7390e14c..e342a586863001 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -147,8 +147,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -275,7 +275,7 @@ PjRtStreamExecutorClient::PjRtStreamExecutorClient(
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
     int process_index,
     std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces,
-    std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+    std::unique_ptr<se::DeviceAddressAllocator> allocator,
     std::unique_ptr<tsl::Allocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options)
@@ -730,7 +730,7 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
         // memory that has already been allocated, and a possible Event
         // allocation.
 
-        se::DeviceMemoryBase device_memory =
+        se::DeviceAddressBase device_memory =
             tensorflow::down_cast<PjRtStreamExecutorRawBuffer*>(
                 raw_buffer.get())
                 ->device_buffer()
@@ -904,7 +904,7 @@ PjRtStreamExecutorClient::CreateViewOfDeviceBuffer(
   auto* device = memory_space->devices().front();
 
   auto buffer = RawSEDeviceMemory::CreateForeign(
-      se::DeviceMemoryBase(device_ptr, ShapeUtil::ByteSizeOf(shape)),
+      se::DeviceAddressBase(device_ptr, ShapeUtil::ByteSizeOf(shape)),
       std::move(on_delete_callback));
 
   TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
@@ -1139,7 +1139,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
                 absl::Span<PjRtBuffer* const> py_buffers,
                 absl::Span<const CommonPjRtBuffer::ScopedHold> device_buffers,
                 int device_ordinal) {
-  se::DeviceMemoryAllocator* allocator = client->allocator();
+  se::DeviceAddressAllocator* allocator = client->allocator();
   TransferManager* transfer_manager =
       client->client()->backend().transfer_manager();
 
@@ -1190,7 +1190,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
   }
   CHECK(input_iterator == iterator_end);
 
-  std::vector<se::DeviceMemoryBase> elements;
+  std::vector<se::DeviceAddressBase> elements;
   size_t num_elements = ShapeUtil::TupleElementCount(tupled_parameter_shape);
   elements.reserve(num_elements);
   for (int64_t i = 0; i < num_elements; ++i) {
@@ -1442,7 +1442,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     const se::DeviceMemoryBase&,
+                     const se::DeviceAddressBase&,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return Internal(
           "Don't send a buffer to the channel_id=%d, there was no send "
@@ -1456,7 +1456,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
 
   return [callbacks, thread_pool](
              int64_t channel_id, se::Stream* stream, const Shape& shape,
-             const se::DeviceMemoryBase& src,
+             const se::DeviceAddressBase& src,
              const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(3) << "Send " << src.size() << " bytes to channel #" << channel_id
@@ -1525,7 +1525,7 @@ namespace {
 class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
  public:
   StreamExecutorCopyToDeviceStream(
-      int64_t channel_id, se::Stream* stream, se::DeviceMemoryBase dst,
+      int64_t channel_id, se::Stream* stream, se::DeviceAddressBase dst,
       AsyncValueRef<std::unique_ptr<se::Event>> done)
       : CopyToDeviceStream(dst.size(), /*granule_bytes=*/1),
         channel_id_(channel_id),
@@ -1562,7 +1562,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
       return Future<>(done_.GetError());
     }
 
-    se::DeviceMemoryBase dst(
+    se::DeviceAddressBase dst(
         reinterpret_cast<std::byte*>(dst_.opaque()) + current_bytes_,
         dst_.size() - current_bytes_);
 
@@ -1602,7 +1602,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
  private:
   int64_t channel_id_;
   se::Stream* stream_;
-  se::DeviceMemoryBase dst_;
+  se::DeviceAddressBase dst_;
 
   // Async value will become available after we'll submit the last memcpy
   // operation, and the event will be recorded on the stream.
@@ -1615,7 +1615,7 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   // Check if we have callbacks registered for the given replica.
   if (replica >= options.send_callbacks.size()) {
     return [replica](int64_t channel_id, se::Stream*, const Shape&,
-                     se::DeviceMemoryBase*,
+                     se::DeviceAddressBase*,
                      const absl::flat_hash_map<std::string, std::string>&) {
       return InvalidArgument(
           "Failed to receive a buffer from the channel_id=%d, there was no "
@@ -1628,7 +1628,7 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
   absl::Span<const RecvCallback> callbacks = options.recv_callbacks[replica];
 
   return [callbacks](int64_t channel_id, se::Stream* stream, const Shape& shape,
-                     se::DeviceMemoryBase* dst,
+                     se::DeviceAddressBase* dst,
                      const absl::flat_hash_map<std::string, std::string>&)
              -> absl::StatusOr<AsyncValueRef<std::unique_ptr<se::Event>>> {
     VLOG(3) << "Recv from channel #" << channel_id
@@ -1691,7 +1691,7 @@ PjRtStreamExecutorClient::RunAsync(
   xla::ShapeTree<tsl::AsyncValueRef<RawSEDeviceMemory>> results(
       ssb.on_device_shape());
   auto it = results.begin();
-  se::DeviceMemoryAllocator* allocator = ssb.memory_allocator();
+  se::DeviceAddressAllocator* allocator = ssb.memory_allocator();
   ShapedBuffer released_ssb = ssb.release();
   for (auto& buf : released_ssb.buffers()) {
     CHECK(it != results.end());
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 3e543724c182aa..4b656c48fc2517 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -66,7 +66,7 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
@@ -237,7 +237,7 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
       std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
       int process_index,
       std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces,
-      std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+      std::unique_ptr<se::DeviceAddressAllocator> allocator,
       std::unique_ptr<tsl::Allocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
       std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
@@ -340,7 +340,7 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
                 ->local_device_state();
   }
   LocalClient* client() const { return client_; }
-  se::DeviceMemoryAllocator* allocator() const { return allocator_; }
+  se::DeviceAddressAllocator* allocator() const { return allocator_; }
   tsl::Allocator* host_memory_allocator() const {
     return host_memory_allocator_.get();
   }
@@ -488,8 +488,8 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
   // Device memory allocator. If owned, the allocator must outlive the devices,
   // because it is the device destructor that waits for any outstanding work to
   // complete.
-  se::DeviceMemoryAllocator* allocator_;
-  std::unique_ptr<se::DeviceMemoryAllocator> owned_allocator_;
+  se::DeviceAddressAllocator* allocator_;
+  std::unique_ptr<se::DeviceAddressAllocator> owned_allocator_;
 
   // Includes all devices, including non-local devices on multi-host platforms.
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> owned_devices_;
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.cc b/third_party/xla/xla/pjrt/se_raw_buffer.cc
index 1d5fc0516f7e10..4ba31cb16cb1d9 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/generic_transfer_manager.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -136,7 +136,7 @@ PjRtStreamExecutorRawBuffer::CopyRawHostToDeviceAndReturnEvent(
                                     local_device = local_device_, stream, src,
                                     offset, transfer_size,
                                     buf = tsl::FormRef(this)]() mutable {
-    se::DeviceMemoryBase sub_buffer = buf->device_buffer_->mem();
+    se::DeviceAddressBase sub_buffer = buf->device_buffer_->mem();
     if (transfer_size < sub_buffer.size()) {
       sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
     }
@@ -196,7 +196,7 @@ PjRtStreamExecutorRawBuffer::CopyRawDeviceToHostAndReturnEvent(
                                     local_device = local_device_, stream, dst,
                                     offset, transfer_size,
                                     buf = tsl::FormRef(this)]() mutable {
-    se::DeviceMemoryBase sub_buffer = buf->device_buffer_->mem();
+    se::DeviceAddressBase sub_buffer = buf->device_buffer_->mem();
     if (transfer_size < sub_buffer.size()) {
       sub_buffer = sub_buffer.GetByteSlice(offset, transfer_size);
     }
@@ -248,7 +248,7 @@ ShapedBuffer PjRtStreamExecutorRawBuffer::AsShapedBuffer(
   auto* device = memory_space()->devices()[0];
   ShapedBuffer shaped_buffer(shape, device->local_device_id().value(),
                              device->local_hardware_id().value());
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+  ShapeTree<se::DeviceAddressBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   if (device_buffer_) {
     CHECK(iterator != shaped_buffer.buffers().end());
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.cc b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
index f5595d2ea39040..31f668f7baf51d 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.cc
@@ -42,8 +42,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -57,7 +57,7 @@ ShapedBuffer RawSEDeviceMemory::AsShapedBuffer(
     PjRtDevice* device, const Shape& on_device_shape) const {
   ShapedBuffer shaped_buffer(on_device_shape, device->local_device_id().value(),
                              device->local_hardware_id().value());
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+  ShapeTree<se::DeviceAddressBase>::iterator iterator =
       shaped_buffer.buffers().begin();
   CHECK(iterator != shaped_buffer.buffers().end());
   iterator->second = mem();
@@ -68,9 +68,9 @@ ShapedBuffer RawSEDeviceMemory::AsShapedBuffer(
 
 class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
  public:
-  AllocatedRawSEDeviceMemory(se::DeviceMemoryBase value,
+  AllocatedRawSEDeviceMemory(se::DeviceAddressBase value,
                              LocalDeviceState* local_device,
-                             se::DeviceMemoryAllocator* allocator)
+                             se::DeviceAddressAllocator* allocator)
       : RawSEDeviceMemory(value),
         allocator_(allocator),
         local_device_(local_device) {
@@ -103,21 +103,21 @@ class AllocatedRawSEDeviceMemory : public RawSEDeviceMemory {
   }
 
  private:
-  se::DeviceMemoryAllocator* allocator_;
+  se::DeviceAddressAllocator* allocator_;
   LocalDeviceState* local_device_;
   size_t sync_point_ = std::numeric_limits<size_t>::max();
 };
 
 tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::Create(
-    se::DeviceMemoryBase value, LocalDeviceState* local_device,
-    se::DeviceMemoryAllocator* allocator) {
+    se::DeviceAddressBase value, LocalDeviceState* local_device,
+    se::DeviceAddressAllocator* allocator) {
   return tsl::MakeAvailableAsyncValueRef<AllocatedRawSEDeviceMemory>(
       value, local_device, allocator);
 }
 
 class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
  public:
-  ForeignRawSEDeviceMemory(se::DeviceMemoryBase value,
+  ForeignRawSEDeviceMemory(se::DeviceAddressBase value,
                            absl::AnyInvocable<void() &&> on_delete_callback)
       : RawSEDeviceMemory(value),
         on_delete_callback_(std::move(on_delete_callback)) {}
@@ -133,7 +133,7 @@ class ForeignRawSEDeviceMemory : public RawSEDeviceMemory {
 };
 
 tsl::AsyncValueRef<RawSEDeviceMemory> RawSEDeviceMemory::CreateForeign(
-    se::DeviceMemoryBase value,
+    se::DeviceAddressBase value,
     absl::AnyInvocable<void() &&> on_delete_callback) {
   return tsl::MakeAvailableAsyncValueRef<ForeignRawSEDeviceMemory>(
       value, std::move(on_delete_callback));
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer.h b/third_party/xla/xla/pjrt/tracked_device_buffer.h
index 62b36de4923881..7bce98bf6fa0a8 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer.h
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer.h
@@ -43,8 +43,8 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/threadpool.h"
@@ -53,11 +53,11 @@ namespace xla {
 
 class RawSEDeviceMemory {
  public:
-  explicit RawSEDeviceMemory(se::DeviceMemoryBase value) : value_(value) {}
+  explicit RawSEDeviceMemory(se::DeviceAddressBase value) : value_(value) {}
 
   virtual ~RawSEDeviceMemory() = default;
 
-  const se::DeviceMemoryBase& mem() const { return value_; }
+  const se::DeviceAddressBase& mem() const { return value_; }
 
   void* opaque() const { return value_.opaque(); }
 
@@ -70,10 +70,10 @@ class RawSEDeviceMemory {
                               const Shape& on_device_shape) const;
 
   static tsl::AsyncValueRef<RawSEDeviceMemory> Create(
-      se::DeviceMemoryBase value, LocalDeviceState* local_device,
-      se::DeviceMemoryAllocator* allocator);
+      se::DeviceAddressBase value, LocalDeviceState* local_device,
+      se::DeviceAddressAllocator* allocator);
   static tsl::AsyncValueRef<RawSEDeviceMemory> CreateForeign(
-      se::DeviceMemoryBase value,
+      se::DeviceAddressBase value,
       absl::AnyInvocable<void() &&> on_delete_callback);
 
   // Returns a definition event (or nullptr if the definition is known to be in
@@ -84,7 +84,7 @@ class RawSEDeviceMemory {
   }
 
  private:
-  se::DeviceMemoryBase value_;
+  se::DeviceAddressBase value_;
 };
 
 // Class that represents a tuple of device buffers. Like a ScopedShapedBuffer it
@@ -124,7 +124,7 @@ class TrackedDeviceBuffer : public AbstractTrackedDeviceBuffer {
       ShapeTree<MaybeOwningDeviceAddress>::iterator* iterator,
       const ShapeTree<MaybeOwningDeviceAddress>::iterator& end,
       ExecutionInput* execution_input,
-      se::DeviceMemoryAllocator* allocator) const;
+      se::DeviceAddressAllocator* allocator) const;
 
   const absl::InlinedVector<BufferSequencingEventRef, 2>& definition_events()
       const {
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index f4d2b8664df143..d5bec6ba286977 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/platform/statusor.h"
@@ -114,7 +114,7 @@ TEST(TrackedDeviceBufferTest, AsShapedBuffer) {
   TF_ASSERT_OK_AND_ASSIGN(auto b_buffer, MakeArray(b_shape, client));
   TF_ASSERT_OK_AND_ASSIGN(auto c_buffer, MakeArray(c_shape, client));
 
-  std::vector<se::DeviceMemoryBase> expected_buffer_sequence = {
+  std::vector<se::DeviceAddressBase> expected_buffer_sequence = {
       a_buffer->mem(), b_buffer->mem(), c_buffer->mem()};
   ShapedBuffer shaped_a = a_buffer->AsShapedBuffer(
       &device,

From 33c6f22e73c7290bc43d013033bfd539d98b56ba Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Tue, 9 Dec 2025 09:22:36 -0800
Subject: [PATCH 084/753] Fix use-after-free in PjRtCApiClient

`BufferMemoryLayoutData` and `device_layout_list` should be alive until the return of the api call.

PiperOrigin-RevId: 842271572
---
 .../pjrt/c_api_client/pjrt_c_api_client.cc    | 27 ++++++++++---------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 3d579a45509817..a173d3c87e0674 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -1381,30 +1381,33 @@ PjRtCApiClient::CreateBuffersForAsyncHostToDevice(
       PJRT_Client_CreateBuffersForAsyncHostToDevice_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.client = c_client_.get();
+
   args.num_shape_specs = shape_specs.size();
-  args.shape_specs = new PJRT_ShapeSpec[shape_specs.size()];
-  absl::Cleanup cleanup =
-      absl::MakeCleanup([&args] { delete[] args.shape_specs; });
-  const ShapeSpec* iterator = shape_specs.begin();
-  for (int i = 0; i < shape_specs.size(); ++i) {
-    args.shape_specs[i] = pjrt::ConvertToPjRtShapeSpec(*(iterator++));
+  absl::InlinedVector<PJRT_ShapeSpec, 4> c_shape_specs;
+  c_shape_specs.reserve(shape_specs.size());
+  for (const ShapeSpec& shape_spec : shape_specs) {
+    c_shape_specs.push_back(pjrt::ConvertToPjRtShapeSpec(shape_spec));
   }
+  args.shape_specs = c_shape_specs.data();
+
+  absl::InlinedVector<pjrt::BufferMemoryLayoutData, 4> layout_data_list;
+  absl::InlinedVector<PJRT_Buffer_MemoryLayout*, 4> device_layout_list;
   if (device_layouts.has_value()) {
     args.num_device_layouts = device_layouts->size();
-    auto device_layout_list =
-        std::make_unique<std::vector<PJRT_Buffer_MemoryLayout*>>(
-            device_layouts->size());
+    device_layout_list.reserve(device_layouts->size());
+    layout_data_list.reserve(device_layouts->size());
     for (int i = 0; i < device_layouts->size(); ++i) {
       if (device_layouts.has_value() && (*device_layouts)[i].has_value()) {
         const Layout& layout = (*device_layouts)[i].value();
         TF_ASSIGN_OR_RETURN(pjrt::BufferMemoryLayoutData c_layout_data,
                             pjrt::ConvertToBufferMemoryLayoutData(layout));
-        device_layout_list->emplace_back(&(c_layout_data.c_layout));
+        layout_data_list.push_back(std::move(c_layout_data));
+        device_layout_list.emplace_back(&(layout_data_list.back().c_layout));
       } else {
-        device_layout_list->emplace_back(nullptr);
+        device_layout_list.emplace_back(nullptr);
       }
     }
-    args.device_layouts = device_layout_list->data();
+    args.device_layouts = device_layout_list.data();
   } else {
     args.num_device_layouts = 0;
     args.device_layouts = nullptr;

From 8d5f52bc3a386e1acc9f1d2753a291cf7199aade Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov@google.com>
Date: Tue, 9 Dec 2025 09:53:15 -0800
Subject: [PATCH 085/753] [XLA:GPU] split dimensions greedly including ones in
 CalculateBitcastOfTransposeImpl

We have not hanlded the case when bitcast introcduces a new 1-size dimension in
case like

a = f32[6,7] transpose(f32[7,6]), dims={1,0}
b = f32[6,1,7] bitcast(a)

as this 1-size dimension could teoretically be moved anywhere in the
hosted expression

c = f32[1,7,6] bitcase(f32[7,6])
# or c = f32[7,1,6] bitcase(f32[7,6])
# or c = f32[7,6,1] bitcase(f32[7,6])
b = f32[6,1,7] transpose(c), dims=...

by using a "greedy" version of CommonFactors that does not produce
grouping like [] -> [1] or [1] -> [] we now handle this case (picking
group [6,1] -> [6] mapping).

PiperOrigin-RevId: 842282725
---
 .../xla/xla/service/gpu/transforms/BUILD      |   5 +-
 .../gpu/transforms/nest_gemm_fusion.cc        |  35 ++-
 .../service/gpu/transforms/nest_gemm_fusion.h |  19 +-
 .../gpu/transforms/nest_gemm_fusion_test.cc   | 249 ++++++++++++------
 4 files changed, 218 insertions(+), 90 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index e4ed891c6391b9..1acb680d9e73d7 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1857,7 +1857,6 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:symbolic_tiled_hlo_instruction",
         "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
@@ -1898,18 +1897,18 @@ xla_cc_test(
     deps = [
         ":nest_gemm_fusion",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index dc972406913fc3..f9812d96ca0747 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <deque>
 #include <memory>
@@ -658,8 +659,11 @@ absl::StatusOr<BitcastParams> CalculateBitcastOfTransposeImpl(
   // Maps logical operand dimension index to the physical dimension index.
   llvm::SmallVector<int64_t> operand_inv_layout =
       GetInversePermutation(operand_shape.layout().minor_to_major());
-  auto factors = CommonFactors(GetPhysicalDimensions(result_shape),
-                               GetPhysicalDimensions(transpose_shape));
+
+  const absl::InlinedVector<std::pair<int64_t, int64_t>, 8> factors =
+      ::xla::gpu::detail::CommonFactorsMergingTrivialRanges(
+          GetPhysicalDimensions(result_shape),
+          GetPhysicalDimensions(transpose_shape));
   for (int64_t i = 1; i < factors.size(); ++i) {
     auto [result_from, transpose_from] = factors[i - 1];
     auto [result_to, transpose_to] = factors[i];
@@ -1348,5 +1352,32 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
       "Couldn't find output tile sizes that satisfy ", tiled_dot.ToString()));
 }
 
+absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
+CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
+                                  absl::Span<const int64_t> b) {
+  // CommonFactors does what we need but it also creates empty groups with
+  // product of 1, e.g. `[1] -> []` or `[] -> [1]`. We remove the bounds of
+  // such ranges to merge them with neighbors. There are many different ways
+  // to do this, here we continously append ranges to the start of the next
+  // group unless it is the very last range.
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> bounds =
+      CommonFactors(a, b);
+  for (size_t i = 0; i + 1 < bounds.size() && bounds.size() > 2;) {
+    auto [a_start, b_start] = bounds[i];
+    auto [a_end, b_end] = bounds[i + 1];
+    if (a_start != a_end && b_start != b_end) {
+      i++;
+      continue;
+    }
+    if (i + 2 == bounds.size()) {
+      // Very last range - append it to the previous one.
+      bounds.erase(bounds.begin() + i);
+    } else {
+      bounds.erase(bounds.begin() + i + 1);
+    }
+  }
+  return bounds;
+}
+
 }  // namespace detail
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
index 2d94ad1c4417f6..bc1a54cfadd09a 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
@@ -16,11 +16,16 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
 #define XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
 
+#include <cstdint>
+#include <utility>
+
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
-#include "xla/hlo/ir/hlo_instructions.h"
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -81,6 +86,16 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
     mlir::MLIRContext* mlir_context,
     const se::DeviceDescription& device_description);
 
+// Returns the start indices of consecutive non-overlapping subsequences of `a`
+// and `b` with the same product (see `CommonFactors` from `util.h`) grouping
+// ranges having product of 1 with neighbors.
+//
+// For example, if a=[2, 5, 1, 3] and b=[1, 10, 3, 1], the result will be
+// {{0, 0}, {2, 2}, {4, 4}}, grouping [2,5] with [1,10] and [1,3] with [3,1].
+absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
+CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
+                                  absl::Span<const int64_t> b);
+
 }  // namespace detail
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index b353c68c8b2d29..fd81e8ef39895b 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -15,31 +15,38 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
+#include <cstdint>
+#include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/status/status_matchers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
+using ::absl_testing::IsOkAndHolds;
 using ::testing::ElementsAre;
 
 namespace xla {
@@ -105,7 +112,7 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
   ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 
   const HloInstruction* fusion = nullptr;
@@ -162,9 +169,9 @@ ENTRY e {
                          "num_ctas":1}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   HloComputation* fusion_computation = module->entry_computation()
                                            ->root_instruction()
@@ -283,7 +290,7 @@ ENTRY entry {
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
   ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 
   const HloInstruction* fusion = nullptr;
@@ -330,7 +337,7 @@ ENTRY entry {
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
   ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 
   const HloInstruction* fusion = nullptr;
@@ -375,9 +382,9 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -410,9 +417,9 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -444,15 +451,15 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: f16[3,11]{1,0} convert(
 CHECK: f16[3,11]{1,0} fusion(
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 
   ASSERT_OK(verifier().Run(module.get()).status());
 }
@@ -490,9 +497,9 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -526,9 +533,9 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
   CHECK: ENTRY
@@ -536,7 +543,7 @@ ENTRY entry {
   CHECK: [[fusion:[^ ]+]] = s8[3,11]{1,0:E(4)} fusion({{.*}})
   CHECK: ROOT {{.*}} = s8[33]{0:E(4)} bitcast([[fusion]])
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -570,9 +577,9 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -609,9 +616,9 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -645,9 +652,9 @@ ENTRY entry_computation {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -680,9 +687,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -712,9 +719,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
@@ -727,7 +734,7 @@ CHECK: ENTRY {{.*}} {
 CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
 CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -758,7 +765,7 @@ ENTRY e {
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
   // We can nest the fusion including the broadcast.
-  EXPECT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
+  ASSERT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
                   .Run(module.get())
                   .ok());
   ASSERT_OK(verifier().Run(module.get()).status());
@@ -769,7 +776,7 @@ CHECK:      f32[3,4,16]{2,1,0} broadcast
 CHECK-NEXT: f32[3,64]{1,0} $0
 )",
                                             HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -800,7 +807,7 @@ ENTRY e {
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
   // We can nest the fusion including the broadcast.
-  EXPECT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
+  ASSERT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
                   .Run(module.get())
                   .ok());
   ASSERT_OK(verifier().Run(module.get()).status());
@@ -811,7 +818,7 @@ CHECK:      f32[2,3,5]{2,1,0} $0
 CHECK-NEXT: f32[2,4,3,5]{3,2,1,0} broadcast
 )",
                                             HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -843,9 +850,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
@@ -854,7 +861,7 @@ CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[p0]]), dimensions={0}
 CHECK-DAG: [[br:[^ ]+]] = f32[15]{0} broadcast([[p0]]), dimensions={0}
 CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[br]]), dimensions={0}
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedOverBroadcasts) {
@@ -883,9 +890,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            R"(
@@ -899,7 +906,7 @@ CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
 CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
 )"),
 
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsLayoutIsPreserved) {
@@ -934,9 +941,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
@@ -953,7 +960,7 @@ CHECK: ENTRY {{.*}} {
 CHECK: {{.*}} = pred[122,5]{0,1} bitcast({{.*}})
 )",
                                             HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -985,16 +992,16 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
 CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedUpThroughTransposes) {
@@ -1021,9 +1028,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
@@ -1031,7 +1038,45 @@ CHECK:      ROOT transpose
 CHECK-SAME: f32[2,3,7]{2,1,0} transpose
 CHECK-SAME: dimensions={1,2,0}
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
+}
+
+TEST_P(NestGemmFusionReshapeTest,
+       BitcastsWithSize1DimensionsAreHoistedUpThroughTransposes) {
+  const HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[7,6] parameter(0)
+  transpose = f32[6,7] transpose(p0), dimensions={1,0}
+  bitcast = f32[1,6,7] $0(transpose)
+  p1 = f32[1,5,7] parameter(1)
+  ROOT result = f32[1,6,5] dot(bitcast, p1),
+    lhs_contracting_dims={2}, lhs_batch_dims={0},
+    rhs_contracting_dims={2}, rhs_batch_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[7,6] parameter(0)
+  p1 = f32[1,5,7] parameter(1)
+  ROOT result = f32[1,6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(
+                              absl::Substitute(hlo, HloOpcodeString(opcode))));
+  ASSERT_THAT(
+      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
+      IsOkAndHolds(true));
+  ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT transpose
+CHECK-SAME: f32[1,6,7]{2,1,0} transpose
+CHECK-SAME: dimensions={1,2,0}
+)"),
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -1058,9 +1103,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
@@ -1068,7 +1113,7 @@ CHECK:      transpose
 CHECK-SAME: f32[3,2,7]{2,1,0} transpose
 CHECK-SAME: dimensions={2,0,1}
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -1095,9 +1140,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
@@ -1105,7 +1150,7 @@ CHECK:      f32[2,3,5]{2,1,0} $0
 CHECK-NEXT: f32[2,5,3]{2,1,0} transpose
 )",
                                             HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -1134,9 +1179,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   // Checks that transpose is on rank 3 tensor from hoisting bitcast1, not rank
   // 4 tensor from hoisting bitcast0 first and then failing to hoist bitcast1.
@@ -1146,7 +1191,7 @@ CHECK:      transpose
 CHECK-SAME: f16[3,1152,122]{2,1,0} transpose
 CHECK-SAME: dimensions={0,2,1}
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedDownThroughTransposes) {
@@ -1173,9 +1218,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
@@ -1183,7 +1228,7 @@ CHECK:      ROOT transpose
 CHECK-SAME: f32[5,2,3]{2,1,0} transpose
 CHECK-SAME: dimensions={2,0,1}
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedDownThroughBroadcasts) {
@@ -1209,9 +1254,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
@@ -1219,7 +1264,7 @@ CHECK:      ROOT broadcast
 CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
 CHECK-SAME: dimensions={0,1}
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -1246,9 +1291,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
@@ -1256,7 +1301,7 @@ CHECK:      f32[2,3,5]{2,1,0} $0(dot)
 CHECK-NEXT: f32[2,3,5]{2,0,1} broadcast
 )",
                                             HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastRootsAreHoistedDown) {
@@ -1281,15 +1326,15 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: ROOT dot
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -1318,15 +1363,15 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: ROOT add = f32[3,5]{1,0} add
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -1359,9 +1404,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK-NOT: bitcast
@@ -1376,7 +1421,7 @@ CHECK: f32[2,7]{1,0} bitcast(p0
 CHECK: result = f32[2,7,15,11]{2,1,0,3} fusion
 CHECK: ROOT {{.*}} = f32[15,11,14]{0,2,1} bitcast(result)
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -1410,9 +1455,9 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK-NOT: bitcast
@@ -1427,7 +1472,7 @@ CHECK: f32[7,3,2]{2,0,1} bitcast(p0
 CHECK: result = f32[3,5,2]{2,1,0} fusion
 CHECK: ROOT {{.*}} = f32[2,3,5]{0,2,1} bitcast(result)
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -1458,16 +1503,16 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(
                               absl::Substitute(hlo, HloOpcodeString(opcode))));
-  EXPECT_THAT(
+  ASSERT_THAT(
       NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK-NOT: bitcast
 CHECK-NOT: reshape
 CHECK: ENTRY
 )"),
-      absl_testing::IsOkAndHolds(true));
+      IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -1509,7 +1554,7 @@ CHECK-NOT: bitcast
 CHECK-NOT: reshape
         )",
                                             HloOpcodeString(opcode))),
-              absl_testing::IsOkAndHolds(true));
+              IsOkAndHolds(true));
   ASSERT_OK(verifier().Run(module.get()).status());
 }
 
@@ -1521,6 +1566,44 @@ INSTANTIATE_TEST_SUITE_P(NestGemmFusionReshapeTestSuite,
                            return std::string(HloOpcodeString(info.param));
                          });
 
+struct CommonFactorsTestCase {
+  std::vector<int64_t> from, to;
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> expected;
+};
+
+class CommonFactorsMergingTrivialRangesTest
+    : public ::testing::TestWithParam<CommonFactorsTestCase> {};
+
+TEST_P(CommonFactorsMergingTrivialRangesTest, Example) {
+  const CommonFactorsTestCase& test_case = GetParam();
+  EXPECT_EQ(test_case.expected, detail::CommonFactorsMergingTrivialRanges(
+                                    test_case.from, test_case.to));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CommonFactorsMergingTrivialRangesTestSuite,
+    CommonFactorsMergingTrivialRangesTest,
+    ::testing::Values(
+        CommonFactorsTestCase{{1}, {}, {{0, 0}, {1, 0}}},
+        CommonFactorsTestCase{{}, {1}, {{0, 0}, {0, 1}}},
+        CommonFactorsTestCase{{}, {}, {{0, 0}}},
+        CommonFactorsTestCase{{1, 2, 0}, {2, 0, 3}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{2, 3, 0}, {1, 0, 1000}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{1, 1, 1}, {1, 1}, {{0, 0}, {1, 1}, {3, 2}}},
+        CommonFactorsTestCase{{1, 1, 3}, {3, 1, 1}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{2, 6}, {4, 3}, {{0, 0}, {2, 2}}},
+        CommonFactorsTestCase{{1, 2, 6}, {4, 1, 3, 1}, {{0, 0}, {3, 4}}},
+        CommonFactorsTestCase{{2, 3, 4, 5}, {6, 20}, {{0, 0}, {2, 1}, {4, 2}}},
+        CommonFactorsTestCase{
+            {2, 3, 4, 5, 6}, {6, 20, 6}, {{0, 0}, {2, 1}, {4, 2}, {5, 3}}},
+        CommonFactorsTestCase{{2, 2, 2, 2}, {4, 4}, {{0, 0}, {2, 1}, {4, 2}}},
+        CommonFactorsTestCase{
+            {2, 5, 1, 3}, {1, 10, 3, 1}, {{0, 0}, {2, 2}, {4, 4}}}),
+    [](const ::testing::TestParamInfo<CommonFactorsTestCase>& info) {
+      return absl::StrCat(absl::StrJoin(info.param.from, "_"), "_to_",
+                          absl::StrJoin(info.param.to, "_"));
+    });
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 19a899a5b8475fa9d72fbc16b09c5a90381d7c18 Mon Sep 17 00:00:00 2001
From: Marcin Radomski <dextero@google.com>
Date: Tue, 9 Dec 2025 10:07:23 -0800
Subject: [PATCH 086/753] [XLA:GPU] Make float check more parallel

The goal is to count NaNs/infs/zeros in a buffer of floats, and append
the results to a BufferDebugLog stored in device memory. This used to be
done on a single thread block with poor performance.

This CL changes it to a 2-step process:
1. Do partial accumulation into a temporary buffer.
2. Use a second kernel to reduce partial results down into scalars and
   append them to the log.

This also includes some optimizations suggested by gflegar:
* Use array-of-structs over struct-of-arrays for __shared__ memory in
  step 1
* Always use 1024 threads per block to avoid switching at kernel runtime
* Read global memory 128bits a time

PiperOrigin-RevId: 842289188
---
 .../xla/xla/backends/gpu/runtime/BUILD        |   7 +
 .../gpu/runtime/buffer_debug_log_structs.h    |  19 +
 .../gpu/runtime/buffers_float_check_thunk.cc  |  65 +++-
 .../gpu/runtime/buffers_float_check_thunk.h   |  13 +-
 .../runtime/buffers_float_check_thunk_test.cc |  29 +-
 .../runtime/thunk_buffer_debug_float_check.cc |  56 ++-
 .../runtime/thunk_buffer_debug_pass_test.cc   |  22 +-
 .../xla/xla/stream_executor/cuda/BUILD        |   3 +
 ...buffer_debug_float_check_kernel_cuda.cu.cc | 342 ++++++++++--------
 ...ffer_debug_float_check_kernel_cuda_test.cc | 135 +++++--
 .../gpu/buffer_debug_float_check_kernel.h     |  29 +-
 11 files changed, 495 insertions(+), 225 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 856ea1bc586623..27b4149b9e25bd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -3259,6 +3259,7 @@ cc_library(
         ":thunk_id",
         ":thunk_pass_pipeline",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu:ffi",
@@ -3287,6 +3288,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_googlesource_code_re2//:re2",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -3463,9 +3465,13 @@ cc_library(
         ":buffer_debug_log_structs",
         ":thunk",
         "//xla:types",
+        "//xla:util",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/cuda:cuda_platform_id",
@@ -3477,6 +3483,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
index 9ff067c00b633d..9d3492ae964f33 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
+++ b/third_party/xla/xla/backends/gpu/runtime/buffer_debug_log_structs.h
@@ -54,6 +54,25 @@ static_assert(sizeof(BufferDebugLogEntry) == sizeof(uint32_t) * 2);
 static_assert(offsetof(BufferDebugLogEntry, entry_id) == 0);
 static_assert(offsetof(BufferDebugLogEntry, value) == sizeof(uint32_t));
 
+struct FloatCheckResult {
+  uint32_t nan_count;
+  uint32_t inf_count;
+  uint32_t zero_count;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const FloatCheckResult& result) {
+    absl::Format(&sink, "{nan_count: %u, inf_count: %u, zero_count: %u}",
+                 result.nan_count, result.inf_count, result.zero_count);
+  }
+};
+
+// The struct layout must match on both host and device.
+static_assert(_Alignof(FloatCheckResult) == _Alignof(uint32_t));
+static_assert(sizeof(FloatCheckResult) == sizeof(uint32_t) * 3);
+static_assert(offsetof(FloatCheckResult, nan_count) == 0);
+static_assert(offsetof(FloatCheckResult, inf_count) == sizeof(uint32_t));
+static_assert(offsetof(FloatCheckResult, zero_count) == sizeof(uint32_t) * 2);
+
 struct BufferDebugFloatCheckEntry {
   // An ID that uniquely identifies a log entry within a HLO module execution.
   BufferDebugLogEntryId entry_id;
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
index 6ff174e2a418d2..d6b8b04c70c47a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/buffers_float_check_thunk.h"
 
+#include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -30,14 +34,18 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
 #include "xla/stream_executor/gpu/buffer_debug_log.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/launch_dim.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
+#include "xla/util.h"
 
 namespace xla::gpu {
 
@@ -73,15 +81,33 @@ absl::Status BuffersDebugFloatCheckThunk::Initialize(
           auto kernel_bf16,
           registry.LoadKernel<se::gpu::BufferDebugFloatCheckBf16Kernel>(
               params.executor));
+      TF_ASSIGN_OR_RETURN(
+          auto kernel_reduce,
+          registry.LoadKernel<
+              se::gpu::BufferDebugAppendReducedFloatCheckResultsKernel>(
+              params.executor));
       kernels_[params.executor] = std::make_unique<Kernels>(
-          Kernels{std::move(kernel_f32), std::move(kernel_bf16)});
+          Kernels{std::move(kernel_f32), std::move(kernel_bf16),
+                  std::move(kernel_reduce)});
+      VLOG(1) << "NanCount kernels loaded";
     }
   }
 
-  VLOG(1) << "FloatCheck kernel loaded";
   return absl::OkStatus();
 }
 
+template <typename T>
+se::BlockDim GetBlockDimForBuffer(se::Stream* stream,
+                                  se::DeviceMemory<T> buffer,
+                                  int64_t max_blocks) {
+  const int64_t num_elements = buffer.size() / sizeof(T);
+  const se::DeviceDescription& desc = stream->parent()->GetDeviceDescription();
+  const int64_t num_blocks =
+      std::min(xla::CeilOfRatio(num_elements, desc.threads_per_block_limit()),
+               max_blocks);
+  return se::BlockDim(num_blocks);
+}
+
 absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::StreamExecutor* executor = params.stream->parent();
@@ -102,8 +128,13 @@ absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
 
   VLOG(1) << "BuffersDebugFloatCheckThunk::ExecuteOnStream";
 
-  const se::ThreadDim thread_dim(
-      executor->GetDeviceDescription().threads_per_block_limit(), 1, 1);
+  se::DeviceAddress<xla::gpu::FloatCheckResult> tmp_ptr(
+      params.buffer_allocations->GetDeviceAddress(tmp_slice_));
+  const size_t tmp_size_elements =
+      tmp_slice_.size() / sizeof(xla::gpu::FloatCheckResult);
+  CHECK_GT(tmp_size_elements, 0)
+      << "tmp_slice_ is too small to hold any results, this should have been "
+         "caught during initialization";
 
   se::DeviceAddress<uint8_t> log_ptr(
       params.buffer_allocations->GetDeviceAddress(log_slice_));
@@ -111,6 +142,8 @@ absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
       se::gpu::BufferDebugLog<
           BufferDebugFloatCheckEntry>::FromDeviceAddressUnchecked(log_ptr);
   const uint32_t execution_id = execution_count_.fetch_add(1);
+  // The kernel assumes 1024 threads per block.
+  const se::ThreadDim thread_dim(1024);
 
   for (const auto& [buffer_idx, buffer] : checked_thunk_buffers_) {
     BufferDebugLogEntryMetadataStore::Metadata metadata{
@@ -130,22 +163,32 @@ absl::Status BuffersDebugFloatCheckThunk::ExecuteOnStream(
       VLOG(1) << "F32 buffer detected with id: " << entry_id
               << " and size: " << device_buffer.size();
       se::DeviceAddress<float> f32_buffer(device_buffer);
-      TF_RETURN_IF_ERROR(kernels->f32.Launch(
-          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
-          f32_buffer, f32_buffer.size(), buffer_debug_log.GetDeviceHeader(),
-          buffer_debug_log.GetDeviceEntries()));
+      const se::BlockDim block_dim = GetBlockDimForBuffer<float>(
+          params.stream, f32_buffer, tmp_size_elements);
+      TF_RETURN_IF_ERROR(
+          kernels->f32.Launch(thread_dim, block_dim, params.stream, f32_buffer,
+                              f32_buffer.size(), tmp_ptr, tmp_size_elements));
     } else if (buffer_type == PrimitiveType::BF16) {
       VLOG(1) << "BF16 buffer detected with id: " << entry_id
               << " and size: " << device_buffer.size();
       se::DeviceAddress<Eigen::bfloat16> bf16_buffer(device_buffer);
+      const se::BlockDim block_dim = GetBlockDimForBuffer<Eigen::bfloat16>(
+          params.stream, bf16_buffer, tmp_size_elements);
       TF_RETURN_IF_ERROR(kernels->bf16.Launch(
-          thread_dim, se::BlockDim(1, 1, 1), params.stream, entry_id,
-          bf16_buffer, bf16_buffer.size(), buffer_debug_log.GetDeviceHeader(),
-          buffer_debug_log.GetDeviceEntries()));
+          thread_dim, block_dim, params.stream, bf16_buffer, bf16_buffer.size(),
+          tmp_ptr, tmp_size_elements));
     } else {
       VLOG(1) << "Unsupported primitive type for float checking: "
               << PrimitiveType_Name(buffer_type);
+      continue;
     }
+
+    // Operations on the same stream perform in sequence, so at this point the
+    // results of the previous FloatCheck operation are available.
+    TF_RETURN_IF_ERROR(kernels->reduce.Launch(
+        thread_dim, se::BlockDim(1, 1, 1), params.stream, tmp_ptr,
+        tmp_size_elements, entry_id, buffer_debug_log.GetDeviceHeader(),
+        buffer_debug_log.GetDeviceEntries()));
   }
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
index 5d2f78e80edb99..f73c9ef305fde6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <atomic>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -38,18 +39,21 @@ class BuffersDebugFloatCheckThunk : public Thunk {
  public:
   explicit BuffersDebugFloatCheckThunk(
       ThunkInfo info, const ThunkInfo& checked_thunk_info,
-      BufferAllocation::Slice log_slice,
+      BufferAllocation::Slice log_slice, BufferAllocation::Slice tmp_slice,
       absl::flat_hash_map<size_t, BufferAllocation::Slice>
           checked_thunk_buffers,
       std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store)
       : Thunk(Thunk::Kind::kBuffersDebugFloatCheck, std::move(info)),
         log_slice_(log_slice),
+        tmp_slice_(tmp_slice),
         checked_thunk_info_(checked_thunk_info),
         checked_thunk_buffers_(std::move(checked_thunk_buffers)),
         metadata_store_(std::move(metadata_store)) {}
 
-  absl::Status Initialize(const InitializeParams& params) override;
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+  absl::Status Initialize(const InitializeParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override
+      ABSL_LOCKS_EXCLUDED(kernels_mutex_);
 
   std::string ToString(int indent) const override;
 
@@ -67,6 +71,8 @@ class BuffersDebugFloatCheckThunk : public Thunk {
   struct Kernels {
     stream_executor::gpu::BufferDebugFloatCheckF32Kernel::KernelType f32;
     stream_executor::gpu::BufferDebugFloatCheckBf16Kernel::KernelType bf16;
+    stream_executor::gpu::BufferDebugAppendReducedFloatCheckResultsKernel::
+        KernelType reduce;
   };
   absl::Mutex kernels_mutex_;
   // Each loaded kernel is associated with a specific device (represented by its
@@ -79,6 +85,7 @@ class BuffersDebugFloatCheckThunk : public Thunk {
       kernels_ ABSL_GUARDED_BY(kernels_mutex_);
 
   BufferAllocation::Slice log_slice_;
+  BufferAllocation::Slice tmp_slice_;
   ThunkInfo checked_thunk_info_;
   absl::flat_hash_map<size_t, BufferAllocation::Slice> checked_thunk_buffers_;
   std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
index dfb933bce2a4ee..c56538c15a6e39 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
@@ -101,17 +101,20 @@ class BuffersDebugFloatCheckThunkTest : public ::testing::Test {
 TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
   static constexpr size_t kLogSize =
       BufferDebugLog<BufferDebugFloatCheckEntry>::RequiredSizeForEntries(10);
+  static constexpr size_t kTmpSizeElems = 1024;
+  static constexpr size_t kTmpSizeBytes = kTmpSizeElems * sizeof(uint32_t);
   static constexpr size_t kInputElems = 1024;
   static constexpr size_t kInputSizeInBytes = kInputElems * sizeof(float);
   static constexpr size_t kTotalDeviceMemoryBytes =
-      kLogSize + kInputSizeInBytes * 2;
+      kLogSize + kTmpSizeBytes + kInputSizeInBytes * 2;
   // Setup memory allocations for the log and inputs
   BufferAllocation alloc(/*index=*/0,
                          /*size=*/kTotalDeviceMemoryBytes,
                          /*color=*/0);
   int64_t input_offset = kLogSize;
   BufferAllocation::Slice log_slice(&alloc, /*offset=*/0, kLogSize);
-  input_offset += kLogSize;
+  BufferAllocation::Slice tmp_slice(&alloc, /*offset=*/kLogSize, kTmpSizeBytes);
+  input_offset += kLogSize + kTmpSizeBytes;
 
   BufferAllocation::Slice inputs[2];
   int64_t input_size_bf16 = kInputElems * sizeof(Eigen::bfloat16);
@@ -159,7 +162,7 @@ TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
   Thunk::ThunkInfo checked_thunk_info;
   checked_thunk_info.thunk_id = ThunkId(123);
   BuffersDebugFloatCheckThunk thunk(
-      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice, tmp_slice,
       {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
       metadata_store);
   TF_ASSERT_OK(thunk.Initialize(init_params));
@@ -202,8 +205,13 @@ TEST_F(BuffersDebugFloatCheckThunkTest,
     GTEST_SKIP() << "need at least 2 devices for this test";
   }
 
+  static constexpr size_t kLogOffset = 0;
   static constexpr size_t kLogSizeBytes = 1024;
+  static constexpr size_t kTmpOffset = kLogOffset + kLogSizeBytes;
+  static constexpr size_t kTmpSizeBytes = 1024 * sizeof(uint32_t);
+  static constexpr size_t kInputOffset = kTmpOffset + kTmpSizeBytes;
   static constexpr size_t kInputSizeBytes = 1024;
+  static constexpr size_t kTotalDeviceMemory = kInputOffset + kInputSizeBytes;
 
   struct TestDevice {
     se::StreamExecutor* executor;
@@ -219,7 +227,7 @@ TEST_F(BuffersDebugFloatCheckThunkTest,
     auto allocator =
         std::make_unique<se::StreamExecutorMemoryAllocator>(executor);
     BufferAllocations allocations(
-        {executor->AllocateArray<uint8_t>(kLogSizeBytes + kInputSizeBytes)},
+        {executor->AllocateArray<uint8_t>(kTotalDeviceMemory)},
         executor->device_ordinal(), allocator.get());
 
     return TestDevice{std::move(executor), std::move(stream),
@@ -227,16 +235,17 @@ TEST_F(BuffersDebugFloatCheckThunkTest,
   };
   TF_ASSERT_OK_AND_ASSIGN(TestDevice device0, setup_device(0));
   TF_ASSERT_OK_AND_ASSIGN(TestDevice device1, setup_device(1));
-  BufferAllocation allocation(0, kLogSizeBytes + kInputSizeBytes, 0);
-  BufferAllocation::Slice log_slice(&allocation, 0, kLogSizeBytes);
-  BufferAllocation::Slice f32_slice(&allocation, kLogSizeBytes, kInputSizeBytes,
+  BufferAllocation allocation(/*index=*/0, kTotalDeviceMemory, /*color=*/0);
+  BufferAllocation::Slice log_slice(&allocation, kLogOffset, kLogSizeBytes);
+  BufferAllocation::Slice tmp_slice(&allocation, kTmpOffset, kTmpSizeBytes);
+  BufferAllocation::Slice f32_slice(&allocation, kInputOffset, kInputSizeBytes,
                                     PrimitiveType::F32);
-  BufferAllocation::Slice bf16_slice(&allocation, kLogSizeBytes,
-                                     kInputSizeBytes, PrimitiveType::BF16);
+  BufferAllocation::Slice bf16_slice(&allocation, kInputOffset, kInputSizeBytes,
+                                     PrimitiveType::BF16);
   Thunk::ThunkInfo checked_thunk_info;
   checked_thunk_info.thunk_id = ThunkId(123);
   BuffersDebugFloatCheckThunk thunk(
-      Thunk::ThunkInfo(), checked_thunk_info, log_slice,
+      Thunk::ThunkInfo(), checked_thunk_info, log_slice, tmp_slice,
       {{/*buffer_idx=*/0, f32_slice}, {/*buffer_idx=*/1, bf16_slice}},
       std::make_shared<BufferDebugLogEntryMetadataStore>());
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
index 8241084f52831c..808142a0e2a45c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_float_check.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/thunk_buffer_debug_float_check.h"
 
+#include <algorithm>
+#include <cmath>
 #include <cstddef>
+#include <cstdint>
 #include <cstring>
 #include <memory>
 #include <optional>
@@ -32,6 +35,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "Eigen/Core"
 #include "xla/backends/gpu/ffi.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
@@ -59,6 +63,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 
@@ -72,10 +77,41 @@ constexpr size_t kLogSizeBytes = 64 * 1024;
 
 namespace {
 
-std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
+size_t CalculateTempBufferSize(const Thunk& thunk) {
+  size_t max_buffer_size_bytes = 0;
+  for (const BufferUse& use : thunk.buffer_uses()) {
+    if (use.HasDefinedContentsOnInput() || use.HasDefinedContentsOnOutput()) {
+      max_buffer_size_bytes =
+          std::max<size_t>(max_buffer_size_bytes, use.slice().size());
+    }
+  }
+
+  // We're doing the float checks in 2 steps:
+  // - parallel aggregation: one thread block writes partial result into the
+  //   temp buffer. The number of thread blocks used will be limtied by the size
+  //   calculated here.
+  // - reduction of the temp buffer on a single thread block
+  // To optimize for time, we want to do as much computation in parallel as we
+  // can, but also consider the overhead of single-block reduction step.
+
+  // Avoid making the reduction step use less than a block's worth of data. We
+  // can't go any faster than that anyway.
+  static constexpr size_t kMinElements = 1024;
+  // Arbitrary limit of 1Mi elements. This should be enough to accomodate the
+  // max number of thread blocks available on any supported GPU.
+  static constexpr size_t kMaxElements = 1024 * 1024;
+  const size_t size_elems =
+      xla::CeilOfRatio(max_buffer_size_bytes, sizeof(uint32_t));
+  const size_t sqrt_size_elems = std::sqrt(size_elems);
+  return std::clamp(xla::CeilOfRatio(size_elems, sqrt_size_elems), kMinElements,
+                    kMaxElements);
+}
+
+absl::StatusOr<std::unique_ptr<Thunk>> WrapWithFloatCheckThunk(
     std::unique_ptr<Thunk> thunk, BufferAllocation::Slice log_slice,
     const Thunk& predecessor_thunk, Thunk& successor_thunk,
-    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store) {
+    std::shared_ptr<BufferDebugLogEntryMetadataStore> metadata_store,
+    ThunkPassBufferAllocator& allocator) {
   const auto& thunk_buffers = thunk->buffer_uses();
   if (thunk_buffers.empty()) {
     VLOG(1) << "No buffers in thunk " << thunk->thunk_info().thunk_id
@@ -120,6 +156,12 @@ std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
     return thunk;
   }
 
+  const size_t temp_buffer_size_bytes =
+      CalculateTempBufferSize(*thunk) * sizeof(xla::gpu::FloatCheckResult);
+  TF_ASSIGN_OR_RETURN(BufferAllocation * tmp_alloc,
+                      allocator.NewEmptyAllocation(temp_buffer_size_bytes));
+  BufferAllocation::Slice tmp_slice(tmp_alloc, 0, tmp_alloc->size());
+
   VLOG(1) << "Wrapping thunk " << thunk->thunk_info().thunk_id
           << " with float check thunk due to presence of buffers: "
           << buffers_to_check.size();
@@ -128,7 +170,7 @@ std::unique_ptr<Thunk> WrapWithFloatCheckThunk(
   thunk_and_checks.push_back(std::move(thunk));
   auto buffer_debug_float_check_thunk =
       std::make_unique<BuffersDebugFloatCheckThunk>(
-          Thunk::ThunkInfo(), thunk_ptr->thunk_info(), log_slice,
+          Thunk::ThunkInfo(), thunk_ptr->thunk_info(), log_slice, tmp_slice,
           std::move(buffers_to_check), std::move(metadata_store));
   buffer_debug_float_check_thunk->add_control_predecessor(thunk_ptr);
   thunk_and_checks.push_back(std::move(buffer_debug_float_check_thunk));
@@ -329,8 +371,9 @@ absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
       CreateBufferDebugFloatCheckThunk(metadata_store, log_slice, hlo_module));
 
   ThunkFilter thunk_filter = CreateThunkFilter(debug_options);
-  TF_RETURN_IF_ERROR(
-      root_thunk->TransformAllNestedThunks([&](std::unique_ptr<Thunk> thunk) {
+  TF_RETURN_IF_ERROR(root_thunk->TransformAllNestedThunks(
+      [&](std::unique_ptr<Thunk> thunk)
+          -> absl::StatusOr<std::unique_ptr<Thunk>> {
         if (thunk_filter(*thunk) == InstrumentAction::kSkip) {
           return thunk;
         }
@@ -338,7 +381,8 @@ absl::Status RunFloatCheckPassInternal(SequentialThunk* root_thunk,
         return WrapWithFloatCheckThunk(
             std::move(thunk), log_slice,
             /*predecessor_thunk=*/*buffer_debug_init_thunk,
-            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store);
+            /*successor_thunk=*/*buffer_debug_dump_thunk, metadata_store,
+            allocator);
       }));
 
   ThunkSequence& thunks = root_thunk->thunks();
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
index cb4e449e4a8bb1..62bc737997450a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
@@ -58,6 +58,7 @@ namespace {
 
 using testing::ElementsAre;
 using testing::Eq;
+using testing::IsEmpty;
 using testing::Pair;
 using testing::Pointer;
 using testing::SizeIs;
@@ -102,17 +103,16 @@ using SliceList =
 class FakeThunkPassBufferAllocator : public ThunkPassBufferAllocator {
  public:
   absl::StatusOr<BufferAllocation*> NewEmptyAllocation(int64_t size) override {
-    if (CreatedAlloc()) {
-      return absl::InvalidArgumentError("Expected only one allocation");
-    }
-    alloc_ = std::make_unique<BufferAllocation>(0, size, 0);
-    return alloc_.get();
+    allocs_.push_back(std::make_unique<BufferAllocation>(0, size, 0));
+    return allocs_.back().get();
   }
 
-  bool CreatedAlloc() { return alloc_ != nullptr; }
+  const std::vector<std::unique_ptr<BufferAllocation>>& allocs() const {
+    return allocs_;
+  }
 
  private:
-  std::unique_ptr<BufferAllocation> alloc_;
+  std::vector<std::unique_ptr<BufferAllocation>> allocs_;
 };
 
 class FakeThunk : public Thunk {
@@ -188,6 +188,7 @@ TEST_F(ThunkBufferDebugPassTest, IsNoOpWhenHloModuleIsNull) {
                              /*hlo_module=*/nullptr, device_info, allocator));
   EXPECT_FALSE(changed);
   EXPECT_THAT(root_thunk->thunks(), ElementsAre(Pointer(fake_thunk_ptr)));
+  EXPECT_THAT(allocator.allocs(), IsEmpty());
 }
 
 TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugChecksumThunks) {
@@ -256,6 +257,8 @@ TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugChecksumThunks) {
                                                 {2, slice_io},
                                             }))),
           IsCustomCallThunkWithTargetName("xla_gpu_buffer_debug_log_dump")));
+
+  EXPECT_THAT(allocator.allocs(), SizeIs(1));
 }
 
 TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
@@ -461,6 +464,8 @@ TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
                     Pointer(branch1_thunk_ptr),
                     IsChecksumThunkChecking(SliceList{{0, slice_branch1}})));
   }
+
+  EXPECT_THAT(allocator.allocs(), SizeIs(1));
 }
 
 TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugFloatCheckThunks) {
@@ -544,6 +549,9 @@ TEST_F(ThunkBufferDebugPassTest, InsertsBuffersDebugFloatCheckThunks) {
       static_cast<const BuffersDebugFloatCheckThunk&>(*sub_thunks[1]);
   EXPECT_THAT(buffer_debug_after_fake_thunk.buffer_slices(),
               UnorderedElementsAre(Pair(1, slice_o), Pair(2, slice_io)));
+
+  // 1 for the log buffer, 1 per wrapped thunk for the temp buffer
+  EXPECT_THAT(allocator.allocs(), SizeIs(2));
 }
 
 TEST_F(ThunkBufferDebugPassTest, BufferSaverInserter) {
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index ba0403fbed0832..1342a13898784c 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -453,7 +453,9 @@ cuda_library(
         "gpu",
     ],
     deps = [
+        ":cuda_platform",
         ":cuda_platform_id",
+        "//xla:util",
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/stream_executor:kernel_spec",
         "//xla/stream_executor/gpu:buffer_debug_float_check_kernel",
@@ -475,6 +477,7 @@ xla_test(
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/backends/gpu/runtime:thunk_id",
         "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
index 2325478c1256fb..4f6e94ab7ccce2 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda.cu.cc
@@ -13,9 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <array>
 #include <cassert>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
+#include <optional>
+#include <tuple>
 
 #include "absl/base/casts.h"
 #include "third_party/gpus/cuda/include/cuda/atomic"
@@ -24,11 +29,29 @@ limitations under the License.
 #include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
 #include "xla/stream_executor/kernel_spec.h"
+#include "xla/util.h"
 
 namespace se = stream_executor;
 
 namespace {
 
+using xla::gpu::FloatCheckResult;
+
+// https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
+// > CUDA architecture limits the numbers of threads per block (1024 threads
+// > per block limit).
+static constexpr uint64_t kBlockSize = 1024;
+// warpSize is not a compile time constant on all OSS CI builds, but we need it
+// to be one for static array initialization. We assert this value matches
+// warpSize at runtime.
+static constexpr uint64_t kWarpSize = 32;
+static constexpr uint64_t kMaxWarpsPerBlock = kBlockSize / kWarpSize;
+template <typename T>
+static constexpr uint64_t kElementsPerMemoryAccess =
+    std::max<uint64_t>(16 / sizeof(T), 1);
+template <typename T>
+using Chunk = std::array<T, kElementsPerMemoryAccess<T>>;
+
 __device__ unsigned int ThreadIdx() {
   return threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x +
          threadIdx.x;
@@ -39,16 +62,57 @@ __device__ unsigned int BlockIdx() {
          blockIdx.x;
 }
 
-// Based on
-// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
-template <unsigned int BLOCK_SIZE>
-__device__ void WarpReduceSum(unsigned int tid, volatile uint32_t* data) {
-  if (BLOCK_SIZE >= 64) data[tid] += data[tid + 32];
-  if (BLOCK_SIZE >= 32) data[tid] += data[tid + 16];
-  if (BLOCK_SIZE >= 16) data[tid] += data[tid + 8];
-  if (BLOCK_SIZE >= 8) data[tid] += data[tid + 4];
-  if (BLOCK_SIZE >= 4) data[tid] += data[tid + 2];
-  if (BLOCK_SIZE >= 2) data[tid] += data[tid + 1];
+// Reduce a warp worth of values into a single one and have the 0th thread in
+// the warp return it.
+__device__ uint32_t WarpReduceSum(uint32_t value) {
+  static constexpr uint32_t kFullMask = ~0;
+  for (unsigned int offset = 1; offset < kWarpSize; offset <<= 1) {
+    value += __shfl_down_sync(kFullMask, value, offset);
+  }
+  return value;
+}
+
+// Sum up a block worth of FloatCheckResults into a single one and have the 0th
+// thread in the block return it.
+__device__ FloatCheckResult BlockReduceSum(uint32_t tid,
+                                           FloatCheckResult value) {
+  assert(kWarpSize == warpSize);
+  static_assert(kBlockSize == kWarpSize * kMaxWarpsPerBlock);
+  // Required to do the second warp reduction.
+  static_assert(kMaxWarpsPerBlock == kWarpSize);
+
+  const size_t warp_idx = tid / kWarpSize;
+  const size_t lane_idx = tid % kWarpSize;
+
+  value.nan_count = WarpReduceSum(value.nan_count);
+  value.inf_count = WarpReduceSum(value.inf_count);
+  value.zero_count = WarpReduceSum(value.zero_count);
+
+  __shared__ uint32_t scratch_nan[kMaxWarpsPerBlock];
+  __shared__ uint32_t scratch_inf[kMaxWarpsPerBlock];
+  __shared__ uint32_t scratch_zero[kMaxWarpsPerBlock];
+  if (lane_idx == 0) {
+    scratch_nan[warp_idx] = value.nan_count;
+    scratch_inf[warp_idx] = value.inf_count;
+    scratch_zero[warp_idx] = value.zero_count;
+  }
+
+  __syncthreads();
+  // The first warp reduces the results from all warps.
+  if (warp_idx == 0) {
+    value.nan_count = scratch_nan[lane_idx];
+    value.inf_count = scratch_inf[lane_idx];
+    value.zero_count = scratch_zero[lane_idx];
+    value.nan_count = WarpReduceSum(value.nan_count);
+    value.inf_count = WarpReduceSum(value.inf_count);
+    value.zero_count = WarpReduceSum(value.zero_count);
+  } else {
+    value.nan_count = 0;
+    value.inf_count = 0;
+    value.zero_count = 0;
+  }
+
+  return value;
 }
 
 __device__ inline bool IsNan(float v) { return isnan(v); }
@@ -60,173 +124,126 @@ __device__ inline bool IsZero(__nv_bfloat16 v) {
   return v == __nv_bfloat16(0.0f);
 }
 
-// Calculates count of NaNs of all elements of `input` and puts result in
-// `output`.
-//
-// Optimized implementation based on
-// https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
-// that takes advantage of `BLOCK_SIZE` threads.
-//
-// `BLOCK_SIZE` must be a power of 2 no larger than 1024.
-template <typename T, unsigned int BLOCK_SIZE>
-__device__ void ReduceSum(const T* input, uint64_t input_size,
-                          uint32_t* nan_counter, uint32_t* inf_counter,
-                          uint32_t* zero_counter) {
-  __shared__ uint32_t nan_count[BLOCK_SIZE];
-  __shared__ uint32_t inf_count[BLOCK_SIZE];
-  __shared__ uint32_t zero_count[BLOCK_SIZE];
+// Get a part of the input buffer current thread block is responsible for
+// processing, assuming the load is spread up to max_blocks across the entire
+// grid. If max_blocks is not provided, the entire grid is used.
+template <typename T>
+__device__ inline std::tuple<const T*, uint64_t> GetBlockInput(
+    const T* input, uint64_t input_size,
+    std::optional<uint64_t> max_blocks = std::nullopt) {
+  size_t grid_size = gridDim.x * gridDim.y * gridDim.z;
+  if (max_blocks.has_value()) {
+    grid_size = std::min<size_t>(grid_size, *max_blocks);
+  }
+  const uint64_t max_block_input_size = xla::RoundUpTo(
+      xla::CeilOfRatio(input_size, grid_size), kElementsPerMemoryAccess<T>);
+  const uint64_t block_input_offset = BlockIdx() * max_block_input_size;
+  const uint64_t block_input_size =
+      std::min(max_block_input_size, input_size - block_input_offset);
+  return {input + block_input_offset, block_input_size};
+}
 
-  assert(BlockIdx() == 0);
+template <typename T>
+__device__ FloatCheckResult CheckFloats(const T* input, uint64_t input_size,
+                                        uint64_t max_blocks) {
   const unsigned int tid = ThreadIdx();
+  const auto [block_input, block_input_size] =
+      GetBlockInput(input, input_size, max_blocks);
 
-  nan_count[tid] = 0;
-  inf_count[tid] = 0;
-  zero_count[tid] = 0;
-  for (unsigned int i = tid; i < input_size; i += BLOCK_SIZE) {
-    if (IsNan(input[i])) {
-      nan_count[tid]++;
-    }
-    if (IsInf(input[i])) {
-      inf_count[tid]++;
-    }
-    if (IsZero(input[i])) {
-      zero_count[tid]++;
-    }
-  }
-
-  __syncthreads();
+  const Chunk<T>* chunked_input =
+      reinterpret_cast<const Chunk<T>*>(block_input);
+  const uint64_t input_chunks =
+      xla::FloorOfRatio(block_input_size, kElementsPerMemoryAccess<T>);
+  // This may be less than block_input_size only for the last block.
+  const uint64_t chunked_input_size =
+      xla::RoundDownTo(block_input_size, kElementsPerMemoryAccess<T>);
 
-  if (BLOCK_SIZE >= 1024) {
-    if (tid < 512) {
-      nan_count[tid] += nan_count[tid + 512];
-      inf_count[tid] += inf_count[tid + 512];
-      zero_count[tid] += zero_count[tid + 512];
+  FloatCheckResult result{};
+  for (uint64_t i = tid; i < input_chunks; i += kBlockSize) {
+    Chunk<T> values = chunked_input[i];
+    for (const T value : values) {
+      result.nan_count += IsNan(value);
+      result.inf_count += IsInf(value);
+      result.zero_count += IsZero(value);
     }
-    __syncthreads();
   }
-  if (BLOCK_SIZE >= 512) {
-    if (tid < 256) {
-      nan_count[tid] += nan_count[tid + 256];
-      inf_count[tid] += inf_count[tid + 256];
-      zero_count[tid] += zero_count[tid + 256];
-    }
-    __syncthreads();
-  }
-  if (BLOCK_SIZE >= 256) {
-    if (tid < 128) {
-      nan_count[tid] += nan_count[tid + 128];
-      inf_count[tid] += inf_count[tid + 128];
-      zero_count[tid] += zero_count[tid + 128];
-    }
-    __syncthreads();
-  }
-  if (BLOCK_SIZE >= 128) {
-    if (tid < 64) {
-      nan_count[tid] += nan_count[tid + 64];
-      inf_count[tid] += inf_count[tid + 64];
-      zero_count[tid] += zero_count[tid + 64];
+
+  if (tid == 0 && chunked_input_size < block_input_size) {
+    const size_t rest = block_input_size - chunked_input_size;
+    for (uint64_t j = 0; j < rest; ++j) {
+      const T value = block_input[input_chunks + j];
+      result.nan_count += IsNan(value);
+      result.inf_count += IsInf(value);
+      result.zero_count += IsZero(value);
     }
-    __syncthreads();
-  }
-  if (tid < 32) {
-    WarpReduceSum<BLOCK_SIZE>(tid, nan_count);
-    WarpReduceSum<BLOCK_SIZE>(tid, inf_count);
-    WarpReduceSum<BLOCK_SIZE>(tid, zero_count);
   }
-  if (tid == 0) {
-    *nan_counter = nan_count[0];
-    *inf_counter = inf_count[0];
-    *zero_counter = zero_count[0];
+
+  return BlockReduceSum(tid, result);
+}
+
+__device__ FloatCheckResult ReduceResults(const FloatCheckResult* input,
+                                          uint64_t input_size) {
+  const unsigned int tid = ThreadIdx();
+  const auto [block_input, block_input_size] = GetBlockInput(input, input_size);
+
+  FloatCheckResult result{};
+  for (uint64_t i = tid; i < input_size; i += kBlockSize) {
+    const FloatCheckResult value = block_input[i];
+    result.nan_count += value.nan_count;
+    result.inf_count += value.inf_count;
+    result.zero_count += value.zero_count;
   }
+
+  // Now reduce a block worth of values into a single one.
+  return BlockReduceSum(tid, result);
 }
 
-// Attempts to append the NaN count of the `input` buffer to the
-// `float_check_entries`, using `log_header` to track available capacity and
-// used space.
-//
-// The log entry is tagged with `entry_id`. The NaN count is parallelized as
-// much as block dimensions allow it.
-//
-// If the log does not have enough space for the new entry, the entry is
-// discarded.
-//
-// `input_size_in_bytes` is the size of the input buffer in bytes.
-//
-// LIMITATIONS:
-// - Only a single thread block is supported.
-// - Block dimensions must be a power of 2.
+// Count the number of floats for NaNs, Infs and zeros in input buffer and store
+// partially accumulated results in the tmp array.
 template <typename T>
-__global__ void AppendFloatCheck(
-    xla::gpu::BufferDebugLogEntryId entry_id, const T* input,
-    uint64_t input_size_in_bytes, xla::gpu::BufferDebugLogHeader* log_header,
-    xla::gpu::BufferDebugFloatCheckEntry* float_check_entries) {
-  const uint32_t block_size = blockDim.x * blockDim.y * blockDim.z;
-  const uint64_t input_size = input_size_in_bytes / sizeof(T);
-  uint32_t nan_count = 0;
-  uint32_t inf_count = 0;
-  uint32_t zero_count = 0;
-
-  assert(gridDim.x == 1 && gridDim.y == 1 && gridDim.z == 1);
-  if (BlockIdx() != 0) {
+__global__ void FloatCheck(const T* input, uint64_t input_size,
+                           xla::gpu::FloatCheckResult* tmp, uint64_t tmp_size) {
+  assert(blockDim.x * blockDim.y * blockDim.z == kBlockSize);
+  assert(BlockIdx() < tmp_size);
+  if (BlockIdx() >= tmp_size) {
     return;
   }
 
-  // https://developer.nvidia.com/blog/cuda-refresher-cuda-programming-model/:
-  // > CUDA architecture limits the numbers of threads per block (1024 threads
-  // > per block limit).
-  switch (block_size) {
-    case 1024:
-      ReduceSum<T, 1024>(input, input_size, &nan_count, &inf_count,
-                         &zero_count);
-      break;
-    case 512:
-      ReduceSum<T, 512>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 256:
-      ReduceSum<T, 256>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 128:
-      ReduceSum<T, 128>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 64:
-      ReduceSum<T, 64>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 32:
-      ReduceSum<T, 32>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 16:
-      ReduceSum<T, 16>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 8:
-      ReduceSum<T, 8>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 4:
-      ReduceSum<T, 4>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 2:
-      ReduceSum<T, 2>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    case 1:
-      ReduceSum<T, 1>(input, input_size, &nan_count, &inf_count, &zero_count);
-      break;
-    default:
-      // Unsupported block size.
-      assert(false);
-      return;
+  const FloatCheckResult result = CheckFloats(input, input_size, tmp_size);
+  if (ThreadIdx() == 0) {
+    tmp[BlockIdx()] = result;
   }
+}
 
-  if (ThreadIdx() == 0) {
-    cuda::atomic_ref<uint32_t, cuda::thread_scope_system>
-        nan_count_log_write_idx(log_header->write_idx);
+// Reduce the partially accumulated results from `FloatCheck` invocations and
+// append the result to the buffer debug log.
+__global__ void ReduceFloatCheckResults(
+    xla::gpu::FloatCheckResult* tmp, uint64_t tmp_size,
+    xla::gpu::BufferDebugLogEntryId entry_id,
+    xla::gpu::BufferDebugLogHeader* log_header,
+    xla::gpu::BufferDebugFloatCheckEntry* log_entries) {
+  assert(blockDim.x * blockDim.y * blockDim.z == kBlockSize);
+  assert(BlockIdx() == 0);
+  if (BlockIdx() >= 1) {
+    return;
+  }
+
+  assert(tmp_size > 0);
+  FloatCheckResult total = ReduceResults(tmp, tmp_size);
+
+  if (BlockIdx() == 0 && ThreadIdx() == 0) {
+    cuda::atomic_ref<uint32_t, cuda::thread_scope_system> log_write_idx(
+        log_header->write_idx);
 #if __CUDA_ARCH__ >= 600
-    const uint32_t write_idx = nan_count_log_write_idx.fetch_add(1);
-    if (nan_count_log_write_idx.load() < log_header->capacity) {
-      float_check_entries[write_idx] = xla::gpu::BufferDebugFloatCheckEntry{
-          entry_id, nan_count, inf_count, zero_count};
+    const uint32_t write_idx = log_write_idx.fetch_add(1);
+    if (write_idx < log_header->capacity) {
+      log_entries[write_idx] = xla::gpu::BufferDebugFloatCheckEntry{
+          entry_id, total.nan_count, total.inf_count, total.zero_count};
     }
 #else
     // Our toolchains generate a fetch_add PTX instructions with system scope,
     // which is not supported on pre-Pascal architectures.
+    (void)total;
     assert(false);
 #endif
   }
@@ -234,16 +251,22 @@ __global__ void AppendFloatCheck(
 
 se::KernelLoaderSpec GetFloatCheckF32KernelSpec(int arity) {
   return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      absl::bit_cast<void*>(&AppendFloatCheck<float>),
+      absl::bit_cast<void*>(&FloatCheck<float>),
       "BufferDebugFloatCheckF32Kernel", arity);
 }
 
 se::KernelLoaderSpec GetFloatCheckBf16KernelSpec(int arity) {
   return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
-      absl::bit_cast<void*>(&AppendFloatCheck<__nv_bfloat16>),
+      absl::bit_cast<void*>(&FloatCheck<__nv_bfloat16>),
       "BufferDebugFloatCheckBf16Kernel", arity);
 }
 
+se::KernelLoaderSpec GetReduceFloatCheckResultsKernelSpec(int arity) {
+  return se::KernelLoaderSpec::CreateInProcessSymbolSpec(
+      absl::bit_cast<void*>(&ReduceFloatCheckResults),
+      "BufferDebugReduceFloatCheckResultsKernel", arity);
+}
+
 }  // namespace
 
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
@@ -253,3 +276,8 @@ GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
 GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
     BufferDebugFloatCheckBf16Kernel, se::gpu::BufferDebugFloatCheckBf16Kernel,
     se::cuda::kCudaPlatformId, GetFloatCheckBf16KernelSpec);
+
+GPU_KERNEL_REGISTRY_REGISTER_KERNEL_STATICALLY(
+    BufferDebugReduceFloatCheckResultsKernel,
+    se::gpu::BufferDebugAppendReducedFloatCheckResultsKernel,
+    se::cuda::kCudaPlatformId, GetReduceFloatCheckResultsKernelSpec);
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
index 56ec5d18289bed..a1ab9cbb610482 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <cstdint>
+#include <cstdlib>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -29,6 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
 #include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/buffer_debug_float_check_kernel.h"
 #include "xla/stream_executor/gpu/buffer_debug_log.h"
 #include "xla/stream_executor/gpu/gpu_kernel_registry.h"
@@ -86,11 +89,17 @@ class FloatCheckKernelTest : public ::testing::Test {
   absl::Status AppendFloatCheckOnDevice(
       BufferDebugLogEntryId entry_id, const std::vector<InputType>& input,
       se::gpu::BufferDebugLog<BufferType>& buffer_debug_log,
-      stream_executor::ThreadDim dim = stream_executor::ThreadDim(1, 1, 1)) {
+      stream_executor::BlockDim block_dim = stream_executor::BlockDim(1, 1, 1),
+      size_t temp_buffer_size_elements = 1024) {
     // Load kernel
     gpu::GpuKernelRegistry registry =
         gpu::GpuKernelRegistry::GetGlobalRegistry();
     TF_ASSIGN_OR_RETURN(auto kernel, registry.LoadKernel<Kernel>(executor_));
+    TF_ASSIGN_OR_RETURN(
+        auto reduce_kernel,
+        registry
+            .LoadKernel<gpu::BufferDebugAppendReducedFloatCheckResultsKernel>(
+                executor_));
 
     // Setup device buffers
     TF_ASSIGN_OR_RETURN(
@@ -100,13 +109,27 @@ class FloatCheckKernelTest : public ::testing::Test {
     auto cleanup_input =
         absl::MakeCleanup([&]() { executor_->Deallocate(&device_input); });
 
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceAddress<xla::gpu::FloatCheckResult> device_tmp,
+        CheckNotNull(executor_->AllocateArray<xla::gpu::FloatCheckResult>(
+                         temp_buffer_size_elements),
+                     "tmp"));
+    auto cleanup_tmp =
+        absl::MakeCleanup([&]() { executor_->Deallocate(&device_tmp); });
+
+    const se::ThreadDim thread_dim(1024, 1, 1);
+
     // Call kernel
     TF_RETURN_IF_ERROR(stream_->Memcpy(&device_input, input.data(),
                                        input.size() * sizeof(input[0])));
-    TF_RETURN_IF_ERROR(kernel.Launch(
-        dim, stream_executor::BlockDim(1, 1, 1), stream_.get(), entry_id,
-        device_input, device_input.ElementCount() * sizeof(InputType),
-        buffer_debug_log.GetDeviceHeader(),
+    TF_RETURN_IF_ERROR(kernel.Launch(thread_dim, block_dim, stream_.get(),
+                                     device_input, device_input.ElementCount(),
+                                     device_tmp, device_tmp.ElementCount()));
+    TF_RETURN_IF_ERROR(reduce_kernel.Launch(
+        thread_dim, se::BlockDim(1, 1, 1), stream_.get(), device_tmp,
+        std::min(device_tmp.ElementCount(),
+                 block_dim.x * block_dim.y * block_dim.z),
+        entry_id, buffer_debug_log.GetDeviceHeader(),
         buffer_debug_log.GetDeviceEntries()));
     TF_RETURN_IF_ERROR(stream_->BlockHostUntilDone());
 
@@ -170,33 +193,101 @@ TEST_F(FloatCheckKernelTest, ChecksFloatsForBf16) {
 }
 
 TEST_F(FloatCheckKernelTest, ChecksFloatsInParallel) {
-  se::DeviceAddress<uint8_t> mem = executor_->AllocateArray<uint8_t>(1024);
-  std::vector<float> input(1024, 1.0f);
-  input[100] = std::numeric_limits<float>::quiet_NaN();
-  input[200] = std::numeric_limits<float>::quiet_NaN();
-  input[300] = std::numeric_limits<float>::quiet_NaN();
-  input[400] = 0.0f;
-  input[600] = std::numeric_limits<float>::infinity();
-  input[700] = std::numeric_limits<float>::infinity();
+  static constexpr size_t kNumNaNs = 100;
+  static constexpr size_t kNumInfs = 200;
+  static constexpr size_t kNumZeros = 300;
+  static constexpr size_t kMaxTestValues =
+      std::max(std::max(kNumNaNs, kNumInfs), kNumZeros);
+
+  const se::DeviceDescription& device_desc = executor_->GetDeviceDescription();
+  const size_t threads_per_core = device_desc.threads_per_core_limit();
+  const size_t num_cores = device_desc.core_count();
+  const size_t input_size = num_cores * threads_per_core * 3 / 2;
+  const size_t test_value_stride = input_size / (kMaxTestValues + 1);
+  ASSERT_GT(input_size, kMaxTestValues);
+  ASSERT_GT(test_value_stride, 2);
+
+  std::vector<float> input(input_size, 1.0f);
+  for (size_t i = 0; i < kNumNaNs; ++i) {
+    input[i * test_value_stride] = std::numeric_limits<float>::quiet_NaN();
+  }
+  for (size_t i = 0; i < kNumInfs; ++i) {
+    input[i * test_value_stride + 1] = std::numeric_limits<float>::infinity();
+  }
+  for (size_t i = 0; i < kNumZeros; ++i) {
+    input[i * test_value_stride + 2] = 0.0f;
+  }
 
+  se::DeviceAddress<uint8_t> log_mem = executor_->AllocateArray<uint8_t>(1024);
   TF_ASSERT_OK_AND_ASSIGN(
       auto device_log,
       se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
-          *stream_, mem));
+          *stream_, log_mem));
 
+  int64_t threads_per_block;
+  int64_t num_blocks;
+  CalculateDimensionality(executor_->GetDeviceDescription(), input.size(),
+                          &threads_per_block, &num_blocks);
+  const se::BlockDim block_dim(num_blocks);
   TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
-      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+      BufferDebugLogEntryId{0}, input, device_log, block_dim));
   TF_EXPECT_OK(AppendFloatCheckOnDevice<gpu::BufferDebugFloatCheckF32Kernel>(
-      BufferDebugLogEntryId{0}, input, device_log, se::ThreadDim(2, 4, 8)));
+      BufferDebugLogEntryId{0}, input, device_log, block_dim));
 
   TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
   ASSERT_GE(host_log.size(), 2);
-  EXPECT_EQ(host_log[0].nan_count, 3);
-  EXPECT_EQ(host_log[0].inf_count, 2);
-  EXPECT_EQ(host_log[0].zero_count, 1);
-  EXPECT_EQ(host_log[1].nan_count, 3);
-  EXPECT_EQ(host_log[1].inf_count, 2);
-  EXPECT_EQ(host_log[1].zero_count, 1);
+  EXPECT_EQ(host_log[0].nan_count, kNumNaNs);
+  EXPECT_EQ(host_log[0].inf_count, kNumInfs);
+  EXPECT_EQ(host_log[0].zero_count, kNumZeros);
+  EXPECT_EQ(host_log[1].nan_count, kNumNaNs);
+  EXPECT_EQ(host_log[1].inf_count, kNumInfs);
+  EXPECT_EQ(host_log[1].zero_count, kNumZeros);
+}
+
+TEST_F(FloatCheckKernelTest, ReduceFloatCheckResults) {
+  static constexpr size_t kNumNaNs = 100;
+  static constexpr size_t kNumInfs = 200;
+  static constexpr size_t kNumZeros = 300;
+  static constexpr size_t kIntermediateResults = 16 * 1024;
+
+  std::vector<xla::gpu::FloatCheckResult> results(kIntermediateResults);
+  for (size_t i = 0; i < kIntermediateResults; ++i) {
+    results[i].nan_count = i < kNumNaNs ? 1 : 0;
+    results[i].inf_count = i < kNumInfs ? 1 : 0;
+    results[i].zero_count = i < kNumZeros ? 1 : 0;
+  }
+
+  gpu::GpuKernelRegistry registry = gpu::GpuKernelRegistry::GetGlobalRegistry();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto reduce_kernel,
+      registry.LoadKernel<gpu::BufferDebugAppendReducedFloatCheckResultsKernel>(
+          executor_));
+
+  se::DeviceAddress<uint8_t> log_mem = executor_->AllocateArray<uint8_t>(1024);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto device_log,
+      se::gpu::BufferDebugLog<BufferDebugFloatCheckEntry>::CreateOnDevice(
+          *stream_, log_mem));
+  TF_ASSERT_OK_AND_ASSIGN(
+      se::DeviceAddress<xla::gpu::FloatCheckResult> device_results,
+      CheckNotNull(executor_->AllocateArray<xla::gpu::FloatCheckResult>(
+                       kIntermediateResults),
+                   "results"));
+  auto cleanup_results =
+      absl::MakeCleanup([&]() { executor_->Deallocate(&device_results); });
+
+  TF_ASSERT_OK(stream_->Memcpy(&device_results, results.data(),
+                               results.size() * sizeof(results[0])));
+  TF_ASSERT_OK(reduce_kernel.Launch(
+      se::ThreadDim(1024, 1, 1), se::BlockDim(1, 1, 1), stream_.get(),
+      device_results, device_results.ElementCount(), BufferDebugLogEntryId{0},
+      device_log.GetDeviceHeader(), device_log.GetDeviceEntries()));
+  TF_ASSERT_OK_AND_ASSIGN(auto host_log, device_log.ReadFromDevice(*stream_));
+
+  ASSERT_GE(host_log.size(), 1);
+  EXPECT_EQ(host_log[0].nan_count, kNumNaNs);
+  EXPECT_EQ(host_log[0].inf_count, kNumInfs);
+  EXPECT_EQ(host_log[0].zero_count, kNumZeros);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
index 421a1a08b7d547..af0b687d6f9578 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_float_check_kernel.h
@@ -25,21 +25,32 @@ limitations under the License.
 
 namespace stream_executor::gpu {
 
-// Trait for a kernel that computes the NaN count of given input buffer and
-// appends it to the buffer debug log.
-//
-// This kernel MUST execute on a single thread block.
+// Counts the number of NaNs, Infs and zeros in a buffer of floats in parallel,
+// and stores partially accumulated results in the FloatCheckResult array.
 struct BufferDebugFloatCheckF32Kernel {
   using KernelType =
-      TypedKernel<xla::gpu::BufferDebugLogEntryId, DeviceAddress<float>,
-                  uint64_t, DeviceAddress<xla::gpu::BufferDebugLogHeader>,
-                  DeviceAddress<xla::gpu::BufferDebugFloatCheckEntry>>;
+      TypedKernel<DeviceAddress<float>, uint64_t,
+                  DeviceAddress<xla::gpu::FloatCheckResult>, uint64_t>;
 };
 
+// Counts the number of NaNs, Infs and zeros in a buffer of bfloat16s in
+// parallel, and stores partially accumulated results in the FloatCheckResult
+// array.
 struct BufferDebugFloatCheckBf16Kernel {
   using KernelType =
-      TypedKernel<xla::gpu::BufferDebugLogEntryId,
-                  DeviceAddress<Eigen::bfloat16>, uint64_t,
+      TypedKernel<DeviceAddress<Eigen::bfloat16>, uint64_t,
+                  DeviceAddress<xla::gpu::FloatCheckResult>, uint64_t>;
+};
+
+// Trait for a kernel that reduces the partially accumulated results from
+// `BufferDebugFloatCheckF32Kernel` or `BufferDebugFloatCheckBf16Kernel`
+// invocations and appends the result to the buffer debug log.
+//
+// This kernel MUST execute on a single thread block.
+struct BufferDebugAppendReducedFloatCheckResultsKernel {
+  using KernelType =
+      TypedKernel<DeviceAddress<xla::gpu::FloatCheckResult>, uint64_t,
+                  xla::gpu::BufferDebugLogEntryId,
                   DeviceAddress<xla::gpu::BufferDebugLogHeader>,
                   DeviceAddress<xla::gpu::BufferDebugFloatCheckEntry>>;
 };

From 9f90786e64afa218470d36ce3394756970042895 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 9 Dec 2025 10:14:58 -0800
Subject: [PATCH 087/753] [stream_executor] Make sure that DeviceAddress
 behaves like a pointer wrt comparison to nullptr_t and casting to bool

PiperOrigin-RevId: 842292205
---
 third_party/xla/xla/stream_executor/BUILD     |  9 ++++
 .../xla/xla/stream_executor/device_address.h  | 50 +++++++++----------
 .../stream_executor/device_address_test.cc    | 40 +++++++++++++++
 3 files changed, 74 insertions(+), 25 deletions(-)
 create mode 100644 third_party/xla/xla/stream_executor/device_address_test.cc

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 195ec82c4d2777..5c1481cc227da1 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -82,6 +82,15 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "device_address_test",
+    srcs = ["device_address_test.cc"],
+    deps = [
+        ":device_address",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "device_address_handle",
     srcs = ["device_address_handle.cc"],
diff --git a/third_party/xla/xla/stream_executor/device_address.h b/third_party/xla/xla/stream_executor/device_address.h
index a2ac3d8ac02095..9884ade4430b7d 100644
--- a/third_party/xla/xla/stream_executor/device_address.h
+++ b/third_party/xla/xla/stream_executor/device_address.h
@@ -41,7 +41,7 @@ namespace stream_executor {
 // check for `opaque` being null to determine if the device address is null.
 class DeviceAddressBase {
  public:
-  // Default constructor instantiates a null-pointed, zero-sized device memory
+  // Default constructor instantiates a null-pointed, zero-sized device address
   // region. An opaque pointer may be provided -- see header for details on the
   // opacity of that pointer.
   explicit DeviceAddressBase(void* opaque = nullptr, uint64_t size = 0)
@@ -53,10 +53,12 @@ class DeviceAddressBase {
     //  explicit DeviceAddressBase(void *opaque) = delete;
   }
 
-  // Returns whether the backing memory is the null pointer.
+  // Returns whether the backing address is the null pointer.
   // A `== nullptr` convenience method is also provided.
   bool is_null() const { return opaque_ == nullptr; }
 
+  explicit operator bool() const { return !is_null(); }
+
   bool operator==(std::nullptr_t other) const { return is_null(); }
   bool operator!=(std::nullptr_t other) const { return !is_null(); }
 
@@ -64,7 +66,7 @@ class DeviceAddressBase {
     return opaque_ == other.opaque_ && size_ == other.size_;
   }
 
-  // Provides a partial order between device memory values.
+  // Provides a partial order between device address values.
   //
   // This operator is provided so that this object can be used as a key in an
   // ordered map.
@@ -72,14 +74,14 @@ class DeviceAddressBase {
     return std::tie(opaque_, size_) < std::tie(other.opaque_, other.size_);
   }
 
-  // Returns the size, in bytes, for the backing memory.
+  // Returns the size, in bytes, for the backing address range.
   uint64_t size() const { return size_; }
 
   // Warning: note that the pointer returned is not necessarily directly to
   // device virtual address space, but is platform-dependent.
   void* opaque() const { return opaque_; }
 
-  // Returns the payload of this memory region.
+  // Returns the payload of this address range.
   uint64_t payload() const { return payload_; }
 
   // Sets payload to given value.
@@ -91,60 +93,58 @@ class DeviceAddressBase {
     return opaque() == other.opaque() && size() == other.size();
   }
 
-  // Creates a memory region (slice) inside another allocated memory region.
-  // Offset and size are in bytes.
+  // Creates and address range slice at the given offset and size. Offset and
+  // size are in bytes.
   ABSL_ATTRIBUTE_ALWAYS_INLINE DeviceAddressBase
   GetByteSlice(uint64_t offset_bytes, uint64_t size_bytes) const {
     DCHECK(offset_bytes + size_bytes <= size_)
-        << "requested slice allocation (offset + size) is greater "
-        << "than parent allocation size: (" << offset_bytes << " + "
-        << size_bytes << ") vs. (" << size_ << ")";
+        << "requested address slice (offset + size) is out of bounds "
+        << "of parent address: (" << offset_bytes << " + " << size_bytes
+        << ") vs. (" << size_ << ")";
 
     return DeviceAddressBase(
         reinterpret_cast<std::byte*>(opaque_) + offset_bytes, size_bytes);
   }
 
  private:
-  void* opaque_;   // Platform-dependent value representing addressable memory.
-  uint64_t size_;  // Size in bytes of this allocation.
-  uint64_t payload_ = 0;  // Payload data associated with this allocation.
+  void* opaque_;          // Platform-dependent value representing base address.
+  uint64_t size_;         // Size in bytes of this address range.
+  uint64_t payload_ = 0;  // Payload data associated with this address.
 };
 
 // Typed wrapper around "void *"-like DeviceAddressBase.
 //
 // For example, DeviceAddress<int32_t> is a simple wrapper around
-// DeviceAddressBase that represents one or more integers in Device memory.
+// DeviceAddressBase that represents one or more integers on Device.
 template <typename T>
 class DeviceAddress final : public DeviceAddressBase {
  public:
-  // Default constructor instantiates a null-pointed, zero-sized memory region.
+  // Default constructor instantiates a null-pointed, zero-sized addess range.
   DeviceAddress() : DeviceAddressBase(nullptr, 0) {}
   explicit DeviceAddress(std::nullptr_t) : DeviceAddress() {}
 
-  // Typed device memory regions may be constructed from untyped device memory
-  // regions, this effectively amounts to a cast from a void*.
+  // Typed device address range may be constructed from untyped device address
+  // range, this effectively amounts to a cast from a void*.
   explicit DeviceAddress(const DeviceAddressBase& other)
-      : DeviceAddressBase(const_cast<DeviceAddressBase&>(other).opaque(),
-                          other.size()) {
+      : DeviceAddressBase(other.opaque(), other.size()) {
     SetPayload(other.payload());
   }
 
-  // Returns the number of elements of type T that constitute this
-  // allocation.
+  // Returns the number of elements of type T that constitute this address.
   uint64_t ElementCount() const { return size() / sizeof(T); }
 
-  // Returns pointer to the allocated data
+  // Returns a base pointer to the data.
   T* base() const { return reinterpret_cast<T*>(opaque()); }
 
   // Creates a typed area of DeviceAddress with a given opaque pointer and the
-  // quantity of bytes in the allocation. This function is broken out to
+  // quantity of bytes in the address range. This function is broken out to
   // distinguish bytes from an element count.
   static DeviceAddress<T> MakeFromByteSize(void* opaque, uint64_t bytes) {
     return DeviceAddress<T>(opaque, bytes);
   }
 
-  // Creates a memory region (slice) inside another allocated memory region.
-  // Offset and size are specified in terms of T elements.
+  // Creates and address range slice at the given offset and count. Offset and
+  // count are specified in terms of T elements.
   DeviceAddress<T> GetSlice(uint64_t element_offset, uint64_t element_count) {
     return DeviceAddress<T>(
         GetByteSlice(sizeof(T) * element_offset, sizeof(T) * element_count));
diff --git a/third_party/xla/xla/stream_executor/device_address_test.cc b/third_party/xla/xla/stream_executor/device_address_test.cc
new file mode 100644
index 00000000000000..71acd21672215e
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/device_address_test.cc
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/device_address.h"
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+
+namespace stream_executor {
+namespace {
+
+TEST(DeviceAddressTest, NullptrComparisons) {
+  {
+    DeviceAddressBase null_ptr;
+    EXPECT_FALSE(null_ptr);
+    EXPECT_TRUE(null_ptr == nullptr);
+  }
+
+  {
+    DeviceAddress<int32_t> null_ptr;
+    EXPECT_FALSE(null_ptr);
+    EXPECT_TRUE(null_ptr == nullptr);
+  }
+}
+
+}  // namespace
+}  // namespace stream_executor

From 21040b5b652e63d81f77509a5ce8ab27a5b1d16c Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 9 Dec 2025 10:17:43 -0800
Subject: [PATCH 088/753] [xla] Migrate to se::DeviceMemoryAddress

PiperOrigin-RevId: 842293237
---
 tensorflow/compiler/jit/xla_tensor.cc         |  2 +-
 .../runtime/gpublas_lt_matmul_thunk_test.cc   |  3 +-
 third_party/xla/xla/client/BUILD              |  4 +-
 third_party/xla/xla/client/client_library.h   |  2 +-
 third_party/xla/xla/client/local_client.cc    |  4 +-
 third_party/xla/xla/client/local_client.h     |  4 +-
 third_party/xla/xla/core/collectives/BUILD    |  2 +-
 .../xla/xla/core/collectives/communicator.h   | 40 ++++++++--------
 third_party/xla/xla/ffi/BUILD                 | 10 ++--
 third_party/xla/xla/ffi/api/BUILD             |  4 +-
 third_party/xla/xla/ffi/api/c_api_internal.h  |  2 +-
 third_party/xla/xla/ffi/api/ffi_test.cc       | 48 ++++++++++---------
 third_party/xla/xla/ffi/call_frame.cc         | 20 ++++----
 third_party/xla/xla/ffi/call_frame.h          | 14 +++---
 third_party/xla/xla/ffi/call_frame_test.cc    | 20 ++++----
 third_party/xla/xla/ffi/ffi.h                 | 12 ++---
 third_party/xla/xla/ffi/ffi_api.cc            |  4 +-
 third_party/xla/xla/ffi/ffi_test.cc           | 32 ++++++-------
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  4 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc  | 23 +++++----
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  4 +-
 .../xla/pjrt/pjrt_stream_executor_client.h    |  4 +-
 .../xla/pjrt/tracked_device_buffer_test.cc    |  3 +-
 third_party/xla/xla/service/BUILD             |  2 +
 .../xla/service/maybe_owning_device_memory.h  |  2 +
 third_party/xla/xla/tests/BUILD               | 16 +++----
 .../xla/xla/tests/buffer_donation_test.cc     | 12 ++---
 .../xla/xla/tests/collective_ops_ffi_test.cc  |  2 +-
 third_party/xla/xla/tests/hlo_test_base.cc    |  4 +-
 third_party/xla/xla/tests/hlo_test_base.h     |  6 +--
 .../xla/tests/local_client_execute_test.cc    |  2 +-
 .../xla/xla/tests/local_client_test_base.cc   |  8 ++--
 .../xla/xla/tests/local_client_test_base.h    |  8 ++--
 .../xla/xla/tests/transfer_manager_test.cc    |  2 +-
 third_party/xla/xla/tools/BUILD               |  2 +-
 35 files changed, 173 insertions(+), 158 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index e9cdad219dd28d..d6792cd7802d96 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -55,7 +55,7 @@ absl::Status XlaTensor::AllocateShapedBuffer(DataType dtype,
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
+    TF_ASSIGN_OR_RETURN(se::ScopedDeviceAddress<uint8_t> buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false,
                             subshape.layout().memory_space()));
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index ccdf653ca1862e..77a6ac88f8ff70 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <memory>
 #include <optional>
@@ -182,7 +183,7 @@ class GpuBlasLtThunkBuilder {
   se::StreamExecutorMemoryAllocator allocator_;
   se::GpuComputeCapability gpu_comp_;
   std::deque<BufferAllocation> allocs_;
-  std::vector<se::OwningDeviceMemory> mem_buffers_;
+  std::vector<se::ScopedDeviceAddress<uint8_t>> mem_buffers_;
 };
 
 void GpuBlasLtMatmulThunkTest::CreateExecuteThunksFromHLO(
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index c2801fa3fa8410..fac2d9343ff1d0 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -128,7 +128,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:source_map_util",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -181,7 +181,7 @@ cc_library(
         "//xla/service:compile_only_service",
         "//xla/service:local_service",
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:logging",
diff --git a/third_party/xla/xla/client/client_library.h b/third_party/xla/xla/client/client_library.h
index 0e4f3a9a24dd22..42d0f34202e092 100644
--- a/third_party/xla/xla/client/client_library.h
+++ b/third_party/xla/xla/client/client_library.h
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/service/compile_only_service.h"
 #include "xla/service/local_service.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index cc383a9aa81b34..e1f348a755521d 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -45,7 +45,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
@@ -512,7 +512,7 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
 
 absl::StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
-    se::DeviceMemoryAllocator* allocator) {
+    se::DeviceAddressAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 3ccda5d43f6794..3c237ef37a1973 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/stream_pool.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -183,7 +183,7 @@ class LocalClient : public Client {
   // device is used.
   absl::StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
-      se::DeviceMemoryAllocator* allocator = nullptr);
+      se::DeviceAddressAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
   absl::StatusOr<GlobalDataHandle> TransferToLocalServer(
diff --git a/third_party/xla/xla/core/collectives/BUILD b/third_party/xla/xla/core/collectives/BUILD
index 1b0398aaaf4801..06d3ef7f6c9aed 100644
--- a/third_party/xla/xla/core/collectives/BUILD
+++ b/third_party/xla/xla/core/collectives/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//xla:future",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/core/collectives/communicator.h b/third_party/xla/xla/core/collectives/communicator.h
index 0f60a859db854d..4be35fb52163f7 100644
--- a/third_party/xla/xla/core/collectives/communicator.h
+++ b/third_party/xla/xla/core/collectives/communicator.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -65,7 +65,7 @@ class Communicator {
   // Register `buffer_range` once for efficient collective operations (i.e. on
   // NCCL backend it registers the buffer for zero-copy collective operations).
   //
-  virtual absl::Status RegisterBufferOnce(se::DeviceMemoryBase buffer_range,
+  virtual absl::Status RegisterBufferOnce(se::DeviceAddressBase buffer_range,
                                           int device_ordinal,
                                           bool use_symmetric_buffer) {
     return Unimplemented("User-managed buffer registration is not supported");
@@ -91,40 +91,40 @@ class Communicator {
 
   // Reduce buffers of length `count` in `send_buff` using `reduction_kind`
   // reduction and leaves identical copies of the result on each `recv_buff`.
-  virtual Future<> AllReduce(stream_executor::DeviceMemoryBase send_buffer,
-                             stream_executor::DeviceMemoryBase recv_buffer,
+  virtual Future<> AllReduce(stream_executor::DeviceAddressBase send_buffer,
+                             stream_executor::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              ReductionKind reduction_kind,
                              const Executor& executor) = 0;
 
   // Copy data in `send_buff` from the root device to the `recv_buff` on
   // all other devices.
-  virtual Future<> Broadcast(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
+  virtual Future<> Broadcast(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count, RankId root,
                              const Executor& executor) = 0;
 
   // Reduce data in `send_buff` from all devices using the `reduction_kind`
   // operation and leave the reduced result scattered over the devices so that
   // the `recv_buff` on rank `i` will contain the i-th block of the result.
-  virtual Future<> ReduceScatter(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
+  virtual Future<> ReduceScatter(se::DeviceAddressBase send_buffer,
+                                 se::DeviceAddressBase recv_buffer,
                                  PrimitiveType dtype, size_t count,
                                  ReductionKind reduction_kind,
                                  const Executor& executor) = 0;
 
   // Gather `count` values from all devices into `recv_buffer`, receiving data
   // from rank `i` at offset `i * sendcount`.
-  virtual Future<> AllGather(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
+  virtual Future<> AllGather(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              const Executor& executor) = 0;
 
   // Sends data from `send_buffer` to `target_ranks` and receives data from
   // `source_rank` into `recv_buffer`. If `source_rank` is not specified, the
   // output is filled with zeros.
-  virtual Future<> CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                     se::DeviceMemoryBase recv_buffer,
+  virtual Future<> CollectivePermute(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
                                      PrimitiveType dtype, size_t count,
                                      std::optional<RankId> source_rank,
                                      absl::Span<const RankId> target_ranks,
@@ -133,30 +133,30 @@ class Communicator {
   // Sends `count` values from `send_buffers` to other ranks and receives data
   // from other ranks into `recv_buffers`.
   virtual Future<> AllToAll(
-      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
-      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
       PrimitiveType dtype, size_t count, const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `peer`.
-  virtual Future<> Send(se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Receive data from rank `peer` into `recv_buff`.
-  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `recv_buff` (one-way send).
-  virtual Future<> Send(se::DeviceMemoryBase recv_buffer,
-                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceAddressBase recv_buffer,
+                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way send is not implemented");
   }
 
   // Receive data from rank `peer` into `recv_buff` (one-way recv).
-  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer,
-                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceAddressBase recv_buffer,
+                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way recv is not implemented");
   }
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 41c825e3599ea2..f14764091594bc 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -37,7 +37,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -58,7 +58,7 @@ xla_cc_test(
         ":call_frame",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
@@ -149,7 +149,7 @@ cc_library(
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -182,8 +182,8 @@ cc_library(
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
-        "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -299,7 +299,7 @@ xla_cc_test(
         "//xla/backends/cpu:ffi",
         "//xla/backends/gpu:ffi",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index 41889027b9ddd3..dc4551d8e2fecc 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -91,8 +91,8 @@ xla_cc_test(
         "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
         "//xla/ffi:type_registry",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
index d0baf4fc3b7bb0..d9070080f3a4a6 100644
--- a/third_party/xla/xla/ffi/api/c_api_internal.h
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -93,7 +93,7 @@ typedef XLA_FFI_Error* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Stream_Get(
     XLA_FFI_ExecutionContext* ctx, void** stream);
 
-// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
+// Returns a pointer to device memory allocator (`se::DeviceAddressAllocator`
 // pointer) which allows to allocate memory inside a custom call from the same
 // allocator as XLA (i.e. it allows to construct scratch memory allocator).
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index e3345ebe915146..81578f564956fd 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -522,7 +522,7 @@ TEST(FfiTest, DeviceOrdinal) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -544,7 +544,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, BufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -562,7 +562,7 @@ TEST(FfiTest, BufferArgument) {
 
 TEST(FfiTest, AnyBufferResult) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -594,7 +594,7 @@ TEST(FfiTest, MissingBufferArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -611,7 +611,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -648,7 +648,7 @@ TEST(FfiTest, WrongNumberOfArguments) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -665,7 +665,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -694,7 +694,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -724,7 +724,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -785,7 +785,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -854,7 +854,7 @@ TEST(FfiTest, AutoBinding) {
   });
 
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder::AttributesBuilder attrs;
   attrs.Insert(kI32, 42);
@@ -873,7 +873,8 @@ TEST(FfiTest, AutoBindingResult) {
       Ffi::BindTo(+[](Result<AnyBuffer> buffer) { return Error::Success(); });
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
-  builder.AddBufferRet(se::DeviceMemoryBase(), PrimitiveType::F32, /*dims=*/{});
+  builder.AddBufferRet(se::DeviceAddressBase(), PrimitiveType::F32,
+                       /*dims=*/{});
   auto call_frame = builder.Build();
 
   auto status = Call(*handler, call_frame);
@@ -1409,19 +1410,22 @@ TEST(FfiTest, ScratchAllocator) {
   static void* kAddr = reinterpret_cast<void*>(0xDEADBEEF);
 
   // A test only memory allocator that returns a fixed memory address.
-  struct TestDeviceMemoryAllocator final : public se::DeviceMemoryAllocator {
+  struct TestDeviceMemoryAllocator final : public se::DeviceAddressAllocator {
     size_t count;
 
     TestDeviceMemoryAllocator()
-        : se::DeviceMemoryAllocator(nullptr), count(0) {}
+        : se::DeviceAddressAllocator(nullptr), count(0) {}
 
-    absl::StatusOr<se::OwningDeviceMemory> Allocate(int, uint64_t size, bool,
-                                                    int64_t) final {
+    absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(int,
+                                                              uint64_t size,
+                                                              bool,
+                                                              int64_t) final {
       count++;
-      return se::OwningDeviceMemory(se::DeviceMemoryBase(kAddr, size), 0, this);
+      return se::ScopedDeviceAddress<uint8_t>(
+          se::DeviceAddressBase(kAddr, size), 0, this);
     }
 
-    absl::Status Deallocate(int, se::DeviceMemoryBase mem) final {
+    absl::Status Deallocate(int, se::DeviceAddressBase mem) final {
       count--;
       EXPECT_EQ(mem.opaque(), kAddr);
       return absl::OkStatus();
@@ -1588,7 +1592,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(BufferR2F32Handler, BufferR2F32Function);
 
 TEST(FfiTest, DefineAutoSymbol) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -1604,7 +1608,7 @@ TEST(FfiTest, DefineAutoSymbol) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index ad7c71c98f8cd6..f0c17215c2dafd 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -47,7 +47,7 @@ namespace xla::ffi {
 //===----------------------------------------------------------------------===//
 
 struct CallFrameBuilder::Buffer {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   PrimitiveType type;
   absl::InlinedVector<int64_t, 4> dims;
 };
@@ -84,7 +84,7 @@ CallFrameBuilder::CallFrameBuilder(size_t num_args, size_t num_rets) {
 
 CallFrameBuilder::~CallFrameBuilder() = default;
 
-void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
+void CallFrameBuilder::AddBufferArg(se::DeviceAddressBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(args_.capacity() > args_.size())
@@ -95,10 +95,10 @@ void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
 void CallFrameBuilder::AddTokenArg() {
   DCHECK(args_.capacity() > args_.size())
       << "CallFrame builder `num_args` argument was too small";
-  args_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
+  args_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
 }
 
-void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
+void CallFrameBuilder::AddBufferRet(se::DeviceAddressBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(rets_.capacity() > rets_.size())
@@ -109,7 +109,7 @@ void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
 void CallFrameBuilder::AddTokenRet() {
   DCHECK(rets_.capacity() > rets_.size())
       << "CallFrame builder `num_rets` argument was too small";
-  rets_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
+  rets_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
 }
 
 void CallFrameBuilder::AddAttributes(AttributesMap attrs) {
@@ -557,8 +557,8 @@ std::unique_ptr<CallFrame::Attributes> CallFrame::FixUpAttrs(
 //===----------------------------------------------------------------------===//
 
 absl::Status CallFrame::UpdateWithBuffers(
-    absl::Span<const se::DeviceMemoryBase> args,
-    absl::Span<const se::DeviceMemoryBase> rets) {
+    absl::Span<const se::DeviceAddressBase> args,
+    absl::Span<const se::DeviceAddressBase> rets) {
   if (ABSL_PREDICT_FALSE(args.size() != arguments_->args.size())) {
     return InvalidArgument("Invalid number of updated arguments: %d vs %d",
                            args.size(), arguments_->args.size());
@@ -587,8 +587,8 @@ CallFrame CallFrame::Copy() const {
 }
 
 absl::StatusOr<CallFrame> CallFrame::CopyWithBuffers(
-    absl::Span<const se::DeviceMemoryBase> args,
-    absl::Span<const se::DeviceMemoryBase> rets) const {
+    absl::Span<const se::DeviceAddressBase> args,
+    absl::Span<const se::DeviceAddressBase> rets) const {
   CallFrame clone(CopyArgs(*arguments_), CopyRets(*results_), attributes_);
   TF_RETURN_IF_ERROR(clone.UpdateWithBuffers(args, rets));
   return clone;
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 32dceead1d9b4b..5433d4be990d42 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
 
@@ -76,12 +76,12 @@ class CallFrameBuilder {
 
   CallFrame Build();
 
-  void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
+  void AddBufferArg(se::DeviceAddressBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenArg();
 
-  void AddBufferRet(se::DeviceMemoryBase memory, PrimitiveType type,
+  void AddBufferRet(se::DeviceAddressBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenRet();
@@ -117,16 +117,16 @@ class CallFrame {
   // array (buffer) arguments and results are known at compile time. Instead of
   // rebuilding the call frame from scratch on every execution, we can just
   // update the arguments and results with new pointers to device memory.
-  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceMemoryBase> args,
-                                 absl::Span<const se::DeviceMemoryBase> rets);
+  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceAddressBase> args,
+                                 absl::Span<const se::DeviceAddressBase> rets);
 
   // Creates a copy of the call frame.
   CallFrame Copy() const;
 
   // Creates a copy of the call frame with updated arguments and results.
   absl::StatusOr<CallFrame> CopyWithBuffers(
-      absl::Span<const se::DeviceMemoryBase> args,
-      absl::Span<const se::DeviceMemoryBase> rets) const;
+      absl::Span<const se::DeviceAddressBase> args,
+      absl::Span<const se::DeviceAddressBase> rets) const;
 
   // Builds an XLA_FFI_CallFrame from owned arguments and attributes.
   XLA_FFI_CallFrame Build(
diff --git a/third_party/xla/xla/ffi/call_frame_test.cc b/third_party/xla/xla/ffi/call_frame_test.cc
index f73461fc7d297f..b58e2d9a2537b6 100644
--- a/third_party/xla/xla/ffi/call_frame_test.cc
+++ b/third_party/xla/xla/ffi/call_frame_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -34,8 +34,8 @@ limitations under the License.
 namespace xla::ffi {
 
 TEST(CallFrameTest, UpdateCallFrame) {
-  se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
-  se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
+  se::DeviceAddressBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
 
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
@@ -116,7 +116,7 @@ TEST(CallFrameTest, UpdateCallFrame) {
 void BM_AddBufferArg(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   for (auto _ : state) {
@@ -151,17 +151,17 @@ void BM_AddAttributes(benchmark::State& state) {
 void BM_UpdateCallFrame(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
+  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     auto updated_call_frame =
@@ -173,17 +173,17 @@ void BM_UpdateCallFrame(benchmark::State& state) {
 void BM_UpdateCallFrameInPlace(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
+  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     benchmark::DoNotOptimize(
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index da6303e14faef7..4e1849a190d327 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -50,7 +50,7 @@ limitations under the License.
 #include "xla/ffi/type_registry.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/types.h"  // IWYU pragma: keep
@@ -137,8 +137,8 @@ class AnyBuffer {
     return reinterpret_cast<T*>(buf_->data);
   }
 
-  se::DeviceMemoryBase device_memory() const {
-    return se::DeviceMemoryBase(untyped_data(), size_bytes());
+  se::DeviceAddressBase device_memory() const {
+    return se::DeviceAddressBase(untyped_data(), size_bytes());
   }
 
  private:
@@ -182,9 +182,9 @@ class Buffer {
     return reinterpret_cast<internal::NativeType<dtype>*>(untyped_data());
   }
 
-  se::DeviceMemory<internal::NativeType<dtype>> device_memory() const {
-    return se::DeviceMemory<internal::NativeType<dtype>>(
-        se::DeviceMemoryBase(untyped_data(), size_bytes()));
+  se::DeviceAddress<internal::NativeType<dtype>> device_memory() const {
+    return se::DeviceAddress<internal::NativeType<dtype>>(
+        se::DeviceAddressBase(untyped_data(), size_bytes()));
   }
 
  private:
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 31287ac7587ef4..3f0de64033061e 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "xla/ffi/ffi_structs.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_allocator.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/platform/logging.h"
@@ -795,7 +795,7 @@ static XLA_FFI_Error* XLA_FFI_DeviceMemory_Free(
 
   absl::Status status = gpu->allocator->Deallocate(
       args->ctx->device_ordinal,
-      stream_executor::DeviceMemoryBase(args->data, args->size));
+      stream_executor::DeviceAddressBase(args->data, args->size));
   if (!status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 8f0b00244c0a93..0369c8cc1946e5 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
@@ -179,7 +179,7 @@ TEST(FfiTest, CatchExceptionExplicit) {
 
 TEST(FfiTest, WrongNumArgs) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(nullptr), PrimitiveType::F32, {});
+  builder.AddBufferArg(se::DeviceAddressBase(nullptr), PrimitiveType::F32, {});
   auto call_frame = builder.Build();
 
   auto handler = Ffi::Bind().Arg<AnyBuffer>().Arg<AnyBuffer>().To(
@@ -579,7 +579,7 @@ TEST(FfiTest, DecodingErrors) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -614,7 +614,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, TypedAndRankedBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), storage.size() * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), storage.size() * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -642,8 +642,8 @@ TEST(FfiTest, TypedAndRankedBufferArgument) {
 
 TEST(FfiTest, ComplexBufferArgument) {
   std::vector<std::complex<float>> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(),
-                              storage.size() * sizeof(std::complex<float>));
+  se::DeviceAddressBase memory(storage.data(),
+                               storage.size() * sizeof(std::complex<float>));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::C64, /*dims=*/{2, 2});
@@ -662,7 +662,7 @@ TEST(FfiTest, ComplexBufferArgument) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -679,7 +679,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -697,7 +697,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -715,7 +715,7 @@ TEST(FfiTest, WrongTypeBufferArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -743,7 +743,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -772,7 +772,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -833,7 +833,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -975,8 +975,8 @@ TEST(FfiTest, UpdateBufferArgumentsAndResults) {
   std::vector<float> storage0(4, 0.0f);
   std::vector<float> storage1(4, 0.0f);
 
-  se::DeviceMemoryBase memory0(storage0.data(), 4 * sizeof(float));
-  se::DeviceMemoryBase memory1(storage1.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory0(storage0.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory1(storage1.data(), 4 * sizeof(float));
 
   std::vector<int64_t> dims = {2, 2};
 
@@ -1169,7 +1169,7 @@ TEST(FfiTest, PlatformStream) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index e210a480bc74dd..205fc66b41fcc8 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -1970,7 +1970,7 @@ StreamExecutorGpuClient::RunAsync(
         const int64_t buffer_size = allocation.size();
         if (buffer_size > 0) {
           TF_ASSIGN_OR_RETURN(
-              se::OwningDeviceMemory owning_buffer,
+              se::ScopedDeviceAddress<uint8_t> owning_buffer,
               memory_allocator->Allocate(device_ordinal, buffer_size,
                                          /*retry_on_failure=*/true,
                                          /*memory_space=*/allocation.color()));
@@ -2035,7 +2035,7 @@ StreamExecutorGpuClient::RunAsync(
                "buffer is not donated; allocating a fresh buffer";
         int64_t allocation_size = ShapeUtil::ByteSizeOf(
             ShapeUtil::GetSubshape(gpu_exec->result_shape(), index));
-        absl::StatusOr<se::OwningDeviceMemory> allocated_buffer =
+        absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> allocated_buffer =
             memory_allocator->Allocate(device_ordinal, allocation_size,
                                        /*retry_on_failure=*/true,
                                        /*memory_space=*/allocation->color());
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 5e84506057c524..88fce7477ce884 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -775,16 +775,20 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
                 tuple_buffer.buffers().mutable_element({});
             VLOG(3) << "untuple: output_buffers[" << i
                     << "].emplace: " << elem->opaque();
-            output_buffers[i].emplace(stream_executor::OwningDeviceMemory(
-                *elem, device->local_device_id().value(), client->allocator()));
+            output_buffers[i].emplace(
+                stream_executor::ScopedDeviceAddress<uint8_t>(
+                    *elem, device->local_device_id().value(),
+                    client->allocator()));
             *elem = se::DeviceAddressBase();
           }
         } else {
           CHECK_EQ(output_buffers.size(), 1);
           auto* elem = output.buffers().mutable_element({});
           VLOG(3) << "output_buffers[0].emplace: " << elem->opaque();
-          output_buffers.front().emplace(stream_executor::OwningDeviceMemory(
-              *elem, device->local_device_id().value(), client->allocator()));
+          output_buffers.front().emplace(
+              stream_executor::ScopedDeviceAddress<uint8_t>(
+                  *elem, device->local_device_id().value(),
+                  client->allocator()));
           *elem = se::DeviceAddressBase();
         }
 
@@ -909,10 +913,11 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
                     << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {i}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
-                           tracked_buffers[i]->buffer()->buffer(),
-                           device->local_hardware_id().value(),
-                           client->allocator())));
+                  {i},
+                  MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
+                      tracked_buffers[i]->buffer()->buffer(),
+                      device->local_hardware_id().value(),
+                      client->allocator())));
             } else {
               input.SetBuffer({i}, MaybeOwningDeviceAddress(
                                        tracked_buffers[i]->buffer()->buffer()));
@@ -928,7 +933,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
             ExecutionInput& input = inputs.back();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
+                  {}, MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
                           tracked_buffers[i]->buffer()->buffer(),
                           device->local_hardware_id().value(),
                           client->allocator())));
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index e342a586863001..d11f6e966f5ec2 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -1156,7 +1156,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
 
   se::Stream* stream = local_device->host_to_device_stream();
   TF_ASSIGN_OR_RETURN(
-      se::OwningDeviceMemory owned_root_table_memory,
+      se::ScopedDeviceAddress<uint8_t> owned_root_table_memory,
       allocator->Allocate(
           device_ordinal,
           transfer_manager->GetByteSizeRequirement(tupled_parameter_shape)));
@@ -1673,7 +1673,7 @@ PjRtStreamExecutorClient::RunAsync(
     auto it = tmp.MutableBuffers()->begin();
     for (auto& v : input) {
       if (v.second.is_donated) {
-        it->second = MaybeOwningDeviceAddress(se::OwningDeviceMemory(
+        it->second = MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
             v.second.buf->mem(), device->local_device_id().value(),
             run_options.allocator()));
         tmp.SetUnownedIndex(it->first);
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 4b656c48fc2517..4220db893cb1dc 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -91,8 +91,8 @@ struct PjRtStreamExecutorExecutionOutput {
   // Donated inputs which must be freed.
   std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
   // For PjRtStreamExecutorClient implementations that
-  // use OwningDeviceMemory for donated inputs.
-  std::vector<se::OwningDeviceMemory> se_to_be_released;
+  // use ScopedDeviceAddress for donated inputs.
+  std::vector<se::ScopedDeviceAddress<uint8_t>> se_to_be_released;
 };
 
 class PjRtStreamExecutorDevice : public PjRtDevice {
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index d5bec6ba286977..2c1b89083b477d 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/pjrt/tracked_device_buffer.h"
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -90,7 +91,7 @@ absl::StatusOr<tsl::AsyncValueRef<RawSEDeviceMemory>> MakeArray(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
       [&](const Shape& subshape, const ShapeIndex&) -> absl::Status {
         TF_ASSIGN_OR_RETURN(
-            se::OwningDeviceMemory device_memory,
+            se::ScopedDeviceAddress<uint8_t> device_memory,
             client->backend().memory_allocator()->Allocate(
                 /*device_ordinal=*/0,
                 client->backend().transfer_manager()->GetByteSizeRequirement(
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b5d097d79b4715..e5e8114809599e 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4113,6 +4113,8 @@ cc_library(
     hdrs = ["maybe_owning_device_memory.h"],
     deps = [
         ":maybe_owning_device_address",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory.h b/third_party/xla/xla/service/maybe_owning_device_memory.h
index 897003ffb17429..40d05599971dcd 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory.h
+++ b/third_party/xla/xla/service/maybe_owning_device_memory.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include "absl/base/macros.h"
 #include "xla/service/maybe_owning_device_address.h"
+#include "xla/stream_executor/device_address.h"  // IWYU pragma: keep
+#include "xla/stream_executor/device_address_allocator.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory_allocator.h"  // IWYU pragma: keep
 
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 9f617478a6ea7b..4466fb094ab53d 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -184,7 +184,7 @@ cc_library(
         "//xla/service:hlo_runner_pjrt",
         "//xla/service:interpreter_plugin",  # reference backend
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
@@ -451,8 +451,8 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -522,8 +522,8 @@ xla_test(
         "//xla/service:hlo_module_config",
         "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -2989,7 +2989,7 @@ xla_test(
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/service:collective_ops_utils",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
@@ -3520,7 +3520,7 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -3660,7 +3660,7 @@ xla_test(
         "//xla/service:generic_transfer_manager",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tests:xla_test_backend_predicates",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 324917cbd57df6..870a7b659bcb27 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -113,7 +113,7 @@ class BufferDonationTest : public HloTestBase {
         run_options, backend_->StreamBorrowerWithPriority());
 
     std::vector<ExecutionInput> args;
-    std::vector<ShapeTree<se::DeviceMemoryBase>> inputs_buffers;
+    std::vector<ShapeTree<se::DeviceAddressBase>> inputs_buffers;
 
     CHECK_EQ(argument_literals.size(), donate_arguments.size());
 
@@ -130,7 +130,7 @@ class BufferDonationTest : public HloTestBase {
       ShapedBuffer shaped_buffer = scoped_shaped_buffer.release();
       CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
           stream.get(), argument_literal, shaped_buffer));
-      ShapeTree<se::DeviceMemoryBase> input_buffers = shaped_buffer.buffers();
+      ShapeTree<se::DeviceAddressBase> input_buffers = shaped_buffer.buffers();
       inputs_buffers.push_back(input_buffers);
       ShapeTree<MaybeOwningDeviceAddress> owned_buffers(
           argument_literal.shape());
@@ -138,7 +138,7 @@ class BufferDonationTest : public HloTestBase {
           [&](const ShapeIndex& index,
               MaybeOwningDeviceAddress* device_memory) {
             if (donate_argument) {
-              *device_memory = se::OwningDeviceMemory(
+              *device_memory = se::ScopedDeviceAddress<uint8_t>(
                   input_buffers.element(index), executor_->device_ordinal(),
                   &memory_allocator);
             } else {
@@ -162,7 +162,7 @@ class BufferDonationTest : public HloTestBase {
     }
     ExecutionOutput output = std::move(output_status).value();
 
-    se::DeviceMemoryBase result_root_buffer = output.Result().root_buffer();
+    se::DeviceAddressBase result_root_buffer = output.Result().root_buffer();
     LOG(INFO) << "result allocation = " << result_root_buffer.opaque()
               << "             size = " << result_root_buffer.size();
 
diff --git a/third_party/xla/xla/tests/collective_ops_ffi_test.cc b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
index f56ef7045eca7b..21d423965efc0e 100644
--- a/third_party/xla/xla/tests/collective_ops_ffi_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/collective_ops_e2e_test_base.h"
 #include "xla/tests/literal_test_util.h"
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index 6421e9badcbec7..dce925c25e28d0 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/hlo_runner_pjrt.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
@@ -174,7 +174,7 @@ ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
                                   reference_preprocessor);
 }
 
-se::DeviceMemoryAllocator* HloTestBase::GetAllocator() {
+se::DeviceAddressAllocator* HloTestBase::GetAllocator() {
   if (allocator_ == nullptr) {
     allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
         backend().default_stream_executor());
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index 31efd1fc5ff2bb..c378860ec85a40 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -48,7 +48,7 @@ static_assert(false,
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/hlo_runner_interface.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
@@ -210,7 +210,7 @@ class ABSL_DEPRECATED(
   static se::Platform* GetTestPlatform();
 
   // Creates or retrieves the allocator.
-  se::DeviceMemoryAllocator* GetAllocator();
+  se::DeviceAddressAllocator* GetAllocator();
 
   ErrorSpec error_spec_{0.0001};
 
@@ -224,7 +224,7 @@ class ABSL_DEPRECATED(
               bool allow_mixed_precision_in_hlo_verifier,
               HloPredicate instruction_can_change_layout_func);
 
-  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+  std::unique_ptr<se::DeviceAddressAllocator> allocator_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index ac4aec28517450..cb0675c889c052 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/tests/local_client_test_base.cc b/third_party/xla/xla/tests/local_client_test_base.cc
index 29563c202f26a2..957b24fc150f8e 100644
--- a/third_party/xla/xla/tests/local_client_test_base.cc
+++ b/third_party/xla/xla/tests/local_client_test_base.cc
@@ -43,8 +43,8 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -57,7 +57,7 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
+absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> TestAllocator::Allocate(
     int device_ordinal, uint64_t size, bool retry_on_failure,
     int64_t memory_space) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
@@ -71,7 +71,7 @@ absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
 }
 
 absl::Status TestAllocator::Deallocate(int device_ordinal,
-                                       se::DeviceMemoryBase mem) {
+                                       se::DeviceAddressBase mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     absl::MutexLock lock(count_mutex_);
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index cb7de54135e8db..3afeae8c003d8c 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -37,8 +37,8 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -53,11 +53,11 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
       : se::StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
 
-  absl::StatusOr<se::OwningDeviceMemory> Allocate(
+  absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
       int64_t memory_space) override;
   absl::Status Deallocate(int device_ordinal,
-                          se::DeviceMemoryBase mem) override;
+                          se::DeviceAddressBase mem) override;
 
   // Return the number of allocations that have been performed.
   int64_t allocation_count() const;
diff --git a/third_party/xla/xla/tests/transfer_manager_test.cc b/third_party/xla/xla/tests/transfer_manager_test.cc
index 6a4a188afd94fa..66d84eebb73fb7 100644
--- a/third_party/xla/xla/tests/transfer_manager_test.cc
+++ b/third_party/xla/xla/tests/transfer_manager_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/service/stream_pool.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/local_client_test_base.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 5f422444fd55e9..60993b0f7d19ab 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1007,8 +1007,8 @@ tsl_gpu_library(
         "//xla/service/cpu:cpu_executable",
         "//xla/service/gpu:gpu_symbol_repository",
         "//xla/service/gpu/autotuning:autotuner_util",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",

From 5b7e6e0d94c58afd3756c2b69e28a4d32e9ba9fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 12:11:08 -0800
Subject: [PATCH 089/753] Reverts 21040b5b652e63d81f77509a5ce8ab27a5b1d16c

PiperOrigin-RevId: 842342466
---
 tensorflow/compiler/jit/xla_tensor.cc         |  2 +-
 .../runtime/gpublas_lt_matmul_thunk_test.cc   |  3 +-
 third_party/xla/xla/client/BUILD              |  4 +-
 third_party/xla/xla/client/client_library.h   |  2 +-
 third_party/xla/xla/client/local_client.cc    |  4 +-
 third_party/xla/xla/client/local_client.h     |  4 +-
 third_party/xla/xla/core/collectives/BUILD    |  2 +-
 .../xla/xla/core/collectives/communicator.h   | 40 ++++++++--------
 third_party/xla/xla/ffi/BUILD                 | 10 ++--
 third_party/xla/xla/ffi/api/BUILD             |  4 +-
 third_party/xla/xla/ffi/api/c_api_internal.h  |  2 +-
 third_party/xla/xla/ffi/api/ffi_test.cc       | 48 +++++++++----------
 third_party/xla/xla/ffi/call_frame.cc         | 20 ++++----
 third_party/xla/xla/ffi/call_frame.h          | 14 +++---
 third_party/xla/xla/ffi/call_frame_test.cc    | 20 ++++----
 third_party/xla/xla/ffi/ffi.h                 | 12 ++---
 third_party/xla/xla/ffi/ffi_api.cc            |  4 +-
 third_party/xla/xla/ffi/ffi_test.cc           | 32 ++++++-------
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  4 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc  | 23 ++++-----
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  4 +-
 .../xla/pjrt/pjrt_stream_executor_client.h    |  4 +-
 .../xla/pjrt/tracked_device_buffer_test.cc    |  3 +-
 third_party/xla/xla/service/BUILD             |  2 -
 .../xla/service/maybe_owning_device_memory.h  |  2 -
 third_party/xla/xla/tests/BUILD               | 16 +++----
 .../xla/xla/tests/buffer_donation_test.cc     | 12 ++---
 .../xla/xla/tests/collective_ops_ffi_test.cc  |  2 +-
 third_party/xla/xla/tests/hlo_test_base.cc    |  4 +-
 third_party/xla/xla/tests/hlo_test_base.h     |  6 +--
 .../xla/tests/local_client_execute_test.cc    |  2 +-
 .../xla/xla/tests/local_client_test_base.cc   |  8 ++--
 .../xla/xla/tests/local_client_test_base.h    |  8 ++--
 .../xla/xla/tests/transfer_manager_test.cc    |  2 +-
 third_party/xla/xla/tools/BUILD               |  2 +-
 35 files changed, 158 insertions(+), 173 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index d6792cd7802d96..e9cdad219dd28d 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -55,7 +55,7 @@ absl::Status XlaTensor::AllocateShapedBuffer(DataType dtype,
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(se::ScopedDeviceAddress<uint8_t> buffer,
+    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false,
                             subshape.layout().memory_space()));
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index 77a6ac88f8ff70..ccdf653ca1862e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 
 #include <cstddef>
-#include <cstdint>
 #include <deque>
 #include <memory>
 #include <optional>
@@ -183,7 +182,7 @@ class GpuBlasLtThunkBuilder {
   se::StreamExecutorMemoryAllocator allocator_;
   se::GpuComputeCapability gpu_comp_;
   std::deque<BufferAllocation> allocs_;
-  std::vector<se::ScopedDeviceAddress<uint8_t>> mem_buffers_;
+  std::vector<se::OwningDeviceMemory> mem_buffers_;
 };
 
 void GpuBlasLtMatmulThunkTest::CreateExecuteThunksFromHLO(
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index fac2d9343ff1d0..c2801fa3fa8410 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -128,7 +128,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:source_map_util",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -181,7 +181,7 @@ cc_library(
         "//xla/service:compile_only_service",
         "//xla/service:local_service",
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:logging",
diff --git a/third_party/xla/xla/client/client_library.h b/third_party/xla/xla/client/client_library.h
index 42d0f34202e092..0e4f3a9a24dd22 100644
--- a/third_party/xla/xla/client/client_library.h
+++ b/third_party/xla/xla/client/client_library.h
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/service/compile_only_service.h"
 #include "xla/service/local_service.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index e1f348a755521d..cc383a9aa81b34 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -45,7 +45,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
@@ -512,7 +512,7 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
 
 absl::StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
-    se::DeviceAddressAllocator* allocator) {
+    se::DeviceMemoryAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 3c237ef37a1973..3ccda5d43f6794 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/stream_pool.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -183,7 +183,7 @@ class LocalClient : public Client {
   // device is used.
   absl::StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
-      se::DeviceAddressAllocator* allocator = nullptr);
+      se::DeviceMemoryAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
   absl::StatusOr<GlobalDataHandle> TransferToLocalServer(
diff --git a/third_party/xla/xla/core/collectives/BUILD b/third_party/xla/xla/core/collectives/BUILD
index 06d3ef7f6c9aed..1b0398aaaf4801 100644
--- a/third_party/xla/xla/core/collectives/BUILD
+++ b/third_party/xla/xla/core/collectives/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//xla:future",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/core/collectives/communicator.h b/third_party/xla/xla/core/collectives/communicator.h
index 4be35fb52163f7..0f60a859db854d 100644
--- a/third_party/xla/xla/core/collectives/communicator.h
+++ b/third_party/xla/xla/core/collectives/communicator.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
-#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -65,7 +65,7 @@ class Communicator {
   // Register `buffer_range` once for efficient collective operations (i.e. on
   // NCCL backend it registers the buffer for zero-copy collective operations).
   //
-  virtual absl::Status RegisterBufferOnce(se::DeviceAddressBase buffer_range,
+  virtual absl::Status RegisterBufferOnce(se::DeviceMemoryBase buffer_range,
                                           int device_ordinal,
                                           bool use_symmetric_buffer) {
     return Unimplemented("User-managed buffer registration is not supported");
@@ -91,40 +91,40 @@ class Communicator {
 
   // Reduce buffers of length `count` in `send_buff` using `reduction_kind`
   // reduction and leaves identical copies of the result on each `recv_buff`.
-  virtual Future<> AllReduce(stream_executor::DeviceAddressBase send_buffer,
-                             stream_executor::DeviceAddressBase recv_buffer,
+  virtual Future<> AllReduce(stream_executor::DeviceMemoryBase send_buffer,
+                             stream_executor::DeviceMemoryBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              ReductionKind reduction_kind,
                              const Executor& executor) = 0;
 
   // Copy data in `send_buff` from the root device to the `recv_buff` on
   // all other devices.
-  virtual Future<> Broadcast(se::DeviceAddressBase send_buffer,
-                             se::DeviceAddressBase recv_buffer,
+  virtual Future<> Broadcast(se::DeviceMemoryBase send_buffer,
+                             se::DeviceMemoryBase recv_buffer,
                              PrimitiveType dtype, size_t count, RankId root,
                              const Executor& executor) = 0;
 
   // Reduce data in `send_buff` from all devices using the `reduction_kind`
   // operation and leave the reduced result scattered over the devices so that
   // the `recv_buff` on rank `i` will contain the i-th block of the result.
-  virtual Future<> ReduceScatter(se::DeviceAddressBase send_buffer,
-                                 se::DeviceAddressBase recv_buffer,
+  virtual Future<> ReduceScatter(se::DeviceMemoryBase send_buffer,
+                                 se::DeviceMemoryBase recv_buffer,
                                  PrimitiveType dtype, size_t count,
                                  ReductionKind reduction_kind,
                                  const Executor& executor) = 0;
 
   // Gather `count` values from all devices into `recv_buffer`, receiving data
   // from rank `i` at offset `i * sendcount`.
-  virtual Future<> AllGather(se::DeviceAddressBase send_buffer,
-                             se::DeviceAddressBase recv_buffer,
+  virtual Future<> AllGather(se::DeviceMemoryBase send_buffer,
+                             se::DeviceMemoryBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              const Executor& executor) = 0;
 
   // Sends data from `send_buffer` to `target_ranks` and receives data from
   // `source_rank` into `recv_buffer`. If `source_rank` is not specified, the
   // output is filled with zeros.
-  virtual Future<> CollectivePermute(se::DeviceAddressBase send_buffer,
-                                     se::DeviceAddressBase recv_buffer,
+  virtual Future<> CollectivePermute(se::DeviceMemoryBase send_buffer,
+                                     se::DeviceMemoryBase recv_buffer,
                                      PrimitiveType dtype, size_t count,
                                      std::optional<RankId> source_rank,
                                      absl::Span<const RankId> target_ranks,
@@ -133,30 +133,30 @@ class Communicator {
   // Sends `count` values from `send_buffers` to other ranks and receives data
   // from other ranks into `recv_buffers`.
   virtual Future<> AllToAll(
-      absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
-      absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
       PrimitiveType dtype, size_t count, const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `peer`.
-  virtual Future<> Send(se::DeviceAddressBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Receive data from rank `peer` into `recv_buff`.
-  virtual Future<> Recv(se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `recv_buff` (one-way send).
-  virtual Future<> Send(se::DeviceAddressBase recv_buffer,
-                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceMemoryBase recv_buffer,
+                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way send is not implemented");
   }
 
   // Receive data from rank `peer` into `recv_buff` (one-way recv).
-  virtual Future<> Recv(se::DeviceAddressBase recv_buffer,
-                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer,
+                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way recv is not implemented");
   }
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index f14764091594bc..41c825e3599ea2 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -37,7 +37,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
-        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -58,7 +58,7 @@ xla_cc_test(
         ":call_frame",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
@@ -149,7 +149,7 @@ cc_library(
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -182,8 +182,8 @@ cc_library(
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -299,7 +299,7 @@ xla_cc_test(
         "//xla/backends/cpu:ffi",
         "//xla/backends/gpu:ffi",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index dc4551d8e2fecc..41889027b9ddd3 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -91,8 +91,8 @@ xla_cc_test(
         "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
         "//xla/ffi:type_registry",
-        "//xla/stream_executor:device_address",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
index d9070080f3a4a6..d0baf4fc3b7bb0 100644
--- a/third_party/xla/xla/ffi/api/c_api_internal.h
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -93,7 +93,7 @@ typedef XLA_FFI_Error* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Stream_Get(
     XLA_FFI_ExecutionContext* ctx, void** stream);
 
-// Returns a pointer to device memory allocator (`se::DeviceAddressAllocator`
+// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
 // pointer) which allows to allocate memory inside a custom call from the same
 // allocator as XLA (i.e. it allows to construct scratch memory allocator).
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index 81578f564956fd..e3345ebe915146 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -522,7 +522,7 @@ TEST(FfiTest, DeviceOrdinal) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -544,7 +544,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, BufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -562,7 +562,7 @@ TEST(FfiTest, BufferArgument) {
 
 TEST(FfiTest, AnyBufferResult) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -594,7 +594,7 @@ TEST(FfiTest, MissingBufferArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -611,7 +611,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -648,7 +648,7 @@ TEST(FfiTest, WrongNumberOfArguments) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -665,7 +665,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -694,7 +694,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -724,7 +724,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -785,7 +785,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -854,7 +854,7 @@ TEST(FfiTest, AutoBinding) {
   });
 
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder::AttributesBuilder attrs;
   attrs.Insert(kI32, 42);
@@ -873,8 +873,7 @@ TEST(FfiTest, AutoBindingResult) {
       Ffi::BindTo(+[](Result<AnyBuffer> buffer) { return Error::Success(); });
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
-  builder.AddBufferRet(se::DeviceAddressBase(), PrimitiveType::F32,
-                       /*dims=*/{});
+  builder.AddBufferRet(se::DeviceMemoryBase(), PrimitiveType::F32, /*dims=*/{});
   auto call_frame = builder.Build();
 
   auto status = Call(*handler, call_frame);
@@ -1410,22 +1409,19 @@ TEST(FfiTest, ScratchAllocator) {
   static void* kAddr = reinterpret_cast<void*>(0xDEADBEEF);
 
   // A test only memory allocator that returns a fixed memory address.
-  struct TestDeviceMemoryAllocator final : public se::DeviceAddressAllocator {
+  struct TestDeviceMemoryAllocator final : public se::DeviceMemoryAllocator {
     size_t count;
 
     TestDeviceMemoryAllocator()
-        : se::DeviceAddressAllocator(nullptr), count(0) {}
+        : se::DeviceMemoryAllocator(nullptr), count(0) {}
 
-    absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(int,
-                                                              uint64_t size,
-                                                              bool,
-                                                              int64_t) final {
+    absl::StatusOr<se::OwningDeviceMemory> Allocate(int, uint64_t size, bool,
+                                                    int64_t) final {
       count++;
-      return se::ScopedDeviceAddress<uint8_t>(
-          se::DeviceAddressBase(kAddr, size), 0, this);
+      return se::OwningDeviceMemory(se::DeviceMemoryBase(kAddr, size), 0, this);
     }
 
-    absl::Status Deallocate(int, se::DeviceAddressBase mem) final {
+    absl::Status Deallocate(int, se::DeviceMemoryBase mem) final {
       count--;
       EXPECT_EQ(mem.opaque(), kAddr);
       return absl::OkStatus();
@@ -1592,7 +1588,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(BufferR2F32Handler, BufferR2F32Function);
 
 TEST(FfiTest, DefineAutoSymbol) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -1608,7 +1604,7 @@ TEST(FfiTest, DefineAutoSymbol) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceAddressBase memory;
+  se::DeviceMemoryBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index f0c17215c2dafd..ad7c71c98f8cd6 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -47,7 +47,7 @@ namespace xla::ffi {
 //===----------------------------------------------------------------------===//
 
 struct CallFrameBuilder::Buffer {
-  se::DeviceAddressBase memory;
+  se::DeviceMemoryBase memory;
   PrimitiveType type;
   absl::InlinedVector<int64_t, 4> dims;
 };
@@ -84,7 +84,7 @@ CallFrameBuilder::CallFrameBuilder(size_t num_args, size_t num_rets) {
 
 CallFrameBuilder::~CallFrameBuilder() = default;
 
-void CallFrameBuilder::AddBufferArg(se::DeviceAddressBase memory,
+void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(args_.capacity() > args_.size())
@@ -95,10 +95,10 @@ void CallFrameBuilder::AddBufferArg(se::DeviceAddressBase memory,
 void CallFrameBuilder::AddTokenArg() {
   DCHECK(args_.capacity() > args_.size())
       << "CallFrame builder `num_args` argument was too small";
-  args_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
+  args_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
 }
 
-void CallFrameBuilder::AddBufferRet(se::DeviceAddressBase memory,
+void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(rets_.capacity() > rets_.size())
@@ -109,7 +109,7 @@ void CallFrameBuilder::AddBufferRet(se::DeviceAddressBase memory,
 void CallFrameBuilder::AddTokenRet() {
   DCHECK(rets_.capacity() > rets_.size())
       << "CallFrame builder `num_rets` argument was too small";
-  rets_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
+  rets_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
 }
 
 void CallFrameBuilder::AddAttributes(AttributesMap attrs) {
@@ -557,8 +557,8 @@ std::unique_ptr<CallFrame::Attributes> CallFrame::FixUpAttrs(
 //===----------------------------------------------------------------------===//
 
 absl::Status CallFrame::UpdateWithBuffers(
-    absl::Span<const se::DeviceAddressBase> args,
-    absl::Span<const se::DeviceAddressBase> rets) {
+    absl::Span<const se::DeviceMemoryBase> args,
+    absl::Span<const se::DeviceMemoryBase> rets) {
   if (ABSL_PREDICT_FALSE(args.size() != arguments_->args.size())) {
     return InvalidArgument("Invalid number of updated arguments: %d vs %d",
                            args.size(), arguments_->args.size());
@@ -587,8 +587,8 @@ CallFrame CallFrame::Copy() const {
 }
 
 absl::StatusOr<CallFrame> CallFrame::CopyWithBuffers(
-    absl::Span<const se::DeviceAddressBase> args,
-    absl::Span<const se::DeviceAddressBase> rets) const {
+    absl::Span<const se::DeviceMemoryBase> args,
+    absl::Span<const se::DeviceMemoryBase> rets) const {
   CallFrame clone(CopyArgs(*arguments_), CopyRets(*results_), attributes_);
   TF_RETURN_IF_ERROR(clone.UpdateWithBuffers(args, rets));
   return clone;
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 5433d4be990d42..32dceead1d9b4b 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
 
@@ -76,12 +76,12 @@ class CallFrameBuilder {
 
   CallFrame Build();
 
-  void AddBufferArg(se::DeviceAddressBase memory, PrimitiveType type,
+  void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenArg();
 
-  void AddBufferRet(se::DeviceAddressBase memory, PrimitiveType type,
+  void AddBufferRet(se::DeviceMemoryBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenRet();
@@ -117,16 +117,16 @@ class CallFrame {
   // array (buffer) arguments and results are known at compile time. Instead of
   // rebuilding the call frame from scratch on every execution, we can just
   // update the arguments and results with new pointers to device memory.
-  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceAddressBase> args,
-                                 absl::Span<const se::DeviceAddressBase> rets);
+  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceMemoryBase> args,
+                                 absl::Span<const se::DeviceMemoryBase> rets);
 
   // Creates a copy of the call frame.
   CallFrame Copy() const;
 
   // Creates a copy of the call frame with updated arguments and results.
   absl::StatusOr<CallFrame> CopyWithBuffers(
-      absl::Span<const se::DeviceAddressBase> args,
-      absl::Span<const se::DeviceAddressBase> rets) const;
+      absl::Span<const se::DeviceMemoryBase> args,
+      absl::Span<const se::DeviceMemoryBase> rets) const;
 
   // Builds an XLA_FFI_CallFrame from owned arguments and attributes.
   XLA_FFI_CallFrame Build(
diff --git a/third_party/xla/xla/ffi/call_frame_test.cc b/third_party/xla/xla/ffi/call_frame_test.cc
index b58e2d9a2537b6..f73461fc7d297f 100644
--- a/third_party/xla/xla/ffi/call_frame_test.cc
+++ b/third_party/xla/xla/ffi/call_frame_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -34,8 +34,8 @@ limitations under the License.
 namespace xla::ffi {
 
 TEST(CallFrameTest, UpdateCallFrame) {
-  se::DeviceAddressBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
-  se::DeviceAddressBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
+  se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
 
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
@@ -116,7 +116,7 @@ TEST(CallFrameTest, UpdateCallFrame) {
 void BM_AddBufferArg(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   for (auto _ : state) {
@@ -151,17 +151,17 @@ void BM_AddAttributes(benchmark::State& state) {
 void BM_UpdateCallFrame(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
+  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     auto updated_call_frame =
@@ -173,17 +173,17 @@ void BM_UpdateCallFrame(benchmark::State& state) {
 void BM_UpdateCallFrameInPlace(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
+  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     benchmark::DoNotOptimize(
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 4e1849a190d327..da6303e14faef7 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -50,7 +50,7 @@ limitations under the License.
 #include "xla/ffi/type_registry.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/types.h"  // IWYU pragma: keep
@@ -137,8 +137,8 @@ class AnyBuffer {
     return reinterpret_cast<T*>(buf_->data);
   }
 
-  se::DeviceAddressBase device_memory() const {
-    return se::DeviceAddressBase(untyped_data(), size_bytes());
+  se::DeviceMemoryBase device_memory() const {
+    return se::DeviceMemoryBase(untyped_data(), size_bytes());
   }
 
  private:
@@ -182,9 +182,9 @@ class Buffer {
     return reinterpret_cast<internal::NativeType<dtype>*>(untyped_data());
   }
 
-  se::DeviceAddress<internal::NativeType<dtype>> device_memory() const {
-    return se::DeviceAddress<internal::NativeType<dtype>>(
-        se::DeviceAddressBase(untyped_data(), size_bytes()));
+  se::DeviceMemory<internal::NativeType<dtype>> device_memory() const {
+    return se::DeviceMemory<internal::NativeType<dtype>>(
+        se::DeviceMemoryBase(untyped_data(), size_bytes()));
   }
 
  private:
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 3f0de64033061e..31287ac7587ef4 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "xla/ffi/ffi_structs.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/service/platform_util.h"
-#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/platform/logging.h"
@@ -795,7 +795,7 @@ static XLA_FFI_Error* XLA_FFI_DeviceMemory_Free(
 
   absl::Status status = gpu->allocator->Deallocate(
       args->ctx->device_ordinal,
-      stream_executor::DeviceAddressBase(args->data, args->size));
+      stream_executor::DeviceMemoryBase(args->data, args->size));
   if (!status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 0369c8cc1946e5..8f0b00244c0a93 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
-#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
@@ -179,7 +179,7 @@ TEST(FfiTest, CatchExceptionExplicit) {
 
 TEST(FfiTest, WrongNumArgs) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceAddressBase(nullptr), PrimitiveType::F32, {});
+  builder.AddBufferArg(se::DeviceMemoryBase(nullptr), PrimitiveType::F32, {});
   auto call_frame = builder.Build();
 
   auto handler = Ffi::Bind().Arg<AnyBuffer>().Arg<AnyBuffer>().To(
@@ -579,7 +579,7 @@ TEST(FfiTest, DecodingErrors) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -614,7 +614,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, TypedAndRankedBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), storage.size() * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), storage.size() * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -642,8 +642,8 @@ TEST(FfiTest, TypedAndRankedBufferArgument) {
 
 TEST(FfiTest, ComplexBufferArgument) {
   std::vector<std::complex<float>> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(),
-                               storage.size() * sizeof(std::complex<float>));
+  se::DeviceMemoryBase memory(storage.data(),
+                              storage.size() * sizeof(std::complex<float>));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::C64, /*dims=*/{2, 2});
@@ -662,7 +662,7 @@ TEST(FfiTest, ComplexBufferArgument) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -679,7 +679,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -697,7 +697,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -715,7 +715,7 @@ TEST(FfiTest, WrongTypeBufferArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -743,7 +743,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -772,7 +772,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -833,7 +833,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -975,8 +975,8 @@ TEST(FfiTest, UpdateBufferArgumentsAndResults) {
   std::vector<float> storage0(4, 0.0f);
   std::vector<float> storage1(4, 0.0f);
 
-  se::DeviceAddressBase memory0(storage0.data(), 4 * sizeof(float));
-  se::DeviceAddressBase memory1(storage1.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory0(storage0.data(), 4 * sizeof(float));
+  se::DeviceMemoryBase memory1(storage1.data(), 4 * sizeof(float));
 
   std::vector<int64_t> dims = {2, 2};
 
@@ -1169,7 +1169,7 @@ TEST(FfiTest, PlatformStream) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceAddressBase memory;
+  se::DeviceMemoryBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 205fc66b41fcc8..e210a480bc74dd 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -1970,7 +1970,7 @@ StreamExecutorGpuClient::RunAsync(
         const int64_t buffer_size = allocation.size();
         if (buffer_size > 0) {
           TF_ASSIGN_OR_RETURN(
-              se::ScopedDeviceAddress<uint8_t> owning_buffer,
+              se::OwningDeviceMemory owning_buffer,
               memory_allocator->Allocate(device_ordinal, buffer_size,
                                          /*retry_on_failure=*/true,
                                          /*memory_space=*/allocation.color()));
@@ -2035,7 +2035,7 @@ StreamExecutorGpuClient::RunAsync(
                "buffer is not donated; allocating a fresh buffer";
         int64_t allocation_size = ShapeUtil::ByteSizeOf(
             ShapeUtil::GetSubshape(gpu_exec->result_shape(), index));
-        absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> allocated_buffer =
+        absl::StatusOr<se::OwningDeviceMemory> allocated_buffer =
             memory_allocator->Allocate(device_ordinal, allocation_size,
                                        /*retry_on_failure=*/true,
                                        /*memory_space=*/allocation->color());
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 88fce7477ce884..5e84506057c524 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -775,20 +775,16 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
                 tuple_buffer.buffers().mutable_element({});
             VLOG(3) << "untuple: output_buffers[" << i
                     << "].emplace: " << elem->opaque();
-            output_buffers[i].emplace(
-                stream_executor::ScopedDeviceAddress<uint8_t>(
-                    *elem, device->local_device_id().value(),
-                    client->allocator()));
+            output_buffers[i].emplace(stream_executor::OwningDeviceMemory(
+                *elem, device->local_device_id().value(), client->allocator()));
             *elem = se::DeviceAddressBase();
           }
         } else {
           CHECK_EQ(output_buffers.size(), 1);
           auto* elem = output.buffers().mutable_element({});
           VLOG(3) << "output_buffers[0].emplace: " << elem->opaque();
-          output_buffers.front().emplace(
-              stream_executor::ScopedDeviceAddress<uint8_t>(
-                  *elem, device->local_device_id().value(),
-                  client->allocator()));
+          output_buffers.front().emplace(stream_executor::OwningDeviceMemory(
+              *elem, device->local_device_id().value(), client->allocator()));
           *elem = se::DeviceAddressBase();
         }
 
@@ -913,11 +909,10 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
                     << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {i},
-                  MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
-                      tracked_buffers[i]->buffer()->buffer(),
-                      device->local_hardware_id().value(),
-                      client->allocator())));
+                  {i}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
+                           tracked_buffers[i]->buffer()->buffer(),
+                           device->local_hardware_id().value(),
+                           client->allocator())));
             } else {
               input.SetBuffer({i}, MaybeOwningDeviceAddress(
                                        tracked_buffers[i]->buffer()->buffer()));
@@ -933,7 +928,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
             ExecutionInput& input = inputs.back();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {}, MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
+                  {}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
                           tracked_buffers[i]->buffer()->buffer(),
                           device->local_hardware_id().value(),
                           client->allocator())));
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index d11f6e966f5ec2..e342a586863001 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -1156,7 +1156,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
 
   se::Stream* stream = local_device->host_to_device_stream();
   TF_ASSIGN_OR_RETURN(
-      se::ScopedDeviceAddress<uint8_t> owned_root_table_memory,
+      se::OwningDeviceMemory owned_root_table_memory,
       allocator->Allocate(
           device_ordinal,
           transfer_manager->GetByteSizeRequirement(tupled_parameter_shape)));
@@ -1673,7 +1673,7 @@ PjRtStreamExecutorClient::RunAsync(
     auto it = tmp.MutableBuffers()->begin();
     for (auto& v : input) {
       if (v.second.is_donated) {
-        it->second = MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
+        it->second = MaybeOwningDeviceAddress(se::OwningDeviceMemory(
             v.second.buf->mem(), device->local_device_id().value(),
             run_options.allocator()));
         tmp.SetUnownedIndex(it->first);
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 4220db893cb1dc..4b656c48fc2517 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -91,8 +91,8 @@ struct PjRtStreamExecutorExecutionOutput {
   // Donated inputs which must be freed.
   std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
   // For PjRtStreamExecutorClient implementations that
-  // use ScopedDeviceAddress for donated inputs.
-  std::vector<se::ScopedDeviceAddress<uint8_t>> se_to_be_released;
+  // use OwningDeviceMemory for donated inputs.
+  std::vector<se::OwningDeviceMemory> se_to_be_released;
 };
 
 class PjRtStreamExecutorDevice : public PjRtDevice {
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index 2c1b89083b477d..d5bec6ba286977 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/pjrt/tracked_device_buffer.h"
 
-#include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -91,7 +90,7 @@ absl::StatusOr<tsl::AsyncValueRef<RawSEDeviceMemory>> MakeArray(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
       [&](const Shape& subshape, const ShapeIndex&) -> absl::Status {
         TF_ASSIGN_OR_RETURN(
-            se::ScopedDeviceAddress<uint8_t> device_memory,
+            se::OwningDeviceMemory device_memory,
             client->backend().memory_allocator()->Allocate(
                 /*device_ordinal=*/0,
                 client->backend().transfer_manager()->GetByteSizeRequirement(
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index e5e8114809599e..b5d097d79b4715 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4113,8 +4113,6 @@ cc_library(
     hdrs = ["maybe_owning_device_memory.h"],
     deps = [
         ":maybe_owning_device_address",
-        "//xla/stream_executor:device_address",
-        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory.h b/third_party/xla/xla/service/maybe_owning_device_memory.h
index 40d05599971dcd..897003ffb17429 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory.h
+++ b/third_party/xla/xla/service/maybe_owning_device_memory.h
@@ -18,8 +18,6 @@ limitations under the License.
 
 #include "absl/base/macros.h"
 #include "xla/service/maybe_owning_device_address.h"
-#include "xla/stream_executor/device_address.h"  // IWYU pragma: keep
-#include "xla/stream_executor/device_address_allocator.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory_allocator.h"  // IWYU pragma: keep
 
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 4466fb094ab53d..9f617478a6ea7b 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -184,7 +184,7 @@ cc_library(
         "//xla/service:hlo_runner_pjrt",
         "//xla/service:interpreter_plugin",  # reference backend
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
@@ -451,8 +451,8 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_address",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -522,8 +522,8 @@ xla_test(
         "//xla/service:hlo_module_config",
         "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
-        "//xla/stream_executor:device_address",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -2989,7 +2989,7 @@ xla_test(
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/service:collective_ops_utils",
-        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
@@ -3520,7 +3520,7 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -3660,7 +3660,7 @@ xla_test(
         "//xla/service:generic_transfer_manager",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_address_allocator",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/tests:xla_test_backend_predicates",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 870a7b659bcb27..324917cbd57df6 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -113,7 +113,7 @@ class BufferDonationTest : public HloTestBase {
         run_options, backend_->StreamBorrowerWithPriority());
 
     std::vector<ExecutionInput> args;
-    std::vector<ShapeTree<se::DeviceAddressBase>> inputs_buffers;
+    std::vector<ShapeTree<se::DeviceMemoryBase>> inputs_buffers;
 
     CHECK_EQ(argument_literals.size(), donate_arguments.size());
 
@@ -130,7 +130,7 @@ class BufferDonationTest : public HloTestBase {
       ShapedBuffer shaped_buffer = scoped_shaped_buffer.release();
       CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
           stream.get(), argument_literal, shaped_buffer));
-      ShapeTree<se::DeviceAddressBase> input_buffers = shaped_buffer.buffers();
+      ShapeTree<se::DeviceMemoryBase> input_buffers = shaped_buffer.buffers();
       inputs_buffers.push_back(input_buffers);
       ShapeTree<MaybeOwningDeviceAddress> owned_buffers(
           argument_literal.shape());
@@ -138,7 +138,7 @@ class BufferDonationTest : public HloTestBase {
           [&](const ShapeIndex& index,
               MaybeOwningDeviceAddress* device_memory) {
             if (donate_argument) {
-              *device_memory = se::ScopedDeviceAddress<uint8_t>(
+              *device_memory = se::OwningDeviceMemory(
                   input_buffers.element(index), executor_->device_ordinal(),
                   &memory_allocator);
             } else {
@@ -162,7 +162,7 @@ class BufferDonationTest : public HloTestBase {
     }
     ExecutionOutput output = std::move(output_status).value();
 
-    se::DeviceAddressBase result_root_buffer = output.Result().root_buffer();
+    se::DeviceMemoryBase result_root_buffer = output.Result().root_buffer();
     LOG(INFO) << "result allocation = " << result_root_buffer.opaque()
               << "             size = " << result_root_buffer.size();
 
diff --git a/third_party/xla/xla/tests/collective_ops_ffi_test.cc b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
index 21d423965efc0e..f56ef7045eca7b 100644
--- a/third_party/xla/xla/tests/collective_ops_ffi_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/collective_ops_e2e_test_base.h"
 #include "xla/tests/literal_test_util.h"
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index dce925c25e28d0..6421e9badcbec7 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/hlo_runner_pjrt.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
@@ -174,7 +174,7 @@ ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
                                   reference_preprocessor);
 }
 
-se::DeviceAddressAllocator* HloTestBase::GetAllocator() {
+se::DeviceMemoryAllocator* HloTestBase::GetAllocator() {
   if (allocator_ == nullptr) {
     allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
         backend().default_stream_executor());
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index c378860ec85a40..31efd1fc5ff2bb 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -48,7 +48,7 @@ static_assert(false,
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/hlo_runner_interface.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
@@ -210,7 +210,7 @@ class ABSL_DEPRECATED(
   static se::Platform* GetTestPlatform();
 
   // Creates or retrieves the allocator.
-  se::DeviceAddressAllocator* GetAllocator();
+  se::DeviceMemoryAllocator* GetAllocator();
 
   ErrorSpec error_spec_{0.0001};
 
@@ -224,7 +224,7 @@ class ABSL_DEPRECATED(
               bool allow_mixed_precision_in_hlo_verifier,
               HloPredicate instruction_can_change_layout_func);
 
-  std::unique_ptr<se::DeviceAddressAllocator> allocator_;
+  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index cb0675c889c052..ac4aec28517450 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/tests/local_client_test_base.cc b/third_party/xla/xla/tests/local_client_test_base.cc
index 957b24fc150f8e..29563c202f26a2 100644
--- a/third_party/xla/xla/tests/local_client_test_base.cc
+++ b/third_party/xla/xla/tests/local_client_test_base.cc
@@ -43,8 +43,8 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -57,7 +57,7 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> TestAllocator::Allocate(
+absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
     int device_ordinal, uint64_t size, bool retry_on_failure,
     int64_t memory_space) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
@@ -71,7 +71,7 @@ absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> TestAllocator::Allocate(
 }
 
 absl::Status TestAllocator::Deallocate(int device_ordinal,
-                                       se::DeviceAddressBase mem) {
+                                       se::DeviceMemoryBase mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     absl::MutexLock lock(count_mutex_);
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index 3afeae8c003d8c..cb7de54135e8db 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -37,8 +37,8 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -53,11 +53,11 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
       : se::StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
 
-  absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(
+  absl::StatusOr<se::OwningDeviceMemory> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
       int64_t memory_space) override;
   absl::Status Deallocate(int device_ordinal,
-                          se::DeviceAddressBase mem) override;
+                          se::DeviceMemoryBase mem) override;
 
   // Return the number of allocations that have been performed.
   int64_t allocation_count() const;
diff --git a/third_party/xla/xla/tests/transfer_manager_test.cc b/third_party/xla/xla/tests/transfer_manager_test.cc
index 66d84eebb73fb7..6a4a188afd94fa 100644
--- a/third_party/xla/xla/tests/transfer_manager_test.cc
+++ b/third_party/xla/xla/tests/transfer_manager_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/service/stream_pool.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_address_allocator.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/local_client_test_base.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 60993b0f7d19ab..5f422444fd55e9 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1007,8 +1007,8 @@ tsl_gpu_library(
         "//xla/service/cpu:cpu_executable",
         "//xla/service/gpu:gpu_symbol_repository",
         "//xla/service/gpu/autotuning:autotuner_util",
-        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description_proto_cc",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",

From 27c6b3b944f7e5e9847d9597d8d6ec51e25c7472 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 12:33:17 -0800
Subject: [PATCH 090/753] Integrate LLVM at llvm/llvm-project@c6e23ab80753

Updates LLVM usage to match
[c6e23ab80753](https://github.com/llvm/llvm-project/commit/c6e23ab80753)

PiperOrigin-RevId: 842350626
---
 .../xla/third_party/llvm/workspace.bzl        |    4 +-
 .../xla/third_party/shardy/temporary.patch    | 1152 +----------------
 .../xla/third_party/shardy/workspace.bzl      |    4 +-
 3 files changed, 9 insertions(+), 1151 deletions(-)

diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index 5e3d8f2100a1be..dd3d4e4de4509d 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "8dee997a8558b460b82b23fb43b197d68258baac"
-    LLVM_SHA256 = "6a26975000c2cb45787813317bfeeadeafa0cba762e9434fb7940481ec4b27de"
+    LLVM_COMMIT = "c6e23ab80753a01dce270f5f8a133fbec942315d"
+    LLVM_SHA256 = "5a6b8aacd2d87ce9c4456843a76d0a54fd7cd0ae788ed3f19e7487ecd2ce4326"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 1f51d21f432dd8..13d339429b0101 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,1157 +1,15 @@
-diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index f04aa96..509398d 100644
---- a/third_party/llvm/generated.patch
-+++ b/third_party/llvm/generated.patch
-@@ -1,1137 +1 @@
- Auto generated patch. Do not edit or delete it, even if empty.
--diff -ruN --strip-trailing-cr a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
----- a/clang/docs/LanguageExtensions.rst
--+++ b/clang/docs/LanguageExtensions.rst
--@@ -1833,23 +1833,6 @@
-- 
-- Clang provides a few builtin aliases to improve the throughput of certain metaprogramming facilities.
-- 
---__builtin_common_reference
-----------------------------
---
---.. code-block:: c++
---
---  template <template <class, class, template <class> class, template <class> class> class BasicCommonReferenceT,
---            template <class... Args> CommonTypeT,
---            template <class> HasTypeMember,
---            class HasNoTypeMember,
---            class... Ts>
---  using __builtin_common_reference = ...;
---
---This alias is used for implementing ``std::common_reference``. If ``std::common_reference`` should contain a ``type``
---member, it is an alias to ``HasTypeMember<TheCommonReference>``. Otherwse it is an alias to ``HasNoTypeMember``. The
---``CommonTypeT`` is usually ``std::common_type_t``. ``BasicCommonReferenceT`` is usually an alias template to
---``basic_common_reference<T, U, TX, UX>::type``.
---
-- __builtin_common_type
-- ---------------------
-- 
--diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/BuiltinTemplates.td b/clang/include/clang/Basic/BuiltinTemplates.td
----- a/clang/include/clang/Basic/BuiltinTemplates.td
--+++ b/clang/include/clang/Basic/BuiltinTemplates.td
--@@ -10,11 +10,11 @@
--   string Name = name;
-- }
-- 
---class Template<list<TemplateArg> args, string name = ""> : TemplateArg<name> {
--+class Template<list<TemplateArg> args, string name> : TemplateArg<name> {
--   list<TemplateArg> Args = args;
-- }
-- 
---class Class<string name = "", bit is_variadic = 0> : TemplateArg<name> {
--+class Class<string name, bit is_variadic = 0> : TemplateArg<name> {
--   bit IsVariadic = is_variadic;
-- }
-- 
--@@ -56,32 +56,6 @@
--    Class<"HasNoTypeMember">,
--    Class<"Ts", /*is_variadic=*/1>]>;
-- 
---// template <template <class,"
---//                     class,"
---//                     template <class> class,"
---//                     template <class> class> class BasicCommonReferenceT,"
---//           template <class... Args> class CommonTypeT,"
---//           template <class> class HasTypeMember,"
---//           class HasNoTypeMember,"
---//           class... Ts>"
---def __builtin_common_reference : CPlusPlusBuiltinTemplate<
---            [Template<[Class<>,
---                       Class<>,
---                       Template<[Class<>]>,
---                       Template<[Class<>]>], "BasicCommonReferenceT">,
---             Template<[Class<"Args", /*is_variadic=*/1>], "CommonTypeT">,
---             Template<[Class<>], "HasTypeMember">,
---             Class<"HasNoTypeMember">,
---             Class<"Ts", /*is_variadic=*/1>]>;
---
---foreach Ref = ["", "lvalue", "rvalue"] in {
---  foreach Const = ["", "const"] in {
---    foreach Volatile = ["", "volatile"] in {
---      def __clang_internal_xref_#Ref#Const#Volatile : CPlusPlusBuiltinTemplate<[Class<>]>;
---    }
---  }
---}
---
-- // template <uint32_t Opcode,
-- //           uint32_t Size,
-- //           uint32_t Alignment,
--diff -ruN --strip-trailing-cr a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
----- a/clang/include/clang/Sema/Sema.h
--+++ b/clang/include/clang/Sema/Sema.h
--@@ -15322,17 +15322,6 @@
--   QualType BuiltinDecay(QualType BaseType, SourceLocation Loc);
--   QualType BuiltinAddReference(QualType BaseType, UTTKind UKind,
--                                SourceLocation Loc);
---
---  QualType BuiltinAddRValueReference(QualType BaseType, SourceLocation Loc) {
---    return BuiltinAddReference(BaseType, UnaryTransformType::AddRvalueReference,
---                               Loc);
---  }
---
---  QualType BuiltinAddLValueReference(QualType BaseType, SourceLocation Loc) {
---    return BuiltinAddReference(BaseType, UnaryTransformType::AddLvalueReference,
---                               Loc);
---  }
---
--   QualType BuiltinRemoveExtent(QualType BaseType, UTTKind UKind,
--                                SourceLocation Loc);
--   QualType BuiltinRemoveReference(QualType BaseType, UTTKind UKind,
--@@ -15347,9 +15336,6 @@
--   QualType BuiltinChangeSignedness(QualType BaseType, UTTKind UKind,
--                                    SourceLocation Loc);
-- 
---  bool BuiltinIsConvertible(QualType From, QualType To, SourceLocation Loc,
---                            bool CheckNothrow = false);
---
--   bool BuiltinIsBaseOf(SourceLocation RhsTLoc, QualType LhsT, QualType RhsT);
-- 
--   /// Ensure that the type T is a literal type.
--diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
----- a/clang/lib/Sema/SemaTemplate.cpp
--+++ b/clang/lib/Sema/SemaTemplate.cpp
--@@ -3212,36 +3212,6 @@
--   }
-- }
-- 
---static QualType InstantiateTemplate(Sema &S, ElaboratedTypeKeyword Keyword,
---                                    TemplateName Template,
---                                    ArrayRef<TemplateArgument> Args,
---                                    SourceLocation Loc) {
---  TemplateArgumentListInfo ArgList;
---  for (auto Arg : Args) {
---    if (Arg.getKind() == TemplateArgument::Type) {
---      ArgList.addArgument(TemplateArgumentLoc(
---          Arg, S.Context.getTrivialTypeSourceInfo(Arg.getAsType())));
---    } else {
---      ArgList.addArgument(
---          S.getTrivialTemplateArgumentLoc(Arg, QualType(), Loc));
---    }
---  }
---
---  EnterExpressionEvaluationContext UnevaluatedContext(
---      S, Sema::ExpressionEvaluationContext::Unevaluated);
---  Sema::SFINAETrap SFINAE(S, /*AccessCheckingSFINAE=*/true);
---  Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
---
---  QualType Instantiation =
---      S.CheckTemplateIdType(Keyword, Template, Loc, ArgList, /*Scope=*/nullptr,
---                            /*ForNestedNameSpecifier=*/false);
---
---  if (SFINAE.hasErrorOccurred())
---    return QualType();
---
---  return Instantiation;
---}
---
-- static QualType builtinCommonTypeImpl(Sema &S, ElaboratedTypeKeyword Keyword,
--                                       TemplateName BaseTemplate,
--                                       SourceLocation TemplateLoc,
--@@ -3254,7 +3224,25 @@
--       return builtinCommonTypeImpl(S, Keyword, BaseTemplate, TemplateLoc,
--                                    {T1, T2});
-- 
---    return InstantiateTemplate(S, Keyword, BaseTemplate, {T1, T2}, TemplateLoc);
--+    TemplateArgumentListInfo Args;
--+    Args.addArgument(TemplateArgumentLoc(
--+        T1, S.Context.getTrivialTypeSourceInfo(T1.getAsType())));
--+    Args.addArgument(TemplateArgumentLoc(
--+        T2, S.Context.getTrivialTypeSourceInfo(T2.getAsType())));
--+
--+    EnterExpressionEvaluationContext UnevaluatedContext(
--+        S, Sema::ExpressionEvaluationContext::Unevaluated);
--+    Sema::SFINAETrap SFINAE(S, /*ForValidityCheck=*/true);
--+    Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
--+
--+    QualType BaseTemplateInst = S.CheckTemplateIdType(
--+        Keyword, BaseTemplate, TemplateLoc, Args,
--+        /*Scope=*/nullptr, /*ForNestedNameSpecifier=*/false);
--+
--+    if (SFINAE.hasErrorOccurred())
--+      return QualType();
--+
--+    return BaseTemplateInst;
--   };
-- 
--   // Note A: For the common_type trait applied to a template parameter pack T of
--@@ -3361,233 +3349,6 @@
--   }
-- }
-- 
---static QualType CopyCV(QualType From, QualType To) {
---  if (From.isConstQualified())
---    To.addConst();
---  if (From.isVolatileQualified())
---    To.addVolatile();
---  return To;
---}
---
---// Let COND-RES(X, Y) be
---//  decltype(false ? declval<X(&)()>()() : declval<Y(&)()>()())
---static QualType CondRes(Sema &S, QualType X, QualType Y, SourceLocation Loc) {
---  EnterExpressionEvaluationContext UnevaluatedContext(
---      S, Sema::ExpressionEvaluationContext::Unevaluated);
---  Sema::SFINAETrap SFINAE(S, /*AccessCheckingSFINAE=*/true);
---  Sema::ContextRAII TUContext(S, S.Context.getTranslationUnitDecl());
---
---  // false
---  OpaqueValueExpr CondExpr(SourceLocation(), S.Context.BoolTy, VK_PRValue);
---  ExprResult Cond = &CondExpr;
---
---  // declval<X(&)()>()()
---  OpaqueValueExpr LHSExpr(Loc, X.getNonLValueExprType(S.Context),
---                          Expr::getValueKindForType(X));
---  ExprResult LHS = &LHSExpr;
---
---  // declval<Y(&)()>()()
---  OpaqueValueExpr RHSExpr(Loc, Y.getNonLValueExprType(S.Context),
---                          Expr::getValueKindForType(Y));
---  ExprResult RHS = &RHSExpr;
---
---  ExprValueKind VK = VK_PRValue;
---  ExprObjectKind OK = OK_Ordinary;
---
---  // decltype(false ? declval<X(&)()>()() : declval<Y(&)()>()())
---  QualType Result = S.CheckConditionalOperands(Cond, LHS, RHS, VK, OK, Loc);
---
---  if (SFINAE.hasErrorOccurred())
---    return QualType();
---  if (VK == VK_LValue)
---    return S.BuiltinAddLValueReference(Result, Loc);
---  if (VK == VK_XValue)
---    return S.BuiltinAddRValueReference(Result, Loc);
---  return Result;
---}
---
---static QualType CommonRef(Sema &S, QualType A, QualType B, SourceLocation Loc) {
---  // Given types A and B, let X be remove_reference_t<A>, let Y be
---  // remove_reference_t<B>, and let COMMON-​REF(A, B) be:
---  assert(A->isReferenceType() && B->isReferenceType() &&
---         "A and B have to be ref qualified for a COMMON-REF");
---  auto X = A.getNonReferenceType();
---  auto Y = B.getNonReferenceType();
---
---  // If A and B are both lvalue reference types, COMMON-REF(A, B) is
---  // COND-RES(COPYCV(X, Y) &, COPYCV(​Y, X) &) if that type exists and is a
---  // reference type.
---  if (A->isLValueReferenceType() && B->isLValueReferenceType()) {
---    auto CR = CondRes(S, S.BuiltinAddLValueReference(CopyCV(X, Y), Loc),
---                      S.BuiltinAddLValueReference(CopyCV(Y, X), Loc), Loc);
---    if (CR.isNull() || !CR->isReferenceType())
---      return QualType();
---    return CR;
---  }
---
---  // Otherwise, let C be remove_reference_t<COMMON-REF(X&, Y&)>&&. If A and B
---  // are both rvalue reference types, C is well-formed, and
---  // is_convertible_v<A, C> && is_convertible_v<B, C> is true, then
---  // COMMON-REF(A, B) is C.
---  if (A->isRValueReferenceType() && B->isRValueReferenceType()) {
---    auto C = CommonRef(S, S.BuiltinAddLValueReference(X, Loc),
---                       S.BuiltinAddLValueReference(Y, Loc), Loc);
---    if (C.isNull())
---      return QualType();
---
---    C = C.getNonReferenceType();
---
---    if (S.BuiltinIsConvertible(A, C, Loc) && S.BuiltinIsConvertible(B, C, Loc))
---      return S.BuiltinAddRValueReference(C, Loc);
---    return QualType();
---  }
---
---  // Otherwise, if A is an lvalue reference and B is an rvalue reference, then
---  // COMMON-REF(A, B) is COMMON-REF(B, A).
---  if (A->isLValueReferenceType() && B->isRValueReferenceType())
---    std::swap(A, B);
---
---  // Otherwise, let D be COMMON-REF(const X&, Y&). If A is an rvalue reference
---  // and B is an lvalue reference and D is well-formed and
---  // is_convertible_v<A, D> is true, then COMMON-REF(A, B) is D.
---  if (A->isRValueReferenceType() && B->isLValueReferenceType()) {
---    auto X2 = X;
---    X2.addConst();
---    auto D = CommonRef(S, S.BuiltinAddLValueReference(X2, Loc),
---                       S.BuiltinAddLValueReference(Y, Loc), Loc);
---    if (!D.isNull() && S.BuiltinIsConvertible(A, D, Loc))
---      return D;
---    return QualType();
---  }
---
---  // Otherwise, COMMON-REF(A, B) is ill-formed.
---  // This is implemented by returning from the individual branches above.
---
---  llvm_unreachable("The above cases should be exhaustive");
---}
---
---static QualType builtinCommonReferenceImpl(Sema &S,
---                                           ElaboratedTypeKeyword Keyword,
---                                           TemplateName CommonReference,
---                                           TemplateName CommonType,
---                                           SourceLocation TemplateLoc,
---                                           ArrayRef<TemplateArgument> Ts) {
---  switch (Ts.size()) {
---  // If sizeof...(T) is zero, there shall be no member type.
---  case 0:
---    return QualType();
---
---  // Otherwise, if sizeof...(T) is one, let T0 denote the sole type in the
---  // pack T. The member typedef type shall denote the same type as T0.
---  case 1:
---    return Ts[0].getAsType();
---
---  // Otherwise, if sizeof...(T) is two, let T1 and T2 denote the two types in
---  // the pack T. Then
---  case 2: {
---    auto T1 = Ts[0].getAsType();
---    auto T2 = Ts[1].getAsType();
---
---    // Let R be COMMON-REF(T1, T2). If T1 and T2 are reference types, R is
---    // well-formed, and is_convertible_v<add_pointer_t<T1>, add_pointer_t<R>> &&
---    // is_convertible_v<add_pointer_t<T2>, add_pointer_t<R>> is true, then the
---    // member typedef type denotes R.
---    if (T1->isReferenceType() && T2->isReferenceType()) {
---      QualType R = CommonRef(S, T1, T2, TemplateLoc);
---      if (!R.isNull()) {
---        if (S.BuiltinIsConvertible(S.BuiltinAddPointer(T1, TemplateLoc),
---                                   S.BuiltinAddPointer(R, TemplateLoc),
---                                   TemplateLoc) &&
---            S.BuiltinIsConvertible(S.BuiltinAddPointer(T2, TemplateLoc),
---                                   S.BuiltinAddPointer(R, TemplateLoc),
---                                   TemplateLoc)) {
---          return R;
---        }
---      }
---    }
---
---    // Otherwise, if basic_common_reference<remove_cvref_t<T1>,
---    // remove_cvref_t<T2>, ​XREF(​T1), XREF(T2)>​::​type is well-formed,
---    // then the member typedef type denotes that type.
---    {
---      auto getXRef = [&](QualType T) {
---        BuiltinTemplateDecl *Quals[12] = {
---            S.Context.get__clang_internal_xref_Decl(),
---            S.Context.get__clang_internal_xref_constDecl(),
---            S.Context.get__clang_internal_xref_volatileDecl(),
---            S.Context.get__clang_internal_xref_constvolatileDecl(),
---            S.Context.get__clang_internal_xref_lvalueDecl(),
---            S.Context.get__clang_internal_xref_lvalueconstDecl(),
---            S.Context.get__clang_internal_xref_lvaluevolatileDecl(),
---            S.Context.get__clang_internal_xref_lvalueconstvolatileDecl(),
---            S.Context.get__clang_internal_xref_rvalueDecl(),
---            S.Context.get__clang_internal_xref_rvalueconstDecl(),
---            S.Context.get__clang_internal_xref_rvaluevolatileDecl(),
---            S.Context.get__clang_internal_xref_rvalueconstvolatileDecl(),
---        };
---        size_t Index = 0;
---        if (T->isLValueReferenceType()) {
---          T = T.getNonReferenceType();
---          Index += 4;
---        } else if (T->isRValueReferenceType()) {
---          T = T.getNonReferenceType();
---          Index += 8;
---        }
---        if (T.isConstQualified())
---          Index += 1;
---
---        if (T.isVolatileQualified())
---          Index += 2;
---
---        return Quals[Index];
---      };
---
---      auto BCR = InstantiateTemplate(S, Keyword, CommonReference,
---                                     {S.BuiltinRemoveCVRef(T1, TemplateLoc),
---                                      S.BuiltinRemoveCVRef(T2, TemplateLoc),
---                                      TemplateName{getXRef(T1)},
---                                      TemplateName{getXRef(T2)}},
---                                     TemplateLoc);
---      if (!BCR.isNull())
---        return BCR;
---    }
---
---    // Otherwise, if COND-RES(T1, T2) is well-formed, then the member typedef
---    // type denotes that type.
---    if (auto CR = CondRes(S, T1, T2, TemplateLoc); !CR.isNull())
---      return CR;
---
---    // Otherwise, if common_type_t<T1, T2> is well-formed, then the member
---    // typedef type denotes that type.
---    if (auto CT =
---            InstantiateTemplate(S, Keyword, CommonType, {T1, T2}, TemplateLoc);
---        !CT.isNull())
---      return CT;
---
---    // Otherwise, there shall be no member type.
---    return QualType();
---  }
---
---  // Otherwise, if sizeof...(T) is greater than two, let T1, T2, and Rest,
---  // respectively, denote the first, second, and (pack of) remaining types
---  // comprising T. Let C be the type common_reference_t<T1, T2>. Then:
---  default: {
---    auto T1 = Ts[0];
---    auto T2 = Ts[1];
---    auto Rest = Ts.drop_front(2);
---    auto C = builtinCommonReferenceImpl(S, Keyword, CommonReference, CommonType,
---                                        TemplateLoc, {T1, T2});
---    if (C.isNull())
---      return QualType();
---    llvm::SmallVector<TemplateArgument, 4> Args;
---    Args.emplace_back(C);
---    Args.append(Rest.begin(), Rest.end());
---    return builtinCommonReferenceImpl(S, Keyword, CommonReference, CommonType,
---                                      TemplateLoc, Args);
---  }
---  }
---}
---
-- static bool isInVkNamespace(const RecordType *RT) {
--   DeclContext *DC = RT->getDecl()->getDeclContext();
--   if (!DC)
--@@ -3746,89 +3507,6 @@
--     return HasNoTypeMember;
--   }
-- 
---  case BTK__builtin_common_reference: {
---    assert(Converted.size() == 5);
---    if (llvm::any_of(Converted, [](auto &C) { return C.isDependent(); }))
---      return QualType();
---
---    TemplateName BasicCommonReference = Converted[0].getAsTemplate();
---    TemplateName CommonType = Converted[1].getAsTemplate();
---    TemplateName HasTypeMember = Converted[2].getAsTemplate();
---    QualType HasNoTypeMember = Converted[3].getAsType();
---    ArrayRef<TemplateArgument> Ts = Converted[4].getPackAsArray();
---    if (auto CR =
---            builtinCommonReferenceImpl(SemaRef, Keyword, BasicCommonReference,
---                                       CommonType, TemplateLoc, Ts);
---        !CR.isNull()) {
---      TemplateArgumentListInfo TAs;
---      TAs.addArgument(TemplateArgumentLoc(
---          TemplateArgument(CR), SemaRef.Context.getTrivialTypeSourceInfo(
---                                    CR, TemplateArgs[1].getLocation())));
---      return SemaRef.CheckTemplateIdType(Keyword, HasTypeMember, TemplateLoc,
---                                         TAs, /*Scope=*/nullptr,
---                                         /*ForNestedNameSpecifier=*/false);
---    }
---    return HasNoTypeMember;
---  }
---
---  case BTK__clang_internal_xref_:
---  case BTK__clang_internal_xref_const:
---  case BTK__clang_internal_xref_volatile:
---  case BTK__clang_internal_xref_constvolatile:
---  case BTK__clang_internal_xref_lvalue:
---  case BTK__clang_internal_xref_lvalueconst:
---  case BTK__clang_internal_xref_lvaluevolatile:
---  case BTK__clang_internal_xref_lvalueconstvolatile:
---  case BTK__clang_internal_xref_rvalue:
---  case BTK__clang_internal_xref_rvalueconst:
---  case BTK__clang_internal_xref_rvaluevolatile:
---  case BTK__clang_internal_xref_rvalueconstvolatile: {
---    if (llvm::any_of(Converted, [](auto &C) { return C.isDependent(); }))
---      return QualType();
---
---    auto BTK = BTD->getBuiltinTemplateKind();
---    auto anyOf = [&](auto... Vals) { return ((BTK == Vals) || ...); };
---
---    bool AddCV = anyOf(BTK__clang_internal_xref_constvolatile,
---                       BTK__clang_internal_xref_lvalueconstvolatile,
---                       BTK__clang_internal_xref_rvalueconstvolatile);
---
---    bool AddConst = AddCV || anyOf(BTK__clang_internal_xref_const,
---                                   BTK__clang_internal_xref_lvalueconst,
---                                   BTK__clang_internal_xref_rvalueconst);
---
---    bool AddVolatile = AddCV || anyOf(BTK__clang_internal_xref_volatile,
---                                      BTK__clang_internal_xref_lvaluevolatile,
---                                      BTK__clang_internal_xref_rvaluevolatile);
---
---    bool AddLValue = anyOf(BTK__clang_internal_xref_lvalue,
---                           BTK__clang_internal_xref_lvalueconst,
---                           BTK__clang_internal_xref_lvaluevolatile,
---                           BTK__clang_internal_xref_lvalueconstvolatile);
---
---    bool AddRValue = anyOf(BTK__clang_internal_xref_rvalue,
---                           BTK__clang_internal_xref_rvalueconst,
---                           BTK__clang_internal_xref_rvaluevolatile,
---                           BTK__clang_internal_xref_rvalueconstvolatile);
---
---    assert(Converted.size() == 1);
---
---    QualType T = Converted[0].getAsType();
---
---    if (AddConst)
---      T.addConst();
---
---    if (AddVolatile)
---      T.addVolatile();
---
---    if (AddLValue)
---      T = SemaRef.BuiltinAddLValueReference(T, TemplateLoc);
---    else if (AddRValue)
---      T = SemaRef.BuiltinAddRValueReference(T, TemplateLoc);
---
---    return T;
---  }
---
--   case BTK__hlsl_spirv_type: {
--     assert(Converted.size() == 4);
-- 
--diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
----- a/clang/lib/Sema/SemaType.cpp
--+++ b/clang/lib/Sema/SemaType.cpp
--@@ -32,8 +32,6 @@
-- #include "clang/Lex/Preprocessor.h"
-- #include "clang/Sema/DeclSpec.h"
-- #include "clang/Sema/DelayedDiagnostic.h"
---#include "clang/Sema/EnterExpressionEvaluationContext.h"
---#include "clang/Sema/Initialization.h"
-- #include "clang/Sema/Lookup.h"
-- #include "clang/Sema/ParsedAttr.h"
-- #include "clang/Sema/ParsedTemplate.h"
--@@ -10074,81 +10072,6 @@
--   return Context.getQualifiedType(Underlying, BaseType.getQualifiers());
-- }
-- 
---bool Sema::BuiltinIsConvertible(QualType From, QualType To, SourceLocation Loc,
---                                bool CheckNothrow) {
---  if (To->isVoidType())
---    return From->isVoidType();
---
---  // [meta.rel]
---  // From and To shall be complete types, cv void, or arrays of unknown bound.
---  if ((!From->isIncompleteArrayType() && !From->isVoidType() &&
---       RequireCompleteType(
---           Loc, From, diag::err_incomplete_type_used_in_type_trait_expr)) ||
---      (!To->isIncompleteArrayType() && !To->isVoidType() &&
---       RequireCompleteType(Loc, To,
---                           diag::err_incomplete_type_used_in_type_trait_expr)))
---    return false;
---
---  // C++11 [meta.rel]p4:
---  //   Given the following function prototype:
---  //
---  //     template <class T>
---  //       typename add_rvalue_reference<T>::type create();
---  //
---  //   the predicate condition for a template specialization
---  //   is_convertible<From, To> shall be satisfied if and only if
---  //   the return expression in the following code would be
---  //   well-formed, including any implicit conversions to the return
---  //   type of the function:
---  //
---  //     To test() {
---  //       return create<From>();
---  //     }
---  //
---  //   Access checking is performed as if in a context unrelated to To and
---  //   From. Only the validity of the immediate context of the expression
---  //   of the return-statement (including conversions to the return type)
---  //   is considered.
---  //
---  // We model the initialization as a copy-initialization of a temporary
---  // of the appropriate type, which for this expression is identical to the
---  // return statement (since NRVO doesn't apply).
---
---  // Functions aren't allowed to return function or array types.
---  if (To->isFunctionType() || To->isArrayType())
---    return false;
---
---  // A function definition requires a non-abstract return type.
---  if (isAbstractType(Loc, To))
---    return false;
---
---  From = BuiltinAddRValueReference(From, Loc);
---
---  // Build a fake source and destination for initialization.
---  InitializedEntity ToEntity(InitializedEntity::InitializeTemporary(To));
---  OpaqueValueExpr FromExpr(Loc, From.getNonLValueExprType(Context),
---                           Expr::getValueKindForType(From));
---  InitializationKind Kind =
---      InitializationKind::CreateCopy(Loc, SourceLocation());
---
---  // Perform the initialization in an unevaluated context within a SFINAE
---  // trap at translation unit scope.
---  EnterExpressionEvaluationContext Unevaluated(
---      *this, Sema::ExpressionEvaluationContext::Unevaluated);
---  Sema::SFINAETrap SFINAE(*this, /*AccessCheckingSFINAE=*/true);
---  Sema::ContextRAII TUContext(*this, Context.getTranslationUnitDecl());
---  Expr *FromExprPtr = &FromExpr;
---  InitializationSequence Init(*this, ToEntity, Kind, FromExprPtr);
---  if (Init.Failed())
---    return false;
---
---  ExprResult Result = Init.Perform(*this, ToEntity, Kind, FromExprPtr);
---  if (Result.isInvalid() || SFINAE.hasErrorOccurred())
---    return false;
---
---  return !CheckNothrow || canThrow(Result.get()) == CT_Cannot;
---}
---
-- QualType Sema::BuildUnaryTransformType(QualType BaseType, UTTKind UKind,
--                                        SourceLocation Loc) {
--   if (BaseType->isDependentType())
--diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
----- a/clang/lib/Sema/SemaTypeTraits.cpp
--+++ b/clang/lib/Sema/SemaTypeTraits.cpp
--@@ -1212,6 +1212,76 @@
--                                     const TypeSourceInfo *Rhs,
--                                     SourceLocation KeyLoc);
-- 
--+static ExprResult CheckConvertibilityForTypeTraits(
--+    Sema &Self, const TypeSourceInfo *Lhs, const TypeSourceInfo *Rhs,
--+    SourceLocation KeyLoc, llvm::BumpPtrAllocator &OpaqueExprAllocator) {
--+
--+  QualType LhsT = Lhs->getType();
--+  QualType RhsT = Rhs->getType();
--+
--+  // C++0x [meta.rel]p4:
--+  //   Given the following function prototype:
--+  //
--+  //     template <class T>
--+  //       typename add_rvalue_reference<T>::type create();
--+  //
--+  //   the predicate condition for a template specialization
--+  //   is_convertible<From, To> shall be satisfied if and only if
--+  //   the return expression in the following code would be
--+  //   well-formed, including any implicit conversions to the return
--+  //   type of the function:
--+  //
--+  //     To test() {
--+  //       return create<From>();
--+  //     }
--+  //
--+  //   Access checking is performed as if in a context unrelated to To and
--+  //   From. Only the validity of the immediate context of the expression
--+  //   of the return-statement (including conversions to the return type)
--+  //   is considered.
--+  //
--+  // We model the initialization as a copy-initialization of a temporary
--+  // of the appropriate type, which for this expression is identical to the
--+  // return statement (since NRVO doesn't apply).
--+
--+  // Functions aren't allowed to return function or array types.
--+  if (RhsT->isFunctionType() || RhsT->isArrayType())
--+    return ExprError();
--+
--+  // A function definition requires a complete, non-abstract return type.
--+  if (!Self.isCompleteType(Rhs->getTypeLoc().getBeginLoc(), RhsT) ||
--+      Self.isAbstractType(Rhs->getTypeLoc().getBeginLoc(), RhsT))
--+    return ExprError();
--+
--+  // Compute the result of add_rvalue_reference.
--+  if (LhsT->isObjectType() || LhsT->isFunctionType())
--+    LhsT = Self.Context.getRValueReferenceType(LhsT);
--+
--+  // Build a fake source and destination for initialization.
--+  InitializedEntity To(InitializedEntity::InitializeTemporary(RhsT));
--+  Expr *From = new (OpaqueExprAllocator.Allocate<OpaqueValueExpr>())
--+      OpaqueValueExpr(KeyLoc, LhsT.getNonLValueExprType(Self.Context),
--+                      Expr::getValueKindForType(LhsT));
--+  InitializationKind Kind =
--+      InitializationKind::CreateCopy(KeyLoc, SourceLocation());
--+
--+  // Perform the initialization in an unevaluated context within a SFINAE
--+  // trap at translation unit scope.
--+  EnterExpressionEvaluationContext Unevaluated(
--+      Self, Sema::ExpressionEvaluationContext::Unevaluated);
--+  Sema::SFINAETrap SFINAE(Self, /*ForValidityCheck=*/true);
--+  Sema::ContextRAII TUContext(Self, Self.Context.getTranslationUnitDecl());
--+  InitializationSequence Init(Self, To, Kind, From);
--+  if (Init.Failed())
--+    return ExprError();
--+
--+  ExprResult Result = Init.Perform(Self, To, Kind, From);
--+  if (Result.isInvalid() || SFINAE.hasErrorOccurred())
--+    return ExprError();
--+
--+  return Result;
--+}
--+
-- static APValue EvaluateSizeTTypeTrait(Sema &S, TypeTrait Kind,
--                                       SourceLocation KWLoc,
--                                       ArrayRef<TypeSourceInfo *> Args,
--@@ -1372,8 +1442,9 @@
--           S.Context.getPointerType(T.getNonReferenceType()));
--       TypeSourceInfo *UPtr = S.Context.CreateTypeSourceInfo(
--           S.Context.getPointerType(U.getNonReferenceType()));
---      return S.BuiltinIsConvertible(UPtr->getType(), TPtr->getType(),
---                                    RParenLoc);
--+      return !CheckConvertibilityForTypeTraits(S, UPtr, TPtr, RParenLoc,
--+                                               OpaqueExprAllocator)
--+                  .isInvalid();
--     }
-- 
--     if (Kind == clang::TT_IsNothrowConstructible)
--@@ -1624,9 +1695,20 @@
--   }
--   case BTT_IsConvertible:
--   case BTT_IsConvertibleTo:
---  case BTT_IsNothrowConvertible:
---    return Self.BuiltinIsConvertible(LhsT, RhsT, KeyLoc,
---                                     BTT == BTT_IsNothrowConvertible);
--+  case BTT_IsNothrowConvertible: {
--+    if (RhsT->isVoidType())
--+      return LhsT->isVoidType();
--+    llvm::BumpPtrAllocator OpaqueExprAllocator;
--+    ExprResult Result = CheckConvertibilityForTypeTraits(Self, Lhs, Rhs, KeyLoc,
--+                                                         OpaqueExprAllocator);
--+    if (Result.isInvalid())
--+      return false;
--+
--+    if (BTT != BTT_IsNothrowConvertible)
--+      return true;
--+
--+    return Self.canThrow(Result.get()) == CT_Cannot;
--+  }
-- 
--   case BTT_IsAssignable:
--   case BTT_IsNothrowAssignable:
--diff -ruN --strip-trailing-cr a/clang/test/SemaCXX/type-trait-common-reference.cpp b/clang/test/SemaCXX/type-trait-common-reference.cpp
----- a/clang/test/SemaCXX/type-trait-common-reference.cpp
--+++ b/clang/test/SemaCXX/type-trait-common-reference.cpp
--@@ -1,136 +0,0 @@
---// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=c++17 -Wno-vla-cxx-extension %s
---// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify -std=c++20 -Wno-vla-cxx-extension %s
---
---#if !__has_builtin(__builtin_common_reference)
---#  error
---#endif
---
---// expected-note@*:* {{template <template <class, class, template <class> class, template <class> class> class, template <class ...> class, template <class> class, class, class ...>}}
---
---void test() {
---  __builtin_common_reference<> a; // expected-error {{too few template arguments for template '__builtin_common_reference'}}
---  __builtin_common_reference<1> b; // expected-error {{template argument for template template parameter must be a class template or type alias template}}
---  __builtin_common_reference<int, 1> c; // expected-error {{template argument for template template parameter must be a class template or type alias template}}
---}
---
---struct empty_type {};
---
---template <class T>
---struct type_identity {
---  using type = T;
---};
---
---template <class...>
---struct common_type;
---
---template <class... Args>
---using common_type_t = typename common_type<Args...>::type;
---
---template <class, class, template <class> class, template <class> class>
---struct basic_common_reference {};
---
---template <class T, class U, template <class> class TX, template <class> class UX>
---using basic_common_reference_t = typename basic_common_reference<T, U, TX, UX>::type;
---
---void test_vla() {
---  int i = 4;
---  int VLA[i];
---  __builtin_common_reference<basic_common_reference_t, common_type_t, type_identity, empty_type, decltype(VLA)> d; // expected-error {{variably modified type 'decltype(VLA)' (aka 'int[i]') cannot be used as a template argument}}
---}
---
---template <class... Args>
---using common_reference_base = __builtin_common_reference<basic_common_reference_t, common_type_t, type_identity, empty_type, Args...>;
---
---template <class... Args>
---struct common_reference : common_reference_base<Args...> {};
---
---template <class... Args>
---using common_reference_t = typename __builtin_common_reference<basic_common_reference_t, common_type_t, type_identity, empty_type, Args...>::type;
---
---struct Incomplete;
---
---template<>
---struct common_type<Incomplete, Incomplete>;
---
---static_assert(__is_same(common_reference_base<>, empty_type));
---
---static_assert(__is_same(common_reference_base<Incomplete>, type_identity<Incomplete>));
---static_assert(__is_same(common_reference_base<char>, type_identity<char>));
---static_assert(__is_same(common_reference_base<int>, type_identity<int>));
---static_assert(__is_same(common_reference_base<const int>, type_identity<const int>));
---static_assert(__is_same(common_reference_base<volatile int>, type_identity<volatile int>));
---static_assert(__is_same(common_reference_base<const volatile int>, type_identity<const volatile int>));
---static_assert(__is_same(common_reference_base<int[]>, type_identity<int[]>));
---static_assert(__is_same(common_reference_base<const int[]>, type_identity<const int[]>));
---static_assert(__is_same(common_reference_base<void(&)()>, type_identity<void(&)()>));
---
---static_assert(__is_same(common_reference_base<int[], int[]>, type_identity<int*>));
---static_assert(__is_same(common_reference_base<int, int>, type_identity<int>));
---static_assert(__is_same(common_reference_base<int, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long, int>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long, long>, type_identity<long>));
---
---static_assert(__is_same(common_reference_base<const int, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<const volatile int, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<int, const long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<int, const volatile long>, type_identity<long>));
---
---static_assert(__is_same(common_reference_base<int*, long*>, empty_type));
---static_assert(__is_same(common_reference_base<const unsigned int *const &, const unsigned int *const &>, type_identity<const unsigned int *const &>));
---
---static_assert(__is_same(common_reference_base<int, long, float>, type_identity<float>));
---static_assert(__is_same(common_reference_base<unsigned, char, long>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long long, long long, long>, type_identity<long long>));
---
---static_assert(__is_same(common_reference_base<int [[clang::address_space(1)]]>, type_identity<int [[clang::address_space(1)]]>));
---static_assert(__is_same(common_reference_base<int [[clang::address_space(1)]], int>, type_identity<int>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], int>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], int [[clang::address_space(1)]]>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], long [[clang::address_space(1)]]>, type_identity<long>));
---static_assert(__is_same(common_reference_base<long [[clang::address_space(1)]], long [[clang::address_space(2)]]>, type_identity<long>));
---
---struct S {};
---struct T : S {};
---struct U {};
---
---static_assert(__is_same(common_reference_base<S&&, T&&>, type_identity<S&&>));
---
---static_assert(__is_same(common_reference_base<int S::*, int S::*>, type_identity<int S::*>));
---static_assert(__is_same(common_reference_base<int S::*, int T::*>, type_identity<int T::*>));
---static_assert(__is_same(common_reference_base<int S::*, long S::*>, empty_type));
---
---static_assert(__is_same(common_reference_base<int (S::*)(), int (S::*)()>, type_identity<int (S::*)()>));
---static_assert(__is_same(common_reference_base<int (S::*)(), int (T::*)()>, type_identity<int (T::*)()>));
---static_assert(__is_same(common_reference_base<int (S::*)(), long (S::*)()>, empty_type));
---
---static_assert(__is_same(common_reference_base<int&, int&>, type_identity<int&>));
---static_assert(__is_same(common_reference_base<int&, const int&>, type_identity<const int&>));
---static_assert(__is_same(common_reference_base<volatile int&, const int&>, type_identity<const volatile int&>));
---
---template <class T, class U>
---struct my_pair;
---
---template <class T1, class U1, class T2, class U2, template <class> class TX, template <class> class UX>
---struct basic_common_reference<my_pair<T1, U1>, my_pair<T2, U2>, TX, UX> {
---  using type = my_pair<common_reference_t<TX<T1>, UX<T2>>, common_reference_t<TX<U1>, UX<U2>>>;
---};
---
---static_assert(__is_same(common_reference_base<my_pair<const int&, int&>, my_pair<int&, volatile int&>>, type_identity<my_pair<const int&, volatile int&>>));
---static_assert(__is_same(common_reference_base<const my_pair<int, int>&, my_pair<int&, volatile int&>>, type_identity<my_pair<const int&, const volatile int&>>));
---static_assert(__is_same(common_reference_base<const int&, const volatile int&>, type_identity<const volatile int&>));
---static_assert(__is_same(common_reference_base<int&&, const volatile int&>, type_identity<int>));
---static_assert(__is_same(common_reference_base<my_pair<int, int>&&, my_pair<int&, volatile int&>>, type_identity<my_pair<const int&, int>>));
---static_assert(__is_same(common_reference_base<my_pair<int, int>&&, my_pair<int&, int>&&>, type_identity<my_pair<const int&, int&&>>));
---
---struct conversion_operator {
---  operator volatile int&&() volatile;
---};
---
---static_assert(__is_same(common_reference_base<volatile conversion_operator&&, volatile int&&>, type_identity<volatile int&&>));
---
---struct reference_wrapper {
---  reference_wrapper(int&);
---  operator int&() const;
---};
---
---static_assert(__is_same(common_reference_base<const reference_wrapper&, int&>, empty_type));
--diff -ruN --strip-trailing-cr a/libcxx/include/__locale_dir/num.h b/libcxx/include/__locale_dir/num.h
----- a/libcxx/include/__locale_dir/num.h
--+++ b/libcxx/include/__locale_dir/num.h
--@@ -436,6 +436,7 @@
--         ++__first;
--         if (__first == __last) {
--           __err |= ios_base::eofbit;
--+          __v = 0;
--           return __first;
--         }
--         // __c2 == 'x' || __c2 == 'X'
--@@ -444,6 +445,7 @@
--           ++__first;
--         } else {
--           __base = 8;
--+          __parsed_num = true; // We only swallowed '0', so we've started to parse a number
--         }
--       } else {
--         __base = 10;
--diff -ruN --strip-trailing-cr a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
----- a/libcxx/include/module.modulemap.in
--+++ b/libcxx/include/module.modulemap.in
--@@ -1517,7 +1517,6 @@
--       header "__iterator/iterator_traits.h"
--       export std_core.type_traits.integral_constant
--       export std_core.type_traits.is_convertible
---      export std_core.type_traits.nat
--     }
--     module iterator_with_data         { header "__iterator/iterator_with_data.h" }
--     module iterator                   { header "__iterator/iterator.h" }
--diff -ruN --strip-trailing-cr a/libcxx/include/__type_traits/common_reference.h b/libcxx/include/__type_traits/common_reference.h
----- a/libcxx/include/__type_traits/common_reference.h
--+++ b/libcxx/include/__type_traits/common_reference.h
--@@ -18,37 +18,16 @@
-- #include <__type_traits/is_reference.h>
-- #include <__type_traits/remove_cvref.h>
-- #include <__type_traits/remove_reference.h>
---#include <__type_traits/type_identity.h>
-- #include <__utility/declval.h>
-- 
-- #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-- #  pragma GCC system_header
-- #endif
-- 
---#if _LIBCPP_STD_VER >= 20
---
-- _LIBCPP_BEGIN_NAMESPACE_STD
-- 
---template <class...>
---struct _LIBCPP_NO_SPECIALIZATIONS common_reference;
---
---template <class... _Types>
---using common_reference_t = typename common_reference<_Types...>::type;
---
---template <class, class, template <class> class, template <class> class>
---struct basic_common_reference {};
---
---#  if __has_builtin(__builtin_common_reference)
---
---template <class _Tp, class _Up, template <class> class _Tx, template <class> class _Ux>
---using __basic_common_reference_t = basic_common_reference<_Tp, _Up, _Tx, _Ux>::type;
---
---template <class... _Args>
---struct _LIBCPP_NO_SPECIALIZATIONS common_reference
---    : __builtin_common_reference<__basic_common_reference_t, common_type_t, type_identity, __empty, _Args...> {};
---
---#  else
---
--+// common_reference
--+#if _LIBCPP_STD_VER >= 20
-- // Let COND_RES(X, Y) be:
-- template <class _Xp, class _Yp>
-- using __cond_res _LIBCPP_NODEBUG = decltype(false ? std::declval<_Xp (&)()>()() : std::declval<_Yp (&)()>()());
--@@ -130,10 +109,19 @@
-- 
-- // Note C: For the common_reference trait applied to a parameter pack [...]
-- 
--+template <class...>
--+struct _LIBCPP_NO_SPECIALIZATIONS common_reference;
--+
--+template <class... _Types>
--+using common_reference_t = typename common_reference<_Types...>::type;
--+
--+template <class, class, template <class> class, template <class> class>
--+struct basic_common_reference {};
--+
-- _LIBCPP_DIAGNOSTIC_PUSH
---#    if __has_warning("-Winvalid-specialization")
--+#  if __has_warning("-Winvalid-specialization")
-- _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization")
---#    endif
--+#  endif
-- // bullet 1 - sizeof...(T) == 0
-- template <>
-- struct common_reference<> {};
--@@ -207,10 +195,8 @@
-- template <class...>
-- struct _LIBCPP_NO_SPECIALIZATIONS common_reference {};
-- 
---#  endif // __has_builtin(__builtin_common_reference)
--+#endif // _LIBCPP_STD_VER >= 20
-- 
-- _LIBCPP_END_NAMESPACE_STD
-- 
---#endif // _LIBCPP_STD_VER >= 20
---
-- #endif // _LIBCPP___TYPE_TRAITS_COMMON_REFERENCE_H
--diff -ruN --strip-trailing-cr a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
----- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
--+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.num.get/facet.num.get.members/get_long.pass.cpp
--@@ -670,5 +670,101 @@
--       assert(v == std::numeric_limits<long>::min());
--     }
-- 
--+  { // Check that auto-detection of the base works properly
--+    ios.flags(ios.flags() & ~std::ios::basefield);
--+    { // zeroes
--+      {
--+        v                          = -1;
--+        const char str[]           = "0";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 1), ios, err, v);
--+        assert(base(iter) == str + 1);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "00";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 2), ios, err, v);
--+        assert(base(iter) == str + 2);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0x0";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 3);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0X0";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 3);
--+        assert(err == ios.eofbit);
--+        assert(v == 0);
--+      }
--+    }
--+    { // first character after base is out of range
--+      {
--+        v                          = -1;
--+        const char str[]           = "08";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 2), ios, err, v);
--+        assert(base(iter) == str + 1);
--+        assert(err == ios.goodbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "1a";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 2), ios, err, v);
--+        assert(base(iter) == str + 1);
--+        assert(err == ios.goodbit);
--+        assert(v == 1);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0xg";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 2);
--+        assert(err == ios.failbit);
--+        assert(v == 0);
--+      }
--+      {
--+        v                          = -1;
--+        const char str[]           = "0Xg";
--+        std::ios_base::iostate err = ios.goodbit;
--+
--+        cpp17_input_iterator<const char*> iter =
--+            f.get(cpp17_input_iterator<const char*>(str), cpp17_input_iterator<const char*>(str + 3), ios, err, v);
--+        assert(base(iter) == str + 2);
--+        assert(err == ios.failbit);
--+        assert(v == 0);
--+      }
--+    }
--+  }
--+
--   return 0;
-- }
--diff -ruN --strip-trailing-cr a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
----- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
--+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
--@@ -948,8 +948,7 @@
--     auto [id, args] = NVVM::MBarrierArriveExpectTxOp::getIntrinsicIDAndArgs(
--                       *op, moduleTranslation, builder);
-- 
---    int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
---    if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
--+    if (op.getNumResults() > 0)
--       $res = createIntrinsicCall(builder, id, args);
--     else
--       createIntrinsicCall(builder, id, args);
--@@ -985,9 +984,7 @@
--   string llvmBuilder = [{
--     auto [id, args] = NVVM::MBarrierArriveDropExpectTxOp::getIntrinsicIDAndArgs(
--                       *op, moduleTranslation, builder);
---
---    int addrSpace = llvm::cast<LLVMPointerType>(op.getAddr().getType()).getAddressSpace();
---    if (addrSpace != NVVM::NVVMMemorySpace::SharedCluster)
--+    if (op.getNumResults() > 0)
--       $res = createIntrinsicCall(builder, id, args);
--     else
--       createIntrinsicCall(builder, id, args);
--diff -ruN --strip-trailing-cr a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
----- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
--+++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/wmma-matmul-f64.mlir
--@@ -44,6 +44,7 @@
--   gpu.host_register %2 : memref<*xf64>
--   gpu.host_register %20 : memref<*xf64>
--   gpu.host_register %33 : memref<*xf64>
--+  gpu.host_register %34 : memref<*xf64>
-- 
--   gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
--              threads(%tx, %ty, %tz) in (%block_x = %c32, %block_y = %c1, %block_z = %c1) {
--diff -ruN --strip-trailing-cr a/utils/bazel/configure.bzl b/utils/bazel/configure.bzl
----- a/utils/bazel/configure.bzl
--+++ b/utils/bazel/configure.bzl
--@@ -27,7 +27,6 @@
--     "XCore",
-- ]
-- 
---
-- MAX_TRAVERSAL_STEPS = 1000000  # "big number" upper bound on total visited dirs
-- 
-- def _overlay_directories(repository_ctx):
--@@ -44,7 +43,9 @@
--     for _ in range(MAX_TRAVERSAL_STEPS):
--         rel_dir = stack.pop()
-- 
---        overlay_dirs = set()
--+        # TODO: `set()` is only available in bazel 8.1.
--+        # Use `set()` after downstream users are on more recent versions.
--+        overlay_dirs = {}
-- 
--         # Symlink overlay files, overlay dirs will be handled in future iterations.
--         for entry in overlay_root.get_child(rel_dir).readdir():
--@@ -53,7 +54,7 @@
-- 
--             if entry.is_dir:
--                 stack.append(full_rel_path)
---                overlay_dirs.add(name)
--+                overlay_dirs[name] = None
--             else:
--                 src_path = overlay_root.get_child(full_rel_path)
--                 dst_path = target_root.get_child(full_rel_path)
--@@ -62,7 +63,7 @@
--         # Symlink source dirs (if not themselves overlaid) and files.
--         for src_entry in src_root.get_child(rel_dir).readdir():
--             name = src_entry.basename
---            if name in overlay_dirs:
--+            if name in overlay_dirs.keys():
--                 # Skip: overlay has a directory with this name
--                 continue
-- 
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index f8f483c..5e3d8f2 100644
+index 5e3d8f2..dd3d4e4 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "ac66ae45cd22a7958ace645a035831000bfcbf51"
--    LLVM_SHA256 = "3bb51316595bbe99da8bee121f1fc39993176afc5b55f72a5d5010214dcd24a8"
-+    LLVM_COMMIT = "8dee997a8558b460b82b23fb43b197d68258baac"
-+    LLVM_SHA256 = "6a26975000c2cb45787813317bfeeadeafa0cba762e9434fb7940481ec4b27de"
+-    LLVM_COMMIT = "8dee997a8558b460b82b23fb43b197d68258baac"
+-    LLVM_SHA256 = "6a26975000c2cb45787813317bfeeadeafa0cba762e9434fb7940481ec4b27de"
++    LLVM_COMMIT = "c6e23ab80753a01dce270f5f8a133fbec942315d"
++    LLVM_SHA256 = "5a6b8aacd2d87ce9c4456843a76d0a54fd7cd0ae788ed3f19e7487ecd2ce4326"
  
      tf_http_archive(
          name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index be133debb9e029..22625dcbb1fff3 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "e0131b277694a847e26d0ce4ce489423f399e26c"
-    SHARDY_SHA256 = "54341c51d72c773217023a77db3b53253bcf49faa5a3d59d088e8b3e75d976be"
+    SHARDY_COMMIT = "bda4ef8940c146fed6477d03b375c06d04475003"
+    SHARDY_SHA256 = "4d54264a91c6ae7977ea072eef92a4df0ffa4b5cc97bdf814095b0f8f7f6d5ce"
 
     tf_http_archive(
         name = "shardy",

From 4fbf3ad045cfe734a7603354f2cdde3481c0b571 Mon Sep 17 00:00:00 2001
From: Marcin Radomski <dextero@google.com>
Date: Tue, 9 Dec 2025 12:41:56 -0800
Subject: [PATCH 091/753] [XLA:GPU] Fix the ASSERT_OK_AND_ASSIGN macro

I did an oopsie and it didn't work because it referred to wrong _IMPL
macro.

PiperOrigin-RevId: 842354212
---
 .../third_party/xla_googletest_wrapper/include/gmock/gmock.h    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
index 1d8aa31e110661..27cfbf1160bd38 100644
--- a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
+++ b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
@@ -62,7 +62,7 @@ limitations under the License.
   ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
 
 #define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
-  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                     \
+  ASSERT_OK_AND_ASSIGN_IMPL(                                        \
       XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
       lhs, rexpr);
 

From 9f2ccbf20305bb9987de2eced4cfdfacbd8c9914 Mon Sep 17 00:00:00 2001
From: Matthias Guenther <mrguenther@google.com>
Date: Tue, 9 Dec 2025 12:55:28 -0800
Subject: [PATCH 092/753] Legalize MLIR to StableHLO instead of MHLO before
 lowering

PiperOrigin-RevId: 842359586
---
 .../tf2xla/api/v1/compile_mlir_util_test.cc   |  2 +-
 .../xla/third_party/stablehlo/temporary.patch | 12 +++
 .../mhlo_to_hlo/attribute_exporter.cc         |  2 +-
 .../mhlo_to_hlo/attribute_exporter_test.cc    |  8 +-
 .../translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc  | 83 ++++++++++---------
 .../tests/opaque_elements_attr.mlir           |  4 +-
 .../hlo_legalize_to_stablehlo_pass.cc         | 29 +++++--
 .../stablehlo_prepare_for_hlo_export.cpp      |  2 +-
 8 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
index b13e099fde3557..475bd79849e80e 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util_test.cc
@@ -85,7 +85,7 @@ TEST(LegalizeMlirTest, LegalizesModule) {
       /*shape_determination_fns=*/{}, &compilation_result);
 
   EXPECT_TRUE(status.ok());
-  EXPECT_THAT(status.value(), HasSubstr("mhlo.const"));
+  EXPECT_THAT(status.value(), HasSubstr("stablehlo.constant"));
 }
 
 TEST(LegalizeMlirTest, FailsLegalizesModule) {
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index adc1a07d777e1f..5e49416bc43a26 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -982,6 +982,18 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_broadcasting.mlir b/stablehlo/stableh
 +  return %0 : !stablehlo.token
 +}
 +
+diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+--- stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
++++ stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
+@@ -316,7 +316,7 @@
+ // Serialized string:
+ //   "\08\03\1A\02\01\02\22\02\00\01"
+ func.func @test_custom_call2(%arg0: tensor<16x16xf32>) -> tensor<16x16xf32> {
+-  %0 = "stablehlo.custom_call"(%arg0) {backend_config = "", call_target_name = "Sharding", stablehlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
++  %0 = "stablehlo.custom_call"(%arg0) {backend_config = "", call_target_name = "Sharding", mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
+   func.return %0 : tensor<16x16xf32>
+ }
+ 
 diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
 --- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
 +++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
index 326ec30ba20572..d2a31df67d3b4b 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -490,7 +490,7 @@ std::optional<xla::OpSharding> ExtractShardyResultShardingFromFrontendAttrs(
   mlir::Operation* defining_op =
       mlir::sdy::getBodyTerminatorOperand(function, res_num).getDefiningOp();
   auto custom_call_op =
-      mlir::dyn_cast_or_null<mlir::mhlo::CustomCallOp>(defining_op);
+      mlir::dyn_cast_or_null<mlir::stablehlo::CustomCallOp>(defining_op);
 
   if (custom_call_op == nullptr ||
       custom_call_op.getCallTargetName() !=
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc
index e9a37c9fc24c65..230922533025f0 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/attribute_exporter_test.cc
@@ -184,8 +184,8 @@ TEST_F(AttributeExporterTest, ExtractShardyResultShardingFromFrontendAttrs) {
       "{mesh = #sdy.mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>}"
     }} {
       func.func @main(%arg0: tensor<8x8xf32>) -> (tensor<8x8xf32>, tensor<8x8xf32>) {
-        %0 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
-        %1 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<@mesh, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
         return %0, %1 : tensor<8x8xf32>, tensor<8x8xf32>
       }
     }
@@ -222,8 +222,8 @@ TEST_F(AttributeExporterTest,
   constexpr absl::string_view mlir_source = R"mlir(
     module @test {
       func.func @main(%arg0: tensor<8x8xf32>) -> (tensor<8x8xf32>, tensor<8x8xf32>) {
-        %0 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
-        %1 = mhlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %0 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{\"x\", \"y\", ?}, {\"z\"}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
+        %1 = stablehlo.custom_call @local_xla.sdy.FuncResultSharding(%arg0) {has_side_effect = true, mhlo.frontend_attributes = {xla.sdy.sharding = "#sdy.sharding_per_value<[<mesh<[\"x\"=2, \"y\"=4, \"z\"=4]>, [{}, {}]>]>"}} : (tensor<8x8xf32>) -> tensor<8x8xf32>
         return %0, %1 : tensor<8x8xf32>, tensor<8x8xf32>
       }
     }
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index a33d5fb6872457..88fc6bc4f2da29 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -3039,8 +3039,8 @@ mlir::LogicalResult ExportXlaOp(SetDimensionSizeOp op, OpLoweringContext ctx) {
     return op.emitError(shape_or.status().ToString());
   }
   xla::XlaOp xla_result;
-  if (auto constant =
-          llvm::dyn_cast_or_null<ConstantOp>(op.getSize().getDefiningOp());
+  if (auto constant = llvm::dyn_cast_or_null<stablehlo::ConstantOp>(
+          op.getSize().getDefiningOp());
       constant != nullptr) {
     auto value = constant.getValue();
     auto values = value.getValues<mlir::IntegerAttr>();
@@ -3340,9 +3340,9 @@ LogicalResult ExportXlaOp(ReduceScatterOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
+LogicalResult ExportXlaOp(mhlo::AsyncStartOp op, OpLoweringContext ctx) {
   for (auto* user : op.getResult().getUsers()) {
-    if (!isa<AsyncUpdateOp, AsyncDoneOp>(user)) {
+    if (!isa<mhlo::AsyncUpdateOp, mhlo::AsyncDoneOp>(user)) {
       return op.emitOpError() << "Users of AsyncStart's return value must be "
                               << "async_update or async_done";
     }
@@ -3357,8 +3357,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
   mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
       FlatSymbolRefAttr::get(op->getContext(), op.getCalledComputation()));
 
-  auto all_gather_op =
-      dyn_cast_or_null<AllGatherOp>(callee.getBody().front().front());
+  auto all_gather_op = dyn_cast_or_null<stablehlo::AllGatherOp>(
+      callee.getBody().front().front());
   if (all_gather_op && SimplyReturnedOp(all_gather_op)) {
     TensorType operand_type =
         mlir::cast<TensorType>(all_gather_op.getOperand(0).getType());
@@ -3378,8 +3378,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         Convert_use_global_device_ids(all_gather_op.getUseGlobalDeviceIds()));
     return success();
   }
-  auto all_reduce_op =
-      dyn_cast_or_null<AllReduceOp>(callee.getBody().front().front());
+  auto all_reduce_op = dyn_cast_or_null<stablehlo::AllReduceOp>(
+      callee.getBody().front().front());
   if (all_reduce_op && SimplyReturnedOp(all_reduce_op)) {
     xla::XlaComputationId computation;
     if (failed(ctx.converter->LowerRegionAsComputation(
@@ -3394,8 +3394,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         Convert_use_global_device_ids(all_reduce_op.getUseGlobalDeviceIds()));
     return success();
   }
-  auto collective_permute_op =
-      dyn_cast_or_null<CollectivePermuteOp>(callee.getBody().front().front());
+  auto collective_permute_op = dyn_cast_or_null<stablehlo::CollectivePermuteOp>(
+      callee.getBody().front().front());
   if (collective_permute_op && SimplyReturnedOp(collective_permute_op)) {
     value_map[result] =
         xla::internal::XlaBuilderFriend::BuildCollectivePermuteStart(
@@ -3405,7 +3405,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
             Convert_channel_handle(collective_permute_op.getChannelHandle()));
     return mlir::success();
   }
-  auto copy_op = dyn_cast_or_null<CopyOp>(callee.getBody().front().front());
+  auto copy_op =
+      dyn_cast_or_null<mhlo::CopyOp>(callee.getBody().front().front());
   if (copy_op && SimplyReturnedOp(copy_op)) {
     std::optional<int> cross_program_prefetch_index =
         copy_op.getCrossProgramPrefetchIndex()
@@ -3415,7 +3416,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         ctx.builder, operands[0], cross_program_prefetch_index);
     return mlir::success();
   }
-  auto send_op = dyn_cast_or_null<SendOp>(callee.getBody().front().front());
+  auto send_op =
+      dyn_cast_or_null<stablehlo::SendOp>(callee.getBody().front().front());
   if (send_op && SimplyReturnedOp(send_op)) {
     xla::XlaOp operand;
     if (operands.size() == 2)
@@ -3432,7 +3434,8 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
         send_op.getIsHostTransfer());
     return mlir::success();
   }
-  auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
+  auto recv_op =
+      dyn_cast_or_null<stablehlo::RecvOp>(callee.getBody().front().front());
   if (recv_op && SimplyReturnedOp(recv_op)) {
     auto result_types =
         mlir::cast<AsyncBundleType>(result.getType()).getTypes()[1];
@@ -3466,8 +3469,9 @@ LogicalResult ExportXlaOp(AsyncStartOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
-  if (!isa<AsyncStartOp, AsyncUpdateOp>(op.getBundle().getDefiningOp())) {
+LogicalResult ExportXlaOp(mhlo::AsyncUpdateOp op, OpLoweringContext ctx) {
+  if (!isa<mhlo::AsyncStartOp, mhlo::AsyncUpdateOp>(
+          op.getBundle().getDefiningOp())) {
     auto theerror = op.emitError()
                     << "Defining op of AsyncUpdate's operand must be "
                     << "async_start or async_update";
@@ -3480,7 +3484,7 @@ LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
   }
 
   for (auto* user : op.getResult().getUsers()) {
-    if (!isa<AsyncUpdateOp, AsyncDoneOp>(user)) {
+    if (!isa<mhlo::AsyncUpdateOp, mhlo::AsyncDoneOp>(user)) {
       return op.emitOpError() << "Users of AsyncUpdate's return value must be "
                               << "async_update or async_done";
     }
@@ -3497,8 +3501,9 @@ LogicalResult ExportXlaOp(AsyncUpdateOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
-  if (!isa<AsyncStartOp, AsyncUpdateOp>(op.getBundle().getDefiningOp())) {
+LogicalResult ExportXlaOp(mhlo::AsyncDoneOp op, OpLoweringContext ctx) {
+  if (!isa<mhlo::AsyncStartOp, mhlo::AsyncUpdateOp>(
+          op.getBundle().getDefiningOp())) {
     auto theerror = op.emitError()
                     << "Defining op of AsyncDone's operand must be "
                     << "async_start or async_update";
@@ -3514,42 +3519,44 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   if (failed(GetXlaOp(op.getBundle(), value_map, &operand, op)))
     return failure();
 
-  // Find the AsyncStartOp that starts the async chain.
+  // Find the mhlo::AsyncStartOp that starts the async chain.
   Operation* start = op;
-  while (start != nullptr && !isa<AsyncStartOp>(start)) {
+  while (start != nullptr && !isa<mhlo::AsyncStartOp>(start)) {
     start = start->getOperand(0).getDefiningOp();
-    if (start == nullptr || !isa<AsyncStartOp, AsyncUpdateOp>(start)) {
+    if (start == nullptr ||
+        !isa<mhlo::AsyncStartOp, mhlo::AsyncUpdateOp>(start)) {
       return op.emitError() << "Defining op of AsyncDone's operand must be "
                             << "async_start or async_update";
     }
   }
 
-  if (!isa<AsyncStartOp>(start)) {
+  if (!isa<mhlo::AsyncStartOp>(start)) {
     return op.emitError() << "Could not find async chain start";
   }
 
   mlir::func::FuncOp callee =
       ctx.converter->LookUpSymbol(FlatSymbolRefAttr::get(
-          op->getContext(), cast<AsyncStartOp>(start).getCalledComputation()));
+          op->getContext(),
+          cast<mhlo::AsyncStartOp>(start).getCalledComputation()));
 
-  auto all_gather_op =
-      dyn_cast_or_null<AllGatherOp>(callee.getBody().front().front());
+  auto all_gather_op = dyn_cast_or_null<stablehlo::AllGatherOp>(
+      callee.getBody().front().front());
   if (all_gather_op && SimplyReturnedOp(all_gather_op)) {
     value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildAllGatherDone(
             ctx.builder, operand, xla::TypeToShape(all_gather_op.getType(0)));
     return success();
   }
-  auto all_reduce_op =
-      dyn_cast_or_null<AllReduceOp>(callee.getBody().front().front());
+  auto all_reduce_op = dyn_cast_or_null<stablehlo::AllReduceOp>(
+      callee.getBody().front().front());
   if (all_reduce_op && SimplyReturnedOp(all_reduce_op)) {
     value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildAllReduceDone(
             ctx.builder, operand, xla::TypeToShape(all_reduce_op.getType(0)));
     return success();
   }
-  auto collective_permute_op =
-      dyn_cast_or_null<CollectivePermuteOp>(callee.getBody().front().front());
+  auto collective_permute_op = dyn_cast_or_null<stablehlo::CollectivePermuteOp>(
+      callee.getBody().front().front());
   if (collective_permute_op && SimplyReturnedOp(collective_permute_op)) {
     value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildCollectivePermuteDone(
@@ -3557,13 +3564,15 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
             xla::TypeToShape(collective_permute_op.getType()));
     return success();
   }
-  auto copy_op = dyn_cast_or_null<CopyOp>(callee.getBody().front().front());
+  auto copy_op =
+      dyn_cast_or_null<mhlo::CopyOp>(callee.getBody().front().front());
   if (copy_op && SimplyReturnedOp(copy_op)) {
     value_map[op.getResult(0)] = xla::internal::XlaBuilderFriend::BuildCopyDone(
         ctx.builder, operand, xla::TypeToShape(copy_op.getType()));
     return success();
   }
-  auto send_op = dyn_cast_or_null<SendOp>(callee.getBody().front().front());
+  auto send_op =
+      dyn_cast_or_null<stablehlo::SendOp>(callee.getBody().front().front());
   if (send_op && SimplyReturnedOp(send_op)) {
     value_map[op.getResult(0)] = xla::internal::XlaBuilderFriend::BuildSendDone(
         ctx.builder, operand,
@@ -3571,7 +3580,8 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
         send_op.getIsHostTransfer());
     return success();
   }
-  auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
+  auto recv_op =
+      dyn_cast_or_null<stablehlo::RecvOp>(callee.getBody().front().front());
   if (recv_op && SimplyReturnedOp(recv_op)) {
     auto result_types =
         mlir::cast<AsyncBundleType>(op.getBundle().getType()).getTypes()[1];
@@ -6427,12 +6437,11 @@ absl::Status ConvertMlirHloToHlo(mlir::ModuleOp module,
 #endif
   pm.enableVerifier(enableVerifier);
 
-  mhlo::StablehloLegalizeToHloPassOptions shlo_pass_opts;
-  shlo_pass_opts.convert_xla_supported_stablehlo_ =
-      !options.direct_stablehlo_to_hlo;
-  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass(shlo_pass_opts));
+  mhlo::HloLegalizeToStablehloPassOptions shlo_pass_opts;
+  shlo_pass_opts.allow_xla_features_ = true;
+  pm.addPass(mlir::mhlo::createHloLegalizeToStablehloPass(shlo_pass_opts));
   if (failed(pm.run(module))) {
-    return absl::InternalError("Unable to convert StableHLO to MHLO");
+    return absl::InternalError("Unable to convert MHLO to StableHLO");
   }
 
   TF_RETURN_IF_ERROR(PrepareForExport(module));
diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir
index 852525dc317da5..461307a0b558f8 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/tests/opaque_elements_attr.mlir
@@ -10,7 +10,7 @@ func.func @main() {
 
 // Tests dynamic result shape
 
-// CHECK: 'mhlo.all_gather' op can't be translated to XLA HLO
+// CHECK: 'stablehlo.all_gather' op can't be translated to XLA HLO
 func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
@@ -24,7 +24,7 @@ func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
 
 // Tests dynamic operand shape
 
-// CHECK: 'mhlo.all_gather' op can't be translated to XLA HLO
+// CHECK: 'stablehlo.all_gather' op can't be translated to XLA HLO
 func.func @main(%arg0: tensor<128x32xf32>) -> tensor<128x?xf32> {
   %0 = "mhlo.all_gather"(%arg0) {
     all_gather_dim = 1 : i64,
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
index e4b7a1a91f2b31..e677e65de672e9 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo_pass.cc
@@ -106,13 +106,23 @@ struct HloLegalizeToStablehloPass
     stablehlo::registerFuncOpsForTypeConversion(target, patterns, converter);
 
     if (allow_xla_features_) {
-      // These ops do not exist in StableHLO.
-      target.addLegalOp<mhlo::AsyncDoneOp, mhlo::AsyncStartOp,
-                        mhlo::AsyncUpdateOp, mhlo::BitcastOp, mhlo::CopyOp,
-                        mhlo::DomainOp, mhlo::ErfOp, mhlo::FusionOp,
-                        mhlo::MinimumBroadcastShapesOp, mhlo::RaggedDotOp,
-                        mhlo::StochasticConvertOp, mhlo::TopKOp, mhlo::TraceOp,
-                        mhlo::XlaRngGetAndUpdateStateOp>();
+      // These ops do not exist in StableHLO. (They do exist in CHLO, a slightly
+      // higher-level dialect wrapping StableHLO, but we leave them as MHLO here
+      // since we're specifically legalizing to StableHLO, not to CHLO.)
+      target.addLegalOp<  //
+          mhlo::AcosOp, mhlo::AcoshOp, mhlo::AsinOp, mhlo::AsinhOp,
+          mhlo::AtanhOp, mhlo::CoshOp, mhlo::ErfOp, mhlo::RaggedDotOp,
+          mhlo::SinhOp, mhlo::TopKOp>();
+
+      // These ops do not exist in StableHLO. (They don't exist in CHLO, either;
+      // MHLO is the appropriate dialect for expressing XLA-specific features
+      // such as these.)
+      target.addLegalOp<
+          mhlo::AsyncDoneOp, mhlo::AsyncStartOp, mhlo::AsyncUpdateOp,
+          mhlo::BitcastOp, mhlo::CopyOp, mhlo::DomainOp, mhlo::FusionOp,
+          mhlo::MinimumBroadcastShapesOp, mhlo::StochasticConvertOp,
+          mhlo::TraceOp, mhlo::XlaRngGetAndUpdateStateOp>();
+
       target.addDynamicallyLegalOp<mhlo::AddDependencyOp>(
           [](mhlo::AddDependencyOp op) {
             return !hasMhloTypes(op->getOperandTypes());
@@ -142,8 +152,11 @@ struct HloLegalizeToStablehloPass
         [](Operation* op) { return !hasMhloTypes(op->getOperandTypes()); });
     patterns.add<UpdateOperandsInUnknownOp>(converter, &getContext());
 
+    ConversionConfig config;
+    config.foldingMode = DialectConversionFoldingMode::Never;
+
     if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns))))
+                                      std::move(patterns), config)))
       return signalPassFailure();
   }
 };
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
index 3e793b4210e1bb..5712ad045279b4 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
@@ -44,7 +44,7 @@ limitations under the License.
 namespace mlir {
 namespace stablehlo_ext {
 
-constexpr char kShardingAttr[] = "stablehlo.sharding";
+constexpr char kShardingAttr[] = "mhlo.sharding";
 
 #define GEN_PASS_DEF_STABLEHLOPREPAREFORHLOEXPORTPASS
 #include "stablehlo_ext/transforms/passes.h.inc"

From 0faedecfc71ed6ff680ac0cca92db4d925bcaf46 Mon Sep 17 00:00:00 2001
From: Marissa Ikonomidis <marissaw@google.com>
Date: Tue, 9 Dec 2025 13:04:02 -0800
Subject: [PATCH 093/753] Support Windows file paths in serialization.cc

Windows uses '\' instead of '/' so update the library to support
both.

PiperOrigin-RevId: 842363091
---
 tensorflow/lite/delegates/serialization.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/serialization.cc b/tensorflow/lite/delegates/serialization.cc
index 0c26589e19bf96..fec19eb6ac34a2 100644
--- a/tensorflow/lite/delegates/serialization.cc
+++ b/tensorflow/lite/delegates/serialization.cc
@@ -47,6 +47,12 @@ namespace {
 
 static const char kDelegatedNodesSuffix[] = "_dnodes";
 
+#if defined(_WIN32)
+static const char kPathSeparator = '\\';
+#else
+static const char kPathSeparator = '/';
+#endif  // defined(_WIN32)
+
 // Farmhash Fingerprint
 inline uint64_t CombineFingerprints(uint64_t l, uint64_t h) {
   // Murmur-inspired hashing.
@@ -63,7 +69,8 @@ inline uint64_t CombineFingerprints(uint64_t l, uint64_t h) {
 
 inline std::string JoinPath(const std::string& path1,
                             const std::string& path2) {
-  return (path1.back() == '/') ? (path1 + path2) : (path1 + "/" + path2);
+  return (path1.back() == kPathSeparator) ? (path1 + path2)
+                                          : (path1 + kPathSeparator + path2);
 }
 
 inline std::string GetFilePath(const std::string& cache_dir,

From d411e84d94174546f2d7b6a9e1a80a208bd4a554 Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Tue, 9 Dec 2025 13:09:54 -0800
Subject: [PATCH 094/753] Change PjRt to use new copy of coordination service.

PiperOrigin-RevId: 842365974
---
 third_party/xla/xla/pjrt/distributed/BUILD    | 11 +++++-----
 .../xla/xla/pjrt/distributed/client.cc        | 22 +++++++++----------
 third_party/xla/xla/pjrt/distributed/client.h |  6 ++---
 .../xla/xla/pjrt/distributed/service.cc       | 14 ++++++------
 .../xla/xla/pjrt/distributed/service.h        |  4 ++--
 third_party/xla/xla/pjrt/gpu/BUILD            |  1 -
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  1 -
 third_party/xla/xla/python/pjrt_ifrt/BUILD    |  2 +-
 .../xla/xla/python/pjrt_ifrt/pjrt_client.cc   |  8 +++----
 .../xla/xla/python/pjrt_ifrt/pjrt_client.h    |  2 +-
 third_party/xla/xla/python/version.h          |  4 +---
 11 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 4f5e5356f6a343..fb1bdb6f7800f2 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -22,9 +22,9 @@ cc_library(
     deps = [
         "//xla:types",
         "//xla:util",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service",
+        "//xla/pjrt/distributed/coordination:coordination_service",
+        "//xla/pjrt/distributed/coordination:grpc_coordination_service_impl",
         "//xla/tsl/distributed_runtime/rpc:async_service_interface",
-        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -65,10 +65,10 @@ cc_library(
     ],
     deps = [
         ":key_value_store_interface",
+        "//xla/pjrt/distributed/coordination:coordination_client",
+        "//xla/pjrt/distributed/coordination:coordination_service_agent",
+        "//xla/pjrt/distributed/coordination:grpc_coordination_client",
         "//xla/runtime:device_id",
-        "//xla/tsl/distributed_runtime/coordination:coordination_client",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
-        "//xla/tsl/distributed_runtime/rpc/coordination:grpc_coordination_client",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
@@ -144,6 +144,7 @@ xla_cc_test(
         ":service",
         ":topology_util",
         "//xla:status_macros",
+        "//xla/pjrt/distributed/coordination:coordination_service_agent",
         "//xla/runtime:device_id",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index e1230a81e614ba..b6c3087a56f808 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -31,11 +31,11 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "grpcpp/channel.h"
+#include "xla/pjrt/distributed/coordination/coordination_client.h"
+#include "xla/pjrt/distributed/coordination/coordination_service_agent.h"
+#include "xla/pjrt/distributed/coordination/grpc_coordination_client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/runtime/device_id.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_client.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
-#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_client.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -73,11 +73,11 @@ class DistributedRuntimeCoordinationServiceClient
   GetLiveNodesWithIncarnations(absl::Span<const int32_t> nodes) override;
   absl::StatusOr<std::vector<int32_t>> GetLiveNodes(
       absl::Span<const int32_t> nodes) override;
-  absl::StatusOr<tsl::CoordinationServiceAgent*> GetCoordinationServiceAgent()
+  absl::StatusOr<CoordinationServiceAgent*> GetCoordinationServiceAgent()
       override;
 
  private:
-  std::unique_ptr<tsl::CoordinationServiceAgent> coord_agent_;
+  std::unique_ptr<CoordinationServiceAgent> coord_agent_;
   tensorflow::CoordinationServiceConfig config_;
   absl::Duration min_connect_barrier_timeout_;
   int task_id_;
@@ -102,9 +102,9 @@ DistributedRuntimeCoordinationServiceClient::
   config.set_poll_for_error_from_service_at_startup(
       options.poll_for_error_from_service_at_startup);
 
-  std::unique_ptr<tsl::CoordinationClient> leader_client;
-  leader_client.reset(tsl::NewGrpcCoordinationClient(channel));
-  coord_agent_ = tsl::CreateCoordinationServiceAgent();
+  std::unique_ptr<CoordinationClient> leader_client;
+  leader_client.reset(NewGrpcCoordinationClient(channel));
+  coord_agent_ = CreateCoordinationServiceAgent();
   const absl::Status status = coord_agent_->Initialize(
       options.env, "jax_worker", options.node_id, config,
       std::move(leader_client), options.missed_heartbeat_callback,
@@ -232,12 +232,12 @@ DistributedRuntimeCoordinationServiceClient::GetLiveNodesWithIncarnations(
 
   // Get the set of live tasks.
   TF_ASSIGN_OR_RETURN(
-      const std::vector<tsl::CoordinationServiceAgent::AliveTask> live_tasks,
+      const std::vector<CoordinationServiceAgent::AliveTask> live_tasks,
       coord_agent_->GetAliveTasks(tasks));
 
   // Extract the node ids from the live tasks.
   absl::flat_hash_map<int32_t, IncarnationId> live_nodes;
-  for (const tsl::CoordinationServiceAgent::AliveTask& task : live_tasks) {
+  for (const CoordinationServiceAgent::AliveTask& task : live_tasks) {
     live_nodes[task.task_id] = task.incarnation_id;
   }
   return live_nodes;
@@ -258,7 +258,7 @@ DistributedRuntimeCoordinationServiceClient::GetLiveNodes(
   return live_nodes;
 }
 
-absl::StatusOr<tsl::CoordinationServiceAgent*>
+absl::StatusOr<CoordinationServiceAgent*>
 DistributedRuntimeCoordinationServiceClient::GetCoordinationServiceAgent() {
   return coord_agent_.get();
 }
diff --git a/third_party/xla/xla/pjrt/distributed/client.h b/third_party/xla/xla/pjrt/distributed/client.h
index 50355fd8c0b182..01bab0d6f700c6 100644
--- a/third_party/xla/xla/pjrt/distributed/client.h
+++ b/third_party/xla/xla/pjrt/distributed/client.h
@@ -36,9 +36,9 @@ limitations under the License.
 #include "xla/runtime/device_id.h"
 #include "xla/tsl/platform/env.h"
 
-namespace tsl {
+namespace xla {
 class CoordinationServiceAgent;
-}  // namespace tsl
+}  // namespace xla
 
 namespace xla {
 
@@ -165,7 +165,7 @@ class DistributedRuntimeClient {
 
   // Returns pointer to coordination service agent, or InternalError if the
   // client does not use coordination service.
-  virtual absl::StatusOr<tsl::CoordinationServiceAgent*>
+  virtual absl::StatusOr<CoordinationServiceAgent*>
   GetCoordinationServiceAgent() = 0;
 };
 
diff --git a/third_party/xla/xla/pjrt/distributed/service.cc b/third_party/xla/xla/pjrt/distributed/service.cc
index 7c7288e79f967a..1f1a58f326c21b 100644
--- a/third_party/xla/xla/pjrt/distributed/service.cc
+++ b/third_party/xla/xla/pjrt/distributed/service.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "grpcpp/server_builder.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/pjrt/distributed/coordination/coordination_service.h"
+#include "xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h"
 #include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "xla/tsl/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
@@ -34,7 +34,7 @@ limitations under the License.
 
 namespace {
 
-std::unique_ptr<tsl::CoordinationService> EnableCoordinationService(
+std::unique_ptr<xla::CoordinationService> EnableCoordinationService(
     const xla::CoordinationServiceImpl::Options& options) {
   const std::string job_name = "jax_worker";
   tensorflow::CoordinationServiceConfig config;
@@ -52,7 +52,7 @@ std::unique_ptr<tsl::CoordinationService> EnableCoordinationService(
   job->set_name(job_name);
   job->set_num_tasks(options.num_nodes);
   auto service =
-      tsl::CoordinationService::Create(options.env, config, /*cache=*/nullptr);
+      xla::CoordinationService::Create(options.env, config, /*cache=*/nullptr);
   return service;
 }
 }  // namespace
@@ -67,10 +67,10 @@ CoordinationServiceImpl::CoordinationServiceImpl(
   coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
       options.env, "CoordinationServiceRpcHandler",
       /*num_threads=*/4);
-  coord_rpc_service_ = std::make_unique<tsl::GrpcCoordinationServiceImpl>(
+  coord_rpc_service_ = std::make_unique<GrpcCoordinationServiceImpl>(
       coord_compute_pool_.get(), builder);
   auto* grpc_coord_service =
-      static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
+      static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
   grpc_coord_service->SetCoordinationServiceInstance(coord_service_.get());
   LOG(INFO) << "Coordination service is enabled.";
 }
@@ -79,7 +79,7 @@ CoordinationServiceImpl::~CoordinationServiceImpl() {
   // Service object must be destroyed to clear all pending RPCs before shutting
   // down the RPC service.
   coord_service_ = nullptr;
-  static_cast<tsl::GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
+  static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
       ->SetCoordinationServiceInstance(nullptr);
   coord_rpc_service_->Shutdown();
 }
diff --git a/third_party/xla/xla/pjrt/distributed/service.h b/third_party/xla/xla/pjrt/distributed/service.h
index 46ee00e7cf3efa..ef875e099559a0 100644
--- a/third_party/xla/xla/pjrt/distributed/service.h
+++ b/third_party/xla/xla/pjrt/distributed/service.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/security/server_credentials.h"
 #include "grpcpp/server_builder.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service.h"
+#include "xla/pjrt/distributed/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
 #include "xla/types.h"
 #include "tsl/platform/env.h"
@@ -72,7 +72,7 @@ class CoordinationServiceImpl {
 
  private:
   tsl::Env* env_ = nullptr;  // Not owned.
-  std::unique_ptr<tsl::CoordinationService> coord_service_;
+  std::unique_ptr<CoordinationService> coord_service_;
   std::unique_ptr<tsl::thread::ThreadPool> coord_compute_pool_;
   std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<tsl::Thread> coord_rpc_thread_;
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 1b082cc7a37f5f..ddecf07a49901f 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -128,7 +128,6 @@ cc_library(
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/distributed_runtime/coordination:coordination_service",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/framework:allocator",
         "//xla/tsl/framework:bfc_allocator",
         "//xla/tsl/framework:device_id",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index e210a480bc74dd..adbe4c805fb546 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -111,7 +111,6 @@ limitations under the License.
 #include "xla/tsl/concurrency/async_value.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/ref_count.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 46451a959629d5..afc3b0ba21baa4 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -342,6 +342,7 @@ cc_library(
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/distributed:protocol_proto_cc",
         "//xla/pjrt/distributed:topology_util",
+        "//xla/pjrt/distributed/coordination:coordination_service_agent",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
@@ -354,7 +355,6 @@ cc_library(
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/distributed_runtime:call_options",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index c2d4458ba0663f..d906dd44f5bf78 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/pjrt/distributed/coordination/coordination_service_agent.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/host_memory_spaces.h"
@@ -90,7 +91,6 @@ limitations under the License.
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
-#include "xla/tsl/distributed_runtime/coordination/coordination_service_agent.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
@@ -869,7 +869,7 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> PjRtClient::Create(
 
   // Start a background thread to monitor the status of all processes.
   if (client->distributed_client_) {
-    absl::StatusOr<tsl::CoordinationServiceAgent*> agent =
+    absl::StatusOr<xla::CoordinationServiceAgent*> agent =
         client->distributed_client_->GetCoordinationServiceAgent();
     if (agent.ok()) {
       client->global_process_info_thread_.reset(
@@ -1497,7 +1497,7 @@ CrossHostTransferKey PjRtClient::CreateNewTransferKey() {
 }
 
 absl::Status PjRtClient::WatchGlobalProcessInfo(
-    tsl::CoordinationServiceAgent& agent) {
+    xla::CoordinationServiceAgent& agent) {
   TF_ASSIGN_OR_RETURN(tensorflow::CoordinatedTask task, agent.GetOwnTask());
   VLOG(3) << "Watching global process info for task "
           << task.ShortDebugString();
@@ -1763,7 +1763,7 @@ PjRtClient::Incarnations() const {
   if (!distributed_client_) {
     return absl::FailedPreconditionError("missing distributed client");
   }
-  TF_ASSIGN_OR_RETURN(tsl::CoordinationServiceAgent * agent,
+  TF_ASSIGN_OR_RETURN(xla::CoordinationServiceAgent * agent,
                       distributed_client_->GetCoordinationServiceAgent());
   return agent->Incarnations();
 }
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index 040775d7a98dd1..7aa3436584ab75 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -418,7 +418,7 @@ class PjRtClient final
   // If true, the backend implements the cross-host transfer APIs.
   bool pjrt_supports_cross_host_transfers_ = false;
 
-  absl::Status WatchGlobalProcessInfo(tsl::CoordinationServiceAgent& agent);
+  absl::Status WatchGlobalProcessInfo(xla::CoordinationServiceAgent& agent);
 
   std::atomic<int64_t> next_transfer_key_ = 0;
   std::shared_ptr<xla::DistributedRuntimeClient> distributed_client_;
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 86f9d46b359be0..46d996d59c1a9b 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,8 +18,6 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER \
-  38  // `xla::ifrt::Executable::Create()` can directly take an MLIR module and
-      // compile it.
+#define JAX_IFRT_VERSION_NUMBER 39  // New coordination service implementation.
 
 #endif  // XLA_PYTHON_VERSION_H_

From a785c3d841b01de588c738469112ee883a69d3c0 Mon Sep 17 00:00:00 2001
From: Changhui Lin <changhuilin@google.com>
Date: Tue, 9 Dec 2025 13:47:26 -0800
Subject: [PATCH 095/753] Temporarily disable the registration of GPU
 compilation environment processing.

PiperOrigin-RevId: 842380990
---
 .../xla/xla/service/gpu_compilation_environment.cc        | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index 6fd49f3a55e1b2..574f6c735b98c6 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -132,9 +132,11 @@ ProcessNewGpuCompilationEnvironment(
 }  // namespace xla
 
 static bool InitModule() {
-  xla::CompilationEnvironments::RegisterProcessNewEnvFn(
-      xla::GpuCompilationEnvironment::descriptor(),
-      xla::ProcessNewGpuCompilationEnvironment);
+  // TODO(b/284274097): Enable the registration once GPU compilation environment
+  // is well supported.
+  // xla::CompilationEnvironments::RegisterProcessNewEnvFn(
+  //     xla::GpuCompilationEnvironment::descriptor(),
+  //     xla::ProcessNewGpuCompilationEnvironment);
   return true;
 }
 static bool module_initialized = InitModule();

From 5419d8d12281448dff692c2f13047e31df5085cc Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 9 Dec 2025 14:12:20 -0800
Subject: [PATCH 096/753] [xla] Migrate to se::DeviceMemoryAddress

Reverts 5b7e6e0d94c58afd3756c2b69e28a4d32e9ba9fa

PiperOrigin-RevId: 842391837
---
 tensorflow/compiler/jit/xla_tensor.cc         |  2 +-
 .../runtime/gpublas_lt_matmul_thunk_test.cc   |  3 +-
 third_party/xla/xla/client/BUILD              |  4 +-
 third_party/xla/xla/client/client_library.h   |  2 +-
 third_party/xla/xla/client/local_client.cc    |  4 +-
 third_party/xla/xla/client/local_client.h     |  4 +-
 third_party/xla/xla/core/collectives/BUILD    |  2 +-
 .../xla/xla/core/collectives/communicator.h   | 40 ++++++++--------
 third_party/xla/xla/ffi/BUILD                 | 10 ++--
 third_party/xla/xla/ffi/api/BUILD             |  4 +-
 third_party/xla/xla/ffi/api/c_api_internal.h  |  2 +-
 third_party/xla/xla/ffi/api/ffi_test.cc       | 48 ++++++++++---------
 third_party/xla/xla/ffi/call_frame.cc         | 20 ++++----
 third_party/xla/xla/ffi/call_frame.h          | 14 +++---
 third_party/xla/xla/ffi/call_frame_test.cc    | 20 ++++----
 third_party/xla/xla/ffi/ffi.h                 | 12 ++---
 third_party/xla/xla/ffi/ffi_api.cc            |  4 +-
 third_party/xla/xla/ffi/ffi_test.cc           | 32 ++++++-------
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  4 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc  | 23 +++++----
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  4 +-
 .../xla/pjrt/pjrt_stream_executor_client.h    |  4 +-
 .../xla/pjrt/tracked_device_buffer_test.cc    |  3 +-
 third_party/xla/xla/service/BUILD             |  2 +
 .../xla/service/maybe_owning_device_memory.h  |  2 +
 third_party/xla/xla/tests/BUILD               | 16 +++----
 .../xla/xla/tests/buffer_donation_test.cc     | 12 ++---
 .../xla/xla/tests/collective_ops_ffi_test.cc  |  2 +-
 third_party/xla/xla/tests/hlo_test_base.cc    |  4 +-
 third_party/xla/xla/tests/hlo_test_base.h     |  6 +--
 .../xla/tests/local_client_execute_test.cc    |  2 +-
 .../xla/xla/tests/local_client_test_base.cc   |  8 ++--
 .../xla/xla/tests/local_client_test_base.h    |  8 ++--
 .../xla/xla/tests/transfer_manager_test.cc    |  2 +-
 third_party/xla/xla/tools/BUILD               |  2 +-
 35 files changed, 173 insertions(+), 158 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index e9cdad219dd28d..d6792cd7802d96 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -55,7 +55,7 @@ absl::Status XlaTensor::AllocateShapedBuffer(DataType dtype,
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
+    TF_ASSIGN_OR_RETURN(se::ScopedDeviceAddress<uint8_t> buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false,
                             subshape.layout().memory_space()));
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index ccdf653ca1862e..77a6ac88f8ff70 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h"
 
 #include <cstddef>
+#include <cstdint>
 #include <deque>
 #include <memory>
 #include <optional>
@@ -182,7 +183,7 @@ class GpuBlasLtThunkBuilder {
   se::StreamExecutorMemoryAllocator allocator_;
   se::GpuComputeCapability gpu_comp_;
   std::deque<BufferAllocation> allocs_;
-  std::vector<se::OwningDeviceMemory> mem_buffers_;
+  std::vector<se::ScopedDeviceAddress<uint8_t>> mem_buffers_;
 };
 
 void GpuBlasLtMatmulThunkTest::CreateExecuteThunksFromHLO(
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index c2801fa3fa8410..fac2d9343ff1d0 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -128,7 +128,7 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:source_map_util",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -181,7 +181,7 @@ cc_library(
         "//xla/service:compile_only_service",
         "//xla/service:local_service",
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:logging",
diff --git a/third_party/xla/xla/client/client_library.h b/third_party/xla/xla/client/client_library.h
index 0e4f3a9a24dd22..42d0f34202e092 100644
--- a/third_party/xla/xla/client/client_library.h
+++ b/third_party/xla/xla/client/client_library.h
@@ -36,7 +36,7 @@ limitations under the License.
 #include "xla/client/local_client.h"
 #include "xla/service/compile_only_service.h"
 #include "xla/service/local_service.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index cc383a9aa81b34..e1f348a755521d 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -45,7 +45,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
@@ -512,7 +512,7 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
 
 absl::StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
     const LiteralSlice& literal, int device_ordinal,
-    se::DeviceMemoryAllocator* allocator) {
+    se::DeviceAddressAllocator* allocator) {
   if (allocator == nullptr) {
     allocator = backend().memory_allocator();
   }
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 3ccda5d43f6794..3c237ef37a1973 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/stream_pool.h"
 #include "xla/shape_tree.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -183,7 +183,7 @@ class LocalClient : public Client {
   // device is used.
   absl::StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
       const LiteralSlice& literal, int device_ordinal,
-      se::DeviceMemoryAllocator* allocator = nullptr);
+      se::DeviceAddressAllocator* allocator = nullptr);
 
   // Transfer the BorrowingLiteral to the device with the given ordinal.
   absl::StatusOr<GlobalDataHandle> TransferToLocalServer(
diff --git a/third_party/xla/xla/core/collectives/BUILD b/third_party/xla/xla/core/collectives/BUILD
index 1b0398aaaf4801..06d3ef7f6c9aed 100644
--- a/third_party/xla/xla/core/collectives/BUILD
+++ b/third_party/xla/xla/core/collectives/BUILD
@@ -73,7 +73,7 @@ cc_library(
         "//xla:future",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/core/collectives/communicator.h b/third_party/xla/xla/core/collectives/communicator.h
index 0f60a859db854d..4be35fb52163f7 100644
--- a/third_party/xla/xla/core/collectives/communicator.h
+++ b/third_party/xla/xla/core/collectives/communicator.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
@@ -65,7 +65,7 @@ class Communicator {
   // Register `buffer_range` once for efficient collective operations (i.e. on
   // NCCL backend it registers the buffer for zero-copy collective operations).
   //
-  virtual absl::Status RegisterBufferOnce(se::DeviceMemoryBase buffer_range,
+  virtual absl::Status RegisterBufferOnce(se::DeviceAddressBase buffer_range,
                                           int device_ordinal,
                                           bool use_symmetric_buffer) {
     return Unimplemented("User-managed buffer registration is not supported");
@@ -91,40 +91,40 @@ class Communicator {
 
   // Reduce buffers of length `count` in `send_buff` using `reduction_kind`
   // reduction and leaves identical copies of the result on each `recv_buff`.
-  virtual Future<> AllReduce(stream_executor::DeviceMemoryBase send_buffer,
-                             stream_executor::DeviceMemoryBase recv_buffer,
+  virtual Future<> AllReduce(stream_executor::DeviceAddressBase send_buffer,
+                             stream_executor::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              ReductionKind reduction_kind,
                              const Executor& executor) = 0;
 
   // Copy data in `send_buff` from the root device to the `recv_buff` on
   // all other devices.
-  virtual Future<> Broadcast(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
+  virtual Future<> Broadcast(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count, RankId root,
                              const Executor& executor) = 0;
 
   // Reduce data in `send_buff` from all devices using the `reduction_kind`
   // operation and leave the reduced result scattered over the devices so that
   // the `recv_buff` on rank `i` will contain the i-th block of the result.
-  virtual Future<> ReduceScatter(se::DeviceMemoryBase send_buffer,
-                                 se::DeviceMemoryBase recv_buffer,
+  virtual Future<> ReduceScatter(se::DeviceAddressBase send_buffer,
+                                 se::DeviceAddressBase recv_buffer,
                                  PrimitiveType dtype, size_t count,
                                  ReductionKind reduction_kind,
                                  const Executor& executor) = 0;
 
   // Gather `count` values from all devices into `recv_buffer`, receiving data
   // from rank `i` at offset `i * sendcount`.
-  virtual Future<> AllGather(se::DeviceMemoryBase send_buffer,
-                             se::DeviceMemoryBase recv_buffer,
+  virtual Future<> AllGather(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
                              PrimitiveType dtype, size_t count,
                              const Executor& executor) = 0;
 
   // Sends data from `send_buffer` to `target_ranks` and receives data from
   // `source_rank` into `recv_buffer`. If `source_rank` is not specified, the
   // output is filled with zeros.
-  virtual Future<> CollectivePermute(se::DeviceMemoryBase send_buffer,
-                                     se::DeviceMemoryBase recv_buffer,
+  virtual Future<> CollectivePermute(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
                                      PrimitiveType dtype, size_t count,
                                      std::optional<RankId> source_rank,
                                      absl::Span<const RankId> target_ranks,
@@ -133,30 +133,30 @@ class Communicator {
   // Sends `count` values from `send_buffers` to other ranks and receives data
   // from other ranks into `recv_buffers`.
   virtual Future<> AllToAll(
-      absl::InlinedVector<se::DeviceMemoryBase, 4> send_buffers,
-      absl::InlinedVector<se::DeviceMemoryBase, 4> recv_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
       PrimitiveType dtype, size_t count, const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `peer`.
-  virtual Future<> Send(se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Receive data from rank `peer` into `recv_buff`.
-  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
                         size_t count, RankId peer,
                         const Executor& executor) = 0;
 
   // Send data from `send_buff` to rank `recv_buff` (one-way send).
-  virtual Future<> Send(se::DeviceMemoryBase recv_buffer,
-                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Send(se::DeviceAddressBase recv_buffer,
+                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way send is not implemented");
   }
 
   // Receive data from rank `peer` into `recv_buff` (one-way recv).
-  virtual Future<> Recv(se::DeviceMemoryBase recv_buffer,
-                        se::DeviceMemoryBase send_buffer, PrimitiveType dtype,
+  virtual Future<> Recv(se::DeviceAddressBase recv_buffer,
+                        se::DeviceAddressBase send_buffer, PrimitiveType dtype,
                         size_t count, RankId peer, const Executor& executor) {
     return Unimplemented("One-way recv is not implemented");
   }
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index 41c825e3599ea2..f14764091594bc 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -37,7 +37,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -58,7 +58,7 @@ xla_cc_test(
         ":call_frame",
         "//xla:xla_data_proto_cc",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:test_benchmark",
@@ -149,7 +149,7 @@ cc_library(
         "//xla/ffi/api:c_api",
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -182,8 +182,8 @@ cc_library(
         "//xla/ffi/api:c_api_internal",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_address_allocator",
-        "//xla/stream_executor:device_memory",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -299,7 +299,7 @@ xla_cc_test(
         "//xla/backends/cpu:ffi",
         "//xla/backends/gpu:ffi",
         "//xla/ffi/api:c_api",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index 41889027b9ddd3..dc4551d8e2fecc 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -91,8 +91,8 @@ xla_cc_test(
         "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
         "//xla/ffi:type_registry",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/ffi/api/c_api_internal.h b/third_party/xla/xla/ffi/api/c_api_internal.h
index d0baf4fc3b7bb0..d9070080f3a4a6 100644
--- a/third_party/xla/xla/ffi/api/c_api_internal.h
+++ b/third_party/xla/xla/ffi/api/c_api_internal.h
@@ -93,7 +93,7 @@ typedef XLA_FFI_Error* XLA_FFI_INTERNAL_IntraOpThreadPool_Get(
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_Stream_Get(
     XLA_FFI_ExecutionContext* ctx, void** stream);
 
-// Returns a pointer to device memory allocator (`se::DeviceMemoryAllocator`
+// Returns a pointer to device memory allocator (`se::DeviceAddressAllocator`
 // pointer) which allows to allocate memory inside a custom call from the same
 // allocator as XLA (i.e. it allows to construct scratch memory allocator).
 typedef XLA_FFI_Error* XLA_FFI_INTERNAL_DeviceMemoryAllocator_Get(
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index e3345ebe915146..81578f564956fd 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/lib/core/status_test_util.h"
@@ -522,7 +522,7 @@ TEST(FfiTest, DeviceOrdinal) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -544,7 +544,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, BufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -562,7 +562,7 @@ TEST(FfiTest, BufferArgument) {
 
 TEST(FfiTest, AnyBufferResult) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -594,7 +594,7 @@ TEST(FfiTest, MissingBufferArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -611,7 +611,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -648,7 +648,7 @@ TEST(FfiTest, WrongNumberOfArguments) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -665,7 +665,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -694,7 +694,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -724,7 +724,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -785,7 +785,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -854,7 +854,7 @@ TEST(FfiTest, AutoBinding) {
   });
 
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder::AttributesBuilder attrs;
   attrs.Insert(kI32, 42);
@@ -873,7 +873,8 @@ TEST(FfiTest, AutoBindingResult) {
       Ffi::BindTo(+[](Result<AnyBuffer> buffer) { return Error::Success(); });
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
-  builder.AddBufferRet(se::DeviceMemoryBase(), PrimitiveType::F32, /*dims=*/{});
+  builder.AddBufferRet(se::DeviceAddressBase(), PrimitiveType::F32,
+                       /*dims=*/{});
   auto call_frame = builder.Build();
 
   auto status = Call(*handler, call_frame);
@@ -1409,19 +1410,22 @@ TEST(FfiTest, ScratchAllocator) {
   static void* kAddr = reinterpret_cast<void*>(0xDEADBEEF);
 
   // A test only memory allocator that returns a fixed memory address.
-  struct TestDeviceMemoryAllocator final : public se::DeviceMemoryAllocator {
+  struct TestDeviceMemoryAllocator final : public se::DeviceAddressAllocator {
     size_t count;
 
     TestDeviceMemoryAllocator()
-        : se::DeviceMemoryAllocator(nullptr), count(0) {}
+        : se::DeviceAddressAllocator(nullptr), count(0) {}
 
-    absl::StatusOr<se::OwningDeviceMemory> Allocate(int, uint64_t size, bool,
-                                                    int64_t) final {
+    absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(int,
+                                                              uint64_t size,
+                                                              bool,
+                                                              int64_t) final {
       count++;
-      return se::OwningDeviceMemory(se::DeviceMemoryBase(kAddr, size), 0, this);
+      return se::ScopedDeviceAddress<uint8_t>(
+          se::DeviceAddressBase(kAddr, size), 0, this);
     }
 
-    absl::Status Deallocate(int, se::DeviceMemoryBase mem) final {
+    absl::Status Deallocate(int, se::DeviceAddressBase mem) final {
       count--;
       EXPECT_EQ(mem.opaque(), kAddr);
       return absl::OkStatus();
@@ -1588,7 +1592,7 @@ XLA_FFI_DEFINE_HANDLER_SYMBOL(BufferR2F32Handler, BufferR2F32Function);
 
 TEST(FfiTest, DefineAutoSymbol) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -1604,7 +1608,7 @@ TEST(FfiTest, DefineAutoSymbol) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/ffi/call_frame.cc b/third_party/xla/xla/ffi/call_frame.cc
index ad7c71c98f8cd6..f0c17215c2dafd 100644
--- a/third_party/xla/xla/ffi/call_frame.cc
+++ b/third_party/xla/xla/ffi/call_frame.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -47,7 +47,7 @@ namespace xla::ffi {
 //===----------------------------------------------------------------------===//
 
 struct CallFrameBuilder::Buffer {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   PrimitiveType type;
   absl::InlinedVector<int64_t, 4> dims;
 };
@@ -84,7 +84,7 @@ CallFrameBuilder::CallFrameBuilder(size_t num_args, size_t num_rets) {
 
 CallFrameBuilder::~CallFrameBuilder() = default;
 
-void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
+void CallFrameBuilder::AddBufferArg(se::DeviceAddressBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(args_.capacity() > args_.size())
@@ -95,10 +95,10 @@ void CallFrameBuilder::AddBufferArg(se::DeviceMemoryBase memory,
 void CallFrameBuilder::AddTokenArg() {
   DCHECK(args_.capacity() > args_.size())
       << "CallFrame builder `num_args` argument was too small";
-  args_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
+  args_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
 }
 
-void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
+void CallFrameBuilder::AddBufferRet(se::DeviceAddressBase memory,
                                     PrimitiveType type,
                                     absl::Span<const int64_t> dims) {
   DCHECK(rets_.capacity() > rets_.size())
@@ -109,7 +109,7 @@ void CallFrameBuilder::AddBufferRet(se::DeviceMemoryBase memory,
 void CallFrameBuilder::AddTokenRet() {
   DCHECK(rets_.capacity() > rets_.size())
       << "CallFrame builder `num_rets` argument was too small";
-  rets_.push_back(Buffer{se::DeviceMemoryBase(), PrimitiveType::TOKEN, {}});
+  rets_.push_back(Buffer{se::DeviceAddressBase(), PrimitiveType::TOKEN, {}});
 }
 
 void CallFrameBuilder::AddAttributes(AttributesMap attrs) {
@@ -557,8 +557,8 @@ std::unique_ptr<CallFrame::Attributes> CallFrame::FixUpAttrs(
 //===----------------------------------------------------------------------===//
 
 absl::Status CallFrame::UpdateWithBuffers(
-    absl::Span<const se::DeviceMemoryBase> args,
-    absl::Span<const se::DeviceMemoryBase> rets) {
+    absl::Span<const se::DeviceAddressBase> args,
+    absl::Span<const se::DeviceAddressBase> rets) {
   if (ABSL_PREDICT_FALSE(args.size() != arguments_->args.size())) {
     return InvalidArgument("Invalid number of updated arguments: %d vs %d",
                            args.size(), arguments_->args.size());
@@ -587,8 +587,8 @@ CallFrame CallFrame::Copy() const {
 }
 
 absl::StatusOr<CallFrame> CallFrame::CopyWithBuffers(
-    absl::Span<const se::DeviceMemoryBase> args,
-    absl::Span<const se::DeviceMemoryBase> rets) const {
+    absl::Span<const se::DeviceAddressBase> args,
+    absl::Span<const se::DeviceAddressBase> rets) const {
   CallFrame clone(CopyArgs(*arguments_), CopyRets(*results_), attributes_);
   TF_RETURN_IF_ERROR(clone.UpdateWithBuffers(args, rets));
   return clone;
diff --git a/third_party/xla/xla/ffi/call_frame.h b/third_party/xla/xla/ffi/call_frame.h
index 32dceead1d9b4b..5433d4be990d42 100644
--- a/third_party/xla/xla/ffi/call_frame.h
+++ b/third_party/xla/xla/ffi/call_frame.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
 
@@ -76,12 +76,12 @@ class CallFrameBuilder {
 
   CallFrame Build();
 
-  void AddBufferArg(se::DeviceMemoryBase memory, PrimitiveType type,
+  void AddBufferArg(se::DeviceAddressBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenArg();
 
-  void AddBufferRet(se::DeviceMemoryBase memory, PrimitiveType type,
+  void AddBufferRet(se::DeviceAddressBase memory, PrimitiveType type,
                     absl::Span<const int64_t> dims);
 
   void AddTokenRet();
@@ -117,16 +117,16 @@ class CallFrame {
   // array (buffer) arguments and results are known at compile time. Instead of
   // rebuilding the call frame from scratch on every execution, we can just
   // update the arguments and results with new pointers to device memory.
-  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceMemoryBase> args,
-                                 absl::Span<const se::DeviceMemoryBase> rets);
+  absl::Status UpdateWithBuffers(absl::Span<const se::DeviceAddressBase> args,
+                                 absl::Span<const se::DeviceAddressBase> rets);
 
   // Creates a copy of the call frame.
   CallFrame Copy() const;
 
   // Creates a copy of the call frame with updated arguments and results.
   absl::StatusOr<CallFrame> CopyWithBuffers(
-      absl::Span<const se::DeviceMemoryBase> args,
-      absl::Span<const se::DeviceMemoryBase> rets) const;
+      absl::Span<const se::DeviceAddressBase> args,
+      absl::Span<const se::DeviceAddressBase> rets) const;
 
   // Builds an XLA_FFI_CallFrame from owned arguments and attributes.
   XLA_FFI_CallFrame Build(
diff --git a/third_party/xla/xla/ffi/call_frame_test.cc b/third_party/xla/xla/ffi/call_frame_test.cc
index f73461fc7d297f..b58e2d9a2537b6 100644
--- a/third_party/xla/xla/ffi/call_frame_test.cc
+++ b/third_party/xla/xla/ffi/call_frame_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/attribute_map.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/test_benchmark.h"
@@ -34,8 +34,8 @@ limitations under the License.
 namespace xla::ffi {
 
 TEST(CallFrameTest, UpdateCallFrame) {
-  se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
-  se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
+  se::DeviceAddressBase mem0(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase mem1(reinterpret_cast<void*>(0x87654321), 1024);
 
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
@@ -116,7 +116,7 @@ TEST(CallFrameTest, UpdateCallFrame) {
 void BM_AddBufferArg(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   for (auto _ : state) {
@@ -151,17 +151,17 @@ void BM_AddAttributes(benchmark::State& state) {
 void BM_UpdateCallFrame(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
+  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     auto updated_call_frame =
@@ -173,17 +173,17 @@ void BM_UpdateCallFrame(benchmark::State& state) {
 void BM_UpdateCallFrameInPlace(benchmark::State& state) {
   size_t num_args = state.range(0);
 
-  se::DeviceMemoryBase memory(reinterpret_cast<void*>(0x12345678), 1024);
+  se::DeviceAddressBase memory(reinterpret_cast<void*>(0x12345678), 1024);
   std::vector<int64_t> dims = {1, 2, 3, 4};
 
   CallFrameBuilder builder(num_args, /*num_rets=*/0);
   for (size_t i = 0; i < num_args; ++i) {
-    builder.AddBufferArg(se::DeviceMemoryBase(nullptr, 1024),
+    builder.AddBufferArg(se::DeviceAddressBase(nullptr, 1024),
                          PrimitiveType::F32, dims);
   }
   CallFrame call_frame = builder.Build();
 
-  std::vector<se::DeviceMemoryBase> updated_args(num_args, memory);
+  std::vector<se::DeviceAddressBase> updated_args(num_args, memory);
 
   for (auto _ : state) {
     benchmark::DoNotOptimize(
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index da6303e14faef7..4e1849a190d327 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -50,7 +50,7 @@ limitations under the License.
 #include "xla/ffi/type_registry.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/primitive_util.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/types.h"  // IWYU pragma: keep
@@ -137,8 +137,8 @@ class AnyBuffer {
     return reinterpret_cast<T*>(buf_->data);
   }
 
-  se::DeviceMemoryBase device_memory() const {
-    return se::DeviceMemoryBase(untyped_data(), size_bytes());
+  se::DeviceAddressBase device_memory() const {
+    return se::DeviceAddressBase(untyped_data(), size_bytes());
   }
 
  private:
@@ -182,9 +182,9 @@ class Buffer {
     return reinterpret_cast<internal::NativeType<dtype>*>(untyped_data());
   }
 
-  se::DeviceMemory<internal::NativeType<dtype>> device_memory() const {
-    return se::DeviceMemory<internal::NativeType<dtype>>(
-        se::DeviceMemoryBase(untyped_data(), size_bytes()));
+  se::DeviceAddress<internal::NativeType<dtype>> device_memory() const {
+    return se::DeviceAddress<internal::NativeType<dtype>>(
+        se::DeviceAddressBase(untyped_data(), size_bytes()));
   }
 
  private:
diff --git a/third_party/xla/xla/ffi/ffi_api.cc b/third_party/xla/xla/ffi/ffi_api.cc
index 31287ac7587ef4..3f0de64033061e 100644
--- a/third_party/xla/xla/ffi/ffi_api.cc
+++ b/third_party/xla/xla/ffi/ffi_api.cc
@@ -47,8 +47,8 @@ limitations under the License.
 #include "xla/ffi/ffi_structs.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_allocator.h"
-#include "xla/stream_executor/device_memory.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
 #include "xla/tsl/platform/logging.h"
@@ -795,7 +795,7 @@ static XLA_FFI_Error* XLA_FFI_DeviceMemory_Free(
 
   absl::Status status = gpu->allocator->Deallocate(
       args->ctx->device_ordinal,
-      stream_executor::DeviceMemoryBase(args->data, args->size));
+      stream_executor::DeviceAddressBase(args->data, args->size));
   if (!status.ok()) {
     return new XLA_FFI_Error{std::move(status)};
   }
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 8f0b00244c0a93..0369c8cc1946e5 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -43,7 +43,7 @@ limitations under the License.
 #include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/ffi/type_registry.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/chain.h"
@@ -179,7 +179,7 @@ TEST(FfiTest, CatchExceptionExplicit) {
 
 TEST(FfiTest, WrongNumArgs) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(nullptr), PrimitiveType::F32, {});
+  builder.AddBufferArg(se::DeviceAddressBase(nullptr), PrimitiveType::F32, {});
   auto call_frame = builder.Build();
 
   auto handler = Ffi::Bind().Arg<AnyBuffer>().Arg<AnyBuffer>().To(
@@ -579,7 +579,7 @@ TEST(FfiTest, DecodingErrors) {
 
 TEST(FfiTest, AnyBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -614,7 +614,7 @@ TEST(FfiTest, AnyBufferArgument) {
 
 TEST(FfiTest, TypedAndRankedBufferArgument) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), storage.size() * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), storage.size() * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -642,8 +642,8 @@ TEST(FfiTest, TypedAndRankedBufferArgument) {
 
 TEST(FfiTest, ComplexBufferArgument) {
   std::vector<std::complex<float>> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(),
-                              storage.size() * sizeof(std::complex<float>));
+  se::DeviceAddressBase memory(storage.data(),
+                               storage.size() * sizeof(std::complex<float>));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::C64, /*dims=*/{2, 2});
@@ -662,7 +662,7 @@ TEST(FfiTest, ComplexBufferArgument) {
 
 TEST(FfiTest, TokenArgument) {
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
-  builder.AddBufferArg(se::DeviceMemoryBase(), PrimitiveType::TOKEN,
+  builder.AddBufferArg(se::DeviceAddressBase(), PrimitiveType::TOKEN,
                        /*dims=*/{});
   auto call_frame = builder.Build();
 
@@ -679,7 +679,7 @@ TEST(FfiTest, TokenArgument) {
 
 TEST(FfiTest, WrongRankBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -697,7 +697,7 @@ TEST(FfiTest, WrongRankBufferArgument) {
 
 TEST(FfiTest, WrongTypeBufferArgument) {
   std::vector<int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -715,7 +715,7 @@ TEST(FfiTest, WrongTypeBufferArgument) {
 
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -743,7 +743,7 @@ TEST(FfiTest, RemainingArgs) {
 
 TEST(FfiTest, RemainingRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/2);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -772,7 +772,7 @@ TEST(FfiTest, RemainingRets) {
 
 TEST(FfiTest, OptionalArgs) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/1, /*num_rets=*/0);
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -833,7 +833,7 @@ TEST(FfiTest, OptionalArgs) {
 
 TEST(FfiTest, OptionalRets) {
   std::vector<float> storage(4, 0.0f);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory(storage.data(), 4 * sizeof(float));
 
   CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/1);
   builder.AddBufferRet(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -975,8 +975,8 @@ TEST(FfiTest, UpdateBufferArgumentsAndResults) {
   std::vector<float> storage0(4, 0.0f);
   std::vector<float> storage1(4, 0.0f);
 
-  se::DeviceMemoryBase memory0(storage0.data(), 4 * sizeof(float));
-  se::DeviceMemoryBase memory1(storage1.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory0(storage0.data(), 4 * sizeof(float));
+  se::DeviceAddressBase memory1(storage1.data(), 4 * sizeof(float));
 
   std::vector<int64_t> dims = {2, 2};
 
@@ -1169,7 +1169,7 @@ TEST(FfiTest, PlatformStream) {
 //===----------------------------------------------------------------------===//
 
 static CallFrameBuilder WithBufferArgs(size_t num_args, size_t rank = 4) {
-  se::DeviceMemoryBase memory;
+  se::DeviceAddressBase memory;
   std::vector<int64_t> dims(4, 1);
 
   CallFrameBuilder builder(/*num_args=*/num_args, /*num_rets=*/0);
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index adbe4c805fb546..c2e23e6cd3a796 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -1969,7 +1969,7 @@ StreamExecutorGpuClient::RunAsync(
         const int64_t buffer_size = allocation.size();
         if (buffer_size > 0) {
           TF_ASSIGN_OR_RETURN(
-              se::OwningDeviceMemory owning_buffer,
+              se::ScopedDeviceAddress<uint8_t> owning_buffer,
               memory_allocator->Allocate(device_ordinal, buffer_size,
                                          /*retry_on_failure=*/true,
                                          /*memory_space=*/allocation.color()));
@@ -2034,7 +2034,7 @@ StreamExecutorGpuClient::RunAsync(
                "buffer is not donated; allocating a fresh buffer";
         int64_t allocation_size = ShapeUtil::ByteSizeOf(
             ShapeUtil::GetSubshape(gpu_exec->result_shape(), index));
-        absl::StatusOr<se::OwningDeviceMemory> allocated_buffer =
+        absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> allocated_buffer =
             memory_allocator->Allocate(device_ordinal, allocation_size,
                                        /*retry_on_failure=*/true,
                                        /*memory_space=*/allocation->color());
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 5e84506057c524..88fce7477ce884 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -775,16 +775,20 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
                 tuple_buffer.buffers().mutable_element({});
             VLOG(3) << "untuple: output_buffers[" << i
                     << "].emplace: " << elem->opaque();
-            output_buffers[i].emplace(stream_executor::OwningDeviceMemory(
-                *elem, device->local_device_id().value(), client->allocator()));
+            output_buffers[i].emplace(
+                stream_executor::ScopedDeviceAddress<uint8_t>(
+                    *elem, device->local_device_id().value(),
+                    client->allocator()));
             *elem = se::DeviceAddressBase();
           }
         } else {
           CHECK_EQ(output_buffers.size(), 1);
           auto* elem = output.buffers().mutable_element({});
           VLOG(3) << "output_buffers[0].emplace: " << elem->opaque();
-          output_buffers.front().emplace(stream_executor::OwningDeviceMemory(
-              *elem, device->local_device_id().value(), client->allocator()));
+          output_buffers.front().emplace(
+              stream_executor::ScopedDeviceAddress<uint8_t>(
+                  *elem, device->local_device_id().value(),
+                  client->allocator()));
           *elem = se::DeviceAddressBase();
         }
 
@@ -909,10 +913,11 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
                     << "]: " << tracked_buffers[i]->buffer()->buffer().opaque();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {i}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
-                           tracked_buffers[i]->buffer()->buffer(),
-                           device->local_hardware_id().value(),
-                           client->allocator())));
+                  {i},
+                  MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
+                      tracked_buffers[i]->buffer()->buffer(),
+                      device->local_hardware_id().value(),
+                      client->allocator())));
             } else {
               input.SetBuffer({i}, MaybeOwningDeviceAddress(
                                        tracked_buffers[i]->buffer()->buffer()));
@@ -928,7 +933,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtGpuExecutable::ExecuteHelper(
             ExecutionInput& input = inputs.back();
             if (buffer_is_donated[i]) {
               input.SetUnownedBuffer(
-                  {}, MaybeOwningDeviceAddress(se::OwningDeviceMemory(
+                  {}, MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
                           tracked_buffers[i]->buffer()->buffer(),
                           device->local_hardware_id().value(),
                           client->allocator())));
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index e342a586863001..d11f6e966f5ec2 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -1156,7 +1156,7 @@ MakeTupleHelper(PjRtStreamExecutorClient* client,
 
   se::Stream* stream = local_device->host_to_device_stream();
   TF_ASSIGN_OR_RETURN(
-      se::OwningDeviceMemory owned_root_table_memory,
+      se::ScopedDeviceAddress<uint8_t> owned_root_table_memory,
       allocator->Allocate(
           device_ordinal,
           transfer_manager->GetByteSizeRequirement(tupled_parameter_shape)));
@@ -1673,7 +1673,7 @@ PjRtStreamExecutorClient::RunAsync(
     auto it = tmp.MutableBuffers()->begin();
     for (auto& v : input) {
       if (v.second.is_donated) {
-        it->second = MaybeOwningDeviceAddress(se::OwningDeviceMemory(
+        it->second = MaybeOwningDeviceAddress(se::ScopedDeviceAddress<uint8_t>(
             v.second.buf->mem(), device->local_device_id().value(),
             run_options.allocator()));
         tmp.SetUnownedIndex(it->first);
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 4b656c48fc2517..4220db893cb1dc 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -91,8 +91,8 @@ struct PjRtStreamExecutorExecutionOutput {
   // Donated inputs which must be freed.
   std::vector<tsl::AsyncValueRef<RawSEDeviceMemory>> to_be_released;
   // For PjRtStreamExecutorClient implementations that
-  // use OwningDeviceMemory for donated inputs.
-  std::vector<se::OwningDeviceMemory> se_to_be_released;
+  // use ScopedDeviceAddress for donated inputs.
+  std::vector<se::ScopedDeviceAddress<uint8_t>> se_to_be_released;
 };
 
 class PjRtStreamExecutorDevice : public PjRtDevice {
diff --git a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
index d5bec6ba286977..2c1b89083b477d 100644
--- a/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/tracked_device_buffer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/pjrt/tracked_device_buffer.h"
 
+#include <cstdint>
 #include <memory>
 #include <utility>
 #include <vector>
@@ -90,7 +91,7 @@ absl::StatusOr<tsl::AsyncValueRef<RawSEDeviceMemory>> MakeArray(
       client->backend().transfer_manager()->HostShapeToDeviceShape(shape),
       [&](const Shape& subshape, const ShapeIndex&) -> absl::Status {
         TF_ASSIGN_OR_RETURN(
-            se::OwningDeviceMemory device_memory,
+            se::ScopedDeviceAddress<uint8_t> device_memory,
             client->backend().memory_allocator()->Allocate(
                 /*device_ordinal=*/0,
                 client->backend().transfer_manager()->GetByteSizeRequirement(
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index b5d097d79b4715..e5e8114809599e 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4113,6 +4113,8 @@ cc_library(
     hdrs = ["maybe_owning_device_memory.h"],
     deps = [
         ":maybe_owning_device_address",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/service/maybe_owning_device_memory.h b/third_party/xla/xla/service/maybe_owning_device_memory.h
index 897003ffb17429..40d05599971dcd 100644
--- a/third_party/xla/xla/service/maybe_owning_device_memory.h
+++ b/third_party/xla/xla/service/maybe_owning_device_memory.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include "absl/base/macros.h"
 #include "xla/service/maybe_owning_device_address.h"
+#include "xla/stream_executor/device_address.h"  // IWYU pragma: keep
+#include "xla/stream_executor/device_address_allocator.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory.h"  // IWYU pragma: keep
 #include "xla/stream_executor/device_memory_allocator.h"  // IWYU pragma: keep
 
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 9f617478a6ea7b..4466fb094ab53d 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -184,7 +184,7 @@ cc_library(
         "//xla/service:hlo_runner_pjrt",
         "//xla/service:interpreter_plugin",  # reference backend
         "//xla/service:platform_util",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/tsl/lib/core:status_test_util",
@@ -451,8 +451,8 @@ cc_library(
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
@@ -522,8 +522,8 @@ xla_test(
         "//xla/service:hlo_module_config",
         "//xla/service:maybe_owning_device_address",
         "//xla/service:shaped_buffer",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -2989,7 +2989,7 @@ xla_test(
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/service:collective_ops_utils",
-        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
@@ -3520,7 +3520,7 @@ xla_test(
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",
@@ -3660,7 +3660,7 @@ xla_test(
         "//xla/service:generic_transfer_manager",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
-        "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/tests:xla_test_backend_predicates",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 324917cbd57df6..870a7b659bcb27 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -45,8 +45,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -113,7 +113,7 @@ class BufferDonationTest : public HloTestBase {
         run_options, backend_->StreamBorrowerWithPriority());
 
     std::vector<ExecutionInput> args;
-    std::vector<ShapeTree<se::DeviceMemoryBase>> inputs_buffers;
+    std::vector<ShapeTree<se::DeviceAddressBase>> inputs_buffers;
 
     CHECK_EQ(argument_literals.size(), donate_arguments.size());
 
@@ -130,7 +130,7 @@ class BufferDonationTest : public HloTestBase {
       ShapedBuffer shaped_buffer = scoped_shaped_buffer.release();
       CHECK_OK(backend_->transfer_manager()->TransferLiteralToDevice(
           stream.get(), argument_literal, shaped_buffer));
-      ShapeTree<se::DeviceMemoryBase> input_buffers = shaped_buffer.buffers();
+      ShapeTree<se::DeviceAddressBase> input_buffers = shaped_buffer.buffers();
       inputs_buffers.push_back(input_buffers);
       ShapeTree<MaybeOwningDeviceAddress> owned_buffers(
           argument_literal.shape());
@@ -138,7 +138,7 @@ class BufferDonationTest : public HloTestBase {
           [&](const ShapeIndex& index,
               MaybeOwningDeviceAddress* device_memory) {
             if (donate_argument) {
-              *device_memory = se::OwningDeviceMemory(
+              *device_memory = se::ScopedDeviceAddress<uint8_t>(
                   input_buffers.element(index), executor_->device_ordinal(),
                   &memory_allocator);
             } else {
@@ -162,7 +162,7 @@ class BufferDonationTest : public HloTestBase {
     }
     ExecutionOutput output = std::move(output_status).value();
 
-    se::DeviceMemoryBase result_root_buffer = output.Result().root_buffer();
+    se::DeviceAddressBase result_root_buffer = output.Result().root_buffer();
     LOG(INFO) << "result allocation = " << result_root_buffer.opaque()
               << "             size = " << result_root_buffer.size();
 
diff --git a/third_party/xla/xla/tests/collective_ops_ffi_test.cc b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
index f56ef7045eca7b..21d423965efc0e 100644
--- a/third_party/xla/xla/tests/collective_ops_ffi_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_ffi_test.cc
@@ -33,7 +33,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/collective_ops_e2e_test_base.h"
 #include "xla/tests/literal_test_util.h"
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index 6421e9badcbec7..dce925c25e28d0 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "xla/service/hlo_runner_pjrt.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
@@ -174,7 +174,7 @@ ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
                                   reference_preprocessor);
 }
 
-se::DeviceMemoryAllocator* HloTestBase::GetAllocator() {
+se::DeviceAddressAllocator* HloTestBase::GetAllocator() {
   if (allocator_ == nullptr) {
     allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
         backend().default_stream_executor());
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index 31efd1fc5ff2bb..c378860ec85a40 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -48,7 +48,7 @@ static_assert(false,
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/hlo_runner_interface.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/tests/hlo_runner_agnostic_reference_mixin.h"
 #include "xla/tests/hlo_runner_agnostic_test_base.h"
@@ -210,7 +210,7 @@ class ABSL_DEPRECATED(
   static se::Platform* GetTestPlatform();
 
   // Creates or retrieves the allocator.
-  se::DeviceMemoryAllocator* GetAllocator();
+  se::DeviceAddressAllocator* GetAllocator();
 
   ErrorSpec error_spec_{0.0001};
 
@@ -224,7 +224,7 @@ class ABSL_DEPRECATED(
               bool allow_mixed_precision_in_hlo_verifier,
               HloPredicate instruction_can_change_layout_func);
 
-  std::unique_ptr<se::DeviceMemoryAllocator> allocator_;
+  std::unique_ptr<se::DeviceAddressAllocator> allocator_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index ac4aec28517450..cb0675c889c052 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -31,7 +31,7 @@ limitations under the License.
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/tests/local_client_test_base.cc b/third_party/xla/xla/tests/local_client_test_base.cc
index 29563c202f26a2..957b24fc150f8e 100644
--- a/third_party/xla/xla/tests/local_client_test_base.cc
+++ b/third_party/xla/xla/tests/local_client_test_base.cc
@@ -43,8 +43,8 @@ limitations under the License.
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -57,7 +57,7 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
+absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> TestAllocator::Allocate(
     int device_ordinal, uint64_t size, bool retry_on_failure,
     int64_t memory_space) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
@@ -71,7 +71,7 @@ absl::StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
 }
 
 absl::Status TestAllocator::Deallocate(int device_ordinal,
-                                       se::DeviceMemoryBase mem) {
+                                       se::DeviceAddressBase mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     absl::MutexLock lock(count_mutex_);
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index cb7de54135e8db..3afeae8c003d8c 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -37,8 +37,8 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
@@ -53,11 +53,11 @@ class TestAllocator : public se::StreamExecutorMemoryAllocator {
       : se::StreamExecutorMemoryAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
 
-  absl::StatusOr<se::OwningDeviceMemory> Allocate(
+  absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(
       int device_ordinal, uint64_t size, bool retry_on_failure,
       int64_t memory_space) override;
   absl::Status Deallocate(int device_ordinal,
-                          se::DeviceMemoryBase mem) override;
+                          se::DeviceAddressBase mem) override;
 
   // Return the number of allocations that have been performed.
   int64_t allocation_count() const;
diff --git a/third_party/xla/xla/tests/transfer_manager_test.cc b/third_party/xla/xla/tests/transfer_manager_test.cc
index 6a4a188afd94fa..66d84eebb73fb7 100644
--- a/third_party/xla/xla/tests/transfer_manager_test.cc
+++ b/third_party/xla/xla/tests/transfer_manager_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "xla/service/stream_pool.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/device_memory_allocator.h"
+#include "xla/stream_executor/device_address_allocator.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/local_client_test_base.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 5f422444fd55e9..60993b0f7d19ab 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1007,8 +1007,8 @@ tsl_gpu_library(
         "//xla/service/cpu:cpu_executable",
         "//xla/service/gpu:gpu_symbol_repository",
         "//xla/service/gpu/autotuning:autotuner_util",
+        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor:stream_executor_memory_allocator",

From 7c723d06ce9f08be8823e2d5aedd80a90fac7dac Mon Sep 17 00:00:00 2001
From: Siqiao Wu <siqiaowu@google.com>
Date: Tue, 9 Dec 2025 15:54:41 -0800
Subject: [PATCH 097/753] Support rewriting PartitionCall to XlaLaunchV2 so
 that function call on tf.PartitionCall with `_XlaMustCompile` attribute will
 be launched on XLA-CPU.

PiperOrigin-RevId: 842430158
---
 .../compiler/mlir/tfrt/tests/xla_rewrite.mlir |  9 ++++++++-
 .../mlir/tfrt/transforms/xla_rewrite_pass.cc  | 19 ++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir b/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir
index 2118183569e6ed..24e195dbc9dc42 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/xla_rewrite.mlir
@@ -27,7 +27,14 @@ func.func @xla_launch(%arg: tensor<i32>, %v0: tensor<*x!tf_type.resource>, %v1:
       device = "/device:GPU:0", executor_type = "", f = @callee}
       : (tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource>, tensor<i32>, tensor<*x!tf_type.resource>) -> tensor<i32>
 
-  func.return %r2 : tensor<i32>
+  // CHECK: tf.XlaLaunchV2
+  // CHECK-SAME: constants = [0, 3]
+  // CHECK-SAME: resources = [2, 4]
+  %r3 = "tf.PartitionedCall"(%c0, %r2, %v0, %c1, %v1) {_XlaMustCompile = true, config = "", config_proto = "",
+      device = "/device:CPU:0", executor_type = "", f = @callee}
+      : (tensor<i32>, tensor<i32>, tensor<*x!tf_type.resource>, tensor<i32>, tensor<*x!tf_type.resource>) -> tensor<i32>
+
+  func.return %r3 : tensor<i32>
 }
 
 func.func @callee(%c0: tensor<i32>, %arg: tensor<i32>, %v0: tensor<*x!tf_type.resource>, %c1: tensor<i32>, %v1: tensor<*x!tf_type.resource>) -> (tensor<i32>) {
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc b/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
index 0ed5a6ac1b6a8a..fea7a988bd40d7 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/xla_rewrite_pass.cc
@@ -38,15 +38,16 @@ namespace tensorflow {
 namespace tfrt_compiler {
 namespace {
 
-struct RewriteStatefulPartitionedCallToXlaLaunchOnCpu
-    : public mlir::OpRewritePattern<mlir::TF::StatefulPartitionedCallOp> {
-  using OpRewritePattern::OpRewritePattern;
+template <typename OpType>
+struct RewriteFunctionCallToXlaLaunchOnCpu
+    : public mlir::OpRewritePattern<OpType> {
+ public:
+  using mlir::OpRewritePattern<OpType>::OpRewritePattern;
 
   mlir::LogicalResult matchAndRewrite(
-      mlir::TF::StatefulPartitionedCallOp op,
-      mlir::PatternRewriter& rewriter) const override {
+      OpType op, mlir::PatternRewriter& rewriter) const override {
     if (auto xla_must_compile =
-            op->getAttrOfType<mlir::BoolAttr>("_XlaMustCompile");
+            op->template getAttrOfType<mlir::BoolAttr>("_XlaMustCompile");
         !xla_must_compile || !xla_must_compile.getValue()) {
       return mlir::failure();
     }
@@ -92,7 +93,11 @@ struct TfrtXlaRewritePass
   void runOnOperation() override {
     mlir::RewritePatternSet patterns(&getContext());
 
-    patterns.add<RewriteStatefulPartitionedCallToXlaLaunchOnCpu>(&getContext());
+    patterns
+        .add<RewriteFunctionCallToXlaLaunchOnCpu<mlir::TF::PartitionedCallOp>>(
+            &getContext());
+    patterns.add<RewriteFunctionCallToXlaLaunchOnCpu<
+        mlir::TF::StatefulPartitionedCallOp>>(&getContext());
 
     if (mlir::failed(
             mlir::applyPatternsGreedily(getOperation(), std::move(patterns)))) {

From 0eebb72ae29b2349e8e30fe1c6de464bd5badb8a Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 9 Dec 2025 16:18:23 -0800
Subject: [PATCH 098/753] [xla:ffi] Add support for internal type serialization
 and deserialization

PiperOrigin-RevId: 842438931
---
 third_party/xla/xla/ffi/BUILD                 |  3 +
 third_party/xla/xla/ffi/execution_state.cc    |  6 +-
 .../xla/xla/ffi/execution_state_test.cc       | 34 ++++----
 third_party/xla/xla/ffi/type_registry.h       | 78 ++++++++++++++++++-
 third_party/xla/xla/ffi/type_registry_test.cc | 38 ++++++++-
 5 files changed, 136 insertions(+), 23 deletions(-)

diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index f14764091594bc..b7b2892e016db3 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -128,6 +128,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
@@ -325,6 +326,7 @@ cc_library(
     deps = [
         "//xla:util",
         "//xla/tsl/lib/gtl:int_type",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/util:safe_reinterpret_cast",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -348,6 +350,7 @@ xla_cc_test(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/ffi/execution_state.cc b/third_party/xla/xla/ffi/execution_state.cc
index 8f5f5fd902b8b1..966c195d05fd7f 100644
--- a/third_party/xla/xla/ffi/execution_state.cc
+++ b/third_party/xla/xla/ffi/execution_state.cc
@@ -116,10 +116,8 @@ absl::StatusOr<ExecutionState> ExecutionState::FromProto(
         proto.type_name());
   }
 
-  TF_ASSIGN_OR_RETURN(void* opaque_state,
-                      type_info.deserializer(proto.state()));
-
-  TF_RETURN_IF_ERROR(state.Set(type_id, type_info, opaque_state));
+  TF_ASSIGN_OR_RETURN(auto opaque_state, type_info.deserializer(proto.state()));
+  TF_RETURN_IF_ERROR(state.Set(type_id, type_info, opaque_state.release()));
   return state;
 }
 
diff --git a/third_party/xla/xla/ffi/execution_state_test.cc b/third_party/xla/xla/ffi/execution_state_test.cc
index e5ca8c3bb108e7..8c05e4caf9098e 100644
--- a/third_party/xla/xla/ffi/execution_state_test.cc
+++ b/third_party/xla/xla/ffi/execution_state_test.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <type_traits>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
@@ -85,22 +87,26 @@ TEST(ExecutionStateTest, SetAndGetForExternalType) {
   EXPECT_EQ(data, value);
 }
 
+struct MyState {
+  std::string value;
+};
+
+template <>
+struct TypeRegistry::SerDes<MyState> : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(const MyState& value) {
+    return value.value;
+  }
+  static absl::StatusOr<std::unique_ptr<MyState>> Deserialize(
+      absl::string_view data) {
+    auto state = std::make_unique<MyState>();
+    state->value = data;
+    return state;
+  }
+};
+
 TEST(ExecutionStateTest, Serialization) {
-  struct MyState {
-    std::string value;
-  };
+  TypeRegistry::TypeInfo type_info = TypeRegistry::GetTypeInfo<MyState>();
 
-  TypeRegistry::TypeInfo type_info = {
-      /*deleter=*/
-      [](void* ptr) { delete static_cast<MyState*>(ptr); },
-      /*serializer=*/
-      [](const void* ptr) -> absl::StatusOr<std::string> {
-        return static_cast<const MyState*>(ptr)->value;
-      },
-      /*deserializer=*/
-      [](absl::string_view state) -> absl::StatusOr<void*> {
-        return new MyState{std::string(state)};
-      }};
   TF_ASSERT_OK_AND_ASSIGN(
       TypeRegistry::TypeId type_id,
       TypeRegistry::AssignExternalTypeId("my_state_type", type_info));
diff --git a/third_party/xla/xla/ffi/type_registry.h b/third_party/xla/xla/ffi/type_registry.h
index d08ad80f015f59..244fd6c8653a9c 100644
--- a/third_party/xla/xla/ffi/type_registry.h
+++ b/third_party/xla/xla/ffi/type_registry.h
@@ -17,14 +17,18 @@ limitations under the License.
 #define XLA_FFI_TYPE_REGISTRY_H_
 
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <type_traits>
 
 #include "absl/base/no_destructor.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/gtl/int_type.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/safe_reinterpret_cast.h"
+#include "xla/util.h"
 
 namespace xla::ffi {
 
@@ -63,13 +67,27 @@ class TypeRegistry {
   struct TypeInfo {
     using Deleter = void (*)(void*);
     using Serializer = absl::StatusOr<std::string> (*)(const void*);
-    using Deserializer = absl::StatusOr<void*> (*)(absl::string_view);
+    using Deserializer =
+        absl::StatusOr<std::unique_ptr<void, Deleter>> (*)(absl::string_view);
 
     Deleter deleter = nullptr;
     Serializer serializer = nullptr;
     Deserializer deserializer = nullptr;
   };
 
+  // To declare a type `T` as serializable and deserializable, define a
+  // specialization of `TypeSerDes<T>` with `Serialize` and `Deserialize` apis.
+  //
+  //   template <>
+  //   struct TypeSerDes<T> : public std::true_type {
+  //     static absl::StatusOr<std::string> Serialize(const T& value);
+  //     static absl::StatusOr<std::unique_ptr<T>> Deserialize(
+  //       absl::string_view data);
+  // };
+  //
+  template <typename T>
+  struct SerDes : public std::false_type {};
+
   // Returns type name for a given type id. Returns an error if type id is not
   // registered. Works for both external and internal type ids.
   static absl::StatusOr<absl::string_view> GetTypeName(TypeId type_id);
@@ -106,6 +124,14 @@ class TypeRegistry {
   template <typename T>
   static TypeInfo GetTypeInfo();
 
+  // Serializes a value of a given type. For internal type ids only.
+  template <typename T>
+  static absl::StatusOr<std::string> Serialize(const T& value);
+
+  // Deserializes a value of a given type. For internal type ids only.
+  template <typename T>
+  static absl::StatusOr<std::unique_ptr<T>> Deserialize(absl::string_view data);
+
  private:
   static TypeId GetNextTypeId();
 };
@@ -126,9 +152,53 @@ TypeRegistry::TypeId TypeRegistry::GetTypeId() {
 
 template <typename T>
 TypeRegistry::TypeInfo TypeRegistry::GetTypeInfo() {
-  return TypeInfo{
-      [](void* state) { delete tsl::safe_reinterpret_cast<T*>(state); },
-  };
+  // Define deleter as a static member, because it's always available for the
+  // internal types.
+  static TypeInfo::Deleter deleter =
+      +[](void* state) { delete tsl::safe_reinterpret_cast<T*>(state); };
+
+  // Serializer and deserializer are defined only if `T` opts in to the
+  // serializable via the `SerDes` specialization.
+  TypeInfo::Serializer serializer = nullptr;
+  TypeInfo::Deserializer deserializer = nullptr;
+
+  if constexpr (SerDes<T>::value) {
+    serializer = +[](const void* value) {
+      return SerDes<T>::Serialize(*tsl::safe_reinterpret_cast<const T*>(value));
+    };
+
+    deserializer = +[](absl::string_view data)
+        -> absl::StatusOr<std::unique_ptr<void, TypeInfo::Deleter>> {
+      TF_ASSIGN_OR_RETURN(auto value, SerDes<T>::Deserialize(data));
+      return std::unique_ptr<void, TypeInfo::Deleter>(value.release(), deleter);
+    };
+  }
+
+  return TypeInfo{deleter, serializer, deserializer};
+}
+
+template <typename T>
+absl::StatusOr<std::string> TypeRegistry::Serialize(const T& value) {
+  TypeInfo type_info = GetTypeInfo<T>();
+  if (type_info.serializer == nullptr) {
+    return FailedPrecondition(
+        "Type is not serializable. Did you forget to specialize "
+        "TypeRegistry::SerDes<T>?");
+  }
+  return type_info.serializer(&value);
+}
+
+template <typename T>
+absl::StatusOr<std::unique_ptr<T>> TypeRegistry::Deserialize(
+    absl::string_view data) {
+  TypeInfo type_info = GetTypeInfo<T>();
+  if (type_info.deserializer == nullptr) {
+    return FailedPrecondition(
+        "Type is not deserializable. Did you forget to specialize "
+        "TypeRegistry::SerDes<T>?");
+  }
+  TF_ASSIGN_OR_RETURN(auto ptr, type_info.deserializer(data));
+  return std::unique_ptr<T>(tsl::safe_reinterpret_cast<T*>(ptr.release()));
 }
 
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/type_registry_test.cc b/third_party/xla/xla/ffi/type_registry_test.cc
index 936b4e40231b93..15d7c9fd0f4196 100644
--- a/third_party/xla/xla/ffi/type_registry_test.cc
+++ b/third_party/xla/xla/ffi/type_registry_test.cc
@@ -17,18 +17,41 @@ limitations under the License.
 
 #include <cstdint>
 #include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 
 namespace xla::ffi {
-namespace {
 
+// Define a custom type with `TypeSerDes` specialization to test that TypeInfo
+// is properly generated for such types.
+struct MyString {
+  std::string data;
+};
+
+template <>
+struct TypeRegistry::SerDes<MyString> : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(const MyString& type) {
+    return type.data;
+  }
+  static absl::StatusOr<std::unique_ptr<MyString>> Deserialize(
+      absl::string_view data) {
+    auto type = std::make_unique<MyString>();
+    type->data = std::string(data);
+    return type;
+  }
+};
+
+namespace {
 using ::testing::HasSubstr;
 
 TEST(TypeRegistryTest, RegisterExternalTypeId) {
@@ -87,5 +110,18 @@ TEST(TypeRegistryTest, InternalTypeInfo) {
   type_info.deleter(ptr);
 }
 
+TEST(TypeRegistryTest, SerializableType) {
+  MyString str = {"foo"};
+
+  TypeRegistry::TypeInfo type_info = TypeRegistry::GetTypeInfo<MyString>();
+  ASSERT_NE(type_info.serializer, nullptr);
+  ASSERT_NE(type_info.deserializer, nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string serialized, TypeRegistry::Serialize(str));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MyString> deserialized,
+                          TypeRegistry::Deserialize<MyString>(serialized));
+  EXPECT_EQ(deserialized->data, "foo");
+}
+
 }  // namespace
 }  // namespace xla::ffi

From 742bf92eb32be524332fc60ba6630b593b1ea6d8 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Tue, 9 Dec 2025 17:45:31 -0800
Subject: [PATCH 099/753] Update XNNPACK in XLA

PiperOrigin-RevId: 842466897
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake |  2 +-
 tensorflow/workspace2.bzl                         | 12 ++++++------
 third_party/xla/third_party/xnnpack/workspace.bzl |  6 +++---
 third_party/xla/workspace2.bzl                    |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index c781a2fc18d86a..4199e85da146ff 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 1b918df9d1744ae40725254f4baa592ed05c912e
+  GIT_TAG 085272364b1e8168a82994296994d9b02444e82a
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index db5a02b6e7345f..c8bea5feba54fb 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -168,18 +168,18 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f644ad3ac88b3b0208a82742938bca35235865d6ca64950dac58b166877eb2a5",
-        strip_prefix = "XNNPACK-1b918df9d1744ae40725254f4baa592ed05c912e",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/1b918df9d1744ae40725254f4baa592ed05c912e.zip"),
+        sha256 = "9ff5f0631970f3393522e2fb0b882c7cabc44c76f957d257b507f47611e2df47",
+        strip_prefix = "XNNPACK-085272364b1e8168a82994296994d9b02444e82a",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/085272364b1e8168a82994296994d9b02444e82a.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
     # XNNPack dependency.
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "fb4f8180171d035a08432b086194121f627d00a76d58cebaad57d7a87ad40dbd",
-        strip_prefix = "kleidiai-7a3a609a3278106df7157bdd27b8f0e75ab00b60",
-        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/7a3a609a3278106df7157bdd27b8f0e75ab00b60.zip"),
+        sha256 = "5e922c9afb7a0c881fc4359b58488f3faa840e8435de1a2207a6525935ed83c2",
+        strip_prefix = "kleidiai-63205aa90afa6803d8f58bc3081b69288e9f1906",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/63205aa90afa6803d8f58bc3081b69288e9f1906.zip"),
     )
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index 5251d630a034e0..f7dc6428146b0a 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f644ad3ac88b3b0208a82742938bca35235865d6ca64950dac58b166877eb2a5",
-        strip_prefix = "XNNPACK-1b918df9d1744ae40725254f4baa592ed05c912e",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/1b918df9d1744ae40725254f4baa592ed05c912e.zip"),
+        sha256 = "9ff5f0631970f3393522e2fb0b882c7cabc44c76f957d257b507f47611e2df47",
+        strip_prefix = "XNNPACK-085272364b1e8168a82994296994d9b02444e82a",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/085272364b1e8168a82994296994d9b02444e82a.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 43d5f037bc7fb0..43537bf6cfabe7 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -163,9 +163,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "KleidiAI",
-        sha256 = "fb4f8180171d035a08432b086194121f627d00a76d58cebaad57d7a87ad40dbd",
-        strip_prefix = "kleidiai-7a3a609a3278106df7157bdd27b8f0e75ab00b60",
-        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/7a3a609a3278106df7157bdd27b8f0e75ab00b60.zip"),
+        sha256 = "5e922c9afb7a0c881fc4359b58488f3faa840e8435de1a2207a6525935ed83c2",
+        strip_prefix = "kleidiai-63205aa90afa6803d8f58bc3081b69288e9f1906",
+        urls = tf_mirror_urls("https://github.com/ARM-software/kleidiai/archive/63205aa90afa6803d8f58bc3081b69288e9f1906.zip"),
     )
 
     tf_http_archive(

From ebc07e02b443fb1c4e5650ebbf4e38d47779ed16 Mon Sep 17 00:00:00 2001
From: Shawn Lu <xiaoxlu@google.com>
Date: Tue, 9 Dec 2025 17:47:58 -0800
Subject: [PATCH 100/753] Skip async variable loading when XLA compilation is
 disabled.

PiperOrigin-RevId: 842467625
---
 tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index d8c2064e5f6f81..2976f38f71e352 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -694,7 +694,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   {
     tsl::profiler::TraceMe traceme("AsyncRestoreVariables");
     absl::ReaderMutexLock lock(mutex_);
-    if (!is_frozen_) {
+    if (!is_frozen_ && !tf_to_hlo_compiler_->IsXlaCompilationDisabled()) {
       // Asynchronously load the restored variable tensors to Ifrt array.
       TF_RETURN_IF_ERROR(AsyncLoadIfrtArray(inputs, variable_arg_indices,
                                             *executable_bundle, device_list));

From 1cf8997874a1264f12fedad9601ec4c57691f757 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 19:36:58 -0800
Subject: [PATCH 101/753] Added unit test for the type change of
 OpSourceInfo::source_file from string_view to std::string which was done to
 avoid dangling references potentially avoiding use-after-free crashes.

PiperOrigin-RevId: 842498608
---
 .../xla/xla/tsl/profiler/convert/BUILD        |  1 +
 .../tsl/profiler/convert/xla_op_utils_test.cc | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/third_party/xla/xla/tsl/profiler/convert/BUILD b/third_party/xla/xla/tsl/profiler/convert/BUILD
index 987d2b8d1367a8..e4c3349f5bc75c 100644
--- a/third_party/xla/xla/tsl/profiler/convert/BUILD
+++ b/third_party/xla/xla/tsl/profiler/convert/BUILD
@@ -58,6 +58,7 @@ tsl_cc_test(
     deps = [
         ":xla_op_utils",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc
index 3e9244c1d4a92a..c3415879c313c4 100644
--- a/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc
+++ b/third_party/xla/xla/tsl/profiler/convert/xla_op_utils_test.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 #include "xla/tsl/profiler/convert/xla_op_utils.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/platform/test.h"
 
 namespace tsl {
 namespace profiler {
 namespace {
 
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::Property;
+
 TEST(XlaOpUtilsTest, HloModuleNameWithProgramId) {
   EXPECT_EQ("module(123)", HloModuleNameWithProgramId("module", 123));
 }
@@ -74,6 +80,22 @@ TEST(XlaOpUtilsTest, IsXlaArgsOrRetvals) {
   EXPECT_FALSE(IsXlaArgsOrRetvals("op_name"));
 }
 
+// Tests that OpSourceInfo members are std::string and capable of owning
+// the string data. If the members were absl::string_view, this test would
+// fail due to dangling references.
+TEST(XlaOpUtilsTest, OpSourceInfo) {
+  OpSourceInfo op_source_info = {
+      .source_file = absl::StrCat("file", ".cc"),
+      .source_line = 10,
+      .stack_frame = absl::StrCat("frame1", "\n", "frame2"),
+  };
+  EXPECT_THAT(op_source_info,
+              AllOf(Field(&OpSourceInfo::source_file, "file.cc"),
+                    Field(&OpSourceInfo::stack_frame, "frame1\nframe2"),
+                    Property(&OpSourceInfo::GetSourceTopLine, "file.cc:10"),
+                    Property(&OpSourceInfo::GetSourceStack, "frame1\nframe2")));
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tsl

From 457322c8bf181c4d42df3ca08f07061f4cc916cc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 19:37:23 -0800
Subject: [PATCH 102/753] Automated Code Change

PiperOrigin-RevId: 842498682
---
 .../xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc    | 1 -
 .../xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
index 803cc7a5ddee8f..635987f32077e8 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <tuple>
-#include <vector>
 
 #include "absl/log/check.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc
index c390916b320389..000c916852aea9 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_to_dot_pass.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"

From 2a13f3ce1e04e07916ccdadfb5c4042d71a1a658 Mon Sep 17 00:00:00 2001
From: Marcin Radomski <dextero@google.com>
Date: Tue, 9 Dec 2025 20:29:47 -0800
Subject: [PATCH 103/753] [XLA] Avoid duplicated include paths in XLA build

The change that introduces googletest wrapper does not work reliably:
depending on the order of deps in BUILD file, googletest include paths
may end up before the wrapper ones. When this happens, the
ASSERT_OK/EXPECT_OK macros are not available.

This showed up in a bzlmod build once I started migrating the TF_
macros.

I tried multiple ways to fix it, described in
README.add-status-macros.md for anyone trying to make this nice in the
future. In the end, adding the macros directly to googletest in a patch
seems like the least intrusive option.

PiperOrigin-RevId: 842513243
---
 third_party/xla/MODULE.bazel                  |  12 +-
 third_party/xla/opensource_only.files         |   1 -
 ...XPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch | 164 +++++++++++
 ...dependencies-for-workspace.bzl-build.patch |  30 ++
 .../googletest/README.add-status-macros.md    |  41 +++
 .../xla_googletest_wrapper/BUILD.bazel        |  38 ---
 .../xla_googletest_wrapper/MODULE.bazel       |   4 -
 .../xla_googletest_wrapper/README.md          |   5 -
 .../xla_googletest_wrapper/REPO.bazel         |   1 -
 .../googletest_deps.bzl                       |   8 -
 .../include/gmock/gmock.h                     | 263 +++++++++---------
 .../include/gtest/gtest.h                     |  21 --
 third_party/xla/workspace2.bzl                |  29 +-
 13 files changed, 383 insertions(+), 234 deletions(-)
 create mode 100644 third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
 create mode 100644 third_party/xla/third_party/googletest/0002-Rename-dependencies-for-workspace.bzl-build.patch
 create mode 100644 third_party/xla/third_party/googletest/README.add-status-macros.md
 delete mode 100644 third_party/xla/third_party/xla_googletest_wrapper/BUILD.bazel
 delete mode 100644 third_party/xla/third_party/xla_googletest_wrapper/MODULE.bazel
 delete mode 100644 third_party/xla/third_party/xla_googletest_wrapper/README.md
 delete mode 100644 third_party/xla/third_party/xla_googletest_wrapper/REPO.bazel
 delete mode 100644 third_party/xla/third_party/xla_googletest_wrapper/googletest_deps.bzl
 delete mode 100644 third_party/xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h

diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index d54ae9978b494d..b51181b412ef6c 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -10,8 +10,7 @@ bazel_dep(name = "bazel_skylib", version = "1.8.1")
 bazel_dep(name = "boringssl", version = "0.20250818.0")
 bazel_dep(name = "curl", version = "8.11.0")
 bazel_dep(name = "google_benchmark", version = "1.8.5", repo_name = "com_google_benchmark")
-bazel_dep(name = "googletest", version = "1.17.0", repo_name = "com_google_googletest_upstream")
-bazel_dep(name = "xla_googletest_wrapper", version = "1.0", repo_name = "com_google_googletest")
+bazel_dep(name = "googletest", version = "1.17.0", repo_name = "com_google_googletest")
 bazel_dep(name = "grpc", version = "1.74.1", repo_name = "com_github_grpc_grpc")
 bazel_dep(name = "gutil", version = "20250502.0", repo_name = "com_google_gutil")
 bazel_dep(name = "jsoncpp", version = "1.9.6", repo_name = "jsoncpp_git")
@@ -73,11 +72,10 @@ archive_override(
     module_name = "googletest",
     strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
     urls = ["https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c94018.zip"],
-)
-
-local_path_override(
-    module_name = "xla_googletest_wrapper",
-    path = "third_party/xla_googletest_wrapper",
+    patch_strip = 1,
+    patches = [
+        "//third_party/googletest:0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch",
+    ],
 )
 
 ##############################################################
diff --git a/third_party/xla/opensource_only.files b/third_party/xla/opensource_only.files
index 888a0978aac8f0..a9bf5dcaee9f9f 100644
--- a/third_party/xla/opensource_only.files
+++ b/third_party/xla/opensource_only.files
@@ -157,5 +157,4 @@ xla/third_party/tensorrt/tensorrt/tensorrt_config.py.tpl:
 xla/third_party/tensorrt/tensorrt_configure.bzl:
 xla/third_party/tensorrt/workspace.bzl:
 xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h:
-xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h:
 xla/third_party/zlib.BUILD:
diff --git a/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch b/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
new file mode 100644
index 00000000000000..5b16fd63bb94a2
--- /dev/null
+++ b/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
@@ -0,0 +1,164 @@
+From 925be4390f717899f3e825abe6e9a0548f0630e4 Mon Sep 17 00:00:00 2001
+From: Marcin Radomski <dextero@google.com>
+Date: Thu, 4 Dec 2025 15:37:45 +0000
+Subject: [PATCH 1/2] Add ASSERT_OK/EXPECT_OK/ASSERT_OK_AND_ASSIGN macros
+
+Create this patch with git diff -U2 / git format-patch -U2 to avoid
+mismatches with googletest.patch.
+---
+ BUILD.bazel                                   |   5 +-
+ googlemock/include/gmock/gmock.h              |   1 +
+ .../include/gmock/internal/xla-gmock-macros.h | 118 ++++++++++++++++++
+ 3 files changed, 123 insertions(+), 1 deletion(-)
+ create mode 100644 googlemock/include/gmock/internal/xla-gmock-macros.h
+
+diff --git a/BUILD.bazel b/BUILD.bazel
+index 008af6a1..32d2a22c 100644
+--- a/BUILD.bazel
++++ b/BUILD.bazel
+@@ -163,5 +163,8 @@ cc_library(
+         ],
+         "//conditions:default": [],
+-    }),
++    }) + [
++        "@abseil-cpp//absl/status",
++        "@abseil-cpp//absl/status:statusor",
++    ],
+ )
+ 
+diff --git a/googlemock/include/gmock/gmock.h b/googlemock/include/gmock/gmock.h
+index c78fb8ee..69b33572 100644
+--- a/googlemock/include/gmock/gmock.h
++++ b/googlemock/include/gmock/gmock.h
+@@ -95,3 +95,4 @@ GTEST_API_ void InitGoogleMock();
+ }  // namespace testing
+ 
++#include "gmock/internal/xla-gmock-macros.h"
+ #endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
+diff --git a/googlemock/include/gmock/internal/xla-gmock-macros.h b/googlemock/include/gmock/internal/xla-gmock-macros.h
+new file mode 100644
+index 00000000..fd48a21a
+--- /dev/null
++++ b/googlemock/include/gmock/internal/xla-gmock-macros.h
+@@ -0,0 +1,118 @@
++/* Copyright 2025 The Abseil Authors & TensorFlow Authors. All Rights Reserved.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#ifndef GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_XLA_GMOCK_MACROS_H_
++#define GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_XLA_GMOCK_MACROS_H_
++
++// gmock/gmock.h wrapper that also provides assert macros.
++//
++// These already exist in internal version of gmock, but upstream version
++// doesn't have them. We use this wrapper to make dependency translation when
++// exporting to OSS easier.
++//
++// - We want to use standard internal header and ASSERT_OK, EXPECT_OK macros
++//   when developing internally.
++// - We want the same macros to work externally, rather than having to add or
++//   strip TF_ prefix.
++// - We want the OSS export to still work after the export and header
++//   translation.
++// - We want to minimize the amount of patching third party projects to reduce
++//   maintenance overhead.
++// - To ensure the OSS patches cleanly apply onto internal repo, we need the
++//   header translation to be reversible, which requires 1:1 header mapping.
++//
++// To achieve this, we add those macros to gmock for all XLA code, which
++// should (TM) make ASSERT_OK/EXPECT_OK "just work" in all XLA tests.
++//
++// absl/status/status_matchers.h depends on gmock.h, so we can't simply add it
++// here. This causes a circular dependency between this and absl - which bazel
++// doesn't allow.
++
++#include "absl/status/status.h"
++#include "absl/status/statusor.h"
++
++// Macros for testing the results of functions that return absl::Status or
++// absl::StatusOr<T> (for any type T).
++#define EXPECT_OK(expression) \
++  EXPECT_THAT(expression, ::xla_testing::internal::IsOk())
++#define ASSERT_OK(expression) \
++  ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
++
++#define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
++  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                     \
++      XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
++      lhs, rexpr);
++
++#define ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
++  auto statusor = (rexpr);                              \
++  ASSERT_OK(statusor.status());                         \
++  lhs = std::move(statusor).value()
++
++#define XLA_STATUS_MACROS_CONCAT_NAME(x, y) XLA_STATUS_MACROS_CONCAT_IMPL(x, y)
++#define XLA_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
++
++namespace xla_testing {
++namespace internal {
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++inline const absl::Status& GetStatus(const absl::Status& status) {
++  return status;
++}
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++template <typename T>
++inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
++  return status.status();
++}
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++//
++// Monomorphic implementation of matcher IsOk() for a given type T.
++// T can be Status, StatusOr<>, or a reference to either of them.
++template <typename T>
++class MonoIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
++ public:
++  void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
++  void DescribeNegationTo(std::ostream* os) const override {
++    *os << "is not OK";
++  }
++  bool MatchAndExplain(T actual_value,
++                       ::testing::MatchResultListener*) const override {
++    return GetStatus(actual_value).ok();
++  }
++};
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++//
++// Implements IsOk() as a polymorphic matcher.
++class IsOkMatcher {
++ public:
++  template <typename T>
++  /*implicit*/ operator ::testing::Matcher<T>() const {  // NOLINT
++    return ::testing::Matcher<T>(new MonoIsOkMatcherImpl<const T&>());
++  }
++};
++
++// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
++//
++// Returns a gMock matcher that matches a Status or StatusOr<> which is OK.
++inline ::xla_testing::internal::IsOkMatcher IsOk() {
++  return ::xla_testing::internal::IsOkMatcher();
++}
++
++}  // namespace internal
++}  // namespace xla_testing
++
++#endif  // GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_XLA_GMOCK_MACROS_H_
+-- 
+2.52.0.223.gf5cc29aaa4-goog
+
diff --git a/third_party/xla/third_party/googletest/0002-Rename-dependencies-for-workspace.bzl-build.patch b/third_party/xla/third_party/googletest/0002-Rename-dependencies-for-workspace.bzl-build.patch
new file mode 100644
index 00000000000000..93fa7f98c1b156
--- /dev/null
+++ b/third_party/xla/third_party/googletest/0002-Rename-dependencies-for-workspace.bzl-build.patch
@@ -0,0 +1,30 @@
+From 21affdb9aaa50767264c13d607d47cb2104c4e4a Mon Sep 17 00:00:00 2001
+From: Marcin Radomski <dextero@google.com>
+Date: Tue, 9 Dec 2025 18:23:26 +0000
+Subject: [PATCH 2/2] Rename dependencies for workspace.bzl build
+
+Must be separate from googletest.patch, because:
+- Tensorflow applies googletest.patch only
+- XLA bzlmod build applies patch that adds assert macros only, and
+  needs different repository name in deps
+- XLA workspace.bzl build applies everything
+---
+ BUILD.bazel | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/BUILD.bazel b/BUILD.bazel
+index 32d2a22c..a122fa28 100644
+--- a/BUILD.bazel
++++ b/BUILD.bazel
+@@ -164,6 +164,6 @@ cc_library(
+         "//conditions:default": [],
+     }) + [
+-        "@abseil-cpp//absl/status",
+-        "@abseil-cpp//absl/status:statusor",
++        "@com_google_absl//absl/status",
++        "@com_google_absl//absl/status:statusor",
+     ],
+ )
+-- 
+2.52.0.223.gf5cc29aaa4-goog
+
diff --git a/third_party/xla/third_party/googletest/README.add-status-macros.md b/third_party/xla/third_party/googletest/README.add-status-macros.md
new file mode 100644
index 00000000000000..8a169b6f6031d5
--- /dev/null
+++ b/third_party/xla/third_party/googletest/README.add-status-macros.md
@@ -0,0 +1,41 @@
+add-status-macros.patch adds `ASSERT_OK`, `EXPECT_OK`, `ASSERT_OK_AND_ASSIGN`
+to gmock.h so that the header's provided functionality matches internal gmock.
+
+What other things have we tried?
+
+1. Introducing a custom header to be used in OSS instead of `gmock/gmock.h`.
+
+   The export-to-OSS process imposes a few restrictions. Notably, header
+   rewrite has to be reversible, so we need a 1:1 mapping between headers used
+   internally and in OSS.
+
+   If we introduced a custom header to be used in OSS instead of gmock, it
+   would have to take the place of current rewrite of internal gmock to
+   `gmock/gmock.h`. This means, any use of `gmock/gmock.h` in OSS XLA code can
+   no longer map to internal gmock. We'd have to ban the header.
+
+   Therefore, updating OSS `gmock/gmock.h` seems necessary.
+
+2. Patching in the extra macros to `gmock/gmock.h` by including
+   `absl/status/status_macros.h`.
+
+   This introduces a circular dependency between absl and gmock which makes
+   bazel strongly opposed to the idea.
+
+3. Introducing a googletest bazel module wrapper.
+
+   This would be a module would proxy all `gmock/gmock.h` within XLA without
+   additional patching of googletest. However, having multiple sources of the
+   same gmock/gmock.h header path only works *sometimes*. The order of include
+   paths emitted by bazel depends on the target definition and ordering of
+   dependencies, so it ends up working in some case and not in others.
+
+4. Expanding 3. by renaming googletest's `gmock.h` to `gmock.upstream.h` to
+   avoid header name conflicts.
+
+   `gmock/gmock.h` is also included by googletest itself, so redirecting it to
+   `gmock/gmock.upstream.h` is needed. That boils down to even more brittle
+   patching.
+
+Overall, the add-status-macros.patch change is the least invasive one that
+works.
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/BUILD.bazel b/third_party/xla/third_party/xla_googletest_wrapper/BUILD.bazel
deleted file mode 100644
index 09ae2f3c729c6e..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/BUILD.bazel
+++ /dev/null
@@ -1,38 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "gtest",
-    # Upstream gtest is *not* marked testonly for some reason, but
-    # gutil:status_matchers is, and non-testonly targets can't depend on
-    # testonly ones.
-    #
-    # XLA doesn't use gtest in non-testonly targets though so making this
-    # testonly should (TM) be fine.
-    testonly = True,
-    hdrs = [
-        "include/gmock/gmock.h",
-        "include/gtest/gtest.h",
-    ],
-    includes = [
-        "include",
-    ],
-    deps = [
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_googletest_upstream//:gtest",
-    ],
-)
-
-cc_library(
-    name = "gtest_main",
-    testonly = True,
-    deps = [
-        ":gtest",
-        "@com_google_googletest_upstream//:gtest_main",
-    ],
-)
-
-alias(
-    name = "gtest_for_library",
-    actual = ":gtest",
-)
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/MODULE.bazel b/third_party/xla/third_party/xla_googletest_wrapper/MODULE.bazel
deleted file mode 100644
index 24583fb0d3406a..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/MODULE.bazel
+++ /dev/null
@@ -1,4 +0,0 @@
-module(name = "xla_googletest_wrapper", version = "1.0")
-
-bazel_dep(name = "abseil-cpp", version = "20250814.0", repo_name = "com_google_absl")
-bazel_dep(name = "googletest", version = "1.17.0", repo_name = "com_google_googletest_upstream")
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/README.md b/third_party/xla/third_party/xla_googletest_wrapper/README.md
deleted file mode 100644
index f2065e18995bd4..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-A gtest wrapper that adds ASSERT_OK, EXPECT_OK, ASSERT_OK_AND_ASSIGN to gmock.h
-so that the header's provided functionality matches internal gmock.
-
-The repo contains a minimal set of reexports necessary to build XLA with this as
-a drop-in replacement for googletest.
\ No newline at end of file
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/REPO.bazel b/third_party/xla/third_party/xla_googletest_wrapper/REPO.bazel
deleted file mode 100644
index 1bab3de8da4637..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/REPO.bazel
+++ /dev/null
@@ -1 +0,0 @@
-# Mark the repo as a bazel repo.
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/googletest_deps.bzl b/third_party/xla/third_party/xla_googletest_wrapper/googletest_deps.bzl
deleted file mode 100644
index 0f980ef53dd946..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/googletest_deps.bzl
+++ /dev/null
@@ -1,8 +0,0 @@
-"""Reexports googletest_deps from upstream googletest."""
-
-# protobuf loads for @com_google_googletest//:googletest_deps.bzl so we need to
-# provide one in the wrapper.
-load("@com_google_googletest_upstream//:googletest_deps.bzl", upstream_deps = "googletest_deps")
-
-def googletest_deps():
-    upstream_deps()
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
index 27cfbf1160bd38..e5a79543245c08 100644
--- a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
+++ b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
@@ -1,129 +1,134 @@
-/* Copyright 2025 The Abseil Authors & TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
-#define GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
-
-// gmock/gmock.h wrapper that also provides assert macros.
-//
-// These already exist in internal version of gmock, but upstream version
-// doesn't have them. We use this wrapper to make dependency translation when
-// exporting to OSS easier.
-//
-// - We want to use standard internal header and ASSERT_OK, EXPECT_OK macros
-//   when developing internally.
-// - We want the same macros to work externally, rather than having to add or
-//   strip TF_ prefix.
-// - We want the OSS export to still work after the export and header
-//   translation.
-// - We want to minimize the amount of patching third party projects to reduce
-//   maintenance overhead.
-// - To ensure the OSS patches cleanly apply onto internal repo, we need the
-//   header translation to be reversible, which requires 1:1 header mapping.
-//
-// To achieve this, we swap out gmock.h for this wrapper in all XLA code, which
-// should (TM) make ASSERT_OK/EXPECT_OK "just work" in all XLA tests.
-//
-// The only way to make this work without patching googletest and/or absl is to
-// make XLA *always* use this wrapper, and *never* directly depend on upstream
-// googletest.
-//
-// absl/status/status_matchers.h depends on gmock.h, so we can't simply add it
-// here. This causes either:
-//
-// - A circular dependency between this and absl - which bazel doesn't allow,
-// - absl dependency on the upstream gmock - which depending on the dependency
-//   graph structure may introduce upstream gmock include path *before* one
-//   defined in here, so we end up with *sometimes* including the wrong one and
-//   the entire idea of drop-in replacing gmock.h goes out of the window.
-
-#include_next "gmock/gmock.h"
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-
-// Macros for testing the results of functions that return absl::Status or
-// absl::StatusOr<T> (for any type T).
-#define EXPECT_OK(expression) \
-  EXPECT_THAT(expression, ::xla_testing::internal::IsOk())
-#define ASSERT_OK(expression) \
-  ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
-
-#define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
-  ASSERT_OK_AND_ASSIGN_IMPL(                                        \
-      XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
-      lhs, rexpr);
-
-#define ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
-  auto statusor = (rexpr);                              \
-  ASSERT_OK(statusor.status());                         \
-  lhs = std::move(statusor).value()
-
-#define XLA_STATUS_MACROS_CONCAT_NAME(x, y) XLA_STATUS_MACROS_CONCAT_IMPL(x, y)
-#define XLA_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
-
-namespace xla_testing {
-namespace internal {
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-inline const absl::Status& GetStatus(const absl::Status& status) {
-  return status;
-}
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-template <typename T>
-inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
-  return status.status();
-}
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-//
-// Monomorphic implementation of matcher IsOk() for a given type T.
-// T can be Status, StatusOr<>, or a reference to either of them.
-template <typename T>
-class MonoIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
- public:
-  void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
-  void DescribeNegationTo(std::ostream* os) const override {
-    *os << "is not OK";
-  }
-  bool MatchAndExplain(T actual_value,
-                       ::testing::MatchResultListener*) const override {
-    return GetStatus(actual_value).ok();
-  }
-};
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-//
-// Implements IsOk() as a polymorphic matcher.
-class IsOkMatcher {
- public:
-  template <typename T>
-  /*implicit*/ operator ::testing::Matcher<T>() const {  // NOLINT
-    return ::testing::Matcher<T>(new MonoIsOkMatcherImpl<const T&>());
-  }
-};
-
-// DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
-//
-// Returns a gMock matcher that matches a Status or StatusOr<> which is OK.
-inline ::xla_testing::internal::IsOkMatcher IsOk() {
-  return ::xla_testing::internal::IsOkMatcher();
-}
-
-}  // namespace internal
-}  // namespace xla_testing
-
-#endif  // GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
+<<<<<<< Conflict 1 of 1
+%%%%%%% Changes from base to side #1
+ /* Copyright 2025 The Abseil Authors & TensorFlow Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+ 
+ #ifndef GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
+ #define GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
+ 
+ // gmock/gmock.h wrapper that also provides assert macros.
+ //
+ // These already exist in internal version of gmock, but upstream version
+ // doesn't have them. We use this wrapper to make dependency translation when
+ // exporting to OSS easier.
+ //
+ // - We want to use standard internal header and ASSERT_OK, EXPECT_OK macros
+ //   when developing internally.
+ // - We want the same macros to work externally, rather than having to add or
+ //   strip TF_ prefix.
+ // - We want the OSS export to still work after the export and header
+ //   translation.
+ // - We want to minimize the amount of patching third party projects to reduce
+ //   maintenance overhead.
+ // - To ensure the OSS patches cleanly apply onto internal repo, we need the
+ //   header translation to be reversible, which requires 1:1 header mapping.
+ //
+ // To achieve this, we swap out gmock.h for this wrapper in all XLA code, which
+ // should (TM) make ASSERT_OK/EXPECT_OK "just work" in all XLA tests.
+ //
+ // The only way to make this work without patching googletest and/or absl is to
+ // make XLA *always* use this wrapper, and *never* directly depend on upstream
+ // googletest.
+ //
+ // absl/status/status_matchers.h depends on gmock.h, so we can't simply add it
+ // here. This causes either:
+ //
+ // - A circular dependency between this and absl - which bazel doesn't allow,
+ // - absl dependency on the upstream gmock - which depending on the dependency
+ //   graph structure may introduce upstream gmock include path *before* one
+ //   defined in here, so we end up with *sometimes* including the wrong one and
+ //   the entire idea of drop-in replacing gmock.h goes out of the window.
+ 
+ #include_next "gmock/gmock.h"
+ 
+ #include "absl/status/status.h"
+ #include "absl/status/statusor.h"
+ 
+ // Macros for testing the results of functions that return absl::Status or
+ // absl::StatusOr<T> (for any type T).
+ #define EXPECT_OK(expression) \
+   EXPECT_THAT(expression, ::xla_testing::internal::IsOk())
+ #define ASSERT_OK(expression) \
+   ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
+ 
+ #define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
+-  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                     \
++  ASSERT_OK_AND_ASSIGN_IMPL(                                        \
+       XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
+       lhs, rexpr);
+ 
+ #define ASSERT_OK_AND_ASSIGN_IMPL(statusor, lhs, rexpr) \
+   auto statusor = (rexpr);                              \
+   ASSERT_OK(statusor.status());                         \
+   lhs = std::move(statusor).value()
+ 
+ #define XLA_STATUS_MACROS_CONCAT_NAME(x, y) XLA_STATUS_MACROS_CONCAT_IMPL(x, y)
+ #define XLA_STATUS_MACROS_CONCAT_IMPL(x, y) x##y
+ 
+ namespace xla_testing {
+ namespace internal {
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ inline const absl::Status& GetStatus(const absl::Status& status) {
+   return status;
+ }
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ template <typename T>
+ inline const absl::Status& GetStatus(const absl::StatusOr<T>& status) {
+   return status.status();
+ }
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ //
+ // Monomorphic implementation of matcher IsOk() for a given type T.
+ // T can be Status, StatusOr<>, or a reference to either of them.
+ template <typename T>
+ class MonoIsOkMatcherImpl : public ::testing::MatcherInterface<T> {
+  public:
+   void DescribeTo(std::ostream* os) const override { *os << "is OK"; }
+   void DescribeNegationTo(std::ostream* os) const override {
+     *os << "is not OK";
+   }
+   bool MatchAndExplain(T actual_value,
+                        ::testing::MatchResultListener*) const override {
+     return GetStatus(actual_value).ok();
+   }
+ };
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ //
+ // Implements IsOk() as a polymorphic matcher.
+ class IsOkMatcher {
+  public:
+   template <typename T>
+   /*implicit*/ operator ::testing::Matcher<T>() const {  // NOLINT
+     return ::testing::Matcher<T>(new MonoIsOkMatcherImpl<const T&>());
+   }
+ };
+ 
+ // DO NOT USE DIRECTLY. Use absl/status/status_matchers.h instead.
+ //
+ // Returns a gMock matcher that matches a Status or StatusOr<> which is OK.
+ inline ::xla_testing::internal::IsOkMatcher IsOk() {
+   return ::xla_testing::internal::IsOkMatcher();
+ }
+ 
+ }  // namespace internal
+ }  // namespace xla_testing
+ 
+ #endif  // GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
++++++++ Contents of side #2
+>>>>>>> Conflict 1 of 1 ends
diff --git a/third_party/xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h b/third_party/xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h
deleted file mode 100644
index c903698baf0922..00000000000000
--- a/third_party/xla/third_party/xla_googletest_wrapper/include/gtest/gtest.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef GOOGLETEST_WRAPPER_INCLUDE_GTEST_GTEST_H_
-#define GOOGLETEST_WRAPPER_INCLUDE_GTEST_GTEST_H_
-
-#include_next "gtest/gtest.h"
-
-#endif  // GOOGLETEST_WRAPPER_INCLUDE_GTEST_GTEST_H_
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 43537bf6cfabe7..d81988a636a1f2 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -7,7 +7,7 @@ load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("@rules_ml_toolchain//gpu/sycl:sycl_configure.bzl", "sycl_configure")
 load("@rules_ml_toolchain//gpu/sycl:sycl_init_repository.bzl", "sycl_init_repository")
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls", "tf_vendored")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
 load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
@@ -305,25 +305,8 @@ def _tf_repositories():
         },
     )
 
-    # We use a vendored wrapper over googletest to provide
-    # ASSERT_OK/EXPECT_OK/ASSERT_OK_AND_ASSIGN macros through gmock/gmock.h.
-    #
-    # Internal gmock includes those macros, but the external one doesn't. This
-    # caused issues where internal builds succeed, but the copybara export to
-    # github doesn't compile because those macros are not defined. The
-    # workaround was to use custom TF_-prefixed variants of those macros.
-    #
-    # This wrapper lets us have the same code work in both by just swapping the
-    # internal header with gmock/gmock.h. This applies to XLA only, not to TF,
-    # so the TF_ macros that are still in use there must stay, and can't just
-    # expand to non-TF_ variants as.
-    tf_vendored(
-        name = "com_google_googletest",
-        path = "third_party/xla_googletest_wrapper",
-    )
-
     tf_http_archive(
-        name = "com_google_googletest_upstream",
+        name = "com_google_googletest",
         # Use the commit on 2025/6/09:
         # https://github.com/google/googletest/commit/28e9d1f26771c6517c3b4be10254887673c94018
         sha256 = "f253ca1a07262f8efde8328e4b2c68979e40ddfcfc001f70d1d5f612c7de2974",
@@ -332,6 +315,8 @@ def _tf_repositories():
         #   - avoid dependencies on @fuchsia_sdk,
         #   - refer to re2 as @com_googlesource_code_re2,
         #   - refer to abseil as @com_google_absl.
+        #   - add status assert macros for consistency with internal gmock (see
+        #     README.add-status-macros.md).
         #
         # To update the patch, run:
         # $ cd ~
@@ -344,7 +329,11 @@ def _tf_repositories():
         # $ git diff > <client-root>/third_party/tensorflow/third_party/googletest/googletest.patch
         #
         # The patch path is relative to third_party/xla.
-        patch_file = ["//third_party/googletest:googletest.patch"],
+        patch_file = [
+            "//third_party/googletest:googletest.patch",
+            "//third_party/googletest:0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch",
+            "//third_party/googletest:0002-Rename-dependencies-for-workspace.bzl-build.patch",
+        ],
         urls = tf_mirror_urls("https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c940189.zip"),
     )
 

From 60787a921bd7cb72eaa44fc674566af7f1c2a62b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 22:53:10 -0800
Subject: [PATCH 104/753] Automated Code Change

PiperOrigin-RevId: 842552120
---
 tensorflow/core/transforms/region_to_functional/impl.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index 65c37b8b468825..6a494c083394ee 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/core/transforms/region_to_functional/impl.h"
 
 #include <cassert>
-#include <cctype>
 #include <optional>
 #include <string>
 #include <tuple>

From 7b4b2a8def12b560a5a570bea0da12f40609ec99 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 9 Dec 2025 23:53:16 -0800
Subject: [PATCH 105/753] Add GitHub Actions job for 8xH100 GPUs.

PiperOrigin-RevId: 842571530
---
 .../xla/.github/workflows/ci_multi_device.yml | 64 +++++++++++++++++++
 third_party/xla/build_tools/ci/build.py       | 45 ++++++++-----
 .../xla/build_tools/ci/golden_commands.txt    |  6 ++
 3 files changed, 100 insertions(+), 15 deletions(-)
 create mode 100644 third_party/xla/.github/workflows/ci_multi_device.yml

diff --git a/third_party/xla/.github/workflows/ci_multi_device.yml b/third_party/xla/.github/workflows/ci_multi_device.yml
new file mode 100644
index 00000000000000..4171626436f600
--- /dev/null
+++ b/third_party/xla/.github/workflows/ci_multi_device.yml
@@ -0,0 +1,64 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+name: Multi-Device CI
+permissions:
+  contents: read
+on:
+  workflow_dispatch:  # Allows manual triggering
+
+jobs:
+  Tests:
+    strategy:
+      # Don't fail fast - want to see results for all builds even if one fails.
+      fail-fast: false
+      matrix:
+        job_info: [
+          {
+            pool: "linux-x86-a3-8g-h100-8gpu",
+            container: "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/ml-build:latest",
+            name: "XLA Linux x86 GPU 8xH100",
+            repo: "openxla/xla",
+          },
+        ]
+    name: ${{ matrix.job_info.name }}
+    runs-on: ${{ matrix.job_info.pool }}
+    container: ${{ matrix.job_info.container }}
+    defaults:
+      run:
+        shell: bash
+    timeout-minutes: 60
+    steps:
+      - name: "Checking out openxla/xla"
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          path: "openxla/xla"
+      - name: Checking out ${{ matrix.job_info.repo }}
+        if: ${{ matrix.job_info.repo != 'openxla/xla' }}
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        with:
+          repository: ${{ matrix.job_info.repo }}
+          path: ${{ matrix.job_info.repo }}
+      - name: "Wait For Connection"
+        uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
+        with:
+          halt-dispatch-input: ${{ inputs.halt-for-connection }}
+      - name: "Run build.py"
+        working-directory: ${{ matrix.job_info.repo }}
+        run: |
+          if [[ "${{ matrix.job_info.pool }}" == *windows* ]]; then
+            python $GITHUB_WORKSPACE\\openxla\\xla\\build_tools\\ci\\build.py --build="${{ matrix.job_info.name }}_github_actions"
+          else
+            $GITHUB_WORKSPACE/openxla/xla/build_tools/ci/build.py --build="${{ matrix.job_info.name }}_github_actions"
+          fi
diff --git a/third_party/xla/build_tools/ci/build.py b/third_party/xla/build_tools/ci/build.py
index 0b6e4dbcbf8822..bf4c2071dce9ce 100755
--- a/third_party/xla/build_tools/ci/build.py
+++ b/third_party/xla/build_tools/ci/build.py
@@ -111,6 +111,7 @@ class BuildType(enum.Enum):
   XLA_LINUX_X86_CPU_BZLMOD_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_ARM64_CPU_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS = enum.auto()
+  XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS = enum.auto()
   XLA_LINUX_X86_GPU_ONEAPI_GITHUB_ACTIONS = enum.auto()
 
   # Presubmit builds for regression testing.
@@ -281,7 +282,9 @@ def _tag_filters_for_compute_capability(
     "-oneapi-only",
 )
 
-single_nvidia_gpu_filters = nvidia_gpu_filters + ("-multi_gpu",)
+nvidia_single_gpu_filters = nvidia_gpu_filters + ("-multi_gpu",)
+
+nvidia_only_multi_gpu_filters = nvidia_gpu_filters + ("multi_gpu",)
 
 
 def nvidia_gpu_build_with_compute_capability(
@@ -289,15 +292,19 @@ def nvidia_gpu_build_with_compute_capability(
     type_: BuildType,
     configs: Tuple[str, ...],
     compute_capability: int,
+    multi_gpu: bool = False,
 ) -> Build:
   extra_gpu_tags = _tag_filters_for_compute_capability(compute_capability)
+  filter_tags = (
+      nvidia_only_multi_gpu_filters if multi_gpu else nvidia_single_gpu_filters
+  )
   return Build(
       type_=type_,
       repo="openxla/xla",
       target_patterns=_XLA_DEFAULT_TARGET_PATTERNS,
       configs=configs,
-      test_tag_filters=single_nvidia_gpu_filters + extra_gpu_tags,
-      build_tag_filters=single_nvidia_gpu_filters,
+      test_tag_filters=filter_tags + extra_gpu_tags,
+      build_tag_filters=filter_tags,
       options={
           "run_under": "//build_tools/ci:parallel_gpu_execute",
           "//xla/tsl:ci_build": True,
@@ -434,6 +441,14 @@ def nvidia_gpu_build_with_compute_capability(
     type_=BuildType.XLA_LINUX_X86_GPU_L4_GITHUB_ACTIONS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     compute_capability=75,
+    multi_gpu=False,
+)
+
+nvidia_gpu_build_with_compute_capability(
+    type_=BuildType.XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS,
+    configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
+    compute_capability=90,
+    multi_gpu=True,
 )
 
 oneapi_build_tag_filter = (
@@ -508,9 +523,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
-    test_tag_filters=single_nvidia_gpu_filters
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=single_nvidia_gpu_filters,
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -528,9 +543,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
-    test_tag_filters=single_nvidia_gpu_filters
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=single_nvidia_gpu_filters,
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -549,9 +564,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=single_nvidia_gpu_filters
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=single_nvidia_gpu_filters,
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -569,9 +584,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=("warnings", "rbe_linux_cuda_nvcc", "hermetic_cuda_umd"),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=single_nvidia_gpu_filters
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=75),
-    build_tag_filters=single_nvidia_gpu_filters,
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         "//xla/tsl:ci_build": True,
@@ -590,9 +605,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=(),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=single_nvidia_gpu_filters
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=100),
-    build_tag_filters=single_nvidia_gpu_filters,
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         # Use User Mode and Kernel Mode Drivers pre-installed on the system.
@@ -613,9 +628,9 @@ def nvidia_gpu_build_with_compute_capability(
     repo="openxla/xla",
     configs=(),
     target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS,
-    test_tag_filters=single_nvidia_gpu_filters
+    test_tag_filters=nvidia_single_gpu_filters
     + _tag_filters_for_compute_capability(compute_capability=100),
-    build_tag_filters=single_nvidia_gpu_filters,
+    build_tag_filters=nvidia_single_gpu_filters,
     options={
         "run_under": "//build_tools/ci:parallel_gpu_execute",
         # Use User Mode and Kernel Mode Drivers pre-installed on the system.
diff --git a/third_party/xla/build_tools/ci/golden_commands.txt b/third_party/xla/build_tools/ci/golden_commands.txt
index e067ee9ecc80dd..3510dd02708fe8 100644
--- a/third_party/xla/build_tools/ci/golden_commands.txt
+++ b/third_party/xla/build_tools/ci/golden_commands.txt
@@ -53,6 +53,12 @@ parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_fi
 bazel test --build_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=nonccl --config=rbe_linux_cpu --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --//xla/tsl:ci_build -- //xla/... //build_tools/... @local_tsl//tsl/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_LINUX_X86_CPU_GITHUB_ACTIONS
+# BEGIN BuildType.XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS
+nvidia-smi
+parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu,requires-gpu-sm90-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=9.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/... //build_tools/... @local_tsl//tsl/...
+bazel test --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,multi_gpu,requires-gpu-sm90-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,-requires-gpu-sm100,-requires-gpu-sm100-only,-requires-gpu-amd,-requires-gpu-intel --config=warnings --config=rbe_linux_cuda_nvcc --config=hermetic_cuda_umd --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=9.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async -- //xla/... //build_tools/... @local_tsl//tsl/...
+bazel analyze-profile profile.json.gz
+# END BuildType.XLA_LINUX_X86_GPU_8X_H100_GITHUB_ACTIONS
 # BEGIN BuildType.XLA_LINUX_X86_GPU_A4_224_VCPU_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS
 nvidia-smi
 parallel --ungroup --retries 3 --delay 15 --nonall -- bazel build --build_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu --test_tag_filters=-no_oss,requires-gpu-nvidia,gpu,-rocm-only,-oneapi-only,-multi_gpu,requires-gpu-sm100-only,requires-gpu-sm60,requires-gpu-sm70,requires-gpu-sm80,requires-gpu-sm90,requires-gpu-sm100,-requires-gpu-amd,-requires-gpu-intel --repo_env=TF_CUDA_COMPUTE_CAPABILITIES=10 --repo_env=HERMETIC_CUDA_VERSION=12.8.0 --repo_env=HERMETIC_CUDNN_VERSION=9.8.0 --run_under=//build_tools/ci:parallel_gpu_execute --//xla/tsl:ci_build --@local_config_cuda//cuda:include_cuda_libs=False --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --nobuild -- //xla/tools/multihost_hlo_runner:hlo_runner_main_gpu //xla/tools:compute_xspace_stats_main_gpu

From 12a199ba96c18a55b813e4f5130bcbb780585913 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Tue, 9 Dec 2025 23:54:59 -0800
Subject: [PATCH 106/753] Update XNNPACK in XLA

PiperOrigin-RevId: 842571958
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake | 2 +-
 tensorflow/workspace2.bzl                         | 6 +++---
 third_party/xla/third_party/xnnpack/workspace.bzl | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 4199e85da146ff..b28d5c5b01c1ed 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 085272364b1e8168a82994296994d9b02444e82a
+  GIT_TAG dc05a09f076534ce56c6f5b82a0327850c66bf3c
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index c8bea5feba54fb..13be76236cb855 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -168,9 +168,9 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "9ff5f0631970f3393522e2fb0b882c7cabc44c76f957d257b507f47611e2df47",
-        strip_prefix = "XNNPACK-085272364b1e8168a82994296994d9b02444e82a",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/085272364b1e8168a82994296994d9b02444e82a.zip"),
+        sha256 = "7480edcb300368d5516b583d6312b596cd8c23395c214bb786ec2a1e09eb6b4b",
+        strip_prefix = "XNNPACK-dc05a09f076534ce56c6f5b82a0327850c66bf3c",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/dc05a09f076534ce56c6f5b82a0327850c66bf3c.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index f7dc6428146b0a..d4696680d3f47b 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "9ff5f0631970f3393522e2fb0b882c7cabc44c76f957d257b507f47611e2df47",
-        strip_prefix = "XNNPACK-085272364b1e8168a82994296994d9b02444e82a",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/085272364b1e8168a82994296994d9b02444e82a.zip"),
+        sha256 = "7480edcb300368d5516b583d6312b596cd8c23395c214bb786ec2a1e09eb6b4b",
+        strip_prefix = "XNNPACK-dc05a09f076534ce56c6f5b82a0327850c66bf3c",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/dc05a09f076534ce56c6f5b82a0327850c66bf3c.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)

From c2dc0eef19e82b78d1894fc28a5296517bacb080 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 00:27:48 -0800
Subject: [PATCH 107/753] Automated Code Change

PiperOrigin-RevId: 842584191
---
 third_party/xla/xla/backends/gpu/autotuner/BUILD     | 12 ++++++++++++
 third_party/xla/xla/backends/gpu/autotuner/cudnn.cc  |  1 +
 .../xla/xla/backends/gpu/autotuner/factory_cuda.cc   |  1 +
 .../xla/xla/backends/gpu/autotuner/factory_rocm.cc   |  1 +
 .../xla/xla/backends/gpu/autotuner/fission_backend.h |  1 +
 .../backends/gpu/autotuner/fission_backend_test.cc   |  1 +
 .../gpu/autotuner/gpu_codegen_backend_test.cc        |  1 +
 .../xla/backends/gpu/autotuner/gpu_profiler_test.cc  |  1 +
 .../xla/xla/backends/gpu/autotuner/legacy_cache.cc   |  2 ++
 .../xla/backends/gpu/autotuner/legacy_cache_test.cc  |  2 ++
 third_party/xla/xla/backends/gpu/autotuner/miopen.cc |  1 +
 .../backends/gpu/autotuner/native_emitter_test.cc    |  1 +
 12 files changed, 25 insertions(+)

diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index 662e3f7e03ddf5..eb0a0060f8125e 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -40,6 +40,7 @@ xla_cc_test(
     srcs = ["gpu_codegen_backend_test.cc"],
     deps = [
         ":gpu_codegen_backend",
+        "//xla:xla_proto_cc",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -284,6 +285,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
@@ -569,6 +571,7 @@ xla_test(
     ],
     deps = [
         ":native_emitter",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -603,6 +606,7 @@ cc_library(
         ":factory",
         ":fission_backend",
         ":triton",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/service:compiler",
@@ -627,6 +631,7 @@ cc_library(
         ":cublas",
         ":factory",
         ":triton",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/analysis:symbolic_expr",
         "//xla/service:compiler",
@@ -659,6 +664,7 @@ xla_test(
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/autotuner:profiler",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -691,6 +697,7 @@ cc_library(
     srcs = ["legacy_cache.cc"],
     hdrs = ["legacy_cache.h"],
     deps = [
+        "//xla:autotune_results_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:autotuner_cache_interface",
@@ -715,6 +722,7 @@ cc_library(
     hdrs = ["fission_backend.h"],
     deps = [
         ":gpu_codegen_backend",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -744,6 +752,7 @@ xla_test(
         ":custom_kernel",
         ":fission_backend",
         ":gpu_codegen_backend",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -774,6 +783,7 @@ cc_library(
         "//xla:autotuning_proto_cc",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/hlo/ir:hlo",
@@ -794,7 +804,9 @@ xla_cc_test(
     srcs = ["legacy_cache_test.cc"],
     deps = [
         ":legacy_cache",
+        "//xla:autotuning_proto_cc",
         "//xla:literal_util",
+        "//xla:xla_proto_cc",
         "//xla/backends/autotuner:autotuner_cache_interface",
         "//xla/backends/autotuner:autotuner_cache_proto_cc",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
index c5c9111d0eac80..f2d79b13a17413 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
index 1b4f73b9a7c2ec..d9e9130f8f96e3 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
index 602aa66217ca2b..e327f6abdde0e0 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
index 01f42446ec07d6..edc5c814af89b2 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla.pb.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
index f6ec44d6aaf755..a9b0ecaaca050b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
index 7b08c0f0637cbf..9e9ddf541aa602 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_codegen_backend_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 
 #include <gtest/gtest.h>
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc
index 2bd462e10074d9..5c3025ba233048 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/gpu_profiler_test.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
index 79242cf983550c..18323dc1ceef2b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/autotune_results.pb.h"
+#include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/autotuning/autotune_cache_key.h"
diff --git a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
index 6c2744e0bd026f..13c0e93baa1313 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/legacy_cache_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/autotuner_cache.pb.h"
 #include "xla/backends/autotuner/autotuner_cache_interface.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/miopen.cc b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
index c4e0872244c092..636be11815a4d0 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/miopen.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
index 94b0d741c01ad5..bedf1bb18be870 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla.pb.h"
 
 namespace xla {
 namespace gpu {

From 1144cc69a2b8caa8f2a5f99cd29cc22882fc73ff Mon Sep 17 00:00:00 2001
From: TJ Xu <tjx@nvidia.com>
Date: Wed, 10 Dec 2025 00:42:16 -0800
Subject: [PATCH 108/753] PR #26196: force delay scheduling start to extend
 overlap interval
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/26196

📝 Summary of Changes
This pr introduces a new heuristics to delay scheduling start based on the op type when the overlap limit is larger than 1 and default cost model is used.

🎯 Justification
The goal is to extend the overlap interval as much as possible to include more computes and provide the best out of the box scheduling.
🚀 Kind of Contribution
 ⚡️ Performance Improvement

📊 Benchmark (for Performance Improvements)
hlo_llama31_8b_bf16_1x8.hlo -> 1359ms -> 1318.3ms
hlo_llama31_8b_bf16_2x8.hlo -> 1366ms -> 1321.4ms
hlo_llama31_8b_fp8_1x8.hlo -> 1123ms -> 1089ms
hlo_llama31_8b_fp8_2x8.hlo -> 1141ms -> 1.95.5ms
🧪 Unit Tests:
Added unit tests

🧪 Execution Tests:
included

Copybara import of the project:

--
28d5f58a06b1725fafd981f22fe4b72680c34b0c by TJ Xu <tjx@nvidia.com>:

Add a new LHS config to priotitize computes nodes over collective starts
when overlap limit is larger than 1 nad LHS is using default cost model

--
f40aed823cea4bf190fe96860ea59db24e6953a7 by TJ Xu <tjx@nvidia.com>:

refactor the local lambda in readysetlt to be a proper function

--
4ec19f629ce116a92b36e06505e1c0df51369d13 by TJ Xu <tjx@nvidia.com>:

updated the unit test to be more specific

Merging this change closes #26196

PiperOrigin-RevId: 842588981
---
 .../xla/xla/service/gpu/gpu_hlo_schedule.cc   |  7 +-
 .../gpu/gpu_latency_hiding_scheduler_test.cc  | 86 +++++++++++++++++++
 .../xla/service/latency_hiding_scheduler.cc   | 33 +++++++
 .../xla/service/latency_hiding_scheduler.h    |  3 +
 4 files changed, 128 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 2afc4787298d11..304da6a2627c0f 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -582,7 +582,12 @@ absl::Status RunLatencyHidingSchedulerPasses(
     pipeline.AddPass<PGLEAccuracyChecker>(
         dynamic_cast<ProfileGuidedLatencyEstimator&>(*estimator));
   }
-
+  // If overlap limit is set to be greater than 1 and the default t-short size
+  // estimator is used we will tell LHS to extend async-done intervals as much
+  // as possible to start collectives as early as possible.
+  if (config.parallel_collective_overlap_limit > 1) {
+    config.prioritize_compute_over_async_start = true;
+  }
   auto async_tracker = std::make_unique<GpuAsyncTracker>(config);
 
   std::shared_ptr<const SchedulingContext> scheduling_context =
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index 28eabcb1cd7680..ac1185896e2ead 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -75,6 +75,8 @@ class GpuLatencyHidingSchedulerBaseTest
     DebugOptions& options = module->mutable_config().mutable_debug_options();
     options.set_xla_gpu_experimental_parallel_collective_overlap_limit(
         num_parallel_resources);
+    options.set_xla_gpu_enable_analytical_sol_latency_estimator(false);
+
     options.set_xla_gpu_pgle_accuracy_checker(strictness);
 
     TF_RETURN_IF_ERROR(ScheduleGpuModule(module, /*pointer_size=*/8,
@@ -1041,5 +1043,89 @@ TEST_F(GpuLatencyHidingSchedulerBaseTest, ParallelThreadsShouldBeScheduled) {
   TF_EXPECT_OK(ScheduleModule(module.get()));
 }
 
+TEST_F(GpuLatencyHidingSchedulerBaseTest,
+       MultipleParallelAsyncsExtendedOverAllComputes) {
+  absl::string_view kHloModule = R"(
+HloModule m
+reduce {
+x = f32[] parameter(0)
+y = f32[] parameter(1)
+ROOT _ = f32[] add(x, y)
+}
+ENTRY main {
+p0 = f32[] parameter(0)
+p1 = f32[2] parameter(1)
+p2 = f32[2] parameter(2)
+p3 = f32[2] parameter(3)
+p4 = f32[2] parameter(4)
+p5 = f32[2] parameter(5)
+p6 = f32[2] parameter(6)
+ar_0 = f32[] all-reduce-start(p0), to_apply=reduce
+ar_1 = f32[] all-reduce-done(ar_0)
+add_2 = f32[2] add(p1, p6)
+
+ar_2 = f32[2] all-reduce-start(add_2), to_apply=reduce
+ar_3 = f32[2] all-reduce-done(ar_2)
+add_3 = f32[2] add(p1, p3)
+
+rs_0 = ((f32[2]), f32[1]) reduce-scatter-start(add_3), to_apply=reduce,
+dimensions={0}
+rs_1 = f32[1] reduce-scatter-done(rs_0)
+add_0 = f32[2] add(p1, p2)
+div_0 = f32[2] divide(p3, p4)
+mul_0 = f32[2] multiply(p4, p5)
+ROOT _ = (f32[], f32[2], f32[1], f32[2], f32[2], f32[2]) tuple(ar_1, ar_3, rs_1, add_0, div_0, mul_0)
+}
+)";
+  absl::string_view kFdoProfile = "";
+
+  auto config = GetModuleConfig(kFdoProfile);
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloModule, config));
+
+  TF_EXPECT_OK(ScheduleModule(module.get(), /*num_parallel_resources=*/16));
+  auto schedule = module->schedule();
+  std::vector<HloInstruction*> instruction_sequence =
+      schedule.sequence(module->entry_computation()).instructions();
+  // With a lot of parallel resources and default latency estimator,
+  // LHS will try to extend all asyncs as much as possible.
+  // We expect all computes to be wrapped within all async start-done
+  // intervals.
+  EXPECT_TRUE(GetIndexByName(instruction_sequence, "add_2") >
+                  GetIndexByName(instruction_sequence, "ar_0") &&
+              GetIndexByName(instruction_sequence, "add_3") >
+                  GetIndexByName(instruction_sequence, "ar_0") &&
+              GetIndexByName(instruction_sequence, "add_2") <
+                  GetIndexByName(instruction_sequence, "ar_1") &&
+              GetIndexByName(instruction_sequence, "add_3") <
+                  GetIndexByName(instruction_sequence, "ar_1"));
+
+  EXPECT_TRUE(GetIndexByName(instruction_sequence, "add_0") >
+                  GetIndexByName(instruction_sequence, "ar_0") &&
+              GetIndexByName(instruction_sequence, "add_0") >
+                  GetIndexByName(instruction_sequence, "rs_0") &&
+              GetIndexByName(instruction_sequence, "add_0") <
+                  GetIndexByName(instruction_sequence, "ar_1") &&
+              GetIndexByName(instruction_sequence, "add_0") <
+                  GetIndexByName(instruction_sequence, "rs_1"));
+
+  EXPECT_TRUE(GetIndexByName(instruction_sequence, "div_0") >
+                  GetIndexByName(instruction_sequence, "ar_0") &&
+              GetIndexByName(instruction_sequence, "div_0") >
+                  GetIndexByName(instruction_sequence, "rs_0") &&
+              GetIndexByName(instruction_sequence, "div_0") <
+                  GetIndexByName(instruction_sequence, "ar_1") &&
+              GetIndexByName(instruction_sequence, "div_0") <
+                  GetIndexByName(instruction_sequence, "rs_1"));
+  EXPECT_TRUE(GetIndexByName(instruction_sequence, "mul_0") >
+                  GetIndexByName(instruction_sequence, "ar_0") &&
+              GetIndexByName(instruction_sequence, "mul_0") >
+                  GetIndexByName(instruction_sequence, "rs_0") &&
+              GetIndexByName(instruction_sequence, "mul_0") <
+                  GetIndexByName(instruction_sequence, "ar_1") &&
+              GetIndexByName(instruction_sequence, "mul_0") <
+                  GetIndexByName(instruction_sequence, "rs_1"));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 62995d048976f9..f7500d9a46267e 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -1262,6 +1262,29 @@ class ReadySetLt {
     return std::nullopt;
   }
 
+  inline std::optional<bool> DelayAsyncStartCandidateCondition(
+      DefaultSchedulerCore::ScheduleCandidate& a,
+      DefaultSchedulerCore::ScheduleCandidate& b, const HloGraphNode* a_node,
+      const HloGraphNode* b_node, const char** reason) const {
+    bool a_has_async_resource =
+        a_node->DoesReleaseAnyResource() && !IsResourceConstrained(a, a_node);
+    bool b_has_async_resource =
+        b_node->DoesReleaseAnyResource() && !IsResourceConstrained(b, b_node);
+
+    CMP_EXPLICIT(!a_has_async_resource, !b_has_async_resource,
+                 "kDelayAsyncStartForCompute");
+    if (a_has_async_resource && b_has_async_resource) {
+      // If 2 nodes are both async nodes, we prioritize the one
+      // with more depth to free up more computes to overlap
+      // with the one with less depth which can be launched
+      // early
+      CMP_EXPLICIT(a_node->GetDepth() > b_node->GetDepth(),
+                   b_node->GetDepth() > a_node->GetDepth(),
+                   "kDelayAsyncStartForDepth");
+    }
+    return std::nullopt;
+  }
+
   // The comparison here implements the priority for the nodes in the ready
   // set. The function compares a and b in a series of prioritized
   // comparisons. As soon as it finds one that is not equal, it stops.  If
@@ -1371,6 +1394,16 @@ class ReadySetLt {
                    AsyncDepth0CandidateCondition(b, bn), "kStartAtZeroDepth");
     }
 
+    if (sched_state_.config.aggressive_scheduling_policies &&
+        sched_state_.config.prioritize_compute_over_async_start) {
+      // If an instruction releasing a resource is not resource constrained,
+      // delay it as much as possible.
+      if (auto value =
+              DelayAsyncStartCandidateCondition(a, b, an, bn, reason)) {
+        return *value;
+      }
+    }
+
     auto a_readytime = an->GetReadyTime();
     auto b_readytime = bn->GetReadyTime();
     if (a_readytime != b_readytime) {  // Quick test to avoid lots of work
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 01630bbaa5bf5e..2930dfd2277e3d 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -149,6 +149,9 @@ struct SchedulerConfig {
   bool use_real_cost_model = false;
   bool aggressive_scheduling_policies = false;
   bool prioritize_async_depth_over_stall = false;
+
+  bool prioritize_compute_over_async_start = false;
+
   bool enable_release_start_policy = false;
   bool resource_sharing = false;
   bool resource_serializing = false;

From de465e9c971a1252b418d82b6e979a84766098f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 01:04:07 -0800
Subject: [PATCH 109/753] Update GraphDef version to 2437.

PiperOrigin-RevId: 842597261
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b483429b89ccd9..71f77a1df57898 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2436  // Updated: 2025/12/9
+#define TF_GRAPH_DEF_VERSION 2437  // Updated: 2025/12/10
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From f7d6fd10889fda2c84e57d4bb92aa5728cf04b40 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 01:04:11 -0800
Subject: [PATCH 110/753] compat: Update forward compatibility horizon to
 2025-12-10

PiperOrigin-RevId: 842597290
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 949094ad18d927..8f5d395630c3fe 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 9)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 10)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From b6ff99a1fd28a12a93e9243cf0df318a24401c1b Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 10 Dec 2025 01:51:40 -0800
Subject: [PATCH 111/753] Reverts 0dc10cd4e296a41f229a8b9196e127c7576ed30f

PiperOrigin-RevId: 842614793
---
 third_party/xla/xla/debug_options_flags.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index e7473940048f0a..fc1f41ea91152e 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -443,7 +443,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_enable_fast_math(false);
   opts.set_xla_gpu_experimental_parallel_collective_overlap_limit(1);
   opts.set_xla_pjrt_allow_auto_layout_in_hlo(false);
-  opts.set_xla_gpu_enable_scatter_determinism_expander(true);
+  opts.set_xla_gpu_enable_scatter_determinism_expander(false);
   opts.set_xla_gpu_unsupported_enable_ragged_all_to_all_decomposer(false);
   opts.set_xla_gpu_unsupported_use_all_reduce_one_shot_kernel(false);
   opts.set_xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel(true);

From 30f2159b4beaf0d4f4d2f8c9fb99e4d5d71ca426 Mon Sep 17 00:00:00 2001
From: Will Froom <willfroom@google.com>
Date: Wed, 10 Dec 2025 02:15:23 -0800
Subject: [PATCH 112/753] [XLA:CPU] Lower unsupported vector lengths via scalar
 approximations.

This is needed to get the same numerics as the scalar loop emitters as some vector lengths are not supported. It isn't optimal but it works.

PiperOrigin-RevId: 842623172
---
 .../transforms/lower_xla_intrinsic_lib.cc     | 43 +++++++++++++++----
 .../tests/lower_xla_intrinsic_lib.mlir        | 18 +++++++-
 2 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
index 95b6fe5737f916..a42af497a0d317 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
@@ -213,27 +213,52 @@ class LowerIntrinsicPattern : public mlir::OpRewritePattern<Op> {
 
   mlir::LogicalResult matchAndRewrite(
       Op op, mlir::PatternRewriter& rewriter) const override {
-    if (auto vec_type = mlir::dyn_cast<mlir::VectorType>(op.getType());
-        vec_type && vec_type.getRank() != 1) {
+    auto vec_type = mlir::dyn_cast<mlir::VectorType>(op.getType());
+    if (vec_type && vec_type.getRank() != 1) {
       // These will later be converted to loops of 1D vectors but will then miss
       // the XLA intrinsic lowering.
       op->emitWarning() << "Missed XLA intrinsic lowering as vector rank != 1.";
       return rewriter.notifyMatchFailure(op, "Vector rank is not 1.");
     }
     Type type = Type::TypeFromIrType(op.getType());
+    Type scalar_type =
+        Type::TypeFromIrType(mlir::getElementTypeOrSelf(op.getType()));
     mlir::StringAttr features =
         module_op_->getAttrOfType<mlir::StringAttr>("mhlo.cpu_features");
     const std::string features_str = !features ? "" : features.getValue().str();
-    if (!Intrinsic::IsSupported(features_str, type)) {
+    bool is_supported = Intrinsic::IsSupported(features_str, type);
+    bool scalar_supported = Intrinsic::IsSupported(features_str, scalar_type);
+    if (!is_supported && !scalar_supported) {
       return rewriter.notifyMatchFailure(op, "unsupported type");
     }
-    mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    auto intrinsic_decl =
-        Intrinsic::GetOrInsertDeclaration(rewriter, module_op_, type);
-    auto call_op =
-        b.create<mlir::func::CallOp>(intrinsic_decl, op.getOperand());
-    rewriter.replaceOp(op, call_op->getResults());
+    if (is_supported) {
+      auto intrinsic_decl =
+          Intrinsic::GetOrInsertDeclaration(rewriter, module_op_, type);
+      rewriter.replaceOpWithNewOp<mlir::func::CallOp>(op, intrinsic_decl,
+                                                      op.getOperand());
+    } else {
+      // If the element type is supported but not the vector type, then we
+      // decompose the vector op into a sequence of scalar ops. This is not
+      // optimal in that we could split into the largest possible supported
+      // vectorized ops, but it works for now.
+      auto intrinsic_decl =
+          Intrinsic::GetOrInsertDeclaration(rewriter, module_op_, scalar_type);
+
+      llvm::SmallVector<mlir::Value> scalar_results;
+      scalar_results.reserve(vec_type.getNumElements());
+      for (int64_t idx = 0; idx != vec_type.getNumElements(); ++idx) {
+        mlir::Value scalar_value = mlir::vector::ExtractOp::create(
+            rewriter, op.getLoc(), op.getOperand(), idx);
+        mlir::Value scalar_result =
+            mlir::func::CallOp::create(rewriter, op.getLoc(), intrinsic_decl,
+                                       scalar_value)
+                .getResult(0);
+        scalar_results.push_back(scalar_result);
+      }
+      rewriter.replaceOpWithNewOp<mlir::vector::FromElementsOp>(op, vec_type,
+                                                                scalar_results);
+    }
     return mlir::success();
   }
 
diff --git a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
index 24a772c5580f12..937ac8206ddde1 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
+++ b/third_party/xla/xla/codegen/emitters/transforms/tests/lower_xla_intrinsic_lib.mlir
@@ -146,4 +146,20 @@ module {
 // CHECK-LABEL: @local_xla.rsqrt.f64
 // CHECK-NOT: math.rsqrt
 // CHECK: %[[RSQRT_CALL:.*]] = call @local_xla.rsqrt.f64
-// CHECK: return %[[RSQRT_CALL]]
\ No newline at end of file
+// CHECK: return %[[RSQRT_CALL]]
+
+// -----
+
+// Use a vector length of 3 as we know that will never be supported.
+func.func @rsqrt_unsupported_vector_size(%arg0: vector<3xf32>) -> vector<3xf32> {
+  // CHECK: %[[IN0:.*]] = vector.extract %arg0[0]
+  // CHECK: %[[RSQRT0:.*]] = call @local_xla.rsqrt.f32(%[[IN0]])
+  // CHECK: %[[IN1:.*]] = vector.extract %arg0[1]
+  // CHECK: %[[RSQRT1:.*]] = call @local_xla.rsqrt.f32(%[[IN1]])
+  // CHECK: %[[IN2:.*]] = vector.extract %arg0[2]
+  // CHECK: %[[RSQRT2:.*]] = call @local_xla.rsqrt.f32(%[[IN2]])
+  // CHECK: %[[RESULT:.*]] = vector.from_elements %[[RSQRT0]], %[[RSQRT1]], %[[RSQRT2]]
+  %ret = math.rsqrt %arg0 : vector<3xf32>
+  // CHECK: return %[[RESULT]]
+  return %ret : vector<3xf32>
+}

From 312d57fd897dfd02c90b767552c1403b5313ab94 Mon Sep 17 00:00:00 2001
From: Sai Ganesh Muthuraman <saiganeshm@google.com>
Date: Wed, 10 Dec 2025 02:36:19 -0800
Subject: [PATCH 113/753] Follow replacement chains when setting deduplicated
 names for HLO OpMetadata

PiperOrigin-RevId: 842629932
---
 third_party/xla/xla/hlo/ir/hlo_module.cc      | 33 +++++++-----
 .../hlo_computation_deduplicator_test.cc      | 50 ++++++++++++++++++-
 2 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index fe9502af64f15c..d67349ea44e393 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -267,20 +267,27 @@ void HloModule::MarkFusionDuplications(
     const {
   for (const HloComputation* computation : computations()) {
     for (auto* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kFusion) {
-        auto rep =
-            replacements.find(instruction->fused_instructions_computation());
-        if (rep != replacements.end()) {
-          xla::HloComputation* new_comp = rep->second;
-          if (new_comp->IsFusionComputation()) {
-            auto dedup_name = new_comp->FusionInstruction()->name();
-            new_comp->FusionInstruction()->set_metadata_deduplicated_name(
-                std::string(dedup_name));
-            instruction->set_metadata_deduplicated_name(
-                std::string(dedup_name));
-          }
-        }
+      if (instruction->opcode() != HloOpcode::kFusion) {
+        continue;
+      }
+      auto it =
+          replacements.find(instruction->fused_instructions_computation());
+      if (it == replacements.end()) {
+        continue;
+      }
+      HloComputation* representative = it->second;
+      // Follow chain to find the root representative.
+      for (auto it2 = replacements.find(representative);
+           it2 != replacements.end(); it2 = replacements.find(representative)) {
+        representative = it2->second;
+      }
+      if (!representative->IsFusionComputation()) {
+        continue;
       }
+      std::string dedup_name(representative->FusionInstruction()->name());
+      representative->FusionInstruction()->set_metadata_deduplicated_name(
+          dedup_name);
+      instruction->set_metadata_deduplicated_name(dedup_name);
     }
   }
 }
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc
index 17a074406e1bf0..9e0b3c860400f9 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/hlo_computation_deduplicator_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/literal_util.h"
@@ -663,5 +664,52 @@ TEST_F(HloComputationDeduplicatorTest, DontDeduplicateReduceAllReduce) {
   EXPECT_EQ(computation_names.size(), 3);
 }
 
-}  //  namespace
+TEST_F(HloComputationDeduplicatorTest, DeduplicateChain) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+fusion0 {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+fusion1 {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+fusion2 {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+ENTRY entry {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  fusion.2 = f32[] fusion(p0, p1), kind=kLoop, calls=fusion2
+  fusion.1 = f32[] fusion(fusion.2, p1), kind=kLoop, calls=fusion1
+  fusion.0 = f32[] fusion(fusion.1, p1), kind=kLoop, calls=fusion0
+  ROOT add = f32[] add(fusion.0, p1)
+}
+)"));
+  HloComputationDeduplicator dedup(/*mark_fusion_duplications=*/true);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, dedup.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_EQ(module->computation_count(), 4);
+  HloInstruction* fusion0 =
+      module->entry_computation()->GetInstructionWithName("fusion.0");
+  HloInstruction* fusion1 =
+      module->entry_computation()->GetInstructionWithName("fusion.1");
+  HloInstruction* fusion2 =
+      module->entry_computation()->GetInstructionWithName("fusion.2");
+  EXPECT_EQ(fusion0->metadata().deduplicated_name(), "fusion.0");
+  EXPECT_EQ(fusion1->metadata().deduplicated_name(), "fusion.0");
+  EXPECT_EQ(fusion2->metadata().deduplicated_name(), "fusion.0");
+}
+
+}  // namespace
 }  //  namespace xla

From c8d46ebb0d75b83233fdb5905158ad726210d626 Mon Sep 17 00:00:00 2001
From: Ville Vesilehto <ville@vesilehto.fi>
Date: Wed, 10 Dec 2025 03:12:48 -0800
Subject: [PATCH 114/753] PR #35048: fix: cap shape size in TextLiteralReader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35048

📝 Summary of Changes

Add size check in TextLiteralReader before allocation to avoid null-dereference crash when parsing shapes with extremely large dimensions.

Cap at `int32_t` maximum (around 2 GB), which should be sufficient for this text-based format.

Fixes OSS-Fuzz finding [#428771909](https://issues.oss-fuzz.com/issues/428771909)and probably [#429123948](https://issues.oss-fuzz.com/issues/429123948) from the Tensorflow fuzzer.

🎯 Justification

This change prevents crashes from unbounded memory allocation triggered by malformed input files.

🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
Not relevant.

🧪 Unit Tests:

Added a unit test from the OSS-Fuzz reproducer test case. Test command:

```
docker exec xla bazel test \
  --spawn_strategy=sandboxed \                                                                                                                                                           --test_output=all \
  //xla:text_literal_reader_test
```

Output:

```
==================== Test output for //xla:text_literal_reader_test:
Note: Randomizing tests' orders with a seed of 8787 .
[==========] Running 4 tests from 1 test suite.
[----------] Global test environment set-up.
[----------] 4 tests from TextLiteralReaderTest
[ RUN      ] TextLiteralReaderTest.MissingColonReturnsInvalidArgument
[       OK ] TextLiteralReaderTest.MissingColonReturnsInvalidArgument (32 ms)
[ RUN      ] TextLiteralReaderTest.ShapeTooLargeReturnsResourceExhausted
[       OK ] TextLiteralReaderTest.ShapeTooLargeReturnsResourceExhausted (1 ms)
[ RUN      ] TextLiteralReaderTest.WhitespaceOnlyLineAfterShapeDoesNotCrashAndYieldsScalarNaN
[       OK ] TextLiteralReaderTest.WhitespaceOnlyLineAfterShapeDoesNotCrashAndYieldsScalarNaN (1 ms)
[ RUN      ] TextLiteralReaderTest.ReadsR3File
[       OK ] TextLiteralReaderTest.ReadsR3File (0 ms)
[----------] 4 tests from TextLiteralReaderTest (36 ms total)

[----------] Global test environment tear-down
[==========] 4 tests from 1 test suite ran. (39 ms total)
[  PASSED  ] 4 tests.
================================================================================
//xla:text_literal_reader_test                                  (cached) PASSED in 0.4s
```

🧪 Execution Tests:
Not relevant.

Copybara import of the project:

--
89453e789c7713423429070ad460670f0a6d7039 by Ville Vesilehto <ville@vesilehto.fi>:

fix: cap shape size in TextLiteralReader

Add size check before allocation to avoid null-dereference crash when
parsing shapes with extremely large dimensions. Cap at int32_t
maximum (around 2 GB), which should be sufficient for this
text-based format.

Fixes OSS-Fuzz finding 428771909.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

Merging this change closes #35048

PiperOrigin-RevId: 842640535
---
 third_party/xla/xla/text_literal_reader.cc      | 11 +++++++++++
 third_party/xla/xla/text_literal_reader_test.cc | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/third_party/xla/xla/text_literal_reader.cc b/third_party/xla/xla/text_literal_reader.cc
index c1641b2239b482..8acf1307c678eb 100644
--- a/third_party/xla/xla/text_literal_reader.cc
+++ b/third_party/xla/xla/text_literal_reader.cc
@@ -73,6 +73,17 @@ absl::StatusOr<Literal> TextLiteralReader::ReadAllLines() {
 
   absl::StripAsciiWhitespace(&shape_string);
   TF_ASSIGN_OR_RETURN(Shape shape, ParseShape(shape_string));
+
+  // Sanity check to reject shapes that are obviously too large. This doesn't
+  // guarantee allocation will succeed, but prevents crashes from absurdly
+  // large sizes (e.g., from fuzz testing).
+  constexpr int64_t kMaxSupportedBytes = std::numeric_limits<int32_t>::max();
+  int64_t byte_size = ShapeUtil::ByteSizeOf(shape);
+  if (byte_size < 0 || byte_size > kMaxSupportedBytes) {
+    return ResourceExhausted("Shape %s requires too much memory (%d bytes)",
+                             ShapeUtil::HumanString(shape), byte_size);
+  }
+
   if (shape.element_type() != F32) {
     return Unimplemented(
         "unsupported element type for text literal reading: %s",
diff --git a/third_party/xla/xla/text_literal_reader_test.cc b/third_party/xla/xla/text_literal_reader_test.cc
index b7414c87d734bd..d57b9a06c2c842 100644
--- a/third_party/xla/xla/text_literal_reader_test.cc
+++ b/third_party/xla/xla/text_literal_reader_test.cc
@@ -89,5 +89,16 @@ TEST(TextLiteralReaderTest, MissingColonReturnsInvalidArgument) {
   EXPECT_THAT(literal, StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST(TextLiteralReaderTest, ShapeTooLargeReturnsResourceExhausted) {
+  // Shape requires too much memory, should fail gracefully rather than crash.
+  std::string contents = "f32[272222222222222222]\n";
+
+  std::string fname = tsl::testing::TmpDir() + "/ShapeTooLarge.data.txt";
+  ASSERT_THAT(tsl::WriteStringToFile(tsl::Env::Default(), fname, contents),
+              IsOk());
+  absl::StatusOr<Literal> literal = TextLiteralReader::ReadPath(fname);
+  EXPECT_THAT(literal, StatusIs(absl::StatusCode::kResourceExhausted));
+}
+
 }  // namespace
 }  // namespace xla

From 9b67795c0b2314d361beaca7fed86671f5ce832c Mon Sep 17 00:00:00 2001
From: Alex <alexandros.theodoridis@amd.com>
Date: Wed, 10 Dec 2025 03:38:41 -0800
Subject: [PATCH 115/753] PR #35077: [ROCm] Fix hermetic build rocm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35077

📝 Summary of Changes
Move comgr into the data directory, fixing hermetic build

🎯 Justification
Hermetic build with rocm config is broken due to invalid
dependency management. Invalid merge of this PR: https://github.com/openxla/xla/pull/34812

🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
Not relevant

🧪 Unit Tests:
Not relevant

🧪 Execution Tests:
Not relevant

Copybara import of the project:

--
66c08137948a92ac98ccaa1785ce27ebd4c489ca by Alexandros Theodoridis <alexandros.theodoridis@amd.com>:

Fix hermetic build rocm

Merging this change closes #35077

PiperOrigin-RevId: 842648079
---
 third_party/xla/third_party/gpus/rocm/BUILD.tpl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index 4eba66c971da72..a9f510d7e0c7aa 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -537,7 +537,7 @@ cc_library(
 cc_library(
     name = "amd_comgr",
     hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
-    srcs = glob([
+    data = glob([
         "%{rocm_root}/lib/libamd_comgr_loader.so*",
         "%{rocm_root}/lib/libamd_comgr.so*",
         "%{rocm_root}/lib/llvm/lib/libLLVM.so*",
@@ -549,10 +549,11 @@ cc_library(
     linkopts = select({
         ":build_hermetic": [
             "-lamd_comgr_loader",
+            "-lamd_comgr",
         ],
         "//conditions:default": [
             "-lamd_comgr",
-	],
+        ],
     }),
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],

From f891ef355996762254d7825d72ed653341fd8d6e Mon Sep 17 00:00:00 2001
From: Terry Sun <tesun@nvidia.com>
Date: Wed, 10 Dec 2025 03:46:36 -0800
Subject: [PATCH 116/753] PR #34868: [GPU] Update fabric detection failure
 warning message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34868

📝 Summary of Changes
Updated fabric detection failure warning message to make it more clear and actionable, and only keep it as a VLOG.

🎯 Justification
Fabric detection warning message confuses non-MNNVL users. See https://github.com/jax-ml/jax/issues/33466.

🚀 Kind of Contribution
♻️ Cleanup

📊 Benchmark (for Performance Improvements)
N/A.

🧪 Unit Tests:
N/A.

🧪 Execution Tests:
N/A.

Copybara import of the project:

--
a9546a2da11fb1e4d72a860ea1fbe80820191985 by Terry Sun <tesun@nvidia.com>:

update warning message

--
9e7531844f3a722e586f6665363bbd1c2aa07742 by Terry Sun <tesun@nvidia.com>:

only throw warning for Blackwell+

--
01c6a9c29a9b8d60b3fb5ca24ed3ee7bc8c81657 by Terry Sun <tesun@nvidia.com>:

only keep vlog

Merging this change closes #34868

PiperOrigin-RevId: 842650532
---
 .../xla/xla/stream_executor/cuda/cuda_executor.cc      | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index cc487db3345103..b2e7027ce1a567 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -783,8 +783,9 @@ absl::StatusOr<FabricInfo> GetDeviceFabricInfo(nvmlDevice_t device) {
 
   if (fabricInfo.state == NVML_GPU_FABRIC_STATE_NOT_SUPPORTED) {
     std::string error_message =
-        "NVML doesn't support extracting fabric info or NVLink is not used by "
-        "the device.";
+        "[Ignore this message unless multi-node NVLink is used] "
+        "CUDA driver version is too low for extracting fabric info (550+ "
+        "required), or multi-node NVLink is not available.";
     VLOG(2) << error_message;
     return absl::InternalError(error_message);
   }
@@ -1836,11 +1837,6 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
     if (fabric_info.ok()) {
       info.cluster_uuid = fabric_info->cluster_uuid;
       info.clique_id = fabric_info->clique_id;
-    } else {
-      if (cc.IsAtLeastHopper() && p2p_link_count.ok() && *p2p_link_count) {
-        LOG(WARNING) << "GPU interconnect information not available: "
-                     << fabric_info.status();
-      }
     }
     desc.set_device_interconnect_info(info);
   }

From 3625c2be440dbcdd3deafca7773e28fa4a5f262b Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 10 Dec 2025 03:51:35 -0800
Subject: [PATCH 117/753] [XLA:GPU] Relax layout check in SortThunk.

A difference in element size is ok.
The added test needs to be disabled because it does not work with PjrtTestBase
yet (it does work with HloTestBase).

PiperOrigin-RevId: 842651908
---
 .../xla/xla/service/gpu/thunk_emitter.cc      |  8 +++----
 third_party/xla/xla/tests/sort_test.cc        | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 7ed1d443480120..4f38a16b7232d0 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -1564,12 +1564,12 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitSort(
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and
     // outputs is the same.
-    TF_RET_CHECK(
-        LayoutUtil::LayoutsInShapesEqual(keys_shape, sort->operand(i)->shape(),
-                                         Layout::Equal().IgnoreMemorySpace()));
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, sort->operand(i)->shape(),
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
     TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
         keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index),
-        Layout::Equal().IgnoreMemorySpace()));
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
 
     BufferAllocation::Slice destination_buffer;
     BufferAllocation::Slice source_address;
diff --git a/third_party/xla/xla/tests/sort_test.cc b/third_party/xla/xla/tests/sort_test.cc
index ae00ac94b1c4e7..c065c718707719 100644
--- a/third_party/xla/xla/tests/sort_test.cc
+++ b/third_party/xla/xla/tests/sort_test.cc
@@ -91,6 +91,29 @@ TEST_F(SortTest, SortTwiceWithSameComparator) {
   EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{0.0, 0.0}));
 }
 
+// TODO(b/456833594): Enable this test once PJRT packs int4 types.
+TEST_F(SortTest, DISABLED_SortTuple) {
+  absl::string_view hlo_text_module = R"(
+    HloModule sort
+
+    compare {
+      p.0.lhs = s4[] parameter(0)
+      p.0.rhs = s4[] parameter(1)
+      p.1.lhs = s32[] parameter(2)
+      p.1.rhs = s32[] parameter(3)
+      ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+    }
+
+    ENTRY main {
+      p0 = s4[2,1452]{1,0} parameter(0)
+      p1 = s32[2,1452]{1,0} iota(), iota_dimension=1
+      ROOT sort = (s4[2,1452]{1,0}, s32[2,1452]{1,0}) sort(p0, p1), dimensions={1}, is_stable=true, to_apply=compare
+    }
+  )";
+
+  EXPECT_TRUE(RunAndCompare(hlo_text_module, ErrorSpec{0.0, 0.0}));
+}
+
 class SortManyInputsTest : public SortTest,
                            public ::testing::WithParamInterface<int> {
  public:

From 57e90959c858d2430c5bcfd52a072c7c74098bc7 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Dec 2025 04:14:37 -0800
Subject: [PATCH 118/753] PR #34965: Bump actions/checkout from 6.0.0 to 6.0.1
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34965

Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.0 to 6.0.1.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/actions/checkout/releases">actions/checkout's releases</a>.</em></p>
<blockquote>
<h2>v6.0.1</h2>
<h2>What's Changed</h2>
<ul>
<li>Update all references from v5 and v4 to v6 by <a href="https://github.com/ericsciple"><code>@​ericsciple</code></a> in <a href="https://redirect.github.com/actions/checkout/pull/2314">actions/checkout#2314</a></li>
<li>Add worktree support for persist-credentials includeIf by <a href="https://github.com/ericsciple"><code>@​ericsciple</code></a> in <a href="https://redirect.github.com/actions/checkout/pull/2327">actions/checkout#2327</a></li>
<li>Clarify v6 README by <a href="https://github.com/ericsciple"><code>@​ericsciple</code></a> in <a href="https://redirect.github.com/actions/checkout/pull/2328">actions/checkout#2328</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/checkout/compare/v6...v6.0.1">https://github.com/actions/checkout/compare/v6...v6.0.1</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li>See full diff in <a href="https://github.com/actions/checkout/compare/v6...v6.0.1">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/checkout&package-manager=github_actions&previous-version=6.0.0&new-version=6.0.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>
Copybara import of the project:

--
35b1dad4109f825bb3c6b8cb235cb7c052b80bbe by dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>:

Bump actions/checkout from 6.0.0 to 6.0.1

Bumps [actions/checkout](https://github.com/actions/checkout) from 6.0.0 to 6.0.1.
- [Release notes](https://github.com/actions/checkout/releases)
- [Commits](https://github.com/actions/checkout/compare/v6...v6.0.1)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-version: 6.0.1
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Merging this change closes #34965

PiperOrigin-RevId: 842658886
---
 .../xla/.github/workflows/bazel_dependency_violations.yml     | 2 +-
 third_party/xla/.github/workflows/bazel_query.yml             | 2 +-
 third_party/xla/.github/workflows/bazel_tags.yml              | 2 +-
 third_party/xla/.github/workflows/benchmark_postsubmit.yml    | 2 +-
 third_party/xla/.github/workflows/benchmark_presubmit.yml     | 2 +-
 third_party/xla/.github/workflows/buildifier.yml              | 2 +-
 third_party/xla/.github/workflows/check_contents.yml          | 2 +-
 third_party/xla/.github/workflows/ci.yml                      | 4 ++--
 third_party/xla/.github/workflows/clang_format.yml            | 2 +-
 third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml  | 2 +-
 .../xla/.github/workflows/generate_benchmark_matrix.yml       | 2 +-
 third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml  | 2 +-
 third_party/xla/.github/workflows/nightly_benchmarks.yml      | 2 +-
 third_party/xla/.github/workflows/postsubmit_benchmark.yml    | 2 +-
 third_party/xla/.github/workflows/presubmit_benchmark.yml     | 2 +-
 third_party/xla/.github/workflows/rollback_notification.yml   | 2 +-
 third_party/xla/.github/workflows/scorecards-analysis.yml     | 2 +-
 17 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/.github/workflows/bazel_dependency_violations.yml b/third_party/xla/.github/workflows/bazel_dependency_violations.yml
index 0588447392e993..e3fbfbab9bee81 100644
--- a/third_party/xla/.github/workflows/bazel_dependency_violations.yml
+++ b/third_party/xla/.github/workflows/bazel_dependency_violations.yml
@@ -39,7 +39,7 @@ jobs:
     continue-on-error: true
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/bazel_query.yml b/third_party/xla/.github/workflows/bazel_query.yml
index 052309ef806012..8888c7b0f3267e 100644
--- a/third_party/xla/.github/workflows/bazel_query.yml
+++ b/third_party/xla/.github/workflows/bazel_query.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/bazel_tags.yml b/third_party/xla/.github/workflows/bazel_tags.yml
index 00ed95c8e6f0e0..09ecd6f00603ef 100644
--- a/third_party/xla/.github/workflows/bazel_tags.yml
+++ b/third_party/xla/.github/workflows/bazel_tags.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 10
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install bazelisk"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/bazelisk@24651ab # v1.20.0
       - name: "Run bazel build --nobuild //xla/... with retries"
diff --git a/third_party/xla/.github/workflows/benchmark_postsubmit.yml b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
index bab85cb699bd02..c7ecb2bbdb6075 100644
--- a/third_party/xla/.github/workflows/benchmark_postsubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
@@ -110,7 +110,7 @@ jobs:
           PR: ${{ steps.find_pr.outputs.pr }}
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@v6.0.0
+        uses: actions/checkout@v6.0.1
       - name: Wait For Connection
         uses: google-ml-infra/actions/ci_connection@7f5ca0c263a81ed09ea276524c1b9192f1304e3c
         with:
diff --git a/third_party/xla/.github/workflows/benchmark_presubmit.yml b/third_party/xla/.github/workflows/benchmark_presubmit.yml
index 33f65f9eead53d..33dc31bd6a64d6 100644
--- a/third_party/xla/.github/workflows/benchmark_presubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_presubmit.yml
@@ -86,7 +86,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Configure GPU backend
         if: ${{ matrix.job_info.platform == 'GPU' }}
diff --git a/third_party/xla/.github/workflows/buildifier.yml b/third_party/xla/.github/workflows/buildifier.yml
index d61728b29b4716..079a608acc26e0 100644
--- a/third_party/xla/.github/workflows/buildifier.yml
+++ b/third_party/xla/.github/workflows/buildifier.yml
@@ -34,7 +34,7 @@ jobs:
     timeout-minutes: 6
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Install buildifier"
         run: parallel --ungroup --retries 3 --delay 15 --nonall -- go install github.com/bazelbuild/buildtools/buildifier@433ea85 # 6.4.0
       - name: "Run buildifier"
diff --git a/third_party/xla/.github/workflows/check_contents.yml b/third_party/xla/.github/workflows/check_contents.yml
index 820a99675525ca..afc6f9c7780e14 100644
--- a/third_party/xla/.github/workflows/check_contents.yml
+++ b/third_party/xla/.github/workflows/check_contents.yml
@@ -46,7 +46,7 @@ jobs:
       contains(github.event.pull_request.body, 'FORCE_TEST_ACTIONS')
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Fetch HEAD of main branch"
         run: git fetch origin main --depth=1
 
diff --git a/third_party/xla/.github/workflows/ci.yml b/third_party/xla/.github/workflows/ci.yml
index daafda6979df76..69509b5a71321a 100644
--- a/third_party/xla/.github/workflows/ci.yml
+++ b/third_party/xla/.github/workflows/ci.yml
@@ -118,12 +118,12 @@ jobs:
     timeout-minutes: 60
     steps:
       - name: "Checking out openxla/xla"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           path: "openxla/xla"
       - name: Checking out ${{ matrix.job_info.repo }}
         if: ${{ matrix.job_info.repo != 'openxla/xla' }}
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           repository: ${{ matrix.job_info.repo }}
           path: ${{ matrix.job_info.repo }}
diff --git a/third_party/xla/.github/workflows/clang_format.yml b/third_party/xla/.github/workflows/clang_format.yml
index 198d0dd5df3a83..f0de7043ebb15b 100644
--- a/third_party/xla/.github/workflows/clang_format.yml
+++ b/third_party/xla/.github/workflows/clang_format.yml
@@ -34,7 +34,7 @@ jobs:
       contains(github.event.pull_request.body, 'FORCE_TEST_ACTIONS')
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           fetch-depth: '0'
       - name: "Fetch HEAD of main branch"
diff --git a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
index 39225d1aeb05d9..ae0f471ecfcbfb 100644
--- a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
@@ -75,7 +75,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA
-        uses: actions/checkout@v6.0.0
+        uses: actions/checkout@v6.0.1
 
       - name: Create results directory
         run:
diff --git a/third_party/xla/.github/workflows/generate_benchmark_matrix.yml b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
index 51f97449ee6b41..e96e3d44ecaab1 100644
--- a/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
+++ b/third_party/xla/.github/workflows/generate_benchmark_matrix.yml
@@ -54,7 +54,7 @@ jobs:
 
     steps:
       - name: Checkout OpenXLA
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           # Use inputs.checkout_ref if provided, otherwise default to the event's ref
           # (e.g., PR's HEAD SHA or caller's commit SHA)
diff --git a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
index 748226655d8b9d..55ffd9ad5c1efe 100644
--- a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
@@ -56,7 +56,7 @@ jobs:
       OUTPUT_DIR: ${{ github.workspace }}/output
     steps:
       - name: Checkout XLA
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
 
       - name: Download Gemma Hlo Files
         run: |
diff --git a/third_party/xla/.github/workflows/nightly_benchmarks.yml b/third_party/xla/.github/workflows/nightly_benchmarks.yml
index 23a82d9350624d..5f33b283f13a52 100644
--- a/third_party/xla/.github/workflows/nightly_benchmarks.yml
+++ b/third_party/xla/.github/workflows/nightly_benchmarks.yml
@@ -110,7 +110,7 @@ jobs:
             exit 1
           fi
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           ref: ${{ env.CHECKOUT_REF }}
       - name: Build Binaries
diff --git a/third_party/xla/.github/workflows/postsubmit_benchmark.yml b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
index 346c17bfaec5c4..2d899ae24284cd 100644
--- a/third_party/xla/.github/workflows/postsubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
@@ -145,7 +145,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
diff --git a/third_party/xla/.github/workflows/presubmit_benchmark.yml b/third_party/xla/.github/workflows/presubmit_benchmark.yml
index e3efa8d429bf5b..02483e6158091b 100644
--- a/third_party/xla/.github/workflows/presubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/presubmit_benchmark.yml
@@ -139,7 +139,7 @@ jobs:
           fi
 
       - name: Checkout OpenXLA Repository
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           ref: ${{ env.CHECKOUT_REF }}
 
diff --git a/third_party/xla/.github/workflows/rollback_notification.yml b/third_party/xla/.github/workflows/rollback_notification.yml
index 8978fe8d9984e5..7a3c21fdd21b9f 100644
--- a/third_party/xla/.github/workflows/rollback_notification.yml
+++ b/third_party/xla/.github/workflows/rollback_notification.yml
@@ -33,7 +33,7 @@ jobs:
     timeout-minutes: 6
     steps:
       - name: "Checking out repository"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: "Check if PR was rolled back"
         run: python3 .github/workflows/rollback_notification.py
 
diff --git a/third_party/xla/.github/workflows/scorecards-analysis.yml b/third_party/xla/.github/workflows/scorecards-analysis.yml
index d2bf9a77ef7ab6..0e410de5bd29bf 100644
--- a/third_party/xla/.github/workflows/scorecards-analysis.yml
+++ b/third_party/xla/.github/workflows/scorecards-analysis.yml
@@ -44,7 +44,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+        uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
         with:
           persist-credentials: false
 

From e31d16f73a90e439b92bb3e6ab3ddc7ee6d588f2 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Wed, 10 Dec 2025 04:15:03 -0800
Subject: [PATCH 119/753] [Autotuner] Add expected mock calls to
 MockStreamExecutor. Remove Cublas and CublasLt distinction. These changes are
 needed for the new Autotuner, which does not distinguish between Cublas and
 CublasLt and autotunes for both backends.

PiperOrigin-RevId: 842658982
---
 third_party/xla/xla/service/gpu/determinism_test.cc | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index aad712107a7e1f..2dc6cb2e0fcdb3 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -134,6 +133,12 @@ class DeterminismTest : public GpuCodegenTest {
     EXPECT_CALL(executor, SynchronizeAllActivity).WillRepeatedly([&]() -> bool {
       return true;
     });
+    EXPECT_CALL(executor, CreateStream).WillRepeatedly([&] {
+      return backend().default_stream_executor()->CreateStream();
+    });
+    EXPECT_CALL(executor, AsBlas).WillRepeatedly([&] {
+      return backend().default_stream_executor()->AsBlas();
+    });
 
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                             ParseAndReturnVerifiedModule(hlo_string));
@@ -183,12 +188,6 @@ ENTRY e {
   MatchOptimizedHlo(kHloText, R"(; CHECK: custom_call_target="__cublas$gemm")",
                     TimerCreation::kForbidden);
   AssertDeterminism(kHloText);
-
-  debug_options_.set_xla_gpu_enable_cublaslt(true);
-  MatchOptimizedHlo(kHloText,
-                    R"(; CHECK: custom_call_target="__cublas$lt$matmul")",
-                    TimerCreation::kForbidden);
-  AssertDeterminism(kHloText);
 }
 
 TEST_F(DeterminismTest, DeterministicTritonGemmUsesDefaultConfig) {

From b29ba0638e3d994956981379630a9a88e423ff36 Mon Sep 17 00:00:00 2001
From: Mohammed Anany <manany@google.com>
Date: Wed, 10 Dec 2025 04:43:52 -0800
Subject: [PATCH 120/753] [XLA:GPU/TMA] Deprecate TMA flag since it is enabled
 by default and is stable. Also explicitly state TMA configurations in the
 default set for the current GEMM autotuner. This makes it easier to maintain
 an explicit list with a known size.

PiperOrigin-RevId: 842666877
---
 .../gpu/autotuner/block_level_emitter.cc      |  9 +-
 .../gpu/autotuner/block_level_emitter_test.cc |  4 +-
 .../xla/xla/backends/gpu/autotuner/triton.cc  | 37 ++------
 .../xla/backends/gpu/autotuner/triton_test.cc |  1 -
 .../triton/fusion_emitter_device_test.cc      | 20 +---
 .../triton/fusion_emitter_deviceless_test.cc  |  1 -
 .../backends/gpu/codegen/triton/tma_utils.cc  |  6 +-
 third_party/xla/xla/debug_options_flags.cc    |  7 --
 .../xla/xla/service/gpu/autotuning/BUILD      |  3 +-
 .../gpu/autotuning/dot_search_space.cc        | 23 ++---
 .../service/gpu/autotuning/dot_search_space.h |  8 +-
 .../gpu/autotuning/gemm_fusion_autotuner.cc   | 12 ---
 .../autotuning/gemm_fusion_autotuner_cuda.cc  | 28 ++----
 .../service/gpu/autotuning/triton_configs.cc  | 93 ++++++++++++++++++-
 .../service/gpu/autotuning/triton_configs.h   |  3 +-
 third_party/xla/xla/xla.proto                 |  8 +-
 16 files changed, 129 insertions(+), 134 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
index bfda4e0c804bcc..253c2373c087a7 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter.cc
@@ -290,12 +290,9 @@ BlockLevelEmitterBackend::GetSupportedConfigs(const HloInstruction& instr) {
     configs.push_back(std::move(any));
   }
 
-  // Allow TMA tuning for Hopper+ devices when TMA flag is passed.
-  bool autotune_tma =
-      debug_options().xla_gpu_experimental_enable_triton_tma() &&
-      stream_executor::gpu::IsTmaAvailableForDevice(
-          target_config().device_description);
-  if (autotune_tma) {
+  // Allow TMA tuning for Hopper+ devices.
+  if (stream_executor::gpu::IsTmaAvailableForDevice(
+          target_config().device_description)) {
     ExtendConfigsWithTma(configs);
   }
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
index 096e9080256933..603a22cbb991cc 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/block_level_emitter_test.cc
@@ -72,9 +72,7 @@ class TritonBlockLevelFusionEmitterBackendTest
                              .value()),
         target_config_(stream_executor_),
         backend_(&debug_options_, &compiler_,
-                 compiler_.ShapeSizeBytesFunction(), &target_config_) {
-    debug_options_.set_xla_gpu_experimental_enable_triton_tma(true);
-  }
+                 compiler_.ShapeSizeBytesFunction(), &target_config_) {}
 
   DebugOptions debug_options_;
   NVPTXCompiler compiler_;
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.cc b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
index 6a6246ee386be9..0ee4e09dc60572 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
@@ -58,7 +58,7 @@ namespace gpu {
 
 namespace {
 std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
-    se::GpuComputeCapability compute_capability, bool autotune_tma) {
+    se::GpuComputeCapability compute_capability) {
   if (compute_capability.IsRocm()) {
     return GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultRocm);
   }
@@ -69,29 +69,15 @@ std::vector<TritonGemmConfig> GetDefaultTritonConfigs(
 
   if (cuda_compute_capability->IsAtLeastBlackwell()) {
     configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kBlackwell);
-  } else if (cuda_compute_capability->IsHopper() ||
-             cuda_compute_capability->IsAmpere()) {
-    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopperAmpere);
+  } else if (cuda_compute_capability->IsHopper()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopper);
+  } else if (cuda_compute_capability->IsAmpere()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kAmpere);
   } else {
     configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultCuda);
   }
 
-  if (!autotune_tma) {
-    return configs;
-  }
-
-  // Hopper+ devices support TMA. Add TMA parameterized configs.
-  std::vector<TritonGemmConfig> tma_parameterized_configs;
-  for (auto& config : configs) {
-    config.is_tma_allowed = false;
-    tma_parameterized_configs.push_back(config);
-
-    if (IsTmaRecommended(config)) {
-      config.is_tma_allowed = true;
-      tma_parameterized_configs.push_back(config);
-    }
-  }
-  return tma_parameterized_configs;
+  return configs;
 }
 
 }  // namespace
@@ -128,11 +114,6 @@ TritonBackend::GetSupportedConfigsForDot(const HloInstruction* instr) {
       supports_contracting_split &&
       debug_options().xla_gpu_enable_split_k_autotuning();
 
-  // Allow TMA tuning for Hopper+ devices when TMA flag is passed.
-  bool autotune_tma =
-      debug_options().xla_gpu_experimental_enable_triton_tma() &&
-      stream_executor::gpu::IsTmaAvailableForDevice(
-          target_config().device_description);
   std::vector<std::unique_ptr<BackendConfig>> configs;
   VLOG(1) << "Generating configs from search space: "
           << search_space.ToString();
@@ -141,15 +122,13 @@ TritonBackend::GetSupportedConfigsForDot(const HloInstruction* instr) {
   std::vector<TritonGemmConfig> gemm_configs = search_space.GenerateConfigs(
       /*force_contracting_split=*/autotune_contracting_split
           ? std::nullopt
-          : std::make_optional(1),
-      /*autotune_tma=*/autotune_tma);
+          : std::make_optional(1));
 
   if (!debug_options().xla_gpu_exhaustive_tiling_search()) {
     VLOG(1) << "Restricting configs to the default set.";
     gemm_configs = search_space.OptimizeConfigSet(
         gemm_configs, /*hints=*/GetDefaultTritonConfigs(
-            target_config().device_description.gpu_compute_capability(),
-            autotune_tma));
+            target_config().device_description.gpu_compute_capability()));
   }
   configs.reserve(gemm_configs.size());
   for (const auto& config : gemm_configs) {
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
index 595599700c6f3b..0c029caa2a5b88 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton_test.cc
@@ -96,7 +96,6 @@ class TritonBackendTest : public HloHardwareIndependentTestBase {
                              .value()),
         target_config_(stream_executor_),
         backend_(&debug_options_, &compiler_, &target_config_, &mlir_context_) {
-    debug_options_.set_xla_gpu_experimental_enable_triton_tma(true);
   }
 
   DebugOptions debug_options_;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index d93e3016930055..39c8a8afe129ee 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -104,14 +104,7 @@ class TritonEmitterTest : public GpuCodegenTest {
 
 class TmaParameterizedTritonEmitterTest
     : public TritonEmitterTest,
-      public ::testing::WithParamInterface<bool> {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(GetParam());
-    return debug_options;
-  }
-};
+      public ::testing::WithParamInterface<bool> {};
 
 INSTANTIATE_TEST_SUITE_P(TmaParameterizedTritonEmitterTestSuite,
                          TmaParameterizedTritonEmitterTest, ::testing::Bool(),
@@ -123,7 +116,6 @@ class WarpSpecializationTritonEmitterTest : public TritonEmitterTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(true);
     debug_options.set_xla_gpu_experimental_enable_triton_warp_specialization(
         true);
     return debug_options;
@@ -139,15 +131,7 @@ struct TmaAndDotLayoutTestParams {
 
 class TmaAndLayoutParameterizedTritonEmitterTest
     : public TritonEmitterTest,
-      public ::testing::WithParamInterface<TmaAndDotLayoutTestParams> {
- public:
-  DebugOptions GetDebugOptionsForTest() const override {
-    DebugOptions debug_options = TritonEmitterTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(
-        GetParam().enable_tma);
-    return debug_options;
-  }
-};
+      public ::testing::WithParamInterface<TmaAndDotLayoutTestParams> {};
 
 std::string TmaAndDotLayoutTestParamsToString(
     const ::testing::TestParamInfo<TmaAndDotLayoutTestParams>& data) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
index 0e750a6e600c19..fa77b022d841a4 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
@@ -48,7 +48,6 @@ class WarpSpecializationTritonEmitterTest : public TritonEmitterDevicelessTest {
   DebugOptions GetDebugOptionsForTest() const override {
     DebugOptions debug_options =
         TritonEmitterDevicelessTest::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_experimental_enable_triton_tma(true);
     debug_options.set_xla_gpu_experimental_enable_triton_warp_specialization(
         true);
     return debug_options;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
index b7a6eb1b6ce397..0f53dd18a323c8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/tma_utils.cc
@@ -139,10 +139,10 @@ absl::StatusOr<TmaDescriptor> CreateTmaDescriptor(
                              GetTmaSwizzleMode(swizzle_mode));
 }
 
+// The current recommendation is based on analyzing the E2E "Nucleo" group
+// data. It might make sense to re-evaluate this recommendation later if we
+// believe there are missed opportunities.
 bool IsTmaRecommended(const TritonGemmConfig& config) {
-  // The current recommendation is based on analyzing the E2E "Nucleo" group
-  // data. It might make sense to re-evaluate this recommendation later if we
-  // believe there are missed opportunities.
   return (config.split_k == 1 || config.split_k == 16) &&
          config.num_warps <= 8 &&
          (config.num_stages == 1 || config.num_stages == 3 ||
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index fc1f41ea91152e..abfb81977ccdb0 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -460,7 +460,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_enable_scoped_logging_timers(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_noop_change(false);
   opts.set_xla_gpu_experimental_enable_split_k_rewrite(false);
-  opts.set_xla_gpu_experimental_enable_triton_tma(true);
   opts.set_xla_gpu_experimental_enable_triton_warp_specialization(false);
   opts.set_xla_detect_unstable_reductions(DebugOptions::DETECTION_MODE_NONE);
   opts.set_xla_detect_unstable_reductions_post_optimizations(
@@ -2623,12 +2622,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_experimental_enable_split_k_rewrite(),
       "Enable the pass that splits GEMMs that underutilize the GPU load by "
       "splitting the K dimension using a heuristic."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_experimental_enable_triton_tma",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_experimental_enable_triton_tma),
-      debug_options->xla_gpu_experimental_enable_triton_tma(),
-      "Enable Triton's TMA loads/stores for arguments where applicable."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_experimental_enable_triton_warp_specialization",
       bool_setter_for(
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 5ca2354d355add..77b0d2daf56629 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -50,7 +50,6 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/gpu/autotuner:cudnn",
-        "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
@@ -70,7 +69,6 @@ cc_library(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:env",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -317,6 +315,7 @@ cc_library(
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:matmul_utils",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/lib/core:bits",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
index 43a2806d38e1f9..4446900f5569a6 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/lib/core/bits.h"
 #include "xla/util.h"
 #include "tsl/platform/protobuf.h"
@@ -124,7 +125,7 @@ TritonDotFusionSearchSpace::TritonDotFusionSearchSpace(
 }
 
 std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
-    std::optional<int64_t> force_contracting_split, bool autotune_tma,
+    std::optional<int64_t> force_contracting_split,
     bool autotune_warp_specialization) const {
   std::vector<ConfigWithNotes> configs;
   if (force_contracting_split.has_value()) {
@@ -153,22 +154,12 @@ std::vector<TritonGemmConfig> TritonDotFusionSearchSpace::GenerateConfigs(
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddCtaSizeParameter);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddContractingTiling);
   ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddPipeliningParameter);
-
-  if (autotune_warp_specialization && !autotune_tma) {
-    LOG(WARNING)
-        << "Warp specialization is requested, but TMA is not enabled, hence "
-           "warp specialization will be ignored. Set both "
-           "`is_warp_specialization_allowed` and `is_tma_allowed` "
-           "to true on the configuration to enable warp specialization.";
-  }
-  if (autotune_tma) {
-    VLOG(10) << "Parameterizing all currently constructed configs with "
-                "TMA.";
+  if (stream_executor::gpu::IsTmaAvailableForDevice(device_description_)) {
     ExtendConfigs(configs, &TritonDotFusionSearchSpace::AddTmaParameter);
-    if (autotune_warp_specialization) {
-      ExtendConfigs(
-          configs, &TritonDotFusionSearchSpace::AddWarpSpecializationParameter);
-    }
+  }
+  if (autotune_warp_specialization) {
+    ExtendConfigs(configs,
+                  &TritonDotFusionSearchSpace::AddWarpSpecializationParameter);
   }
 
   std::vector<TritonGemmConfig> result;
diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
index 3cbae90aa0a9c7..7d6e68ccb0bd8c 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.h
@@ -46,14 +46,10 @@ class TritonDotFusionSearchSpace {
   // If `force_contracting_split` is set, the search space
   // will be restricted to only include configs with the given split_k factor.
   //
-  // If true, `autotune_tma` and `autotune_warp_specialization` extend the
-  // search space with TMA parameterization and warp specialization
-  // respectively. Setting 'autotune_warp_specialization' to true also requires
-  // `autotune_tma` to be true, given that warp specialization is probably not
-  // useful without TMA.
+  // If true, `autotune_warp_specialization` extends the search space with warp
+  // specialization support.
   std::vector<TritonGemmConfig> GenerateConfigs(
       std::optional<int64_t> force_contracting_split = std::nullopt,
-      bool autotune_tma = false,
       bool autotune_warp_specialization = false) const;
 
   // Restrict the set of configs to the ones compatible with the hints list.
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
index 6dfb5b9a5c7966..17c836bf78839b 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -1003,20 +1003,9 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
       supports_contracting_split &&
       debug_options_.xla_gpu_enable_split_k_autotuning();
 
-  // Allow TMA tuning for Hopper+ devices when TMA flag is passed.
-  bool autotune_tma = debug_options_.xla_gpu_experimental_enable_triton_tma() &&
-                      stream_executor::gpu::IsTmaAvailableForDevice(
-                          config_.GetDeviceDescription());
   bool autotune_warp_specialization =
       debug_options_.xla_gpu_experimental_enable_triton_warp_specialization() &&
       IsWarpSpecializationAvailable();
-  if (autotune_warp_specialization && !autotune_tma) {
-    return absl::InvalidArgumentError(
-        "Warp specialization is requested, but TMA is not enabled. If you wish "
-        "to enable warp specialization, set both "
-        "`xla_gpu_experimental_enable_triton_tma` and "
-        "`xla_gpu_experimental_enable_triton_warp_specialization` to true.");
-  }
   TritonDotFusionSearchSpace search_space(config_.GetDeviceDescription(), &dot);
   VLOG(1) << "Generating configs from search space: "
           << search_space.ToString();
@@ -1026,7 +1015,6 @@ GemmFusionAutotunerImpl::GenerateTritonConfigs(const HloDotInstruction& dot) {
       /*force_contracting_split=*/autotune_contracting_split
           ? std::nullopt
           : std::make_optional(1),
-      /*autotune_tma=*/autotune_tma,
       /*autotune_warp_specialization=*/autotune_warp_specialization);
 
   if (auto overrides = config_.gemm_config_overrides(); overrides.has_value()) {
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
index 336b668d4b3160..f086503f3ec795 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_cuda.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/cudnn.h"
-#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -39,7 +38,6 @@ limitations under the License.
 #include "xla/service/gpu/transforms/block_scaling_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -119,37 +117,23 @@ std::vector<TritonGemmConfig> GemmFusionAutotunerImpl::GetDefaultTritonConfigs()
 
   if (compute_capability.IsAtLeastBlackwell()) {
     configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kBlackwell);
-  } else if (compute_capability.IsHopper() || compute_capability.IsAmpere()) {
-    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopperAmpere);
+  } else if (compute_capability.IsHopper()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopper);
+  } else if (compute_capability.IsAmpere()) {
+    configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kAmpere);
   } else {
     configs = GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultCuda);
   }
 
-  if (!debug_options_.xla_gpu_experimental_enable_triton_tma() ||
-      !stream_executor::gpu::IsTmaAvailableForDevice(
-          config_.GetDeviceDescription())) {
-    return configs;
-  }
-  std::vector<TritonGemmConfig> tma_parameterized_configs;
-  for (auto& config : configs) {
-    config.is_tma_allowed = false;
-    tma_parameterized_configs.push_back(config);
-
-    if (IsTmaRecommended(config)) {
-      config.is_tma_allowed = true;
-      tma_parameterized_configs.push_back(config);
-    }
-  }
-
   // TODO(b/449668102): Currently only supporting warp specialization on
   // Blackwell+. Potentially extend support to Hopper.
   if (!debug_options_
            .xla_gpu_experimental_enable_triton_warp_specialization() ||
       !compute_capability.IsAtLeastBlackwell()) {
-    return tma_parameterized_configs;
+    return configs;
   }
   std::vector<TritonGemmConfig> warp_specialized_configs;
-  for (auto& config : tma_parameterized_configs) {
+  for (auto& config : configs) {
     config.is_warp_specialization_allowed = false;
     warp_specialized_configs.push_back(config);
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc b/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc
index e57bb34bf71e97..a540e6a2ede81d 100644
--- a/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/triton_configs.cc
@@ -72,6 +72,28 @@ config { block_m: 64 block_n: 32 block_k: 64 split_k: 64 num_stages: 3 num_warps
 config { block_m: 64 block_n: 64 block_k: 128 split_k: 8 num_stages: 1 num_warps: 8 num_ctas: 1 }
 config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
 config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 128 block_k: 64 split_k: 1 num_stages: 1 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 64 block_k: 64 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 32 block_k: 64 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 256 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 256 block_n: 16 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 256 block_n: 32 block_k: 32 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 16 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
 )";
 
 constexpr absl::string_view kDefaultCudaTritonConfigs = R"(
@@ -118,7 +140,7 @@ config { block_m: 16 block_n: 16 block_k: 256 split_k: 1 num_stages: 1 num_warps
 config { block_m: 16 block_n: 128 block_k: 32 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
 )";
 
-constexpr absl::string_view kHopperAmpereTritonConfigs = R"(
+constexpr absl::string_view kAmpereTritonConfigs = R"(
 config { block_m: 16 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
 config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
 config { block_m: 16 block_n: 16 block_k: 128 split_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 }
@@ -157,6 +179,72 @@ config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warp
 config { block_m: 64 block_n: 8 block_k: 128 split_k: 2 num_stages: 3 num_warps: 4 num_ctas: 1 }
 )";
 
+constexpr absl::string_view kHopperTritonConfigs = R"(
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 16 block_n: 256 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 }
+config { block_m: 32 block_n: 32 block_k: 128 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 16 num_stages: 3 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 16 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 4 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 4 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 64 block_k: 256 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 128 block_k: 128 split_k: 8 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 }
+config { block_m: 128 block_n: 32 block_k: 32 split_k: 8 num_stages: 4 num_warps: 2 num_ctas: 1 }
+config { block_m: 128 block_n: 128 block_k: 32 split_k: 8 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 }
+config { block_m: 64 block_n: 8 block_k: 128 split_k: 2 num_stages: 3 num_warps: 4 num_ctas: 1 }
+config { block_m: 16 block_n: 16 block_k: 64 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 16 block_k: 128 split_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 16 block_n: 256 block_k: 16 split_k: 1 num_stages: 1 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 32 block_k: 128 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 32 block_n: 256 block_k: 32 split_k: 16 num_stages: 3 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 32 split_k: 16 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 1 num_stages: 1 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 64 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 16 block_k: 128 split_k: 16 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 64 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 32 block_k: 128 split_k: 1 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 64 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 128 split_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 64 block_k: 256 split_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 16 split_k: 1 num_stages: 4 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 128 block_k: 64 split_k: 1 num_stages: 3 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 64 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 16 block_k: 64 split_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 256 block_k: 32 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+config { block_m: 128 block_n: 256 block_k: 64 split_k: 1 num_stages: 4 num_warps: 8 num_ctas: 1 is_tma_allowed: true }
+)";
+
 absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>
 LoadTritonConfigs() {
   absl::flat_hash_map<TritonConfigsPlatform, std::vector<TritonGemmConfig>>
@@ -181,10 +269,11 @@ LoadTritonConfigs() {
   const std::initializer_list<
       std::pair<TritonConfigsPlatform, absl::string_view>>
       kConfigsMap = {
+          {TritonConfigsPlatform::kAmpere, kAmpereTritonConfigs},
           {TritonConfigsPlatform::kBlackwell, kBlackwellTritonConfigs},
           {TritonConfigsPlatform::kDefaultCuda, kDefaultCudaTritonConfigs},
           {TritonConfigsPlatform::kDefaultRocm, kDefaultRocmTritonConfigs},
-          {TritonConfigsPlatform::kHopperAmpere, kHopperAmpereTritonConfigs},
+          {TritonConfigsPlatform::kHopper, kHopperTritonConfigs},
       };
   for (const auto& [platform, config_str] : kConfigsMap) {
     result[platform] = parse_config(config_str);
diff --git a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
index 252b4be2b1b692..b37950a6e1d7b0 100644
--- a/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
+++ b/third_party/xla/xla/service/gpu/autotuning/triton_configs.h
@@ -24,10 +24,11 @@ namespace xla {
 namespace gpu {
 
 enum class TritonConfigsPlatform {
+  kAmpere,
   kBlackwell,
   kDefaultCuda,
   kDefaultRocm,
-  kHopperAmpere,
+  kHopper,
 };
 
 const std::vector<TritonGemmConfig>& GetTritonConfigsForPlatform(
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 0982c10ad42298..f3b61c30ae07a6 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -697,9 +697,6 @@ message DebugOptions {
   optional bool xla_gpu_experimental_enable_triton_heroless_priority_fusion =
       340;
 
-  // When possible, XLA will use Triton's TMA loads/stores.
-  optional bool xla_gpu_experimental_enable_triton_tma = 355;
-
   // When possible, XLA will use Triton's auto warp specialization feature.
   optional bool xla_gpu_experimental_enable_triton_warp_specialization = 421;
 
@@ -1426,14 +1423,15 @@ message DebugOptions {
   reserved "xla_use_shardy";
   reserved "xla_gpu_unsupported_annotate_with_emitter_loc";
   reserved "xla_gpu_experimental_enable_command_buffer_on_thunks";
+  reserved "xla_gpu_experimental_enable_triton_tma";
 
   reserved 5, 63, 80, 93, 94, 98, 117, 130, 133, 134, 139, 141, 143, 152, 158,
       160, 161, 162, 167, 168, 169, 171, 172, 173, 176, 177, 178, 179, 180, 183,
       184, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 204, 206,
       207, 211, 214, 218, 220, 221, 226, 229, 230, 233, 234, 238, 242, 249, 263,
       264, 266, 270, 271, 275, 276, 278, 279, 281, 282, 286, 298, 299, 302, 303,
-      309, 313, 314, 319, 320, 325, 326, 332, 346, 352, 358, 361, 367, 369, 371,
-      385, 394, 398, 402, 423;
+      309, 313, 314, 319, 320, 325, 326, 332, 346, 352, 355, 358, 361, 367, 369,
+      371, 385, 394, 398, 402, 423;
 }
 
 // Contains flags which affects the GPU compilation result.

From f53e5493031c29ec2da271ba39aedca182ecfecc Mon Sep 17 00:00:00 2001
From: Shaogang Wang <shawnw@nvidia.com>
Date: Wed, 10 Dec 2025 05:01:19 -0800
Subject: [PATCH 121/753] PR #35011: Use human readable string to display
 memory allocator sizes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35011

📝 Summary of Changes
This pull request updates the allocator statistics reporting across both `stream_executor` and `tsl/framework` to improve readability and consistency.
Copybara import of the project:

--
29373821e0a897a93acf3faa24defcd444c1518f by Shawn Wang <shawnw@nvidia.com>:

Use human readable string to display memory allocator sizes

--
3b496cd030c7fa450cb72bfc2513e654d32ef15e by Shawn Wang <shawnw@nvidia.com>:

fix typos

--
66b17f17fbcc3a05e004f3c091abce0e7a8d96e4 by Shawn Wang <shawnw@nvidia.com>:

typos

Merging this change closes #35011

PiperOrigin-RevId: 842672995
---
 third_party/xla/xla/stream_executor/BUILD     |  5 ++-
 .../xla/stream_executor/allocator_stats.cc    | 28 ++++++++------
 third_party/xla/xla/tsl/framework/BUILD       |  2 +
 .../xla/xla/tsl/framework/allocator.cc        | 38 ++++++++++---------
 .../xla/xla/tsl/framework/bfc_allocator.cc    | 12 ++++--
 5 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 5c1481cc227da1..5e9db471bbb232 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -824,7 +824,10 @@ cc_library(
     name = "allocator_stats",
     srcs = ["allocator_stats.cc"],
     hdrs = ["allocator_stats.h"],
-    deps = ["@com_google_absl//absl/strings:str_format"],
+    deps = [
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:numbers",
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/stream_executor/allocator_stats.cc b/third_party/xla/xla/stream_executor/allocator_stats.cc
index de6432b29d7bf2..1d10eba776da7b 100644
--- a/third_party/xla/xla/stream_executor/allocator_stats.cc
+++ b/third_party/xla/xla/stream_executor/allocator_stats.cc
@@ -18,23 +18,29 @@ limitations under the License.
 #include <string>
 
 #include "absl/strings/str_format.h"
+#include "tsl/platform/numbers.h"
 
 namespace stream_executor {
 
 std::string AllocatorStats::DebugString() const {
   return absl::StrFormat(
-      "Limit:            %20lld\n"
-      "InUse:            %20lld\n"
-      "MaxInUse:         %20lld\n"
+      "Limit:            %20s\n"
+      "InUse:            %20s\n"
+      "MaxInUse:         %20s\n"
       "NumAllocs:        %20lld\n"
-      "MaxAllocSize:     %20lld\n"
-      "Reserved:         %20lld\n"
-      "PeakReserved:     %20lld\n"
-      "LargestFreeBlock: %20lld\n",
-      this->bytes_limit ? *this->bytes_limit : 0, this->bytes_in_use,
-      this->peak_bytes_in_use, this->num_allocs, this->largest_alloc_size,
-      this->bytes_reserved, this->peak_bytes_reserved,
-      this->largest_free_block_bytes);
+      "MaxAllocSize:     %20s\n"
+      "Reserved:         %20s\n"
+      "PeakReserved:     %20s\n"
+      "LargestFreeBlock: %20s\n",
+      tsl::strings::HumanReadableNumBytes(this->bytes_limit ? *this->bytes_limit
+                                                            : 0),
+      tsl::strings::HumanReadableNumBytes(this->bytes_in_use),
+      tsl::strings::HumanReadableNumBytes(this->peak_bytes_in_use),
+      this->num_allocs,
+      tsl::strings::HumanReadableNumBytes(this->largest_alloc_size),
+      tsl::strings::HumanReadableNumBytes(this->bytes_reserved),
+      tsl::strings::HumanReadableNumBytes(this->peak_bytes_reserved),
+      tsl::strings::HumanReadableNumBytes(this->largest_free_block_bytes));
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/tsl/framework/BUILD b/third_party/xla/xla/tsl/framework/BUILD
index 2c058abf55f2dd..141d67cf93d5f8 100644
--- a/third_party/xla/xla/tsl/framework/BUILD
+++ b/third_party/xla/xla/tsl/framework/BUILD
@@ -120,6 +120,7 @@ cc_library(
             ":allocator_registry_impl",
             "@com_google_absl//absl/synchronization",
             "//xla/tsl/lib/gtl:inlined_vector",
+            "@local_tsl//tsl/platform:numbers",
             "@local_tsl//tsl/platform:strcat",
             "//xla/tsl/platform:env",
             "//xla/tsl/platform:env_impl",
@@ -132,6 +133,7 @@ cc_library(
         otherwise = [
             "//xla/tsl/lib/gtl:inlined_vector",
             "//xla/tsl/platform:logging",
+            "@local_tsl//tsl/platform:numbers",
             "@local_tsl//tsl/platform:platform_port",
             "@local_tsl//tsl/platform:strcat",
             "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/tsl/framework/allocator.cc b/third_party/xla/xla/tsl/framework/allocator.cc
index 0c496045c93e56..112f6555ef5902 100644
--- a/third_party/xla/xla/tsl/framework/allocator.cc
+++ b/third_party/xla/xla/tsl/framework/allocator.cc
@@ -19,33 +19,35 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "xla/tsl/framework/allocator_registry.h"
 #include "xla/tsl/framework/tracking_allocator.h"
 #include "xla/tsl/platform/types.h"
 #include "tsl/platform/mem.h"
-#include "tsl/platform/strcat.h"
+#include "tsl/platform/numbers.h"
 
 namespace tsl {
 
 std::string AllocatorStats::DebugString() const {
   return absl::StrFormat(
-      "Limit:            %20lld\n"
-      "InUse:            %20lld\n"
-      "MaxInUse:         %20lld\n"
+      "Limit:            %20s\n"
+      "InUse:            %20s\n"
+      "MaxInUse:         %20s\n"
       "NumAllocs:        %20lld\n"
-      "MaxAllocSize:     %20lld\n"
-      "Reserved:         %20lld\n"
-      "PeakReserved:     %20lld\n"
-      "LargestFreeBlock: %20lld\n",
-      static_cast<long long>(this->bytes_limit ? *this->bytes_limit : 0),
-      static_cast<long long>(this->bytes_in_use),
-      static_cast<long long>(this->peak_bytes_in_use),
+      "MaxAllocSize:     %20s\n"
+      "Reserved:         %20s\n"
+      "PeakReserved:     %20s\n"
+      "LargestFreeBlock: %20s\n",
+      strings::HumanReadableNumBytes(this->bytes_limit ? *this->bytes_limit
+                                                       : 0),
+      strings::HumanReadableNumBytes(this->bytes_in_use),
+      strings::HumanReadableNumBytes(this->peak_bytes_in_use),
       static_cast<long long>(this->num_allocs),
-      static_cast<long long>(this->largest_alloc_size),
-      static_cast<long long>(this->bytes_reserved),
-      static_cast<long long>(this->peak_bytes_reserved),
-      static_cast<long long>(this->largest_free_block_bytes));
+      strings::HumanReadableNumBytes(this->largest_alloc_size),
+      strings::HumanReadableNumBytes(this->bytes_reserved),
+      strings::HumanReadableNumBytes(this->peak_bytes_reserved),
+      strings::HumanReadableNumBytes(this->largest_free_block_bytes));
 }
 
 constexpr size_t Allocator::kAllocatorAlignment;
@@ -59,9 +61,9 @@ void EnableCPUAllocatorFullStats() { cpu_allocator_collect_full_stats = true; }
 bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 std::string AllocatorAttributes::DebugString() const {
-  return strings::StrCat("AllocatorAttributes(on_host=", on_host(),
-                         " nic_compatible=", nic_compatible(),
-                         " gpu_compatible=", gpu_compatible(), ")");
+  return absl::StrCat("AllocatorAttributes(on_host=", on_host(),
+                      " nic_compatible=", nic_compatible(),
+                      " gpu_compatible=", gpu_compatible(), ")");
 }
 
 Allocator* cpu_allocator_base() {
diff --git a/third_party/xla/xla/tsl/framework/bfc_allocator.cc b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
index 4891503a0314be..283df25cc17306 100644
--- a/third_party/xla/xla/tsl/framework/bfc_allocator.cc
+++ b/third_party/xla/xla/tsl/framework/bfc_allocator.cc
@@ -1116,11 +1116,15 @@ void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
   }
   LOG(INFO) << "Sum Total of in-use chunks: "
             << strings::HumanReadableNumBytes(total_bytes);
-  LOG(INFO) << "Total bytes in pool: " << *stats_.pool_bytes
-            << " memory_limit_: " << memory_limit_
-            << " available bytes: " << (memory_limit_ - *stats_.pool_bytes)
+  LOG(INFO) << "Total size in pool: "
+            << strings::HumanReadableNumBytes(*stats_.pool_bytes)
+            << " memory_limit_: "
+            << strings::HumanReadableNumBytes(memory_limit_)
+            << " available size: "
+            << strings::HumanReadableNumBytes(memory_limit_ -
+                                              *stats_.pool_bytes)
             << " curr_region_allocation_bytes_: "
-            << curr_region_allocation_bytes_;
+            << strings::HumanReadableNumBytes(curr_region_allocation_bytes_);
   LOG(INFO) << "Stats: \n" << stats_.DebugString();
 }
 

From 0a7d67300a46efcd75bb03729891be05911f4396 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Wed, 10 Dec 2025 06:01:02 -0800
Subject: [PATCH 122/753] [Autotuner] Add logging to FissionBackend.

PiperOrigin-RevId: 842689931
---
 third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
index 2cddcba025c932..5452a82f39656b 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend.cc
@@ -83,6 +83,8 @@ absl::Status InlineFissionedComputation(HloInstruction* fusion_instr,
 absl::StatusOr<std::vector<std::unique_ptr<BackendConfig>>>
 FissionBackend::GetSupportedConfigs(const HloInstruction& instr) {
   if (!IsSupported(instr)) {
+    VLOG(3) << "Instruction not supported by " << name() << ": "
+            << instr.ToString();
     return std::vector<std::unique_ptr<BackendConfig>>();
   }
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
@@ -90,6 +92,8 @@ FissionBackend::GetSupportedConfigs(const HloInstruction& instr) {
   absl::StatusOr<HloInstruction*> supported_instr =
       FindFirstSupportedInstruction(hlo_module.get());
   if (supported_instr.status().code() == absl::StatusCode::kNotFound) {
+    VLOG(3) << "No supported instructions found by " << name() << ": "
+            << instr.ToString();
     return std::vector<std::unique_ptr<BackendConfig>>();
   }
   TF_RETURN_IF_ERROR(supported_instr.status());

From 4b753fd347d5c1822a8e804440ef369a5391c6df Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 10 Dec 2025 06:01:28 -0800
Subject: [PATCH 123/753] [XLA:GPU] Handle packed element types in sort.

We need to pass the address when calling the comparison computation. However
the unpacking happens when loading the element values. Therefore for packed
types we need to copy to a temporary buffer and pass the address of the buffer.

PiperOrigin-RevId: 842690104
---
 .../backends/gpu/codegen/llvm/sort_util.cc    | 15 ++++++++-
 .../xla/xla/service/gpu/tests/sorting_test.cc | 33 ++++++++++++++++---
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
index e7634a27615e37..9332d9fa5ce937 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "xla/backends/gpu/codegen/llvm/parallel_loop_emitter.h"
 #include "xla/layout_util.h"
+#include "xla/primitive_util.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/target_util.h"
@@ -469,7 +470,19 @@ absl::Status EmitSortInPlace(
         IrArray::Index keys_index(keys_multi_index,
                                   values_arrays[operand].GetShape(),
                                   tiles_index.GetType());
-        return values_arrays[operand].EmitArrayElementAddress(keys_index, b);
+        PrimitiveType element_type =
+            values_arrays[operand].GetShape().element_type();
+        if (!primitive_util::IsSubByteNonPredType(element_type)) {
+          return values_arrays[operand].EmitArrayElementAddress(keys_index, b);
+        }
+        auto element =
+            values_arrays[operand].EmitReadArrayElement(keys_index, b);
+        auto llvm_element_type =
+            llvm_ir::PrimitiveTypeToIrType(element_type, b->getContext());
+        llvm::Value* element_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
+            llvm_element_type, "element_buffer", b);
+        b->CreateStore(element, element_buffer);
+        return element_buffer;
       };
       auto element_address_pointee_type = [&](int64_t operand, llvm::Value*) {
         return values_arrays[operand].GetElementLlvmType();
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index 7e9e5dea71ed67..a36504161cae5e 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -72,7 +72,8 @@ to_apply=compare
       kHloTemplate, primitive_util::LowercasePrimitiveTypeName(GetParam()));
   // We expect that all types except PRED and F8 types are rewritten to a custom
   // call.
-  bool rewrite = GetParam() != PRED && !primitive_util::IsF8Type(GetParam());
+  bool rewrite = GetParam() != PRED && GetParam() != S4 && GetParam() != U4 &&
+                 !primitive_util::IsF8Type(GetParam());
   std::string check = rewrite ? "CHECK: custom-call" : "CHECK-NOT: custom-call";
   MatchOptimizedHlo(hlo, check);
   EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{0, 0}));
@@ -83,9 +84,9 @@ INSTANTIATE_TEST_SUITE_P(
     // 4bit types like U4, S4, or F4E2M1FN are currently not supported.
     // F8E8M0FNU cannot represent NaNs and fails the test below.
     ::testing::ValuesIn({
-        PRED,                               // boolean
-        S8,         S16,    S32,      S64,  // signed
-        U8,         U16,    U32,      U64,  // unsigned
+        PRED,                                              // boolean
+        S4,         S8,     S16,      S32,           S64,  // signed
+        U4,         U8,     U16,      U32,           U64,  // unsigned
         F8E5M2,     F8E4M3, F8E4M3FN, F8E4M3B11FNUZ, F8E3M4, F8E5M2FNUZ,
         F8E4M3FNUZ, F16,    BF16,     F32,           F64  // floating point
     }),
@@ -121,6 +122,30 @@ ENTRY TestComputation {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+TEST_F(SortingTest, PackedElementType) {
+  const char* hlo_text = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_update_0 = s4[] parameter(2)
+      %rhs_update_0 = s4[] parameter(3)
+      %lhs_permutation = s32[] parameter(4)
+      %rhs_permutation = s32[] parameter(5)
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      ROOT %compare.2 = pred[] compare(%lhs_key, %rhs_key), direction=LT
+    }
+
+    ENTRY main {
+      p0 = s32[16384]{0} parameter(0)
+      p1 = s4[16384]{0} parameter(1)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s4[16384]{0}, s32[16384]{0}) sort(p0, p1, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+  )";
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
 // Test that verifies the IgnoreMemorySpace option works correctly
 TEST_F(SortingTest, LayoutsInShapesEqualWithIgnoreMemorySpace) {
   const char* hlo_text = R"(

From 7d78fdae98ed04c2b629d7331d63b5fd16ebdf03 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <basioli@google.com>
Date: Wed, 10 Dec 2025 06:13:07 -0800
Subject: [PATCH 124/753] [XLA:GPU][codegen] Add AllReduce checks to triton
 support checks

PiperOrigin-RevId: 842693983
---
 .../gpu/codegen/triton/collective_emitter.cc  | 14 ++++++--
 .../gpu/codegen/triton/fusion_emitter.cc      |  4 +++
 .../backends/gpu/codegen/triton/support.cc    | 31 ++++++++++++++++++
 .../gpu/codegen/triton/support_test.cc        | 32 ++++++++++++++++++-
 .../backends/gpu/codegen/triton/test_utils.cc |  8 ++++-
 .../xla/xla/hlo/analysis/indexing_analysis.cc |  6 ++--
 6 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
index 25b6c1d8f3834c..66d7d22fd27cc9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
@@ -190,8 +190,11 @@ absl::StatusOr<TensorValue> EmitAllReduce(
     const BlockLevelParameters& block_level_parameters,
     mlir::FunctionOpInterface fn, mlir::Value pid,
     absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
-  const int64_t num_elements =
-      ShapeUtil::ElementsIn(computation->root_instruction()->shape());
+  const HloInstruction* root_instruction = computation->root_instruction();
+  if (root_instruction->opcode() == HloOpcode::kAllReduceDone) {
+    root_instruction = root_instruction->operand(0);
+  }
+  const int64_t num_elements = ShapeUtil::ElementsIn(root_instruction->shape());
   const TiledHloInstruction* tiled_input_hlo = tiled_hlo_reduce.operand(0);
   TensorValue input_tile = values[tiled_input_hlo];
 
@@ -279,6 +282,10 @@ absl::StatusOr<TensorValue> EmitAllReduce(
   }
 
   // 2. Synchronization phase: Wait for all ranks to complete the scatter.
+  if (all_reduce.device_list().replica_groups().empty()) {
+    return Internal(
+        "Triton emitting AllReduce without replica groups is not supported.");
+  }
   int64_t world_size = all_reduce.device_list().num_devices_per_group();
   mtx::BlockBarrierOp::create(b, signal_buffers, device_rank, signal_value,
                               b.getI32IntegerAttr(world_size));
@@ -451,6 +458,9 @@ absl::StatusOr<TensorValue> EmitCollective(
     absl::flat_hash_map<const TiledHloInstruction*, TensorValue>& values) {
   const HloComputation* computation = fusion->fused_instructions_computation();
   const HloInstruction* root = computation->root_instruction();
+  if (root->opcode() == HloOpcode::kAllReduceDone) {
+    root = root->operand(0);
+  }
   switch (root->opcode()) {
     case HloOpcode::kAllReduceStart:
       return EmitAllReduce(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
index 17168e1b7fb251..a344c472fe143a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter.cc
@@ -1191,6 +1191,10 @@ absl::StatusOr<TensorValue> EmitTiledHloInstruction(
                           values);
   }
 
+  if (hlo->opcode() == HloOpcode::kAllReduceDone) {
+    return values[tiled_hlo.operand(0)];
+  }
+
   if (hlo->IsElementwise()) {
     std::vector<Value> operands;
     operands.reserve(hlo->operands().size());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
index 2f044f7f98afa7..2a0a85b0916a2e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support.cc
@@ -292,6 +292,31 @@ CodegenDecision CanTritonHandleReduce(
       "Reduction is not a row-reduction of a single operand.");
 }
 
+CodegenDecision IsTritonSupportedAllReduce(
+    const HloAllReduceInstruction& all_reduce,
+    const se::GpuComputeCapability& gpu_version) {
+  if (all_reduce.replica_groups().empty()) {
+    return CodegenDecision::Forbid("All-reduce does not have replica groups.");
+  }
+  if (all_reduce.shape().element_type() == PrimitiveType::F8E4M3FN ||
+      all_reduce.shape().element_type() == PrimitiveType::F8E5M2 ||
+      all_reduce.shape().element_type() == PrimitiveType::S4) {
+    return CodegenDecision::Forbid(
+        "S4, F8E4M3FN and F8E5M2 are not supported for all-reduces.");
+  }
+
+  bool is_triton_supported_all_reduce_computation = absl::c_all_of(
+      all_reduce.to_apply()->instructions(), [&](const HloInstruction* instr) {
+        return IsTritonSupportedInstructionImpl(*instr, gpu_version).CanFuse();
+      });
+  if (!is_triton_supported_all_reduce_computation) {
+    return CodegenDecision::Forbid(
+        "Unsupported all-reduce computation by Triton.");
+  }
+
+  return CodegenDecision::Allow();
+}
+
 bool IsInTritonNestedGemmFusion(const HloInstruction& hlo) {
   if (!hlo.parent()->IsFusionComputation()) {
     return false;
@@ -682,6 +707,12 @@ CodegenDecision IsTritonSupportedInstructionImpl(
     case HloOpcode::kFusion:
       return IsTritonSupportedFusion(*Cast<HloFusionInstruction>(&instr),
                                      gpu_version);
+    case HloOpcode::kAllReduceStart:
+      return IsTritonSupportedAllReduce(*Cast<HloAllReduceInstruction>(&instr),
+                                        gpu_version);
+    case HloOpcode::kAllReduceDone:
+      return IsTritonSupportedAllReduce(
+          *Cast<HloAllReduceInstruction>(instr.operand(0)), gpu_version);
     default:
       // Not all instructions have a special handling.
       break;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
index 7cda77ed4e673a..7ffb158965fd41 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/support_test.cc
@@ -1172,7 +1172,7 @@ ENTRY triton_computation {
 }
 
 TEST_P(CollectiveTest,
-       UnsupportedAllReduceStartAndDoneFailGracefullyWithTriton) {
+       IsTritonSupportedAllReduceStartAndDoneWithNoReplicaGroups) {
   // 'all-reduce-start' and 'all-reduce-done' need to be tested together, since
   // the HLO verifier relies on one directly consuming the other.
   auto [data_type, cc] = GetParam();
@@ -1201,6 +1201,36 @@ ENTRY triton_computation {
   RunSupportTest(std::move(ti_done), /*output_tile_sizes=*/{2, 2}, cc);
 }
 
+TEST_P(CollectiveTest,
+       IsTritonSupportedAllReduceStartAndDoneWithReplicaGroups) {
+  // 'all-reduce-start' and 'all-reduce-done' need to be tested together, since
+  // the HLO verifier relies on one directly consuming the other.
+  auto [data_type, cc] = GetParam();
+  const std::string kHloTestTemplate = R"(
+apply_op {
+  x = $0[] parameter(0)
+  y = $0[] parameter(1)
+  ROOT apply_op = $0[] add(x, y)
+}
+
+ENTRY triton_computation {
+  input = $0[128,32] parameter(0)
+  all-reduce-start = $0[128,32] all-reduce-start(input), replica_groups={{0,1}},
+      to_apply=apply_op
+  ROOT all-reduce-done = $0[128,32] all-reduce-done(all-reduce-start)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti_start,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type,
+                                     HloOpcode::kAllReduceStart));
+  TF_ASSERT_OK_AND_ASSIGN(
+      TestedInstruction ti_done,
+      ParseTemplateAndGetInstruction(kHloTestTemplate, data_type,
+                                     HloOpcode::kAllReduceDone));
+  RunSupportTest(std::move(ti_start), /*output_tile_sizes=*/{2, 2}, cc);
+  RunSupportTest(std::move(ti_done), /*output_tile_sizes=*/{2, 2}, cc);
+}
+
 TEST_P(CollectiveTest, UnsupportedAllToAllFailsGracefullyWithTriton) {
   auto [data_type, cc] = GetParam();
   const std::string kHloTestTemplate = R"(
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
index c02705a9e07ff2..e95c64815e783f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.cc
@@ -300,6 +300,10 @@ std::string TritonSupportTestDeviceToString(
 
 namespace {
 
+bool IsCollectiveFusion(const HloFusionInstruction& fusion) {
+  return fusion.fused_expression_root()->opcode() == HloOpcode::kAllReduceDone;
+}
+
 // This function does nothing if the input module already has an entry
 // computation whose root is a fusion. Otherwise, creates a new entry
 // computation whose root is a fusion instruction that calls the original entry
@@ -327,7 +331,9 @@ absl::Status ConvertEntryToTritonFusion(HloModule* module) {
 
   gpu::GpuBackendConfig gpu_config;
   gpu_config.mutable_fusion_backend_config()->set_kind(
-      kTritonNestedGemmFusionKind);
+      IsCollectiveFusion(*xla::Cast<HloFusionInstruction>(fusion))
+          ? kTritonCollectiveFusionKind
+          : kTritonNestedGemmFusionKind);
   TF_RETURN_IF_ERROR(fusion->set_backend_config(gpu_config));
 
   auto new_entry =
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
index f4fc26d98e0932..8f4e2ec440ae3f 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
@@ -1729,7 +1729,8 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
       // b/65689298.
       instr->opcode() == HloOpcode::kMap ||
       // For a single device, all-reduce is an elementwise op.
-      instr->opcode() == HloOpcode::kAllReduceStart) {
+      instr->opcode() == HloOpcode::kAllReduceStart ||
+      instr->opcode() == HloOpcode::kAllReduceDone) {
     return ComputeOutputToInputCwiseOpIndexing(instr, mlir_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {
@@ -1815,7 +1816,8 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
       // b/65689298.
       instr->opcode() == HloOpcode::kMap ||
       // For a single device, all-reduce has 1:1 output to input mapping.
-      instr->opcode() == HloOpcode::kAllReduceStart) {
+      instr->opcode() == HloOpcode::kAllReduceStart ||
+      instr->opcode() == HloOpcode::kAllReduceDone) {
     return ComputeInputToOutputCwiseOpIndexing(instr, mlir_context);
   }
   if (instr->opcode() == HloOpcode::kBitcast) {

From 62925a3d0e7a6c18c10644b39d143d47e049ecdd Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 10 Dec 2025 06:16:05 -0800
Subject: [PATCH 125/753] [stream_executor] Document se::MemoryAllocation as a
 future direction for unifying physcal memory allocations in SE

PiperOrigin-RevId: 842694845
---
 third_party/xla/xla/stream_executor/BUILD     |  5 ++++
 .../generic_memory_allocation.h               |  6 ++--
 .../xla/stream_executor/memory_allocation.h   | 29 +++++++++++++++----
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index 5e9db471bbb232..b94670ea68a49c 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -344,6 +344,10 @@ xla_cc_test(
 cc_library(
     name = "memory_allocation",
     hdrs = ["memory_allocation.h"],
+    deps = [
+        ":device_address",
+        "@com_google_absl//absl/base:core_headers",
+    ],
 )
 
 cc_library(
@@ -529,6 +533,7 @@ cc_library(
     name = "generic_memory_allocation",
     hdrs = ["generic_memory_allocation.h"],
     deps = [
+        ":device_address",
         ":memory_allocation",
         "@com_google_absl//absl/functional:any_invocable",
     ],
diff --git a/third_party/xla/xla/stream_executor/generic_memory_allocation.h b/third_party/xla/xla/stream_executor/generic_memory_allocation.h
index c443df3408eacf..69ec0d9361ec1a 100644
--- a/third_party/xla/xla/stream_executor/generic_memory_allocation.h
+++ b/third_party/xla/xla/stream_executor/generic_memory_allocation.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/functional/any_invocable.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/memory_allocation.h"
 
 namespace stream_executor {
@@ -39,8 +40,9 @@ class GenericMemoryAllocation final : public MemoryAllocation {
     }
   }
 
-  void* opaque() const final { return ptr_; }
-  uint64_t size() const final { return size_; }
+  DeviceAddressBase address() const final {
+    return DeviceAddressBase(ptr_, size_);
+  }
 
  private:
   void* ptr_ = nullptr;
diff --git a/third_party/xla/xla/stream_executor/memory_allocation.h b/third_party/xla/xla/stream_executor/memory_allocation.h
index 0e0df2442001e0..2a75a1069b648b 100644
--- a/third_party/xla/xla/stream_executor/memory_allocation.h
+++ b/third_party/xla/xla/stream_executor/memory_allocation.h
@@ -18,11 +18,22 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "absl/base/macros.h"
+#include "xla/stream_executor/device_address.h"
+
 namespace stream_executor {
 
-// An RAII handle for a memory allocated for a device. It can be pinned host
-// memory, unified memory, device memory, etc. depending on what kinds of
-// memories are supported by underlying device.
+// A MemoryAllocation is a block of physical memory allocated on the
+// StreamExecutor device.
+//
+// MemoryAllocation is not necessarily a physical memory on the physical device
+// (i.e. GPU), it can be a memory on the host pre-mapped for the host to device
+// communication. It can be pinned host memory, unified memory, device memory,
+// etc. depending on what kinds of memories are supported by underlying device.
+//
+// MemoryAllocation can be mapped to a DeviceAddress, which can be used to
+// access the memory from device or host. Multiple device address ranges can be
+// mapped to the same MemoryAllocation.
 class MemoryAllocation {
  public:
   MemoryAllocation() = default;
@@ -31,8 +42,16 @@ class MemoryAllocation {
   MemoryAllocation(MemoryAllocation&&) = delete;
   MemoryAllocation& operator=(MemoryAllocation&&) = delete;
 
-  virtual void* opaque() const = 0;
-  virtual uint64_t size() const = 0;
+  // A device address which gives access to the memory allocation. Can be
+  // nullptr if memory allocation is not adressable, i.e. physical allocation
+  // might not be mapped to any virtual address by default.
+  virtual DeviceAddressBase address() const = 0;
+
+  ABSL_DEPRECATE_AND_INLINE()
+  void* opaque() const { return address().opaque(); }
+
+  ABSL_DEPRECATE_AND_INLINE()
+  uint64_t size() const { return address().size(); }
 };
 
 }  // namespace stream_executor

From dbcd8d5545cb813a255af5bc5b481781e7d7ef73 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Wed, 10 Dec 2025 06:53:05 -0800
Subject: [PATCH 126/753] Canonicalize `dim_shardings` in case of all empty
 vectors

PiperOrigin-RevId: 842706792
---
 third_party/xla/xla/hlo/ir/named_sharding.h       | 15 ++++++++++++++-
 third_party/xla/xla/hlo/ir/named_sharding_test.cc | 12 ++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index 01ab052d24a22b..53134af857dc5d 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -71,7 +71,7 @@ class NamedSharding {
                          absl::Span<const AxisRef> unreduced_axes = {},
                          absl::Span<const OpMetadata> metadata = {})
       : mesh_(std::move(mesh)),
-        dim_shardings_(dim_shardings.begin(), dim_shardings.end()),
+        dim_shardings_(CanonicalizedDimShardings(dim_shardings)),
         replicated_axes_(replicated_axes.begin(), replicated_axes.end()),
         unreduced_axes_(unreduced_axes.begin(), unreduced_axes.end()),
         metadata_(metadata.begin(), metadata.end()) {}
@@ -87,6 +87,19 @@ class NamedSharding {
  private:
   friend class HloSharding;
 
+  std::vector<DimensionSharding> CanonicalizedDimShardings(
+      absl::Span<const DimensionSharding> dim_shardings) const {
+    bool all_dims_empty = absl::c_all_of(
+        dim_shardings,
+        [](const DimensionSharding& ds) { return ds.axes().empty(); });
+
+    if (all_dims_empty) {
+      return {};
+    }
+    return std::vector<DimensionSharding>(dim_shardings.begin(),
+                                          dim_shardings.end());
+  }
+
   // Creates a sharding with empty mesh and no sharding axes depicting it is
   // replicated across all devices.
   static NamedSharding Replicate(absl::Span<const OpMetadata> metadata = {}) {
diff --git a/third_party/xla/xla/hlo/ir/named_sharding_test.cc b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
index 78e3b3e3b08095..611ade960943f3 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding_test.cc
+++ b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
@@ -24,6 +24,18 @@ namespace {
 
 using DimensionSharding = NamedSharding::DimensionSharding;
 
+TEST(NamedShardingTest, CanonicalizedDimShardings) {
+  Mesh mesh_abcd({2, 4}, {"a", "b"});
+
+  DimensionSharding empty_ds;
+  NamedSharding sharding1(mesh_abcd, {empty_ds, empty_ds});
+  EXPECT_TRUE(sharding1.dim_shardings().empty());
+
+  DimensionSharding ds_a({AxisRef(0)}, /*is_closed=*/true);
+  NamedSharding sharding2(mesh_abcd, {ds_a, empty_ds});
+  EXPECT_FALSE(sharding2.dim_shardings().empty());
+}
+
 TEST(NamedShardingTest, AxisNameCtor) {
   Mesh mesh_abcd({2, 4, 3, 8}, {"a", "b", "c", "d"});
   AxisRef axis_a(0);

From 26eb24445c7ab6ca5ba397246f222c2671156d2a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 07:07:09 -0800
Subject: [PATCH 127/753] Automated Code Change

PiperOrigin-RevId: 842711894
---
 third_party/xla/xla/ffi/BUILD                   | 1 +
 third_party/xla/xla/ffi/execution_state_test.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index b7b2892e016db3..eb9e58e2a050d0 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -123,6 +123,7 @@ xla_cc_test(
     srcs = ["execution_state_test.cc"],
     deps = [
         ":execution_state",
+        ":execution_state_proto_cc",
         ":type_registry",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/ffi/execution_state_test.cc b/third_party/xla/xla/ffi/execution_state_test.cc
index 8c05e4caf9098e..254cc1500e2113 100644
--- a/third_party/xla/xla/ffi/execution_state_test.cc
+++ b/third_party/xla/xla/ffi/execution_state_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/ffi/execution_state.pb.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"

From 173d71aa5dd26095c7a5516d33734dbc29fde9eb Mon Sep 17 00:00:00 2001
From: spiao <Songlin.Piao@amd.com>
Date: Wed, 10 Dec 2025 07:22:39 -0800
Subject: [PATCH 128/753] PR #34250: [ROCm] bugfix - consider the situation
 where the best time is infinite
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34250

🐛 Bug Fix

it occasionally happens in [TryFindBestTilingForFusion](https://github.com/openxla/xla/blob/b111957fcb2e2e91839c5e70c6b23216416ca99f/xla/service/gpu/model/gpu_indexing_performance_model.cc#L637C42-L637C68), best_tiled_run_time_data contains Infinite. Such situation leads to an unexpected computed_fusion with tile size[1,1] as below, which leads to performance degression due to register spilling (Tested both on MI300x and H100). This bugfix skips tiles with infinite runtime.

```
HloModule m
%region_2.260.clone.19 (Arg_0.55: f32[], Arg_1.55: f32[]) -> f32[] {
  %Arg_0.55 = f32[] parameter(0)
  %Arg_1.55 = f32[] parameter(1)
  ROOT %add.492.0 = f32[] add(%Arg_0.55, %Arg_1.55)
}

%region_2.260.clone.8 (Arg_0.44: f32[], Arg_1.44: f32[]) -> f32[] {
  %Arg_0.44 = f32[] parameter(0)
  %Arg_1.44 = f32[] parameter(1)
  ROOT %add.481.0 = f32[] add(%Arg_0.44, %Arg_1.44)
}

%region_2.260.clone.7 (Arg_0.43: f32[], Arg_1.43: f32[]) -> f32[] {
  %Arg_0.43 = f32[] parameter(0)
  %Arg_1.43 = f32[] parameter(1)
  ROOT %add.480.0 = f32[] add(%Arg_0.43, %Arg_1.43)
}

%fused_computation.337 (param_0.1730: bf16[1,16384,4096], param_1.1795: bf16[16384,4096]) -> f32[128,4096] {
  %param_0.1730 = bf16[1,16384,4096]{2,1,0} parameter(0)
  %convert.113.32 = f32[1,16384,4096]{2,1,0} convert(%param_0.1730)
  %bitcast.1893 = f32[16384,4096]{1,0} bitcast(%convert.113.32)
  %constant_2184 = f32[] constant(0)
  %reduce.310 = f32[16384]{0} reduce(%bitcast.1893, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.7
  %bitcast.1892 = f32[1,16384]{1,0} bitcast(%reduce.310)
  %constant_2183 = f32[] constant(0.000244140625)
  %broadcast.1035 = f32[1,16384]{1,0} broadcast(%constant_2183), dimensions={}
  %multiply.520 = f32[1,16384]{1,0} multiply(%bitcast.1892, %broadcast.1035)
  %bitcast.1891 = f32[16384]{0} bitcast(%multiply.520)
  %broadcast.1034 = f32[1,16384,4096]{2,1,0} broadcast(%bitcast.1891), dimensions={1}
  %subtract.183 = f32[1,16384,4096]{2,1,0} subtract(%convert.113.32, %broadcast.1034)
  %multiply.261.15 = f32[1,16384,4096]{2,1,0} multiply(%subtract.183, %subtract.183)
  %bitcast.1136.15 = f32[16384,4096]{1,0} bitcast(%multiply.261.15)
  %reduce.127.15 = f32[16384]{0} reduce(%bitcast.1136.15, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.8
  %bitcast.1137.13 = f32[1,16384]{1,0} bitcast(%reduce.127.15)
  %multiply.262.13 = f32[1,16384]{1,0} multiply(%bitcast.1137.13, %broadcast.1035)
  %constant_1233_1 = f32[] constant(1e-05)
  %broadcast.449.11 = f32[1,16384]{1,0} broadcast(%constant_1233_1), dimensions={}
  %add.350.11 = f32[1,16384]{1,0} add(%multiply.262.13, %broadcast.449.11)
  %bitcast.213.16 = f32[1,16384,1]{2,1,0} bitcast(%add.350.11)
  %rsqrt.14.5 = f32[1,16384,1]{2,1,0} rsqrt(%bitcast.213.16)
  %bitcast.215.7 = f32[16384]{0} bitcast(%rsqrt.14.5)
  %broadcast.472.7 = f32[1,16384,4096]{2,1,0} broadcast(%bitcast.215.7), dimensions={1}
  %param_1.1795 = bf16[16384,4096]{1,0} parameter(1)
  %bitcast.211.19 = bf16[1,16384,4096]{2,1,0} bitcast(%param_1.1795)
  %convert.201.19 = f32[1,16384,4096]{2,1,0} convert(%bitcast.211.19)
  %multiply.267.5 = f32[1,16384,4096]{2,1,0} multiply(%subtract.183, %convert.201.19)
  %multiply.282.3 = f32[1,16384,4096]{2,1,0} multiply(%broadcast.472.7, %multiply.267.5)
  %bitcast.1210.1 = f32[128,128,4096]{2,1,0} bitcast(%multiply.282.3)
  ROOT %reduce.180.1 = f32[128,4096]{1,0} reduce(%bitcast.1210.1, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.19
}
ENTRY main {
  p0 = bf16[1,16384,4096] parameter(0)
  p1 = bf16[16384,4096] parameter(1)
  ROOT fusion = f32[128,4096] fusion(p0, p1), kind=kCustom,
    calls=%fused_computation.337, backend_config={
      "fusion_backend_config":{
      "kind":"__triton",
      "block_level_fusion_config":{
        "output_tiles":[{"sizes":["1","1"]}],
        "num_warps":"8",
        "num_ctas":"1",
        "num_stages":"1"}}}
}
```

relevant PR https://github.com/openxla/xla/pull/33777#pullrequestreview-3487884860

@xla-rotation could you review my PR, please?

Copybara import of the project:

--
af2b30aeadb71e1b058d7c9e56267b19d8e02f54 by Songlin Piao <Songlin.Piao@amd.com>:

bugfix - consider the situation where the best time is infinite

--
dd3417221bb00e1df4fd83c8b0145e0c7e6b2205 by Songlin Piao <Songlin.Piao@amd.com>:

added an unit test where best_tiled_run_time_data contains Infinite.

Merging this change closes #34250

PiperOrigin-RevId: 842716246
---
 .../model/gpu_indexing_performance_model.cc   |   5 +
 .../gpu/transforms/priority_fusion_test.cc    | 117 ++++++++++++++++++
 .../softmax_rewriter_triton_test.cc           |  14 +--
 3 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
index f6d737b033aa14..8d0ae07cc7f7af 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -680,6 +680,11 @@ GpuPerformanceModelWithIndexingAnalysis::TryFindBestTilingForFusion(
         EstimateRunTimeForTiledHloComputation(
             fusion_adaptor, tiled_hlo_computation, launch_dimensions));
 
+    // Skip tilings with infinite runtime (e.g., due to register spilling).
+    if (estimate_run_time_data.exec_time == absl::InfiniteDuration()) {
+      continue;
+    }
+
     if (!best_tiled_run_time_data.has_value() ||
         estimate_run_time_data.exec_time <
             best_tiled_run_time_data->runtime_data.exec_time) {
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
index fd559471f03ed4..f3253d1a1257b7 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
@@ -1292,6 +1292,123 @@ TEST_F(PriorityFusionTest, DoNotFuseInsideReducer) {
               absl_testing::IsOkAndHolds(false));
 }
 
+TEST_F(PriorityFusionTest, SkipsTilingsWithInfiniteRuntime) {
+  // This test verifies the fix in TryFindBestTilingForFusion that skips
+  // tilings with infinite runtime estimates.
+  //
+  // The fix: After estimating runtime for each tiling candidate, check if
+  // exec_time == absl::InfiniteDuration() and skip those tilings.
+  //
+  // Background: DoesComputationFitInRegisters() returns false when tiles are
+  // too large to fit in registers (tile_size > 0.4 * registers_per_block).
+  // When this happens, EstimateRunTimeForTiledHloComputation() returns
+  // EstimateRunTimeData::Infinite() with exec_time = absl::InfiniteDuration().
+  //
+  // Without the fix: If all tilings have infinite runtime, the first one
+  // would be selected as "best" by default, leading to certain register
+  // spilling and poor performance.
+  //
+  // With the fix: Infinite-runtime tilings are skipped during evaluation,
+  // allowing:
+  // 1. Selection of tilings that actually fit in registers, OR
+  // 2. Return FusionDecision::Forbid("No valid tilings found") if all fail
+  //
+  // Test structure: LayerNorm-like computation with reductions that can
+  // trigger problematic tile sizes on certain input shapes.
+  const std::string kHloText = R"(
+HloModule m
+%region_2.260.clone.19 (Arg_0.55: f32[], Arg_1.55: f32[]) -> f32[] {
+  %Arg_0.55 = f32[] parameter(0)
+  %Arg_1.55 = f32[] parameter(1)
+  ROOT %add.492.0 = f32[] add(%Arg_0.55, %Arg_1.55)
+}
+
+%region_2.260.clone.8 (Arg_0.44: f32[], Arg_1.44: f32[]) -> f32[] {
+  %Arg_0.44 = f32[] parameter(0)
+  %Arg_1.44 = f32[] parameter(1)
+  ROOT %add.481.0 = f32[] add(%Arg_0.44, %Arg_1.44)
+}
+
+%region_2.260.clone.7 (Arg_0.43: f32[], Arg_1.43: f32[]) -> f32[] {
+  %Arg_0.43 = f32[] parameter(0)
+  %Arg_1.43 = f32[] parameter(1)
+  ROOT %add.480.0 = f32[] add(%Arg_0.43, %Arg_1.43)
+}
+
+%producer_computation (param_0: bf16[16384,4096]) -> bf16[16384,4096] {
+  %param_0 = bf16[16384,4096]{1,0} parameter(0)
+  %constant_0 = bf16[] constant(1e-03)
+  %broadcast_0 = bf16[16384,4096]{1,0} broadcast(%constant_0), dimensions={}
+  ROOT %add_0 = bf16[16384,4096]{1,0} add(%param_0, %broadcast_0)
+}
+
+%fused_computation.337 (param_0.1730: bf16[1,16384,4096], param_1.1795: bf16[16384,4096]) -> f32[128,4096] {
+  %param_0.1730 = bf16[1,16384,4096]{2,1,0} parameter(0)
+  %convert.113.32 = f32[1,16384,4096]{2,1,0} convert(%param_0.1730)
+  %bitcast.1893 = f32[16384,4096]{1,0} bitcast(%convert.113.32)
+  %constant_2184 = f32[] constant(0)
+  %reduce.310 = f32[16384]{0} reduce(%bitcast.1893, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.7
+  %bitcast.1892 = f32[1,16384]{1,0} bitcast(%reduce.310)
+  %constant_2183 = f32[] constant(0.000244140625)
+  %broadcast.1035 = f32[1,16384]{1,0} broadcast(%constant_2183), dimensions={}
+  %multiply.520 = f32[1,16384]{1,0} multiply(%bitcast.1892, %broadcast.1035)
+  %bitcast.1891 = f32[16384]{0} bitcast(%multiply.520)
+  %broadcast.1034 = f32[1,16384,4096]{2,1,0} broadcast(%bitcast.1891), dimensions={1}
+  %subtract.183 = f32[1,16384,4096]{2,1,0} subtract(%convert.113.32, %broadcast.1034)
+  %multiply.261.15 = f32[1,16384,4096]{2,1,0} multiply(%subtract.183, %subtract.183)
+  %bitcast.1136.15 = f32[16384,4096]{1,0} bitcast(%multiply.261.15)
+  %reduce.127.15 = f32[16384]{0} reduce(%bitcast.1136.15, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.8
+  %bitcast.1137.13 = f32[1,16384]{1,0} bitcast(%reduce.127.15)
+  %multiply.262.13 = f32[1,16384]{1,0} multiply(%bitcast.1137.13, %broadcast.1035)
+  %constant_1233_1 = f32[] constant(1e-05)
+  %broadcast.449.11 = f32[1,16384]{1,0} broadcast(%constant_1233_1), dimensions={}
+  %add.350.11 = f32[1,16384]{1,0} add(%multiply.262.13, %broadcast.449.11)
+  %bitcast.213.16 = f32[1,16384,1]{2,1,0} bitcast(%add.350.11)
+  %rsqrt.14.5 = f32[1,16384,1]{2,1,0} rsqrt(%bitcast.213.16)
+  %bitcast.215.7 = f32[16384]{0} bitcast(%rsqrt.14.5)
+  %broadcast.472.7 = f32[1,16384,4096]{2,1,0} broadcast(%bitcast.215.7), dimensions={1}
+  %param_1.1795 = bf16[16384,4096]{1,0} parameter(1)
+  %bitcast.211.19 = bf16[1,16384,4096]{2,1,0} bitcast(%param_1.1795)
+  %convert.201.19 = f32[1,16384,4096]{2,1,0} convert(%bitcast.211.19)
+  %multiply.267.5 = f32[1,16384,4096]{2,1,0} multiply(%subtract.183, %convert.201.19)
+  %multiply.282.3 = f32[1,16384,4096]{2,1,0} multiply(%broadcast.472.7, %multiply.267.5)
+  %bitcast.1210.1 = f32[128,128,4096]{2,1,0} bitcast(%multiply.282.3)
+  ROOT %reduce.180.1 = f32[128,4096]{1,0} reduce(%bitcast.1210.1, %constant_2184), dimensions={1}, to_apply=%region_2.260.clone.19
+}
+ENTRY main {
+  p0 = bf16[1,16384,4096] parameter(0)
+  p1 = bf16[16384,4096] parameter(1)
+  producer_fusion = bf16[16384,4096]{1,0} fusion(p1), kind=kLoop, calls=%producer_computation
+
+  ROOT fusion = f32[128,4096] fusion(p0, producer_fusion), kind=kCustom,
+    calls=%fused_computation.337, backend_config={
+      "fusion_backend_config":{
+      "kind":"__triton",
+      "block_level_fusion_config":{
+        "output_tiles":[{"sizes":["1","1"]}],
+        "num_warps":"8",
+        "num_ctas":"1",
+        "num_stages":"1"}}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHloText));
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_enable_triton_gemm(false);
+
+  module->mutable_config().mutable_debug_options().set_xla_gpu_autotune_level(
+      0);
+
+  // VLOG(2) << module->ToString() << std::endl;
+
+  // Run priority fusion - it should not fuse producer into
+  // %fused_computation.337.
+  EXPECT_THAT(priority_fusion_.Run(module.get()),
+              absl_testing::IsOkAndHolds(false));
+}
+
 class PriorityFusionWithTritonEnabledTest : public PriorityFusionTest {
  public:
   DebugOptions GetDebugOptionsForTest() const override {
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
index f50392dc9d6385..8f3da454abe41d 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
@@ -1066,8 +1066,7 @@ ENTRY main {
   EXPECT_FALSE(fusion_rewriter_.Run(module.get()).value());
 }
 
-TEST_F(SoftmaxRewriterTritonTest,
-       DoNotFuseNormalizationWithVeryLongRowsIfProfitabilityCheckIsEnabled) {
+TEST_F(SoftmaxRewriterTritonTest, DoesNotFuseNormalizationWithVeryLongRows) {
   const std::string hlo_string = R"(
 HloModule softmax
 max_computation {
@@ -1084,19 +1083,16 @@ ENTRY main {
 })";
 
   {
-    // Verify that SoftmaxRewriterTriton without Cost Model will fuse the
-    // normalization diamond.
+    // Verify that SoftmaxRewriterTriton without Cost Model will not fuse the
+    // normalization diamond, because the row size is too large to fit in
+    // registers.
     SoftmaxRewriterTriton fusion_rewriter_without_cost_model{
         device_info_, HloCostAnalysis::DefaultShapeSize, &alias_info_,
         &mlir_context_,
         /*only_fuse_if_profitable=*/false};
 
     auto module = ParseAndReturnVerifiedModule(hlo_string).value();
-    EXPECT_TRUE(fusion_rewriter_without_cost_model.Run(module.get()).value());
-    EXPECT_TRUE(verifier().Run(module.get()).status().ok());
-    EXPECT_THAT(module->entry_computation()->root_instruction(),
-                GmockMatch(m::Fusion(m::Parameter())
-                               .WithPredicate(HasBlockLevelFusionConfig)));
+    EXPECT_FALSE(fusion_rewriter_without_cost_model.Run(module.get()).value());
   }
 
   {

From 67ba4d5c767afa1747cb0da3611f3cc7d9997742 Mon Sep 17 00:00:00 2001
From: Corentin Kerisit <corentin.kerisit@gmail.com>
Date: Wed, 10 Dec 2025 08:44:17 -0800
Subject: [PATCH 129/753] PR #34320: Migrate from native built-ins to Starlark
 rule definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34320

📝 Summary of Changes

- Replaces all occurrences of native.rule by their starlark version.
- Add missing load statements where native rules were implicit.

🎯 Justification

This is a preparation PR for Bazel 9 where implicits must be loaded from starlark.

🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix, ⚡️ Performance Improvement,
✨ New Feature, ♻️ Cleanup, 📚 Documentation, 🧪 Tests

Copybara import of the project:

--
08d48174ab45858d7a16df9cfe43fea13d6366f7 by Corentin Kerisit <corentin.kerisit@gmail.com>:

Move all remaining native.* rules to their loaded equivalent

--
3dc798cfe9bed6c0094c8a618b8004cb497ee5e1 by Corentin Kerisit <corentin.kerisit@gmail.com>:

Use rules_java that uses same abseil-cpp version

in latest rules_java, abseil-cpp is bumped and local patches
do not apply anymore.

Merging this change closes #34320

PiperOrigin-RevId: 842743151
---
 third_party/xla/MODULE.bazel                          |  1 +
 third_party/xla/build_tools/ci/BUILD                  |  1 +
 third_party/xla/build_tools/pjrt_wheels/BUILD.bazel   |  2 ++
 .../gpus/crosstool/cc_toolchain_config.bzl.tpl        |  1 +
 .../gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl  |  1 +
 .../gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl   |  1 +
 .../xla/third_party/gpus/cuda/build_defs.bzl.tpl      |  8 +++++---
 .../xla/third_party/gpus/rocm/build_defs.bzl.tpl      |  4 +++-
 .../xla/third_party/gpus/sycl/build_defs.bzl.tpl      |  4 +++-
 third_party/xla/third_party/implib_so/BUILD.bazel     |  2 ++
 third_party/xla/third_party/llvm_openmp/BUILD.bazel   |  1 +
 third_party/xla/third_party/nccl/build_defs.bzl.tpl   |  7 ++++---
 third_party/xla/third_party/nvtx/BUILD.bazel          |  2 ++
 .../xla/third_party/tensorrt/plugin/BUILD.bazel       |  1 +
 third_party/xla/xla/service/gpu/build_defs.bzl        |  3 ++-
 third_party/xla/xla/tsl/mkl/BUILD.bazel               |  1 +
 .../xla/xla/tsl/platform/default/build_config.bzl     |  3 ++-
 third_party/xla/xla/tsl/platform/default/rules_cc.bzl | 11 +++++------
 18 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index b51181b412ef6c..0aa63a6b4b78f5 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -22,6 +22,7 @@ bazel_dep(name = "pybind11_bazel", version = "2.13.6")
 bazel_dep(name = "pybind11_protobuf", version = "0.0.0-20250210-f02a2b7")
 bazel_dep(name = "re2", version = "2024-07-02.bcr.1", repo_name = "com_googlesource_code_re2")
 bazel_dep(name = "rules_cc", version = "0.2.0")
+bazel_dep(name = "rules_java", version = "8.16.1")
 bazel_dep(name = "rules_license", version = "1.0.0")
 bazel_dep(name = "rules_python", version = "1.6.0")
 bazel_dep(name = "rules_shell", version = "0.6.1")
diff --git a/third_party/xla/build_tools/ci/BUILD b/third_party/xla/build_tools/ci/BUILD
index 3d37ca202dd82b..79a571ba22ace1 100644
--- a/third_party/xla/build_tools/ci/BUILD
+++ b/third_party/xla/build_tools/ci/BUILD
@@ -14,6 +14,7 @@
 # ============================================================================
 
 load("@bazel_skylib//rules:diff_test.bzl", "diff_test")
+load("@rules_shell//shell:sh_binary.bzl", "sh_binary")
 load("//xla:pytype.bzl", "pytype_strict_binary")
 
 package(
diff --git a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
index 5848bb71b1c20c..b9adc77a1690e8 100644
--- a/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
+++ b/third_party/xla/build_tools/pjrt_wheels/BUILD.bazel
@@ -1,6 +1,8 @@
 load("@cuda_cudart//:version.bzl", cuda_major_version = "VERSION")
 load("@nightly_timestamp//:timestamp.bzl", "XLA_NIGHTLY_TIMESTAMP")
 load("@rc_number//:rc_number.bzl", "XLA_RC_NUMBER")
+load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
+load("@rules_cc//cc:cc_test.bzl", "cc_test")
 load("@rules_python//python:packaging.bzl", "py_wheel")
 
 # This ensures we can only build plugins for selected CUDA versions.
diff --git a/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index ffa305c772e881..e9da7383842473 100644
--- a/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -16,6 +16,7 @@ load(
     "with_feature_set",
 )
 load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load("@rules_cc//cc/toolchains:cc_toolchain_config_info.bzl", "CcToolchainConfigInfo")
 
 def all_assembly_actions():
     return [
diff --git a/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
index e5a942b66c17fc..a97202d8e9fb61 100644
--- a/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/hipcc_cc_toolchain_config.bzl.tpl
@@ -11,6 +11,7 @@ load(
     "with_feature_set",
 )
 load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load("@rules_cc//cc/toolchains:cc_toolchain_config_info.bzl", "CcToolchainConfigInfo")
 
 all_compile_actions = [
     ACTION_NAMES.c_compile,
diff --git a/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl b/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
index 5d0295a6ee448b..e754300e3dbc9d 100644
--- a/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
+++ b/third_party/xla/third_party/gpus/crosstool/sycl_cc_toolchain_config.bzl.tpl
@@ -16,6 +16,7 @@ load(
     "with_feature_set",
 )
 load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
+load("@rules_cc//cc/toolchains:cc_toolchain_config_info.bzl", "CcToolchainConfigInfo")
 
 def all_assembly_actions():
     return [
diff --git a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
index 40ca4a62607cda..3ee6d2d348b2fc 100644
--- a/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -1,3 +1,5 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 # Macros for building CUDA code.
 def if_cuda(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with CUDA.
@@ -137,7 +139,7 @@ def cuda_header_library(
     target without virtual includes. This works around the fact that bazel can't
     mix 'includes' and 'include_prefix' in the same target."""
 
-    native.cc_library(
+    cc_library(
         name = name + "_virtual",
         hdrs = hdrs,
         include_prefix = include_prefix,
@@ -146,7 +148,7 @@ def cuda_header_library(
         visibility = ["//visibility:private"],
     )
 
-    native.cc_library(
+    cc_library(
         name = name,
         textual_hdrs = hdrs,
         deps = deps + [":%s_virtual" % name],
@@ -160,7 +162,7 @@ def cuda_library(copts = [], tags = [], deps = [], **kwargs):
     # "use of the "register" storage class specifier is not allowed" error.
     # This can and should be removed once we migrate on glibc-2.27 or newer.
     local_defines = kwargs.pop("local_defines", []) + ["register="]
-    native.cc_library(
+    cc_library(
         copts = cuda_default_copts() + copts,
         tags = tags + [
             "gpu",
diff --git a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
index a690f767d8dbd5..d04a045907f274 100644
--- a/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -1,3 +1,5 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 # Macros for building ROCm code.
 def if_rocm(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with ROCm.
@@ -80,7 +82,7 @@ def rocm_library(copts = [], deps = [], **kwargs):
     """Wrapper over cc_library which adds default ROCm options."""
     if "@local_config_rocm//rocm:rocm_headers" not in deps:
       deps.append("@local_config_rocm//rocm:rocm_headers")
-    native.cc_library(copts = rocm_default_copts() + copts, deps = deps, **kwargs)
+    cc_library(copts = rocm_default_copts() + copts, deps = deps, **kwargs)
 
 def get_rbe_amdgpu_pool(is_single_gpu = False):
     return "%{single_gpu_rbe_pool}" if is_single_gpu else "%{multi_gpu_rbe_pool}"
diff --git a/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl b/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl
index 8b4324dcc8c9da..debfd5d27639f7 100644
--- a/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/gpus/sycl/build_defs.bzl.tpl
@@ -1,3 +1,5 @@
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 # Macros for building SYCL code.
 def if_sycl(if_true, if_false = []):
     """Shorthand for select()'ing on whether we're building with SYCL.
@@ -40,7 +42,7 @@ def if_sycl_build_is_configured(x, y):
 
 def sycl_library(copts = [], linkopts = [], tags = [], deps = [], **kwargs):
     """Wrapper over cc_library which adds default SYCL options."""
-    native.cc_library(copts = sycl_default_copts() + copts,
+    cc_library(copts = sycl_default_copts() + copts,
                       linkopts = sycl_default_linkopts() + linkopts,
                       tags = tags + ["gpu"],
                       deps = deps + if_sycl_is_configured([
diff --git a/third_party/xla/third_party/implib_so/BUILD.bazel b/third_party/xla/third_party/implib_so/BUILD.bazel
index ca6976cd8d3425..1cb7282ea89d71 100644
--- a/third_party/xla/third_party/implib_so/BUILD.bazel
+++ b/third_party/xla/third_party/implib_so/BUILD.bazel
@@ -1,3 +1,5 @@
+load("@rules_python//python:defs.bzl", "py_binary")
+
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # MIT
diff --git a/third_party/xla/third_party/llvm_openmp/BUILD.bazel b/third_party/xla/third_party/llvm_openmp/BUILD.bazel
index fbde2733a2a302..15f0218bf2f6a2 100644
--- a/third_party/xla/third_party/llvm_openmp/BUILD.bazel
+++ b/third_party/xla/third_party/llvm_openmp/BUILD.bazel
@@ -17,6 +17,7 @@ load(
     "if_macos",
     "if_windows",
 )
+load("@rules_python//python:defs.bzl", "py_binary")
 
 package(
     default_visibility = [
diff --git a/third_party/xla/third_party/nccl/build_defs.bzl.tpl b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
index ac7f3bc92cff33..ad447657d907a2 100644
--- a/third_party/xla/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/nccl/build_defs.bzl.tpl
@@ -2,6 +2,7 @@
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 # CUDA toolkit version as tuple (e.g. '(11, 1)').
 _cuda_version = %{cuda_version}
@@ -311,7 +312,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
 
     # Compile host and device code into library.
     lib = name + "_lib"
-    native.cc_library(
+    cc_library(
         name = lib,
         hdrs = hdrs,
         copts = _rdc_copts() + copts,
@@ -336,7 +337,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
 
     # Compile the source file into a library.
     dlink = name + "_dlink"
-    native.cc_library(
+    cc_library(
         name = dlink,
         srcs = [dlink_cc],
         textual_hdrs = [dlink_hdrs],
@@ -371,7 +372,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
     )
 
     # Create cc target from archive.
-    native.cc_library(
+    cc_library(
         name = name,
         srcs = [merged],
         hdrs = hdrs,
diff --git a/third_party/xla/third_party/nvtx/BUILD.bazel b/third_party/xla/third_party/nvtx/BUILD.bazel
index af6de99cb8fcf7..a8e181e57b1932 100644
--- a/third_party/xla/third_party/nvtx/BUILD.bazel
+++ b/third_party/xla/third_party/nvtx/BUILD.bazel
@@ -1,5 +1,7 @@
 # NVIDIA NVTX 3
 
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
 licenses(["notice"])
 
 exports_files(["LICENSE.txt"])
diff --git a/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel b/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
index 56e26d779de155..45a8d0a78aa78c 100644
--- a/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
+++ b/third_party/xla/third_party/tensorrt/plugin/BUILD.bazel
@@ -2,6 +2,7 @@
 # This package contains build targets for select TensorRT plugins included in the
 # TensorRT open source repository.
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_library")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 exports_files(["LICENSE"])
 
diff --git a/third_party/xla/xla/service/gpu/build_defs.bzl b/third_party/xla/xla/service/gpu/build_defs.bzl
index 849b8d21dc94aa..4b8fb550b7de97 100644
--- a/third_party/xla/xla/service/gpu/build_defs.bzl
+++ b/third_party/xla/xla/service/gpu/build_defs.bzl
@@ -2,6 +2,7 @@
 """
 
 load("@bazel_skylib//lib:paths.bzl", "paths")
+load("@rules_shell//shell:sh_test.bzl", "sh_test")
 load("//xla/tests:build_defs.bzl", "prepare_gpu_backend_data")
 load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
@@ -147,7 +148,7 @@ def gen_gpu_hlo_compile_tests(
         ]
 
         for backend in backends:
-            native.sh_test(
+            sh_test(
                 name = "gpu_compile_%s_%s_hlo_test" % (filename, backend),
                 srcs = [name + "_gensh"],
                 args = [
diff --git a/third_party/xla/xla/tsl/mkl/BUILD.bazel b/third_party/xla/xla/tsl/mkl/BUILD.bazel
index fdb5bb30887803..36b349fc0afbb2 100644
--- a/third_party/xla/xla/tsl/mkl/BUILD.bazel
+++ b/third_party/xla/xla/tsl/mkl/BUILD.bazel
@@ -1,6 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 load("@local_xla//xla/tsl:tsl.bzl", "clean_dep")
 load("@local_xla//xla/tsl/mkl:build_defs.bzl", "mkl_dep")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 licenses(["notice"])  # 3-Clause BSD
 
diff --git a/third_party/xla/xla/tsl/platform/default/build_config.bzl b/third_party/xla/xla/tsl/platform/default/build_config.bzl
index 14d8f1bba7a487..a9b14ee1123671 100644
--- a/third_party/xla/xla/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/xla/tsl/platform/default/build_config.bzl
@@ -6,6 +6,7 @@
 load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
 load("@com_github_grpc_grpc//bazel:python_rules.bzl", "py_grpc_library")
 load("@com_google_protobuf//bazel:cc_proto_library.bzl", "cc_proto_library")
+load("@com_google_protobuf//bazel:proto_library.bzl", "proto_library")
 load("@com_google_protobuf//bazel:py_proto_library.bzl", "py_proto_library")
 load("@rules_cc//cc:cc_binary.bzl", "cc_binary")
 load("@rules_cc//cc:cc_test.bzl", _cc_test = "cc_test")
@@ -228,7 +229,7 @@ def tf_proto_library(
         name_sans_proto = name[:-6]
     else:
         name_sans_proto = name
-    native.proto_library(
+    proto_library(
         name = name,
         srcs = srcs,
         deps = deps + protodeps + [
diff --git a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
index 12c127bac63727..51dd87ab1c9920 100644
--- a/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
+++ b/third_party/xla/xla/tsl/platform/default/rules_cc.bzl
@@ -3,7 +3,11 @@
 # This file is used in OSS only. It is not transformed by copybara. Therefore all paths in this
 # file are OSS paths.
 
+load("@rules_cc//cc:cc_binary.bzl", _cc_binary = "cc_binary")
+load("@rules_cc//cc:cc_import.bzl", _cc_import = "cc_import")
 load("@rules_cc//cc:cc_library.bzl", _cc_library = "cc_library")
+load("@rules_cc//cc:cc_shared_library.bzl", _cc_shared_library = "cc_shared_library")
+load("@rules_cc//cc:cc_test.bzl", _cc_test = "cc_test")
 
 # IMPORTANT: Do not remove this load statement. We rely on that //xla/tsl doesn't exist in g3
 # to prevent g3 .bzl files from loading this file.
@@ -11,11 +15,6 @@ load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
 
 visibility(DEFAULT_LOAD_VISIBILITY)
 
-_cc_binary = native.cc_binary
-_cc_import = native.cc_import
-_cc_shared_library = native.cc_shared_library
-_cc_test = native.cc_test
-
 cc_binary = _cc_binary
 cc_import = _cc_import
 cc_shared_library = _cc_shared_library
@@ -27,7 +26,7 @@ def cc_library(name, deps = None, **kwargs):
     Args:
       name: name of target.
       deps: deps with `xla/tsl:bazel_issue_21519` added.
-      **kwargs: passed to native.cc_library.
+      **kwargs: passed to cc_library.
     """
 
     if deps == None:

From 28cd67b54e279fa9f8c9352284c477b93a18531a Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Wed, 10 Dec 2025 08:54:05 -0800
Subject: [PATCH 130/753] [Autotuner] Add cublas support for scaled dot fusion.

PiperOrigin-RevId: 842746529
---
 .../xla/xla/backends/gpu/autotuner/BUILD      |  2 ++
 .../backends/gpu/autotuner/factory_cuda.cc    |  2 ++
 .../gpu/autotuner/fission_backend_test.cc     | 31 +++++++++++++++++++
 .../gpu/autotuning/gemm_fusion_autotuner.cc   |  1 +
 4 files changed, 36 insertions(+)

diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index eb0a0060f8125e..5b4eea2a700a9a 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -612,6 +612,7 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:scaled_dot_rewriter",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_platform_id",
@@ -764,6 +765,7 @@ xla_test(
         "//xla/service/gpu/transforms:custom_kernel_fusion_rewriter",
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:scaled_dot_rewriter",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
index d9e9130f8f96e3..28b5786357d1a8 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_cuda.cc
@@ -1,3 +1,4 @@
+#include "xla/service/gpu/transforms/scaled_dot_rewriter.h"
 /* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -46,6 +47,7 @@ using ::mlir::MLIRContext;
 std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
     const se::DeviceDescription& device_description) {
   auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<ScaledDotRewriter>());
   pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
   for (GemmRewriterOptions::DType dtype :
        {GemmRewriterOptions::DType::kFp8Only,
diff --git a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
index a9b0ecaaca050b..0400562457db38 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/fission_backend_test.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/gpu/transforms/scaled_dot_rewriter.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -90,6 +91,27 @@ ENTRY main {
   ROOT %dot.0 = f32[64,64]{1,0} fusion(p0, p1), kind=kCustom, calls=gemm_fusion, backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm"},"force_earliest_schedule":false}
 })";
 
+const char kScaledDotFusionHlo[] = R"(
+HloModule module
+
+fusion_computation {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  p0_scale = f32[1024,8] parameter(2)
+  p1_scale = f32[8,1024] parameter(3)
+  ROOT r = f32[1024,1024] scaled-dot(p0, p1, p0_scale, p1_scale),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[1024,1024] parameter(0)
+  p1 = f32[1024,1024] parameter(1)
+  p0_scale = f32[1024,8] parameter(2)
+  p1_scale = f32[8,1024] parameter(3)
+  ROOT r = f32[1024,1024] fusion(p0, p1, p0_scale, p1_scale),
+    kind=kCustom, calls=fusion_computation
+})";
+
 const char kUnsupportedFusionHlo[] = R"(
   HloModule module
   computation {
@@ -131,6 +153,7 @@ class FissionTest : public HloHardwareIndependentTestBase,
   static std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
       const se::DeviceDescription& device_description) {
     auto pipeline = std::make_unique<HloPassPipeline>("fission_pipeline");
+    pipeline->AddPass(std::make_unique<ScaledDotRewriter>());
     pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
     for (GemmRewriterOptions::DType dtype :
          {GemmRewriterOptions::DType::kFp8Only,
@@ -287,6 +310,14 @@ INSTANTIATE_TEST_SUITE_P(
              "\"kind\":\"__custom_fusion\"",
          },
          /*expected_backend_name=*/"CustomKernel_fission"},
+        {"ScaledDotFusion_Cublas",
+         kScaledDotFusionHlo,
+         &FissionTest::GetCublasRewriterPipeline,
+         &FissionTest::CreateCublasBackend,
+         /*expected_module_substrings=*/
+         {"custom_call_target=\"__cublas$gemm\"",
+          "\"selected_algorithm\":\"-1\""},
+         /*expected_backend_name=*/"Cublas_fission"},
     }),
     [](const ::testing::TestParamInfo<FissionTest::ParamType>& info) {
       return info.param.test_name;
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
index 17c836bf78839b..548c9281bda625 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -145,6 +145,7 @@ namespace {
 std::unique_ptr<HloPassPipeline> GetCublasRewriterPipeline(
     const se::DeviceDescription* device_description) {
   auto pipeline = std::make_unique<HloPassPipeline>("cublas_rewriter_pipeline");
+  pipeline->AddPass(std::make_unique<ScaledDotRewriter>());
   pipeline->AddPass(std::make_unique<DotAlgorithmRewriter>());
   for (GemmRewriterOptions::DType dtype :
        {GemmRewriterOptions::DType::kFp8Only,

From f76d332b684e2543598cf71066dea4a35a596072 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 09:08:54 -0800
Subject: [PATCH 131/753] Introduce a flag to allow h2h copies

This flag is to be used to xla_disable_automatic_host_compute_offload, so as to prevent any host compute that might cause numeric concerns.

PiperOrigin-RevId: 842752710
---
 third_party/xla/xla/debug_options_flags.cc    | 12 +++++
 .../xla/xla/hlo/transforms/host_offloader.cc  |  9 +++-
 .../xla/hlo/transforms/host_offloader_test.cc | 51 +++++++++++++++++++
 third_party/xla/xla/xla.proto                 |  6 ++-
 4 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index abfb81977ccdb0..2023cdbcba5ca1 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -457,6 +457,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_enable_heuristic_collective_combining(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_silent_hlo_change(false);
   opts.set_xla_disable_automatic_host_compute_offload(false);
+  opts.set_xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled(
+      false);
   opts.set_xla_enable_scoped_logging_timers(true);
   opts.set_xla_unsupported_crash_on_hlo_pass_noop_change(false);
   opts.set_xla_gpu_experimental_enable_split_k_rewrite(false);
@@ -2602,6 +2604,16 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_disable_automatic_host_compute_offload(),
       "Return an error if HostOffloader would have automatically offloaded some"
       " compute to the host."));
+  flag_list->push_back(tsl::Flag(
+      "xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled",
+      bool_setter_for(
+          &DebugOptions::
+              set_xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled),  // NOLINT
+      debug_options
+          ->xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled(),
+      "Allow host-to-host copy when automatic host compute offload is "
+      "disabled, i.e. when xla_disable_automatic_host_compute_offload is "
+      "set."));
   flag_list->push_back(tsl::Flag(
       "xla_enable_scoped_logging_timers",
       bool_setter_for(&DebugOptions::set_xla_enable_scoped_logging_timers),
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.cc b/third_party/xla/xla/hlo/transforms/host_offloader.cc
index 1fdd3360d98b82..f72145b2e6392a 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.cc
@@ -298,10 +298,17 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
           "to move the inputs to the device so that computation happens on the "
           "device.",
           instruction->name());
+      bool h2h_copy_allowed =
+          instruction->opcode() == HloOpcode::kCopy &&
+          instruction->GetModule()
+              ->config()
+              .debug_options()
+              .xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled();  // NOLINT
       if (instruction->GetModule()
               ->config()
               .debug_options()
-              .xla_disable_automatic_host_compute_offload()) {
+              .xla_disable_automatic_host_compute_offload() &&
+          !h2h_copy_allowed) {
         return absl::InvalidArgumentError(
             "Automatic host compute offloading is disabled.");
       }
diff --git a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
index c49891e9b03026..44f3f0fe4516a1 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader_test.cc
@@ -95,6 +95,14 @@ class HostOffloaderTest : public HloHardwareIndependentTestBase {
         .set_xla_disable_automatic_host_compute_offload(true);
   }
 
+  static void AllowH2hCopyWhenAutomaticHostComputeOffloadDisabled(
+      HloModule* module) {
+    module->mutable_config()
+        .mutable_debug_options()
+        .set_xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled(
+            true);
+  }
+
   AliasInfo alias_info_;
 };
 
@@ -4650,6 +4658,49 @@ TEST_F(HostOffloaderTest, AutomaticHostComputeOffloadDisabled) {
               absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST_F(HostOffloaderTest,
+       H2hCopyDisallowedWhenAutomaticHostComputeOffloadDisabled) {
+  const absl::string_view hlo_string = R"(
+    HloModule module, entry_computation_layout={(f32[1024]{0:T(128)S(5)})->f32[1024]{0:T(128)S(5)}}
+
+    ENTRY main {
+      param = f32[1024]{0} parameter(0)
+      ROOT a_copy = f32[1024]{0} copy(param)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  DisableAutomaticHostComputeOffload(module.get());
+  // A copy on host memory exists, but we have disabled automatic host compute
+  // offloading and we haven't allowed H2H copies, so we expect an error.
+  absl::StatusOr<bool> changed = RunHostOffloader(module.get());
+  EXPECT_THAT(changed,
+              absl_testing::StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST_F(HostOffloaderTest,
+       H2hCopyAllowedWhenAutomaticHostComputeOffloadDisabled) {  // NOLINT
+  const absl::string_view hlo_string = R"(
+    HloModule module, entry_computation_layout={(f32[1024]{0:T(128)S(5)})->f32[1024]{0:T(128)S(5)}}
+
+    ENTRY main {
+      param = f32[1024]{0} parameter(0)
+      ROOT a_copy = f32[1024]{0} copy(param)
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  DisableAutomaticHostComputeOffload(module.get());
+  AllowH2hCopyWhenAutomaticHostComputeOffloadDisabled(module.get());
+  // A copy on host memory exists, and we have disabled automatic host compute
+  // offloading, but we have allowed H2H copies, so we expect success.
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+  EXPECT_TRUE(changed);
+  VLOG(1) << module->ToString();
+  HloInstruction* a_copy = FindInstruction(module.get(), "a_copy");
+  EXPECT_TRUE(host_offload_utils::ComputeTypeIsHost(a_copy));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index f3b61c30ae07a6..a6ae112f4ef0e7 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -145,6 +145,10 @@ message DebugOptions {
   //--------------------------------------------------------------------------//
   // go/keep-sorted start
 
+  // Allow host-to-host copy even when automatic host compute offload is
+  // disabled, i.e. when xla_disable_automatic_host_compute_offload is set.
+  optional bool
+      xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = 439;
   // Return an error if HostOffloader would have automatically offloaded some
   // compute to the host.
   optional bool xla_disable_automatic_host_compute_offload = 408;
@@ -1323,7 +1327,7 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 439
+  // Next id: 440
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From c4fe87f80f26de5b6c9b959229d1a0a686f8802b Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Wed, 10 Dec 2025 10:08:39 -0800
Subject: [PATCH 132/753] Plumb initial YNNPACK support of convolution
 operator.

This is currently limited to F32, VALID, no dilation convolutions.

PiperOrigin-RevId: 842774858
---
 .../backends/cpu/transforms/library_matcher.h |   1 +
 .../xla/backends/cpu/transforms/ynn_matcher.h |   6 +-
 .../xla/xla/backends/cpu/ynn_emitter.cc       | 121 ++++++++++++++++++
 .../xla/xla/backends/cpu/ynn_emitter.h        |   4 +
 .../xla/xla/backends/cpu/ynn_support.cc       |  73 +++++++++++
 .../xla/xla/backends/cpu/ynn_support.h        |   3 +
 .../xla/xla/service/cpu/thunk_emitter.cc      |  16 +++
 third_party/xla/xla/xla.proto                 |   1 +
 8 files changed, 224 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
index 23c5874c652fb0..381ea0c970b503 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_matcher.h
@@ -45,6 +45,7 @@ class LibraryMatcher {
           break;
         // Not intended to be used by LibraryMatcher.
         case DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT:
+        case DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION:
           break;
         case DebugOptions::LIBRARY_FUSION_TYPE_REDUCE:
           fuse_reduce_ = true;
diff --git a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
index 38dc8f6f820cf7..c8d05b933591e3 100644
--- a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
+++ b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher.h
@@ -43,7 +43,8 @@ class YnnMatcher : public LibraryMatcher {
     static const absl::NoDestructor<absl::flat_hash_set<HloOpcode>>
         kSupportedOps{[]() {
           absl::flat_hash_set<HloOpcode> supported_ops{
-              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant};
+              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant,
+              HloOpcode::kConvolution};
           for (const auto& [op, _] : GetYnnUnaryOpMap()) {
             supported_ops.insert(op);
           }
@@ -65,6 +66,9 @@ class YnnMatcher : public LibraryMatcher {
     if (instr->opcode() == HloOpcode::kReduce) {
       return IsReduceOpOffloadedToYnn(instr);
     }
+    if (instr->opcode() == HloOpcode::kConvolution) {
+      return IsConvolutionOpSupportedByYnn(instr);
+    }
     if (instr->IsConstant()) {
       return IsConstantSupportedByYnn(instr);
     }
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index 145fd16e1ac1d8..790e9f62c610e5 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <numeric>
 #include <utility>
 #include <vector>
 
@@ -368,6 +369,30 @@ static ynn_status DefineBatchMatrixMultiply(ynn_subgraph_t subgraph,
                         YNN_INVALID_VALUE_ID, &output_id, /*flags=*/0);
 }
 
+static ynn_status DefineConvolution(
+    ynn_subgraph_t subgraph, uint32_t input1_id, uint32_t input2_id,
+    uint32_t output_id, const std::vector<int32_t>& stencil_axes,
+    const std::vector<int32_t> new_axes,
+    const std::vector<size_t>& stencil_dims,
+    const std::vector<size_t>& stencil_strides,
+    const std::vector<size_t>& stencil_dilations) {
+  uint32_t padding_id = YNN_INVALID_VALUE_ID;
+  uint32_t stencil_id = YNN_INVALID_VALUE_ID;
+
+  // Make a stenciled view of the input [n, h, w, ci] -> [n, h, w, kh, kw, ci].
+  ynn_status status = ynn_define_stencil_copy(
+      subgraph, /*num_stencils=*/stencil_dims.size(), stencil_axes.data(),
+      new_axes.data(), stencil_dims.data(), stencil_strides.data(),
+      stencil_dilations.data(), input1_id, padding_id, &stencil_id,
+      /*flags=*/0);
+  if (status != ynn_status_success) {
+    return status;
+  }
+  return ynn_define_dot(subgraph, /*num_k_dims=*/stencil_dims.size() + 1,
+                        stencil_id, input2_id, YNN_INVALID_VALUE_ID, &output_id,
+                        /*flags=*/0);
+}
+
 static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
     const HloDotInstruction* dot,
     std::vector<std::unique_ptr<Literal>>& literals,
@@ -442,6 +467,92 @@ static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
   return subgraph;
 }
 
+static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
+    const HloConvolutionInstruction* conv,
+    std::vector<std::unique_ptr<Literal>>& literals,
+    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
+  TF_ASSIGN_OR_RETURN(
+      YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
+        return ynn_create_subgraph(
+            /*external_value_ids=*/3,
+            YnnFlags(conv->GetModule()->config().debug_options()), subgraph);
+      }));
+
+  uint32_t lhs_id = 0;
+  uint32_t rhs_id = 1;
+  uint32_t out_id = 2;
+
+  const HloInstruction* lhs = conv->operand(0);
+  const HloInstruction* rhs = conv->operand(1);
+
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& out_shape = conv->shape();
+
+  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
+    return {dims.begin(), dims.end()};
+  };
+
+  std::vector<size_t> lhs_dims = dims(lhs_shape.dimensions());
+  std::vector<size_t> rhs_dims = dims(rhs_shape.dimensions());
+  std::vector<size_t> out_dims = dims(out_shape.dimensions());
+
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_lhs_type, YnnType(lhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_rhs_type, YnnType(rhs_shape.element_type()));
+  TF_ASSIGN_OR_RETURN(ynn_type ynn_out_type, YnnType(out_shape.element_type()));
+
+  const uint32_t input_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_INPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_lhs_type, lhs_dims.size(), lhs_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &lhs_id));
+
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_rhs_type, rhs_dims.size(), rhs_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, input_tensor_flags, &rhs_id));
+
+  const uint32_t output_tensor_flags = YNN_VALUE_FLAG_EXTERNAL_OUTPUT;
+  YNN_RETURN_IF_ERROR(ynn_define_tensor_value(
+      subgraph.get(), ynn_out_type, out_dims.size(), out_dims.data(),
+      /*data=*/nullptr,
+      /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+      /*scale_id=*/YNN_INVALID_VALUE_ID, output_tensor_flags, &out_id));
+
+  Window conv_window = conv->window();
+  int conv_window_dims_size = conv_window.dimensions_size();
+
+  ConvolutionDimensionNumbers conv_dimensions =
+      conv->convolution_dimension_numbers();
+
+  std::vector<int32_t> stencil_axes(conv_window_dims_size);
+  std::vector<int32_t> new_axes(conv_window_dims_size);
+  std::vector<size_t> stencil_dims(conv_window_dims_size);
+  std::vector<size_t> stencil_strides(conv_window_dims_size);
+  std::vector<size_t> stencil_dilations(conv_window_dims_size);
+
+  for (size_t i = 0; i < conv_window.dimensions_size(); ++i) {
+    stencil_axes[i] = conv_dimensions.input_spatial_dimensions(i);
+    stencil_dims[i] = conv_window.dimensions(i).size();
+    stencil_strides[i] = conv_window.dimensions(i).stride();
+    stencil_dilations[i] = 1;
+  }
+
+  std::iota(new_axes.begin(), new_axes.end(), lhs_dims.size() - 1);
+
+  YNN_RETURN_IF_ERROR(DefineConvolution(subgraph.get(), lhs_id, rhs_id, out_id,
+                                        stencil_axes, new_axes, stencil_dims,
+                                        stencil_strides, stencil_dilations));
+
+  ynn_status status = ynn_optimize_subgraph(
+      subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
+  TF_RETURN_IF_ERROR(YnnStatusToStatus(status));
+
+  return subgraph;
+}
+
 absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
     absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
 EmitYnnFusionBuilder(const HloComputation* computation) {
@@ -478,4 +589,14 @@ EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs) {
       };
 }
 
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
+EmitYnnConvolutionBuilder(const HloConvolutionInstruction* conv) {
+  return
+      [conv, literals = std::vector<std::unique_ptr<Literal>>()](
+          absl::Span<const se::DeviceAddressBase> arguments_buffers) mutable {
+        return EmitYnnConvolutionSubgraph(conv, literals, arguments_buffers);
+      };
+}
+
 }  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.h b/third_party/xla/xla/backends/cpu/ynn_emitter.h
index ff8a2949926979..b1b767e51a44e9 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.h
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.h
@@ -33,6 +33,10 @@ absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
     absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
 EmitYnnDotBuilder(const HloDotInstruction* dot, bool capture_rhs);
 
+absl::StatusOr<absl::AnyInvocable<absl::StatusOr<YnnSubgraph>(
+    absl::Span<const se::DeviceAddressBase> arguments_buffers)>>
+EmitYnnConvolutionBuilder(const HloConvolutionInstruction* conv);
+
 }  // namespace xla::cpu
 
 #endif  // XLA_BACKENDS_CPU_YNN_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index 4eda97a255f752..43cbf1a4749c81 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -273,6 +273,79 @@ bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo) {
   }
 }
 
+bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
+  CHECK_EQ(instr->opcode(), HloOpcode::kConvolution);
+  const HloConvolutionInstruction* conv =
+      Cast<HloConvolutionInstruction>(instr);
+  // Stores tuple of allowed (input, output) dtypes.
+  static const absl::NoDestructor<absl::flat_hash_set<
+      std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
+      kAllowedTypes({
+          {F32, F32, F32},
+      });
+
+  PrimitiveType lhs_dtype = conv->operand(0)->shape().element_type();
+  PrimitiveType rhs_dtype = conv->operand(1)->shape().element_type();
+  PrimitiveType out_dtype = conv->shape().element_type();
+  if (!kAllowedTypes->contains({lhs_dtype, rhs_dtype, out_dtype})) {
+    return false;
+  }
+
+  ConvolutionDimensionNumbers conv_dimensions =
+      conv->convolution_dimension_numbers();
+
+  // Make sure that this layout is supported.
+  if (conv_dimensions.input_feature_dimension() != 3 ||
+      conv_dimensions.output_feature_dimension() != 3) {
+    return false;
+  }
+
+  if (conv_dimensions.kernel_input_feature_dimension() != 2 ||
+      conv_dimensions.kernel_output_feature_dimension() != 3) {
+    return false;
+  }
+
+  if (conv_dimensions.input_spatial_dimensions_size() != 2 ||
+      conv_dimensions.kernel_spatial_dimensions_size() != 2 ||
+      conv_dimensions.output_spatial_dimensions_size() != 2) {
+    return false;
+  }
+
+  if (conv_dimensions.input_spatial_dimensions(0) != 1 ||
+      conv_dimensions.input_spatial_dimensions(1) != 2 ||
+      conv_dimensions.kernel_spatial_dimensions(0) != 0 ||
+      conv_dimensions.kernel_spatial_dimensions(1) != 1 ||
+      conv_dimensions.output_spatial_dimensions(0) != 1 ||
+      conv_dimensions.output_spatial_dimensions(1) != 2) {
+    return false;
+  }
+
+  Window window = conv->window();
+
+  // Only support 2D convolution.
+  if (window.dimensions_size() != 2) {
+    return false;
+  }
+
+  // Only VALID padding for now.
+  if ((window.dimensions(0).padding_low() != 0) ||
+      (window.dimensions(0).padding_high() != 0) ||
+      (window.dimensions(1).padding_low() != 0) ||
+      (window.dimensions(1).padding_high() != 0)) {
+    return false;
+  }
+
+  // No dilation for now.
+  if ((window.dimensions(0).window_dilation() != 1) ||
+      (window.dimensions(1).window_dilation() != 1) ||
+      (window.dimensions(0).base_dilation() != 1) ||
+      (window.dimensions(1).base_dilation() != 1)) {
+    return false;
+  }
+
+  return true;
+}
+
 uint32_t YnnFlags(const DebugOptions& debug_options) {
   uint32_t flags = 0;
   if (!debug_options.xla_cpu_enable_platform_dependent_math()) {
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.h b/third_party/xla/xla/backends/cpu/ynn_support.h
index 7025010715dad9..f7352adfe4164b 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.h
+++ b/third_party/xla/xla/backends/cpu/ynn_support.h
@@ -71,6 +71,9 @@ bool IsReduceOpSupportedByYnn(const HloInstruction* hlo);
 // Returns true if the reduce op will be offloaded to YNNPACK.
 bool IsReduceOpOffloadedToYnn(const HloInstruction* hlo);
 
+// Returns true if the convolution op is supported by YNNPACK.
+bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr);
+
 // Convert XLA options to YNNPACK flags.
 uint32_t YnnFlags(const DebugOptions& debug_options);
 
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 0506620c3bb7a6..5b91a6c097c6d8 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -770,6 +770,17 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
       /*supported_types=*/
       {PRED, S8, U8, S16, U16, S32, U32, S64, U64, F16, F32, F64, C64, C128}));
 
+#ifdef XLA_YNNPACK
+  const bool use_ynn = absl::c_linear_search(
+      hlo_module_config_.debug_options().xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
+  if (use_ynn) {
+    if (IsConvolutionOpSupportedByYnn(instruction)) {
+      return EmitYnnFusionThunk(instruction);
+    }
+  }
+#endif  // XLA_YNNPACK
+
   // TODO(tonywy): Add PotentiallyImplementedAsMKLConvolution to support
   // different data layouts.
   if (PotentiallyImplementedAsEigenConvolution(*instruction,
@@ -1564,6 +1575,11 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
     if (capture_rhs) {
       captured_arguments_ids = kCapturedIds;
     }
+  } else if (instruction->opcode() == HloOpcode::kConvolution) {
+    const HloConvolutionInstruction* conv =
+        Cast<HloConvolutionInstruction>(instruction);
+    // Construct YNNPACK subgraph builder from the convolution instruction.
+    TF_ASSIGN_OR_RETURN(builder, EmitYnnConvolutionBuilder(conv));
   } else {
     auto* fusion = Cast<HloFusionInstruction>(instruction);
     const HloComputation* computation =
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index a6ae112f4ef0e7..bb3123dfef0137 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -180,6 +180,7 @@ message DebugOptions {
     LIBRARY_FUSION_TYPE_ELTWISE = 2;
     LIBRARY_FUSION_TYPE_REDUCE = 3;
     LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT = 4;
+    LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION = 5;
   }
 
   enum XnnGraphFusionMode {

From 98a7e7342715ef7412fe4733ed9f034ecba62a05 Mon Sep 17 00:00:00 2001
From: Seher Ellis <sacer@google.com>
Date: Wed, 10 Dec 2025 10:40:42 -0800
Subject: [PATCH 133/753] Rollback of PR #26196 Not reviewed by the scheduling
 team. Sorry for the trouble.

Reverts 1144cc69a2b8caa8f2a5f99cd29cc22882fc73ff

PiperOrigin-RevId: 842788005
---
 .../xla/xla/service/gpu/gpu_hlo_schedule.cc   |  7 +-
 .../gpu/gpu_latency_hiding_scheduler_test.cc  | 86 -------------------
 .../xla/service/latency_hiding_scheduler.cc   | 33 -------
 .../xla/service/latency_hiding_scheduler.h    |  3 -
 4 files changed, 1 insertion(+), 128 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 304da6a2627c0f..2afc4787298d11 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -582,12 +582,7 @@ absl::Status RunLatencyHidingSchedulerPasses(
     pipeline.AddPass<PGLEAccuracyChecker>(
         dynamic_cast<ProfileGuidedLatencyEstimator&>(*estimator));
   }
-  // If overlap limit is set to be greater than 1 and the default t-short size
-  // estimator is used we will tell LHS to extend async-done intervals as much
-  // as possible to start collectives as early as possible.
-  if (config.parallel_collective_overlap_limit > 1) {
-    config.prioritize_compute_over_async_start = true;
-  }
+
   auto async_tracker = std::make_unique<GpuAsyncTracker>(config);
 
   std::shared_ptr<const SchedulingContext> scheduling_context =
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index ac1185896e2ead..28eabcb1cd7680 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -75,8 +75,6 @@ class GpuLatencyHidingSchedulerBaseTest
     DebugOptions& options = module->mutable_config().mutable_debug_options();
     options.set_xla_gpu_experimental_parallel_collective_overlap_limit(
         num_parallel_resources);
-    options.set_xla_gpu_enable_analytical_sol_latency_estimator(false);
-
     options.set_xla_gpu_pgle_accuracy_checker(strictness);
 
     TF_RETURN_IF_ERROR(ScheduleGpuModule(module, /*pointer_size=*/8,
@@ -1043,89 +1041,5 @@ TEST_F(GpuLatencyHidingSchedulerBaseTest, ParallelThreadsShouldBeScheduled) {
   TF_EXPECT_OK(ScheduleModule(module.get()));
 }
 
-TEST_F(GpuLatencyHidingSchedulerBaseTest,
-       MultipleParallelAsyncsExtendedOverAllComputes) {
-  absl::string_view kHloModule = R"(
-HloModule m
-reduce {
-x = f32[] parameter(0)
-y = f32[] parameter(1)
-ROOT _ = f32[] add(x, y)
-}
-ENTRY main {
-p0 = f32[] parameter(0)
-p1 = f32[2] parameter(1)
-p2 = f32[2] parameter(2)
-p3 = f32[2] parameter(3)
-p4 = f32[2] parameter(4)
-p5 = f32[2] parameter(5)
-p6 = f32[2] parameter(6)
-ar_0 = f32[] all-reduce-start(p0), to_apply=reduce
-ar_1 = f32[] all-reduce-done(ar_0)
-add_2 = f32[2] add(p1, p6)
-
-ar_2 = f32[2] all-reduce-start(add_2), to_apply=reduce
-ar_3 = f32[2] all-reduce-done(ar_2)
-add_3 = f32[2] add(p1, p3)
-
-rs_0 = ((f32[2]), f32[1]) reduce-scatter-start(add_3), to_apply=reduce,
-dimensions={0}
-rs_1 = f32[1] reduce-scatter-done(rs_0)
-add_0 = f32[2] add(p1, p2)
-div_0 = f32[2] divide(p3, p4)
-mul_0 = f32[2] multiply(p4, p5)
-ROOT _ = (f32[], f32[2], f32[1], f32[2], f32[2], f32[2]) tuple(ar_1, ar_3, rs_1, add_0, div_0, mul_0)
-}
-)";
-  absl::string_view kFdoProfile = "";
-
-  auto config = GetModuleConfig(kFdoProfile);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(kHloModule, config));
-
-  TF_EXPECT_OK(ScheduleModule(module.get(), /*num_parallel_resources=*/16));
-  auto schedule = module->schedule();
-  std::vector<HloInstruction*> instruction_sequence =
-      schedule.sequence(module->entry_computation()).instructions();
-  // With a lot of parallel resources and default latency estimator,
-  // LHS will try to extend all asyncs as much as possible.
-  // We expect all computes to be wrapped within all async start-done
-  // intervals.
-  EXPECT_TRUE(GetIndexByName(instruction_sequence, "add_2") >
-                  GetIndexByName(instruction_sequence, "ar_0") &&
-              GetIndexByName(instruction_sequence, "add_3") >
-                  GetIndexByName(instruction_sequence, "ar_0") &&
-              GetIndexByName(instruction_sequence, "add_2") <
-                  GetIndexByName(instruction_sequence, "ar_1") &&
-              GetIndexByName(instruction_sequence, "add_3") <
-                  GetIndexByName(instruction_sequence, "ar_1"));
-
-  EXPECT_TRUE(GetIndexByName(instruction_sequence, "add_0") >
-                  GetIndexByName(instruction_sequence, "ar_0") &&
-              GetIndexByName(instruction_sequence, "add_0") >
-                  GetIndexByName(instruction_sequence, "rs_0") &&
-              GetIndexByName(instruction_sequence, "add_0") <
-                  GetIndexByName(instruction_sequence, "ar_1") &&
-              GetIndexByName(instruction_sequence, "add_0") <
-                  GetIndexByName(instruction_sequence, "rs_1"));
-
-  EXPECT_TRUE(GetIndexByName(instruction_sequence, "div_0") >
-                  GetIndexByName(instruction_sequence, "ar_0") &&
-              GetIndexByName(instruction_sequence, "div_0") >
-                  GetIndexByName(instruction_sequence, "rs_0") &&
-              GetIndexByName(instruction_sequence, "div_0") <
-                  GetIndexByName(instruction_sequence, "ar_1") &&
-              GetIndexByName(instruction_sequence, "div_0") <
-                  GetIndexByName(instruction_sequence, "rs_1"));
-  EXPECT_TRUE(GetIndexByName(instruction_sequence, "mul_0") >
-                  GetIndexByName(instruction_sequence, "ar_0") &&
-              GetIndexByName(instruction_sequence, "mul_0") >
-                  GetIndexByName(instruction_sequence, "rs_0") &&
-              GetIndexByName(instruction_sequence, "mul_0") <
-                  GetIndexByName(instruction_sequence, "ar_1") &&
-              GetIndexByName(instruction_sequence, "mul_0") <
-                  GetIndexByName(instruction_sequence, "rs_1"));
-}
-
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index f7500d9a46267e..62995d048976f9 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -1262,29 +1262,6 @@ class ReadySetLt {
     return std::nullopt;
   }
 
-  inline std::optional<bool> DelayAsyncStartCandidateCondition(
-      DefaultSchedulerCore::ScheduleCandidate& a,
-      DefaultSchedulerCore::ScheduleCandidate& b, const HloGraphNode* a_node,
-      const HloGraphNode* b_node, const char** reason) const {
-    bool a_has_async_resource =
-        a_node->DoesReleaseAnyResource() && !IsResourceConstrained(a, a_node);
-    bool b_has_async_resource =
-        b_node->DoesReleaseAnyResource() && !IsResourceConstrained(b, b_node);
-
-    CMP_EXPLICIT(!a_has_async_resource, !b_has_async_resource,
-                 "kDelayAsyncStartForCompute");
-    if (a_has_async_resource && b_has_async_resource) {
-      // If 2 nodes are both async nodes, we prioritize the one
-      // with more depth to free up more computes to overlap
-      // with the one with less depth which can be launched
-      // early
-      CMP_EXPLICIT(a_node->GetDepth() > b_node->GetDepth(),
-                   b_node->GetDepth() > a_node->GetDepth(),
-                   "kDelayAsyncStartForDepth");
-    }
-    return std::nullopt;
-  }
-
   // The comparison here implements the priority for the nodes in the ready
   // set. The function compares a and b in a series of prioritized
   // comparisons. As soon as it finds one that is not equal, it stops.  If
@@ -1394,16 +1371,6 @@ class ReadySetLt {
                    AsyncDepth0CandidateCondition(b, bn), "kStartAtZeroDepth");
     }
 
-    if (sched_state_.config.aggressive_scheduling_policies &&
-        sched_state_.config.prioritize_compute_over_async_start) {
-      // If an instruction releasing a resource is not resource constrained,
-      // delay it as much as possible.
-      if (auto value =
-              DelayAsyncStartCandidateCondition(a, b, an, bn, reason)) {
-        return *value;
-      }
-    }
-
     auto a_readytime = an->GetReadyTime();
     auto b_readytime = bn->GetReadyTime();
     if (a_readytime != b_readytime) {  // Quick test to avoid lots of work
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 2930dfd2277e3d..01630bbaa5bf5e 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -149,9 +149,6 @@ struct SchedulerConfig {
   bool use_real_cost_model = false;
   bool aggressive_scheduling_policies = false;
   bool prioritize_async_depth_over_stall = false;
-
-  bool prioritize_compute_over_async_start = false;
-
   bool enable_release_start_policy = false;
   bool resource_sharing = false;
   bool resource_serializing = false;

From 6c2d0e79349528e9f51a1cdb1e5f169e9969f2f0 Mon Sep 17 00:00:00 2001
From: Krishna Haridasan <krishnahari@google.com>
Date: Wed, 10 Dec 2025 10:52:38 -0800
Subject: [PATCH 134/753] Make xla::ifrt::AttributeMap thread-safe

PiperOrigin-RevId: 842793185
---
 third_party/xla/xla/python/ifrt/BUILD         |  2 +
 .../xla/xla/python/ifrt/attribute_map.cc      |  3 +
 .../xla/xla/python/ifrt/attribute_map.h       | 80 +++++++++++++++++--
 .../xla/xla/python/ifrt/attribute_map_test.cc | 19 +++++
 4 files changed, 96 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index c411c99ab65d9c..59ee58576058f8 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -165,6 +165,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
     ],
 )
 
@@ -176,6 +177,7 @@ xla_cc_test(
         ":attribute_map",
         ":serdes_test_util",
         ":serdes_version",
+        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/python/ifrt/attribute_map.cc b/third_party/xla/xla/python/ifrt/attribute_map.cc
index 18e827c78194e3..dd727de8b9eab5 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map.cc
+++ b/third_party/xla/xla/python/ifrt/attribute_map.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/attribute_map.pb.h"
 #include "xla/python/ifrt/serdes_version.h"
 
@@ -82,6 +83,7 @@ void AttributeMap::ToProto(AttributeMapProto& proto,
   proto.Clear();
   proto.set_version_number(SerDesVersionNumber(0).value());
 
+  absl::ReaderMutexLock lock(mu_);
   for (const auto& [key, value] : map_) {
     AttributeMapProto::Value value_proto;
     std::visit(
@@ -110,6 +112,7 @@ void AttributeMap::ToProto(AttributeMapProto& proto,
 
 std::string AttributeMap::DebugString(size_t max_string_length,
                                       size_t max_int64_list_size) const {
+  absl::ReaderMutexLock lock(mu_);
   auto formatter = [=](std::string* out,
                        const AttributeMap::Map::value_type& key_value) {
     absl::StrAppend(out, key_value.first, "=");
diff --git a/third_party/xla/xla/python/ifrt/attribute_map.h b/third_party/xla/xla/python/ifrt/attribute_map.h
index e53be413eb1c4a..964969b718bf1b 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map.h
+++ b/third_party/xla/xla/python/ifrt/attribute_map.h
@@ -24,12 +24,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/base/attributes.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/python/ifrt/attribute_map.pb.h"
 #include "xla/python/ifrt/serdes_default_version_accessor.h"
 #include "xla/python/ifrt/serdes_version.h"
@@ -38,6 +39,8 @@ namespace xla {
 namespace ifrt {
 
 // Attribute map that contains UTF-8 keys and variant values.
+//
+// This class is thread-safe.
 class AttributeMap {
  public:
   // Supported value types for `AttributeMap`. Modeled after
@@ -89,11 +92,9 @@ class AttributeMap {
 
   explicit AttributeMap(Map map) : map_(std::move(map)) {}
 
-  ABSL_DEPRECATED("map() is not thread-safe. Use Get() function instead.")
-  const Map& map() const { return map_; }
-
   template <typename T>
   absl::StatusOr<T> Get(const std::string& key) const {
+    absl::ReaderMutexLock lock(mu_);
     if constexpr (std::is_same_v<T, Value>) {
       auto it = map_.find(key);
       if (it == map_.end()) {
@@ -115,6 +116,28 @@ class AttributeMap {
     }
   }
 
+  template <typename T>
+  absl::Status Set(const std::string& key, T value) {
+    absl::MutexLock lock(mu_);
+    using ValueType = std::decay_t<T>;
+    if constexpr (std::is_same_v<ValueType, std::string> ||
+                  std::is_same_v<ValueType, const char*> ||
+                  std::is_convertible_v<ValueType, std::string>) {
+      map_.insert_or_assign(key, StringValue(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, bool>) {
+      map_.insert_or_assign(key, BoolValue(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, int64_t>) {
+      map_.insert_or_assign(key, Int64Value(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, std::vector<int64_t>>) {
+      map_.insert_or_assign(key, Int64ListValue(std::move(value)));
+    } else if constexpr (std::is_same_v<ValueType, float>) {
+      map_.insert_or_assign(key, FloatValue(std::move(value)));
+    } else {
+      static_assert(false, "Unsupported type for AttributeMap::Set");
+    }
+    return absl::OkStatus();
+  }
+
   // Deserializes `AttributeMapProto` into `AttributeMap`.
   static absl::StatusOr<AttributeMap> FromProto(const AttributeMapProto& proto);
 
@@ -138,25 +161,65 @@ class AttributeMap {
     sink.Append(attribute_map.DebugString());
   }
 
-  bool IsEmpty() const { return map_.empty(); }
+  bool IsEmpty() const {
+    absl::ReaderMutexLock lock(mu_);
+    return map_.empty();
+  }
 
   // Invokes `f` for each key-value pair in the attribute map.
   void ForEach(
       absl::FunctionRef<void(const std::string&, const Value&)> f) const {
+    absl::ReaderMutexLock lock(mu_);
     for (const auto& [key, value] : map_) {
       f(key, value);
     }
   }
 
   bool operator==(const AttributeMap& other) const {
+    absl::ReaderMutexLock lock1(mu_);
+    absl::ReaderMutexLock lock2(other.mu_);
     return map_ == other.map_;
   }
 
-  size_t size() const { return map_.size(); }
+  size_t size() const {
+    absl::ReaderMutexLock lock(mu_);
+    return map_.size();
+  }
+
+  // Copyable and movable.
+  AttributeMap(const AttributeMap& other) {
+    absl::ReaderMutexLock lock(other.mu_);
+    map_ = other.map_;
+  }
+  AttributeMap& operator=(const AttributeMap& other) {
+    Map map;
+    {
+      absl::ReaderMutexLock lock(other.mu_);
+      map = other.map_;
+    }
+    absl::MutexLock lock(mu_);
+    map_ = std::move(map);
+    return *this;
+  }
+  AttributeMap(AttributeMap&& other) {
+    absl::MutexLock lock(other.mu_);
+    map_ = std::move(other.map_);
+  }
+  AttributeMap& operator=(AttributeMap&& other) {
+    Map map;
+    {
+      absl::MutexLock lock(other.mu_);
+      map = std::move(other.map_);
+    }
+    absl::MutexLock lock(mu_);
+    map_ = std::move(map);
+    return *this;
+  }
 
  private:
   template <typename T, typename V>
-  absl::StatusOr<T> Get(const std::string& key) const {
+  absl::StatusOr<T> Get(const std::string& key) const
+      ABSL_SHARED_LOCKS_REQUIRED(mu_) {
     auto it = map_.find(key);
     if (it == map_.end()) {
       return absl::NotFoundError(absl::StrCat("Key not found: ", key));
@@ -169,7 +232,8 @@ class AttributeMap {
     return value->value;
   }
 
-  Map map_;
+  mutable absl::Mutex mu_;
+  Map map_ ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt/attribute_map_test.cc b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
index c658838e2af1c3..e81a3b8448a735 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map_test.cc
+++ b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
@@ -80,6 +81,24 @@ TEST(AttributeMapTest, Get) {
                        HasSubstr("Value type mismatch for key: string")));
 }
 
+TEST(AttributeMapTest, Set) {
+  AttributeMap map({});
+  TF_ASSERT_OK(map.Set("string", "value"));
+  TF_ASSERT_OK(map.Set("bool", true));
+  TF_ASSERT_OK(map.Set("int64", int64_t{123}));
+  TF_ASSERT_OK(map.Set("int64_list", std::vector<int64_t>{1, 2}));
+  TF_ASSERT_OK(map.Set("float", 1.23f));
+  EXPECT_EQ(map, AttributeMap({
+                     {"string", AttributeMap::StringValue("value")},
+                     {"bool", AttributeMap::BoolValue(true)},
+                     {"int64", AttributeMap::Int64Value(123)},
+                     {"int64_list",
+                      AttributeMap::Int64ListValue({int64_t{1}, int64_t{2}})},
+                     {"float", AttributeMap::FloatValue(1.23f)},
+                 }))
+      << map.DebugString();
+}
+
 class AttributeMapSerDesTest : public testing::TestWithParam<SerDesVersion> {
  public:
   AttributeMapSerDesTest() : version_(GetParam()) {}

From d1f10818d7497bb615fd7b1b3e5cc96ff47ee827 Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Wed, 10 Dec 2025 11:26:40 -0800
Subject: [PATCH 135/753] [PjRt-IFRT] Rewrite `PjRtLoadedExecutable`
 construction

This change rewrites the construction of `PjRtLoadedExecutable`. The main goal is to prepare for IFRT layout changes and to remove legacy boilerplate code.

It cleans up the output spec (dtype, shape, shading, layout) calculation logic in `xla::ifrt::PjRtLoadedExecutable` that used to be done in two stages (`::Create()` and `::CreateInternal()`) into one stage instead, where `PjRtLoadedExecutable` built from compilation and `PjRtLoadedExecutable` built from deserialization share most of the logic.

Several utility functions are changed to take a return value from `xla::PjRtLoadedExecuable` methods instead of requiring `xla::PjRtLoadedExecuable*` directly. This makes these utility functions reusable for `xla::PjRtExecuable*` in `xla::ifrt::PjRtExecutable` (not part of this change; soon to follow)

The output shape calculation logic using per-shard shapes embedded in HLO has been removed. All output shapes are computed by applying output shardings on full output shapes.

All outputs from a compiled executable use `xla::ifrt::HloSharding` (including for token types). `xla::ifrt::ConcreteEvenSharding` are still used for an `xla::ifrt::PjRtLoadedExecutable` created from `xla::PjRtLoadedExecutable` (for deserialization) because there is no correct sharding information available. The use of `xla::ifrt::ConcreteEvenSharding` will also be removed once serialization & deserialization roundtrip preserves IFRT-level metadata. Following an existing convention, non-shardable types like tokens are still considered fully replicated.

PiperOrigin-RevId: 842807805
---
 .../xla/python/pjrt_ifrt/pjrt_executable.cc   | 817 ++++++++----------
 .../xla/python/pjrt_ifrt/pjrt_executable.h    |  29 +-
 2 files changed, 381 insertions(+), 465 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 772bafc939652b..2ea62a8d42fda4 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -47,7 +46,6 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/utils.h"
-#include "xla/primitive_util.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
@@ -66,12 +64,10 @@ limitations under the License.
 #include "xla/python/pjrt_ifrt/pjrt_dtype.h"
 #include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "xla/python/pjrt_ifrt/pjrt_memory.h"
-#include "xla/python/pjrt_ifrt/xla_compiler.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
-#include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/tsl/concurrency/future.h"
 #include "xla/tsl/concurrency/ref_count.h"
@@ -85,145 +81,220 @@ namespace ifrt {
 
 namespace {
 
-// Returns the op sharding of the root instruction in the entry computation.
-absl::StatusOr<const xla::HloInstructionProto*> FindRootInstruction(
-    const HloModuleProto& proto) {
-  for (const auto& computation : proto.computations()) {
-    if (computation.id() == proto.entry_computation_id()) {
-      for (const auto& instruction : computation.instructions()) {
-        if (instruction.id() == computation.root_id()) {
-          return &instruction;
-        }
-      }
-    }
-  }
-  return InvalidArgument("Entry computation not found");
+constexpr absl::string_view kDefaultMemoryKind = "device";
+
+// Returns a flat list of IFRT dtypes from element type information that a PjRt
+// executable returns (per-module lists of primitive of element types).
+// PjRt-IFRT always uses the first module's information.
+absl::StatusOr<std::vector<DType>> GetDTypes(
+    const absl::StatusOr<std::vector<std::vector<xla::PrimitiveType>>>&
+        pjrt_executable_element_types) {
+  TF_RETURN_IF_ERROR(pjrt_executable_element_types.status());
+  if (pjrt_executable_element_types->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  std::vector<DType> dtypes;
+  dtypes.reserve(pjrt_executable_element_types->front().size());
+  for (xla::PrimitiveType element_type :
+       pjrt_executable_element_types->front()) {
+    TF_ASSIGN_OR_RETURN(DType dtype, ToDType(element_type));
+    dtypes.push_back(dtype);
+  }
+  return dtypes;
 }
 
-// Returns the output element types of the first module in a
-// `PjRtLoadedExecutable`.
-absl::StatusOr<std::vector<xla::PrimitiveType>>
-GetFirstModuleOutputElementTypes(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable) {
-  auto element_types = pjrt_loaded_executable->GetOutputElementTypes();
-  TF_RETURN_IF_ERROR(element_types.status());
-  if (element_types->empty()) {
-    return FailedPrecondition("No output element types found");
-  }
-  return element_types->front();
-}
-
-// Returns the output dimensions of the first module in a
-// `PjRtLoadedExecutable`.
-absl::StatusOr<std::vector<xla::DimensionVector>>
-GetFirstModuleOutputDimensions(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable) {
-  auto dimensions = pjrt_loaded_executable->GetOutputDimensions();
-  TF_RETURN_IF_ERROR(dimensions.status());
-  if (dimensions->empty()) {
-    return FailedPrecondition("No output dimensions found");
-  }
-  return dimensions->front();
+// Returns a flat list of IFRT shapes from the dimension information that a PjRt
+// executable returns (per-module lists of dimension vectors).
+// PjRt-IFRT always uses the first module's information.
+absl::StatusOr<std::vector<Shape>> GetShapes(
+    const absl::StatusOr<std::vector<std::vector<xla::DimensionVector>>>&
+        pjrt_executable_dimensions,
+    absl::Span<const DType> dtypes) {
+  TF_RETURN_IF_ERROR(pjrt_executable_dimensions.status());
+  if (pjrt_executable_dimensions->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  if (pjrt_executable_dimensions->front().size() != dtypes.size()) {
+    return FailedPrecondition(
+        "Output dimensions and dtypes have different sizes: %d vs. %d",
+        pjrt_executable_dimensions->front().size(), dtypes.size());
+  }
+  std::vector<Shape> shapes;
+  shapes.reserve(pjrt_executable_dimensions->front().size());
+  for (int i = 0; i < pjrt_executable_dimensions->front().size(); ++i) {
+    if (dtypes[i].kind() == DType::kToken) {
+      // Token uses a scalar shape by convention.
+      shapes.push_back(Shape({}));
+    } else {
+      shapes.push_back(Shape(pjrt_executable_dimensions->front()[i]));
+    }
+  }
+  return shapes;
 }
 
-// Returns the output shardings of the first module in a
-// `PjRtLoadedExecutable`.
-absl::StatusOr<std::optional<xla::HloSharding>> GetFirstModuleOutputSharding(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable,
-    const xla::Shape& shape) {
-  auto output_shardings = pjrt_loaded_executable->GetOutputShardings();
-  std::optional<xla::HloSharding> result_hlo_sharding;
-  if (output_shardings.has_value()) {
-    std::vector<xla::HloSharding> hlo_shardings;
-    hlo_shardings.reserve(output_shardings->size());
-    for (const auto& sharding : *output_shardings) {
-      TF_ASSIGN_OR_RETURN(auto hlo_sharding,
-                          xla::HloSharding::FromProto(sharding));
-      hlo_shardings.push_back(hlo_sharding);
-    }
-    if (shape.IsTuple()) {
-      return xla::HloSharding::Tuple(shape, hlo_shardings);
+// Returns a pair of flat lists of IFRT dtypes and shapes from XLA shapes
+// extracted from an MLIR module's signature.
+absl::StatusOr<std::pair<std::vector<DType>, std::vector<Shape>>>
+GetDTypesAndShapes(absl::Span<const xla::Shape> mlir_module_xla_shapes) {
+  std::vector<DType> dtypes;
+  dtypes.reserve(mlir_module_xla_shapes.size());
+  std::vector<Shape> shapes;
+  shapes.reserve(mlir_module_xla_shapes.size());
+  for (const xla::Shape& xla_shape : mlir_module_xla_shapes) {
+    TF_ASSIGN_OR_RETURN(DType dtype, ToDType(xla_shape.element_type()));
+    dtypes.push_back(dtype);
+    if (dtype.kind() == DType::kToken) {
+      // Token uses a scalar shape by convention.
+      shapes.push_back(Shape({}));
     } else {
-      return hlo_shardings.front();
+      shapes.push_back(Shape(xla_shape.dimensions()));
     }
   }
-  return std::nullopt;
+  return std::make_pair(std::move(dtypes), std::move(shapes));
 }
 
-// Returns the flattened output memory_kinds of the first module in a
-// `PjRtLoadedExecutable`.
-// `UnimplementedError` will be converted into `std::nullopt`.
-absl::StatusOr<std::optional<std::vector<absl::string_view>>>
-GetFirstModuleOutputMemoryKinds(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable) {
-  auto output_memory_kinds = pjrt_loaded_executable->GetOutputMemoryKinds();
-  // Gracefully handle an unimplemented error.
-  if (absl::IsUnimplemented(output_memory_kinds.status())) {
+// Returns a flat list of HLO shardings from the sharding information that a
+// PjRt executable returns (a flat list of `OpSharding`s, with some special
+// cases). Returns `std::nullopt` if the executable does not have sharding
+// information.
+absl::StatusOr<std::optional<std::vector<xla::HloSharding>>> GetHloShardings(
+    const std::optional<std::vector<xla::OpSharding>>&
+        pjrt_executable_op_shardings,
+    absl::Span<const DType> dtypes, bool is_output) {
+  if (!pjrt_executable_op_shardings.has_value()) {
     return std::nullopt;
   }
-  TF_RETURN_IF_ERROR(output_memory_kinds.status());
-  // Expect `xla::PjRtExecutable::GetOutputMemoryKinds()` to return at least
-  // one module's output memory_kinds if it returns any non-error result.
-  if (output_memory_kinds->empty()) {
-    return FailedPrecondition("No output memory kinds found");
+  std::vector<xla::HloSharding> hlo_shardings;
+  if (is_output && dtypes.empty()) {
+    // If the HLO module output is an empty tuple, the output sharding will have
+    // a single element for the tuple as a special case. We allow this condition
+    // by checking this condition specifically.
+    if (pjrt_executable_op_shardings->size() != 1) {
+      return FailedPrecondition(
+          "HLO module output is an empty tuple, but the output sharding has "
+          "%d elements",
+          pjrt_executable_op_shardings->size());
+    }
+    return std::vector<xla::HloSharding>();
+  }
+  if (pjrt_executable_op_shardings->size() != dtypes.size()) {
+    return FailedPrecondition(
+        "Output shardings and dtypes have different sizes: %d vs. %d",
+        pjrt_executable_op_shardings->size(), dtypes.size());
+  }
+  hlo_shardings.reserve(pjrt_executable_op_shardings->size());
+  for (int i = 0; i < pjrt_executable_op_shardings->size(); ++i) {
+    if (dtypes[i].kind() == DType::kToken) {
+      // Token uses a fully replicated sharding by convention.
+      hlo_shardings.push_back(xla::HloSharding::Replicate());
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          auto hlo_sharding,
+          xla::HloSharding::FromProto((*pjrt_executable_op_shardings)[i]));
+      hlo_shardings.push_back(hlo_sharding);
+    }
   }
-  return std::move(output_memory_kinds)->front();
+  return hlo_shardings;
 }
 
-// Returns the flattened output layouts of the first module in a
-// `PjRtLoadedExecutable`.
-// `UnimplementedError` will be converted into a vector of `nullptr`.
-absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-GetFirstModuleOutputLayouts(
-    xla::PjRtLoadedExecutable* pjrt_loaded_executable,
-    absl::Span<const xla::LayoutMode> output_layout_modes) {
-  absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-      executable_output_layouts = pjrt_loaded_executable->GetOutputLayouts();
-  // An unimplemented error is converted into all-default layouts.
-  if (absl::IsUnimplemented(executable_output_layouts.status())) {
-    return std::vector<std::shared_ptr<const xla::PjRtLayout>>(
-        /*size=*/output_layout_modes.size(), /*value=*/nullptr);
-  }
-  TF_RETURN_IF_ERROR(executable_output_layouts.status());
-  std::vector<std::shared_ptr<const xla::PjRtLayout>> output_layouts;
-  if (executable_output_layouts->size() != output_layout_modes.size()) {
+// Returns a flat list of IFRT memory kinds from the memory kind information
+// that a PjRt executable returns (per-module lists of memory kind strings).
+// PjRt-IFRT always uses the first module's information.
+absl::StatusOr<std::vector<absl::string_view>> GetMemoryKinds(
+    const absl::StatusOr<std::vector<std::vector<absl::string_view>>>&
+        pjrt_executable_memory_kinds,
+    absl::Span<const DType> dtypes) {
+  std::vector<absl::string_view> memory_kinds;
+  // An unimplemented error is converted into all-default memory kinds.
+  if (absl::IsUnimplemented(pjrt_executable_memory_kinds.status())) {
+    memory_kinds.resize(/*size=*/dtypes.size(), /*value=*/kDefaultMemoryKind);
+    return memory_kinds;
+  }
+  TF_RETURN_IF_ERROR(pjrt_executable_memory_kinds.status());
+  if (pjrt_executable_memory_kinds->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  if (pjrt_executable_memory_kinds->front().size() != dtypes.size()) {
     return FailedPrecondition(
-        "Output memory kinds and output layout modes have different sizes: %d "
-        "vs. %d",
-        executable_output_layouts->size(), output_layout_modes.size());
-  }
-  output_layouts.reserve(executable_output_layouts->size());
-  for (int i = 0; i < executable_output_layouts->size(); ++i) {
-    if (output_layout_modes[i].mode == xla::LayoutMode::Mode::kDefault) {
-      output_layouts.push_back(nullptr);
+        "Memory kinds and dtypes have different sizes: %d vs. %d",
+        pjrt_executable_memory_kinds->front().size(), dtypes.size());
+  }
+  memory_kinds.reserve(pjrt_executable_memory_kinds->front().size());
+  for (int i = 0; i < pjrt_executable_memory_kinds->front().size(); ++i) {
+    if (dtypes[i].kind() == DType::kToken) {
+      // Token uses a device memory kind by convention.
+      memory_kinds.push_back(kDefaultMemoryKind);
     } else {
-      output_layouts.push_back(std::move((*executable_output_layouts)[i]));
+      memory_kinds.push_back(pjrt_executable_memory_kinds->front()[i]);
     }
   }
-  return output_layouts;
+  return memory_kinds;
 }
 
-struct ShapePartialInfo {
-  std::vector<xla::PrimitiveType> element_types;
-  std::vector<xla::DimensionVector> dimensions;
-};
-
-absl::StatusOr<ShapePartialInfo> CreateShapePartialInfo(
-    absl::Span<const xla::Shape> shapes) {
-  ShapePartialInfo partial_info;
-  partial_info.element_types.reserve(shapes.size());
-  partial_info.dimensions.reserve(shapes.size());
-  for (const auto& shape : shapes) {
-    if (shape.IsTuple()) {
-      return FailedPrecondition(
-          "Tupled shape is not supported in `CreateShapePartialInfo`.");
+// Makes IFRT shardings created from HLO shardings and memory kinds.
+std::vector<ShardingRef> MakeShardings(
+    absl::Span<const Shape> shapes,
+    const std::optional<std::vector<xla::HloSharding>>& hlo_shardings,
+    absl::Span<const absl::string_view> memory_kinds,
+    const DeviceListRef& executable_devices) {
+  std::vector<ShardingRef> shardings;
+  shardings.reserve(memory_kinds.size());
+  if (hlo_shardings.has_value()) {
+    for (int i = 0; i < memory_kinds.size(); ++i) {
+      shardings.push_back(ifrt::HloSharding::Create(executable_devices,
+                                                    MemoryKind(memory_kinds[i]),
+                                                    (*hlo_shardings)[i]));
+    }
+  } else {
+    // Assume a traditional replication computation where tile shapes are the
+    // same as global shapes.
+    for (int i = 0; i < memory_kinds.size(); ++i) {
+      shardings.push_back(ifrt::ConcreteEvenSharding::Create(
+          executable_devices, MemoryKind(memory_kinds[i]),
+          /*shape=*/shapes[i],
+          /*shard_shape=*/shapes[i]));
     }
-    partial_info.element_types.push_back(shape.element_type());
-    partial_info.dimensions.push_back(
-        xla::ShapeUtil::CreateDimensionVectorFromShape(shape));
   }
+  return shardings;
+}
 
-  return partial_info;
+// Returns a flat list of layouts by combining layout modes and PjRt executable
+// layouts.
+// If any error other than an unimplemented error happens, returns
+// `std::nullopt`. The layout will be determined at execute time.
+//
+// TODO(hyeontaek): Remove the nullopt path once obtaining layout modes and
+// concrete layouts avoids HLO module serialization/deserialization and always
+// succeeds.
+absl::StatusOr<
+    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>>
+GetLayouts(
+    const absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
+        pjrt_executable_layouts,
+    absl::Span<const xla::LayoutMode> layout_modes) {
+  // An unimplemented error is converted into all-default layouts.
+  if (absl::IsUnimplemented(pjrt_executable_layouts.status())) {
+    return std::vector<std::shared_ptr<const xla::PjRtLayout>>(
+        /*size=*/layout_modes.size(), /*value=*/nullptr);
+  }
+  if (!pjrt_executable_layouts.ok()) {
+    return std::nullopt;
+  }
+  std::vector<std::shared_ptr<const xla::PjRtLayout>> layouts;
+  if (pjrt_executable_layouts->size() != layout_modes.size()) {
+    return FailedPrecondition(
+        "Layouts and layout modes have different sizes: %d vs. %d",
+        pjrt_executable_layouts->size(), layout_modes.size());
+  }
+  layouts.reserve(pjrt_executable_layouts->size());
+  for (int i = 0; i < pjrt_executable_layouts->size(); ++i) {
+    if (layout_modes[i].mode == xla::LayoutMode::Mode::kDefault) {
+      layouts.push_back(nullptr);
+    } else {
+      layouts.push_back(std::move((*pjrt_executable_layouts)[i]));
+    }
+  }
+  return layouts;
 }
 
 // Special `xla::GetLayoutModes()` implementation for obtaining layout modes
@@ -237,9 +308,10 @@ static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModesFromFrontendAttr(
   std::vector<std::string> str_modes =
       absl::StrSplit(attr, kDelimiter, absl::SkipEmpty());
   std::vector<LayoutMode> result;
+  result.reserve(str_modes.size());
   for (const std::string& str_mode : str_modes) {
     TF_ASSIGN_OR_RETURN(LayoutMode mode, LayoutMode::FromString(str_mode));
-    result.emplace_back(std::move(mode));
+    result.push_back(std::move(mode));
   }
   return result;
 }
@@ -256,6 +328,89 @@ static absl::StatusOr<std::vector<LayoutMode>> GetLayoutModes(
   return GetLayoutModesFromFrontendAttr(iter->second);
 }
 
+// Returns a flat list of output layout modes by examining the HLO modules.
+//
+// TODO(hyeontaek): Remove this layout mode discovery method once
+// deserialization loads layout information from the serialization metadata
+// instead of from `xla::PjRtExecutable` or `xla::PjRtLoadedExecutable`.
+absl::StatusOr<std::vector<xla::LayoutMode>> GetOutputLayoutModesFromHloModules(
+    const absl::StatusOr<std::vector<std::shared_ptr<xla::HloModule>>>&
+        hlo_modules,
+    absl::Span<const DType> output_dtypes) {
+  TF_RETURN_IF_ERROR(hlo_modules.status());
+  if (hlo_modules->empty()) {
+    return FailedPrecondition("No module found");
+  }
+  return GetLayoutModes(*hlo_modules->front(), "out_layout_modes",
+                        output_dtypes.size());
+}
+
+// Returns a new `DeviceListRef` that contains the addressable devices of the
+// PjRt executable if the supplied `executable_devices` has an incomplete set of
+// devices.
+absl::StatusOr<DeviceListRef> AdjustExecutableDevicesForPmap(
+    PjRtClient* client, const xla::PjRtLoadedExecutable* pjrt_loaded_executable,
+    DeviceListRef executable_devices) {
+  // For jit(pmap(...)), the device assignment (passed as `executable_devices`)
+  // may contain a single device while the PjRt executable has multiple
+  // addressable devices. We check for this condition and replace
+  // `executable_devices` with the executable's addressable devices if
+  // necessary.
+  if (pjrt_loaded_executable->num_replicas() > 1 &&
+      executable_devices->devices().size() == 1) {
+    if (pjrt_loaded_executable->addressable_devices().size() > 1) {
+      BasicDeviceList::Devices ds;
+      ds.reserve(pjrt_loaded_executable->addressable_devices().size());
+      for (xla::PjRtDevice* device :
+           pjrt_loaded_executable->addressable_devices()) {
+        TF_ASSIGN_OR_RETURN(Device * ifrt_device,
+                            client->LookupPjRtDevice(device));
+        ds.push_back(ifrt_device);
+      }
+      executable_devices = BasicDeviceList::Create(std::move(ds));
+    } else if (pjrt_loaded_executable->addressable_devices().size() == 1) {
+      TF_ASSIGN_OR_RETURN(
+          Device * ifrt_device,
+          client->LookupPjRtDevice(
+              pjrt_loaded_executable->addressable_devices().front()));
+      if (ifrt_device != executable_devices->devices().front()) {
+        return FailedPrecondition(
+            "Addressable device does not match sharding device");
+      }
+    }
+  }
+  if (executable_devices->devices().size() <
+      pjrt_loaded_executable->addressable_devices().size()) {
+    return FailedPrecondition(
+        "Sharding devices must be at least as many as addressable devices");
+  }
+  return executable_devices;
+}
+
+// Gathers all `PjRtHostSendAndRecvLoadedHostCallback` from the given list of
+// loaded host callbacks.
+std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
+GatherHostSendAndRecvCallbacks(
+    absl::Span<const tsl::RCReference<LoadedHostCallback>>
+        loaded_host_callbacks) {
+  std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
+      host_send_and_recv_callbacks;
+  host_send_and_recv_callbacks.reserve(loaded_host_callbacks.size());
+  // Gather all `PjRtLoadedHostCallback` separately, as each execution will
+  // register `PjRtLoadedHostCallback` for host send and recv. All host
+  // callbacks will be referenced by the executable and any pending execution to
+  // guarantee the liveliness of host callbacks during executions.
+  for (auto& loaded_host_callback : loaded_host_callbacks) {
+    auto* host_send_and_recv_callback =
+        llvm::dyn_cast<PjRtHostSendAndRecvLoadedHostCallback>(
+            loaded_host_callback.get());
+    if (host_send_and_recv_callback != nullptr) {
+      host_send_and_recv_callbacks.push_back(host_send_and_recv_callback);
+    }
+  }
+  return host_send_and_recv_callbacks;
+}
+
 }  // namespace
 
 char PjRtCompatibleExecutable::ID = 0;
@@ -292,52 +447,69 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
     std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
     DeviceListRef executable_devices) {
-  // TODO(hyeontaek): Use a full shape and a sharding rather than a per-shard
-  // shape.
   VLOG(3) << "PjRtLoadedExecutable::Create";
-  VLOG(3) << "Using per-shard shape";
+
+  TF_ASSIGN_OR_RETURN(
+      executable_devices,
+      AdjustExecutableDevicesForPmap(client, pjrt_loaded_executable.get(),
+                                     std::move(executable_devices)));
+
   TF_ASSIGN_OR_RETURN(
-      auto result_element_types,
-      GetFirstModuleOutputElementTypes(pjrt_loaded_executable.get()));
+      std::vector<DType> output_dtypes,
+      GetDTypes(pjrt_loaded_executable->GetOutputElementTypes()));
   TF_ASSIGN_OR_RETURN(
-      auto result_dimensions,
-      GetFirstModuleOutputDimensions(pjrt_loaded_executable.get()));
+      std::vector<Shape> output_shapes,
+      GetShapes(pjrt_loaded_executable->GetOutputDimensions(), output_dtypes));
+  // When creating `xla::ifrt::PjRtLoadedExecutable` from an already compiled
+  // and loaded `xla::PjRtLoadedExecutable`, we do not have a full shape
+  // (`xla::PjRtLoadedExecutable::GetOutputDimensions()` returns shard shapes).
+  // This prevents us from using
+  // `xla::PjRtLoadedExecutable::GetOutputShardings()` for constructing IFRT
+  // shardings; otherwise, we would try to apply the shardings to already
+  // sharded shapes, which will result in incorrect sharded shapes (and layouts
+  // computed from these shard shapes). Thus, we ignore HLO shardings and use
+  // `xla::ifrt::ConcreteEvenSharding` that will take the already sharded shapes
+  // as shard shapes.
+  //
+  // TODO(hyeontaek): Remove this special handling once we can preserve full
+  // output shapes and layouts from the original compilation during
+  // serialization/deserialization, and remove this `PjRtLoadedExecutable`
+  // construction path.
+  std::optional<std::vector<xla::HloSharding>> output_hlo_shardings =
+      std::nullopt;
   TF_ASSIGN_OR_RETURN(
-      auto result_memory_kinds,
-      GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
-  // Obtaining output layout modes and output layouts directly from
-  // `PjRtLoadedExecutable` may fail because the currently PjRt implementations
-  // often fetch and serialize the optimized HLO. For now, we gracefully
-  // handle it by omitting output layouts at creation time and using output
-  // `PjRtBuffer`'s concrete layouts.
-  // TODO(hyeontaek): Add a way to obtain output layout modes and
-  // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
-  // HLO to be serialized and fetched.
+      std::vector<absl::string_view> output_memory_kinds,
+      GetMemoryKinds(pjrt_loaded_executable->GetOutputMemoryKinds(),
+                     output_dtypes));
+  std::vector<ShardingRef> output_shardings =
+      MakeShardings(output_shapes, output_hlo_shardings, output_memory_kinds,
+                    executable_devices);
+
+  // Obtaining output layout modes and output layouts directly may fail because
+  // PjRt implementations often fetch and serialize/deserialize the optimized
+  // HLO to provide the layout information. For now, we gracefully handle it by
+  // omitting output layouts at creation time and using output `PjRtBuffer`'s
+  // concrete layouts.
+  //
+  // TODO(hyeontaek): Remove this layout mode discovery method once
+  // deserialization loads layout information from the serialization metadata
+  // instead of from `xla::PjRtExecutable` or `xla::PjRtLoadedExecutable`.
   std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
       output_layouts;
-  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> hlo_modules =
-      pjrt_loaded_executable->GetHloModules();
-  if (hlo_modules.ok()) {
-    if (hlo_modules->empty()) {
-      return FailedPrecondition("Requires at least one HloModule.");
-    }
-    absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
-        GetLayoutModes(*hlo_modules->front(), "out_layout_modes",
-                       result_element_types.size());
-    if (output_layout_modes.ok()) {
-      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-          first_module_output_layouts = GetFirstModuleOutputLayouts(
-              pjrt_loaded_executable.get(), *output_layout_modes);
-      if (first_module_output_layouts.ok()) {
-        output_layouts = *std::move(first_module_output_layouts);
-      }
-    }
+  absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
+      GetOutputLayoutModesFromHloModules(
+          pjrt_loaded_executable->GetHloModules(), output_dtypes);
+  if (output_layout_modes.ok()) {
+    TF_ASSIGN_OR_RETURN(output_layouts,
+                        GetLayouts(pjrt_loaded_executable->GetOutputLayouts(),
+                                   *output_layout_modes));
   }
-  return CreateInternal(client, std::move(pjrt_loaded_executable),
-                        result_element_types, result_dimensions,
-                        /*result_hlo_sharding=*/std::nullopt,
-                        result_memory_kinds, output_layouts,
-                        loaded_host_callbacks, std::move(executable_devices));
+
+  return LoadedExecutableRef(new PjRtLoadedExecutable(
+      client, std::move(pjrt_loaded_executable), std::move(executable_devices),
+      std::move(loaded_host_callbacks), std::move(output_dtypes),
+      std::move(output_shapes), std::move(output_shardings),
+      std::move(output_layouts)));
 }
 
 static absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
@@ -366,280 +538,49 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
     module.dump();
   }
   VLOG(3) << compile_options.ToProto()->DebugString();
-  const auto& build_options = compile_options.executable_build_options;
-  const bool auto_spmd_partitioning =
-      build_options.use_spmd_partitioning() &&
-      build_options.num_partitions() > 1 &&
-      (build_options.use_auto_spmd_partitioning() ||
-       build_options.any_allow_spmd_sharding_propagation_to_parameters() ||
-       build_options.any_allow_spmd_sharding_propagation_to_output());
 
   // We have to do process the MLIR before the compile call, since the latter
   // will use the MLIR as scratch space, or possibly even deallocate it.
-  TF_ASSIGN_OR_RETURN(const std::vector<xla::Shape> result_shapes,
-                      ResultShapesOfModule(module));
-  absl::StatusOr<std::vector<xla::LayoutMode>> output_layout_modes =
-      GetOutputLayoutModes(module);
-
-  TF_ASSIGN_OR_RETURN(auto pjrt_loaded_executable,
-                      client->pjrt_client()->CompileAndLoad(
-                          std::move(module), std::move(compile_options)));
-
-  if (auto_spmd_partitioning) {
-    // TODO(hyeontaek): Use a full shape and a sharding rather than a per-shard
-    // shape.
-    VLOG(3) << "Using per-shard shape";
-    TF_ASSIGN_OR_RETURN(
-        auto result_element_types,
-        GetFirstModuleOutputElementTypes(pjrt_loaded_executable.get()));
-    TF_ASSIGN_OR_RETURN(
-        auto result_dimensions,
-        GetFirstModuleOutputDimensions(pjrt_loaded_executable.get()));
-    TF_ASSIGN_OR_RETURN(
-        auto result_memory_kinds,
-        GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
-    // Obtaining output layout modes and output layouts directly from
-    // `PjRtLoadedExecutable` may fail because the currently PjRt
-    // implementations often fetch and serialize the optimized HLO. For now, we
-    // gracefully handle it by omitting output layouts at creation time and
-    // using output `PjRtBuffer`'s concrete layouts.
-    // TODO(hyeontaek): Add a way to obtain output layout modes and
-    // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
-    // HLO to be serialized and fetched.
-    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-        output_layouts;
-    if (output_layout_modes.ok()) {
-      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-          first_module_output_layouts = GetFirstModuleOutputLayouts(
-              pjrt_loaded_executable.get(), *output_layout_modes);
-      if (first_module_output_layouts.ok()) {
-        output_layouts = *std::move(first_module_output_layouts);
-      }
-    }
-    return CreateInternal(client, std::move(pjrt_loaded_executable),
-                          result_element_types, result_dimensions,
-                          /*result_hlo_sharding=*/std::nullopt,
-                          result_memory_kinds, output_layouts,
-                          std::move(loaded_host_callbacks),
-                          std::move(executable_devices));
-  } else {
-    VLOG(3) << "Using full shape";
-    // TODO(yueshengys): Consider getting element types and dimensions directly
-    // from module.
-    bool tuple_output = result_shapes.size() != 1;
-    xla::Shape result_shape;
-    std::vector<xla::Shape> output_shapes;
-    if (tuple_output) {
-      result_shape = xla::ShapeUtil::MakeTupleShape(result_shapes);
-      output_shapes = std::move(result_shapes);
-    } else {
-      result_shape = result_shapes.front();
-      output_shapes = result_shape.IsTuple()
-                          ? result_shape.tuple_shapes()
-                          : std::vector<xla::Shape>{result_shape};
-    }
-    TF_ASSIGN_OR_RETURN(auto shape_partial_info,
-                        CreateShapePartialInfo(output_shapes));
-    TF_ASSIGN_OR_RETURN(auto result_hlo_sharding,
-                        GetFirstModuleOutputSharding(
-                            pjrt_loaded_executable.get(), result_shape));
-    TF_ASSIGN_OR_RETURN(
-        auto result_memory_kinds,
-        GetFirstModuleOutputMemoryKinds(pjrt_loaded_executable.get()));
-    // Obtaining output layout modes and output layouts directly from
-    // `PjRtLoadedExecutable` may fail because the currently PjRt
-    // implementations often fetch and serialize the optimized HLO. For now, we
-    // gracefully handle it by omitting output layouts at creation time and
-    // using output `PjRtBuffer`'s concrete layouts.
-    // TODO(hyeontaek): Add a way to obtain output layout modes and
-    // `PjRtLoadedExecutable::GetOutputLayouts()` without causing the optimized
-    // HLO to be serialized and fetched.
-    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-        output_layouts;
-    if (output_layout_modes.ok()) {
-      absl::StatusOr<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
-          first_module_output_layouts = GetFirstModuleOutputLayouts(
-              pjrt_loaded_executable.get(), *output_layout_modes);
-      if (first_module_output_layouts.ok()) {
-        output_layouts = *std::move(first_module_output_layouts);
-      }
-    }
-    return CreateInternal(
-        client, std::move(pjrt_loaded_executable),
-        shape_partial_info.element_types, shape_partial_info.dimensions,
-        result_hlo_sharding, result_memory_kinds, output_layouts,
-        std::move(loaded_host_callbacks), std::move(executable_devices));
-  }
-}
-
-absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
-    PjRtClient* client,
-    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    absl::Span<const xla::PrimitiveType> result_element_types,
-    absl::Span<const xla::DimensionVector> result_dimensions,
-    const std::optional<xla::HloSharding>& result_hlo_sharding,
-    const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
-    const std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
-        output_layouts,
-    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
-    DeviceListRef executable_devices) {
-  // For jit(pmap(...)), the device assignment (passed as `executable_devices`)
-  // may contain a single device while the PjRt executable has multiple
-  // addressable devices. We check for this condition and replace
-  // `executable_devices` with the executable's addressable devices if
-  // necessary.
-  if (pjrt_loaded_executable->num_replicas() > 1 &&
-      executable_devices->devices().size() == 1) {
-    if (pjrt_loaded_executable->addressable_devices().size() > 1) {
-      BasicDeviceList::Devices ds;
-      ds.reserve(pjrt_loaded_executable->addressable_devices().size());
-      for (xla::PjRtDevice* device :
-           pjrt_loaded_executable->addressable_devices()) {
-        TF_ASSIGN_OR_RETURN(Device * ifrt_device,
-                            client->LookupPjRtDevice(device));
-        ds.push_back(ifrt_device);
-      }
-      executable_devices = BasicDeviceList::Create(std::move(ds));
-    } else if (pjrt_loaded_executable->addressable_devices().size() == 1) {
-      TF_ASSIGN_OR_RETURN(
-          Device * ifrt_device,
-          client->LookupPjRtDevice(
-              pjrt_loaded_executable->addressable_devices().front()));
-      if (ifrt_device != executable_devices->devices().front()) {
-        return FailedPrecondition(
-            "Addressable device does not match sharding device");
-      }
-    }
-  }
-  if (executable_devices->devices().size() <
-      pjrt_loaded_executable->addressable_devices().size()) {
-    return FailedPrecondition(
-        "Sharding devices must be at least as many as addressable devices");
-  }
-  std::vector<DType> output_dtypes;
-  std::vector<Shape> output_shapes;
-  std::vector<ShardingRef> output_shardings;
-
-  auto append_arg = [&](const xla::PrimitiveType& element_type,
-                        const xla::DimensionVector& dimensions,
-                        const xla::HloSharding* sharding,
-                        MemoryKind memory_kind) -> absl::Status {
-    TF_ASSIGN_OR_RETURN(auto dtype, ToDType(element_type));
-    output_dtypes.push_back(dtype);
-    output_shapes.push_back(Shape(dimensions));
-
-    CHECK(xla::primitive_util::IsArrayType(element_type));
-
-    if (sharding != nullptr) {
-      output_shardings.push_back(ifrt::HloSharding::Create(
-          executable_devices, memory_kind, *sharding));
-    } else {
-      // Assume a traditional replication computation where tile shapes are
-      // the same as global shapes.
-      const xla::DimensionVector& tile_shape_dimensions = dimensions;
-      output_shardings.push_back(ifrt::ConcreteEvenSharding::Create(
-          executable_devices, memory_kind,
-          /*shape=*/ifrt::Shape(dimensions),
-          /*shard_shape=*/ifrt::Shape(tile_shape_dimensions)));
-    }
-    return absl::OkStatus();
-  };
-  auto append_token = [&](MemoryKind memory_kind) {
-    output_dtypes.push_back(DType(DType::kToken));
-    output_shapes.push_back(Shape({}));
-    output_shardings.push_back(
-        ifrt::ConcreteEvenSharding::Create(executable_devices, memory_kind,
-                                           /*shape=*/ifrt::Shape({}),
-                                           /*shard_shape=*/ifrt::Shape({})));
-  };
-  auto check_output_sharding_condition =
-      [](absl::Span<const xla::PrimitiveType> element_types,
-         const xla::HloSharding& sharding) {
-        if (sharding.IsTuple()) {
-          // Check that the HLO sharding of the result has the same number of
-          // elements as the output tuple shape. If the output is an empty tuple
-          // then the output sharding will have a single element for the tuple
-          // as a special case, so we will have to allow that by checking this
-          // condition specifically.
-          return element_types.size() == sharding.tuple_elements().size() ||
-                 (element_types.empty() &&
-                  sharding.tuple_elements().size() == 1);
-        }
-        return element_types.size() == 1;
-      };
-
-  if (result_memory_kinds.has_value() &&
-      result_memory_kinds->size() != result_element_types.size()) {
-    return FailedPrecondition(
-        "Output memory kinds are inconsistent with the output shape");
-  }
-  if (result_hlo_sharding.has_value() &&
-      !check_output_sharding_condition(result_element_types,
-                                       *result_hlo_sharding)) {
-    return FailedPrecondition(
-        "Output sharding is inconsistent with the output shape");
-  }
-
-  CHECK_EQ(result_element_types.size(), result_dimensions.size());
-  output_dtypes.reserve(result_element_types.size());
-  output_shapes.reserve(result_element_types.size());
-  output_shardings.reserve(result_element_types.size());
-  for (int i = 0; i < result_element_types.size(); ++i) {
-    const auto& element_type = result_element_types[i];
-    MemoryKind element_memory_kind;
-    if (result_memory_kinds.has_value()) {
-      element_memory_kind = MemoryKind((*result_memory_kinds)[i]);
-    }
-    if (xla::primitive_util::IsArrayType(element_type)) {
-      const xla::HloSharding* element_hlo_sharding = nullptr;
-      if (result_hlo_sharding.has_value()) {
-        element_hlo_sharding = result_hlo_sharding->IsTuple()
-                                   ? &result_hlo_sharding->tuple_elements()[i]
-                                   : &*result_hlo_sharding;
-        if (element_hlo_sharding->IsTuple()) {
-          return FailedPrecondition(
-              "Nested-tupled output sharding is not supported");
-        }
-      }
-      TF_RETURN_IF_ERROR(append_arg(element_type, result_dimensions[i],
-                                    element_hlo_sharding, element_memory_kind));
-    } else if (element_type == TOKEN) {
-      append_token(element_memory_kind);
-    } else {
-      return FailedPrecondition(
-          "The element type is not a supported type (array, token)");
-    }
-  }
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<xla::Shape> mlir_module_output_xla_shapes,
+      ResultShapesOfModule(module));
+  TF_ASSIGN_OR_RETURN(const std::vector<xla::LayoutMode> output_layout_modes,
+                      GetOutputLayoutModes(module));
 
-  std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
-      host_send_and_recv_callbacks;
-  host_send_and_recv_callbacks.reserve(loaded_host_callbacks.size());
-  // Gather all `PjRtLoadedHostCallback` separately, as each execution will
-  // register `PjRtLoadedHostCallback` for host send and recv. All host
-  // callbacks will be referenced by the executable and any pending execution to
-  // guarantee the liveliness of host callbacks during executions.
-  for (auto& loaded_host_callback : loaded_host_callbacks) {
-    auto* host_send_and_recv_callback =
-        llvm::dyn_cast<PjRtHostSendAndRecvLoadedHostCallback>(
-            loaded_host_callback.get());
-    if (host_send_and_recv_callback != nullptr) {
-      host_send_and_recv_callbacks.push_back(host_send_and_recv_callback);
-    }
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      client->pjrt_client()->CompileAndLoad(std::move(module),
+                                            std::move(compile_options)));
 
-  std::vector<Device*> addressable_devices;
-  addressable_devices.reserve(
-      pjrt_loaded_executable->addressable_devices().size());
-  for (xla::PjRtDevice* device :
-       pjrt_loaded_executable->addressable_devices()) {
-    TF_ASSIGN_OR_RETURN(Device * ifrt_device, client->LookupPjRtDevice(device));
-    addressable_devices.push_back(ifrt_device);
-  }
+  TF_ASSIGN_OR_RETURN(
+      executable_devices,
+      AdjustExecutableDevicesForPmap(client, pjrt_loaded_executable.get(),
+                                     std::move(executable_devices)));
+
+  TF_ASSIGN_OR_RETURN(auto output_dtypes_and_shapes,
+                      GetDTypesAndShapes(mlir_module_output_xla_shapes));
+  std::vector<DType> output_dtypes = std::move(output_dtypes_and_shapes.first);
+  std::vector<Shape> output_shapes = std::move(output_dtypes_and_shapes.second);
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+      GetHloShardings(pjrt_loaded_executable->GetOutputShardings(),
+                      output_dtypes, /*is_output=*/true));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<absl::string_view> output_memory_kinds,
+      GetMemoryKinds(pjrt_loaded_executable->GetOutputMemoryKinds(),
+                     output_dtypes));
+  std::vector<ShardingRef> output_shardings =
+      MakeShardings(output_shapes, output_hlo_shardings, output_memory_kinds,
+                    executable_devices);
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          output_layouts,
+      GetLayouts(pjrt_loaded_executable->GetOutputLayouts(),
+                 output_layout_modes));
 
   return LoadedExecutableRef(new PjRtLoadedExecutable(
       client, std::move(pjrt_loaded_executable), std::move(executable_devices),
-      std::move(addressable_devices), std::move(loaded_host_callbacks),
-      std::move(host_send_and_recv_callbacks), std::move(output_dtypes),
+      std::move(loaded_host_callbacks), std::move(output_dtypes),
       std::move(output_shapes), std::move(output_shardings),
       std::move(output_layouts)));
 }
@@ -647,10 +588,8 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::CreateInternal(
 PjRtLoadedExecutable::PjRtLoadedExecutable(
     PjRtClient* client,
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    DeviceListRef devices, std::vector<Device*> addressable_devices,
+    DeviceListRef devices,
     std::vector<tsl::RCReference<LoadedHostCallback>> all_loaded_host_callbacks,
-    std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
-        host_send_recv_callbacks,
     std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
     std::vector<ShardingRef> output_shardings,
     std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
@@ -658,11 +597,12 @@ PjRtLoadedExecutable::PjRtLoadedExecutable(
     : client_(client),
       pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
       devices_(std::move(devices)),
-      addressable_devices_(std::move(addressable_devices)),
+      addressable_devices_(devices_->AddressableDeviceList()->devices()),
       all_loaded_host_callbacks_(
           std::make_shared<std::vector<tsl::RCReference<LoadedHostCallback>>>(
               std::move(all_loaded_host_callbacks))),
-      host_send_recv_callbacks_(std::move(host_send_recv_callbacks)),
+      host_send_recv_callbacks_(
+          GatherHostSendAndRecvCallbacks(*all_loaded_host_callbacks_)),
       output_dtypes_(std::move(output_dtypes)),
       output_shapes_(std::move(output_shapes)),
       output_shardings_(std::move(output_shardings)),
@@ -877,20 +817,9 @@ PjRtLoadedExecutable::Execute(absl::Span<ArrayRef> args,
     }
   } else {
     auto maybe_layouts = GetOutputLayouts();
+    // An unimplemented error is converted into all-default layouts.
     if (absl::IsUnimplemented(maybe_layouts.status())) {
-      for (int i = 0; i < num_outputs; ++i) {
-        std::shared_ptr<const xla::PjRtLayout> layout;
-        if (output_dtypes_[i].kind() == xla::ifrt::DType::kToken) {
-          layout = std::make_shared<xla::PjRtLayout>(xla::Layout());
-        } else {
-          TF_ASSIGN_OR_RETURN(layout,
-                              client_->GetDefaultPjRtLayout(
-                                  output_dtypes_[i], output_shapes_[i].dims(),
-                                  devices_->devices().front(),
-                                  output_shardings_[i]->memory_kind()));
-        }
-        layouts.push_back(std::move(layout));
-      }
+      layouts.resize(/*size=*/num_outputs, /*value=*/nullptr);
     } else {
       TF_RETURN_IF_ERROR(maybe_layouts.status());
       layouts = *std::move(maybe_layouts);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index 9dd2d445da4504..c3e4bd2111dc7d 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "mlir/IR/BuiltinOps.h"
-#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -350,26 +349,12 @@ class PjRtLoadedExecutable final
   static char ID;  // NOLINT
 
  private:
-  static absl::StatusOr<LoadedExecutableRef> CreateInternal(
-      PjRtClient* client,
-      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      absl::Span<const xla::PrimitiveType> result_element_types,
-      absl::Span<const xla::DimensionVector> result_dimensions,
-      const std::optional<xla::HloSharding>& result_hlo_sharding,
-      const std::optional<std::vector<absl::string_view>>& result_memory_kinds,
-      const std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>&
-          output_layouts,
-      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks,
-      DeviceListRef executable_devices);
-
   PjRtLoadedExecutable(
       PjRtClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      DeviceListRef devices, std::vector<Device*> addressable_devices,
+      DeviceListRef devices,
       std::vector<tsl::RCReference<LoadedHostCallback>>
           all_loaded_host_callbacks,
-      std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
-          host_send_recv_callbacks,
       std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
       std::vector<ShardingRef> output_shardings,
       std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
@@ -380,17 +365,19 @@ class PjRtLoadedExecutable final
   // Devices that `pjrt_loaded_executable_` runs on. Empty if the executable is
   // portable.
   DeviceListRef devices_;
-  std::vector<Device*> addressable_devices_;
+  // Addressable devices. The underlying device list is owned by
+  // `devices_->AddressableDeviceList()`.
+  absl::Span<Device* const> addressable_devices_;
   std::shared_ptr<std::vector<tsl::RCReference<LoadedHostCallback>>>
       all_loaded_host_callbacks_;
   std::vector<PjRtHostSendAndRecvLoadedHostCallback*> host_send_recv_callbacks_;
 
-  // Output array specs. If the executable is portable, shardings in
-  // `output_shardings_` will use an arbitrary addressable device, and will be
-  // overridden by a `SingleDeviceSharding` generated on the fly at execution
-  // time.
+  // Output array specs.
   std::vector<DType> output_dtypes_;
   std::vector<Shape> output_shapes_;
+  // If the executable is portable, shardings in `output_shardings_` will use an
+  // arbitrary addressable device, and will be overridden by a
+  // `SingleDeviceSharding` generated on the fly at execution time.
   std::vector<ShardingRef> output_shardings_;
   std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
       output_layouts_;

From edd6824511a6747949a06a811b2f66618e197996 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Wed, 10 Dec 2025 11:48:45 -0800
Subject: [PATCH 136/753] PR #34047: [GPU] Upgrade cuDNN frontend to 1.16.1.

Imported from GitHub PR https://github.com/openxla/xla/pull/34047

Copybara import of the project:

--
3f5e1f59906682459d9e819a104781c16f45fe84 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Upgrade cuDNN frontend to 1.16.1.

Merging this change closes #34047

PiperOrigin-RevId: 842816375
---
 third_party/xla/third_party/cudnn_frontend/workspace.bzl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/third_party/cudnn_frontend/workspace.bzl b/third_party/xla/third_party/cudnn_frontend/workspace.bzl
index f954a198969f87..1bea852e045b88 100644
--- a/third_party/xla/third_party/cudnn_frontend/workspace.bzl
+++ b/third_party/xla/third_party/cudnn_frontend/workspace.bzl
@@ -7,7 +7,7 @@ def repo():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "257b3b7f8a99abc096094abc9e5011659117b647d55293bcd2c5659f9181b99e",
-        strip_prefix = "cudnn-frontend-1.13.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.13.0.zip"),
+        sha256 = "453d4650e6a25ede58fbbd7077c64ebe92734218d474ec7371bb13fa6d2181fa",
+        strip_prefix = "cudnn-frontend-1.16.1",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.16.1.zip"),
     )

From ddbf18af4ef3bc26474470b9473439fc1d87c21f Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Wed, 10 Dec 2025 11:56:34 -0800
Subject: [PATCH 137/753] Prefer `all-gather` over
 `all-reduce(dynamic-update-slice)` in spmd partitioner.

Before this change, we can disable creating all-gather explicitly in the test files. With this change, we always enable creating all-gather.

A follow up of cl/820593767.

PiperOrigin-RevId: 842819486
---
 .../xla/xla/service/spmd/spmd_partitioner.cc  | 33 +++----
 .../xla/service/spmd/spmd_partitioner_test.cc | 97 ++++++-------------
 .../tools/hlo_control_flow_flattening_test.cc |  1 -
 3 files changed, 43 insertions(+), 88 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 052fe73912d8ba..f9f8d1dbdc694c 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -1397,28 +1397,17 @@ HloInstruction* PartitionedHlo::ReplicatePartial(
     return broadcast;
   }
 
-  HloInstruction* result = nullptr;
-  if (state_.collective_ops_creator.create_cross_partition_all_gather) {
-    result = state_.partitioner->AllGatherShards(
-        state_.b, broadcast, sharding(), state_.next_channel_id, ag_dims,
-        state_.collective_ops_creator);
-  }
-
-  if (result == nullptr) {
-    // We do not create all-gather instructions.
-    dus_ar_dims.insert(dus_ar_dims.end(), ag_dims.begin(), ag_dims.end());
-    result = broadcast;
-  } else {
-    // We create all-gather instructions, which may contain padding. Add a slice
-    // to remove the padding.
-    if (!ShapeUtil::Compatible(result->shape(), ag_result_shape)) {
-      std::vector<int64_t> start_indices(ag_result_shape.dimensions().size(),
-                                         0);
-      std::vector<int64_t> strides(ag_result_shape.dimensions().size(), 1);
-      result = state_.b->AddInstruction(
-          HloInstruction::CreateSlice(ag_result_shape, result, start_indices,
-                                      ag_result_shape.dimensions(), strides));
-    }
+  HloInstruction* result = state_.partitioner->AllGatherShards(
+      state_.b, broadcast, sharding(), state_.next_channel_id, ag_dims,
+      state_.collective_ops_creator);
+  // We create all-gather instructions, which may contain padding. Add a slice
+  // to remove the padding.
+  if (!ShapeUtil::Compatible(result->shape(), ag_result_shape)) {
+    std::vector<int64_t> start_indices(ag_result_shape.dimensions().size(), 0);
+    std::vector<int64_t> strides(ag_result_shape.dimensions().size(), 1);
+    result = state_.b->AddInstruction(
+        HloInstruction::CreateSlice(ag_result_shape, result, start_indices,
+                                    ag_result_shape.dimensions(), strides));
   }
 
   if (!dus_ar_dims.empty()) {
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index fa880889ce9bf9..9e2118ef7d8e16 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -85,13 +85,10 @@ class SpmdPartitioningTest
   absl::StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
       absl::string_view hlo_module, int64_t num_devices,
       SpmdPartitionerOptions options = SpmdPartitionerOptions(),
-      bool use_all_gather = true, bool enable_enzyme_opt = false) {
+      bool enable_enzyme_opt = false) {
     options.allow_module_signature_change = true;
     auto collective_ops_creator =
         GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
-    if (!use_all_gather) {
-      collective_ops_creator.create_cross_partition_all_gather = nullptr;
-    }
 
     HloModuleConfig config = GetModuleConfigForTest();
     config.set_use_spmd_partitioning(true);
@@ -8100,10 +8097,10 @@ TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantInRange) {
       dynamic-update-slice(%input, %update, %c59, %c27),
       sharding={devices=[1,2]<=[2]}
   })";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
   const auto root = module->entry_computation()->root_instruction();
   auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
   auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[10,5]"));
@@ -8154,10 +8151,10 @@ TEST_P(SpmdPartitioningTest, DynamicUpdateSliceOfConstantOutOfRange) {
       sharding={devices=[1,2]<=[2]}
   })";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/2, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
   const auto root = module->entry_computation()->root_instruction();
   auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[128,32]"));
   auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[128,10]"));
@@ -8187,10 +8184,10 @@ TEST_P(SpmdPartitioningTest, DynamicUpdateSliceSingleDimensionWithEnzymeOpt) {
         sharding={devices=[4]<=[4]}
     })";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/4, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
   const auto root = module->entry_computation()->root_instruction();
   auto sharded_input = AllOf(op::Parameter(0), op::Shape("s32[4]"));
   auto sharded_update = AllOf(op::Parameter(1), op::Shape("s32[2]"));
@@ -12261,29 +12258,22 @@ TEST_P(SpmdPartitioningTest,
 HloModule module
 
 ENTRY %module {
-  %parameter.0 = s32[8,4,2,2]{3,2,1,0} parameter(0),
-    sharding={devices=[2,2,1,1,2]<=[8] last_tile_dim_replicate}
-  %parameter.1 = s32[2,8,4]{2,1,0} parameter(1),
-    sharding={devices=[1,2,1,4]<=[8] last_tile_dim_replicate}
-  ROOT %gather.20 = s32[8,4,2,2]{3,2,1,0} gather(
-    s32[8,4,2,2]{3,2,1,0} %parameter.0,
-    s32[2,8,4]{2,1,0} %parameter.1), offset_dims={2,3},
-    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=0,
-    slice_sizes={1,1,2,2}, sharding={replicated}
+  %operand = s32[18,14,2,2] parameter(0), sharding={devices=[2,2,1,1,2]<=[8] last_tile_dim_replicate}
+  %indices = s32[2,8,4] parameter(1), sharding={devices=[1,2,1,4]<=[8] last_tile_dim_replicate}
+  ROOT %gather.20 = s32[8,4,2,2] gather(%operand, %indices),
+    offset_dims={2,3}, collapsed_slice_dims={0,1}, start_index_map={0,1},
+    index_vector_dim=0, slice_sizes={1,1,2,2}, sharding={replicated}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      PartitionComputation(hlo_string, /*num_devices=*/8,
-                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
-  auto operand = AllOf(op::Shape("s32[4,2,2,2]"), op::Parameter());
+  auto operand = AllOf(op::Shape("s32[9,7,2,2]"), op::Parameter());
   auto indices = AllOf(op::Shape("s32[2,4,4]"), op::Subtract());
   auto gather = AllOf(op::Shape("s32[4,4,2,2]"), op::Gather(operand, indices));
   EXPECT_THAT(
-      root, op::AllReduce(op::DynamicUpdateSlice(
-                _, op::AllReduce(op::AllReduce(op::Select(_, _, gather))), _, _,
-                _, _)));
+      root,
+      op::AllGather(op::AllReduce(op::AllReduce(op::Select(_, _, gather)))));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -13249,12 +13239,10 @@ ENTRY %module {
     update_window_dims={2,3},
     inserted_window_dims={0,1},
     scatter_dims_to_operand_dims={0,1},
-    index_vector_dim=0, sharding={replicated}
+    index_vector_dim=0, sharding={devices=[2,2,2,1]<=[8]}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      PartitionComputation(hlo_string, /*num_devices=*/8,
-                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   const auto root = module->entry_computation()->root_instruction();
   auto operand = AllOf(op::Shape("s32[8,4,1,2]"), op::Select());
@@ -13262,8 +13250,8 @@ ENTRY %module {
   auto update = AllOf(op::Shape("s32[4,2,1,2]"), op::DynamicSlice());
   auto scatter =
       AllOf(op::Shape("s32[8,4,1,2]"), op::Scatter(operand, indices, update));
-  EXPECT_THAT(root, op::AllReduce(op::DynamicUpdateSlice(
-                        _, op::AllReduce(op::AllReduce(scatter)), _, _, _, _)));
+  EXPECT_THAT(root, op::DynamicSlice(op::AllReduce(op::AllReduce(scatter)), _,
+                                     _, _, _));
 }
 
 TEST_P(SpmdPartitioningTest,
@@ -14935,10 +14923,10 @@ ENTRY entry {
   ROOT c = bf16[16,224,224,384]{3,2,1,0} copy(dynamic-update-slice.128), sharding={devices=[2,2,2,1]<=[8]}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module, PartitionComputation(
-                       hlo_string, /*num_devices=*/8, SpmdPartitionerOptions(),
-                       /*use_all_gather=*/true, /*enable_enzyme_opt=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8,
+                                               SpmdPartitionerOptions(),
+                                               /*enable_enzyme_opt=*/true));
 
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -16379,27 +16367,6 @@ ENTRY entry {
   EXPECT_EQ(FindInstruction(module.get(), HloOpcode::kAllReduce), nullptr);
 }
 
-TEST_P(SpmdPartitioningTest, UnreducedPopulation) {
-  absl::string_view hlo_string = R"(
-HloModule module
-
-ENTRY entry {
-  constant = s32[2,4]{1,0} constant({{1,1,1,1},{1,1,1,1}}), sharding={maximal device=0}
-  a = s32[2,4]{1,0} parameter(0), sharding={devices=[1,2]0,1}
-  add = s32[2,4]{1,0} add(constant, a), sharding={unreduced}
-  ROOT copy = s32[2,4]{1,0} copy(%add), sharding={unreduced}
-})";
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto module,
-      PartitionComputation(hlo_string, /*num_devices=*/2,
-                           SpmdPartitionerOptions(), /*use_all_gather=*/false));
-  VLOG(1) << module->ToString();
-  // Check that we use all-reduce to reshard the operands of the add in spite
-  // that the `add` has unreduced axes.
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Copy(op::Add(op::AllReduce(), op::AllReduce())));
-}
-
 TEST_P(SpmdPartitioningTest, UnreducedParam) {
   absl::string_view hlo_string = R"(
 HloModule module
diff --git a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
index 11bec92ba9a32d..4fd83f5f10c84f 100644
--- a/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
+++ b/third_party/xla/xla/tools/hlo_control_flow_flattening_test.cc
@@ -49,7 +49,6 @@ class HloControlFlowFlatteningTest : public HloHardwareIndependentTestBase {
     spmd::SpmdPartitionerOptions options;
     auto collective_ops_creator =
         spmd::GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
-    collective_ops_creator.create_cross_partition_all_gather = nullptr;
 
     HloModuleConfig config = GetModuleConfigForTest();
     config.set_use_spmd_partitioning(true);

From ea93d433c3a1a99c918292e9ea846727413b5e2f Mon Sep 17 00:00:00 2001
From: Matthias Guenther <mrguenther@google.com>
Date: Wed, 10 Dec 2025 12:15:39 -0800
Subject: [PATCH 138/753] Integrate StableHLO at openxla/stablehlo@1ef9e390

PiperOrigin-RevId: 842827681
---
 .../xla/third_party/stablehlo/temporary.patch | 914 +-----------------
 .../xla/third_party/stablehlo/workspace.bzl   |   4 +-
 2 files changed, 14 insertions(+), 904 deletions(-)

diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 5e49416bc43a26..ca34dee010d16d 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -34,41 +34,6 @@ diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
  gentbl_cc_library(
      name = "func_builder_inc",
      tbl_outs = {
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir b/stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
---- stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
-+++ stablehlo/stablehlo/conversions/linalg/tests/miscellaneous.mlir
-@@ -913,6 +913,15 @@
- 
- // -----
- 
-+// CHECK-LABEL: func @reshape_0D_0D
-+func.func @reshape_0D_0D(%arg0: tensor<i32>) ->tensor<i32> {
-+  %0 = "stablehlo.reshape"(%arg0) : (tensor<i32>) -> tensor<i32>
-+  func.return %0 : tensor<i32>
-+}
-+// CHECK: return %arg0 : tensor<i32>
-+
-+// -----
-+
- // CHECK-LABEL: func @reshape_0D_1D_unsigned
- // CHECK-SAME:    %[[ARG_UNSIGNED:[a-zA-Z0-9_]*]]
- func.func @reshape_0D_1D_unsigned(%arg0: tensor<ui32>) -> tensor<1xui32> {
-diff --ruN a/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp b/stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
---- stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
-+++ stablehlo/stablehlo/conversions/linalg/transforms/StablehloLegalizeToLinalg.cpp
-@@ -1103,6 +1103,12 @@
- 
-     if (!resultType.hasStaticShape()) return failure();
- 
-+    // If the reshape is a no-op simply fold it away.
-+    if (resultType == operandType) {
-+      rewriter.replaceOp(reshapeOp, operand);
-+      return success();
-+    }
-+
-     // If any of the output dimensions is 0, the tensor has no elements. In that
-     // case, we can just replace the reshape with an empty op.
-     if (llvm::is_contained(resultType.getShape(), 0)) {
 diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
 --- stablehlo/stablehlo/dialect/Base.cpp
 +++ stablehlo/stablehlo/dialect/Base.cpp
@@ -130,91 +95,6 @@ diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/diale
    return success();
  }
  
-diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
---- stablehlo/stablehlo/dialect/StablehloOps.cpp
-+++ stablehlo/stablehlo/dialect/StablehloOps.cpp
-@@ -4024,6 +4024,61 @@
-   ReturnOp::create(*builder, loc, compare);
- }
- 
-+void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
-+                           OpBuilder& builder) {
-+  OpBuilder::InsertionGuard guard(builder);
-+  if (body.getBlocks().empty()) builder.createBlock(&body);
-+  Block* block = &body.getBlocks().front();
-+
-+  Type value_type = RankedTensorType::get(/*shape=*/{}, elementType);
-+  Type index_type = RankedTensorType::get(/*shape=*/{}, indices_type);
-+  Location loc = body.getLoc();
-+  block->addArguments({value_type, index_type}, {loc, loc});
-+  block->addArguments({value_type, index_type}, {loc, loc});
-+
-+  auto lhs_value = block->getArgument(0);
-+  auto lhs_index = block->getArgument(1);
-+  auto rhs_value = block->getArgument(2);
-+  auto rhs_index = block->getArgument(3);
-+
-+  auto gt_pred =
-+      builder
-+          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::GT)
-+          .getResult();
-+
-+  // Tie-Breaker Condition: (lhs == rhs) AND (lhs_index < rhs_index)
-+  auto eq_pred =
-+      builder
-+          .create<CompareOp>(loc, lhs_value, rhs_value, ComparisonDirection::EQ)
-+          .getResult();
-+  auto lt_index_pred =
-+      builder
-+          .create<CompareOp>(loc, lhs_index, rhs_index, ComparisonDirection::LT)
-+          .getResult();
-+  auto tie_breaker_condition =
-+      builder.create<AndOp>(loc, eq_pred, lt_index_pred).getResult();
-+
-+  // Final lhs Selection Condition: (gt_pred) OR (tie_breaker_condition)
-+  auto final_lhs_condition =
-+      builder.create<OrOp>(loc, gt_pred, tie_breaker_condition).getResult();
-+
-+  // Select Final Results:
-+  // if final_lhs_condition:
-+  //     return (lhs_value, lhs_index)
-+  // else:
-+  //     return (rhs_value, rhs_index)
-+  auto selected_value = builder
-+                            .create<stablehlo::SelectOp>(
-+                                loc, final_lhs_condition, lhs_value, rhs_value)
-+                            .getResult();
-+  auto selected_index = builder
-+                            .create<stablehlo::SelectOp>(
-+                                loc, final_lhs_condition, lhs_index, rhs_index)
-+                            .getResult();
-+  builder.create<stablehlo::ReturnOp>(
-+      loc, mlir::ValueRange{selected_value, selected_index});
-+}
-+
- SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
-                     const llvm::ArrayRef<Value>& operands,
-                     const llvm::ArrayRef<Type>& elementTypes, int64_t dimension,
-diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.h b/stablehlo/stablehlo/dialect/StablehloOps.h
---- stablehlo/stablehlo/dialect/StablehloOps.h
-+++ stablehlo/stablehlo/dialect/StablehloOps.h
-@@ -204,6 +204,16 @@
-   stablehlo::ReturnOp::create(builder, loc, reducer.getResult());
- }
- 
-+// Builds the region `body` for a max-and-argmax computation, suitable for
-+// use in ReduceWindow operations with varidic value and index inputs.
-+// It creates four block arguments (val1, idx1, val2, idx2) of `elementType` and
-+// `indices_type`, and returns two results: result_val and result_idx.
-+// result_val is the maximum of val1 and val2, and result_idx is the index
-+// corresponding to result_val. If val1 >= val2, idx1 is returned, otherwise
-+// idx2 is returned.
-+void buildMaxAndArgmaxBody(Type elementType, Type indices_type, Region& body,
-+                           OpBuilder& builder);
-+
- // PrecisionConfigAttr is a constraint attribute on ArrayAttrs.
- // Create this class to allow for building this attr similar to other
- // attributes.
 diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt b/stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
 --- stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
 +++ stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
@@ -414,28 +294,6 @@ diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp b/
 +
 +}  // namespace chlo
 +}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp b/stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/MlirBuilderTblgen.cpp
-@@ -203,6 +203,9 @@
-   // If the op does not support type inference, return a default output shape
-   // parameter that must be injected.
-   MethodParameter getDefaultOutputShape() {
-+    if (hasSingleVariadicResult(getOp()) || getOp().getNumResults() > 1) {
-+      return MethodParameter("TypeRange", "resultTypes");
-+    }
-     return MethodParameter("Type", "resultType");
-   }
- 
-@@ -276,7 +279,7 @@
-     BuilderParams params = getOpBuilderParameters();
-     SmallVector<MethodParameter> parameters;
-     if (params.outputShape.has_value()) {
--      parameters.push_back(getDefaultOutputShape());
-+      parameters.push_back(params.outputShape.value());
-     }
-     for (auto& operand : params.operands) {
-       parameters.push_back(
 diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
 --- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
 +++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
@@ -447,61 +305,6 @@ diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp b
    if (isa<ComplexType>(inputType.getElementType()) &&
        !isa<ComplexType>(resultElementType)) {
      operand = stablehlo::Real(operand);
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilderTest.cpp
-@@ -17,12 +17,12 @@
- #include <cstdint>
- #include <string>
- 
--#include "gtest/gtest.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinOps.h"
- #include "mlir/IR/DialectRegistry.h"
- #include "mlir/IR/MLIRContext.h"
- #include "mlir/IR/OwningOpRef.h"
-+#include "mlir/IR/Types.h"
- #include "mlir/IR/Verifier.h"
- #include "mlir/Support/DebugStringHelper.h"
- #include "mlir/Support/LLVM.h"
-@@ -32,6 +32,7 @@
- #include "stablehlo/integrations/cpp/builder/FuncBuilder.h"
- #include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
- #include "stablehlo/integrations/cpp/builder/StablehloBuilder.h"
-+#include "gtest/gtest.h"
- 
- namespace mlir {
- namespace stablehlo {
-@@ -1517,6 +1518,29 @@
-   EXPECT_EQ(expected, debugString(*module));
- }
- 
-+TEST(MlirBuilderTest, VariadicResult) {
-+  std::string expected = R"mlir(module {
-+  func.func @main() -> (tensor<f64>, tensor<f64>) {
-+    %0:2 = stablehlo.custom_call @two_outs() : () -> (tensor<f64>, tensor<f64>)
-+    return %0#0, %0#1 : tensor<f64>, tensor<f64>
-+  }
-+})mlir";
-+
-+  StablehloModuleBuilder mb;
-+  {
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type = makeTensorType(fb.getContext(), {}, ElementType::F64);
-+    SmallVector<Type> resultTypes = {type, type};
-+    // Pass double data with i64 type.
-+    auto cc = stablehlo::CustomCall(fb, resultTypes, {}, "two_outs");
-+    func::Return(fb, {cc});
-+  }
-+
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_EQ(expected, debugString(*module));
-+}
-+
- ////////
- // Custom Attribute Tests
- ////////
 diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
 --- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
 +++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
@@ -888,100 +691,6 @@ diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast
      -> tensor<4xf32> {
    // CHECK-NOT: chlo.broadcast_zeta
    // CHECK-NOT: chlo.zeta
-diff --ruN a/stablehlo/stablehlo/tests/ops_broadcasting.mlir b/stablehlo/stablehlo/tests/ops_broadcasting.mlir
---- stablehlo/stablehlo/tests/ops_broadcasting.mlir
-+++ stablehlo/stablehlo/tests/ops_broadcasting.mlir
-@@ -92,6 +92,8 @@
- // [<=10] x [1] => [<=10]
- // [1] x [<=10] => [<=10]
- // [1] x [1, <=10, 1] => [1, <=10, 1]
-+// [5] x [10, 1] => [10, 5]
-+// [5] x [<=10, 1] => [<=10, 5]
- 
- 
- // [1] x [1] => [1]
-@@ -232,6 +234,38 @@
- 
- // -----
- 
-+// [5] x [10, 1] => [10, 5]
-+// CHECK-LABEL: func @tensor_broadcast_5_x_10_1
-+func.func @tensor_broadcast_5_x_10_1(%arg0: tensor<5xf64>, %arg1: tensor<10x1xf64>) -> !stablehlo.token {
-+  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [1] : (tensor<5xf64>) -> tensor<10x5xf64>
-+  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<10x1xf64>) -> tensor<10x5xf64>
-+  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %[[RHS_BCAST]])
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<5xf64>, tensor<10x1xf64>) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
-+// [<=10, 1] x [5] => [<=10, 5]
-+// CHECK-LABEL: func @tensor_broadcast_b5_1_x_5
-+func.func @tensor_broadcast_b5_1_x_5(
-+  %arg0: tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
-+  %arg1: tensor<5xf64>
-+) -> !stablehlo.token {
-+  // CHECK: %[[LHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<?x1xf64, #stablehlo.bounds<10, ?>>) -> tensor<?x5xf64, #stablehlo.bounds<10, ?>>
-+  // CHECK: %[[RHS_BCAST_STATIC:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [1] : (tensor<5xf64>) -> tensor<10x5xf64>
-+  // CHECK: %[[ARG0_DIM0_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
-+  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST_STATIC]], %[[ARG0_DIM0_SIZE]], dim = 0
-+  // CHECK-NEXT: stablehlo.custom_call @numpy_broadcasted(%[[LHS_BCAST]], %[[RHS_BCAST_DYN]])
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (
-+    tensor<?x1xf64, #stablehlo.bounds<10, ?>>,
-+    tensor<5xf64>
-+  ) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
- //////
- // N-ary broadcast tests.
- 
-@@ -247,3 +281,42 @@
-   return %0 : !stablehlo.token
- }
- 
-+// -----
-+
-+/////
-+// Broadcast errors
-+
-+// [10] x [5] => error
-+// expected-error @+1 {{incompatible shapes for broadcasting 10 and 5}}
-+func.func @broadcast_error_10_x_5(%arg0: tensor<10xf64>, %arg1: tensor<5xf64>) -> !stablehlo.token {
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, tensor<5xf64>) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
-+// [10] x [<=10] => error
-+// expected-error @+1 {{cannot mix bounded and static dimensions in broadcast}}
-+func.func @broadcast_error_10_x_b10(%arg0: tensor<10xf64>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token {
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, tensor<?xf64, #stablehlo.bounds<10>>) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
-+// [10] x not_tensor => error
-+func.func @broadcast_error_not_tensor(%arg0: tensor<10xf64>, %arg1: !stablehlo.token) -> !stablehlo.token {
-+  // expected-error @+1 {{expected ranked tensor type for broadcast inputs}}
-+  %0 = "hlo_test_broadcast.numpy_broadcast"(%arg0, %arg1) : (tensor<10xf64>, !stablehlo.token) -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
-+// -----
-+
-+// [] => error
-+func.func @broadcast_error_empty() -> !stablehlo.token {
-+  // expected-error @+1 {{requires at least one operand to broadcast}}
-+  %0 = "hlo_test_broadcast.numpy_broadcast"() : () -> !stablehlo.token
-+  return %0 : !stablehlo.token
-+}
-+
 diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
 --- stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
 +++ stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
@@ -994,104 +703,6 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/
    func.return %0 : tensor<16x16xf32>
  }
  
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-@@ -47,8 +47,8 @@
- ////////
- // CaseOp
- 
--// CHECK-LABEL: func.func @case_fold_constant_branch_index
--func.func @case_fold_constant_branch_index(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-+// CHECK-LABEL: func.func @case_fold_constant_branch_index_int_result
-+func.func @case_fold_constant_branch_index_int_result(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i32> {
-   // CHECK-NEXT: {{(^ *|func\.)}}return %arg1
-   // CHECK-NOT:  stablehlo.case
-   %branch_index = stablehlo.constant dense<1> : tensor<i32>
-@@ -60,6 +60,47 @@
-     stablehlo.return %arg2 : tensor<i32>
-   }) : (tensor<i32>) -> tensor<i32>
-   func.return %result: tensor<i32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func.func @case_fold_constant_branch_index_complex_result
-+func.func @case_fold_constant_branch_index_complex_result(%arg0: tensor<complex<f32>>, %arg1: tensor<complex<f32>>, %arg2: tensor<complex<f32>>) -> tensor<complex<f32>> {
-+  // CHECK-NEXT: {{(^ *|func\.)}}return %arg1
-+  // CHECK-NOT:  stablehlo.case
-+  %branch_index = stablehlo.constant dense<1> : tensor<i32>
-+  %result = "stablehlo.case"(%branch_index) ({
-+    stablehlo.return %arg0 : tensor<complex<f32>>
-+  }, {
-+    stablehlo.return %arg1 : tensor<complex<f32>>
-+  }, {
-+    stablehlo.return %arg2 : tensor<complex<f32>>
-+  }) : (tensor<i32>) -> tensor<complex<f32>>
-+  func.return %result: tensor<complex<f32>>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func.func @case_fold_inline_call_tf_function
-+func.func @case_fold_inline_call_tf_function(%arg0: !stablehlo.token {jax.token = true}, %arg1: tensor<16xi32>, %arg2: tensor<16xi64>) -> (!stablehlo.token {jax.token = true}, tensor<16xi32> {jax.result_info = "result"}) {
-+  // CHECK: [[RESULT_TOKEN:%.+]] = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2)
-+  // CHECK: [[UNUSED_TOKEN:%.+]] = {{"?}}stablehlo.case{{"?}}(
-+  // CHECK: return [[RESULT_TOKEN]], %arg1
-+  %c = stablehlo.constant dense<1> : tensor<i32>
-+  %c_0 = stablehlo.constant dense<0> : tensor<i32>
-+  %0 = "stablehlo.case"(%c_0) ({
-+    stablehlo.return %c_0 : tensor<i32>
-+  }, {
-+    stablehlo.return %c : tensor<i32>
-+  }) : (tensor<i32>) -> tensor<i32>
-+  %1 = "stablehlo.case"(%0) ({
-+    %2 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_index = 0 : i64, has_token_input_output = true}} : (!stablehlo.token, tensor<16xi32>, tensor<16xi64>) -> !stablehlo.token
-+    stablehlo.return %2 : !stablehlo.token
-+  }, {
-+    %2 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_index = 1 : i64, has_token_input_output = true}} : (!stablehlo.token, tensor<16xi32>, tensor<16xi64>) -> !stablehlo.token
-+    stablehlo.return %2 : !stablehlo.token
-+  }) : (tensor<i32>) -> !stablehlo.token
-+  return %1, %arg1 : !stablehlo.token, tensor<16xi32>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_simplification.mlir
-@@ -128,6 +128,16 @@
-   return %7 : tensor<3x2x3x3xi32>
- }
- 
-+// CHECK-LABEL: func.func @broadcast_in_dim_nested_bounded
-+func.func @broadcast_in_dim_nested_bounded(%arg0: tensor<3x3xi32>, %arg1: tensor<i32>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>> {
-+  // CHECK: [[SDS:%.+]] = stablehlo.set_dimension_size
-+  // CHECK-NEXT: stablehlo.broadcast_in_dim [[SDS]], dims = [2, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
-+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<3x3xi32>, tensor<i32>) -> tensor<?x3xi32, #stablehlo.bounds<3, ?>>
-+  %1 = stablehlo.broadcast_in_dim %0, dims = [1, 0] : (tensor<?x3xi32, #stablehlo.bounds<3, ?>>) -> tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>
-+  %2 = stablehlo.broadcast_in_dim %1, dims = [0, 2, 1] : (tensor<3x?x2xi32, #stablehlo.bounds<?, 3, ?>>) -> tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
-+  return %2 : tensor<3x2x?x3xi32, #stablehlo.bounds<?, ?, 3, ?>>
-+}
-+
- // CHECK-LABEL: func.func @broadcast_in_dim_reshape
- // CHECK-SAME:   ([[ARG0:%.+]]: tensor<3x6xi32>)
- func.func @broadcast_in_dim_reshape(%arg0: tensor<3x6xi32>)
-@@ -140,6 +150,15 @@
- 
-   // CHECK-NEXT: return [[R0]], [[R5]]
-   return %0, %5 : tensor<1x3x6xi32>, tensor<3x6x1xi32>
-+}
-+
-+// CHECK-LABEL: func.func @broadcast_in_dim_bounded_no_reshape
-+func.func @broadcast_in_dim_bounded_no_reshape(%arg0: tensor<20xf32>, %arg1: tensor<i32>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>> {
-+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<20xf32>, tensor<i32>) -> tensor<?xf32, #stablehlo.bounds<20>>
-+  // CHECK: stablehlo.set_dimension_size
-+  // CHECK-NEXT: stablehlo.broadcast_in_dim
-+  %1 = stablehlo.broadcast_in_dim %0, dims = [1] : (tensor<?xf32, #stablehlo.bounds<20>>) -> tensor<1x?xf32, #stablehlo.bounds<?, 20>>
-+  return %1 : tensor<1x?xf32, #stablehlo.bounds<?, 20>>
- }
- 
- // CHECK-LABEL: func.func @broadcast_in_dim_prefer_nested_reshape
 diff --ruN a/stablehlo/stablehlo/transforms/CMakeLists.txt b/stablehlo/stablehlo/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/transforms/CMakeLists.txt
@@ -1268,7 +879,7 @@ diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stable
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
 --- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
 +++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
-@@ -59,29 +59,11 @@
+@@ -59,26 +59,6 @@
    };
  }
  
@@ -1276,7 +887,8 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/sta
 -  // Get tensor type
 -  mlir::RankedTensorType tensor_type = dyn_cast<RankedTensorType>(op.getType());
 -  if (!tensor_type)
--    return emitError(op.getLoc(), "expected ranked tensor type");
+-    return emitError(op.getLoc(),
+-                     "expected ranked tensor type for broadcast inputs");
 -
 -  auto encoding =
 -      mlir::dyn_cast_if_present<mlir::stablehlo::TypeExtensionsAttr>(
@@ -1291,40 +903,12 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/sta
 -  return dimensions;
 -}
 -
--FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(const Dimensions& a,
-+FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(Value op,
-+                                                       const Dimensions& a,
+ FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(Value op,
+                                                        const Dimensions& a,
                                                         const Dimensions& b) {
-   LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] inputs: "
--                          << toString(a) << " * " << toString(b));
-+                          << toString(a) << " * " << toString(b) << "\n");
-   size_t max_rank = std::max(a.size(), b.size());
-   Dimensions result(max_rank);
- 
-@@ -110,14 +92,14 @@
- 
-     // If both LHS and RHS are not 1, dim size must match.
-     if (dim_a.size != dim_b.size) {
--      return emitError(a[a_idx].boundOp.value().getLoc(),
--                       "incompatible shapes for broadcasting ")
-+      // FIXME
-+      return emitError(op.getLoc(), "incompatible shapes for broadcasting ")
-              << dim_a.size << " and " << dim_b.size;
-     }
- 
-     // If bounded both must be bounded
-     if (dim_a.boundOp.has_value() != dim_b.boundOp.has_value()) {
--      return emitError(a[a_idx].boundOp.value().getLoc(),
-+      return emitError(op.getLoc(),
-                        "cannot mix bounded and static dimensions in broadcast");
-     }
- 
-@@ -126,8 +108,30 @@
-   }
- 
+@@ -130,6 +110,28 @@
    LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] result: "
--                          << toString(result));
-+                          << toString(result) << "\n");
+                           << toString(result) << "\n");
    return result;
 +}
 +
@@ -1351,170 +935,18 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/sta
  }
  
  mlir::RankedTensorType getRankedTensorType(const Dimensions& dims,
-@@ -153,10 +157,12 @@
+@@ -155,7 +157,6 @@
    return mlir::RankedTensorType::get(shape, element_type, encoding);
  }
  
 -}  // namespace
--
--FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops) {
--  if (ops.empty()) return failure();
-+
-+FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
-+                                             ArrayRef<Value> ops) {
-+  if (ops.empty())
-+    return emitError(builder.getInsertionPoint()->getLoc(),
-+                     "requires at least one operand to broadcast");
- 
-   Value first = ops[0];
-   auto bcastShapeOrFail = getDimensions(first);
-@@ -168,7 +174,7 @@
-     auto dims = getDimensions(currOp);
-     if (failed(dims)) return failure();
-     auto currBcastShapeOrFail =
--        getNumpyBroadcastShapeWithBounds(bcastShape, *dims);
-+        getNumpyBroadcastShapeWithBounds(currOp, bcastShape, *dims);
-     if (failed(currBcastShapeOrFail)) return failure();
-     bcastShape = std::move(*currBcastShapeOrFail);
-   }
-@@ -192,7 +198,7 @@
- FailureOr<SmallVector<Value>> numpyBroadcastIfNeeded(OpBuilder& builder,
-                                                      ArrayRef<Value> operands) {
-   // Figure out the broadcast shape
--  auto bcastShapeOrFail = getNumpyBroadcastShape(operands);
-+  auto bcastShapeOrFail = getNumpyBroadcastShape(builder, operands);
-   if (failed(bcastShapeOrFail)) return failure();
-   Dimensions bcastShape = std::move(*bcastShapeOrFail);
- 
-@@ -208,35 +214,34 @@
- 
- FailureOr<Value> numpyBroadcastIfNeeded(OpBuilder& builder, Value input,
-                                         const Dimensions& shape) {
--  LLVM_DEBUG(llvm::dbgs() << "[BroadcastIfNeeded] input: " << input
--                          << " shape: " << toString(shape));
-+  LLVM_DEBUG(llvm::dbgs() << "[numpyBroadcastIfNeeded] Broadcasting input "
-+                          << input.getType() << " => " << toString(shape)
-+                          << "\n");
-   auto loc = input.getLoc();
--  mlir::RankedTensorType input_type =
-+  mlir::RankedTensorType inputType =
-       dyn_cast<RankedTensorType>(input.getType());
--  if (!input_type) return emitError(input.getLoc(), "expected tensor type");
--  mlir::RankedTensorType output_type =
--      getRankedTensorType(shape, input_type.getElementType());
-+  if (!inputType)
-+    return emitError(loc, "expected ranked tensor type for broadcast inputs");
-+  mlir::RankedTensorType outputType =
-+      getRankedTensorType(shape, inputType.getElementType());
- 
-   // Short circuit if no broadcasting is needed.
--  if (input_type == output_type) return input;
--
--  int64_t input_rank = input_type.getRank();
--  int64_t output_rank = output_type.getRank();
--  if (input_rank > output_rank)
-+  if (inputType == outputType) return input;
-+
-+  int64_t inputRank = inputType.getRank();
-+  int64_t outputRank = outputType.getRank();
-+  if (inputRank > outputRank)
-     return emitError(loc, "input rank must be <= output rank, got ")
--           << input_rank << " vs " << output_rank;
--
--  size_t rank_diff = output_rank - input_rank;
--  SmallVector<int64_t> bcast_dims;
--  bcast_dims.reserve(input_rank);
--
-+           << inputRank << " vs " << outputRank;
-+
-+  size_t rankDiff = outputRank - inputRank;
-   auto inputShapeOrFail = getDimensions(input);
-   if (failed(inputShapeOrFail)) return failure();
-   Dimensions inputShape = std::move(*inputShapeOrFail);
- 
-   // Construct broadcast dimensions.
-   auto broadcastDimensions = llvm::to_vector(
--      llvm::seq<int64_t>(output_rank - input_rank, output_rank));
-+      llvm::seq<int64_t>(outputRank - inputRank, outputRank));
- 
-   // Construct the result type of the broadcast
-   //  - If input is static and target shape is static, use static shape.
-@@ -244,33 +249,35 @@
-   //  - If input is not bounded, but target shape is bounded, broadcast to
-   //    the padded shape then call SetDimensionSize to make dynamic.
-   auto bcastShape = shape;
--  for (int64_t i = 0; i < input_rank; ++i) {
--    int64_t input_dim_size = inputShape[i].size;
--    int64_t result_idx = i + rank_diff;
--    int64_t result_dim_size = shape[result_idx].size;
--    if (input_dim_size != 1 && input_dim_size != result_dim_size)
-+  for (int64_t i = 0; i < inputRank; ++i) {
-+    int64_t inputDimSize = inputShape[i].size;
-+    int64_t resultIdx = i + rankDiff;
-+    int64_t resultDimSize = shape[resultIdx].size;
-+    if (inputDimSize != 1 && inputDimSize != resultDimSize)
-       return emitError(loc, "Cannot broadcast input: ")
--             << input_type << " to target shape " << toString(shape);
-+             << inputType << " to target shape " << toString(shape);
- 
-     if (!inputShape[i].boundOp.has_value() &&
--        shape[result_idx].boundOp.has_value()) {
-+        shape[resultIdx].boundOp.has_value()) {
-       // Use padded shape in broadcast.
--      bcastShape[result_idx] = DimensionInfo{shape[result_idx].size};
--    }
--    bcast_dims.push_back(result_idx);
-+      bcastShape[resultIdx] = DimensionInfo{shape[resultIdx].size};
-+    }
-   }
- 
-   // Broadcast to padded size for remaining dimensions.
--  for (size_t i = input_rank; i < shape.size(); ++i) {
-+  for (size_t i = 0; i < rankDiff; ++i) {
-     bcastShape[i] = DimensionInfo{shape[i].size};
-   }
- 
-   // Insert broadcast ops
--  mlir::RankedTensorType bcast_type =
--      getRankedTensorType(bcastShape, input_type.getElementType());
--  Value bcast_op = stablehlo::BroadcastInDimOp::create(
--      builder, loc, bcast_type, input, broadcastDimensions);
--  if (bcast_op.getType() == output_type) return bcast_op;
-+  mlir::RankedTensorType bcastType =
-+      getRankedTensorType(bcastShape, inputType.getElementType());
-+  LLVM_DEBUG(
-+      llvm::dbgs() << "[numpyBroadcastIfNeeded] Broadcast to padded type "
-+                   << bcastType << "\n");
-+  Value bcastOp = stablehlo::BroadcastInDimOp::create(
-+      builder, loc, bcastType, input, broadcastDimensions);
-+  if (bcastOp.getType() == outputType) return bcastOp;
- 
-   // Mark the padded broadcast as dynamic where the result is bounded.
-   // Inserts `GetDimSize(boundOp)->SetDimSize(inputBcast)` for any bounded
-@@ -278,13 +285,13 @@
-   for (size_t i = 0; i < shape.size(); ++i) {
-     if (!bcastShape[i].boundOp.has_value() && shape[i].boundOp.has_value()) {
-       Value boundOp = shape[i].boundOp.value();
--      auto dim_size = stablehlo::GetDimensionSizeOp::create(
-+      auto dimSize = stablehlo::GetDimensionSizeOp::create(
-           builder, loc, boundOp, shape[i].boundOpDim);
--      bcast_op = stablehlo::SetDimensionSizeOp::create(builder, loc, bcast_op,
--                                                       dim_size, i);
--    }
--  }
--  return bcast_op;
-+      bcastOp = stablehlo::SetDimensionSizeOp::create(builder, loc, bcastOp,
-+                                                       dimSize, i);
-+    }
-+  }
-+  return bcastOp;
- }
  
- }  // namespace stablehlo
+ FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
+                                              ArrayRef<Value> ops) {
 diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
 --- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
 +++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
-@@ -47,9 +47,18 @@
+@@ -47,6 +47,14 @@
  using Dimensions = SmallVector<DimensionInfo>;
  std::string toString(const Dimensions& dims);
  
@@ -1528,327 +960,5 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stabl
 +
  // Returns the common shape these ops would broadcast to, or an error if the
  // ops are not broadcastable.
--FailureOr<Dimensions> getNumpyBroadcastShape(ArrayRef<Value> ops);
-+FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
-+                                             ArrayRef<Value> ops);
- 
- // Apply numpy broadcasting to the given operands, returning an error if any
- // operands are not broadcastable.
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
---- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-+++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-@@ -14,6 +14,7 @@
- 
- #include <cassert>
- #include <cmath>
-+#include <complex>
- #include <cstddef>
- #include <cstdint>
- #include <functional>
-@@ -38,6 +39,7 @@
- #include "mlir/Dialect/CommonFolders.h"
- #include "mlir/Dialect/Func/IR/FuncOps.h"
- #include "mlir/Dialect/Utils/IndexingUtils.h"
-+#include "mlir/IR/Builders.h"
- #include "mlir/IR/BuiltinAttributeInterfaces.h"
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinTypeInterfaces.h"
-@@ -82,6 +84,71 @@
-                 /*isUnsigned=*/!isSigned);
- }
- 
-+class LazyPlaceholderValue {
-+ public:
-+  static FailureOr<LazyPlaceholderValue> preparePlaceholderFor(
-+      PatternRewriter& rewriter, Value likeValue) {
-+    Type valueType = likeValue.getType();
-+
-+    // If `getZeroAttr(valueType)` returns a valid attribute, simply wrap the
-+    // result in a `stablehlo.constant` op.
-+    if (TypedAttr placeholderAttr = rewriter.getZeroAttr(valueType)) {
-+      return LazyPlaceholderValue([&rewriter, placeholderAttr](Location loc) {
-+        return ConstantOp::create(rewriter, loc, placeholderAttr);
-+      });
-+    }
-+
-+    // `getZeroAttr` doesn't support complex types, so we handle that case here.
-+    if (auto shapedType = dyn_cast<ShapedType>(valueType)) {
-+      if (auto complexElementType =
-+              dyn_cast<ComplexType>(shapedType.getElementType())) {
-+        if (!isa<FloatType>(complexElementType.getElementType()))
-+          return rewriter.notifyMatchFailure(
-+              likeValue.getLoc(),
-+              "unexpected real component type for complex element type");
-+        auto realImagComponentFloatType =
-+            cast<FloatType>(complexElementType.getElementType());
-+        APFloat apFloatZero(0.0);
-+        bool losesInfo;
-+        apFloatZero.convert(realImagComponentFloatType.getFloatSemantics(),
-+                            llvm::RoundingMode::NearestTiesToEven, &losesInfo);
-+        std::complex<APFloat> complexZeroScalar(apFloatZero, apFloatZero);
-+        auto complexZeroSplat =
-+            SplatElementsAttr::get(shapedType, complexZeroScalar);
-+        return LazyPlaceholderValue(
-+            [&rewriter, complexZeroSplat](Location loc) {
-+              return ConstantOp::create(rewriter, loc, complexZeroSplat);
-+            });
-+      }
-+    }
-+
-+    // If `valueType` is a token type, use `stablehlo.after_all` with no
-+    // arguments to create a placeholder token.
-+    if (isa<TokenType>(valueType)) {
-+      return LazyPlaceholderValue([&rewriter](Location loc) {  //
-+        return AfterAllOp::create(rewriter, loc, {});
-+      });
-+    }
-+
-+    // TODO: Support quantized and buffer types.
-+
-+    return rewriter.notifyMatchFailure(
-+        likeValue.getLoc(), "unable to create placeholder value for type");
-+  }
-+
-+  Value createAt(Location loc) const {
-+    if (!lazyInitializer)
-+      llvm::report_fatal_error("No lazy initializer for this value type.");
-+    return lazyInitializer(loc);
-+  }
-+
-+ private:
-+  LazyPlaceholderValue(std::function<Value(Location)> lazyInitializer)
-+      : lazyInitializer(std::move(lazyInitializer)) {}
-+
-+  std::function<Value(Location)> lazyInitializer;
-+};
-+
- LogicalResult validateStaticShapeResult(PatternRewriter& rewriter,
-                                         Operation* op, ShapedType resultType) {
-   if (!resultType.hasStaticShape())
-@@ -737,18 +804,14 @@
-     Operation* terminator = blockToInline->getTerminator();
-     ValueRange results = terminator->getOperands();
- 
--    // TODO: Add support for complex, quantized, and token return types.
--    // Currently, this pattern only supports int and float return types. We'll
--    // need a more general equivalent of `getZeroAttr` to support other types.
--    SmallVector<TypedAttr> placeholderAttrs;
-+    SmallVector<LazyPlaceholderValue> lazyPlaceholderResults;
-     for (auto result : op.getResults()) {
--      TypedAttr placeholderAttr = rewriter.getZeroAttr(result.getType());
--      if (!placeholderAttr)
--        return rewriter.notifyMatchFailure(
--            op,
--            "The case op's return type isn't currently supported by this "
--            "optimization pattern.");
--      placeholderAttrs.push_back(placeholderAttr);
-+      auto placeholder =
-+          LazyPlaceholderValue::preparePlaceholderFor(rewriter, result);
-+
-+      if (failed(placeholder)) return failure();
-+
-+      lazyPlaceholderResults.push_back(std::move(placeholder.value()));
-     }
- 
-     // Inline the active branch of the `case` op.
-@@ -763,9 +826,9 @@
-     Block& noopBlock = region.emplaceBlock();
-     SmallVector<Value> placeholderResults;
-     rewriter.setInsertionPointToEnd(&noopBlock);
--    for (auto placeholderAttr : placeholderAttrs) {
-+    for (const auto& lazyPlaceholderResult : lazyPlaceholderResults) {
-       placeholderResults.push_back(
--          ConstantOp::create(rewriter, region.getLoc(), placeholderAttr));
-+          lazyPlaceholderResult.createAt(region.getLoc()));
-     }
-     stablehlo::ReturnOp::create(rewriter, region.getLoc(), placeholderResults);
- 
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
---- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
-+++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveSimplificationPatterns.td
-@@ -44,7 +44,8 @@
-     "same number of elements">;
- 
- def BroadcastNotReducibleToReshape : Constraint<
--    CPred<"llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
-+    CPred<"!llvm::cast<ShapedType>($0.getType()).hasStaticShape() || "
-+          "llvm::isa<stablehlo::BroadcastInDimOp>($0.getDefiningOp()) && "
-           "!("
-             "llvm::is_sorted($0.getDefiningOp<stablehlo::BroadcastInDimOp>().getBroadcastDimensions()) && "
-             "llvm::cast<ShapedType>($0.getType()).getNumElements() == llvm::cast<ShapedType>($1.getType()).getNumElements()"
-@@ -134,8 +135,7 @@
- 
- def MergePermutations : NativeCodeCall<"getMergedTransposePermutation($_builder, $0, $1)">;
- 
--def MergeDiscardableAttributes
--    : NativeCodeCall<"mergeDiscardableAttributes($0, $1)">;
-+def MergeDiscardableAttributes : NativeCodeCall<"mergeDiscardableAttributes($0, $1)">;
- 
- def StableHLO_ConvertOpWithShape : NativeCodeCall<
-     "stablehlo::ConvertOp::create($_builder, $_loc, $0.getType(), $1)">;
-@@ -151,10 +151,10 @@
- 
- // op(cst, X) -> op(X, cst)
- class CanonicalizeConstantToRhs<Op StableHLO_OpType>
--    : Pat<(StableHLO_OpType:$op (StableHLO_ConstantOp:$lhs $value), $rhs),
--          (StableHLO_OpType:$new_op $rhs, $lhs),
--          [(NotConstantOp $rhs), (CommutativeOp $op)],
--          [(MergeDiscardableAttributes $op, $new_op)]>;
-+  : Pat<(StableHLO_OpType:$op (StableHLO_ConstantOp:$lhs $value), $rhs),
-+        (StableHLO_OpType:$new_op $rhs, $lhs),
-+        [(NotConstantOp $rhs), (CommutativeOp $op)],
-+        [(MergeDiscardableAttributes $op, $new_op)]>;
- 
- ////////
- // AddOp
-@@ -165,9 +165,9 @@
- 
- // Pattern: add(X, 0) -> X
- def AddOp_RemoveNoop
--    : Pat<(StableHLO_AddOp:$op $lhs, (ConstantLikeMatcher AnyZero:$value)),
--          (replaceWithValue $lhs), [],
--          [(MergeDiscardableAttributes $op, $lhs)]>;
-+  : Pat<(StableHLO_AddOp:$op $lhs, (ConstantLikeMatcher AnyZero:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // AndOp
-@@ -177,25 +177,26 @@
-   : CanonicalizeConstantToRhs<StableHLO_AndOp>;
- 
- // Pattern: and(X, 0) -> 0
--def AndOp_FoldToZero : Pat<(StableHLO_AndOp:$op $lhs,
--                               (StableHLO_ConstantOp:$zero IntZero:$value)),
--                           (replaceWithValue $zero), [],
--                           [(MergeDiscardableAttributes $op, $zero)]>;
-+def AndOp_FoldToZero
-+  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
-+        (replaceWithValue $zero), [],
-+        [(MergeDiscardableAttributes $op, $zero)]>;
- 
- // Pattern: and(X, 1) -> X
--def AndOp_RemoveNoop : Pat<(StableHLO_AndOp:$op $lhs,
--                               (StableHLO_ConstantOp:$one IntAllOnes:$value)),
--                           (replaceWithValue $lhs), [],
--                           [(MergeDiscardableAttributes $op, $lhs)]>;
-+def AndOp_RemoveNoop
-+  : Pat<(StableHLO_AndOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // BroadcastInDimOp
- 
- // Pattern: broadcast_in_dim(X, [iota...]) -> X
- def BroadcastInDimOp_RemoveNoop
--    : Pat<(StableHLO_BroadcastInDimOp:$op $operand, IotaDims:$dims),
--          (replaceWithValue $operand), [(TypesEqual $op, $operand)],
--          [(MergeDiscardableAttributes $op, $operand)]>;
-+  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, IotaDims:$dims),
-+        (replaceWithValue $operand),
-+        [(TypesEqual $op, $operand)],
-+        [(MergeDiscardableAttributes $op, $operand)]>;
- 
- // Pattern: broadcast_in_dim(broadcast_in_dim(X, [dimsA...]), [dimsB...])
- //       -> broadcast_in_dim(X, merge(dimsA, dimsB))
-@@ -210,8 +211,10 @@
- 
- // Pattern: broadcast_in_dim(X, [sorted...]) -> reshape(X, [sorted...])
- //          [if same numel]
-+// TODO: Figure out if static extents matching is valid (i.e. <=10 -> 1x[<=10])
-+// for bounded dynamism, same for BroadcastInDimOp_ReplaceWithReshape
- def BroadcastInDimOp_ReplaceWithReshape
--  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, SortedDims:$dims),
-+  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, SortedDims:$dims),
-         (StableHLO_ReshapeOpWithShape $op, $operand),
-         [(NumberOfElementsEqual $op, $operand)],
-         [],
-@@ -220,7 +223,7 @@
- // Pattern: broadcast_in_dim(X, [dims...]) -> transpose(X, [dims...])
- //          [if same numel & rank]
- def BroadcastInDimOp_ReplaceWithTranspose
--  : Pat<(StableHLO_BroadcastInDimOp:$op $operand, $dims),
-+  : Pat<(StableHLO_BroadcastInDimOp:$op AnyStaticShapeTensor:$operand, $dims),
-         (StableHLO_TransposeOp $operand, (InvertBroadcastDims $dims)),
-         [(NumberOfElementsEqual $op, $operand), (RankEqual $op, $operand)]>;
- 
-@@ -259,9 +262,10 @@
- 
- // Pattern: convert(X, [X.type]) -> X
- def ConvertOp_RemoveNoop
--    : Pat<(StableHLO_ConvertOp:$convert $operand),
--          (replaceWithValue $operand), [(TypesEqual $convert, $operand)],
--          [(MergeDiscardableAttributes $convert, $operand)]>;
-+  : Pat<(StableHLO_ConvertOp:$convert $operand),
-+        (replaceWithValue $operand),
-+        [(TypesEqual $convert, $operand)],
-+        [(MergeDiscardableAttributes $convert, $operand)]>;
- 
- ////////
- // DynamicBroadcastInDimOp
-@@ -447,16 +451,16 @@
- //
- // Multiplication by 0. This fold is not trivial for floats in presence of NaNs,
- // so we currently only enable it for ints.
--def MulOp_FoldToZero : Pat<(StableHLO_MulOp:$mul_op $lhs,
--                               (StableHLO_ConstantOp:$zero IntZero:$value)),
--                           (replaceWithValue $zero), [],
--                           [(MergeDiscardableAttributes $mul_op, $zero)]>;
-+def MulOp_FoldToZero
-+  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
-+        (replaceWithValue $zero), [],
-+        [(MergeDiscardableAttributes $mul_op, $zero)]>;
- 
- // Pattern: multiply(X, 1i) -> X
- def MulOp_RemoveNoop
--    : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp AnyOne:$value)),
--          (replaceWithValue $lhs), [],
--          [(MergeDiscardableAttributes $mul_op, $lhs)]>;
-+  : Pat<(StableHLO_MulOp:$mul_op $lhs, (StableHLO_ConstantOp AnyOne:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $mul_op, $lhs)]>;
- 
- ////////
- // OrOp
-@@ -465,16 +469,16 @@
- def OrOp_CanonicalizeConstantToRhs : CanonicalizeConstantToRhs<StableHLO_OrOp>;
- 
- // Pattern: or(X, 1) -> 1
--def OrOp_FoldToOne : Pat<(StableHLO_OrOp:$op $lhs,
--                             (StableHLO_ConstantOp:$one IntAllOnes:$value)),
--                         (replaceWithValue $one), [],
--                         [(MergeDiscardableAttributes $op, $one)]>;
-+def OrOp_FoldToOne
-+  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$one IntAllOnes:$value)),
-+        (replaceWithValue $one), [],
-+        [(MergeDiscardableAttributes $op, $one)]>;
- 
- // Pattern: or(X, 0) -> X
--def OrOp_RemoveNoop : Pat<(StableHLO_OrOp:$op $lhs,
--                              (StableHLO_ConstantOp:$zero IntZero:$value)),
--                          (replaceWithValue $lhs), [],
--                          [(MergeDiscardableAttributes $op, $lhs)]>;
-+def OrOp_RemoveNoop
-+  : Pat<(StableHLO_OrOp:$op $lhs, (StableHLO_ConstantOp:$zero IntZero:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // PadOp
-@@ -574,10 +578,10 @@
-         (StableHLO_ConstantLike<"0"> $operand)>;
- 
- // Pattern: subtract(X, 0) -> X
--def SubtractOp_RemoveNoop : Pat<(StableHLO_SubtractOp:$op $lhs,
--                                    (StableHLO_ConstantOp AnyZero:$value)),
--                                (replaceWithValue $lhs), [],
--                                [(MergeDiscardableAttributes $op, $lhs)]>;
-+def SubtractOp_RemoveNoop
-+  : Pat<(StableHLO_SubtractOp:$op $lhs, (StableHLO_ConstantOp AnyZero:$value)),
-+        (replaceWithValue $lhs), [],
-+        [(MergeDiscardableAttributes $op, $lhs)]>;
- 
- ////////
- // SliceOp
+ FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 6012798b53e02e..48e631619a6888 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "96acdcb7724f4a9eec6d2e5af2597b0750c13948"
-    STABLEHLO_SHA256 = "68e068a78d71f0764d5dd385ef434df922050530de99001969493298a00d64a0"
+    STABLEHLO_COMMIT = "1ef9e390b5295e676d2b864fe1924bc2f3f4cf0f"
+    STABLEHLO_SHA256 = "818c951ad0ba0ac6c26d3ed01fed8f9a0e5ca93f5aed35005f75f0faf11bdfb0"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(

From eaa38d83551b08018ec3e374caf412544ae88276 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Wed, 10 Dec 2025 12:21:42 -0800
Subject: [PATCH 139/753] [XLA:GPU] Fix dispatch decomposition when replica
 groups are shuffled.

When replica groups for for singe host are not contiguous, we need an extra step to reorder offsets and sizes metadata operands. We need a similar for combine ragged-all-to-all that I'll do in the following change.

PiperOrigin-RevId: 842830042
---
 .../xla/xla/service/gpu/transforms/BUILD      |   1 +
 ...ragged_all_to_all_multi_host_decomposer.cc | 123 +++++++++++++++++-
 ...d_all_to_all_multi_host_decomposer_test.cc |  38 +++++-
 .../xla/tests/ragged_all_to_all_e2e_test.cc   |  60 +++++++++
 4 files changed, 213 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 1acb680d9e73d7..1a95a47a225823 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -3106,6 +3106,7 @@ cc_library(
     srcs = ["ragged_all_to_all_multi_host_decomposer.cc"],
     hdrs = ["ragged_all_to_all_multi_host_decomposer.h"],
     deps = [
+        "//xla:array",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
index e6da5ffb509679..a3d967d7b4cb40 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "xla/array.h"
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -50,6 +51,90 @@ namespace gpu {
 
 using hlo_query::NextChannelId;
 
+// Returns a permutation of the devices in the replica group such that devices
+// on the same host are next to each other. The order of the devices within a
+// host is preserved.
+absl::InlinedVector<int64_t, 8> FindPermutation(
+    const ReplicaGroup& replica_group, int64_t num_devices_per_host) {
+  int64_t num_devices_in_replica = replica_group.replica_ids_size();
+
+  absl::InlinedVector<int64_t, 8> permutation(num_devices_in_replica);
+  absl::c_iota(permutation, 0);
+
+  absl::c_stable_sort(permutation, [&](int64_t i, int64_t j) {
+    int64_t host_i = replica_group.replica_ids(i) / num_devices_per_host;
+    int64_t host_j = replica_group.replica_ids(j) / num_devices_per_host;
+    return host_i < host_j;
+    return replica_group.replica_ids(i) < replica_group.replica_ids(j);
+  });
+  return permutation;
+}
+
+// Returns a permutation of the devices in the replica groups such that devices
+// on the same host are next to each other. Returns std::nullopt if the
+// permutation is not the same for all replica groups.
+std::optional<absl::InlinedVector<int64_t, 8>> FindReplicaGroupsPermutation(
+    absl::Span<ReplicaGroup const> replica_groups,
+    int64_t num_devices_per_host) {
+  absl::InlinedVector<int64_t, 8> permutation =
+      FindPermutation(replica_groups[0], num_devices_per_host);
+
+  // Check that all replica groups have the same permutation. Operand
+  // permutation doesn't not depend on the device id, so if permutations are
+  // different, we can't rewrite the ragged-all-to-all.
+  for (int64_t i = 1; i < replica_groups.size(); ++i) {
+    auto replica_group_permutation =
+        FindPermutation(replica_groups[i], num_devices_per_host);
+    if (replica_group_permutation != permutation) {
+      return std::nullopt;
+    }
+  }
+
+  return permutation;
+}
+
+// Shuffle values in the hlo instruction based on the permutation.
+HloInstruction* ShuffleMetadataOperandValues(
+    HloInstruction* hlo, absl::Span<int64_t const> permutation) {
+  // If the permutation is already sorted, then we don't need to shuffle.
+  if (absl::c_is_sorted(permutation)) {
+    return hlo;
+  }
+
+  HloComputation* computation = hlo->parent();
+
+  const Shape& shape = hlo->shape();
+  CHECK_EQ(shape.dimensions().size(), 1);
+
+  int64_t num_elements = shape.dimensions(0);
+  int64_t num_replicas = permutation.size();
+  int64_t num_elements_per_replica = num_elements / permutation.size();
+
+  Array<int64_t> permutation_array({num_replicas, 1});
+  for (int64_t i = 0; i < permutation.size(); ++i) {
+    permutation_array(i, 0) = num_elements_per_replica * permutation[i];
+  }
+
+  auto permutation_constant =
+      computation->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateFromArray(permutation_array)));
+
+  Shape new_shape = ShapeUtil::MakeShape(
+      shape.element_type(), {num_replicas, num_elements_per_replica});
+
+  hlo = computation->AddInstruction(
+      HloInstruction::CreateGather(new_shape, hlo, permutation_constant,
+                                   HloGatherInstruction::MakeGatherDimNumbers(
+                                       /*offset_dims=*/{1},
+                                       /*collapsed_slice_dims=*/{},
+                                       /*start_index_map=*/{0},
+                                       /*index_vector_dim=*/1),
+                                   /*slice_sizes=*/{num_elements_per_replica},
+                                   /*indices_are_sorted=*/false));
+
+  return computation->AddInstruction(HloInstruction::CreateReshape(shape, hlo));
+}
+
 // Corrects the offsets in the local metadata to account for the number of input
 // rows in the combined ragged tensor.
 HloInstruction* CorrectOffsets(int64_t offset, HloInstruction* local_metadata,
@@ -83,13 +168,16 @@ HloInstruction* CorrectOffsets(int64_t offset, HloInstruction* local_metadata,
 absl::InlinedVector<HloInstruction*, 4> GetIntraHostMetadata(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
     HloComputation* computation, absl::Span<ReplicaGroup const> replica_groups,
-    int64_t num_hosts, int64_t num_devices_in_replica) {
+    absl::Span<int64_t const> replica_groups_permutation, int64_t num_hosts,
+    int64_t num_devices_in_replica) {
   int64_t num_devices_in_replica_per_host = num_devices_in_replica / num_hosts;
 
   absl::InlinedVector<HloInstruction*, 4> metadata_operands;
   metadata_operands.reserve(4);
   for (int i = 2; i < 6; ++i) {
     metadata_operands.push_back(ragged_all_to_all->mutable_operand(i));
+    metadata_operands.back() = ShuffleMetadataOperandValues(
+        metadata_operands.back(), replica_groups_permutation);
   }
 
   Shape metadata_operand_shape = metadata_operands[0]->shape();
@@ -179,7 +267,8 @@ absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
     HloComputation* computation,
     absl::Span<ReplicaGroup const> inter_host_replica_groups,
-    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups,
+    absl::Span<int64_t const> replica_groups_permutation, int64_t num_hosts,
     int64_t num_devices_in_replica) {
   HloInstruction* input_operand = ragged_all_to_all->mutable_operand(0);
 
@@ -208,9 +297,9 @@ absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
           ragged_all_to_all->channel_id().has_value()));
 
   absl::InlinedVector<HloInstruction*, 4> intra_host_metadata =
-      GetIntraHostMetadata(ragged_all_to_all, computation,
-                           inter_host_replica_groups, num_hosts,
-                           num_devices_in_replica);
+      GetIntraHostMetadata(
+          ragged_all_to_all, computation, inter_host_replica_groups,
+          replica_groups_permutation, num_hosts, num_devices_in_replica);
 
   HloInstruction* new_ragged_all_to_all =
       computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
@@ -219,7 +308,7 @@ absl::StatusOr<bool> DecomposeDispatchRaggedAllToAll(
           {all_gather_input, ragged_all_to_all->mutable_operand(1),
            intra_host_metadata[0], intra_host_metadata[1],
            intra_host_metadata[2], intra_host_metadata[3]},
-          /*replica_groups=*/intra_host_replica_groups,
+          /*device_list=*/CollectiveDeviceList(intra_host_replica_groups),
           /*channel_id=*/ragged_all_to_all->channel_id()));
 
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(ragged_all_to_all,
@@ -430,6 +519,25 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
     return false;
   }
 
+  // Offsets and sizes in metadata operands are stored in the order of replica
+  // groups. For example, if the replica groups are:
+  //   {{0, 2, 4, 6, 1, 3, 5, 7}}
+  // Then the offsets and sizes are stored in the order of
+  //   [0, 2, 4, 6, 1, 3, 5, 7]
+  // In the decomposition, we want to exchange all the intra-host metadata
+  // between hosts. To do that we want to group the metadata by hosts. We
+  // compute permutation that need to be performed on the metadata operand and
+  // use gather to move values. After the shuffle, offsets and sizes will be
+  // ordered as:
+  //   [0, 2, 1, 3, 4, 6, 5, 7]
+  auto replica_groups_permutation = FindReplicaGroupsPermutation(
+      replica_groups, fast_interconnect_slice_size);
+  // Empty value means that we can not find such permutation and the
+  // ragged-all-to-all can not be decomposed.
+  if (!replica_groups_permutation.has_value()) {
+    return false;
+  }
+
   // Decompose the replica groups into inter-host and intra-host replica groups.
   // For example, if the original replica groups were:
   //   {{0, 2, 4, 6, 8, 10, 12, 14}, {1, 3, 5, 7, 9, 11, 13, 15}}
@@ -488,7 +596,8 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
 
   return DecomposeDispatchRaggedAllToAll(
       ragged_all_to_all, computation, inter_host_replica_groups,
-      intra_host_replica_groups, num_hosts, num_devices_in_replica);
+      intra_host_replica_groups, *replica_groups_permutation, num_hosts,
+      num_devices_in_replica);
 }
 
 absl::StatusOr<bool> RaggedAllToAllMultiHostDecomposer::RunImpl(
diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
index 9508b9395c15de..e0df20ff42a189 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer_test.cc
@@ -68,8 +68,7 @@ ENTRY main {
   )"));
 }
 
-TEST_F(RaggedAllToAllDecomposerTest,
-       SimpleRaggedAllToAllCrossPartitionIsSupported) {
+TEST_F(RaggedAllToAllDecomposerTest, DispatchRaggedAllToAllIsDecomposed) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
 HloModule module, num_partitions=16
 
@@ -102,6 +101,41 @@ ENTRY main {
   )"));
 }
 
+TEST_F(RaggedAllToAllDecomposerTest,
+       DispatchRaggedAllToAllWithShuffledReplicaGroupsIsDecomposed) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule module, num_partitions=16
+
+ENTRY main {
+  input = bf16[128] parameter(0)
+  output = bf16[256] parameter(1)
+  input_offsets = s64[32] parameter(2)
+  send_sizes = s64[32] parameter(3)
+  output_offsets = s64[32] parameter(4)
+  recv_sizes = s64[32] parameter(5)
+  ROOT ra2a = bf16[256] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes),
+    replica_groups={{0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15}}
+}
+)"));
+
+  RaggedAllToAllMultiHostDecomposer decomposer(
+      /*fast_interconnect_slice_size=*/8);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get(), {}));
+
+  EXPECT_TRUE(changed);
+  EXPECT_OK(VerifyHloModule(module.get(), true, true));
+  EXPECT_OK(HloDCE().Run(module.get()));
+  EXPECT_OK(HloCSE(true).Run(module.get()));
+
+  EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
+    // CHECK: all-gather{{.*}}, replica_groups={{[{]}}{0,8},{4,12},{1,9},{5,13},{2,10},{6,14},{3,11},{7,15}{{[}]}}
+    // CHECK-COUNT-4: s64[16,2]{1,0} gather
+    // CHECK: all-to-all{{.*}}, replica_groups={{[{]}}{0,8},{4,12},{1,9},{5,13},{2,10},{6,14},{3,11},{7,15}{{[}]}}
+    // CHECK: ragged-all-to-all{{.*}}, replica_groups={{[{]}}{0,4,1,5,2,6,3,7},{8,12,9,13,10,14,11,15}{{[}]}}
+  )"));
+}
+
 TEST_F(RaggedAllToAllDecomposerTest, SingleHostRaggedAllToAllIsNotDecomposed) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
 HloModule module
diff --git a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
index cad950b9f08c83..f74275d7933bc3 100644
--- a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
+++ b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
@@ -931,6 +931,66 @@ TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_2GPUs_SliceSize1) {
   }
 }
 
+TEST_P(RaggedAllToAllMultiHostDecomposerTest,
+       RaggedAllToAll_8GPUs_SliceSize4_ShuffledReplicaGroups) {
+  auto [num_input_rows, num_output_rows] = GetParam();
+
+  if (num_input_rows > num_output_rows) {
+    // TODO(b/445380264): Fix decomposer for combine ragged-all-to-all.
+    GTEST_SKIP()
+        << "The test will currently fail for combine ragged-all-to-all (when "
+           "input is larger than output).";
+  }
+
+  std::string kModuleReplicatedStr =
+      absl::Substitute(R"(
+  HloModule module
+
+  ENTRY entry {
+    input = f32[$0,5,32] parameter(0)
+    output = f32[$1,5,32] parameter(1)
+    input_offsets = s32[32] parameter(2)
+    send_sizes = s32[32] parameter(3)
+    output_offsets = s32[32] parameter(4)
+    recv_sizes = s32[32] parameter(5)
+    ROOT ra2a = f32[$1,5,32] ragged-all-to-all(input, output,
+      input_offsets, send_sizes, output_offsets, recv_sizes),
+      replica_groups={{0,2,4,6,1,3,5,7}}
+  })",
+                       num_input_rows, num_output_rows);
+
+  const int64_t kNumReplicas = 8;
+  const int64_t kNumUpdatesPerReplica = 4;
+  if (hlo_runner_->device_count() < kNumReplicas) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas << " devices ("
+                 << hlo_runner_->device_count() << " available)";
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleReplicatedStr, kNumReplicas));
+
+  module->mutable_config()
+      .mutable_debug_options()
+      .set_xla_gpu_unsupported_override_fast_interconnect_slice_size(4);
+
+  Array<int64_t> input_sizes(
+      {kNumReplicas, kNumReplicas, kNumUpdatesPerReplica});
+  input_sizes.FillRandomUniform(0, 10);
+
+  TF_ASSERT_OK(CreateRandomTestData(module.get(), input_sizes));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs()));
+
+  const std::vector<Literal>& results = execution_result.results;
+  ASSERT_EQ(results.size(), kNumReplicas);
+
+  for (int i = 0; i < kNumReplicas; ++i) {
+    EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[i], results[i]));
+  }
+}
+
 TEST_P(RaggedAllToAllMultiHostDecomposerTest, RaggedAllToAll_8GPUs_SliceSize4) {
   auto [num_input_rows, num_output_rows] = GetParam();
 

From e1fab9f0bba793ed5a839b92ff9c7cad4789bc84 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 12:30:31 -0800
Subject: [PATCH 140/753] Handle wrapt.ObjectProxy in tensor_util and
 TrackableObject.

Resolves a TypeError in Python 3.12 caused by stricter inspect module behavior when accessing __dict__ on wrapt.ObjectProxy subclasses like _DictWrapper. The fix modifies the type-checking logic to bypass incompatible introspection on proxy objects during serialization. This restores tf.saved_model.save functionality and ensures compatibility with the Python 3.12+ runtime.

PiperOrigin-RevId: 842833037
---
 tensorflow/python/framework/tensor_util.py     |  4 ++++
 tensorflow/python/trackable/data_structures.py | 14 +++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index b44ef77a7e901d..7101f046e60b04 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -1281,6 +1281,10 @@ def is_tf_type(x):  # pylint: disable=invalid-name
   Returns:
     `True` if `x` is a TensorFlow-native type.
   """
+  # ObjectProxy is a special type of object that is used by wrapt to wrap
+  # objects. It is not a Tensor.
+  if (type(x).__name__ == "ObjectProxy"):
+    return False
   return isinstance(x, tf_type_classes)
 
 
diff --git a/tensorflow/python/trackable/data_structures.py b/tensorflow/python/trackable/data_structures.py
index c920dd882aac35..3bcb4c9ccba3b9 100644
--- a/tensorflow/python/trackable/data_structures.py
+++ b/tensorflow/python/trackable/data_structures.py
@@ -23,9 +23,6 @@
   # Fall back to the build-time dependency if the system package is not available.
   from .....third_party import wrapt  # pylint: disable=relative-beyond-top-level
 
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function as defun
-from tensorflow.python.ops import variables
 from tensorflow.python.trackable import base
 from tensorflow.python.trackable import layer_utils
 from tensorflow.python.util.compat import collections_abc
@@ -195,6 +192,8 @@ def trainable(self, value):
 
   def _track_value(self, value, name):
     """Add a dependency on `value`."""
+    # pylint: disable=g-import-not-at-top
+    from tensorflow.python.ops import variables
     value = sticky_attribute_assignment(
         trackable=self, value=value, name=name)
     if isinstance(value, variables.Variable):
@@ -810,6 +809,12 @@ def __reduce_ex__(self, protocol):
             (self.__wrapped__,))
 
   def __getattribute__(self, name):
+    if name == "__dict__":
+      # Returns __dict__ from wrapt.ObjectProxy
+      try:
+        return object.__getattribute__(self, "__dict__")
+      except (AttributeError, TypeError):
+        return {}
     if (hasattr(type(self), name)
         and isinstance(getattr(type(self), name), property)):
       # Bypass ObjectProxy for properties. Whether this workaround is necessary
@@ -1108,6 +1113,9 @@ def __getattribute__(self, name):
 
 
 def _is_function(x):
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.eager import def_function
+  from tensorflow.python.eager import function as defun
   return isinstance(x, (def_function.Function, defun.ConcreteFunction))
 
 
From ed28da9693926bd490b66e3dac7a2f5a22860a1c Mon Sep 17 00:00:00 2001
From: Raviteja Gorijala <gorijala@google.com>
Date: Wed, 10 Dec 2025 13:50:15 -0800
Subject: [PATCH 141/753] Remove TPU environment

PiperOrigin-RevId: 842864798
---
 ci/official/envs/linux_x86_tpu | 23 -----------------------
 1 file changed, 23 deletions(-)
 delete mode 100644 ci/official/envs/linux_x86_tpu

diff --git a/ci/official/envs/linux_x86_tpu b/ci/official/envs/linux_x86_tpu
deleted file mode 100644
index bde958b1a5b3d4..00000000000000
--- a/ci/official/envs/linux_x86_tpu
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-source ci/official/envs/linux_x86
-TFCI_BAZEL_COMMON_ARGS="--repo_env=HERMETIC_PYTHON_VERSION=$TFCI_PYTHON_VERSION --config release_cpu_linux --config=tpu"
-TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_tpu
-TFCI_BUILD_PIP_PACKAGE_WHEEL_NAME_ARG="--repo_env=WHEEL_NAME=tensorflow_tpu"
-TFCI_LIB_SUFFIX="-tpu-linux-x86_64"
-TFCI_WHL_BAZEL_TEST_ENABLE=0
-TFCI_WHL_IMPORT_TEST_ENABLE=0
-TFCI_WHL_SIZE_LIMIT=580M
-TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS="-f https://storage.googleapis.com/libtpu-wheels/index.html"

From 73956bab48f069372ce06334757bec6b89673fcb Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Wed, 10 Dec 2025 13:51:36 -0800
Subject: [PATCH 142/753] Integrate LLVM at llvm/llvm-project@87bf5ee23863

Updates LLVM usage to match
[87bf5ee23863](https://github.com/llvm/llvm-project/commit/87bf5ee23863)

PiperOrigin-RevId: 842865253
---
 .../xla/third_party/llvm/generated.patch      | 1073 ++++++++++++++++
 .../xla/third_party/llvm/workspace.bzl        |    4 +-
 .../xla/third_party/shardy/temporary.patch    | 1089 ++++++++++++++++-
 .../xla/third_party/shardy/workspace.bzl      |    4 +-
 4 files changed, 2161 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/third_party/llvm/generated.patch b/third_party/xla/third_party/llvm/generated.patch
index 509398da979e83..2948da46566950 100644
--- a/third_party/xla/third_party/llvm/generated.patch
+++ b/third_party/xla/third_party/llvm/generated.patch
@@ -1 +1,1074 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
+--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
++++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
+@@ -554,8 +554,6 @@
+     if (const auto &Dir = Params.initializationOptions.compilationDatabasePath)
+       CDBOpts.CompileCommandsDir = Dir;
+     CDBOpts.ContextProvider = Opts.ContextProvider;
+-    if (Opts.StrongWorkspaceMode)
+-      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
+     BaseCDB =
+         std::make_unique<DirectoryBasedGlobalCompilationDatabase>(CDBOpts);
+   }
+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
+--- a/clang-tools-extra/clangd/ClangdServer.h
++++ b/clang-tools-extra/clangd/ClangdServer.h
+@@ -152,11 +152,6 @@
+     /// FIXME: If not set, should use the current working directory.
+     std::optional<std::string> WorkspaceRoot;
+ 
+-    /// Sets an alternate mode of operation. Current effects are:
+-    /// - Using the current working directory as the working directory for
+-    ///   fallback commands
+-    bool StrongWorkspaceMode;
+-
+     /// The resource directory is used to find internal headers, overriding
+     /// defaults and -resource-dir compiler flag).
+     /// If std::nullopt, ClangdServer calls
+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
++++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+@@ -64,9 +64,7 @@
+   if (FileExtension.empty() || FileExtension == ".h")
+     Argv.push_back("-xobjective-c++-header");
+   Argv.push_back(std::string(File));
+-  tooling::CompileCommand Cmd(FallbackWorkingDirectory
+-                                  ? *FallbackWorkingDirectory
+-                                  : llvm::sys::path::parent_path(File),
++  tooling::CompileCommand Cmd(llvm::sys::path::parent_path(File),
+                               llvm::sys::path::filename(File), std::move(Argv),
+                               /*Output=*/"");
+   Cmd.Heuristic = "clangd fallback";
+@@ -351,8 +349,7 @@
+ 
+ DirectoryBasedGlobalCompilationDatabase::
+     DirectoryBasedGlobalCompilationDatabase(const Options &Opts)
+-    : GlobalCompilationDatabase(Opts.FallbackWorkingDirectory), Opts(Opts),
+-      Broadcaster(std::make_unique<BroadcastThread>(*this)) {
++    : Opts(Opts), Broadcaster(std::make_unique<BroadcastThread>(*this)) {
+   if (!this->Opts.ContextProvider)
+     this->Opts.ContextProvider = [](llvm::StringRef) {
+       return Context::current().clone();
+@@ -463,21 +460,6 @@
+   return Result;
+ }
+ 
+-void DirectoryBasedGlobalCompilationDatabase::Options::
+-    applyFallbackWorkingDirectory(
+-        std::optional<std::string> FallbackWorkingDirectory) {
+-  if (FallbackWorkingDirectory)
+-    this->FallbackWorkingDirectory = *FallbackWorkingDirectory;
+-  else {
+-    // Clangd is running in strong workspace mode but the client didn't
+-    // specify a workspace path in the `initialize` request.
+-    // Fallback to current working directory.
+-    SmallString<256> CWD;
+-    llvm::sys::fs::current_path(CWD);
+-    this->FallbackWorkingDirectory = std::string(CWD);
+-  }
+-}
+-
+ // The broadcast thread announces files with new compile commands to the world.
+ // Primarily this is used to enqueue them for background indexing.
+ //
+@@ -777,10 +759,9 @@
+ 
+ OverlayCDB::OverlayCDB(const GlobalCompilationDatabase *Base,
+                        std::vector<std::string> FallbackFlags,
+-                       CommandMangler Mangler,
+-                       std::optional<std::string> FallbackWorkingDirectory)
+-    : DelegatingCDB(Base, FallbackWorkingDirectory),
+-      Mangler(std::move(Mangler)), FallbackFlags(std::move(FallbackFlags)) {}
++                       CommandMangler Mangler)
++    : DelegatingCDB(Base), Mangler(std::move(Mangler)),
++      FallbackFlags(std::move(FallbackFlags)) {}
+ 
+ std::optional<tooling::CompileCommand>
+ OverlayCDB::getCompileCommand(PathRef File) const {
+@@ -863,20 +844,16 @@
+   return MDB;
+ }
+ 
+-DelegatingCDB::DelegatingCDB(
+-    const GlobalCompilationDatabase *Base,
+-    std::optional<std::string> FallbackWorkingDirectory)
+-    : GlobalCompilationDatabase(FallbackWorkingDirectory), Base(Base) {
++DelegatingCDB::DelegatingCDB(const GlobalCompilationDatabase *Base)
++    : Base(Base) {
+   if (Base)
+     BaseChanged = Base->watch([this](const std::vector<std::string> Changes) {
+       OnCommandChanged.broadcast(Changes);
+     });
+ }
+ 
+-DelegatingCDB::DelegatingCDB(
+-    std::unique_ptr<GlobalCompilationDatabase> Base,
+-    std::optional<std::string> FallbackWorkingDirectory)
+-    : DelegatingCDB(Base.get(), FallbackWorkingDirectory) {
++DelegatingCDB::DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base)
++    : DelegatingCDB(Base.get()) {
+   BaseOwner = std::move(Base);
+ }
+ 
+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
+--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h
++++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
+@@ -35,9 +35,6 @@
+ /// Provides compilation arguments used for parsing C and C++ files.
+ class GlobalCompilationDatabase {
+ public:
+-  GlobalCompilationDatabase(
+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt)
+-      : FallbackWorkingDirectory(FallbackWorkingDirectory) {}
+   virtual ~GlobalCompilationDatabase() = default;
+ 
+   /// If there are any known-good commands for building this file, returns one.
+@@ -72,19 +69,14 @@
+   }
+ 
+ protected:
+-  std::optional<std::string> FallbackWorkingDirectory;
+   mutable CommandChanged OnCommandChanged;
+ };
+ 
+ // Helper class for implementing GlobalCompilationDatabases that wrap others.
+ class DelegatingCDB : public GlobalCompilationDatabase {
+ public:
+-  DelegatingCDB(
+-      const GlobalCompilationDatabase *Base,
+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
+-  DelegatingCDB(
+-      std::unique_ptr<GlobalCompilationDatabase> Base,
+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
++  DelegatingCDB(const GlobalCompilationDatabase *Base);
++  DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base);
+ 
+   std::optional<tooling::CompileCommand>
+   getCompileCommand(PathRef File) const override;
+@@ -125,12 +117,6 @@
+     // Only look for a compilation database in this one fixed directory.
+     // FIXME: fold this into config/context mechanism.
+     std::optional<Path> CompileCommandsDir;
+-    // Working directory for fallback commands
+-    // If unset, parent directory of file should be used
+-    std::optional<std::string> FallbackWorkingDirectory;
+-
+-    void applyFallbackWorkingDirectory(
+-        std::optional<std::string> FallbackWorkingDirectory);
+   };
+ 
+   DirectoryBasedGlobalCompilationDatabase(const Options &Opts);
+@@ -208,11 +194,9 @@
+   // Base may be null, in which case no entries are inherited.
+   // FallbackFlags are added to the fallback compile command.
+   // Adjuster is applied to all commands, fallback or not.
+-  OverlayCDB(
+-      const GlobalCompilationDatabase *Base,
+-      std::vector<std::string> FallbackFlags = {},
+-      CommandMangler Mangler = nullptr,
+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
++  OverlayCDB(const GlobalCompilationDatabase *Base,
++             std::vector<std::string> FallbackFlags = {},
++             CommandMangler Mangler = nullptr);
+ 
+   std::optional<tooling::CompileCommand>
+   getCompileCommand(PathRef File) const override;
+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
+--- a/clang-tools-extra/clangd/tool/Check.cpp
++++ b/clang-tools-extra/clangd/tool/Check.cpp
+@@ -169,8 +169,6 @@
+   bool buildCommand(const ThreadsafeFS &TFS) {
+     log("Loading compilation database...");
+     DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
+-    if (Opts.StrongWorkspaceMode)
+-      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
+     CDBOpts.CompileCommandsDir =
+         Config::current().CompileFlags.CDBSearch.FixedCDBPath;
+     BaseCDB =
+@@ -180,10 +178,8 @@
+         getSystemIncludeExtractor(llvm::ArrayRef(Opts.QueryDriverGlobs));
+     if (Opts.ResourceDir)
+       Mangler.ResourceDir = *Opts.ResourceDir;
+-
+     CDB = std::make_unique<OverlayCDB>(
+-        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler),
+-        CDBOpts.FallbackWorkingDirectory);
++        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler));
+ 
+     if (auto TrueCmd = CDB->getCompileCommand(File)) {
+       Cmd = std::move(*TrueCmd);
+@@ -506,7 +502,7 @@
+                  config::DiagnosticCallback Diag) const override {
+       config::Fragment F;
+       // If we're timing clang-tidy checks, implicitly disabling the slow ones
+-      // is counterproductive!
++      // is counterproductive! 
+       if (CheckTidyTime.getNumOccurrences())
+         F.Diagnostics.ClangTidy.FastCheckFilter.emplace("None");
+       return {std::move(F).compile(Diag)};
+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
+--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
++++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
+@@ -500,17 +500,6 @@
+     init(true),
+ };
+ 
+-opt<bool> StrongWorkspaceMode{
+-    "strong-workspace-mode",
+-    cat(Features),
+-    desc("An alternate mode of operation for clangd, where the clangd instance "
+-         "is used to edit a single workspace.\n"
+-         "When enabled, fallback commands use the workspace directory as their "
+-         "working directory instead of the parent folder."),
+-    init(false),
+-    Hidden,
+-};
+-
+ opt<bool> UseDirtyHeaders{"use-dirty-headers", cat(Misc),
+                           desc("Use files open in the editor when parsing "
+                                "headers instead of reading from the disk"),
+@@ -918,7 +907,6 @@
+   }
+   if (!ResourceDir.empty())
+     Opts.ResourceDir = ResourceDir;
+-  Opts.StrongWorkspaceMode = StrongWorkspaceMode;
+   Opts.BuildDynamicSymbolIndex = true;
+ #if CLANGD_ENABLE_REMOTE
+   if (RemoteIndexAddress.empty() != ProjectRoot.empty()) {
+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
+--- a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
++++ b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
+@@ -55,20 +55,6 @@
+                                            testPath("foo/bar")));
+ }
+ 
+-TEST(GlobalCompilationDatabaseTest, FallbackWorkingDirectory) {
+-  MockFS TFS;
+-  DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
+-  CDBOpts.applyFallbackWorkingDirectory(testPath("foo"));
+-  EXPECT_EQ(CDBOpts.FallbackWorkingDirectory, testPath("foo"));
+-
+-  DirectoryBasedGlobalCompilationDatabase DB(CDBOpts);
+-  auto Cmd = DB.getFallbackCommand(testPath("foo/src/bar.cc"));
+-  EXPECT_EQ(Cmd.Directory, testPath("foo"));
+-  EXPECT_THAT(Cmd.CommandLine,
+-              ElementsAre("clang", testPath("foo/src/bar.cc")));
+-  EXPECT_EQ(Cmd.Output, "");
+-}
+-
+ static tooling::CompileCommand cmd(llvm::StringRef File, llvm::StringRef Arg) {
+   return tooling::CompileCommand(
+       testRoot(), File, {"clang", std::string(Arg), std::string(File)}, "");
+diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
+--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
++++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
+@@ -618,8 +618,6 @@
+ 
+   DenseSet<const MachineBasicBlock *> DirtyBBs;
+   for (MachineBasicBlock &MBB : MF) {
+-    if (!MDT->isReachableFromEntry(&MBB))
+-      continue;
+     if (MBB.isEHPad()) {
+       DirtyBBs.insert(&MBB);
+       continue;
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+@@ -708,53 +708,6 @@
+   return 2;
+ }
+ 
+-bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
+-                               const TargetInstrInfo &TII) {
+-  for (MachineInstr &MI : MBB->terminators()) {
+-    unsigned Opc = MI.getOpcode();
+-    switch (Opc) {
+-    case AArch64::CBZW:
+-    case AArch64::CBZX:
+-    case AArch64::TBZW:
+-    case AArch64::TBZX:
+-      // CBZ/TBZ with WZR/XZR -> unconditional B
+-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+-          MI.getOperand(0).getReg() == AArch64::XZR) {
+-        DEBUG_WITH_TYPE("optimizeTerminators",
+-                        dbgs() << "Removing always taken branch: " << MI);
+-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+-        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
+-        for (auto *S : Succs)
+-          if (S != Target)
+-            MBB->removeSuccessor(S);
+-        DebugLoc DL = MI.getDebugLoc();
+-        while (MBB->rbegin() != &MI)
+-          MBB->rbegin()->eraseFromParent();
+-        MI.eraseFromParent();
+-        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
+-        return true;
+-      }
+-      break;
+-    case AArch64::CBNZW:
+-    case AArch64::CBNZX:
+-    case AArch64::TBNZW:
+-    case AArch64::TBNZX:
+-      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
+-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+-          MI.getOperand(0).getReg() == AArch64::XZR) {
+-        DEBUG_WITH_TYPE("optimizeTerminators",
+-                        dbgs() << "Removing never taken branch: " << MI);
+-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+-        MI.getParent()->removeSuccessor(Target);
+-        MI.eraseFromParent();
+-        return true;
+-      }
+-      break;
+-    }
+-  }
+-  return false;
+-}
+-
+ // Find the original register that VReg is copied from.
+ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+   while (Register::isVirtualRegister(VReg)) {
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+@@ -705,8 +705,6 @@
+                               unsigned *OutUnscaledOp = nullptr,
+                               int64_t *EmittableOffset = nullptr);
+ 
+-bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
+-
+ static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+ 
+ static inline bool isCondBranchOpcode(int Opc) {
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+--- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
++++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+@@ -14,7 +14,6 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "AArch64.h"
+-#include "AArch64InstrInfo.h"
+ #include "llvm/CodeGen/MachineFunctionPass.h"
+ #include "llvm/CodeGen/MachineInstrBuilder.h"
+ #include "llvm/CodeGen/TargetInstrInfo.h"
+@@ -46,6 +45,51 @@
+                 "AArch64 Redundant Conditional Branch Elimination pass", false,
+                 false)
+ 
++static bool optimizeTerminators(MachineBasicBlock *MBB,
++                                const TargetInstrInfo &TII) {
++  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
++    unsigned Opc = MI.getOpcode();
++    switch (Opc) {
++    case AArch64::CBZW:
++    case AArch64::CBZX:
++    case AArch64::TBZW:
++    case AArch64::TBZX:
++      // CBZ/TBZ with WZR/XZR -> unconditional B
++      if (MI.getOperand(0).getReg() == AArch64::WZR ||
++          MI.getOperand(0).getReg() == AArch64::XZR) {
++        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
++        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
++        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
++        for (auto *S : Succs)
++          if (S != Target)
++            MBB->removeSuccessor(S);
++        DebugLoc DL = MI.getDebugLoc();
++        while (MBB->rbegin() != &MI)
++          MBB->rbegin()->eraseFromParent();
++        MI.eraseFromParent();
++        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
++        return true;
++      }
++      break;
++    case AArch64::CBNZW:
++    case AArch64::CBNZX:
++    case AArch64::TBNZW:
++    case AArch64::TBNZX:
++      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
++      if (MI.getOperand(0).getReg() == AArch64::WZR ||
++          MI.getOperand(0).getReg() == AArch64::XZR) {
++        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
++        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
++        MI.getParent()->removeSuccessor(Target);
++        MI.eraseFromParent();
++        return true;
++      }
++      break;
++    }
++  }
++  return false;
++}
++
+ bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
+   if (skipFunction(MF.getFunction()))
+     return false;
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+--- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
++++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+@@ -50,7 +50,6 @@
+ //        to use WZR/XZR directly in some cases.
+ //===----------------------------------------------------------------------===//
+ #include "AArch64.h"
+-#include "AArch64InstrInfo.h"
+ #include "llvm/ADT/SetVector.h"
+ #include "llvm/ADT/Statistic.h"
+ #include "llvm/ADT/iterator_range.h"
+@@ -476,7 +475,6 @@
+     return false;
+   TRI = MF.getSubtarget().getRegisterInfo();
+   MRI = &MF.getRegInfo();
+-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+ 
+   // Resize the clobbered and used register unit trackers.  We do this once per
+   // function.
+@@ -486,10 +484,8 @@
+   OptBBUsedRegs.init(*TRI);
+ 
+   bool Changed = false;
+-  for (MachineBasicBlock &MBB : MF) {
+-    Changed |= optimizeTerminators(&MBB, TII);
++  for (MachineBasicBlock &MBB : MF)
+     Changed |= optimizeBlock(&MBB);
+-  }
+   return Changed;
+ }
+ 
+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
++++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+@@ -1827,12 +1827,8 @@
+     // profile info.
+     CostTooHigh =
+         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
+-    if (CostTooHigh) {
+-      // Mark runtime checks as never succeeding when they exceed the threshold.
+-      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+-      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
++    if (CostTooHigh)
+       return;
+-    }
+ 
+     BasicBlock *LoopHeader = L->getHeader();
+     BasicBlock *Preheader = L->getLoopPreheader();
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+--- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
++++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+@@ -735,15 +735,21 @@
+ ; ENABLE-NEXT:    .cfi_offset w29, -16
+ ; ENABLE-NEXT:    .cfi_offset w19, -24
+ ; ENABLE-NEXT:    .cfi_offset w20, -32
++; ENABLE-NEXT:  ; %bb.1: ; %if.then
+ ; ENABLE-NEXT:    sub x19, sp, #16
+ ; ENABLE-NEXT:    mov sp, x19
+ ; ENABLE-NEXT:    mov w20, wzr
+-; ENABLE-NEXT:  LBB10_1: ; %for.body
++; ENABLE-NEXT:  LBB10_2: ; %for.body
+ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+ ; ENABLE-NEXT:    bl _something
+ ; ENABLE-NEXT:    add w20, w0, w20
+ ; ENABLE-NEXT:    str w20, [x19]
+-; ENABLE-NEXT:    b LBB10_1
++; ENABLE-NEXT:    b LBB10_2
++; ENABLE-NEXT:  ; %bb.3: ; %if.end
++; ENABLE-NEXT:    sub sp, x29, #16
++; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
++; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
++; ENABLE-NEXT:    ret
+ ;
+ ; DISABLE-LABEL: infiniteloop:
+ ; DISABLE:       ; %bb.0: ; %entry
+@@ -755,15 +761,21 @@
+ ; DISABLE-NEXT:    .cfi_offset w29, -16
+ ; DISABLE-NEXT:    .cfi_offset w19, -24
+ ; DISABLE-NEXT:    .cfi_offset w20, -32
++; DISABLE-NEXT:  ; %bb.1: ; %if.then
+ ; DISABLE-NEXT:    sub x19, sp, #16
+ ; DISABLE-NEXT:    mov sp, x19
+ ; DISABLE-NEXT:    mov w20, wzr
+-; DISABLE-NEXT:  LBB10_1: ; %for.body
++; DISABLE-NEXT:  LBB10_2: ; %for.body
+ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+ ; DISABLE-NEXT:    bl _something
+ ; DISABLE-NEXT:    add w20, w0, w20
+ ; DISABLE-NEXT:    str w20, [x19]
+-; DISABLE-NEXT:    b LBB10_1
++; DISABLE-NEXT:    b LBB10_2
++; DISABLE-NEXT:  ; %bb.3: ; %if.end
++; DISABLE-NEXT:    sub sp, x29, #16
++; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
++; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
++; DISABLE-NEXT:    ret
+ entry:
+   br i1 undef, label %if.then, label %if.end
+ 
+@@ -794,10 +806,11 @@
+ ; ENABLE-NEXT:    .cfi_offset w29, -16
+ ; ENABLE-NEXT:    .cfi_offset w19, -24
+ ; ENABLE-NEXT:    .cfi_offset w20, -32
++; ENABLE-NEXT:  ; %bb.1: ; %if.then
+ ; ENABLE-NEXT:    sub x8, sp, #16
+ ; ENABLE-NEXT:    mov sp, x8
+ ; ENABLE-NEXT:    mov w9, wzr
+-; ENABLE-NEXT:  LBB11_1: ; %for.body
++; ENABLE-NEXT:  LBB11_2: ; %for.body
+ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+ ; ENABLE-NEXT:    ; InlineAsm Start
+ ; ENABLE-NEXT:    mov x10, #0 ; =0x0
+@@ -808,7 +821,12 @@
+ ; ENABLE-NEXT:    ; InlineAsm Start
+ ; ENABLE-NEXT:    nop
+ ; ENABLE-NEXT:    ; InlineAsm End
+-; ENABLE-NEXT:    b LBB11_1
++; ENABLE-NEXT:    b LBB11_2
++; ENABLE-NEXT:  ; %bb.3: ; %if.end
++; ENABLE-NEXT:    sub sp, x29, #16
++; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
++; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
++; ENABLE-NEXT:    ret
+ ;
+ ; DISABLE-LABEL: infiniteloop2:
+ ; DISABLE:       ; %bb.0: ; %entry
+@@ -820,10 +838,11 @@
+ ; DISABLE-NEXT:    .cfi_offset w29, -16
+ ; DISABLE-NEXT:    .cfi_offset w19, -24
+ ; DISABLE-NEXT:    .cfi_offset w20, -32
++; DISABLE-NEXT:  ; %bb.1: ; %if.then
+ ; DISABLE-NEXT:    sub x8, sp, #16
+ ; DISABLE-NEXT:    mov sp, x8
+ ; DISABLE-NEXT:    mov w9, wzr
+-; DISABLE-NEXT:  LBB11_1: ; %for.body
++; DISABLE-NEXT:  LBB11_2: ; %for.body
+ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+ ; DISABLE-NEXT:    ; InlineAsm Start
+ ; DISABLE-NEXT:    mov x10, #0 ; =0x0
+@@ -834,7 +853,12 @@
+ ; DISABLE-NEXT:    ; InlineAsm Start
+ ; DISABLE-NEXT:    nop
+ ; DISABLE-NEXT:    ; InlineAsm End
+-; DISABLE-NEXT:    b LBB11_1
++; DISABLE-NEXT:    b LBB11_2
++; DISABLE-NEXT:  ; %bb.3: ; %if.end
++; DISABLE-NEXT:    sub sp, x29, #16
++; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
++; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
++; DISABLE-NEXT:    ret
+ entry:
+   br i1 undef, label %if.then, label %if.end
+ 
+@@ -865,43 +889,49 @@
+ define void @infiniteloop3() {
+ ; ENABLE-LABEL: infiniteloop3:
+ ; ENABLE:       ; %bb.0: ; %entry
++; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
+ ; ENABLE-NEXT:    mov x8, xzr
+ ; ENABLE-NEXT:    mov x9, xzr
+ ; ENABLE-NEXT:    mov x11, xzr
+-; ENABLE-NEXT:    b LBB12_2
+-; ENABLE-NEXT:  LBB12_1: ; %loop2b
+-; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
++; ENABLE-NEXT:    b LBB12_3
++; ENABLE-NEXT:  LBB12_2: ; %loop2b
++; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
+ ; ENABLE-NEXT:    str x10, [x11]
+ ; ENABLE-NEXT:    mov x11, x10
+-; ENABLE-NEXT:  LBB12_2: ; %loop1
++; ENABLE-NEXT:  LBB12_3: ; %loop1
+ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+ ; ENABLE-NEXT:    mov x10, x9
+ ; ENABLE-NEXT:    ldr x9, [x8]
+-; ENABLE-NEXT:    cbnz x8, LBB12_1
+-; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
++; ENABLE-NEXT:    cbnz x8, LBB12_2
++; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
+ ; ENABLE-NEXT:    mov x8, x10
+ ; ENABLE-NEXT:    mov x11, x10
+-; ENABLE-NEXT:    b LBB12_2
++; ENABLE-NEXT:    b LBB12_3
++; ENABLE-NEXT:  ; %bb.5: ; %end
++; ENABLE-NEXT:    ret
+ ;
+ ; DISABLE-LABEL: infiniteloop3:
+ ; DISABLE:       ; %bb.0: ; %entry
++; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
+ ; DISABLE-NEXT:    mov x8, xzr
+ ; DISABLE-NEXT:    mov x9, xzr
+ ; DISABLE-NEXT:    mov x11, xzr
+-; DISABLE-NEXT:    b LBB12_2
+-; DISABLE-NEXT:  LBB12_1: ; %loop2b
+-; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
++; DISABLE-NEXT:    b LBB12_3
++; DISABLE-NEXT:  LBB12_2: ; %loop2b
++; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
+ ; DISABLE-NEXT:    str x10, [x11]
+ ; DISABLE-NEXT:    mov x11, x10
+-; DISABLE-NEXT:  LBB12_2: ; %loop1
++; DISABLE-NEXT:  LBB12_3: ; %loop1
+ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+ ; DISABLE-NEXT:    mov x10, x9
+ ; DISABLE-NEXT:    ldr x9, [x8]
+-; DISABLE-NEXT:    cbnz x8, LBB12_1
+-; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
++; DISABLE-NEXT:    cbnz x8, LBB12_2
++; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
+ ; DISABLE-NEXT:    mov x8, x10
+ ; DISABLE-NEXT:    mov x11, x10
+-; DISABLE-NEXT:    b LBB12_2
++; DISABLE-NEXT:    b LBB12_3
++; DISABLE-NEXT:  ; %bb.5: ; %end
++; DISABLE-NEXT:    ret
+ entry:
+   br i1 undef, label %loop2a, label %body
+ 
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+--- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
++++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+@@ -8,14 +8,20 @@
+ define i8 @foo_optsize(i32 %v4) optsize {
+ ; CHECK-LABEL: foo_optsize:
+ ; CHECK:       // %bb.0: // %entry
+-; CHECK-NEXT:    cbnz w0, .LBB0_2
+-; CHECK-NEXT:  // %bb.1: // %b2
+-; CHECK-NEXT:    mov w0, #1 // =0x1
++; CHECK-NEXT:    b .LBB0_2
++; CHECK-NEXT:  .LBB0_1:
++; CHECK-NEXT:    mov w0, wzr
+ ; CHECK-NEXT:    ret
+ ; CHECK-NEXT:  .LBB0_2: // %b1
+-; CHECK-NEXT:    cmp w0, #1
+-; CHECK-NEXT:    mov w0, wzr
++; CHECK-NEXT:    cbnz w0, .LBB0_4
++; CHECK-NEXT:  // %bb.3: // %b2
++; CHECK-NEXT:    mov w0, #1 // =0x1
+ ; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB0_4: // %b1
++; CHECK-NEXT:    cmp w0, #1
++; CHECK-NEXT:    b.ne .LBB0_1
++; CHECK-NEXT:  // %bb.5: // %b3
++; CHECK-NEXT:    b .LBB0_1
+ entry:
+   %v2 = icmp eq i32 0, 0
+   br i1 %v2, label %b1, label %b4
+@@ -41,14 +47,20 @@
+ define i8 @foo_optspeed(i32 %v4) {
+ ; CHECK-LABEL: foo_optspeed:
+ ; CHECK:       // %bb.0: // %entry
+-; CHECK-NEXT:    cbnz w0, .LBB1_2
+-; CHECK-NEXT:  // %bb.1: // %b2
+-; CHECK-NEXT:    mov w0, #1 // =0x1
++; CHECK-NEXT:    b .LBB1_2
++; CHECK-NEXT:  .LBB1_1:
++; CHECK-NEXT:    mov w0, wzr
+ ; CHECK-NEXT:    ret
+ ; CHECK-NEXT:  .LBB1_2: // %b1
+-; CHECK-NEXT:    cmp w0, #1
+-; CHECK-NEXT:    mov w0, wzr
++; CHECK-NEXT:    cbnz w0, .LBB1_4
++; CHECK-NEXT:  // %bb.3: // %b2
++; CHECK-NEXT:    mov w0, #1 // =0x1
+ ; CHECK-NEXT:    ret
++; CHECK-NEXT:  .LBB1_4: // %b1
++; CHECK-NEXT:    cmp w0, #1
++; CHECK-NEXT:    b.ne .LBB1_1
++; CHECK-NEXT:  // %bb.5: // %b3
++; CHECK-NEXT:    b .LBB1_1
+ entry:
+   %v2 = icmp eq i32 0, 0
+   br i1 %v2, label %b1, label %b4
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+--- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
++++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+@@ -21,8 +21,10 @@
+   ; CHECK-NEXT:   B %bb.3
+   ; CHECK-NEXT: {{  $}}
+   ; CHECK-NEXT: bb.1.bb:
++  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
+   ; CHECK-NEXT:   liveins: $w0, $lr
+   ; CHECK-NEXT: {{  $}}
++  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
+   ; CHECK-NEXT:   B %bb.2
+   ; CHECK-NEXT: {{  $}}
+   ; CHECK-NEXT: bb.2.bb1:
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
+--- a/llvm/test/CodeGen/AArch64/pr164181.ll
++++ b/llvm/test/CodeGen/AArch64/pr164181.ll
+@@ -29,11 +29,11 @@
+ ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
+ ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
+ ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
+-; CHECK-NEXT:    tbz w5, #0, .LBB0_40
++; CHECK-NEXT:    tbz w5, #0, .LBB0_43
+ ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
+ ; CHECK-NEXT:    ldr x4, [sp, #312]
+ ; CHECK-NEXT:    ldr x14, [sp, #280]
+-; CHECK-NEXT:    tbz w0, #0, .LBB0_39
++; CHECK-NEXT:    tbz w0, #0, .LBB0_42
+ ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
+ ; CHECK-NEXT:    ldrb w8, [sp, #368]
+ ; CHECK-NEXT:    ldrb w12, [sp, #256]
+@@ -92,7 +92,7 @@
+ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+ ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
+ ; CHECK-NEXT:    mov x12, x24
+ ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
+@@ -117,7 +117,7 @@
+ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+ ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
+ ; CHECK-NEXT:    cmn x24, #30
+ ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
+@@ -142,7 +142,7 @@
+ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+ ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
+ ; CHECK-NEXT:    mov w14, #1152 // =0x480
+ ; CHECK-NEXT:    mov w24, #1 // =0x1
+@@ -176,7 +176,7 @@
+ ; CHECK-NEXT:    // => This Loop Header: Depth=4
+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+ ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
+ ; CHECK-NEXT:    and w8, w8, w8, asr #31
+ ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
+@@ -281,23 +281,31 @@
+ ; CHECK-NEXT:    mov x24, xzr
+ ; CHECK-NEXT:    mul w12, w12, w22
+ ; CHECK-NEXT:    mov x22, x5
+-; CHECK-NEXT:    tbz w0, #0, .LBB0_33
+-; CHECK-NEXT:  .LBB0_28: // %if.then222.us
++; CHECK-NEXT:    tbz w0, #0, .LBB0_36
++; CHECK-NEXT:  .LBB0_28: // %for.body194.us
+ ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+ ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+ ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+ ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+ ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
++; CHECK-NEXT:  // %bb.29: // %if.then222.us
++; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+ ; CHECK-NEXT:    adrp x27, :got:var_32
+ ; CHECK-NEXT:    ldur w8, [x19, #-12]
+ ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
+ ; CHECK-NEXT:    strh w8, [x27]
+ ; CHECK-NEXT:    sxtb w8, w25
+-; CHECK-NEXT:    strb w3, [x16]
+ ; CHECK-NEXT:    bic w25, w8, w8, asr #31
++; CHECK-NEXT:    b .LBB0_31
++; CHECK-NEXT:    .p2align 5, , 16
++; CHECK-NEXT:  // %bb.30:
++; CHECK-NEXT:    mov w25, wzr
++; CHECK-NEXT:  .LBB0_31: // %if.end239.us
++; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
++; CHECK-NEXT:    strb w3, [x16]
+ ; CHECK-NEXT:    tst w13, #0xff
+-; CHECK-NEXT:    b.eq .LBB0_30
+-; CHECK-NEXT:  // %bb.29: // %if.then254.us
++; CHECK-NEXT:    b.eq .LBB0_33
++; CHECK-NEXT:  // %bb.32: // %if.then254.us
+ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+ ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
+ ; CHECK-NEXT:    adrp x27, :got:var_35
+@@ -306,7 +314,7 @@
+ ; CHECK-NEXT:    csel x8, xzr, x7, eq
+ ; CHECK-NEXT:    str x8, [x27]
+ ; CHECK-NEXT:    strh w1, [x17]
+-; CHECK-NEXT:  .LBB0_30: // %if.end282.us
++; CHECK-NEXT:  .LBB0_33: // %if.end282.us
+ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+ ; CHECK-NEXT:    orr x27, x24, x4
+ ; CHECK-NEXT:    adrp x8, :got:var_39
+@@ -317,14 +325,14 @@
+ ; CHECK-NEXT:    str x8, [x18]
+ ; CHECK-NEXT:    mov w8, #1 // =0x1
+ ; CHECK-NEXT:    cbnz x2, .LBB0_27
+-; CHECK-NEXT:  // %bb.31: // %if.then327.us
++; CHECK-NEXT:  // %bb.34: // %if.then327.us
+ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+ ; CHECK-NEXT:    cbz w8, .LBB0_25
+-; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
++; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
+ ; CHECK-NEXT:    mov w4, wzr
+ ; CHECK-NEXT:    b .LBB0_26
+ ; CHECK-NEXT:    .p2align 5, , 16
+-; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
++; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
+ ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+ ; CHECK-NEXT:    mov w3, #1152 // =0x480
+ ; CHECK-NEXT:    mov x22, xzr
+@@ -335,24 +343,24 @@
+ ; CHECK-NEXT:    madd x14, x14, x3, x11
+ ; CHECK-NEXT:    mov w28, w30
+ ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+-; CHECK-NEXT:    b .LBB0_36
++; CHECK-NEXT:    b .LBB0_39
+ ; CHECK-NEXT:    .p2align 5, , 16
+-; CHECK-NEXT:  .LBB0_34: // %if.then466.us
+-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
++; CHECK-NEXT:  .LBB0_37: // %if.then466.us
++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+ ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
+ ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
+ ; CHECK-NEXT:    sxtb w4, w4
+ ; CHECK-NEXT:    bic w4, w4, w4, asr #31
+ ; CHECK-NEXT:    str x3, [x28]
+ ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+-; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
+-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
++; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+ ; CHECK-NEXT:    add x22, x22, #1
+ ; CHECK-NEXT:    add x27, x27, #1
+ ; CHECK-NEXT:    mov w28, wzr
+ ; CHECK-NEXT:    cmp x27, #0
+ ; CHECK-NEXT:    b.hs .LBB0_9
+-; CHECK-NEXT:  .LBB0_36: // %for.body380.us
++; CHECK-NEXT:  .LBB0_39: // %for.body380.us
+ ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+ ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+ ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+@@ -364,18 +372,18 @@
+ ; CHECK-NEXT:    strh w28, [x11]
+ ; CHECK-NEXT:    csel w28, w21, w3, ne
+ ; CHECK-NEXT:    str w28, [x20]
+-; CHECK-NEXT:    cbz x15, .LBB0_35
+-; CHECK-NEXT:  // %bb.37: // %if.then436.us
+-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
++; CHECK-NEXT:    cbz x15, .LBB0_38
++; CHECK-NEXT:  // %bb.40: // %if.then436.us
++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+ ; CHECK-NEXT:    ldrh w28, [x14]
+-; CHECK-NEXT:    cbnz w28, .LBB0_34
+-; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
++; CHECK-NEXT:    cbnz w28, .LBB0_37
++; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
+ ; CHECK-NEXT:    mov w4, wzr
+-; CHECK-NEXT:    b .LBB0_35
+-; CHECK-NEXT:  .LBB0_39: // %for.body41
++; CHECK-NEXT:    b .LBB0_38
++; CHECK-NEXT:  .LBB0_42: // %for.body41
+ ; CHECK-NEXT:    strb wzr, [x4]
+ ; CHECK-NEXT:    strb wzr, [x14]
+-; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
++; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
+ ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+ ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+ ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
+--- a/llvm/test/CodeGen/AArch64/pr166870.ll
++++ b/llvm/test/CodeGen/AArch64/pr166870.ll
+@@ -26,11 +26,12 @@
+ ; CHECK-NEXT:    mov x21, x1
+ ; CHECK-NEXT:    bl baz
+ ; CHECK-NEXT:    mov w0, #0 // =0x0
++; CHECK-NEXT:  // %bb.5: // %bb6
+ ; CHECK-NEXT:    mov w10, #1 // =0x1
++; CHECK-NEXT:    cbnz w10, .LBB0_11
++; CHECK-NEXT:  // %bb.6: // %bb7
+ ; CHECK-NEXT:    cbnz w10, .LBB0_10
+-; CHECK-NEXT:  // %bb.5: // %bb7
+-; CHECK-NEXT:    cbnz w10, .LBB0_9
+-; CHECK-NEXT:  // %bb.6: // %bb8
++; CHECK-NEXT:  // %bb.7: // %bb8
+ ; CHECK-NEXT:    mov x8, x21
+ ; CHECK-NEXT:    mov x9, x20
+ ; CHECK-NEXT:    mov w20, #0 // =0x0
+@@ -38,17 +39,17 @@
+ ; CHECK-NEXT:    mov x21, x9
+ ; CHECK-NEXT:    mov w8, w8
+ ; CHECK-NEXT:    mov x22, x8
+-; CHECK-NEXT:  .LBB0_7: // %bb10
++; CHECK-NEXT:  .LBB0_8: // %bb10
+ ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+ ; CHECK-NEXT:    strb w20, [x19]
+-; CHECK-NEXT:    cbnz x21, .LBB0_7
+-; CHECK-NEXT:  // %bb.8: // %bb12
+-; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
++; CHECK-NEXT:    cbnz x21, .LBB0_8
++; CHECK-NEXT:  // %bb.9: // %bb12
++; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
+ ; CHECK-NEXT:    bl snork
+-; CHECK-NEXT:    cbnz x22, .LBB0_7
+-; CHECK-NEXT:  .LBB0_9:
+-; CHECK-NEXT:    mov w0, #0 // =0x0
++; CHECK-NEXT:    cbnz x22, .LBB0_8
+ ; CHECK-NEXT:  .LBB0_10:
++; CHECK-NEXT:    mov w0, #0 // =0x0
++; CHECK-NEXT:  .LBB0_11:
+ ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+ ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+ ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
++++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+@@ -71,21 +71,27 @@
+ ; CHECK-NEXT:    .cfi_def_cfa w29, 16
+ ; CHECK-NEXT:    .cfi_offset w30, -8
+ ; CHECK-NEXT:    .cfi_offset w29, -16
++; CHECK-NEXT:    .cfi_remember_state
+ ; CHECK-NEXT:    mov w8, #1 // =0x1
+-; CHECK-NEXT:    mov w9, #2 // =0x2
+ ; CHECK-NEXT:    stur xzr, [x29, #-8]
+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-; CHECK-NEXT:    ldur w8, [x29, #-8]
+-; CHECK-NEXT:    cbz w8, .LBB0_2
++; CHECK-NEXT:    b .LBB0_3
+ ; CHECK-NEXT:  // %bb.1:
+-; CHECK-NEXT:    mov w8, #1 // =0x1
+ ; CHECK-NEXT:    str w8, [sp, #16]
+-; CHECK-NEXT:    b .LBB0_3
++; CHECK-NEXT:    ldur w8, [x29, #-8]
++; CHECK-NEXT:    cbz w8, .LBB0_4
+ ; CHECK-NEXT:  .LBB0_2:
++; CHECK-NEXT:    .cfi_restore_state
+ ; CHECK-NEXT:    mov w8, #1 // =0x1
+-; CHECK-NEXT:    mov w9, #2 // =0x2
+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++; CHECK-NEXT:    str w8, [sp, #16]
++; CHECK-NEXT:    b .LBB0_5
+ ; CHECK-NEXT:  .LBB0_3:
++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++; CHECK-NEXT:    ldur w8, [x29, #-8]
++; CHECK-NEXT:    cbnz w8, .LBB0_2
++; CHECK-NEXT:  .LBB0_4:
++; CHECK-NEXT:    mov w8, #1 // =0x1
++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++; CHECK-NEXT:  .LBB0_5:
+ ; CHECK-NEXT:    mov w0, wzr
+ ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
+ ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+@@ -128,6 +134,7 @@
+ ;
+ ; CHECK-LABEL: OUTLINED_FUNCTION_0:
+ ; CHECK:       // %bb.0:
++; CHECK-NEXT:    mov w9, #2 // =0x2
+ ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
+ ; CHECK-NEXT:    mov w9, #3 // =0x3
+ ; CHECK-NEXT:    mov w8, #4 // =0x4
+diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
++++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+@@ -12,21 +12,27 @@
+ ; CHECK-NEXT:    .cfi_def_cfa w29, 16
+ ; CHECK-NEXT:    .cfi_offset w30, -8
+ ; CHECK-NEXT:    .cfi_offset w29, -16
++; CHECK-NEXT:    .cfi_remember_state
+ ; CHECK-NEXT:    mov w8, #1 // =0x1
+-; CHECK-NEXT:    mov w9, #2 // =0x2
+ ; CHECK-NEXT:    stur xzr, [x29, #-8]
+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-; CHECK-NEXT:    ldur w8, [x29, #-8]
+-; CHECK-NEXT:    cbz w8, .LBB0_2
++; CHECK-NEXT:    b .LBB0_3
+ ; CHECK-NEXT:  // %bb.1:
+-; CHECK-NEXT:    mov w8, #1 // =0x1
+ ; CHECK-NEXT:    str w8, [sp, #16]
+-; CHECK-NEXT:    b .LBB0_3
++; CHECK-NEXT:    ldur w8, [x29, #-8]
++; CHECK-NEXT:    cbz w8, .LBB0_4
+ ; CHECK-NEXT:  .LBB0_2:
++; CHECK-NEXT:    .cfi_restore_state
+ ; CHECK-NEXT:    mov w8, #1 // =0x1
+-; CHECK-NEXT:    mov w9, #2 // =0x2
+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++; CHECK-NEXT:    str w8, [sp, #16]
++; CHECK-NEXT:    b .LBB0_5
+ ; CHECK-NEXT:  .LBB0_3:
++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++; CHECK-NEXT:    ldur w8, [x29, #-8]
++; CHECK-NEXT:    cbnz w8, .LBB0_2
++; CHECK-NEXT:  .LBB0_4:
++; CHECK-NEXT:    mov w8, #1 // =0x1
++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++; CHECK-NEXT:  .LBB0_5:
+ ; CHECK-NEXT:    mov w0, wzr
+ ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
+ ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+--- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
++++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+@@ -2,23 +2,29 @@
+ ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
+ ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
+ 
+-; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
+-; no runtime check is generated and the loop should not be vectorized.
++; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
++; no runtime check is generated even though one is needed and !noalias
++; annotations are added.
+ define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
+ ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
+ ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+-; LIMIT0-NEXT:  [[ENTRY:.*]]:
+-; LIMIT0-NEXT:    br label %[[LOOP:.*]]
+-; LIMIT0:       [[LOOP]]:
+-; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+-; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
++; LIMIT0-NEXT:  [[ENTRY:.*:]]
++; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
++; LIMIT0:       [[VECTOR_PH]]:
++; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
++; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
++; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
++; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
++; LIMIT0:       [[VECTOR_BODY]]:
++; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+ ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+-; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
+-; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
++; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
++; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+ ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+-; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
++; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
++; LIMIT0:       [[MIDDLE_BLOCK]]:
++; LIMIT0-NEXT:    br label %[[EXIT:.*]]
+ ; LIMIT0:       [[EXIT]]:
+-; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
+ ; LIMIT0-NEXT:    ret i16 [[TMP0]]
+ ;
+ ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
+@@ -82,9 +88,14 @@
+ !3 = !{!"llvm.loop.vectorize.enable", i1 true}
+ 
+ ;.
+-; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+-; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
+-; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
++; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
++; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
++; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
++; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
++; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
++; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
++; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
++; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+ ;.
+ ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
+ ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+diff -ruN --strip-trailing-cr a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
++++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+@@ -1320,8 +1320,9 @@
+ }
+ 
+ template <typename T>
+-T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
+-                               StringRef entryType, uint64_t depth) {
++T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
++                               uint64_t index, StringRef entryType,
++                               uint64_t depth) {
+   if (index >= entries.size()) {
+     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
+     return {};
diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index dd3d4e4de4509d..e573782a756d19 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "c6e23ab80753a01dce270f5f8a133fbec942315d"
-    LLVM_SHA256 = "5a6b8aacd2d87ce9c4456843a76d0a54fd7cd0ae788ed3f19e7487ecd2ce4326"
+    LLVM_COMMIT = "87bf5ee23863bc0b467ee44b2184b2c134a98464"
+    LLVM_SHA256 = "9d0bca271bfb266de8453cd34156741fd41f64b911f580262d187ce4d4d9b6d9"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 13d339429b0101..486c1d22e6cb1e 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,15 +1,1094 @@
+diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
+index 509398d..2948da4 100644
+--- a/third_party/llvm/generated.patch
++++ b/third_party/llvm/generated.patch
+@@ -1 +1,1074 @@
+ Auto generated patch. Do not edit or delete it, even if empty.
++diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
++--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
++@@ -554,8 +554,6 @@
++     if (const auto &Dir = Params.initializationOptions.compilationDatabasePath)
++       CDBOpts.CompileCommandsDir = Dir;
++     CDBOpts.ContextProvider = Opts.ContextProvider;
++-    if (Opts.StrongWorkspaceMode)
++-      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
++     BaseCDB =
++         std::make_unique<DirectoryBasedGlobalCompilationDatabase>(CDBOpts);
++   }
++diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
++--- a/clang-tools-extra/clangd/ClangdServer.h
+++++ b/clang-tools-extra/clangd/ClangdServer.h
++@@ -152,11 +152,6 @@
++     /// FIXME: If not set, should use the current working directory.
++     std::optional<std::string> WorkspaceRoot;
++ 
++-    /// Sets an alternate mode of operation. Current effects are:
++-    /// - Using the current working directory as the working directory for
++-    ///   fallback commands
++-    bool StrongWorkspaceMode;
++-
++     /// The resource directory is used to find internal headers, overriding
++     /// defaults and -resource-dir compiler flag).
++     /// If std::nullopt, ClangdServer calls
++diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
++--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+++++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
++@@ -64,9 +64,7 @@
++   if (FileExtension.empty() || FileExtension == ".h")
++     Argv.push_back("-xobjective-c++-header");
++   Argv.push_back(std::string(File));
++-  tooling::CompileCommand Cmd(FallbackWorkingDirectory
++-                                  ? *FallbackWorkingDirectory
++-                                  : llvm::sys::path::parent_path(File),
+++  tooling::CompileCommand Cmd(llvm::sys::path::parent_path(File),
++                               llvm::sys::path::filename(File), std::move(Argv),
++                               /*Output=*/"");
++   Cmd.Heuristic = "clangd fallback";
++@@ -351,8 +349,7 @@
++ 
++ DirectoryBasedGlobalCompilationDatabase::
++     DirectoryBasedGlobalCompilationDatabase(const Options &Opts)
++-    : GlobalCompilationDatabase(Opts.FallbackWorkingDirectory), Opts(Opts),
++-      Broadcaster(std::make_unique<BroadcastThread>(*this)) {
+++    : Opts(Opts), Broadcaster(std::make_unique<BroadcastThread>(*this)) {
++   if (!this->Opts.ContextProvider)
++     this->Opts.ContextProvider = [](llvm::StringRef) {
++       return Context::current().clone();
++@@ -463,21 +460,6 @@
++   return Result;
++ }
++ 
++-void DirectoryBasedGlobalCompilationDatabase::Options::
++-    applyFallbackWorkingDirectory(
++-        std::optional<std::string> FallbackWorkingDirectory) {
++-  if (FallbackWorkingDirectory)
++-    this->FallbackWorkingDirectory = *FallbackWorkingDirectory;
++-  else {
++-    // Clangd is running in strong workspace mode but the client didn't
++-    // specify a workspace path in the `initialize` request.
++-    // Fallback to current working directory.
++-    SmallString<256> CWD;
++-    llvm::sys::fs::current_path(CWD);
++-    this->FallbackWorkingDirectory = std::string(CWD);
++-  }
++-}
++-
++ // The broadcast thread announces files with new compile commands to the world.
++ // Primarily this is used to enqueue them for background indexing.
++ //
++@@ -777,10 +759,9 @@
++ 
++ OverlayCDB::OverlayCDB(const GlobalCompilationDatabase *Base,
++                        std::vector<std::string> FallbackFlags,
++-                       CommandMangler Mangler,
++-                       std::optional<std::string> FallbackWorkingDirectory)
++-    : DelegatingCDB(Base, FallbackWorkingDirectory),
++-      Mangler(std::move(Mangler)), FallbackFlags(std::move(FallbackFlags)) {}
+++                       CommandMangler Mangler)
+++    : DelegatingCDB(Base), Mangler(std::move(Mangler)),
+++      FallbackFlags(std::move(FallbackFlags)) {}
++ 
++ std::optional<tooling::CompileCommand>
++ OverlayCDB::getCompileCommand(PathRef File) const {
++@@ -863,20 +844,16 @@
++   return MDB;
++ }
++ 
++-DelegatingCDB::DelegatingCDB(
++-    const GlobalCompilationDatabase *Base,
++-    std::optional<std::string> FallbackWorkingDirectory)
++-    : GlobalCompilationDatabase(FallbackWorkingDirectory), Base(Base) {
+++DelegatingCDB::DelegatingCDB(const GlobalCompilationDatabase *Base)
+++    : Base(Base) {
++   if (Base)
++     BaseChanged = Base->watch([this](const std::vector<std::string> Changes) {
++       OnCommandChanged.broadcast(Changes);
++     });
++ }
++ 
++-DelegatingCDB::DelegatingCDB(
++-    std::unique_ptr<GlobalCompilationDatabase> Base,
++-    std::optional<std::string> FallbackWorkingDirectory)
++-    : DelegatingCDB(Base.get(), FallbackWorkingDirectory) {
+++DelegatingCDB::DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base)
+++    : DelegatingCDB(Base.get()) {
++   BaseOwner = std::move(Base);
++ }
++ 
++diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
++--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h
+++++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
++@@ -35,9 +35,6 @@
++ /// Provides compilation arguments used for parsing C and C++ files.
++ class GlobalCompilationDatabase {
++ public:
++-  GlobalCompilationDatabase(
++-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt)
++-      : FallbackWorkingDirectory(FallbackWorkingDirectory) {}
++   virtual ~GlobalCompilationDatabase() = default;
++ 
++   /// If there are any known-good commands for building this file, returns one.
++@@ -72,19 +69,14 @@
++   }
++ 
++ protected:
++-  std::optional<std::string> FallbackWorkingDirectory;
++   mutable CommandChanged OnCommandChanged;
++ };
++ 
++ // Helper class for implementing GlobalCompilationDatabases that wrap others.
++ class DelegatingCDB : public GlobalCompilationDatabase {
++ public:
++-  DelegatingCDB(
++-      const GlobalCompilationDatabase *Base,
++-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
++-  DelegatingCDB(
++-      std::unique_ptr<GlobalCompilationDatabase> Base,
++-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
+++  DelegatingCDB(const GlobalCompilationDatabase *Base);
+++  DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base);
++ 
++   std::optional<tooling::CompileCommand>
++   getCompileCommand(PathRef File) const override;
++@@ -125,12 +117,6 @@
++     // Only look for a compilation database in this one fixed directory.
++     // FIXME: fold this into config/context mechanism.
++     std::optional<Path> CompileCommandsDir;
++-    // Working directory for fallback commands
++-    // If unset, parent directory of file should be used
++-    std::optional<std::string> FallbackWorkingDirectory;
++-
++-    void applyFallbackWorkingDirectory(
++-        std::optional<std::string> FallbackWorkingDirectory);
++   };
++ 
++   DirectoryBasedGlobalCompilationDatabase(const Options &Opts);
++@@ -208,11 +194,9 @@
++   // Base may be null, in which case no entries are inherited.
++   // FallbackFlags are added to the fallback compile command.
++   // Adjuster is applied to all commands, fallback or not.
++-  OverlayCDB(
++-      const GlobalCompilationDatabase *Base,
++-      std::vector<std::string> FallbackFlags = {},
++-      CommandMangler Mangler = nullptr,
++-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
+++  OverlayCDB(const GlobalCompilationDatabase *Base,
+++             std::vector<std::string> FallbackFlags = {},
+++             CommandMangler Mangler = nullptr);
++ 
++   std::optional<tooling::CompileCommand>
++   getCompileCommand(PathRef File) const override;
++diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
++--- a/clang-tools-extra/clangd/tool/Check.cpp
+++++ b/clang-tools-extra/clangd/tool/Check.cpp
++@@ -169,8 +169,6 @@
++   bool buildCommand(const ThreadsafeFS &TFS) {
++     log("Loading compilation database...");
++     DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
++-    if (Opts.StrongWorkspaceMode)
++-      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
++     CDBOpts.CompileCommandsDir =
++         Config::current().CompileFlags.CDBSearch.FixedCDBPath;
++     BaseCDB =
++@@ -180,10 +178,8 @@
++         getSystemIncludeExtractor(llvm::ArrayRef(Opts.QueryDriverGlobs));
++     if (Opts.ResourceDir)
++       Mangler.ResourceDir = *Opts.ResourceDir;
++-
++     CDB = std::make_unique<OverlayCDB>(
++-        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler),
++-        CDBOpts.FallbackWorkingDirectory);
+++        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler));
++ 
++     if (auto TrueCmd = CDB->getCompileCommand(File)) {
++       Cmd = std::move(*TrueCmd);
++@@ -506,7 +502,7 @@
++                  config::DiagnosticCallback Diag) const override {
++       config::Fragment F;
++       // If we're timing clang-tidy checks, implicitly disabling the slow ones
++-      // is counterproductive!
+++      // is counterproductive! 
++       if (CheckTidyTime.getNumOccurrences())
++         F.Diagnostics.ClangTidy.FastCheckFilter.emplace("None");
++       return {std::move(F).compile(Diag)};
++diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
++--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
++@@ -500,17 +500,6 @@
++     init(true),
++ };
++ 
++-opt<bool> StrongWorkspaceMode{
++-    "strong-workspace-mode",
++-    cat(Features),
++-    desc("An alternate mode of operation for clangd, where the clangd instance "
++-         "is used to edit a single workspace.\n"
++-         "When enabled, fallback commands use the workspace directory as their "
++-         "working directory instead of the parent folder."),
++-    init(false),
++-    Hidden,
++-};
++-
++ opt<bool> UseDirtyHeaders{"use-dirty-headers", cat(Misc),
++                           desc("Use files open in the editor when parsing "
++                                "headers instead of reading from the disk"),
++@@ -918,7 +907,6 @@
++   }
++   if (!ResourceDir.empty())
++     Opts.ResourceDir = ResourceDir;
++-  Opts.StrongWorkspaceMode = StrongWorkspaceMode;
++   Opts.BuildDynamicSymbolIndex = true;
++ #if CLANGD_ENABLE_REMOTE
++   if (RemoteIndexAddress.empty() != ProjectRoot.empty()) {
++diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
++--- a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
+++++ b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
++@@ -55,20 +55,6 @@
++                                            testPath("foo/bar")));
++ }
++ 
++-TEST(GlobalCompilationDatabaseTest, FallbackWorkingDirectory) {
++-  MockFS TFS;
++-  DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
++-  CDBOpts.applyFallbackWorkingDirectory(testPath("foo"));
++-  EXPECT_EQ(CDBOpts.FallbackWorkingDirectory, testPath("foo"));
++-
++-  DirectoryBasedGlobalCompilationDatabase DB(CDBOpts);
++-  auto Cmd = DB.getFallbackCommand(testPath("foo/src/bar.cc"));
++-  EXPECT_EQ(Cmd.Directory, testPath("foo"));
++-  EXPECT_THAT(Cmd.CommandLine,
++-              ElementsAre("clang", testPath("foo/src/bar.cc")));
++-  EXPECT_EQ(Cmd.Output, "");
++-}
++-
++ static tooling::CompileCommand cmd(llvm::StringRef File, llvm::StringRef Arg) {
++   return tooling::CompileCommand(
++       testRoot(), File, {"clang", std::string(Arg), std::string(File)}, "");
++diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
++--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
++@@ -618,8 +618,6 @@
++ 
++   DenseSet<const MachineBasicBlock *> DirtyBBs;
++   for (MachineBasicBlock &MBB : MF) {
++-    if (!MDT->isReachableFromEntry(&MBB))
++-      continue;
++     if (MBB.isEHPad()) {
++       DirtyBBs.insert(&MBB);
++       continue;
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
++--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
++@@ -708,53 +708,6 @@
++   return 2;
++ }
++ 
++-bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
++-                               const TargetInstrInfo &TII) {
++-  for (MachineInstr &MI : MBB->terminators()) {
++-    unsigned Opc = MI.getOpcode();
++-    switch (Opc) {
++-    case AArch64::CBZW:
++-    case AArch64::CBZX:
++-    case AArch64::TBZW:
++-    case AArch64::TBZX:
++-      // CBZ/TBZ with WZR/XZR -> unconditional B
++-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
++-          MI.getOperand(0).getReg() == AArch64::XZR) {
++-        DEBUG_WITH_TYPE("optimizeTerminators",
++-                        dbgs() << "Removing always taken branch: " << MI);
++-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
++-        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
++-        for (auto *S : Succs)
++-          if (S != Target)
++-            MBB->removeSuccessor(S);
++-        DebugLoc DL = MI.getDebugLoc();
++-        while (MBB->rbegin() != &MI)
++-          MBB->rbegin()->eraseFromParent();
++-        MI.eraseFromParent();
++-        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
++-        return true;
++-      }
++-      break;
++-    case AArch64::CBNZW:
++-    case AArch64::CBNZX:
++-    case AArch64::TBNZW:
++-    case AArch64::TBNZX:
++-      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
++-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
++-          MI.getOperand(0).getReg() == AArch64::XZR) {
++-        DEBUG_WITH_TYPE("optimizeTerminators",
++-                        dbgs() << "Removing never taken branch: " << MI);
++-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
++-        MI.getParent()->removeSuccessor(Target);
++-        MI.eraseFromParent();
++-        return true;
++-      }
++-      break;
++-    }
++-  }
++-  return false;
++-}
++-
++ // Find the original register that VReg is copied from.
++ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
++   while (Register::isVirtualRegister(VReg)) {
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
++--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
++@@ -705,8 +705,6 @@
++                               unsigned *OutUnscaledOp = nullptr,
++                               int64_t *EmittableOffset = nullptr);
++ 
++-bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
++-
++ static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
++ 
++ static inline bool isCondBranchOpcode(int Opc) {
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
++--- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+++++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
++@@ -14,7 +14,6 @@
++ //===----------------------------------------------------------------------===//
++ 
++ #include "AArch64.h"
++-#include "AArch64InstrInfo.h"
++ #include "llvm/CodeGen/MachineFunctionPass.h"
++ #include "llvm/CodeGen/MachineInstrBuilder.h"
++ #include "llvm/CodeGen/TargetInstrInfo.h"
++@@ -46,6 +45,51 @@
++                 "AArch64 Redundant Conditional Branch Elimination pass", false,
++                 false)
++ 
+++static bool optimizeTerminators(MachineBasicBlock *MBB,
+++                                const TargetInstrInfo &TII) {
+++  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
+++    unsigned Opc = MI.getOpcode();
+++    switch (Opc) {
+++    case AArch64::CBZW:
+++    case AArch64::CBZX:
+++    case AArch64::TBZW:
+++    case AArch64::TBZX:
+++      // CBZ/TBZ with WZR/XZR -> unconditional B
+++      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+++          MI.getOperand(0).getReg() == AArch64::XZR) {
+++        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+++        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+++        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
+++        for (auto *S : Succs)
+++          if (S != Target)
+++            MBB->removeSuccessor(S);
+++        DebugLoc DL = MI.getDebugLoc();
+++        while (MBB->rbegin() != &MI)
+++          MBB->rbegin()->eraseFromParent();
+++        MI.eraseFromParent();
+++        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
+++        return true;
+++      }
+++      break;
+++    case AArch64::CBNZW:
+++    case AArch64::CBNZX:
+++    case AArch64::TBNZW:
+++    case AArch64::TBNZX:
+++      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
+++      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+++          MI.getOperand(0).getReg() == AArch64::XZR) {
+++        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+++        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+++        MI.getParent()->removeSuccessor(Target);
+++        MI.eraseFromParent();
+++        return true;
+++      }
+++      break;
+++    }
+++  }
+++  return false;
+++}
+++
++ bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
++   if (skipFunction(MF.getFunction()))
++     return false;
++diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
++--- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
++@@ -50,7 +50,6 @@
++ //        to use WZR/XZR directly in some cases.
++ //===----------------------------------------------------------------------===//
++ #include "AArch64.h"
++-#include "AArch64InstrInfo.h"
++ #include "llvm/ADT/SetVector.h"
++ #include "llvm/ADT/Statistic.h"
++ #include "llvm/ADT/iterator_range.h"
++@@ -476,7 +475,6 @@
++     return false;
++   TRI = MF.getSubtarget().getRegisterInfo();
++   MRI = &MF.getRegInfo();
++-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
++ 
++   // Resize the clobbered and used register unit trackers.  We do this once per
++   // function.
++@@ -486,10 +484,8 @@
++   OptBBUsedRegs.init(*TRI);
++ 
++   bool Changed = false;
++-  for (MachineBasicBlock &MBB : MF) {
++-    Changed |= optimizeTerminators(&MBB, TII);
+++  for (MachineBasicBlock &MBB : MF)
++     Changed |= optimizeBlock(&MBB);
++-  }
++   return Changed;
++ }
++ 
++diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
++--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
++@@ -1827,12 +1827,8 @@
++     // profile info.
++     CostTooHigh =
++         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
++-    if (CostTooHigh) {
++-      // Mark runtime checks as never succeeding when they exceed the threshold.
++-      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
++-      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+++    if (CostTooHigh)
++       return;
++-    }
++ 
++     BasicBlock *LoopHeader = L->getHeader();
++     BasicBlock *Preheader = L->getLoopPreheader();
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
++--- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
++@@ -735,15 +735,21 @@
++ ; ENABLE-NEXT:    .cfi_offset w29, -16
++ ; ENABLE-NEXT:    .cfi_offset w19, -24
++ ; ENABLE-NEXT:    .cfi_offset w20, -32
+++; ENABLE-NEXT:  ; %bb.1: ; %if.then
++ ; ENABLE-NEXT:    sub x19, sp, #16
++ ; ENABLE-NEXT:    mov sp, x19
++ ; ENABLE-NEXT:    mov w20, wzr
++-; ENABLE-NEXT:  LBB10_1: ; %for.body
+++; ENABLE-NEXT:  LBB10_2: ; %for.body
++ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
++ ; ENABLE-NEXT:    bl _something
++ ; ENABLE-NEXT:    add w20, w0, w20
++ ; ENABLE-NEXT:    str w20, [x19]
++-; ENABLE-NEXT:    b LBB10_1
+++; ENABLE-NEXT:    b LBB10_2
+++; ENABLE-NEXT:  ; %bb.3: ; %if.end
+++; ENABLE-NEXT:    sub sp, x29, #16
+++; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+++; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+++; ENABLE-NEXT:    ret
++ ;
++ ; DISABLE-LABEL: infiniteloop:
++ ; DISABLE:       ; %bb.0: ; %entry
++@@ -755,15 +761,21 @@
++ ; DISABLE-NEXT:    .cfi_offset w29, -16
++ ; DISABLE-NEXT:    .cfi_offset w19, -24
++ ; DISABLE-NEXT:    .cfi_offset w20, -32
+++; DISABLE-NEXT:  ; %bb.1: ; %if.then
++ ; DISABLE-NEXT:    sub x19, sp, #16
++ ; DISABLE-NEXT:    mov sp, x19
++ ; DISABLE-NEXT:    mov w20, wzr
++-; DISABLE-NEXT:  LBB10_1: ; %for.body
+++; DISABLE-NEXT:  LBB10_2: ; %for.body
++ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
++ ; DISABLE-NEXT:    bl _something
++ ; DISABLE-NEXT:    add w20, w0, w20
++ ; DISABLE-NEXT:    str w20, [x19]
++-; DISABLE-NEXT:    b LBB10_1
+++; DISABLE-NEXT:    b LBB10_2
+++; DISABLE-NEXT:  ; %bb.3: ; %if.end
+++; DISABLE-NEXT:    sub sp, x29, #16
+++; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+++; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+++; DISABLE-NEXT:    ret
++ entry:
++   br i1 undef, label %if.then, label %if.end
++ 
++@@ -794,10 +806,11 @@
++ ; ENABLE-NEXT:    .cfi_offset w29, -16
++ ; ENABLE-NEXT:    .cfi_offset w19, -24
++ ; ENABLE-NEXT:    .cfi_offset w20, -32
+++; ENABLE-NEXT:  ; %bb.1: ; %if.then
++ ; ENABLE-NEXT:    sub x8, sp, #16
++ ; ENABLE-NEXT:    mov sp, x8
++ ; ENABLE-NEXT:    mov w9, wzr
++-; ENABLE-NEXT:  LBB11_1: ; %for.body
+++; ENABLE-NEXT:  LBB11_2: ; %for.body
++ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
++ ; ENABLE-NEXT:    ; InlineAsm Start
++ ; ENABLE-NEXT:    mov x10, #0 ; =0x0
++@@ -808,7 +821,12 @@
++ ; ENABLE-NEXT:    ; InlineAsm Start
++ ; ENABLE-NEXT:    nop
++ ; ENABLE-NEXT:    ; InlineAsm End
++-; ENABLE-NEXT:    b LBB11_1
+++; ENABLE-NEXT:    b LBB11_2
+++; ENABLE-NEXT:  ; %bb.3: ; %if.end
+++; ENABLE-NEXT:    sub sp, x29, #16
+++; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+++; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+++; ENABLE-NEXT:    ret
++ ;
++ ; DISABLE-LABEL: infiniteloop2:
++ ; DISABLE:       ; %bb.0: ; %entry
++@@ -820,10 +838,11 @@
++ ; DISABLE-NEXT:    .cfi_offset w29, -16
++ ; DISABLE-NEXT:    .cfi_offset w19, -24
++ ; DISABLE-NEXT:    .cfi_offset w20, -32
+++; DISABLE-NEXT:  ; %bb.1: ; %if.then
++ ; DISABLE-NEXT:    sub x8, sp, #16
++ ; DISABLE-NEXT:    mov sp, x8
++ ; DISABLE-NEXT:    mov w9, wzr
++-; DISABLE-NEXT:  LBB11_1: ; %for.body
+++; DISABLE-NEXT:  LBB11_2: ; %for.body
++ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
++ ; DISABLE-NEXT:    ; InlineAsm Start
++ ; DISABLE-NEXT:    mov x10, #0 ; =0x0
++@@ -834,7 +853,12 @@
++ ; DISABLE-NEXT:    ; InlineAsm Start
++ ; DISABLE-NEXT:    nop
++ ; DISABLE-NEXT:    ; InlineAsm End
++-; DISABLE-NEXT:    b LBB11_1
+++; DISABLE-NEXT:    b LBB11_2
+++; DISABLE-NEXT:  ; %bb.3: ; %if.end
+++; DISABLE-NEXT:    sub sp, x29, #16
+++; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+++; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+++; DISABLE-NEXT:    ret
++ entry:
++   br i1 undef, label %if.then, label %if.end
++ 
++@@ -865,43 +889,49 @@
++ define void @infiniteloop3() {
++ ; ENABLE-LABEL: infiniteloop3:
++ ; ENABLE:       ; %bb.0: ; %entry
+++; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
++ ; ENABLE-NEXT:    mov x8, xzr
++ ; ENABLE-NEXT:    mov x9, xzr
++ ; ENABLE-NEXT:    mov x11, xzr
++-; ENABLE-NEXT:    b LBB12_2
++-; ENABLE-NEXT:  LBB12_1: ; %loop2b
++-; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+++; ENABLE-NEXT:    b LBB12_3
+++; ENABLE-NEXT:  LBB12_2: ; %loop2b
+++; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
++ ; ENABLE-NEXT:    str x10, [x11]
++ ; ENABLE-NEXT:    mov x11, x10
++-; ENABLE-NEXT:  LBB12_2: ; %loop1
+++; ENABLE-NEXT:  LBB12_3: ; %loop1
++ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
++ ; ENABLE-NEXT:    mov x10, x9
++ ; ENABLE-NEXT:    ldr x9, [x8]
++-; ENABLE-NEXT:    cbnz x8, LBB12_1
++-; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+++; ENABLE-NEXT:    cbnz x8, LBB12_2
+++; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
++ ; ENABLE-NEXT:    mov x8, x10
++ ; ENABLE-NEXT:    mov x11, x10
++-; ENABLE-NEXT:    b LBB12_2
+++; ENABLE-NEXT:    b LBB12_3
+++; ENABLE-NEXT:  ; %bb.5: ; %end
+++; ENABLE-NEXT:    ret
++ ;
++ ; DISABLE-LABEL: infiniteloop3:
++ ; DISABLE:       ; %bb.0: ; %entry
+++; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
++ ; DISABLE-NEXT:    mov x8, xzr
++ ; DISABLE-NEXT:    mov x9, xzr
++ ; DISABLE-NEXT:    mov x11, xzr
++-; DISABLE-NEXT:    b LBB12_2
++-; DISABLE-NEXT:  LBB12_1: ; %loop2b
++-; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+++; DISABLE-NEXT:    b LBB12_3
+++; DISABLE-NEXT:  LBB12_2: ; %loop2b
+++; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
++ ; DISABLE-NEXT:    str x10, [x11]
++ ; DISABLE-NEXT:    mov x11, x10
++-; DISABLE-NEXT:  LBB12_2: ; %loop1
+++; DISABLE-NEXT:  LBB12_3: ; %loop1
++ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
++ ; DISABLE-NEXT:    mov x10, x9
++ ; DISABLE-NEXT:    ldr x9, [x8]
++-; DISABLE-NEXT:    cbnz x8, LBB12_1
++-; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+++; DISABLE-NEXT:    cbnz x8, LBB12_2
+++; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
++ ; DISABLE-NEXT:    mov x8, x10
++ ; DISABLE-NEXT:    mov x11, x10
++-; DISABLE-NEXT:    b LBB12_2
+++; DISABLE-NEXT:    b LBB12_3
+++; DISABLE-NEXT:  ; %bb.5: ; %end
+++; DISABLE-NEXT:    ret
++ entry:
++   br i1 undef, label %loop2a, label %body
++ 
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
++--- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+++++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
++@@ -8,14 +8,20 @@
++ define i8 @foo_optsize(i32 %v4) optsize {
++ ; CHECK-LABEL: foo_optsize:
++ ; CHECK:       // %bb.0: // %entry
++-; CHECK-NEXT:    cbnz w0, .LBB0_2
++-; CHECK-NEXT:  // %bb.1: // %b2
++-; CHECK-NEXT:    mov w0, #1 // =0x1
+++; CHECK-NEXT:    b .LBB0_2
+++; CHECK-NEXT:  .LBB0_1:
+++; CHECK-NEXT:    mov w0, wzr
++ ; CHECK-NEXT:    ret
++ ; CHECK-NEXT:  .LBB0_2: // %b1
++-; CHECK-NEXT:    cmp w0, #1
++-; CHECK-NEXT:    mov w0, wzr
+++; CHECK-NEXT:    cbnz w0, .LBB0_4
+++; CHECK-NEXT:  // %bb.3: // %b2
+++; CHECK-NEXT:    mov w0, #1 // =0x1
++ ; CHECK-NEXT:    ret
+++; CHECK-NEXT:  .LBB0_4: // %b1
+++; CHECK-NEXT:    cmp w0, #1
+++; CHECK-NEXT:    b.ne .LBB0_1
+++; CHECK-NEXT:  // %bb.5: // %b3
+++; CHECK-NEXT:    b .LBB0_1
++ entry:
++   %v2 = icmp eq i32 0, 0
++   br i1 %v2, label %b1, label %b4
++@@ -41,14 +47,20 @@
++ define i8 @foo_optspeed(i32 %v4) {
++ ; CHECK-LABEL: foo_optspeed:
++ ; CHECK:       // %bb.0: // %entry
++-; CHECK-NEXT:    cbnz w0, .LBB1_2
++-; CHECK-NEXT:  // %bb.1: // %b2
++-; CHECK-NEXT:    mov w0, #1 // =0x1
+++; CHECK-NEXT:    b .LBB1_2
+++; CHECK-NEXT:  .LBB1_1:
+++; CHECK-NEXT:    mov w0, wzr
++ ; CHECK-NEXT:    ret
++ ; CHECK-NEXT:  .LBB1_2: // %b1
++-; CHECK-NEXT:    cmp w0, #1
++-; CHECK-NEXT:    mov w0, wzr
+++; CHECK-NEXT:    cbnz w0, .LBB1_4
+++; CHECK-NEXT:  // %bb.3: // %b2
+++; CHECK-NEXT:    mov w0, #1 // =0x1
++ ; CHECK-NEXT:    ret
+++; CHECK-NEXT:  .LBB1_4: // %b1
+++; CHECK-NEXT:    cmp w0, #1
+++; CHECK-NEXT:    b.ne .LBB1_1
+++; CHECK-NEXT:  // %bb.5: // %b3
+++; CHECK-NEXT:    b .LBB1_1
++ entry:
++   %v2 = icmp eq i32 0, 0
++   br i1 %v2, label %b1, label %b4
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
++--- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+++++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
++@@ -21,8 +21,10 @@
++   ; CHECK-NEXT:   B %bb.3
++   ; CHECK-NEXT: {{  $}}
++   ; CHECK-NEXT: bb.1.bb:
+++  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
++   ; CHECK-NEXT:   liveins: $w0, $lr
++   ; CHECK-NEXT: {{  $}}
+++  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
++   ; CHECK-NEXT:   B %bb.2
++   ; CHECK-NEXT: {{  $}}
++   ; CHECK-NEXT: bb.2.bb1:
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
++--- a/llvm/test/CodeGen/AArch64/pr164181.ll
+++++ b/llvm/test/CodeGen/AArch64/pr164181.ll
++@@ -29,11 +29,11 @@
++ ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
++ ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
++ ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
++-; CHECK-NEXT:    tbz w5, #0, .LBB0_40
+++; CHECK-NEXT:    tbz w5, #0, .LBB0_43
++ ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
++ ; CHECK-NEXT:    ldr x4, [sp, #312]
++ ; CHECK-NEXT:    ldr x14, [sp, #280]
++-; CHECK-NEXT:    tbz w0, #0, .LBB0_39
+++; CHECK-NEXT:    tbz w0, #0, .LBB0_42
++ ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
++ ; CHECK-NEXT:    ldrb w8, [sp, #368]
++ ; CHECK-NEXT:    ldrb w12, [sp, #256]
++@@ -92,7 +92,7 @@
++ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
++ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
++ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
++-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
++ ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
++ ; CHECK-NEXT:    mov x12, x24
++ ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
++@@ -117,7 +117,7 @@
++ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
++ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
++ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
++-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
++ ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
++ ; CHECK-NEXT:    cmn x24, #30
++ ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
++@@ -142,7 +142,7 @@
++ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
++ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
++ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
++-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
++ ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
++ ; CHECK-NEXT:    mov w14, #1152 // =0x480
++ ; CHECK-NEXT:    mov w24, #1 // =0x1
++@@ -176,7 +176,7 @@
++ ; CHECK-NEXT:    // => This Loop Header: Depth=4
++ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
++ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
++-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
++ ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
++ ; CHECK-NEXT:    and w8, w8, w8, asr #31
++ ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
++@@ -281,23 +281,31 @@
++ ; CHECK-NEXT:    mov x24, xzr
++ ; CHECK-NEXT:    mul w12, w12, w22
++ ; CHECK-NEXT:    mov x22, x5
++-; CHECK-NEXT:    tbz w0, #0, .LBB0_33
++-; CHECK-NEXT:  .LBB0_28: // %if.then222.us
+++; CHECK-NEXT:    tbz w0, #0, .LBB0_36
+++; CHECK-NEXT:  .LBB0_28: // %for.body194.us
++ ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
++ ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
++ ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
++ ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
++ ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+++; CHECK-NEXT:  // %bb.29: // %if.then222.us
+++; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
++ ; CHECK-NEXT:    adrp x27, :got:var_32
++ ; CHECK-NEXT:    ldur w8, [x19, #-12]
++ ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
++ ; CHECK-NEXT:    strh w8, [x27]
++ ; CHECK-NEXT:    sxtb w8, w25
++-; CHECK-NEXT:    strb w3, [x16]
++ ; CHECK-NEXT:    bic w25, w8, w8, asr #31
+++; CHECK-NEXT:    b .LBB0_31
+++; CHECK-NEXT:    .p2align 5, , 16
+++; CHECK-NEXT:  // %bb.30:
+++; CHECK-NEXT:    mov w25, wzr
+++; CHECK-NEXT:  .LBB0_31: // %if.end239.us
+++; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+++; CHECK-NEXT:    strb w3, [x16]
++ ; CHECK-NEXT:    tst w13, #0xff
++-; CHECK-NEXT:    b.eq .LBB0_30
++-; CHECK-NEXT:  // %bb.29: // %if.then254.us
+++; CHECK-NEXT:    b.eq .LBB0_33
+++; CHECK-NEXT:  // %bb.32: // %if.then254.us
++ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
++ ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
++ ; CHECK-NEXT:    adrp x27, :got:var_35
++@@ -306,7 +314,7 @@
++ ; CHECK-NEXT:    csel x8, xzr, x7, eq
++ ; CHECK-NEXT:    str x8, [x27]
++ ; CHECK-NEXT:    strh w1, [x17]
++-; CHECK-NEXT:  .LBB0_30: // %if.end282.us
+++; CHECK-NEXT:  .LBB0_33: // %if.end282.us
++ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
++ ; CHECK-NEXT:    orr x27, x24, x4
++ ; CHECK-NEXT:    adrp x8, :got:var_39
++@@ -317,14 +325,14 @@
++ ; CHECK-NEXT:    str x8, [x18]
++ ; CHECK-NEXT:    mov w8, #1 // =0x1
++ ; CHECK-NEXT:    cbnz x2, .LBB0_27
++-; CHECK-NEXT:  // %bb.31: // %if.then327.us
+++; CHECK-NEXT:  // %bb.34: // %if.then327.us
++ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
++ ; CHECK-NEXT:    cbz w8, .LBB0_25
++-; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
+++; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
++ ; CHECK-NEXT:    mov w4, wzr
++ ; CHECK-NEXT:    b .LBB0_26
++ ; CHECK-NEXT:    .p2align 5, , 16
++-; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
+++; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
++ ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
++ ; CHECK-NEXT:    mov w3, #1152 // =0x480
++ ; CHECK-NEXT:    mov x22, xzr
++@@ -335,24 +343,24 @@
++ ; CHECK-NEXT:    madd x14, x14, x3, x11
++ ; CHECK-NEXT:    mov w28, w30
++ ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
++-; CHECK-NEXT:    b .LBB0_36
+++; CHECK-NEXT:    b .LBB0_39
++ ; CHECK-NEXT:    .p2align 5, , 16
++-; CHECK-NEXT:  .LBB0_34: // %if.then466.us
++-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+++; CHECK-NEXT:  .LBB0_37: // %if.then466.us
+++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
++ ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
++ ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
++ ; CHECK-NEXT:    sxtb w4, w4
++ ; CHECK-NEXT:    bic w4, w4, w4, asr #31
++ ; CHECK-NEXT:    str x3, [x28]
++ ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
++-; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
++-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+++; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
+++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
++ ; CHECK-NEXT:    add x22, x22, #1
++ ; CHECK-NEXT:    add x27, x27, #1
++ ; CHECK-NEXT:    mov w28, wzr
++ ; CHECK-NEXT:    cmp x27, #0
++ ; CHECK-NEXT:    b.hs .LBB0_9
++-; CHECK-NEXT:  .LBB0_36: // %for.body380.us
+++; CHECK-NEXT:  .LBB0_39: // %for.body380.us
++ ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
++ ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
++ ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
++@@ -364,18 +372,18 @@
++ ; CHECK-NEXT:    strh w28, [x11]
++ ; CHECK-NEXT:    csel w28, w21, w3, ne
++ ; CHECK-NEXT:    str w28, [x20]
++-; CHECK-NEXT:    cbz x15, .LBB0_35
++-; CHECK-NEXT:  // %bb.37: // %if.then436.us
++-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+++; CHECK-NEXT:    cbz x15, .LBB0_38
+++; CHECK-NEXT:  // %bb.40: // %if.then436.us
+++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
++ ; CHECK-NEXT:    ldrh w28, [x14]
++-; CHECK-NEXT:    cbnz w28, .LBB0_34
++-; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
+++; CHECK-NEXT:    cbnz w28, .LBB0_37
+++; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
++ ; CHECK-NEXT:    mov w4, wzr
++-; CHECK-NEXT:    b .LBB0_35
++-; CHECK-NEXT:  .LBB0_39: // %for.body41
+++; CHECK-NEXT:    b .LBB0_38
+++; CHECK-NEXT:  .LBB0_42: // %for.body41
++ ; CHECK-NEXT:    strb wzr, [x4]
++ ; CHECK-NEXT:    strb wzr, [x14]
++-; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
+++; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
++ ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
++ ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
++ ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
++diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
++--- a/llvm/test/CodeGen/AArch64/pr166870.ll
+++++ b/llvm/test/CodeGen/AArch64/pr166870.ll
++@@ -26,11 +26,12 @@
++ ; CHECK-NEXT:    mov x21, x1
++ ; CHECK-NEXT:    bl baz
++ ; CHECK-NEXT:    mov w0, #0 // =0x0
+++; CHECK-NEXT:  // %bb.5: // %bb6
++ ; CHECK-NEXT:    mov w10, #1 // =0x1
+++; CHECK-NEXT:    cbnz w10, .LBB0_11
+++; CHECK-NEXT:  // %bb.6: // %bb7
++ ; CHECK-NEXT:    cbnz w10, .LBB0_10
++-; CHECK-NEXT:  // %bb.5: // %bb7
++-; CHECK-NEXT:    cbnz w10, .LBB0_9
++-; CHECK-NEXT:  // %bb.6: // %bb8
+++; CHECK-NEXT:  // %bb.7: // %bb8
++ ; CHECK-NEXT:    mov x8, x21
++ ; CHECK-NEXT:    mov x9, x20
++ ; CHECK-NEXT:    mov w20, #0 // =0x0
++@@ -38,17 +39,17 @@
++ ; CHECK-NEXT:    mov x21, x9
++ ; CHECK-NEXT:    mov w8, w8
++ ; CHECK-NEXT:    mov x22, x8
++-; CHECK-NEXT:  .LBB0_7: // %bb10
+++; CHECK-NEXT:  .LBB0_8: // %bb10
++ ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
++ ; CHECK-NEXT:    strb w20, [x19]
++-; CHECK-NEXT:    cbnz x21, .LBB0_7
++-; CHECK-NEXT:  // %bb.8: // %bb12
++-; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
+++; CHECK-NEXT:    cbnz x21, .LBB0_8
+++; CHECK-NEXT:  // %bb.9: // %bb12
+++; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
++ ; CHECK-NEXT:    bl snork
++-; CHECK-NEXT:    cbnz x22, .LBB0_7
++-; CHECK-NEXT:  .LBB0_9:
++-; CHECK-NEXT:    mov w0, #0 // =0x0
+++; CHECK-NEXT:    cbnz x22, .LBB0_8
++ ; CHECK-NEXT:  .LBB0_10:
+++; CHECK-NEXT:    mov w0, #0 // =0x0
+++; CHECK-NEXT:  .LBB0_11:
++ ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
++ ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
++ ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
++diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
++--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+++++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
++@@ -71,21 +71,27 @@
++ ; CHECK-NEXT:    .cfi_def_cfa w29, 16
++ ; CHECK-NEXT:    .cfi_offset w30, -8
++ ; CHECK-NEXT:    .cfi_offset w29, -16
+++; CHECK-NEXT:    .cfi_remember_state
++ ; CHECK-NEXT:    mov w8, #1 // =0x1
++-; CHECK-NEXT:    mov w9, #2 // =0x2
++ ; CHECK-NEXT:    stur xzr, [x29, #-8]
++-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++-; CHECK-NEXT:    ldur w8, [x29, #-8]
++-; CHECK-NEXT:    cbz w8, .LBB0_2
+++; CHECK-NEXT:    b .LBB0_3
++ ; CHECK-NEXT:  // %bb.1:
++-; CHECK-NEXT:    mov w8, #1 // =0x1
++ ; CHECK-NEXT:    str w8, [sp, #16]
++-; CHECK-NEXT:    b .LBB0_3
+++; CHECK-NEXT:    ldur w8, [x29, #-8]
+++; CHECK-NEXT:    cbz w8, .LBB0_4
++ ; CHECK-NEXT:  .LBB0_2:
+++; CHECK-NEXT:    .cfi_restore_state
++ ; CHECK-NEXT:    mov w8, #1 // =0x1
++-; CHECK-NEXT:    mov w9, #2 // =0x2
++-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+++; CHECK-NEXT:    str w8, [sp, #16]
+++; CHECK-NEXT:    b .LBB0_5
++ ; CHECK-NEXT:  .LBB0_3:
+++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+++; CHECK-NEXT:    ldur w8, [x29, #-8]
+++; CHECK-NEXT:    cbnz w8, .LBB0_2
+++; CHECK-NEXT:  .LBB0_4:
+++; CHECK-NEXT:    mov w8, #1 // =0x1
+++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+++; CHECK-NEXT:  .LBB0_5:
++ ; CHECK-NEXT:    mov w0, wzr
++ ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
++ ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
++@@ -128,6 +134,7 @@
++ ;
++ ; CHECK-LABEL: OUTLINED_FUNCTION_0:
++ ; CHECK:       // %bb.0:
+++; CHECK-NEXT:    mov w9, #2 // =0x2
++ ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
++ ; CHECK-NEXT:    mov w9, #3 // =0x3
++ ; CHECK-NEXT:    mov w8, #4 // =0x4
++diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
++--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+++++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
++@@ -12,21 +12,27 @@
++ ; CHECK-NEXT:    .cfi_def_cfa w29, 16
++ ; CHECK-NEXT:    .cfi_offset w30, -8
++ ; CHECK-NEXT:    .cfi_offset w29, -16
+++; CHECK-NEXT:    .cfi_remember_state
++ ; CHECK-NEXT:    mov w8, #1 // =0x1
++-; CHECK-NEXT:    mov w9, #2 // =0x2
++ ; CHECK-NEXT:    stur xzr, [x29, #-8]
++-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
++-; CHECK-NEXT:    ldur w8, [x29, #-8]
++-; CHECK-NEXT:    cbz w8, .LBB0_2
+++; CHECK-NEXT:    b .LBB0_3
++ ; CHECK-NEXT:  // %bb.1:
++-; CHECK-NEXT:    mov w8, #1 // =0x1
++ ; CHECK-NEXT:    str w8, [sp, #16]
++-; CHECK-NEXT:    b .LBB0_3
+++; CHECK-NEXT:    ldur w8, [x29, #-8]
+++; CHECK-NEXT:    cbz w8, .LBB0_4
++ ; CHECK-NEXT:  .LBB0_2:
+++; CHECK-NEXT:    .cfi_restore_state
++ ; CHECK-NEXT:    mov w8, #1 // =0x1
++-; CHECK-NEXT:    mov w9, #2 // =0x2
++-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+++; CHECK-NEXT:    str w8, [sp, #16]
+++; CHECK-NEXT:    b .LBB0_5
++ ; CHECK-NEXT:  .LBB0_3:
+++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+++; CHECK-NEXT:    ldur w8, [x29, #-8]
+++; CHECK-NEXT:    cbnz w8, .LBB0_2
+++; CHECK-NEXT:  .LBB0_4:
+++; CHECK-NEXT:    mov w8, #1 // =0x1
+++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+++; CHECK-NEXT:  .LBB0_5:
++ ; CHECK-NEXT:    mov w0, wzr
++ ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
++ ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
++diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
++--- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+++++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
++@@ -2,23 +2,29 @@
++ ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
++ ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
++ 
++-; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
++-; no runtime check is generated and the loop should not be vectorized.
+++; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
+++; no runtime check is generated even though one is needed and !noalias
+++; annotations are added.
++ define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
++ ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
++ ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
++-; LIMIT0-NEXT:  [[ENTRY:.*]]:
++-; LIMIT0-NEXT:    br label %[[LOOP:.*]]
++-; LIMIT0:       [[LOOP]]:
++-; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
++-; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
+++; LIMIT0-NEXT:  [[ENTRY:.*:]]
+++; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
+++; LIMIT0:       [[VECTOR_PH]]:
+++; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+++; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+++; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+++; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
+++; LIMIT0:       [[VECTOR_BODY]]:
+++; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
++ ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
++-; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
++-; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+++; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+++; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
++ ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
++-; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+++; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+++; LIMIT0:       [[MIDDLE_BLOCK]]:
+++; LIMIT0-NEXT:    br label %[[EXIT:.*]]
++ ; LIMIT0:       [[EXIT]]:
++-; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
++ ; LIMIT0-NEXT:    ret i16 [[TMP0]]
++ ;
++ ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
++@@ -82,9 +88,14 @@
++ !3 = !{!"llvm.loop.vectorize.enable", i1 true}
++ 
++ ;.
++-; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
++-; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
++-; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
+++; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
+++; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+++; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+++; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
+++; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
+++; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+++; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+++; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
++ ;.
++ ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
++ ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
++diff -ruN --strip-trailing-cr a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
++--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
++@@ -1320,8 +1320,9 @@
++ }
++ 
++ template <typename T>
++-T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
++-                               StringRef entryType, uint64_t depth) {
+++T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
+++                               uint64_t index, StringRef entryType,
+++                               uint64_t depth) {
++   if (index >= entries.size()) {
++     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
++     return {};
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 5e3d8f2..dd3d4e4 100644
+index dd3d4e4..e573782 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "8dee997a8558b460b82b23fb43b197d68258baac"
--    LLVM_SHA256 = "6a26975000c2cb45787813317bfeeadeafa0cba762e9434fb7940481ec4b27de"
-+    LLVM_COMMIT = "c6e23ab80753a01dce270f5f8a133fbec942315d"
-+    LLVM_SHA256 = "5a6b8aacd2d87ce9c4456843a76d0a54fd7cd0ae788ed3f19e7487ecd2ce4326"
+-    LLVM_COMMIT = "c6e23ab80753a01dce270f5f8a133fbec942315d"
+-    LLVM_SHA256 = "5a6b8aacd2d87ce9c4456843a76d0a54fd7cd0ae788ed3f19e7487ecd2ce4326"
++    LLVM_COMMIT = "87bf5ee23863bc0b467ee44b2184b2c134a98464"
++    LLVM_SHA256 = "9d0bca271bfb266de8453cd34156741fd41f64b911f580262d187ce4d4d9b6d9"
  
      tf_http_archive(
          name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 22625dcbb1fff3..bda3ea1501fc0e 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "bda4ef8940c146fed6477d03b375c06d04475003"
-    SHARDY_SHA256 = "4d54264a91c6ae7977ea072eef92a4df0ffa4b5cc97bdf814095b0f8f7f6d5ce"
+    SHARDY_COMMIT = "879ea7b95f957a2fc66da58b10d21390eb5f449a"
+    SHARDY_SHA256 = "222ec7b6c591888207d4de5795e03fe81ec94b3acb59a421ce02ff7ace38dc07"
 
     tf_http_archive(
         name = "shardy",

From 45fb84067d13d40feee87d26bd6a4b086fa8b319 Mon Sep 17 00:00:00 2001
From: Fengwu Yao <fengwuyao@google.com>
Date: Wed, 10 Dec 2025 14:01:41 -0800
Subject: [PATCH 143/753] Update to use half data type in test_util

PiperOrigin-RevId: 842870236
---
 tensorflow/lite/kernels/BUILD                 | 30 ++++--
 tensorflow/lite/kernels/activations_test.cc   | 43 ++++-----
 tensorflow/lite/kernels/atan2_test.cc         | 29 +++---
 tensorflow/lite/kernels/cast_test.cc          | 19 ++--
 tensorflow/lite/kernels/comparisons_test.cc   | 11 +--
 tensorflow/lite/kernels/concatenation_test.cc | 44 ++++-----
 .../lite/kernels/dynamic_update_slice_test.cc | 17 ++--
 tensorflow/lite/kernels/fill_test.cc          |  5 +-
 tensorflow/lite/kernels/floor_test.cc         | 31 +++---
 tensorflow/lite/kernels/gather_nd_test.cc     | 62 ++++++------
 tensorflow/lite/kernels/gather_test.cc        |  4 +-
 .../lite/kernels/maximum_minimum_test.cc      | 96 +++++++++----------
 tensorflow/lite/kernels/neg_test.cc           | 13 ++-
 tensorflow/lite/kernels/pad_test.cc           | 46 +++++----
 tensorflow/lite/kernels/reverse_test.cc       | 42 ++++----
 tensorflow/lite/kernels/round_test.cc         | 37 ++++---
 tensorflow/lite/kernels/slice_test.cc         | 39 ++++----
 tensorflow/lite/kernels/strided_slice_test.cc |  8 +-
 tensorflow/lite/kernels/test_util.h           | 10 +-
 tensorflow/lite/kernels/test_util_test.cc     |  9 ++
 20 files changed, 301 insertions(+), 294 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 6a3ec9f57e2a02..db2435b081d36b 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -167,7 +167,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "//tensorflow/lite/tools/serialization:writer_lib",
         "//tensorflow/lite/tools/versioning",
-        "@FP16",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -574,6 +574,7 @@ cc_test(
         "//tensorflow/lite:array",
         "//tensorflow/lite:util",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1118,6 +1119,7 @@ cc_test(
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1499,6 +1501,7 @@ cc_test(
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1515,8 +1518,8 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -1709,6 +1712,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1739,6 +1743,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1853,6 +1858,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1913,8 +1919,8 @@ cc_test(
         ":test_util",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
@@ -1972,6 +1978,7 @@ cc_test(
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2082,12 +2089,12 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2101,7 +2108,9 @@ cc_test(
         "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -2485,7 +2494,9 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2530,6 +2541,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2544,6 +2556,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2581,8 +2594,8 @@ cc_test(
         "//tensorflow/lite/kernels/internal:tensor_ctypes",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2881,6 +2894,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -2905,6 +2919,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -3173,12 +3188,13 @@ cc_test(
     size = "small",
     srcs = ["dynamic_update_slice_test.cc"],
     deps = [
+        ":subgraph_test_util",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
-        "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
         "@flatbuffers",
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 42747a87e61b2a..96bb22ed76c431 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 
@@ -574,18 +575,17 @@ TEST_P(TanhOpTest, Tanh) {
 }
 
 TEST_P(TanhOpTest, TanhFloat16) {
-  FloatActivationsOpModel<Eigen::half> m(
-      GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_TANH,
+                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      Eigen::half(0),
-      Eigen::half(-6),
-      Eigen::half(2),
-      Eigen::half(4),
-      Eigen::half(3),
-      Eigen::half(-2),
-      Eigen::half(10),
-      Eigen::half(1),
+      half(0),
+      half(-6),
+      half(2),
+      half(4),
+      half(3),
+      half(-2),
+      half(10),
+      half(1),
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
@@ -1210,18 +1210,17 @@ TEST_P(LogisticOpTest, SigmoidFloat32) {
 }
 
 TEST_P(LogisticOpTest, SigmoidFloat16) {
-  FloatActivationsOpModel<Eigen::half> m(
-      GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_LOGISTIC,
+                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      Eigen::half{-1.2f},
-      Eigen::half{-6.0f},
-      Eigen::half{2.0f},
-      Eigen::half{4.0f},
-      Eigen::half{3.0f},
-      Eigen::half{-2.0f},
-      Eigen::half{10.0f},
-      Eigen::half{1.0f},
+      half{-1.2f},
+      half{-6.0f},
+      half{2.0f},
+      half{4.0f},
+      half{3.0f},
+      half{-2.0f},
+      half{10.0f},
+      half{1.0f},
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/atan2_test.cc b/tensorflow/lite/kernels/atan2_test.cc
index 309ba79f284f3f..0c3839361570a6 100644
--- a/tensorflow/lite/kernels/atan2_test.cc
+++ b/tensorflow/lite/kernels/atan2_test.cc
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -34,7 +35,7 @@ tflite::TensorType GetTTEnum<double>() {
 }
 
 template <>
-tflite::TensorType GetTTEnum<Eigen::half>() {
+tflite::TensorType GetTTEnum<half>() {
   return tflite::TensorType_FLOAT16;
 }
 
@@ -74,7 +75,7 @@ class Atan2Test : public ::testing::Test {
   using FloatType = Float;
 };
 
-using TestTypes = ::testing::Types<float, double, Eigen::half, Eigen::bfloat16>;
+using TestTypes = ::testing::Types<float, double, half, Eigen::bfloat16>;
 
 TYPED_TEST_SUITE(Atan2Test, TestTypes);
 
@@ -85,15 +86,15 @@ TYPED_TEST(Atan2Test, TestScalar) {
   tflite::TensorData output = {GetTTEnum<Float>(), {}};
   Atan2Model m(y, x, output);
 
-  auto got = m.GetOutput<Float>({Float(0.0)}, {Float(0.0)});
+  auto got = m.GetOutput<Float>({Float(0.0f)}, {Float(0.0f)});
   ASSERT_EQ(got.size(), 1);
   EXPECT_FLOAT_EQ(got[0], 0.0);
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0)}, {Float(0.0)})[0],
-                  Float(M_PI / 2));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0)}, {Float(1.0)})[0],
-                  Float(0.0));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0)}, {Float(0.0)})[0],
-                  Float(-M_PI / 2));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0f)}, {Float(0.0f)})[0],
+                  Float(static_cast<float>(M_PI / 2)));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0f)}, {Float(1.0f)})[0],
+                  Float(0.0f));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0f)}, {Float(0.0f)})[0],
+                  Float(-static_cast<float>(M_PI / 2)));
 }
 
 TYPED_TEST(Atan2Test, TestBatch) {
@@ -102,10 +103,12 @@ TYPED_TEST(Atan2Test, TestBatch) {
   tflite::TensorData x = {GetTTEnum<Float>(), {4, 2, 1}};
   tflite::TensorData output = {GetTTEnum<Float>(), {4, 2, 1}};
   Atan2Model m(y, x, output);
-  std::vector<Float> y_data = {Float(0.1), Float(0.2), Float(0.3), Float(0.4),
-                               Float(0.5), Float(0.6), Float(0.7), Float(0.8)};
-  std::vector<Float> x_data = {Float(0.8), Float(0.7), Float(0.6), Float(0.5),
-                               Float(0.4), Float(0.3), Float(0.2), Float(0.1)};
+  std::vector<Float> y_data = {Float(0.1f), Float(0.2f), Float(0.3f),
+                               Float(0.4f), Float(0.5f), Float(0.6f),
+                               Float(0.7f), Float(0.8f)};
+  std::vector<Float> x_data = {Float(0.8f), Float(0.7f), Float(0.6f),
+                               Float(0.5f), Float(0.4f), Float(0.3f),
+                               Float(0.2f), Float(0.1f)};
   auto got = m.GetOutput<Float>(y_data, x_data);
   ASSERT_EQ(got.size(), 8);
   for (int i = 0; i < 8; ++i) {
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index 77cc2f3442b1c2..bcc9b4bc058003 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -413,11 +414,10 @@ TEST(CastOpModel, CastFloatToFloat16) {
   m.PopulateTensor<float>(m.input(), {100.f, 1.0f, 0.f, 0.4f, 1.999f, 1.1f});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
-      m.ExtractVector<Eigen::half>(m.output()),
-      ElementsAreArray(
-          {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
-           static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
-           static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1)}));
+      m.ExtractVector<half>(m.output()),
+      ElementsAreArray({static_cast<half>(100.f), static_cast<half>(1.0f),
+                        static_cast<half>(0.f), static_cast<half>(0.4f),
+                        static_cast<half>(1.999f), static_cast<half>(1.1f)}));
 }
 
 TEST(CastOpModel, CastFloatToBFloat16) {
@@ -435,11 +435,10 @@ TEST(CastOpModel, CastFloatToBFloat16) {
 
 TEST(CastOpModel, CastFloat16ToFloat) {
   CastOpModel m({TensorType_FLOAT16, {3, 2}}, {TensorType_FLOAT32, {3, 2}});
-  m.PopulateTensor<Eigen::half>(
-      m.input(),
-      {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
-       static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
-       static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1f)});
+  m.PopulateTensor<half>(m.input(),
+                         {static_cast<half>(100.f), static_cast<half>(1.0f),
+                          static_cast<half>(0.f), static_cast<half>(0.4f),
+                          static_cast<half>(1.999f), static_cast<half>(1.1f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.ExtractVector<float>(m.output()),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 10226bb60a8ed8..bc2091aa823832 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -396,12 +397,10 @@ TEST(ComparisonsTest, LessFloat) {
 TEST(ComparisonsTest, LessFloat16) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT16,
                           BuiltinOperator_LESS);
-  model.PopulateTensor<Eigen::half>(
-      model.input1(),
-      {Eigen::half(0.1), Eigen::half(0.9), Eigen::half(0.7), Eigen::half(0.3)});
-  model.PopulateTensor<Eigen::half>(
-      model.input2(),
-      {Eigen::half(0.1), Eigen::half(0.2), Eigen::half(0.6), Eigen::half(0.5)});
+  model.PopulateTensor<half>(model.input1(),
+                             {half(0.1f), half(0.9f), half(0.7f), half(0.3f)});
+  model.PopulateTensor<half>(model.input2(),
+                             {half(0.1f), half(0.2f), half(0.6f), half(0.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true));
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 28692ae1528dd3..f9c765375cc20f 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -121,12 +122,11 @@ TEST(ConcatenationOpTest, ThreeDimensionalOneInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, ThreeDimensionalOneInputFloat16) {
-  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2}},
-                                      /*axis=*/1,
-                                      /*num_inputs=*/1);
-  m.SetInput(0,
-             {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(3.0f),
-              static_cast<Eigen::half>(4.0f), static_cast<Eigen::half>(7.0f)});
+  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2}},
+                               /*axis=*/1,
+                               /*num_inputs=*/1);
+  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(3.0f),
+                 static_cast<half>(4.0f), static_cast<half>(7.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
 }
@@ -206,23 +206,21 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, FiveDimensionalTwoInputFloat16) {
-  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
-                                      /*axis=*/0,
-                                      /*num_inputs=*/2);
-  m.SetInput(
-      0, {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(2.0f),
-          static_cast<Eigen::half>(3.0f), static_cast<Eigen::half>(4.0f),
-          static_cast<Eigen::half>(5.0f), static_cast<Eigen::half>(6.0f),
-          static_cast<Eigen::half>(7.0f), Eigen::half{8.0f},
-          static_cast<Eigen::half>(9.0f), static_cast<Eigen::half>(10.0f),
-          static_cast<Eigen::half>(11.0f), static_cast<Eigen::half>(12.0f)});
-  m.SetInput(
-      1, {static_cast<Eigen::half>(13.0f), static_cast<Eigen::half>(14.0f),
-          Eigen::half{15.0f}, static_cast<Eigen::half>(16.0f),
-          Eigen::half{17.0f}, static_cast<Eigen::half>(18.0f),
-          static_cast<Eigen::half>(19.0f), static_cast<Eigen::half>(20.0f),
-          static_cast<Eigen::half>(21.0f), static_cast<Eigen::half>(22.0f),
-          static_cast<Eigen::half>(23.0f), static_cast<Eigen::half>(24.0f)});
+  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
+                               /*axis=*/0,
+                               /*num_inputs=*/2);
+  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(2.0f),
+                 static_cast<half>(3.0f), static_cast<half>(4.0f),
+                 static_cast<half>(5.0f), static_cast<half>(6.0f),
+                 static_cast<half>(7.0f), half{8.0f}, static_cast<half>(9.0f),
+                 static_cast<half>(10.0f), static_cast<half>(11.0f),
+                 static_cast<half>(12.0f)});
+  m.SetInput(1,
+             {static_cast<half>(13.0f), static_cast<half>(14.0f), half{15.0f},
+              static_cast<half>(16.0f), half{17.0f}, static_cast<half>(18.0f),
+              static_cast<half>(19.0f), static_cast<half>(20.0f),
+              static_cast<half>(21.0f), static_cast<half>(22.0f),
+              static_cast<half>(23.0f), static_cast<half>(24.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/dynamic_update_slice_test.cc b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
index 373a719d5ac412..99aa637a068d23 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice_test.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -112,10 +113,9 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   DynamicUpdateSliceOpModel m({TensorType_FLOAT16, {3, 3}},
                               {TensorType_FLOAT16, {2, 1}},
                               {TensorType_INT32, {2}});
-  m.SetInput<Eigen::half>({Eigen::half(1), Eigen::half(2), Eigen::half(3),
-                           Eigen::half(4), Eigen::half(5), Eigen::half(6),
-                           Eigen::half(7), Eigen::half(8), Eigen::half(9)});
-  m.SetUpdate<Eigen::half>({Eigen::half(-1), Eigen::half(-2)});
+  m.SetInput<half>({half(1), half(2), half(3), half(4), half(5), half(6),
+                    half(7), half(8), half(9)});
+  m.SetUpdate<half>({half(-1), half(-2)});
   m.SetStartIndices<int32_t>({1, 1});
   const int kInplaceInputTensorIdx = 0;
   const int kInplaceOutputTensorIdx = 0;
@@ -123,11 +123,10 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   TfLiteTensor* output_tensor = m.GetOutputTensor(kInplaceOutputTensorIdx);
   output_tensor->data.data = input_tensor->data.data;
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<Eigen::half>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {Eigen::half(1), Eigen::half(2), Eigen::half(3),
-                   Eigen::half(4), Eigen::half(-1), Eigen::half(6),
-                   Eigen::half(7), Eigen::half(-2), Eigen::half(9)})));
+  EXPECT_THAT(m.GetOutput<half>(),
+              ElementsAreArray(
+                  ArrayFloatNear({half(1), half(2), half(3), half(4), half(-1),
+                                  half(6), half(7), half(-2), half(9)})));
   EXPECT_EQ(output_tensor->data.data, input_tensor->data.data);
 }
 
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index 028623e3a0a321..a8e9815f30bc61 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -139,8 +140,8 @@ TEST_P(FillOpTest, FillFloat) {
 }
 
 TEST_P(FillOpTest, FillFloat16) {
-  FillOpModel<int64_t, Eigen::half> m(TensorType_INT64, {3}, {2, 2, 2},
-                                      Eigen::half(4.0f), GetParam());
+  FillOpModel<int64_t, half> m(TensorType_INT64, {3}, {2, 2, 2}, half(4.0f),
+                               GetParam());
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
index 86ea68ad39e599..13154175e334cc 100644
--- a/tensorflow/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -79,28 +80,28 @@ TEST(FloorOpTest, MultiDims) {
 
 TEST(FloorOpTest, SingleDimFloat16) {
   FloorOpModel model({2}, TensorType_FLOAT16);
-  model.PopulateTensor<>(model.input(), {Eigen::half(8.5), Eigen::half(0.0)});
+  model.PopulateTensor<>(model.input(), {half(8.5f), half(0.0f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<Eigen::half>(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutput<half>(), ElementsAreArray({8, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(FloorOpTest, MultiDimsFloat16) {
   FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT16);
-  model.PopulateTensor<Eigen::half>(model.input(), {
-                                                       Eigen::half(0.75),
-                                                       Eigen::half(8.25),
-                                                       Eigen::half(0.49),
-                                                       Eigen::half(9.99),
-                                                       Eigen::half(0.5),
-                                                       Eigen::half(-0.25),
-                                                       Eigen::half(-8.75),
-                                                       Eigen::half(-0.99),
-                                                       Eigen::half(-9.49),
-                                                       Eigen::half(-0.5),
-                                                   });
+  model.PopulateTensor<half>(model.input(), {
+                                                half(0.75f),
+                                                half(8.25f),
+                                                half(0.49f),
+                                                half(9.99f),
+                                                half(0.5f),
+                                                half(-0.25f),
+                                                half(-8.75f),
+                                                half(-0.99f),
+                                                half(-9.49f),
+                                                half(-0.5f),
+                                            });
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<Eigen::half>(),
+  EXPECT_THAT(model.GetOutput<half>(),
               ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
 }
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
index 2bd9a0235ebe2c..f4b9f65711fbdc 100644
--- a/tensorflow/lite/kernels/gather_nd_test.cc
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -20,10 +20,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -244,21 +246,19 @@ TEST(GatherNdOpTest, BFloat16Int32) {
 TEST(GatherNdOpTest, Float16Int32) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT32, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int32_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int32) {
@@ -297,21 +297,19 @@ TEST(GatherNdOpTest, BFloat16Int64) {
 TEST(GatherNdOpTest, Float16Int64) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT64, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int64) {
@@ -462,21 +460,19 @@ TEST(GatherNdOpTest, BFloat16Int16) {
 TEST(GatherNdOpTest, Float16Int16) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT16, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int16_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int16) {
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 23e30eb7867774..61ca1b654f6160 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -252,7 +254,7 @@ TEST_P(GatherOpTest, LastAxis0DIndex) {
 }
 
 using TestTypes = testing::Types<int8_t, uint8_t, int16_t, int32_t, int64_t,
-                                 float, Eigen::half, Eigen::bfloat16>;
+                                 float, half, Eigen::bfloat16>;
 
 template <typename T>
 struct TypedGatherOpTest : public testing::Test {};
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index babdb4f69fad03..00e25ee9b86500 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -247,24 +248,20 @@ TEST(MaximumOpTest, Int32WithBroadcastTest5D) {
 }
 
 TEST(MaximumOpTest, Float16Test) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0),  Eigen::half(-1.0),
-      Eigen::half(11.0), Eigen::half(-2.0), Eigen::half(-1.44)};
-  std::initializer_list<Eigen::half> data2 = {
-      Eigen::half(-1.0), Eigen::half(0.0),  Eigen::half(1.0),
-      Eigen::half(12.0), Eigen::half(-3.0), Eigen::half(-1.43)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(1.0), Eigen::half(12.0),
-       Eigen::half(-2.0), Eigen::half(-1.43)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
-      data2,
-      {Eigen::half(-1.0), Eigen::half(0.0), Eigen::half(-1.0),
-       Eigen::half(11.0), Eigen::half(-3.0), Eigen::half(-1.44)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),  half(-1.0f),
+                                       half(11.0f), half(-2.0f), half(-1.44f)};
+  std::initializer_list<half> data2 = {half(-1.0f), half(0.0f),  half(1.0f),
+                                       half(12.0f), half(-3.0f), half(-1.43f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
+                  {half(1.0f), half(0.0f), half(1.0f), half(12.0f), half(-2.0f),
+                   half(-1.43f)});
+  TestModel<half>(BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
+                  {half(-1.0f), half(0.0f), half(-1.0f), half(11.0f),
+                   half(-3.0f), half(-1.44f)});
 }
 
 TEST(MaximumOpTest, BFloat16Test) {
@@ -308,42 +305,39 @@ TEST(MaximumOpTest, BFloat16WithBroadcastTest5DScalarY) {
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5DScalarY) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0), Eigen::half(-1.0),
-      Eigen::half(-2.0), Eigen::half(3.0), Eigen::half(11.0)};
-  std::initializer_list<Eigen::half> data2 = {Eigen::half(2.0)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
-      data2,
-      {Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0),
-       Eigen::half(3.0), Eigen::half(11.0)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
-       Eigen::half(2.0), Eigen::half(2.0)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f), half(-1.0f),
+                                       half(-2.0f), half(3.0f), half(11.0f)};
+  std::initializer_list<half> data2 = {half(2.0f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM,
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+                  {TensorType_FLOAT16, {1}},
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
+                  {half(2.0f), half(2.0f), half(2.0f), half(2.0f), half(3.0f),
+                   half(11.0f)});
+  TestModel<half>(BuiltinOperator_MINIMUM,
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+                  {TensorType_FLOAT16, {1}},
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
+                  {half(1.0f), half(0.0f), half(-1.0f), half(-2.0f), half(2.0f),
+                   half(2.0f)});
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5D) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0),   Eigen::half(-1.0),
-      Eigen::half(-2.0), Eigen::half(-1.44), Eigen::half(11.0)};
-  std::initializer_list<Eigen::half> data2 = {Eigen::half(0.5),
-                                              Eigen::half(2.0)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(2.0), Eigen::half(0.5), Eigen::half(2.0),
-       Eigen::half(0.5), Eigen::half(11.0)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
-      data2,
-      {Eigen::half(0.5), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
-       Eigen::half(-1.44), Eigen::half(2.0)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),   half(-1.0f),
+                                       half(-2.0f), half(-1.44f), half(11.0f)};
+  std::initializer_list<half> data2 = {half(0.5f), half(2.0f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM,
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+                  {TensorType_FLOAT16, {2}},
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
+                  {half(1.0f), half(2.0f), half(0.5f), half(2.0f), half(0.5f),
+                   half(11.0f)});
+  TestModel<half>(BuiltinOperator_MINIMUM,
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+                  {TensorType_FLOAT16, {2}},
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
+                  {half(0.5f), half(0.0f), half(-1.0f), half(-2.0f),
+                   half(-1.44f), half(2.0f)});
 }
 
 TEST(MaximumOpTest, BFloat16WithBroadcastTest5D) {
diff --git a/tensorflow/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
index fe9cc68bdf8a4d..883f9182758412 100644
--- a/tensorflow/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -67,14 +68,12 @@ TEST(NegOpModel, NegFloat32) {
 
 TEST(NegOpModel, NegFloat16) {
   NegOpModel m({TensorType_FLOAT16, {6}}, {TensorType_FLOAT16, {6}});
-  m.SetInput<Eigen::half>({Eigen::half(-2.0f), Eigen::half(-1.0f),
-                           Eigen::half(0.f), Eigen::half(1.0f),
-                           Eigen::half(2.0f), Eigen::half(3.0f)});
+  m.SetInput<half>({half(-2.0f), half(-1.0f), half(0.f), half(1.0f), half(2.0f),
+                    half(3.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<Eigen::half>(),
-              ElementsAreArray({Eigen::half(2.0f), Eigen::half(1.0f),
-                                Eigen::half(0.f), Eigen::half(-1.0f),
-                                Eigen::half(-2.0f), Eigen::half(-3.0f)}));
+  EXPECT_THAT(m.GetOutput<half>(),
+              ElementsAreArray({half(2.0f), half(1.0f), half(0.f), half(-1.0f),
+                                half(-2.0f), half(-3.0f)}));
 }
 
 TEST(NegOpModel, NegBfloat16) {
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 971be96a915b4b..b985abccddcee7 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -927,19 +928,16 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleConstFloat32ValuedTestInt8) {
 
 template <typename padding_integer_type>
 void SimpleConstFloat16ValuedTest() {
-  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+  PadV2OpConstModel<half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-      Eigen::half{4.0f}, {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{1.5f}, Eigen::half{2.5f}, Eigen::half{3.5f},
-              Eigen::half{4.5}});
+      half{4.0f}, {TensorType_FLOAT16});
+  m.SetInput({half{1.5f}, half{2.5f}, half{3.5f}, half{4.5f}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear(
-          {Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{1.5}, Eigen::half{2.5}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{3.5}, Eigen::half{4.5}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4}})));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{4}, half{4}, half{4}, half{4}, half{4}, half{1.5f},
+                   half{2.5f}, half{4}, half{4}, half{3.5f}, half{4.5f},
+                   half{4}, half{4}, half{4}, half{4}, half{4}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
@@ -1050,12 +1048,15 @@ TEST_F(PadV2OpTest, Int16PaddingSimple4DConstFloat32ValuedTest) {
 
 template <typename padding_integer_type>
 void Simple4DConstFloat16ValuedTest() {
-  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+  PadV2OpConstModel<half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 1, 2, 1}}, {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1},
-      Eigen::half{7.0}, {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{3.0f}, Eigen::half{6.0f}});
+      half{7.0f}, {TensorType_FLOAT16});
+  m.SetInput({half{3.0f}, half{6.0f}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 7, 6, 7, 7, 7, 7, 7}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{3.0f}, half{7.0f}, half{6.0f}, half{7.0f}, half{7.0f},
+                   half{7.0f}, half{7.0f}, half{7.0f}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
 }
 
@@ -1167,15 +1168,18 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicTest) {
 
 template <typename padding_integer_type>
 void SimpleDynamicTestV2Float16() {
-  PadV2OpDynamicModel<Eigen::half, padding_integer_type> m(
-      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, Eigen::half{0.0},
+  PadV2OpDynamicModel<half, padding_integer_type> m(
+      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, half{0.0f},
       {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{1.0f}, Eigen::half{2.0f}, Eigen::half{3.0f},
-              Eigen::half{4.0f}});
+  m.SetInput({half{1.0f}, half{2.0f}, half{3.0f}, half{4.0f}});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
-                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
+                   half{1.0f}, half{2.0f}, half{0.0f}, half{0.0f}, half{3.0f},
+                   half{4.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
+                   half{0.0f}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
index 4301b0120f53c3..7e2d3df543ba28 100644
--- a/tensorflow/lite/kernels/reverse_test.cc
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -354,45 +355,38 @@ TEST(ReverseOpTest, Int16MultiDimensions) {
 
 // float16 tests.
 TEST(ReverseOpTest, Float16OneDimension) {
-  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4}},
-                                    {TensorType_INT32, {1}});
-  model.PopulateTensor<Eigen::half>(
-      model.input(),
-      {Eigen::half(1), Eigen::half(2), Eigen::half(3), Eigen::half(4)});
+  ReverseOpModel<half> model({TensorType_FLOAT16, {4}},
+                             {TensorType_INT32, {1}});
+  model.PopulateTensor<half>(model.input(),
+                             {half(1), half(2), half(3), half(4)});
   model.PopulateTensor<int32_t>(model.axis(), {0});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
   EXPECT_THAT(model.GetOutput(),
-              ElementsAreArray({Eigen::half(4), Eigen::half(3), Eigen::half(2),
-                                Eigen::half(1)}));
+              ElementsAreArray({half(4), half(3), half(2), half(1)}));
 }
 
 TEST(ReverseOpTest, Float16MultiDimensions) {
-  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4, 3, 2}},
-                                    {TensorType_INT32, {1}});
-  model.PopulateTensor<Eigen::half>(
+  ReverseOpModel<half> model({TensorType_FLOAT16, {4, 3, 2}},
+                             {TensorType_INT32, {1}});
+  model.PopulateTensor<half>(
       model.input(),
-      {Eigen::half(1),  Eigen::half(2),  Eigen::half(3),  Eigen::half(4),
-       Eigen::half(5),  Eigen::half(6),  Eigen::half(7),  Eigen::half(8),
-       Eigen::half(9),  Eigen::half(10), Eigen::half(11), Eigen::half(12),
-       Eigen::half(13), Eigen::half(14), Eigen::half(15), Eigen::half(16),
-       Eigen::half(17), Eigen::half(18), Eigen::half(19), Eigen::half(20),
-       Eigen::half(21), Eigen::half(22), Eigen::half(23), Eigen::half(24)});
+      {half(1),  half(2),  half(3),  half(4),  half(5),  half(6),
+       half(7),  half(8),  half(9),  half(10), half(11), half(12),
+       half(13), half(14), half(15), half(16), half(17), half(18),
+       half(19), half(20), half(21), half(22), half(23), half(24)});
   model.PopulateTensor<int32_t>(model.axis(), {1});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
   EXPECT_THAT(
       model.GetOutput(),
-      ElementsAreArray({Eigen::half(5),  Eigen::half(6),  Eigen::half(3),
-                        Eigen::half(4),  Eigen::half(1),  Eigen::half(2),
-                        Eigen::half(11), Eigen::half(12), Eigen::half(9),
-                        Eigen::half(10), Eigen::half(7),  Eigen::half(8),
-                        Eigen::half(17), Eigen::half(18), Eigen::half(15),
-                        Eigen::half(16), Eigen::half(13), Eigen::half(14),
-                        Eigen::half(23), Eigen::half(24), Eigen::half(21),
-                        Eigen::half(22), Eigen::half(19), Eigen::half(20)}));
+      ElementsAreArray({half(5),  half(6),  half(3),  half(4),  half(1),
+                        half(2),  half(11), half(12), half(9),  half(10),
+                        half(7),  half(8),  half(17), half(18), half(15),
+                        half(16), half(13), half(14), half(23), half(24),
+                        half(21), half(22), half(19), half(20)}));
 }
 
 // bfloat16 tests.
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
index c3752827f3e61c..e3fccf888c9815 100644
--- a/tensorflow/lite/kernels/round_test.cc
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -68,33 +69,29 @@ TEST(RoundOpTest, MultiDims) {
 }
 
 TEST(RoundOpTest, Float16SingleDim) {
-  RoundOpModel<Eigen::half> model({6});
-  model.PopulateTensor<Eigen::half>(
-      model.input(), {Eigen::half(8.5), Eigen::half(0.0), Eigen::half(3.5),
-                      Eigen::half(4.2), Eigen::half(-3.5), Eigen::half(-4.5)});
+  RoundOpModel<half> model({6});
+  model.PopulateTensor<half>(model.input(),
+                             {half(8.5f), half(0.0f), half(3.5f), half(4.2f),
+                              half(-3.5f), half(-4.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      model.GetOutput(),
-      ElementsAreArray({Eigen::half(8), Eigen::half(0), Eigen::half(4),
-                        Eigen::half(4), Eigen::half(-4), Eigen::half(-4)}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray(
+                  {half(8), half(0), half(4), half(4), half(-4), half(-4)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({6}));
 }
 
 TEST(RoundOpTest, Float16MultiDims) {
-  RoundOpModel<Eigen::half> model({2, 1, 1, 6});
-  model.PopulateTensor<Eigen::half>(
+  RoundOpModel<half> model({2, 1, 1, 6});
+  model.PopulateTensor<half>(
       model.input(),
-      {Eigen::half(0.0001), Eigen::half(8.0001), Eigen::half(0.9999),
-       Eigen::half(9.9999), Eigen::half(0.5), Eigen::half(-0.0001),
-       Eigen::half(-8.0001), Eigen::half(-0.9999), Eigen::half(-9.9999),
-       Eigen::half(-0.5), Eigen::half(-2.5), Eigen::half(1.5)});
+      {half(0.0001f), half(8.0001f), half(0.9999f), half(9.9999f), half(0.5f),
+       half(-0.0001f), half(-8.0001f), half(-0.9999f), half(-9.9999f),
+       half(-0.5f), half(-2.5f), half(1.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      model.GetOutput(),
-      ElementsAreArray({Eigen::half(0), Eigen::half(8), Eigen::half(1),
-                        Eigen::half(10), Eigen::half(0), Eigen::half(0),
-                        Eigen::half(-8), Eigen::half(-1), Eigen::half(-10),
-                        Eigen::half(-0), Eigen::half(-2), Eigen::half(2)}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({half(0), half(8), half(1), half(10), half(0),
+                                half(0), half(-8), half(-1), half(-10),
+                                half(-0), half(-2), half(2)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
 }
 
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index feb02c48d2f3aa..2f3430770f7b68 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
-#include "Eigen/Core"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -338,20 +338,16 @@ TEST_P(SliceOpTest, SliceBool) {
 }
 
 TEST_P(SliceOpTest, SliceFloat16) {
-  SliceOpModel<Eigen::half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
-                                       {2, 1, -1, 1}, TensorType_INT32,
-                                       TensorType_FLOAT16, GetParam());
-  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(1), Eigen::half(2),
-              Eigen::half(2), Eigen::half(2), Eigen::half(3), Eigen::half(3),
-              Eigen::half(3), Eigen::half(4), Eigen::half(4), Eigen::half(4),
-              Eigen::half(5), Eigen::half(5), Eigen::half(5), Eigen::half(6),
-              Eigen::half(6), Eigen::half(6)});
+  SliceOpModel<half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                {2, 1, -1, 1}, TensorType_INT32,
+                                TensorType_FLOAT16, GetParam());
+  m.SetInput({half(1), half(1), half(1), half(2), half(2), half(2), half(3),
+              half(3), half(3), half(4), half(4), half(4), half(5), half(5),
+              half(5), half(6), half(6), half(6)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray({Eigen::half(3), Eigen::half(3), Eigen::half(3),
-                        Eigen::half(5), Eigen::half(5), Eigen::half(5)}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({half(3), half(3), half(3),
+                                               half(5), half(5), half(5)}));
 }
 
 TEST_P(SliceOpTest, SliceBFloat16) {
@@ -373,19 +369,16 @@ TEST_P(SliceOpTest, SliceBFloat16) {
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1Float16) {
-  SliceOpModel<Eigen::half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
-                                       {2, -1, 1, 1}, TensorType_INT32,
-                                       TensorType_FLOAT16, GetParam());
-  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(2), Eigen::half(2),
-              Eigen::half(3), Eigen::half(3), Eigen::half(4), Eigen::half(4),
-              Eigen::half(5), Eigen::half(5), Eigen::half(6), Eigen::half(6),
-              Eigen::half(7), Eigen::half(7), Eigen::half(8), Eigen::half(8),
-              Eigen::half(9), Eigen::half(9)});
+  SliceOpModel<half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
+                                {2, -1, 1, 1}, TensorType_INT32,
+                                TensorType_FLOAT16, GetParam());
+  m.SetInput({half(1), half(1), half(2), half(2), half(3), half(3), half(4),
+              half(4), half(5), half(5), half(6), half(6), half(7), half(7),
+              half(8), half(8), half(9), half(9)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1, 1}));
   EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({Eigen::half(5), Eigen::half(6), Eigen::half(8),
-                                Eigen::half(9)}));
+              ElementsAreArray({half(5), half(6), half(8), half(9)}));
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1BFloat16) {
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 6ba4ef3b78977f..f7c79680576fe1 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive  // IWYU pragma: keep
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -152,7 +154,7 @@ class StridedSliceOpModel : public SingleOpModel {
 template <typename T>
 class StridedSliceOpTest : public ::testing::Test {};
 
-using DataTypes = ::testing::Types<float, Eigen::half, Eigen::bfloat16, uint8_t,
+using DataTypes = ::testing::Types<float, half, Eigen::bfloat16, uint8_t,
                                    uint32_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(StridedSliceOpTest, DataTypes);
 
@@ -347,7 +349,9 @@ TYPED_TEST(StridedSliceOpTest, In1D_Int32End) {
       continue;
     }
     std::vector<TypeParam> values(32768);
-    std::iota(values.begin(), values.end(), TypeParam(0));
+    for (int i = 0; i < 32768; ++i) {
+      values[i] = static_cast<TypeParam>(i);
+    }
 
     StridedSliceOpModel<TypeParam> m({32768}, {1}, {1}, {1}, values, {0},
                                      {32768}, {1}, 0, 0, 0, 0, 0,
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index cbdb74d29d04aa..1bd870d2c56c92 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -38,7 +38,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "fp16/fp16.h"  // from @FP16
 #include "absl/algorithm/container.h"
 #include "absl/log/absl_check.h"
 #include "absl/log/absl_log.h"
@@ -57,6 +56,7 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 #include "tsl/platform/logging.h"
 
@@ -134,7 +134,7 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 }
 
 template <>
-constexpr TfLiteType typeToTfLiteType<Eigen::half>() {
+constexpr TfLiteType typeToTfLiteType<half>() {
   return kTfLiteFloat16;
 }
 
@@ -1362,7 +1362,7 @@ TFLITE_TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
 TFLITE_TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
 TFLITE_TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteFloat16, TensorType_FLOAT16);
-TFLITE_TENSOR_TYPE_ASSOC(Eigen::half, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(half, TensorType_FLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteBFloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(Eigen::bfloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
@@ -1461,13 +1461,13 @@ struct TypeUnion<uint8_t> {
 };
 
 template <>
-struct TypeUnion<Eigen::half> {
+struct TypeUnion<half> {
  public:
   // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT16;
   // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat16;
-  typedef Eigen::half ScalarType;
+  typedef half ScalarType;
 };
 
 template <>
diff --git a/tensorflow/lite/kernels/test_util_test.cc b/tensorflow/lite/kernels/test_util_test.cc
index ed9a679b4e4d33..01f514692b0616 100644
--- a/tensorflow/lite/kernels/test_util_test.cc
+++ b/tensorflow/lite/kernels/test_util_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/array.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
+#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -197,6 +198,14 @@ TEST(TestUtilTest, QuantizeVectorScalingUp) {
   EXPECT_THAT(q_data, ElementsAreArray(expected));
 }
 
+TEST(TestUtilTest, DequantizeVectorFp16) {
+  std::vector<half> data = {half(-1.0f), half(-0.5f), half(0.0f), half(0.5f),
+                            half(1.0f)};
+  auto f_data = Dequantize<half>(data, /*scale=*/0.1f, /*zero_point=*/0);
+  std::vector<float> expected = {-0.1f, -0.05f, 0.0f, 0.05f, 0.1f};
+  EXPECT_THAT(f_data, ElementsAreArray(tflite::ArrayFloatNear(expected, 1e-7)));
+}
+
 TEST(DimsAreMatcherTestTensor, ValidOneD) {
   TensorUniquePtr t = BuildTfLiteTensor(kTfLiteInt32, {2}, kTfLiteDynamic);
   EXPECT_THAT(t.get(), DimsAre({2}));

From ca3cb67ac904514dcee47e4a9aa8b8be7d1f1ba3 Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Wed, 10 Dec 2025 15:23:46 -0800
Subject: [PATCH 144/753] [PJRT:CPU] Delay the fulfillment of `returned_future`
 until (possibly async) all execution activities finish

`returned_future` from `PjRtLoadedExecutable::Execute()` is frequently used as an indicator of whether the entire execution activity has finished and it is safe to destroy external resources used for prior execution dispatches (e.g., loaded callbacks, execute_context).

In the PJRT CPU client, with execution poisoning, `execute_event` may be set before an in-flight execution using the computation backend (e.g., eigen) finishes completely, and `execute_event`'s ready state was forwarded immediately `returned_future`, which has made `returned_future` become prematurely ready. If the user code releases the external resources, the inflight execution may attempt to access the external resources and causes a segfault or sigill.

This change makes `returned_future` to be fulfilled only if all execution activities are complete. This allows the user code to tear down of the execution environment and loaded executables safely.

PiperOrigin-RevId: 842904201
---
 third_party/xla/xla/pjrt/cpu/cpu_client.cc | 65 ++++++++++++++--------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 55690711a5fc40..98d5c48ef63655 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -1390,6 +1390,15 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
   auto execute_event = tsl::MakeConstructedAsyncValueRef<CpuEvent>();
   MarkEventReadyOnExit ready_on_exit(execute_event);
   auto execute_usage_event = tsl::MakeRef<CpuTrackedDeviceEvent>(execute_event);
+  // `returned_future_can_be_set_event` indicates when `returned_future` can be
+  // set using `execute_event`. This is necessary to delay setting the
+  // `returned_future` until all (async) execution activities are complete even
+  // if `execute_event` itself may be set early due to execution poisoning. This
+  // lets the user rely on `returned_future` when there is no more in-flight
+  // executions and destroy any external resources such as loaded callbacks and
+  // execute contexts.
+  auto returned_future_can_be_set_event =
+      tsl::MakeConstructedAsyncValueRef<CpuEvent>();
 
   absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4> donation_transactions;
 
@@ -1689,6 +1698,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
       return thunks_execute_event.GetError();
     }
 
+    returned_future_can_be_set_event.SetStateConcrete();
+
   } else {
     // Asynchronously call generated function.
 
@@ -1718,20 +1729,21 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
         device->async_execution_tracker()->NewAsyncExecution(
             run_id.ToInt(), std::move(ready_on_exit).Release());
     client()->async_work_runner()->ScheduleWhenReady(
-        input_deps,
-        [cpu_executable, buffer_alloc = std::move(buffer_alloc),
-         buffer_alloc_and_copy = std::move(buffer_alloc_and_copy),
-         buffer_table = std::move(buffer_table),
-         run_options = std::move(run_options),
-         device_assignment = std::move(device_assignment),
-         cpu_run_options = std::move(cpu_run_options),
-         compute_reservation = std::move(compute_reservation),
-         tuple_index_table = std::move(tuple_index_table),
-         donation_transactions = std::move(donation_transactions),
-         scoped_async_execution = std::move(scoped_async_execution),
-         input_deps_avs = std::move(input_deps_avs_copy),
-         allocator = client()->allocator(),
-         eigen_device = client()->eigen_intraop_device()]() mutable {
+        input_deps, [cpu_executable, buffer_alloc = std::move(buffer_alloc),
+                     buffer_alloc_and_copy = std::move(buffer_alloc_and_copy),
+                     buffer_table = std::move(buffer_table),
+                     run_options = std::move(run_options),
+                     device_assignment = std::move(device_assignment),
+                     cpu_run_options = std::move(cpu_run_options),
+                     compute_reservation = std::move(compute_reservation),
+                     tuple_index_table = std::move(tuple_index_table),
+                     donation_transactions = std::move(donation_transactions),
+                     scoped_async_execution = std::move(scoped_async_execution),
+                     input_deps_avs = std::move(input_deps_avs_copy),
+                     allocator = client()->allocator(),
+                     eigen_device = client()->eigen_intraop_device(),
+                     returned_future_can_be_set_event =
+                         returned_future_can_be_set_event.CopyRef()]() mutable {
           // Because `input_deps` contains the definition events of all inputs,
           // when it is ready, all input buffers must have been allocated. So,
           // we are safe to allocate and copy memory here. Since `execute_event`
@@ -1743,6 +1755,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
             if (auto* error = av->GetErrorIfPresent()) {
               scoped_async_execution.SetError(Internal(
                   "Error dispatching computation: %s", error->message()));
+              returned_future_can_be_set_event.SetStateConcrete();
               return;
             }
           }
@@ -1758,6 +1771,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
               scoped_async_execution.SetError(
                   Internal("Error preparing computation: %s",
                            buffer_info.buffer.GetError().message()));
+              returned_future_can_be_set_event.SetStateConcrete();
               return;
             }
           }
@@ -1840,10 +1854,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           if (!status.ok()) {
             // CPU computation fails with an error.
             scoped_async_execution.SetError(std::move(status));
+            returned_future_can_be_set_event.SetStateConcrete();
+            return;
           }
 
           // CPU computation completes.
           scoped_async_execution.SetStateConcrete();
+          returned_future_can_be_set_event.SetStateConcrete();
         });
   }
 
@@ -1884,14 +1901,18 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
 
   if (fill_future) {
     auto [promise, future] = Future<>::MakePromise();
-    execute_event.AndThen([promise = std::move(promise),
-                           event = execute_event.CopyRef()]() mutable {
-      if (auto* error = event.GetErrorIfPresent()) {
-        promise.Set(Internal("Compute error: %s", error->message()));
-      } else {
-        promise.Set();
-      }
-    });
+    returned_future_can_be_set_event.AndThen(
+        [execute_event = std::move(execute_event),
+         promise = std::move(promise)]() mutable {
+          execute_event.AndThen([execute_event = execute_event.CopyRef(),
+                                 promise = std::move(promise)]() mutable {
+            if (auto* error = execute_event.GetErrorIfPresent()) {
+              promise.Set(Internal("Compute error: %s", error->message()));
+            } else {
+              promise.Set();
+            }
+          });
+        });
     return Result({std::move(future), /*buffers=*/std::move(res)});
   }
 

From 7a8a6036a1d6ccecbb6b76a6e0521cec8dee88bf Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Wed, 10 Dec 2025 15:30:16 -0800
Subject: [PATCH 145/753] [Utilities][ReplicaGroupV3] Add V3 ExpandDeviceGroups
 function. This utilizes the functionality of V2 implementation.

PiperOrigin-RevId: 842906395
---
 .../xla/xla/service/spmd/spmd_partitioner_util.cc        | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index 8e79e7c16d2e84..9b7c68b257a358 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -536,6 +536,15 @@ std::optional<IotaReplicaGroupList> ExpandDeviceGroupsWithIota(
                               processed_device_groups.iota()->transpose_perm());
 }
 
+// Expand the device groups, given a mesh-axes partition group list.
+// NOLINTNEXTLINE(clang-diagnostic-unused-function)
+std::optional<IotaReplicaGroupList> ExpandDeviceGroupsWithMeshAxes(
+    const DeviceGroupTileAssignment& device_groups,
+    MeshAxesReplicaGroupList* partition_group_list) {
+  return ExpandDeviceGroupsWithIota(
+      device_groups, partition_group_list->ToIotaReplicaGroupList());
+}
+
 SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
     const SPMDCollectiveOpsCreator& creator,
     const DeviceGroupTileAssignment& device_groups) {

From f3366660078e859f3aa33684505574d7ae4b03ac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 15:44:11 -0800
Subject: [PATCH 146/753] Reverts 45fb84067d13d40feee87d26bd6a4b086fa8b319

PiperOrigin-RevId: 842911482
---
 tensorflow/lite/kernels/BUILD                 | 30 ++----
 tensorflow/lite/kernels/activations_test.cc   | 43 +++++----
 tensorflow/lite/kernels/atan2_test.cc         | 29 +++---
 tensorflow/lite/kernels/cast_test.cc          | 19 ++--
 tensorflow/lite/kernels/comparisons_test.cc   | 11 ++-
 tensorflow/lite/kernels/concatenation_test.cc | 44 +++++----
 .../lite/kernels/dynamic_update_slice_test.cc | 17 ++--
 tensorflow/lite/kernels/fill_test.cc          |  5 +-
 tensorflow/lite/kernels/floor_test.cc         | 31 +++---
 tensorflow/lite/kernels/gather_nd_test.cc     | 62 ++++++------
 tensorflow/lite/kernels/gather_test.cc        |  4 +-
 .../lite/kernels/maximum_minimum_test.cc      | 96 ++++++++++---------
 tensorflow/lite/kernels/neg_test.cc           | 13 +--
 tensorflow/lite/kernels/pad_test.cc           | 46 ++++-----
 tensorflow/lite/kernels/reverse_test.cc       | 42 ++++----
 tensorflow/lite/kernels/round_test.cc         | 37 +++----
 tensorflow/lite/kernels/slice_test.cc         | 39 ++++----
 tensorflow/lite/kernels/strided_slice_test.cc |  8 +-
 tensorflow/lite/kernels/test_util.h           | 10 +-
 tensorflow/lite/kernels/test_util_test.cc     |  9 --
 20 files changed, 294 insertions(+), 301 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index db2435b081d36b..6a3ec9f57e2a02 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -167,7 +167,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "//tensorflow/lite/tools/serialization:writer_lib",
         "//tensorflow/lite/tools/versioning",
-        "//tensorflow/lite/types:half",
+        "@FP16",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -574,7 +574,6 @@ cc_test(
         "//tensorflow/lite:array",
         "//tensorflow/lite:util",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1119,7 +1118,6 @@ cc_test(
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1501,7 +1499,6 @@ cc_test(
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1518,8 +1515,8 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -1712,7 +1709,6 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1743,7 +1739,6 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1858,7 +1853,6 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1919,8 +1913,8 @@ cc_test(
         ":test_util",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@flatbuffers",
     ],
 )
 
@@ -1978,7 +1972,6 @@ cc_test(
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2089,12 +2082,12 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
+        "@flatbuffers",
     ],
 )
 
@@ -2108,9 +2101,7 @@ cc_test(
         "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -2494,9 +2485,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2541,7 +2530,6 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2556,7 +2544,6 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2594,8 +2581,8 @@ cc_test(
         "//tensorflow/lite/kernels/internal:tensor_ctypes",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2894,7 +2881,6 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -2919,7 +2905,6 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -3188,13 +3173,12 @@ cc_test(
     size = "small",
     srcs = ["dynamic_update_slice_test.cc"],
     deps = [
-        ":subgraph_test_util",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/schema:schema_fbs",
-        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
         "@flatbuffers",
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 96bb22ed76c431..42747a87e61b2a 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 
@@ -575,17 +574,18 @@ TEST_P(TanhOpTest, Tanh) {
 }
 
 TEST_P(TanhOpTest, TanhFloat16) {
-  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_TANH,
-                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<Eigen::half> m(
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      half(0),
-      half(-6),
-      half(2),
-      half(4),
-      half(3),
-      half(-2),
-      half(10),
-      half(1),
+      Eigen::half(0),
+      Eigen::half(-6),
+      Eigen::half(2),
+      Eigen::half(4),
+      Eigen::half(3),
+      Eigen::half(-2),
+      Eigen::half(10),
+      Eigen::half(1),
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
@@ -1210,17 +1210,18 @@ TEST_P(LogisticOpTest, SigmoidFloat32) {
 }
 
 TEST_P(LogisticOpTest, SigmoidFloat16) {
-  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_LOGISTIC,
-                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<Eigen::half> m(
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      half{-1.2f},
-      half{-6.0f},
-      half{2.0f},
-      half{4.0f},
-      half{3.0f},
-      half{-2.0f},
-      half{10.0f},
-      half{1.0f},
+      Eigen::half{-1.2f},
+      Eigen::half{-6.0f},
+      Eigen::half{2.0f},
+      Eigen::half{4.0f},
+      Eigen::half{3.0f},
+      Eigen::half{-2.0f},
+      Eigen::half{10.0f},
+      Eigen::half{1.0f},
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/atan2_test.cc b/tensorflow/lite/kernels/atan2_test.cc
index 0c3839361570a6..309ba79f284f3f 100644
--- a/tensorflow/lite/kernels/atan2_test.cc
+++ b/tensorflow/lite/kernels/atan2_test.cc
@@ -17,7 +17,6 @@
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -35,7 +34,7 @@ tflite::TensorType GetTTEnum<double>() {
 }
 
 template <>
-tflite::TensorType GetTTEnum<half>() {
+tflite::TensorType GetTTEnum<Eigen::half>() {
   return tflite::TensorType_FLOAT16;
 }
 
@@ -75,7 +74,7 @@ class Atan2Test : public ::testing::Test {
   using FloatType = Float;
 };
 
-using TestTypes = ::testing::Types<float, double, half, Eigen::bfloat16>;
+using TestTypes = ::testing::Types<float, double, Eigen::half, Eigen::bfloat16>;
 
 TYPED_TEST_SUITE(Atan2Test, TestTypes);
 
@@ -86,15 +85,15 @@ TYPED_TEST(Atan2Test, TestScalar) {
   tflite::TensorData output = {GetTTEnum<Float>(), {}};
   Atan2Model m(y, x, output);
 
-  auto got = m.GetOutput<Float>({Float(0.0f)}, {Float(0.0f)});
+  auto got = m.GetOutput<Float>({Float(0.0)}, {Float(0.0)});
   ASSERT_EQ(got.size(), 1);
   EXPECT_FLOAT_EQ(got[0], 0.0);
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0f)}, {Float(0.0f)})[0],
-                  Float(static_cast<float>(M_PI / 2)));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0f)}, {Float(1.0f)})[0],
-                  Float(0.0f));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0f)}, {Float(0.0f)})[0],
-                  Float(-static_cast<float>(M_PI / 2)));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0)}, {Float(0.0)})[0],
+                  Float(M_PI / 2));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0)}, {Float(1.0)})[0],
+                  Float(0.0));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0)}, {Float(0.0)})[0],
+                  Float(-M_PI / 2));
 }
 
 TYPED_TEST(Atan2Test, TestBatch) {
@@ -103,12 +102,10 @@ TYPED_TEST(Atan2Test, TestBatch) {
   tflite::TensorData x = {GetTTEnum<Float>(), {4, 2, 1}};
   tflite::TensorData output = {GetTTEnum<Float>(), {4, 2, 1}};
   Atan2Model m(y, x, output);
-  std::vector<Float> y_data = {Float(0.1f), Float(0.2f), Float(0.3f),
-                               Float(0.4f), Float(0.5f), Float(0.6f),
-                               Float(0.7f), Float(0.8f)};
-  std::vector<Float> x_data = {Float(0.8f), Float(0.7f), Float(0.6f),
-                               Float(0.5f), Float(0.4f), Float(0.3f),
-                               Float(0.2f), Float(0.1f)};
+  std::vector<Float> y_data = {Float(0.1), Float(0.2), Float(0.3), Float(0.4),
+                               Float(0.5), Float(0.6), Float(0.7), Float(0.8)};
+  std::vector<Float> x_data = {Float(0.8), Float(0.7), Float(0.6), Float(0.5),
+                               Float(0.4), Float(0.3), Float(0.2), Float(0.1)};
   auto got = m.GetOutput<Float>(y_data, x_data);
   ASSERT_EQ(got.size(), 8);
   for (int i = 0; i < 8; ++i) {
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index bcc9b4bc058003..77cc2f3442b1c2 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -414,10 +413,11 @@ TEST(CastOpModel, CastFloatToFloat16) {
   m.PopulateTensor<float>(m.input(), {100.f, 1.0f, 0.f, 0.4f, 1.999f, 1.1f});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
-      m.ExtractVector<half>(m.output()),
-      ElementsAreArray({static_cast<half>(100.f), static_cast<half>(1.0f),
-                        static_cast<half>(0.f), static_cast<half>(0.4f),
-                        static_cast<half>(1.999f), static_cast<half>(1.1f)}));
+      m.ExtractVector<Eigen::half>(m.output()),
+      ElementsAreArray(
+          {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
+           static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
+           static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1)}));
 }
 
 TEST(CastOpModel, CastFloatToBFloat16) {
@@ -435,10 +435,11 @@ TEST(CastOpModel, CastFloatToBFloat16) {
 
 TEST(CastOpModel, CastFloat16ToFloat) {
   CastOpModel m({TensorType_FLOAT16, {3, 2}}, {TensorType_FLOAT32, {3, 2}});
-  m.PopulateTensor<half>(m.input(),
-                         {static_cast<half>(100.f), static_cast<half>(1.0f),
-                          static_cast<half>(0.f), static_cast<half>(0.4f),
-                          static_cast<half>(1.999f), static_cast<half>(1.1f)});
+  m.PopulateTensor<Eigen::half>(
+      m.input(),
+      {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
+       static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
+       static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.ExtractVector<float>(m.output()),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index bc2091aa823832..10226bb60a8ed8 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -397,10 +396,12 @@ TEST(ComparisonsTest, LessFloat) {
 TEST(ComparisonsTest, LessFloat16) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT16,
                           BuiltinOperator_LESS);
-  model.PopulateTensor<half>(model.input1(),
-                             {half(0.1f), half(0.9f), half(0.7f), half(0.3f)});
-  model.PopulateTensor<half>(model.input2(),
-                             {half(0.1f), half(0.2f), half(0.6f), half(0.5f)});
+  model.PopulateTensor<Eigen::half>(
+      model.input1(),
+      {Eigen::half(0.1), Eigen::half(0.9), Eigen::half(0.7), Eigen::half(0.3)});
+  model.PopulateTensor<Eigen::half>(
+      model.input2(),
+      {Eigen::half(0.1), Eigen::half(0.2), Eigen::half(0.6), Eigen::half(0.5)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true));
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index f9c765375cc20f..28692ae1528dd3 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -122,11 +121,12 @@ TEST(ConcatenationOpTest, ThreeDimensionalOneInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, ThreeDimensionalOneInputFloat16) {
-  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2}},
-                               /*axis=*/1,
-                               /*num_inputs=*/1);
-  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(3.0f),
-                 static_cast<half>(4.0f), static_cast<half>(7.0f)});
+  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2}},
+                                      /*axis=*/1,
+                                      /*num_inputs=*/1);
+  m.SetInput(0,
+             {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(3.0f),
+              static_cast<Eigen::half>(4.0f), static_cast<Eigen::half>(7.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
 }
@@ -206,21 +206,23 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, FiveDimensionalTwoInputFloat16) {
-  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
-                               /*axis=*/0,
-                               /*num_inputs=*/2);
-  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(2.0f),
-                 static_cast<half>(3.0f), static_cast<half>(4.0f),
-                 static_cast<half>(5.0f), static_cast<half>(6.0f),
-                 static_cast<half>(7.0f), half{8.0f}, static_cast<half>(9.0f),
-                 static_cast<half>(10.0f), static_cast<half>(11.0f),
-                 static_cast<half>(12.0f)});
-  m.SetInput(1,
-             {static_cast<half>(13.0f), static_cast<half>(14.0f), half{15.0f},
-              static_cast<half>(16.0f), half{17.0f}, static_cast<half>(18.0f),
-              static_cast<half>(19.0f), static_cast<half>(20.0f),
-              static_cast<half>(21.0f), static_cast<half>(22.0f),
-              static_cast<half>(23.0f), static_cast<half>(24.0f)});
+  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
+                                      /*axis=*/0,
+                                      /*num_inputs=*/2);
+  m.SetInput(
+      0, {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(2.0f),
+          static_cast<Eigen::half>(3.0f), static_cast<Eigen::half>(4.0f),
+          static_cast<Eigen::half>(5.0f), static_cast<Eigen::half>(6.0f),
+          static_cast<Eigen::half>(7.0f), Eigen::half{8.0f},
+          static_cast<Eigen::half>(9.0f), static_cast<Eigen::half>(10.0f),
+          static_cast<Eigen::half>(11.0f), static_cast<Eigen::half>(12.0f)});
+  m.SetInput(
+      1, {static_cast<Eigen::half>(13.0f), static_cast<Eigen::half>(14.0f),
+          Eigen::half{15.0f}, static_cast<Eigen::half>(16.0f),
+          Eigen::half{17.0f}, static_cast<Eigen::half>(18.0f),
+          static_cast<Eigen::half>(19.0f), static_cast<Eigen::half>(20.0f),
+          static_cast<Eigen::half>(21.0f), static_cast<Eigen::half>(22.0f),
+          static_cast<Eigen::half>(23.0f), static_cast<Eigen::half>(24.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/dynamic_update_slice_test.cc b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
index 99aa637a068d23..373a719d5ac412 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice_test.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -113,9 +112,10 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   DynamicUpdateSliceOpModel m({TensorType_FLOAT16, {3, 3}},
                               {TensorType_FLOAT16, {2, 1}},
                               {TensorType_INT32, {2}});
-  m.SetInput<half>({half(1), half(2), half(3), half(4), half(5), half(6),
-                    half(7), half(8), half(9)});
-  m.SetUpdate<half>({half(-1), half(-2)});
+  m.SetInput<Eigen::half>({Eigen::half(1), Eigen::half(2), Eigen::half(3),
+                           Eigen::half(4), Eigen::half(5), Eigen::half(6),
+                           Eigen::half(7), Eigen::half(8), Eigen::half(9)});
+  m.SetUpdate<Eigen::half>({Eigen::half(-1), Eigen::half(-2)});
   m.SetStartIndices<int32_t>({1, 1});
   const int kInplaceInputTensorIdx = 0;
   const int kInplaceOutputTensorIdx = 0;
@@ -123,10 +123,11 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   TfLiteTensor* output_tensor = m.GetOutputTensor(kInplaceOutputTensorIdx);
   output_tensor->data.data = input_tensor->data.data;
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<half>(),
-              ElementsAreArray(
-                  ArrayFloatNear({half(1), half(2), half(3), half(4), half(-1),
-                                  half(6), half(7), half(-2), half(9)})));
+  EXPECT_THAT(m.GetOutput<Eigen::half>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {Eigen::half(1), Eigen::half(2), Eigen::half(3),
+                   Eigen::half(4), Eigen::half(-1), Eigen::half(6),
+                   Eigen::half(7), Eigen::half(-2), Eigen::half(9)})));
   EXPECT_EQ(output_tensor->data.data, input_tensor->data.data);
 }
 
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index a8e9815f30bc61..028623e3a0a321 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -140,8 +139,8 @@ TEST_P(FillOpTest, FillFloat) {
 }
 
 TEST_P(FillOpTest, FillFloat16) {
-  FillOpModel<int64_t, half> m(TensorType_INT64, {3}, {2, 2, 2}, half(4.0f),
-                               GetParam());
+  FillOpModel<int64_t, Eigen::half> m(TensorType_INT64, {3}, {2, 2, 2},
+                                      Eigen::half(4.0f), GetParam());
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
index 13154175e334cc..86ea68ad39e599 100644
--- a/tensorflow/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -80,28 +79,28 @@ TEST(FloorOpTest, MultiDims) {
 
 TEST(FloorOpTest, SingleDimFloat16) {
   FloorOpModel model({2}, TensorType_FLOAT16);
-  model.PopulateTensor<>(model.input(), {half(8.5f), half(0.0f)});
+  model.PopulateTensor<>(model.input(), {Eigen::half(8.5), Eigen::half(0.0)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<half>(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutput<Eigen::half>(), ElementsAreArray({8, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(FloorOpTest, MultiDimsFloat16) {
   FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT16);
-  model.PopulateTensor<half>(model.input(), {
-                                                half(0.75f),
-                                                half(8.25f),
-                                                half(0.49f),
-                                                half(9.99f),
-                                                half(0.5f),
-                                                half(-0.25f),
-                                                half(-8.75f),
-                                                half(-0.99f),
-                                                half(-9.49f),
-                                                half(-0.5f),
-                                            });
+  model.PopulateTensor<Eigen::half>(model.input(), {
+                                                       Eigen::half(0.75),
+                                                       Eigen::half(8.25),
+                                                       Eigen::half(0.49),
+                                                       Eigen::half(9.99),
+                                                       Eigen::half(0.5),
+                                                       Eigen::half(-0.25),
+                                                       Eigen::half(-8.75),
+                                                       Eigen::half(-0.99),
+                                                       Eigen::half(-9.49),
+                                                       Eigen::half(-0.5),
+                                                   });
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<half>(),
+  EXPECT_THAT(model.GetOutput<Eigen::half>(),
               ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
 }
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
index f4b9f65711fbdc..2bd9a0235ebe2c 100644
--- a/tensorflow/lite/kernels/gather_nd_test.cc
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -20,12 +20,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "Eigen/Core"  // from @eigen_archive
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -246,19 +244,21 @@ TEST(GatherNdOpTest, BFloat16Int32) {
 TEST(GatherNdOpTest, Float16Int32) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT32, {2, 2}});
-  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
-                    half(2.2f), half(2.3f),  //
-                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
-                    half(-4.2f), half(4.3f),  //
-                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
-                    half(-6.2f), half(6.3f)});
+  m.SetInput<Eigen::half>(
+      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
+       Eigen::half(2.2), Eigen::half(2.3),  //
+       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
+       Eigen::half(-4.2), Eigen::half(4.3),  //
+       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
+       Eigen::half(-6.2), Eigen::half(6.3)});
   m.SetPositions<int32_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<half>(),
-      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
-                                    half(3.1f), half(3.2f), half(-3.3f)}));
+      m.GetOutput<Eigen::half>(),
+      Pointwise(FloatingPointEq(),
+                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
+                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
 }
 
 TEST(GatherNdOpTest, Float32Int32) {
@@ -297,19 +297,21 @@ TEST(GatherNdOpTest, BFloat16Int64) {
 TEST(GatherNdOpTest, Float16Int64) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT64, {2, 2}});
-  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
-                    half(2.2f), half(2.3f),  //
-                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
-                    half(-4.2f), half(4.3f),  //
-                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
-                    half(-6.2f), half(6.3f)});
+  m.SetInput<Eigen::half>(
+      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
+       Eigen::half(2.2), Eigen::half(2.3),  //
+       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
+       Eigen::half(-4.2), Eigen::half(4.3),  //
+       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
+       Eigen::half(-6.2), Eigen::half(6.3)});
   m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<half>(),
-      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
-                                    half(3.1f), half(3.2f), half(-3.3f)}));
+      m.GetOutput<Eigen::half>(),
+      Pointwise(FloatingPointEq(),
+                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
+                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
 }
 
 TEST(GatherNdOpTest, Float32Int64) {
@@ -460,19 +462,21 @@ TEST(GatherNdOpTest, BFloat16Int16) {
 TEST(GatherNdOpTest, Float16Int16) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT16, {2, 2}});
-  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
-                    half(2.2f), half(2.3f),  //
-                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
-                    half(-4.2f), half(4.3f),  //
-                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
-                    half(-6.2f), half(6.3f)});
+  m.SetInput<Eigen::half>(
+      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
+       Eigen::half(2.2), Eigen::half(2.3),  //
+       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
+       Eigen::half(-4.2), Eigen::half(4.3),  //
+       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
+       Eigen::half(-6.2), Eigen::half(6.3)});
   m.SetPositions<int16_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<half>(),
-      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
-                                    half(3.1f), half(3.2f), half(-3.3f)}));
+      m.GetOutput<Eigen::half>(),
+      Pointwise(FloatingPointEq(),
+                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
+                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
 }
 
 TEST(GatherNdOpTest, Float32Int16) {
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 61ca1b654f6160..23e30eb7867774 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -20,11 +20,9 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -254,7 +252,7 @@ TEST_P(GatherOpTest, LastAxis0DIndex) {
 }
 
 using TestTypes = testing::Types<int8_t, uint8_t, int16_t, int32_t, int64_t,
-                                 float, half, Eigen::bfloat16>;
+                                 float, Eigen::half, Eigen::bfloat16>;
 
 template <typename T>
 struct TypedGatherOpTest : public testing::Test {};
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index 00e25ee9b86500..babdb4f69fad03 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -248,20 +247,24 @@ TEST(MaximumOpTest, Int32WithBroadcastTest5D) {
 }
 
 TEST(MaximumOpTest, Float16Test) {
-  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),  half(-1.0f),
-                                       half(11.0f), half(-2.0f), half(-1.44f)};
-  std::initializer_list<half> data2 = {half(-1.0f), half(0.0f),  half(1.0f),
-                                       half(12.0f), half(-3.0f), half(-1.43f)};
-  TestModel<half>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-                  {TensorType_FLOAT16, {3, 1, 2}},
-                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
-                  {half(1.0f), half(0.0f), half(1.0f), half(12.0f), half(-2.0f),
-                   half(-1.43f)});
-  TestModel<half>(BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-                  {TensorType_FLOAT16, {3, 1, 2}},
-                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
-                  {half(-1.0f), half(0.0f), half(-1.0f), half(11.0f),
-                   half(-3.0f), half(-1.44f)});
+  std::initializer_list<Eigen::half> data1 = {
+      Eigen::half(1.0),  Eigen::half(0.0),  Eigen::half(-1.0),
+      Eigen::half(11.0), Eigen::half(-2.0), Eigen::half(-1.44)};
+  std::initializer_list<Eigen::half> data2 = {
+      Eigen::half(-1.0), Eigen::half(0.0),  Eigen::half(1.0),
+      Eigen::half(12.0), Eigen::half(-3.0), Eigen::half(-1.43)};
+  TestModel<Eigen::half>(
+      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
+      data2,
+      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(1.0), Eigen::half(12.0),
+       Eigen::half(-2.0), Eigen::half(-1.43)});
+  TestModel<Eigen::half>(
+      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
+      data2,
+      {Eigen::half(-1.0), Eigen::half(0.0), Eigen::half(-1.0),
+       Eigen::half(11.0), Eigen::half(-3.0), Eigen::half(-1.44)});
 }
 
 TEST(MaximumOpTest, BFloat16Test) {
@@ -305,39 +308,42 @@ TEST(MaximumOpTest, BFloat16WithBroadcastTest5DScalarY) {
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5DScalarY) {
-  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f), half(-1.0f),
-                                       half(-2.0f), half(3.0f), half(11.0f)};
-  std::initializer_list<half> data2 = {half(2.0f)};
-  TestModel<half>(BuiltinOperator_MAXIMUM,
-                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-                  {TensorType_FLOAT16, {1}},
-                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
-                  {half(2.0f), half(2.0f), half(2.0f), half(2.0f), half(3.0f),
-                   half(11.0f)});
-  TestModel<half>(BuiltinOperator_MINIMUM,
-                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-                  {TensorType_FLOAT16, {1}},
-                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
-                  {half(1.0f), half(0.0f), half(-1.0f), half(-2.0f), half(2.0f),
-                   half(2.0f)});
+  std::initializer_list<Eigen::half> data1 = {
+      Eigen::half(1.0),  Eigen::half(0.0), Eigen::half(-1.0),
+      Eigen::half(-2.0), Eigen::half(3.0), Eigen::half(11.0)};
+  std::initializer_list<Eigen::half> data2 = {Eigen::half(2.0)};
+  TestModel<Eigen::half>(
+      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
+      data2,
+      {Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0),
+       Eigen::half(3.0), Eigen::half(11.0)});
+  TestModel<Eigen::half>(
+      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
+      data2,
+      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
+       Eigen::half(2.0), Eigen::half(2.0)});
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5D) {
-  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),   half(-1.0f),
-                                       half(-2.0f), half(-1.44f), half(11.0f)};
-  std::initializer_list<half> data2 = {half(0.5f), half(2.0f)};
-  TestModel<half>(BuiltinOperator_MAXIMUM,
-                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-                  {TensorType_FLOAT16, {2}},
-                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
-                  {half(1.0f), half(2.0f), half(0.5f), half(2.0f), half(0.5f),
-                   half(11.0f)});
-  TestModel<half>(BuiltinOperator_MINIMUM,
-                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-                  {TensorType_FLOAT16, {2}},
-                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
-                  {half(0.5f), half(0.0f), half(-1.0f), half(-2.0f),
-                   half(-1.44f), half(2.0f)});
+  std::initializer_list<Eigen::half> data1 = {
+      Eigen::half(1.0),  Eigen::half(0.0),   Eigen::half(-1.0),
+      Eigen::half(-2.0), Eigen::half(-1.44), Eigen::half(11.0)};
+  std::initializer_list<Eigen::half> data2 = {Eigen::half(0.5),
+                                              Eigen::half(2.0)};
+  TestModel<Eigen::half>(
+      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
+      data2,
+      {Eigen::half(1.0), Eigen::half(2.0), Eigen::half(0.5), Eigen::half(2.0),
+       Eigen::half(0.5), Eigen::half(11.0)});
+  TestModel<Eigen::half>(
+      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
+      data2,
+      {Eigen::half(0.5), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
+       Eigen::half(-1.44), Eigen::half(2.0)});
 }
 
 TEST(MaximumOpTest, BFloat16WithBroadcastTest5D) {
diff --git a/tensorflow/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
index 883f9182758412..fe9cc68bdf8a4d 100644
--- a/tensorflow/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -68,12 +67,14 @@ TEST(NegOpModel, NegFloat32) {
 
 TEST(NegOpModel, NegFloat16) {
   NegOpModel m({TensorType_FLOAT16, {6}}, {TensorType_FLOAT16, {6}});
-  m.SetInput<half>({half(-2.0f), half(-1.0f), half(0.f), half(1.0f), half(2.0f),
-                    half(3.0f)});
+  m.SetInput<Eigen::half>({Eigen::half(-2.0f), Eigen::half(-1.0f),
+                           Eigen::half(0.f), Eigen::half(1.0f),
+                           Eigen::half(2.0f), Eigen::half(3.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<half>(),
-              ElementsAreArray({half(2.0f), half(1.0f), half(0.f), half(-1.0f),
-                                half(-2.0f), half(-3.0f)}));
+  EXPECT_THAT(m.GetOutput<Eigen::half>(),
+              ElementsAreArray({Eigen::half(2.0f), Eigen::half(1.0f),
+                                Eigen::half(0.f), Eigen::half(-1.0f),
+                                Eigen::half(-2.0f), Eigen::half(-3.0f)}));
 }
 
 TEST(NegOpModel, NegBfloat16) {
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index b985abccddcee7..971be96a915b4b 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -928,16 +927,19 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleConstFloat32ValuedTestInt8) {
 
 template <typename padding_integer_type>
 void SimpleConstFloat16ValuedTest() {
-  PadV2OpConstModel<half, padding_integer_type> m(
+  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-      half{4.0f}, {TensorType_FLOAT16});
-  m.SetInput({half{1.5f}, half{2.5f}, half{3.5f}, half{4.5f}});
+      Eigen::half{4.0f}, {TensorType_FLOAT16});
+  m.SetInput({Eigen::half{1.5f}, Eigen::half{2.5f}, Eigen::half{3.5f},
+              Eigen::half{4.5}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray(ArrayFloatNear(
-                  {half{4}, half{4}, half{4}, half{4}, half{4}, half{1.5f},
-                   half{2.5f}, half{4}, half{4}, half{3.5f}, half{4.5f},
-                   half{4}, half{4}, half{4}, half{4}, half{4}})));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4},
+           Eigen::half{4}, Eigen::half{1.5}, Eigen::half{2.5}, Eigen::half{4},
+           Eigen::half{4}, Eigen::half{3.5}, Eigen::half{4.5}, Eigen::half{4},
+           Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
@@ -1048,15 +1050,12 @@ TEST_F(PadV2OpTest, Int16PaddingSimple4DConstFloat32ValuedTest) {
 
 template <typename padding_integer_type>
 void Simple4DConstFloat16ValuedTest() {
-  PadV2OpConstModel<half, padding_integer_type> m(
+  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 1, 2, 1}}, {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1},
-      half{7.0f}, {TensorType_FLOAT16});
-  m.SetInput({half{3.0f}, half{6.0f}});
+      Eigen::half{7.0}, {TensorType_FLOAT16});
+  m.SetInput({Eigen::half{3.0f}, Eigen::half{6.0f}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray(ArrayFloatNear(
-                  {half{3.0f}, half{7.0f}, half{6.0f}, half{7.0f}, half{7.0f},
-                   half{7.0f}, half{7.0f}, half{7.0f}})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 7, 6, 7, 7, 7, 7, 7}));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
 }
 
@@ -1168,18 +1167,15 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicTest) {
 
 template <typename padding_integer_type>
 void SimpleDynamicTestV2Float16() {
-  PadV2OpDynamicModel<half, padding_integer_type> m(
-      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, half{0.0f},
+  PadV2OpDynamicModel<Eigen::half, padding_integer_type> m(
+      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, Eigen::half{0.0},
       {TensorType_FLOAT16});
-  m.SetInput({half{1.0f}, half{2.0f}, half{3.0f}, half{4.0f}});
+  m.SetInput({Eigen::half{1.0f}, Eigen::half{2.0f}, Eigen::half{3.0f},
+              Eigen::half{4.0f}});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray(ArrayFloatNear(
-                  {half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
-                   half{1.0f}, half{2.0f}, half{0.0f}, half{0.0f}, half{3.0f},
-                   half{4.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
-                   half{0.0f}})));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
index 7e2d3df543ba28..4301b0120f53c3 100644
--- a/tensorflow/lite/kernels/reverse_test.cc
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -355,38 +354,45 @@ TEST(ReverseOpTest, Int16MultiDimensions) {
 
 // float16 tests.
 TEST(ReverseOpTest, Float16OneDimension) {
-  ReverseOpModel<half> model({TensorType_FLOAT16, {4}},
-                             {TensorType_INT32, {1}});
-  model.PopulateTensor<half>(model.input(),
-                             {half(1), half(2), half(3), half(4)});
+  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4}},
+                                    {TensorType_INT32, {1}});
+  model.PopulateTensor<Eigen::half>(
+      model.input(),
+      {Eigen::half(1), Eigen::half(2), Eigen::half(3), Eigen::half(4)});
   model.PopulateTensor<int32_t>(model.axis(), {0});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
   EXPECT_THAT(model.GetOutput(),
-              ElementsAreArray({half(4), half(3), half(2), half(1)}));
+              ElementsAreArray({Eigen::half(4), Eigen::half(3), Eigen::half(2),
+                                Eigen::half(1)}));
 }
 
 TEST(ReverseOpTest, Float16MultiDimensions) {
-  ReverseOpModel<half> model({TensorType_FLOAT16, {4, 3, 2}},
-                             {TensorType_INT32, {1}});
-  model.PopulateTensor<half>(
+  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4, 3, 2}},
+                                    {TensorType_INT32, {1}});
+  model.PopulateTensor<Eigen::half>(
       model.input(),
-      {half(1),  half(2),  half(3),  half(4),  half(5),  half(6),
-       half(7),  half(8),  half(9),  half(10), half(11), half(12),
-       half(13), half(14), half(15), half(16), half(17), half(18),
-       half(19), half(20), half(21), half(22), half(23), half(24)});
+      {Eigen::half(1),  Eigen::half(2),  Eigen::half(3),  Eigen::half(4),
+       Eigen::half(5),  Eigen::half(6),  Eigen::half(7),  Eigen::half(8),
+       Eigen::half(9),  Eigen::half(10), Eigen::half(11), Eigen::half(12),
+       Eigen::half(13), Eigen::half(14), Eigen::half(15), Eigen::half(16),
+       Eigen::half(17), Eigen::half(18), Eigen::half(19), Eigen::half(20),
+       Eigen::half(21), Eigen::half(22), Eigen::half(23), Eigen::half(24)});
   model.PopulateTensor<int32_t>(model.axis(), {1});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
   EXPECT_THAT(
       model.GetOutput(),
-      ElementsAreArray({half(5),  half(6),  half(3),  half(4),  half(1),
-                        half(2),  half(11), half(12), half(9),  half(10),
-                        half(7),  half(8),  half(17), half(18), half(15),
-                        half(16), half(13), half(14), half(23), half(24),
-                        half(21), half(22), half(19), half(20)}));
+      ElementsAreArray({Eigen::half(5),  Eigen::half(6),  Eigen::half(3),
+                        Eigen::half(4),  Eigen::half(1),  Eigen::half(2),
+                        Eigen::half(11), Eigen::half(12), Eigen::half(9),
+                        Eigen::half(10), Eigen::half(7),  Eigen::half(8),
+                        Eigen::half(17), Eigen::half(18), Eigen::half(15),
+                        Eigen::half(16), Eigen::half(13), Eigen::half(14),
+                        Eigen::half(23), Eigen::half(24), Eigen::half(21),
+                        Eigen::half(22), Eigen::half(19), Eigen::half(20)}));
 }
 
 // bfloat16 tests.
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
index e3fccf888c9815..c3752827f3e61c 100644
--- a/tensorflow/lite/kernels/round_test.cc
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -69,29 +68,33 @@ TEST(RoundOpTest, MultiDims) {
 }
 
 TEST(RoundOpTest, Float16SingleDim) {
-  RoundOpModel<half> model({6});
-  model.PopulateTensor<half>(model.input(),
-                             {half(8.5f), half(0.0f), half(3.5f), half(4.2f),
-                              half(-3.5f), half(-4.5f)});
+  RoundOpModel<Eigen::half> model({6});
+  model.PopulateTensor<Eigen::half>(
+      model.input(), {Eigen::half(8.5), Eigen::half(0.0), Eigen::half(3.5),
+                      Eigen::half(4.2), Eigen::half(-3.5), Eigen::half(-4.5)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput(),
-              ElementsAreArray(
-                  {half(8), half(0), half(4), half(4), half(-4), half(-4)}));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({Eigen::half(8), Eigen::half(0), Eigen::half(4),
+                        Eigen::half(4), Eigen::half(-4), Eigen::half(-4)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({6}));
 }
 
 TEST(RoundOpTest, Float16MultiDims) {
-  RoundOpModel<half> model({2, 1, 1, 6});
-  model.PopulateTensor<half>(
+  RoundOpModel<Eigen::half> model({2, 1, 1, 6});
+  model.PopulateTensor<Eigen::half>(
       model.input(),
-      {half(0.0001f), half(8.0001f), half(0.9999f), half(9.9999f), half(0.5f),
-       half(-0.0001f), half(-8.0001f), half(-0.9999f), half(-9.9999f),
-       half(-0.5f), half(-2.5f), half(1.5f)});
+      {Eigen::half(0.0001), Eigen::half(8.0001), Eigen::half(0.9999),
+       Eigen::half(9.9999), Eigen::half(0.5), Eigen::half(-0.0001),
+       Eigen::half(-8.0001), Eigen::half(-0.9999), Eigen::half(-9.9999),
+       Eigen::half(-0.5), Eigen::half(-2.5), Eigen::half(1.5)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput(),
-              ElementsAreArray({half(0), half(8), half(1), half(10), half(0),
-                                half(0), half(-8), half(-1), half(-10),
-                                half(-0), half(-2), half(2)}));
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({Eigen::half(0), Eigen::half(8), Eigen::half(1),
+                        Eigen::half(10), Eigen::half(0), Eigen::half(0),
+                        Eigen::half(-8), Eigen::half(-1), Eigen::half(-10),
+                        Eigen::half(-0), Eigen::half(-2), Eigen::half(2)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
 }
 
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 2f3430770f7b68..feb02c48d2f3aa 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "Eigen/Core"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -338,16 +338,20 @@ TEST_P(SliceOpTest, SliceBool) {
 }
 
 TEST_P(SliceOpTest, SliceFloat16) {
-  SliceOpModel<half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
-                                {2, 1, -1, 1}, TensorType_INT32,
-                                TensorType_FLOAT16, GetParam());
-  m.SetInput({half(1), half(1), half(1), half(2), half(2), half(2), half(3),
-              half(3), half(3), half(4), half(4), half(4), half(5), half(5),
-              half(5), half(6), half(6), half(6)});
+  SliceOpModel<Eigen::half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                       {2, 1, -1, 1}, TensorType_INT32,
+                                       TensorType_FLOAT16, GetParam());
+  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(1), Eigen::half(2),
+              Eigen::half(2), Eigen::half(2), Eigen::half(3), Eigen::half(3),
+              Eigen::half(3), Eigen::half(4), Eigen::half(4), Eigen::half(4),
+              Eigen::half(5), Eigen::half(5), Eigen::half(5), Eigen::half(6),
+              Eigen::half(6), Eigen::half(6)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({half(3), half(3), half(3),
-                                               half(5), half(5), half(5)}));
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({Eigen::half(3), Eigen::half(3), Eigen::half(3),
+                        Eigen::half(5), Eigen::half(5), Eigen::half(5)}));
 }
 
 TEST_P(SliceOpTest, SliceBFloat16) {
@@ -369,16 +373,19 @@ TEST_P(SliceOpTest, SliceBFloat16) {
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1Float16) {
-  SliceOpModel<half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
-                                {2, -1, 1, 1}, TensorType_INT32,
-                                TensorType_FLOAT16, GetParam());
-  m.SetInput({half(1), half(1), half(2), half(2), half(3), half(3), half(4),
-              half(4), half(5), half(5), half(6), half(6), half(7), half(7),
-              half(8), half(8), half(9), half(9)});
+  SliceOpModel<Eigen::half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
+                                       {2, -1, 1, 1}, TensorType_INT32,
+                                       TensorType_FLOAT16, GetParam());
+  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(2), Eigen::half(2),
+              Eigen::half(3), Eigen::half(3), Eigen::half(4), Eigen::half(4),
+              Eigen::half(5), Eigen::half(5), Eigen::half(6), Eigen::half(6),
+              Eigen::half(7), Eigen::half(7), Eigen::half(8), Eigen::half(8),
+              Eigen::half(9), Eigen::half(9)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1, 1}));
   EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({half(5), half(6), half(8), half(9)}));
+              ElementsAreArray({Eigen::half(5), Eigen::half(6), Eigen::half(8),
+                                Eigen::half(9)}));
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1BFloat16) {
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index f7c79680576fe1..6ba4ef3b78977f 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -22,10 +22,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "Eigen/Core"  // from @eigen_archive  // IWYU pragma: keep
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -154,7 +152,7 @@ class StridedSliceOpModel : public SingleOpModel {
 template <typename T>
 class StridedSliceOpTest : public ::testing::Test {};
 
-using DataTypes = ::testing::Types<float, half, Eigen::bfloat16, uint8_t,
+using DataTypes = ::testing::Types<float, Eigen::half, Eigen::bfloat16, uint8_t,
                                    uint32_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(StridedSliceOpTest, DataTypes);
 
@@ -349,9 +347,7 @@ TYPED_TEST(StridedSliceOpTest, In1D_Int32End) {
       continue;
     }
     std::vector<TypeParam> values(32768);
-    for (int i = 0; i < 32768; ++i) {
-      values[i] = static_cast<TypeParam>(i);
-    }
+    std::iota(values.begin(), values.end(), TypeParam(0));
 
     StridedSliceOpModel<TypeParam> m({32768}, {1}, {1}, {1}, values, {0},
                                      {32768}, {1}, 0, 0, 0, 0, 0,
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 1bd870d2c56c92..cbdb74d29d04aa 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -38,6 +38,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "fp16/fp16.h"  // from @FP16
 #include "absl/algorithm/container.h"
 #include "absl/log/absl_check.h"
 #include "absl/log/absl_log.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
-#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 #include "tsl/platform/logging.h"
 
@@ -134,7 +134,7 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 }
 
 template <>
-constexpr TfLiteType typeToTfLiteType<half>() {
+constexpr TfLiteType typeToTfLiteType<Eigen::half>() {
   return kTfLiteFloat16;
 }
 
@@ -1362,7 +1362,7 @@ TFLITE_TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
 TFLITE_TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
 TFLITE_TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteFloat16, TensorType_FLOAT16);
-TFLITE_TENSOR_TYPE_ASSOC(half, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(Eigen::half, TensorType_FLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteBFloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(Eigen::bfloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
@@ -1461,13 +1461,13 @@ struct TypeUnion<uint8_t> {
 };
 
 template <>
-struct TypeUnion<half> {
+struct TypeUnion<Eigen::half> {
  public:
   // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT16;
   // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat16;
-  typedef half ScalarType;
+  typedef Eigen::half ScalarType;
 };
 
 template <>
diff --git a/tensorflow/lite/kernels/test_util_test.cc b/tensorflow/lite/kernels/test_util_test.cc
index 01f514692b0616..ed9a679b4e4d33 100644
--- a/tensorflow/lite/kernels/test_util_test.cc
+++ b/tensorflow/lite/kernels/test_util_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/array.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
-#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -198,14 +197,6 @@ TEST(TestUtilTest, QuantizeVectorScalingUp) {
   EXPECT_THAT(q_data, ElementsAreArray(expected));
 }
 
-TEST(TestUtilTest, DequantizeVectorFp16) {
-  std::vector<half> data = {half(-1.0f), half(-0.5f), half(0.0f), half(0.5f),
-                            half(1.0f)};
-  auto f_data = Dequantize<half>(data, /*scale=*/0.1f, /*zero_point=*/0);
-  std::vector<float> expected = {-0.1f, -0.05f, 0.0f, 0.05f, 0.1f};
-  EXPECT_THAT(f_data, ElementsAreArray(tflite::ArrayFloatNear(expected, 1e-7)));
-}
-
 TEST(DimsAreMatcherTestTensor, ValidOneD) {
   TensorUniquePtr t = BuildTfLiteTensor(kTfLiteInt32, {2}, kTfLiteDynamic);
   EXPECT_THAT(t.get(), DimsAre({2}));

From 6823891de4050b232bac02e402eb7a1e5c6ea036 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Wed, 10 Dec 2025 16:01:58 -0800
Subject: [PATCH 147/753] All the dus_ar_dims needs padding.

The dus_ar_dims satisfies that `base_shape().dimensions(i) <= partitions / 2`. Hence, we must introduce padding in this dimension.

PiperOrigin-RevId: 842917144
---
 .../xla/xla/service/spmd/spmd_partitioner.cc  | 29 +++++--------------
 1 file changed, 8 insertions(+), 21 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index f9f8d1dbdc694c..1f4c6f84b6b968 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -1413,29 +1413,16 @@ HloInstruction* PartitionedHlo::ReplicatePartial(
   if (!dus_ar_dims.empty()) {
     auto zero = state_.b->AddInstruction(HloInstruction::CreateConstant(
         LiteralUtil::Zero(shard_shape.element_type())));
-    std::vector<int64_t> masking_dims;
-    for (int64_t dim : dus_ar_dims) {
-      if (shard_shape.dimensions(dim) * sharding().dimension(dim) !=
-          base_shape().dimensions(dim)) {
-        // DUS will be out-of-bound and offset will be clamped, so we need to
-        // mask this dim with 0.
-        masking_dims.push_back(dim);
-      }
-    }
-    if (!masking_dims.empty()) {
-      std::vector<int64_t> skipped_dims;
-      for (int64_t i = 0; i < base_shape().dimensions().size(); ++i) {
-        if (!absl::c_linear_search(masking_dims, i)) {
-          skipped_dims.push_back(i);
-        }
+    std::vector<int64_t> skipped_dims;
+    for (int64_t i = 0; i < base_shape().dimensions().size(); ++i) {
+      if (!absl::c_linear_search(dus_ar_dims, i)) {
+        skipped_dims.push_back(i);
       }
-      result->copy_sharding(hlo_);
-      result = PartitionedHlo(result, final_result_shape, state_)
-                   .PadWithValue(zero,
-                                 /*left_padded_dims=*/{},
-                                 /*skipped_dims=*/skipped_dims)
-                   .hlo();
     }
+    result->copy_sharding(hlo_);
+    result = PartitionedHlo(result, final_result_shape, state_)
+                 .PadWithValue(zero, /*left_padded_dims=*/{}, skipped_dims)
+                 .hlo();
     auto zero_bcast = state_.b->AddInstruction(
         HloInstruction::CreateBroadcast(final_result_shape, zero, {}));
     auto offsets = MakePartitionOffsets(

From 95a9086a3545a0a0fd4c562b162acef4b573fb0c Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Wed, 10 Dec 2025 16:04:58 -0800
Subject: [PATCH 148/753] (3/N) Add support for `NamedSharding` in existing
 `HloSharding` methods. Remaining methods will be updated in follow up cl's.

PiperOrigin-RevId: 842918400
---
 third_party/xla/xla/hlo/ir/hlo_sharding.h     | 17 +++++-
 third_party/xla/xla/hlo/ir/mesh_and_axis.cc   |  8 +++
 third_party/xla/xla/hlo/ir/mesh_and_axis.h    |  2 +
 .../xla/xla/hlo/ir/mesh_and_axis_test.cc      | 14 ++++-
 third_party/xla/xla/hlo/ir/named_sharding.cc  |  9 +++
 third_party/xla/xla/hlo/ir/named_sharding.h   | 15 +++++
 .../xla/xla/hlo/ir/named_sharding_test.cc     | 56 +++++++++++++++++++
 7 files changed, 118 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index 488dfdb2793421..289c93a6640964 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -489,10 +489,18 @@ class HloSharding {
   }
 
   // Returns the number of dimensions.
-  int64_t num_dimensions() const { return tile_assignment().num_dimensions(); }
+  int64_t num_dimensions() const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->num_dimensions();
+    }
+    return tile_assignment().num_dimensions();
+  }
 
   // Returns number of shards in the given dimension.
   int64_t dimension(int64_t dim_index) const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->dimension(dim_index);
+    }
     return tile_assignment().dim(dim_index);
   }
 
@@ -502,7 +510,12 @@ class HloSharding {
   }
 
   // Returns the total number of devices used by sharding.
-  int64_t num_devices() const { return tile_assignment().num_elements(); }
+  int64_t num_devices() const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->num_devices();
+    }
+    return tile_assignment().num_elements();
+  }
 
   // Gets the subgroup types array.
   // REQUIRES: !IsTuple()
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
index 6a7807f51d698e..ae638cd8d9bdc3 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
@@ -309,6 +309,14 @@ bool AxisRef::CanCoexistWithoutOverlap(const AxisRef& other) const {
   return max_pre_size % min_next_pre_size == 0;
 }
 
+int64_t AxisRef::size(const Mesh& mesh) const {
+  if (sub_axis_info_.has_value()) {
+    return sub_axis_info_->size;
+  }
+
+  return mesh.axis_size(mesh_axis_index_);
+}
+
 bool AxesCanCoexistWithoutOverlap(absl::Span<const AxisRef> axes) {
   for (int64_t i = 0; i < axes.size() - 1; ++i) {
     for (int64_t j = i + 1; j < axes.size(); ++j) {
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.h b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
index 2b913f7638dad0..f6190d038ff625 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis.h
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
@@ -202,6 +202,8 @@ class AxisRef {
   int64_t mesh_axis_index() const { return mesh_axis_index_; }
   std::optional<SubAxis> sub_axis_info() const { return sub_axis_info_; }
 
+  int64_t size(const Mesh& mesh) const;
+
  private:
   absl::Status ValidateAxisRef();
 };
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
index 3dbcc53db77246..1bc5b9896b90a6 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/hlo/ir/mesh_and_axis.h"
 
 #include <cstdint>
-#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -350,4 +349,17 @@ TEST(MeshAndAxisTest, MaximalMesh) {
   EXPECT_EQ(maximal_mesh, Mesh::FromProto(maximal_mesh.ToProto()));
 }
 
+TEST(MeshAndAxisTest, AxisRefSize) {
+  Mesh mesh({2 * 7, 3 * 11, 5 * 13}, {"a", "b", "c"});
+  EXPECT_EQ(AxisRef(0).size(mesh), 14);
+  EXPECT_EQ(AxisRef(1).size(mesh), 33);
+  EXPECT_EQ(AxisRef(2).size(mesh), 65);
+  EXPECT_EQ(AxisRef(0, {1, 2}).size(mesh), 2);
+  EXPECT_EQ(AxisRef(0, {2, 7}).size(mesh), 7);
+  EXPECT_EQ(AxisRef(1, {1, 3}).size(mesh), 3);
+  EXPECT_EQ(AxisRef(1, {3, 11}).size(mesh), 11);
+  EXPECT_EQ(AxisRef(2, {1, 5}).size(mesh), 5);
+  EXPECT_EQ(AxisRef(2, {5, 13}).size(mesh), 13);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.cc b/third_party/xla/xla/hlo/ir/named_sharding.cc
index 0db5d8e916aa53..efdecbcea0fa16 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/named_sharding.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <map>
+#include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
@@ -27,6 +28,14 @@ limitations under the License.
 
 namespace xla {
 
+int64_t NamedSharding::DimensionSharding::getShardedSize(
+    const Mesh& mesh) const {
+  return std::accumulate(axes_.begin(), axes_.end(), 1,
+                         [&mesh](int64_t cur, const AxisRef& axis) {
+                           return cur * axis.size(mesh);
+                         });
+}
+
 namespace test_utils {
 // Construct sharding with given mesh. 'dim_shardings', 'replicated_axes',
 // 'unreduced_axes' refer to axis names in the mesh.
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index 53134af857dc5d..35919d7b4befa3 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -47,6 +47,8 @@ class NamedSharding {
 
     absl::Span<const AxisRef> axes() const { return axes_; }
 
+    int64_t getShardedSize(const Mesh& mesh) const;
+
    private:
     std::vector<AxisRef> axes_;
     bool is_closed_;
@@ -84,6 +86,19 @@ class NamedSharding {
   absl::Span<const AxisRef> unreduced_axes() const { return unreduced_axes_; }
   absl::Span<const OpMetadata> metadata() const { return metadata_; }
 
+  // Returns number of dimensions.
+  int64_t num_dimensions() const { return dim_shardings_.size(); }
+
+  // Returns size of the given dimension.
+  int64_t dimension(int64_t dim) const {
+    return dim_shardings_[dim].getShardedSize(mesh_);
+  }
+
+  // Returns the total number of devices used by sharding.
+  int64_t num_devices() const {
+    return mesh_.device_assignment().num_elements();
+  }
+
  private:
   friend class HloSharding;
 
diff --git a/third_party/xla/xla/hlo/ir/named_sharding_test.cc b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
index 611ade960943f3..22930e733aca5d 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding_test.cc
+++ b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
@@ -109,5 +109,61 @@ TEST(NamedShardingTest, Equality) {
             NamedSharding(mesh_diff_shape, {ds_ab, ds_dc}, {axis_b}, {axis_c}));
 }
 
+TEST(NamedShardingTest, GetShardedSize) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  EXPECT_EQ(ds_ab.getShardedSize(mesh), 2 * 2);
+
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+  EXPECT_EQ(ds_dc.getShardedSize(mesh), 2 * 3);
+
+  DimensionSharding ds_b({axis_b}, /*is_closed=*/true);
+  EXPECT_EQ(ds_b.getShardedSize(mesh), 2);
+
+  DimensionSharding ds_empty({}, /*is_closed=*/true);
+  EXPECT_EQ(ds_empty.getShardedSize(mesh), 1);
+}
+
+TEST(NamedShardingTest, Dimension) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+
+  NamedSharding sharding(mesh, /*dim_shardings=*/{ds_ab, ds_dc});
+
+  EXPECT_EQ(sharding.dimension(0), 2 * 2);
+  EXPECT_EQ(sharding.dimension(1), 2 * 3);
+  EXPECT_EQ(sharding.num_dimensions(), 2);
+
+  NamedSharding empty_sharding(mesh, /*dim_shardings=*/{});
+  EXPECT_EQ(empty_sharding.num_dimensions(), 0);
+}
+
+TEST(NamedShardingTest, NumDevices) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+  NamedSharding sharding(mesh, {});
+  EXPECT_EQ(sharding.num_devices(), 2 * 4 * 3 * 8);
+
+  Mesh maximal_mesh(5);
+  NamedSharding maximal_sharding(maximal_mesh);
+  EXPECT_EQ(maximal_sharding.num_devices(), 1);
+
+  Mesh empty_mesh;
+  NamedSharding empty_sharding(empty_mesh);
+  EXPECT_EQ(empty_sharding.num_devices(), 0);
+}
+
 }  // namespace
 }  // namespace xla

From 1f1065a43ef68726ae91f774fdebfc816d708b4b Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Wed, 10 Dec 2025 16:26:11 -0800
Subject: [PATCH 149/753] [xla:codegen] split LowerToLLVMPass into CPU and GPU
 versions

PiperOrigin-RevId: 842927209
---
 .../xla/xla/backends/cpu/codegen/BUILD        |   1 +
 .../backends/cpu/codegen/fusion_compiler.cc   |   3 +-
 .../xla/backends/gpu/codegen/emitters/BUILD   |   1 +
 .../gpu/codegen/emitters/emitter_base.cc      |   3 +-
 .../xla/xla/codegen/emitters/transforms/BUILD | 113 +++++++++++++++---
 .../emitters/transforms/lower_to_llvm_cpu.cc  | 103 ++++++++++++++++
 .../emitters/transforms/lower_to_llvm_cpu.h   |  36 ++++++
 .../emitters/transforms/lower_to_llvm_cpu.td  |  35 ++++++
 ...{lower_to_llvm.cc => lower_to_llvm_gpu.cc} | 104 ++++++++--------
 .../emitters/transforms/lower_to_llvm_gpu.h   |  44 +++++++
 .../emitters/transforms/lower_to_llvm_gpu.td  |  42 +++++++
 .../xla/codegen/emitters/transforms/passes.h  |   5 -
 .../xla/codegen/emitters/transforms/passes.td |  23 ----
 13 files changed, 411 insertions(+), 102 deletions(-)
 create mode 100644 third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
 create mode 100644 third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.h
 create mode 100644 third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.td
 rename third_party/xla/xla/codegen/emitters/transforms/{lower_to_llvm.cc => lower_to_llvm_gpu.cc} (65%)
 create mode 100644 third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
 create mode 100644 third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.td

diff --git a/third_party/xla/xla/backends/cpu/codegen/BUILD b/third_party/xla/xla/backends/cpu/codegen/BUILD
index db9b0a2cfbd267..e770f8a6a7907f 100644
--- a/third_party/xla/xla/backends/cpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/BUILD
@@ -193,6 +193,7 @@ cc_library(
         "//xla/codegen:trace_pass_instrumentation",
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
+        "//xla/codegen/emitters/transforms:lower_to_llvm_cpu_pass",
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/codegen/xtile/ir:xtile",
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
index 65996cf9b41d30..4b6ac965f10230 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_compiler.cc
@@ -106,6 +106,7 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_attrs.h.inc"
 #include "xla/codegen/emitters/ir/xla_dialect.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/codegen/llvm_kernel_source.h"
@@ -212,7 +213,7 @@ static void AddGenericLoweringPasses(mlir::OpPassManager& pm,
   pm.addPass(mlir::createSCFToControlFlowPass());
   pm.addPass(emitters::CreateLowerXlaIntrinsicLibPass());
   pm.addNestedPass<mlir::func::FuncOp>(CreateConvertMathToLLVMPass());
-  pm.addPass(emitters::CreateLowerToLLVMPass(/*target_type=*/"cpu"));
+  pm.addPass(emitters::CreateLowerToLLVMCPUPass());
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mlir::createCSEPass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
index b72a97787b82fe..6f91eb138bab3c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "//xla/codegen/emitters:kernel_api_builder",
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/emitters/transforms:lower_to_llvm_gpu_pass",
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/hlo/analysis:indexing_analysis",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
index f171a5cb6b4f33..2821b28b53215b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -88,6 +88,7 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h"
 #include "xla/codegen/emitters/transforms/pass_pipelines.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
@@ -511,7 +512,7 @@ void AddLoweringPasses(mlir::OpPassManager& pm,
   pm.addPass(emitters::CreateExpandFloatOpsPass());
   pm.addPass(mlir::createLowerAffinePass());
   pm.addPass(mlir::createSCFToControlFlowPass());
-  pm.addPass(emitters::CreateLowerToLLVMPass(device));
+  pm.addPass(emitters::CreateLowerToLLVMGPUPass(device));
   pm.addPass(mlir::createReconcileUnrealizedCastsPass());
 }
 
diff --git a/third_party/xla/xla/codegen/emitters/transforms/BUILD b/third_party/xla/xla/codegen/emitters/transforms/BUILD
index 4f910f30935a01..9e07eef2ba0f25 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/codegen/emitters/transforms/BUILD
@@ -29,6 +29,101 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "lower_to_llvm_cpu_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"lower_to_llvm_cpu.h.inc": [
+        "-gen-pass-decls",
+        "-name=TransformsLLVMCPU",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lower_to_llvm_cpu.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "lower_to_llvm_cpu_pass",
+    srcs = ["lower_to_llvm_cpu.cc"],
+    hdrs = ["lower_to_llvm_cpu.h"],
+    deps = [
+        ":lower_to_llvm_cpu_inc_gen",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithToLLVM",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:ComplexToLLVM",
+        "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:MemRefToLLVM",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:UBToLLVM",
+        "@llvm-project//mlir:VectorToLLVM",
+    ],
+)
+
+gentbl_cc_library(
+    name = "lower_to_llvm_gpu_inc_gen",
+    compatible_with = get_compatible_with_portable(),
+    tbl_outs = {"lower_to_llvm_gpu.h.inc": [
+        "-gen-pass-decls",
+        "-name=TransformsLLVMGPU",
+    ]},
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lower_to_llvm_gpu.td",
+    visibility = ["//visibility:private"],
+    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "lower_to_llvm_gpu_pass",
+    srcs = ["lower_to_llvm_gpu.cc"],
+    hdrs = ["lower_to_llvm_gpu.h"],
+    deps = [
+        ":lower_to_llvm_gpu_inc_gen",
+        "//xla/codegen:device_spec",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tsl/platform:logging",
+        "@com_google_protobuf//:protobuf",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AMDGPUUtils",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:ArithDialect",
+        "@llvm-project//mlir:ArithToLLVM",
+        "@llvm-project//mlir:ArithTransforms",
+        "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:ComplexToLLVM",
+        "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:DataLayoutInterfaces",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:GPUToLLVMSPVTransforms",
+        "@llvm-project//mlir:GPUToNVVMTransforms",
+        "@llvm-project//mlir:GPUToROCDLTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMCommonConversion",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:MathToLLVM",
+        "@llvm-project//mlir:MemRefToLLVM",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:UBToLLVM",
+        "@llvm-project//mlir:VectorToLLVM",
+        "@local_tsl//tsl/platform:protobuf",
+    ],
+)
+
 gentbl_cc_library(
     name = "passes_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -49,7 +144,6 @@ cc_library(
         "expand_float_ops.cc",
         "flatten_tensors.cc",
         "lower_tensors.cc",
-        "lower_to_llvm.cc",
         "lower_xla_intrinsic_lib.cc",
         "lower_xla_to_scf.cc",
         "merge_pointers_to_same_slice.cc",
@@ -89,7 +183,6 @@ cc_library(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/rocm:rocm_compute_capability",
-        "//xla/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log:check",
@@ -98,41 +191,25 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AMDGPUUtils",
-        "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ArithToLLVM",
-        "@llvm-project//mlir:ArithTransforms",
         "@llvm-project//mlir:ComplexDialect",
-        "@llvm-project//mlir:ComplexToLLVM",
-        "@llvm-project//mlir:ControlFlowToLLVM",
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:GPUToLLVMSPVTransforms",
-        "@llvm-project//mlir:GPUToNVVMTransforms",
-        "@llvm-project//mlir:GPUToROCDLTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:MathDialect",
-        "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MathTransforms",
-        "@llvm-project//mlir:MemRefToLLVM",
-        "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLDialect",
         "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:SCFUtils",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:UBToLLVM",
         "@llvm-project//mlir:VectorDialect",
-        "@llvm-project//mlir:VectorToLLVM",
         "@llvm-project//mlir:VectorTransforms",
         "@llvm-project//mlir:VectorUtils",
         "@local_tsl//tsl/platform:protobuf",
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
new file mode 100644
index 00000000000000..d98dc0cd8e7055
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
@@ -0,0 +1,103 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace xla {
+namespace emitters {
+namespace {
+
+#define GEN_PASS_DEF_LOWERTOLLVMCPUPASS
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h.inc"
+
+class LowerToLLVMCPUPass
+    : public impl::LowerToLLVMCPUPassBase<LowerToLLVMCPUPass> {
+ public:
+  LowerToLLVMCPUPass() : LowerToLLVMCPUPassBase() {}
+
+  void runOnOperation() override {
+    // Populate type conversions.
+    mlir::LowerToLLVMOptions llvm_opts(&getContext(),
+                                       mlir::DataLayout(getOperation()));
+    mlir::LLVMTypeConverter type_converter(getOperation().getContext(),
+                                           llvm_opts);
+    mlir::LLVMConversionTarget target(*getOperation().getContext());
+
+    // Populate patterns.
+    mlir::RewritePatternSet patterns(&getContext());
+    mlir::arith::populateArithExpandOpsPatterns(patterns);
+    mlir::arith::populateArithToLLVMConversionPatterns(type_converter,
+                                                       patterns);
+    mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
+    mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
+                                                         patterns);
+    mlir::ub::populateUBToLLVMConversionPatterns(type_converter, patterns);
+    mlir::populateVectorToLLVMConversionPatterns(type_converter, patterns);
+    mlir::cf::populateControlFlowToLLVMConversionPatterns(type_converter,
+                                                          patterns);
+    mlir::populateComplexToLLVMConversionPatterns(type_converter, patterns);
+
+    //  Set up target.
+    target.addIllegalDialect<mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                             mlir::complex::ComplexDialect>();
+    target.addLegalOp<mlir::ModuleOp>();
+
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+
+    // Clean up any leftover math ops.
+    mlir::RewritePatternSet mathPatterns(&getContext());
+    mlir::populateMathToLLVMConversionPatterns(type_converter, mathPatterns,
+                                               /*approximateLog1p=*/false);
+    target.addIllegalDialect<mlir::math::MathDialect>();
+
+    if (failed(applyFullConversion(getOperation(), target,
+                                   std::move(mathPatterns)))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<::mlir::Pass> CreateLowerToLLVMCPUPass() {
+  return std::make_unique<LowerToLLVMCPUPass>();
+}
+
+}  // namespace emitters
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.h
new file mode 100644
index 00000000000000..3fb1d73d3b2606
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.h
@@ -0,0 +1,36 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_H_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_H_
+
+#include <memory>
+
+#include "mlir/Pass/Pass.h"
+
+namespace xla {
+namespace emitters {
+
+#define GEN_PASS_DECL
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMCPUPass();
+
+#define GEN_PASS_REGISTRATION
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h.inc"
+
+}  // namespace emitters
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_H_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.td b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.td
new file mode 100644
index 00000000000000..f937a75c75fd40
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.td
@@ -0,0 +1,35 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_TD_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_TD_
+
+include "mlir/Pass/PassBase.td"
+
+def LowerToLLVMCPUPass :
+   Pass<"xla-lower-to-llvm-cpu", "mlir::ModuleOp"> {
+  let summary = "Lowers to LLVM (CPU Version).";
+
+  let description = [{
+    Lowers the rest to LLVM (CPU Version)
+  }];
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+  ];
+  let constructor = "CreateLowerToLLVMCPUPass()";
+}
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_CPU_TD_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
similarity index 65%
rename from third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
rename to third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
index 4fe70076991a27..ab4de23a4ed646 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The OpenXLA Authors.
+/* Copyright 2025 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h"
+
 #include <cstdint>
 #include <memory>
 #include <string>
@@ -51,7 +53,6 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 #include "google/protobuf/text_format.h"
 #include "xla/codegen/device_spec.h"
-#include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/tsl/platform/logging.h"
@@ -63,19 +64,20 @@ namespace {
 
 namespace se = ::stream_executor;
 
-#define GEN_PASS_DEF_LOWERTOLLVMPASS
-#include "xla/codegen/emitters/transforms/passes.h.inc"
+#define GEN_PASS_DEF_LOWERTOLLVMGPUPASS
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h.inc"
 
-class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
+class LowerToLLVMGPUPass
+    : public impl::LowerToLLVMGPUPassBase<LowerToLLVMGPUPass> {
  public:
-  explicit LowerToLLVMPass(const LowerToLLVMPassOptions& options)
-      : LowerToLLVMPassBase(options) {}
+  explicit LowerToLLVMGPUPass(const LowerToLLVMGPUPassOptions& options)
+      : LowerToLLVMGPUPassBase(options) {}
 
-  explicit LowerToLLVMPass(const se::DeviceDescription& device_description)
+  explicit LowerToLLVMGPUPass(const se::DeviceDescription& device_description)
       : device_spec_(device_description) {}
 
   void runOnOperation() override {
-    if (target_type_ == "gpu" && !gpu_device_info_.empty()) {
+    if (gpu_device_info_.empty()) {
       se::GpuDeviceInfoProto device_info;
       CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
                                                        &device_info));
@@ -83,9 +85,6 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
           se::DeviceDescription::FromProto(device_info);
       CHECK_OK(device_description.status());
       *device_spec_.mutable_type() = *device_description;
-    } else if (target_type_ == "cpu") {
-      CHECK(gpu_device_info_.empty());
-      *device_spec_.mutable_type() = CpuDeviceSpec{};
     }
     // Populate type conversions.
     mlir::LowerToLLVMOptions llvm_opts(&getContext(),
@@ -99,39 +98,37 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
     mlir::arith::populateArithExpandOpsPatterns(patterns);
     mlir::arith::populateArithToLLVMConversionPatterns(type_converter,
                                                        patterns);
-    if (device_spec_.IsGpu()) {
-      if (device_spec_.IsAmdGpu()) {
-        std::string chipset =
-            device_spec_.gpu().rocm_compute_capability().gfx_version();
-        llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
-            mlir::amdgpu::Chipset::parse(chipset);
-        if (failed(maybeChipset)) {
-          mlir::emitError(mlir::UnknownLoc::get(&getContext()),
-                          "Invalid chipset name: " + chipset);
-          return signalPassFailure();
-        }
-        mlir::populateGpuToROCDLConversionPatterns(
-            type_converter, patterns, mlir::gpu::amd::Runtime::Unknown,
-            *maybeChipset);
-        mlir::configureGpuToROCDLConversionLegality(target);
-      } else if (device_spec_.IsIntelGpu()) {
-        // Add sub-group-size attribute to functions.
-        int32_t sub_group_size = device_spec_.gpu().threads_per_warp();
-        if (auto module_op = mlir::dyn_cast<mlir::ModuleOp>(getOperation())) {
-          module_op.walk([sub_group_size](mlir::func::FuncOp func) {
-            if (!func.getBody().empty()) {
-              mlir::OpBuilder b(func.getContext());
-              auto sub_group_attr = b.getI32IntegerAttr(sub_group_size);
-              func->setAttr("intel_reqd_sub_group_size", sub_group_attr);
-            }
-          });
-        }
-        populateGpuToLLVMSPVConversionPatterns(type_converter, patterns);
-        populateGpuMemorySpaceAttributeConversions(type_converter);
-      } else {
-        mlir::populateGpuToNVVMConversionPatterns(type_converter, patterns);
-        mlir::configureGpuToNVVMConversionLegality(target);
+    if (device_spec_.IsAmdGpu()) {
+      std::string chipset =
+          device_spec_.gpu().rocm_compute_capability().gfx_version();
+      llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
+          mlir::amdgpu::Chipset::parse(chipset);
+      if (failed(maybeChipset)) {
+        mlir::emitError(mlir::UnknownLoc::get(&getContext()),
+                        "Invalid chipset name: " + chipset);
+        return signalPassFailure();
+      }
+      mlir::populateGpuToROCDLConversionPatterns(
+          type_converter, patterns, mlir::gpu::amd::Runtime::Unknown,
+          *maybeChipset);
+      mlir::configureGpuToROCDLConversionLegality(target);
+    } else if (device_spec_.IsIntelGpu()) {
+      // Add sub-group-size attribute to functions.
+      int32_t sub_group_size = device_spec_.gpu().threads_per_warp();
+      if (auto module_op = mlir::dyn_cast<mlir::ModuleOp>(getOperation())) {
+        module_op.walk([sub_group_size](mlir::func::FuncOp func) {
+          if (!func.getBody().empty()) {
+            mlir::OpBuilder b(func.getContext());
+            auto sub_group_attr = b.getI32IntegerAttr(sub_group_size);
+            func->setAttr("intel_reqd_sub_group_size", sub_group_attr);
+          }
+        });
       }
+      populateGpuToLLVMSPVConversionPatterns(type_converter, patterns);
+      populateGpuMemorySpaceAttributeConversions(type_converter);
+    } else {
+      mlir::populateGpuToNVVMConversionPatterns(type_converter, patterns);
+      mlir::configureGpuToNVVMConversionLegality(target);
     }
     mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
     mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
@@ -142,7 +139,7 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
                                                           patterns);
     mlir::populateComplexToLLVMConversionPatterns(type_converter, patterns);
 
-    //  Setup target.
+    // Set up target.
     target.addIllegalDialect<mlir::arith::ArithDialect, mlir::func::FuncDialect,
                              mlir::complex::ComplexDialect>();
     target.addLegalOp<mlir::ModuleOp>();
@@ -153,10 +150,10 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
       return;
     }
 
-    // Cleanup any leftover math ops not handled NVVM or ROCDL lowering
+    // Clean up any leftover math ops not handled NVVM or ROCDL lowering.
     mlir::RewritePatternSet mathPatterns(&getContext());
     mlir::populateMathToLLVMConversionPatterns(type_converter, mathPatterns,
-                                               /* approximateLog1p */ false);
+                                               /*approximateLog1p=*/false);
     target.addIllegalDialect<mlir::math::MathDialect>();
 
     if (failed(applyFullConversion(getOperation(), target,
@@ -171,17 +168,16 @@ class LowerToLLVMPass : public impl::LowerToLLVMPassBase<LowerToLLVMPass> {
 
 }  // namespace
 
-std::unique_ptr<::mlir::Pass> CreateLowerToLLVMPass(
-    const std::string& target_type, const std::string& gpu_device_info) {
-  LowerToLLVMPassOptions options;
+std::unique_ptr<::mlir::Pass> CreateLowerToLLVMGPUPass(
+    const std::string& gpu_device_info) {
+  LowerToLLVMGPUPassOptions options;
   options.gpu_device_info_ = gpu_device_info;
-  options.target_type_ = target_type;
-  return std::make_unique<LowerToLLVMPass>(options);
+  return std::make_unique<LowerToLLVMGPUPass>(options);
 }
 
-std::unique_ptr<::mlir::Pass> CreateLowerToLLVMPass(
+std::unique_ptr<::mlir::Pass> CreateLowerToLLVMGPUPass(
     const se::DeviceDescription& device_description) {
-  return std::make_unique<LowerToLLVMPass>(device_description);
+  return std::make_unique<LowerToLLVMGPUPass>(device_description);
 }
 
 }  // namespace emitters
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
new file mode 100644
index 00000000000000..35bd6a0d33766b
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
@@ -0,0 +1,44 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "mlir/Pass/Pass.h"
+
+namespace stream_executor {
+class DeviceDescription;
+}  // namespace stream_executor
+
+namespace xla {
+namespace emitters {
+
+#define GEN_PASS_DECL
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h.inc"
+
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMGPUPass(
+    const std::string& gpu_device_info = "");
+std::unique_ptr<mlir::Pass> CreateLowerToLLVMGPUPass(
+    const stream_executor::DeviceDescription& device_description);
+#define GEN_PASS_REGISTRATION
+#include "xla/codegen/emitters/transforms/lower_to_llvm_gpu.h.inc"
+
+}  // namespace emitters
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.td b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.td
new file mode 100644
index 00000000000000..765dab04a6fb79
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.td
@@ -0,0 +1,42 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_TD_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_TD_
+
+include "mlir/Pass/PassBase.td"
+
+def LowerToLLVMGPUPass :
+   Pass<"xla-lower-to-llvm-gpu", "mlir::ModuleOp"> {
+  let summary = "Lowers to LLVM (GPU Version).";
+
+  let description = [{
+    Lowers the rest to LLVM (GPU Version)
+  }];
+
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "mlir::LLVM::LLVMDialect",
+    "mlir::NVVM::NVVMDialect",
+  ];
+
+  let options = [
+    Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
+           "Serialized stream_executor::GPUDeviceInfo proto.">,
+  ];
+  let constructor = "CreateLowerToLLVMGPUPass()";
+}
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_TD_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.h b/third_party/xla/xla/codegen/emitters/transforms/passes.h
index 74a5ab5fe2f897..d1bb595ab029a7 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.h
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.h
@@ -42,11 +42,6 @@ std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
     const std::string& gpu_device_info = "");
 std::unique_ptr<mlir::Pass> CreateLowerTensorsPass(
     const stream_executor::DeviceDescription& device_description);
-std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass(
-    const std::string& target_type = "gpu",
-    const std::string& gpu_device_info = "");
-std::unique_ptr<mlir::Pass> CreateLowerToLLVMPass(
-    const stream_executor::DeviceDescription& device_description);
 std::unique_ptr<mlir::Pass> CreateLowerXlaToScfPass(int64_t warp_size = 32);
 std::unique_ptr<mlir::Pass> CreateLowerXlaLoopsToScfPass();
 std::unique_ptr<mlir::Pass> CreateMergePointersToSameSlicePass();
diff --git a/third_party/xla/xla/codegen/emitters/transforms/passes.td b/third_party/xla/xla/codegen/emitters/transforms/passes.td
index 0b7afb432042f6..c82a0d26ea39a2 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/passes.td
+++ b/third_party/xla/xla/codegen/emitters/transforms/passes.td
@@ -129,29 +129,6 @@ def LowerTensorsPass : Pass<"xla-lower-tensors", "mlir::ModuleOp"> {
   let constructor = "CreateLowerTensorsPass()";
 }
 
-def LowerToLLVMPass :
-   Pass<"xla-lower-to-llvm", "mlir::ModuleOp"> {
-  let summary = "Lowers to LLVM.";
-
-  let description = [{
-    Lowers the rest to LLVM
-  }];
-
-  let dependentDialects = [
-    "mlir::func::FuncDialect",
-    "mlir::LLVM::LLVMDialect",
-    "mlir::NVVM::NVVMDialect",
-  ];
-
-  let options = [
-    Option<"gpu_device_info_", "gpu_device_info", "std::string", /*default=*/"",
-           "Serialized stream_executor::GPUDeviceInfo proto.">,
-    Option<"target_type_", "target_type", "std::string", /*default=*/"\"gpu\"",
-           "Whether the pass targets a 'cpu' or 'gpu'. If 'cpu', gpu_device_info_ must be empty.">,
-  ];
-  let constructor = "CreateLowerToLLVMPass()";
-}
-
 def LowerXlaToScfPass :
    Pass<"xla-lower-xla-to-scf", "mlir::func::FuncOp"> {
   let summary = "Lowers xla to SCF.";

From 2f41ba94e4625718e339800da9527fac595b8e9d Mon Sep 17 00:00:00 2001
From: Bryan Massoth <bmassoth@google.com>
Date: Wed, 10 Dec 2025 16:35:38 -0800
Subject: [PATCH 150/753] Integrate SparseCore offloading metadata into
 grouping logic to accurately group SparseCore execution with TensorCore.

PiperOrigin-RevId: 842931062
---
 third_party/xla/xla/tsl/profiler/utils/BUILD  |   6 +-
 .../xla/tsl/profiler/utils/group_events.cc    | 367 +++++++-----
 .../xla/xla/tsl/profiler/utils/group_events.h |  23 +-
 .../tsl/profiler/utils/group_events_test.cc   | 526 +++++++++++++++++-
 .../tsl/profiler/utils/xplane_test_utils.cc   |   9 +-
 .../tsl/profiler/utils/xplane_test_utils.h    |  10 +-
 .../xla/tsl/profiler/utils/xplane_visitor.h   |   2 +
 7 files changed, 790 insertions(+), 153 deletions(-)

diff --git a/third_party/xla/xla/tsl/profiler/utils/BUILD b/third_party/xla/xla/tsl/profiler/utils/BUILD
index 5088f54edeb11c..175eabf5b957bc 100644
--- a/third_party/xla/xla/tsl/profiler/utils/BUILD
+++ b/third_party/xla/xla/tsl/profiler/utils/BUILD
@@ -297,6 +297,7 @@ cc_library(
     deps = [
         ":tf_xplane_visitor",
         ":timespan",
+        ":tpu_xplane_utils",
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
@@ -311,8 +312,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
@@ -344,6 +345,7 @@ tsl_cc_test(
         ":preprocess_xplane",
         ":tf_xplane_visitor",
         ":timespan",
+        ":trace_utils",
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_test_utils",
@@ -353,8 +355,10 @@ tsl_cc_test(
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/profiler/lib:context_types_hdrs",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.cc b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
index 7edf70559c50f3..20a74dd74eea91 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.cc
@@ -15,11 +15,8 @@ limitations under the License.
 
 #include "xla/tsl/profiler/utils/group_events.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <functional>
-#include <iterator>
-#include <map>
 #include <memory>
 #include <optional>
 #include <queue>
@@ -31,18 +28,21 @@ limitations under the License.
 #include "absl/base/no_destructor.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/lib/gtl/map_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
 #include "xla/tsl/profiler/utils/timespan.h"
+#include "xla/tsl/profiler/utils/tpu_xplane_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
 #include "xla/tsl/profiler/utils/xplane_visitor.h"
-#include "tsl/platform/dso_loader.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
@@ -144,16 +144,13 @@ void SetContextGroup(const GroupingEventStats& stats, EventNode* event,
 }
 
 void ConnectContextGroups(const ContextGroupMap& context_groups) {
-  for (auto& type_id_group : context_groups) {
-    for (auto& id_group : type_id_group.second) {
-      const ContextGroup& group = id_group.second;
+  for (auto& [type, id_group] : context_groups) {
+    for (auto& [id, group] : id_group) {
       if (group.producers.size() >= 64 && group.consumers.size() >= 64) {
         LOG_EVERY_N(WARNING, 1000)
-            << "id:" << id_group.first
+            << "type: " << type << " id: " << id
             << " producers:" << group.producers.size() << " : "
-            << group.producers[0]->GetEventVisitor().Name()
-            << " consumers:" << group.consumers.size() << " : "
-            << group.consumers[0]->GetEventVisitor().Name();
+            << " consumers:" << group.consumers.size() << " : ";
         continue;
       }
 
@@ -166,6 +163,13 @@ void ConnectContextGroups(const ContextGroupMap& context_groups) {
   }
 }
 
+bool IsTPUParentLineEvent(const XEventVisitor& event) {
+  return event.LineName() == kStepLineName ||
+         event.LineName() == kSparseCoreStepLineName ||
+         event.LineName() == kXlaModuleLineName ||
+         event.LineName() == kSparseCoreModuleLineName;
+}
+
 bool IsImplicitRootEvent(const XEventVisitor& event) {
   static const absl::NoDestructor<absl::flat_hash_set<int64_t>>
       kImplicitRootEvents({
@@ -174,8 +178,9 @@ bool IsImplicitRootEvent(const XEventVisitor& event) {
           HostEventType::kRunGraph,
           HostEventType::kExecutorStateProcess,
       });
-  return event.Type().has_value() &&
-         kImplicitRootEvents->contains(*event.Type());
+  return (event.Type().has_value() &&
+          kImplicitRootEvents->contains(*event.Type())) ||
+         IsTPUParentLineEvent(event);
 }
 
 void ProcessRootEvent(int64_t group_id, EventNode* root_event,
@@ -269,7 +274,7 @@ std::string EventNode::GetGroupName() const {
   std::string name;
   if (std::optional<XStatVisitor> stat = GetContextStat(StatType::kGraphType)) {
     absl::StrAppend(&name, stat->StrOrRefValue(), " ");
-  } else if (!(IsImplicitRootEvent(visitor_))) {
+  } else if (!IsImplicitRootEvent(visitor_)) {
     absl::StrAppend(&name, GetEventVisitor().Name(), " ");
   }
   int64_t step_num = group_id_.value_or(0);
@@ -381,41 +386,133 @@ void EventForest::FindEventNodeAndApply(
   }
 }
 
-void EventForest::ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
-                                     ContextGroupMap* context_groups) {
-  bool is_host_plane = (visitor->Name() == kHostThreadsPlaneName);
+// Finds the primary line used for grouping TPU events (StepLine or ModuleLine).
+// Returns nullptr if no suitable grouping line is found.
+XLine* GetGroupingLineForTPU(XPlane* plane) {
+  XLine* step_line = nullptr;
+  XLine* module_line = nullptr;
+  for (auto& line : *plane->mutable_lines()) {
+    if (line.name() == kStepLineName ||
+        line.name() == kSparseCoreStepLineName) {
+      step_line = &line;
+    } else if (line.name() == kXlaModuleLineName ||
+               line.name() == kSparseCoreModuleLineName) {
+      module_line = &line;
+    }
+  }
+
+  if (step_line != nullptr && step_line->events_size() > 0) {
+    // Prefer the step line for grouping if it is not empty.
+    return step_line;
+  }
+  if (module_line != nullptr && module_line->events_size() > 0) {
+    // Fall back to the module line for inference grouping.
+    return module_line;
+  }
+  return nullptr;
+}
+
+void EventForest::ConnectIntraThreadTPU(XPlane* plane, XPlaneVisitor* visitor,
+                                        ContextGroupMap* context_groups) {
+  std::optional<int64_t> tc_id = GetTensorCoreId(visitor->Name());
+  std::optional<int64_t> sc_id = GetSparseCoreId(visitor->Name());
+  if (!tc_id.has_value() && !sc_id.has_value()) {
+    LOG(ERROR) << "TensorCore or SparseCore ID is missing. Skipping grouping "
+                  "for device plane: "
+               << visitor->Name();
+    return;
+  }
+  XLine* grouping_line = GetGroupingLineForTPU(plane);
+  if (grouping_line == nullptr) {
+    LOG(ERROR) << "No grouping line found. Skipping grouping for device plane: "
+               << visitor->Name();
+    return;
+  }
+
+  // Step 1: Connect the grouping events and store them as the parent nodes for
+  // future connection delegation. NOTE: This will need to be updated to support
+  // sub-step grouping.
+  std::vector<EventNode*> parent_nodes;
+  parent_nodes.reserve(grouping_line->events_size());
+  for (auto& event : *grouping_line->mutable_events()) {
+    XEventVisitor event_visitor(visitor, grouping_line, &event);
+    int64_t event_type = GetEventType(/*is_host_plane=*/false, event_visitor);
+    EventNode* step_node =
+        &event_node_map_[event_type].emplace_back(std::move(event_visitor));
+    GroupingEventStats stats(step_node->GetEventVisitor());
+    parent_nodes.push_back(step_node);
+    SetContextGroup(stats, step_node, context_groups);
+  }
+  // Step 2: Process all other events and propagate their connection metadata to
+  // the parent nodes.
   for (auto& line : *plane->mutable_lines()) {
-    if (line.name() == kTensorCoreSyncFlagLineName ||
-        line.name() == kSparseCoreSyncsLineName) {
-      VLOG(1) << "Skipping Xline with name: " << line.name()
-              << " in plane: " << visitor->Name();
+    if (&line == grouping_line) {
       continue;
     }
-    std::vector<EventNode*> parent_nodes;
+    int parent_index = 0;  // Reset index for each line
     for (auto& event : *line.mutable_events()) {
       XEventVisitor event_visitor(visitor, &line, &event);
-      int64_t event_type = GetEventType(is_host_plane, event_visitor);
-      EventNode* cur_node =
-          &event_node_map_[event_type].emplace_back(std::move(event_visitor));
-      GroupingEventStats stats(cur_node->GetEventVisitor());
-      if (stats.root_level.has_value()) {
-        cur_node->SetRootLevel(*stats.root_level);
+      GroupingEventStats stats(event_visitor);
+      // Find the first step node that *may* be the parent of this event.
+      while (parent_index < parent_nodes.size() &&
+             parent_nodes[parent_index]
+                     ->GetEventVisitor()
+                     .GetTimespan()
+                     .end_ps() <= event_visitor.GetTimespan().begin_ps()) {
+        parent_index++;
       }
-      // Update `context_groups` for `ConnectInterThread`.
-      SetContextGroup(stats, cur_node, context_groups);
-      // Async events are ignored when processing the nesting relationship.
-      if (!stats.is_async) {
-        while (!parent_nodes.empty()) {
-          EventNode* parent_node = parent_nodes.back();
-          if (parent_node->GetEventVisitor().GetTimespan().Includes(
-                  cur_node->GetEventVisitor().GetTimespan())) {
-            parent_node->AddChild(cur_node);
-            break;
-          } else {
+      if (parent_index == parent_nodes.size()) {
+        // Short-circuit when we've reached the end of the parent line.
+        break;
+      }
+      if (parent_nodes[parent_index]->GetEventVisitor().GetTimespan().Includes(
+              event_visitor.GetTimespan())) {
+        // For device events, the parent nodes will consume the
+        // producer/consumer stats of children to reduce the number of nodes
+        // in DAG.
+        SetContextGroup(stats, parent_nodes[parent_index], context_groups);
+      }
+    }
+  }
+  // Step 3: [Only for TensorCore] Store the parent nodes for later if they fail
+  // to be grouped with the host events.
+  if (tc_id.has_value()) {
+    tensor_core_root_events_per_core_.emplace_back(std::move(parent_nodes));
+  }
+}
+
+void EventForest::ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
+                                     ContextGroupMap* context_groups) {
+  bool is_host_plane = (visitor->Name() == kHostThreadsPlaneName);
+  if (absl::StartsWith(visitor->Name(), kTpuPlanePrefix)) {
+    ConnectIntraThreadTPU(plane, visitor, context_groups);
+  } else {
+    for (auto& line : *plane->mutable_lines()) {
+      std::vector<EventNode*> parent_nodes;
+      for (auto& event : *line.mutable_events()) {
+        XEventVisitor event_visitor(visitor, &line, &event);
+        int64_t event_type = GetEventType(is_host_plane, event_visitor);
+        EventNode* cur_node =
+            &event_node_map_[event_type].emplace_back(std::move(event_visitor));
+        GroupingEventStats stats(cur_node->GetEventVisitor());
+        if (stats.root_level.has_value()) {
+          cur_node->SetRootLevel(*stats.root_level);
+        }
+        // Update `context_groups` for `ConnectInterThread`.
+        SetContextGroup(stats, cur_node, context_groups);
+        // Async events are ignored when processing the nesting relationship.
+        if (!stats.is_async) {
+          while (!parent_nodes.empty()) {
+            EventNode* parent_node = parent_nodes.back();
+            if (parent_node->GetEventVisitor().GetTimespan().Includes(
+                    cur_node->GetEventVisitor().GetTimespan())) {
+              parent_node->AddChild(cur_node);
+              break;
+            }
             parent_nodes.pop_back();
           }
+          parent_nodes.push_back(cur_node);
         }
-        parent_nodes.push_back(cur_node);
       }
     }
   }
@@ -468,7 +565,7 @@ bool RootNeedsGrouping(const EventNode* root) {
   // different levels are grouped separately.
   const EventNode* root_parent = FindParentWithComparator(
       [root](const EventNode* parent) {
-        return parent->RootLevel() == root->RootLevel();
+        return parent->IsRoot() && parent->RootLevel() == root->RootLevel();
       },
       root,
       /*include_self=*/false);
@@ -493,28 +590,56 @@ void EventForest::CreateEventGroups() {
     for (EventNode* root_event : tf_loop_root_events_) {
       ProcessRootEvent(group_id++, root_event, &group_metadata_map_);
     }
-    return;
-  }
+  } else {
+    // Iterate over all events and collect all root events.
+    EventList root_events;
+    EventList implicit_root_events;
+    for (auto& [event_type, events] : event_node_map_) {
+      for (EventNode& event : events) {
+        if (!event.IsRoot()) {
+          continue;
+        }
+        std::optional<XStatVisitor> step_id_stat =
+            event.GetEventVisitor().GetStat(StatType::kStepId);
+        // If this is a root event that associated with tf.data, skip.
+        if (step_id_stat &&
+            tf_data_step_ids_.contains(step_id_stat->IntValue())) {
+          continue;
+        }
+        root_events.push_back(&event);
+      }
+    }
 
-  // Iterate over all events and collect all root events.
-  EventList root_events;
-  for (auto& [event_type, events] : event_node_map_) {
-    for (EventNode& event : events) {
-      if (!event.RootLevel()) continue;
-      std::optional<XStatVisitor> step_id_stat =
-          event.GetEventVisitor().GetStat(StatType::kStepId);
-      // If this is a root event that associated with tf.data, skip.
-      if (step_id_stat && tf_data_step_ids_.contains(step_id_stat->IntValue()))
-        continue;
-      root_events.push_back(&event);
+    SortRootEventList(&root_events);
+
+    for (EventNode* root_event : root_events) {
+      if (RootNeedsGrouping(root_event)) {
+        ProcessRootEvent(group_id++, root_event, &group_metadata_map_);
+      }
     }
   }
 
-  SortRootEventList(&root_events);
-
-  for (EventNode* root_event : root_events) {
-    if (RootNeedsGrouping(root_event)) {
-      ProcessRootEvent(group_id++, root_event, &group_metadata_map_);
+  // Check if any TPU root events were grouped. If not, group all in lock step.
+  bool tpu_needs_grouping = absl::c_all_of(
+      tensor_core_root_events_per_core_, [](const auto& core_root_events) {
+        return absl::c_all_of(core_root_events, [](const auto& event) {
+          return RootNeedsGrouping(event);
+        });
+      });
+  if (tpu_needs_grouping) {
+    for (auto& core_root_events : tensor_core_root_events_per_core_) {
+      // Do not change the group_id. This is a cheap way to align the TensorCore
+      // and SparseCore device step events. But can be incorrect if somehow one
+      // core started from an earlier step.
+      uint64_t device_step_group_id = group_id;
+      for (EventNode* root_event : core_root_events) {
+        // If the device step event hasn't been grouped, then treat it as a root
+        // event and group it.
+        if (RootNeedsGrouping(root_event)) {
+          ProcessRootEvent(device_step_group_id++, root_event,
+                           &group_metadata_map_);
+        }
+      }
     }
   }
 }
@@ -615,6 +740,10 @@ void EventForest::ProcessTensorFlowLoop() {
 void EventForest::AddPlane(
     const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
     XPlane* plane) {
+  if (registered_planes_.contains(plane)) {
+    return;
+  }
+  registered_planes_.insert(plane);
   CreateStatMetadata(plane);
   planes_.push_back({plane, visitor_factory(plane)});
 }
@@ -828,32 +957,55 @@ void MergeHostSteps(const XStatMetadata& group_id_stat_metadata,
       GetStatTypeStr(StatType::kDeviceDurationPs));
   auto device_offset_stat_metadata = *plane_builder->GetOrCreateStatMetadata(
       GetStatTypeStr(StatType::kDeviceOffsetPs));
+  auto step_idle_time_stat_metadata = *plane_builder->GetOrCreateStatMetadata(
+      GetStatTypeStr(StatType::kStepIdleTimePs));
   std::optional<int64_t> merged_group_id;
   std::optional<Timespan> merged_device_timespan;
   std::optional<XEventBuilder> merged_step_builder;
+  int64_t merged_step_idle_time = 0;
   absl::flat_hash_set<const XEvent*> events_to_remove;
   for (XEvent& step_event : *step_line->mutable_events()) {
     XEventVisitor step_visitor(&plane_visitor, step_line, &step_event);
     auto group_id = GetGroupId(step_visitor, group_id_stat_metadata);
-    if (!group_id) {
+    if (!group_id.has_value()) {
       // Discard ungrouped event.
       // This usually happens at the beginning of a trace collected using
       // sampling mode, since the host is ahead of the device.
       merged_group_id.reset();
       merged_step_builder.reset();
+      merged_step_idle_time = 0;
       events_to_remove.insert(&step_event);
     } else if (merged_group_id != group_id) {
       // Start a new step with the current event.
       merged_group_id = group_id;
       merged_device_timespan.reset();
-      if (step_visitor.GetStat(StatType::kDeviceOffsetPs).has_value() &&
-          step_visitor.GetStat(StatType::kDeviceDurationPs).has_value()) {
+      if (std::optional<XStatVisitor> current_step_idle_time =
+              step_visitor.GetStat(StatType::kStepIdleTimePs,
+                                   step_idle_time_stat_metadata);
+          current_step_idle_time.has_value()) {
+        merged_step_idle_time = current_step_idle_time->IntOrUintValue();
+      }
+      if (step_visitor
+              .GetStat(StatType::kDeviceOffsetPs, device_offset_stat_metadata)
+              .has_value() &&
+          step_visitor
+              .GetStat(StatType::kDeviceDurationPs,
+                       device_duration_stat_metadata)
+              .has_value()) {
         merged_device_timespan = GetDeviceEventTimespan(step_visitor);
       }
       merged_step_builder.emplace(step_line, plane_builder, &step_event);
+      merged_step_builder->SetOrAddStatValue(step_idle_time_stat_metadata,
+                                             merged_step_idle_time);
     } else {
       // Multi-module step: extend the previous step until the end of the
       // current event and discard the current event.
+      if (std::optional<XStatVisitor> current_step_idle_time =
+              step_visitor.GetStat(StatType::kStepIdleTimePs,
+                                   step_idle_time_stat_metadata);
+          current_step_idle_time.has_value()) {
+        merged_step_idle_time += current_step_idle_time->IntOrUintValue();
+      }
       if (merged_device_timespan.has_value()) {
         merged_device_timespan->ExpandToInclude(
             GetDeviceEventTimespan(step_visitor));
@@ -864,6 +1016,8 @@ void MergeHostSteps(const XStatMetadata& group_id_stat_metadata,
             merged_device_timespan->duration_ps());
       }
       merged_step_builder->SetEndTimestampPs(step_visitor.EndTimestampPs());
+      merged_step_builder->SetOrAddStatValue(step_idle_time_stat_metadata,
+                                             merged_step_idle_time);
       events_to_remove.insert(&step_event);
     }
   }
@@ -902,89 +1056,40 @@ void GroupHostAndPlanes(
   event_forest->GroupEvents();
 }
 
-void GroupXplaneEvents(tensorflow::profiler::XPlane* plane,
-                       const GroupMetadataMap& group_metadata_map) {
-  // For each device_trace, the following happens:
-  // (1) Find the module line and the step line.
-  // (2) Assigns group_id to step events. group_id is read from the module
-  //     events nested by the step events.
-  // (3) Assigns group_id to other events nested by the grouped module events.
-  XLine* module_line = nullptr;
+// Groups the events in the device plane using the step line or module line as
+// the grouping line depending on whether the loop is on the device or host.
+void GroupTpuXPlaneEvents(tensorflow::profiler::XPlane* plane,
+                          const GroupMetadataMap& group_metadata_map) {
   XLine* step_line = nullptr;
   std::vector<XLine*> other_lines;
   for (XLine& line : *plane->mutable_lines()) {
-    if (line.name() == "XLA Modules") {
-      module_line = &line;
-    } else if (line.name() == "Steps") {
+    if (line.name() == kStepLineName ||
+        line.name() == kSparseCoreStepLineName) {
       step_line = &line;
     } else {
       other_lines.push_back(&line);
     }
   }
-
-  if (!module_line) return;
-
+  XLine* grouping_line = GetGroupingLineForTPU(plane);
+  if (grouping_line == nullptr) {
+    return;
+  }
   XPlaneBuilder plane_builder(plane);
   const XStatMetadata* group_id_stat_metadata =
       plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId));
   // NOTE: Create plane_visitor after adding new stat metadata to
   // plane_builder, so plane_visitor picks up the changes.
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
-  const XLine* group_line = module_line;
-  if (step_line) {
-    bool device_loop = (step_line->events_size() > module_line->events_size());
-    if (device_loop) {
-      int32_t group_id = 0;
-      for (XEvent& event : *step_line->mutable_events()) {
-        XEventBuilder step_builder(step_line, &plane_builder, &event);
-        XEventVisitor step_visitor(&plane_visitor, step_line, &event);
-        if (!step_visitor.GetStat(StatType::kGroupId).has_value()) {
-          step_builder.AddStatValue(*group_id_stat_metadata, group_id++);
-        }
-      }
-      group_line = step_line;
-    } else {  // host loop
-      if (group_line) {
-        // Determine whether the module line has been grouped.
-        bool is_grouped = false;
-        for (XEvent& event : *module_line->mutable_events()) {
-          XEventVisitor module_visitor(&plane_visitor, module_line, &event);
-          if (module_visitor.GetStat(StatType::kGroupId).has_value()) {
-            is_grouped = true;
-            break;
-          }
-        }
-        if (!is_grouped) {
-          // If the module line has not been grouped, then:
-          // (1) Assign group_id to each step event.
-          int32_t group_id = 0;
-          for (XEvent& event : *step_line->mutable_events()) {
-            XEventBuilder step_builder(step_line, &plane_builder, &event);
-            XEventVisitor step_visitor(&plane_visitor, step_line, &event);
-            if (!step_visitor.GetStat(StatType::kGroupId).has_value()) {
-              step_builder.AddStatValue(*group_id_stat_metadata, group_id++);
-            }
-          }
-          // (2) Group the module events nested by the step events.
-          GroupLine(*group_id_stat_metadata, plane_visitor, *step_line,
-                    &plane_builder, module_line);
-        }
-        // Host loop steps take the group_id from their module.
-        GroupLine(*group_id_stat_metadata, plane_visitor, *group_line,
-                  &plane_builder, step_line);
-        // Merge consecutive steps with the same group_id.
-        MergeHostSteps(*group_id_stat_metadata, plane_visitor, &plane_builder,
-                       step_line);
-        XLineBuilder step_line_builder(step_line, &plane_builder);
-        AddGroupMetadataToStepEvents(group_metadata_map, step_line_builder);
-      }
-    }
+  if (step_line != nullptr) {
+    // Merge consecutive steps with the same group_id.
+    MergeHostSteps(*group_id_stat_metadata, plane_visitor, &plane_builder,
+                   step_line);
+    XLineBuilder step_line_builder(step_line, &plane_builder);
+    AddGroupMetadataToStepEvents(group_metadata_map, step_line_builder);
   }
-  if (group_line) {
-    for (XLine* line : other_lines) {
-      GroupLine(*group_id_stat_metadata, plane_visitor, *group_line,
-                &plane_builder, line);
-    }
+  for (XLine* line : other_lines) {
+    GroupLine(*group_id_stat_metadata, plane_visitor, *grouping_line,
+              &plane_builder, line);
   }
 }
 
@@ -1008,7 +1113,7 @@ void GroupTpuEventsOSS(
   for (XPlane* plane : device_traces) {
     threads.emplace_back(Env::Default()->StartThread(
         thread_options, "group_xplane_events",
-        absl::bind_front(GroupXplaneEvents, plane,
+        absl::bind_front(GroupTpuXPlaneEvents, plane,
                          std::ref(group_metadata_map))));
   }
 }
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events.h b/third_party/xla/xla/tsl/profiler/utils/group_events.h
index 2bb0aa811a92b9..51c3cf4c3c94c0 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events.h
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TSL_PROFILER_UTILS_GROUP_EVENTS_H_
 #define XLA_TSL_PROFILER_UTILS_GROUP_EVENTS_H_
 
+#include <cstdint>
 #include <deque>
 #include <functional>
 #include <memory>
@@ -101,7 +102,12 @@ class EventNode {
 
   void SetRootLevel(int root_level) { root_level_ = root_level; }
 
-  int RootLevel() const { return root_level_; }
+  // Returns the root level of this event.
+  // NOTE: return 0 if this event is not a root event to maintain the legacy
+  // behavior.
+  int RootLevel() const { return root_level_.value_or(0); }
+
+  bool IsRoot() const { return root_level_.has_value(); }
 
   bool IsCompiledFunc() const;
 
@@ -121,7 +127,7 @@ class EventNode {
   // Root event level.
   // By default root_level_ is set to 0, which means it is not a root event.
   // Events with root_level_ greater than 0 are considered as root events.
-  int root_level_ = 0;
+  std::optional<int> root_level_;
 };
 
 using EventNodeMap =
@@ -173,6 +179,9 @@ class EventForest {
       std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
           visitor_factory,
       tensorflow::profiler::XPlane* plane);
+  void ConnectIntraThreadTPU(tensorflow::profiler::XPlane* plane,
+                             XPlaneVisitor* visitor,
+                             ContextGroupMap* context_groups);
 
   // Creates an EventNode for each event in event_node_map and connect events
   // according to the nesting relationship within the thread.
@@ -216,12 +225,16 @@ class EventForest {
 
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
+  absl::flat_hash_set<XPlane*> registered_planes_;
   // std::deque for pointer stability.
   std::deque<std::pair<tensorflow::profiler::XPlane*, XPlaneVisitor>> planes_;
   // The "step" id (actually it is "function" id that are associated with
   // the tf.data pipeline.
   absl::flat_hash_set<int64_t> tf_data_step_ids_;
   EventList tf_loop_root_events_;
+  // The root events for TPUs per core.
+  std::vector<std::vector<EventNode*>> tensor_core_root_events_per_core_;
+  std::vector<std::vector<EventNode*>> sparse_core_root_events_per_core_;
   GroupMetadataMap group_metadata_map_;
 };
 
@@ -246,8 +259,10 @@ void GroupHostAndPlanes(
     const std::vector<tensorflow::profiler::XPlane*>& device_traces,
     EventForest* event_forest);
 
-void GroupXplaneEvents(tensorflow::profiler::XPlane* plane,
-                       const GroupMetadataMap& group_metadata_map);
+// Groups the events in the provided TPU plane by the step line or module line
+// depending on whether the loop is on the device or host.
+void GroupTpuXPlaneEvents(tensorflow::profiler::XPlane* plane,
+                          const GroupMetadataMap& group_metadata_map);
 
 void GroupTpuEventsOSS(
     tensorflow::profiler::XSpace* space,
diff --git a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
index f3b8e20a56ebe6..04e2e33f178bba 100644
--- a/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/group_events_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/test.h"
@@ -26,11 +27,13 @@ limitations under the License.
 #include "xla/tsl/profiler/utils/preprocess_xplane.h"
 #include "xla/tsl/profiler/utils/tf_xplane_visitor.h"
 #include "xla/tsl/profiler/utils/timespan.h"
+#include "xla/tsl/profiler/utils/trace_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_test_utils.h"
 #include "xla/tsl/profiler/utils/xplane_utils.h"
 #include "xla/tsl/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/lib/context_types.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tsl {
@@ -734,19 +737,20 @@ TEST(GroupTPUEventsTest, TpuProgramCallbackTest) {
 
 TEST(GroupTPUEventsTest, ModuleRootEventTest) {
   tensorflow::profiler::XSpace space;
-  tensorflow::profiler::XPlane* device_plane = space.add_planes();
+  tensorflow::profiler::XPlane* device_plane =
+      GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
   XPlaneBuilder device_plane_builder(device_plane);
-  device_plane_builder.ReserveLines(1);
+  device_plane_builder.ReserveLines(3);
   auto step_line = device_plane_builder.GetOrCreateLine(0);
   step_line.SetName("Steps");
   CreateXEvent(&device_plane_builder, &step_line, "1", 100, 200,
                {{StatType::kStepNum, int64_t{1}}});
   auto module_line = device_plane_builder.GetOrCreateLine(1);
   module_line.SetName("XLA Modules");
-  CreateXEvent(&device_plane_builder, &module_line, "module", 105, 199,
+  CreateXEvent(&device_plane_builder, &module_line, "module", 105, 194,
                {{StatType::kRunId, int64_t{123}},
                 {StatType::kQueueId, int64_t{0}},
-                {StatType::kDeviceOrdinal, int64_t{1}}});
+                {StatType::kDeviceOrdinal, int64_t{0}}});
   auto hlo_line = device_plane_builder.GetOrCreateLine(2);
   hlo_line.SetName("XLA Ops");
   CreateXEvent(&device_plane_builder, &hlo_line, "matmul", 110, 190, {});
@@ -771,19 +775,23 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
   CreateXEvent(
       &host_plane_builder, &main_thread, "train", 100, 10,
       {{StatType::kStepNum, int64_t{1}}, {StatType::kIsRoot, int64_t{1}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 100, 1,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 100, 1,
                {{StatType::kRunId, int64_t{2}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 101, 2,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 101, 2,
                {{StatType::kRunId, int64_t{3}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 103, 2,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 103, 2,
                {{StatType::kRunId, int64_t{4}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
-  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 105, 4,
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kDoEnqueueProgram, 105, 4,
                {{StatType::kRunId, int64_t{5}},
                 {StatType::kQueueId, int64_t{0}},
                 {StatType::kDeviceOrdinal, int64_t{0}}});
@@ -796,13 +804,13 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
       &device_plane_builder, &module_line, "jit_something(1)", 1000, 10,
       {{StatType::kRunId, int64_t{2}}, {StatType::kQueueId, int64_t{0}}});
   CreateXEvent(
-      &device_plane_builder, &module_line, "jit_something(2)", 1015, 100,
+      &device_plane_builder, &module_line, "jit_something(1)", 1015, 100,
       {{StatType::kRunId, int64_t{3}}, {StatType::kQueueId, int64_t{0}}});
   CreateXEvent(
-      &device_plane_builder, &module_line, "jit_something(3)", 1125, 50,
+      &device_plane_builder, &module_line, "jit_something(1)", 1125, 50,
       {{StatType::kRunId, int64_t{4}}, {StatType::kQueueId, int64_t{0}}});
   CreateXEvent(
-      &device_plane_builder, &module_line, "jit_something(4)", 1180, 25,
+      &device_plane_builder, &module_line, "jit_something(1)", 1180, 25,
       {{StatType::kRunId, int64_t{5}}, {StatType::kQueueId, int64_t{0}}});
   auto step_line = device_plane_builder.GetOrCreateLine(1);
   step_line.SetName(kStepLineName);
@@ -818,11 +826,97 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
   CreateXEvent(&device_plane_builder, &step_line, "3", 1180, 25,
                {{StatType::kDeviceOffsetPs, int64_t{1180}},
                 {StatType::kDeviceDurationPs, int64_t{25}}});
+  auto op_line = device_plane_builder.GetOrCreateLine(2);
+  op_line.SetName(kXlaOpLineName);
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1000, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{0}},
+                {StatType::kProducerId, int64_t{1}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1005, 5, {});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1015, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{1}},
+                {StatType::kProducerId, int64_t{2}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1020, 95, {});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1125, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{2}},
+                {StatType::kProducerId, int64_t{3}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1130, 45, {});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.start.1", 1180, 5,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{3}},
+                {StatType::kProducerId, int64_t{4}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&device_plane_builder, &op_line, "offload.done.1", 1185, 20, {});
+
+  // TPU SparseCore Plane (device_id 0, core_type 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 0);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1001, 8,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1016, 98,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1126, 48,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1181, 23,
+               {
+                   {StatType::kTcOffloadStartId, int64_t{1}},
+               });
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1000, 10, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 1", 1015, 100, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 2", 1125, 50, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 3", 1180, 25, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1001, 8,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1016, 98,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{2}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1126, 48,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{3}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1", 1181, 23,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{4}}});
+
   // Make sure to preprocess so that the Runtime events have a Producer/Consumer
   // event set created.
   PreprocessXSpace(&space);
   EventForest event_forest;
-  GroupTpuEventsOSS(&space, {device_plane}, &event_forest);
+  GroupTpuEventsOSS(&space, {device_plane, sparsecore_plane}, &event_forest);
+  EXPECT_EQ(event_forest.GetGroupMetadataMap().size(), 1);
   auto visitor = CreateTfXPlaneVisitor(device_plane);
   bool step_line_found = false;
   visitor.ForEachLine([&](const XLineVisitor& line) {
@@ -838,6 +932,414 @@ TEST(GroupTPUEventsTest, MergeHostStepsTest) {
     EXPECT_EQ(GetDeviceEventTimespan(step_event).end_ps(), 1205);
   });
   EXPECT_TRUE(step_line_found);
+
+  auto sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  bool sc_step_line_found = false;
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    if (line.Name() != kSparseCoreStepLineName) {
+      return;
+    }
+    sc_step_line_found = true;
+    EXPECT_EQ(line.NumEvents(), 1);
+    auto step_event = line.GetFirstEvent();
+    EXPECT_EQ(step_event.GetTimespan().begin_ps(), 1000);
+    EXPECT_EQ(step_event.GetTimespan().end_ps(), 1205);
+    EXPECT_EQ(GetDeviceEventTimespan(step_event).begin_ps(), 1000);
+    EXPECT_EQ(GetDeviceEventTimespan(step_event).end_ps(), 1205);
+  });
+  EXPECT_TRUE(sc_step_line_found);
+}
+
+TEST(GroupTPUEventsTest, MergeOffloadedScSteps) {
+  tensorflow::profiler::XSpace space;
+  // No host plane in this test.
+
+  // TPU TensorCore Plane (device_id 0)
+  XPlane* tensorcore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
+  XPlaneBuilder tc_plane_builder(tensorcore_plane);
+  tc_plane_builder.ReserveLines(3);
+
+  auto tc_module_line = tc_plane_builder.GetOrCreateLine(0);
+  tc_module_line.SetName(kXlaModuleLineName);
+  // The module event is strictly within the step event (1000-2000).
+  CreateXEvent(&tc_plane_builder, &tc_module_line, "jit_tc_module", 1010, 980,
+               {{StatType::kRunId, int64_t{1}}});
+
+  auto tc_step_line = tc_plane_builder.GetOrCreateLine(1);
+  tc_step_line.SetName(kStepLineName);
+  CreateXEvent(&tc_plane_builder, &tc_step_line, "tc step 0", 1000, 1000,
+               {{StatType::kDeviceOffsetPs, int64_t{1000}},
+                {StatType::kDeviceDurationPs, int64_t{1000}}});
+
+  auto tc_op_line = tc_plane_builder.GetOrCreateLine(2);
+  tc_op_line.SetName(kXlaOpLineName);
+  // First offload
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.start.1", 1050, 50,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{0}},
+                {StatType::kProducerId, int64_t{1}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.done.1", 1100, 400, {});
+  // Second offload
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.start.1", 1550, 50,
+               {{StatType::kTcOffloadStartId, int64_t{1}},
+                {StatType::kOffloadCoreId, int64_t{0}},
+                {StatType::kOffloadExecutionIndex, int64_t{1}},
+                {StatType::kProducerId, int64_t{2}},
+                {StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kScOffload)}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload.done.1", 1600, 400, {});
+
+  // TPU SparseCore Plane (device_id 0, core_type 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 1);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  // These module events are strictly within their respective step events.
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1101, 398,
+               {{StatType::kTcOffloadStartId, int64_t{1}}});
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(1)", 1601, 398,
+               {{StatType::kTcOffloadStartId, int64_t{1}}});
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1100, 400, {});
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 1", 1600, 400, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_1a", 1110, 100,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "sc_op_2a", 1610, 100,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{2}}});
+
+  // Make sure to preprocess so that the Runtime events have a Producer/Consumer
+  // event set created.
+  PreprocessXSpace(&space);
+  EventForest event_forest;
+  GroupTpuEventsOSS(&space, {tensorcore_plane, sparsecore_plane},
+                    &event_forest);
+
+  // We expect only one group as all events are linked.
+  const GroupMetadataMap& group_metadata_map =
+      event_forest.GetGroupMetadataMap();
+  EXPECT_EQ(group_metadata_map.size(), 1);
+  const int64_t expected_group_id = group_metadata_map.begin()->first;
+
+  // Check the merged TensorCore step event.
+  auto tc_visitor = CreateTfXPlaneVisitor(tensorcore_plane);
+  tc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(tensorcore_plane->name(), ": ", line.Name(),
+                                " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntOrUintValue(), expected_group_id);
+    });
+  });
+
+  // Check the merged SparseCore step event.
+  auto sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(sparsecore_plane->name(), ": ", line.Name(),
+                                " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntOrUintValue(), expected_group_id);
+    });
+    if (line.Name() == kSparseCoreStepLineName) {
+      EXPECT_EQ(line.NumEvents(), 1);
+      auto step_event = line.GetFirstEvent();
+      EXPECT_EQ(step_event.GetTimespan().begin_ps(), 1100);
+      EXPECT_EQ(step_event.GetTimespan().end_ps(), 2000);
+    }
+  });
+}
+
+TEST(GroupTPUEventsTest, GroupOffloadedSparseCoreModulesHostLoopTest) {
+  tensorflow::profiler::XSpace space;
+  tensorflow::profiler::XPlane* host_plane = GetOrCreateHostXPlane(&space);
+  XPlaneBuilder host_plane_builder(host_plane);
+  host_plane_builder.ReserveLines(1);
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  main_thread.SetName("main");
+
+  CreateXEvent(&host_plane_builder, &main_thread, "host step 0", 0, 200,
+               {{StatType::kIsRoot, int64_t{1}}});
+  // Host event for TensorCore.
+  CreateXEvent(&host_plane_builder, &main_thread, "DoEnqueueProgram", 100, 10,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kReplicaId, int64_t{0}},
+                {StatType::kDeviceOrdinal, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}}});  // kTpuTensorCore
+
+  // TPU TensorCore Plane (device_id 0)
+  XPlane* tensorcore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
+  XPlaneBuilder tc_plane_builder(tensorcore_plane);
+  tc_plane_builder.ReserveLines(3);
+
+  auto tc_module_line = tc_plane_builder.GetOrCreateLine(0);
+  tc_module_line.SetName(kXlaModuleLineName);
+  CreateXEvent(&tc_plane_builder, &tc_module_line, "jit(123)", 1000, 1000,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kReplicaId, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}}});
+
+  auto tc_step_line = tc_plane_builder.GetOrCreateLine(1);
+  tc_step_line.SetName(kStepLineName);
+  CreateXEvent(&tc_plane_builder, &tc_step_line, "tc step 0", 1000, 1000, {});
+
+  auto tc_op_line = tc_plane_builder.GetOrCreateLine(2);
+  tc_op_line.SetName(kXlaOpLineName);
+  CreateXEvent(
+      &tc_plane_builder, &tc_op_line, "offload_start", 1050, 100,
+      {{StatType::kTcOffloadStartId, int64_t{123}},
+       {StatType::kOffloadCoreId, int64_t{0}},
+       {StatType::kOffloadExecutionIndex, int64_t{0}},
+       {StatType::kProducerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kProducerId, int64_t{1}}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload_done", 1200, 750, {});
+
+  // TPU SparseCore Plane (device_id 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 0);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(123)", 1100, 800,
+               {{StatType::kTcOffloadStartId, int64_t{123}}});
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1100, 800, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "offloaded_start.copy", 1100, 10,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(&sc_plane_builder, &sc_op_line, "offloaded_done.copy", 1120, 180,
+               {});
+
+  // Preprocess to create Producer/Consumer events.
+  PreprocessXSpace(&space);
+  EventForest event_forest;
+  GroupTpuEventsOSS(&space, {tensorcore_plane, sparsecore_plane},
+                    &event_forest);
+
+  // We expect one group, where all events are grouped under the same group.
+  EXPECT_EQ(event_forest.GetGroupMetadataMap().size(), 1);
+  const int64_t expected_group_id =
+      event_forest.GetGroupMetadataMap().begin()->first;
+
+  // Check Host events.
+  XPlaneVisitor host_visitor = CreateTfXPlaneVisitor(host_plane);
+  int host_event_idx = 0;
+  host_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(host_plane->name(), ": ", line.Name(), " ",
+                                event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), expected_group_id);
+      host_event_idx++;
+    });
+  });
+  EXPECT_EQ(host_event_idx, 2);
+
+  // Check TensorCore events.
+  XPlaneVisitor tc_visitor = CreateTfXPlaneVisitor(tensorcore_plane);
+  tc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(tensorcore_plane->name(), ": ",
+
+                                line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      // TensorCore events are associated with run_id 1, likely getting group_id
+      // 0.
+      EXPECT_EQ(group_id_stat->IntValue(), expected_group_id);
+    });
+  });
+
+  // Check SparseCore events.
+  XPlaneVisitor sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(
+          absl::StrCat(sparsecore_plane->name(), ": ",
+                       ParseDeviceOrdinal(sparsecore_plane->name()).value(),
+                       " ", line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      // SparseCore events are associated with run_id 2, likely getting
+      // group_id 1.
+      EXPECT_EQ(group_id_stat->IntValue(), expected_group_id);
+    });
+  });
+}
+
+TEST(GroupTPUEventsTest, GroupOffloadedSparseCoreModulesDeviceLoopTest) {
+  tensorflow::profiler::XSpace space;
+  tensorflow::profiler::XPlane* host_plane = GetOrCreateHostXPlane(&space);
+  XPlaneBuilder host_plane_builder(host_plane);
+  host_plane_builder.ReserveLines(2);
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  main_thread.SetName("main");
+
+  // Tf Loop event
+  CreateXEvent(
+      &host_plane_builder, &main_thread, HostEventType::kExecutorStateProcess,
+      100, 10,
+      {{StatType::kStepId, int64_t{1}}, {StatType::kIterNum, int64_t{99}}});
+  CreateXEvent(&host_plane_builder, &main_thread,
+               HostEventType::kTpuSystemExecute, 100, 9,
+               {{StatType::kProducerType,
+                 static_cast<int64_t>(ContextType::kTfrtTpuRuntime)},
+                {StatType::kProducerId, int64_t{1}}});
+
+  auto enqueue_thread = host_plane_builder.GetOrCreateLine(1);
+  enqueue_thread.SetName("tf_enqueue");
+  CreateXEvent(&host_plane_builder, &enqueue_thread,
+               "tpu::System::Execute=>IssueSequencedEvent", 102, 10,
+               {{StatType::kConsumerType,
+                 static_cast<int64_t>(ContextType::kTfrtTpuRuntime)},
+                {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(&host_plane_builder, &enqueue_thread,
+               HostEventType::kDoEnqueueProgram, 103, 8,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}},
+                {StatType::kDeviceOrdinal, int64_t{0}}});
+
+  // TPU TensorCore Plane (device_id 0)
+  XPlane* tensorcore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0);
+  XPlaneBuilder tc_plane_builder(tensorcore_plane);
+  tc_plane_builder.ReserveLines(3);
+
+  auto tc_module_line = tc_plane_builder.GetOrCreateLine(0);
+  tc_module_line.SetName(kXlaModuleLineName);
+  // The module event encompasses the step event's time range (1000-2000).
+  CreateXEvent(&tc_plane_builder, &tc_module_line, "jit(123)", 900, 1200,
+               {{StatType::kRunId, int64_t{1}},
+                {StatType::kQueueId, int64_t{0}},
+                {StatType::kReplicaId, int64_t{0}},
+                {StatType::kCoreType, int64_t{0}}});
+
+  auto tc_step_line = tc_plane_builder.GetOrCreateLine(1);
+  tc_step_line.SetName(kStepLineName);
+  CreateXEvent(&tc_plane_builder, &tc_step_line, "tc step 0", 1000, 1000, {});
+
+  auto tc_op_line = tc_plane_builder.GetOrCreateLine(2);
+  tc_op_line.SetName(kXlaOpLineName);
+  CreateXEvent(
+      &tc_plane_builder, &tc_op_line, "offload_start", 1050, 100,
+      {{StatType::kTcOffloadStartId, int64_t{123}},
+       {StatType::kOffloadCoreId, int64_t{0}},
+       {StatType::kOffloadExecutionIndex, int64_t{0}},
+       {StatType::kProducerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kProducerId, int64_t{1}}});
+  CreateXEvent(&tc_plane_builder, &tc_op_line, "offload_done", 1200, 750, {});
+
+  // TPU SparseCore Plane (device_id 1)
+  XPlane* sparsecore_plane = GetOrCreateTpuXPlane(&space, 0, "TPUv4", 0, 0, 0);
+  XPlaneBuilder sc_plane_builder(sparsecore_plane);
+  sc_plane_builder.ReserveLines(3);
+
+  auto sc_module_line = sc_plane_builder.GetOrCreateLine(0);
+  sc_module_line.SetName(kSparseCoreModuleLineName);
+  CreateXEvent(&sc_plane_builder, &sc_module_line, "offloaded(123)", 1100, 800,
+               {{StatType::kTcOffloadStartId, int64_t{123}}});
+
+  auto sc_step_line = sc_plane_builder.GetOrCreateLine(1);
+  sc_step_line.SetName(kSparseCoreStepLineName);
+  CreateXEvent(&sc_plane_builder, &sc_step_line, "sc step 0", 1100, 800, {});
+
+  auto sc_op_line = sc_plane_builder.GetOrCreateLine(2);
+  sc_op_line.SetName(kSparseCoreOpLineName);
+  CreateXEvent(
+      &sc_plane_builder, &sc_op_line, "offloaded_start.copy", 1100, 100,
+      {{StatType::kConsumerType, static_cast<int64_t>(ContextType::kScOffload)},
+       {StatType::kConsumerId, int64_t{1}}});
+  CreateXEvent(&sc_plane_builder, &sc_op_line, "offloaded_done.copy", 1300, 100,
+               {});
+
+  // Preprocess to create Producer/Consumer events.
+  PreprocessXSpace(&space);
+  EventForest event_forest;
+  GroupTpuEventsOSS(&space, {tensorcore_plane, sparsecore_plane},
+                    &event_forest);
+
+  // We expect two groups, one for the host events and one for the device
+  // events.
+  EXPECT_EQ(event_forest.GetGroupMetadataMap().size(), 2);
+
+  // Check Host events.
+  XPlaneVisitor host_visitor = CreateTfXPlaneVisitor(host_plane);
+  int host_event_idx = 0;
+  host_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(absl::StrCat(host_plane->name(), ": ", line.Name(), " ",
+                                event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), 0);
+      host_event_idx++;
+    });
+  });
+  EXPECT_EQ(host_event_idx, 4);
+
+  // Check TensorCore events.
+  XPlaneVisitor tc_visitor = CreateTfXPlaneVisitor(tensorcore_plane);
+  tc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (line.Name() == kXlaModuleLineName) {
+        // The module event encompasses multiple steps, so it cannot be grouped.
+        return;
+      }
+      SCOPED_TRACE(absl::StrCat(tensorcore_plane->name(), ": ",
+
+                                line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), 1);
+    });
+  });
+
+  // Check SparseCore events.
+  XPlaneVisitor sc_visitor = CreateTfXPlaneVisitor(sparsecore_plane);
+  sc_visitor.ForEachLine([&](const XLineVisitor& line) {
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      SCOPED_TRACE(
+          absl::StrCat(sparsecore_plane->name(), ": ",
+                       ParseDeviceOrdinal(sparsecore_plane->name()).value(),
+                       " ", line.Name(), " ", event.Name()));
+      std::optional<XStatVisitor> group_id_stat =
+          event.GetStat(StatType::kGroupId);
+      ASSERT_TRUE(group_id_stat.has_value());
+      EXPECT_EQ(group_id_stat->IntValue(), 1);
+    });
+  });
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc
index b693dcfebe9579..f95fc81f1769ed 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.cc
@@ -16,11 +16,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <initializer_list>
+#include <optional>
 #include <string>
 #include <utility>
 #include <variant>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/tsl/platform/types.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
@@ -58,8 +60,13 @@ XPlane* GetOrCreateHostXPlane(XSpace* space) {
 XPlane* GetOrCreateTpuXPlane(XSpace* space, int32_t device_ordinal,
                              absl::string_view device_type,
                              double peak_tera_flops_per_second,
-                             double peak_hbm_bw_gigabytes_per_second) {
+                             double peak_hbm_bw_gigabytes_per_second,
+                             std::optional<int32_t> sparsecore_core_id) {
   std::string name = TpuPlaneName(device_ordinal);
+  if (sparsecore_core_id.has_value()) {
+    name = std::string(
+        absl::StrCat(name, " SparseCore ", sparsecore_core_id.value()));
+  }
   XPlane* xplane = FindOrAddMutablePlaneWithName(space, name);
   XPlaneBuilder builder(xplane);
   builder.AddStatValue(*builder.GetOrCreateStatMetadata(
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
index f7292594df0af0..7d2a38c2ec1e4f 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_test_utils.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef XLA_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
 #define XLA_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
 
+#include <cstdint>
 #include <initializer_list>
+#include <optional>
 #include <utility>
 #include <variant>
 
@@ -34,10 +36,10 @@ XPlane* GetOrCreateHostXPlane(XSpace* space);
 
 XPlane* GetOrCreateGpuXPlane(XSpace* space, int32_t device_ordinal);
 
-XPlane* GetOrCreateTpuXPlane(XSpace* space, int32_t device_ordinal,
-                             absl::string_view device_type,
-                             double peak_tera_flops_per_second,
-                             double peak_hbm_bw_gigabytes_per_second);
+XPlane* GetOrCreateTpuXPlane(
+    XSpace* space, int32_t device_ordinal, absl::string_view device_type,
+    double peak_tera_flops_per_second, double peak_hbm_bw_gigabytes_per_second,
+    std::optional<int32_t> sparsecore_core_id = std::nullopt);
 
 void CreateXEvent(
     XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
index b2eb5af4ab47b3..f5f8b0fdfaf0ad 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_visitor.h
@@ -169,6 +169,8 @@ class XEventVisitor : public XStatsOwner<XEvent> {
 
   absl::string_view Name() const { return metadata_->name(); }
 
+  absl::string_view LineName() const { return line_->name(); }
+
   std::optional<int64_t> Type() const { return type_; }
 
   bool HasDisplayName() const { return !metadata_->display_name().empty(); }

From 2175701ae9313dd73f0cebceb482ef941cb1f933 Mon Sep 17 00:00:00 2001
From: Sayan Saha <sayans@mathworks.com>
Date: Wed, 10 Dec 2025 20:02:55 -0500
Subject: [PATCH 151/753] [tosa] : Use QuantizedType signed info for
 legalization. (#105376)

---
 .../mlir/tosa/tests/tfl-to-tosa-pipeline.mlir | 33 +++++++++++++
 .../mlir/tosa/tests/tfl-unequal-ranks.mlir    |  9 ++++
 .../mlir/tosa/transforms/legalize_tfl.cc      | 47 ++++++++++---------
 .../mlir/tosa/transforms/legalize_utils.cc    | 34 +++++++++-----
 4 files changed, 90 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index 0b6dd410c57d9a..7e4573aa5a09e4 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -2930,6 +2930,21 @@ func.func @test_relu_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.01568594
 
 // -----
 
+// CHECK-LABEL:   func.func @test_relu_qu16(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>> {
+// CHECK:           %[[VAL_0:.*]] = "tosa.const"() <{values = dense<1073741824> : tensor<1xi32>}> : () -> tensor<1xi32>
+// CHECK:           %[[VAL_1:.*]] = "tosa.const"() <{values = dense<30> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xui16>}> : () -> tensor<1xui16>
+// CHECK:           %[[RESCALE_0:.*]] = tosa.rescale %[[ARG0]], %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]] {input_unsigned = true, output_unsigned = true, per_channel = false, rounding_mode = SINGLE_ROUND, scale32 = true} : (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>, tensor<1xi32>, tensor<1xi8>, tensor<1xui16>, tensor<1xui16>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>
+// CHECK:           %[[CLAMP_0:.*]] = tosa.clamp %[[RESCALE_0]] {max_val = 65535 : ui16, min_val = 0 : ui16} : (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802>>
+// CHECK:           return %[[CLAMP_0]]
+func.func @test_relu_qu16(%arg0:tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>) -> (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>) {
+    %0 = "tfl.relu"(%arg0) : (tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>) -> tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>
+    return %0 : tensor<?x112x112x32x!quant.uniform<u16:f32, 0.023529412224888802:0>>
+}
+
+// -----
+
 // CHECK-LABEL: test_relu0To1_qi8
 // CHECK-DAG: %[[VAL_0:.*]]: tensor<13x21x3x!quant.uniform<i8:f32, 0.015686025843024254:-1>>
 // CHECK-DAG: %[[VAL_1:.*]] = "tosa.const"() <{values = dense<2147449478> : tensor<1xi32>}>
@@ -3684,6 +3699,24 @@ func.func @test_conv2d_int8_input_variable_bias(%input: tensor<1x32x32x8x!quant.
 
 // -----
 
+// CHECK-LABEL:   func.func @test_conv2d_qu16(
+// CHECK-SAME:      %[[ARG0:.*]]: tensor<1x32x32x8x!quant.uniform<u16:f32, 1.000000e+00>>,
+// CHECK-SAME:      %[[ARG1:.*]]: tensor<3x3x8x16x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.000000e+00>> {
+// CHECK:           %[[VAL_0:.*]] = "tosa.const"() <{values = dense<14> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_1:.*]] = "tosa.const"() <{values = dense<16384> : tensor<1xi16>}> : () -> tensor<1xi16>
+// CHECK:           %[[VAL_2:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi48>}> : () -> tensor<1xi48>
+// CHECK:           %[[VAL_3:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xui16>}> : () -> tensor<1xui16>
+// CHECK:           %[[VAL_4:.*]] = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
+// CHECK:           %[[VAL_5:.*]] = tosa.conv2d %[[ARG0]], %[[ARG1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] {acc_type = i48, dilation = array<i64: 1, 1>, pad = array<i64: 1, 1, 3, 4>, stride = array<i64: 1, 1>} : (tensor<1x32x32x8x!quant.uniform<u16:f32, 1.000000e+00>>, tensor<3x3x8x16x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<1xi48>, tensor<1xui16>, tensor<1xi8>) -> tensor<1x32x32x3xi48>
+// CHECK:           %[[RESCALE_0:.*]] = tosa.rescale %[[VAL_5]], %[[VAL_1]], %[[VAL_0]], %[[VAL_2]], %[[VAL_3]] {input_unsigned = true, output_unsigned = true, per_channel = false, rounding_mode = SINGLE_ROUND, scale32 = false} : (tensor<1x32x32x3xi48>, tensor<1xi16>, tensor<1xi8>, tensor<1xi48>, tensor<1xui16>) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.000000e+00>>
+// CHECK:           return %[[RESCALE_0]]
+func.func @test_conv2d_qu16(%input: tensor<1x32x32x8x!quant.uniform<u16:f32, 1.0>>, %filter: tensor<3x3x8x16x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.0>> {
+  %bias = "tfl.no_value"() {value} : () -> none
+  %0 = "tfl.conv_2d"(%input, %filter, %bias) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x8x!quant.uniform<u16:f32, 1.0>>, tensor<3x3x8x16x!quant.uniform<i8:f32, 1.0>>, none) -> tensor<1x32x32x3x!quant.uniform<u16:f32, 1.0>>
+  return %0 : tensor<1x32x32x3x!quant.uniform<u16:f32, 1.0>>
+}
+// -----
+
 // CHECK-LABEL: @test_squeeze
 func.func @test_squeeze(%arg0: tensor<2x1x3x1xf32>) -> tensor<2x3x1xf32> {
   // CHECK: tosa.reshape
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
index c4d07792549543..7805fdd9742f11 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-unequal-ranks.mlir
@@ -13,6 +13,15 @@ func.func @test_add(%arg0: tensor<192x192x3xf32>, %arg1: tensor<16x192x192x3xf32
 
 // -----
 
+// CHECK-LABEL: test_add_dynamic
+func.func @test_add_dynamic(%arg0: tensor<?x?x?xf32>, %arg1: tensor<5xf32>) -> tensor<?x?x5xf32> {
+    // CHECK: tosa.add
+    %1 = tfl.add(%arg0, %arg1) {fused_activation_function = "NONE"} : (tensor<?x?x?xf32>, tensor<5xf32>) -> tensor<?x?x5xf32>
+    func.return %1 : tensor<?x?x5xf32>
+}
+
+// -----
+
 // CHECK-LABEL: test_add_qi8
 func.func @test_add_qi8(%arg0: tensor<13x21x1x!quant.uniform<i8:f32, 0.01568480022251606:-1>>, %arg1: tensor<1x13x21x3x!quant.uniform<i8:f32, 0.015686055645346642:-1>>) -> tensor<1x13x21x3x!quant.uniform<i8:f32, 0.031318482011556625:-1>> {
   // CHECK: tosa.add
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index 37a9f4234d992a..c7e49c5703fcd7 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -31,25 +31,26 @@ limitations under the License.
 #include <unordered_set>
 
 #include "llvm/ADT/ArrayRef.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/IR/QuantTypes.h"  // from @llvm-project
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"             // from @llvm-project
+#include "mlir/Dialect/Quant/IR/QuantTypes.h"         // from @llvm-project
+#include "mlir/Dialect/Tosa/IR/TosaOps.h"             // from @llvm-project
 #include "mlir/Dialect/Tosa/Utils/ConversionUtils.h"  // from @llvm-project
-#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/Dialect/Tosa/Utils/QuantUtils.h"
+#include "mlir/IR/Block.h"                       // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Matchers.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/Region.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"           // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"       // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"                // from @llvm-project
+#include "mlir/IR/MLIRContext.h"                 // from @llvm-project
+#include "mlir/IR/Matchers.h"                    // from @llvm-project
+#include "mlir/IR/PatternMatch.h"                // from @llvm-project
+#include "mlir/IR/Region.h"                      // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"               // from @llvm-project
+#include "mlir/IR/Types.h"                       // from @llvm-project
+#include "mlir/IR/Value.h"                       // from @llvm-project
+#include "mlir/IR/ValueRange.h"                  // from @llvm-project
+#include "mlir/Support/LLVM.h"                   // from @llvm-project
+#include "mlir/Support/LogicalResult.h"          // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
@@ -359,7 +360,8 @@ LogicalResult ConvertTFLReluOp::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type =
+        tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -429,7 +431,7 @@ LogicalResult ConvertTFLRelu1Op::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type = tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -496,7 +498,7 @@ LogicalResult ConvertTFLRelu0To1Op::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type = tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -563,7 +565,7 @@ LogicalResult ConvertTFLRelu6Op::matchAndRewrite(
   auto element_type = input_type.getElementType();
   if (auto quant_type =
           dyn_cast<mlir::quant::UniformQuantizedType>(element_type)) {
-    element_type = quant_type.getStorageType();
+    element_type = tosa::getStorageElementTypeFromQuantized(quant_type);
   }
 
   mlir::Attribute min_val, max_val;
@@ -1405,7 +1407,8 @@ RankedTensorType getTypeForSlice(RankedTensorType type, int64_t slice_dim,
         per_channel_qtype.getZeroPoints().begin() + offset,
         per_channel_qtype.getZeroPoints().begin() + offset + slice_size);
     auto output_per_channel_qtype = quant::UniformQuantizedPerAxisType::get(
-        per_channel_qtype.getFlags(), per_channel_qtype.getStorageType(),
+        per_channel_qtype.getFlags(),
+        tosa::getStorageElementTypeFromQuantized(per_channel_qtype),
         per_channel_qtype.getExpressedType(), output_scale_arr, output_zp_arr,
         per_channel_qtype.getQuantizedDimension(),
         per_channel_qtype.getStorageTypeMin(),
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index d1f6772ae6c5fa..b1bde08cf929eb 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -54,11 +54,11 @@ mlir::TypeAttr getConvAccTypeAttr(PatternRewriter& rewriter,
   // in case of quantized types: get base element types
   if (auto qtype =
           llvm::dyn_cast<mlir::quant::UniformQuantizedType>(input_etype))
-    input_etype = qtype.getStorageType();
+    input_etype = tosa::getStorageElementTypeFromQuantized(qtype);
 
   if (auto qtype =
           llvm::dyn_cast<mlir::quant::UniformQuantizedType>(output_etype))
-    output_etype = qtype.getStorageType();
+    output_etype = tosa::getStorageElementTypeFromQuantized(qtype);
 
   // special cases: input_etype and output_etype are both f16 or bf16: use
   // acc_type=f32
@@ -355,8 +355,19 @@ Value buildRescale(PatternRewriter& rewriter, Operation* op,
                    int32_t scale_multiplier, int32_t scale_shift,
                    int64_t input_zp, int64_t output_zp, tosa::RoundingMode rounding_mode,
                    bool scale32) {
-  bool input_unsigned = input_val.getType().isUnsignedInteger();
-  bool output_unsigned = output_type.isUnsignedInteger();
+  bool input_unsigned, output_unsigned;
+  if (auto qtype = dyn_cast<mlir::quant::QuantizedType>(
+          cast<ShapedType>(input_val.getType()).getElementType())) {
+    input_unsigned = !qtype.isSigned();
+  } else {
+    input_unsigned = input_val.getType().isUnsignedInteger();
+  }
+  if (auto qtype =
+          dyn_cast<mlir::quant::QuantizedType>(output_type.getElementType())) {
+    output_unsigned = !qtype.isSigned();
+  } else {
+    output_unsigned = output_type.isUnsignedInteger();
+  }
   auto loc = op->getLoc();
   Value multiplier_val =
       buildRescaleMultiplier(scale32, rewriter, loc, {scale_multiplier});
@@ -486,8 +497,8 @@ Value buildRescaleOpConvOutput(PatternRewriter& rewriter, Operation* op,
   const auto rounding_mode_attr = tosa::RoundingModeAttr::get(
       rewriter.getContext(), rounding_mode);
 
-  bool input_unsigned = input_qtype.isUnsignedInteger();
-  bool output_unsigned = output_qtype.isUnsignedInteger();
+  bool input_unsigned = !input_qtype.isSigned();
+  bool output_unsigned = !output_qtype.isSigned();
 
   auto loc = op->getLoc();
   const Value empty_output_val = rewriter.create<tensor::EmptyOp>(
@@ -664,7 +675,7 @@ Value getTosaConstHardSwish8bitTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -128, 127);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {256}, element_qtype.getStorageType());
+      {256}, getStorageElementTypeFromQuantized(element_qtype));
   auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
 
   auto const_op =
@@ -718,7 +729,8 @@ Value getTosaConstRsqrt8bitTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -128, 127);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {256}, element_qtype.getStorageType());
+      {256},
+      tosa::getStorageElementTypeFromQuantized(element_qtype));
   auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
 
   auto const_op =
@@ -756,7 +768,7 @@ Value getTosaConst8bitTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -128, 127);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({256}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {256}, element_qtype.getStorageType());
+      {256}, tosa::getStorageElementTypeFromQuantized(element_qtype));
   auto const_attr = DenseElementsAttr::get(storage_type, llvm::ArrayRef(table));
 
   auto const_op =
@@ -880,7 +892,7 @@ void getTosaConst32bitSoftmaxExpTable(PatternRewriter& rewriter, Operation* op,
                                 rewriter.getF32Type(), 1.0f, 0, -32768, 32767);
   auto const_type = tensorflow::GetTypeFromTFTensorShape({513}, element_qtype);
   auto storage_type = tensorflow::GetTypeFromTFTensorShape(
-      {513}, element_qtype.getStorageType());
+      {513}, tosa::getStorageElementTypeFromQuantized(element_qtype));
 
   auto first_const_attr =
       DenseElementsAttr::get(storage_type, llvm::ArrayRef(first_table));
@@ -1409,7 +1421,7 @@ Value reshapeScalarTo1D(PatternRewriter& rewriter, Location loc, Value value) {
     auto element_qtype = dyn_cast<quant::QuantizedType>(element_type);
     if (element_qtype) {
       storage_type = tensorflow::GetTypeFromTFTensorShape(
-          {1}, element_qtype.getStorageType());
+          {1}, tosa::getStorageElementTypeFromQuantized(element_qtype));
     }
 
     DenseElementsAttr const_attr;

From a44a6d3d75c4b8ee48ce8d165efd40f5a51cddba Mon Sep 17 00:00:00 2001
From: Georg Stefan Schmid <gschmid@nvidia.com>
Date: Wed, 10 Dec 2025 17:15:04 -0800
Subject: [PATCH 152/753] PR #34110: [pjrt] Add cuda_version attribute to GPU
 plugin
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34110

📝 Summary of Changes
Adds a C API plugin attribute `cuda_version` that's set to the cuda runtime version the plugin was compiled against. The verison may be queried from JAX, e.g.,
```bash
$ python3 -c "import jax.extend; print(jax.extend.backend.get_backend().cuda_version)"
13000
```

🎯 Justification
This allows CUDA-version-specific packages to detect the CUDA version present in JAX. See also https://github.com/jax-ml/jax/issues/32729

🚀 Kind of Contribution
New Feature

📊 Benchmark (for Performance Improvements)
N/A

🧪 Unit Tests:
None

🧪 Execution Tests:
None

cc @skye
Copybara import of the project:

--
c8b9097739f4416a6e5939ae8102d7141ecc03a2 by Georg Stefan Schmid <gschmid@nvidia.com>:

[pjrt] Add cuda_version attribute to GPU plugin

Merging this change closes #34110

PiperOrigin-RevId: 842948422
---
 third_party/xla/xla/pjrt/c/BUILD              | 10 +++-
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc | 50 ++++++++++++++++++-
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 6ea1ff0213911f..203269226200e8 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -417,7 +417,11 @@ cc_library(
     name = "pjrt_c_api_gpu_internal",
     srcs = ["pjrt_c_api_gpu_internal.cc"],
     hdrs = ["pjrt_c_api_gpu_internal.h"],
-    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    local_defines = (
+        if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+            "TENSORFLOW_USE_ROCM=1",
+        ])
+    ),
     visibility = ["//visibility:public"],
     deps = [
         ":pjrt_c_api_custom_partitioner_extension_hdrs",
@@ -462,7 +466,9 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
-    ],
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index 6f9d0d70e56df0..dd5fa7400a7ccb 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -59,6 +59,10 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/custom_call_target_registry.h"
 
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#endif  // GOOGLE_CUDA
+
 namespace pjrt {
 namespace gpu_plugin {
 
@@ -322,6 +326,50 @@ PJRT_Error* PJRT_GpuDeviceTopology_Create(
   return nullptr;
 }
 
+#if GOOGLE_CUDA && defined(CUDART_VERSION)  // cuda
+namespace {
+
+const std::vector<PJRT_NamedValue>* MakeCudaPluginCAttributes() {
+  std::vector<PJRT_NamedValue>* attributes = new std::vector<PJRT_NamedValue>();
+  const std::vector<PJRT_NamedValue>& base_attributes =
+      pjrt::GetXlaPluginCAttributes();
+  attributes->reserve(base_attributes.size() + 1);
+  attributes->assign(base_attributes.begin(), base_attributes.end());
+  {
+    // Include the cuda_version attribute.
+    PJRT_NamedValue c_value;
+    c_value.struct_size = PJRT_NamedValue_STRUCT_SIZE;
+    c_value.extension_start = nullptr;
+    absl::string_view name = "cuda_version";
+    c_value.name = name.data();
+    c_value.name_size = name.size();
+    c_value.type = PJRT_NamedValue_Type::PJRT_NamedValue_kInt64;
+    c_value.int64_value = CUDART_VERSION;
+    c_value.value_size = 1;
+    attributes->push_back(c_value);
+  }
+  return attributes;
+}
+
+}  // namespace
+#endif
+
+PJRT_Error* PJRT_Plugin_Attributes_Gpu(PJRT_Plugin_Attributes_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Plugin_Attributes_Args", PJRT_Plugin_Attributes_Args_STRUCT_SIZE,
+      args->struct_size));
+#if GOOGLE_CUDA && defined(CUDART_VERSION)  // cuda
+  static const std::vector<PJRT_NamedValue>* attributes =
+      MakeCudaPluginCAttributes();
+#else
+  const std::vector<PJRT_NamedValue>* attributes =
+      &pjrt::GetXlaPluginCAttributes();
+#endif
+  args->num_attributes = attributes->size();
+  args->attributes = attributes->data();
+  return nullptr;
+}
+
 PLUGIN_Profiler_Api profiler_api{
     /*struct_size=*/PLUGIN_Profiler_Api_STRUCT_SIZE,
     /*priv=*/nullptr,
@@ -470,7 +518,7 @@ const PJRT_Api* GetGpuPjrtApi() {
       pjrt::gpu_plugin::PJRT_ExecuteContext_Create,
       pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create,
       pjrt::PJRT_Plugin_Initialize_NoOp, &cross_host_transfers_extension.base,
-      pjrt::PJRT_Plugin_Attributes_Xla);
+      pjrt::gpu_plugin::PJRT_Plugin_Attributes_Gpu);
 
   return &pjrt_api;
 }

From 1c9d10e7967d37c36c499ea0c030b0a41632e790 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 17:22:19 -0800
Subject: [PATCH 153/753] [XLA:GPU cleanup] Remove unused
 _xla_send_recv_validation frontend attribute.

PiperOrigin-RevId: 842951517
---
 .../backends/gpu/runtime/p2p_thunk_common.cc  |  31 -----
 .../xla/xla/service/collective_ops_utils.h    |  21 ---
 .../xla/xla/service/collective_pipeliner.cc   | 127 ------------------
 .../xla/service/collective_pipeliner_test.cc  |  28 ++--
 .../service/gpu/transforms/collectives/BUILD  |   1 -
 .../collective_permute_cycle_decomposer.cc    |  21 +--
 ...ollective_permute_cycle_decomposer_test.cc |  24 ++--
 .../double_buffer_loop_unrolling.cc           | 120 -----------------
 .../double_buffer_loop_unrolling_test.cc      |  53 ++++----
 third_party/xla/xla/service/hlo_verifier.cc   |   9 +-
 10 files changed, 47 insertions(+), 388 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
index e0bfbc72f844de..52beaae671e837 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
@@ -127,30 +127,8 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
   }
 
   std::vector<ReplicaGroup> replica_groups = statusor.value();
-  auto validation_it =
-      instr->frontend_attributes().map().find(kSendRecvValidationAttr);
   P2PConfig::ValidationKind validation_kind = P2PConfig::ValidationKind::kValid;
-  std::vector<ReplicaGroup> bounds;
-  if (validation_it != instr->frontend_attributes().map().end()) {
-    if (validation_it->second == "invalid") {
-      validation_kind = P2PConfig::ValidationKind::kInvalid;
-    } else {
-      auto statusor_bounds = ParseReplicaGroupsOnly(validation_it->second);
-      if (!statusor_bounds.ok() ||
-          statusor_bounds.value().size() != replica_groups.size()) {
-        // Ignore problems related to the source-target-pair string to avoid
-        // using absl::StatusOr for the return type.
-        return p2p_config;
-      }
-      validation_kind = P2PConfig::ValidationKind::kConditional;
-      bounds = statusor_bounds.value();
-    }
-  }
-
-  int i = 0;
   p2p_config.validation_kind = validation_kind;
-  P2PConfig::SourceTargetToBounds& source_target_to_bounds =
-      p2p_config.source_target_to_bounds;
   for (const ReplicaGroup& replica_group : replica_groups) {
     int64_t source = replica_group.replica_ids(0);
     int64_t target = replica_group.replica_ids(1);
@@ -159,15 +137,6 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
         source;
     p2p_config.id_to_source_target.insert({source, {}}).first->second.target =
         target;
-
-    if (validation_kind == P2PConfig::ValidationKind::kConditional) {
-      const ReplicaGroup& bound = bounds[i];
-      int64_t lower = bound.replica_ids(0);
-      int64_t upper = bound.replica_ids(1);
-      source_target_to_bounds[std::make_pair(source, target)] =
-          std::make_pair(lower, upper);
-      i++;
-    }
   }
 
   return p2p_config;
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index 4c86d1fab8b06c..adaa3acc37ac49 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -296,27 +296,6 @@ inline bool MayPipelineSendRecvChannel(int64_t channel_id) {
 // Send or Recv. For all other cases, asynchronous stream kP2P0 is used.
 constexpr char kSendRecvPipelineAttr[] = "_xla_send_recv_pipeline";
 
-// This frontend attribute conveys the following information:
-// (1) _xla_send_recv_validation="invalid": the runtime should skip sending or
-// receiving data when the instruction is executed.
-// (2) the absent of the attribute: the runtime should faithfully perform the
-// Send or Recv operation when the instruction is executed.
-// (3) _xla_send_recv_validation={list-of-bounds}: the list-of-bounds
-// corresponds to the value of _xla_send_recv_source_target_pairs, and specifies
-// the execution instances for which the runtime should faithfully perform the
-// Send or Recv operation. Here is an example:
-//   _xla_send_recv_source_target_pairs={{0,1}, {1,2}}
-//   _xla_send_recv_validation={{2,3}, {5,7}}
-// The Send or Recv instruction with the above two attributes have the
-// following semantics:
-// The communication between device 0 and 1 will only send or receive data
-// for execution instances 2 and 3 of the instruction on devices 0 and 1.
-// For execution instances 0, 1, and beyond 3, the runtime should skip sending
-// or receiving any data.
-// Similarly, the communication between device 1 and 2 will only send or
-// receive data on execution instances 5 and 7.
-constexpr char kSendRecvValidationAttr[] = "_xla_send_recv_validation";
-
 // Attribute to indicate that collective operations should be issued on a
 // dedicated p2p stream. This is a hint and there is no guarantee that this will
 // be honored.
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 553beccf215b3c..85a04feae59633 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -1636,24 +1636,6 @@ HloInstruction* CreateZero(HloComputation* comp, const Shape& shape,
 
 }  // namespace
 
-using Interval = std::pair<int64_t, int64_t>;
-using Intervals = std::vector<Interval>;
-// Parses a string "{{a,b},{c,d},{e,f},...}" to a vector of pairs.
-absl::StatusOr<std::vector<Interval>> ParseVectorOfPairs(
-    absl::string_view str) {
-  TF_ASSIGN_OR_RETURN(std::vector<ReplicaGroup> replica_groups,
-                      ParseReplicaGroupsOnly(str));
-  std::vector<Interval> res;
-  res.reserve(replica_groups.size());
-  for (const ReplicaGroup& replica_group : replica_groups) {
-    TF_RET_CHECK(replica_group.replica_ids_size() == 2);
-    int64_t a = replica_group.replica_ids(0);
-    int64_t b = replica_group.replica_ids(1);
-    res.emplace_back(a, b);
-  }
-  return res;
-}
-
 // If there is a collective-permute instruction with _xla_send_recv_validation
 // attribute in the computation, then during pipelining the loop trip count
 // changes. This function fixes the attribute for the cloned instruction.
@@ -1680,87 +1662,6 @@ absl::StatusOr<std::vector<Interval>> ParseVectorOfPairs(
 // attribute will become {{1,0},{1,0},{1,0},{0,0},{0,0}} and for the collective
 // inside while loop, this attribute will become
 // {{0,4},{0,4},{1,5},{1,5},{2,5}}.
-absl::Status UpdateSendRecvValidation(
-    HloInstruction* instruction, bool is_peeled,
-    collective_pipeliner_utils::PipeliningDirection direction,
-    const WhileLoopAnalysis& loop_analysis) {
-  if (instruction->opcode() != HloOpcode::kCollectivePermute) {
-    return absl::OkStatus();
-  }
-  const auto& frontend_attributes = instruction->frontend_attributes().map();
-  if (!frontend_attributes.contains(kSendRecvValidationAttr)) {
-    return absl::OkStatus();
-  }
-  VLOG(3) << "Trip count = "
-          << loop_analysis.GetLoopIterationCount()->GetSignedValue();
-  VLOG(3) << "Collective permute with _xla_send_recv_validation: "
-          << instruction->ToString();
-  TF_ASSIGN_OR_RETURN(
-      Intervals old_intervals,
-      ParseVectorOfPairs(frontend_attributes.at(kSendRecvValidationAttr)));
-
-  Intervals intervals;
-
-  if (direction == collective_pipeliner_utils::PipeliningDirection::kForward) {
-    // It is a forward pipelining which means that the peeled collective permute
-    // is before the loop. It should run once for the devices executing the
-    // first iteration and the internal collective permute now sees each
-    // original iteration decreased by one.
-    //
-    // peeled collective permute:
-    //      {{0,0} if {a,b} in old and a<=0<=b, {1,0} otherwise}
-    // internal collective permute: {{max(0, a-1), max(0, b-1)} | {a,b} in old}
-    for (auto [a, b] : old_intervals) {
-      if (is_peeled) {
-        if (a <= 0 && 0 <= b) {
-          intervals.push_back({0, 0});
-        } else {
-          intervals.push_back({1, 0});
-        }
-      } else {
-        intervals.push_back(
-            {std::max(int64_t{0}, a - 1), std::max(int64_t{0}, b - 1)});
-      }
-    }
-  } else if (direction ==
-             collective_pipeliner_utils::PipeliningDirection::kBackward) {
-    // It is a backward pipelining which means that the peeled collective is
-    // after the loop. It should run once for the devices executing the last
-    // iteration and the internal collective permute doesn't see the last
-    // iteration.
-    //
-    // peeled collective permute:
-    //      {{0,0} if {a,b} in old and a<=n<=b where n=#last_iteration, {1,0}
-    //      otherwise}
-    // interval collective permute:
-    //      {{a,min(n-1,b)} | {a,b} in old and n=#last_iteration}
-    auto trip_count_value = loop_analysis.GetLoopIterationCount();
-    if (!trip_count_value) {
-      return absl::InternalError(
-          "Unable to deduce loop trip count in collective pipeliner. This is "
-          "required for backward pipelining while fixing the "
-          "_xla_send_recv_validation attribute");
-    }
-    int64_t trip_count = trip_count_value->GetSignedValue();
-    int64_t last_iteration = trip_count - 1;
-    for (auto [a, b] : old_intervals) {
-      if (is_peeled) {
-        if (a <= last_iteration && last_iteration <= b) {
-          intervals.push_back({0, 0});
-        } else {
-          intervals.push_back({1, 0});
-        }
-      } else {
-        intervals.push_back({a, std::min(last_iteration - 1, b)});
-      }
-    }
-  }
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      instruction, kSendRecvValidationAttr, intervals);
-  VLOG(3) << "Updated collective_permute with _xla_send_recv_validation: "
-          << instruction->ToString();
-  return absl::OkStatus();
-}
 
 // Function that does the work of pushing forward instructions that have been
 // determined that can be pipelined. Rough transformation:
@@ -1923,12 +1824,6 @@ absl::Status TransformLoopForward(
       TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
           cloned_instr, next_scheduling_id, annotation_map));
     }
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        cloned_instr, true,
-        collective_pipeliner_utils::PipeliningDirection::kForward,
-        loop_analysis));
     while_body_to_peeled[instr] = cloned_instr;
     auto output_it = is_output_instruction.find(instr);
     if (output_it != is_output_instruction.end()) {
@@ -1992,14 +1887,6 @@ absl::Status TransformLoopForward(
   HloComputation* new_while_body =
       loop_computation->parent()->AddEmbeddedComputation(
           while_body->CloneWithReplacements(&replacements));
-  for (HloInstruction* instruction : new_while_body->instructions()) {
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        instruction, false,
-        collective_pipeliner_utils::PipeliningDirection::kForward,
-        loop_analysis));
-  }
   HloInstruction* new_init = loop_computation->AddInstruction(
       HloInstruction::CreateTuple(new_init_operands));
   while_body_to_peeled[while_body->root_instruction()] = new_init;
@@ -3150,14 +3037,6 @@ static absl::Status TransformLoopBackward(
   TF_RETURN_IF_ERROR(UpdateControlDependencies(while_body->root_instruction(),
                                                new_loop_root,
                                                while_body_replacement_map));
-  for (HloInstruction* instruction : new_while_body->instructions()) {
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        instruction, false,
-        collective_pipeliner_utils::PipeliningDirection::kBackward,
-        loop_analysis));
-  }
   auto cond_builder =
       HloComputation::Builder(while_loop->while_condition()->name());
   HloInstruction* new_cond_param =
@@ -3244,12 +3123,6 @@ static absl::Status TransformLoopBackward(
                                update_collective_channel_id);
     TF_RETURN_IF_ERROR(UpdateInstructionSchedulingAnnotation(
         cloned_instr, next_scheduling_id, annotation_map));
-    // TODO(b/398891001): Remove this once we have eliminated the need for
-    // send/recv validation.
-    TF_RETURN_IF_ERROR(UpdateSendRecvValidation(
-        cloned_instr, true,
-        collective_pipeliner_utils::PipeliningDirection::kBackward,
-        loop_analysis));
     while_body_replacement_map[instr] = cloned_instr;
     if (instruction_is_output_it != is_output_instruction.end()) {
       for (int64_t index : instruction_is_output_it->second) {
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index 24e6ba9be4aa1b..7b01e0d9fdd61e 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -352,8 +352,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.5 = bf16[3,8,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}},
-                     frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12},{7,13}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}}
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
   constant.2559 = s32[] constant(14)
@@ -388,14 +387,14 @@ ENTRY entry {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: HloModule
     // CHECK: %while_body
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{0,5},{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[dus]], {{.*}}%[[dus]])
     // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %entry
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}{_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[ds]], {{.*}}%[[ds]])
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
@@ -428,8 +427,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.5 = bf16[3,8,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                     frontend_attributes={_xla_send_recv_validation="{{7,13},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.5), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
   constant.2559 = s32[] constant(14)
@@ -464,14 +462,14 @@ ENTRY entry {
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: HloModule
     // CHECK: %while_body
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6},{0,5}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[dus]], {{.*}}%[[dus]])
     // CHECK:   %[[dus2:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %entry
-    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}{_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}
+    // CHECK:   %[[cp:.+]] = {{.+}} collective-permute({{.+}})
     // CHECK:   %[[ds:.+]] = {{.+}} dynamic-slice({{.*}}%[[cp]], {{.*}})
     // CHECK:   %[[mul:.+]] = {{.+}} multiply({{.*}}%[[ds]], {{.*}}%[[ds]])
     // CHECK:   %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[mul]], {{.*}})
@@ -1995,8 +1993,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}},
-                     frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12},{7,13}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}}
   constant.2561 = s32[] constant(0)
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
@@ -2041,13 +2038,13 @@ ENTRY entry {
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
   // CHECK: %while_body
-  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{0,6},{1,7},{2,8},{3,9},{4,10},{5,11},{6,12},{7,12}{{[}]}}}
+  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}})
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ENTRY %entry
   // CHECK: %[[while:.+]] = {{.+}} while({{.*}})
   // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.*}}%[[while]]), index=1
-  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}
+  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]])
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp2]], {{.*}})
   // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.*}}%[[tuple]]), index=1
@@ -2074,8 +2071,7 @@ while_body {
   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
   get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
-  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                     frontend_attributes={_xla_send_recv_validation="{{7,13},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}}"}
+  cp = bf16[3,8,128] collective-permute(get-tuple-element.395), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   constant.2561 = s32[] constant(0)
   constant.2557 = s32[] constant(1)
   add.230 = s32[] add(get-tuple-element.394, constant.2557)
@@ -2120,13 +2116,13 @@ ENTRY entry {
   XLA_VLOG_LINES(1, module->ToString());
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
   // CHECK: %while_body
-  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}_xla_send_recv_validation={{[{]}}{7,12},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}{{[}]}}}
+  // CHECK: %[[cp:.+]] = {{.+}} collective-permute({{.+}})
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ENTRY %entry
   // CHECK: %[[while:.+]] = {{.+}} while({{.+}})
   // CHECK: %[[gte:.+]] = {{.+}} get-tuple-element({{.*}}%[[while]]), index=1
-  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]]), {{.+}}_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}
+  // CHECK: %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[gte]])
   // CHECK: %[[dus:.+]] = {{.+}} dynamic-update-slice({{.*}}%[[cp2]], {{.*}})
   // CHECK: %[[tuple:.+]] = {{.+}} tuple({{.*}}%[[dus]], {{.*}})
   // CHECK: ROOT {{.+}} = {{.+}} get-tuple-element({{.*}}%[[tuple]]), index=1
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
index c2939ffff5ff47..2bd05f3b369ee6 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
@@ -593,7 +593,6 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:collective_ops_utils",
-        "//xla/service:source_target_pairs",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
index faf8c9375d9a8a..5c64c8a8d40c66 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/source_target_pairs.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
@@ -142,29 +141,11 @@ std::pair<CycleType, std::set<int>> GetCycleTypeAndIndicesArray(
   return GetCycleTypeAndIndices(pairs);
 }
 
-// Copies the frontend attributes from the original CP and splits the
-// _xla_send_recv_validation attribute;
+// Copies the frontend attributes from the original CP.
 absl::StatusOr<std::pair<FrontendAttributes, FrontendAttributes>>
 DecomposeFrontendAttributes(const FrontendAttributes& orig,
                             const CycleType cycle_type) {
   FrontendAttributes attr1 = orig, attr2 = orig;
-  auto it = orig.map().find(kSendRecvValidationAttr);
-  if (it == orig.map().end() || it->second == "invalid") {
-    return std::make_pair(attr1, attr2);
-  }
-
-  TF_ASSIGN_OR_RETURN(SourceTargetPairs bounds,
-                      SourceTargetPairs::FromString(it->second));
-  int64_t num_pairs = bounds.size();
-  if (num_pairs < 2) {
-    return Internal("Invalid number of replica groups");
-  }
-
-  // TODO: b/391377472 - this also need to be able to work with multiple cycles.
-  auto [cp1_bounds, cp2_bounds] =
-      collective_permute_cycle::SplitEdges(bounds, cycle_type);
-  (*attr1.mutable_map())[kSendRecvValidationAttr] = cp1_bounds.ToString();
-  (*attr2.mutable_map())[kSendRecvValidationAttr] = cp2_bounds.ToString();
   return std::make_pair(attr1, attr2);
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc
index 1ff9b780cb5bbd..e05dcb487bb724 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_permute_cycle_decomposer_test.cc
@@ -112,7 +112,6 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
       p = u32[8,8] parameter(0)
       ROOT start = u32[8,8] collective-permute(p), channel_id=1,
         source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
-        frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
         metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
     }
   )";
@@ -126,10 +125,10 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycle) {
     // CHECK-DAG:   %{{.+}} = u32[8,8] parameter(0)
 
     // CHECK-DAG:   %[[cp1:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=1,
-    // CHECK-SAME{LITERAL}: source_target_pairs={{3,0}}, frontend_attributes={_xla_send_recv_validation={{3,10}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: source_target_pairs={{3,0}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   %[[cp2:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=2,
-    // CHECK-SAME{LITERAL}: source_target_pairs={{0,1},{1,2},{2,3}}, frontend_attributes={_xla_send_recv_validation={{0,7},{1,8},{2,9}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: source_target_pairs={{0,1},{1,2},{2,3}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   ROOT %{{.+}} = u32[8,8] select(%[[compare]], %[[cp1]], %[[cp2]])
     // CHECK-DAG: }
@@ -220,8 +219,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycleWithMatmul) {
     weights = f32[2,2] get-tuple-element(param), index=2
     cp = f32[2,2] collective-permute(data),
       channel_id=1,
-      source_target_pairs={{0,1}, {1,2}, {2,3}, {3,0}},
-      frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"}
+      source_target_pairs={{0,1}, {1,2}, {2,3}, {3,0}}
     matmul = f32[2,2] dot(weights, cp), lhs_contracting_dims={1}, rhs_contracting_dims={0}
     iter_increment = u32[] constant(1)
     next_iter = u32[] add(iter, iter_increment)
@@ -241,12 +239,8 @@ TEST_F(CollectivePermuteCycleDecomposerTest, ForwardCycleWithMatmul) {
   Decomposed deco = FindComponents(module.get(), "cp");
   EXPECT_THAT(deco.cp_bwd->ToString(),
               HasSubstr("source_target_pairs={{3,0}}"));
-  EXPECT_THAT(deco.cp_bwd->ToString(),
-              HasSubstr("_xla_send_recv_validation={{3,10}}"));
   EXPECT_THAT(deco.cp_fwd->ToString(),
               HasSubstr("source_target_pairs={{0,1},{1,2},{2,3}}"));
-  EXPECT_THAT(deco.cp_fwd->ToString(),
-              HasSubstr("_xla_send_recv_validation={{0,7},{1,8},{2,9}}"));
 }
 
 TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
@@ -260,7 +254,6 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
       p = u32[8,8] parameter(0)
       ROOT start = u32[8,8] collective-permute(p), channel_id=1,
         source_target_pairs={{0,3},{1,0},{2,1},{3,2}},
-        frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"},
         metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
     })";
 
@@ -274,10 +267,10 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycle) {
     // CHECK-DAG:   %{{.+}} = u32[8,8] parameter(0)
 
     // CHECK-DAG:   %[[cp1:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=1, source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{0,3}}, frontend_attributes={_xla_send_recv_validation={{0,7}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: {{0,3}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   %[[cp2:.+]] = u32[8,8] collective-permute(%{{.+}}), channel_id=2, source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}, frontend_attributes={_xla_send_recv_validation={{1,8},{2,9},{3,10}}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
+    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}, metadata={op_name="op1/op2/add" source_file="foo/bar/mysource.py" source_line=35}
 
     // CHECK-DAG:   ROOT %{{.+}} = u32[8,8] select(%[[compare]], %[[cp1]], %[[cp2]])
     // CHECK-DAG: }
@@ -293,8 +286,7 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycleNoChannel) {
     ENTRY test_computation {
       p = u32[8,8] parameter(0)
       ROOT start = u32[8,8] collective-permute(p),
-        source_target_pairs={{0,3},{1,0},{2,1},{3,2}},
-        frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10}}"}
+        source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
     })";
 
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -307,10 +299,10 @@ TEST_F(CollectivePermuteCycleDecomposerTest, BackwardCycleNoChannel) {
     // CHECK-DAG:   %{{.+}} = u32[8,8] parameter(0)
 
     // CHECK-DAG:   %[[cp1:.+]] = u32[8,8] collective-permute(%{{.+}}), source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{0,3}}, frontend_attributes={_xla_send_recv_validation={{0,7}}}
+    // CHECK-SAME{LITERAL}: {{0,3}}
 
     // CHECK-DAG:   %[[cp2:.+]] = u32[8,8] collective-permute(%{{.+}}), source_target_pairs=
-    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}, frontend_attributes={_xla_send_recv_validation={{1,8},{2,9},{3,10}}}
+    // CHECK-SAME{LITERAL}: {{1,0},{2,1},{3,2}}
 
     // CHECK-DAG:   ROOT %{{.+}} = u32[8,8] select(%[[compare]], %[[cp1]], %[[cp2]])
     // CHECK-DAG: }
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
index aa5c0c396ef34e..589633cef66afd 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
@@ -88,70 +88,12 @@ void SetChannelIdForNewCollective(HloInstruction* new_instr,
 
 using Interval = std::pair<int64_t, int64_t>;
 
-// Parses a string of the format `{{a,b},{c,d},{e,f}...}` to a vector of pairs.
-absl::StatusOr<std::vector<Interval>> ParseVectorOfPairs(
-    absl::string_view str) {
-  TF_ASSIGN_OR_RETURN(std::vector<ReplicaGroup> replica_groups,
-                      ParseReplicaGroupsOnly(str));
-  std::vector<Interval> res;
-  res.reserve(replica_groups.size());
-  for (const ReplicaGroup& replica_group : replica_groups) {
-    TF_RET_CHECK(replica_group.replica_ids_size() == 2);
-    int64_t a = replica_group.replica_ids(0);
-    int64_t b = replica_group.replica_ids(1);
-    res.emplace_back(a, b);
-  }
-  return res;
-}
-
 // This function fixes the `_xla_send_recv_validation` attribute for peeled
 // instructions. When the loop trip count is odd, the peeled instructions are
 // moved before the loop. The collectives in these instructions correspond to
 // the first iteration of the original loop. We have to run this peeled
 // collective for all those devices that had the 0-th iteration as a valid
 // iteration.
-absl::Status SetSendRecvValidationForPeeledInstr(HloInstruction* new_instr,
-                                                 HloInstruction* old_instr) {
-  TF_RET_CHECK(
-      new_instr->opcode() == old_instr->opcode() &&
-      "cloned instruction and original instruction have different opcodes");
-  if (HloPredicateIsNotOp<HloOpcode::kCollectivePermute,
-                          HloOpcode::kCollectivePermuteStart, HloOpcode::kSend,
-                          HloOpcode::kRecv>(old_instr)) {
-    return absl::OkStatus();
-  }
-
-  const auto& attribute_map = new_instr->frontend_attributes().map();
-  if (!attribute_map.contains(kSendRecvValidationAttr)) {
-    return absl::OkStatus();
-  }
-
-  VLOG(3) << "Original send-recv iterations: "
-          << attribute_map.at(kSendRecvValidationAttr);
-
-  TF_ASSIGN_OR_RETURN(
-      auto send_recv_validation_attr,
-      ParseVectorOfPairs(attribute_map.at(kSendRecvValidationAttr)));
-
-  uint64_t n_pairs = send_recv_validation_attr.size();
-  if (n_pairs == 0) {
-    return absl::OkStatus();
-  }
-  std::vector<Interval> send_recv_validation_attr_updated(n_pairs, {1, 0});
-  // Check which of the attributes have iteration number zero as valid
-  // iteration. For all those, set the peeled instruction to run.
-  for (std::uint64_t i = 0; i < send_recv_validation_attr.size(); i++) {
-    if (send_recv_validation_attr[i].first <= 0 &&
-        send_recv_validation_attr[i].second >= 0) {
-      send_recv_validation_attr_updated[i] = {0, 0};
-    }
-  }
-
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      /*instr=*/new_instr, /*attr_name=*/kSendRecvValidationAttr,
-      /*intervals=*/send_recv_validation_attr_updated);
-  return absl::OkStatus();
-}
 
 // This function fixes the `_xla_send_recv_validation` attribute for the two new
 // collectives inside the loop. The calculation of the new valid iterations
@@ -180,65 +122,6 @@ absl::Status SetSendRecvValidationForPeeledInstr(HloInstruction* new_instr,
 //
 // In a similar fashion we can generalize the computation of new values based on
 // the values of the old attribute as done in the logic below.
-absl::Status SetSendRecvValidation(HloInstruction* cp1, HloInstruction* cp2,
-                                   bool is_peeled) {
-  TF_RET_CHECK(
-      cp2->opcode() == cp1->opcode() &&
-      "cloned instruction and original instruction have different opcodes");
-  if (HloPredicateIsNotOp<HloOpcode::kCollectivePermute,
-                          HloOpcode::kCollectivePermuteStart, HloOpcode::kSend,
-                          HloOpcode::kRecv>(cp1)) {
-    return absl::OkStatus();
-  }
-  const auto& attribute_map = cp2->frontend_attributes().map();
-  if (!attribute_map.contains(kSendRecvValidationAttr)) {
-    return absl::OkStatus();
-  }
-  VLOG(3) << "Original send-recv iterations: "
-          << attribute_map.at(kSendRecvValidationAttr);
-
-  TF_ASSIGN_OR_RETURN(
-      auto send_recv_validation_attr,
-      ParseVectorOfPairs(attribute_map.at(kSendRecvValidationAttr)));
-
-  if (send_recv_validation_attr.size() == 0) {
-    return absl::OkStatus();
-  }
-
-  std::vector<Interval> send_recv_iterations_new_instr1,
-      send_recv_iterations_new_instr2;
-  send_recv_iterations_new_instr1.reserve(send_recv_validation_attr.size());
-  send_recv_iterations_new_instr2.reserve(send_recv_validation_attr.size());
-  for (const Interval& pair : send_recv_validation_attr) {
-    int64_t a = pair.first;
-    int64_t b = pair.second;
-    if (is_peeled) {
-      send_recv_iterations_new_instr1.emplace_back(
-          std::floor(a / 2.0), std::max(0.0, std::floor((b - 1) / 2.0)));
-      send_recv_iterations_new_instr2.emplace_back(
-          std::max(0.0, std::floor((a - 1) / 2.0)),
-          std::max(0.0, std::floor((b - 2) / 2.0)));
-    } else {
-      send_recv_iterations_new_instr1.emplace_back(std::floor((a + 1) / 2.0),
-                                                   std::floor(b / 2.0));
-      send_recv_iterations_new_instr2.emplace_back(
-          std::floor(a / 2.0), std::max(0.0, std::floor((b - 1) / 2.0)));
-    }
-  }
-
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      /*instr=*/cp1, /*attr_name=*/kSendRecvValidationAttr,
-      /*intervals=*/send_recv_iterations_new_instr1);
-  hlo_instruction_utils::AddOrUpdateVectorOfPairsAsAttribute(
-      /*instr=*/cp2, /*attr_name=*/kSendRecvValidationAttr,
-      /*intervals=*/send_recv_iterations_new_instr2);
-
-  VLOG(3) << "Updated send-recv iterations for " << cp1->name() << " : "
-          << cp1->frontend_attributes().map().at(kSendRecvValidationAttr);
-  VLOG(3) << "Updated send-recv iterations for " << cp2->name() << " : "
-          << cp2->frontend_attributes().map().at(kSendRecvValidationAttr);
-  return absl::OkStatus();
-}
 
 // Handle control predecessors/successors for every old-new instruction pair.
 // For every new instruction, we find the relevant predecessor/successor
@@ -406,7 +289,6 @@ absl::Status PeelInstructionsForOddTripCount(HloModule* module,
             old_instr->shape(), new_operands, suffix));
 
     SetChannelIdForNewCollective(new_instr, module);
-    CHECK_OK(SetSendRecvValidationForPeeledInstr(new_instr, old_instr));
     old_to_new_map[old_instr] = new_instr;
     VLOG(2) << "Added instruction " << new_instr->ToString()
             << " to parent computation.";
@@ -498,8 +380,6 @@ absl::StatusOr<bool> DoubleBufferingUnroll(HloInstruction* while_instr,
       skip_control_dep_injection.insert(old_instr);
     }
     SetChannelIdForNewCollective(new_instr, module);
-    CHECK_OK(SetSendRecvValidation(old_instr, new_instr,
-                                   /*is_peeled=*/peel_one_iteration));
     old_to_new_map[old_instr] = new_instr;
     VLOG(2) << "Added instruction " << new_instr->ToString();
   }
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
index 6e16fcb9754c91..3c643a3b83a110 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
@@ -1003,8 +1003,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
-                             frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1026,10 +1025,10 @@ ENTRY main {
   VLOG(1) << module->ToString();
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body {{.+}} {
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %main {{.+}} {
@@ -1057,8 +1056,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}},
-                             frontend_attributes={_xla_send_recv_validation="{{0,7},{1,8},{2,9},{3,10},{4,11},{5,12},{6,13},{7,14}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,0}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1080,13 +1078,13 @@ ENTRY main {
   VLOG(1) << module->ToString();
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6},{3,6}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]])
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{0,3},{1,4},{1,4},{2,5},{2,5},{3,6}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: ENTRY %main {{.+}} {
-    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0}{{[}]}}}
+    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.*}}), {{.+}}
     // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.*}}%[[cp_peeled]], {{.*}})
     // CHECK:   %[[while:.+]] = {{.+}} while({{.*}}%[[out_peeled]])
     // CHECK: }
@@ -1112,8 +1110,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                             frontend_attributes={_xla_send_recv_validation="{{7,13},{6,12},{5,11},{4,10},{3,9},{2,8},{1,7},{0,6}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1135,10 +1132,10 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = f32[] collective-permute(%param_0), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{4,6},{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = f32[] collective-permute(%param_0), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{3,5},{2,5},{2,4},{1,4},{1,3},{0,3},{0,2}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: ENTRY %main
     // CHECK-NOT: collective-permute
@@ -1165,8 +1162,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}},
-                             frontend_attributes={_xla_send_recv_validation="{{7,14},{6,13},{5,12},{4,11},{3,10},{2,9},{1,8},{0,7}}"}
+  collective-permute = f32[] collective-permute(param_0), channel_id=1, source_target_pairs={{0,7},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6}}
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
   ROOT output_tuple = (f32[], s32[]) tuple(collective-permute, cond_plus_1)
@@ -1188,14 +1184,14 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{3,6},{2,5},{2,5},{1,4},{1,4},{0,3},{0,3}{{[}]}}}
+    // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{3,6},{2,5},{2,5},{1,4},{1,4},{0,3},{0,3},{0,2}{{[}]}}}
+    // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute({{.*}}%[[param2]]), {{.+}}
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
     // CHECK: ENTRY %main
-    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{1,0},{0,0}{{[}]}}}
+    // CHECK:   %[[cp_peeled:.+]] = {{.+}} collective-permute({{.+}}), {{.+}}
     // CHECK:   %[[out_peeled:.+]] = {{.+}} tuple({{.*}}%[[cp_peeled]], {{.*}})
     // CHECK:   ROOT {{.+}} = {{.+}} while({{.*}}%[[out_peeled]])
     // CHECK: }
@@ -1221,8 +1217,7 @@ body {
   input_tuple = (f32[], s32[]) parameter(0)
   param_0 = f32[] get-tuple-element(input_tuple), index=0
   cond = s32[] get-tuple-element(input_tuple), index=1
-  collective-permute-start = (f32[], f32[], u32[], u32[]) collective-permute-start(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}},
-                             frontend_attributes={_xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"}
+  collective-permute-start = (f32[], f32[], u32[], u32[]) collective-permute-start(param_0), channel_id=1, source_target_pairs={{0,1},{1,2},{2,3},{3,0}}
   collective-permute = f32[] collective-permute-done(collective-permute-start)
   one = s32[] constant(1)
   cond_plus_1 = s32[] add(cond, one)
@@ -1245,11 +1240,11 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[cp_start1:.+]] = {{.+}} collective-permute-start({{.+}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}}
+    // CHECK:   %[[cp_start1:.+]] = {{.+}} collective-permute-start({{.+}}), {{.+}}
     // CHECK:   %[[cp1:.+]] = {{.+}} collective-permute-done({{.*}}%[[cp_start1]])
     // CHECK:   %[[out1:.+]] = {{.+}} tuple({{.*}}%[[cp1]], {{.*}})
     // CHECK:   %[[param2:.+]] = {{.+}} get-tuple-element({{.*}}%[[out1]]), index=0
-    // CHECK:   %[[cp_start2:.+]] = {{.+}} collective-permute-start({{.*}}), {{.+}}, frontend_attributes={_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}}
+    // CHECK:   %[[cp_start2:.+]] = {{.+}} collective-permute-start({{.*}}), {{.+}}
     // CHECK:   %[[cp2:.+]] = {{.+}} collective-permute-done({{.*}}%[[cp_start2]])
     // CHECK:   ROOT {{.+}} = {{.+}} tuple({{.*}}%[[cp2]], {{.*}})
     // CHECK: }
@@ -1281,8 +1276,7 @@ body {
   recv.0 = (f32[], u32[], token[]) recv(after-all.0), channel_id=1,
         frontend_attributes={
           _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,0}}",
-          _xla_send_recv_pipeline="0",
-          _xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"
+          _xla_send_recv_pipeline="0"
         }
   recv-done.0 = (f32[], token[]) recv-done(recv.0), channel_id=1,
         frontend_attributes={
@@ -1310,8 +1304,8 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[recv1:.+]] = {{.+}} recv({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}
-    // CHECK:   %[[recv2:.+]] = {{.+}} recv({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}
+    // CHECK:   %[[recv1:.+]] = {{.+}} recv({{.+}}), {{.+}}
+    // CHECK:   %[[recv2:.+]] = {{.+}} recv({{.+}}), {{.+}}
     // CHECK: ENTRY %main
     // CHECK-NOT: recv
     // CHECK: }
@@ -1340,8 +1334,7 @@ body {
   send.0 = (f32[], u32[], token[]) send(param_0, after-all.0), channel_id=1,
         frontend_attributes={
           _xla_send_recv_source_target_pairs="{{0,1},{1,2},{2,3},{3,0}}",
-          _xla_send_recv_pipeline="0",
-          _xla_send_recv_validation="{{0,6},{1,7},{2,8},{3,9}}"
+          _xla_send_recv_pipeline="0"
         }
   send-done.0 = token[] send-done(send.0), channel_id=1,
         frontend_attributes={
@@ -1368,8 +1361,8 @@ ENTRY main {
 
   EXPECT_TRUE(*RunFileCheck(module->ToString(), R"(
     // CHECK: %body
-    // CHECK:   %[[send1:.+]] = {{.+}} send({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,3},{1,3},{1,4},{2,4}{{[}]}}
-    // CHECK:   %[[send2:.+]] = {{.+}} send({{.+}}), {{.+}},_xla_send_recv_validation={{[{]}}{0,2},{0,3},{1,3},{1,4}{{[}]}}
+    // CHECK:   %[[send1:.+]] = {{.+}} send({{.+}}), {{.+}}
+    // CHECK:   %[[send2:.+]] = {{.+}} send({{.+}}), {{.+}}
     // CHECK: ENTRY %main
     // CHECK-NOT: send
     // CHECK: }
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index e2a5cd29821fb2..e4443bf16350b9 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -2562,16 +2562,13 @@ absl::StatusOr<bool> ShouldSkipDeadlockCheck(const T* instruction) {
   if (instruction->is_host_transfer()) {
     return true;
   }
-  // TODO: b/441038687 - Remove kSendRecvValidationAttr
   // TODO: b/441088186 - update static analyzer logic to also handle
   // instructions annotated with _xla_send_recv_pipeline
   // For now we will skip checks for instructions annotated with
-  // _xla_send_recv_pipeline and _xla_send_recv_validation, since they introduce
-  // extra constraints that have not been modeled by this function.
+  // _xla_send_recv_pipeline, since they introduce extra constraints that have
+  // not been modeled by this function.
   if (instruction->frontend_attributes().map().contains(
-          kSendRecvPipelineAttr) ||
-      instruction->frontend_attributes().map().contains(
-          kSendRecvValidationAttr)) {
+          kSendRecvPipelineAttr)) {
     return true;
   }
   // Check that the instruction itself does not have conflicting

From 2a0841fb753474f17b8b47e30d35c58b660ea56a Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Wed, 10 Dec 2025 17:58:37 -0800
Subject: [PATCH 154/753] Rename num_replicas to num_devices in
 HloRunnerInterface to reduce confusion.

num_replicas in the hlo runner interface is not the number of replicas to
execute on, unless the number of partitions is 1. Thus far, tests have only used
a single partition for testing, so they were equivalent. This is now changing.
The field should be renamed to avoid confusion.

PiperOrigin-RevId: 842964006
---
 third_party/xla/xla/service/hlo_runner.cc     | 38 +++++++++----------
 .../xla/xla/service/hlo_runner_interface.h    |  2 +-
 .../xla/xla/service/hlo_runner_pjrt.cc        | 30 +++++++--------
 .../xla/tests/collective_ops_e2e_test_base.cc |  2 +-
 .../xla/xla/tests/collective_ops_test.cc      |  2 +-
 .../tests/hlo_runner_agnostic_test_base.cc    | 12 +++---
 .../xla/xla/tests/replicated_io_feed_test.cc  |  2 +-
 7 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index 077c92bf517de3..9ce2dec42dc218 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -497,7 +497,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   // argument_buffers.
   const int64_t total_argument_count = [&]() {
     int64_t total = 0;
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       total += argument_count_provider(i);
     }
     return total;
@@ -511,7 +511,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   std::vector<absl::Span<const ShapedBuffer* const>> argument_buffer_slices;
   int64_t index = 0;
   RunId run_id;
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
+  for (int64_t i = 0; i < options.num_devices; ++i) {
     int64_t device =
         (*device_assignment)(i / num_partitions, i % num_partitions);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
@@ -543,10 +543,10 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
 
   std::unique_ptr<tsl::thread::ThreadPool> pool;
   TF_RET_CHECK(options.infeed_values.empty() ||
-               options.infeed_values.size() == options.num_replicas);
+               options.infeed_values.size() == options.num_devices);
   int64_t num_threads = options.infeed_values.size();
   if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
-    num_threads += options.num_replicas;
+    num_threads += options.num_devices;
   }
   if (num_threads > 0) {
     pool = std::make_unique<tsl::thread::ThreadPool>(
@@ -554,7 +554,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
         /*num_threads=*/num_threads);
   }
   if (!options.infeed_values.empty()) {
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       int64_t device =
           (*device_assignment)(i / num_partitions, i % num_partitions);
       pool->Schedule([this, device, &options, i]() {
@@ -574,9 +574,9 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   }
   if (ShapeUtil::IsInitialized(options.outfeed_shape)) {
     if (options.outfeed_values) {
-      options.outfeed_values->resize(options.num_replicas);
+      options.outfeed_values->resize(options.num_devices);
     }
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       int64_t device =
           (*device_assignment)(i / num_partitions, i % num_partitions);
       pool->Schedule([this, device, &options, i]() {
@@ -606,8 +606,8 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
   VLOG(1) << "Replicated execution terminated";
 
   std::vector<Literal> exec_results;
-  exec_results.reserve(options.num_replicas);
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
+  exec_results.reserve(options.num_devices);
+  for (int64_t i = 0; i < options.num_devices; ++i) {
     TF_RETURN_IF_ERROR(streams[i]->BlockHostUntilDone());
     TF_ASSIGN_OR_RETURN(Literal literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
@@ -636,13 +636,13 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         } else {
           absl::Mutex mutex;
           std::vector<absl::StatusOr<ScopedShapedBuffer>> thread_results(
-              options.num_replicas);
+              options.num_devices);
           {
-            VLOG(1) << "Creating thread pool for " << options.num_replicas
+            VLOG(1) << "Creating thread pool for " << options.num_devices
                     << " replicas";
             tsl::thread::ThreadPool pool(tsl::Env::Default(), "replicas",
-                                         options.num_replicas);
-            for (int64_t i = 0; i < options.num_replicas; ++i) {
+                                         options.num_devices);
+            for (int64_t i = 0; i < options.num_devices; ++i) {
               pool.Schedule([&, i] {
                 auto result = executable->ExecuteOnStream(
                     &service_run_options[i], argument_buffer_slices[i]);
@@ -678,7 +678,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   if (device_assignment == nullptr) {
     TF_ASSIGN_OR_RETURN(
         computation_device_assignment,
-        backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+        backend().computation_placer()->AssignDevices(options.num_devices, 1));
     device_assignment = &computation_device_assignment;
   }
   CHECK_NE(device_assignment, nullptr);
@@ -691,13 +691,13 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
         std::vector<ScopedShapedBuffer> results;
         absl::Mutex mutex;
         std::vector<absl::StatusOr<ScopedShapedBuffer>> thread_results(
-            options.num_replicas);
+            options.num_devices);
         {
-          VLOG(1) << "Creating thread pool for " << options.num_replicas
+          VLOG(1) << "Creating thread pool for " << options.num_devices
                   << " replicas";
           tsl::thread::ThreadPool pool(tsl::Env::Default(), "replicas",
-                                       options.num_replicas);
-          for (int64_t i = 0; i < options.num_replicas; ++i) {
+                                       options.num_devices);
+          for (int64_t i = 0; i < options.num_devices; ++i) {
             for (const auto& arg : argument_buffer_slices[i]) {
               TF_RET_CHECK(arg != nullptr);
             }
@@ -732,7 +732,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     const ReplicatedExecuteOptions& options) {
   TF_ASSIGN_OR_RETURN(
       DeviceAssignment device_assignment,
-      backend().computation_placer()->AssignDevices(options.num_replicas, 1));
+      backend().computation_placer()->AssignDevices(options.num_devices, 1));
   return ExecuteReplicated(std::move(module), options, &device_assignment);
 }
 
diff --git a/third_party/xla/xla/service/hlo_runner_interface.h b/third_party/xla/xla/service/hlo_runner_interface.h
index a7141a9c02482e..eb498d61185311 100644
--- a/third_party/xla/xla/service/hlo_runner_interface.h
+++ b/third_party/xla/xla/service/hlo_runner_interface.h
@@ -166,7 +166,7 @@ class HloRunnerInterface {
   // The options used to configure an ExecuteReplicated() call.
   struct ReplicatedExecuteOptions {
     // The number of devices the HLO module should be replicated onto.
-    int64_t num_replicas = 1;
+    int64_t num_devices = 1;
 
     // The arguments to be fed to each replica. Since this is used for a
     // replicated execution, all the arguments are the same for all replicas.
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index eebcdc0e0401f3..4d7b4105b32f69 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -560,7 +560,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  module->mutable_config().set_replica_count(options.num_replicas);
+  module->mutable_config().set_replica_count(options.num_devices);
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> executable,
@@ -618,15 +618,15 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
         // The underlying data is modified concurrently. We don't need to
         // protect access as each replica writes only to its own slot.
         std::vector<absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>>
-            per_replica_results(options.num_replicas);
+            per_replica_results(options.num_devices);
         absl::c_fill(per_replica_results,
                      absl::InternalError("No result for replica."));
 
         {
           // NB: `pool` is joined on destruction.
           tsl::thread::ThreadPool pool(tsl::Env::Default(), "replicas",
-                                       options.num_replicas);
-          for (int64_t i = 0; i < options.num_replicas; ++i) {
+                                       options.num_devices);
+          for (int64_t i = 0; i < options.num_devices; ++i) {
             for (const PjRtBuffer* const buffer : argument_buffer_slices[i]) {
               TF_RET_CHECK(buffer != nullptr);
             }
@@ -659,7 +659,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
         }
         // Aggregate results.
         std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> results;
-        for (int64_t i = 0; i < options.num_replicas; ++i) {
+        for (int64_t i = 0; i < options.num_devices; ++i) {
           absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>&
               replica_result = per_replica_results[i];
           if (!replica_result.ok()) {
@@ -685,12 +685,12 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
     const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
   TF_RET_CHECK(options.infeed_values.empty() ||
-               options.infeed_values.size() == options.num_replicas);
+               options.infeed_values.size() == options.num_devices);
 
-  std::vector<PjRtDevice*> replica_devices(options.num_replicas, nullptr);
+  std::vector<PjRtDevice*> replica_devices(options.num_devices, nullptr);
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> argument_buffer_slices;
-  argument_buffer_slices.reserve(options.num_replicas);
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
+  argument_buffer_slices.reserve(options.num_devices);
+  for (int64_t i = 0; i < options.num_devices; ++i) {
     // Amortize device lookup.
     TF_ASSIGN_OR_RETURN(PjRtDevice* const device_ptr,
                         pjrt_client_->LookupDevice(
@@ -732,12 +732,12 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   if (has_infeed || has_outfeed) {
     // One infeed per infeed value and one outfeed per replica.
     const int64_t num_threads =
-        options.infeed_values.size() + (has_outfeed ? options.num_replicas : 0);
+        options.infeed_values.size() + (has_outfeed ? options.num_devices : 0);
     pool = std::make_unique<tsl::thread::ThreadPool>(
         tsl::Env::Default(), "infeed_outfeed", num_threads);
   }
   if (has_infeed) {
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       pool->Schedule(
           [device = replica_devices[i],
            &infeed_literal = *ABSL_DIE_IF_NULL(options.infeed_values[i]),
@@ -759,9 +759,9 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   }
   if (has_outfeed) {
     if (options.outfeed_values != nullptr) {
-      options.outfeed_values->resize(options.num_replicas);
+      options.outfeed_values->resize(options.num_devices);
     }
-    for (int64_t i = 0; i < options.num_replicas; ++i) {
+    for (int64_t i = 0; i < options.num_devices; ++i) {
       pool->Schedule([i, device = replica_devices[i],
                       outfeed_values = options.outfeed_values,
                       outfeed_shape = options.outfeed_shape,
@@ -796,8 +796,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
 
   // Get the result from execution.
   std::vector<Literal> result_literals;
-  result_literals.reserve(options.num_replicas);
-  for (int64_t i = 0; i < options.num_replicas; ++i) {
+  result_literals.reserve(options.num_devices);
+  for (int64_t i = 0; i < options.num_devices; ++i) {
     TF_ASSIGN_OR_RETURN(Literal literal,
                         TransferLiteralsFromDevice(
                             result_buffers[i], result_buffers[i].size() != 1));
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc b/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
index 775007fa085eec..d8302a06c5ad3e 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test_base.cc
@@ -161,7 +161,7 @@ CollectiveOpsE2ETestBase::ExecuteReplicated(
   // TODO(b/441865120): Use designated initializers this once XLA moves to
   // C++20.
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_devices;
+  options.num_devices = num_devices;
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
 
diff --git a/third_party/xla/xla/tests/collective_ops_test.cc b/third_party/xla/xla/tests/collective_ops_test.cc
index 26dab88fea8a8b..b27f4e65c9db88 100644
--- a/third_party/xla/xla/tests/collective_ops_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_test.cc
@@ -435,7 +435,7 @@ TEST_F(CollectiveOpsTest, AllReduce_ManyConcurrentAllReduces) {
   auto device_assn = MakeDeviceAssn(devices);
 
   HloRunnerInterface::ReplicatedExecuteOptions opts;
-  opts.num_replicas = devices.size();
+  opts.num_devices = devices.size();
   opts.use_threads = true;
   opts.arguments.push_back(&input_literal);
 
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
index db5fd00500278a..3331f59e00ce21 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
@@ -152,7 +152,7 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
     const int64_t num_replicas, const bool use_threads,
     const bool run_hlo_passes) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_replicas;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
@@ -167,7 +167,7 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
     const int64_t num_replicas, DeviceAssignment* const device_assignment,
     const bool run_hlo_passes, const bool use_threads) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_replicas;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
@@ -184,7 +184,7 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
     const int64_t num_replicas, const bool run_hlo_passes,
     DeviceAssignment* const device_assignment) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_replicas;
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
   return test_runner_->ExecuteReplicated(
@@ -259,11 +259,11 @@ HloRunnerAgnosticTestBase::RunAndCompareTwoModulesReplicated(
            << "Number of replicas is not the same: " << replica_count << " Vs "
            << module_1->config().replica_count();
   }
-  if (options.num_replicas != replica_count) {
+  if (options.num_devices != replica_count) {
     return ::testing::AssertionFailure()
            << "Number of execution replicas is different from number of "
               "replicas in the module: requested number of replicas = "
-           << options.num_replicas
+           << options.num_devices
            << ", number of replicas in hlo = " << replica_count;
   }
 
@@ -540,7 +540,7 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::RunReplicated(
   }
 
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_replicas = num_replicas;
+  options.num_devices = num_replicas;
   options.arguments = {fake_argument_ptrs.begin(), fake_argument_ptrs.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
diff --git a/third_party/xla/xla/tests/replicated_io_feed_test.cc b/third_party/xla/xla/tests/replicated_io_feed_test.cc
index a6d82d33112c40..e7ff6762b41a72 100644
--- a/third_party/xla/xla/tests/replicated_io_feed_test.cc
+++ b/third_party/xla/xla/tests/replicated_io_feed_test.cc
@@ -63,7 +63,7 @@ TEST_F(ReplicatedIOFeedTest, InfeedAndOutfeed) {
   std::vector<Literal> outfeed_literals;
 
   HloRunnerInterface::ReplicatedExecuteOptions opts;
-  opts.num_replicas = kNumReplicas;
+  opts.num_devices = kNumReplicas;
 
   // Initialize infeed literal = replica_id * 10
   std::vector<Literal> infeed_literals(kNumReplicas);

From f30bc84ef2d931c63a0eb4d992d8c4e8d324841e Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Wed, 10 Dec 2025 18:47:11 -0800
Subject: [PATCH 155/753] Updated python version from `3.13.6` to `3.13.11`.

PiperOrigin-RevId: 842980941
---
 third_party/py/rules_python_versions.patch    | 64 +++++++++++++++++--
 .../py/rules_python_versions.patch            | 64 +++++++++++++++++--
 2 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/third_party/py/rules_python_versions.patch b/third_party/py/rules_python_versions.patch
index 8dbc70bad193d7..c31b6772c2675f 100644
--- a/third_party/py/rules_python_versions.patch
+++ b/third_party/py/rules_python_versions.patch
@@ -1,8 +1,60 @@
 diff --git a/python/versions.bzl b/python/versions.bzl
-index 30929f82..8e79225a 100644
+index 30929f82..c0856d70 100644
 --- a/python/versions.bzl
 +++ b/python/versions.bzl
-@@ -855,6 +855,51 @@ TOOL_VERSIONS = {
+@@ -810,6 +810,51 @@ TOOL_VERSIONS = {
+             "x86_64-unknown-linux-gnu-freethreaded": "python/install",
+         },
+     },
++    "3.13.11": {
++        "url": "20251209/cpython-{python_version}+20251209-{platform}-{build}.{ext}",
++        "sha256": {
++            "aarch64-apple-darwin": "295a9f7bc899ea1cc08baf60bbf511bdd1e4a29b2dd7e5f59b48f18bfa6bf585",
++            "aarch64-unknown-linux-gnu": "ea1e678e6e82301bb32bf3917732125949b6e46d541504465972024a3f165343",
++            "ppc64le-unknown-linux-gnu": "7660e53aad9d35ee256913c6d98427f81f078699962035c5fa8b5c3138695109",
++            "riscv64-unknown-linux-gnu": "763fa1548e6a432e9402916e690c74ea30f26dcd2e131893dd506f72b87c27c9",
++            "s390x-unknown-linux-gnu": "ffb6af51fbfabfc6fbc4e7379bdec70c2f51e972b1d2f45c053493b9da3a1bbe",
++            "x86_64-apple-darwin": "dac4a0a0a9b71f6b02a8b0886547fa22814474239bffb948e3e77185406ea136",
++            "x86_64-pc-windows-msvc": "87822417007045a28a7eccc47fe67b8c61265b99b10dbbfa24d231a3622b1c27",
++            "aarch64-pc-windows-msvc": "ba646d0c3b7dd7bdfb770d9b2ebd6cd2df02a37fda90c9c79a7cf59c7df6f165",
++            "aarch64-pc-windows-msvc-freethreaded": "6daf6d092c7294cfe68c4c7bf2698ac134235489c874b3bf796c7972b9dbba30",
++            "x86_64-unknown-linux-gnu": "1ffa06d714a44aea14c0c54c30656413e5955a6c92074b4b3cb4351dcc28b63b",
++            "x86_64-unknown-linux-musl": "969fe24017380b987c4e3ce15e9edf82a4618c1e61672b2cc9b021a1c98eae78",
++            "aarch64-apple-darwin-freethreaded": "4213058b7fcd875596c12b58cd46a399358b0a87ecde4b349cbdd00cf87ed79a",
++            "aarch64-unknown-linux-gnu-freethreaded": "290ca3bd0007db9e551f90b08dfcb6c1b2d62c33b2fc3e9a43e77d385d94f569",
++            "ppc64le-unknown-linux-gnu-freethreaded": "09d4b50f8abb443f7e3af858c920aa61c2430b0954df465e861caa7078e55e69",
++            "riscv64-unknown-linux-gnu-freethreaded": "5406f2a7cacafbd2aac3ce2de066a0929aab55423824276c36e04cb83babc36c",
++            "s390x-unknown-linux-gnu-freethreaded": "3984b67c4292892eaccdd1c094c7ec788884c4c9b3534ab6995f6be96d5ed51d",
++            "x86_64-apple-darwin-freethreaded": "d6f489464045d6895ae68b0a04a9e16477e74fe3185a75f3a9a0af8ccd25eade",
++            "x86_64-pc-windows-msvc-freethreaded": "bb9a29a7ba8f179273b79971da6aaa7be592d78c606a63f99eff3e4c12fb0fae",
++            "x86_64-unknown-linux-gnu-freethreaded": "33f89c957d986d525529b8a980103735776f4d20cf52f55960a057c760188ac3",
++        },
++        "strip_prefix": {
++            "aarch64-apple-darwin": "python",
++            "aarch64-unknown-linux-gnu": "python",
++            "ppc64le-unknown-linux-gnu": "python",
++            "s390x-unknown-linux-gnu": "python",
++            "riscv64-unknown-linux-gnu": "python",
++            "x86_64-apple-darwin": "python",
++            "x86_64-pc-windows-msvc": "python",
++            "aarch64-pc-windows-msvc": "python",
++            "x86_64-unknown-linux-gnu": "python",
++            "x86_64-unknown-linux-musl": "python",
++            "aarch64-apple-darwin-freethreaded": "python/install",
++            "aarch64-unknown-linux-gnu-freethreaded": "python/install",
++            "ppc64le-unknown-linux-gnu-freethreaded": "python/install",
++            "riscv64-unknown-linux-gnu-freethreaded": "python/install",
++            "s390x-unknown-linux-gnu-freethreaded": "python/install",
++            "x86_64-apple-darwin-freethreaded": "python/install",
++            "x86_64-pc-windows-msvc-freethreaded": "python/install",
++            "aarch64-pc-windows-msvc-freethreaded": "python/install",
++            "x86_64-unknown-linux-gnu-freethreaded": "python/install",
++        },
++    },
+     "3.14.0rc1": {
+         "url": "20250808/cpython-{python_version}+20250808-{platform}-{build}.{ext}",
+         "sha256": {
+@@ -855,6 +900,51 @@ TOOL_VERSIONS = {
              "x86_64-unknown-linux-gnu-freethreaded": "python/install",
          },
      },
@@ -54,16 +106,18 @@ index 30929f82..8e79225a 100644
  }
  
  # buildifier: disable=unsorted-dict-items
-@@ -865,7 +910,7 @@ MINOR_MAPPING = {
+@@ -864,8 +954,8 @@ MINOR_MAPPING = {
+     "3.10": "3.10.18",
      "3.11": "3.11.13",
      "3.12": "3.12.11",
-     "3.13": "3.13.6",
+-    "3.13": "3.13.6",
 -    "3.14": "3.14.0rc1",
++    "3.13": "3.13.11",
 +    "3.14": "3.14.0",
  }
  
  def _generate_platforms():
-@@ -1045,29 +1090,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
+@@ -1045,29 +1135,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
      for u in url:
          p, _, _ = platform.partition(FREETHREADED)
  
diff --git a/third_party/xla/third_party/py/rules_python_versions.patch b/third_party/xla/third_party/py/rules_python_versions.patch
index 8dbc70bad193d7..c31b6772c2675f 100644
--- a/third_party/xla/third_party/py/rules_python_versions.patch
+++ b/third_party/xla/third_party/py/rules_python_versions.patch
@@ -1,8 +1,60 @@
 diff --git a/python/versions.bzl b/python/versions.bzl
-index 30929f82..8e79225a 100644
+index 30929f82..c0856d70 100644
 --- a/python/versions.bzl
 +++ b/python/versions.bzl
-@@ -855,6 +855,51 @@ TOOL_VERSIONS = {
+@@ -810,6 +810,51 @@ TOOL_VERSIONS = {
+             "x86_64-unknown-linux-gnu-freethreaded": "python/install",
+         },
+     },
++    "3.13.11": {
++        "url": "20251209/cpython-{python_version}+20251209-{platform}-{build}.{ext}",
++        "sha256": {
++            "aarch64-apple-darwin": "295a9f7bc899ea1cc08baf60bbf511bdd1e4a29b2dd7e5f59b48f18bfa6bf585",
++            "aarch64-unknown-linux-gnu": "ea1e678e6e82301bb32bf3917732125949b6e46d541504465972024a3f165343",
++            "ppc64le-unknown-linux-gnu": "7660e53aad9d35ee256913c6d98427f81f078699962035c5fa8b5c3138695109",
++            "riscv64-unknown-linux-gnu": "763fa1548e6a432e9402916e690c74ea30f26dcd2e131893dd506f72b87c27c9",
++            "s390x-unknown-linux-gnu": "ffb6af51fbfabfc6fbc4e7379bdec70c2f51e972b1d2f45c053493b9da3a1bbe",
++            "x86_64-apple-darwin": "dac4a0a0a9b71f6b02a8b0886547fa22814474239bffb948e3e77185406ea136",
++            "x86_64-pc-windows-msvc": "87822417007045a28a7eccc47fe67b8c61265b99b10dbbfa24d231a3622b1c27",
++            "aarch64-pc-windows-msvc": "ba646d0c3b7dd7bdfb770d9b2ebd6cd2df02a37fda90c9c79a7cf59c7df6f165",
++            "aarch64-pc-windows-msvc-freethreaded": "6daf6d092c7294cfe68c4c7bf2698ac134235489c874b3bf796c7972b9dbba30",
++            "x86_64-unknown-linux-gnu": "1ffa06d714a44aea14c0c54c30656413e5955a6c92074b4b3cb4351dcc28b63b",
++            "x86_64-unknown-linux-musl": "969fe24017380b987c4e3ce15e9edf82a4618c1e61672b2cc9b021a1c98eae78",
++            "aarch64-apple-darwin-freethreaded": "4213058b7fcd875596c12b58cd46a399358b0a87ecde4b349cbdd00cf87ed79a",
++            "aarch64-unknown-linux-gnu-freethreaded": "290ca3bd0007db9e551f90b08dfcb6c1b2d62c33b2fc3e9a43e77d385d94f569",
++            "ppc64le-unknown-linux-gnu-freethreaded": "09d4b50f8abb443f7e3af858c920aa61c2430b0954df465e861caa7078e55e69",
++            "riscv64-unknown-linux-gnu-freethreaded": "5406f2a7cacafbd2aac3ce2de066a0929aab55423824276c36e04cb83babc36c",
++            "s390x-unknown-linux-gnu-freethreaded": "3984b67c4292892eaccdd1c094c7ec788884c4c9b3534ab6995f6be96d5ed51d",
++            "x86_64-apple-darwin-freethreaded": "d6f489464045d6895ae68b0a04a9e16477e74fe3185a75f3a9a0af8ccd25eade",
++            "x86_64-pc-windows-msvc-freethreaded": "bb9a29a7ba8f179273b79971da6aaa7be592d78c606a63f99eff3e4c12fb0fae",
++            "x86_64-unknown-linux-gnu-freethreaded": "33f89c957d986d525529b8a980103735776f4d20cf52f55960a057c760188ac3",
++        },
++        "strip_prefix": {
++            "aarch64-apple-darwin": "python",
++            "aarch64-unknown-linux-gnu": "python",
++            "ppc64le-unknown-linux-gnu": "python",
++            "s390x-unknown-linux-gnu": "python",
++            "riscv64-unknown-linux-gnu": "python",
++            "x86_64-apple-darwin": "python",
++            "x86_64-pc-windows-msvc": "python",
++            "aarch64-pc-windows-msvc": "python",
++            "x86_64-unknown-linux-gnu": "python",
++            "x86_64-unknown-linux-musl": "python",
++            "aarch64-apple-darwin-freethreaded": "python/install",
++            "aarch64-unknown-linux-gnu-freethreaded": "python/install",
++            "ppc64le-unknown-linux-gnu-freethreaded": "python/install",
++            "riscv64-unknown-linux-gnu-freethreaded": "python/install",
++            "s390x-unknown-linux-gnu-freethreaded": "python/install",
++            "x86_64-apple-darwin-freethreaded": "python/install",
++            "x86_64-pc-windows-msvc-freethreaded": "python/install",
++            "aarch64-pc-windows-msvc-freethreaded": "python/install",
++            "x86_64-unknown-linux-gnu-freethreaded": "python/install",
++        },
++    },
+     "3.14.0rc1": {
+         "url": "20250808/cpython-{python_version}+20250808-{platform}-{build}.{ext}",
+         "sha256": {
+@@ -855,6 +900,51 @@ TOOL_VERSIONS = {
              "x86_64-unknown-linux-gnu-freethreaded": "python/install",
          },
      },
@@ -54,16 +106,18 @@ index 30929f82..8e79225a 100644
  }
  
  # buildifier: disable=unsorted-dict-items
-@@ -865,7 +910,7 @@ MINOR_MAPPING = {
+@@ -864,8 +954,8 @@ MINOR_MAPPING = {
+     "3.10": "3.10.18",
      "3.11": "3.11.13",
      "3.12": "3.12.11",
-     "3.13": "3.13.6",
+-    "3.13": "3.13.6",
 -    "3.14": "3.14.0rc1",
++    "3.13": "3.13.11",
 +    "3.14": "3.14.0",
  }
  
  def _generate_platforms():
-@@ -1045,29 +1090,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
+@@ -1045,29 +1135,25 @@ def get_release_info(platform, python_version, base_url = DEFAULT_RELEASE_BASE_U
      for u in url:
          p, _, _ = platform.partition(FREETHREADED)
  

From 774404d4af867ed70564738242b77771553a611a Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Wed, 10 Dec 2025 20:52:41 -0800
Subject: [PATCH 156/753] Rename num_replicas to num_devices in
 HloRunnerAgnosticTestBase.

PiperOrigin-RevId: 843023763
---
 .../tests/hlo_runner_agnostic_test_base.cc    | 50 +++++++++----------
 .../xla/tests/hlo_runner_agnostic_test_base.h | 24 ++++-----
 2 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
index 3331f59e00ce21..71f4c3b69df799 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
@@ -148,11 +148,10 @@ absl::StatusOr<Literal> HloRunnerAgnosticTestBase::Execute(
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
-    const absl::Span<const Literal* const> arguments,
-    const int64_t num_replicas, const bool use_threads,
-    const bool run_hlo_passes) {
+    const absl::Span<const Literal* const> arguments, const int64_t num_devices,
+    const bool use_threads, const bool run_hlo_passes) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_devices = num_replicas;
+  options.num_devices = num_devices;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
@@ -163,11 +162,11 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
 absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
-    const absl::Span<const Literal* const> arguments,
-    const int64_t num_replicas, DeviceAssignment* const device_assignment,
-    const bool run_hlo_passes, const bool use_threads) {
+    const absl::Span<const Literal* const> arguments, const int64_t num_devices,
+    DeviceAssignment* const device_assignment, const bool run_hlo_passes,
+    const bool use_threads) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_devices = num_replicas;
+  options.num_devices = num_devices;
   options.arguments = {arguments.begin(), arguments.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = use_threads;
@@ -181,10 +180,10 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
     absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
     absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
     absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
-    const int64_t num_replicas, const bool run_hlo_passes,
+    const int64_t num_devices, const bool run_hlo_passes,
     DeviceAssignment* const device_assignment) {
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_devices = num_replicas;
+  options.num_devices = num_devices;
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
   return test_runner_->ExecuteReplicated(
@@ -196,11 +195,10 @@ absl::StatusOr<std::vector<Literal>>
 HloRunnerAgnosticTestBase::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const std::vector<std::vector<Literal*>> arguments,
-    const int64_t num_replicas, const bool run_hlo_passes,
+    const int64_t num_devices, const bool run_hlo_passes,
     DeviceAssignment* const device_assignment) {
-  CHECK(num_replicas > 0 && "expect at least one replica");
-  CHECK(num_replicas == arguments.size() &&
-        "expect arguments for each replica");
+  CHECK(num_devices > 0 && "expected at least one device");
+  CHECK(num_devices == arguments.size() && "expect arguments for each device");
   int64_t argument_count = arguments.front().size();
   TF_RETURN_IF_ERROR(PreprocessModuleForTestRunner(module.get()));
   TF_ASSIGN_OR_RETURN(
@@ -213,7 +211,7 @@ HloRunnerAgnosticTestBase::ExecuteReplicated(
       [&](int64_t replica_idx, int64_t argument_idx) -> const Literal* {
         return arguments[replica_idx][argument_idx];
       },
-      num_replicas, /*run_hlo_passes=*/run_hlo_passes,
+      num_devices, /*run_hlo_passes=*/run_hlo_passes,
       /*device_assignment=*/device_assignment);
 }
 
@@ -313,15 +311,12 @@ HloRunnerAgnosticTestBase::RunAndCompareTwoModulesReplicated(
     std::unique_ptr<HloModule> module_0, std::unique_ptr<HloModule> module_1,
     const std::vector<Literal>& fake_arguments, const bool run_hlo_passes,
     const bool use_threads, const std::optional<ErrorSpec>& error) {
-  const HloRunnerInterface::ReplicatedExecuteOptions options{
-      /*num_replicas=*/module_0->config().replica_count(),
-      /*arguments=*/LiteralUtil::MakePointers(fake_arguments),
-      /*infeed_values=*/{},
-      /*infeed_steps=*/-1,
-      /*outfeed_shape=*/{},
-      /*outfeed_values=*/nullptr,
-      /*run_hlo_passes=*/run_hlo_passes,
-      /*use_threads=*/use_threads};
+  HloRunnerInterface::ReplicatedExecuteOptions options;
+  options.num_devices =
+      module_0->config().replica_count() * module_0->config().num_partitions();
+  options.arguments = LiteralUtil::MakePointers(fake_arguments);
+  options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = use_threads;
   return RunAndCompareTwoModulesReplicated(std::move(module_0),
                                            std::move(module_1), options, error);
 }
@@ -512,9 +507,10 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::Run(
 
 ::testing::AssertionResult HloRunnerAgnosticTestBase::RunReplicated(
     const absl::string_view hlo_string, const bool run_hlo_passes,
-    const int64_t num_replicas, const tsl::protobuf::Message* backend_config) {
+    const int64_t num_devices, const tsl::protobuf::Message* backend_config) {
   absl::StatusOr<std::unique_ptr<VerifiedHloModule>> module =
-      ParseAndReturnVerifiedModule(hlo_string, num_replicas);
+      ParseAndReturnVerifiedModule(hlo_string, /*num_replicas=*/num_devices,
+                                   /*num_partitions=*/1);
   if (!module.ok()) {
     return ::testing::AssertionFailure()
            << "Error while parsing HLO text format: "
@@ -540,7 +536,7 @@ ::testing::AssertionResult HloRunnerAgnosticTestBase::RunReplicated(
   }
 
   HloRunnerInterface::ReplicatedExecuteOptions options;
-  options.num_devices = num_replicas;
+  options.num_devices = num_devices;
   options.arguments = {fake_argument_ptrs.begin(), fake_argument_ptrs.end()};
   options.run_hlo_passes = run_hlo_passes;
   options.use_threads = true;
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
index fc1fc3ccc75c0e..ea3ffce16ff77c 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
@@ -159,36 +159,36 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
     return test_runner_->CreateExecutable(std::move(module), run_hlo_passes);
   }
 
-  // Executes the given module on multiple replicas.
+  // Executes the given module on multiple devices.
   //
   // use_threads indicates whether this replicated computation will be executed
-  // with a thread-per-replica, vs using an implicitly async call such as
+  // with a thread-per-device, vs using an implicitly async call such as
   // Executable::ExecuteOnStreams.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      absl::Span<const Literal* const> arguments, int64_t num_replicas,
+      absl::Span<const Literal* const> arguments, int64_t num_devices,
       bool use_threads, bool run_hlo_passes = false);
 
   // Same as above, but uses specified device assignment.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      absl::Span<const Literal* const> arguments, int64_t num_replicas,
+      absl::Span<const Literal* const> arguments, int64_t num_devices,
       DeviceAssignment* device_assignment, bool run_hlo_passes,
       bool use_threads);
 
-  // Same as above, but allows passing different programs for replicas.
+  // Same as above, but allows passing different programs for devices.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
       absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
-      int64_t num_replicas, bool run_hlo_passes,
+      int64_t num_devices, bool run_hlo_passes,
       DeviceAssignment* device_assignment = nullptr);
 
   // Convenience function for above. Allows passing different inputs to
-  // different replicas of the same program.
+  // different devices of the same program.
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       std::unique_ptr<HloModule> module,
-      std::vector<std::vector<Literal*>> arguments, int64_t num_replicas,
+      std::vector<std::vector<Literal*>> arguments, int64_t num_devices,
       bool run_hlo_passes, DeviceAssignment* device_assignment = nullptr);
 
   // Executes an hlo module with fake inputs and checks that the execution is
@@ -225,8 +225,8 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
       bool use_threads, const std::optional<ErrorSpec>& error);
 
   // Parses the modules, and executes them based on `run_hlo_passes` and
-  // `use_threads` flags. The replica count should be mentioned in the module
-  // itself.
+  // `use_threads` flags. The replica + partition count should be set in the
+  // module itself.
   ::testing::AssertionResult RunAndCompareTwoModulesReplicated(
       absl::string_view module_0_str, absl::string_view module_1_str,
       bool run_hlo_passes, bool use_threads,
@@ -268,10 +268,10 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
       absl::Span<const Literal* const> arguments,
       const std::optional<ErrorSpec>& error, bool run_hlo_passes = true);
 
-  // Executes an hlo module with fake inputs on multiple replicas.
+  // Executes an hlo module with fake inputs on multiple devices.
   ::testing::AssertionResult RunReplicated(
       absl::string_view hlo_string, bool run_hlo_passes = true,
-      int64_t num_replicas = 1,
+      int64_t num_devices = 1,
       const tsl::protobuf::Message* backend_config = nullptr);
 
   // If assert_determinism is true, the assertion will fail unless all runs

From 3c3712f6a0066643c17b9627cd2c54d6ce7fb9a3 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 10 Dec 2025 20:55:47 -0800
Subject: [PATCH 157/753] Tidy up the parser checks for AsyncStart/Update/Done.

Previously, we check that AsyncUpdate/Done have a single operand with a shape that conforms to the shape of AsyncStart, and the single operand is an Asynchronous op. This is now replaced by ensuring that AsyncUpdate/Done have a single operand which is an AsyncStart/Update.

Previously, we check that AsyncUpdate has a result shape that conforms to the shape of AsyncStart. This is now replaced by ensuring that the AsyncUpdate operand and result have the same shape, like the HloVerifier does.

PiperOrigin-RevId: 843024671
---
 third_party/xla/xla/hlo/parser/hlo_parser.cc  | 41 +++++++------------
 .../xla/xla/hlo/parser/hlo_parser_test.cc     | 28 +++++--------
 .../xla/xla/service/hlo_verifier_test.cc      | 27 ------------
 3 files changed, 25 insertions(+), 71 deletions(-)

diff --git a/third_party/xla/xla/hlo/parser/hlo_parser.cc b/third_party/xla/xla/hlo/parser/hlo_parser.cc
index fe1e5298e5db58..57d934e6437ea0 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser.cc
@@ -2070,27 +2070,11 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       if (!preset_operands && !ParseOperands(&operands, builder)) {
         return nullptr;
       }
-      auto is_async_shape_correct = [](const Shape& shape) {
-        return shape.IsTuple() && shape.tuple_shapes().size() >= 2 &&
-               shape.tuple_shapes(0).IsTuple();
-      };
-      // Verify operand/resulting shapes
-      if (opcode == HloOpcode::kAsyncUpdate ||
-          opcode == HloOpcode::kAsyncDone) {
-        if (operands.size() != 1 ||
-            !is_async_shape_correct(operands[0]->shape())) {
-          TokenError(
-              "AsyncUpdate and AsyncDone expect a single operand in the form "
-              "of ((async-operands), async-outputs, state).");
-          return nullptr;
-        }
-      }
-      if (opcode == HloOpcode::kAsyncStart ||
-          opcode == HloOpcode::kAsyncUpdate) {
-        if (!is_async_shape_correct(*shape)) {
+      if (opcode == HloOpcode::kAsyncStart) {
+        if (!shape->IsTuple() || shape->tuple_shapes().size() < 2 ||
+            !shape->tuple_shapes(0).IsTuple()) {
           TokenError(
-              "AsyncStart and AsyncUpdate expect the op shape to be in the "
-              "form of "
+              "AsyncStart expects the op shape to be in the form of "
               "((async-operands), async-outputs, state).");
           return nullptr;
         }
@@ -2099,17 +2083,20 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       // previous async op.
       if (opcode == HloOpcode::kAsyncUpdate ||
           opcode == HloOpcode::kAsyncDone) {
-        if (operands.size() != 1 ||
-            !is_async_shape_correct(operands[0]->shape())) {
+        if (operands.size() != 1 || !operands[0]->IsAsynchronous() ||
+            operands[0]->opcode() == HloOpcode::kAsyncDone) {
           TokenError(
-              "AsyncUpdate and AsyncDone expect a single operand in the form "
-              "of ((async-operands), async-outputs, state).");
+              "AsyncUpdate and AsyncDone expect a single async op as their "
+              "operand.");
           return nullptr;
         }
-        if (!operands[0]->IsAsynchronous()) {
+      }
+      // For AsyncUpdate, the operand and the result should have the same shape.
+      if (opcode == HloOpcode::kAsyncUpdate) {
+        if (operands[0]->shape() != *shape) {
           TokenError(
-              "AsyncUpdate and AsyncDone expect their operand to be the "
-              "previous async op.");
+              "AsyncUpdate expects the op shape to be the same as the operand "
+              "shape.");
           return nullptr;
         }
       }
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
index 3e33268136e874..2d2cbf31651e68 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
@@ -5362,8 +5362,7 @@ ENTRY AsyncStartMissingOperandWrapper {
       ParseAndReturnUnverifiedModule(hlo_string).status(),
       absl_testing::StatusIs(
           tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncStart and AsyncUpdate expect the op shape to be "
-                    "in the form of "
+          HasSubstr("AsyncStart expects the op shape to be in the form of "
                     "((async-operands), async-outputs, state).")));
 }
 
@@ -5385,11 +5384,9 @@ ENTRY AsyncUpdateMissingOperandWrapper {
   )";
   EXPECT_THAT(
       ParseAndReturnUnverifiedModule(hlo_string).status(),
-      absl_testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncStart and AsyncUpdate expect the op shape to be "
-                    "in the form of "
-                    "((async-operands), async-outputs, state).")));
+      absl_testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                             HasSubstr("AsyncUpdate expects the op shape to be "
+                                       "the same as the operand shape.")));
 }
 
 TEST_F(HloParserTest, AsyncOpTupleWrongType) {
@@ -5411,8 +5408,7 @@ ENTRY AsyncStartAndAsyncDone {
       ParseAndReturnUnverifiedModule(hlo_string).status(),
       absl_testing::StatusIs(
           tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncStart and AsyncUpdate expect the op shape to be "
-                    "in the form of "
+          HasSubstr("AsyncStart expects the op shape to be in the form of "
                     "((async-operands), async-outputs, state).")));
 }
 
@@ -5429,10 +5425,9 @@ ENTRY AsyncStartAndAsyncDone {
   )";
   EXPECT_THAT(
       ParseAndReturnUnverifiedModule(hlo_string).status(),
-      absl_testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncUpdate and AsyncDone expect their operand to be "
-                    "the previous async op.")));
+      absl_testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                             HasSubstr("AsyncUpdate and AsyncDone expect a "
+                                       "single async op as their operand.")));
 }
 
 TEST_F(HloParserTest, AsyncUpdateAndAsyncDoneNoAsyncStart) {
@@ -5449,10 +5444,9 @@ ENTRY AsyncStartAndAsyncDone {
   )";
   EXPECT_THAT(
       ParseAndReturnUnverifiedModule(hlo_string).status(),
-      absl_testing::StatusIs(
-          tsl::error::INVALID_ARGUMENT,
-          HasSubstr("AsyncUpdate and AsyncDone expect their operand to be "
-                    "the previous async op.")));
+      absl_testing::StatusIs(tsl::error::INVALID_ARGUMENT,
+                             HasSubstr("AsyncUpdate and AsyncDone expect a "
+                                       "single async op as their operand.")));
 }
 
 TEST_F(HloParserTest, AsyncUpdateWithSyntaxSugarWrongOp) {
diff --git a/third_party/xla/xla/service/hlo_verifier_test.cc b/third_party/xla/xla/service/hlo_verifier_test.cc
index 24d69e257b8d2a..99010d48c72fa7 100644
--- a/third_party/xla/xla/service/hlo_verifier_test.cc
+++ b/third_party/xla/xla/service/hlo_verifier_test.cc
@@ -1431,33 +1431,6 @@ TEST_F(HloVerifierTest, AsyncDoneOutputWrongType) {
                         "async shape at index {1}"));
 }
 
-TEST_F(HloVerifierTest, AsyncUpdateWrongType) {
-  const char* const hlo_string = R"(
-  HloModule Module
-
-  async_computation {
-    p = f32[2,3] parameter(0)
-    ROOT custom-call = f32[3,2] custom-call(p), custom_call_target="foo"
-  }
-
-  ENTRY AsyncStartAndAsyncDone {
-    p0 = f32[2,3] parameter(0)
-    async-start = ((f32[2,3]), f32[3,2], u32[]) async-start(p0), calls=async_computation
-    async-update = ((f32[3,2]), f32[3,2], u32[]) async-update(async-start), calls=async_computation
-    ROOT async-done = f32[3,2] async-done(async-update), calls=async_computation
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
-
-  auto status = verifier().Run(module.get()).status();
-  ASSERT_FALSE(status.ok());
-  EXPECT_THAT(
-      status.message(),
-      HasSubstr(
-          "async-update expects the shape of operand and output to match"));
-}
-
 TEST_F(HloVerifierTest, AsyncOpComputationNotTrivial) {
   const char* const hlo_string = R"(
   HloModule Module

From d5035162934d95bc6f1961a4abff11567ea44ea5 Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Wed, 10 Dec 2025 21:21:22 -0800
Subject: [PATCH 158/753] [Utilities][ReplicaGroupV3] Refactor
 GetPerGroupCollectiveOpsCreator to reduce cognitive complexity, and make it
 easier to refactor for V3 support.

PiperOrigin-RevId: 843032534
---
 .../xla/service/spmd/spmd_partitioner_util.cc | 299 ++++++++++--------
 1 file changed, 173 insertions(+), 126 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index 9b7c68b257a358..cac9a5766d88ac 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -545,6 +545,153 @@ std::optional<IotaReplicaGroupList> ExpandDeviceGroupsWithMeshAxes(
       device_groups, partition_group_list->ToIotaReplicaGroupList());
 }
 
+// Lambdas for creating SPMDCollectiveOps functions.
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_all_reduce)
+CreateCrossPartitionAllReduce(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             int64_t channel_id) {
+    return creator.create_cross_partition_all_reduce(
+        b, operand, reduction,
+        ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
+        channel_id);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::
+             create_cross_partition_all_reduce_with_iota_device_list)
+CreateCrossPartitionAllReduceWithIotaDeviceList(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+             const IotaReplicaGroupList& partition_group_list,
+             int64_t channel_id) {
+    // Try to expand the device group list, but if this fails fallback
+    // to creating collective with list of list of integers representation.
+    std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
+        ExpandDeviceGroupsWithIota(*device_groups_ptr, partition_group_list);
+    if (!expanded_iota_partition_group_list.has_value()) {
+      return creator.create_cross_partition_all_reduce(
+          b, operand, reduction,
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
+          channel_id);
+    }
+    return creator.create_cross_partition_all_reduce_with_iota_device_list(
+        b, operand, reduction, *expanded_iota_partition_group_list, channel_id);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_collective_permute)
+CreateCrossPartitionCollectivePermute(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand,
+             std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
+             int64_t next_channel_id) {
+    std::vector<std::pair<int64_t, int64_t>> expanded_pairs(
+        src_dst_pairs.size() * device_groups_ptr->num_groups());
+    for (int64_t g = 0; g < device_groups_ptr->num_groups(); ++g) {
+      for (int64_t i = 0; i < src_dst_pairs.size(); ++i) {
+        expanded_pairs[g * src_dst_pairs.size() + i] =
+            std::pair<int64_t, int64_t>{
+                device_groups_ptr->array()(g, src_dst_pairs[i].first),
+                device_groups_ptr->array()(g, src_dst_pairs[i].second)};
+      }
+    }
+    return creator.create_cross_partition_collective_permute(
+        b, operand, expanded_pairs, next_channel_id);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_all_to_all)
+CreateCrossPartitionAllToAll(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             int64_t channel_id, std::optional<int64_t> split_dimension) {
+    return creator.create_cross_partition_all_to_all(
+        b, operands,
+        ExpandDeviceGroups(*device_groups_ptr, partition_subgroups), channel_id,
+        split_dimension);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::
+             create_cross_partition_all_to_all_with_iota_device_list)
+CreateCrossPartitionAllToAllWithIotaDeviceList(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+             const IotaReplicaGroupList& partition_group_list,
+             int64_t channel_id, std::optional<int64_t> split_dimension) {
+    // Try to expand the partition group list, but if this fails fallback
+    // to creating collective with list of list of integers representation.
+    std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
+        ExpandDeviceGroupsWithIota(*device_groups_ptr, partition_group_list);
+    if (!expanded_iota_partition_group_list.has_value()) {
+      return creator.create_cross_partition_all_to_all(
+          b, operands,
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
+          channel_id, split_dimension);
+    }
+    return creator.create_cross_partition_all_to_all_with_iota_device_list(
+        b, operands, *expanded_iota_partition_group_list, channel_id,
+        split_dimension);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::create_cross_partition_all_gather)
+CreateCrossPartitionAllGather(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             int64_t channel_id, int64_t all_gather_dimension) {
+    return creator.create_cross_partition_all_gather(
+        b, operand, ag_shape,
+        ExpandDeviceGroups(*device_groups_ptr, partition_subgroups), channel_id,
+        all_gather_dimension);
+  };
+}
+
+decltype(SPMDCollectiveOpsCreator::
+             create_cross_partition_all_gather_with_iota_device_list)
+CreateCrossPartitionAllGatherWithIotaDeviceList(
+    const SPMDCollectiveOpsCreator& creator,
+    std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
+  return [creator, device_groups_ptr](
+             SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+             const IotaReplicaGroupList& partition_group_list,
+             int64_t channel_id, int64_t all_gather_dimension) {
+    // Try to expand the device group list, but if this fails fallback
+    // to creating collective with list of list of integers
+    // representation.
+    std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
+        ExpandDeviceGroupsWithIota(*device_groups_ptr, partition_group_list);
+    if (!expanded_iota_partition_group_list.has_value()) {
+      return creator.create_cross_partition_all_gather(
+          b, operand, ag_shape,
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
+          channel_id, all_gather_dimension);
+    }
+    return creator.create_cross_partition_all_gather_with_iota_device_list(
+        b, operand, ag_shape, *expanded_iota_partition_group_list, channel_id,
+        all_gather_dimension);
+  };
+}
+
 SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
     const SPMDCollectiveOpsCreator& creator,
     const DeviceGroupTileAssignment& device_groups) {
@@ -568,124 +715,25 @@ SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
                                  *device_groups_ptr, b);
   };
   result.create_cross_partition_all_reduce =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
-          const std::vector<std::vector<int64_t>>& partition_subgroups,
-          int64_t channel_id) {
-        return creator.create_cross_partition_all_reduce(
-            b, operand, reduction,
-            ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
-            channel_id);
-      };
+      CreateCrossPartitionAllReduce(creator, device_groups_ptr);
   result.create_cross_partition_all_reduce_with_iota_device_list =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
-          const IotaReplicaGroupList& partition_group_list,
-          int64_t channel_id) {
-        // Try to expand the device group list, but if this fails fallback
-        // to creating collective with list of list of integers representation.
-        std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
-            ExpandDeviceGroupsWithIota(*device_groups_ptr,
-                                       partition_group_list);
-        if (!expanded_iota_partition_group_list.has_value()) {
-          return creator.create_cross_partition_all_reduce(
-              b, operand, reduction,
-              ExpandDeviceGroups(
-                  *device_groups_ptr,
-                  partition_group_list.flattened_replica_groups()),
-              channel_id);
-        }
-        return creator.create_cross_partition_all_reduce_with_iota_device_list(
-            b, operand, reduction, *expanded_iota_partition_group_list,
-            channel_id);
-      };
+      CreateCrossPartitionAllReduceWithIotaDeviceList(creator,
+                                                      device_groups_ptr);
   result.create_cross_partition_collective_permute =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, HloInstruction* operand,
-          std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
-          int64_t next_channel_id) {
-        std::vector<std::pair<int64_t, int64_t>> expanded_pairs(
-            src_dst_pairs.size() * device_groups_ptr->num_groups());
-        for (int64_t g = 0; g < device_groups_ptr->num_groups(); ++g) {
-          for (int64_t i = 0; i < src_dst_pairs.size(); ++i) {
-            expanded_pairs[g * src_dst_pairs.size() + i] =
-                std::pair<int64_t, int64_t>{
-                    device_groups_ptr->array()(g, src_dst_pairs[i].first),
-                    device_groups_ptr->array()(g, src_dst_pairs[i].second)};
-          }
-        }
-        return creator.create_cross_partition_collective_permute(
-            b, operand, expanded_pairs, next_channel_id);
-      };
+      CreateCrossPartitionCollectivePermute(creator, device_groups_ptr);
   result.create_cross_partition_all_to_all =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-          const std::vector<std::vector<int64_t>>& partition_subgroups,
-          int64_t channel_id, std::optional<int64_t> split_dimension) {
-        return creator.create_cross_partition_all_to_all(
-            b, operands,
-            ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
-            channel_id, split_dimension);
-      };
+      CreateCrossPartitionAllToAll(creator, device_groups_ptr);
   result.create_cross_partition_all_to_all_with_iota_device_list =
-      [creator, device_groups_ptr](
-          SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-          const IotaReplicaGroupList& partition_group_list, int64_t channel_id,
-          std::optional<int64_t> split_dimension) {
-        // Try to expand the partition group list, but if this fails fallback
-        // to creating collective with list of list of integers representation.
-        std::optional<IotaReplicaGroupList> expanded_iota_partition_group_list =
-            ExpandDeviceGroupsWithIota(*device_groups_ptr,
-                                       partition_group_list);
-        if (!expanded_iota_partition_group_list.has_value()) {
-          return creator.create_cross_partition_all_to_all(
-              b, operands,
-              ExpandDeviceGroups(
-                  *device_groups_ptr,
-                  partition_group_list.flattened_replica_groups()),
-              channel_id, split_dimension);
-        }
-        return creator.create_cross_partition_all_to_all_with_iota_device_list(
-            b, operands, *expanded_iota_partition_group_list, channel_id,
-            split_dimension);
-      };
+      CreateCrossPartitionAllToAllWithIotaDeviceList(creator,
+                                                     device_groups_ptr);
   if (creator.create_cross_partition_all_gather) {
     result.create_cross_partition_all_gather =
-        [creator, device_groups_ptr](
-            SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-            const std::vector<std::vector<int64_t>>& partition_subgroups,
-            int64_t channel_id, int64_t all_gather_dimension) {
-          return creator.create_cross_partition_all_gather(
-              b, operand, ag_shape,
-              ExpandDeviceGroups(*device_groups_ptr, partition_subgroups),
-              channel_id, all_gather_dimension);
-        };
+        CreateCrossPartitionAllGather(creator, device_groups_ptr);
   }
   if (creator.create_cross_partition_all_gather_with_iota_device_list) {
     result.create_cross_partition_all_gather_with_iota_device_list =
-        [creator, device_groups_ptr](
-            SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-            const IotaReplicaGroupList& partition_group_list,
-            int64_t channel_id, int64_t all_gather_dimension) {
-          // Try to expand the device group list, but if this fails fallback
-          // to creating collective with list of list of integers
-          // representation.
-          std::optional<IotaReplicaGroupList>
-              expanded_iota_partition_group_list = ExpandDeviceGroupsWithIota(
-                  *device_groups_ptr, partition_group_list);
-          if (!expanded_iota_partition_group_list.has_value()) {
-            return creator.create_cross_partition_all_gather(
-                b, operand, ag_shape,
-                ExpandDeviceGroups(
-                    *device_groups_ptr,
-                    partition_group_list.flattened_replica_groups()),
-                channel_id, all_gather_dimension);
-          }
-          return creator
-              .create_cross_partition_all_gather_with_iota_device_list(
-                  b, operand, ag_shape, *expanded_iota_partition_group_list,
-                  channel_id, all_gather_dimension);
-        };
+        CreateCrossPartitionAllGatherWithIotaDeviceList(creator,
+                                                        device_groups_ptr);
   }
   return result;
 }
@@ -2902,23 +2950,22 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
       group_sizes.begin(), group_sizes.end(), 1, std::multiplies<int64_t>());
   std::vector<std::vector<int64_t>> groups(sharding.num_devices() /
                                            total_group_size);
-  sharding.tile_assignment().Each(
-      [&](absl::Span<const int64_t> indices, int64_t device) {
-        int64_t group_id = 0;
-        for (int64_t dim = 0; dim < indices.size(); ++dim) {
-          if (auto it = absl::c_find(target_dims, dim);
-              it != target_dims.end()) {
-            int64_t group_size =
-                group_sizes[std::distance(target_dims.begin(), it)];
-            group_id *= sharding.dimension(dim) / group_size;
-            group_id += indices[dim] / group_size;
-          } else {
-            group_id *= sharding.dimension(dim);
-            group_id += indices[dim];
-          }
-        }
-        groups[group_id].push_back(device);
-      });
+  sharding.tile_assignment().Each([&](absl::Span<const int64_t> indices,
+                                      int64_t device) {
+    int64_t group_id = 0;
+    for (int64_t dim = 0; dim < indices.size(); ++dim) {
+      if (auto it = absl::c_find(target_dims, dim); it != target_dims.end()) {
+        int64_t group_size =
+            group_sizes[std::distance(target_dims.begin(), it)];
+        group_id *= sharding.dimension(dim) / group_size;
+        group_id += indices[dim] / group_size;
+      } else {
+        group_id *= sharding.dimension(dim);
+        group_id += indices[dim];
+      }
+    }
+    groups[group_id].push_back(device);
+  });
   return groups;
 }
 

From e4f5e50efb3708eee252250132c9501689f58801 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 22:14:24 -0800
Subject: [PATCH 159/753] Automated Code Change

PiperOrigin-RevId: 843048586
---
 tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc  | 2 ++
 .../c/experimental/ops/gen/cpp/renderers/guard_renderer.cc     | 1 +
 .../c/experimental/ops/gen/cpp/renderers/include_renderer.cc   | 2 ++
 .../c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc | 2 ++
 tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc | 1 +
 tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc    | 3 +++
 .../c/experimental/ops/gen/cpp/renderers/renderer_test.cc      | 2 ++
 7 files changed, 13 insertions(+)

diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
index 7c8231a71133f5..cd4e0af1ec8454 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"
 
+#include <string>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
index 50db08df1db988..b3d33c379549b5 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/guard_renderer.h"
 
 #include <algorithm>
+#include <string>
 
 #include "tensorflow/c/experimental/ops/gen/common/case_format.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
index 0ec8108bee7aaf..5aea065a45dffc 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/include_renderer.h"
 
+#include <string>
+
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/path.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
index b490cc7fe9e86a..96f317f6201286 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/namespace_renderer.h"
 
+#include <string>
+
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
index 63cb5f30eb1d9d..766adae9a558a1 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/op_renderer.h"
 
 #include <iterator>
+#include <string>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
index 6a608d759a3753..5acf000cd71169 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include <string>
+
 #include "absl/log/log.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_context.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
index 6621d1aea2c217..cdcbad089a556e 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/renderers/renderer_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/renderer.h"
 
+#include <string>
+
 #include "tensorflow/c/experimental/ops/gen/common/path_config.h"
 #include "tensorflow/c/experimental/ops/gen/common/source_code.h"
 #include "tensorflow/c/experimental/ops/gen/cpp/renderers/cpp_config.h"

From ac49bbd684c0a1446d53eaaee4739e7780ecebfe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 10 Dec 2025 23:26:19 -0800
Subject: [PATCH 160/753] Automated Code Change

PiperOrigin-RevId: 843070119
---
 tensorflow/cc/saved_model/experimental/tests/BUILD               | 1 +
 .../cc/saved_model/experimental/tests/saved_model_api_test.cc    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/cc/saved_model/experimental/tests/BUILD b/tensorflow/cc/saved_model/experimental/tests/BUILD
index 3270ca916e14a0..995f2a18d6979b 100644
--- a/tensorflow/cc/saved_model/experimental/tests/BUILD
+++ b/tensorflow/cc/saved_model/experimental/tests/BUILD
@@ -23,5 +23,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
index ac85bd728cb7e4..baa3b6be991076 100644
--- a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/cc/experimental/base/public/runtime.h"
 #include "tensorflow/cc/experimental/base/public/runtime_builder.h"

From 78fb2c83b1ed0499651bd7239e6b1499e7072067 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 00:04:56 -0800
Subject: [PATCH 161/753] Automated Code Change

PiperOrigin-RevId: 843080851
---
 tensorflow/core/graph/regularization/simple_delete_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/graph/regularization/simple_delete_test.cc b/tensorflow/core/graph/regularization/simple_delete_test.cc
index 2eac003707755f..424c0384823cb8 100644
--- a/tensorflow/core/graph/regularization/simple_delete_test.cc
+++ b/tensorflow/core/graph/regularization/simple_delete_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/graph/regularization/simple_delete.h"
 
+#include <cstdint>
 #include <string>
 
 #include "absl/status/statusor.h"

From e1ab7b41a28747f544144be4f928c3af2d9deecb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 00:23:19 -0800
Subject: [PATCH 162/753] Automated Code Change

PiperOrigin-RevId: 843086902
---
 third_party/xla/xla/tools/BUILD                         | 1 +
 third_party/xla/xla/tools/hlo_module_loader.cc          | 1 +
 third_party/xla/xla/tools/matmul_perf_table_gen_test.cc | 1 -
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 60993b0f7d19ab..f6dfc5b93ba278 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -387,6 +387,7 @@ cc_library(
         "//xla/hlo/translate/stablehlo_to_hlo:translate",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
diff --git a/third_party/xla/xla/tools/hlo_module_loader.cc b/third_party/xla/xla/tools/hlo_module_loader.cc
index d805b45d9426ec..96ea37ed9353f4 100644
--- a/third_party/xla/xla/tools/hlo_module_loader.cc
+++ b/third_party/xla/xla/tools/hlo_module_loader.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
diff --git a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
index 7aa3d6863d752e..aa11c8fe86b2bd 100644
--- a/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
+++ b/third_party/xla/xla/tools/matmul_perf_table_gen_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/tools/matmul_perf_table_gen.h"
 
 #include <cstdint>
-#include <variant>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>

From c0f76388d0edaa36dd721bdbb8ed588087493eba Mon Sep 17 00:00:00 2001
From: Terry Sun <tesun@nvidia.com>
Date: Thu, 11 Dec 2025 00:35:07 -0800
Subject: [PATCH 163/753] PR #34143: [GPU] Add all-to-all support to S-curve
 model
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34143

📝 Summary of Changes
Added all-to-all support to S-curve model.

🎯 Justification
S-curve model doesn't support all-to-all, fallback may lead to bad performance, benchmarking justified that the added all-to-all model can improve performance for models with cross-NVL domain all-to-all.

🚀 Kind of Contribution
⚡️ Performance Improvement/✨ New Feature

📊 Benchmark (for Performance Improvements)
| Branch | End-to-end execution time mean on mixtral_8x7b_bf16_2x8 |
| :------- | :------: |
| main     | 1128328 us   |
| terryysun/a2a_s_curve (this branch)   | 1009397 us |

Speedup over main: 11.78%.

🧪 Unit Tests:
Added exact-matching unit tests to guard the estimation value.

🧪 Execution Tests:
Added execution tests to guard the comm-compute overlapping behavior.

Copybara import of the project:

--
794ef568fe9fcc0f6b4571f19e2a6ce6e06d0099 by Terry Sun <tesun@nvidia.com>:

s-curve a2a support

--
4f85dae4e688af0e6b1f0f5ff1aa0bfef052f15f by Terry Sun <tesun@nvidia.com>:

fix buffer size calculation

--
1dc94f78cd73ec3f0784b6b2db795a608468cdc7 by Terry Sun <tesun@nvidia.com>:

add LHS test

--
56aef84b36c2bc99bf39562fa868398240ae79c3 by Terry Sun <tesun@nvidia.com>:

add model dispatching test

--
d20ed933cbd97d56f1664bbea1b8d35f9092146e by Terry Sun <tesun@nvidia.com>:

fix merge issue

--
f09474bc513804097956e15a7684f1299bef4173 by Terry Sun <tesun@nvidia.com>:

rephase doc string

Merging this change closes #34143

PiperOrigin-RevId: 843090023
---
 .../gpu/gpu_latency_hiding_scheduler_test.cc  | 46 ++++++++++++
 .../service/gpu/model/sol_gpu_cost_model.cc   | 21 ++++++
 .../service/gpu/model/sol_gpu_cost_model.h    | 13 +++-
 .../gpu/model/sol_gpu_cost_model_test.cc      | 70 ++++++++++++++-----
 .../gpu/model/sol_latency_estimator.cc        | 17 ++++-
 .../gpu/model/sol_latency_estimator_test.cc   | 51 +++++++++++++-
 6 files changed, 195 insertions(+), 23 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index 28eabcb1cd7680..397e1c85737bf2 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -408,6 +408,52 @@ TEST_F(GpuLatencyHidingSchedulerBaseTest,
                   GetIndexByName(instruction_sequence, "rs_1"));
 }
 
+TEST_F(GpuLatencyHidingSchedulerBaseTest,
+       AllToAllAndGemmOverlapWithSolCostModel) {
+  // Verify SoL cost model successfully enables all-to-all overlap with compute.
+  absl::string_view kHloModule = R"(
+    HloModule m, replica_count=16
+
+    async_a2a {
+      param = f32[2048,2048] parameter(0)
+      ROOT a2a_inner = f32[2048,2048] all-to-all(param), dimensions={0},
+        replica_groups={{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}}
+    }
+
+    ENTRY main {
+      lhs = f32[8192,8192] parameter(0)
+      rhs = f32[8192,8192] parameter(1)
+      comm = f32[2048,2048] parameter(2)
+      compute = f32[8192,8192] dot(lhs, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      a2a = ((f32[2048,2048]), f32[2048,2048]) async-start(comm), calls=async_a2a
+      a2a_done = f32[2048,2048] async-done(a2a)
+      ROOT tuple = (f32[2048,2048], f32[8192,8192]) tuple(a2a_done, compute)
+    }
+  )";
+
+  auto config = GetModuleConfig("");
+  DebugOptions& debug_options = config.mutable_debug_options();
+  debug_options.set_xla_gpu_enable_latency_hiding_scheduler(true);
+  debug_options.set_xla_gpu_enable_analytical_sol_latency_estimator(true);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(kHloModule, config));
+  auto scheduled = ScheduleModule(module.get(), /*num_parallel_resources=*/1);
+  TF_ASSERT_OK(scheduled.status());
+
+  const auto& sequence = scheduled.value()
+                             ->schedule()
+                             .sequence(module->entry_computation())
+                             .instructions();
+  int64_t a2a_idx = GetIndexByName(sequence, "a2a");
+  int64_t compute_idx = GetIndexByName(sequence, "compute");
+  int64_t a2a_done_idx = GetIndexByName(sequence, "a2a_done");
+
+  // Check that overlap occurs: a2a < compute < a2a_done
+  EXPECT_LT(a2a_idx, compute_idx);
+  EXPECT_LT(compute_idx, a2a_done_idx);
+}
+
 TEST_F(GpuLatencyHidingSchedulerBaseTest,
        OverlappingRanksPreventOverlappingCollectives) {
   // TODO TJ re-enable this test when the multi-streamed
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
index 05417dbb997dc3..01f7df1228dd53 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.cc
@@ -224,6 +224,27 @@ absl::StatusOr<absl::Duration> SolGPUCostModel::RingLatency(
   return ret + xla_flag_config_.nccl_op_launch_time;
 }
 
+absl::StatusOr<absl::Duration> SolGPUCostModel::AllToAllLatency(
+    const int64_t buff_size_bytes, const int num_nodes,
+    const int num_communicators) const {
+  TF_ASSIGN_OR_RETURN(
+      int num_gpus,
+      NumGpusPerComm(num_nodes, SolGPUCostModel::CollectiveType::kAllToAll,
+                     num_communicators));
+
+  const int num_gpus_per_node = num_gpus / num_nodes;
+  // Each GPU sends to (num_gpus_per_node * (num_nodes-1)) peers off-node.
+  const int inter_node_peers_per_gpu = num_gpus_per_node * (num_nodes - 1);
+  // Sending buff_size_bytes / (num_gpus - 1) bytes to each peer off-node.
+  const int64_t per_peer_bytes = buff_size_bytes / (num_gpus - 1);
+  absl::Duration per_peer_duration = TransferDuration(per_peer_bytes) +
+                                     ChunkPrepLatency(per_peer_bytes) +
+                                     xla_flag_config_.rtt;
+  absl::Duration total = inter_node_peers_per_gpu * per_peer_duration;
+
+  return total + xla_flag_config_.nccl_op_launch_time;
+}
+
 // Helper functions
 absl::StatusOr<int> SolGPUCostModel::NumGpusPerComm(
     int num_nodes, const CollectiveType& coll_type,
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
index 0634f118c528cd..21cf4bd32f9cc7 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model.h
@@ -50,8 +50,9 @@ class SolGPUCostModel {
   };
 
   enum class CollectiveType {
-    kAllReduce,
     kAllGather,
+    kAllReduce,
+    kAllToAll,
     kReduceScatter,
     kSendRecv,
   };
@@ -73,6 +74,16 @@ class SolGPUCostModel {
                                              const CollectiveType& coll_type,
                                              int num_communicators) const;
 
+  // Returns the latency of an AllToAll collective across multiple nodes.
+  //
+  // `buff_size_bytes`: the size of the message to be transferred.
+  // `num_nodes`: the number of nodes participating in the all-to-all.
+  // `num_communicators`: the number of communicators participating in the
+  // all-to-all.
+  absl::StatusOr<absl::Duration> AllToAllLatency(int64_t buff_size_bytes,
+                                                 int num_nodes,
+                                                 int num_communicators) const;
+
  private:
   // Helper functions to estimate the latency subcomponents
   absl::Duration ChunkPrepLatency(int64_t per_gpu_msg_size_bytes) const;
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc
index 3adfda9b07671d..7b778e0f682b61 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_test.cc
@@ -27,17 +27,18 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
-constexpr int64_t kTenMB = 10 * 1024 * 1024;  // 10MB
+constexpr int64_t kEightMB = 8 * 1024 * 1024;  // 8MB
 
 using ::testing::TestWithParam;
 using ::testing::ValuesIn;
 
-struct RingLatencyTestCase {
+struct LatencyTestCase {
   SolGPUCostModel::CollectiveType collective_type;
+  int num_nodes;
   absl::Duration expected_latency;
 };
 
-class SolGPUCostModelTest : public TestWithParam<RingLatencyTestCase> {
+class SolGPUCostModelTest : public TestWithParam<LatencyTestCase> {
  protected:
   SolGPUCostModelTest()
       : model_({
@@ -45,30 +46,61 @@ class SolGPUCostModelTest : public TestWithParam<RingLatencyTestCase> {
             /*nic_speed_gbps=*/100,
             /*chunk_prep_time=*/absl::Microseconds(100),
             /*rtt=*/absl::Microseconds(100),
-            /*gpus_per_node=*/100,
+            /*gpus_per_node=*/8,
             /*chunk_size_bytes=*/4 * 1024 * 1024,
         }) {}
   SolGPUCostModel model_;
 };
 
-TEST_P(SolGPUCostModelTest, TestRingLatency) {
-  const RingLatencyTestCase& test_case = GetParam();
-  absl::Duration actual_latency =
-      absl::Trunc(*model_.RingLatency(kTenMB, 1, test_case.collective_type,
-                                      /*num_communicators=*/1),
-                  absl::Microseconds(1));
+TEST_P(SolGPUCostModelTest, TestLatency) {
+  const LatencyTestCase& test_case = GetParam();
+  absl::Duration actual_latency;
+  if (test_case.collective_type == SolGPUCostModel::CollectiveType::kAllToAll) {
+    actual_latency =
+        absl::Trunc(*model_.AllToAllLatency(kEightMB, test_case.num_nodes,
+                                            /*num_communicators=*/1),
+                    absl::Microseconds(1));
+  } else {
+    actual_latency =
+        absl::Trunc(*model_.RingLatency(kEightMB, test_case.num_nodes,
+                                        test_case.collective_type,
+                                        /*num_communicators=*/1),
+                    absl::Microseconds(1));
+  }
   EXPECT_EQ(actual_latency, test_case.expected_latency);
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    SolGPUCostModelTests, SolGPUCostModelTest,
-    ValuesIn<RingLatencyTestCase>({
-        {SolGPUCostModel::CollectiveType::kAllGather, absl::Microseconds(299)},
-        {SolGPUCostModel::CollectiveType::kAllReduce, absl::Microseconds(498)},
-        {SolGPUCostModel::CollectiveType::kReduceScatter,
-         absl::Microseconds(299)},
-        {SolGPUCostModel::CollectiveType::kSendRecv, absl::Microseconds(353)},
-    }));
+INSTANTIATE_TEST_SUITE_P(SolGPUCostModelTests, SolGPUCostModelTest,
+                         ValuesIn<LatencyTestCase>({
+                             {SolGPUCostModel::CollectiveType::kAllGather,
+                              /*num_nodes=*/1, absl::Microseconds(284)},
+                             {SolGPUCostModel::CollectiveType::kAllGather,
+                              /*num_nodes=*/2, absl::Microseconds(485)},
+                             {SolGPUCostModel::CollectiveType::kAllGather,
+                              /*num_nodes=*/4, absl::Microseconds(885)},
+                             {SolGPUCostModel::CollectiveType::kAllReduce,
+                              /*num_nodes=*/1, absl::Microseconds(468)},
+                             {SolGPUCostModel::CollectiveType::kAllReduce,
+                              /*num_nodes=*/2, absl::Microseconds(870)},
+                             {SolGPUCostModel::CollectiveType::kAllReduce,
+                              /*num_nodes=*/4, absl::Microseconds(1670)},
+                             {SolGPUCostModel::CollectiveType::kReduceScatter,
+                              /*num_nodes=*/1, absl::Microseconds(284)},
+                             {SolGPUCostModel::CollectiveType::kReduceScatter,
+                              /*num_nodes=*/2, absl::Microseconds(485)},
+                             {SolGPUCostModel::CollectiveType::kReduceScatter,
+                              /*num_nodes=*/4, absl::Microseconds(885)},
+                             {SolGPUCostModel::CollectiveType::kSendRecv,
+                              /*num_nodes=*/1, absl::Microseconds(292)},
+                             {SolGPUCostModel::CollectiveType::kSendRecv,
+                              /*num_nodes=*/2, absl::Microseconds(485)},
+                             {SolGPUCostModel::CollectiveType::kAllToAll,
+                              /*num_nodes=*/1, absl::Microseconds(100)},
+                             {SolGPUCostModel::CollectiveType::kAllToAll,
+                              /*num_nodes=*/2, absl::Microseconds(1745)},
+                             {SolGPUCostModel::CollectiveType::kAllToAll,
+                              /*num_nodes=*/4, absl::Microseconds(4966)},
+                         }));
 
 TEST(SolGPUCostModelGetConfigTest, ConfigForHopper) {
   constexpr absl::string_view kDummyModule = R"(
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
index 72bb3f5033896e..8fdcdb8232159d 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
@@ -61,7 +61,7 @@ using ::mlir::MLIRContext;
 bool IsSupportedCollectiveOp(const HloInstruction& instr) {
   return HloPredicateIsOp<HloOpcode::kAllReduceStart, HloOpcode::kAllReduce,
                           HloOpcode::kReduceScatter, HloOpcode::kAllGatherStart,
-                          HloOpcode::kAllGather>(&instr);
+                          HloOpcode::kAllGather, HloOpcode::kAllToAll>(&instr);
 }
 
 bool IsHostOffloaded(const HloInstruction& instr) {
@@ -127,6 +127,14 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
       result += runtime;
       break;
     }
+    case HloOpcode::kAllToAll: {
+      TF_ASSIGN_OR_RETURN(
+          absl::Duration runtime,
+          sol_model.AllToAllLatency(msg_size, num_participating_hosts,
+                                    num_communicators));
+      result += runtime;
+      break;
+    }
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllReduceStart: {
       result += gpu_performance_model.Get()
@@ -165,6 +173,13 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
                                 num_communicators));
         result += runtime;
       }
+      if (instr.async_wrapped_opcode() == HloOpcode::kAllToAll) {
+        TF_ASSIGN_OR_RETURN(
+            absl::Duration runtime,
+            sol_model.AllToAllLatency(msg_size, num_participating_hosts,
+                                      num_communicators));
+        result += runtime;
+      }
       break;
     }
     case HloOpcode::kRecv:
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index 4ecd0253fee7ab..088099eb468ee2 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal_util.h"
-#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/collective_interpolator.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model.h"
@@ -45,7 +44,6 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
-#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 namespace {
@@ -553,6 +551,55 @@ INSTANTIATE_TEST_SUITE_P(SolLatencyEstimatorTests, SolLatencyEstimatorTest,
                            return info.param.test_name;
                          });
 
+TEST_F(HloHardwareIndependentTestBase, CollectiveCostModelDispatching) {
+  const auto shape_size_fn = HloCostAnalysis::DefaultShapeSize;
+  const auto gpu_info = TestGpuDeviceInfo::RTXH100SXMDeviceInfo();
+  const SolGPUCostModel::Config sol_flags = {
+      absl::Microseconds(100), 100, absl::Microseconds(100),
+      absl::Microseconds(100), 8,   4 * 1024 * 1024};
+  mlir::MLIRContext mlir_ctx;
+  auto interpolator =
+      *CollectiveInterpolator::Create(sol_flags.gpus_per_node, gpu_info,
+                                      /*analysis=*/nullptr);
+
+  // NVLink domain collective should use CollectiveInterpolator.
+  TF_ASSERT_OK_AND_ASSIGN(auto nvl_module, ParseAndReturnVerifiedModule(R"(
+HloModule m, num_partitions=16
+ENTRY main {
+  p = bf16[8,16000,1000] parameter(0)
+  ROOT a2a = bf16[8,16000,1000] all-to-all(p),
+    replica_groups={{0,1,2,3,4,5,6,7},{8,9,10,11,12,13,14,15}},
+    channel_id=1, dimensions={0}
+})"));
+  HloInstruction* nvl_instr = hlo_query::FindInstruction(
+      nvl_module->entry_computation(), HloOpcode::kAllToAll);
+  EXPECT_FALSE(SolLatencyEstimator::ComputeCollectiveTime(
+                   *nvl_instr, gpu_info, shape_size_fn, sol_flags, &mlir_ctx,
+                   /*collective_interpolator=*/nullptr)
+                   .ok());
+  EXPECT_TRUE(SolLatencyEstimator::ComputeCollectiveTime(
+                  *nvl_instr, gpu_info, shape_size_fn, sol_flags, &mlir_ctx,
+                  interpolator.get())
+                  .ok());
+
+  // Cross-partition collective should use S-curve model (world-level across 2
+  // hosts).
+  TF_ASSERT_OK_AND_ASSIGN(auto ib_module, ParseAndReturnVerifiedModule(R"(
+HloModule m, num_partitions=16
+ENTRY main {
+  p = bf16[16,16000,1000] parameter(0)
+  ROOT a2a = bf16[16,16000,1000] all-to-all(p),
+    replica_groups={{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}},
+    channel_id=1, dimensions={0}
+})"));
+  HloInstruction* ib_instr = hlo_query::FindInstruction(
+      ib_module->entry_computation(), HloOpcode::kAllToAll);
+  EXPECT_TRUE(SolLatencyEstimator::ComputeCollectiveTime(
+                  *ib_instr, gpu_info, shape_size_fn, sol_flags, &mlir_ctx,
+                  /*collective_interpolator=*/nullptr)
+                  .ok());
+}
+
 class IsSolLatencyEstimatorEnabledTest : public HloTestBase {
  protected:
   IsSolLatencyEstimatorEnabledTest()

From fe87a77274ede90f62f422628926939eec9ef5be Mon Sep 17 00:00:00 2001
From: Alex <alexandros.theodoridis@amd.com>
Date: Thu, 11 Dec 2025 00:36:28 -0800
Subject: [PATCH 164/753] PR #35108: [ROCm] Fix crashes in hermetic build tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35108

📝 Summary of Changes
Fixing crash due to conflicting llvm in rocm vs llvm-project symbols

🎯 Justification
new versions of rocm are linked agains an internal version of llvm
the symbols of this library are hidden behind the libamd_comgr_loader.so library.
It is wrong to link libamd_comgr.so directoy to any xla target. Instead we have to link
libamd_comgr_loader.so and a specific stub file. This will ensure that internal
rocm llvm is loaded dynamically and calls are propagated to it through the loader.

🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
Not relevant

🧪 Unit Tests:
Existing u-tests in hermetic build with rocm 7.10.0

🧪 Execution Tests:
Rocm CI hermetic tests + //xla/service/gpu:gpu_offloading_test_amdgpu_any locally

Copybara import of the project:

--
bec3e66474a0a611f69265fdbf6f672566ad3a15 by Alexandros Theodoridis <atheodor@amd.com>:

Fix crashes in hermetic build tests

Merging this change closes #35108

PiperOrigin-RevId: 843090382
---
 .../xla/third_party/gpus/rocm/BUILD.tpl       | 52 +++++++++++++------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index a9f510d7e0c7aa..d3b6f87a4adf18 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -116,12 +116,8 @@ cc_library(
         ":rocsolver",
         ":rocsparse",
         ":roctracer",
-    ] + select_threshold(
-        above_or_eq = [":hipfft"],
-        below = [":rocfft"],
-        threshold = 40100,
-        value = rocm_version_number(),
-    ),
+        ":hipfft",
+    ],
 )
 
 cc_library(
@@ -535,8 +531,9 @@ cc_library(
 )
 
 cc_library(
-    name = "amd_comgr",
+    name = "amd_comgr_dynamic",
     hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
+    srcs = ["%{rocm_root}/lib/libamd_comgr_stub.a"],
     data = glob([
         "%{rocm_root}/lib/libamd_comgr_loader.so*",
         "%{rocm_root}/lib/libamd_comgr.so*",
@@ -546,17 +543,27 @@ cc_library(
     includes = [
         "%{rocm_root}/include",
     ],
-    linkopts = select({
-        ":build_hermetic": [
-            "-lamd_comgr_loader",
-            "-lamd_comgr",
-        ],
-        "//conditions:default": [
-            "-lamd_comgr",
-        ],
-    }),
+    linkopts = ["-lamd_comgr_loader"],
+    strip_include_prefix = "%{rocm_root}",
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+        ":system_libs",
+    ],
+)
+
+cc_library(
+    name = "amd_comgr_static",
+    hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
+    data = glob([
+        "%{rocm_root}/lib/libamd_comgr.so*",
+    ]),
+    include_prefix = "rocm",
+    includes = [
+        "%{rocm_root}/include",
+    ],
+    linkopts = ["-lamd_comgr"],
     strip_include_prefix = "%{rocm_root}",
-    visibility = ["//visibility:public"],
     deps = [
         ":rocm_config",
         ":rocm_rpath",
@@ -564,6 +571,17 @@ cc_library(
     ],
 )
 
+alias(
+    name = "amd_comgr",
+    actual = select_threshold(
+        above_or_eq = ":amd_comgr_dynamic",
+        below = ":amd_comgr_static",
+        threshold = 71000,
+        value = rocm_version_number(),
+    ),
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "rocm_smi",
     srcs = glob([

From 87ca8553797086ca912f34153ad4707b60d3463c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 01:03:40 -0800
Subject: [PATCH 165/753] Update GraphDef version to 2438.

PiperOrigin-RevId: 843098878
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 71f77a1df57898..58dbf3272f4164 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2437  // Updated: 2025/12/10
+#define TF_GRAPH_DEF_VERSION 2438  // Updated: 2025/12/11
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 4f4ef39cdad82f3a00a40620acf18a029c1bd28f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 01:03:43 -0800
Subject: [PATCH 166/753] compat: Update forward compatibility horizon to
 2025-12-11

PiperOrigin-RevId: 843098900
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 8f5d395630c3fe..638340af389d5e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 10)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 11)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 5441dfdbc89d3b6989777f22624a526a124e4ccd Mon Sep 17 00:00:00 2001
From: Chun-nien Chan <cnchan@google.com>
Date: Thu, 11 Dec 2025 01:36:39 -0800
Subject: [PATCH 167/753] Add extenral_buffer attribute to tfl.external_const

PiperOrigin-RevId: 843109775
---
 .../compiler/mlir/lite/flatbuffer_export.cc   | 108 ++++++++++++---
 .../compiler/mlir/lite/flatbuffer_import.cc   | 125 +++++++++++++-----
 .../compiler/mlir/lite/ir/tfl_op_enums.td     |  12 ++
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  10 +-
 .../flatbuffer2mlir/external_buffer.mlir      |  14 ++
 .../mlir2flatbuffer/external_buffer.mlir      |  34 +++++
 6 files changed, 247 insertions(+), 56 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_buffer.mlir
 create mode 100644 tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/external_buffer.mlir

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 41dffc228a6b2c..67eef87eb872ad 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -269,7 +269,7 @@ static StatusOr<tflite::TensorType> GetTFLiteType(Type type,
 static bool IsConst(Operation* op) {
   return isa<mlir::func::ConstantOp, mlir::arith::ConstantOp, mlir::TF::ConstOp,
              tfl::ConstOp, tfl::QConstOp, tfl::SparseConstOp,
-             tfl::SparseQConstOp, mlir::TFL::NoValueOp,
+             tfl::ExternalConstOp, tfl::SparseQConstOp, mlir::TFL::NoValueOp,
              mlir::stablehlo::ConstantOp, mlir::vhlo::ConstantOpV1>(op);
 }
 
@@ -632,6 +632,12 @@ class Translator {
   std::optional<BufferOffset<tflite::Buffer>> BuildBuffer(
       Value value, bool can_be_deduplicated, int& index);
 
+  // Builds external buffer and external buffer group from the given value. If
+  // the value is not defined by a constant op with external buffer attributes,
+  // returns std::nullopt.
+  std::optional<BufferOffset<tflite::ExternalBuffer>> BuildExternalBuffer(
+      Value value, uint32_t external_buffer_id);
+
   // Build TFLite tensor from the given type. This function is for tfl.lstm
   // intermediates, which should have UniformQuantizedType.
   std::optional<BufferOffset<tflite::Tensor>> BuildTensorFromType(
@@ -647,6 +653,7 @@ class Translator {
   // corresponding buffer. Emits error and returns std::nullopt on failure.
   std::optional<BufferOffset<tflite::Tensor>> BuildTensor(
       Value value, const std::string& name, unsigned buffer_idx,
+      unsigned external_buffer_id,
       const std::optional<BufferOffset<tflite::QuantizationParameters>>&
           quant_parameters);
 
@@ -858,6 +865,13 @@ class Translator {
   BufferOffset<tflite::Buffer> empty_buffer_;
 
   std::vector<BufferOffset<tflite::Buffer>> buffers_;
+
+  // External buffers
+  std::vector<BufferOffset<tflite::ExternalBuffer>> external_buffers_;
+  std::vector<BufferOffset<tflite::ExternalBufferGroup>>
+      external_buffer_groups_;
+  absl::flat_hash_map<std::string, uint32_t> external_buffer_group_map_;
+
   // Maps subgraph index and tensor name in the graph to the tensor index.
   absl::flat_hash_map<int, absl::flat_hash_map<std::string, int>>
       tensor_index_map_;
@@ -986,6 +1000,44 @@ std::string Translator::UniqueName(mlir::Value val) {
   return std::string(name_mapper_.GetUniqueName(val));
 }
 
+std::optional<BufferOffset<tflite::ExternalBuffer>>
+Translator::BuildExternalBuffer(mlir::Value value,
+                                uint32_t external_buffer_id) {
+  if (value.getDefiningOp() == nullptr) {
+    return std::nullopt;
+  }
+  auto inst = mlir::dyn_cast<tfl::ExternalConstOp>(value.getDefiningOp());
+  if (!inst) {
+    return std::nullopt;
+  }
+  auto meta = inst.getExternalBufferAttr();
+  if (!meta) {
+    return std::nullopt;
+  }
+
+  std::string group_name = meta.getGroupName().str();
+  uint64_t offset = meta.getOffset();
+  uint64_t length = meta.getLength();
+  std::string packing = meta.getPacking().str();
+
+  uint32_t group_index = 0;
+  if (auto it = external_buffer_group_map_.find(group_name);
+      it != external_buffer_group_map_.end()) {
+    group_index = it->second;
+  } else {
+    int index = external_buffer_groups_.size();
+    external_buffer_groups_.push_back(tflite::CreateExternalBufferGroup(
+        builder_, builder_.CreateString(group_name)));
+    external_buffer_group_map_[group_name] = index;
+    group_index = index;
+  }
+
+  auto external_buffer = tflite::CreateExternalBuffer(
+      builder_, external_buffer_id, group_index, offset, length,
+      builder_.CreateString(packing));
+  return external_buffer;
+}
+
 std::optional<BufferOffset<tflite::Buffer>> Translator::BuildBuffer(
     mlir::Value value, bool can_be_deduplicated, int& index) {
   can_be_deduplicated = can_be_deduplicated && !disable_buffer_deduping_;
@@ -1241,11 +1293,13 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensorFromType(
       /*buffer=*/0, builder_.CreateString(name), q_params,
       /*is_variable=*/false, /*sparsity=*/0, /*shape_signature=*/0,
       /*has_rank=*/tensor_type.hasRank(),
-      variant_params->empty() ? 0 : builder_.CreateVector(*variant_params));
+      variant_params->empty() ? 0 : builder_.CreateVector(*variant_params),
+      /*external_buffer=*/0);
 }
 
 std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
     Value value, const std::string& name, unsigned buffer_idx,
+    unsigned external_buffer_id,
     const std::optional<BufferOffset<tflite::QuantizationParameters>>&
         quant_parameters) {
   auto type = mlir::cast<TensorType>(value.getType());
@@ -1371,7 +1425,8 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
         (is_variable ? 0 : buffer_idx), builder_.CreateString(name), q_params,
         /*is_variable=*/is_variable, s_params, /*shape_signature=*/0,
         /*has_rank=*/has_rank,
-        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params));
+        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params),
+        external_buffer_id);
   } else {
     return tflite::CreateTensor(
         builder_, builder_.CreateVector(shape), tflite_element_type,
@@ -1379,7 +1434,8 @@ std::optional<BufferOffset<tflite::Tensor>> Translator::BuildTensor(
         /*is_variable=*/is_variable, s_params,
         /*shape_signature=*/builder_.CreateVector(shape_signature),
         /*has_rank=*/has_rank,
-        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params));
+        variant_params->empty() ? 0 : builder_.CreateVector(*variant_params),
+        external_buffer_id);
   }
 }
 
@@ -3292,27 +3348,41 @@ std::optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
       }
     }
 
+    // External buffer id is enforced to have MSB set to 1 to distinguish from
+    // buffer index/id, with the assumption that the number of external buffers
+    // are less than 2^31.
+    uint32_t external_buffer_id =
+        (1 << 31) | static_cast<uint32_t>(external_buffers_.size());
     int buffer_index = buffers_.size();
-    // If a constant is returned as subgraph's output, this constant cannot be
-    // deduplicated.
-    const bool not_returned_by_subgraph = llvm::none_of(
-        value.getUsers(),
-        [](Operation* user) { return llvm::isa<mlir::func::ReturnOp>(user); });
+
     // TODO(ashwinm): Check if for stateful tensors, if it is also needed to
     // make the Buffer empty apart from setting the buffer_idx=0 in the
     // Tensor. This does not seem to affect runtime behavior for RNN/LSTM,
     // but would be good for reducing memory footprint.
-    if (value.getDefiningOp()) {
+    if (auto external_buffer_or =
+            BuildExternalBuffer(value, external_buffer_id);
+        external_buffer_or.has_value()) {
+      buffer_index = 0;
+      external_buffers_.push_back(*external_buffer_or);
+    } else if (value.getDefiningOp()) {
+      // If a constant is returned as subgraph's output, this constant cannot be
+      // deduplicated.
+      const bool not_returned_by_subgraph =
+          llvm::none_of(value.getUsers(), [](Operation* user) {
+            return llvm::isa<mlir::func::ReturnOp>(user);
+          });
       auto buffer_or =
           BuildBuffer(value, not_returned_by_subgraph, buffer_index);
       if (!buffer_or) return false;
+      external_buffer_id = 0;
       buffers_.push_back(*buffer_or);
     } else {
+      external_buffer_id = 0;
       buffers_.push_back(empty_buffer_);
     }
 
-    auto tensor_or =
-        BuildTensor(value, tensor_name, buffer_index, quant_parameters);
+    auto tensor_or = BuildTensor(value, tensor_name, buffer_index,
+                                 external_buffer_id, quant_parameters);
     if (!tensor_or) return false;
     tensors.push_back(*tensor_or);
 
@@ -4192,11 +4262,15 @@ std::optional<std::string> Translator::TranslateInternal() {
   }
   auto signature_defs = CreateSignatureDefs(signature_defs_vec);
 
-  auto model = tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
-                                   builder_.CreateVector(opcodes_),
-                                   builder_.CreateVector(subgraphs_),
-                                   description, builder_.CreateVector(buffers_),
-                                   metadata_buffer, *metadata, *signature_defs);
+  bool has_external_buffers = !external_buffers_.empty();
+  auto model = tflite::CreateModel(
+      builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(opcodes_),
+      builder_.CreateVector(subgraphs_), description,
+      builder_.CreateVector(buffers_), metadata_buffer, *metadata,
+      *signature_defs,
+      has_external_buffers ? builder_.CreateVector(external_buffer_groups_) : 0,
+      has_external_buffers ? builder_.CreateVector(external_buffers_) : 0);
+
   tflite::FinishModelBuffer(builder_, model);
   // There is a limit of 2GB for a flatbuffer.
   bool flatbuffer_limit_exceeded = builder_.GetSize() > flatbuffer_size_max;
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 4bd8ae5ce0dbb3..19aae278c33178 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -457,9 +457,9 @@ std::string GetMlirOpName(const tflite::OperatorT& op,
   return mlir::GetMlirOpNameFromOpCode(op_code);
 }
 
-StatusOr<Operation*> BuildExternalConstOp(const tflite::TensorT& tensor,
-                                          int32_t buffer_index,
-                                          OpBuilder builder, Location loc) {
+StatusOr<Operation*> BuildExternalConstOpWithBufferIndex(
+    const tflite::TensorT& tensor, int32_t buffer_index, OpBuilder builder,
+    Location loc) {
   TF_ASSIGN_OR_RETURN(mlir::TensorType type,
                       tfl::GetTensorType(tensor, builder,
                                          /*is_constant=*/true));
@@ -468,7 +468,45 @@ StatusOr<Operation*> BuildExternalConstOp(const tflite::TensorT& tensor,
     return errors::Internal("Constant doesn't have a shape");
   }
   auto op = builder.create<tfl::ExternalConstOp>(
-      loc, shaped_type, builder.getI32IntegerAttr(buffer_index));
+      loc, shaped_type,
+      /*buffer_index=*/builder.getI32IntegerAttr(buffer_index),
+      /*external_buffer=*/nullptr);
+  return op.getOperation();
+}
+
+StatusOr<Operation*> BuildExternalConstOpWithExternalBuffer(
+    const tflite::ModelT& model, const tflite::TensorT& tensor,
+    OpBuilder builder, Location loc) {
+  TF_ASSIGN_OR_RETURN(mlir::TensorType type,
+                      tfl::GetTensorType(tensor, builder,
+                                         /*is_constant=*/true));
+  auto shaped_type = llvm::dyn_cast<mlir::RankedTensorType>(type);
+  if (!shaped_type) {
+    return errors::Internal("Constant doesn't have a shape");
+  }
+
+  tflite::ExternalBufferT* external_buffer = nullptr;
+  for (const auto& extbuf : model.external_buffers) {
+    if (extbuf->id == tensor.external_buffer) {
+      external_buffer = extbuf.get();
+      break;
+    }
+  }
+  if (external_buffer == nullptr) {
+    return errors::Internal("External buffer not found");
+  }
+
+  std::string group_name =
+      model.external_buffer_groups[external_buffer->group]->name;
+  auto op = builder.create<tfl::ExternalConstOp>(
+      loc, shaped_type, /*buffer_index=*/nullptr,
+      /*external_buffer=*/
+      tfl::ExternalBufferAttr::get(
+          builder.getContext(),
+          /*group_name=*/builder.getStringAttr(group_name),
+          /*offset=*/external_buffer->offset,
+          /*length=*/external_buffer->length,
+          /*packing=*/builder.getStringAttr(external_buffer->packing)));
   return op.getOperation();
 }
 
@@ -1347,7 +1385,8 @@ mlir::ResultRange MaybeWrapInControlNode(mlir::Operation* op,
 // ordered_output_arrays in the same order. If signature is not null, then the
 // inputs/outputs in signature will be attached to the FuncOp.
 StatusOr<FuncOp> ConvertSubgraph(
-    const tflite::SubGraphT& subgraph, llvm::StringRef name,
+    const tflite::ModelT& model, const tflite::SubGraphT& subgraph,
+    llvm::StringRef name,
     const std::vector<std::unique_ptr<tflite::OperatorCodeT>>& op_codes,
     const std::vector<std::string>& func_names,
     const std::vector<std::unique_ptr<tflite::BufferT>>& buffers,
@@ -1511,22 +1550,30 @@ StatusOr<FuncOp> ConvertSubgraph(
         StatusOr<Operation*> op_or_err;
         std::vector<uint8_t> buffer;
         // Check if constant tensor is stored outside of the flatbuffers.
-        if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
-          const uint8_t* file_begin_ptr =
-              reinterpret_cast<const uint8_t*>(model_ptr->allocation()->base());
-          buffer = std::vector<uint8_t>(
-              file_begin_ptr + buffers[const_tensor.buffer]->offset,
-              file_begin_ptr + buffers[const_tensor.buffer]->offset +
-                  buffers[const_tensor.buffer]->size);
+        if (const_tensor.external_buffer != 0) {
+          op_or_err = BuildExternalConstOpWithExternalBuffer(
+              model, const_tensor, op_builder, const_loc);
         } else {
-          buffer = buffers[const_tensor.buffer]->data;
+          if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
+            const uint8_t* file_begin_ptr = reinterpret_cast<const uint8_t*>(
+                model_ptr->allocation()->base());
+
+            buffer = std::vector<uint8_t>(
+                file_begin_ptr + buffers[const_tensor.buffer]->offset,
+                file_begin_ptr + buffers[const_tensor.buffer]->offset +
+                    buffers[const_tensor.buffer]->size);
+          } else {
+            buffer = buffers[const_tensor.buffer]->data;
+          }
+          op_or_err =
+              use_external_constant
+                  ? BuildExternalConstOpWithBufferIndex(const_tensor,
+                                                        const_tensor.buffer,
+                                                        op_builder, const_loc)
+                  : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
+                                 op_builder, const_loc, use_stablehlo_constant);
         }
-        op_or_err =
-            use_external_constant
-                ? BuildExternalConstOp(const_tensor, const_tensor.buffer,
-                                       op_builder, const_loc)
-                : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
-                               op_builder, const_loc, use_stablehlo_constant);
+
         if (!op_or_err.ok()) {
           return emitError(const_loc, op_or_err.status().ToString()),
                  op_or_err.status();
@@ -1584,23 +1631,29 @@ StatusOr<FuncOp> ConvertSubgraph(
       StatusOr<Operation*> op_or_err;
       std::vector<uint8_t> buffer;
       // Check if constant tensor is stored outside of the flatbuffers.
-      if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
-        const uint8_t* file_begin_ptr =
-            reinterpret_cast<const uint8_t*>(model_ptr->allocation()->base());
-
-        buffer = std::vector<uint8_t>(
-            file_begin_ptr + buffers[const_tensor.buffer]->offset,
-            file_begin_ptr + buffers[const_tensor.buffer]->offset +
-                buffers[const_tensor.buffer]->size);
+      if (const_tensor.external_buffer != 0) {
+        op_or_err = BuildExternalConstOpWithExternalBuffer(
+            model, const_tensor, op_builder, const_loc);
       } else {
-        buffer = buffers[const_tensor.buffer]->data;
+        if (IsValidBufferOffset(buffers[const_tensor.buffer]->offset)) {
+          const uint8_t* file_begin_ptr =
+              reinterpret_cast<const uint8_t*>(model_ptr->allocation()->base());
+
+          buffer = std::vector<uint8_t>(
+              file_begin_ptr + buffers[const_tensor.buffer]->offset,
+              file_begin_ptr + buffers[const_tensor.buffer]->offset +
+                  buffers[const_tensor.buffer]->size);
+        } else {
+          buffer = buffers[const_tensor.buffer]->data;
+        }
+        op_or_err =
+            use_external_constant
+                ? BuildExternalConstOpWithBufferIndex(
+                      const_tensor, const_tensor.buffer, op_builder, const_loc)
+                : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
+                               op_builder, const_loc, use_stablehlo_constant);
       }
-      op_or_err =
-          use_external_constant
-              ? BuildExternalConstOp(const_tensor, const_tensor.buffer,
-                                     op_builder, const_loc)
-              : BuildConstOp(const_tensor, buffer, const_tensor.is_variable,
-                             op_builder, const_loc, use_stablehlo_constant);
+
       if (!op_or_err.ok()) {
         return emitError(const_loc, op_or_err.status().ToString()),
                op_or_err.status();
@@ -1862,8 +1915,8 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
         SubgraphName(set_implicit_main_func, e.index(), *subgraph);
     uint32_t subgraph_index = static_cast<uint32_t>(e.index());
     auto func_or_error = ConvertSubgraph(
-        *subgraph, name, model->operator_codes, func_names, model->buffers,
-        base_loc, builder,
+        *model, *subgraph, name, model->operator_codes, func_names,
+        model->buffers, base_loc, builder,
         /*is_entry_point=*/
         set_implicit_main_func
             ? e.index() == 0
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td b/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
index 57e4ec22976df3..6fa287a8c8b013 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_op_enums.td
@@ -166,4 +166,16 @@ def TFL_ConstBytesAttr : AttrDef<TFL_Dialect, "ConstBytes"> {
   let hasCustomAssemblyFormat = 1;
 }
 
+def TFL_ExternalBufferAttr : AttrDef<TFL_Dialect, "ExternalBuffer"> {
+  let mnemonic = "external_buffer";
+  let parameters = (ins
+      "::mlir::StringAttr":$group_name,
+      "uint64_t":$offset,
+      "uint64_t":$length,
+      "::mlir::StringAttr":$packing
+  );
+  let summary = "Flatbuffer external buffer metadata.";
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
 #endif // TFL_OP_ENUMS
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index c90859cd6accfe..4c7e784d5069fd 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -951,11 +951,15 @@ def TFL_ExternalConstOp : Op<TFL_Dialect, "external_const", [
   let summary = "External const op.";
 
   let description = [{
-    External const op holds a `buffer_index` which points to a constant
-    in the flatbuffer.
+    External const op that can hold :
+    - `buffer_index` which points to a constant in the flatbuffer.
+    - `external_buffer` which contains metadata for external buffer outside flatbuffer.
   }];
 
-  let arguments = (ins I32Attr:$buffer_index);
+  let arguments = (ins
+    OptionalAttr<I32Attr>:$buffer_index,
+    OptionalAttr<TFL_ExternalBufferAttr>:$external_buffer
+  );
 
   let results = (outs AnyTensor:$output);
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_buffer.mlir
new file mode 100644
index 00000000000000..987f5a90e374f5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/external_buffer.mlir
@@ -0,0 +1,14 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck %s
+
+module {
+  func.func public @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %0 = "tfl.external_const"() <{external_buffer = #tfl.external_buffer<group_name = "test.bin", offset = 0, length = 13, packing = "unpacked">}> : () -> tensor<2x2xf32>
+    %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+    return %1 : tensor<2x2xf32>
+  }
+}
+
+// CHECK-LABEL: @main
+// CHECK:      %0 = "tfl.external_const"() <{external_buffer = #tfl.external_buffer<group_name = "test.bin", offset = 0, length = 13, packing = "unpacked">}>
+// CHECK-NEXT: %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+// CHECK-NEXT: return %1
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/external_buffer.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/external_buffer.mlir
new file mode 100644
index 00000000000000..09d7e764b1f7a2
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/external_buffer.mlir
@@ -0,0 +1,34 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+module {
+  func.func public @main(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %0 = "tfl.external_const"() <{external_buffer = #tfl.external_buffer<group_name = "test.bin", offset = 0, length = 13, packing = "unpacked">}> : () -> tensor<2x2xf32>
+    %1 = tfl.add %arg0, %0 {fused_activation_function = "NONE"} : tensor<2x2xf32>
+    return %1 : tensor<2x2xf32>
+  }
+}
+
+// CHECK:  tensors: [ {
+// CHECK:    shape: [ 2, 2 ],
+// CHECK:    buffer: 1,
+// CHECK:    name: "arg0",
+// CHECK:    has_rank: true
+// CHECK:  }, {
+// CHECK:    shape: [ 2, 2 ],
+// CHECK:    name: "tfl.external_const",
+// CHECK:    has_rank: true,
+// CHECK:    external_buffer: 2147483648
+// CHECK:  }, {
+// CHECK:    shape: [ 2, 2 ],
+// CHECK:    buffer: 2,
+// CHECK:    name: "tfl.add",
+// CHECK:    has_rank: true
+// CHECK:  } ],
+// CHECK:  external_buffer_groups: [ {
+// CHECK:    name: "test.bin"
+// CHECK:  } ],
+// CHECK:  external_buffers: [ {
+// CHECK:    id: 2147483648,
+// CHECK:    length: 13,
+// CHECK:    packing: "unpacked"
+// CHECK:  } ]

From 1fa905e7f633db4c5a96e01a00e1204f49714f71 Mon Sep 17 00:00:00 2001
From: Yun Peng <pcloudy@google.com>
Date: Thu, 11 Dec 2025 02:22:48 -0800
Subject: [PATCH 168/753] Remove unused dependency on
 absl/base/internal/endian.h.

The include and build dependency for absl/base/internal/endian.h are no longer required by hlo_evaluator.cc.

PiperOrigin-RevId: 843125098
---
 third_party/xla/xla/hlo/evaluator/BUILD            | 1 -
 third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc | 1 -
 2 files changed, 2 deletions(-)

diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index c24f1ca6ce17e0..379b128680313e 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -89,7 +89,6 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/base:endian",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index ce763d545a5dfa..cea61cf467cb32 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/internal/endian.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"

From 229b6a8eb359a5f24563b5864457341aad9a34f4 Mon Sep 17 00:00:00 2001
From: Ville Vesilehto <ville@vesilehto.fi>
Date: Thu, 11 Dec 2025 03:12:40 -0800
Subject: [PATCH 169/753] PR #35049: refactor: use Literal::Make() in
 TextLiteralReader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35049

📝 Summary of Changes

Replace deprecated `Literal(shape)` constructor with `Literal::Make(shape)` which returns StatusOr and handles allocation failure gracefully.

🎯 Justification

The `Literal(shape)` constructor is deprecated as seen in `literal.h`:

```cpp
  ABSL_DEPRECATED(
      "This ctor may crash if allocation fails. Use Literal::Make() instead.")
  explicit Literal(
      const Shape& shape, bool allocate_arrays = true,
      ArrayValueState leaf_array_value_state = ArrayValueState::kKnown);
```

🚀 Kind of Contribution
Please remove what does not apply: ♻️ Cleanup

📊 Benchmark (for Performance Improvements)
Not applicable.

🧪 Unit Tests:
Not applicable.

🧪 Execution Tests:
Not applicable.

Copybara import of the project:

--
d2ea47c670f9e25cc4ac42190cabe60304c31d52 by Ville Vesilehto <ville@vesilehto.fi>:

refactor: use Literal::Make() in TextLiteralReader

Replace deprecated Literal(shape) constructor with Literal::Make()
which returns StatusOr and handles allocation failure gracefully.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

Merging this change closes #35049

PiperOrigin-RevId: 843140989
---
 third_party/xla/xla/text_literal_reader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/text_literal_reader.cc b/third_party/xla/xla/text_literal_reader.cc
index 8acf1307c678eb..d5fde2a80aa0f3 100644
--- a/third_party/xla/xla/text_literal_reader.cc
+++ b/third_party/xla/xla/text_literal_reader.cc
@@ -90,7 +90,7 @@ absl::StatusOr<Literal> TextLiteralReader::ReadAllLines() {
         ShapeUtil::HumanString(shape));
   }
 
-  Literal result(shape);
+  TF_ASSIGN_OR_RETURN(Literal result, Literal::Make(shape));
   const float fill = std::numeric_limits<float>::quiet_NaN();
   result.PopulateWithValue<float>(fill);
   std::vector<absl::string_view> pieces;

From 4fb9488700235410453bf977977f35d31ba7b38b Mon Sep 17 00:00:00 2001
From: spiao <Songlin.Piao@amd.com>
Date: Thu, 11 Dec 2025 03:12:55 -0800
Subject: [PATCH 170/753] PR #35026: [ROCm] fixed
 TritonFusionNumericsVerifierTest.VerifyThatDisablingTritonIsFast on rocm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35026

The original warp size 32 leads to the following error message: "INTERNAL: Failed to launch ROCm kernel: triton_softmax with block dimensions: 2048x1x1: HIP_ERROR_InvalidValue".

32 warps × 64 threads/wavefront = 2048 threads (exceeds 1024 limit)

For comparison on NVIDIA:
32 warps × 32 threads/warp = 1024 threads (within limit)

This test should be a general test for all the platforms.

@xla-rotation could you review my PR, please?
Copybara import of the project:

--
26edce6666fd55e7ad965aa677d7721140c60739 by Songlin Piao <Songlin.Piao@amd.com>:

adapt the num_warps so that the hlo could be compiled on both amd and nvidia

Merging this change closes #35026

PiperOrigin-RevId: 843141066
---
 .../gpu/transforms/triton_fusion_numerics_verifier_test.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index dae6b73eba50a6..c0492c26cdac1c 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -517,7 +517,7 @@ ENTRY main {
       "kind":"__triton",
       "block_level_fusion_config":{
         "output_tiles":[{"sizes":["1","1","1","16384"]}],
-        "num_warps":"32",
+        "num_warps":"16",
         "num_ctas":"1",
         "num_stages":"1"}}}
 }

From 8f00104abb79cddd70ba1ba4fd0812a0933b4fbe Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Thu, 11 Dec 2025 03:21:01 -0800
Subject: [PATCH 171/753] Implement `HloSharding::dimensions()` for
 `NamedSharding`

PiperOrigin-RevId: 843143103
---
 third_party/xla/xla/hlo/ir/hlo_sharding.h     |  3 +++
 third_party/xla/xla/hlo/ir/named_sharding.h   | 17 +++++++++++++++-
 .../xla/xla/hlo/ir/named_sharding_test.cc     | 20 +++++++++++++++++++
 3 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index 289c93a6640964..02673f27e7ff1e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -506,6 +506,9 @@ class HloSharding {
 
   // Returns all sharding dimensions.
   absl::Span<const int64_t> dimensions() const {
+    if (UseNamedShardingLeaf()) {
+      return named_sharding_->dimensions();
+    }
     return tile_assignment().dimensions();
   }
 
diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index 35919d7b4befa3..0795df6588a397 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -76,7 +76,12 @@ class NamedSharding {
         dim_shardings_(CanonicalizedDimShardings(dim_shardings)),
         replicated_axes_(replicated_axes.begin(), replicated_axes.end()),
         unreduced_axes_(unreduced_axes.begin(), unreduced_axes.end()),
-        metadata_(metadata.begin(), metadata.end()) {}
+        metadata_(metadata.begin(), metadata.end()) {
+    sharded_sizes_.reserve(dim_shardings_.size());
+    for (const DimensionSharding& dim_sharding : dim_shardings_) {
+      sharded_sizes_.push_back(dim_sharding.getShardedSize(mesh_));
+    }
+  }
 
   const Mesh& mesh() const { return mesh_; }
   absl::Span<const DimensionSharding> dim_shardings() const {
@@ -94,6 +99,9 @@ class NamedSharding {
     return dim_shardings_[dim].getShardedSize(mesh_);
   }
 
+  // Returns all sharding dimensions.
+  absl::Span<const int64_t> dimensions() const { return sharded_sizes_; }
+
   // Returns the total number of devices used by sharding.
   int64_t num_devices() const {
     return mesh_.device_assignment().num_elements();
@@ -154,6 +162,13 @@ class NamedSharding {
   std::vector<AxisRef> replicated_axes_;
   std::vector<AxisRef> unreduced_axes_;
   std::vector<OpMetadata> metadata_;
+
+  // Stores sharded sizes for each dimension. Required to maintain backward
+  // compatibility with existing `HloSharding::dimensions()` implementation
+  // returning a span.
+  // Once we make API change for `HloSharding::dimensions()` to return a vector,
+  // we can remove this field.
+  std::vector<int64_t> sharded_sizes_;
 };
 
 // Contains test only helper functions.
diff --git a/third_party/xla/xla/hlo/ir/named_sharding_test.cc b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
index 22930e733aca5d..19764b7042f3b4 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding_test.cc
+++ b/third_party/xla/xla/hlo/ir/named_sharding_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/hlo/ir/named_sharding.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "xla/hlo/ir/mesh_and_axis.h"
 #include "xla/xla_data.pb.h"
@@ -23,6 +24,7 @@ namespace xla {
 namespace {
 
 using DimensionSharding = NamedSharding::DimensionSharding;
+using ::testing::ElementsAre;
 
 TEST(NamedShardingTest, CanonicalizedDimShardings) {
   Mesh mesh_abcd({2, 4}, {"a", "b"});
@@ -151,6 +153,24 @@ TEST(NamedShardingTest, Dimension) {
   EXPECT_EQ(empty_sharding.num_dimensions(), 0);
 }
 
+TEST(NamedShardingTest, Dimensions) {
+  Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
+
+  AxisRef axis_a(0);
+  AxisRef axis_b(1, {2, 2});
+  AxisRef axis_c(2);
+  AxisRef axis_d(3, {4, 2});
+
+  DimensionSharding ds_ab({axis_a, axis_b}, /*is_closed=*/true);
+  DimensionSharding ds_dc({axis_d, axis_c}, /*is_closed=*/true);
+
+  NamedSharding sharding(mesh, /*dim_shardings=*/{ds_ab, ds_dc});
+  EXPECT_THAT(sharding.dimensions(), ElementsAre(2 * 2, 2 * 3));
+
+  NamedSharding empty_sharding(mesh, /*dim_shardings=*/{});
+  EXPECT_THAT(empty_sharding.dimensions(), ElementsAre());
+}
+
 TEST(NamedShardingTest, NumDevices) {
   Mesh mesh({2, 4, 3, 8}, {"a", "b", "c", "d"});
   NamedSharding sharding(mesh, {});

From c42eb74c903537acae5ec48dc0fa09c3a6abad6d Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Thu, 11 Dec 2025 04:01:36 -0800
Subject: [PATCH 172/753] [Autotuner] Fix behavior when cublas is disabled and
 autotune level is set to 0.

PiperOrigin-RevId: 843153465
---
 .../xla/xla/backends/autotuner/autotuner.cc   | 20 ++++++++++---------
 .../xla/backends/autotuner/autotuner_test.cc  |  6 ------
 .../service/gpu/autotuning/autotuner_pass.cc  |  3 ++-
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index 2d98a517270eb0..2578a24a614d8f 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -332,6 +332,17 @@ absl::StatusOr<Autotuner::Config> Autotuner::TuneBestConfig(
     }
   }
 
+  if (autotune_config_.exclude_cublas_config) {
+    executable_candidates.erase(
+        std::remove_if(executable_candidates.begin(),
+                       executable_candidates.end(),
+                       [](const ExecutableCandidate& candidate) {
+                         return candidate.config.codegen_backend->name() ==
+                                "Cublas_fission";
+                       }),
+        executable_candidates.end());
+  }
+
   if (executable_candidates.empty()) {
     return absl::InternalError(
         absl::StrCat("Autotuner could not compile any configs for HLO: ",
@@ -502,15 +513,6 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
 
 absl::StatusOr<Autotuner::ConfigResult> Autotuner::PickBestConfig(
     std::vector<ConfigResult>& results) {
-  if (autotune_config_.exclude_cublas_config) {
-    results.erase(
-        std::remove_if(results.begin(), results.end(),
-                       [](const ConfigResult& result) {
-                         return result.config.codegen_backend->name() ==
-                                "Cublas_fission";
-                       }),
-        results.end());
-  }
 
   absl::Duration min_duration = absl::InfiniteDuration();
   ConfigResult* best_result = nullptr;
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_test.cc b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
index 1fc74269cead16..bd4cc84715935b 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_test.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
@@ -709,12 +709,6 @@ TEST_F(AutotunerTest, ExcludeCublasConfig) {
   backends.push_back(std::move(backend));
 
   auto profiler = std::make_unique<MockProfiler>();
-  EXPECT_CALL(*profiler, CreateInputBuffers(_))
-      .WillOnce(Return(std::make_unique<InputBuffers>()));
-  EXPECT_CALL(*profiler, Profile(_, _))
-      .WillOnce(Return(ProfileResult({absl::Seconds(1)})))
-      .WillOnce(Return(ProfileResult({absl::Seconds(2)})));
-
   TF_ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
                                         std::move(profiler), config_, nullptr));
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
index 0835c4fae539fe..714f6ef7fe1c68 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
@@ -63,7 +63,8 @@ AutotuneConfig GetAutotuneConfig(const DebugOptions& debug_options,
       !debug_options.xla_gpu_cublas_fallback();
   autotune_config.select_first_config =
       debug_options.xla_gpu_deterministic_ops() ||
-      debug_options.xla_gpu_exclude_nondeterministic_ops();
+      debug_options.xla_gpu_exclude_nondeterministic_ops() ||
+      debug_options.xla_gpu_autotune_level() == 0;
 
   if (is_deviceless) {
     // If we are running on a deviceless target, we want to use default configs.

From 493c298f0b4d673cabc6175e1a1f99d6ffccde71 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Thu, 11 Dec 2025 04:12:58 -0800
Subject: [PATCH 173/753] [XLA:GPU] Fix combine decomposition when replica
 groups are shuffled.

When replica groups for for singe host are not contiguous, we need an extra step to reorder offsets and sizes metadata operands.

https://github.com/openxla/xla/pull/35096 was a similar change for dispatch ragged-all-to-all.

PiperOrigin-RevId: 843157263
---
 ...ragged_all_to_all_multi_host_decomposer.cc | 69 +++++++++++--------
 .../xla/tests/ragged_all_to_all_e2e_test.cc   |  7 --
 2 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
index a3d967d7b4cb40..578470d6f18e74 100644
--- a/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
+++ b/third_party/xla/xla/service/gpu/transforms/ragged_all_to_all_multi_host_decomposer.cc
@@ -103,12 +103,13 @@ HloInstruction* ShuffleMetadataOperandValues(
 
   HloComputation* computation = hlo->parent();
 
-  const Shape& shape = hlo->shape();
-  CHECK_EQ(shape.dimensions().size(), 1);
-
-  int64_t num_elements = shape.dimensions(0);
+  PrimitiveType element_type = hlo->shape().element_type();
+  int64_t num_elements = ShapeUtil::ElementsIn(hlo->shape());
   int64_t num_replicas = permutation.size();
   int64_t num_elements_per_replica = num_elements / permutation.size();
+  Shape linear_shape = ShapeUtil::MakeShape(element_type, {num_elements});
+  Shape gather_shape = ShapeUtil::MakeShape(
+      element_type, {num_replicas, num_elements_per_replica});
 
   Array<int64_t> permutation_array({num_replicas, 1});
   for (int64_t i = 0; i < permutation.size(); ++i) {
@@ -119,11 +120,11 @@ HloInstruction* ShuffleMetadataOperandValues(
       computation->AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::CreateFromArray(permutation_array)));
 
-  Shape new_shape = ShapeUtil::MakeShape(
-      shape.element_type(), {num_replicas, num_elements_per_replica});
+  hlo = computation->AddInstruction(
+      HloInstruction::CreateReshape(linear_shape, hlo));
 
   hlo = computation->AddInstruction(
-      HloInstruction::CreateGather(new_shape, hlo, permutation_constant,
+      HloInstruction::CreateGather(gather_shape, hlo, permutation_constant,
                                    HloGatherInstruction::MakeGatherDimNumbers(
                                        /*offset_dims=*/{1},
                                        /*collapsed_slice_dims=*/{},
@@ -132,7 +133,8 @@ HloInstruction* ShuffleMetadataOperandValues(
                                    /*slice_sizes=*/{num_elements_per_replica},
                                    /*indices_are_sorted=*/false));
 
-  return computation->AddInstruction(HloInstruction::CreateReshape(shape, hlo));
+  return computation->AddInstruction(
+      HloInstruction::CreateReshape(linear_shape, hlo));
 }
 
 // Corrects the offsets in the local metadata to account for the number of input
@@ -337,8 +339,11 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
     HloRaggedAllToAllInstruction* ragged_all_to_all,
     HloComputation* computation,
     absl::Span<ReplicaGroup const> inter_host_replica_groups,
-    absl::Span<ReplicaGroup const> intra_host_replica_groups, int64_t num_hosts,
+    absl::Span<ReplicaGroup const> intra_host_replica_groups,
+    absl::Span<int64_t const> replica_groups_permutation, int64_t num_hosts,
     int64_t num_devices_in_replica, int64_t num_participating_devices) {
+  const Shape& metadata_operand_shape = ragged_all_to_all->operand(2)->shape();
+
   auto* zero = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(
           ragged_all_to_all->operand(1)->shape().element_type())));
@@ -359,6 +364,9 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
 
   auto get_intra_host_metadata = [&](HloInstruction* metadata_operand,
                                      bool correct_offsets) {
+    metadata_operand = ShuffleMetadataOperandValues(metadata_operand,
+                                                    replica_groups_permutation);
+
     metadata_operand =
         computation->AddInstruction(HloInstruction::CreateReshape(
             /*shape=*/ShapeUtil::MakeShape(
@@ -383,8 +391,7 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
             /*dimensions=*/{1, 0, 2}));
 
     return computation->AddInstruction(HloInstruction::CreateReshape(
-        /*shape=*/ragged_all_to_all->operand(2)->shape(),
-        /*operand=*/metadata_operand));
+        /*shape=*/metadata_operand_shape, /*operand=*/metadata_operand));
   };
 
   absl::InlinedVector<HloInstruction*, 4> intra_host_ragged_all_to_all_operands{
@@ -443,36 +450,40 @@ absl::StatusOr<bool> DecomposeCombineRaggedAllToAll(
           : std::nullopt,
       /*split_dimension=*/0));
 
-  HloInstruction* corrected_output_offsets = output_offsets;
+  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
+      /*shape=*/metadata_operand_shape, /*operand=*/output_offsets));
+
+  std::vector<HloInstruction*> local_ragged_all_to_all_operands = {
+      local_inputs,   ragged_all_to_all->mutable_operand(1),
+      output_offsets, ragged_all_to_all->mutable_operand(5),
+      output_offsets, ragged_all_to_all->mutable_operand(5),
+  };
+
+  for (int i = 2; i < 6; ++i) {
+    local_ragged_all_to_all_operands[i] = ShuffleMetadataOperandValues(
+        local_ragged_all_to_all_operands[i], replica_groups_permutation);
+  }
 
-  corrected_output_offsets =
+  HloInstruction* local_input_offsets =
       computation->AddInstruction(HloInstruction::CreateReshape(
           /*shape=*/ShapeUtil::MakeShape(
               output_offsets->shape().element_type(),
               {num_hosts, num_devices_in_replica_per_host,
                num_updates_per_replica}),
-          /*operand=*/corrected_output_offsets));
+          /*operand=*/local_ragged_all_to_all_operands[2]));
 
-  corrected_output_offsets =
+  local_input_offsets =
       CorrectOffsets(ragged_all_to_all->operand(1)->shape().dimensions(0),
-                     corrected_output_offsets, computation);
+                     local_input_offsets, computation);
 
-  output_offsets = computation->AddInstruction(HloInstruction::CreateReshape(
-      /*shape=*/ragged_all_to_all->operand(2)->shape(),
-      /*operand=*/output_offsets));
-
-  corrected_output_offsets =
+  local_ragged_all_to_all_operands[2] =
       computation->AddInstruction(HloInstruction::CreateReshape(
-          /*shape=*/ragged_all_to_all->operand(2)->shape(),
-          /*operand=*/corrected_output_offsets));
+          /*shape=*/metadata_operand_shape, /*operand=*/local_input_offsets));
 
   HloInstruction* local_ragged_all_to_all =
       computation->AddInstruction(HloInstruction::CreateRaggedAllToAll(
           /*shape=*/ragged_all_to_all->shape(),
-          /*operands=*/
-          {local_inputs, ragged_all_to_all->mutable_operand(1),
-           corrected_output_offsets, ragged_all_to_all->mutable_operand(5),
-           output_offsets, ragged_all_to_all->mutable_operand(5)},
+          /*operands=*/local_ragged_all_to_all_operands,
           /*device_list=*/CollectiveDeviceList(degenerated_replica_groups),
           /*channel_id=*/ragged_all_to_all->channel_id()));
 
@@ -590,8 +601,8 @@ absl::StatusOr<bool> DecomposeRaggedAllToAll(
   if (num_input_rows > num_output_rows) {
     return DecomposeCombineRaggedAllToAll(
         ragged_all_to_all, computation, inter_host_replica_groups,
-        intra_host_replica_groups, num_hosts, num_devices_in_replica,
-        num_participating_devices);
+        intra_host_replica_groups, *replica_groups_permutation, num_hosts,
+        num_devices_in_replica, num_participating_devices);
   }
 
   return DecomposeDispatchRaggedAllToAll(
diff --git a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
index f74275d7933bc3..257d2b9c2625b2 100644
--- a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
+++ b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
@@ -935,13 +935,6 @@ TEST_P(RaggedAllToAllMultiHostDecomposerTest,
        RaggedAllToAll_8GPUs_SliceSize4_ShuffledReplicaGroups) {
   auto [num_input_rows, num_output_rows] = GetParam();
 
-  if (num_input_rows > num_output_rows) {
-    // TODO(b/445380264): Fix decomposer for combine ragged-all-to-all.
-    GTEST_SKIP()
-        << "The test will currently fail for combine ragged-all-to-all (when "
-           "input is larger than output).";
-  }
-
   std::string kModuleReplicatedStr =
       absl::Substitute(R"(
   HloModule module

From f6d55d77560fa15908fd811ad59c9a587e108b19 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Thu, 11 Dec 2025 04:15:50 -0800
Subject: [PATCH 174/753] [XLA:GPU] DotMerger: Stop merging when dots have
 become too large. Perform expensive reachability checks last.

PiperOrigin-RevId: 843157999
---
 .../xla/xla/hlo/transforms/simplifiers/dot_merger.cc   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
index db0040ed1ba75b..dc55e09dbdb953 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
@@ -473,6 +473,9 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
     return false;
   }
 
+  VLOG(0) << "Merging Dots in computation: " << comp->name();
+  VLOG(1) << "Found " << equivalence_classes.size() << " equivalence classes.";
+
   // Build a dependency graph representing the whole computation.
   GraphCycles graph;
 
@@ -537,9 +540,10 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
 
         if (dead_instrs.contains(a) || dead_instrs.contains(b) ||
             (!is_merge_candidate(a) && !is_merge_candidate(b)) ||
+            !can_merge(a, b) ||
             // Perform reachability checks last since they can be expensive.
             graph.IsReachableNonConst(a_id, b_id) ||
-            graph.IsReachableNonConst(b_id, a_id) || !can_merge(a, b)) {
+            graph.IsReachableNonConst(b_id, a_id)) {
           continue;
         }
 
@@ -559,6 +563,10 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
           dead_instrs.insert(b);
           dots[i] = merged;
           dots[j] = nullptr;
+          if (!is_merge_candidate(merged)) {
+            // The merged dot is not a candidate for futher merging.
+            break;
+          }
         }
       }
     }

From b67559d7326c68972ad666929be6d6a5fdcd1955 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 11 Dec 2025 04:26:36 -0800
Subject: [PATCH 175/753] Remove `Compiler*` argument from
 AotCompilationResult::LoadExecutable

Loading an executable should not depend on the Compiler instance. If a particular instance needs access to the Compiler, it can keep a Compiler* in its implementation. So that's what I'm doing for LegacyGpuAotCompilationResult which in fact needs the compiler to produce an executable.

I had to make `Compiler::Export` non-const for this to work, but this will go away again in a subsequent change which moves `Export` to `Executable`.

PiperOrigin-RevId: 843160651
---
 tensorflow/compiler/aot/codegen.cc            |  6 ++--
 .../xla_compiled_cpu_function_thunks.cc       |  2 +-
 third_party/xla/xla/client/local_client.cc    | 13 ++++-----
 third_party/xla/xla/client/local_client.h     |  2 +-
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    |  4 +--
 third_party/xla/xla/pjrt/gpu/BUILD            |  6 +++-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc  | 28 +++++++++++++------
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h   | 10 +++++++
 third_party/xla/xla/service/compiler.h        | 12 ++++++--
 .../service/cpu/cpu_aot_compilation_result.cc | 11 ++------
 .../service/cpu/cpu_aot_compilation_result.h  |  8 +++---
 .../xla/service/cpu/cpu_aot_compiler_test.cc  |  3 +-
 .../xla/xla/service/cpu/cpu_aot_loader.cc     |  2 +-
 .../xla/xla/service/cpu/cpu_compiler.cc       |  2 +-
 .../xla/xla/service/cpu/cpu_compiler.h        |  2 +-
 .../service/cpu/tests/cpu_aot_export_test.cc  |  2 +-
 .../service/gpu/gpu_aot_compilation_result.h  |  5 ++--
 .../gpu/gpu_aot_compilation_result_test.cc    |  5 ++--
 .../service/gpu/gpu_aot_compilation_test.cc   | 15 ++++------
 .../xla/xla/service/gpu/gpu_compiler.cc       |  8 +++---
 .../xla/xla/service/gpu/gpu_compiler.h        |  2 +-
 .../xla/xla/service/gpu/gpu_compiler_test.cc  | 13 ++++-----
 .../gpu/legacy_gpu_aot_compilation_result.cc  | 20 ++++++-------
 .../gpu/legacy_gpu_aot_compilation_result.h   | 21 ++++++++------
 24 files changed, 110 insertions(+), 92 deletions(-)

diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 87cb051b75df63..1042ff1fa7a896 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -1206,9 +1206,9 @@ absl::StatusOr<EmbeddedConstantBuffers> GenerateConstantBuffersData(
       auto aot_thunk_result_temp,
       xla::cpu::CpuAotCompilationResult::FromString(serialized, nullptr));
 
-  TF_ASSIGN_OR_RETURN(
-      auto executable,
-      std::move(*aot_thunk_result_temp).LoadExecutable(nullptr, nullptr));
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      std::move(*aot_thunk_result_temp)
+                          .LoadExecutable(/*stream_exec=*/nullptr));
 
   xla::cpu::CpuExecutable* cpu_executable =
       tsl::down_cast<xla::cpu::CpuExecutable*>(executable.get());
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
index c2b9cc26d5d461..68c4d7f90b204c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
@@ -50,7 +50,7 @@ XlaCompiledCpuFunctionThunks::XlaCompiledCpuFunctionThunks(
   TF_CHECK_OK(aot_compilation_result.status());
   // NO_CDC: aot_compilation_result is checked to be OK above.
   auto cpu_executable = std::move(*aot_compilation_result.value())
-                            .LoadExecutable(nullptr, nullptr);
+                            .LoadExecutable(/*stream_exec=*/nullptr);
 
   TF_CHECK_OK(cpu_executable.status());
   auto executable_or_err =
diff --git a/third_party/xla/xla/client/local_client.cc b/third_party/xla/xla/client/local_client.cc
index e1f348a755521d..aa0a0e4c289055 100644
--- a/third_party/xla/xla/client/local_client.cc
+++ b/third_party/xla/xla/client/local_client.cc
@@ -482,19 +482,17 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<xla::AotCompilationResult> aot_result,
       compiler->LoadAotCompilationResult(serialized_aot_result));
-  return LoadInternal(std::move(aot_result), compiler.get(), options);
+  return LoadInternal(std::move(aot_result), options);
 }
 
 absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Load(
     std::unique_ptr<xla::AotCompilationResult> aot_result,
     const ExecutableBuildOptions& options) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Compiler> compiler,
-                      Compiler::GetForPlatform(platform()));
-  return LoadInternal(std::move(aot_result), compiler.get(), options);
+  return LoadInternal(std::move(aot_result), options);
 }
 
 absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
-    std::unique_ptr<xla::AotCompilationResult> aot_result, Compiler* compiler,
+    std::unique_ptr<xla::AotCompilationResult> aot_result,
     const ExecutableBuildOptions& options) {
   TF_ASSIGN_OR_RETURN(ExecutableBuildOptions updated_options,
                       UpdateBuildOptions(options, default_device_ordinal()));
@@ -502,9 +500,8 @@ absl::StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::LoadInternal(
       se::StreamExecutor * executor,
       backend().stream_executor(updated_options.device_ordinal()));
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, executor));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      std::move(*aot_result).LoadExecutable(executor));
   return std::make_unique<LocalExecutable>(std::move(executable),
                                            local_service_->mutable_backend(),
                                            updated_options);
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 3c237ef37a1973..4429dc84664f6b 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -249,7 +249,7 @@ class LocalClient : public Client {
   LocalService* local_service_;
 
   absl::StatusOr<std::unique_ptr<LocalExecutable>> LoadInternal(
-      std::unique_ptr<xla::AotCompilationResult> aot_result, Compiler* compiler,
+      std::unique_ptr<xla::AotCompilationResult> aot_result,
       const ExecutableBuildOptions& options);
 };
 
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 98d5c48ef63655..8a5e603efd34e6 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -488,7 +488,7 @@ PjRtCpuClient::LoadSerializedExecutable(absl::string_view serialized,
                       compiler.LoadAotCompilationResult(str));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(&compiler, /*executor=*/nullptr));
+      std::move(*aot_result).LoadExecutable(/*executor=*/nullptr));
 
   // Set up other arguments for PjRtCpuExecutable
   // TODO(b/232263665): Remove duplicated code in DeserializeExecutable and
@@ -620,7 +620,7 @@ static absl::StatusOr<std::unique_ptr<xla::Executable>> CompileAheadOfTime(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<AotCompilationResult> aot_result,
                       compiler.LoadAotCompilationResult(serialized_aot_result));
 
-  return std::move(*aot_result).LoadExecutable(&compiler, /*executor=*/nullptr);
+  return std::move(*aot_result).LoadExecutable(/*executor=*/nullptr);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index ddecf07a49901f..8aa72f6c944892 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -462,7 +462,7 @@ cc_library(
         "//xla:status_macros",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
+        "//xla/mlir_hlo:mhlo_passes",
         "//xla/pjrt:mlir_to_hlo",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_compiler",
@@ -480,9 +480,11 @@ cc_library(
         "//xla/stream_executor:platform_manager",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:casts",
     ],
@@ -536,7 +538,9 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/service:compiler",
         "//xla/stream_executor:platform",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
     ] + if_cuda([
         ":se_gpu_pjrt_compiler_cuda_registration",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 5d0632a7f7aa4d..bb0668ba72cf28 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -25,11 +25,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/layout_util.h"
+#include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -110,18 +111,27 @@ StreamExecutorGpuCompiler::StreamExecutorGpuCompiler(
     stream_executor::Platform::Id platform_id)
     : requested_platform_id_(platform_id) {}
 
+absl::StatusOr<Compiler*> StreamExecutorGpuCompiler::GetOrCreateCompiler() {
+  absl::MutexLock lock(compiler_mutex_);
+  if (compiler_ == nullptr) {
+    // We get the compiler here because doing so in the constructor might fail
+    // due to static initialization order shenanigans (An instance of this class
+    // is initialized statically and this might happen before the compiler is
+    // registered with Compiler::RegisterCompilerFactory). For the same reason,
+    // we can't fail construction of this class, therefore we have this
+    // GetOrCreate function and we can return on error when calling Compile.
+    TF_ASSIGN_OR_RETURN(compiler_,
+                        GetCompilerForPlatform(requested_platform_id_));
+  }
+  return compiler_.get();
+}
+
 absl::StatusOr<std::unique_ptr<PjRtExecutable>>
 StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                    const XlaComputation& computation,
                                    const PjRtTopologyDescription& topology,
                                    PjRtClient* client) {
-  // We get the compiler here because doing so in the constructor might fail due
-  // to static initialization order shenanigans. Also we can't fail construction
-  // of this class because it's also statically constructed.
-  // TODO(b/382417973): Use factories instead of static initialization of
-  // singletons.
-  TF_ASSIGN_OR_RETURN(auto gpu_compiler,
-                      GetCompilerForPlatform(requested_platform_id_));
+  TF_ASSIGN_OR_RETURN(Compiler * gpu_compiler, GetOrCreateCompiler());
 
   CompileOptions input_options = options;
   if (!options.gpu_target_config) {
@@ -165,7 +175,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
       HloModule::CreateFromProto(hlo_module_proto, *hlo_config));
   UpdateEntryComputationLayout(
       hlo_module.get(), std::bind(&Compiler::DefaultDeviceShapeRepresentation,
-                                  gpu_compiler.get(), std::placeholders::_1));
+                                  gpu_compiler, std::placeholders::_1));
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
   Compiler::CompileOptions opts;
   opts.gpu_target_config = options.gpu_target_config;
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
index 80197b5fe0dcf2..16728c2f6f6dbb 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -19,11 +19,14 @@ limitations under the License.
 #include <memory>
 #include <optional>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
+#include "xla/service/compiler.h"
 #include "xla/stream_executor/platform.h"
 
 namespace xla {
@@ -49,6 +52,13 @@ class StreamExecutorGpuCompiler : public PjRtCompiler {
 
  private:
   std::optional<stream_executor::Platform::Id> requested_platform_id_;
+  mutable absl::Mutex compiler_mutex_;
+  std::unique_ptr<Compiler> compiler_ ABSL_GUARDED_BY(compiler_mutex_);
+
+  // Returns an instance of the compiler for the given platform (or the default
+  // GPU platform if none is specified). If one does not exist, creates one. The
+  // compiler is cached for subsequent calls.
+  absl::StatusOr<Compiler*> GetOrCreateCompiler();
 };
 }  // namespace xla
 #endif  // XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 4d83f602f2cb53..b47c105d72160c 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -85,10 +85,16 @@ class AotCompilationResult {
   }
 
   virtual absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* executor) && {
+      const se::StreamExecutor* executor) && {
     return Unimplemented("LoadExecutable unimplemented.");
   }
 
+  ABSL_DEPRECATE_AND_INLINE()
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler*, const se::StreamExecutor* executor) && {
+    return std::move(*this).LoadExecutable(executor);
+  }
+
   virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
       const {
     return Unimplemented("buffer_assignment unimplemented.");
@@ -356,7 +362,7 @@ class Compiler {
 
   // Returns an AotCompilationResult of the executable for serialization.
   virtual absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
-      Executable* executable) const {
+      Executable* executable) {
     return Unimplemented("Export unimplemented");
   }
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
index 31ca1d590cd292..ad60a542567127 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.cc
@@ -146,7 +146,6 @@ CpuAotCompilationResult::CpuAotCompilationResult(
 
 absl::StatusOr<std::unique_ptr<Executable>>
 CpuAotCompilationResult::LoadExecutable(
-    [[maybe_unused]] Compiler* compiler,
     const se::StreamExecutor* stream_exec) && {
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
@@ -156,13 +155,9 @@ CpuAotCompilationResult::LoadExecutable(
 
   // Copied from cpu_compiler.cc in order to avoid dependency on cpu_compiler.
   std::function<int64_t(const BufferValue&)> buffer_size_bytes_function_getter =
-      compiler ? compiler->BufferSizeBytesFunction() : []() {
-        HloCostAnalysis::ShapeSizeFunction shape_size =
-            CpuExecutable::ShapeSizeBytes;
-        return [shape_size](const BufferValue& buffer) {
-          return shape_size(buffer.shape());
-        };
-      }();
+      [](const BufferValue& buffer) {
+        return CpuExecutable::ShapeSizeBytes(buffer.shape());
+      };
 
   // Recreate BufferAssignment from proto.
   AliasInfo alias_info;
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 4817200999814f..1f845de703b5ec 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -116,10 +116,10 @@ class CpuAotCompilationResult : public AotCompilationResult {
     return proto_.SerializeAsString();
   }
 
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      [[maybe_unused]] Compiler* compiler,
-      const se::StreamExecutor* stream_exec) &&
-      override;
+  using AotCompilationResult::LoadExecutable;
+
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 
   const HloModule* optimized_module() const override { return module_.get(); }
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
index b3544407d61546..114829539d4e8e 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
@@ -112,8 +112,7 @@ ENTRY e {
 
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        std::move(*aot_result)
-            .LoadExecutable(compiler, aot_options->executor()));
+        std::move(*aot_result).LoadExecutable(aot_options->executor()));
     std::unique_ptr<OpaqueExecutable> wrapped_executable =
         test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
index 21ebdebf6730f2..9175ab43b33216 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
@@ -148,7 +148,7 @@ absl::StatusOr<std::unique_ptr<Executable>> CpuAotLoader::LoadExecutable(
 
 absl::StatusOr<std::unique_ptr<Executable>> CpuAotLoader::LoadExecutable(
     xla::AotCompilationResult&& compilation_result) {
-  return std::move(compilation_result).LoadExecutable(nullptr, nullptr);
+  return std::move(compilation_result).LoadExecutable(/*executor=*/nullptr);
 }
 
 absl::StatusOr<std::unique_ptr<AotCompilationResult>>
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 12b1e38459b79e..5cced18ff84246 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -2273,7 +2273,7 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
 }
 
 absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
-    Executable* executable) const {
+    Executable* executable) {
   auto* cpu_executable = tensorflow::down_cast<CpuExecutable*>(executable);
   if (!cpu_executable)
     return Internal("Could not downcast Executable to CpuExecutable");
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index 0816a51979f7d2..a01fef46396135 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -82,7 +82,7 @@ class CpuCompiler : public LLVMCompiler {
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
   absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
-      Executable* executable) const override;
+      Executable* executable) override;
 
   // Returns a (deserialized) AotCompilationResult from a serialized
   // AotCompilationResult.
diff --git a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
index ba580c9997131d..e69757ba09d095 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
@@ -67,7 +67,7 @@ class CpuAotCompilationTest : public HloTestBase {
     // Load Executable from AOT compilation result.
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        std::move(*loaded_aot_result).LoadExecutable(compiler, stream_exec));
+        std::move(*loaded_aot_result).LoadExecutable(stream_exec));
   }
 };
 
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
index 11141620ef3d66..1fbf15aaa8865c 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
@@ -62,9 +62,8 @@ class GpuAotCompilationResult : public AotCompilationResult {
     return serialized;
   }
 
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
-      final {
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && final {
     stream_executor::Platform::Id platform_id =
         stream_exec->GetPlatform()->id();
     const auto symbol_resolver = [&](absl::string_view symbol_name) {
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
index 091e616bac25f7..65fa6965ea14f8 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result_test.cc
@@ -189,9 +189,8 @@ TEST_F(GpuAotCompilationResultTest, LoadExecutable) {
 
   EnsureCudaSymbolIsRegistered();
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*result).LoadExecutable(/*compiler=*/nullptr, &executor_));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*result).LoadExecutable(&executor_));
 
   auto* gpu_executable = dynamic_cast<GpuExecutable*>(executable.get());
   ASSERT_NE(gpu_executable, nullptr) << "Executable is not a GpuExecutable.";
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
index bf02596ba3a477..3acb11b387fabb 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -102,9 +102,8 @@ TEST_P(GpuAotCompilationTest, ExportAndLoadExecutable) {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec));
 }
 
 TEST_P(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
@@ -144,9 +143,8 @@ TEST_P(GpuAotCompilationTest, AotCompilationWithoutGpuDevice) {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec));
 }
 
 namespace {
@@ -257,9 +255,8 @@ TEST_P(GpuAotCompilationTest, ExportAndLoadExecutableWithTriton) {
       compiler->LoadAotCompilationResult(serialized_aot_result));
 
   // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler, stream_exec));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec));
   std::unique_ptr<OpaqueExecutable> wrapped_executable =
       test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index a98c18b53adc89..d949a15e15783a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -2703,7 +2703,7 @@ GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
           optimized_module.get(),
           res.compile_module_results.buffer_assignment.get(),
           res.backend_result.asm_text, res.backend_result.binary,
-          res.backend_result.dnn_compiled_graphs, pointer_size_));
+          res.backend_result.dnn_compiled_graphs, pointer_size_, this));
 
   return std::move(results);
 }
@@ -2714,7 +2714,7 @@ HloCostAnalysis::ShapeSizeFunction GpuCompiler::ShapeSizeBytesFunction() const {
 }
 
 absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
-    Executable* executable) const {
+    Executable* executable) {
   auto* gpu_executable = tensorflow::down_cast<GpuExecutable*>(executable);
   if (!gpu_executable) {
     return Internal("GpuExecutable is null");
@@ -2731,7 +2731,7 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
   return LegacyGpuAotCompilationResult::FromModule(
       &gpu_executable->module(), gpu_executable->buffer_assignment(),
       gpu_executable->text(), gpu_executable->binary(),
-      gpu_executable->dnn_compiled_graphs(), pointer_size_);
+      gpu_executable->dnn_compiled_graphs(), pointer_size_, this);
 }
 
 absl::Status GpuCompiler::RunPreSchedulingPasses(
@@ -2968,7 +2968,7 @@ GpuCompiler::LoadAotCompilationResult(
   }
 
   return LegacyGpuAotCompilationResult::FromProto(gpu_executable_proto,
-                                                  pointer_size_);
+                                                  pointer_size_, this);
 }
 
 absl::StatusOr<std::unique_ptr<Executable>>
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 235c184ecf22c6..ad2e6840dba02e 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -90,7 +90,7 @@ class GpuCompiler : public LLVMCompiler {
   LoadAotCompilationResult(const std::string& serialized_aot_result) override;
 
   absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
-      Executable* executable) const override;
+      Executable* executable) override;
 
   absl::Status RunPostSchedulingPipelines(
       HloModule* module, int64_t scheduler_mem_limit,
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 7d5f568bbdbddb..944a71ab6a7ee8 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -950,9 +950,8 @@ TEST_P(AotCompilationTest, CompileAndLoadAotResult) {
       std::unique_ptr<AotCompilationResult> aot_result,
       compiler_->LoadAotCompilationResult(serialized_aot_result));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> executable,
-      std::move(*aot_result).LoadExecutable(compiler_, stream_exec_));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec_));
   std::unique_ptr<OpaqueExecutable> wrapped_executable =
       test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
@@ -988,9 +987,8 @@ TEST_P(AotCompilationTest, ExportAndImportAotResult) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<AotCompilationResult> aot_result,
                           compiler_->Export(executable.get()));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<Executable> new_executable,
-      std::move(*aot_result).LoadExecutable(compiler_, stream_exec_));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> new_executable,
+                          std::move(*aot_result).LoadExecutable(stream_exec_));
   std::unique_ptr<OpaqueExecutable> wrapped_executable =
       test_runner_as_hlo_runner().WrapExecutable(std::move(new_executable));
 
@@ -1156,8 +1154,7 @@ ENTRY e {
 
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,
-        std::move(*aot_result)
-            .LoadExecutable(compiler, aot_options.executor()));
+        std::move(*aot_result).LoadExecutable(aot_options.executor()));
     std::unique_ptr<OpaqueExecutable> wrapped_executable =
         test_runner_as_hlo_runner().WrapExecutable(std::move(executable));
 
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
index 0985ad404f4b9e..86f920b6d57ed0 100644
--- a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
@@ -28,7 +27,6 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
 #include "xla/service/compiler.h"
-#include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable.pb.h"
 #include "xla/service/gpu/gpu_latency_hiding_scheduler.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -44,7 +42,8 @@ absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
 LegacyGpuAotCompilationResult::FromModule(
     const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
     absl::string_view asm_text, absl::Span<const uint8_t> binary,
-    const BinaryMap& dnn_compiled_graphs, int pointer_size) {
+    const BinaryMap& dnn_compiled_graphs, int pointer_size,
+    Compiler* compiler) {
   tsl::profiler::TraceMe traceme("ResultFromModule");
   GpuExecutableProto proto;
   *proto.mutable_hlo_module_with_config() = hlo_module->ToProtoWithConfig();
@@ -55,12 +54,13 @@ LegacyGpuAotCompilationResult::FromModule(
                                               dnn_compiled_graphs.cend());
   return std::unique_ptr<LegacyGpuAotCompilationResult>(
       new LegacyGpuAotCompilationResult(hlo_module->Clone(), std::move(proto),
-                                        pointer_size));
+                                        pointer_size, compiler));
 }
 
 absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
 LegacyGpuAotCompilationResult::FromString(const std::string& serialized,
-                                          int pointer_size) {
+                                          int pointer_size,
+                                          Compiler* compiler) {
   tsl::profiler::TraceMe traceme("ResultFromString");
   GpuExecutableProto proto;
   if (!proto.ParseFromString(serialized)) {
@@ -68,19 +68,19 @@ LegacyGpuAotCompilationResult::FromString(const std::string& serialized,
         "Failed to parse serialized LegacyGpuAotCompilationResult.");
   }
 
-  return FromProto(proto, pointer_size);
+  return FromProto(proto, pointer_size, compiler);
 }
 
 absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
 LegacyGpuAotCompilationResult::FromProto(const GpuExecutableProto& proto,
-                                         int pointer_size) {
+                                         int pointer_size, Compiler* compiler) {
   tsl::profiler::TraceMe traceme("ResultFromProto");
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
       HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
   return std::unique_ptr<LegacyGpuAotCompilationResult>(
       new LegacyGpuAotCompilationResult(std::move(module), std::move(proto),
-                                        pointer_size));
+                                        pointer_size, compiler));
 }
 
 absl::StatusOr<std::string> LegacyGpuAotCompilationResult::SerializeAsString()
@@ -90,12 +90,12 @@ absl::StatusOr<std::string> LegacyGpuAotCompilationResult::SerializeAsString()
 
 absl::StatusOr<std::unique_ptr<Executable>>
 LegacyGpuAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* stream_exec) && {
+    const se::StreamExecutor* stream_exec) && {
   if (stream_exec == nullptr) {
     return InvalidArgument("Stream executor is null.");
   }
 
-  return compiler->LoadExecutableFromAotResult(*this, *stream_exec);
+  return compiler_->LoadExecutableFromAotResult(*this, *stream_exec);
 }
 
 absl::StatusOr<std::unique_ptr<BufferAssignment>>
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
index b97cecd5a2ebce..d511992ce5e371 100644
--- a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
@@ -50,19 +50,21 @@ class LegacyGpuAotCompilationResult : public AotCompilationResult {
   FromModule(const HloModule* hlo_module,
              const BufferAssignment* buffer_assignment,
              absl::string_view asm_text, absl::Span<const uint8_t> binary,
-             const BinaryMap& dnn_compiled_graphs, int pointer_size);
+             const BinaryMap& dnn_compiled_graphs, int pointer_size,
+             Compiler* compiler);
 
   static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
-  FromString(const std::string& serialized, int pointer_size);
+  FromString(const std::string& serialized, int pointer_size,
+             Compiler* compiler);
 
   static absl::StatusOr<std::unique_ptr<LegacyGpuAotCompilationResult>>
-  FromProto(const GpuExecutableProto& proto, int pointer_size);
+  FromProto(const GpuExecutableProto& proto, int pointer_size,
+            Compiler* compiler);
 
   absl::StatusOr<std::string> SerializeAsString() const override;
 
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* stream_exec) &&
-      override;
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 
   const HloModule* optimized_module() const override { return module_.get(); }
   std::unique_ptr<HloModule> consume_optimized_module() override {
@@ -76,14 +78,17 @@ class LegacyGpuAotCompilationResult : public AotCompilationResult {
 
  private:
   LegacyGpuAotCompilationResult(std::unique_ptr<HloModule> module,
-                                GpuExecutableProto proto, int pointer_size)
+                                GpuExecutableProto proto, int pointer_size,
+                                Compiler* compiler)
       : module_(std::move(module)),
         proto_(std::move(proto)),
-        pointer_size_(pointer_size) {}
+        pointer_size_(pointer_size),
+        compiler_(compiler) {}
 
   std::unique_ptr<HloModule> module_;
   GpuExecutableProto proto_;
   int pointer_size_;
+  Compiler* compiler_;
 };
 
 }  // namespace gpu

From 9a6ae99708e9d13d4562c090508d0e789aee236d Mon Sep 17 00:00:00 2001
From: Alex <alexandros.theodoridis@amd.com>
Date: Thu, 11 Dec 2025 05:52:13 -0800
Subject: [PATCH 176/753] PR #35138: [ROCm] Fix rocm build due to inverted
 condition
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35138

📝 Summary of Changes
Looks there is invalid check introduced in this commit:
https://github.com/openxla/xla/commit/349a1d4e00cf33dbabd1db5771d03b0c5a4aea61
<img width="1012" height="175" alt="image" src="https://github.com/user-attachments/assets/03ead4ea-b5ab-40f5-9af2-17d85e402cd0" />

🎯 Justification
This commit breaks all the rocm tests, we start getting ir code for nvidia gpus.
Inverting the check fixes them.

🚀 Kind of Contribution
🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
Not relevant

🧪 Unit Tests:
All rocm u-tests

🧪 Execution Tests:
CI

Copybara import of the project:

--
5e4a09ec7140e6e4b20c94ac71789e4b67f96c09 by Alexandros Theodoridis <atheodor@amd.com>:

Fix rocm build due to inverted condition

Merging this change closes #35138

PiperOrigin-RevId: 843183609
---
 .../xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
index ab4de23a4ed646..3c22e505ff63ef 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
@@ -77,7 +77,7 @@ class LowerToLLVMGPUPass
       : device_spec_(device_description) {}
 
   void runOnOperation() override {
-    if (gpu_device_info_.empty()) {
+    if (!gpu_device_info_.empty()) {
       se::GpuDeviceInfoProto device_info;
       CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
                                                        &device_info));

From de08322b72d7c6f07e6ae7812142044b0ce4834f Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov@google.com>
Date: Thu, 11 Dec 2025 06:22:46 -0800
Subject: [PATCH 177/753] [XLA:GPU] simplify testing in nest_gemm_fusion

PiperOrigin-RevId: 843192398
---
 .../gpu/transforms/nest_gemm_fusion_test.cc   | 309 +++++-------------
 1 file changed, 73 insertions(+), 236 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index fd81e8ef39895b..23cb8c8e5a1f4a 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -84,6 +84,17 @@ class NestGemmFusionTest : public HloHardwareIndependentTestBase {
       TestGpuDeviceInfo::RTXA6000DeviceInfo(
           se::GpuComputeCapability{se::CudaComputeCapability::Ampere()})};
   mlir::MLIRContext mlir_context_;
+
+  std::unique_ptr<VerifiedHloModule> ParseAndRunNestGemmFusion(
+      absl::string_view hlo, const bool expect_change = true) {
+    std::unique_ptr<VerifiedHloModule> module =
+        ParseAndReturnVerifiedModule(hlo).value();
+    EXPECT_THAT(
+        NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
+        IsOkAndHolds(expect_change));
+    EXPECT_OK(verifier().Run(module.get()).status());
+    return module;
+  }
 };
 
 TEST_F(NestGemmFusionTest, BasicTest) {
@@ -109,12 +120,7 @@ ENTRY entry {
     }
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-
+  std::unique_ptr<VerifiedHloModule> module = ParseAndRunNestGemmFusion(hlo);
   const HloInstruction* fusion = nullptr;
   ASSERT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(match::Fusion(&fusion)));
@@ -168,11 +174,7 @@ ENTRY e {
                          "split_k":1,"num_stages":1,"num_warps":2,
                          "num_ctas":1}}}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module = ParseAndRunNestGemmFusion(hlo);
   HloComputation* fusion_computation = module->entry_computation()
                                            ->root_instruction()
                                            ->fused_instructions_computation();
@@ -232,11 +234,7 @@ ENTRY e {
   ROOT result = (f32[128,128], f32[8192,512]) tuple(r1, r2)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
-  TF_ASSERT_OK_AND_ASSIGN(
-      bool updated,
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()));
-  EXPECT_TRUE(updated);
+  std::unique_ptr<VerifiedHloModule> module = ParseAndRunNestGemmFusion(hlo);
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kFusion);
@@ -285,13 +283,8 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 
   const HloInstruction* fusion = nullptr;
   ASSERT_THAT(module->entry_computation()->root_instruction(),
@@ -332,13 +325,8 @@ ENTRY entry {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 
   const HloInstruction* fusion = nullptr;
   ASSERT_THAT(module->entry_computation()->root_instruction(),
@@ -379,13 +367,7 @@ ENTRY entry {
     }
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 }
 
 TEST_P(NestGemmFusionReshapeTest,
@@ -414,13 +396,7 @@ ENTRY entry {
       }
     }
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsCanBeHoistedPastConvertEpilogues) {
@@ -448,12 +424,8 @@ ENTRY entry {
       }
     }
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: f16[3,11]{1,0} convert(
@@ -494,13 +466,7 @@ ENTRY entry {
       }
     }
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsKeepElementSizeInBits) {
@@ -530,12 +496,8 @@ ENTRY entry {
       }
     }
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
   CHECK: ENTRY
@@ -574,13 +536,7 @@ ENTRY entry {
       }
     }
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 }
 
 TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample2) {
@@ -613,13 +569,7 @@ ENTRY entry_computation {
     }
 })";
   // Note: block sizes were 16,16,32, but that now fails to satisfy constraints.
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 }
 
 TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample3) {
@@ -649,13 +599,7 @@ ENTRY entry_computation {
       }
     }
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedPastCompare) {
@@ -684,13 +628,7 @@ ENTRY e {
       "block_m":32,"block_n":16,"block_k":128,
       "split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
 }
 
 TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedUpThroughBroadcasts) {
@@ -716,13 +654,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 // Broadcast fusion:
@@ -761,14 +694,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  // We can nest the fusion including the broadcast.
-  ASSERT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
-                  .Run(module.get())
-                  .ok());
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   // Cos should not be rewritten as we cannot hoist bitcast.
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
@@ -803,14 +730,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  // We can nest the fusion including the broadcast.
-  ASSERT_TRUE(NestGemmFusion(device_description_, &mlir_context_)
-                  .Run(module.get())
-                  .ok());
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   // Cos should not be rewritten as we cannot hoist bitcast.
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
@@ -847,13 +768,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: [[p0:[^ ]+]] = f32[15]{0} parameter(0)
@@ -887,13 +803,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            R"(
 // Broadcast fusion:
@@ -938,13 +849,8 @@ ENTRY e {
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
 CHECK: {{.*}} {
@@ -989,13 +895,8 @@ ENTRY entry {
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
@@ -1025,13 +926,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK:      ROOT transpose
@@ -1063,13 +959,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK:      ROOT transpose
@@ -1100,13 +991,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK:      transpose
@@ -1137,13 +1023,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
 CHECK:      f32[2,3,5]{2,1,0} $0
@@ -1176,13 +1057,8 @@ ENTRY e {
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
 }
           )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   // Checks that transpose is on rank 3 tensor from hoisting bitcast1, not rank
   // 4 tensor from hoisting bitcast0 first and then failing to hoist bitcast1.
   EXPECT_THAT(
@@ -1215,13 +1091,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK:      ROOT transpose
@@ -1251,13 +1122,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK:      ROOT broadcast
@@ -1288,13 +1154,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
 CHECK:      f32[2,3,5]{2,1,0} $0(dot)
@@ -1323,13 +1184,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: ROOT dot
@@ -1360,13 +1216,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK: ROOT add = f32[3,5]{1,0} add
@@ -1401,12 +1252,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK-NOT: bitcast
@@ -1452,12 +1299,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK-NOT: bitcast
@@ -1500,12 +1343,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK-NOT: bitcast
@@ -1513,11 +1352,11 @@ CHECK-NOT: reshape
 CHECK: ENTRY
 )"),
       IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
 }
 
+// TODO(b/393299275): this test was not written correctly and now fails.
 TEST_P(NestGemmFusionReshapeTest,
-       BitcastsAreNotHoistedOutThroughLayoutChangingTranspose) {
+       DISABLED_BitcastsAreNotHoistedOutThroughLayoutChangingTranspose) {
   HloOpcode opcode = GetParam();
   absl::string_view hlo = R"(
 HloModule t
@@ -1542,9 +1381,8 @@ ENTRY e {
     triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
+  std::unique_ptr<VerifiedHloModule> module =
+      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
 CHECK: $0.1 = f32[2,7]{0,1} $0
@@ -1555,7 +1393,6 @@ CHECK-NOT: reshape
         )",
                                             HloOpcodeString(opcode))),
               IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
 }
 
 INSTANTIATE_TEST_SUITE_P(NestGemmFusionReshapeTestSuite,

From d4c9bda1fc4f1da8060dc44ae2be374da5ce129f Mon Sep 17 00:00:00 2001
From: Ilya Tikhonovskiy <loislo@google.com>
Date: Thu, 11 Dec 2025 07:03:23 -0800
Subject: [PATCH 178/753] Integrate Triton up to 8d445186

https://github.com/openxla/triton/tree/triton_integrate_branch-1.15

PiperOrigin-RevId: 843206657
---
 .../triton/llvm_integration/series.bzl        |   4 -
 .../temporary/launched_cluster_dim_fix.patch  | 106 +++++++++++++
 .../triton/temporary/launcher_addition.patch  |  70 +++++++++
 .../launcher_non_portable_clusters.patch      |  19 +++
 .../temporary/launcher_tma_desc_fix.patch     | 144 ------------------
 .../triton/temporary/utility-fix.patch        |  22 ---
 .../xla/third_party/triton/workspace.bzl      |   4 +-
 .../codegen/triton/compilation_pipeline.cc    |  18 +--
 .../gpu/codegen/triton/compilation_pipeline.h |  17 +--
 .../triton/compilation_pipeline_cuda.cc       |  14 +-
 .../triton/compilation_pipeline_rocm.cc       |   2 +
 .../triton/compilation_pipeline_test.cc       |   1 +
 .../xla/backends/gpu/codegen/triton/fusion.cc |   8 +-
 .../gpu/codegen/triton/xtile_compiler.cc      |  27 +---
 .../gpu/codegen/triton/xtile_compiler.h       |   4 -
 .../xla/pjrt/c/pjrt_c_api_triton_extension.h  |   5 +-
 .../xla/pjrt/c/pjrt_c_api_triton_internal.cc  |   3 -
 third_party/xla/xla/pjrt/triton.h             |   3 -
 third_party/xla/xla/pjrt/triton_cuda.cc       |   6 +-
 .../gpu/autotuning/autotune_cache_key.h       |   2 +-
 .../xla/xla/service/gpu/tests/xla-opt.cc      |   5 +-
 .../xla/xla/service/gpu/thunk_emitter.cc      |   4 +-
 22 files changed, 225 insertions(+), 263 deletions(-)
 create mode 100644 third_party/xla/third_party/triton/temporary/launched_cluster_dim_fix.patch
 create mode 100644 third_party/xla/third_party/triton/temporary/launcher_addition.patch
 create mode 100644 third_party/xla/third_party/triton/temporary/launcher_non_portable_clusters.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
 delete mode 100644 third_party/xla/third_party/triton/temporary/utility-fix.patch

diff --git a/third_party/xla/third_party/triton/llvm_integration/series.bzl b/third_party/xla/third_party/triton/llvm_integration/series.bzl
index 15b35c93bc457c..656b9c894904d8 100644
--- a/third_party/xla/third_party/triton/llvm_integration/series.bzl
+++ b/third_party/xla/third_party/triton/llvm_integration/series.bzl
@@ -8,9 +8,5 @@ LLVM nor MLIR integrator, please do not add any patches to this list.
 """
 
 llvm_patch_list = [
-    "//third_party/triton:llvm_integration/cl831451347.patch",
-    "//third_party/triton:llvm_integration/cl833447018.patch",
-    "//third_party/triton:llvm_integration/cl835942347.patch",
-    "//third_party/triton:llvm_integration/cl838780160.patch",
     # Add new patches just above this line
 ]
diff --git a/third_party/xla/third_party/triton/temporary/launched_cluster_dim_fix.patch b/third_party/xla/third_party/triton/temporary/launched_cluster_dim_fix.patch
new file mode 100644
index 00000000000000..9386923e48f637
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launched_cluster_dim_fix.patch
@@ -0,0 +1,106 @@
+https://github.com/triton-lang/triton/pull/8645 removed cluster dimensions from the
+kernel metadata. We don't need to pass them to the launch API anymore.
+
+--- a/third_party/nvidia/backend/cuda_utils.cc	2025-11-17 02:33:44.000000000 -0800
++++ b/third_party/nvidia/backend/cuda_utils.cc	2025-11-18 03:58:02.000000000 -0800
+@@ -119,8 +119,8 @@
+     constexpr int size() const { return x * y * z; }
+   };
+   Dim grid;                     // Number of clusters per grid
+-  Dim cluster;                  // Number of blocks per cluster
+   int num_warps;                // number of warps per block
++  int num_ctas;                 // number of CTAs per block
+   int shared_memory;            // Size of shared memory in bytes to allocate
+   int launch_cooperative_grid;  // Non-zero to launch coop grid
+   int launch_pdl;               // Non-zero to use programatic-dependent launch
+@@ -137,16 +137,15 @@
+   // APIs if needed.
+   Py_BEGIN_ALLOW_THREADS;
+   const auto& grid = config.grid;
+-  const auto& cluster = config.cluster;
+   if (grid.size() == 0) {
+     PyEval_RestoreThread(_save);
+     Py_RETURN_NONE;
+   }
+ 
+   CUlaunchConfig cu_config;
+-  cu_config.gridDimX = grid.x * cluster.x;
+-  cu_config.gridDimY = grid.y * cluster.y;
+-  cu_config.gridDimZ = grid.z * cluster.z;
++  cu_config.gridDimX = grid.x * config.num_ctas;
++  cu_config.gridDimY = grid.y;
++  cu_config.gridDimZ = grid.z;
+   cu_config.blockDimX = 32 * config.num_warps;
+   cu_config.blockDimY = 1;
+   cu_config.blockDimZ = 1;
+@@ -169,12 +168,12 @@
+       .value = { .cooperative =  1}
+     };
+   }
+-  if (config.cluster.size() > 1) {
++  if (config.num_ctas != 1) {
+     auto& clusterDimAttr = launchAttr[cu_config.numAttrs++];
+     clusterDimAttr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
+-    clusterDimAttr.value.clusterDim.x = cluster.x;
+-    clusterDimAttr.value.clusterDim.y = cluster.y;
+-    clusterDimAttr.value.clusterDim.z = cluster.z;
++    clusterDimAttr.value.clusterDim.x = config.num_ctas;
++    clusterDimAttr.value.clusterDim.y = 1;
++    clusterDimAttr.value.clusterDim.z = 1;
+     auto& clusterDimSchedulingAttr = launchAttr[cu_config.numAttrs++];
+     clusterDimSchedulingAttr.id =
+         CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
+@@ -518,9 +517,8 @@
+ :param launch_pdl: enable programmatic dependent launch
+ :type launch_pdl: bool
+ :param packed_metadata: Kernel metadata, including in sequence:
+-    number of warps, number of CTAs, required bytes of shared memory,
+-    cluster dimensions x, y, and z
+-:type packed_metadata: 6-tuple
++    number of warps, number of CTAs, required bytes of shared memory
++:type packed_metadata: 3-tuple
+ :param hook_args: arguments to pass to the enter and exit hooks
+ :type hook_args: object
+ :param launch_enter_hook: hook to call just before launching the kernel
+@@ -542,7 +540,6 @@
+   ensureCudaContext();
+   TritonLaunchConfig config{};
+   auto& grid = config.grid;
+-  auto& cluster = config.cluster;
+   // PyObject* kernel_metadata = nullptr;
+   PyObject* hook_args = nullptr;
+   PyObject* launch_enter_hook = nullptr;
+@@ -551,15 +548,13 @@
+   PyObject* kernel_args = nullptr;
+   PyObject* global_scratch = nullptr;
+   PyObject* profile_scratch = nullptr;
+-  int num_ctas = 0;
+-  if (!PyArg_ParseTuple(args, "iiiKKpp(iiiiii)OOOSOOO", &grid.x, &grid.y,
+-                        &grid.z, &config.stream, &config.function,
++  if (!PyArg_ParseTuple(args, "iiiKKpp(iii)OOOSOOO", &grid.x, &grid.y, &grid.z,
++                        &config.stream, &config.function,
+                         &config.launch_cooperative_grid, &config.launch_pdl,
+-                        &config.num_warps, &num_ctas, &config.shared_memory,
+-                        &cluster.x, &cluster.y, &cluster.z, &hook_args,
+-                        &launch_enter_hook, &launch_exit_hook,
+-                        &signature_metadata_bytes, &global_scratch,
+-                        &profile_scratch, &kernel_args)) {
++                        &config.num_warps, &config.num_ctas,
++                        &config.shared_memory, &hook_args, &launch_enter_hook,
++                        &launch_exit_hook, &signature_metadata_bytes,
++                        &global_scratch, &profile_scratch, &kernel_args)) {
+     return nullptr;
+   }
+   llvm::ArrayRef<char> signature_metadata(
+
+--- a/third_party/nvidia/backend/driver.py	2025-11-13 05:31:00.000000000 -0800
++++ b/third_party/nvidia/backend/driver.py	2025-11-18 03:45:28.000000000 -0800
+@@ -223,7 +223,7 @@
+     def wrapper(grid_dim_x: int, grid_dim_y: int, grid_dim_z: int,
+                 stream: int, kernel: int, launch_cooperative_grid: bool,
+                 launch_pdl: bool, global_scratch: any, profile_scratch: any,
+-                packed_metadata: tuple[int, int, int, int, int, int],
++                packed_metadata: tuple[int, int, int],
+                 hook_args: any,
+                 launch_enter_hook: Callable[..., None],
+                 launch_exit_hook: Callable[..., None],
diff --git a/third_party/xla/third_party/triton/temporary/launcher_addition.patch b/third_party/xla/third_party/triton/temporary/launcher_addition.patch
new file mode 100644
index 00000000000000..7aa0f2e9551d9a
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_addition.patch
@@ -0,0 +1,70 @@
+
+--- a/third_party/nvidia/backend/cuda_utils.cc	2025-11-13 05:31:00.000000000 -0800
++++ b/third_party/nvidia/backend/cuda_utils.cc	2025-11-14 02:50:50.000000000 -0800
+@@ -59,7 +59,7 @@
+ #define CUDA_CHECK_AND_RETURN_NULL(ans)                                        \
+   do {                                                                         \
+     if (!gpuAssert((ans), __FILE__, __LINE__))                                 \
+-      goto cleanup;                                                            \
++      return NULL;                                                            \
+   } while (0)
+ 
+ // To be used inside a Py_{BEGIN,END}_ALLOW_THREADS block.
+@@ -77,7 +77,7 @@
+     if ((funcPointer) == NULL) {                                               \
+       (funcPointer) = (initializerFunction)();                                 \
+       if ((funcPointer) == NULL) {                                             \
+-        goto cleanup;                                                          \
++        return NULL;                                                          \
+       }                                                                        \
+     }                                                                          \
+   } while (0)
+@@ -912,16 +912,21 @@
+ 
+ // clang-format off
+ static PyTypeObject PyCUtensorMapType = {
+-    PyVarObject_HEAD_INIT(NULL, 0)
++    .ob_base = {
++        .ob_base = {
++            .ob_type = NULL,
++        },
++        .ob_size = 0,
++    },
+     .tp_name = "triton.backends.nvidia.PyCUtensorMap",
+     .tp_basicsize = sizeof(PyCUtensorMapObject),
+     .tp_itemsize = 0,
++    .tp_dealloc = (destructor)PyCUtensorMap_dealloc,
+     .tp_flags = Py_TPFLAGS_DEFAULT,
+     .tp_doc = "<PyCUtensorMap object>",
+     .tp_methods = PyCUtensorMap_methods,
++    .tp_alloc = PyCUtensorMap_alloc,
+     .tp_new = PyType_GenericNew,
+-    .tp_alloc = PyCUtensorMap_alloc,
+-    .tp_dealloc = (destructor)PyCUtensorMap_dealloc,
+     .tp_free = PyCUtensorMap_free,
+ };
+ // clang-format on
+@@ -1056,9 +1061,11 @@
+   INITIALIZE_FUNCTION_POINTER_IF_NULL(cuTensorMapEncodeTiled,
+                                       getCuTensorMapEncodeTiledHandle);
+   CUresult res = cuTensorMapEncodeTiled(
+-      &desc->tensorMap, elemType, rank, (void *)global_address, shapeInt,
+-      stridesLL, blockSizeInt, elementStrides, CU_TENSOR_MAP_INTERLEAVE_NONE,
+-      swizzle, CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
++      &desc->tensorMap, (CUtensorMapDataType)elemType, rank,
++      (void*)global_address, shapeInt, stridesLL, blockSizeInt, elementStrides,
++      CU_TENSOR_MAP_INTERLEAVE_NONE, (CUtensorMapSwizzle)swizzle,
++      CU_TENSOR_MAP_L2_PROMOTION_L2_128B, fill);
++
+   if (res != CUDA_SUCCESS) {
+     const char *str;
+     cuGetErrorString(res, &str);
+@@ -1104,8 +1111,6 @@
+   }
+ 
+   return (PyObject *)desc;
+-
+-  return result;
+ }
+ 
+ static PyMethodDef ModuleMethods[] = {
diff --git a/third_party/xla/third_party/triton/temporary/launcher_non_portable_clusters.patch b/third_party/xla/third_party/triton/temporary/launcher_non_portable_clusters.patch
new file mode 100644
index 00000000000000..c4e4872e455f08
--- /dev/null
+++ b/third_party/xla/third_party/triton/temporary/launcher_non_portable_clusters.patch
@@ -0,0 +1,19 @@
+This should be merged with the launcher patch afterwards.
+
+--- a/third_party/nvidia/backend/cuda_utils.cc	2025-11-18 05:00:39.000000000 -0800
++++ b/third_party/nvidia/backend/cuda_utils.cc	2025-12-02 08:10:32.000000000 -0800
+@@ -180,6 +180,14 @@
+     clusterDimSchedulingAttr.value.clusterSchedulingPolicyPreference =
+         CU_CLUSTER_SCHEDULING_POLICY_SPREAD;
+   }
++
++  // As per the comment in third_party/nvidia/backend/driver.py,
++  // "num_ctas == 16 is non-portable. Does work for H100 and B200 tho".
++  if (config.num_ctas == 16) {
++    CUDA_CHECK(cuFuncSetAttribute(
++        config.function, CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED,
++        1));
++  }
+ 
+   // cuLaunchKernelEx was added in CUDA 12, so load it dynamically to be
+   // able to link on CUDA 11 and earlier.
diff --git a/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch b/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
deleted file mode 100644
index 57d4c2121e37ea..00000000000000
--- a/third_party/xla/third_party/triton/temporary/launcher_tma_desc_fix.patch
+++ /dev/null
@@ -1,144 +0,0 @@
-diff --git a/third_party/nvidia/backend/cuda_utils.cc b/third_party/nvidia/backend/cuda_utils.cc
---- a/third_party/nvidia/backend/cuda_utils.cc
-+++ b/third_party/nvidia/backend/cuda_utils.cc
-@@ -270,51 +270,16 @@ bool extractPointer(PyObject* obj, void*
-   return true;
- }
- 
-+CUtensorMap* getTmaDesc(PyObject* obj);
-+
- // Extract a CUtensorMap descriptor from a python object, and store it to the
- // memory location pointed by ptr.
- bool extractTmaDesc(PyObject* obj, void* ptr) {
--  if (sizeof(CUtensorMap*) != 8) {
--    PyErr_SetString(PyExc_SystemError,
--                "extractTmaDesc() requires 64-bit compilation");
--    return false;
--  }
--
--  UniquePyObjectPtr method_ret(
--      PyObject_CallMethod(obj, "tma_desc_cpu_ptr", nullptr));
--  // Checking the error retains context if tma_desc_cpu_ptr raises an exception.
--  if (PyErr_Occurred()) {
--    return false;
--  }
--
--  if (!method_ret) {
--    PyErr_SetString(PyExc_SystemError, "Call to tma_desc_cpu_ptr() failed");
-+  CUtensorMap* tensor_map = getTmaDesc(obj);
-+  if (tensor_map == nullptr) {
-     return false;
-   }
--
--  if (!PyLong_Check(method_ret.get())) {
--    PyErr_SetString(PyExc_TypeError,
--                    "tma_desc_cpu_ptr() must return 64-bit int");
--    return false;
--  }
--
--  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret.get());
--  if (PyErr_Occurred()) {
--    return false;
--  }
--
--  if (!ptr_as_uint) {
--    PyErr_SetString(PyExc_ValueError,
--                    "received NULL ptr from tma_desc_cpu_ptr()");
--    return false;
--  }
--  if (ptr_as_uint % 64 != 0) {
--    PyErr_SetString(PyExc_ValueError,
--                    "tma_desc_cpu_ptr() must be 64-byte aligned");
--    return false;
--  }
--
--  *static_cast<CUtensorMap*>(ptr) =
--      *reinterpret_cast<CUtensorMap*>(ptr_as_uint);
-+  *static_cast<CUtensorMap*>(ptr) = *tensor_map;
-   return true;
- }
- 
-@@ -392,6 +357,7 @@ struct ExtractionInfo {
-   // Prefixes of types reprs supported by the extractor.
-   llvm::SmallVector<llvm::StringRef> supported_type_repr_prefixes;
-   std::size_t size;         // Size required by the extracted value.
-+  std::size_t alignment;    // Alignment requirement for the extracted value.
-   ExtractorType extractor;  // Function to call to extract the value.
- 
-   // Builds an ExtractionInfo for a given type T and a list of type reprs that
-@@ -400,7 +366,7 @@ struct ExtractionInfo {
-   static ExtractionInfo build(
-       std::initializer_list<llvm::StringRef> supported_type_reprs,
-       ExtractorType extractor = extractValue<T>) {
--    return {supported_type_reprs, sizeof(T), extractor};
-+    return {supported_type_reprs, sizeof(T), alignof(T), extractor};
-   }
- 
-   // Checks if the extractor supports extracting a given type repr.
-@@ -428,7 +394,7 @@ const ExtractionInfo kExtractionInfos[]{
-     // Note: types are e.g. '*fp32', so no closing quote is intentional.
-     ExtractionInfo::build<void*>({"'*"}, extractPointer),
-     ExtractionInfo{
--        {"None", "'none'"}, 0, nullptr},  // Represent constexprs as None
-+        {"None", "'none'"}, 0, 0, nullptr},  // Represent constexprs as None
-     ExtractionInfo::build<CUtensorMap>({"'nvTmaDesc'"}, extractTmaDesc),
- };
- 
-@@ -628,7 +594,19 @@ PyObject* launch(PyObject* self, PyObjec
-     if (extraction_info.size == 0) {
-       continue;  // skip adding constexpr parameters
-     }
--    config.params[params_idx] = alloca(extraction_info.size);
-+    size_t alignment = std::max(1UL, extraction_info.alignment);
-+
-+    // Allocate enough space on the stack to guarantee an aligned block.
-+    size_t size_with_alignment = extraction_info.size + alignment - 1;
-+    void *param_storage_ptr = alloca(size_with_alignment);
-+
-+    void *aligned_ptr = std::align(alignment, extraction_info.size,
-+                                   param_storage_ptr, size_with_alignment);
-+    if (aligned_ptr == nullptr) {
-+      PyErr_SetString(PyExc_MemoryError, "Failed to align parameter storage");
-+      return nullptr;
-+    }
-+    config.params[params_idx] = aligned_ptr;
-     if (!extraction_info.extractor(arg, config.params[params_idx])) {
-       return nullptr;
-     }
-@@ -940,6 +918,36 @@ static PyTypeObject PyCUtensorMapType = 
- };
- // clang-format on
- 
-+namespace {
-+
-+// Extracts a pointer to `CUtensorMap` from a `PyCUtensorMapObject`.
-+CUtensorMap* getTmaDesc(PyObject* obj) {
-+  if (sizeof(CUtensorMap*) != 8) {
-+    PyErr_SetString(PyExc_SystemError,
-+                    "getTmaDesc() requires 64-bit compilation");
-+    return nullptr;
-+  }
-+  if (Py_TYPE(obj) != static_cast<PyTypeObject*>(&PyCUtensorMapType)) {
-+    PyErr_Format(PyExc_TypeError,
-+                 "object must be of type PyCUtensorMap, got %s",
-+                 Py_TYPE(obj)->tp_name);
-+    return nullptr;
-+  }
-+  CUtensorMap* map = &((PyCUtensorMapObject*)obj)->tensorMap;
-+  // PyCUtensorMapObject aligns tensorMap to 128.
-+  uintptr_t align_128 = (uintptr_t)map & (128 - 1);
-+  if (align_128 != 0) {
-+    PyErr_Format(
-+        PyExc_ValueError,
-+        "CUtensorMap must be aligned to 128B, but got (&map) mod 128 = %ld",
-+        align_128);
-+    return nullptr;
-+  }
-+  return map;
-+}
-+
-+}  // namespace
-+
- static PyObject *fillTMADescriptor(PyObject *self, PyObject *args) {
-   unsigned long long global_address;
-   int swizzle;
diff --git a/third_party/xla/third_party/triton/temporary/utility-fix.patch b/third_party/xla/third_party/triton/temporary/utility-fix.patch
deleted file mode 100644
index f8cc5d0821f098..00000000000000
--- a/third_party/xla/third_party/triton/temporary/utility-fix.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-This patch would probably not be accepted upstream because our infrastructure
-uses Index type for indexing, while they use Integer type. Triton frontend
-wouldn't generate a situation that would run into this issue.
-
-diff --git a/lib/Dialect/Triton/IR/Utility.cpp b/lib/Dialect/Triton/IR/Utility.cpp
---- a/lib/Dialect/Triton/IR/Utility.cpp
-+++ b/lib/Dialect/Triton/IR/Utility.cpp
-@@ -97,8 +97,12 @@ Value tt::getLastInductionValue(OpBuilde
-   // (ub - lb -1) // step * step + lb
-   Value diff =
-       b.create<arith::SubIOp>(loc, loop.getUpperBound(), loop.getLowerBound());
--  diff = b.create<arith::SubIOp>(
--      loc, diff, b.create<arith::ConstantOp>(loc, b.getI32IntegerAttr(1)));
-+  Value one;
-+  if (diff.getType().isIndex())
-+    one = b.create<arith::ConstantIndexOp>(loc, 1);
-+  else
-+    one = b.create<arith::ConstantOp>(loc, b.getIntegerAttr(diff.getType(), 1));
-+  diff = b.create<arith::SubIOp>(loc, diff, one);
-   Value ceilStep = b.create<arith::MulIOp>(
-       loc, b.create<arith::DivSIOp>(loc, diff, loop.getStep()), loop.getStep());
-   return b.create<arith::AddIOp>(loc, ceilStep, loop.getLowerBound());
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 6316fc91a1a9fa..fd263961ed17fd 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -7,8 +7,8 @@ load("//third_party/triton:temporary/series.bzl", "temporary_patch_list")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "triton_integrate_branch-1.14"
-    TRITON_SHA256 = "b684cff8d07e839f8a1ea6cc7d331f370615b4c5530489db76f619aa7aa66608"
+    TRITON_COMMIT = "triton_integrate_branch-1.15"
+    TRITON_SHA256 = "a502364ad54bd822dae5d2fc6215695f7d343617a8c643a39a49f40ef474d013"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
index 67c1eb9b94d7a7..3fb0c9c2a1e1de 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 namespace xla::gpu {
 
@@ -68,30 +67,23 @@ void CreateTritonXlaPipeline(
 void CreateTritonCudaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::CudaComputeCapability& cuda_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info);
+    int num_ctas, int num_stages);
 
 void CreateTritonRocmPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::RocmComputeCapability& rocm_cc, int num_warps,
     int num_ctas, int num_stages);
 
-void CreateTritonPipeline(
-    mlir::OpPassManager* pm,
-    const stream_executor::GpuComputeCapability& gpu_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
+void CreateTritonPipeline(mlir::OpPassManager* pm,
+                          const stream_executor::GpuComputeCapability& gpu_cc,
+                          int num_warps, int num_ctas, int num_stages) {
   if (auto* cuda_cc = gpu_cc.cuda_compute_capability()) {
     return CreateTritonCudaPipeline(pm, *cuda_cc, num_warps, num_ctas,
-                                    num_stages, out_cluster_info);
+                                    num_stages);
   }
 
   CreateTritonRocmPipeline(pm, *gpu_cc.rocm_compute_capability(), num_warps,
                            num_ctas, num_stages);
-  // There is no clusters in ROCm for now.
-  out_cluster_info.clusterDimX = 1;
-  out_cluster_info.clusterDimY = 1;
-  out_cluster_info.clusterDimZ = 1;
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
index 8ae26d3a691cc5..2d4fc0ee7fa5b3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "mlir/Pass/PassManager.h"
 #include "xla/stream_executor/device_description.h"
-#include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
 
 namespace xla::gpu {
 
@@ -29,19 +28,9 @@ void CreateTritonXlaPipeline(
     bool allow_tma, int num_stages);
 
 // Creates a Triton compilation pipeline.
-//
-// `out_cluster_info` must be kept alive at least until pm.run() is called.
-// It should be read after that. We have to pass the cluster dims to
-// LaunchDimensions. Triton currently uses this as an out-parameter to return
-// the cluster dims determined based on `config.num_ctas` and a heuristic. There
-// are some signs that show that this was intended to be used as an in-out
-// parameter which would give a hint to Triton which cluster dims we prefer to
-// use, but that's not the case currently.
-void CreateTritonPipeline(
-    mlir::OpPassManager* pm,
-    const stream_executor::GpuComputeCapability& gpu_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info);
+void CreateTritonPipeline(mlir::OpPassManager* pm,
+                          const stream_executor::GpuComputeCapability& gpu_cc,
+                          int num_warps, int num_ctas, int num_stages);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
index 133de281e50e85..00ba104d3bc82e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -61,17 +61,14 @@ static void MakeTTIR(mlir::OpPassManager* pm,
 // @triton//:third_party/nvidia/backend/compiler.py
 static void MakeTTGIR(mlir::OpPassManager* pm,
                       const stream_executor::CudaComputeCapability& cuda_cc,
-                      int num_warps, int num_ctas, int num_stages,
-                      mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
+                      int num_warps, int num_ctas, int num_stages) {
   const int cuda_cc_as_int = cuda_cc.major * 10 + cuda_cc.minor;
   pm->addPass(mt::createConvertTritonToTritonGPU(
       {absl::StrFormat("cuda:%u", cuda_cc_as_int), num_warps,
        /*threads_per_warp=*/32, num_ctas}));
   pm->addPass(mt::gpu::createTritonGPUCoalesce());
-  if (cuda_cc.IsAtLeastAmpere()) {
-    pm->addPass(mt::gpu::createTritonGPUF32DotTC());
-  }
-  pm->addPass(ttng::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
+  pm->addPass(mt::gpu::createTritonGPUF32DotTC({cuda_cc.IsAtLeastAmpere()}));
+  pm->addPass(ttng::createTritonNvidiaGPUPlanCTAPass());
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
   pm->addPass(mt::gpu::createTritonGPUAccelerateMatmul());
@@ -169,10 +166,9 @@ static void MakeLLIR(mlir::OpPassManager* pm,
 void CreateTritonCudaPipeline(
     mlir::OpPassManager* pm,
     const stream_executor::CudaComputeCapability& cuda_cc, int num_warps,
-    int num_ctas, int num_stages,
-    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
+    int num_ctas, int num_stages) {
   MakeTTIR(pm, cuda_cc);
-  MakeTTGIR(pm, cuda_cc, num_warps, num_ctas, num_stages, out_cluster_info);
+  MakeTTGIR(pm, cuda_cc, num_warps, num_ctas, num_stages);
   MakeLLIR(pm, cuda_cc);
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
index 502f22fafa6b35..2786b61fc4fd73 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -63,6 +63,7 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
       {absl::StrCat("hip:", rocm_cc.gfx_version()), num_warps, threadsPerWarp,
        num_ctas}));
   pm->addPass(mt::gpu::createTritonGPUCoalesce());
+  pm->addPass(mt::gpu::createTritonGPUF32DotTC({false}));
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUOptimizeThreadLocality());
   // TODO ROCm Pass rocm_cc.gfx_version() after fixing issue with fmfa
@@ -123,6 +124,7 @@ static void MakeLLIR(mlir::OpPassManager* pm,
                      const stream_executor::RocmComputeCapability& rocm_cc,
                      int num_stages) {
   const int custom_lds_size = 0;
+  pm->addPass(mlir::createTritonAMDGPUUpdateAsyncWaitCount());
   pm->addPass(mlir::triton::AMD::createOptimizeLDSUsagePass(
       rocm_cc.gfx_version(), custom_lds_size));
   pm->addPass(mlir::createSCFToControlFlowPass());
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
index 69d5d43dcb20e6..a1be422730922c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
index 2caf28722c47e8..88443f24071477 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion.cc
@@ -203,9 +203,8 @@ absl::StatusOr<TritonFusion::EmitResult> TritonFusion::Emit(
         local_module.get()));
 
     return {{kernel->getName().str(), launch_dimensions,
-             triton_wrapper_result.cluster_dim,
-             triton_wrapper_result.shmem_bytes, /*binary=*/"",
-             triton_wrapper_result.tma_metadata}};
+             /*cluster_dim=*/std::nullopt, triton_wrapper_result.shmem_bytes,
+             /*binary=*/"", triton_wrapper_result.tma_metadata}};
   };
 
   auto [status_or_entry, was_cached] =
@@ -218,7 +217,8 @@ absl::StatusOr<TritonFusion::EmitResult> TritonFusion::Emit(
           Thunk::ThunkInfo::WithProfileAnnotation(
               &fusion, ir_emitter_context.GetNextThunkId()),
           entry->kernel_name, kernel_arguments, entry->launch_dimensions,
-          entry->cluster_dim, entry->shmem_bytes, entry->tma_metadata),
+          /*cluster_dim=*/std::nullopt, entry->shmem_bytes,
+          entry->tma_metadata),
       was_cached ? nullptr : std::move(local_module)};
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
index 8d2b17c25097dd..36fadbcef88696 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.cc
@@ -432,9 +432,7 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
         "(num_warps, num_ctas, num_stages) must be positive, but got: (",
         num_warps, ", ", num_ctas, ", ", num_stages, ")"));
   }
-  mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
-  CreateTritonPipeline(&pm, gpu_cc, num_warps, num_ctas, num_stages,
-                       cluster_info);
+  CreateTritonPipeline(&pm, gpu_cc, num_warps, num_ctas, num_stages);
 
   // Triton generates pointers to the global address space, while XLA needs a
   // kernel signature with pointers to the generic address space.
@@ -496,24 +494,6 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
     }
   }
 
-  // `cluster_info` must be read after pm.run().
-  std::optional<se::ClusterDim> cluster_dim;
-  if (block_level_parameters.num_ctas > 1) {
-    VLOG(3) << "num_ctas: " << block_level_parameters.num_ctas
-            << ", cluster_info: " << cluster_info.clusterDimX << ","
-            << cluster_info.clusterDimY << "," << cluster_info.clusterDimZ;
-    if (cluster_info.clusterDimX > 1 || cluster_info.clusterDimY > 1 ||
-        cluster_info.clusterDimZ > 1) {
-      cluster_dim =
-          se::ClusterDim(cluster_info.clusterDimX, cluster_info.clusterDimY,
-                         cluster_info.clusterDimZ);
-    }
-  } else {
-    TF_RET_CHECK(cluster_info.clusterDimX == 1 &&
-                 cluster_info.clusterDimY == 1 &&
-                 cluster_info.clusterDimZ == 1);
-  }
-
   SmallVector<mlir::LLVM::LLVMFuncOp> func_ops;
   for (auto func : triton_module.getOps<mlir::LLVM::LLVMFuncOp>()) {
     // Custom calls will also match to LLVMFuncOp, so we are only interested in
@@ -535,10 +515,7 @@ absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
   // - TMA metadata.
   // - Total threads per block. Computed from module attributes.
   // - Captured NVVM annotations.
-  TritonWrapperResult result = {shared_mem_bytes,
-                                cluster_dim,
-                                tma_metadata,
-                                thread_dims,
+  TritonWrapperResult result = {shared_mem_bytes, tma_metadata, thread_dims,
                                 captured_nvvm_annotations,
                                 std::move(ll_triton_module)};
   return result;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
index 3370b7f81a12c0..a1fe63ad299d04 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/xtile_compiler.h
@@ -48,9 +48,6 @@ limitations under the License.
 
 namespace mlir {
 namespace triton {
-namespace nvidia_gpu {
-struct ClusterInfo;
-}
 }  // namespace triton
 }  // namespace mlir
 
@@ -59,7 +56,6 @@ namespace gpu {
 
 struct TritonWrapperResult {
   int64_t shmem_bytes = 0;
-  std::optional<se::ClusterDim> cluster_dim;
   se::gpu::TmaMetadata tma_metadata;
   se::ThreadDim thread_dims;
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
index 3eef6aca6b2c75..282e84b109e24e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_extension.h
@@ -39,11 +39,8 @@ struct PJRT_Triton_Compile_Args {
   const char* out_asm;  // owned
   size_t out_asm_size;
   int64_t out_smem_bytes;
-  int out_cluster_dim_x;
-  int out_cluster_dim_y;
-  int out_cluster_dim_z;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Triton_Compile_Args, out_cluster_dim_z);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Triton_Compile_Args, out_smem_bytes);
 
 // Compiles a given Triton kernel.
 typedef PJRT_Error* PJRT_Triton_Compile(PJRT_Triton_Compile_Args* args);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc
index df5070f806e3f7..4212c92475d24b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_triton_internal.cc
@@ -42,9 +42,6 @@ PJRT_Error* PJRT_Triton_Compile(PJRT_Triton_Compile_Args* args) {
   args->out_asm = asm_copy;
   args->out_asm_size = result.asm_text.size();
   args->out_smem_bytes = result.smem_bytes;
-  args->out_cluster_dim_x = result.cluster_dim_x;
-  args->out_cluster_dim_y = result.cluster_dim_y;
-  args->out_cluster_dim_z = result.cluster_dim_z;
   return nullptr;
 }
 
diff --git a/third_party/xla/xla/pjrt/triton.h b/third_party/xla/xla/pjrt/triton.h
index 81eabeb1adefaf..528922d38558bc 100644
--- a/third_party/xla/xla/pjrt/triton.h
+++ b/third_party/xla/xla/pjrt/triton.h
@@ -27,9 +27,6 @@ namespace xla::triton {
 struct CompilationResult {
   std::string asm_text;
   int64_t smem_bytes;
-  int cluster_dim_x;
-  int cluster_dim_y;
-  int cluster_dim_z;
 };
 
 absl::StatusOr<CompilationResult> Compile(absl::string_view module,
diff --git a/third_party/xla/xla/pjrt/triton_cuda.cc b/third_party/xla/xla/pjrt/triton_cuda.cc
index 5b4b8a69395d45..acd8866aa40bb6 100644
--- a/third_party/xla/xla/pjrt/triton_cuda.cc
+++ b/third_party/xla/xla/pjrt/triton_cuda.cc
@@ -228,13 +228,12 @@ absl::StatusOr<CompilationResult> Compile(absl::string_view module,
 
   mlir::PassManager pm(&context);
   pm.enableVerifier();
-  mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
   TF_ASSIGN_OR_RETURN(
       auto cuda_cc,
       stream_executor::CudaComputeCapability::FromString(arch_name));
   xla::gpu::CreateTritonPipeline(&pm,
                                  stream_executor::GpuComputeCapability(cuda_cc),
-                                 num_warps, num_ctas, num_stages, cluster_info);
+                                 num_warps, num_ctas, num_stages);
   if (failed(pm.run(*module_op))) {
     return absl::InternalError("Failed to compile Triton IR to LLVM IR");
   }
@@ -247,9 +246,6 @@ absl::StatusOr<CompilationResult> Compile(absl::string_view module,
   return CompilationResult{
       ptx,
       shared_mem_bytes,
-      cluster_info.clusterDimX,
-      cluster_info.clusterDimY,
-      cluster_info.clusterDimZ,
   };
 }
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
index c133650da12dda..83a360383f2d60 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
@@ -32,7 +32,7 @@ class AutotuneCacheKey {
   // Tie a version to the cache key in order to invalidate the cache when
   // necessary. This should be incremented on triton upgrades or any other
   // changes that may affect the autotuning results.
-  static constexpr int kCurrentVersion = 19;
+  static constexpr int kCurrentVersion = 20;
 
   AutotuneCacheKey(const se::DeviceDescription& device_description,
                    const HloInstruction& instruction,
diff --git a/third_party/xla/xla/service/gpu/tests/xla-opt.cc b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
index 850a788aaf73ef..2a1cf0e0cec05a 100644
--- a/third_party/xla/xla/service/gpu/tests/xla-opt.cc
+++ b/third_party/xla/xla/service/gpu/tests/xla-opt.cc
@@ -43,8 +43,6 @@ limitations under the License.
 
 namespace {
 
-mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
-
 struct TritonPipelineOptions
     : public mlir::PassPipelineOptions<TritonPipelineOptions> {
   Option<std::string> target{*this, "target", llvm::cl::init("8.0")};
@@ -75,8 +73,7 @@ mlir::PassPipelineRegistration<TritonPipelineOptions>
                                             options.allow_tma,
                                             options.num_stages);
           xla::gpu::CreateTritonPipeline(&pm, gpu_cc, options.num_warps,
-                                         options.num_ctas, options.num_stages,
-                                         cluster_info);
+                                         options.num_ctas, options.num_stages);
         });
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 4f38a16b7232d0..3822ebc2438363 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -1340,7 +1340,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTritonCustomCall(
     }
 
     kernel_modules_.push_back(std::move(result.llvm_module));
-    return {{kernel_name, launch_dimensions, result.cluster_dim,
+    return {{kernel_name, launch_dimensions, /*cluster_dim=*/std::nullopt,
              result.shmem_bytes}};
   };
 
@@ -1358,7 +1358,7 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTritonCustomCall(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
       entry->kernel_name, kernel_arguments, entry->launch_dimensions,
-      entry->cluster_dim, entry->shmem_bytes, entry->tma_metadata));
+      /*cluster_dim=*/std::nullopt, entry->shmem_bytes, entry->tma_metadata));
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitAsyncComputation(

From 5d1d09eeac829964d196ac97dd6c0362999a272a Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov@google.com>
Date: Thu, 11 Dec 2025 07:14:28 -0800
Subject: [PATCH 179/753] [XLA:GPU] ignore dim=1 sizes when swapping broadcast
 and bitcast

Previously rewrite would fail if brodcast introduced a 1 between
dimenstions that would collapse in the following reshape, for example:

p0 = f32[2,3] parameter(0)
t1 = f32[2,1,3,5] broadcast(p0), dimensions={0,2}
t2 = f32[6,5] reshape(t1)

But we can safely ignore dimensions of size 1 as they do not change the physical layout.

Will follow up with a symmetrical case for the reshape <-> broadcast.

PiperOrigin-RevId: 843210080
---
 .../gpu/transforms/nest_gemm_fusion.cc        | 11 ++-
 .../gpu/transforms/nest_gemm_fusion_test.cc   | 87 ++++++++++++++++++-
 2 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index f9812d96ca0747..28a31ba41a8d50 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -468,19 +468,26 @@ absl::StatusOr<BitcastParams> CalculateBitcastOfBroadcast(
 
   // Dimensions of the new broadcast.
   llvm::SmallVector<int64_t> new_dims;
+  llvm::SmallVector<int64_t> broadcast_physical_dims =
+      GetPhysicalDimensions(broadcast_shape);
   auto factors = CommonFactors(GetPhysicalDimensions(result_shape),
-                               GetPhysicalDimensions(broadcast_shape));
+                               broadcast_physical_dims);
   for (int64_t i = 1; i < factors.size(); ++i) {
     auto [result_from, broadcast_from] = factors[i - 1];
     auto [result_to, broadcast_to] = factors[i];
 
     bool all_operands = true, any_operands = false;
     for (int64_t j = broadcast_from; j < broadcast_to; ++j) {
+      if (broadcast_physical_dims[j] == 1) {
+        // If dimension size is 1 then we can ignore it: it's either immediately
+        // dropped by old reshape or it's coming from the operand and then the
+        // new reshape will handle it.
+        continue;
+      }
       bool value = is_operand_dim[broadcast_shape.layout().minor_to_major(j)];
       all_operands &= value;
       any_operands |= value;
     }
-
     if (!any_operands) {
       continue;  // All dimensions in this group are broadcast dimensions.
     }
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index 23cb8c8e5a1f4a..bab6311f57991e 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -670,6 +670,50 @@ CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
       IsOkAndHolds(true));
 }
 
+TEST_P(NestGemmFusionReshapeTest,
+       BitcastsAreHoistedUpThroughBroadcastsWithTrivialDimensions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,2,3}
+  p0_reshape = f32[264,128] $0(p0_broadcast)
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(
+                              absl::Substitute(hlo, HloOpcodeString(opcode))));
+  ASSERT_THAT(
+      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
+      IsOkAndHolds(true));
+  ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
+CHECK-NEXT: }
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,24,1]{{.*}} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+      IsOkAndHolds(true));
+}
+
 TEST_P(NestGemmFusionReshapeTest,
        BitcastOfOperandAndBroadcastDimsIsNotHoistedUp) {
   HloOpcode opcode = GetParam();
@@ -899,8 +943,8 @@ ENTRY entry {
       ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
-CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={0,3}
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
 )"),
       IsOkAndHolds(true));
 }
@@ -1133,6 +1177,45 @@ CHECK-SAME: dimensions={0,1}
       IsOkAndHolds(true));
 }
 
+// TODO(b/467306121): handle the case when we need to sink the reshape through
+// broadcast.
+TEST_P(NestGemmFusionReshapeTest,
+       DISABLED_BitcastsAreHoistedDownThroughBroadcastsWithTrivialDimensions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[6,7] parameter(1)
+  dot = f32[3,6] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast = f32[3,2,3] $0(dot)
+  ROOT broadcast = f32[3,2,1,3,7] broadcast(bitcast), dimensions={0,1,3}
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[6,7] parameter(1)
+  ROOT result = f32[3,2,1,3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(
+                              absl::Substitute(hlo, HloOpcodeString(opcode))));
+  ASSERT_THAT(
+      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
+      IsOkAndHolds(true));
+  ASSERT_OK(verifier().Run(module.get()).status());
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT broadcast
+CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
+CHECK-SAME: dimensions={0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
 TEST_P(NestGemmFusionReshapeTest,
        BitcastsAreHoistedDownThroughBroadcastsWithNonDefaultLayout) {
   HloOpcode opcode = GetParam();

From 347bf33edd1df1331c2f9d6b443cf90651e9d59e Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 11 Dec 2025 08:25:33 -0800
Subject: [PATCH 180/753] Remove deprecated LoadExecutable overload

The two parameter version of LoadExecutable in AotCompilationResult is no longer used and can be removed. The using declaration in CpuAotCompilationResult is also removed as it's no longer necessary.

PiperOrigin-RevId: 843233101
---
 third_party/xla/xla/service/compiler.h                     | 7 -------
 .../xla/xla/service/cpu/cpu_aot_compilation_result.h       | 2 --
 2 files changed, 9 deletions(-)

diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index b47c105d72160c..99f4cfa1171eed 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -69,7 +69,6 @@ namespace xla {
 // computation.
 using ObjectFileData = std::vector<char>;
 
-class Compiler;
 class AotCompilationOptions;
 
 // Abstract superclass describing the result of an ahead-of-time compilation.
@@ -89,12 +88,6 @@ class AotCompilationResult {
     return Unimplemented("LoadExecutable unimplemented.");
   }
 
-  ABSL_DEPRECATE_AND_INLINE()
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler*, const se::StreamExecutor* executor) && {
-    return std::move(*this).LoadExecutable(executor);
-  }
-
   virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
       const {
     return Unimplemented("buffer_assignment unimplemented.");
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 1f845de703b5ec..59a3a0597ab7c6 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -116,8 +116,6 @@ class CpuAotCompilationResult : public AotCompilationResult {
     return proto_.SerializeAsString();
   }
 
-  using AotCompilationResult::LoadExecutable;
-
   absl::StatusOr<std::unique_ptr<Executable>>
       LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 

From 80c1d06aa8f4240698858829f1b262f77b65e6b8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 08:31:06 -0800
Subject: [PATCH 181/753] Exclude `_DictWrapper` from `is_tf_type`.

The `is_tf_type` function now returns False for objects whose type name is `_DictWrapper`, preventing these internal wrapper types from being treated as TensorFlow types.

PiperOrigin-RevId: 843234946
---
 tensorflow/python/framework/tensor_util.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 7101f046e60b04..0aaa5add6081a9 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -1285,6 +1285,8 @@ def is_tf_type(x):  # pylint: disable=invalid-name
   # objects. It is not a Tensor.
   if (type(x).__name__ == "ObjectProxy"):
     return False
+  if (type(x).__name__ == "_DictWrapper"):
+    return False
   return isinstance(x, tf_type_classes)
 
 
From 52178014a650ee63df1791f0c023e437a1e7033e Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Thu, 11 Dec 2025 09:23:46 -0800
Subject: [PATCH 182/753] Integrate LLVM at llvm/llvm-project@48d942c7158a

Updates LLVM usage to match
[48d942c7158a](https://github.com/llvm/llvm-project/commit/48d942c7158a)

PiperOrigin-RevId: 843253742
---
 .../xla/third_party/llvm/generated.patch      | 1073 --------
 .../xla/third_party/llvm/workspace.bzl        |    4 +-
 .../xla/third_party/shardy/temporary.patch    | 2160 ++++++++---------
 .../xla/third_party/shardy/workspace.bzl      |    4 +-
 4 files changed, 1084 insertions(+), 2157 deletions(-)

diff --git a/third_party/xla/third_party/llvm/generated.patch b/third_party/xla/third_party/llvm/generated.patch
index 2948da46566950..509398da979e83 100644
--- a/third_party/xla/third_party/llvm/generated.patch
+++ b/third_party/xla/third_party/llvm/generated.patch
@@ -1,1074 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
---- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
-+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
-@@ -554,8 +554,6 @@
-     if (const auto &Dir = Params.initializationOptions.compilationDatabasePath)
-       CDBOpts.CompileCommandsDir = Dir;
-     CDBOpts.ContextProvider = Opts.ContextProvider;
--    if (Opts.StrongWorkspaceMode)
--      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
-     BaseCDB =
-         std::make_unique<DirectoryBasedGlobalCompilationDatabase>(CDBOpts);
-   }
-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
---- a/clang-tools-extra/clangd/ClangdServer.h
-+++ b/clang-tools-extra/clangd/ClangdServer.h
-@@ -152,11 +152,6 @@
-     /// FIXME: If not set, should use the current working directory.
-     std::optional<std::string> WorkspaceRoot;
- 
--    /// Sets an alternate mode of operation. Current effects are:
--    /// - Using the current working directory as the working directory for
--    ///   fallback commands
--    bool StrongWorkspaceMode;
--
-     /// The resource directory is used to find internal headers, overriding
-     /// defaults and -resource-dir compiler flag).
-     /// If std::nullopt, ClangdServer calls
-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
---- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
-+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
-@@ -64,9 +64,7 @@
-   if (FileExtension.empty() || FileExtension == ".h")
-     Argv.push_back("-xobjective-c++-header");
-   Argv.push_back(std::string(File));
--  tooling::CompileCommand Cmd(FallbackWorkingDirectory
--                                  ? *FallbackWorkingDirectory
--                                  : llvm::sys::path::parent_path(File),
-+  tooling::CompileCommand Cmd(llvm::sys::path::parent_path(File),
-                               llvm::sys::path::filename(File), std::move(Argv),
-                               /*Output=*/"");
-   Cmd.Heuristic = "clangd fallback";
-@@ -351,8 +349,7 @@
- 
- DirectoryBasedGlobalCompilationDatabase::
-     DirectoryBasedGlobalCompilationDatabase(const Options &Opts)
--    : GlobalCompilationDatabase(Opts.FallbackWorkingDirectory), Opts(Opts),
--      Broadcaster(std::make_unique<BroadcastThread>(*this)) {
-+    : Opts(Opts), Broadcaster(std::make_unique<BroadcastThread>(*this)) {
-   if (!this->Opts.ContextProvider)
-     this->Opts.ContextProvider = [](llvm::StringRef) {
-       return Context::current().clone();
-@@ -463,21 +460,6 @@
-   return Result;
- }
- 
--void DirectoryBasedGlobalCompilationDatabase::Options::
--    applyFallbackWorkingDirectory(
--        std::optional<std::string> FallbackWorkingDirectory) {
--  if (FallbackWorkingDirectory)
--    this->FallbackWorkingDirectory = *FallbackWorkingDirectory;
--  else {
--    // Clangd is running in strong workspace mode but the client didn't
--    // specify a workspace path in the `initialize` request.
--    // Fallback to current working directory.
--    SmallString<256> CWD;
--    llvm::sys::fs::current_path(CWD);
--    this->FallbackWorkingDirectory = std::string(CWD);
--  }
--}
--
- // The broadcast thread announces files with new compile commands to the world.
- // Primarily this is used to enqueue them for background indexing.
- //
-@@ -777,10 +759,9 @@
- 
- OverlayCDB::OverlayCDB(const GlobalCompilationDatabase *Base,
-                        std::vector<std::string> FallbackFlags,
--                       CommandMangler Mangler,
--                       std::optional<std::string> FallbackWorkingDirectory)
--    : DelegatingCDB(Base, FallbackWorkingDirectory),
--      Mangler(std::move(Mangler)), FallbackFlags(std::move(FallbackFlags)) {}
-+                       CommandMangler Mangler)
-+    : DelegatingCDB(Base), Mangler(std::move(Mangler)),
-+      FallbackFlags(std::move(FallbackFlags)) {}
- 
- std::optional<tooling::CompileCommand>
- OverlayCDB::getCompileCommand(PathRef File) const {
-@@ -863,20 +844,16 @@
-   return MDB;
- }
- 
--DelegatingCDB::DelegatingCDB(
--    const GlobalCompilationDatabase *Base,
--    std::optional<std::string> FallbackWorkingDirectory)
--    : GlobalCompilationDatabase(FallbackWorkingDirectory), Base(Base) {
-+DelegatingCDB::DelegatingCDB(const GlobalCompilationDatabase *Base)
-+    : Base(Base) {
-   if (Base)
-     BaseChanged = Base->watch([this](const std::vector<std::string> Changes) {
-       OnCommandChanged.broadcast(Changes);
-     });
- }
- 
--DelegatingCDB::DelegatingCDB(
--    std::unique_ptr<GlobalCompilationDatabase> Base,
--    std::optional<std::string> FallbackWorkingDirectory)
--    : DelegatingCDB(Base.get(), FallbackWorkingDirectory) {
-+DelegatingCDB::DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base)
-+    : DelegatingCDB(Base.get()) {
-   BaseOwner = std::move(Base);
- }
- 
-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
---- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h
-+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
-@@ -35,9 +35,6 @@
- /// Provides compilation arguments used for parsing C and C++ files.
- class GlobalCompilationDatabase {
- public:
--  GlobalCompilationDatabase(
--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt)
--      : FallbackWorkingDirectory(FallbackWorkingDirectory) {}
-   virtual ~GlobalCompilationDatabase() = default;
- 
-   /// If there are any known-good commands for building this file, returns one.
-@@ -72,19 +69,14 @@
-   }
- 
- protected:
--  std::optional<std::string> FallbackWorkingDirectory;
-   mutable CommandChanged OnCommandChanged;
- };
- 
- // Helper class for implementing GlobalCompilationDatabases that wrap others.
- class DelegatingCDB : public GlobalCompilationDatabase {
- public:
--  DelegatingCDB(
--      const GlobalCompilationDatabase *Base,
--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
--  DelegatingCDB(
--      std::unique_ptr<GlobalCompilationDatabase> Base,
--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
-+  DelegatingCDB(const GlobalCompilationDatabase *Base);
-+  DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base);
- 
-   std::optional<tooling::CompileCommand>
-   getCompileCommand(PathRef File) const override;
-@@ -125,12 +117,6 @@
-     // Only look for a compilation database in this one fixed directory.
-     // FIXME: fold this into config/context mechanism.
-     std::optional<Path> CompileCommandsDir;
--    // Working directory for fallback commands
--    // If unset, parent directory of file should be used
--    std::optional<std::string> FallbackWorkingDirectory;
--
--    void applyFallbackWorkingDirectory(
--        std::optional<std::string> FallbackWorkingDirectory);
-   };
- 
-   DirectoryBasedGlobalCompilationDatabase(const Options &Opts);
-@@ -208,11 +194,9 @@
-   // Base may be null, in which case no entries are inherited.
-   // FallbackFlags are added to the fallback compile command.
-   // Adjuster is applied to all commands, fallback or not.
--  OverlayCDB(
--      const GlobalCompilationDatabase *Base,
--      std::vector<std::string> FallbackFlags = {},
--      CommandMangler Mangler = nullptr,
--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
-+  OverlayCDB(const GlobalCompilationDatabase *Base,
-+             std::vector<std::string> FallbackFlags = {},
-+             CommandMangler Mangler = nullptr);
- 
-   std::optional<tooling::CompileCommand>
-   getCompileCommand(PathRef File) const override;
-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
---- a/clang-tools-extra/clangd/tool/Check.cpp
-+++ b/clang-tools-extra/clangd/tool/Check.cpp
-@@ -169,8 +169,6 @@
-   bool buildCommand(const ThreadsafeFS &TFS) {
-     log("Loading compilation database...");
-     DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
--    if (Opts.StrongWorkspaceMode)
--      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
-     CDBOpts.CompileCommandsDir =
-         Config::current().CompileFlags.CDBSearch.FixedCDBPath;
-     BaseCDB =
-@@ -180,10 +178,8 @@
-         getSystemIncludeExtractor(llvm::ArrayRef(Opts.QueryDriverGlobs));
-     if (Opts.ResourceDir)
-       Mangler.ResourceDir = *Opts.ResourceDir;
--
-     CDB = std::make_unique<OverlayCDB>(
--        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler),
--        CDBOpts.FallbackWorkingDirectory);
-+        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler));
- 
-     if (auto TrueCmd = CDB->getCompileCommand(File)) {
-       Cmd = std::move(*TrueCmd);
-@@ -506,7 +502,7 @@
-                  config::DiagnosticCallback Diag) const override {
-       config::Fragment F;
-       // If we're timing clang-tidy checks, implicitly disabling the slow ones
--      // is counterproductive!
-+      // is counterproductive! 
-       if (CheckTidyTime.getNumOccurrences())
-         F.Diagnostics.ClangTidy.FastCheckFilter.emplace("None");
-       return {std::move(F).compile(Diag)};
-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
---- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
-+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
-@@ -500,17 +500,6 @@
-     init(true),
- };
- 
--opt<bool> StrongWorkspaceMode{
--    "strong-workspace-mode",
--    cat(Features),
--    desc("An alternate mode of operation for clangd, where the clangd instance "
--         "is used to edit a single workspace.\n"
--         "When enabled, fallback commands use the workspace directory as their "
--         "working directory instead of the parent folder."),
--    init(false),
--    Hidden,
--};
--
- opt<bool> UseDirtyHeaders{"use-dirty-headers", cat(Misc),
-                           desc("Use files open in the editor when parsing "
-                                "headers instead of reading from the disk"),
-@@ -918,7 +907,6 @@
-   }
-   if (!ResourceDir.empty())
-     Opts.ResourceDir = ResourceDir;
--  Opts.StrongWorkspaceMode = StrongWorkspaceMode;
-   Opts.BuildDynamicSymbolIndex = true;
- #if CLANGD_ENABLE_REMOTE
-   if (RemoteIndexAddress.empty() != ProjectRoot.empty()) {
-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
---- a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
-+++ b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
-@@ -55,20 +55,6 @@
-                                            testPath("foo/bar")));
- }
- 
--TEST(GlobalCompilationDatabaseTest, FallbackWorkingDirectory) {
--  MockFS TFS;
--  DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
--  CDBOpts.applyFallbackWorkingDirectory(testPath("foo"));
--  EXPECT_EQ(CDBOpts.FallbackWorkingDirectory, testPath("foo"));
--
--  DirectoryBasedGlobalCompilationDatabase DB(CDBOpts);
--  auto Cmd = DB.getFallbackCommand(testPath("foo/src/bar.cc"));
--  EXPECT_EQ(Cmd.Directory, testPath("foo"));
--  EXPECT_THAT(Cmd.CommandLine,
--              ElementsAre("clang", testPath("foo/src/bar.cc")));
--  EXPECT_EQ(Cmd.Output, "");
--}
--
- static tooling::CompileCommand cmd(llvm::StringRef File, llvm::StringRef Arg) {
-   return tooling::CompileCommand(
-       testRoot(), File, {"clang", std::string(Arg), std::string(File)}, "");
-diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
---- a/llvm/lib/CodeGen/ShrinkWrap.cpp
-+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
-@@ -618,8 +618,6 @@
- 
-   DenseSet<const MachineBasicBlock *> DirtyBBs;
-   for (MachineBasicBlock &MBB : MF) {
--    if (!MDT->isReachableFromEntry(&MBB))
--      continue;
-     if (MBB.isEHPad()) {
-       DirtyBBs.insert(&MBB);
-       continue;
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-@@ -708,53 +708,6 @@
-   return 2;
- }
- 
--bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
--                               const TargetInstrInfo &TII) {
--  for (MachineInstr &MI : MBB->terminators()) {
--    unsigned Opc = MI.getOpcode();
--    switch (Opc) {
--    case AArch64::CBZW:
--    case AArch64::CBZX:
--    case AArch64::TBZW:
--    case AArch64::TBZX:
--      // CBZ/TBZ with WZR/XZR -> unconditional B
--      if (MI.getOperand(0).getReg() == AArch64::WZR ||
--          MI.getOperand(0).getReg() == AArch64::XZR) {
--        DEBUG_WITH_TYPE("optimizeTerminators",
--                        dbgs() << "Removing always taken branch: " << MI);
--        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
--        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
--        for (auto *S : Succs)
--          if (S != Target)
--            MBB->removeSuccessor(S);
--        DebugLoc DL = MI.getDebugLoc();
--        while (MBB->rbegin() != &MI)
--          MBB->rbegin()->eraseFromParent();
--        MI.eraseFromParent();
--        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
--        return true;
--      }
--      break;
--    case AArch64::CBNZW:
--    case AArch64::CBNZX:
--    case AArch64::TBNZW:
--    case AArch64::TBNZX:
--      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
--      if (MI.getOperand(0).getReg() == AArch64::WZR ||
--          MI.getOperand(0).getReg() == AArch64::XZR) {
--        DEBUG_WITH_TYPE("optimizeTerminators",
--                        dbgs() << "Removing never taken branch: " << MI);
--        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
--        MI.getParent()->removeSuccessor(Target);
--        MI.eraseFromParent();
--        return true;
--      }
--      break;
--    }
--  }
--  return false;
--}
--
- // Find the original register that VReg is copied from.
- static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
-   while (Register::isVirtualRegister(VReg)) {
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-@@ -705,8 +705,6 @@
-                               unsigned *OutUnscaledOp = nullptr,
-                               int64_t *EmittableOffset = nullptr);
- 
--bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
--
- static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
- 
- static inline bool isCondBranchOpcode(int Opc) {
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
---- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
-@@ -14,7 +14,6 @@
- //===----------------------------------------------------------------------===//
- 
- #include "AArch64.h"
--#include "AArch64InstrInfo.h"
- #include "llvm/CodeGen/MachineFunctionPass.h"
- #include "llvm/CodeGen/MachineInstrBuilder.h"
- #include "llvm/CodeGen/TargetInstrInfo.h"
-@@ -46,6 +45,51 @@
-                 "AArch64 Redundant Conditional Branch Elimination pass", false,
-                 false)
- 
-+static bool optimizeTerminators(MachineBasicBlock *MBB,
-+                                const TargetInstrInfo &TII) {
-+  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
-+    unsigned Opc = MI.getOpcode();
-+    switch (Opc) {
-+    case AArch64::CBZW:
-+    case AArch64::CBZX:
-+    case AArch64::TBZW:
-+    case AArch64::TBZX:
-+      // CBZ/TBZ with WZR/XZR -> unconditional B
-+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-+          MI.getOperand(0).getReg() == AArch64::XZR) {
-+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
-+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-+        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
-+        for (auto *S : Succs)
-+          if (S != Target)
-+            MBB->removeSuccessor(S);
-+        DebugLoc DL = MI.getDebugLoc();
-+        while (MBB->rbegin() != &MI)
-+          MBB->rbegin()->eraseFromParent();
-+        MI.eraseFromParent();
-+        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
-+        return true;
-+      }
-+      break;
-+    case AArch64::CBNZW:
-+    case AArch64::CBNZX:
-+    case AArch64::TBNZW:
-+    case AArch64::TBNZX:
-+      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
-+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-+          MI.getOperand(0).getReg() == AArch64::XZR) {
-+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
-+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-+        MI.getParent()->removeSuccessor(Target);
-+        MI.eraseFromParent();
-+        return true;
-+      }
-+      break;
-+    }
-+  }
-+  return false;
-+}
-+
- bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
-   if (skipFunction(MF.getFunction()))
-     return false;
-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
---- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
-+++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
-@@ -50,7 +50,6 @@
- //        to use WZR/XZR directly in some cases.
- //===----------------------------------------------------------------------===//
- #include "AArch64.h"
--#include "AArch64InstrInfo.h"
- #include "llvm/ADT/SetVector.h"
- #include "llvm/ADT/Statistic.h"
- #include "llvm/ADT/iterator_range.h"
-@@ -476,7 +475,6 @@
-     return false;
-   TRI = MF.getSubtarget().getRegisterInfo();
-   MRI = &MF.getRegInfo();
--  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
- 
-   // Resize the clobbered and used register unit trackers.  We do this once per
-   // function.
-@@ -486,10 +484,8 @@
-   OptBBUsedRegs.init(*TRI);
- 
-   bool Changed = false;
--  for (MachineBasicBlock &MBB : MF) {
--    Changed |= optimizeTerminators(&MBB, TII);
-+  for (MachineBasicBlock &MBB : MF)
-     Changed |= optimizeBlock(&MBB);
--  }
-   return Changed;
- }
- 
-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
---- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
-+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
-@@ -1827,12 +1827,8 @@
-     // profile info.
-     CostTooHigh =
-         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
--    if (CostTooHigh) {
--      // Mark runtime checks as never succeeding when they exceed the threshold.
--      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
--      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
-+    if (CostTooHigh)
-       return;
--    }
- 
-     BasicBlock *LoopHeader = L->getHeader();
-     BasicBlock *Preheader = L->getLoopPreheader();
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
---- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
-+++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
-@@ -735,15 +735,21 @@
- ; ENABLE-NEXT:    .cfi_offset w29, -16
- ; ENABLE-NEXT:    .cfi_offset w19, -24
- ; ENABLE-NEXT:    .cfi_offset w20, -32
-+; ENABLE-NEXT:  ; %bb.1: ; %if.then
- ; ENABLE-NEXT:    sub x19, sp, #16
- ; ENABLE-NEXT:    mov sp, x19
- ; ENABLE-NEXT:    mov w20, wzr
--; ENABLE-NEXT:  LBB10_1: ; %for.body
-+; ENABLE-NEXT:  LBB10_2: ; %for.body
- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
- ; ENABLE-NEXT:    bl _something
- ; ENABLE-NEXT:    add w20, w0, w20
- ; ENABLE-NEXT:    str w20, [x19]
--; ENABLE-NEXT:    b LBB10_1
-+; ENABLE-NEXT:    b LBB10_2
-+; ENABLE-NEXT:  ; %bb.3: ; %if.end
-+; ENABLE-NEXT:    sub sp, x29, #16
-+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-+; ENABLE-NEXT:    ret
- ;
- ; DISABLE-LABEL: infiniteloop:
- ; DISABLE:       ; %bb.0: ; %entry
-@@ -755,15 +761,21 @@
- ; DISABLE-NEXT:    .cfi_offset w29, -16
- ; DISABLE-NEXT:    .cfi_offset w19, -24
- ; DISABLE-NEXT:    .cfi_offset w20, -32
-+; DISABLE-NEXT:  ; %bb.1: ; %if.then
- ; DISABLE-NEXT:    sub x19, sp, #16
- ; DISABLE-NEXT:    mov sp, x19
- ; DISABLE-NEXT:    mov w20, wzr
--; DISABLE-NEXT:  LBB10_1: ; %for.body
-+; DISABLE-NEXT:  LBB10_2: ; %for.body
- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
- ; DISABLE-NEXT:    bl _something
- ; DISABLE-NEXT:    add w20, w0, w20
- ; DISABLE-NEXT:    str w20, [x19]
--; DISABLE-NEXT:    b LBB10_1
-+; DISABLE-NEXT:    b LBB10_2
-+; DISABLE-NEXT:  ; %bb.3: ; %if.end
-+; DISABLE-NEXT:    sub sp, x29, #16
-+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-+; DISABLE-NEXT:    ret
- entry:
-   br i1 undef, label %if.then, label %if.end
- 
-@@ -794,10 +806,11 @@
- ; ENABLE-NEXT:    .cfi_offset w29, -16
- ; ENABLE-NEXT:    .cfi_offset w19, -24
- ; ENABLE-NEXT:    .cfi_offset w20, -32
-+; ENABLE-NEXT:  ; %bb.1: ; %if.then
- ; ENABLE-NEXT:    sub x8, sp, #16
- ; ENABLE-NEXT:    mov sp, x8
- ; ENABLE-NEXT:    mov w9, wzr
--; ENABLE-NEXT:  LBB11_1: ; %for.body
-+; ENABLE-NEXT:  LBB11_2: ; %for.body
- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
- ; ENABLE-NEXT:    ; InlineAsm Start
- ; ENABLE-NEXT:    mov x10, #0 ; =0x0
-@@ -808,7 +821,12 @@
- ; ENABLE-NEXT:    ; InlineAsm Start
- ; ENABLE-NEXT:    nop
- ; ENABLE-NEXT:    ; InlineAsm End
--; ENABLE-NEXT:    b LBB11_1
-+; ENABLE-NEXT:    b LBB11_2
-+; ENABLE-NEXT:  ; %bb.3: ; %if.end
-+; ENABLE-NEXT:    sub sp, x29, #16
-+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-+; ENABLE-NEXT:    ret
- ;
- ; DISABLE-LABEL: infiniteloop2:
- ; DISABLE:       ; %bb.0: ; %entry
-@@ -820,10 +838,11 @@
- ; DISABLE-NEXT:    .cfi_offset w29, -16
- ; DISABLE-NEXT:    .cfi_offset w19, -24
- ; DISABLE-NEXT:    .cfi_offset w20, -32
-+; DISABLE-NEXT:  ; %bb.1: ; %if.then
- ; DISABLE-NEXT:    sub x8, sp, #16
- ; DISABLE-NEXT:    mov sp, x8
- ; DISABLE-NEXT:    mov w9, wzr
--; DISABLE-NEXT:  LBB11_1: ; %for.body
-+; DISABLE-NEXT:  LBB11_2: ; %for.body
- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
- ; DISABLE-NEXT:    ; InlineAsm Start
- ; DISABLE-NEXT:    mov x10, #0 ; =0x0
-@@ -834,7 +853,12 @@
- ; DISABLE-NEXT:    ; InlineAsm Start
- ; DISABLE-NEXT:    nop
- ; DISABLE-NEXT:    ; InlineAsm End
--; DISABLE-NEXT:    b LBB11_1
-+; DISABLE-NEXT:    b LBB11_2
-+; DISABLE-NEXT:  ; %bb.3: ; %if.end
-+; DISABLE-NEXT:    sub sp, x29, #16
-+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-+; DISABLE-NEXT:    ret
- entry:
-   br i1 undef, label %if.then, label %if.end
- 
-@@ -865,43 +889,49 @@
- define void @infiniteloop3() {
- ; ENABLE-LABEL: infiniteloop3:
- ; ENABLE:       ; %bb.0: ; %entry
-+; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
- ; ENABLE-NEXT:    mov x8, xzr
- ; ENABLE-NEXT:    mov x9, xzr
- ; ENABLE-NEXT:    mov x11, xzr
--; ENABLE-NEXT:    b LBB12_2
--; ENABLE-NEXT:  LBB12_1: ; %loop2b
--; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
-+; ENABLE-NEXT:    b LBB12_3
-+; ENABLE-NEXT:  LBB12_2: ; %loop2b
-+; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
- ; ENABLE-NEXT:    str x10, [x11]
- ; ENABLE-NEXT:    mov x11, x10
--; ENABLE-NEXT:  LBB12_2: ; %loop1
-+; ENABLE-NEXT:  LBB12_3: ; %loop1
- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
- ; ENABLE-NEXT:    mov x10, x9
- ; ENABLE-NEXT:    ldr x9, [x8]
--; ENABLE-NEXT:    cbnz x8, LBB12_1
--; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
-+; ENABLE-NEXT:    cbnz x8, LBB12_2
-+; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
- ; ENABLE-NEXT:    mov x8, x10
- ; ENABLE-NEXT:    mov x11, x10
--; ENABLE-NEXT:    b LBB12_2
-+; ENABLE-NEXT:    b LBB12_3
-+; ENABLE-NEXT:  ; %bb.5: ; %end
-+; ENABLE-NEXT:    ret
- ;
- ; DISABLE-LABEL: infiniteloop3:
- ; DISABLE:       ; %bb.0: ; %entry
-+; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
- ; DISABLE-NEXT:    mov x8, xzr
- ; DISABLE-NEXT:    mov x9, xzr
- ; DISABLE-NEXT:    mov x11, xzr
--; DISABLE-NEXT:    b LBB12_2
--; DISABLE-NEXT:  LBB12_1: ; %loop2b
--; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
-+; DISABLE-NEXT:    b LBB12_3
-+; DISABLE-NEXT:  LBB12_2: ; %loop2b
-+; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
- ; DISABLE-NEXT:    str x10, [x11]
- ; DISABLE-NEXT:    mov x11, x10
--; DISABLE-NEXT:  LBB12_2: ; %loop1
-+; DISABLE-NEXT:  LBB12_3: ; %loop1
- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
- ; DISABLE-NEXT:    mov x10, x9
- ; DISABLE-NEXT:    ldr x9, [x8]
--; DISABLE-NEXT:    cbnz x8, LBB12_1
--; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
-+; DISABLE-NEXT:    cbnz x8, LBB12_2
-+; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
- ; DISABLE-NEXT:    mov x8, x10
- ; DISABLE-NEXT:    mov x11, x10
--; DISABLE-NEXT:    b LBB12_2
-+; DISABLE-NEXT:    b LBB12_3
-+; DISABLE-NEXT:  ; %bb.5: ; %end
-+; DISABLE-NEXT:    ret
- entry:
-   br i1 undef, label %loop2a, label %body
- 
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
---- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
-+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
-@@ -8,14 +8,20 @@
- define i8 @foo_optsize(i32 %v4) optsize {
- ; CHECK-LABEL: foo_optsize:
- ; CHECK:       // %bb.0: // %entry
--; CHECK-NEXT:    cbnz w0, .LBB0_2
--; CHECK-NEXT:  // %bb.1: // %b2
--; CHECK-NEXT:    mov w0, #1 // =0x1
-+; CHECK-NEXT:    b .LBB0_2
-+; CHECK-NEXT:  .LBB0_1:
-+; CHECK-NEXT:    mov w0, wzr
- ; CHECK-NEXT:    ret
- ; CHECK-NEXT:  .LBB0_2: // %b1
--; CHECK-NEXT:    cmp w0, #1
--; CHECK-NEXT:    mov w0, wzr
-+; CHECK-NEXT:    cbnz w0, .LBB0_4
-+; CHECK-NEXT:  // %bb.3: // %b2
-+; CHECK-NEXT:    mov w0, #1 // =0x1
- ; CHECK-NEXT:    ret
-+; CHECK-NEXT:  .LBB0_4: // %b1
-+; CHECK-NEXT:    cmp w0, #1
-+; CHECK-NEXT:    b.ne .LBB0_1
-+; CHECK-NEXT:  // %bb.5: // %b3
-+; CHECK-NEXT:    b .LBB0_1
- entry:
-   %v2 = icmp eq i32 0, 0
-   br i1 %v2, label %b1, label %b4
-@@ -41,14 +47,20 @@
- define i8 @foo_optspeed(i32 %v4) {
- ; CHECK-LABEL: foo_optspeed:
- ; CHECK:       // %bb.0: // %entry
--; CHECK-NEXT:    cbnz w0, .LBB1_2
--; CHECK-NEXT:  // %bb.1: // %b2
--; CHECK-NEXT:    mov w0, #1 // =0x1
-+; CHECK-NEXT:    b .LBB1_2
-+; CHECK-NEXT:  .LBB1_1:
-+; CHECK-NEXT:    mov w0, wzr
- ; CHECK-NEXT:    ret
- ; CHECK-NEXT:  .LBB1_2: // %b1
--; CHECK-NEXT:    cmp w0, #1
--; CHECK-NEXT:    mov w0, wzr
-+; CHECK-NEXT:    cbnz w0, .LBB1_4
-+; CHECK-NEXT:  // %bb.3: // %b2
-+; CHECK-NEXT:    mov w0, #1 // =0x1
- ; CHECK-NEXT:    ret
-+; CHECK-NEXT:  .LBB1_4: // %b1
-+; CHECK-NEXT:    cmp w0, #1
-+; CHECK-NEXT:    b.ne .LBB1_1
-+; CHECK-NEXT:  // %bb.5: // %b3
-+; CHECK-NEXT:    b .LBB1_1
- entry:
-   %v2 = icmp eq i32 0, 0
-   br i1 %v2, label %b1, label %b4
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
---- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
-+++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
-@@ -21,8 +21,10 @@
-   ; CHECK-NEXT:   B %bb.3
-   ; CHECK-NEXT: {{  $}}
-   ; CHECK-NEXT: bb.1.bb:
-+  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
-   ; CHECK-NEXT:   liveins: $w0, $lr
-   ; CHECK-NEXT: {{  $}}
-+  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
-   ; CHECK-NEXT:   B %bb.2
-   ; CHECK-NEXT: {{  $}}
-   ; CHECK-NEXT: bb.2.bb1:
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
---- a/llvm/test/CodeGen/AArch64/pr164181.ll
-+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
-@@ -29,11 +29,11 @@
- ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
- ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
- ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
--; CHECK-NEXT:    tbz w5, #0, .LBB0_40
-+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
- ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
- ; CHECK-NEXT:    ldr x4, [sp, #312]
- ; CHECK-NEXT:    ldr x14, [sp, #280]
--; CHECK-NEXT:    tbz w0, #0, .LBB0_39
-+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
- ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
- ; CHECK-NEXT:    ldrb w8, [sp, #368]
- ; CHECK-NEXT:    ldrb w12, [sp, #256]
-@@ -92,7 +92,7 @@
- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
- ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
- ; CHECK-NEXT:    mov x12, x24
- ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
-@@ -117,7 +117,7 @@
- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
- ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
- ; CHECK-NEXT:    cmn x24, #30
- ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
-@@ -142,7 +142,7 @@
- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
- ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
- ; CHECK-NEXT:    mov w14, #1152 // =0x480
- ; CHECK-NEXT:    mov w24, #1 // =0x1
-@@ -176,7 +176,7 @@
- ; CHECK-NEXT:    // => This Loop Header: Depth=4
- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
- ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
- ; CHECK-NEXT:    and w8, w8, w8, asr #31
- ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
-@@ -281,23 +281,31 @@
- ; CHECK-NEXT:    mov x24, xzr
- ; CHECK-NEXT:    mul w12, w12, w22
- ; CHECK-NEXT:    mov x22, x5
--; CHECK-NEXT:    tbz w0, #0, .LBB0_33
--; CHECK-NEXT:  .LBB0_28: // %if.then222.us
-+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
-+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
- ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
- ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
- ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
- ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
- ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
-+; CHECK-NEXT:  // %bb.29: // %if.then222.us
-+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
- ; CHECK-NEXT:    adrp x27, :got:var_32
- ; CHECK-NEXT:    ldur w8, [x19, #-12]
- ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
- ; CHECK-NEXT:    strh w8, [x27]
- ; CHECK-NEXT:    sxtb w8, w25
--; CHECK-NEXT:    strb w3, [x16]
- ; CHECK-NEXT:    bic w25, w8, w8, asr #31
-+; CHECK-NEXT:    b .LBB0_31
-+; CHECK-NEXT:    .p2align 5, , 16
-+; CHECK-NEXT:  // %bb.30:
-+; CHECK-NEXT:    mov w25, wzr
-+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
-+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-+; CHECK-NEXT:    strb w3, [x16]
- ; CHECK-NEXT:    tst w13, #0xff
--; CHECK-NEXT:    b.eq .LBB0_30
--; CHECK-NEXT:  // %bb.29: // %if.then254.us
-+; CHECK-NEXT:    b.eq .LBB0_33
-+; CHECK-NEXT:  // %bb.32: // %if.then254.us
- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
- ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
- ; CHECK-NEXT:    adrp x27, :got:var_35
-@@ -306,7 +314,7 @@
- ; CHECK-NEXT:    csel x8, xzr, x7, eq
- ; CHECK-NEXT:    str x8, [x27]
- ; CHECK-NEXT:    strh w1, [x17]
--; CHECK-NEXT:  .LBB0_30: // %if.end282.us
-+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
- ; CHECK-NEXT:    orr x27, x24, x4
- ; CHECK-NEXT:    adrp x8, :got:var_39
-@@ -317,14 +325,14 @@
- ; CHECK-NEXT:    str x8, [x18]
- ; CHECK-NEXT:    mov w8, #1 // =0x1
- ; CHECK-NEXT:    cbnz x2, .LBB0_27
--; CHECK-NEXT:  // %bb.31: // %if.then327.us
-+; CHECK-NEXT:  // %bb.34: // %if.then327.us
- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
- ; CHECK-NEXT:    cbz w8, .LBB0_25
--; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
-+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
- ; CHECK-NEXT:    mov w4, wzr
- ; CHECK-NEXT:    b .LBB0_26
- ; CHECK-NEXT:    .p2align 5, , 16
--; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
-+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
- ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
- ; CHECK-NEXT:    mov w3, #1152 // =0x480
- ; CHECK-NEXT:    mov x22, xzr
-@@ -335,24 +343,24 @@
- ; CHECK-NEXT:    madd x14, x14, x3, x11
- ; CHECK-NEXT:    mov w28, w30
- ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
--; CHECK-NEXT:    b .LBB0_36
-+; CHECK-NEXT:    b .LBB0_39
- ; CHECK-NEXT:    .p2align 5, , 16
--; CHECK-NEXT:  .LBB0_34: // %if.then466.us
--; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
-+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
-+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
- ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
- ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
- ; CHECK-NEXT:    sxtb w4, w4
- ; CHECK-NEXT:    bic w4, w4, w4, asr #31
- ; CHECK-NEXT:    str x3, [x28]
- ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
--; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
--; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
-+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
-+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
- ; CHECK-NEXT:    add x22, x22, #1
- ; CHECK-NEXT:    add x27, x27, #1
- ; CHECK-NEXT:    mov w28, wzr
- ; CHECK-NEXT:    cmp x27, #0
- ; CHECK-NEXT:    b.hs .LBB0_9
--; CHECK-NEXT:  .LBB0_36: // %for.body380.us
-+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
- ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
- ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
- ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
-@@ -364,18 +372,18 @@
- ; CHECK-NEXT:    strh w28, [x11]
- ; CHECK-NEXT:    csel w28, w21, w3, ne
- ; CHECK-NEXT:    str w28, [x20]
--; CHECK-NEXT:    cbz x15, .LBB0_35
--; CHECK-NEXT:  // %bb.37: // %if.then436.us
--; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
-+; CHECK-NEXT:    cbz x15, .LBB0_38
-+; CHECK-NEXT:  // %bb.40: // %if.then436.us
-+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
- ; CHECK-NEXT:    ldrh w28, [x14]
--; CHECK-NEXT:    cbnz w28, .LBB0_34
--; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
-+; CHECK-NEXT:    cbnz w28, .LBB0_37
-+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
- ; CHECK-NEXT:    mov w4, wzr
--; CHECK-NEXT:    b .LBB0_35
--; CHECK-NEXT:  .LBB0_39: // %for.body41
-+; CHECK-NEXT:    b .LBB0_38
-+; CHECK-NEXT:  .LBB0_42: // %for.body41
- ; CHECK-NEXT:    strb wzr, [x4]
- ; CHECK-NEXT:    strb wzr, [x14]
--; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
-+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
- ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
- ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
- ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
---- a/llvm/test/CodeGen/AArch64/pr166870.ll
-+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
-@@ -26,11 +26,12 @@
- ; CHECK-NEXT:    mov x21, x1
- ; CHECK-NEXT:    bl baz
- ; CHECK-NEXT:    mov w0, #0 // =0x0
-+; CHECK-NEXT:  // %bb.5: // %bb6
- ; CHECK-NEXT:    mov w10, #1 // =0x1
-+; CHECK-NEXT:    cbnz w10, .LBB0_11
-+; CHECK-NEXT:  // %bb.6: // %bb7
- ; CHECK-NEXT:    cbnz w10, .LBB0_10
--; CHECK-NEXT:  // %bb.5: // %bb7
--; CHECK-NEXT:    cbnz w10, .LBB0_9
--; CHECK-NEXT:  // %bb.6: // %bb8
-+; CHECK-NEXT:  // %bb.7: // %bb8
- ; CHECK-NEXT:    mov x8, x21
- ; CHECK-NEXT:    mov x9, x20
- ; CHECK-NEXT:    mov w20, #0 // =0x0
-@@ -38,17 +39,17 @@
- ; CHECK-NEXT:    mov x21, x9
- ; CHECK-NEXT:    mov w8, w8
- ; CHECK-NEXT:    mov x22, x8
--; CHECK-NEXT:  .LBB0_7: // %bb10
-+; CHECK-NEXT:  .LBB0_8: // %bb10
- ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
- ; CHECK-NEXT:    strb w20, [x19]
--; CHECK-NEXT:    cbnz x21, .LBB0_7
--; CHECK-NEXT:  // %bb.8: // %bb12
--; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
-+; CHECK-NEXT:    cbnz x21, .LBB0_8
-+; CHECK-NEXT:  // %bb.9: // %bb12
-+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
- ; CHECK-NEXT:    bl snork
--; CHECK-NEXT:    cbnz x22, .LBB0_7
--; CHECK-NEXT:  .LBB0_9:
--; CHECK-NEXT:    mov w0, #0 // =0x0
-+; CHECK-NEXT:    cbnz x22, .LBB0_8
- ; CHECK-NEXT:  .LBB0_10:
-+; CHECK-NEXT:    mov w0, #0 // =0x0
-+; CHECK-NEXT:  .LBB0_11:
- ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
- ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
- ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
-diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
---- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
-+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
-@@ -71,21 +71,27 @@
- ; CHECK-NEXT:    .cfi_def_cfa w29, 16
- ; CHECK-NEXT:    .cfi_offset w30, -8
- ; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    .cfi_remember_state
- ; CHECK-NEXT:    mov w8, #1 // =0x1
--; CHECK-NEXT:    mov w9, #2 // =0x2
- ; CHECK-NEXT:    stur xzr, [x29, #-8]
--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--; CHECK-NEXT:    ldur w8, [x29, #-8]
--; CHECK-NEXT:    cbz w8, .LBB0_2
-+; CHECK-NEXT:    b .LBB0_3
- ; CHECK-NEXT:  // %bb.1:
--; CHECK-NEXT:    mov w8, #1 // =0x1
- ; CHECK-NEXT:    str w8, [sp, #16]
--; CHECK-NEXT:    b .LBB0_3
-+; CHECK-NEXT:    ldur w8, [x29, #-8]
-+; CHECK-NEXT:    cbz w8, .LBB0_4
- ; CHECK-NEXT:  .LBB0_2:
-+; CHECK-NEXT:    .cfi_restore_state
- ; CHECK-NEXT:    mov w8, #1 // =0x1
--; CHECK-NEXT:    mov w9, #2 // =0x2
--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+; CHECK-NEXT:    str w8, [sp, #16]
-+; CHECK-NEXT:    b .LBB0_5
- ; CHECK-NEXT:  .LBB0_3:
-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+; CHECK-NEXT:    ldur w8, [x29, #-8]
-+; CHECK-NEXT:    cbnz w8, .LBB0_2
-+; CHECK-NEXT:  .LBB0_4:
-+; CHECK-NEXT:    mov w8, #1 // =0x1
-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+; CHECK-NEXT:  .LBB0_5:
- ; CHECK-NEXT:    mov w0, wzr
- ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
- ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
-@@ -128,6 +134,7 @@
- ;
- ; CHECK-LABEL: OUTLINED_FUNCTION_0:
- ; CHECK:       // %bb.0:
-+; CHECK-NEXT:    mov w9, #2 // =0x2
- ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
- ; CHECK-NEXT:    mov w9, #3 // =0x3
- ; CHECK-NEXT:    mov w8, #4 // =0x4
-diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
---- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
-+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
-@@ -12,21 +12,27 @@
- ; CHECK-NEXT:    .cfi_def_cfa w29, 16
- ; CHECK-NEXT:    .cfi_offset w30, -8
- ; CHECK-NEXT:    .cfi_offset w29, -16
-+; CHECK-NEXT:    .cfi_remember_state
- ; CHECK-NEXT:    mov w8, #1 // =0x1
--; CHECK-NEXT:    mov w9, #2 // =0x2
- ; CHECK-NEXT:    stur xzr, [x29, #-8]
--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--; CHECK-NEXT:    ldur w8, [x29, #-8]
--; CHECK-NEXT:    cbz w8, .LBB0_2
-+; CHECK-NEXT:    b .LBB0_3
- ; CHECK-NEXT:  // %bb.1:
--; CHECK-NEXT:    mov w8, #1 // =0x1
- ; CHECK-NEXT:    str w8, [sp, #16]
--; CHECK-NEXT:    b .LBB0_3
-+; CHECK-NEXT:    ldur w8, [x29, #-8]
-+; CHECK-NEXT:    cbz w8, .LBB0_4
- ; CHECK-NEXT:  .LBB0_2:
-+; CHECK-NEXT:    .cfi_restore_state
- ; CHECK-NEXT:    mov w8, #1 // =0x1
--; CHECK-NEXT:    mov w9, #2 // =0x2
--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+; CHECK-NEXT:    str w8, [sp, #16]
-+; CHECK-NEXT:    b .LBB0_5
- ; CHECK-NEXT:  .LBB0_3:
-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+; CHECK-NEXT:    ldur w8, [x29, #-8]
-+; CHECK-NEXT:    cbnz w8, .LBB0_2
-+; CHECK-NEXT:  .LBB0_4:
-+; CHECK-NEXT:    mov w8, #1 // =0x1
-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+; CHECK-NEXT:  .LBB0_5:
- ; CHECK-NEXT:    mov w0, wzr
- ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
- ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
---- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
-+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
-@@ -2,23 +2,29 @@
- ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
- ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
- 
--; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
--; no runtime check is generated and the loop should not be vectorized.
-+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
-+; no runtime check is generated even though one is needed and !noalias
-+; annotations are added.
- define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
- ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
- ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
--; LIMIT0-NEXT:  [[ENTRY:.*]]:
--; LIMIT0-NEXT:    br label %[[LOOP:.*]]
--; LIMIT0:       [[LOOP]]:
--; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
--; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
-+; LIMIT0-NEXT:  [[ENTRY:.*:]]
-+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
-+; LIMIT0:       [[VECTOR_PH]]:
-+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
-+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
-+; LIMIT0:       [[VECTOR_BODY]]:
-+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
- ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
--; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
--; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
-+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
- ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
--; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
-+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-+; LIMIT0:       [[MIDDLE_BLOCK]]:
-+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
- ; LIMIT0:       [[EXIT]]:
--; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
- ; LIMIT0-NEXT:    ret i16 [[TMP0]]
- ;
- ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
-@@ -82,9 +88,14 @@
- !3 = !{!"llvm.loop.vectorize.enable", i1 true}
- 
- ;.
--; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
--; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
--; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
-+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
-+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
-+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
-+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
-+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
-+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
-+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
-+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
- ;.
- ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
- ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
-diff -ruN --strip-trailing-cr a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
---- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
-+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
-@@ -1320,8 +1320,9 @@
- }
- 
- template <typename T>
--T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
--                               StringRef entryType, uint64_t depth) {
-+T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
-+                               uint64_t index, StringRef entryType,
-+                               uint64_t depth) {
-   if (index >= entries.size()) {
-     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
-     return {};
diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index e573782a756d19..3c9c005f2315d3 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "87bf5ee23863bc0b467ee44b2184b2c134a98464"
-    LLVM_SHA256 = "9d0bca271bfb266de8453cd34156741fd41f64b911f580262d187ce4d4d9b6d9"
+    LLVM_COMMIT = "48d942c7158af43094db1b5e6c59c6e6fcf1b5aa"
+    LLVM_SHA256 = "6ce4ac276a4687625e9f57e53715285d99b60c6553e0cde4db9b7e74f2179f69"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 486c1d22e6cb1e..4caadcc3b73011 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,1094 +1,1094 @@
 diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index 509398d..2948da4 100644
+index 2948da4..509398d 100644
 --- a/third_party/llvm/generated.patch
 +++ b/third_party/llvm/generated.patch
-@@ -1 +1,1074 @@
+@@ -1,1074 +1 @@
  Auto generated patch. Do not edit or delete it, even if empty.
-+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
-+--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
-++++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
-+@@ -554,8 +554,6 @@
-+     if (const auto &Dir = Params.initializationOptions.compilationDatabasePath)
-+       CDBOpts.CompileCommandsDir = Dir;
-+     CDBOpts.ContextProvider = Opts.ContextProvider;
-+-    if (Opts.StrongWorkspaceMode)
-+-      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
-+     BaseCDB =
-+         std::make_unique<DirectoryBasedGlobalCompilationDatabase>(CDBOpts);
-+   }
-+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
-+--- a/clang-tools-extra/clangd/ClangdServer.h
-++++ b/clang-tools-extra/clangd/ClangdServer.h
-+@@ -152,11 +152,6 @@
-+     /// FIXME: If not set, should use the current working directory.
-+     std::optional<std::string> WorkspaceRoot;
-+ 
-+-    /// Sets an alternate mode of operation. Current effects are:
-+-    /// - Using the current working directory as the working directory for
-+-    ///   fallback commands
-+-    bool StrongWorkspaceMode;
-+-
-+     /// The resource directory is used to find internal headers, overriding
-+     /// defaults and -resource-dir compiler flag).
-+     /// If std::nullopt, ClangdServer calls
-+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
-+--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
-++++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
-+@@ -64,9 +64,7 @@
-+   if (FileExtension.empty() || FileExtension == ".h")
-+     Argv.push_back("-xobjective-c++-header");
-+   Argv.push_back(std::string(File));
-+-  tooling::CompileCommand Cmd(FallbackWorkingDirectory
-+-                                  ? *FallbackWorkingDirectory
-+-                                  : llvm::sys::path::parent_path(File),
-++  tooling::CompileCommand Cmd(llvm::sys::path::parent_path(File),
-+                               llvm::sys::path::filename(File), std::move(Argv),
-+                               /*Output=*/"");
-+   Cmd.Heuristic = "clangd fallback";
-+@@ -351,8 +349,7 @@
-+ 
-+ DirectoryBasedGlobalCompilationDatabase::
-+     DirectoryBasedGlobalCompilationDatabase(const Options &Opts)
-+-    : GlobalCompilationDatabase(Opts.FallbackWorkingDirectory), Opts(Opts),
-+-      Broadcaster(std::make_unique<BroadcastThread>(*this)) {
-++    : Opts(Opts), Broadcaster(std::make_unique<BroadcastThread>(*this)) {
-+   if (!this->Opts.ContextProvider)
-+     this->Opts.ContextProvider = [](llvm::StringRef) {
-+       return Context::current().clone();
-+@@ -463,21 +460,6 @@
-+   return Result;
-+ }
-+ 
-+-void DirectoryBasedGlobalCompilationDatabase::Options::
-+-    applyFallbackWorkingDirectory(
-+-        std::optional<std::string> FallbackWorkingDirectory) {
-+-  if (FallbackWorkingDirectory)
-+-    this->FallbackWorkingDirectory = *FallbackWorkingDirectory;
-+-  else {
-+-    // Clangd is running in strong workspace mode but the client didn't
-+-    // specify a workspace path in the `initialize` request.
-+-    // Fallback to current working directory.
-+-    SmallString<256> CWD;
-+-    llvm::sys::fs::current_path(CWD);
-+-    this->FallbackWorkingDirectory = std::string(CWD);
-+-  }
-+-}
-+-
-+ // The broadcast thread announces files with new compile commands to the world.
-+ // Primarily this is used to enqueue them for background indexing.
-+ //
-+@@ -777,10 +759,9 @@
-+ 
-+ OverlayCDB::OverlayCDB(const GlobalCompilationDatabase *Base,
-+                        std::vector<std::string> FallbackFlags,
-+-                       CommandMangler Mangler,
-+-                       std::optional<std::string> FallbackWorkingDirectory)
-+-    : DelegatingCDB(Base, FallbackWorkingDirectory),
-+-      Mangler(std::move(Mangler)), FallbackFlags(std::move(FallbackFlags)) {}
-++                       CommandMangler Mangler)
-++    : DelegatingCDB(Base), Mangler(std::move(Mangler)),
-++      FallbackFlags(std::move(FallbackFlags)) {}
-+ 
-+ std::optional<tooling::CompileCommand>
-+ OverlayCDB::getCompileCommand(PathRef File) const {
-+@@ -863,20 +844,16 @@
-+   return MDB;
-+ }
-+ 
-+-DelegatingCDB::DelegatingCDB(
-+-    const GlobalCompilationDatabase *Base,
-+-    std::optional<std::string> FallbackWorkingDirectory)
-+-    : GlobalCompilationDatabase(FallbackWorkingDirectory), Base(Base) {
-++DelegatingCDB::DelegatingCDB(const GlobalCompilationDatabase *Base)
-++    : Base(Base) {
-+   if (Base)
-+     BaseChanged = Base->watch([this](const std::vector<std::string> Changes) {
-+       OnCommandChanged.broadcast(Changes);
-+     });
-+ }
-+ 
-+-DelegatingCDB::DelegatingCDB(
-+-    std::unique_ptr<GlobalCompilationDatabase> Base,
-+-    std::optional<std::string> FallbackWorkingDirectory)
-+-    : DelegatingCDB(Base.get(), FallbackWorkingDirectory) {
-++DelegatingCDB::DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base)
-++    : DelegatingCDB(Base.get()) {
-+   BaseOwner = std::move(Base);
-+ }
-+ 
-+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
-+--- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h
-++++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
-+@@ -35,9 +35,6 @@
-+ /// Provides compilation arguments used for parsing C and C++ files.
-+ class GlobalCompilationDatabase {
-+ public:
-+-  GlobalCompilationDatabase(
-+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt)
-+-      : FallbackWorkingDirectory(FallbackWorkingDirectory) {}
-+   virtual ~GlobalCompilationDatabase() = default;
-+ 
-+   /// If there are any known-good commands for building this file, returns one.
-+@@ -72,19 +69,14 @@
-+   }
-+ 
-+ protected:
-+-  std::optional<std::string> FallbackWorkingDirectory;
-+   mutable CommandChanged OnCommandChanged;
-+ };
-+ 
-+ // Helper class for implementing GlobalCompilationDatabases that wrap others.
-+ class DelegatingCDB : public GlobalCompilationDatabase {
-+ public:
-+-  DelegatingCDB(
-+-      const GlobalCompilationDatabase *Base,
-+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
-+-  DelegatingCDB(
-+-      std::unique_ptr<GlobalCompilationDatabase> Base,
-+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
-++  DelegatingCDB(const GlobalCompilationDatabase *Base);
-++  DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base);
-+ 
-+   std::optional<tooling::CompileCommand>
-+   getCompileCommand(PathRef File) const override;
-+@@ -125,12 +117,6 @@
-+     // Only look for a compilation database in this one fixed directory.
-+     // FIXME: fold this into config/context mechanism.
-+     std::optional<Path> CompileCommandsDir;
-+-    // Working directory for fallback commands
-+-    // If unset, parent directory of file should be used
-+-    std::optional<std::string> FallbackWorkingDirectory;
-+-
-+-    void applyFallbackWorkingDirectory(
-+-        std::optional<std::string> FallbackWorkingDirectory);
-+   };
-+ 
-+   DirectoryBasedGlobalCompilationDatabase(const Options &Opts);
-+@@ -208,11 +194,9 @@
-+   // Base may be null, in which case no entries are inherited.
-+   // FallbackFlags are added to the fallback compile command.
-+   // Adjuster is applied to all commands, fallback or not.
-+-  OverlayCDB(
-+-      const GlobalCompilationDatabase *Base,
-+-      std::vector<std::string> FallbackFlags = {},
-+-      CommandMangler Mangler = nullptr,
-+-      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
-++  OverlayCDB(const GlobalCompilationDatabase *Base,
-++             std::vector<std::string> FallbackFlags = {},
-++             CommandMangler Mangler = nullptr);
-+ 
-+   std::optional<tooling::CompileCommand>
-+   getCompileCommand(PathRef File) const override;
-+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
-+--- a/clang-tools-extra/clangd/tool/Check.cpp
-++++ b/clang-tools-extra/clangd/tool/Check.cpp
-+@@ -169,8 +169,6 @@
-+   bool buildCommand(const ThreadsafeFS &TFS) {
-+     log("Loading compilation database...");
-+     DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
-+-    if (Opts.StrongWorkspaceMode)
-+-      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
-+     CDBOpts.CompileCommandsDir =
-+         Config::current().CompileFlags.CDBSearch.FixedCDBPath;
-+     BaseCDB =
-+@@ -180,10 +178,8 @@
-+         getSystemIncludeExtractor(llvm::ArrayRef(Opts.QueryDriverGlobs));
-+     if (Opts.ResourceDir)
-+       Mangler.ResourceDir = *Opts.ResourceDir;
-+-
-+     CDB = std::make_unique<OverlayCDB>(
-+-        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler),
-+-        CDBOpts.FallbackWorkingDirectory);
-++        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler));
-+ 
-+     if (auto TrueCmd = CDB->getCompileCommand(File)) {
-+       Cmd = std::move(*TrueCmd);
-+@@ -506,7 +502,7 @@
-+                  config::DiagnosticCallback Diag) const override {
-+       config::Fragment F;
-+       // If we're timing clang-tidy checks, implicitly disabling the slow ones
-+-      // is counterproductive!
-++      // is counterproductive! 
-+       if (CheckTidyTime.getNumOccurrences())
-+         F.Diagnostics.ClangTidy.FastCheckFilter.emplace("None");
-+       return {std::move(F).compile(Diag)};
-+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
-+--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
-++++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
-+@@ -500,17 +500,6 @@
-+     init(true),
-+ };
-+ 
-+-opt<bool> StrongWorkspaceMode{
-+-    "strong-workspace-mode",
-+-    cat(Features),
-+-    desc("An alternate mode of operation for clangd, where the clangd instance "
-+-         "is used to edit a single workspace.\n"
-+-         "When enabled, fallback commands use the workspace directory as their "
-+-         "working directory instead of the parent folder."),
-+-    init(false),
-+-    Hidden,
-+-};
-+-
-+ opt<bool> UseDirtyHeaders{"use-dirty-headers", cat(Misc),
-+                           desc("Use files open in the editor when parsing "
-+                                "headers instead of reading from the disk"),
-+@@ -918,7 +907,6 @@
-+   }
-+   if (!ResourceDir.empty())
-+     Opts.ResourceDir = ResourceDir;
-+-  Opts.StrongWorkspaceMode = StrongWorkspaceMode;
-+   Opts.BuildDynamicSymbolIndex = true;
-+ #if CLANGD_ENABLE_REMOTE
-+   if (RemoteIndexAddress.empty() != ProjectRoot.empty()) {
-+diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
-+--- a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
-++++ b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
-+@@ -55,20 +55,6 @@
-+                                            testPath("foo/bar")));
-+ }
-+ 
-+-TEST(GlobalCompilationDatabaseTest, FallbackWorkingDirectory) {
-+-  MockFS TFS;
-+-  DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
-+-  CDBOpts.applyFallbackWorkingDirectory(testPath("foo"));
-+-  EXPECT_EQ(CDBOpts.FallbackWorkingDirectory, testPath("foo"));
-+-
-+-  DirectoryBasedGlobalCompilationDatabase DB(CDBOpts);
-+-  auto Cmd = DB.getFallbackCommand(testPath("foo/src/bar.cc"));
-+-  EXPECT_EQ(Cmd.Directory, testPath("foo"));
-+-  EXPECT_THAT(Cmd.CommandLine,
-+-              ElementsAre("clang", testPath("foo/src/bar.cc")));
-+-  EXPECT_EQ(Cmd.Output, "");
-+-}
-+-
-+ static tooling::CompileCommand cmd(llvm::StringRef File, llvm::StringRef Arg) {
-+   return tooling::CompileCommand(
-+       testRoot(), File, {"clang", std::string(Arg), std::string(File)}, "");
-+diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
-+--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
-++++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
-+@@ -618,8 +618,6 @@
-+ 
-+   DenseSet<const MachineBasicBlock *> DirtyBBs;
-+   for (MachineBasicBlock &MBB : MF) {
-+-    if (!MDT->isReachableFromEntry(&MBB))
-+-      continue;
-+     if (MBB.isEHPad()) {
-+       DirtyBBs.insert(&MBB);
-+       continue;
-+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-+--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
-+@@ -708,53 +708,6 @@
-+   return 2;
-+ }
-+ 
-+-bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
-+-                               const TargetInstrInfo &TII) {
-+-  for (MachineInstr &MI : MBB->terminators()) {
-+-    unsigned Opc = MI.getOpcode();
-+-    switch (Opc) {
-+-    case AArch64::CBZW:
-+-    case AArch64::CBZX:
-+-    case AArch64::TBZW:
-+-    case AArch64::TBZX:
-+-      // CBZ/TBZ with WZR/XZR -> unconditional B
-+-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-+-          MI.getOperand(0).getReg() == AArch64::XZR) {
-+-        DEBUG_WITH_TYPE("optimizeTerminators",
-+-                        dbgs() << "Removing always taken branch: " << MI);
-+-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-+-        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
-+-        for (auto *S : Succs)
-+-          if (S != Target)
-+-            MBB->removeSuccessor(S);
-+-        DebugLoc DL = MI.getDebugLoc();
-+-        while (MBB->rbegin() != &MI)
-+-          MBB->rbegin()->eraseFromParent();
-+-        MI.eraseFromParent();
-+-        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
-+-        return true;
-+-      }
-+-      break;
-+-    case AArch64::CBNZW:
-+-    case AArch64::CBNZX:
-+-    case AArch64::TBNZW:
-+-    case AArch64::TBNZX:
-+-      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
-+-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-+-          MI.getOperand(0).getReg() == AArch64::XZR) {
-+-        DEBUG_WITH_TYPE("optimizeTerminators",
-+-                        dbgs() << "Removing never taken branch: " << MI);
-+-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-+-        MI.getParent()->removeSuccessor(Target);
-+-        MI.eraseFromParent();
-+-        return true;
-+-      }
-+-      break;
-+-    }
-+-  }
-+-  return false;
-+-}
-+-
-+ // Find the original register that VReg is copied from.
-+ static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
-+   while (Register::isVirtualRegister(VReg)) {
-+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-+--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-++++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
-+@@ -705,8 +705,6 @@
-+                               unsigned *OutUnscaledOp = nullptr,
-+                               int64_t *EmittableOffset = nullptr);
-+ 
-+-bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
-+-
-+ static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
-+ 
-+ static inline bool isCondBranchOpcode(int Opc) {
-+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
-+--- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
-++++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
-+@@ -14,7 +14,6 @@
-+ //===----------------------------------------------------------------------===//
-+ 
-+ #include "AArch64.h"
-+-#include "AArch64InstrInfo.h"
-+ #include "llvm/CodeGen/MachineFunctionPass.h"
-+ #include "llvm/CodeGen/MachineInstrBuilder.h"
-+ #include "llvm/CodeGen/TargetInstrInfo.h"
-+@@ -46,6 +45,51 @@
-+                 "AArch64 Redundant Conditional Branch Elimination pass", false,
-+                 false)
-+ 
-++static bool optimizeTerminators(MachineBasicBlock *MBB,
-++                                const TargetInstrInfo &TII) {
-++  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
-++    unsigned Opc = MI.getOpcode();
-++    switch (Opc) {
-++    case AArch64::CBZW:
-++    case AArch64::CBZX:
-++    case AArch64::TBZW:
-++    case AArch64::TBZX:
-++      // CBZ/TBZ with WZR/XZR -> unconditional B
-++      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-++          MI.getOperand(0).getReg() == AArch64::XZR) {
-++        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
-++        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-++        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
-++        for (auto *S : Succs)
-++          if (S != Target)
-++            MBB->removeSuccessor(S);
-++        DebugLoc DL = MI.getDebugLoc();
-++        while (MBB->rbegin() != &MI)
-++          MBB->rbegin()->eraseFromParent();
-++        MI.eraseFromParent();
-++        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
-++        return true;
-++      }
-++      break;
-++    case AArch64::CBNZW:
-++    case AArch64::CBNZX:
-++    case AArch64::TBNZW:
-++    case AArch64::TBNZX:
-++      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
-++      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-++          MI.getOperand(0).getReg() == AArch64::XZR) {
-++        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
-++        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-++        MI.getParent()->removeSuccessor(Target);
-++        MI.eraseFromParent();
-++        return true;
-++      }
-++      break;
-++    }
-++  }
-++  return false;
-++}
-++
-+ bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
-+   if (skipFunction(MF.getFunction()))
-+     return false;
-+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
-+--- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
-++++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
-+@@ -50,7 +50,6 @@
-+ //        to use WZR/XZR directly in some cases.
-+ //===----------------------------------------------------------------------===//
-+ #include "AArch64.h"
-+-#include "AArch64InstrInfo.h"
-+ #include "llvm/ADT/SetVector.h"
-+ #include "llvm/ADT/Statistic.h"
-+ #include "llvm/ADT/iterator_range.h"
-+@@ -476,7 +475,6 @@
-+     return false;
-+   TRI = MF.getSubtarget().getRegisterInfo();
-+   MRI = &MF.getRegInfo();
-+-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-+ 
-+   // Resize the clobbered and used register unit trackers.  We do this once per
-+   // function.
-+@@ -486,10 +484,8 @@
-+   OptBBUsedRegs.init(*TRI);
-+ 
-+   bool Changed = false;
-+-  for (MachineBasicBlock &MBB : MF) {
-+-    Changed |= optimizeTerminators(&MBB, TII);
-++  for (MachineBasicBlock &MBB : MF)
-+     Changed |= optimizeBlock(&MBB);
-+-  }
-+   return Changed;
-+ }
-+ 
-+diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
-+--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
-++++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
-+@@ -1827,12 +1827,8 @@
-+     // profile info.
-+     CostTooHigh =
-+         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
-+-    if (CostTooHigh) {
-+-      // Mark runtime checks as never succeeding when they exceed the threshold.
-+-      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
-+-      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
-++    if (CostTooHigh)
-+       return;
-+-    }
-+ 
-+     BasicBlock *LoopHeader = L->getHeader();
-+     BasicBlock *Preheader = L->getLoopPreheader();
-+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
-+--- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
-++++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
-+@@ -735,15 +735,21 @@
-+ ; ENABLE-NEXT:    .cfi_offset w29, -16
-+ ; ENABLE-NEXT:    .cfi_offset w19, -24
-+ ; ENABLE-NEXT:    .cfi_offset w20, -32
-++; ENABLE-NEXT:  ; %bb.1: ; %if.then
-+ ; ENABLE-NEXT:    sub x19, sp, #16
-+ ; ENABLE-NEXT:    mov sp, x19
-+ ; ENABLE-NEXT:    mov w20, wzr
-+-; ENABLE-NEXT:  LBB10_1: ; %for.body
-++; ENABLE-NEXT:  LBB10_2: ; %for.body
-+ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-+ ; ENABLE-NEXT:    bl _something
-+ ; ENABLE-NEXT:    add w20, w0, w20
-+ ; ENABLE-NEXT:    str w20, [x19]
-+-; ENABLE-NEXT:    b LBB10_1
-++; ENABLE-NEXT:    b LBB10_2
-++; ENABLE-NEXT:  ; %bb.3: ; %if.end
-++; ENABLE-NEXT:    sub sp, x29, #16
-++; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-++; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-++; ENABLE-NEXT:    ret
-+ ;
-+ ; DISABLE-LABEL: infiniteloop:
-+ ; DISABLE:       ; %bb.0: ; %entry
-+@@ -755,15 +761,21 @@
-+ ; DISABLE-NEXT:    .cfi_offset w29, -16
-+ ; DISABLE-NEXT:    .cfi_offset w19, -24
-+ ; DISABLE-NEXT:    .cfi_offset w20, -32
-++; DISABLE-NEXT:  ; %bb.1: ; %if.then
-+ ; DISABLE-NEXT:    sub x19, sp, #16
-+ ; DISABLE-NEXT:    mov sp, x19
-+ ; DISABLE-NEXT:    mov w20, wzr
-+-; DISABLE-NEXT:  LBB10_1: ; %for.body
-++; DISABLE-NEXT:  LBB10_2: ; %for.body
-+ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-+ ; DISABLE-NEXT:    bl _something
-+ ; DISABLE-NEXT:    add w20, w0, w20
-+ ; DISABLE-NEXT:    str w20, [x19]
-+-; DISABLE-NEXT:    b LBB10_1
-++; DISABLE-NEXT:    b LBB10_2
-++; DISABLE-NEXT:  ; %bb.3: ; %if.end
-++; DISABLE-NEXT:    sub sp, x29, #16
-++; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-++; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-++; DISABLE-NEXT:    ret
-+ entry:
-+   br i1 undef, label %if.then, label %if.end
-+ 
-+@@ -794,10 +806,11 @@
-+ ; ENABLE-NEXT:    .cfi_offset w29, -16
-+ ; ENABLE-NEXT:    .cfi_offset w19, -24
-+ ; ENABLE-NEXT:    .cfi_offset w20, -32
-++; ENABLE-NEXT:  ; %bb.1: ; %if.then
-+ ; ENABLE-NEXT:    sub x8, sp, #16
-+ ; ENABLE-NEXT:    mov sp, x8
-+ ; ENABLE-NEXT:    mov w9, wzr
-+-; ENABLE-NEXT:  LBB11_1: ; %for.body
-++; ENABLE-NEXT:  LBB11_2: ; %for.body
-+ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-+ ; ENABLE-NEXT:    ; InlineAsm Start
-+ ; ENABLE-NEXT:    mov x10, #0 ; =0x0
-+@@ -808,7 +821,12 @@
-+ ; ENABLE-NEXT:    ; InlineAsm Start
-+ ; ENABLE-NEXT:    nop
-+ ; ENABLE-NEXT:    ; InlineAsm End
-+-; ENABLE-NEXT:    b LBB11_1
-++; ENABLE-NEXT:    b LBB11_2
-++; ENABLE-NEXT:  ; %bb.3: ; %if.end
-++; ENABLE-NEXT:    sub sp, x29, #16
-++; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-++; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-++; ENABLE-NEXT:    ret
-+ ;
-+ ; DISABLE-LABEL: infiniteloop2:
-+ ; DISABLE:       ; %bb.0: ; %entry
-+@@ -820,10 +838,11 @@
-+ ; DISABLE-NEXT:    .cfi_offset w29, -16
-+ ; DISABLE-NEXT:    .cfi_offset w19, -24
-+ ; DISABLE-NEXT:    .cfi_offset w20, -32
-++; DISABLE-NEXT:  ; %bb.1: ; %if.then
-+ ; DISABLE-NEXT:    sub x8, sp, #16
-+ ; DISABLE-NEXT:    mov sp, x8
-+ ; DISABLE-NEXT:    mov w9, wzr
-+-; DISABLE-NEXT:  LBB11_1: ; %for.body
-++; DISABLE-NEXT:  LBB11_2: ; %for.body
-+ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-+ ; DISABLE-NEXT:    ; InlineAsm Start
-+ ; DISABLE-NEXT:    mov x10, #0 ; =0x0
-+@@ -834,7 +853,12 @@
-+ ; DISABLE-NEXT:    ; InlineAsm Start
-+ ; DISABLE-NEXT:    nop
-+ ; DISABLE-NEXT:    ; InlineAsm End
-+-; DISABLE-NEXT:    b LBB11_1
-++; DISABLE-NEXT:    b LBB11_2
-++; DISABLE-NEXT:  ; %bb.3: ; %if.end
-++; DISABLE-NEXT:    sub sp, x29, #16
-++; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
-++; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
-++; DISABLE-NEXT:    ret
-+ entry:
-+   br i1 undef, label %if.then, label %if.end
-+ 
-+@@ -865,43 +889,49 @@
-+ define void @infiniteloop3() {
-+ ; ENABLE-LABEL: infiniteloop3:
-+ ; ENABLE:       ; %bb.0: ; %entry
-++; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
-+ ; ENABLE-NEXT:    mov x8, xzr
-+ ; ENABLE-NEXT:    mov x9, xzr
-+ ; ENABLE-NEXT:    mov x11, xzr
-+-; ENABLE-NEXT:    b LBB12_2
-+-; ENABLE-NEXT:  LBB12_1: ; %loop2b
-+-; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
-++; ENABLE-NEXT:    b LBB12_3
-++; ENABLE-NEXT:  LBB12_2: ; %loop2b
-++; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
-+ ; ENABLE-NEXT:    str x10, [x11]
-+ ; ENABLE-NEXT:    mov x11, x10
-+-; ENABLE-NEXT:  LBB12_2: ; %loop1
-++; ENABLE-NEXT:  LBB12_3: ; %loop1
-+ ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-+ ; ENABLE-NEXT:    mov x10, x9
-+ ; ENABLE-NEXT:    ldr x9, [x8]
-+-; ENABLE-NEXT:    cbnz x8, LBB12_1
-+-; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
-++; ENABLE-NEXT:    cbnz x8, LBB12_2
-++; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
-+ ; ENABLE-NEXT:    mov x8, x10
-+ ; ENABLE-NEXT:    mov x11, x10
-+-; ENABLE-NEXT:    b LBB12_2
-++; ENABLE-NEXT:    b LBB12_3
-++; ENABLE-NEXT:  ; %bb.5: ; %end
-++; ENABLE-NEXT:    ret
-+ ;
-+ ; DISABLE-LABEL: infiniteloop3:
-+ ; DISABLE:       ; %bb.0: ; %entry
-++; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
-+ ; DISABLE-NEXT:    mov x8, xzr
-+ ; DISABLE-NEXT:    mov x9, xzr
-+ ; DISABLE-NEXT:    mov x11, xzr
-+-; DISABLE-NEXT:    b LBB12_2
-+-; DISABLE-NEXT:  LBB12_1: ; %loop2b
-+-; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
-++; DISABLE-NEXT:    b LBB12_3
-++; DISABLE-NEXT:  LBB12_2: ; %loop2b
-++; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
-+ ; DISABLE-NEXT:    str x10, [x11]
-+ ; DISABLE-NEXT:    mov x11, x10
-+-; DISABLE-NEXT:  LBB12_2: ; %loop1
-++; DISABLE-NEXT:  LBB12_3: ; %loop1
-+ ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-+ ; DISABLE-NEXT:    mov x10, x9
-+ ; DISABLE-NEXT:    ldr x9, [x8]
-+-; DISABLE-NEXT:    cbnz x8, LBB12_1
-+-; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
-++; DISABLE-NEXT:    cbnz x8, LBB12_2
-++; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
-+ ; DISABLE-NEXT:    mov x8, x10
-+ ; DISABLE-NEXT:    mov x11, x10
-+-; DISABLE-NEXT:    b LBB12_2
-++; DISABLE-NEXT:    b LBB12_3
-++; DISABLE-NEXT:  ; %bb.5: ; %end
-++; DISABLE-NEXT:    ret
-+ entry:
-+   br i1 undef, label %loop2a, label %body
-+ 
-+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
-+--- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
-++++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
-+@@ -8,14 +8,20 @@
-+ define i8 @foo_optsize(i32 %v4) optsize {
-+ ; CHECK-LABEL: foo_optsize:
-+ ; CHECK:       // %bb.0: // %entry
-+-; CHECK-NEXT:    cbnz w0, .LBB0_2
-+-; CHECK-NEXT:  // %bb.1: // %b2
-+-; CHECK-NEXT:    mov w0, #1 // =0x1
-++; CHECK-NEXT:    b .LBB0_2
-++; CHECK-NEXT:  .LBB0_1:
-++; CHECK-NEXT:    mov w0, wzr
-+ ; CHECK-NEXT:    ret
-+ ; CHECK-NEXT:  .LBB0_2: // %b1
-+-; CHECK-NEXT:    cmp w0, #1
-+-; CHECK-NEXT:    mov w0, wzr
-++; CHECK-NEXT:    cbnz w0, .LBB0_4
-++; CHECK-NEXT:  // %bb.3: // %b2
-++; CHECK-NEXT:    mov w0, #1 // =0x1
-+ ; CHECK-NEXT:    ret
-++; CHECK-NEXT:  .LBB0_4: // %b1
-++; CHECK-NEXT:    cmp w0, #1
-++; CHECK-NEXT:    b.ne .LBB0_1
-++; CHECK-NEXT:  // %bb.5: // %b3
-++; CHECK-NEXT:    b .LBB0_1
-+ entry:
-+   %v2 = icmp eq i32 0, 0
-+   br i1 %v2, label %b1, label %b4
-+@@ -41,14 +47,20 @@
-+ define i8 @foo_optspeed(i32 %v4) {
-+ ; CHECK-LABEL: foo_optspeed:
-+ ; CHECK:       // %bb.0: // %entry
-+-; CHECK-NEXT:    cbnz w0, .LBB1_2
-+-; CHECK-NEXT:  // %bb.1: // %b2
-+-; CHECK-NEXT:    mov w0, #1 // =0x1
-++; CHECK-NEXT:    b .LBB1_2
-++; CHECK-NEXT:  .LBB1_1:
-++; CHECK-NEXT:    mov w0, wzr
-+ ; CHECK-NEXT:    ret
-+ ; CHECK-NEXT:  .LBB1_2: // %b1
-+-; CHECK-NEXT:    cmp w0, #1
-+-; CHECK-NEXT:    mov w0, wzr
-++; CHECK-NEXT:    cbnz w0, .LBB1_4
-++; CHECK-NEXT:  // %bb.3: // %b2
-++; CHECK-NEXT:    mov w0, #1 // =0x1
-+ ; CHECK-NEXT:    ret
-++; CHECK-NEXT:  .LBB1_4: // %b1
-++; CHECK-NEXT:    cmp w0, #1
-++; CHECK-NEXT:    b.ne .LBB1_1
-++; CHECK-NEXT:  // %bb.5: // %b3
-++; CHECK-NEXT:    b .LBB1_1
-+ entry:
-+   %v2 = icmp eq i32 0, 0
-+   br i1 %v2, label %b1, label %b4
-+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
-+--- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
-++++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
-+@@ -21,8 +21,10 @@
-+   ; CHECK-NEXT:   B %bb.3
-+   ; CHECK-NEXT: {{  $}}
-+   ; CHECK-NEXT: bb.1.bb:
-++  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
-+   ; CHECK-NEXT:   liveins: $w0, $lr
-+   ; CHECK-NEXT: {{  $}}
-++  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
-+   ; CHECK-NEXT:   B %bb.2
-+   ; CHECK-NEXT: {{  $}}
-+   ; CHECK-NEXT: bb.2.bb1:
-+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
-+--- a/llvm/test/CodeGen/AArch64/pr164181.ll
-++++ b/llvm/test/CodeGen/AArch64/pr164181.ll
-+@@ -29,11 +29,11 @@
-+ ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
-+ ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
-+ ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
-+-; CHECK-NEXT:    tbz w5, #0, .LBB0_40
-++; CHECK-NEXT:    tbz w5, #0, .LBB0_43
-+ ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
-+ ; CHECK-NEXT:    ldr x4, [sp, #312]
-+ ; CHECK-NEXT:    ldr x14, [sp, #280]
-+-; CHECK-NEXT:    tbz w0, #0, .LBB0_39
-++; CHECK-NEXT:    tbz w0, #0, .LBB0_42
-+ ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
-+ ; CHECK-NEXT:    ldrb w8, [sp, #368]
-+ ; CHECK-NEXT:    ldrb w12, [sp, #256]
-+@@ -92,7 +92,7 @@
-+ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
-+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-+ ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
-+ ; CHECK-NEXT:    mov x12, x24
-+ ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
-+@@ -117,7 +117,7 @@
-+ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
-+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-+ ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
-+ ; CHECK-NEXT:    cmn x24, #30
-+ ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
-+@@ -142,7 +142,7 @@
-+ ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
-+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-+ ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
-+ ; CHECK-NEXT:    mov w14, #1152 // =0x480
-+ ; CHECK-NEXT:    mov w24, #1 // =0x1
-+@@ -176,7 +176,7 @@
-+ ; CHECK-NEXT:    // => This Loop Header: Depth=4
-+ ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-+ ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-+-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
-++; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-+ ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
-+ ; CHECK-NEXT:    and w8, w8, w8, asr #31
-+ ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
-+@@ -281,23 +281,31 @@
-+ ; CHECK-NEXT:    mov x24, xzr
-+ ; CHECK-NEXT:    mul w12, w12, w22
-+ ; CHECK-NEXT:    mov x22, x5
-+-; CHECK-NEXT:    tbz w0, #0, .LBB0_33
-+-; CHECK-NEXT:  .LBB0_28: // %if.then222.us
-++; CHECK-NEXT:    tbz w0, #0, .LBB0_36
-++; CHECK-NEXT:  .LBB0_28: // %for.body194.us
-+ ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
-+ ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
-+ ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
-+ ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
-+ ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
-++; CHECK-NEXT:  // %bb.29: // %if.then222.us
-++; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-+ ; CHECK-NEXT:    adrp x27, :got:var_32
-+ ; CHECK-NEXT:    ldur w8, [x19, #-12]
-+ ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
-+ ; CHECK-NEXT:    strh w8, [x27]
-+ ; CHECK-NEXT:    sxtb w8, w25
-+-; CHECK-NEXT:    strb w3, [x16]
-+ ; CHECK-NEXT:    bic w25, w8, w8, asr #31
-++; CHECK-NEXT:    b .LBB0_31
-++; CHECK-NEXT:    .p2align 5, , 16
-++; CHECK-NEXT:  // %bb.30:
-++; CHECK-NEXT:    mov w25, wzr
-++; CHECK-NEXT:  .LBB0_31: // %if.end239.us
-++; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-++; CHECK-NEXT:    strb w3, [x16]
-+ ; CHECK-NEXT:    tst w13, #0xff
-+-; CHECK-NEXT:    b.eq .LBB0_30
-+-; CHECK-NEXT:  // %bb.29: // %if.then254.us
-++; CHECK-NEXT:    b.eq .LBB0_33
-++; CHECK-NEXT:  // %bb.32: // %if.then254.us
-+ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-+ ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
-+ ; CHECK-NEXT:    adrp x27, :got:var_35
-+@@ -306,7 +314,7 @@
-+ ; CHECK-NEXT:    csel x8, xzr, x7, eq
-+ ; CHECK-NEXT:    str x8, [x27]
-+ ; CHECK-NEXT:    strh w1, [x17]
-+-; CHECK-NEXT:  .LBB0_30: // %if.end282.us
-++; CHECK-NEXT:  .LBB0_33: // %if.end282.us
-+ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-+ ; CHECK-NEXT:    orr x27, x24, x4
-+ ; CHECK-NEXT:    adrp x8, :got:var_39
-+@@ -317,14 +325,14 @@
-+ ; CHECK-NEXT:    str x8, [x18]
-+ ; CHECK-NEXT:    mov w8, #1 // =0x1
-+ ; CHECK-NEXT:    cbnz x2, .LBB0_27
-+-; CHECK-NEXT:  // %bb.31: // %if.then327.us
-++; CHECK-NEXT:  // %bb.34: // %if.then327.us
-+ ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-+ ; CHECK-NEXT:    cbz w8, .LBB0_25
-+-; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
-++; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
-+ ; CHECK-NEXT:    mov w4, wzr
-+ ; CHECK-NEXT:    b .LBB0_26
-+ ; CHECK-NEXT:    .p2align 5, , 16
-+-; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
-++; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
-+ ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
-+ ; CHECK-NEXT:    mov w3, #1152 // =0x480
-+ ; CHECK-NEXT:    mov x22, xzr
-+@@ -335,24 +343,24 @@
-+ ; CHECK-NEXT:    madd x14, x14, x3, x11
-+ ; CHECK-NEXT:    mov w28, w30
-+ ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
-+-; CHECK-NEXT:    b .LBB0_36
-++; CHECK-NEXT:    b .LBB0_39
-+ ; CHECK-NEXT:    .p2align 5, , 16
-+-; CHECK-NEXT:  .LBB0_34: // %if.then466.us
-+-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
-++; CHECK-NEXT:  .LBB0_37: // %if.then466.us
-++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
-+ ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
-+ ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
-+ ; CHECK-NEXT:    sxtb w4, w4
-+ ; CHECK-NEXT:    bic w4, w4, w4, asr #31
-+ ; CHECK-NEXT:    str x3, [x28]
-+ ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
-+-; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
-+-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
-++; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
-++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
-+ ; CHECK-NEXT:    add x22, x22, #1
-+ ; CHECK-NEXT:    add x27, x27, #1
-+ ; CHECK-NEXT:    mov w28, wzr
-+ ; CHECK-NEXT:    cmp x27, #0
-+ ; CHECK-NEXT:    b.hs .LBB0_9
-+-; CHECK-NEXT:  .LBB0_36: // %for.body380.us
-++; CHECK-NEXT:  .LBB0_39: // %for.body380.us
-+ ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
-+ ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
-+ ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
-+@@ -364,18 +372,18 @@
-+ ; CHECK-NEXT:    strh w28, [x11]
-+ ; CHECK-NEXT:    csel w28, w21, w3, ne
-+ ; CHECK-NEXT:    str w28, [x20]
-+-; CHECK-NEXT:    cbz x15, .LBB0_35
-+-; CHECK-NEXT:  // %bb.37: // %if.then436.us
-+-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
-++; CHECK-NEXT:    cbz x15, .LBB0_38
-++; CHECK-NEXT:  // %bb.40: // %if.then436.us
-++; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
-+ ; CHECK-NEXT:    ldrh w28, [x14]
-+-; CHECK-NEXT:    cbnz w28, .LBB0_34
-+-; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
-++; CHECK-NEXT:    cbnz w28, .LBB0_37
-++; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
-+ ; CHECK-NEXT:    mov w4, wzr
-+-; CHECK-NEXT:    b .LBB0_35
-+-; CHECK-NEXT:  .LBB0_39: // %for.body41
-++; CHECK-NEXT:    b .LBB0_38
-++; CHECK-NEXT:  .LBB0_42: // %for.body41
-+ ; CHECK-NEXT:    strb wzr, [x4]
-+ ; CHECK-NEXT:    strb wzr, [x14]
-+-; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
-++; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
-+ ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
-+ ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
-+ ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
-+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
-+--- a/llvm/test/CodeGen/AArch64/pr166870.ll
-++++ b/llvm/test/CodeGen/AArch64/pr166870.ll
-+@@ -26,11 +26,12 @@
-+ ; CHECK-NEXT:    mov x21, x1
-+ ; CHECK-NEXT:    bl baz
-+ ; CHECK-NEXT:    mov w0, #0 // =0x0
-++; CHECK-NEXT:  // %bb.5: // %bb6
-+ ; CHECK-NEXT:    mov w10, #1 // =0x1
-++; CHECK-NEXT:    cbnz w10, .LBB0_11
-++; CHECK-NEXT:  // %bb.6: // %bb7
-+ ; CHECK-NEXT:    cbnz w10, .LBB0_10
-+-; CHECK-NEXT:  // %bb.5: // %bb7
-+-; CHECK-NEXT:    cbnz w10, .LBB0_9
-+-; CHECK-NEXT:  // %bb.6: // %bb8
-++; CHECK-NEXT:  // %bb.7: // %bb8
-+ ; CHECK-NEXT:    mov x8, x21
-+ ; CHECK-NEXT:    mov x9, x20
-+ ; CHECK-NEXT:    mov w20, #0 // =0x0
-+@@ -38,17 +39,17 @@
-+ ; CHECK-NEXT:    mov x21, x9
-+ ; CHECK-NEXT:    mov w8, w8
-+ ; CHECK-NEXT:    mov x22, x8
-+-; CHECK-NEXT:  .LBB0_7: // %bb10
-++; CHECK-NEXT:  .LBB0_8: // %bb10
-+ ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-+ ; CHECK-NEXT:    strb w20, [x19]
-+-; CHECK-NEXT:    cbnz x21, .LBB0_7
-+-; CHECK-NEXT:  // %bb.8: // %bb12
-+-; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
-++; CHECK-NEXT:    cbnz x21, .LBB0_8
-++; CHECK-NEXT:  // %bb.9: // %bb12
-++; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
-+ ; CHECK-NEXT:    bl snork
-+-; CHECK-NEXT:    cbnz x22, .LBB0_7
-+-; CHECK-NEXT:  .LBB0_9:
-+-; CHECK-NEXT:    mov w0, #0 // =0x0
-++; CHECK-NEXT:    cbnz x22, .LBB0_8
-+ ; CHECK-NEXT:  .LBB0_10:
-++; CHECK-NEXT:    mov w0, #0 // =0x0
-++; CHECK-NEXT:  .LBB0_11:
-+ ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-+ ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-+ ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
-+diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
-+--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
-++++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
-+@@ -71,21 +71,27 @@
-+ ; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+ ; CHECK-NEXT:    .cfi_offset w30, -8
-+ ; CHECK-NEXT:    .cfi_offset w29, -16
-++; CHECK-NEXT:    .cfi_remember_state
-+ ; CHECK-NEXT:    mov w8, #1 // =0x1
-+-; CHECK-NEXT:    mov w9, #2 // =0x2
-+ ; CHECK-NEXT:    stur xzr, [x29, #-8]
-+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+-; CHECK-NEXT:    ldur w8, [x29, #-8]
-+-; CHECK-NEXT:    cbz w8, .LBB0_2
-++; CHECK-NEXT:    b .LBB0_3
-+ ; CHECK-NEXT:  // %bb.1:
-+-; CHECK-NEXT:    mov w8, #1 // =0x1
-+ ; CHECK-NEXT:    str w8, [sp, #16]
-+-; CHECK-NEXT:    b .LBB0_3
-++; CHECK-NEXT:    ldur w8, [x29, #-8]
-++; CHECK-NEXT:    cbz w8, .LBB0_4
-+ ; CHECK-NEXT:  .LBB0_2:
-++; CHECK-NEXT:    .cfi_restore_state
-+ ; CHECK-NEXT:    mov w8, #1 // =0x1
-+-; CHECK-NEXT:    mov w9, #2 // =0x2
-+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-++; CHECK-NEXT:    str w8, [sp, #16]
-++; CHECK-NEXT:    b .LBB0_5
-+ ; CHECK-NEXT:  .LBB0_3:
-++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-++; CHECK-NEXT:    ldur w8, [x29, #-8]
-++; CHECK-NEXT:    cbnz w8, .LBB0_2
-++; CHECK-NEXT:  .LBB0_4:
-++; CHECK-NEXT:    mov w8, #1 // =0x1
-++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-++; CHECK-NEXT:  .LBB0_5:
-+ ; CHECK-NEXT:    mov w0, wzr
-+ ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
-+ ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
-+@@ -128,6 +134,7 @@
-+ ;
-+ ; CHECK-LABEL: OUTLINED_FUNCTION_0:
-+ ; CHECK:       // %bb.0:
-++; CHECK-NEXT:    mov w9, #2 // =0x2
-+ ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
-+ ; CHECK-NEXT:    mov w9, #3 // =0x3
-+ ; CHECK-NEXT:    mov w8, #4 // =0x4
-+diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
-+--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
-++++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
-+@@ -12,21 +12,27 @@
-+ ; CHECK-NEXT:    .cfi_def_cfa w29, 16
-+ ; CHECK-NEXT:    .cfi_offset w30, -8
-+ ; CHECK-NEXT:    .cfi_offset w29, -16
-++; CHECK-NEXT:    .cfi_remember_state
-+ ; CHECK-NEXT:    mov w8, #1 // =0x1
-+-; CHECK-NEXT:    mov w9, #2 // =0x2
-+ ; CHECK-NEXT:    stur xzr, [x29, #-8]
-+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-+-; CHECK-NEXT:    ldur w8, [x29, #-8]
-+-; CHECK-NEXT:    cbz w8, .LBB0_2
-++; CHECK-NEXT:    b .LBB0_3
-+ ; CHECK-NEXT:  // %bb.1:
-+-; CHECK-NEXT:    mov w8, #1 // =0x1
-+ ; CHECK-NEXT:    str w8, [sp, #16]
-+-; CHECK-NEXT:    b .LBB0_3
-++; CHECK-NEXT:    ldur w8, [x29, #-8]
-++; CHECK-NEXT:    cbz w8, .LBB0_4
-+ ; CHECK-NEXT:  .LBB0_2:
-++; CHECK-NEXT:    .cfi_restore_state
-+ ; CHECK-NEXT:    mov w8, #1 // =0x1
-+-; CHECK-NEXT:    mov w9, #2 // =0x2
-+-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-++; CHECK-NEXT:    str w8, [sp, #16]
-++; CHECK-NEXT:    b .LBB0_5
-+ ; CHECK-NEXT:  .LBB0_3:
-++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-++; CHECK-NEXT:    ldur w8, [x29, #-8]
-++; CHECK-NEXT:    cbnz w8, .LBB0_2
-++; CHECK-NEXT:  .LBB0_4:
-++; CHECK-NEXT:    mov w8, #1 // =0x1
-++; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-++; CHECK-NEXT:  .LBB0_5:
-+ ; CHECK-NEXT:    mov w0, wzr
-+ ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
-+ ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
-+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
-+--- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
-++++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
-+@@ -2,23 +2,29 @@
-+ ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
-+ ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
-+ 
-+-; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
-+-; no runtime check is generated and the loop should not be vectorized.
-++; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
-++; no runtime check is generated even though one is needed and !noalias
-++; annotations are added.
-+ define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
-+ ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
-+ ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
-+-; LIMIT0-NEXT:  [[ENTRY:.*]]:
-+-; LIMIT0-NEXT:    br label %[[LOOP:.*]]
-+-; LIMIT0:       [[LOOP]]:
-+-; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
-+-; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
-++; LIMIT0-NEXT:  [[ENTRY:.*:]]
-++; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
-++; LIMIT0:       [[VECTOR_PH]]:
-++; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
-++; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-++; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-++; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
-++; LIMIT0:       [[VECTOR_BODY]]:
-++; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-+ ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
-+-; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
-+-; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
-++; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-++; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-+ ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-+-; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
-++; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-++; LIMIT0:       [[MIDDLE_BLOCK]]:
-++; LIMIT0-NEXT:    br label %[[EXIT:.*]]
-+ ; LIMIT0:       [[EXIT]]:
-+-; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
-+ ; LIMIT0-NEXT:    ret i16 [[TMP0]]
-+ ;
-+ ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
-+@@ -82,9 +88,14 @@
-+ !3 = !{!"llvm.loop.vectorize.enable", i1 true}
-+ 
-+ ;.
-+-; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-+-; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
-+-; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
-++; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
-++; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
-++; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
-++; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
-++; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
-++; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
-++; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
-++; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
-+ ;.
-+ ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
-+ ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
-+diff -ruN --strip-trailing-cr a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
-+--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
-++++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
-+@@ -1320,8 +1320,9 @@
-+ }
-+ 
-+ template <typename T>
-+-T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-+-                               StringRef entryType, uint64_t depth) {
-++T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
-++                               uint64_t index, StringRef entryType,
-++                               uint64_t depth) {
-+   if (index >= entries.size()) {
-+     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
-+     return {};
+-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
+---- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+-+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
+-@@ -554,8 +554,6 @@
+-     if (const auto &Dir = Params.initializationOptions.compilationDatabasePath)
+-       CDBOpts.CompileCommandsDir = Dir;
+-     CDBOpts.ContextProvider = Opts.ContextProvider;
+--    if (Opts.StrongWorkspaceMode)
+--      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
+-     BaseCDB =
+-         std::make_unique<DirectoryBasedGlobalCompilationDatabase>(CDBOpts);
+-   }
+-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
+---- a/clang-tools-extra/clangd/ClangdServer.h
+-+++ b/clang-tools-extra/clangd/ClangdServer.h
+-@@ -152,11 +152,6 @@
+-     /// FIXME: If not set, should use the current working directory.
+-     std::optional<std::string> WorkspaceRoot;
+- 
+--    /// Sets an alternate mode of operation. Current effects are:
+--    /// - Using the current working directory as the working directory for
+--    ///   fallback commands
+--    bool StrongWorkspaceMode;
+--
+-     /// The resource directory is used to find internal headers, overriding
+-     /// defaults and -resource-dir compiler flag).
+-     /// If std::nullopt, ClangdServer calls
+-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+---- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+-+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
+-@@ -64,9 +64,7 @@
+-   if (FileExtension.empty() || FileExtension == ".h")
+-     Argv.push_back("-xobjective-c++-header");
+-   Argv.push_back(std::string(File));
+--  tooling::CompileCommand Cmd(FallbackWorkingDirectory
+--                                  ? *FallbackWorkingDirectory
+--                                  : llvm::sys::path::parent_path(File),
+-+  tooling::CompileCommand Cmd(llvm::sys::path::parent_path(File),
+-                               llvm::sys::path::filename(File), std::move(Argv),
+-                               /*Output=*/"");
+-   Cmd.Heuristic = "clangd fallback";
+-@@ -351,8 +349,7 @@
+- 
+- DirectoryBasedGlobalCompilationDatabase::
+-     DirectoryBasedGlobalCompilationDatabase(const Options &Opts)
+--    : GlobalCompilationDatabase(Opts.FallbackWorkingDirectory), Opts(Opts),
+--      Broadcaster(std::make_unique<BroadcastThread>(*this)) {
+-+    : Opts(Opts), Broadcaster(std::make_unique<BroadcastThread>(*this)) {
+-   if (!this->Opts.ContextProvider)
+-     this->Opts.ContextProvider = [](llvm::StringRef) {
+-       return Context::current().clone();
+-@@ -463,21 +460,6 @@
+-   return Result;
+- }
+- 
+--void DirectoryBasedGlobalCompilationDatabase::Options::
+--    applyFallbackWorkingDirectory(
+--        std::optional<std::string> FallbackWorkingDirectory) {
+--  if (FallbackWorkingDirectory)
+--    this->FallbackWorkingDirectory = *FallbackWorkingDirectory;
+--  else {
+--    // Clangd is running in strong workspace mode but the client didn't
+--    // specify a workspace path in the `initialize` request.
+--    // Fallback to current working directory.
+--    SmallString<256> CWD;
+--    llvm::sys::fs::current_path(CWD);
+--    this->FallbackWorkingDirectory = std::string(CWD);
+--  }
+--}
+--
+- // The broadcast thread announces files with new compile commands to the world.
+- // Primarily this is used to enqueue them for background indexing.
+- //
+-@@ -777,10 +759,9 @@
+- 
+- OverlayCDB::OverlayCDB(const GlobalCompilationDatabase *Base,
+-                        std::vector<std::string> FallbackFlags,
+--                       CommandMangler Mangler,
+--                       std::optional<std::string> FallbackWorkingDirectory)
+--    : DelegatingCDB(Base, FallbackWorkingDirectory),
+--      Mangler(std::move(Mangler)), FallbackFlags(std::move(FallbackFlags)) {}
+-+                       CommandMangler Mangler)
+-+    : DelegatingCDB(Base), Mangler(std::move(Mangler)),
+-+      FallbackFlags(std::move(FallbackFlags)) {}
+- 
+- std::optional<tooling::CompileCommand>
+- OverlayCDB::getCompileCommand(PathRef File) const {
+-@@ -863,20 +844,16 @@
+-   return MDB;
+- }
+- 
+--DelegatingCDB::DelegatingCDB(
+--    const GlobalCompilationDatabase *Base,
+--    std::optional<std::string> FallbackWorkingDirectory)
+--    : GlobalCompilationDatabase(FallbackWorkingDirectory), Base(Base) {
+-+DelegatingCDB::DelegatingCDB(const GlobalCompilationDatabase *Base)
+-+    : Base(Base) {
+-   if (Base)
+-     BaseChanged = Base->watch([this](const std::vector<std::string> Changes) {
+-       OnCommandChanged.broadcast(Changes);
+-     });
+- }
+- 
+--DelegatingCDB::DelegatingCDB(
+--    std::unique_ptr<GlobalCompilationDatabase> Base,
+--    std::optional<std::string> FallbackWorkingDirectory)
+--    : DelegatingCDB(Base.get(), FallbackWorkingDirectory) {
+-+DelegatingCDB::DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base)
+-+    : DelegatingCDB(Base.get()) {
+-   BaseOwner = std::move(Base);
+- }
+- 
+-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
+---- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h
+-+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
+-@@ -35,9 +35,6 @@
+- /// Provides compilation arguments used for parsing C and C++ files.
+- class GlobalCompilationDatabase {
+- public:
+--  GlobalCompilationDatabase(
+--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt)
+--      : FallbackWorkingDirectory(FallbackWorkingDirectory) {}
+-   virtual ~GlobalCompilationDatabase() = default;
+- 
+-   /// If there are any known-good commands for building this file, returns one.
+-@@ -72,19 +69,14 @@
+-   }
+- 
+- protected:
+--  std::optional<std::string> FallbackWorkingDirectory;
+-   mutable CommandChanged OnCommandChanged;
+- };
+- 
+- // Helper class for implementing GlobalCompilationDatabases that wrap others.
+- class DelegatingCDB : public GlobalCompilationDatabase {
+- public:
+--  DelegatingCDB(
+--      const GlobalCompilationDatabase *Base,
+--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
+--  DelegatingCDB(
+--      std::unique_ptr<GlobalCompilationDatabase> Base,
+--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
+-+  DelegatingCDB(const GlobalCompilationDatabase *Base);
+-+  DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base);
+- 
+-   std::optional<tooling::CompileCommand>
+-   getCompileCommand(PathRef File) const override;
+-@@ -125,12 +117,6 @@
+-     // Only look for a compilation database in this one fixed directory.
+-     // FIXME: fold this into config/context mechanism.
+-     std::optional<Path> CompileCommandsDir;
+--    // Working directory for fallback commands
+--    // If unset, parent directory of file should be used
+--    std::optional<std::string> FallbackWorkingDirectory;
+--
+--    void applyFallbackWorkingDirectory(
+--        std::optional<std::string> FallbackWorkingDirectory);
+-   };
+- 
+-   DirectoryBasedGlobalCompilationDatabase(const Options &Opts);
+-@@ -208,11 +194,9 @@
+-   // Base may be null, in which case no entries are inherited.
+-   // FallbackFlags are added to the fallback compile command.
+-   // Adjuster is applied to all commands, fallback or not.
+--  OverlayCDB(
+--      const GlobalCompilationDatabase *Base,
+--      std::vector<std::string> FallbackFlags = {},
+--      CommandMangler Mangler = nullptr,
+--      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
+-+  OverlayCDB(const GlobalCompilationDatabase *Base,
+-+             std::vector<std::string> FallbackFlags = {},
+-+             CommandMangler Mangler = nullptr);
+- 
+-   std::optional<tooling::CompileCommand>
+-   getCompileCommand(PathRef File) const override;
+-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
+---- a/clang-tools-extra/clangd/tool/Check.cpp
+-+++ b/clang-tools-extra/clangd/tool/Check.cpp
+-@@ -169,8 +169,6 @@
+-   bool buildCommand(const ThreadsafeFS &TFS) {
+-     log("Loading compilation database...");
+-     DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
+--    if (Opts.StrongWorkspaceMode)
+--      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
+-     CDBOpts.CompileCommandsDir =
+-         Config::current().CompileFlags.CDBSearch.FixedCDBPath;
+-     BaseCDB =
+-@@ -180,10 +178,8 @@
+-         getSystemIncludeExtractor(llvm::ArrayRef(Opts.QueryDriverGlobs));
+-     if (Opts.ResourceDir)
+-       Mangler.ResourceDir = *Opts.ResourceDir;
+--
+-     CDB = std::make_unique<OverlayCDB>(
+--        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler),
+--        CDBOpts.FallbackWorkingDirectory);
+-+        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler));
+- 
+-     if (auto TrueCmd = CDB->getCompileCommand(File)) {
+-       Cmd = std::move(*TrueCmd);
+-@@ -506,7 +502,7 @@
+-                  config::DiagnosticCallback Diag) const override {
+-       config::Fragment F;
+-       // If we're timing clang-tidy checks, implicitly disabling the slow ones
+--      // is counterproductive!
+-+      // is counterproductive! 
+-       if (CheckTidyTime.getNumOccurrences())
+-         F.Diagnostics.ClangTidy.FastCheckFilter.emplace("None");
+-       return {std::move(F).compile(Diag)};
+-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
+---- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+-+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
+-@@ -500,17 +500,6 @@
+-     init(true),
+- };
+- 
+--opt<bool> StrongWorkspaceMode{
+--    "strong-workspace-mode",
+--    cat(Features),
+--    desc("An alternate mode of operation for clangd, where the clangd instance "
+--         "is used to edit a single workspace.\n"
+--         "When enabled, fallback commands use the workspace directory as their "
+--         "working directory instead of the parent folder."),
+--    init(false),
+--    Hidden,
+--};
+--
+- opt<bool> UseDirtyHeaders{"use-dirty-headers", cat(Misc),
+-                           desc("Use files open in the editor when parsing "
+-                                "headers instead of reading from the disk"),
+-@@ -918,7 +907,6 @@
+-   }
+-   if (!ResourceDir.empty())
+-     Opts.ResourceDir = ResourceDir;
+--  Opts.StrongWorkspaceMode = StrongWorkspaceMode;
+-   Opts.BuildDynamicSymbolIndex = true;
+- #if CLANGD_ENABLE_REMOTE
+-   if (RemoteIndexAddress.empty() != ProjectRoot.empty()) {
+-diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
+---- a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
+-+++ b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
+-@@ -55,20 +55,6 @@
+-                                            testPath("foo/bar")));
+- }
+- 
+--TEST(GlobalCompilationDatabaseTest, FallbackWorkingDirectory) {
+--  MockFS TFS;
+--  DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
+--  CDBOpts.applyFallbackWorkingDirectory(testPath("foo"));
+--  EXPECT_EQ(CDBOpts.FallbackWorkingDirectory, testPath("foo"));
+--
+--  DirectoryBasedGlobalCompilationDatabase DB(CDBOpts);
+--  auto Cmd = DB.getFallbackCommand(testPath("foo/src/bar.cc"));
+--  EXPECT_EQ(Cmd.Directory, testPath("foo"));
+--  EXPECT_THAT(Cmd.CommandLine,
+--              ElementsAre("clang", testPath("foo/src/bar.cc")));
+--  EXPECT_EQ(Cmd.Output, "");
+--}
+--
+- static tooling::CompileCommand cmd(llvm::StringRef File, llvm::StringRef Arg) {
+-   return tooling::CompileCommand(
+-       testRoot(), File, {"clang", std::string(Arg), std::string(File)}, "");
+-diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
+---- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+-+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
+-@@ -618,8 +618,6 @@
+- 
+-   DenseSet<const MachineBasicBlock *> DirtyBBs;
+-   for (MachineBasicBlock &MBB : MF) {
+--    if (!MDT->isReachableFromEntry(&MBB))
+--      continue;
+-     if (MBB.isEHPad()) {
+-       DirtyBBs.insert(&MBB);
+-       continue;
+-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+-@@ -708,53 +708,6 @@
+-   return 2;
+- }
+- 
+--bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
+--                               const TargetInstrInfo &TII) {
+--  for (MachineInstr &MI : MBB->terminators()) {
+--    unsigned Opc = MI.getOpcode();
+--    switch (Opc) {
+--    case AArch64::CBZW:
+--    case AArch64::CBZX:
+--    case AArch64::TBZW:
+--    case AArch64::TBZX:
+--      // CBZ/TBZ with WZR/XZR -> unconditional B
+--      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+--          MI.getOperand(0).getReg() == AArch64::XZR) {
+--        DEBUG_WITH_TYPE("optimizeTerminators",
+--                        dbgs() << "Removing always taken branch: " << MI);
+--        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+--        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
+--        for (auto *S : Succs)
+--          if (S != Target)
+--            MBB->removeSuccessor(S);
+--        DebugLoc DL = MI.getDebugLoc();
+--        while (MBB->rbegin() != &MI)
+--          MBB->rbegin()->eraseFromParent();
+--        MI.eraseFromParent();
+--        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
+--        return true;
+--      }
+--      break;
+--    case AArch64::CBNZW:
+--    case AArch64::CBNZX:
+--    case AArch64::TBNZW:
+--    case AArch64::TBNZX:
+--      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
+--      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+--          MI.getOperand(0).getReg() == AArch64::XZR) {
+--        DEBUG_WITH_TYPE("optimizeTerminators",
+--                        dbgs() << "Removing never taken branch: " << MI);
+--        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+--        MI.getParent()->removeSuccessor(Target);
+--        MI.eraseFromParent();
+--        return true;
+--      }
+--      break;
+--    }
+--  }
+--  return false;
+--}
+--
+- // Find the original register that VReg is copied from.
+- static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+-   while (Register::isVirtualRegister(VReg)) {
+-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+---- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+-+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+-@@ -705,8 +705,6 @@
+-                               unsigned *OutUnscaledOp = nullptr,
+-                               int64_t *EmittableOffset = nullptr);
+- 
+--bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
+--
+- static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+- 
+- static inline bool isCondBranchOpcode(int Opc) {
+-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+---- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+-+++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+-@@ -14,7 +14,6 @@
+- //===----------------------------------------------------------------------===//
+- 
+- #include "AArch64.h"
+--#include "AArch64InstrInfo.h"
+- #include "llvm/CodeGen/MachineFunctionPass.h"
+- #include "llvm/CodeGen/MachineInstrBuilder.h"
+- #include "llvm/CodeGen/TargetInstrInfo.h"
+-@@ -46,6 +45,51 @@
+-                 "AArch64 Redundant Conditional Branch Elimination pass", false,
+-                 false)
+- 
+-+static bool optimizeTerminators(MachineBasicBlock *MBB,
+-+                                const TargetInstrInfo &TII) {
+-+  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
+-+    unsigned Opc = MI.getOpcode();
+-+    switch (Opc) {
+-+    case AArch64::CBZW:
+-+    case AArch64::CBZX:
+-+    case AArch64::TBZW:
+-+    case AArch64::TBZX:
+-+      // CBZ/TBZ with WZR/XZR -> unconditional B
+-+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+-+          MI.getOperand(0).getReg() == AArch64::XZR) {
+-+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+-+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+-+        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
+-+        for (auto *S : Succs)
+-+          if (S != Target)
+-+            MBB->removeSuccessor(S);
+-+        DebugLoc DL = MI.getDebugLoc();
+-+        while (MBB->rbegin() != &MI)
+-+          MBB->rbegin()->eraseFromParent();
+-+        MI.eraseFromParent();
+-+        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
+-+        return true;
+-+      }
+-+      break;
+-+    case AArch64::CBNZW:
+-+    case AArch64::CBNZX:
+-+    case AArch64::TBNZW:
+-+    case AArch64::TBNZX:
+-+      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
+-+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+-+          MI.getOperand(0).getReg() == AArch64::XZR) {
+-+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+-+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+-+        MI.getParent()->removeSuccessor(Target);
+-+        MI.eraseFromParent();
+-+        return true;
+-+      }
+-+      break;
+-+    }
+-+  }
+-+  return false;
+-+}
+-+
+- bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
+-   if (skipFunction(MF.getFunction()))
+-     return false;
+-diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+---- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+-+++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+-@@ -50,7 +50,6 @@
+- //        to use WZR/XZR directly in some cases.
+- //===----------------------------------------------------------------------===//
+- #include "AArch64.h"
+--#include "AArch64InstrInfo.h"
+- #include "llvm/ADT/SetVector.h"
+- #include "llvm/ADT/Statistic.h"
+- #include "llvm/ADT/iterator_range.h"
+-@@ -476,7 +475,6 @@
+-     return false;
+-   TRI = MF.getSubtarget().getRegisterInfo();
+-   MRI = &MF.getRegInfo();
+--  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+- 
+-   // Resize the clobbered and used register unit trackers.  We do this once per
+-   // function.
+-@@ -486,10 +484,8 @@
+-   OptBBUsedRegs.init(*TRI);
+- 
+-   bool Changed = false;
+--  for (MachineBasicBlock &MBB : MF) {
+--    Changed |= optimizeTerminators(&MBB, TII);
+-+  for (MachineBasicBlock &MBB : MF)
+-     Changed |= optimizeBlock(&MBB);
+--  }
+-   return Changed;
+- }
+- 
+-diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+---- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+-+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+-@@ -1827,12 +1827,8 @@
+-     // profile info.
+-     CostTooHigh =
+-         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
+--    if (CostTooHigh) {
+--      // Mark runtime checks as never succeeding when they exceed the threshold.
+--      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+--      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+-+    if (CostTooHigh)
+-       return;
+--    }
+- 
+-     BasicBlock *LoopHeader = L->getHeader();
+-     BasicBlock *Preheader = L->getLoopPreheader();
+-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+---- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+-+++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+-@@ -735,15 +735,21 @@
+- ; ENABLE-NEXT:    .cfi_offset w29, -16
+- ; ENABLE-NEXT:    .cfi_offset w19, -24
+- ; ENABLE-NEXT:    .cfi_offset w20, -32
+-+; ENABLE-NEXT:  ; %bb.1: ; %if.then
+- ; ENABLE-NEXT:    sub x19, sp, #16
+- ; ENABLE-NEXT:    mov sp, x19
+- ; ENABLE-NEXT:    mov w20, wzr
+--; ENABLE-NEXT:  LBB10_1: ; %for.body
+-+; ENABLE-NEXT:  LBB10_2: ; %for.body
+- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+- ; ENABLE-NEXT:    bl _something
+- ; ENABLE-NEXT:    add w20, w0, w20
+- ; ENABLE-NEXT:    str w20, [x19]
+--; ENABLE-NEXT:    b LBB10_1
+-+; ENABLE-NEXT:    b LBB10_2
+-+; ENABLE-NEXT:  ; %bb.3: ; %if.end
+-+; ENABLE-NEXT:    sub sp, x29, #16
+-+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+-+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+-+; ENABLE-NEXT:    ret
+- ;
+- ; DISABLE-LABEL: infiniteloop:
+- ; DISABLE:       ; %bb.0: ; %entry
+-@@ -755,15 +761,21 @@
+- ; DISABLE-NEXT:    .cfi_offset w29, -16
+- ; DISABLE-NEXT:    .cfi_offset w19, -24
+- ; DISABLE-NEXT:    .cfi_offset w20, -32
+-+; DISABLE-NEXT:  ; %bb.1: ; %if.then
+- ; DISABLE-NEXT:    sub x19, sp, #16
+- ; DISABLE-NEXT:    mov sp, x19
+- ; DISABLE-NEXT:    mov w20, wzr
+--; DISABLE-NEXT:  LBB10_1: ; %for.body
+-+; DISABLE-NEXT:  LBB10_2: ; %for.body
+- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+- ; DISABLE-NEXT:    bl _something
+- ; DISABLE-NEXT:    add w20, w0, w20
+- ; DISABLE-NEXT:    str w20, [x19]
+--; DISABLE-NEXT:    b LBB10_1
+-+; DISABLE-NEXT:    b LBB10_2
+-+; DISABLE-NEXT:  ; %bb.3: ; %if.end
+-+; DISABLE-NEXT:    sub sp, x29, #16
+-+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+-+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+-+; DISABLE-NEXT:    ret
+- entry:
+-   br i1 undef, label %if.then, label %if.end
+- 
+-@@ -794,10 +806,11 @@
+- ; ENABLE-NEXT:    .cfi_offset w29, -16
+- ; ENABLE-NEXT:    .cfi_offset w19, -24
+- ; ENABLE-NEXT:    .cfi_offset w20, -32
+-+; ENABLE-NEXT:  ; %bb.1: ; %if.then
+- ; ENABLE-NEXT:    sub x8, sp, #16
+- ; ENABLE-NEXT:    mov sp, x8
+- ; ENABLE-NEXT:    mov w9, wzr
+--; ENABLE-NEXT:  LBB11_1: ; %for.body
+-+; ENABLE-NEXT:  LBB11_2: ; %for.body
+- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+- ; ENABLE-NEXT:    ; InlineAsm Start
+- ; ENABLE-NEXT:    mov x10, #0 ; =0x0
+-@@ -808,7 +821,12 @@
+- ; ENABLE-NEXT:    ; InlineAsm Start
+- ; ENABLE-NEXT:    nop
+- ; ENABLE-NEXT:    ; InlineAsm End
+--; ENABLE-NEXT:    b LBB11_1
+-+; ENABLE-NEXT:    b LBB11_2
+-+; ENABLE-NEXT:  ; %bb.3: ; %if.end
+-+; ENABLE-NEXT:    sub sp, x29, #16
+-+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+-+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+-+; ENABLE-NEXT:    ret
+- ;
+- ; DISABLE-LABEL: infiniteloop2:
+- ; DISABLE:       ; %bb.0: ; %entry
+-@@ -820,10 +838,11 @@
+- ; DISABLE-NEXT:    .cfi_offset w29, -16
+- ; DISABLE-NEXT:    .cfi_offset w19, -24
+- ; DISABLE-NEXT:    .cfi_offset w20, -32
+-+; DISABLE-NEXT:  ; %bb.1: ; %if.then
+- ; DISABLE-NEXT:    sub x8, sp, #16
+- ; DISABLE-NEXT:    mov sp, x8
+- ; DISABLE-NEXT:    mov w9, wzr
+--; DISABLE-NEXT:  LBB11_1: ; %for.body
+-+; DISABLE-NEXT:  LBB11_2: ; %for.body
+- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+- ; DISABLE-NEXT:    ; InlineAsm Start
+- ; DISABLE-NEXT:    mov x10, #0 ; =0x0
+-@@ -834,7 +853,12 @@
+- ; DISABLE-NEXT:    ; InlineAsm Start
+- ; DISABLE-NEXT:    nop
+- ; DISABLE-NEXT:    ; InlineAsm End
+--; DISABLE-NEXT:    b LBB11_1
+-+; DISABLE-NEXT:    b LBB11_2
+-+; DISABLE-NEXT:  ; %bb.3: ; %if.end
+-+; DISABLE-NEXT:    sub sp, x29, #16
+-+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+-+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+-+; DISABLE-NEXT:    ret
+- entry:
+-   br i1 undef, label %if.then, label %if.end
+- 
+-@@ -865,43 +889,49 @@
+- define void @infiniteloop3() {
+- ; ENABLE-LABEL: infiniteloop3:
+- ; ENABLE:       ; %bb.0: ; %entry
+-+; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
+- ; ENABLE-NEXT:    mov x8, xzr
+- ; ENABLE-NEXT:    mov x9, xzr
+- ; ENABLE-NEXT:    mov x11, xzr
+--; ENABLE-NEXT:    b LBB12_2
+--; ENABLE-NEXT:  LBB12_1: ; %loop2b
+--; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+-+; ENABLE-NEXT:    b LBB12_3
+-+; ENABLE-NEXT:  LBB12_2: ; %loop2b
+-+; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
+- ; ENABLE-NEXT:    str x10, [x11]
+- ; ENABLE-NEXT:    mov x11, x10
+--; ENABLE-NEXT:  LBB12_2: ; %loop1
+-+; ENABLE-NEXT:  LBB12_3: ; %loop1
+- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+- ; ENABLE-NEXT:    mov x10, x9
+- ; ENABLE-NEXT:    ldr x9, [x8]
+--; ENABLE-NEXT:    cbnz x8, LBB12_1
+--; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+-+; ENABLE-NEXT:    cbnz x8, LBB12_2
+-+; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
+- ; ENABLE-NEXT:    mov x8, x10
+- ; ENABLE-NEXT:    mov x11, x10
+--; ENABLE-NEXT:    b LBB12_2
+-+; ENABLE-NEXT:    b LBB12_3
+-+; ENABLE-NEXT:  ; %bb.5: ; %end
+-+; ENABLE-NEXT:    ret
+- ;
+- ; DISABLE-LABEL: infiniteloop3:
+- ; DISABLE:       ; %bb.0: ; %entry
+-+; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
+- ; DISABLE-NEXT:    mov x8, xzr
+- ; DISABLE-NEXT:    mov x9, xzr
+- ; DISABLE-NEXT:    mov x11, xzr
+--; DISABLE-NEXT:    b LBB12_2
+--; DISABLE-NEXT:  LBB12_1: ; %loop2b
+--; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+-+; DISABLE-NEXT:    b LBB12_3
+-+; DISABLE-NEXT:  LBB12_2: ; %loop2b
+-+; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
+- ; DISABLE-NEXT:    str x10, [x11]
+- ; DISABLE-NEXT:    mov x11, x10
+--; DISABLE-NEXT:  LBB12_2: ; %loop1
+-+; DISABLE-NEXT:  LBB12_3: ; %loop1
+- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
+- ; DISABLE-NEXT:    mov x10, x9
+- ; DISABLE-NEXT:    ldr x9, [x8]
+--; DISABLE-NEXT:    cbnz x8, LBB12_1
+--; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+-+; DISABLE-NEXT:    cbnz x8, LBB12_2
+-+; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
+- ; DISABLE-NEXT:    mov x8, x10
+- ; DISABLE-NEXT:    mov x11, x10
+--; DISABLE-NEXT:    b LBB12_2
+-+; DISABLE-NEXT:    b LBB12_3
+-+; DISABLE-NEXT:  ; %bb.5: ; %end
+-+; DISABLE-NEXT:    ret
+- entry:
+-   br i1 undef, label %loop2a, label %body
+- 
+-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+---- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+-+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+-@@ -8,14 +8,20 @@
+- define i8 @foo_optsize(i32 %v4) optsize {
+- ; CHECK-LABEL: foo_optsize:
+- ; CHECK:       // %bb.0: // %entry
+--; CHECK-NEXT:    cbnz w0, .LBB0_2
+--; CHECK-NEXT:  // %bb.1: // %b2
+--; CHECK-NEXT:    mov w0, #1 // =0x1
+-+; CHECK-NEXT:    b .LBB0_2
+-+; CHECK-NEXT:  .LBB0_1:
+-+; CHECK-NEXT:    mov w0, wzr
+- ; CHECK-NEXT:    ret
+- ; CHECK-NEXT:  .LBB0_2: // %b1
+--; CHECK-NEXT:    cmp w0, #1
+--; CHECK-NEXT:    mov w0, wzr
+-+; CHECK-NEXT:    cbnz w0, .LBB0_4
+-+; CHECK-NEXT:  // %bb.3: // %b2
+-+; CHECK-NEXT:    mov w0, #1 // =0x1
+- ; CHECK-NEXT:    ret
+-+; CHECK-NEXT:  .LBB0_4: // %b1
+-+; CHECK-NEXT:    cmp w0, #1
+-+; CHECK-NEXT:    b.ne .LBB0_1
+-+; CHECK-NEXT:  // %bb.5: // %b3
+-+; CHECK-NEXT:    b .LBB0_1
+- entry:
+-   %v2 = icmp eq i32 0, 0
+-   br i1 %v2, label %b1, label %b4
+-@@ -41,14 +47,20 @@
+- define i8 @foo_optspeed(i32 %v4) {
+- ; CHECK-LABEL: foo_optspeed:
+- ; CHECK:       // %bb.0: // %entry
+--; CHECK-NEXT:    cbnz w0, .LBB1_2
+--; CHECK-NEXT:  // %bb.1: // %b2
+--; CHECK-NEXT:    mov w0, #1 // =0x1
+-+; CHECK-NEXT:    b .LBB1_2
+-+; CHECK-NEXT:  .LBB1_1:
+-+; CHECK-NEXT:    mov w0, wzr
+- ; CHECK-NEXT:    ret
+- ; CHECK-NEXT:  .LBB1_2: // %b1
+--; CHECK-NEXT:    cmp w0, #1
+--; CHECK-NEXT:    mov w0, wzr
+-+; CHECK-NEXT:    cbnz w0, .LBB1_4
+-+; CHECK-NEXT:  // %bb.3: // %b2
+-+; CHECK-NEXT:    mov w0, #1 // =0x1
+- ; CHECK-NEXT:    ret
+-+; CHECK-NEXT:  .LBB1_4: // %b1
+-+; CHECK-NEXT:    cmp w0, #1
+-+; CHECK-NEXT:    b.ne .LBB1_1
+-+; CHECK-NEXT:  // %bb.5: // %b3
+-+; CHECK-NEXT:    b .LBB1_1
+- entry:
+-   %v2 = icmp eq i32 0, 0
+-   br i1 %v2, label %b1, label %b4
+-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+---- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+-+++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+-@@ -21,8 +21,10 @@
+-   ; CHECK-NEXT:   B %bb.3
+-   ; CHECK-NEXT: {{  $}}
+-   ; CHECK-NEXT: bb.1.bb:
+-+  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
+-   ; CHECK-NEXT:   liveins: $w0, $lr
+-   ; CHECK-NEXT: {{  $}}
+-+  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
+-   ; CHECK-NEXT:   B %bb.2
+-   ; CHECK-NEXT: {{  $}}
+-   ; CHECK-NEXT: bb.2.bb1:
+-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
+---- a/llvm/test/CodeGen/AArch64/pr164181.ll
+-+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
+-@@ -29,11 +29,11 @@
+- ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
+- ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
+- ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
+--; CHECK-NEXT:    tbz w5, #0, .LBB0_40
+-+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
+- ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
+- ; CHECK-NEXT:    ldr x4, [sp, #312]
+- ; CHECK-NEXT:    ldr x14, [sp, #280]
+--; CHECK-NEXT:    tbz w0, #0, .LBB0_39
+-+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
+- ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
+- ; CHECK-NEXT:    ldrb w8, [sp, #368]
+- ; CHECK-NEXT:    ldrb w12, [sp, #256]
+-@@ -92,7 +92,7 @@
+- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+- ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
+- ; CHECK-NEXT:    mov x12, x24
+- ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
+-@@ -117,7 +117,7 @@
+- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+- ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
+- ; CHECK-NEXT:    cmn x24, #30
+- ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
+-@@ -142,7 +142,7 @@
+- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+- ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
+- ; CHECK-NEXT:    mov w14, #1152 // =0x480
+- ; CHECK-NEXT:    mov w24, #1 // =0x1
+-@@ -176,7 +176,7 @@
+- ; CHECK-NEXT:    // => This Loop Header: Depth=4
+- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+--; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+-+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+- ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
+- ; CHECK-NEXT:    and w8, w8, w8, asr #31
+- ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
+-@@ -281,23 +281,31 @@
+- ; CHECK-NEXT:    mov x24, xzr
+- ; CHECK-NEXT:    mul w12, w12, w22
+- ; CHECK-NEXT:    mov x22, x5
+--; CHECK-NEXT:    tbz w0, #0, .LBB0_33
+--; CHECK-NEXT:  .LBB0_28: // %if.then222.us
+-+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
+-+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
+- ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+- ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+- ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+- ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+- ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+-+; CHECK-NEXT:  // %bb.29: // %if.then222.us
+-+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+- ; CHECK-NEXT:    adrp x27, :got:var_32
+- ; CHECK-NEXT:    ldur w8, [x19, #-12]
+- ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
+- ; CHECK-NEXT:    strh w8, [x27]
+- ; CHECK-NEXT:    sxtb w8, w25
+--; CHECK-NEXT:    strb w3, [x16]
+- ; CHECK-NEXT:    bic w25, w8, w8, asr #31
+-+; CHECK-NEXT:    b .LBB0_31
+-+; CHECK-NEXT:    .p2align 5, , 16
+-+; CHECK-NEXT:  // %bb.30:
+-+; CHECK-NEXT:    mov w25, wzr
+-+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
+-+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+-+; CHECK-NEXT:    strb w3, [x16]
+- ; CHECK-NEXT:    tst w13, #0xff
+--; CHECK-NEXT:    b.eq .LBB0_30
+--; CHECK-NEXT:  // %bb.29: // %if.then254.us
+-+; CHECK-NEXT:    b.eq .LBB0_33
+-+; CHECK-NEXT:  // %bb.32: // %if.then254.us
+- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+- ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
+- ; CHECK-NEXT:    adrp x27, :got:var_35
+-@@ -306,7 +314,7 @@
+- ; CHECK-NEXT:    csel x8, xzr, x7, eq
+- ; CHECK-NEXT:    str x8, [x27]
+- ; CHECK-NEXT:    strh w1, [x17]
+--; CHECK-NEXT:  .LBB0_30: // %if.end282.us
+-+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
+- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+- ; CHECK-NEXT:    orr x27, x24, x4
+- ; CHECK-NEXT:    adrp x8, :got:var_39
+-@@ -317,14 +325,14 @@
+- ; CHECK-NEXT:    str x8, [x18]
+- ; CHECK-NEXT:    mov w8, #1 // =0x1
+- ; CHECK-NEXT:    cbnz x2, .LBB0_27
+--; CHECK-NEXT:  // %bb.31: // %if.then327.us
+-+; CHECK-NEXT:  // %bb.34: // %if.then327.us
+- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+- ; CHECK-NEXT:    cbz w8, .LBB0_25
+--; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
+-+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
+- ; CHECK-NEXT:    mov w4, wzr
+- ; CHECK-NEXT:    b .LBB0_26
+- ; CHECK-NEXT:    .p2align 5, , 16
+--; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
+-+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
+- ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+- ; CHECK-NEXT:    mov w3, #1152 // =0x480
+- ; CHECK-NEXT:    mov x22, xzr
+-@@ -335,24 +343,24 @@
+- ; CHECK-NEXT:    madd x14, x14, x3, x11
+- ; CHECK-NEXT:    mov w28, w30
+- ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+--; CHECK-NEXT:    b .LBB0_36
+-+; CHECK-NEXT:    b .LBB0_39
+- ; CHECK-NEXT:    .p2align 5, , 16
+--; CHECK-NEXT:  .LBB0_34: // %if.then466.us
+--; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+-+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
+-+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+- ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
+- ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
+- ; CHECK-NEXT:    sxtb w4, w4
+- ; CHECK-NEXT:    bic w4, w4, w4, asr #31
+- ; CHECK-NEXT:    str x3, [x28]
+- ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+--; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
+--; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+-+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
+-+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+- ; CHECK-NEXT:    add x22, x22, #1
+- ; CHECK-NEXT:    add x27, x27, #1
+- ; CHECK-NEXT:    mov w28, wzr
+- ; CHECK-NEXT:    cmp x27, #0
+- ; CHECK-NEXT:    b.hs .LBB0_9
+--; CHECK-NEXT:  .LBB0_36: // %for.body380.us
+-+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
+- ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+- ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+- ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+-@@ -364,18 +372,18 @@
+- ; CHECK-NEXT:    strh w28, [x11]
+- ; CHECK-NEXT:    csel w28, w21, w3, ne
+- ; CHECK-NEXT:    str w28, [x20]
+--; CHECK-NEXT:    cbz x15, .LBB0_35
+--; CHECK-NEXT:  // %bb.37: // %if.then436.us
+--; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+-+; CHECK-NEXT:    cbz x15, .LBB0_38
+-+; CHECK-NEXT:  // %bb.40: // %if.then436.us
+-+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+- ; CHECK-NEXT:    ldrh w28, [x14]
+--; CHECK-NEXT:    cbnz w28, .LBB0_34
+--; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
+-+; CHECK-NEXT:    cbnz w28, .LBB0_37
+-+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
+- ; CHECK-NEXT:    mov w4, wzr
+--; CHECK-NEXT:    b .LBB0_35
+--; CHECK-NEXT:  .LBB0_39: // %for.body41
+-+; CHECK-NEXT:    b .LBB0_38
+-+; CHECK-NEXT:  .LBB0_42: // %for.body41
+- ; CHECK-NEXT:    strb wzr, [x4]
+- ; CHECK-NEXT:    strb wzr, [x14]
+--; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
+-+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
+- ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+- ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+- ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+-diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
+---- a/llvm/test/CodeGen/AArch64/pr166870.ll
+-+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
+-@@ -26,11 +26,12 @@
+- ; CHECK-NEXT:    mov x21, x1
+- ; CHECK-NEXT:    bl baz
+- ; CHECK-NEXT:    mov w0, #0 // =0x0
+-+; CHECK-NEXT:  // %bb.5: // %bb6
+- ; CHECK-NEXT:    mov w10, #1 // =0x1
+-+; CHECK-NEXT:    cbnz w10, .LBB0_11
+-+; CHECK-NEXT:  // %bb.6: // %bb7
+- ; CHECK-NEXT:    cbnz w10, .LBB0_10
+--; CHECK-NEXT:  // %bb.5: // %bb7
+--; CHECK-NEXT:    cbnz w10, .LBB0_9
+--; CHECK-NEXT:  // %bb.6: // %bb8
+-+; CHECK-NEXT:  // %bb.7: // %bb8
+- ; CHECK-NEXT:    mov x8, x21
+- ; CHECK-NEXT:    mov x9, x20
+- ; CHECK-NEXT:    mov w20, #0 // =0x0
+-@@ -38,17 +39,17 @@
+- ; CHECK-NEXT:    mov x21, x9
+- ; CHECK-NEXT:    mov w8, w8
+- ; CHECK-NEXT:    mov x22, x8
+--; CHECK-NEXT:  .LBB0_7: // %bb10
+-+; CHECK-NEXT:  .LBB0_8: // %bb10
+- ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+- ; CHECK-NEXT:    strb w20, [x19]
+--; CHECK-NEXT:    cbnz x21, .LBB0_7
+--; CHECK-NEXT:  // %bb.8: // %bb12
+--; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
+-+; CHECK-NEXT:    cbnz x21, .LBB0_8
+-+; CHECK-NEXT:  // %bb.9: // %bb12
+-+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
+- ; CHECK-NEXT:    bl snork
+--; CHECK-NEXT:    cbnz x22, .LBB0_7
+--; CHECK-NEXT:  .LBB0_9:
+--; CHECK-NEXT:    mov w0, #0 // =0x0
+-+; CHECK-NEXT:    cbnz x22, .LBB0_8
+- ; CHECK-NEXT:  .LBB0_10:
+-+; CHECK-NEXT:    mov w0, #0 // =0x0
+-+; CHECK-NEXT:  .LBB0_11:
+- ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+- ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+- ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+-diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+---- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+-+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+-@@ -71,21 +71,27 @@
+- ; CHECK-NEXT:    .cfi_def_cfa w29, 16
+- ; CHECK-NEXT:    .cfi_offset w30, -8
+- ; CHECK-NEXT:    .cfi_offset w29, -16
+-+; CHECK-NEXT:    .cfi_remember_state
+- ; CHECK-NEXT:    mov w8, #1 // =0x1
+--; CHECK-NEXT:    mov w9, #2 // =0x2
+- ; CHECK-NEXT:    stur xzr, [x29, #-8]
+--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+--; CHECK-NEXT:    ldur w8, [x29, #-8]
+--; CHECK-NEXT:    cbz w8, .LBB0_2
+-+; CHECK-NEXT:    b .LBB0_3
+- ; CHECK-NEXT:  // %bb.1:
+--; CHECK-NEXT:    mov w8, #1 // =0x1
+- ; CHECK-NEXT:    str w8, [sp, #16]
+--; CHECK-NEXT:    b .LBB0_3
+-+; CHECK-NEXT:    ldur w8, [x29, #-8]
+-+; CHECK-NEXT:    cbz w8, .LBB0_4
+- ; CHECK-NEXT:  .LBB0_2:
+-+; CHECK-NEXT:    .cfi_restore_state
+- ; CHECK-NEXT:    mov w8, #1 // =0x1
+--; CHECK-NEXT:    mov w9, #2 // =0x2
+--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-+; CHECK-NEXT:    str w8, [sp, #16]
+-+; CHECK-NEXT:    b .LBB0_5
+- ; CHECK-NEXT:  .LBB0_3:
+-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-+; CHECK-NEXT:    ldur w8, [x29, #-8]
+-+; CHECK-NEXT:    cbnz w8, .LBB0_2
+-+; CHECK-NEXT:  .LBB0_4:
+-+; CHECK-NEXT:    mov w8, #1 // =0x1
+-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-+; CHECK-NEXT:  .LBB0_5:
+- ; CHECK-NEXT:    mov w0, wzr
+- ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
+- ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+-@@ -128,6 +134,7 @@
+- ;
+- ; CHECK-LABEL: OUTLINED_FUNCTION_0:
+- ; CHECK:       // %bb.0:
+-+; CHECK-NEXT:    mov w9, #2 // =0x2
+- ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
+- ; CHECK-NEXT:    mov w9, #3 // =0x3
+- ; CHECK-NEXT:    mov w8, #4 // =0x4
+-diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+---- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+-+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+-@@ -12,21 +12,27 @@
+- ; CHECK-NEXT:    .cfi_def_cfa w29, 16
+- ; CHECK-NEXT:    .cfi_offset w30, -8
+- ; CHECK-NEXT:    .cfi_offset w29, -16
+-+; CHECK-NEXT:    .cfi_remember_state
+- ; CHECK-NEXT:    mov w8, #1 // =0x1
+--; CHECK-NEXT:    mov w9, #2 // =0x2
+- ; CHECK-NEXT:    stur xzr, [x29, #-8]
+--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+--; CHECK-NEXT:    ldur w8, [x29, #-8]
+--; CHECK-NEXT:    cbz w8, .LBB0_2
+-+; CHECK-NEXT:    b .LBB0_3
+- ; CHECK-NEXT:  // %bb.1:
+--; CHECK-NEXT:    mov w8, #1 // =0x1
+- ; CHECK-NEXT:    str w8, [sp, #16]
+--; CHECK-NEXT:    b .LBB0_3
+-+; CHECK-NEXT:    ldur w8, [x29, #-8]
+-+; CHECK-NEXT:    cbz w8, .LBB0_4
+- ; CHECK-NEXT:  .LBB0_2:
+-+; CHECK-NEXT:    .cfi_restore_state
+- ; CHECK-NEXT:    mov w8, #1 // =0x1
+--; CHECK-NEXT:    mov w9, #2 // =0x2
+--; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-+; CHECK-NEXT:    str w8, [sp, #16]
+-+; CHECK-NEXT:    b .LBB0_5
+- ; CHECK-NEXT:  .LBB0_3:
+-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-+; CHECK-NEXT:    ldur w8, [x29, #-8]
+-+; CHECK-NEXT:    cbnz w8, .LBB0_2
+-+; CHECK-NEXT:  .LBB0_4:
+-+; CHECK-NEXT:    mov w8, #1 // =0x1
+-+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+-+; CHECK-NEXT:  .LBB0_5:
+- ; CHECK-NEXT:    mov w0, wzr
+- ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
+- ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
+-diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+---- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+-+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+-@@ -2,23 +2,29 @@
+- ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
+- ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
+- 
+--; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
+--; no runtime check is generated and the loop should not be vectorized.
+-+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
+-+; no runtime check is generated even though one is needed and !noalias
+-+; annotations are added.
+- define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
+- ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
+- ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+--; LIMIT0-NEXT:  [[ENTRY:.*]]:
+--; LIMIT0-NEXT:    br label %[[LOOP:.*]]
+--; LIMIT0:       [[LOOP]]:
+--; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+--; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
+-+; LIMIT0-NEXT:  [[ENTRY:.*:]]
+-+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
+-+; LIMIT0:       [[VECTOR_PH]]:
+-+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+-+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+-+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+-+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
+-+; LIMIT0:       [[VECTOR_BODY]]:
+-+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+- ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+--; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
+--; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+-+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+-+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+- ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+--; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+-+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+-+; LIMIT0:       [[MIDDLE_BLOCK]]:
+-+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
+- ; LIMIT0:       [[EXIT]]:
+--; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
+- ; LIMIT0-NEXT:    ret i16 [[TMP0]]
+- ;
+- ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
+-@@ -82,9 +88,14 @@
+- !3 = !{!"llvm.loop.vectorize.enable", i1 true}
+- 
+- ;.
+--; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+--; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
+--; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
+-+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
+-+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+-+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+-+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
+-+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
+-+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+-+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+-+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+- ;.
+- ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
+- ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+-diff -ruN --strip-trailing-cr a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+---- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+-+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+-@@ -1320,8 +1320,9 @@
+- }
+- 
+- template <typename T>
+--T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
+--                               StringRef entryType, uint64_t depth) {
+-+T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
+-+                               uint64_t index, StringRef entryType,
+-+                               uint64_t depth) {
+-   if (index >= entries.size()) {
+-     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
+-     return {};
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index dd3d4e4..e573782 100644
+index e573782..3c9c005 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "c6e23ab80753a01dce270f5f8a133fbec942315d"
--    LLVM_SHA256 = "5a6b8aacd2d87ce9c4456843a76d0a54fd7cd0ae788ed3f19e7487ecd2ce4326"
-+    LLVM_COMMIT = "87bf5ee23863bc0b467ee44b2184b2c134a98464"
-+    LLVM_SHA256 = "9d0bca271bfb266de8453cd34156741fd41f64b911f580262d187ce4d4d9b6d9"
+-    LLVM_COMMIT = "87bf5ee23863bc0b467ee44b2184b2c134a98464"
+-    LLVM_SHA256 = "9d0bca271bfb266de8453cd34156741fd41f64b911f580262d187ce4d4d9b6d9"
++    LLVM_COMMIT = "48d942c7158af43094db1b5e6c59c6e6fcf1b5aa"
++    LLVM_SHA256 = "6ce4ac276a4687625e9f57e53715285d99b60c6553e0cde4db9b7e74f2179f69"
  
      tf_http_archive(
          name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index bda3ea1501fc0e..230123596f420e 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "879ea7b95f957a2fc66da58b10d21390eb5f449a"
-    SHARDY_SHA256 = "222ec7b6c591888207d4de5795e03fe81ec94b3acb59a421ce02ff7ace38dc07"
+    SHARDY_COMMIT = "d9023f29bb8ad1fcb72b8183de06f8bc86fc195d"
+    SHARDY_SHA256 = "2b8951f25c0c1e6c1569b842ef3f68a3cefdcc2a1a53eb6f4970d5bf1df91eb5"
 
     tf_http_archive(
         name = "shardy",

From 5e534dd15d0287afcb5d08b04192896dfc4d5eb1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 10:20:39 -0800
Subject: [PATCH 183/753] Reverts a05e35e330f70fe1920a07573e709247b09ddb15

PiperOrigin-RevId: 843276404
---
 .../xla/hlo/analysis/hlo_dataflow_analysis.cc | 28 +------------------
 1 file changed, 1 insertion(+), 27 deletions(-)

diff --git a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
index 893c233f9bd1c2..ca00349f4c25d5 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_dataflow_analysis.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout.h"
 #include "xla/map_util.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/hlo_value.h"
@@ -1617,26 +1616,6 @@ HloDataflowAnalysis::GetInPlaceInputOutputPairs(
   return alias_info->GetInPlaceInputOutputPairs(instruction);
 }
 
-// Returns true if the instruction is a fusion consisting of a single copy which
-// changes tiling. This is handled by the emitters and effectively are no-ops.
-static bool IsChangeTilingCopyFusion(HloInstruction* instr) {
-  if (!instr->parent()->IsFusionComputation() ||
-      instr->opcode() != HloOpcode::kFusion ||
-      instr->called_computations().size() != 1 || instr->operand_count() != 1) {
-    return false;
-  }
-  // These copy fusions should only change tiling (and sometimes memory space).
-  HloInstruction* fusion_root = instr->fused_expression_root();
-  const Layout& operand_layout = fusion_root->operand(0)->shape().layout();
-  const Layout& output_layout = fusion_root->shape().layout();
-  absl::Span<const Tile> operand_tiles = operand_layout.tiles();
-  absl::Span<const Tile> output_tiles = output_layout.tiles();
-  return fusion_root->opcode() == HloOpcode::kCopy &&
-         Layout::Equal().IgnoreTiles().IgnoreMemorySpace()(operand_layout,
-                                                           output_layout) &&
-         operand_tiles != output_tiles;
-}
-
 bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     HloInstruction* operand, const ShapeIndex& operand_index,
     HloInstruction* user, const ShapeIndex& user_index,
@@ -1652,12 +1631,7 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
   const Shape& user_subshape =
       ShapeUtil::GetSubshape(user->shape(), user_index);
 
-  // During tiling assignment, we can add no-op instructions which appear to
-  // change tiling (and memory space) of the operand, but don't.
-  if (IsChangeTilingCopyFusion(user) || IsChangeTilingCopyFusion(operand)) {
-    return true;
-  }
-  const bool shapes_equal = ShapeUtil::Equal(operand_subshape, user_subshape);
+  auto shapes_equal = ShapeUtil::Equal(operand_subshape, user_subshape);
   // Check that operand and user emit the same shape and layout.
   if (shapes_equal) {
     // Must-alias relationship returns true for in-place operations (DUS and DUS

From adc9ab4117fe1ebdc85964c2c7e141e0088f4035 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Thu, 11 Dec 2025 10:57:53 -0800
Subject: [PATCH 184/753] Fix replicated execution defaults in HloRunnerPjRt.

PiperOrigin-RevId: 843293521
---
 third_party/xla/xla/service/BUILD              |  1 +
 third_party/xla/xla/service/hlo_runner_pjrt.cc | 13 ++++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index e5e8114809599e..a18e1b9660700f 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4536,6 +4536,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 4d7b4105b32f69..40d43d98357280 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/base/nullability.h"
 #include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
 #include "absl/log/die_if_null.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -560,12 +561,9 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  module->mutable_config().set_replica_count(options.num_devices);
-
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
-
   return ExecuteReplicated(executable.get(), options, device_assignment);
 }
 
@@ -603,8 +601,13 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  TF_RET_CHECK(device_assignment->computation_count() == 1)
-      << "Only single-computation execution is supported.";
+  std::optional<DeviceAssignment> default_device_assignment = std::nullopt;
+  if (device_assignment == nullptr) {
+    TF_ASSIGN_OR_RETURN(default_device_assignment,
+                        GetDefaultDeviceAssignment(options.num_devices, 1));
+    device_assignment = &*default_device_assignment;
+  }
+  CHECK_NE(device_assignment, nullptr);
   return ExecuteReplicatedImpl(
       [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
           absl::AnyInvocable<OpaqueExecutable*(int64_t)>

From 41f9c3d1cf0c163cf1e28d3b2977b18504e90882 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Thu, 11 Dec 2025 11:34:22 -0800
Subject: [PATCH 185/753] Add Shape to CopyThunk buffer_uses

Modify Thunk's serialization

PiperOrigin-RevId: 843308614
---
 .../xla/xla/backends/gpu/codegen/BUILD        |   1 +
 .../xla/xla/backends/gpu/codegen/copy.cc      |   7 +-
 .../xla/xla/backends/gpu/codegen/custom.cc    |   4 +-
 .../xla/xla/backends/gpu/runtime/BUILD        |   5 +
 .../gpu/runtime/command_buffer_cmd.cc         |  19 +-
 .../backends/gpu/runtime/command_buffer_cmd.h |   9 +-
 .../gpu/runtime/command_buffer_cmd_test.cc    |  14 +-
 .../command_buffer_conversion_pass_test.cc    |   8 +-
 .../gpu/runtime/command_buffer_thunk_test.cc  |   5 +-
 .../xla/backends/gpu/runtime/copy_thunk.cc    | 122 +++++---
 .../xla/xla/backends/gpu/runtime/copy_thunk.h |  70 ++---
 .../backends/gpu/runtime/copy_thunk_test.cc   | 290 ++++++++++++++----
 .../xla/xla/backends/gpu/runtime/thunk.proto  |   4 +-
 .../thunk_proto_deserialization_test.cc       | 290 +++++++++++++++---
 third_party/xla/xla/service/gpu/BUILD         |   1 +
 .../xla/service/gpu/gpu_executable_test.cc    |  15 +-
 .../xla/xla/service/gpu/thunk_emitter.cc      |  41 +--
 17 files changed, 668 insertions(+), 237 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index 62f99605665307..506a39ee7b28b8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -25,6 +25,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/runtime:copy_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy.cc b/third_party/xla/xla/backends/gpu/codegen/copy.cc
index 901103333277b7..2997e07de0e0ef 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -64,6 +65,7 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
     IrEmitterContext& ir_emitter_context,
     const HloFusionInstruction& fusion) const {
   std::vector<BufferAllocation::Slice> src_buffers;
+  std::vector<Shape> src_shapes;
   for (const HloInstructionAdaptor& root_adaptor : analysis_.fusion_roots()) {
     const HloInstruction* root = &root_adaptor.instruction();
     const HloInstruction* src_instr =
@@ -71,6 +73,7 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
                         buffer_assignment_->GetUniqueSlice(src_instr, {}));
     src_buffers.push_back(slice);
+    src_shapes.push_back(root->operand(0)->shape());
   }
 
   std::vector<BufferAllocation::Slice> dst_buffers;
@@ -91,8 +94,8 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
       result.thunks.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo::WithProfileAnnotation(
               &fusion, ir_emitter_context.GetNextThunkId()),
-          /*source_buffer=*/src_buffers[i],
-          /*destination_buffer=*/dst_buffers[i],
+          /*source_buffer=*/ShapedSlice{src_buffers[i], src_shapes[i]},
+          /*destination_buffer=*/ShapedSlice{dst_buffers[i], src_shapes[i]},
           /*mem_size=*/src_buffers[i].size()));
     }
   }
diff --git a/third_party/xla/xla/backends/gpu/codegen/custom.cc b/third_party/xla/xla/backends/gpu/codegen/custom.cc
index 22a2e1d159b29d..6cbc17cc246fe0 100644
--- a/third_party/xla/xla/backends/gpu/codegen/custom.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/custom.cc
@@ -1289,8 +1289,8 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
              "collective";
       seq.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
           thunk_info,
-          /*source_buffer=*/src.value(),
-          /*destination_buffer=*/dst.value(),
+          /*source_buffer=*/ShapedSlice{src.value(), shape},
+          /*destination_buffer=*/ShapedSlice{dst.value(), shape},
           /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
     }
   } else if (implementable_status.ok()) {
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 27b4149b9e25bd..3876537b50c5b2 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -592,9 +592,11 @@ cc_library(
     srcs = ["copy_thunk.cc"],
     hdrs = ["copy_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
         ":while_thunk",
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
@@ -610,6 +612,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
@@ -624,6 +627,7 @@ xla_cc_test(
         ":copy_thunk",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:parse_text_proto",
@@ -3205,6 +3209,7 @@ xla_test(
         ":gemm_thunk",
         ":replica_id_thunk",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_pass_pipeline",
         ":while_thunk",
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index 866e9a9f0b870b..5412864c823977 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -1324,13 +1324,19 @@ CommandBufferCmd::BufferUseVector CustomKernelLaunchCmd::buffers() const {
 // MemcpyDeviceToDeviceCmd
 //===----------------------------------------------------------------------===//
 
-MemcpyDeviceToDeviceCmd::MemcpyDeviceToDeviceCmd(BufferAllocation::Slice dst,
-                                                 BufferAllocation::Slice src,
+MemcpyDeviceToDeviceCmd::MemcpyDeviceToDeviceCmd(ShapedSlice dst,
+                                                 ShapedSlice src,
                                                  int64_t num_bytes)
     : CommandBufferCmd(CommandBufferCmdType::kMemcpyDeviceToDeviceCmd),
       dst_(dst),
       src_(src),
-      num_bytes_(num_bytes) {}
+      num_bytes_(num_bytes) {
+  CHECK_EQ(ShapeUtil::ByteSizeOfElements(src_.shape),
+           ShapeUtil::ByteSizeOfElements(dst_.shape));
+  CHECK_LE(num_bytes, dst_.slice.size());
+  CHECK_LE(num_bytes, src_.slice.size());
+  CHECK_GE(src_.slice.size(), ShapeUtil::ByteSizeOf(src_.shape));
+}
 
 absl::StatusOr<const se::CommandBuffer::Command*>
 MemcpyDeviceToDeviceCmd::Record(const Thunk::ExecuteParams& execute_params,
@@ -1338,9 +1344,9 @@ MemcpyDeviceToDeviceCmd::Record(const Thunk::ExecuteParams& execute_params,
                                 RecordAction record_action,
                                 se::CommandBuffer* command_buffer) {
   se::DeviceAddressBase dst =
-      execute_params.buffer_allocations->GetDeviceAddress(dst_);
+      execute_params.buffer_allocations->GetDeviceAddress(dst_.slice);
   se::DeviceAddressBase src =
-      execute_params.buffer_allocations->GetDeviceAddress(src_);
+      execute_params.buffer_allocations->GetDeviceAddress(src_.slice);
 
   VLOG(5) << "MemcpyDeviceToDeviceCmd: num_bytes = " << num_bytes_;
   VLOG(5) << "  Dst: " << dst_ << " (" << dst.opaque() << ")";
@@ -1363,7 +1369,8 @@ MemcpyDeviceToDeviceCmd::Record(const Thunk::ExecuteParams& execute_params,
 }
 
 CommandBufferCmd::BufferUseVector MemcpyDeviceToDeviceCmd::buffers() const {
-  return {BufferUse::Write(dst_), BufferUse::Read(src_)};
+  return {BufferUse::Write(dst_.slice, dst_.shape),
+          BufferUse::Read(src_.slice, src_.shape)};
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index 31dc41b43596d0..abeb0971888d22 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -781,8 +781,7 @@ class CustomKernelLaunchCmd : public CommandBufferCmd {
 
 class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
  public:
-  MemcpyDeviceToDeviceCmd(BufferAllocation::Slice dst,
-                          BufferAllocation::Slice src, int64_t num_bytes);
+  MemcpyDeviceToDeviceCmd(ShapedSlice dst, ShapedSlice src, int64_t num_bytes);
 
   absl::StatusOr<const se::CommandBuffer::Command*> Record(
       const Thunk::ExecuteParams& execute_params,
@@ -792,9 +791,9 @@ class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
   BufferUseVector buffers() const override;
 
  private:
-  BufferAllocation::Slice dst_;
-  BufferAllocation::Slice src_;
-  int64_t num_bytes_;
+  ShapedSlice dst_;
+  ShapedSlice src_;
+  uint64_t num_bytes_;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
index c8e59ba1093233..c995a45d181cac 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_test.cc
@@ -252,6 +252,7 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   auto stream = stream_executor->CreateStream().value();
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
 
   // Prepare arguments: a=42, b=0
   se::DeviceAddress<int32_t> a =
@@ -271,7 +272,8 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
 
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_a, byte_length);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_b, shape}, ShapedSlice{slice_a, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
@@ -609,6 +611,7 @@ TEST(CommandBufferCmdTest, RecordExecutorsWithDependencies) {
   auto stream = stream_executor->CreateStream().value();
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
 
   // Device buffers: a, b, c
   se::DeviceAddress<int32_t> a =
@@ -654,7 +657,8 @@ TEST(CommandBufferCmdTest, RecordExecutorsWithDependencies) {
 
   // Executor C: c = b (memcpy)
   CommandBufferCmdSequence seq_c;
-  seq_c.Emplace<MemcpyDeviceToDeviceCmd>(slice_c, slice_b, byte_length);
+  seq_c.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_c, shape}, ShapedSlice{slice_b, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor exec_c,
       CommandBufferCmdExecutor::Create(std::move(seq_c), serialize));
@@ -749,7 +753,8 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
 
   // Inner child: c = a (device-to-device memcpy)
   CommandBufferCmdSequence inner_seq;
-  inner_seq.Emplace<MemcpyDeviceToDeviceCmd>(slice_c, slice_a, byte_length);
+  inner_seq.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_c, shape}, ShapedSlice{slice_a, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor inner_executor,
       CommandBufferCmdExecutor::Create(std::move(inner_seq), serialize));
@@ -759,7 +764,8 @@ TEST(CommandBufferCmdTest, NestedChildCmdCreateAndUpdate) {
   middle_seq.Emplace<ChildCmd>(std::move(inner_executor));
   // Add a couple of extra commands that don't affect `c`.
   middle_seq.Emplace<Memset32Cmd>(slice_b, /*bit_pattern=*/3);
-  middle_seq.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_b, byte_length);
+  middle_seq.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_b, shape}, ShapedSlice{slice_b, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor middle_executor,
       CommandBufferCmdExecutor::Create(std::move(middle_seq), serialize));
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
index d8854aa70db011..19a47a6ff0f5df 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_pass_pipeline.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
@@ -51,6 +52,7 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/platform_util.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_description.h"
@@ -134,8 +136,10 @@ std::unique_ptr<AllGatherStartThunk> CreateAllGatherStartThunk(
 std::unique_ptr<DeviceToDeviceCopyThunk> CreateCopyThunk(
     const BufferAllocation& alloc0) {
   BufferAllocation::Slice slice0(&alloc0, 0, 1024);
-  return std::make_unique<DeviceToDeviceCopyThunk>(Thunk::ThunkInfo(), slice0,
-                                                   slice0, 1024);
+  Shape shape = ShapeUtil::MakeShape(S32, {256});
+  return std::make_unique<DeviceToDeviceCopyThunk>(
+      Thunk::ThunkInfo(), ShapedSlice{slice0, shape},
+      ShapedSlice{slice0, shape}, 1024);
 }
 
 std::unique_ptr<GemmThunk> CreateGemmThunk(const BufferAllocation& alloc1) {
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index fff417ba5dacbe..9c80bec9cba67c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -159,7 +159,7 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
-
+  Shape shape = ShapeUtil::MakeShape(S32, {length});
   // Prepare arguments: a=42, b=0
   se::DeviceAddress<int32_t> a =
       stream_executor->AllocateArray<int32_t>(length, 0);
@@ -178,7 +178,8 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
 
   // Prepare commands sequence for constructing command buffer.
   CommandBufferCmdSequence commands;
-  commands.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_a, byte_length);
+  commands.Emplace<MemcpyDeviceToDeviceCmd>(
+      ShapedSlice{slice_b, shape}, ShapedSlice{slice_a, shape}, byte_length);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
index a98f4a05f38dbc..a5aba50e345156 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.cc
@@ -22,15 +22,18 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream.h"
@@ -42,22 +45,31 @@ namespace xla {
 namespace gpu {
 
 DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size)
+    ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+    const ShapedSlice& destination_buffer, int64_t mem_size)
     : Thunk(Kind::kCopy, std::move(thunk_info)),
       source_buffer_(source_buffer),
       destination_buffer_(destination_buffer),
-      mem_size_(mem_size) {}
+      mem_size_(mem_size) {
+  // TODO(b/460846009): Determine size based on shape.
+  // Bounded dynamic shape contains extra header after data.
+  // Header size needs to be accounted for.
+  CHECK_EQ(ShapeUtil::ByteSizeOf(source_buffer_.shape),
+           ShapeUtil::ByteSizeOf(destination_buffer_.shape));
+
+  CHECK_GE(source_buffer_.slice.size(), mem_size);
+  CHECK_GE(destination_buffer_.slice.size(), mem_size);
+}
 
 absl::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::DeviceAddressBase destination_data =
-      params.buffer_allocations->GetDeviceAddress(destination_buffer_);
+      params.buffer_allocations->GetDeviceAddress(destination_buffer_.slice);
   se::DeviceAddressBase source_data =
-      params.buffer_allocations->GetDeviceAddress(source_buffer_);
-  VLOG(3) << "Memcpy D2D of size " << mem_size_ << " from "
+      params.buffer_allocations->GetDeviceAddress(source_buffer_.slice);
+  VLOG(3) << "Memcpy D2D of size " << size_bytes() << " from "
           << source_data.opaque() << " to " << destination_data.opaque();
-  return params.stream->Memcpy(&destination_data, source_data, mem_size_);
+  return params.stream->Memcpy(&destination_data, source_data, size_bytes());
 }
 
 absl::StatusOr<ThunkProto> DeviceToDeviceCopyThunk::ToProto() const {
@@ -67,9 +79,9 @@ absl::StatusOr<ThunkProto> DeviceToDeviceCopyThunk::ToProto() const {
       proto.mutable_device_to_device_copy_thunk();
   CopyThunkProto* copy_thunk_proto = d2d_copy_thunk_proto->mutable_copy_thunk();
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_source_buffer(),
-                      source().ToProto());
+                      source_buffer_.ToProto());
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_destination_buffer(),
-                      destination().ToProto());
+                      destination_buffer_.ToProto());
   copy_thunk_proto->set_mem_size(size_bytes());
   return proto;
 }
@@ -79,13 +91,18 @@ DeviceToDeviceCopyThunk::FromProto(
     ThunkInfo thunk_info, const DeviceToDeviceCopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice src_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().source_buffer(), buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().source_buffer(),
+                             buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().destination_buffer(), buffer_allocations));
+      ShapedSlice dst_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().destination_buffer(),
+                             buffer_allocations));
+  if (ShapeUtil::ByteSizeOfElements(src_slice.shape) !=
+      ShapeUtil::ByteSizeOfElements(dst_slice.shape)) {
+    return absl::FailedPreconditionError(
+        "DeviceToDeviceCopyThunkProto with incompatible shapes.");
+  }
   return std::make_unique<DeviceToDeviceCopyThunk>(
       std::move(thunk_info), src_slice, dst_slice,
       thunk_proto.copy_thunk().mem_size());
@@ -95,14 +112,18 @@ DeviceToDeviceCopyThunk::FromProto(
 // CopyThunk
 //===----------------------------------------------------------------------===//
 
-CopyThunk::CopyThunk(ThunkInfo thunk_info,
-                     const BufferAllocation::Slice& source_buffer,
-                     const BufferAllocation::Slice& destination_buffer,
-                     uint64_t mem_size)
+CopyThunk::CopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+                     const ShapedSlice& destination_buffer, int64_t mem_size)
     : Thunk(Kind::kCopy, std::move(thunk_info)),
       source_buffer_(source_buffer),
       destination_buffer_(destination_buffer),
-      mem_size_(mem_size) {}
+      mem_size_(mem_size) {
+  CHECK_EQ(ShapeUtil::ByteSizeOfElements(source_buffer_.shape),
+           ShapeUtil::ByteSizeOfElements(destination_buffer_.shape));
+
+  CHECK_GE(source_buffer_.slice.size(), mem_size);
+  CHECK_GE(destination_buffer_.slice.size(), mem_size);
+}
 
 absl::Status CopyThunk::ExecuteOnStream(const ExecuteParams& params) {
   return absl::OkStatus();
@@ -146,9 +167,9 @@ absl::StatusOr<ThunkProto> CopyThunk::ToProto() const {
 
   CopyThunkProto* copy_thunk_proto = proto.mutable_copy_thunk();
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_source_buffer(),
-                      source().ToProto());
+                      source_buffer_.ToProto());
   TF_ASSIGN_OR_RETURN(*copy_thunk_proto->mutable_destination_buffer(),
-                      destination().ToProto());
+                      destination_buffer_.ToProto());
   copy_thunk_proto->set_mem_size(size_bytes());
   return proto;
 }
@@ -156,13 +177,18 @@ absl::StatusOr<ThunkProto> CopyThunk::ToProto() const {
 absl::StatusOr<std::unique_ptr<CopyThunk>> CopyThunk::FromProto(
     ThunkInfo thunk_info, const CopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice src_slice,
-                      BufferAllocation::Slice::FromProto(
-                          thunk_proto.source_buffer(), buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(thunk_proto.destination_buffer(),
-                                         buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.source_buffer(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice dst_slice,
+                      ShapedSlice::FromProto(thunk_proto.destination_buffer(),
+                                             buffer_allocations));
+  if (ShapeUtil::ByteSizeOfElements(src_slice.shape) !=
+      ShapeUtil::ByteSizeOfElements(dst_slice.shape)) {
+    return absl::FailedPreconditionError(
+        "DeviceToDeviceCopyThunkProto with incompatible shapes.");
+  }
+
   return std::make_unique<CopyThunk>(std::move(thunk_info), src_slice,
                                      dst_slice, thunk_proto.mem_size());
 }
@@ -171,8 +197,8 @@ absl::StatusOr<std::unique_ptr<CopyThunk>> CopyThunk::FromProto(
 // DeviceToHostCopyThunk
 //===----------------------------------------------------------------------===//
 DeviceToHostCopyThunk::DeviceToHostCopyThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size,
+    ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+    const ShapedSlice& destination_buffer, int64_t mem_size,
     std::shared_ptr<CopyThunk::AsyncEvents> async_events,
     const HloInstruction* instr)
     : CopyThunk(std::move(thunk_info), source_buffer, destination_buffer,
@@ -183,9 +209,9 @@ DeviceToHostCopyThunk::DeviceToHostCopyThunk(
 absl::Status DeviceToHostCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::DeviceAddressBase destination_data =
-      params.buffer_allocations->GetDeviceAddress(destination());
+      params.buffer_allocations->GetDeviceAddress(destination().slice);
   se::DeviceAddressBase source_data =
-      params.buffer_allocations->GetDeviceAddress(source());
+      params.buffer_allocations->GetDeviceAddress(source().slice);
   void* cpu_dst = destination_data.opaque();
   TF_ASSIGN_OR_RETURN(
       se::Stream * stream,
@@ -225,13 +251,13 @@ DeviceToHostCopyThunk::FromProto(
     ThunkInfo thunk_info, const DeviceToHostCopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice src_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().source_buffer(), buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().source_buffer(),
+                             buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().destination_buffer(), buffer_allocations));
+      ShapedSlice dst_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().destination_buffer(),
+                             buffer_allocations));
   return std::make_unique<DeviceToHostCopyThunk>(
       std::move(thunk_info), src_slice, dst_slice,
       thunk_proto.copy_thunk().mem_size(),
@@ -252,8 +278,8 @@ DeviceToHostCopyThunk::GetAsyncEventsUniqueId() const {
 // HostToDeviceCopyThunk
 //===----------------------------------------------------------------------===//
 HostToDeviceCopyThunk::HostToDeviceCopyThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-    const BufferAllocation::Slice& destination_buffer, uint64_t mem_size,
+    ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+    const ShapedSlice& destination_buffer, int64_t mem_size,
     std::shared_ptr<CopyThunk::AsyncEvents> async_events,
     const HloInstruction* instr)
     : CopyThunk(std::move(thunk_info), source_buffer, destination_buffer,
@@ -264,9 +290,9 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
 absl::Status HostToDeviceCopyThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   se::DeviceAddressBase destination_data =
-      params.buffer_allocations->GetDeviceAddress(destination());
+      params.buffer_allocations->GetDeviceAddress(destination().slice);
   se::DeviceAddressBase source_data =
-      params.buffer_allocations->GetDeviceAddress(source());
+      params.buffer_allocations->GetDeviceAddress(source().slice);
   void* cpu_src = source_data.opaque();
   TF_ASSIGN_OR_RETURN(
       se::Stream * stream,
@@ -306,13 +332,13 @@ HostToDeviceCopyThunk::FromProto(
     ThunkInfo thunk_info, const HostToDeviceCopyThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice src_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().source_buffer(), buffer_allocations));
+      ShapedSlice src_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().source_buffer(),
+                             buffer_allocations));
   TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice dst_slice,
-      BufferAllocation::Slice::FromProto(
-          thunk_proto.copy_thunk().destination_buffer(), buffer_allocations));
+      ShapedSlice dst_slice,
+      ShapedSlice::FromProto(thunk_proto.copy_thunk().destination_buffer(),
+                             buffer_allocations));
   return std::make_unique<HostToDeviceCopyThunk>(
       std::move(thunk_info), src_slice, dst_slice,
       thunk_proto.copy_thunk().mem_size(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
index 381c426a66f35f..a6afb8f0e3c7e4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk.h
@@ -30,11 +30,13 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/stream_executor.h"
 
@@ -45,28 +47,25 @@ namespace gpu {
 class DeviceToDeviceCopyThunk : public Thunk {
  public:
   // Constructs a CopyThunk that copies host data from `source_buffer` to the
-  // device buffer `destination_buffer`. `mem_size` is the size of the data in
-  // bytes.
+  // device buffer `destination_buffer`.
   DeviceToDeviceCopyThunk(ThunkInfo thunk_info,
-                          const BufferAllocation::Slice& source_buffer,
-                          const BufferAllocation::Slice& destination_buffer,
-                          uint64_t mem_size);
+                          const ShapedSlice& source_buffer,
+                          const ShapedSlice& destination_buffer,
+                          int64_t mem_size);
 
   DeviceToDeviceCopyThunk(const DeviceToDeviceCopyThunk&) = delete;
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
-  const BufferAllocation::Slice& source() const { return source_buffer_; }
-  const BufferAllocation::Slice& destination() const {
-    return destination_buffer_;
-  }
-  uint64_t size_bytes() const { return mem_size_; }
+  const ShapedSlice& source() const { return source_buffer_; }
+  const ShapedSlice& destination() const { return destination_buffer_; }
+  int64_t size_bytes() const { return mem_size_; }
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(source_buffer_),
-        BufferUse::Write(destination_buffer_),
+        BufferUse::Read(source_buffer_.slice, source_buffer_.shape),
+        BufferUse::Write(destination_buffer_.slice, destination_buffer_.shape),
     };
   }
 
@@ -78,9 +77,11 @@ class DeviceToDeviceCopyThunk : public Thunk {
 
   friend bool operator==(const DeviceToDeviceCopyThunk& lhs,
                          const DeviceToDeviceCopyThunk& rhs) {
-    return std::tie(lhs.source_buffer_, lhs.destination_buffer_,
-                    lhs.mem_size_) ==
-           std::tie(rhs.source_buffer_, rhs.destination_buffer_, rhs.mem_size_);
+    if (lhs.size_bytes() != rhs.size_bytes()) {
+      return false;
+    }
+    return std::tie(lhs.source_buffer_, lhs.destination_buffer_) ==
+           std::tie(rhs.source_buffer_, rhs.destination_buffer_);
   }
 
   friend bool operator!=(const DeviceToDeviceCopyThunk& lhs,
@@ -89,9 +90,9 @@ class DeviceToDeviceCopyThunk : public Thunk {
   }
 
  private:
-  const BufferAllocation::Slice source_buffer_;
-  const BufferAllocation::Slice destination_buffer_;
-  const uint64_t mem_size_;
+  const ShapedSlice source_buffer_;
+  const ShapedSlice destination_buffer_;
+  const int64_t mem_size_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -117,20 +118,17 @@ class CopyThunk : public Thunk {
     absl::flat_hash_map<Key, std::unique_ptr<se::Event>> events_
         ABSL_GUARDED_BY(mutex_);
   };
-  CopyThunk(ThunkInfo thunk_info, const BufferAllocation::Slice& source_buffer,
-            const BufferAllocation::Slice& destination_buffer,
-            uint64_t mem_size);
+  CopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+            const ShapedSlice& destination_buffer, int64_t mem_size);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-  const BufferAllocation::Slice& source() const { return source_buffer_; }
-  const BufferAllocation::Slice& destination() const {
-    return destination_buffer_;
-  }
+  const ShapedSlice& source() const { return source_buffer_; }
+  const ShapedSlice& destination() const { return destination_buffer_; }
   uint64_t size_bytes() const { return mem_size_; }
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(source_buffer_),
-        BufferUse::Write(destination_buffer_),
+        BufferUse::Read(source_buffer_.slice, source_buffer_.shape),
+        BufferUse::Write(destination_buffer_.slice, destination_buffer_.shape),
     };
   }
 
@@ -146,9 +144,9 @@ class CopyThunk : public Thunk {
       absl::Span<const BufferAllocation> buffer_allocations);
 
  private:
-  const BufferAllocation::Slice source_buffer_;
-  const BufferAllocation::Slice destination_buffer_;
-  const uint64_t mem_size_;
+  const ShapedSlice source_buffer_;
+  const ShapedSlice destination_buffer_;
+  const int64_t mem_size_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -163,10 +161,8 @@ class DeviceToHostCopyThunk : public CopyThunk {
   // the device buffer `destination_buffer`. `mem_size` is the size of the data
   // in bytes. `events` are the cuda record/wait events.
   // `instr` is the copy-start instruction.
-  DeviceToHostCopyThunk(ThunkInfo thunk_info,
-                        const BufferAllocation::Slice& source_buffer,
-                        const BufferAllocation::Slice& destination_buffer,
-                        uint64_t mem_size,
+  DeviceToHostCopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+                        const ShapedSlice& destination_buffer, int64_t mem_size,
                         std::shared_ptr<CopyThunk::AsyncEvents> events,
                         const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
@@ -198,10 +194,8 @@ class HostToDeviceCopyThunk : public CopyThunk {
   // the host buffer `destination_buffer`. `mem_size` is the size of the data
   // in bytes. `events` are the cuda record/wait events.
   // `instr` is the copy-start instruction.
-  HostToDeviceCopyThunk(ThunkInfo thunk_info,
-                        const BufferAllocation::Slice& source_buffer,
-                        const BufferAllocation::Slice& destination_buffer,
-                        uint64_t mem_size,
+  HostToDeviceCopyThunk(ThunkInfo thunk_info, const ShapedSlice& source_buffer,
+                        const ShapedSlice& destination_buffer, int64_t mem_size,
                         std::shared_ptr<CopyThunk::AsyncEvents> events,
                         const HloInstruction* instr);
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
diff --git a/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
index 0be4c722ce265e..3b556de2685d4d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/copy_thunk_test.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -41,22 +43,40 @@ TEST(CopyThunkTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
-  CopyThunk thunk(thunk_info, src_slice, dst_slice, /*mem_size=*/256);
+  CopyThunk thunk(thunk_info, {src_slice, shape}, {dst_slice, shape}, 256);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
-  EXPECT_THAT(proto, EqualsProto(R"pb(
-                thunk_info {
-                  profile_annotation: "profile_annotation"
-                  execution_stream_id: 123
-                }
-                copy_thunk {
-                  source_buffer { offset: 128 size: 384 }
-                  destination_buffer { size: 256 buffer_allocation_index: 1 }
-                  mem_size: 256
-                }
-              )pb"));
+  EXPECT_THAT(
+      proto, EqualsProto(R"pb(
+        thunk_info {
+          profile_annotation: "profile_annotation"
+          execution_stream_id: 123
+        }
+        copy_thunk {
+          source_buffer {
+            slice { offset: 128 size: 256 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          destination_buffer {
+            slice { size: 256 buffer_allocation_index: 1 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          mem_size: 256
+        }
+      )pb"));
 }
 
 TEST(CopyThunkTest, FromProto) {
@@ -67,8 +87,24 @@ TEST(CopyThunkTest, FromProto) {
           execution_stream_id: 123
         }
         copy_thunk {
-          source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
-          destination_buffer { offset: 0 size: 256 buffer_allocation_index: 1 }
+          source_buffer {
+            slice { offset: 128 size: 256 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          destination_buffer {
+            slice { size: 256 buffer_allocation_index: 1 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
           mem_size: 256
         }
       )pb");
@@ -83,15 +119,18 @@ TEST(CopyThunkTest, FromProto) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<CopyThunk> thunk,
       CopyThunk::FromProto(thunk_info, proto.copy_thunk(), buffer_allocations));
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
   EXPECT_EQ(
       *thunk.get(),
       CopyThunk(thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
-                /*mem_size=*/256));
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
+                256));
 }
 
 TEST(DeviceToHostCopyThunkProtoTest, ToProto) {
@@ -102,11 +141,12 @@ TEST(DeviceToHostCopyThunkProtoTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
 
-  DeviceToHostCopyThunk thunk(thunk_info, src_slice, dst_slice,
-                              /*mem_size=*/256,
+  DeviceToHostCopyThunk thunk(thunk_info, {src_slice, shape},
+                              {dst_slice, shape}, 256,
                               /*events=*/nullptr,
                               /*instr=*/nullptr);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
@@ -117,8 +157,30 @@ TEST(DeviceToHostCopyThunkProtoTest, ToProto) {
                 }
                 device_to_host_copy_thunk {
                   copy_thunk {
-                    source_buffer { offset: 128 size: 384 }
-                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    source_buffer {
+                      slice { offset: 128 size: 256 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
+                    destination_buffer {
+                      slice { size: 256 buffer_allocation_index: 1 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
                     mem_size: 256
                   }
                 }
@@ -134,11 +196,29 @@ TEST(DeviceToHostCopyThunkProtoTest, FromProto) {
         }
         device_to_host_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 256 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -156,14 +236,17 @@ TEST(DeviceToHostCopyThunkProtoTest, FromProto) {
       std::unique_ptr<DeviceToHostCopyThunk> thunk,
       DeviceToHostCopyThunk::FromProto(
           thunk_info, proto.device_to_host_copy_thunk(), buffer_allocations));
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
   EXPECT_EQ(*thunk.get(),
             DeviceToHostCopyThunk(
                 thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
                 /*mem_size=*/256,
                 /*events=*/nullptr,
                 /*instr=*/nullptr));
@@ -177,10 +260,12 @@ TEST(HostToDeviceCopyThunkProtoTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
-  HostToDeviceCopyThunk thunk(thunk_info, src_slice, dst_slice,
+  HostToDeviceCopyThunk thunk(thunk_info, {src_slice, shape},
+                              {dst_slice, shape},
                               /*mem_size=*/256,
                               /*events=*/nullptr,
                               /*instr=*/nullptr);
@@ -192,8 +277,30 @@ TEST(HostToDeviceCopyThunkProtoTest, ToProto) {
                 }
                 host_to_device_copy_thunk {
                   copy_thunk {
-                    source_buffer { offset: 128 size: 384 }
-                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    source_buffer {
+                      slice { offset: 128 size: 256 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
+                    destination_buffer {
+                      slice { size: 256 buffer_allocation_index: 1 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
                     mem_size: 256
                   }
                 }
@@ -209,11 +316,29 @@ TEST(HostToDeviceCopyThunkProtoTest, FromProto) {
         }
         host_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 256 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -226,6 +351,7 @@ TEST(HostToDeviceCopyThunkProtoTest, FromProto) {
   std::vector<BufferAllocation> buffer_allocations = {
       BufferAllocation(/*index=*/0, /*size=*/1024, /*color=*/0),
       BufferAllocation(/*index=*/1, /*size=*/1024, /*color=*/0)};
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HostToDeviceCopyThunk> thunk,
@@ -235,10 +361,12 @@ TEST(HostToDeviceCopyThunkProtoTest, FromProto) {
   EXPECT_EQ(*thunk.get(),
             HostToDeviceCopyThunk(
                 thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
                 /*mem_size=*/256,
                 /*events=*/nullptr,
                 /*instr=*/nullptr));
@@ -252,11 +380,12 @@ TEST(DeviceToDeviceCopyThunkProtoTest, ToProto) {
   BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
   auto src_slice =
-      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/384);
+      BufferAllocation::Slice(&alloc0, /*offset=*/128, /*size=*/256);
   auto dst_slice = BufferAllocation::Slice(&alloc1, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
 
-  DeviceToDeviceCopyThunk thunk(thunk_info, src_slice, dst_slice,
-                                /*mem_size=*/256);
+  DeviceToDeviceCopyThunk thunk(thunk_info, {src_slice, shape},
+                                {dst_slice, shape}, 256);
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk.ToProto());
   EXPECT_THAT(proto, EqualsProto(R"pb(
                 thunk_info {
@@ -265,8 +394,30 @@ TEST(DeviceToDeviceCopyThunkProtoTest, ToProto) {
                 }
                 device_to_device_copy_thunk {
                   copy_thunk {
-                    source_buffer { offset: 128 size: 384 }
-                    destination_buffer { size: 256 buffer_allocation_index: 1 }
+                    source_buffer {
+                      slice { offset: 128 size: 256 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
+                    destination_buffer {
+                      slice { size: 256 buffer_allocation_index: 1 }
+                      shape {
+                        dimensions: 64
+                        element_type: S32
+                        is_dynamic_dimension: false
+                        layout {
+                          minor_to_major: 0
+                          tail_padding_alignment_in_elements: 1
+                        }
+                      }
+                    }
                     mem_size: 256
                   }
                 }
@@ -282,11 +433,29 @@ TEST(DeviceToDeviceCopyThunkProtoTest, FromProto) {
         }
         device_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 256 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -305,14 +474,17 @@ TEST(DeviceToDeviceCopyThunkProtoTest, FromProto) {
       DeviceToDeviceCopyThunk::FromProto(
           thunk_info, proto.device_to_device_copy_thunk(), buffer_allocations));
 
+  Shape shape = ShapeUtil::MakeShape(S32, {64});
   EXPECT_EQ(*thunk.get(),
             DeviceToDeviceCopyThunk(
                 thunk_info,
-                BufferAllocation::Slice(&buffer_allocations[0],
-                                        /*offset=*/128, /*size=*/384),
-                BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
-                                        /*size=*/256),
-                /*mem_size=*/256));
+                {BufferAllocation::Slice(&buffer_allocations[0],
+                                         /*offset=*/128, /*size=*/256),
+                 shape},
+                {BufferAllocation::Slice(&buffer_allocations[1], /*offset=*/0,
+                                         /*size=*/256),
+                 shape},
+                256));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 7b9bbf093b6863..a84798192bfe65 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -53,8 +53,8 @@ message ThunkMetadataListProto {
 }
 
 message CopyThunkProto {
-  xla.buffer_assignment.BufferAllocationSliceProto source_buffer = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto destination_buffer = 2;
+  ShapedSliceProto source_buffer = 1;
+  ShapedSliceProto destination_buffer = 2;
   int64 mem_size = 3;
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
index 83a1db3b20b828..b3afd91438161a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
@@ -101,8 +101,24 @@ TEST(ThunkProtoDeserializationTest, CopyThunk) {
           execution_stream_id: 123
         }
         copy_thunk {
-          source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
-          destination_buffer { offset: 0 size: 256 buffer_allocation_index: 1 }
+          source_buffer {
+            slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
+          destination_buffer {
+            slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+            shape {
+              dimensions: 64
+              element_type: S32
+              is_dynamic_dimension: false
+              layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            }
+          }
           mem_size: 256
         }
       )pb");
@@ -130,11 +146,29 @@ TEST(ThunkProtoDeserializationTest, DeviceToHostCopyThunk) {
         }
         device_to_host_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -164,11 +198,29 @@ TEST(ThunkProtoDeserializationTest, HostToDeviceCopyThunk) {
         }
         host_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -198,11 +250,29 @@ TEST(ThunkProtoDeserializationTest, DeviceToDeviceCopyThunk) {
         }
         device_to_device_copy_thunk {
           copy_thunk {
-            source_buffer { offset: 128 size: 384 buffer_allocation_index: 0 }
+            source_buffer {
+              slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
+            }
             destination_buffer {
-              offset: 0
-              size: 256
-              buffer_allocation_index: 1
+              slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+              shape {
+                dimensions: 64
+                element_type: S32
+                is_dynamic_dimension: false
+                layout {
+                  minor_to_major: 0
+                  tail_padding_alignment_in_elements: 1
+                }
+              }
             }
             mem_size: 256
           }
@@ -243,18 +313,30 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { buffer_allocation_index: 0 }
-                destination_buffer { buffer_allocation_index: 1 }
-              }
-            }
-            thunks {
-              thunk_info {
-                profile_annotation: "profile_annotation"
-                execution_stream_id: 123
-              }
-              copy_thunk {
-                source_buffer { buffer_allocation_index: 1 }
-                destination_buffer { buffer_allocation_index: 2 }
+                source_buffer {
+                  slice { offset: 128 size: 384 buffer_allocation_index: 0 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
+                destination_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
               }
             }
           }
@@ -265,8 +347,30 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { buffer_allocation_index: 2 }
-                destination_buffer { buffer_allocation_index: 3 }
+                source_buffer {
+                  slice { offset: 128 size: 384 buffer_allocation_index: 2 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
+                destination_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
               }
             }
             thunks {
@@ -275,8 +379,30 @@ TEST(ThunkProtoDeserializationTest, WhileThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { buffer_allocation_index: 3 }
-                destination_buffer { buffer_allocation_index: 4 }
+                source_buffer {
+                  slice { offset: 128 size: 384 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
+                destination_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 4 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
               }
             }
           }
@@ -318,11 +444,29 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 0 size: 256 buffer_allocation_index: 0 }
+                source_buffer {
+                  slice { offset: 0 size: 256 buffer_allocation_index: 0 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 1
-                  size: 257
-                  buffer_allocation_index: 1
+                  slice { offset: 1 size: 257 buffer_allocation_index: 1 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
@@ -332,11 +476,29 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 2 size: 258 buffer_allocation_index: 1 }
+                source_buffer {
+                  slice { offset: 2 size: 258 buffer_allocation_index: 1 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 3
-                  size: 259
-                  buffer_allocation_index: 2
+                  slice { offset: 3 size: 259 buffer_allocation_index: 2 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
@@ -348,11 +510,29 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 4 size: 260 buffer_allocation_index: 2 }
+                source_buffer {
+                  slice { offset: 4 size: 260 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 5
-                  size: 261
-                  buffer_allocation_index: 3
+                  slice { offset: 5 size: 261 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
@@ -362,11 +542,29 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
                 execution_stream_id: 123
               }
               copy_thunk {
-                source_buffer { offset: 6 size: 262 buffer_allocation_index: 3 }
+                source_buffer {
+                  slice { offset: 6 size: 262 buffer_allocation_index: 3 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
+                }
                 destination_buffer {
-                  offset: 7
-                  size: 263
-                  buffer_allocation_index: 4
+                  slice { offset: 7 size: 263 buffer_allocation_index: 4 }
+                  shape {
+                    dimensions: 64
+                    element_type: S32
+                    is_dynamic_dimension: false
+                    layout {
+                      minor_to_major: 0
+                      tail_padding_alignment_in_elements: 1
+                    }
+                  }
                 }
               }
             }
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b9a929cdf282fe..b5bde9888fb0f8 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -784,6 +784,7 @@ xla_cc_test(
         "//xla/backends/gpu/runtime:custom_kernel_thunk",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/backends/gpu/runtime:sequential_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
         "//xla/backends/gpu/runtime:thunk",
         "//xla/client:executable_build_options",
         "//xla/codegen/emitters:kernel_arguments",
diff --git a/third_party/xla/xla/service/gpu/gpu_executable_test.cc b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
index 33483843b616cc..d4e884d50898b0 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable_test.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/kernel_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/client/executable_build_options.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -49,6 +50,7 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/logical_buffer.h"
+#include "xla/shape.h"
 #include "xla/shape_layout.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -145,6 +147,7 @@ TEST(GpuExecutableTest, RunThunkPasses) {
   auto create_executable = [&]() {
     Thunk::ThunkInfo thunk_info;
     BufferAllocation alloc(0, 1024, 0);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
     BufferAllocation::Slice slice(&alloc, 0, 1024);
 
     ThunkSequence thunk_sequence;
@@ -157,7 +160,8 @@ TEST(GpuExecutableTest, RunThunkPasses) {
         /*shmem_bytes=*/0,
         /*tma_metadata=*/se::gpu::TmaMetadata()));
     thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        thunk_info, slice, slice, 1024));
+        thunk_info, ShapedSlice{slice, shape}, ShapedSlice{slice, shape},
+        1024));
 
     GpuExecutable::Params params;
     params.executable = std::make_unique<SequentialThunk>(
@@ -391,6 +395,7 @@ TEST(GpuExecutableTest, DumpsMetadataListProto) {
   auto create_executable = [&]() {
     BufferAllocation alloc(0, 1024, 0);
     BufferAllocation::Slice slice(&alloc, 0, 1024);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
 
     ThunkSequence thunk_sequence;
     thunk_sequence.push_back(std::make_unique<KernelThunk>(
@@ -402,7 +407,8 @@ TEST(GpuExecutableTest, DumpsMetadataListProto) {
         /*shmem_bytes=*/0,
         /*tma_metadata=*/se::gpu::TmaMetadata()));
     thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        ThunkInfoWithId(456), slice, slice, 1024));
+        ThunkInfoWithId(456), ShapedSlice{slice, shape},
+        ShapedSlice{slice, shape}, 1024));
 
     GpuExecutable::Params params;
     params.executable = std::make_unique<SequentialThunk>(
@@ -511,6 +517,8 @@ TEST(GpuExecutableTest, GpuExecutableDump) {
   auto create_executable = [&]() {
     ThunkSequence thunk_sequence;
     BufferAllocation::Slice slice(&alloc, 0, 1024);
+    Shape shape = ShapeUtil::MakeShape(S32, {256});
+
     thunk_sequence.push_back(std::make_unique<KernelThunk>(
         ThunkInfoWithId(123),
         /*kernel_name=*/"test_kernel",
@@ -520,7 +528,8 @@ TEST(GpuExecutableTest, GpuExecutableDump) {
         /*shmem_bytes=*/0,
         /*tma_metadata=*/se::gpu::TmaMetadata()));
     thunk_sequence.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        ThunkInfoWithId(456), slice, slice, 1024));
+        ThunkInfoWithId(456), ShapedSlice{slice, shape},
+        ShapedSlice{slice, shape}, 1024));
 
     GpuExecutable::Params params;
     params.executable = std::make_unique<SequentialThunk>(
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 3822ebc2438363..890f1c660302fa 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -1151,8 +1151,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTriangularSolveCustomCall(
     thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/b_slice,
-        /*destination_buffer=*/result_slice,
+        /*source_buffer=*/ShapedSlice{b_slice, b_shape},
+        /*destination_buffer=*/ShapedSlice{result_slice, b_shape},
         /*mem_size=*/ShapeUtil::ByteSizeOf(b_shape)));
   }
 
@@ -1438,8 +1438,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopy(
   return GetThunkSequence(std::make_unique<DeviceToDeviceCopyThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
-      /*source_buffer=*/src_buffer,
-      /*destination_buffer=*/dst_buffer,
+      /*source_buffer=*/ShapedSlice{src_buffer, instr->operand(0)->shape()},
+      /*destination_buffer=*/ShapedSlice{dst_buffer, instr->shape()},
       /*mem_size=*/src_buffer.size()));
 }
 
@@ -1589,9 +1589,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitSort(
       thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo::WithProfileAnnotation(
               sort, ir_emitter_context_->GetNextThunkId()),
-          /*source_buffer=*/source_address,
-          /*destination_buffer=*/destination_buffer,
-          /*mem_size=*/
+          /*source_buffer=*/
+          ShapedSlice{source_address, sort->operand(i)->shape()},
+          /*destination_buffer=*/
+          ShapedSlice{destination_buffer, sort->operand(i)->shape()},
           ShapeUtil::ByteSizeOf(sort->operand(i)->shape())));
     }
   }
@@ -1676,10 +1677,12 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCollectivePermute(
                                                           : normal_shape_idx));
 
     const int64_t src_memory_space = operand_shape.layout().memory_space();
+    Shape result_buffer_shape = (result_shape.IsTuple())
+                                    ? result_shape.tuple_shapes(oprd_idx)
+                                    : result_shape;
+
     const int64_t dst_memory_space =
-        (result_shape.IsTuple())
-            ? result_shape.tuple_shapes(0).layout().memory_space()
-            : result_shape.layout().memory_space();
+        result_buffer_shape.layout().memory_space();
 
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice source_slice,
                         GetAllocationSliceForHlo(operand));
@@ -1690,8 +1693,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCollectivePermute(
       thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo::WithProfileAnnotation(
               instr, ir_emitter_context_->GetNextThunkId()),
-          /*source_buffer=*/source_slice,
-          /*destination_buffer=*/result_slice,
+          /*source_buffer=*/ShapedSlice{source_slice, operand_shape},
+          /*destination_buffer=*/
+          ShapedSlice{result_slice, result_buffer_shape},
           /*mem_size=*/ShapeUtil::ByteSizeOf(operand_shape)));
       // Signal that start thunk not created with nullptr.
       GetCollectivesAsyncEvents().try_emplace(instr, nullptr);
@@ -2097,8 +2101,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDegeneratedCollectiveThunk(
     thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             inst, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/buffers[i].source_buffer,
-        /*destination_buffer=*/buffers[i].destination_buffer,
+        /*source_buffer=*/ShapedSlice{buffers[i].source_buffer, shape},
+        /*destination_buffer=*/
+        ShapedSlice{buffers[i].destination_buffer, shape},
         /*mem_size=*/ShapeUtil::ByteSizeOf(shape)));
   }
   if (thunks.size() == 1) {
@@ -2233,8 +2238,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyStartThunk(
     auto thunk = std::make_unique<DeviceToHostCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             copy_start_instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/src_buffer,
-        /*destination_buffer=*/dst_buffer,
+        /*source_buffer=*/ShapedSlice{src_buffer, input_shape},
+        /*destination_buffer=*/ShapedSlice{dst_buffer, input_shape},
         /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape),
         /*copy_events=*/copy_events_,
         /*copy_start_instr=*/copy_start_instr);
@@ -2244,8 +2249,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCopyStartThunk(
     auto thunk = std::make_unique<HostToDeviceCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             copy_start_instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/src_buffer,
-        /*destination_buffer=*/dst_buffer,
+        /*source_buffer=*/ShapedSlice{src_buffer, input_shape},
+        /*destination_buffer=*/ShapedSlice{dst_buffer, input_shape},
         /*mem_size=*/ShapeUtil::ByteSizeOf(input_shape),
         /*copy_events=*/copy_events_,
         /*copy_start_instr=*/copy_start_instr);

From 7940664e6b72631d8d6d5a78b83802539a7f1931 Mon Sep 17 00:00:00 2001
From: Ilia Sergachev <isergachev@nvidia.com>
Date: Thu, 11 Dec 2025 12:00:05 -0800
Subject: [PATCH 186/753] PR #35098: [GPU] Wrap single instructions in fusions
 before autotuning.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35098

📝 Summary of Changes
Wrap single instructions into fusions before autotuning so that they get picked up.

🎯 Justification
Creating fusions out of single instructions earlier enables more autotuning.

🚀 Kind of Contribution
⚡️ Performance Improvement ♻️ Cleanup

📊 Benchmark (for Performance Improvements)

🧪 Unit Tests:
yes

🧪 Execution Tests:
no
Copybara import of the project:

--
9da7077726da5b5dc36f204ee004996313dd8102 by Ilia Sergachev <isergachev@nvidia.com>:

[GPU] Wrap single instructions in fusions before autotuning.

Merging this change closes #35098

PiperOrigin-RevId: 843319000
---
 .../codegen/triton/triton_gemm_fusion_test.cc | 33 +++++++++++--------
 .../xla/xla/service/gpu/gpu_compiler.cc       |  1 +
 .../xla/xla/service/gpu/gpu_compiler_test.cc  | 18 ++++++++++
 .../xla/xla/service/gpu/tests/dot_bf16.hlo    | 12 +++----
 .../gpu/tests/sub_byte_collectives.hlo        |  4 +--
 .../transforms/cublas_gemm_rewriter_test.cc   | 12 +++----
 .../gpu/transforms/gemm_rewriter_test.cc      |  8 ++---
 7 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
index 038a0ad926d729..ad4de9bf2257ac 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
@@ -685,7 +685,8 @@ ENTRY e {
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: ENTRY
-; CHECK: transpose
+; CHECK: fusion
+; CHECK-SAME: kind=kLoop
 ; CHECK: fusion
 ; CHECK-SAME: kind=kCustom
 ; CHECK-SAME: "__triton_nested_gemm_fusion"
@@ -710,7 +711,8 @@ ENTRY e {
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: ENTRY
-; CHECK: transpose
+; CHECK: fusion
+; CHECK-SAME: kind=kLoop
 ; CHECK: fusion
 ; CHECK-SAME: kind=kCustom
 ; CHECK-SAME: "__triton_nested_gemm_fusion"
@@ -1205,7 +1207,8 @@ ENTRY e {
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK:      ENTRY
-; CHECK:      concatenate
+; CHECK:      fusion
+; CHECK-SAME:   kind=kLoop
 ; CHECK:      fusion
 ; CHECK-SAME:   kind=kCustom
 ; CHECK-SAME:   "__triton_nested_gemm_fusion"
@@ -1336,12 +1339,14 @@ ENTRY e {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
 
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Add(
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom),
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
+                        .WithFusionKind(HloInstruction::FusionKind::kCustom),
+                    m::Fusion(m::Parameter(), m::Parameter())
+                        .WithFusionKind(HloInstruction::FusionKind::kCustom))
+              .WithFusionKind(HloInstruction::FusionKind::kLoop)));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
@@ -1511,10 +1516,12 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Sin(
-                  m::Fusion(m::Parameter(), m::Parameter())
-                      .WithFusionKind(HloInstruction::FusionKind::kCustom))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(
+          m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
+                        .WithFusionKind(HloInstruction::FusionKind::kCustom))
+              .WithFusionKind(HloInstruction::FusionKind::kLoop)));
 }
 
 // TODO(b/393299275): this should just be a fusion test and does not need to be
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index d949a15e15783a..d8b6a86f155288 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1487,6 +1487,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_RETURN_IF_ERROR(RunAsyncDotPasses(hlo_module));
   {
     HloPassPipeline pipeline("autotune-fusion-emitters");
+    pipeline.AddPass<FusionWrapper>(gpu_target_config.device_description);
     TF_RETURN_IF_ERROR(AddFusionAutotuningPass(
         &pipeline, hlo_module, options, thread_pool.get_mutable(), stream_exec,
         &gpu_target_config, ShapeSizeBytesFunction()));
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 944a71ab6a7ee8..0e204cb3466ef4 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -455,6 +455,24 @@ TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
   }
 }
 
+TEST_F(PersistedAutotuningTest, SingleOperationGetsAutotuned) {
+  xla_gpu_dump_autotune_results_to_ = GetUniqueTempFilePath(".txt");
+
+  TF_EXPECT_OK(GetOptimizedModule(R"(
+e {
+  a = f32[64,128] parameter(0)
+  t = f32[128,64] transpose(a), dimensions={1,0}
+})")
+                   .status());
+
+  TF_ASSERT_OK_AND_ASSIGN(std::string autotune_results_str,
+                          ReadNonEmptyFile(xla_gpu_dump_autotune_results_to_));
+  AutotuneResults results;
+  EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(autotune_results_str,
+                                                         &results));
+  EXPECT_THAT(results.results(), Not(IsEmpty()));
+}
+
 int64_t CountCopies(const HloComputation& computation) {
   int64_t count = 0;
   for (const auto& instruction : computation.instructions()) {
diff --git a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
index dd2a75881159ba..982d34501f4730 100644
--- a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
@@ -3,11 +3,11 @@
 // RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
 
 
-// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} convert(%{{.+}})
-// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} convert(%{{.+}})
+// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} fusion(%{{.+}})
+// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} fusion(%{{.+}})
 // CHECK-SM70: custom-call(%[[convert1]], %[[convert2]]), custom_call_target="__cublas$gemm"
 
-// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} fusion(%{{.+}})
 // CHECK-SM80: %[[b:.+]] = bf16[32,1536]{1,0} parameter(1)
 // CHECK-SM80: custom-call(%[[convert]], %[[b]]), custom_call_target="__cublas$gemm"
 
@@ -22,11 +22,11 @@ ENTRY %computation1 {
 
 // -----
 
-// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} convert(%{{.+}})
-// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} convert(%{{.+}})
+// CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} fusion(%{{.+}})
+// CHECK-SM70: %[[convert2:.+]] = f32[32,1536]{1,0} fusion(%{{.+}})
 // CHECK-SM70: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(%[[convert1]], %[[convert2]]), custom_call_target="__cublas$gemm"
 
-// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} convert(%{{.+}})
+// CHECK-SM80: %[[convert:.+]] = bf16[1536,6144]{1,0} fusion(%{{.+}})
 // CHECK-SM80: %[[b:.+]] = bf16[32,1536]{1,0} parameter(1)
 // CHECK-SM80: (f32[6144,32]{1,0}, s8[4194304]{0}) custom-call(%[[convert]], %[[b]]), custom_call_target="__cublas$gemm"
 
diff --git a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
index 74a24914d4b718..fd858818c3dbcd 100644
--- a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
@@ -19,11 +19,11 @@ e {
 
 // CHECK-NOT: convert
 // CHECK:      s4[4,2]{1,0:E(4)} parameter
-// CHECK-NEXT: s4[2,4]{1,0:E(4)} transpose
+// CHECK:      s4[2,4]{1,0:E(4)} fusion(%{{.+}})
 // CHECK-NEXT: s8[2,2]{0,1} bitcast
 // CHECK:      s8[2,4]{0,1} all-gather-done
 // CHECK-NEXT: s4[4,4]{1,0:E(4)} bitcast
-// CHECK-NEXT: s4[4,4]{1,0:E(4)} transpose
+// CHECK:      s4[4,4]{1,0:E(4)} fusion(%{{.+}})
 
 // -----
 
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
index 16c26d533e50e0..4c412e16ade5e0 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
@@ -583,7 +583,7 @@ ENTRY test {
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
                 GmockMatch(m::GetTupleElement(
                     m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                  m::Negate(m::Parameter(2))),
+                                  m::Fusion(m::Parameter(2))),
                     0)));
   }
 }
@@ -625,7 +625,7 @@ ENTRY test {
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
                 GmockMatch(m::GetTupleElement(
                     m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                  m::Negate(m::Parameter(2))),
+                                  m::Fusion(m::Parameter(2))),
                     0)));
   }
 }
@@ -932,7 +932,7 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:         "epilogue":"DEFAULT"
 ; CHECK:           }
 ; CHECK-NEXT:  [[GEMM:%[^ ]+]] = f32[1024,1024]{1,0} get-tuple-element([[GEMM_TUPLE]]), index=0
-; CHECK-NEXT:  ROOT [[OUT:%[^ ]+]] = f32[1024,1024]{1,0} add([[GEMM]], [[BIAS]])
+; CHECK:  ROOT [[OUT:%[^ ]+]] = f32[1024,1024]{1,0} fusion([[GEMM]], [[BIAS]]), kind=kLoop
 )");
 }
 
@@ -1399,7 +1399,7 @@ ENTRY test {
 ; CHECK-DAG:         "epilogue":"BIAS"
 ; CHECK:           }
 ; CHECK-NEXT:    [[GETTUPLE:%[^ ]+]] = f32[4,4]{1,0} get-tuple-element([[MATMUL]]), index=0
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3]{1,0} slice([[GETTUPLE]]), slice={[0:2], [0:3]}
+; CHECK:    ROOT [[OUT:%[^ ]+]] = f32[2,3]{1,0} fusion([[GETTUPLE]]), kind=kLoop
       )");
 }
 
@@ -1775,7 +1775,7 @@ ENTRY test {
 ; CHECK-DAG:         "epilogue":"RELU"
 ; CHECK:           }
 ; CHECK:         [[MATMUL:%[^ ]+]] = f32[2,4]{1,0} get-tuple-element([[MATMUL_TUPLE]]), index=0
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} slice([[MATMUL]]), slice={[0:2], [0:2]}
+; CHECK:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} fusion([[MATMUL]]), kind=kLoop
       )");
 }
 
@@ -3335,7 +3335,7 @@ ENTRY test {
     EXPECT_THAT(optimized_module->entry_computation()->root_instruction(),
                 GmockMatch(m::GetTupleElement(
                     m::CustomCall(m::Parameter(0), m::Parameter(1),
-                                  m::Negate(m::Parameter(2))),
+                                  m::Fusion(m::Parameter(2))),
                     0)));
   }
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
index 4ab0fb261d837c..449dfe481c19a6 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
@@ -585,10 +585,9 @@ ENTRY AddDotsFunc {
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc ({{.*}}: f32[3,2,5], {{.*}}: f32[5,3,4]) -> f32[5,2,4] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,2,5]{2,1,0} parameter(0)
+; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} fusion([[P0]])
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
-; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} transpose([[P0]])
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} custom-call([[FUSION]], [[P1]]),
-; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
+; CHECK:         {{[^ ]+}} = {{.*}} custom-call([[FUSION]], [[P1]]), custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
 ; CHECK:           backend_config={
 ; CHECK-DAG:         "alpha_real":1
 ; CHECK-DAG:         "alpha_imag":0
@@ -1545,8 +1544,7 @@ ENTRY DotFunc {
 ; CHECK-LABEL: ENTRY %DotFunc ({{.*}}: f32[3,3], {{.*}}: f32[3,3]) -> f32[3,3] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[3,3]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,3]{1,0} parameter(1)
-; CHECK-NEXT:    [[GEMM:%[^ ]+]] = {{.*}} dot([[P0]], [[P1]]),
-; CHECK:           lhs_contracting_dims={1}, rhs_contracting_dims={0}
+; CHECK-NEXT:    ROOT {{[^ ]+}} = f32[3,3]{1,0} fusion([[P0]], [[P1]]), kind=kLoop
 )");
 }
 

From 2878fedcc765484eadd407e18476ffa4ceef9fba Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Thu, 11 Dec 2025 12:28:59 -0800
Subject: [PATCH 187/753] Support padding in convolution op with YNNPACK
 enabled.

PiperOrigin-RevId: 843329663
---
 .../xla/xla/backends/cpu/ynn_emitter.cc       | 57 ++++++++++++++++---
 .../xla/xla/backends/cpu/ynn_support.cc       |  8 ---
 2 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index 790e9f62c610e5..e70e7757577001 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -370,20 +370,54 @@ static ynn_status DefineBatchMatrixMultiply(ynn_subgraph_t subgraph,
 }
 
 static ynn_status DefineConvolution(
-    ynn_subgraph_t subgraph, uint32_t input1_id, uint32_t input2_id,
-    uint32_t output_id, const std::vector<int32_t>& stencil_axes,
+    ynn_subgraph_t subgraph, ynn_type input1_id_type, uint32_t input1_id,
+    uint32_t input2_id, uint32_t output_id,
+    const std::vector<int32_t>& stencil_axes,
     const std::vector<int32_t> new_axes,
     const std::vector<size_t>& stencil_dims,
     const std::vector<size_t>& stencil_strides,
-    const std::vector<size_t>& stencil_dilations) {
-  uint32_t padding_id = YNN_INVALID_VALUE_ID;
+    const std::vector<size_t>& stencil_dilations,
+    const std::vector<int64_t>& padding_lows,
+    const std::vector<int64_t>& padding_highs) {
   uint32_t stencil_id = YNN_INVALID_VALUE_ID;
 
+  ynn_status status;
+
+  // If any of paddings is not zero, define a padding value and pad the input.
+  if (absl::c_any_of(padding_lows, [](int32_t i) { return i != 0; }) ||
+      absl::c_any_of(padding_highs, [](int32_t i) { return i != 0; })) {
+    uint32_t padding_id = YNN_INVALID_VALUE_ID;
+
+    // Define padding value.
+    uint64_t padding_value = 0;
+    status = ynn_define_tensor_value(subgraph, input1_id_type,
+                                     /*rank=*/0, /*dims=*/nullptr,
+                                     /*data=*/&padding_value,
+                                     /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+                                     /*scale_id=*/YNN_INVALID_VALUE_ID,
+                                     /*flags=*/YNN_VALUE_FLAG_COPY_DATA,
+                                     &padding_id);
+
+    if (status != ynn_status_success) {
+      return status;
+    }
+
+    uint32_t padded_id = YNN_INVALID_VALUE_ID;
+    status = ynn_define_static_pad(
+        subgraph, stencil_axes.size(), stencil_axes.data(), padding_lows.data(),
+        padding_highs.data(), input1_id, padding_id, &padded_id, /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input1_id = padded_id;
+    padding_id = YNN_INVALID_VALUE_ID;
+  }
+
   // Make a stenciled view of the input [n, h, w, ci] -> [n, h, w, kh, kw, ci].
-  ynn_status status = ynn_define_stencil_copy(
+  status = ynn_define_stencil_copy(
       subgraph, /*num_stencils=*/stencil_dims.size(), stencil_axes.data(),
       new_axes.data(), stencil_dims.data(), stencil_strides.data(),
-      stencil_dilations.data(), input1_id, padding_id, &stencil_id,
+      stencil_dilations.data(), input1_id, YNN_INVALID_VALUE_ID, &stencil_id,
       /*flags=*/0);
   if (status != ynn_status_success) {
     return status;
@@ -532,19 +566,24 @@ static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
   std::vector<size_t> stencil_dims(conv_window_dims_size);
   std::vector<size_t> stencil_strides(conv_window_dims_size);
   std::vector<size_t> stencil_dilations(conv_window_dims_size);
+  std::vector<int64_t> padding_lows(conv_window_dims_size);
+  std::vector<int64_t> padding_highs(conv_window_dims_size);
 
   for (size_t i = 0; i < conv_window.dimensions_size(); ++i) {
     stencil_axes[i] = conv_dimensions.input_spatial_dimensions(i);
     stencil_dims[i] = conv_window.dimensions(i).size();
     stencil_strides[i] = conv_window.dimensions(i).stride();
     stencil_dilations[i] = 1;
+    padding_lows[i] = conv_window.dimensions(i).padding_low();
+    padding_highs[i] = conv_window.dimensions(i).padding_high();
   }
 
   std::iota(new_axes.begin(), new_axes.end(), lhs_dims.size() - 1);
 
-  YNN_RETURN_IF_ERROR(DefineConvolution(subgraph.get(), lhs_id, rhs_id, out_id,
-                                        stencil_axes, new_axes, stencil_dims,
-                                        stencil_strides, stencil_dilations));
+  YNN_RETURN_IF_ERROR(
+      DefineConvolution(subgraph.get(), ynn_lhs_type, lhs_id, rhs_id, out_id,
+                        stencil_axes, new_axes, stencil_dims, stencil_strides,
+                        stencil_dilations, padding_lows, padding_highs));
 
   ynn_status status = ynn_optimize_subgraph(
       subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index 43cbf1a4749c81..b455c00c7734fa 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -327,14 +327,6 @@ bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
     return false;
   }
 
-  // Only VALID padding for now.
-  if ((window.dimensions(0).padding_low() != 0) ||
-      (window.dimensions(0).padding_high() != 0) ||
-      (window.dimensions(1).padding_low() != 0) ||
-      (window.dimensions(1).padding_high() != 0)) {
-    return false;
-  }
-
   // No dilation for now.
   if ((window.dimensions(0).window_dilation() != 1) ||
       (window.dimensions(1).window_dilation() != 1) ||

From a32a58016e73281abdb21aa091b0e9f2461211af Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Thu, 11 Dec 2025 13:13:31 -0800
Subject: [PATCH 188/753] Remove redundant environment variables.

PiperOrigin-RevId: 843346350
---
 .bazelrc                           | 6 ------
 third_party/xla/tensorflow.bazelrc | 6 ------
 2 files changed, 12 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 898642bf6acbc4..14a2128d591243 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -637,12 +637,6 @@ common:rbe_linux_cpu --python_path="/usr/bin/python3"
 # These you may need to change for your own GCP project.
 common:rbe_linux_cpu --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 
-# Download CUDA/CUDNN redistributions to preserve the repositories cache between
-# CPU and GPU builds.
-# TODO(ybaturina): Uncomment when RBE is ready to support this.
-common:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
-common:rbe_linux_cpu --config=cuda_version
-
 # Deprecated RBE config with non-hermetic toolchains.
 common:rbe_linux_cpu_clang_local --config=rbe_linux_cpu
 common:rbe_linux_cpu_clang_local --config=clang_local
diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index 16c8a9c1bfecc1..80b55cb9db8fd0 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -532,12 +532,6 @@ common:rbe_linux_cpu_clang_local --extra_toolchains="@local_config_cuda//crossto
 common:rbe_linux_cpu_clang_local --repo_env=CC="/usr/lib/llvm-18/bin/clang"
 common:rbe_linux_cpu_clang_local --repo_env=TF_SYSROOT="/dt9"
 
-# Download CUDA/CUDNN redistributions to preserve the repositories cache between
-# CPU and GPU builds.
-# TODO(ybaturina): Uncomment when RBE is ready to support this.
-# common:rbe_linux_cpu --repo_env USE_CUDA_REDISTRIBUTIONS=1
-# common:rbe_linux_cpu --config=cuda_version
-
 common:rbe_linux_cuda --config=cuda_clang_official
 common:rbe_linux_cuda --config=rbe_linux_cpu
 # dt9 is based on glibc 2.17, which is outdated and incompatible with CUDA 12.8.0

From 17d6e98b76810250de1b626c16607f5189137a52 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 13:39:03 -0800
Subject: [PATCH 189/753] [XLA:Original Value] Propagate OriginalValue in
 WhileLoopSimplifier when removing constant loop-carried values.

When constant tuple elements are removed from a while loop's loop-carried values, update the `OriginalValue` trees for the modified tuples (init, body root, cond root) and the new while instruction to reflect the shape changes. This ensures that debugging information remains accurate after simplification.

PiperOrigin-RevId: 843355843
---
 .../xla/xla/service/while_loop_simplifier.cc  |  32 ++++-
 .../xla/service/while_loop_simplifier_test.cc | 111 ++++++++++++++++++
 2 files changed, 140 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index 55c260fb08571d..2bf42ac92b32a2 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -875,7 +875,8 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
   };
 
   // Returns a new tuple without the elements of constant_tuple_indices.
-  auto remove_constant_elems = [&](HloInstruction* instr) {
+  auto remove_constant_elems =
+      [&](HloInstruction* instr) -> std::unique_ptr<HloInstruction> {
     CHECK(ShapeUtil::Compatible(instr->shape(), while_shape));
 
     std::vector<HloInstruction*> tuple_elems;
@@ -886,10 +887,24 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
                 while_shape.tuple_shapes(i), instr, i)));
       }
     }
-    return HloInstruction::CreateTuple(tuple_elems);
+    std::unique_ptr<HloInstruction> new_tuple =
+        HloInstruction::CreateTuple(tuple_elems);
+    if (instr->original_value()) {
+      auto new_ov = std::make_shared<OriginalValue>(new_tuple->shape());
+      int64_t new_i = 0;
+      for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
+        if (!constant_tuple_indices.count(i)) {
+          CHECK_OK(new_ov->mutable_tree()->CopyCompatibleSubtreeFrom(
+              instr->original_value()->tree(), {i}, {new_i++}));
+        }
+      }
+      new_tuple->set_original_value(new_ov);
+    }
+    return new_tuple;
   };
 
-  auto add_constant_elems = [&](HloInstruction* instr) {
+  auto add_constant_elems =
+      [&](HloInstruction* instr) -> std::unique_ptr<HloInstruction> {
     CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
 
     std::vector<HloInstruction*> tuple_elems;
@@ -952,6 +967,17 @@ static absl::StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
       module->AddEmbeddedComputation(std::move(new_while_cond)),
       module->AddEmbeddedComputation(std::move(new_while_body)),
       add_new_instr(remove_constant_elems(while_init))));
+  if (while_op->original_value()) {
+    auto new_ov = std::make_shared<OriginalValue>(new_while_op->shape());
+    int64_t new_i = 0;
+    for (int i = 0; i < while_shape.tuple_shapes().size(); ++i) {
+      if (!constant_tuple_indices.count(i)) {
+        CHECK_OK(new_ov->mutable_tree()->CopyCompatibleSubtreeFrom(
+            while_op->original_value()->tree(), {i}, {new_i++}));
+      }
+    }
+    new_while_op->set_original_value(new_ov);
+  }
   new_while_op->CopyBackendConfigFrom(while_op);
   CopyFrontendAttributes(while_op, new_while_op);
   CopyMetadata(while_op, new_while_op);
diff --git a/third_party/xla/xla/service/while_loop_simplifier_test.cc b/third_party/xla/xla/service/while_loop_simplifier_test.cc
index 24202035575301..9ee413ca8b655c 100644
--- a/third_party/xla/xla/service/while_loop_simplifier_test.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier_test.cc
@@ -819,6 +819,35 @@ TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarry) {
               op::Tuple(op::Constant()));
 }
 
+TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarryWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1]) parameter(0)
+    a = s32[1] constant({0})
+    ROOT tuple = (s32[1]) tuple(a)
+  }
+  Cond {
+    param = (s32[1]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    init = (s32[1]) tuple(a), origin={({"a"})}
+    ROOT while = (s32[1]) while(init), condition=Cond, body=Body, origin={({"w"})}
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).value();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(op::Constant()));
+  HloInstruction* root_instr = m->entry_computation()->root_instruction();
+  ASSERT_NE(root_instr->original_value(), nullptr);
+  EXPECT_EQ(root_instr->original_value()->ToString(), R"(({"w"}))");
+}
+
 TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
   const std::string hlo_string = R"(
   HloModule Test
@@ -1481,6 +1510,88 @@ ENTRY %main (arg.0: f32[3], arg.1: f32[2]) -> (f32[3], f32[2], f32[2], f32[3]) {
                         op::GetTupleElement(op::While(), 0)));
 }
 
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarryWithOriginalValue) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    a.1 = s32[1] add(a, a)
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a.1, b, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({2,2,2})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c), origin={({"a"},{"b"},{"c"})}
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body, origin={({"w0"},{"w1"},{"w2"})}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  HloInstruction* while_instr = FindFirstWhile(m.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(), R"(({"w0"}, {"w2"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(), R"(({"a"}, {"c"}))");
+  HloInstruction* root_instr = m->entry_computation()->root_instruction();
+  ASSERT_NE(root_instr->original_value(), nullptr);
+  EXPECT_EQ(root_instr->original_value()->ToString(),
+            R"(({"w0"}, {"w1"}, {"w2"}))");
+}
+
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarryWithOriginalValue2) {
+  const std::string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] constant({1})
+    b = s32[2] get-tuple-element(param), index=1
+    b.1 = s32[2] add(b, b)
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a, b.1, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({1})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c), origin={({"a"},{"b"},{"c"})}
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body, origin={({"w0"},{"w1"},{"w2"})}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).value());
+  HloInstruction* while_instr = FindFirstWhile(m.get());
+  ASSERT_NE(while_instr->original_value(), nullptr);
+  EXPECT_EQ(while_instr->original_value()->ToString(), R"(({"w1"}))");
+  HloInstruction* while_init = while_instr->while_init();
+  ASSERT_NE(while_init->original_value(), nullptr);
+  EXPECT_EQ(while_init->original_value()->ToString(), R"(({"b"}))");
+  HloInstruction* root_instr = m->entry_computation()->root_instruction();
+  ASSERT_NE(root_instr->original_value(), nullptr);
+  EXPECT_EQ(root_instr->original_value()->ToString(),
+            R"(({"w0"}, {"w1"}, {"w2"}))");
+}
+
 TEST_F(WhileLoopSimplifierTest, RemoveDeadTupleIndicesWithOriginalValue) {
   const std::string hlo_string = R"(
   HloModule dus

From b650c8ec468c0a73e515e8bd334696d815ef1f36 Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Thu, 11 Dec 2025 13:55:12 -0800
Subject: [PATCH 190/753] Remove unused `CoordinationClientCache`.

PiperOrigin-RevId: 843362068
---
 .../xla/pjrt/distributed/coordination/BUILD   |  28 +-
 .../coordination/client_server_test.cc        |   8 +-
 .../coordination/coordination_client.h        |  15 -
 .../coordination/coordination_service.cc      |  61 +---
 .../coordination/coordination_service.h       |  14 +-
 ...ordination_service_recoverable_job_test.cc | 272 ------------------
 .../coordination/coordination_service_test.cc | 256 +++++------------
 .../coordination/grpc_coordination_client.cc  |  69 -----
 .../coordination/grpc_coordination_client.h   |   3 -
 .../preemption_sync_manager_test.cc           |   3 +-
 .../xla/xla/pjrt/distributed/service.cc       |   2 +-
 11 files changed, 92 insertions(+), 639 deletions(-)
 delete mode 100644 third_party/xla/xla/pjrt/distributed/coordination/coordination_service_recoverable_job_test.cc

diff --git a/third_party/xla/xla/pjrt/distributed/coordination/BUILD b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
index 2f4d808af8c8f0..fc6c3a9001f0af 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
@@ -103,7 +103,9 @@ xla_cc_test(
         ":coordination_service",
         ":coordination_service_error_util",
         ":test_device_proto_cc",
+        "//xla/service:global_device_id",
         "//xla/tsl/distributed_runtime:call_options",
+        "//xla/tsl/distributed_runtime/coordination:coordination_service",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
@@ -245,32 +247,6 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
-    name = "coordination_service_recoverable_job_test",
-    srcs = ["coordination_service_recoverable_job_test.cc"],
-    deps = [
-        ":coordination_client",
-        ":coordination_service",
-        ":coordination_service_agent",
-        ":grpc_coordination_client",
-        ":grpc_coordination_service_impl",
-        "//xla/tsl/distributed_runtime/rpc:async_service_interface",
-        "//xla/tsl/lib/core:status_test_util",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:status",
-        "//xla/tsl/platform:test",
-        "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
-        "@com_github_grpc_grpc//:grpc++",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 xla_cc_test(
     name = "client_server_test",
     size = "medium",
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
index 2cc16bd7d9bf76..07fd5fe468f415 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
@@ -115,8 +115,8 @@ class ClientServerTest : public ::testing::Test {
         config.mutable_coordinated_job_list()->Add();
     job->set_name("agent");
     job->set_num_tasks(num_nodes);
-    auto service = CoordinationService::Create(tsl::Env::Default(), config,
-                                               /*cache=*/nullptr);
+    auto service =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
     return config;
   }
 
@@ -160,8 +160,8 @@ class ClientServerTest : public ::testing::Test {
                              grpc::InsecureServerCredentials());
     // Set up the actual coordination service (where all the real logic
     // lives).
-    coord_service_ = CoordinationService::Create(tsl::Env::Default(), config,
-                                                 /*cache=*/nullptr);
+    coord_service_ =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
     // Set up threads and RPC service.
     coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
         tsl::Env::Default(), "CoordinationServiceRpcHandler",
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
index 141fd0d69ae8a2..49313c4177e4bf 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
@@ -154,21 +154,6 @@ class CoordinationClient {
                                  tsl::StatusCallback done) = 0;
 };
 
-// Simple wrapper class that can be used to retrieve CoordinationClients.
-class CoordinationClientCache {
- public:
-  virtual ~CoordinationClientCache() = default;
-
-  // If the `target` names a remote task, returns a pointer of the
-  // CoordinationClient object wrapping that channel to the remote task.
-  virtual CoordinationClient* GetClient(const std::string& target) = 0;
-
-  // If the `target` names a remote task, returns an owned pointer of the
-  // CoordinationClient object wrapping that channel to the remote task.
-  virtual std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) = 0;
-};
-
 }  // namespace xla
 
 #endif  // XLA_PJRT_DISTRIBUTED_COORDINATION_COORDINATION_CLIENT_H_
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
index 54a026aefb5b93..7df2ef53488c5f 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
@@ -201,10 +201,8 @@ bool CoordinationService::TaskState::IsDisconnectedBeyondGracePeriod() {
 }
 
 CoordinationService::CoordinationService(
-    tsl::Env* env, const CoordinationServiceConfig& config,
-    std::unique_ptr<CoordinationClientCache> client_cache)
-    : client_cache_(std::move(client_cache)),
-      env_(*env),
+    tsl::Env* env, const CoordinationServiceConfig& config)
+    : env_(*env),
       heartbeat_timeout_ms_([&config]() -> uint64_t {
         return config.heartbeat_timeout_in_ms() > 0
                    ? config.heartbeat_timeout_in_ms()
@@ -965,52 +963,7 @@ void CoordinationService::PropagateError(
     VLOG(3) << "All tasks are recoverable, skip propagating error.";
     return;
   }
-  // If there is no service-to-client connection, use error polling or stop
-  // the service.
-  if (client_cache_ == nullptr) {
-    SendErrorPollingResponseOrFailAllTasks(error);
-    return;
-  }
-
-  ReportErrorToTaskRequest request;
-  request.set_error_code(error.raw_code());
-  request.set_error_message(std::string(error.message()));
-  CoordinationServiceError* payload = request.mutable_error_payload();
-  payload->set_is_reported_error(is_reported_by_task);
-  tsl::CallOptions call_opts;
-  call_opts.SetTimeout(kServiceToClientTimeoutMs);
-  // TODO(b/369222279): This logic will be removed shortly, so we don't bother
-  // adding the full list of source tasks.
-  if (!source_tasks.empty()) {
-    *payload->mutable_source_task() = source_tasks[0];
-  }
-
-  std::vector<std::shared_ptr<absl::Notification>> notifications;
-
-  for (const auto& pair : cluster_state_) {
-    // Propagate error only to tasks that are connected
-    if (pair.second->GetState() != CoordinatedTaskState::TASKSTATE_CONNECTED) {
-      continue;
-    }
-    std::string task = pair.first;
-
-    CoordinationClient* client = client_cache_->GetClient(task);
-    auto response = std::make_shared<ReportErrorToTaskResponse>();
-    auto n = std::make_shared<absl::Notification>();
-    client->ReportErrorToTaskAsync(
-        &call_opts, &request, response.get(),
-        [response, n, task](const absl::Status& s) {
-          if (!s.ok()) {
-            LOG(ERROR) << "Encountered another error while reporting to "
-                       << task << ": " << s;
-          }
-          n->Notify();
-        });
-    notifications.push_back(n);
-  }
-  for (auto& n : notifications) {
-    n->WaitForNotification();
-  }
+  SendErrorPollingResponseOrFailAllTasks(error);
 }
 
 // Utility for normalizing structured config key string.
@@ -1125,13 +1078,6 @@ void CoordinationService::PollForErrorAsync(const CoordinatedTask& task,
     return;
   }
 
-  if (client_cache_ != nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Should not use error polling from service when "
-                            "there is service to client connection.")));
-    return;
-  }
-
   client_polling_for_error_ = true;
 
   if (!cluster_state_.contains(task_name)) {
@@ -1891,7 +1837,6 @@ void CoordinationService::SendErrorPollingResponseOrFailAllTasks(
   CHECK(!error.ok()) << "SendErrorPollingResponseOrFailAllTasks called with OK "
                         "status. Should always return an error.";
   // Should be called only when there is no service-to-client connection.
-  assert(client_cache_ == nullptr);
   if (IsClientPollingForError()) {
     LOG(ERROR)
         << "Use error polling to propagate the following error to all tasks: "
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
index 35e174af3a5228..e95bd9c2adf23d 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
@@ -89,15 +89,8 @@ class CoordinationService {
       absl::flat_hash_set<tensorflow::CoordinatedTask, CoordinatedTaskHash,
                           CoordinatedTaskEqual>;
 
-  static std::unique_ptr<CoordinationService> Create(
-      tsl::Env* env, const tensorflow::CoordinationServiceConfig& config,
-      std::unique_ptr<CoordinationClientCache> cache) {
-    return std::make_unique<CoordinationService>(env, config, std::move(cache));
-  }
-
   CoordinationService(tsl::Env* env,
-                      const tensorflow::CoordinationServiceConfig& config,
-                      std::unique_ptr<CoordinationClientCache> client_cache);
+                      const tensorflow::CoordinationServiceConfig& config);
 
   ~CoordinationService() {
     absl::MutexLock lock(state_mu_);
@@ -280,9 +273,7 @@ class CoordinationService {
                           GetAliveTasksCallback done);
 
   // Gets error from the coordination service. Block until the service
-  // returns an error or the task/service is shutdown. This should never be used
-  // when there is service to client connection (i.e. `CoordinationClientCache`
-  // is passed in during construction).
+  // returns an error or the task/service is shutdown.
   //
   // The first call to this function will trigger the error polling mode in the
   // coordination service, so once an error occurs after the first call, the
@@ -619,7 +610,6 @@ class CoordinationService {
   // such that NotifyWatchJobStateCallbacks should be called.
   void ClusterStateUpdated() ABSL_EXCLUSIVE_LOCKS_REQUIRED(state_mu_);
 
-  std::unique_ptr<CoordinationClientCache> client_cache_;
   tsl::Env& env_;
   const IncarnationId service_incarnation_{tsl::random::New64()};
   const uint64_t heartbeat_timeout_ms_;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_recoverable_job_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_recoverable_job_test.cc
deleted file mode 100644
index 8f2dfb02ab135d..00000000000000
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_recoverable_job_test.cc
+++ /dev/null
@@ -1,272 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/log/log.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "grpcpp/server.h"
-#include "grpcpp/server_builder.h"
-#include "grpcpp/support/channel_arguments.h"
-#include "xla/pjrt/distributed/coordination/coordination_client.h"
-#include "xla/pjrt/distributed/coordination/coordination_service.h"
-#include "xla/pjrt/distributed/coordination/coordination_service_agent.h"
-#include "xla/pjrt/distributed/coordination/grpc_coordination_client.h"
-#include "xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h"
-#include "xla/tsl/distributed_runtime/rpc/async_service_interface.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/status.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/tsl/protobuf/coordination_config.pb.h"
-
-namespace xla {
-namespace {
-using tensorflow::CoordinatedJob;
-using tensorflow::CoordinationServiceConfig;
-
-constexpr char kParameterServerJobName[] = "parameter_server";
-constexpr char kWorkerJobName[] = "worker";
-constexpr char kCoordinationServiceType[] = "standalone";
-constexpr char kServiceLeader[] = "/job:parameter_server/replica:0/task:0";
-
-class TestCoordinationClientCache : public CoordinationClientCache {
- public:
-  void AddTask(const std::string& target, CoordinationClient* client) {
-    absl::MutexLock l(clients_mu_);
-    clients_.emplace(target, client);
-  }
-
-  CoordinationClient* GetClient(const std::string& target) override {
-    absl::MutexLock l(clients_mu_);
-    if (auto it = clients_.find(target); it != clients_.end()) {
-      return it->second;
-    }
-    return nullptr;
-  }
-
-  std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) override {
-    LOG(ERROR) << "GetOwnedClient is not supported.";
-    return nullptr;
-  }
-
- private:
-  absl::Mutex clients_mu_;
-  absl::flat_hash_map<std::string, CoordinationClient*> clients_
-      ABSL_GUARDED_BY(clients_mu_);
-};
-
-class TestCoordinationServiceTaskState {
- public:
-  TestCoordinationServiceTaskState() = default;
-
-  ~TestCoordinationServiceTaskState() = default;
-
-  void Shutdown() {
-    coord_client_.reset();
-    coord_agent_.reset();
-    coord_compute_pool_.reset();
-    static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get())
-        ->SetCoordinationServiceInstance(nullptr);
-    grpc_server_->Shutdown();
-    coord_rpc_service_->Shutdown();
-  }
-
-  void StartGrpcServer() {
-    ::grpc::ServerBuilder builder;
-    coord_compute_pool_ = std::make_unique<tsl::thread::ThreadPool>(
-        tsl::Env::Default(), /*name=*/"CoordinationServiceRpcHandler",
-        /*num_threads=*/5);
-    coord_rpc_service_ = std::make_unique<GrpcCoordinationServiceImpl>(
-        coord_compute_pool_.get(), &builder);
-    auto* grpc_coord_service =
-        static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
-    grpc_coord_service->SetCoordinationServiceAgentInstance(coord_agent_.get());
-    grpc_server_ = builder.BuildAndStart();
-    coord_client_ = absl::WrapUnique(NewGrpcCoordinationClient(
-        grpc_server_->InProcessChannel(::grpc::ChannelArguments())));
-    coord_rpc_thread_ = absl::WrapUnique(tsl::Env::Default()->StartThread(
-        /*thread_options=*/{}, /*name=*/"CoordinationServiceHandleRPCsLoop",
-        [service = coord_rpc_service_.get()]() { service->HandleRPCsLoop(); }));
-  }
-
-  void SetCoordinationService(CoordinationService* service) {
-    auto* grpc_coord_service =
-        static_cast<GrpcCoordinationServiceImpl*>(coord_rpc_service_.get());
-    grpc_coord_service->SetCoordinationServiceInstance(service);
-  }
-
-  void InitializeAndConnectCoordinationAgents(
-      const std::string& job_name, int task_id,
-      const CoordinationServiceConfig& coordination_config) {
-    auto error_fn = [this, job_name](const absl::Status& status) {
-      this->status_ = status;
-      LOG(ERROR) << "Coordination service agent of " << job_name
-                 << " is in error status: " << status;
-    };
-
-    TF_CHECK_OK(coord_agent_->Initialize(tsl::Env::Default(), job_name, task_id,
-                                         coordination_config,
-                                         std::move(coord_client_), error_fn));
-    TF_CHECK_OK(coord_agent_->Connect());
-    TF_CHECK_OK(status_);
-  }
-
-  CoordinationClient* GetCoordinationClient() { return coord_client_.get(); }
-
-  absl::Status ReportError(const absl::Status& status) {
-    return coord_agent_->ReportError(status);
-  }
-
-  absl::Status GetStatus() const { return status_; }
-
- private:
-  std::unique_ptr<::grpc::Server> grpc_server_;
-  std::unique_ptr<tsl::thread::ThreadPool> coord_compute_pool_;
-  std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
-  std::unique_ptr<tsl::Thread> coord_rpc_thread_;
-  std::unique_ptr<CoordinationServiceAgent> coord_agent_ =
-      CreateCoordinationServiceAgent();
-  std::unique_ptr<CoordinationClient> coord_client_;
-  absl::Status status_;
-};
-
-class CoordinationServiceRecoverableJobTest : public ::testing::Test {
- public:
-  void SetUp() override {
-    state_ps_0_.StartGrpcServer();
-    state_ps_1_.StartGrpcServer();
-    state_worker_0_.StartGrpcServer();
-    state_worker_1_.StartGrpcServer();
-  }
-
-  void TearDown() override {
-    state_ps_0_.Shutdown();
-    state_ps_1_.Shutdown();
-    state_worker_0_.Shutdown();
-    state_worker_1_.Shutdown();
-    coord_service_.reset();
-  }
-
-  void Initialize() {
-    ConfigureCoordinationService();
-    auto client_cache = std::make_unique<TestCoordinationClientCache>();
-    client_cache->AddTask(
-        /*target=*/kServiceLeader, state_ps_0_.GetCoordinationClient());
-    client_cache->AddTask(
-        /*target=*/"/job:parameter_server/replica:0/task:1",
-        state_ps_1_.GetCoordinationClient());
-    client_cache->AddTask(
-        /*target=*/"/job:worker/replica:0/task:0",
-        state_worker_0_.GetCoordinationClient());
-    client_cache->AddTask(
-        /*target=*/"/job:worker/replica:0/task:1",
-        state_worker_1_.GetCoordinationClient());
-    coord_service_ = CoordinationService::Create(
-        tsl::Env::Default(), coordination_config_, std::move(client_cache));
-    // Set the service pointer for all the tasks since it is needed for handling
-    // error propagations. In reality, every task has its own service pointer.
-    // To mimic that, we need multi-process tests.
-    state_ps_0_.SetCoordinationService(coord_service_.get());
-    state_ps_1_.SetCoordinationService(coord_service_.get());
-    state_worker_0_.SetCoordinationService(coord_service_.get());
-    state_worker_1_.SetCoordinationService(coord_service_.get());
-    state_ps_0_.InitializeAndConnectCoordinationAgents(kParameterServerJobName,
-                                                       /*task_id=*/0,
-                                                       coordination_config_);
-    state_ps_1_.InitializeAndConnectCoordinationAgents(kParameterServerJobName,
-                                                       /*task_id=*/1,
-                                                       coordination_config_);
-    state_worker_0_.InitializeAndConnectCoordinationAgents(
-        kWorkerJobName,
-        /*task_id=*/0, coordination_config_);
-    state_worker_1_.InitializeAndConnectCoordinationAgents(
-        kWorkerJobName,
-        /*task_id=*/1, coordination_config_);
-  }
-
-  void ConfigureCoordinationService() {
-    // Assume the coordination service is deployed in the parameter server.
-    coordination_config_.set_service_type(kCoordinationServiceType);
-    coordination_config_.set_service_leader(kServiceLeader);
-    CoordinatedJob* ps =
-        coordination_config_.mutable_coordinated_job_list()->Add();
-    ps->set_name(kParameterServerJobName);
-    ps->set_num_tasks(2);
-    CoordinatedJob* worker =
-        coordination_config_.mutable_coordinated_job_list()->Add();
-    worker->set_name(kWorkerJobName);
-    worker->set_num_tasks(2);
-  }
-
-  void AddJobToRecoverableJobs(const std::string& job_name) {
-    coordination_config_.add_recoverable_jobs(job_name);
-  }
-
- protected:
-  CoordinationServiceConfig coordination_config_;
-  std::unique_ptr<CoordinationService> coord_service_;
-  TestCoordinationServiceTaskState state_ps_0_;
-  TestCoordinationServiceTaskState state_ps_1_;
-  TestCoordinationServiceTaskState state_worker_0_;
-  TestCoordinationServiceTaskState state_worker_1_;
-};
-
-TEST_F(CoordinationServiceRecoverableJobTest,
-       UnrecoverableWorkerFailurePropagated) {
-  Initialize();
-  TF_ASSERT_OK(state_worker_0_.ReportError(absl::InternalError("Test Error.")));
-
-  // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_TRUE(absl::IsInternal(state_ps_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_ps_1_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_1_.GetStatus()));
-}
-
-TEST_F(CoordinationServiceRecoverableJobTest,
-       UnrecoverablePSFailurePropagated) {
-  Initialize();
-  TF_ASSERT_OK(state_ps_0_.ReportError(absl::InternalError("Test Error.")));
-
-  // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_TRUE(absl::IsInternal(state_ps_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_ps_1_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(absl::IsInternal(state_worker_1_.GetStatus()));
-}
-
-TEST_F(CoordinationServiceRecoverableJobTest,
-       RecoverableWorkerFailureNotPropagated) {
-  AddJobToRecoverableJobs(kWorkerJobName);
-  Initialize();
-  TF_ASSERT_OK(state_worker_0_.ReportError(absl::InternalError("Test Error.")));
-
-  // For recoverable task, error does not propagate.
-  EXPECT_TRUE(state_ps_0_.GetStatus().ok());
-  EXPECT_TRUE(state_ps_1_.GetStatus().ok());
-  EXPECT_TRUE(absl::IsInternal(state_worker_0_.GetStatus()));
-  EXPECT_TRUE(state_worker_1_.GetStatus().ok());
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
index 55ee5a32ed1527..b201c1100da1c5 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -43,7 +42,6 @@ limitations under the License.
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/types.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
@@ -68,7 +66,6 @@ using tensorflow::CoordinationServiceConfig;
 using tensorflow::DeviceInfo;
 using tensorflow::KeyValueEntry;
 using xla::TestDevice;
-using xla::TestDeviceList;
 
 constexpr absl::Duration kHeartbeatTimeout = absl::Seconds(2);
 constexpr absl::Duration kShutdownBarrierTimeout = absl::Milliseconds(500);
@@ -156,50 +153,25 @@ class TestCoordinationClient : public CoordinationClient {
   absl::Status status_ ABSL_GUARDED_BY(mu_);
 };
 
-class TestCoordinationClientCache : public CoordinationClientCache {
- public:
-  void AddTask(const std::string& target, CoordinationClient* client) {
-    clients_.emplace(target, client);
-  }
-
-  CoordinationClient* GetClient(const std::string& target) override {
-    auto it = clients_.find(target);
-    if (it == clients_.end()) return nullptr;
-    return it->second;
-  }
-
-  std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) override {
-    LOG(ERROR) << "GetOwnedClient is not supported.";
-    return nullptr;
-  }
-
- private:
-  std::unordered_map<std::string, CoordinationClient*> clients_;
-};
-
 class CoordinationBarrierTest : public ::testing::Test {
  protected:
-  CoordinationBarrierTest() {
+  explicit CoordinationBarrierTest(bool recoverable = false) {
     // Set up fake cluster with 3 tasks.
     const int num_tasks = 3;
-    auto client_cache = std::make_unique<TestCoordinationClientCache>();
     for (int i = 0; i < num_tasks; ++i) {
       CoordinatedTask task;
       task.set_job_name("worker");
       task.set_task_id(i);
+      task.set_recoverable(recoverable);
 
       auto client = std::make_unique<TestCoordinationClient>();
-      client_cache->AddTask(absl::StrCat("/job:worker/replica:0/task:", i),
-                            client.get());
-
       tasks_.push_back(task);
       clients_.push_back(std::move(client));
     }
     CoordinationServiceConfig config = GetCoordinationServiceConfig(num_tasks);
 
-    coord_service_ = CoordinationService::Create(tsl::Env::Default(), config,
-                                                 std::move(client_cache));
+    coord_service_ =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
     // Register the tasks.
     for (int i = 0; i < num_tasks; ++i) {
       absl::Status s =
@@ -256,20 +228,12 @@ class CoordinateTwoTasksTest : public ::testing::Test {
 
   // Set up coordination service.
   void EnableCoordinationService(
-      bool has_service_to_client_connection = true,
       bool enable_shutdown_barrier = false,
       bool enable_register_barrier = false,
       bool set_worker_job_recoverable = false,
       bool allow_new_incarnation_to_reconnect = false) {
     CoordinationServiceConfig config =
         GetCoordinationServiceConfig(/*num_tasks=*/2);
-    auto client_cache = std::make_unique<TestCoordinationClientCache>();
-    if (has_service_to_client_connection) {
-      client_cache->AddTask("/job:worker/replica:0/task:0", &client_0_);
-      client_cache->AddTask("/job:worker/replica:0/task:1", &client_1_);
-    } else {
-      client_cache = nullptr;
-    }
     config.set_heartbeat_timeout_in_ms(kHeartbeatTimeout /
                                        absl::Milliseconds(1));
     if (set_worker_job_recoverable) {
@@ -288,8 +252,8 @@ class CoordinateTwoTasksTest : public ::testing::Test {
       config.set_allow_new_incarnation_to_reconnect(true);
     }
     // Init service.
-    coord_service_ = CoordinationService::Create(tsl::Env::Default(), config,
-                                                 std::move(client_cache));
+    coord_service_ =
+        std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   }
 
   CoordinatedTask task_0_;
@@ -342,8 +306,6 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
               StatusIs(absl::StatusCode::kAborted));
   EXPECT_THAT(coord_service_->RecordHeartbeat(task_1_, IncarnationId(0)),
               StatusIs(absl::StatusCode::kAborted));
-  // Error is propagated to other tasks.
-  EXPECT_THAT(client_0_.GetStatus(), StatusIs(absl::StatusCode::kAborted));
 }
 
 TEST(CoordinationServiceTest, TestCoordinatedJobs) {
@@ -369,18 +331,8 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   worker_job->set_name("worker");
   worker_job->set_num_tasks(2);
 
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
-  TestCoordinationClient ci;
-  client_cache->AddTask("/job:chief/replica:0/task:0", &ci);
-  TestCoordinationClient wi0;
-  client_cache->AddTask("/job:worker/replica:0/task:0", &wi0);
-  TestCoordinationClient wi1;
-  client_cache->AddTask("/job:worker/replica:0/task:1", &wi1);
-  TestCoordinationClient ei;
-  client_cache->AddTask("/job:evaluator/replica:0/task:0", &ei);
-  std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
+  auto coord_service =
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
 
   // Each coordinated task registers and waits for other tasks.
   absl::Notification register_chief;
@@ -423,8 +375,7 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
 
@@ -443,8 +394,7 @@ TEST(CoordinationServiceTest,
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
 
@@ -464,8 +414,7 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   // Task connects to coordination service.
   ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
   // Arbitrarily set task to be in error.
@@ -495,7 +444,7 @@ TEST_F(CoordinateTwoTasksTest, TestTaskHeartbeatTimeout) {
 
 TEST_F(CoordinateTwoTasksTest,
        ErrorPollingRequestsGotCancelledErrorUponServiceShutdown) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   std::vector<absl::Status> statuses;
@@ -518,7 +467,7 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        HeartbeatTimeoutWithoutServerToClientConnection) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
 
@@ -534,7 +483,7 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        HeartbeatTimeoutErrorCanPropagateThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // Use notifications to guarantee the ordering of operations across threads.
@@ -565,7 +514,7 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        HeartbeatTimeoutErrorFromOneTaskCanPropagateThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // Use notifications to guarantee the ordering of operations across threads.
@@ -603,7 +552,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, ReportedErrorCanPropagateThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   std::vector<absl::Status> statuses;
@@ -630,8 +579,6 @@ TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
       task_1_, IncarnationId(tsl::random::New64()));
 
   EXPECT_THAT(s, StatusIs(absl::StatusCode::kAborted));
-  // Aborted error is also propagated to other tasks in cluster.
-  EXPECT_THAT(client_0_.GetStatus(), StatusIs(absl::StatusCode::kAborted));
 }
 
 tensorflow::CoordinatedTaskStateInfo info(
@@ -909,10 +856,8 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
 TEST(CoordinationServiceTest, TryGetKeyValue) {
   const CoordinationServiceConfig config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
 
   // Try to get nonexistent key.
   absl::StatusOr<std::string> result =
@@ -933,10 +878,8 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
 TEST(CoordinationServiceTest, IncrementKeyValue) {
   const CoordinationServiceConfig config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   ASSERT_OK(coord_service->InsertKeyValue("test_key", "1"));
   ASSERT_OK(coord_service->IncrementKeyValue("test_key", 3));
   ASSERT_OK_AND_ASSIGN(std::string result_0,
@@ -1044,10 +987,8 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
   task_2.set_job_name("worker");
   task_2.set_task_id(2);
   absl::Status status = absl::OkStatus();
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   absl::Notification n;
   // Map fake devices to each task.
   DeviceInfo local_devices_0;
@@ -1101,10 +1042,8 @@ TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
   task_1.set_task_id(1);
   absl::Status status = absl::OkStatus();
   absl::Status initial_wait_for_all_tasks_status;
-  auto client_cache = std::make_unique<TestCoordinationClientCache>();
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  std::move(client_cache));
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   absl::Notification n;
   // Map fake devices to each task.
   DeviceInfo local_devices_0;
@@ -1866,8 +1805,8 @@ TEST_F(CoordinateTwoTasksTest, Reset_HeartbeatsAreAcceptedForAGracePeriod) {
 }
 
 TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Status barrier_status;
   absl::Notification barrier_n;
@@ -1887,8 +1826,8 @@ TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
 }
 
 TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
   absl::Notification n;
@@ -1910,8 +1849,8 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
 }
 
 TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   absl::Status barrier_status;
   absl::Notification barrier_n;
@@ -1936,8 +1875,8 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
 }
 
 TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
@@ -1960,8 +1899,8 @@ TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
 
 TEST_F(CoordinateTwoTasksTest,
        ShutdownWithBarrier_BarrierFails_TaskDisconnectsOtherTaskIsAlerted) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
@@ -1984,16 +1923,12 @@ TEST_F(CoordinateTwoTasksTest,
               StatusIs(absl::StatusCode::kAborted));
   EXPECT_THAT(coord_service_->RegisterTask(task_0_, incarnation_1_),
               StatusIs(absl::StatusCode::kAborted));
-
-  // Other task is alerted that shutdown has been initiated without it.
-  absl::Status other_task_status = client_1_.GetStatus();
-  EXPECT_THAT(other_task_status, StatusIs(absl::StatusCode::kInternal));
 }
 
 TEST_F(CoordinateTwoTasksTest,
        ShutdownWithBarrier_BarrierFailsWithoutClientConnection_SetTaskToError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/true);
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status barrier_status;
@@ -2020,7 +1955,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, BarrierFailsIfTaskIsInError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
@@ -2043,7 +1978,7 @@ TEST_F(CoordinateTwoTasksTest, BarrierFailsIfTaskIsInError) {
 
 TEST_F(CoordinateTwoTasksTest,
        BarrierWithParticipatingTasksFailsIfTaskIsStale) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
@@ -2064,7 +1999,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, BarrierFailsAfterErrorPollingResponse) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // Use notifications to guarantee the ordering of operations across threads.
@@ -2107,7 +2042,7 @@ TEST_F(CoordinateTwoTasksTest, BarrierFailsAfterErrorPollingResponse) {
 }
 
 TEST_F(CoordinateTwoTasksTest, BarrierWithSubsetFailsIfTaskIsStale) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Notification n0;
@@ -2130,27 +2065,11 @@ TEST_F(CoordinateTwoTasksTest, BarrierWithSubsetFailsIfTaskIsStale) {
   EXPECT_THAT(barrier_status, StatusIs(absl::StatusCode::kInternal));
 }
 
-TEST_F(CoordinateTwoTasksTest, UnrecoverableTaskPropagatesError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/false);
-
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-
-  ASSERT_OK(coord_service_->ReportTaskError(task_0_,
-                                            absl::InternalError("test_error")));
-
-  // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_THAT(client_1_.GetStatus(), StatusIs(absl::StatusCode::kInternal));
-}
-
 TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/true);
 
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2165,10 +2084,10 @@ TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
 
 TEST_F(CoordinateTwoTasksTest,
        RecoverableTaskWithErrorPollingWillNotPropagateError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/true);
   // These callbacks may be invoked after this test (e.g. cancellations during
   // coord service dtor), so we use shared pointers to extend their lifetimes
   // beyond the test to avoid use-after-free errors.
@@ -2212,8 +2131,7 @@ TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
   worker_job->set_num_tasks(2);
 
   std::unique_ptr<CoordinationService> coord_service =
-      CoordinationService::Create(tsl::Env::Default(), config,
-                                  /*cache=*/nullptr);
+      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
 
   // Each coordinated task registers and polls for errors.
   ASSERT_OK(coord_service->RegisterTask(chief, IncarnationId(0)));
@@ -2257,10 +2175,10 @@ TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
 
 TEST_F(CoordinateTwoTasksTest,
        RecoverableTaskReportErrorResetAndRegisterAgain) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/true);
 
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   TF_EXPECT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2282,11 +2200,11 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/false,
-                            /*set_worker_job_recoverable=*/false,
-                            /*allow_new_incarnation_to_reconnect=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/false,
+      /*set_worker_job_recoverable=*/false,
+      /*allow_new_incarnation_to_reconnect=*/true);
 
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
 
@@ -2296,27 +2214,8 @@ TEST_F(CoordinateTwoTasksTest, UnavailableTaskCanReconnect) {
   TF_EXPECT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_new_));
 }
 
-TEST_F(CoordinateTwoTasksTest,
-       DoNotAllowPollForErrorIfHasServiceToClientConnection) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/true);
-  ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  std::vector<absl::Status> statuses;
-  statuses.reserve(2);
-
-  for (const CoordinatedTask& task : {task_0_, task_1_}) {
-    coord_service_->PollForErrorAsync(
-        task, [&](const absl::Status& status) { statuses.push_back(status); });
-  }
-
-  // The error polling requests will get immediate error because there is
-  // service to client connection.
-  EXPECT_EQ(statuses.size(), 2);
-  EXPECT_THAT(statuses, Each(StatusIs(absl::StatusCode::kInternal)));
-}
-
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfNotInCluster) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   CoordinatedTask task_not_in_cluster;
   absl::Status s;
 
@@ -2328,7 +2227,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfNotInCluster) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskNotRegistered) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
 
   coord_service_->PollForErrorAsync(
@@ -2340,7 +2239,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskNotRegistered) {
 
 TEST_F(CoordinateTwoTasksTest,
        AllowPollForErrorWithinGracePeriodIfTaskHasShutDown) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2359,7 +2258,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskHasShutDown) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
@@ -2378,7 +2277,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskHasShutDown) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorAfterReset) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->ResetTask(task_0_));
@@ -2393,7 +2292,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorAfterReset) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorWhenInErrorState) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   absl::Status s;
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->ReportTaskError(task_0_,
@@ -2405,7 +2304,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorWhenInErrorState) {
 }
 
 TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskIsStale) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   // No heartbeat for a while, leader consider the task as stale.
@@ -2422,7 +2321,7 @@ TEST_F(CoordinateTwoTasksTest, DoNotAllowPollForErrorIfTaskIsStale) {
 
 TEST_F(CoordinateTwoTasksTest,
        CanPropagateTaskRegistrationErrorThroughErrorPolling) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   absl::Status s0;
@@ -2440,7 +2339,7 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, LatePollingTaskCanGetError) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false);
+  EnableCoordinationService();
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
   std::vector<absl::Status> statuses;
@@ -2465,9 +2364,9 @@ TEST_F(CoordinateTwoTasksTest, LatePollingTaskCanGetError) {
 
 TEST_F(CoordinateTwoTasksTest,
        RegisterWithBarrier_OldHeartbeat_RestartedTasksCanReconnect) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   // Service restarted.
   // Old task 0 sends an unexpected heartbeat, which should fail.
   ASSERT_THAT(coord_service_->RecordHeartbeat(task_0_, incarnation_0_ - 1),
@@ -2488,9 +2387,9 @@ TEST_F(CoordinateTwoTasksTest,
 
 TEST_F(CoordinateTwoTasksTest,
        RegisterWithBarrier_RestartBeforeBarrier_Succeeds) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   absl::Status task0_status = absl::InternalError("uninitialized_status");
   absl::Status restarted_task0_status =
       absl::InternalError("uninitialized_status");
@@ -2514,9 +2413,9 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST_F(CoordinateTwoTasksTest, RegisterWithBarrier_RestartAfterBarrier_Fails) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   absl::Status task0_status = absl::InternalError("uninitialized_status");
   // Task 0 registers first.
   coord_service_->RegisterTaskAsync(
@@ -2544,16 +2443,19 @@ TEST_F(CoordinateTwoTasksTest, RegisterWithBarrier_RestartAfterBarrier_Fails) {
 }
 
 TEST_F(CoordinateTwoTasksTest, RegisterWithBarrier_Timeout) {
-  EnableCoordinationService(/*has_service_to_client_connection=*/false,
-                            /*enable_shutdown_barrier=*/false,
-                            /*enable_register_barrier=*/true);
+  EnableCoordinationService(
+      /*enable_shutdown_barrier=*/false,
+      /*enable_register_barrier=*/true);
   // Task 0 joins without task 1. Times out eventually as this function is
   // blocking.
   EXPECT_THAT(coord_service_->RegisterTask(task_0_, incarnation_0_),
               StatusIs(absl::StatusCode::kDeadlineExceeded));
 }
 
-using GetAliveTasksTest = CoordinationBarrierTest;
+class GetAliveTasksTest : public CoordinationBarrierTest {
+ public:
+  GetAliveTasksTest() : CoordinationBarrierTest(true) {}
+};
 
 TEST_F(GetAliveTasksTest, SuccessfulGetAliveTasks) {
   // This test has three tasks successfully call GetAliveTasks.
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
index bdcd5d76b579ea..ee5f5ebafc27cc 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
@@ -329,77 +329,8 @@ class GrpcCoordinationClient : public CoordinationClient {
   std::unique_ptr<GrpcCoordinationClientThread> client_thread_;
 };
 
-class GrpcCoordinationClientCache : public CoordinationClientCache {
- public:
-  explicit GrpcCoordinationClientCache(
-      std::shared_ptr<tsl::GrpcChannelCache> channel_cache)
-      : next_round_robin_assignment_(0),
-        channel_cache_(channel_cache),
-        threads_(4) {}
-
-  ~GrpcCoordinationClientCache() override = default;
-
-  CoordinationClient* GetClient(const std::string& target) override {
-    absl::MutexLock l(clients_mu_);
-    auto it = clients_.find(target);
-    if (it == clients_.end()) {
-      tsl::SharedGrpcChannelPtr channel =
-          channel_cache_->FindWorkerChannel(target);
-      if (channel == nullptr) {
-        VLOG(2) << "Coordination client for target " << target << " not found.";
-      }
-      int assigned_index = AssignClientToThread(target);
-      auto coord_client = std::make_unique<GrpcCoordinationClient>(
-          channel, threads_[assigned_index].completion_queue(), target);
-      it = clients_.emplace(target, std::move(coord_client)).first;
-    }
-    return it->second.get();
-  }
-
-  std::unique_ptr<CoordinationClient> GetOwnedClient(
-      const std::string& target) override {
-    tsl::SharedGrpcChannelPtr channel =
-        channel_cache_->FindWorkerChannel(target);
-    if (channel == nullptr) {
-      VLOG(2) << "Coordination client for target " << target << " not found.";
-    }
-    return std::make_unique<GrpcCoordinationClient>(channel, target);
-  }
-
- private:
-  absl::Mutex assignment_mu_;
-  std::unordered_map<std::string, size_t> target_assignments_
-      ABSL_GUARDED_BY(assignment_mu_);
-  size_t next_round_robin_assignment_ ABSL_GUARDED_BY(assignment_mu_);
-
-  size_t AssignClientToThread(const std::string& target) {
-    // Round-robin target assignment, but keeps the same target on the same
-    // polling thread always, as this is important for gRPC performance
-    absl::MutexLock l(assignment_mu_);
-    auto it = target_assignments_.find(target);
-    if (it == target_assignments_.end()) {
-      it = target_assignments_
-               .insert(std::make_pair(
-                   target, (next_round_robin_assignment_++) % threads_.size()))
-               .first;
-    }
-    return it->second;
-  }
-
-  std::shared_ptr<tsl::GrpcChannelCache> channel_cache_;
-  mutable absl::Mutex clients_mu_;
-  std::unordered_map<std::string, std::unique_ptr<CoordinationClient>> clients_
-      ABSL_GUARDED_BY(clients_mu_);
-  std::vector<GrpcCoordinationClientThread> threads_;
-};
-
 }  // namespace
 
-CoordinationClientCache* NewGrpcCoordinationClientCache(
-    std::shared_ptr<tsl::GrpcChannelCache> channel_cache) {
-  return new GrpcCoordinationClientCache(channel_cache);
-}
-
 CoordinationClient* NewGrpcCoordinationClient(
     std::shared_ptr<::grpc::Channel> channel) {
   return new GrpcCoordinationClient(channel, /*target=*/"coordination_service");
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h
index 0661b3a7b18df8..a1c41ec4484e4e 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.h
@@ -23,9 +23,6 @@ limitations under the License.
 
 namespace xla {
 
-CoordinationClientCache* NewGrpcCoordinationClientCache(
-    std::shared_ptr<tsl::GrpcChannelCache> channel);
-
 CoordinationClient* NewGrpcCoordinationClient(
     std::shared_ptr<::grpc::Channel> channel);
 
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
index 6db1efa3849749..c8edb09a60bcd0 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
@@ -149,8 +149,7 @@ class PreemptionSyncManagerTest : public ::testing::Test {
     CoordinatedJob* job = config.mutable_coordinated_job_list()->Add();
     job->set_name(kJobName);
     job->set_num_tasks(2);
-    return CoordinationService::Create(tsl::Env::Default(), config,
-                                       /*cache=*/nullptr);
+    return std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   }
   void InitializeAndConnectCoordinationAgents() {
     std::unique_ptr<CoordinationClient> coord_client =
diff --git a/third_party/xla/xla/pjrt/distributed/service.cc b/third_party/xla/xla/pjrt/distributed/service.cc
index 1f1a58f326c21b..c5688fc14e8e37 100644
--- a/third_party/xla/xla/pjrt/distributed/service.cc
+++ b/third_party/xla/xla/pjrt/distributed/service.cc
@@ -52,7 +52,7 @@ std::unique_ptr<xla::CoordinationService> EnableCoordinationService(
   job->set_name(job_name);
   job->set_num_tasks(options.num_nodes);
   auto service =
-      xla::CoordinationService::Create(options.env, config, /*cache=*/nullptr);
+      std::make_unique<xla::CoordinationService>(options.env, config);
   return service;
 }
 }  // namespace

From 2817085d03d119181d4a8091957e7f49841028f1 Mon Sep 17 00:00:00 2001
From: Felix Wang <wfelix@google.com>
Date: Thu, 11 Dec 2025 14:32:47 -0800
Subject: [PATCH 191/753] Enhance `collective_perf_table_gen_main` to merge
 collective performance profiles and generate the new perf table profile
 including collective permutes.

This functionality can update the `kDefaultCollectivePTable` in `collective_interpolator_data.h`. The tool reads the current profiles from the header, merges them with new profiles provided as a file or directory of `.pbtxt` files, and writes the combined data back into the header file, replacing the existing proto string. This helps adding new collective into the perftable without changing existing perf table.

PiperOrigin-RevId: 843377472
---
 third_party/xla/xla/service/gpu/model/BUILD   |     7 +
 .../gpu/model/collective_interpolator_data.h  | 13054 ++++++++++------
 third_party/xla/xla/tools/BUILD               |     9 +
 .../tools/collective_perf_table_gen_main.cc   |   132 +-
 4 files changed, 8434 insertions(+), 4768 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 7ad8e03a450351..ac73cffff1ef0d 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -921,6 +921,12 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "collective_interpolator_data",
+    hdrs = ["collective_interpolator_data.h"],
+    compatible_with = get_compatible_with_portable(),
+)
+
 cc_library(
     name = "collective_interpolator",
     srcs = ["collective_interpolator.cc"],
@@ -929,6 +935,7 @@ cc_library(
         "collective_interpolator_data.h",
     ],
     deps = [
+        ":collective_interpolator_data",
         ":gpu_hlo_cost_analysis",
         ":hlo_op_profile_proto_cc",
         ":hlo_op_profiles",
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h b/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
index 5dfdbaa6591553..ac9a6fc3571738 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator_data.h
@@ -23,41 +23,54 @@ limitations under the License.
 // BEGIN_DEFAULT_PERF_TABLE
 constexpr char kDefaultCollectivePTable[] = R"pb(
   entries {
-    key: "sm_90"
+    key: "sm_100"
     value {
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 20765736
+        fingerprint: "82724b215353fdf447c8f5867b927fe2"
+        network_throughput_bytes_per_sec: 14185281385
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -67,42 +80,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 13787160
+        fingerprint: "70a080ed258662e7a7c448a580386531"
+        network_throughput_bytes_per_sec: 155528554
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 8344198
+        fingerprint: "8c4f72b22cf1c427b5192fde2275b82d"
+        network_throughput_bytes_per_sec: 75804898766
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -112,43 +138,54 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 32064128
+        fingerprint: "b119f67d214e8219e6b672422c7ff82d"
+        network_throughput_bytes_per_sec: 4940892641
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 15873015
+        fingerprint: "30228c58f0f8bfd498c30a0b4c75491e"
+        network_throughput_bytes_per_sec: 523521123354
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -158,20 +195,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 18593840
+        fingerprint: "6a6da3f8a701c6de63d3f3eff5a326d0"
+        network_throughput_bytes_per_sec: 307692307
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -181,175 +224,233 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 47904191
+        fingerprint: "4f39fbed3ed5b26fd8f01ceb12a6958e"
+        network_throughput_bytes_per_sec: 11359602
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 28243601
+        fingerprint: "d99a31a5731cd087d2a57a04dbeda416"
+        network_throughput_bytes_per_sec: 277694915254
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 11838697
+        fingerprint: "991c99cf49ca7ed01041e4151f354da3"
+        network_throughput_bytes_per_sec: 71111111
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 10571522
+        fingerprint: "8dd875a32ca9280cac2fb6c8b4a3f900"
+        network_throughput_bytes_per_sec: 2458583433
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 7492390
+        fingerprint: "36c99c9ecd2afb910616dba7e7604d76"
+        network_throughput_bytes_per_sec: 1120350109
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 7626310
+        fingerprint: "9efe18a44677d5ebf6d950d12f0105d0"
+        network_throughput_bytes_per_sec: 129005394058
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 56939501
+        fingerprint: "b6cdb267bfc64ec38e1e2740095c8805"
+        network_throughput_bytes_per_sec: 7111111111
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 90523338
+        fingerprint: "27a51e9c9148298fd01ee900e6a81c2c"
+        network_throughput_bytes_per_sec: 851808285946
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -359,66 +460,83 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 122137404
+        fingerprint: "22444f0e312a3499cbfd75eaf67c0888"
+        network_throughput_bytes_per_sec: 127937530502
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 58447488
+        fingerprint: "bee86716cff01212fb687be758fc96e1"
+        network_throughput_bytes_per_sec: 520870550009
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 104746317
+        fingerprint: "e67540b745e062c37cb2d5e38a645a43"
+        network_throughput_bytes_per_sec: 363836224843
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -428,88 +546,139 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 134736842
+        fingerprint: "02f84dcfcb5697b10aa0548ff15c1379"
+        network_throughput_bytes_per_sec: 7074265975
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "260f965f1a4678225622c8bb1bb605bd"
+        network_throughput_bytes_per_sec: 117960007874
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
           }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 55220017
+        fingerprint: "f6f986332989bc7ea465794f1c2b6856"
+        network_throughput_bytes_per_sec: 536489870069
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 51990251
+        fingerprint: "4fd09bd390682728f7da420003fb7c37"
+        network_throughput_bytes_per_sec: 533889468
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 154216867
+        fingerprint: "f9d967d15c65b7c80d2055c0c6dbf3c6"
+        network_throughput_bytes_per_sec: 333722196805
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -519,41 +688,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 53691275
+        fingerprint: "bb6b916563714f9bbad245e71a3d56e3"
+        network_throughput_bytes_per_sec: 32031280547
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 90267983
+        fingerprint: "c04d9b7c7ac56f98fcf87ee9c131ab68"
+        network_throughput_bytes_per_sec: 733454255330
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -563,41 +748,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 66528066
+        fingerprint: "ef5d4bc6c48c17023f1713afce3c88c3"
+        network_throughput_bytes_per_sec: 26947368421
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 125984251
+        fingerprint: "644ed15c889d04ca582b384ff68fc71e"
+        network_throughput_bytes_per_sec: 380952380
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -607,65 +808,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 127617148
+        fingerprint: "5b2d32958f53fbfe8142848551afad7c"
+        network_throughput_bytes_per_sec: 9683215130
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 221837088
+        fingerprint: "498af7a3213702edfadb727477672515"
+        network_throughput_bytes_per_sec: 25051987767
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 140814081
+        fingerprint: "6b3fd8cf011b133409ba2ce19f78aed7"
+        network_throughput_bytes_per_sec: 6239146991
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -675,44 +899,53 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 225749559
+        fingerprint: "f9040b578d0f9eba41c4ac1a07ee4224"
+        network_throughput_bytes_per_sec: 44582312925
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 209492635
+        fingerprint: "ea38ae3f2a296149dde0cac2c673fd8f"
+        network_throughput_bytes_per_sec: 429631462026
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
               num_devices_per_group: 8
@@ -721,20 +954,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 136606189
+        fingerprint: "8154a3ab4411af0b0a94fc14f69ac096"
+        network_throughput_bytes_per_sec: 101057825751
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -744,42 +984,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 160200250
+        fingerprint: "64fc8a1589366b418b88651876990852"
+        network_throughput_bytes_per_sec: 590414414414
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
-        }
-        network_throughput_bytes_per_sec: 342245989
-      }
-      entries {
-        instruction {
-          opcode: "all-to-all"
-          shape {
-            element_type: F32
-            dimensions: 1024
-            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          dimensions: 0
-          channel_id: 1
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -789,63 +1015,82 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 101426307
+        fingerprint: "8abe97935f10f37b31406be7fe615de0"
+        network_throughput_bytes_per_sec: 262026612
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 156670746
+        fingerprint: "61c2170cfeaf7234b58eea7d3c5cc136"
+        network_throughput_bytes_per_sec: 12709395908
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 70368334
+        fingerprint: "9f6af0d6b827293d0386ca2378e5d71b"
+        network_throughput_bytes_per_sec: 22743425
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -855,41 +1100,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 253968253
+        fingerprint: "b8e3907c6dfb227acf1602dcedd1dfae"
+        network_throughput_bytes_per_sec: 695270135305
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 228980322
+        fingerprint: "83d2f5e591feefe0553a301d532b898f"
+        network_throughput_bytes_per_sec: 337325398101
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -899,43 +1159,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 254726368
+        fingerprint: "413eb782ec4e2c4409f95026e7b720e6"
+        network_throughput_bytes_per_sec: 48617210682
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 162539682
+        fingerprint: "ca28c1cb14c2361c8daf298f6de5fc47"
+        network_throughput_bytes_per_sec: 102480062548
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -945,20 +1219,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 275565123
+        fingerprint: "c8859ac88de21b0d40acf7b94c89a34e"
+        network_throughput_bytes_per_sec: 757137293394
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -968,20 +1249,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 709141274
+        fingerprint: "e5dc26fcdbdb577aa5941155f0d4fc57"
+        network_throughput_bytes_per_sec: 10252816020
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -991,153 +1278,205 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 277657266
+        fingerprint: "85bb8349a62442dcab56384b99cbe6d0"
+        network_throughput_bytes_per_sec: 511875030510
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 343163538
+        fingerprint: "6632fafd05450c9ca5bdd86c67f7cc0a"
+        network_throughput_bytes_per_sec: 1452482269
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 725212464
+        fingerprint: "c6fbc0a09d2d44806949eef49196c7a2"
+        network_throughput_bytes_per_sec: 639200998
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 251226692
+        fingerprint: "4ffa457833b6c4ff6e5147781e31302a"
+        network_throughput_bytes_per_sec: 113817915388
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 280087527
+        fingerprint: "ad9a13d7c03557ae4a78b547c910aaa9"
+        network_throughput_bytes_per_sec: 18244988864
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 536687631
+        fingerprint: "2e4edd9d5f901a539189122797458ea4"
+        network_throughput_bytes_per_sec: 24526946107
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 471889400
+        fingerprint: "e39d4c38abe32c0b8bf790196f492d26"
+        network_throughput_bytes_per_sec: 383216445865
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -1147,201 +1486,257 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 519796954
+        fingerprint: "ca3eb58708ad6e9c3551f90b9d193653"
+        network_throughput_bytes_per_sec: 227654363873
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 676354029
+        fingerprint: "0f527662a7b68694b48d50d10a297e0b"
+        network_throughput_bytes_per_sec: 3205007824
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 532224532
+        fingerprint: "4351be00ad096ee1fcfe565c2215c7dd"
+        network_throughput_bytes_per_sec: 693502645502
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1160997732
+        fingerprint: "6ab6cfdfc119a9143b046bd2262766d6"
+        network_throughput_bytes_per_sec: 102687525
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 1406593406
+        fingerprint: "aa53481ae940f8be73d0809b81bd85ee"
+        network_throughput_bytes_per_sec: 28971790125
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 497570456
+        fingerprint: "84086fea224a69018a6bcf0db282b861"
+        network_throughput_bytes_per_sec: 584490523968
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 537250786
+        fingerprint: "695c9c2e1a16cd287cd6b80d66c3cf24"
+        network_throughput_bytes_per_sec: 170638893409
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1142857142
+        fingerprint: "0120a2fd4590718b617dbde5030314f0"
+        network_throughput_bytes_per_sec: 111888111
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 503937007
+        fingerprint: "b398f3b5618fef0e8beefc3d9fb45eee"
+        network_throughput_bytes_per_sec: 292082451253
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -1351,41 +1746,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 393543428
+        fingerprint: "a9f1ccc1dcdd0ea4b6d345c626fc0464"
+        network_throughput_bytes_per_sec: 27777777
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 756277695
+        fingerprint: "44d6bfd785c449cc8640234e220f77aa"
+        network_throughput_bytes_per_sec: 10613547107
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -1395,87 +1804,117 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 870008496
+        fingerprint: "b51292fa84bc7fc3b5a42c808ed0538a"
+        network_throughput_bytes_per_sec: 650456170278
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1400820793
+        fingerprint: "19bfe133a0d02c4cac3fe71ed6e3e741"
+        network_throughput_bytes_per_sec: 273208963001
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 2285714285
+        fingerprint: "d04da75eb576bed3d4db82103261bf34"
+        network_throughput_bytes_per_sec: 6370139968
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1777777777
+        fingerprint: "c986af5a1df1d20f73d7d40cf5b1e067"
+        network_throughput_bytes_per_sec: 1066666666
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -1485,20 +1924,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 2343249427
+        fingerprint: "d71debdb12a8986a0f288def2a8ac093"
+        network_throughput_bytes_per_sec: 390095238095
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1508,132 +1954,170 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 2813186813
+        fingerprint: "d6ceb25936203837f994d4ea62fccbcb"
+        network_throughput_bytes_per_sec: 228367528
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 822489959
+        fingerprint: "a83d894078ce2cfadd898cfbdd4955ea"
+        network_throughput_bytes_per_sec: 16094302554
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 722143864
+        fingerprint: "a9e70188f014a7fd5d3664cb93a8ceea"
+        network_throughput_bytes_per_sec: 333550488
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 867796610
+        fingerprint: "04a5265203d9bcb5e98c819340f31d6c"
+        network_throughput_bytes_per_sec: 66651369003
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 503937007
+        fingerprint: "99043b67c066b466fdfabf4ba0e10d9d"
+        network_throughput_bytes_per_sec: 246925488
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1109425785
+        fingerprint: "10f1eac7685082516c77e28c1c570603"
+        network_throughput_bytes_per_sec: 749183531303
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1643,63 +2127,85 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 713091922
+        fingerprint: "62c4fc6430ca1eb1c9da917231cf7c2c"
+        network_throughput_bytes_per_sec: 424438777575
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 1650282030
+        fingerprint: "f7fd5b7bdf4d97b0eb10f5fbab3117c5"
+        network_throughput_bytes_per_sec: 14234578627
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 3631205673
+        fingerprint: "a5c25d90d3703c3e05a5428a5fbafe10"
+        network_throughput_bytes_per_sec: 1961450975
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1709,89 +2215,114 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 4302521008
+        fingerprint: "35882d22990344fadfe4e45b8e2721eb"
+        network_throughput_bytes_per_sec: 478822324015
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 3524956970
+        fingerprint: "0f9cdfcb5a2647c85e89d23875e02e61"
+        network_throughput_bytes_per_sec: 70450636
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 4729792147
+        fingerprint: "71fef15b131813a4d472ccf5528d373b"
+        network_throughput_bytes_per_sec: 210526315
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 5535135135
+        fingerprint: "c9746f1c866d390a80ecaa1cd0747467"
+        network_throughput_bytes_per_sec: 213494044589
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -1801,43 +2332,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 1740016992
+        fingerprint: "d0a5ced62829c6ceb591eb442eb1b79c"
+        network_throughput_bytes_per_sec: 653061224
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 4511013215
+        fingerprint: "7090d041bc0599ce9b8dc0095e8d7135"
+        network_throughput_bytes_per_sec: 34168925964
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -1847,152 +2391,200 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 3605633802
+        fingerprint: "584651046fab6aa022a9fcdaa741ca49"
+        network_throughput_bytes_per_sec: 4556173526
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 2763832658
+        fingerprint: "e78c4bfa48ee5d7f81743a050aa5f803"
+        network_throughput_bytes_per_sec: 79559438818
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 2167195767
+        fingerprint: "3e00e3751db2f0a54edfc160509a0c32"
+        network_throughput_bytes_per_sec: 7185964912
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 3778597785
+        fingerprint: "df2f19ebe8fc637197e39b52d97a794c"
+        network_throughput_bytes_per_sec: 5143911149
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 3289959839
+        fingerprint: "3fbaf73ace028a5c0673748316b980bf"
+        network_throughput_bytes_per_sec: 331029083303
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 5031941031
+        fingerprint: "28ed8d8831773650ac1210294feb985d"
+        network_throughput_bytes_per_sec: 1213270142
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 7816793893
+        fingerprint: "6411603f6b84e86918c660203d2586d6"
+        network_throughput_bytes_per_sec: 138988802
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2002,20 +2594,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 4437703141
+        fingerprint: "361bab01e1544bbd7d4c57964c7cb2e8"
+        network_throughput_bytes_per_sec: 296124258683
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2025,20 +2625,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 7262411347
+        fingerprint: "f2e7c307868389c7b945a60985fdbfb7"
+        network_throughput_bytes_per_sec: 600473013600
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2048,66 +2656,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 7876923076
+        fingerprint: "d8ba4d1f1855930537e677def956da0c"
+        network_throughput_bytes_per_sec: 85333333333
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 5859799713
+        fingerprint: "21c3dea5239284aed45517eccc2c77a1"
+        network_throughput_bytes_per_sec: 12118916
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 3602462620
+        fingerprint: "b5ef73f8707a38b85661790207aa156e"
+        network_throughput_bytes_per_sec: 704488436788
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2117,175 +2747,229 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 6942372881
+        fingerprint: "e188b963e5912029be041b83f8c32803"
+        network_throughput_bytes_per_sec: 130031746031
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 3599297012
+        fingerprint: "58212b0c758c8c906c2df8d9cd23841e"
+        network_throughput_bytes_per_sec: 675411272141
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 4075621890
+        fingerprint: "aedd2df037a65be1250f90a94291ed61"
+        network_throughput_bytes_per_sec: 11394345076
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 5298835705
+        fingerprint: "b01e85aab5ef3da6357ca6f9cfb67b8f"
+        network_throughput_bytes_per_sec: 99244105294
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 10014669926
+        fingerprint: "3fc3265f1d4f48b553f24df733a0ec07"
+        network_throughput_bytes_per_sec: 137931034
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 13107200000
+        fingerprint: "905de60dd6e7b89f580e55ae80ac8d79"
+        network_throughput_bytes_per_sec: 330989898989
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 16031311154
+        fingerprint: "923306d188529fd23828978fba917eca"
+        network_throughput_bytes_per_sec: 1007564110
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 7953398058
+        fingerprint: "012c7d96d1729d9e95f6cb6f9cc6646d"
+        network_throughput_bytes_per_sec: 1183815028
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2295,20 +2979,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 12720496894
+        fingerprint: "add1d98ea03d7ecc59ff9877b5bd5e93"
+        network_throughput_bytes_per_sec: 203567462628
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2318,20 +3010,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 20739240506
+        fingerprint: "8f93e3c7983b80f6171d861fa67a2bb5"
+        network_throughput_bytes_per_sec: 2135557872
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2341,20 +3039,54 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 9173572228
+        fingerprint: "8370a1824c7b75672651c80e67bfcc33"
+        network_throughput_bytes_per_sec: 671948734380
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "faccbf0108d642668cb20ee319a39541"
+        network_throughput_bytes_per_sec: 510805041535
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2364,20 +3096,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 17031185031
+        fingerprint: "fc2857874212cf751e4b60decec734a7"
+        network_throughput_bytes_per_sec: 52588331
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2387,63 +3126,80 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 8933478735
+        fingerprint: "eebdac40c94b93e0e8351eba013b7958"
+        network_throughput_bytes_per_sec: 44826265389
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 10317380352
+        fingerprint: "5357a763f8b75abbd8c2b3aa99d399b0"
+        network_throughput_bytes_per_sec: 147976878
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 6913080168
+        fingerprint: "00842feada0344771c1b4e414c197917"
+        network_throughput_bytes_per_sec: 513185355119
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2453,86 +3209,112 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 5970845481
+        fingerprint: "25f7485f5479f664a4f493d61ff1a4d8"
+        network_throughput_bytes_per_sec: 5207485
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 12593389700
+        fingerprint: "ea64b110db7c46aafbb394dd547e9e23"
+        network_throughput_bytes_per_sec: 804836343575
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 12328066215
+        fingerprint: "766a46b63049615920fa933700606ad3"
+        network_throughput_bytes_per_sec: 4003910068
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 27125827814
+        fingerprint: "a9dda3180d9de81b8ac47f5af4e3717c"
+        network_throughput_bytes_per_sec: 127023506
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2542,20 +3324,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 27443886097
+        fingerprint: "4f6b04c57baf41d308928831e49e6f05"
+        network_throughput_bytes_per_sec: 6924767540
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2565,111 +3354,143 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 32125490196
+        fingerprint: "a80bdc2e68784e2f6b048f9070199b93"
+        network_throughput_bytes_per_sec: 1185185185
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 35158798283
+        fingerprint: "e824c0c6cda89c864c487464c4714920"
+        network_throughput_bytes_per_sec: 132441685004
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 12681114551
+        fingerprint: "22006dffcceb6352ce0d7f47b568a045"
+        network_throughput_bytes_per_sec: 5436629
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 18265328874
+        fingerprint: "ae02e28f43d17a07b5679b487f20fb4a"
+        network_throughput_bytes_per_sec: 157349339735
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 26047694753
+        fingerprint: "ef648be80ef0366d556323938cba8b8b"
+        network_throughput_bytes_per_sec: 145797552836
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2679,41 +3500,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 8650475184
+        fingerprint: "4a53acb196d9813effc6e5c5955a28c2"
+        network_throughput_bytes_per_sec: 3744058500
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 13826160337
+        fingerprint: "f08943c7fb325a80c9aedfc3d9219cfb"
+        network_throughput_bytes_per_sec: 935159817
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2723,10 +3558,12 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 27629005059
+        fingerprint: "cc87abd4e4f49d00a84464d67ac1cc63"
+        network_throughput_bytes_per_sec: 26947368421
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
@@ -2734,8 +3571,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2745,19 +3588,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 20189772027
+        fingerprint: "b064bd92e8bb26128984b39785b63827"
+        network_throughput_bytes_per_sec: 21375081539
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -2767,179 +3619,262 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 24129602356
+        fingerprint: "5cfc87185d90d1302586da3e1c0f6fde"
+        network_throughput_bytes_per_sec: 1067778936
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 50027480916
+        fingerprint: "e6fb7f1db9f0def5ca27f01f28291a63"
+        network_throughput_bytes_per_sec: 842105263
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 33677286742
+        fingerprint: "9c002f2bdf400f2638aac27df778dfc1"
+        network_throughput_bytes_per_sec: 438013106023
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 51200000000
+        fingerprint: "b9ca069b3d1f1eeefa7a6ecf54baacae"
+        network_throughput_bytes_per_sec: 848534088610
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 59795620437
+        fingerprint: "9ca39fa8794daa9631163ee0e266097e"
+        network_throughput_bytes_per_sec: 453900709
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 51280125195
+        fingerprint: "7137b6e88084dce1309b8c65093ae1ff"
+        network_throughput_bytes_per_sec: 938598637743
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 36008791208
+        fingerprint: "d08ba83901c2428a2cacfc0d6e826840"
+        network_throughput_bytes_per_sec: 293924597056
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 66737270875
+        fingerprint: "c2aa9452829d26d4fbfc6be7dcd22902"
+        network_throughput_bytes_per_sec: 638305280779
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "4ce7115cb0355a0436faffe9d4f63a60"
+        network_throughput_bytes_per_sec: 49960967
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -2949,41 +3884,57 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 19106705539
+        fingerprint: "b253ab825e72a17754e0f920ba0ac47b"
+        network_throughput_bytes_per_sec: 619725768321
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 30681647940
+        fingerprint: "90fc6688a7894e5c0b2688b8b3b56e5b"
+        network_throughput_bytes_per_sec: 1204705882
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -2993,19 +3944,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 28972590627
+        fingerprint: "0c83ea7171fd3e561e96f900b77b901b"
+        network_throughput_bytes_per_sec: 34026998961
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
             dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3015,19 +3973,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 36612290502
+        fingerprint: "5ecb8bfdacc065c41d076a95fde10de2"
+        network_throughput_bytes_per_sec: 51360501567
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3037,19 +4003,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 35463203463
+        fingerprint: "98b831c4ca72406f37389214c5e19865"
+        network_throughput_bytes_per_sec: 311496769402
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3059,43 +4033,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 47524292965
+        fingerprint: "224b0c65bfb5a0772e3b3ef5e529c630"
+        network_throughput_bytes_per_sec: 20505632040
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 66264914054
+        fingerprint: "7c15270553f884594396ea3a9e22288a"
+        network_throughput_bytes_per_sec: 208464413518
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3105,43 +4091,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 73388577827
+        fingerprint: "b5dc833e1006db332dcf16fc073558e4"
+        network_throughput_bytes_per_sec: 215578947368
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 82022528160
+        fingerprint: "0cb65c209a26b3c7c2d519854b62c67c"
+        network_throughput_bytes_per_sec: 8162157113
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3151,20 +4150,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 60963720930
+        fingerprint: "7cac54d6e1d08a6cd127ab0f81c94048"
+        network_throughput_bytes_per_sec: 608355065632
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3174,20 +4180,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 93090909090
+        fingerprint: "f40071ad502637ac5fbb867e29baea24"
+        network_throughput_bytes_per_sec: 37037037
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3197,41 +4211,58 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 68266666666
+        fingerprint: "a7adb6a1534bbf74d4512d8e05eb5ad0"
+        network_throughput_bytes_per_sec: 859356451365
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 83591836734
+        fingerprint: "96b26a3b1e59d77bab95e0026d23275c"
+        network_throughput_bytes_per_sec: 2074974670
       }
       entries {
         instruction {
-          opcode: "all-to-all"
-          shape {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3241,32 +4272,41 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 58514285714
+        fingerprint: "7607dfea803f89904d8e152fb8189956"
+        network_throughput_bytes_per_sec: 4271115745
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 42833986928
+        fingerprint: "25a76511510793444ae58b29d1d310cd"
+        network_throughput_bytes_per_sec: 612396554241
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
@@ -3274,8 +4314,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3285,64 +4331,79 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 61077353215
+        fingerprint: "17e8b701cba039ffed5c4c64b81bd38e"
+        network_throughput_bytes_per_sec: 50567901234
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 70888047593
+        fingerprint: "f6d54f4055232256fcd6c0bf18d19f73"
+        network_throughput_bytes_per_sec: 155498651441
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 97234421364
+        fingerprint: "1f93b4ce5f502f4229278ea4c2936bb8"
+        network_throughput_bytes_per_sec: 8393442622
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3352,66 +4413,87 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 88983027834
+        fingerprint: "01cbebe328fc80e3c97d6b476fec64ff"
+        network_throughput_bytes_per_sec: 228571428
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 99447647951
+        fingerprint: "abd8c8f7ce2fd95b63645316537b74e7"
+        network_throughput_bytes_per_sec: 528429514717
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 136818371607
+        fingerprint: "bac7b1d59bfbf3ce3c9770a882b7d9d9"
+        network_throughput_bytes_per_sec: 1057851239
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3421,10 +4503,12 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 117658886894
+        fingerprint: "4afbc2a251fde8fe2a2f21f9b3680acf"
+        network_throughput_bytes_per_sec: 12700775193
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
@@ -3432,32 +4516,45 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 98847662141
+        fingerprint: "0f6fd7c2255ef5d6619d3e266e4493b1"
+        network_throughput_bytes_per_sec: 162017305315
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3467,19 +4564,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 132262361251
+        fingerprint: "b3774b957d17024bdec4dfb6aa7015b8"
+        network_throughput_bytes_per_sec: 10680573663
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "d1ed25619ddb7091e280b64f8bf17b71"
+        network_throughput_bytes_per_sec: 499055741
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3489,19 +4622,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 112123182207
+        fingerprint: "d23936a34187f9cecae83b71e3b7c071"
+        network_throughput_bytes_per_sec: 363894811
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3511,108 +4651,117 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 79054282267
+        fingerprint: "0ac37c0c61461a98a53778858abef7c2"
+        network_throughput_bytes_per_sec: 1773160173
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 75112893982
+        fingerprint: "c5c9c12452980db70f61be1deb2f609c"
+        network_throughput_bytes_per_sec: 4000000000
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 80117359413
+        fingerprint: "9ea76d324a2a87d6007857f1bbd58e8b"
+        network_throughput_bytes_per_sec: 6557534520
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 98698795180
+        fingerprint: "e001496ab38237a7a240b9087b52ce67"
+        network_throughput_bytes_per_sec: 759837681159
       }
       entries {
         instruction {
-          opcode: "all-reduce"
-          shape {
-            element_type: F32
-            dimensions: 2097152
-            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
-        }
-        network_throughput_bytes_per_sec: 115076382791
-      }
-      entries {
-        instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3622,43 +4771,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 128944417117
+        fingerprint: "687f7f1c2f260eab74a9b0fe20926d8a"
+        network_throughput_bytes_per_sec: 100000000
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 147936794582
+        fingerprint: "7d42a2c52405b0487725b632c32b3246"
+        network_throughput_bytes_per_sec: 56303161285
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3668,66 +4829,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 208547334924
+        fingerprint: "4fcbe7e7d7bb2d20520c051dd60ad89e"
+        network_throughput_bytes_per_sec: 118832275611
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 141470048569
+        fingerprint: "713bd2c46abfb6ce361f71da4eb7d023"
+        network_throughput_bytes_per_sec: 75851851851
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 153210987726
+        fingerprint: "a2d264c6a62d0909b4dc78be82a6aff6"
+        network_throughput_bytes_per_sec: 124878048
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3737,19 +4918,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 196509745127
+        fingerprint: "03e84199997302127c56cdb92f61f92d"
+        network_throughput_bytes_per_sec: 218271440466
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3759,107 +4948,146 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 118402890695
+        fingerprint: "64fcb07cb34dfdc261edff3094f5e329"
+        network_throughput_bytes_per_sec: 675180232207
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 106649308380
+        fingerprint: "9f83571c52a45ed6fa719a3ef1b35f56"
+        network_throughput_bytes_per_sec: 189910979
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 177364005412
+        fingerprint: "e70429d2d9bdfaadf7c6d6cd451ad2d0"
+        network_throughput_bytes_per_sec: 150433377
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 115481938325
+        fingerprint: "378df9b975bd6f83307c54c5ebf8a5e3"
+        network_throughput_bytes_per_sec: 7192273924
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 109821533305
+        fingerprint: "50e0778e29cbb85bbe9e52c8a9e3f53b"
+        network_throughput_bytes_per_sec: 89134308058
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -3869,20 +5097,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 165913924050
+        fingerprint: "fd47bb6bf2e69ef941ae3455e0980ec2"
+        network_throughput_bytes_per_sec: 68195629552
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -3892,20 +5127,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 204003112840
+        fingerprint: "70cc3d28822cfa34a6f4f77936bd4122"
+        network_throughput_bytes_per_sec: 567411255411
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -3915,133 +5157,174 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 239729309556
+        fingerprint: "7e46aa6a95b79c94bd27c5e9de8038c1"
+        network_throughput_bytes_per_sec: 726223530430
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 277989395546
+        fingerprint: "906601acb6c0bdb8e772d3adb5a7e148"
+        network_throughput_bytes_per_sec: 318474107820
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 193750184774
+        fingerprint: "d1dbd5305c09a5bbfd5a1e9c7c51dd3f"
+        network_throughput_bytes_per_sec: 269367354
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 240388812471
+        fingerprint: "29421a4f1a4df068f67122b659311863"
+        network_throughput_bytes_per_sec: 2371742906
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 246607714016
+        fingerprint: "f6813a624e0d4cd5931fd1911a59ce8d"
+        network_throughput_bytes_per_sec: 781679389
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 158875151515
+        fingerprint: "d84ec32ec9d5bce065cad02d30309053"
+        network_throughput_bytes_per_sec: 413557878130
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4051,19 +5334,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 191345985401
+        fingerprint: "a490a74ef07c6ec14e8318d4b8142f8e"
+        network_throughput_bytes_per_sec: 367401723439
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4073,41 +5365,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 211321241434
+        fingerprint: "a6f8b72cb4b9af96eef324934f9de021"
+        network_throughput_bytes_per_sec: 22199098
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 148544553052
+        fingerprint: "561c9e59f33d316b74205823cb42d04d"
+        network_throughput_bytes_per_sec: 513444984578
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4117,19 +5423,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 153772693943
+        fingerprint: "668e8b905ffbce1829608cbf4befc332"
+        network_throughput_bytes_per_sec: 85333333333
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4139,20 +5453,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 178663486113
+        fingerprint: "48e28feb64dc3689b568926698d8e02e"
+        network_throughput_bytes_per_sec: 5913879
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4162,89 +5484,114 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 259355923818
+        fingerprint: "0690a7935a759eb144aeec77b49771d9"
+        network_throughput_bytes_per_sec: 51160031225
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 296124258683
+        fingerprint: "5d605c04268d15a46391c0ef5400e98e"
+        network_throughput_bytes_per_sec: 128000000000
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 347901791639
+        fingerprint: "3aa698dd32575d30602a31ddba9fefac"
+        network_throughput_bytes_per_sec: 38120333006
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 251276300023
+        fingerprint: "3f33c91a674d3a639ad8460a9fee21f0"
+        network_throughput_bytes_per_sec: 76830732
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4254,86 +5601,3383 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 271581455581
+        fingerprint: "ea826b5b3a3cd11ff917bb395a93cd6f"
+        network_throughput_bytes_per_sec: 524288000000
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 266001014713
+        fingerprint: "d5bb5db4849bbe872fc0a3ab1ec6d0d9"
+        network_throughput_bytes_per_sec: 3657142857
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 253218063269
+        fingerprint: "4447fc843a198e997259f2a45d1c5078"
+        network_throughput_bytes_per_sec: 515460734914
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "7e50a143617aca892c3824cf04c47087"
+        network_throughput_bytes_per_sec: 461495736370
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 64
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "790112769afed6a30fcc1c8cbce08768"
+        network_throughput_bytes_per_sec: 38415366
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e77afb055f2c55ce8a9f881f93f4ccec"
+        network_throughput_bytes_per_sec: 592290333968
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "6dd3b94c1709ebff95103a4422009c8b"
+        network_throughput_bytes_per_sec: 413455961358
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c708aebbaea40d6ad370faaa7d411d0f"
+        network_throughput_bytes_per_sec: 115481938325
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "9c2db8231f5ef24a566554bcd16a60d4"
+        network_throughput_bytes_per_sec: 626225822104
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "b9f629195d1e0d1afeebe40aef9955df"
+        network_throughput_bytes_per_sec: 73253995144
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "bffcfdab46001339d858d2bde0841588"
+        network_throughput_bytes_per_sec: 12424771
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "d4bb9864d0881eafd2fa5efec09c6e99"
+        network_throughput_bytes_per_sec: 50039093
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "d9af6aa7b045746dbcb1015482010070"
+        network_throughput_bytes_per_sec: 26750752589
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "7c2845488c50871c6df2b01b09b93607"
+        network_throughput_bytes_per_sec: 524681639325
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "275f04921ccc4b0fa29127fb3a80a2f2"
+        network_throughput_bytes_per_sec: 2967175261
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "abd659445180bd7e7b25417a2c970841"
+        network_throughput_bytes_per_sec: 220474348191
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 65536
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "696f90418e9a91559c454b6ef6ce9b85"
+        network_throughput_bytes_per_sec: 6948261238
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3007a47f1e00d56e53224073fd790288"
+        network_throughput_bytes_per_sec: 1878899082
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "8eb0c3afb6d4a5a42937daf0b2ca9327"
+        network_throughput_bytes_per_sec: 1853393665
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "6823bce11114d8ceb5386584c6072fc8"
+        network_throughput_bytes_per_sec: 62972756
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e1c68238c33369569964e0715085e11e"
+        network_throughput_bytes_per_sec: 333728835136
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "3c0dab67f668aa96622d925405aa4c35"
+        network_throughput_bytes_per_sec: 278053085
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "353ed7150bd91617fa8843f2620c704d"
+        network_throughput_bytes_per_sec: 70156207
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "8f9d476f5290f9fd2653c4f975ec810e"
+        network_throughput_bytes_per_sec: 28370562770
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "46eb6d6f2c352a68dd5943ccbf21f917"
+        network_throughput_bytes_per_sec: 455111111111
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "f85a8aefce6c3bcea1643e35008f1774"
+        network_throughput_bytes_per_sec: 390822213939
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "b0b00085d45064632771977a738377fa"
+        network_throughput_bytes_per_sec: 405874201664
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "75c0063e60c072af0ddaa5e2dbbfa741"
+        network_throughput_bytes_per_sec: 356050069
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ade554f65ab408a90ce8976d623c021f"
+        network_throughput_bytes_per_sec: 25680250783
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "d28e01182634547a1027d109446f3b88"
+        network_throughput_bytes_per_sec: 4882729846
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "a69547b3c276d3341642608b1db494a6"
+        network_throughput_bytes_per_sec: 91511754502
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "baf06cabffc7267e4ba88f3f0469f867"
+        network_throughput_bytes_per_sec: 538168440
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c37087da0a44968200e66995156557e7"
+        network_throughput_bytes_per_sec: 399305407463
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "02feb5e94baeeab2372bc39cf64b4dc3"
+        network_throughput_bytes_per_sec: 121212121
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "067cb94ec99578ac4a5d635f77d6836e"
+        network_throughput_bytes_per_sec: 857217600
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "61804978175d4c86d2e9535c315a7c67"
+        network_throughput_bytes_per_sec: 281572502685
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3b80581304fa3c1ea59b5614ce83e167"
+        network_throughput_bytes_per_sec: 4566332218
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "71a05d2710cbb210f44bd79ab02e4544"
+        network_throughput_bytes_per_sec: 155802442594
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "87766ec8522f636b65d29ffbbac0f005"
+        network_throughput_bytes_per_sec: 2681286310
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "77c8cbc50f987483fee44f7a20bf8b1f"
+        network_throughput_bytes_per_sec: 528649357196
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3b97564ed96cdb745f18dfba342d89a0"
+        network_throughput_bytes_per_sec: 198443603330
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fb071244e8d81a3688446f7a2515f445"
+        network_throughput_bytes_per_sec: 633006942348
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3a320de13af8d0a2b9013cfbf39985fb"
+        network_throughput_bytes_per_sec: 295953757
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "60c70a6d99cce4e304e50c40c8f99fce"
+        network_throughput_bytes_per_sec: 378611362482
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c8fe77ad4d6a5f9c9e08e64b66c5aa36"
+        network_throughput_bytes_per_sec: 2064516129
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fab7de7fff87c4825d063b15cf576f4a"
+        network_throughput_bytes_per_sec: 81632653
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "762f33c509d278627132d4806885cfc9"
+        network_throughput_bytes_per_sec: 29283288650
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ad7d779a1ec5f906606de9e4e93650d9"
+        network_throughput_bytes_per_sec: 1601250977
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "dd72a6e820b6d534b73545dc695b9277"
+        network_throughput_bytes_per_sec: 1924812030
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "141f0055829b89a154053476d8d5fabe"
+        network_throughput_bytes_per_sec: 43690666666
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "96ca5c7e1f1638828edc08d3daa8f8f7"
+        network_throughput_bytes_per_sec: 78486227544
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "1c34dad9da64a4a58f0c8f86c14a73a2"
+        network_throughput_bytes_per_sec: 30654738934
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 128
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e94911366d0b4cfff0d6742f123be8a9"
+        network_throughput_bytes_per_sec: 77575757
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "7da45b992c6f90f878e16349e67435ed"
+        network_throughput_bytes_per_sec: 818281032044
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "656be27a022b3b74531f5f2327584a2a"
+        network_throughput_bytes_per_sec: 3757798165
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "24af8009842c9b30f9fc309675fd46c8"
+        network_throughput_bytes_per_sec: 687140235910
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "0588b7c974105348d0a2515c0abd898b"
+        network_throughput_bytes_per_sec: 99712438189
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "188b7c8f49ee518ef3f12f7239d0542a"
+        network_throughput_bytes_per_sec: 8245596376
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "83591db4b7a2726af43dd49fc404007b"
+        network_throughput_bytes_per_sec: 132297300387
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "21ea7fae57499ff2b1700818db1ccc37"
+        network_throughput_bytes_per_sec: 130470612022
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "afc15c1b4deac0e2789adf42fce916f4"
+        network_throughput_bytes_per_sec: 694823821751
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "2965a87c08fc2bb44b5e63f4a8232930"
+        network_throughput_bytes_per_sec: 178086956521
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "66a5b4b136e3e63482a26491f2086663"
+        network_throughput_bytes_per_sec: 380386329
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ffb987920d3dcd0f23601030220f2c32"
+        network_throughput_bytes_per_sec: 515524090462
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "96e82cb9d09f5d9b43c2800f01b5f3ff"
+        network_throughput_bytes_per_sec: 280105783357
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "542e81cf1beafb7de263b511ad1f5d7c"
+        network_throughput_bytes_per_sec: 77926278240
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "2c9f9cea76ec0a7f35987d8bdeea1d30"
+        network_throughput_bytes_per_sec: 3552471812
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "7ab9f4f3c725f05b8e8f79d1d9a79a65"
+        network_throughput_bytes_per_sec: 890434782
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "03330f0defed011fd622da3ddcd58de5"
+        network_throughput_bytes_per_sec: 17636167922
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "d51ae4f483b29ba1518794ed1f635d41"
+        network_throughput_bytes_per_sec: 351171449502
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fac36931d73a8f77c6af1f29aa01f950"
+        network_throughput_bytes_per_sec: 196952667167
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "954fa4e199cb0e689954013992370dc4"
+        network_throughput_bytes_per_sec: 731428571
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "32921213d78db478164e6ece7132d57a"
+        network_throughput_bytes_per_sec: 399457523809
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "3a965e23c93556616a155131d28e076f"
+        network_throughput_bytes_per_sec: 693387998016
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "e937e6f6cd2b6db16fe9cd7c2979d357"
+        network_throughput_bytes_per_sec: 66064516129
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "1f4b3db733dab96bb8fa97d8f2bb2c7e"
+        network_throughput_bytes_per_sec: 531221766925
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 16384
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c578baa139f847a50200e65a501cfe37"
+        network_throughput_bytes_per_sec: 1456614509
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "997ccdcc3c56a275439d3efb9d75628b"
+        network_throughput_bytes_per_sec: 6134969
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ecf9d950bcd7b6c6223401f066216fd4"
+        network_throughput_bytes_per_sec: 310303030
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "bdda763fe4cbd4dbd2cd6e538df4d2f5"
+        network_throughput_bytes_per_sec: 350752968723
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "59b183e8704a0ba13869ade15fa2b92b"
+        network_throughput_bytes_per_sec: 156878515858
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "0b76c03ffc5616ae8aaf7fe05d58a8e5"
+        network_throughput_bytes_per_sec: 585960324112
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ee0857faea7d71857c6fb8036e979d2d"
+        network_throughput_bytes_per_sec: 48725650557
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "393073cf2c2fb7f64008edb89f2f877a"
+        network_throughput_bytes_per_sec: 47310939156
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "22d939e4ee859968ac17ebf1c62fef05"
+        network_throughput_bytes_per_sec: 715049908366
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 32768
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "bd85fa7c974409dc2c659688fdfb262f"
+        network_throughput_bytes_per_sec: 4413793103
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 64
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "f4c16425b37cb7c0469f632bc5f2954f"
+        network_throughput_bytes_per_sec: 62256809
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "b4c5c1f7997e35be9d661c5ab8917bbc"
+        network_throughput_bytes_per_sec: 332036316
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "4a5e09d90eb94f08a4eb31421f1e8443"
+        network_throughput_bytes_per_sec: 80874710166
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "96c966eff2db96826fa05a63d70abdfb"
+        network_throughput_bytes_per_sec: 293133731697
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "fd886f592d094bb3c992eb050b3aeb7a"
+        network_throughput_bytes_per_sec: 76920187793
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "a0c14787b94ec324f39d4b4cde6aaee8"
+        network_throughput_bytes_per_sec: 1561049973
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8388608
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "b3181d33b743bf7d8e93edc0102fa54b"
+        network_throughput_bytes_per_sec: 577250756950
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c092460e68acec60d687e3ff6c5a6674"
+        network_throughput_bytes_per_sec: 788440058273
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ef3bbaf6d14ca5829ae80710febc85eb"
+        network_throughput_bytes_per_sec: 500000000
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "15185f0d28c7a7edc1b4aecfa6d8f221"
+        network_throughput_bytes_per_sec: 293947770934
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 67108864
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "6b3993ffc7a2464a3f8f42c61f55394d"
+        network_throughput_bytes_per_sec: 633198067632
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "1287402cc6f1747882876a7ef488e090"
+        network_throughput_bytes_per_sec: 33590978985
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "9baae3a878bfbfff40ac180eb9c74753"
+        network_throughput_bytes_per_sec: 16384000000
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "47bea0ecc1ea80cb91e9b291e81894f8"
+        network_throughput_bytes_per_sec: 68246672524
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "a9ce40b744201dc85700fe442e18c6d3"
+        network_throughput_bytes_per_sec: 46512420156
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ae9b470b1a59d43e5b5fa203ca090cd9"
+        network_throughput_bytes_per_sec: 327680000000
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "f36d3cb56a18e44b99c8e2bd01e47a20"
+        network_throughput_bytes_per_sec: 100000000
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "56f48d98625522ae94180017c6e4235a"
+        network_throughput_bytes_per_sec: 592592592
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1024
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "53d4043108c4d45128534295dc2c4234"
+        network_throughput_bytes_per_sec: 468007312
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "f57f58c135e77c22b55a8bdbe156e2a4"
+        network_throughput_bytes_per_sec: 15208656049
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "5016df44b768d5084d1607b448e77a3d"
+        network_throughput_bytes_per_sec: 13462612982
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "be567d531287055cea40ba66db60de94"
+        network_throughput_bytes_per_sec: 413313362238
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 268435456
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "680f6ed8e838fee643167dc7a214bcd4"
+        network_throughput_bytes_per_sec: 344699539776
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 536870912
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "d8c02d04f293873b5f6c56e662530193"
+        network_throughput_bytes_per_sec: 883755583649
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 4194304
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "8426634cb954bc9a04f5d3df48489b36"
+        network_throughput_bytes_per_sec: 584653470867
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "c5eb925239f355be19b2a2ae51019597"
+        network_throughput_bytes_per_sec: 241998128458
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "2d89d55373f92fb4e1b55e7456f04f79"
+        network_throughput_bytes_per_sec: 134226318484
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "147048ce1f70eaaf4c41e3a478797c71"
+        network_throughput_bytes_per_sec: 648922686
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 8192
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 1
+              num_devices_per_group: 8
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "c161c6ad6758b19e819c0db57a16b335"
+        network_throughput_bytes_per_sec: 6090706319
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 247189061763
+        fingerprint: "0ee3ab31a2cbac43bf62cb9214b7c1d1"
+        network_throughput_bytes_per_sec: 102480062548
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4343,19 +8987,55 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 317942995755
+        fingerprint: "76f465d808cb817683770b3e6ab6838d"
+        network_throughput_bytes_per_sec: 92827195467
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 4
+              num_devices_per_group: 2
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "4a4598a1e81e510779605c8c674a3ac0"
+        network_throughput_bytes_per_sec: 810630589713
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4365,19 +9045,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 176691549414
+        fingerprint: "1576d013b336b385ef44a95d0cd74a2c"
+        network_throughput_bytes_per_sec: 15968810916
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4387,42 +9075,56 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 170016376165
+        fingerprint: "95b6a84a10fab4f19072622cadfd1acf"
+        network_throughput_bytes_per_sec: 2560000000
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 203192713884
+        fingerprint: "44ab650a56833b85a693cb515c43dee0"
+        network_throughput_bytes_per_sec: 2058291457
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4432,20 +9134,59 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 314227150134
+        fingerprint: "5326b51b78d6cb159c04918bfead91ed"
+        network_throughput_bytes_per_sec: 584898061637
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "99623f61db0dd4933df0e1e1215c75eb"
+        network_throughput_bytes_per_sec: 124121212121
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4455,20 +9196,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 342671895424
+        fingerprint: "1dc1e161c38f7eff0cea4c9c323dcfad"
+        network_throughput_bytes_per_sec: 642214668504
       }
       entries {
         instruction {
+          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4478,20 +9226,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 416680309954
+        fingerprint: "6fdb3d7a311222b991ec1edef14c5c26"
+        network_throughput_bytes_per_sec: 809086419753
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4501,20 +9256,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 306960187353
+        fingerprint: "f791188a95c2a553c9030fa46c38f2a0"
+        network_throughput_bytes_per_sec: 97523809523
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 4096
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "ecaeb13272e9384319989d342680549c"
+        network_throughput_bytes_per_sec: 531671858
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2097152
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
+          collective_device_list {
+            iota_replica_group_list {
+              num_replica_groups: 2
+              num_devices_per_group: 4
+              iota_reshape_dims: 8
+              iota_transpose_perm: 0
+            }
+          }
+        }
+        fingerprint: "6b9183f8b747bcf1e11a605c2867f7ff"
+        network_throughput_bytes_per_sec: 128313264806
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4524,64 +9345,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 299037786967
+        fingerprint: "7c115f6e4ef6c361899235020514a18f"
+        network_throughput_bytes_per_sec: 68912723449
       }
       entries {
         instruction {
+          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 303847000869
+        fingerprint: "0516ad83d2a5538970091f77531c73c6"
+        network_throughput_bytes_per_sec: 766958445714
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 306825457205
+        fingerprint: "31a79cde8d9fdcd983bb9252c3c92f73"
+        network_throughput_bytes_per_sec: 42622441721
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4591,19 +9434,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 288545954870
+        fingerprint: "48cd9372c015dda07a982da1a5620727"
+        network_throughput_bytes_per_sec: 121442125
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -4613,19 +9464,54 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 395838429596
+        fingerprint: "e4c41dd6f65b2df6f78f48e9ef705d3e"
+        network_throughput_bytes_per_sec: 468532618409
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "0a1bea9a023ace07a93df0c757a64d8d"
+        network_throughput_bytes_per_sec: 3642507781
+      }
+      entries {
+        instruction {
+          name: "_"
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 512
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          dimensions: 0
+          channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4635,19 +9521,28 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 200281921497
+        fingerprint: "b4310b6629879dd69750c1adf77eca35"
+        network_throughput_bytes_per_sec: 20792722
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4657,65 +9552,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 181823478411
+        fingerprint: "22929fda0f320c7393deb2c18af09d78"
+        network_throughput_bytes_per_sec: 114695340
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 216201237113
+        fingerprint: "1f1e13b498f3efcf560154972498ced3"
+        network_throughput_bytes_per_sec: 5616005
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 343936367363
+        fingerprint: "5c21cfc6706586931dd655d0d05f90bb"
+        network_throughput_bytes_per_sec: 126984126
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4725,43 +9641,58 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 383321513434
+        fingerprint: "15f365ac896c0b583d549fa6577c6d56"
+        network_throughput_bytes_per_sec: 266112266
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "_"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 32
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 4
+          operand_ids: 3
+          called_computation_ids: 2
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 454420801733
+        fingerprint: "ad8d88294e7338062a0ebdb4f4cd2eb5"
+        network_throughput_bytes_per_sec: 22727272
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "_"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           dimensions: 0
           channel_id: 1
+          id: 1
+          operand_ids: 0
+          frontend_attributes {}
           use_global_device_ids: true
+          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -4771,20 +9702,25 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 326151166407
+        fingerprint: "2168cceecb93680ecc7859d77fc1492b"
+        network_throughput_bytes_per_sec: 30303030
       }
+    }
+  }
+  entries {
+    key: "sm_90"
+    value {
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4794,14 +9730,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 331854102381
+        network_throughput_bytes_per_sec: 421983399567
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4817,14 +9753,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 330312175145
+        network_throughput_bytes_per_sec: 1406593406
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4832,43 +9768,44 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 325265917022
+        network_throughput_bytes_per_sec: 2167195767
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 374926611245
+        network_throughput_bytes_per_sec: 5535135135
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4876,87 +9813,99 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 435771844155
+        network_throughput_bytes_per_sec: 118402890695
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 212665939916
+        fingerprint: "df2f19ebe8fc637197e39b52d97a794c"
+        network_throughput_bytes_per_sec: 7423231579
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 189098712833
+        fingerprint: "f6f986332989bc7ea465794f1c2b6856"
+        network_throughput_bytes_per_sec: 292016122948
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 227204246905
+        network_throughput_bytes_per_sec: 19106705539
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -4965,27 +9914,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 368082843352
+        network_throughput_bytes_per_sec: 209492635
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -4995,14 +9943,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 402872346556
+        network_throughput_bytes_per_sec: 1109425785
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5018,14 +9966,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 488704223711
+        network_throughput_bytes_per_sec: 277989395546
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5034,89 +9982,99 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 348118354981
+        network_throughput_bytes_per_sec: 537782991954
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 346765656649
+        network_throughput_bytes_per_sec: 28972590627
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 347397523501
+        fingerprint: "d1dbd5305c09a5bbfd5a1e9c7c51dd3f"
+        network_throughput_bytes_per_sec: 342618151
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 353651264755
+        fingerprint: "f08943c7fb325a80c9aedfc3d9219cfb"
+        network_throughput_bytes_per_sec: 1242435732
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5124,92 +10082,96 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 400219847328
+        network_throughput_bytes_per_sec: 3599297012
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 519129154031
+        network_throughput_bytes_per_sec: 497570456
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 220787703321
+        network_throughput_bytes_per_sec: 3602462620
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 193319306331
+        fingerprint: "aa53481ae940f8be73d0809b81bd85ee"
+        network_throughput_bytes_per_sec: 33755343806
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
             dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -5219,14 +10181,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 232468006096
+        network_throughput_bytes_per_sec: 558905190219
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5242,37 +10204,36 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 385603346434
+        network_throughput_bytes_per_sec: 532224532
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 423399772870
+        network_throughput_bytes_per_sec: 220787703321
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5288,18 +10249,17 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 511594072086
+        network_throughput_bytes_per_sec: 208547334924
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
@@ -5311,41 +10271,39 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 358832552668
+        network_throughput_bytes_per_sec: 471889400
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 360350874178
+        network_throughput_bytes_per_sec: 20189772027
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
@@ -5357,14 +10315,14 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 365819545593
+        network_throughput_bytes_per_sec: 232468006096
       }
       entries {
         instruction {
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5372,26 +10330,27 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           channel_id: 1
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 367582840366
+        network_throughput_bytes_per_sec: 75112893982
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -5401,61 +10360,63 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 421983399567
+        network_throughput_bytes_per_sec: 51990251
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 558905190219
+        network_throughput_bytes_per_sec: 239729309556
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 227444546849
+        network_throughput_bytes_per_sec: 713091922
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
@@ -5467,36 +10428,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 198288807469
+        network_throughput_bytes_per_sec: 2343249427
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 242242337131
+        fingerprint: "6411603f6b84e86918c660203d2586d6"
+        network_throughput_bytes_per_sec: 182449888
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5512,37 +10479,60 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 393268230936
+        network_throughput_bytes_per_sec: 12681114551
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "6823bce11114d8ceb5386584c6072fc8"
+        network_throughput_bytes_per_sec: 86122792
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 434192960662
+        network_throughput_bytes_per_sec: 503937007
       }
       entries {
         instruction {
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5551,21 +10541,21 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 537782991954
+        network_throughput_bytes_per_sec: 385603346434
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5574,27 +10564,26 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 367611030160
+        network_throughput_bytes_per_sec: 2813186813
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -5604,42 +10593,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 371222516014
+        network_throughput_bytes_per_sec: 6913080168
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 389556301155
+        network_throughput_bytes_per_sec: 13107200000
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -5649,80 +10638,90 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 383957524230
+        network_throughput_bytes_per_sec: 60963720930
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 450461571507
+        fingerprint: "e70429d2d9bdfaadf7c6d6cd451ad2d0"
+        network_throughput_bytes_per_sec: 174624829
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 623839068919
+        fingerprint: "ea38ae3f2a296149dde0cac2c673fd8f"
+        network_throughput_bytes_per_sec: 263586259582
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 229968237690
+        fingerprint: "61c2170cfeaf7234b58eea7d3c5cc136"
+        network_throughput_bytes_per_sec: 15856283078
       }
       entries {
         instruction {
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5737,42 +10736,47 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 201566254276
+        network_throughput_bytes_per_sec: 13787160
       }
       entries {
         instruction {
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 250289469463
+        fingerprint: "f57f58c135e77c22b55a8bdbe156e2a4"
+        network_throughput_bytes_per_sec: 17673026360
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -5782,20 +10786,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 401897616481
+        network_throughput_bytes_per_sec: 101426307
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -5805,37 +10808,36 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 443094410881
+        network_throughput_bytes_per_sec: 450461571507
       }
       entries {
         instruction {
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
-          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 549964466006
+        network_throughput_bytes_per_sec: 7492390
       }
       entries {
         instruction {
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
@@ -5851,263 +10853,243 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        network_throughput_bytes_per_sec: 376090719201
+        network_throughput_bytes_per_sec: 5859799713
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
           use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 383183624061
+        network_throughput_bytes_per_sec: 16031311154
       }
       entries {
         instruction {
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
             dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
+          metadata {}
           channel_id: 1
-          use_global_device_ids: true
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        network_throughput_bytes_per_sec: 410431685299
+        fingerprint: "abd8c8f7ce2fd95b63645316537b74e7"
+        network_throughput_bytes_per_sec: 289717760542
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 388143599946
+        network_throughput_bytes_per_sec: 360350874178
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 468722523974
+        network_throughput_bytes_per_sec: 250289469463
       }
       entries {
         instruction {
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           dimensions: 0
           channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        network_throughput_bytes_per_sec: 642663627744
+        network_throughput_bytes_per_sec: 12720496894
       }
-    }
-  }
-  entries {
-    key: "sm_100"
-    value {
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "9f6af0d6b827293d0386ca2378e5d71b"
-        network_throughput_bytes_per_sec: 22743425
+        fingerprint: "47bea0ecc1ea80cb91e9b291e81894f8"
+        network_throughput_bytes_per_sec: 65284552465
       }
       entries {
-        instruction {
-          name: "_"
-          opcode: "all-reduce"
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "a9f1ccc1dcdd0ea4b6d345c626fc0464"
-        network_throughput_bytes_per_sec: 27777777
+        fingerprint: "275f04921ccc4b0fa29127fb3a80a2f2"
+        network_throughput_bytes_per_sec: 4457321635
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "997ccdcc3c56a275439d3efb9d75628b"
-        network_throughput_bytes_per_sec: 6134969
+        fingerprint: "f6d54f4055232256fcd6c0bf18d19f73"
+        network_throughput_bytes_per_sec: 142461701757
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "2168cceecb93680ecc7859d77fc1492b"
-        network_throughput_bytes_per_sec: 30303030
+        network_throughput_bytes_per_sec: 211321241434
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -6117,89 +11099,70 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "f40071ad502637ac5fbb867e29baea24"
-        network_throughput_bytes_per_sec: 37037037
+        network_throughput_bytes_per_sec: 98698795180
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "48e28feb64dc3689b568926698d8e02e"
-        network_throughput_bytes_per_sec: 5913879
+        fingerprint: "44d6bfd785c449cc8640234e220f77aa"
+        network_throughput_bytes_per_sec: 12531982025
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ad8d88294e7338062a0ebdb4f4cd2eb5"
-        network_throughput_bytes_per_sec: 22727272
+        network_throughput_bytes_per_sec: 299037786967
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -6209,57 +11172,41 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "790112769afed6a30fcc1c8cbce08768"
-        network_throughput_bytes_per_sec: 38415366
+        network_throughput_bytes_per_sec: 181823478411
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "22006dffcceb6352ce0d7f47b568a045"
-        network_throughput_bytes_per_sec: 5436629
+        network_throughput_bytes_per_sec: 176691549414
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -6269,235 +11216,183 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4f39fbed3ed5b26fd8f01ceb12a6958e"
-        network_throughput_bytes_per_sec: 11359602
+        network_throughput_bytes_per_sec: 251226692
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "1f1e13b498f3efcf560154972498ced3"
-        network_throughput_bytes_per_sec: 5616005
+        network_throughput_bytes_per_sec: 227204246905
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "25f7485f5479f664a4f493d61ff1a4d8"
-        network_throughput_bytes_per_sec: 5207485
+        network_throughput_bytes_per_sec: 376090719201
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "d4bb9864d0881eafd2fa5efec09c6e99"
-        network_throughput_bytes_per_sec: 50039093
+        network_throughput_bytes_per_sec: 393543428
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4ce7115cb0355a0436faffe9d4f63a60"
-        network_throughput_bytes_per_sec: 49960967
+        network_throughput_bytes_per_sec: 82022528160
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f36d3cb56a18e44b99c8e2bd01e47a20"
-        network_throughput_bytes_per_sec: 100000000
+        network_throughput_bytes_per_sec: 314227150134
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "991c99cf49ca7ed01041e4151f354da3"
-        network_throughput_bytes_per_sec: 71111111
+        network_throughput_bytes_per_sec: 59795620437
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "21c3dea5239284aed45517eccc2c77a1"
-        network_throughput_bytes_per_sec: 12118916
+        fingerprint: "4ffa457833b6c4ff6e5147781e31302a"
+        network_throughput_bytes_per_sec: 103156802223
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -6507,119 +11402,87 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "bffcfdab46001339d858d2bde0841588"
-        network_throughput_bytes_per_sec: 12424771
+        network_throughput_bytes_per_sec: 177364005412
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 64
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f4c16425b37cb7c0469f632bc5f2954f"
-        network_throughput_bytes_per_sec: 62256809
+        network_throughput_bytes_per_sec: 347901791639
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e94911366d0b4cfff0d6742f123be8a9"
-        network_throughput_bytes_per_sec: 77575757
+        network_throughput_bytes_per_sec: 3605633802
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "a6f8b72cb4b9af96eef324934f9de021"
-        network_throughput_bytes_per_sec: 22199098
+        network_throughput_bytes_per_sec: 198288807469
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -6629,145 +11492,109 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b4310b6629879dd69750c1adf77eca35"
-        network_throughput_bytes_per_sec: 20792722
+        network_throughput_bytes_per_sec: 20765736
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fc2857874212cf751e4b60decec734a7"
-        network_throughput_bytes_per_sec: 52588331
+        network_throughput_bytes_per_sec: 42833986928
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3f33c91a674d3a639ad8460a9fee21f0"
-        network_throughput_bytes_per_sec: 76830732
+        network_throughput_bytes_per_sec: 73388577827
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fab7de7fff87c4825d063b15cf576f4a"
-        network_throughput_bytes_per_sec: 81632653
+        network_throughput_bytes_per_sec: 170016376165
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "48cd9372c015dda07a982da1a5620727"
-        network_throughput_bytes_per_sec: 121442125
+        network_throughput_bytes_per_sec: 3524956970
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -6777,27 +11604,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "9f83571c52a45ed6fa719a3ef1b35f56"
-        network_throughput_bytes_per_sec: 189910979
+        network_throughput_bytes_per_sec: 165913924050
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -6807,119 +11626,95 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "02feb5e94baeeab2372bc39cf64b4dc3"
-        network_throughput_bytes_per_sec: 121212121
+        network_throughput_bytes_per_sec: 12593389700
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "3fc3265f1d4f48b553f24df733a0ec07"
-        network_throughput_bytes_per_sec: 137931034
+        fingerprint: "d9af6aa7b045746dbcb1015482010070"
+        network_throughput_bytes_per_sec: 29791629968
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5357a763f8b75abbd8c2b3aa99d399b0"
-        network_throughput_bytes_per_sec: 147976878
+        fingerprint: "c5eb925239f355be19b2a2ae51019597"
+        network_throughput_bytes_per_sec: 206503744116
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 128
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "687f7f1c2f260eab74a9b0fe20926d8a"
-        network_throughput_bytes_per_sec: 100000000
+        fingerprint: "d28e01182634547a1027d109446f3b88"
+        network_throughput_bytes_per_sec: 6836280185
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -6929,28 +11724,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "22929fda0f320c7393deb2c18af09d78"
-        network_throughput_bytes_per_sec: 114695340
+        network_throughput_bytes_per_sec: 70888047593
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -6960,55 +11746,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "5c21cfc6706586931dd655d0d05f90bb"
-        network_throughput_bytes_per_sec: 126984126
+        network_throughput_bytes_per_sec: 115076382791
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0120a2fd4590718b617dbde5030314f0"
-        network_throughput_bytes_per_sec: 111888111
+        network_throughput_bytes_per_sec: 68266666666
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7018,56 +11791,71 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "70a080ed258662e7a7c448a580386531"
-        network_throughput_bytes_per_sec: 155528554
+        network_throughput_bytes_per_sec: 5031941031
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "a2d264c6a62d0909b4dc78be82a6aff6"
-        network_throughput_bytes_per_sec: 124878048
+        fingerprint: "a69547b3c276d3341642608b1db494a6"
+        network_throughput_bytes_per_sec: 90527151860
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "83591db4b7a2726af43dd49fc404007b"
+        network_throughput_bytes_per_sec: 112599898656
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 134217728
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7077,27 +11865,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6ab6cfdfc119a9143b046bd2262766d6"
-        network_throughput_bytes_per_sec: 102687525
+        network_throughput_bytes_per_sec: 367582840366
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7107,27 +11887,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b4c5c1f7997e35be9d661c5ab8917bbc"
-        network_throughput_bytes_per_sec: 332036316
+        network_throughput_bytes_per_sec: 189098712833
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7137,180 +11909,139 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d6ceb25936203837f994d4ea62fccbcb"
-        network_throughput_bytes_per_sec: 228367528
+        network_throughput_bytes_per_sec: 97234421364
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "01cbebe328fc80e3c97d6b476fec64ff"
-        network_throughput_bytes_per_sec: 228571428
+        fingerprint: "15185f0d28c7a7edc1b4aecfa6d8f221"
+        network_throughput_bytes_per_sec: 249948514054
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3a320de13af8d0a2b9013cfbf39985fb"
-        network_throughput_bytes_per_sec: 295953757
+        network_throughput_bytes_per_sec: 27443886097
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "66a5b4b136e3e63482a26491f2086663"
-        network_throughput_bytes_per_sec: 380386329
+        fingerprint: "99043b67c066b466fdfabf4ba0e10d9d"
+        network_throughput_bytes_per_sec: 334503879
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 256
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8abe97935f10f37b31406be7fe615de0"
-        network_throughput_bytes_per_sec: 262026612
+        network_throughput_bytes_per_sec: 254726368
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ecf9d950bcd7b6c6223401f066216fd4"
-        network_throughput_bytes_per_sec: 310303030
+        network_throughput_bytes_per_sec: 3778597785
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7320,26 +12051,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "75c0063e60c072af0ddaa5e2dbbfa741"
-        network_throughput_bytes_per_sec: 356050069
+        network_throughput_bytes_per_sec: 549964466006
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7349,55 +12073,75 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "71fef15b131813a4d472ccf5528d373b"
-        network_throughput_bytes_per_sec: 210526315
+        network_throughput_bytes_per_sec: 3289959839
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
         }
-        fingerprint: "15f365ac896c0b583d549fa6577c6d56"
-        network_throughput_bytes_per_sec: 266112266
+        fingerprint: "4a5e09d90eb94f08a4eb31421f1e8443"
+        network_throughput_bytes_per_sec: 76497126892
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
+        }
+        fingerprint: "561c9e59f33d316b74205823cb42d04d"
+        network_throughput_bytes_per_sec: 285606702146
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 524288
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7407,147 +12151,112 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6a6da3f8a701c6de63d3f3eff5a326d0"
-        network_throughput_bytes_per_sec: 307692307
+        network_throughput_bytes_per_sec: 47524292965
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "d23936a34187f9cecae83b71e3b7c071"
-        network_throughput_bytes_per_sec: 363894811
+        network_throughput_bytes_per_sec: 303847000869
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "ef3bbaf6d14ca5829ae80710febc85eb"
-        network_throughput_bytes_per_sec: 500000000
+        fingerprint: "b01e85aab5ef3da6357ca6f9cfb67b8f"
+        network_throughput_bytes_per_sec: 96956830291
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "954fa4e199cb0e689954013992370dc4"
-        network_throughput_bytes_per_sec: 731428571
+        network_throughput_bytes_per_sec: 371222516014
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "56f48d98625522ae94180017c6e4235a"
-        network_throughput_bytes_per_sec: 592592592
+        network_throughput_bytes_per_sec: 99447647951
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7557,58 +12266,44 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c6fbc0a09d2d44806949eef49196c7a2"
-        network_throughput_bytes_per_sec: 639200998
+        network_throughput_bytes_per_sec: 468722523974
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "c986af5a1df1d20f73d7d40cf5b1e067"
-        network_throughput_bytes_per_sec: 1066666666
+        fingerprint: "bee86716cff01212fb687be758fc96e1"
+        network_throughput_bytes_per_sec: 271017233924
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 512
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7618,28 +12313,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a9e70188f014a7fd5d3664cb93a8ceea"
-        network_throughput_bytes_per_sec: 333550488
+        network_throughput_bytes_per_sec: 1740016992
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7649,28 +12336,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "53d4043108c4d45128534295dc2c4234"
-        network_throughput_bytes_per_sec: 468007312
+        network_throughput_bytes_per_sec: 15873015
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7680,26 +12359,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "644ed15c889d04ca582b384ff68fc71e"
-        network_throughput_bytes_per_sec: 380952380
+        network_throughput_bytes_per_sec: 342245989
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7709,26 +12381,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4fd09bd390682728f7da420003fb7c37"
-        network_throughput_bytes_per_sec: 533889468
+        network_throughput_bytes_per_sec: 870008496
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7738,56 +12404,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ecaeb13272e9384319989d342680549c"
-        network_throughput_bytes_per_sec: 531671858
+        network_throughput_bytes_per_sec: 4729792147
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
-          shape {
-            element_type: F32
-            dimensions: 4096
-            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          metadata {}
-          dimensions: 0
-          channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
-        }
-        fingerprint: "147048ce1f70eaaf4c41e3a478797c71"
-        network_throughput_bytes_per_sec: 648922686
-      }
-      entries {
-        instruction {
-          name: "_"
-          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7797,57 +12426,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d0a5ced62829c6ceb591eb442eb1b79c"
-        network_throughput_bytes_per_sec: 653061224
+        network_throughput_bytes_per_sec: 503937007
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e6fb7f1db9f0def5ca27f01f28291a63"
-        network_throughput_bytes_per_sec: 842105263
+        network_throughput_bytes_per_sec: 822489959
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -7857,57 +12472,66 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6632fafd05450c9ca5bdd86c67f7cc0a"
-        network_throughput_bytes_per_sec: 1452482269
+        network_throughput_bytes_per_sec: 488704223711
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "a9dda3180d9de81b8ac47f5af4e3717c"
+        network_throughput_bytes_per_sec: 171596145
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 33554432
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f6813a624e0d4cd5931fd1911a59ce8d"
-        network_throughput_bytes_per_sec: 781679389
+        network_throughput_bytes_per_sec: 216201237113
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -7917,58 +12541,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a80bdc2e68784e2f6b048f9070199b93"
-        network_throughput_bytes_per_sec: 1185185185
+        network_throughput_bytes_per_sec: 537250786
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "012c7d96d1729d9e95f6cb6f9cc6646d"
-        network_throughput_bytes_per_sec: 1183815028
+        network_throughput_bytes_per_sec: 104746317
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1024
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -7978,28 +12587,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "bac7b1d59bfbf3ce3c9770a882b7d9d9"
-        network_throughput_bytes_per_sec: 1057851239
+        network_throughput_bytes_per_sec: 326151166407
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8009,28 +12609,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "5cfc87185d90d1302586da3e1c0f6fde"
-        network_throughput_bytes_per_sec: 1067778936
+        network_throughput_bytes_per_sec: 58514285714
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8040,55 +12632,70 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "28ed8d8831773650ac1210294feb985d"
-        network_throughput_bytes_per_sec: 1213270142
+        network_throughput_bytes_per_sec: 389556301155
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "36c99c9ecd2afb910616dba7e7604d76"
-        network_throughput_bytes_per_sec: 1120350109
+        network_throughput_bytes_per_sec: 147936794582
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
+        }
+        fingerprint: "aedd2df037a65be1250f90a94291ed61"
+        network_throughput_bytes_per_sec: 13073861652
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8098,56 +12705,44 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7ab9f4f3c725f05b8e8f79d1d9a79a65"
-        network_throughput_bytes_per_sec: 890434782
+        network_throughput_bytes_per_sec: 30681647940
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "90fc6688a7894e5c0b2688b8b3b56e5b"
-        network_throughput_bytes_per_sec: 1204705882
+        fingerprint: "e824c0c6cda89c864c487464c4714920"
+        network_throughput_bytes_per_sec: 115599104957
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8157,27 +12752,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c578baa139f847a50200e65a501cfe37"
-        network_throughput_bytes_per_sec: 1456614509
+        network_throughput_bytes_per_sec: 259355923818
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8187,27 +12775,48 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c8fe77ad4d6a5f9c9e08e64b66c5aa36"
-        network_throughput_bytes_per_sec: 2064516129
+        network_throughput_bytes_per_sec: 343163538
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "31a79cde8d9fdcd983bb9252c3c92f73"
+        network_throughput_bytes_per_sec: 46920350814
+      }
+      entries {
+        instruction {
+          opcode: "reduce-scatter"
+          shape {
+            element_type: F32
+            dimensions: 1048576
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8217,27 +12826,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "3007a47f1e00d56e53224073fd790288"
-        network_throughput_bytes_per_sec: 1878899082
+        network_throughput_bytes_per_sec: 196509745127
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8247,27 +12848,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "44ab650a56833b85a693cb515c43dee0"
-        network_throughput_bytes_per_sec: 2058291457
+        network_throughput_bytes_per_sec: 115481938325
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8277,27 +12871,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "95b6a84a10fab4f19072622cadfd1acf"
-        network_throughput_bytes_per_sec: 2560000000
+        network_throughput_bytes_per_sec: 32125490196
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8307,28 +12893,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "656be27a022b3b74531f5f2327584a2a"
-        network_throughput_bytes_per_sec: 3757798165
+        network_throughput_bytes_per_sec: 203192713884
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2048
+            dimensions: 32
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8338,117 +12916,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ad7d779a1ec5f906606de9e4e93650d9"
-        network_throughput_bytes_per_sec: 1601250977
+        network_throughput_bytes_per_sec: 47904191
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "96b26a3b1e59d77bab95e0026d23275c"
-        network_throughput_bytes_per_sec: 2074974670
+        network_throughput_bytes_per_sec: 158875151515
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8f93e3c7983b80f6171d861fa67a2bb5"
-        network_throughput_bytes_per_sec: 2135557872
+        network_throughput_bytes_per_sec: 148544553052
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8eb0c3afb6d4a5a42937daf0b2ca9327"
-        network_throughput_bytes_per_sec: 1853393665
+        network_throughput_bytes_per_sec: 27125827814
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -8458,56 +13005,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "0ac37c0c61461a98a53778858abef7c2"
-        network_throughput_bytes_per_sec: 1773160173
+        network_throughput_bytes_per_sec: 1160997732
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 64
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8dd875a32ca9280cac2fb6c8b4a3f900"
-        network_throughput_bytes_per_sec: 2458583433
+        network_throughput_bytes_per_sec: 55220017
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8517,87 +13050,75 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "29421a4f1a4df068f67122b659311863"
-        network_throughput_bytes_per_sec: 2371742906
+        network_throughput_bytes_per_sec: 200281921497
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "7607dfea803f89904d8e152fb8189956"
-        network_throughput_bytes_per_sec: 4271115745
+        fingerprint: "a0c14787b94ec324f39d4b4cde6aaee8"
+        network_throughput_bytes_per_sec: 2500419687
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "766a46b63049615920fa933700606ad3"
-        network_throughput_bytes_per_sec: 4003910068
+        fingerprint: "21ea7fae57499ff2b1700818db1ccc37"
+        network_throughput_bytes_per_sec: 117049187304
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8607,57 +13128,41 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d5bb5db4849bbe872fc0a3ab1ec6d0d9"
-        network_throughput_bytes_per_sec: 3657142857
+        network_throughput_bytes_per_sec: 53691275
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c5c9c12452980db70f61be1deb2f609c"
-        network_throughput_bytes_per_sec: 4000000000
+        network_throughput_bytes_per_sec: 642663627744
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8667,90 +13172,63 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "bd85fa7c974409dc2c659688fdfb262f"
-        network_throughput_bytes_per_sec: 4413793103
+        network_throughput_bytes_per_sec: 623839068919
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4096
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4a53acb196d9813effc6e5c5955a28c2"
-        network_throughput_bytes_per_sec: 3744058500
+        network_throughput_bytes_per_sec: 35463203463
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3b80581304fa3c1ea59b5614ce83e167"
-        network_throughput_bytes_per_sec: 4566332218
+        network_throughput_bytes_per_sec: 10317380352
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
-            is_dynamic_dimension: false
-          }
-          metadata {}
-          dimensions: 0
-          channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8760,26 +13238,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "02f84dcfcb5697b10aa0548ff15c1379"
-        network_throughput_bytes_per_sec: 7074265975
+        network_throughput_bytes_per_sec: 536687631
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8789,85 +13261,71 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "0f527662a7b68694b48d50d10a297e0b"
-        network_throughput_bytes_per_sec: 3205007824
+        network_throughput_bytes_per_sec: 367611030160
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "2c9f9cea76ec0a7f35987d8bdeea1d30"
-        network_throughput_bytes_per_sec: 3552471812
+        network_throughput_bytes_per_sec: 204003112840
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "584651046fab6aa022a9fcdaa741ca49"
-        network_throughput_bytes_per_sec: 4556173526
+        fingerprint: "a5c25d90d3703c3e05a5428a5fbafe10"
+        network_throughput_bytes_per_sec: 2575290789
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -8877,57 +13335,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b119f67d214e8219e6b672422c7ff82d"
-        network_throughput_bytes_per_sec: 4940892641
+        network_throughput_bytes_per_sec: 88983027834
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "6b3fd8cf011b133409ba2ce19f78aed7"
-        network_throughput_bytes_per_sec: 6239146991
+        network_throughput_bytes_per_sec: 410431685299
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -8937,118 +13380,92 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b3774b957d17024bdec4dfb6aa7015b8"
-        network_throughput_bytes_per_sec: 10680573663
+        network_throughput_bytes_per_sec: 5298835705
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4f6b04c57baf41d308928831e49e6f05"
-        network_throughput_bytes_per_sec: 6924767540
+        network_throughput_bytes_per_sec: 79054282267
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "696f90418e9a91559c454b6ef6ce9b85"
-        network_throughput_bytes_per_sec: 6948261238
+        network_throughput_bytes_per_sec: 227444546849
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "e5dc26fcdbdb577aa5941155f0d4fc57"
-        network_throughput_bytes_per_sec: 10252816020
+        fingerprint: "353ed7150bd91617fa8843f2620c704d"
+        network_throughput_bytes_per_sec: 87089641
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8192
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9058,28 +13475,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c161c6ad6758b19e819c0db57a16b335"
-        network_throughput_bytes_per_sec: 6090706319
+        network_throughput_bytes_per_sec: 343936367363
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -9089,44 +13497,35 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "378df9b975bd6f83307c54c5ebf8a5e3"
-        network_throughput_bytes_per_sec: 7192273924
+        network_throughput_bytes_per_sec: 193319306331
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b6cdb267bfc64ec38e1e2740095c8805"
-        network_throughput_bytes_per_sec: 7111111111
+        network_throughput_bytes_per_sec: 136606189
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
             dimensions: 65536
@@ -9134,12 +13533,34 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
+        }
+        fingerprint: "0cb65c209a26b3c7c2d519854b62c67c"
+        network_throughput_bytes_per_sec: 9752380952
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9149,85 +13570,71 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d04da75eb576bed3d4db82103261bf34"
-        network_throughput_bytes_per_sec: 6370139968
+        network_throughput_bytes_per_sec: 8650475184
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5b2d32958f53fbfe8142848551afad7c"
-        network_throughput_bytes_per_sec: 9683215130
+        fingerprint: "7d42a2c52405b0487725b632c32b3246"
+        network_throughput_bytes_per_sec: 57640194044
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3e00e3751db2f0a54edfc160509a0c32"
-        network_throughput_bytes_per_sec: 7185964912
+        network_throughput_bytes_per_sec: 7953398058
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9237,27 +13644,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "188b7c8f49ee518ef3f12f7239d0542a"
-        network_throughput_bytes_per_sec: 8245596376
+        network_throughput_bytes_per_sec: 348118354981
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -9267,57 +13667,48 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "f7fd5b7bdf4d97b0eb10f5fbab3117c5"
-        network_throughput_bytes_per_sec: 14234578627
+        network_throughput_bytes_per_sec: 240388812471
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "224b0c65bfb5a0772e3b3ef5e529c630"
-        network_throughput_bytes_per_sec: 20505632040
+        fingerprint: "d1ed25619ddb7091e280b64f8bf17b71"
+        network_throughput_bytes_per_sec: 640300140
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9327,27 +13718,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "9baae3a878bfbfff40ac180eb9c74753"
-        network_throughput_bytes_per_sec: 16384000000
+        network_throughput_bytes_per_sec: 277657266
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -9357,58 +13741,48 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "03330f0defed011fd622da3ddcd58de5"
-        network_throughput_bytes_per_sec: 17636167922
+        network_throughput_bytes_per_sec: 275565123
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "cc87abd4e4f49d00a84464d67ac1cc63"
-        network_throughput_bytes_per_sec: 26947368421
+        fingerprint: "71a05d2710cbb210f44bd79ab02e4544"
+        network_throughput_bytes_per_sec: 144412841361
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16384
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9418,59 +13792,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4afbc2a251fde8fe2a2f21f9b3680acf"
-        network_throughput_bytes_per_sec: 12700775193
+        network_throughput_bytes_per_sec: 393268230936
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "a83d894078ce2cfadd898cfbdd4955ea"
-        network_throughput_bytes_per_sec: 16094302554
+        network_throughput_bytes_per_sec: 35158798283
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -9480,26 +13838,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ad9a13d7c03557ae4a78b547c910aaa9"
-        network_throughput_bytes_per_sec: 18244988864
+        network_throughput_bytes_per_sec: 20739240506
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9509,145 +13861,123 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "1576d013b336b385ef44a95d0cd74a2c"
-        network_throughput_bytes_per_sec: 15968810916
+        network_throughput_bytes_per_sec: 128944417117
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "82724b215353fdf447c8f5867b927fe2"
-        network_throughput_bytes_per_sec: 14185281385
+        fingerprint: "3c0dab67f668aa96622d925405aa4c35"
+        network_throughput_bytes_per_sec: 352404714
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5016df44b768d5084d1607b448e77a3d"
-        network_throughput_bytes_per_sec: 13462612982
+        fingerprint: "1c34dad9da64a4a58f0c8f86c14a73a2"
+        network_throughput_bytes_per_sec: 32431522949
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b064bd92e8bb26128984b39785b63827"
-        network_throughput_bytes_per_sec: 21375081539
+        network_throughput_bytes_per_sec: 51200000000
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "8f9d476f5290f9fd2653c4f975ec810e"
-        network_throughput_bytes_per_sec: 28370562770
+        fingerprint: "260f965f1a4678225622c8bb1bb605bd"
+        network_throughput_bytes_per_sec: 109586962343
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -9657,27 +13987,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7090d041bc0599ce9b8dc0095e8d7135"
-        network_throughput_bytes_per_sec: 34168925964
+        network_throughput_bytes_per_sec: 330312175145
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9687,179 +14010,164 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "762f33c509d278627132d4806885cfc9"
-        network_throughput_bytes_per_sec: 29283288650
+        network_throughput_bytes_per_sec: 51280125195
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "0f9cdfcb5a2647c85e89d23875e02e61"
+        network_throughput_bytes_per_sec: 90820399
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f9040b578d0f9eba41c4ac1a07ee4224"
-        network_throughput_bytes_per_sec: 44582312925
+        network_throughput_bytes_per_sec: 56939501
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "eebdac40c94b93e0e8351eba013b7958"
-        network_throughput_bytes_per_sec: 44826265389
+        network_throughput_bytes_per_sec: 32064128
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 32768
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "bb6b916563714f9bbad245e71a3d56e3"
-        network_throughput_bytes_per_sec: 32031280547
+        network_throughput_bytes_per_sec: 756277695
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "498af7a3213702edfadb727477672515"
-        network_throughput_bytes_per_sec: 25051987767
+        fingerprint: "30228c58f0f8bfd498c30a0b4c75491e"
+        network_throughput_bytes_per_sec: 290438102739
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0c83ea7171fd3e561e96f900b77b901b"
-        network_throughput_bytes_per_sec: 34026998961
+        network_throughput_bytes_per_sec: 306825457205
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9869,85 +14177,63 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "2e4edd9d5f901a539189122797458ea4"
-        network_throughput_bytes_per_sec: 24526946107
+        network_throughput_bytes_per_sec: 83591836734
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ade554f65ab408a90ce8976d623c021f"
-        network_throughput_bytes_per_sec: 25680250783
+        network_throughput_bytes_per_sec: 70368334
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ef5d4bc6c48c17023f1713afce3c88c3"
-        network_throughput_bytes_per_sec: 26947368421
+        network_throughput_bytes_per_sec: 153772693943
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -9957,117 +14243,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "1287402cc6f1747882876a7ef488e090"
-        network_throughput_bytes_per_sec: 33590978985
+        network_throughput_bytes_per_sec: 253218063269
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "141f0055829b89a154053476d8d5fabe"
-        network_throughput_bytes_per_sec: 43690666666
+        network_throughput_bytes_per_sec: 1650282030
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e937e6f6cd2b6db16fe9cd7c2979d357"
-        network_throughput_bytes_per_sec: 66064516129
+        network_throughput_bytes_per_sec: 160200250
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "a9ce40b744201dc85700fe442e18c6d3"
-        network_throughput_bytes_per_sec: 46512420156
+        network_throughput_bytes_per_sec: 153210987726
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10077,27 +14334,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7c115f6e4ef6c361899235020514a18f"
-        network_throughput_bytes_per_sec: 68912723449
+        network_throughput_bytes_per_sec: 17031185031
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10107,59 +14356,43 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "713bd2c46abfb6ce361f71da4eb7d023"
-        network_throughput_bytes_per_sec: 75851851851
+        network_throughput_bytes_per_sec: 221837088
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 65536
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0690a7935a759eb144aeec77b49771d9"
-        network_throughput_bytes_per_sec: 51160031225
+        network_throughput_bytes_per_sec: 346765656649
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 131072
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10169,28 +14402,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "fd886f592d094bb3c992eb050b3aeb7a"
-        network_throughput_bytes_per_sec: 76920187793
+        network_throughput_bytes_per_sec: 225749559
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10200,26 +14425,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d8ba4d1f1855930537e677def956da0c"
-        network_throughput_bytes_per_sec: 85333333333
+        network_throughput_bytes_per_sec: 6942372881
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10229,175 +14447,132 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "5ecb8bfdacc065c41d076a95fde10de2"
-        network_throughput_bytes_per_sec: 51360501567
+        network_throughput_bytes_per_sec: 212665939916
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ee0857faea7d71857c6fb8036e979d2d"
-        network_throughput_bytes_per_sec: 48725650557
+        network_throughput_bytes_per_sec: 136818371607
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "413eb782ec4e2c4409f95026e7b720e6"
-        network_throughput_bytes_per_sec: 48617210682
+        network_throughput_bytes_per_sec: 106649308380
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "17e8b701cba039ffed5c4c64b81bd38e"
-        network_throughput_bytes_per_sec: 50567901234
+        network_throughput_bytes_per_sec: 156670746
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "668e8b905ffbce1829608cbf4befc332"
-        network_throughput_bytes_per_sec: 85333333333
+        network_throughput_bytes_per_sec: 709141274
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fd47bb6bf2e69ef941ae3455e0980ec2"
-        network_throughput_bytes_per_sec: 68195629552
+        network_throughput_bytes_per_sec: 201566254276
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10407,27 +14582,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "f791188a95c2a553c9030fa46c38f2a0"
-        network_throughput_bytes_per_sec: 97523809523
+        network_throughput_bytes_per_sec: 251276300023
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10437,27 +14604,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ca28c1cb14c2361c8daf298f6de5fc47"
-        network_throughput_bytes_per_sec: 102480062548
+        network_throughput_bytes_per_sec: 1400820793
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10467,12 +14627,10 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "0ee3ab31a2cbac43bf62cb9214b7c1d1"
-        network_throughput_bytes_per_sec: 102480062548
+        network_throughput_bytes_per_sec: 867796610
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
@@ -10480,15 +14638,9 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10498,28 +14650,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "8154a3ab4411af0b0a94fc14f69ac096"
-        network_throughput_bytes_per_sec: 101057825751
+        network_throughput_bytes_per_sec: 117658886894
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10529,205 +14673,160 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c708aebbaea40d6ad370faaa7d411d0f"
-        network_throughput_bytes_per_sec: 115481938325
+        network_throughput_bytes_per_sec: 402872346556
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e188b963e5912029be041b83f8c32803"
-        network_throughput_bytes_per_sec: 130031746031
+        network_throughput_bytes_per_sec: 722143864
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "96ca5c7e1f1638828edc08d3daa8f8f7"
-        network_throughput_bytes_per_sec: 78486227544
+        network_throughput_bytes_per_sec: 365819545593
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "542e81cf1beafb7de263b511ad1f5d7c"
-        network_throughput_bytes_per_sec: 77926278240
+        fingerprint: "96c966eff2db96826fa05a63d70abdfb"
+        network_throughput_bytes_per_sec: 244936493728
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "76f465d808cb817683770b3e6ab6838d"
-        network_throughput_bytes_per_sec: 92827195467
+        network_throughput_bytes_per_sec: 401897616481
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "50e0778e29cbb85bbe9e52c8a9e3f53b"
-        network_throughput_bytes_per_sec: 89134308058
+        network_throughput_bytes_per_sec: 122137404
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "6b9183f8b747bcf1e11a605c2867f7ff"
-        network_throughput_bytes_per_sec: 128313264806
+        network_throughput_bytes_per_sec: 2763832658
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10737,27 +14836,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "4fcbe7e7d7bb2d20520c051dd60ad89e"
-        network_throughput_bytes_per_sec: 118832275611
+        network_throughput_bytes_per_sec: 676354029
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
             dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10767,150 +14858,112 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "2d89d55373f92fb4e1b55e7456f04f79"
-        network_throughput_bytes_per_sec: 134226318484
+        network_throughput_bytes_per_sec: 80117359413
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "0588b7c974105348d0a2515c0abd898b"
-        network_throughput_bytes_per_sec: 99712438189
+        fingerprint: "b9f629195d1e0d1afeebe40aef9955df"
+        network_throughput_bytes_per_sec: 77324336780
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fac36931d73a8f77c6af1f29aa01f950"
-        network_throughput_bytes_per_sec: 196952667167
+        network_throughput_bytes_per_sec: 374926611245
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 262144
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "0f6fd7c2255ef5d6619d3e266e4493b1"
-        network_throughput_bytes_per_sec: 162017305315
+        network_throughput_bytes_per_sec: 383183624061
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 524288
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "99623f61db0dd4933df0e1e1215c75eb"
-        network_throughput_bytes_per_sec: 124121212121
+        network_throughput_bytes_per_sec: 1777777777
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -10920,26 +14973,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ae02e28f43d17a07b5679b487f20fb4a"
-        network_throughput_bytes_per_sec: 157349339735
+        network_throughput_bytes_per_sec: 246607714016
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -10949,26 +14996,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "59b183e8704a0ba13869ade15fa2b92b"
-        network_throughput_bytes_per_sec: 156878515858
+        network_throughput_bytes_per_sec: 162539682
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -10978,26 +15018,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ef648be80ef0366d556323938cba8b8b"
-        network_throughput_bytes_per_sec: 145797552836
+        network_throughput_bytes_per_sec: 288545954870
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11007,87 +15041,65 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "22444f0e312a3499cbfd75eaf67c0888"
-        network_throughput_bytes_per_sec: 127937530502
+        network_throughput_bytes_per_sec: 66737270875
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "5d605c04268d15a46391c0ef5400e98e"
-        network_throughput_bytes_per_sec: 128000000000
+        network_throughput_bytes_per_sec: 1142857142
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "2965a87c08fc2bb44b5e63f4a8232930"
-        network_throughput_bytes_per_sec: 178086956521
+        network_throughput_bytes_per_sec: 4302521008
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11097,57 +15109,65 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "03e84199997302127c56cdb92f61f92d"
-        network_throughput_bytes_per_sec: 218271440466
+        network_throughput_bytes_per_sec: 8933478735
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "61804978175d4c86d2e9535c315a7c67"
-        network_throughput_bytes_per_sec: 281572502685
+        network_throughput_bytes_per_sec: 242242337131
+      }
+      entries {
+        instruction {
+          name: "collective-permute"
+          opcode: "collective-permute"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          metadata {}
+          channel_id: 1
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
+          frontend_attributes {}
+          statistics_viz {}
+        }
+        fingerprint: "9ca39fa8794daa9631163ee0e266097e"
+        network_throughput_bytes_per_sec: 668352778
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -11157,27 +15177,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ca3eb58708ad6e9c3551f90b9d193653"
-        network_throughput_bytes_per_sec: 227654363873
+        network_throughput_bytes_per_sec: 247189061763
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11187,13 +15200,12 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "ae9b470b1a59d43e5b5fa203ca090cd9"
-        network_throughput_bytes_per_sec: 327680000000
+        network_throughput_bytes_per_sec: 154216867
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
             dimensions: 524288
@@ -11201,45 +15213,52 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "393073cf2c2fb7f64008edb89f2f877a"
+        network_throughput_bytes_per_sec: 55877860968
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 131072
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "19bfe133a0d02c4cac3fe71ed6e3e741"
-        network_throughput_bytes_per_sec: 273208963001
+        network_throughput_bytes_per_sec: 12328066215
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -11249,28 +15268,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "3b97564ed96cdb745f18dfba342d89a0"
-        network_throughput_bytes_per_sec: 198443603330
+        network_throughput_bytes_per_sec: 400219847328
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -11280,26 +15291,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "905de60dd6e7b89f580e55ae80ac8d79"
-        network_throughput_bytes_per_sec: 330989898989
+        network_throughput_bytes_per_sec: 347397523501
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -11309,266 +15313,240 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "abd659445180bd7e7b25417a2c970841"
-        network_throughput_bytes_per_sec: 220474348191
+        network_throughput_bytes_per_sec: 229968237690
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b5dc833e1006db332dcf16fc073558e4"
-        network_throughput_bytes_per_sec: 215578947368
+        network_throughput_bytes_per_sec: 132262361251
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7c15270553f884594396ea3a9e22288a"
-        network_throughput_bytes_per_sec: 208464413518
+        network_throughput_bytes_per_sec: 4075621890
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "695c9c2e1a16cd287cd6b80d66c3cf24"
-        network_throughput_bytes_per_sec: 170638893409
+        network_throughput_bytes_per_sec: 127617148
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "add1d98ea03d7ecc59ff9877b5bd5e93"
-        network_throughput_bytes_per_sec: 203567462628
+        fingerprint: "1f4b3db733dab96bb8fa97d8f2bb2c7e"
+        network_throughput_bytes_per_sec: 292410046019
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "d99a31a5731cd087d2a57a04dbeda416"
-        network_throughput_bytes_per_sec: 277694915254
+        fingerprint: "faccbf0108d642668cb20ee319a39541"
+        network_throughput_bytes_per_sec: 286685266708
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c37087da0a44968200e66995156557e7"
-        network_throughput_bytes_per_sec: 399305407463
+        network_throughput_bytes_per_sec: 271581455581
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "d71debdb12a8986a0f288def2a8ac093"
-        network_throughput_bytes_per_sec: 390095238095
+        fingerprint: "dd72a6e820b6d534b73545dc695b9277"
+        network_throughput_bytes_per_sec: 2518484359
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e4c41dd6f65b2df6f78f48e9ef705d3e"
-        network_throughput_bytes_per_sec: 468532618409
+        network_throughput_bytes_per_sec: 66264914054
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 1048576
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "8c4f72b22cf1c427b5192fde2275b82d"
+        network_throughput_bytes_per_sec: 70532512139
+      }
+      entries {
+        instruction {
+          opcode: "all-to-all"
+          shape {
+            element_type: F32
+            dimensions: 256
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -11578,117 +15556,88 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "e67540b745e062c37cb2d5e38a645a43"
-        network_throughput_bytes_per_sec: 363836224843
+        network_throughput_bytes_per_sec: 10571522
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "32921213d78db478164e6ece7132d57a"
-        network_throughput_bytes_per_sec: 399457523809
+        network_throughput_bytes_per_sec: 140814081
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "46eb6d6f2c352a68dd5943ccbf21f917"
-        network_throughput_bytes_per_sec: 455111111111
+        network_throughput_bytes_per_sec: 4437703141
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "361bab01e1544bbd7d4c57964c7cb2e8"
-        network_throughput_bytes_per_sec: 296124258683
+        network_throughput_bytes_per_sec: 511594072086
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -11698,146 +15647,111 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "e1c68238c33369569964e0715085e11e"
-        network_throughput_bytes_per_sec: 333728835136
+        network_throughput_bytes_per_sec: 191345985401
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
             dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b398f3b5618fef0e8beefc3d9fb45eee"
-        network_throughput_bytes_per_sec: 292082451253
+        network_throughput_bytes_per_sec: 331854102381
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c9746f1c866d390a80ecaa1cd0747467"
-        network_throughput_bytes_per_sec: 213494044589
+        network_throughput_bytes_per_sec: 8344198
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "96e82cb9d09f5d9b43c2800f01b5f3ff"
-        network_throughput_bytes_per_sec: 280105783357
+        network_throughput_bytes_per_sec: 112123182207
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "83d2f5e591feefe0553a301d532b898f"
-        network_throughput_bytes_per_sec: 337325398101
+        fingerprint: "067cb94ec99578ac4a5d635f77d6836e"
+        network_throughput_bytes_per_sec: 1329007138
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -11847,208 +15761,160 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "be567d531287055cea40ba66db60de94"
-        network_throughput_bytes_per_sec: 413313362238
+        network_throughput_bytes_per_sec: 58447488
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
             dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ea826b5b3a3cd11ff917bb395a93cd6f"
-        network_throughput_bytes_per_sec: 524288000000
+        network_throughput_bytes_per_sec: 395838429596
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "0b76c03ffc5616ae8aaf7fe05d58a8e5"
-        network_throughput_bytes_per_sec: 585960324112
+        fingerprint: "9ea76d324a2a87d6007857f1bbd58e8b"
+        network_throughput_bytes_per_sec: 7858504706
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 2097152
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f85a8aefce6c3bcea1643e35008f1774"
-        network_throughput_bytes_per_sec: 390822213939
+        network_throughput_bytes_per_sec: 178663486113
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "77c8cbc50f987483fee44f7a20bf8b1f"
-        network_throughput_bytes_per_sec: 528649357196
+        network_throughput_bytes_per_sec: 134736842
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b3181d33b743bf7d8e93edc0102fa54b"
-        network_throughput_bytes_per_sec: 577250756950
+        network_throughput_bytes_per_sec: 296124258683
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b0b00085d45064632771977a738377fa"
-        network_throughput_bytes_per_sec: 405874201664
+        network_throughput_bytes_per_sec: 7876923076
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12058,56 +15924,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d84ec32ec9d5bce065cad02d30309053"
-        network_throughput_bytes_per_sec: 413557878130
+        network_throughput_bytes_per_sec: 280087527
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "62c4fc6430ca1eb1c9da917231cf7c2c"
-        network_throughput_bytes_per_sec: 424438777575
+        network_throughput_bytes_per_sec: 443094410881
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -12117,27 +15969,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d08ba83901c2428a2cacfc0d6e826840"
-        network_throughput_bytes_per_sec: 293924597056
+        network_throughput_bytes_per_sec: 353651264755
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12147,102 +15992,89 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "98b831c4ca72406f37389214c5e19865"
-        network_throughput_bytes_per_sec: 311496769402
+        network_throughput_bytes_per_sec: 93090909090
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "e39d4c38abe32c0b8bf790196f492d26"
-        network_throughput_bytes_per_sec: 383216445865
+        fingerprint: "9efe18a44677d5ebf6d950d12f0105d0"
+        network_throughput_bytes_per_sec: 111468000783
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "70cc3d28822cfa34a6f4f77936bd4122"
-        network_throughput_bytes_per_sec: 567411255411
+        fingerprint: "baf06cabffc7267e4ba88f3f0469f867"
+        network_throughput_bytes_per_sec: 657252888
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "64fc8a1589366b418b88651876990852"
-        network_throughput_bytes_per_sec: 590414414414
+        network_throughput_bytes_per_sec: 9173572228
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
@@ -12250,136 +16082,100 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "3a965e23c93556616a155131d28e076f"
-        network_throughput_bytes_per_sec: 693387998016
+        network_throughput_bytes_per_sec: 383321513434
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 4194304
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8426634cb954bc9a04f5d3df48489b36"
-        network_throughput_bytes_per_sec: 584653470867
+        network_throughput_bytes_per_sec: 317942995755
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "f2e7c307868389c7b945a60985fdbfb7"
-        network_throughput_bytes_per_sec: 600473013600
+        fingerprint: "1f93b4ce5f502f4229278ea4c2936bb8"
+        network_throughput_bytes_per_sec: 11199384799
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 2097152
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "58212b0c758c8c906c2df8d9cd23841e"
-        network_throughput_bytes_per_sec: 675411272141
+        network_throughput_bytes_per_sec: 306960187353
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -12389,205 +16185,164 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "85bb8349a62442dcab56384b99cbe6d0"
-        network_throughput_bytes_per_sec: 511875030510
+        network_throughput_bytes_per_sec: 383957524230
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ffb987920d3dcd0f23601030220f2c32"
-        network_throughput_bytes_per_sec: 515524090462
+        network_throughput_bytes_per_sec: 7626310
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4447fc843a198e997259f2a45d1c5078"
-        network_throughput_bytes_per_sec: 515460734914
+        network_throughput_bytes_per_sec: 193750184774
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "906601acb6c0bdb8e772d3adb5a7e148"
-        network_throughput_bytes_per_sec: 318474107820
+        network_throughput_bytes_per_sec: 7816793893
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "3fbaf73ace028a5c0673748316b980bf"
-        network_throughput_bytes_per_sec: 331029083303
+        fingerprint: "04a5265203d9bcb5e98c819340f31d6c"
+        network_throughput_bytes_per_sec: 69698295057
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 8388608
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "6dd3b94c1709ebff95103a4422009c8b"
-        network_throughput_bytes_per_sec: 413455961358
+        fingerprint: "e78c4bfa48ee5d7f81743a050aa5f803"
+        network_throughput_bytes_per_sec: 74252717998
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b253ab825e72a17754e0f920ba0ac47b"
-        network_throughput_bytes_per_sec: 619725768321
+        network_throughput_bytes_per_sec: 109821533305
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12597,58 +16352,72 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6b3993ffc7a2464a3f8f42c61f55394d"
-        network_throughput_bytes_per_sec: 633198067632
+        network_throughput_bytes_per_sec: 24129602356
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "10f1eac7685082516c77e28c1c570603"
-        network_throughput_bytes_per_sec: 749183531303
+        fingerprint: "7c2845488c50871c6df2b01b09b93607"
+        network_throughput_bytes_per_sec: 271990404586
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 8388608
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 target: 2 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 4 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 6 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "3aa698dd32575d30602a31ddba9fefac"
+        network_throughput_bytes_per_sec: 41157750127
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 262144
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -12658,28 +16427,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7cac54d6e1d08a6cd127ab0f81c94048"
-        network_throughput_bytes_per_sec: 608355065632
+        network_throughput_bytes_per_sec: 33677286742
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 16777216
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -12689,28 +16450,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "1dc1e161c38f7eff0cea4c9c323dcfad"
-        network_throughput_bytes_per_sec: 642214668504
+        network_throughput_bytes_per_sec: 36008791208
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 4194304
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -12720,174 +16473,133 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "e001496ab38237a7a240b9087b52ce67"
-        network_throughput_bytes_per_sec: 759837681159
+        network_throughput_bytes_per_sec: 266001014713
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "5326b51b78d6cb159c04918bfead91ed"
-        network_throughput_bytes_per_sec: 584898061637
+        fingerprint: "87766ec8522f636b65d29ffbbac0f005"
+        network_throughput_bytes_per_sec: 4868583314
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "84086fea224a69018a6bcf0db282b861"
-        network_throughput_bytes_per_sec: 584490523968
+        network_throughput_bytes_per_sec: 325265917022
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 268435456
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "e77afb055f2c55ce8a9f881f93f4ccec"
-        network_throughput_bytes_per_sec: 592290333968
+        network_throughput_bytes_per_sec: 434192960662
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "f9d967d15c65b7c80d2055c0c6dbf3c6"
-        network_throughput_bytes_per_sec: 333722196805
+        network_throughput_bytes_per_sec: 50027480916
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "d51ae4f483b29ba1518794ed1f635d41"
-        network_throughput_bytes_per_sec: 351171449502
+        network_throughput_bytes_per_sec: 26047694753
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -12897,87 +16609,64 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "9c002f2bdf400f2638aac27df778dfc1"
-        network_throughput_bytes_per_sec: 438013106023
+        network_throughput_bytes_per_sec: 519129154031
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b51292fa84bc7fc3b5a42c808ed0538a"
-        network_throughput_bytes_per_sec: 650456170278
+        network_throughput_bytes_per_sec: 4511013215
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
-        }
-        fingerprint: "4351be00ad096ee1fcfe565c2215c7dd"
-        network_throughput_bytes_per_sec: 693502645502
+        }
+        network_throughput_bytes_per_sec: 27629005059
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 8192
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -12987,12 +16676,10 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "6fdb3d7a311222b991ec1edef14c5c26"
-        network_throughput_bytes_per_sec: 809086419753
+        network_throughput_bytes_per_sec: 2285714285
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
@@ -13000,15 +16687,9 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13018,88 +16699,65 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c2aa9452829d26d4fbfc6be7dcd22902"
-        network_throughput_bytes_per_sec: 638305280779
+        network_throughput_bytes_per_sec: 358832552668
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "22d939e4ee859968ac17ebf1c62fef05"
-        network_throughput_bytes_per_sec: 715049908366
+        network_throughput_bytes_per_sec: 725212464
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-gather"
           shape {
             element_type: F32
             dimensions: 67108864
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7da45b992c6f90f878e16349e67435ed"
-        network_throughput_bytes_per_sec: 818281032044
+        network_throughput_bytes_per_sec: 368082843352
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 1024
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13109,115 +16767,86 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "25a76511510793444ae58b29d1d310cd"
-        network_throughput_bytes_per_sec: 612396554241
+        network_throughput_bytes_per_sec: 125984251
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 256
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "9c2db8231f5ef24a566554bcd16a60d4"
-        network_throughput_bytes_per_sec: 626225822104
+        network_throughput_bytes_per_sec: 18593840
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 536870912
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "fb071244e8d81a3688446f7a2515f445"
-        network_throughput_bytes_per_sec: 633006942348
+        network_throughput_bytes_per_sec: 388143599946
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "680f6ed8e838fee643167dc7a214bcd4"
-        network_throughput_bytes_per_sec: 344699539776
+        network_throughput_bytes_per_sec: 435771844155
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13227,57 +16856,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a490a74ef07c6ec14e8318d4b8142f8e"
-        network_throughput_bytes_per_sec: 367401723439
+        network_throughput_bytes_per_sec: 3631205673
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7e50a143617aca892c3824cf04c47087"
-        network_throughput_bytes_per_sec: 461495736370
+        network_throughput_bytes_per_sec: 423399772870
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 524288
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13287,27 +16901,47 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "64fcb07cb34dfdc261edff3094f5e329"
-        network_throughput_bytes_per_sec: 675180232207
+        network_throughput_bytes_per_sec: 36612290502
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 16384
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "0a1bea9a023ace07a93df0c757a64d8d"
+        network_throughput_bytes_per_sec: 4709737693
+      }
+      entries {
+        instruction {
+          opcode: "all-reduce"
+          shape {
+            element_type: F32
+            dimensions: 2048
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13317,27 +16951,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "7e46aa6a95b79c94bd27c5e9de8038c1"
-        network_throughput_bytes_per_sec: 726223530430
+        network_throughput_bytes_per_sec: 228980322
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 128
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -13347,28 +16974,19 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "27a51e9c9148298fd01ee900e6a81c2c"
-        network_throughput_bytes_per_sec: 851808285946
+        network_throughput_bytes_per_sec: 11838697
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 33554432
+            dimensions: 1048576
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 1
@@ -13378,59 +16996,47 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "b8e3907c6dfb227acf1602dcedd1dfae"
-        network_throughput_bytes_per_sec: 695270135305
+        network_throughput_bytes_per_sec: 61077353215
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 1 }
+          source_target_pairs { source: 1 }
+          source_target_pairs { source: 2 target: 3 }
+          source_target_pairs { source: 3 target: 2 }
+          source_target_pairs { source: 4 target: 5 }
+          source_target_pairs { source: 5 target: 4 }
+          source_target_pairs { source: 6 target: 7 }
+          source_target_pairs { source: 7 target: 6 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
-          collective_device_list {
-            iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
-              iota_reshape_dims: 8
-              iota_transpose_perm: 0
-            }
-          }
         }
-        fingerprint: "0516ad83d2a5538970091f77531c73c6"
-        network_throughput_bytes_per_sec: 766958445714
+        fingerprint: "923306d188529fd23828978fba917eca"
+        network_throughput_bytes_per_sec: 1285019607
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
-          channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
+          dimensions: 0
+          channel_id: 1
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -13440,55 +17046,42 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "a7adb6a1534bbf74d4512d8e05eb5ad0"
-        network_throughput_bytes_per_sec: 859356451365
+        network_throughput_bytes_per_sec: 5970845481
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 131072
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "24af8009842c9b30f9fc309675fd46c8"
-        network_throughput_bytes_per_sec: 687140235910
+        network_throughput_bytes_per_sec: 13826160337
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 64
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13498,176 +17091,131 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "afc15c1b4deac0e2789adf42fce916f4"
-        network_throughput_bytes_per_sec: 694823821751
+        network_throughput_bytes_per_sec: 28243601
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "4a4598a1e81e510779605c8c674a3ac0"
-        network_throughput_bytes_per_sec: 810630589713
+        network_throughput_bytes_per_sec: 90267983
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-reduce"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
+          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "bdda763fe4cbd4dbd2cd6e538df4d2f5"
-        network_throughput_bytes_per_sec: 350752968723
+        network_throughput_bytes_per_sec: 98847662141
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 2048
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "60c70a6d99cce4e304e50c40c8f99fce"
-        network_throughput_bytes_per_sec: 378611362482
+        network_throughput_bytes_per_sec: 253968253
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 65536
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "35882d22990344fadfe4e45b8e2721eb"
-        network_throughput_bytes_per_sec: 478822324015
+        network_throughput_bytes_per_sec: 10014669926
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 33554432
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b5ef73f8707a38b85661790207aa156e"
-        network_throughput_bytes_per_sec: 704488436788
+        network_throughput_bytes_per_sec: 454420801733
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-gather"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13677,27 +17225,20 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c8859ac88de21b0d40acf7b94c89a34e"
-        network_throughput_bytes_per_sec: 757137293394
+        network_throughput_bytes_per_sec: 90523338
       }
       entries {
         instruction {
-          name: "_"
           opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 16777216
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 4
@@ -13707,148 +17248,134 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "d8c02d04f293873b5f6c56e662530193"
-        network_throughput_bytes_per_sec: 883755583649
+        network_throughput_bytes_per_sec: 416680309954
       }
       entries {
         instruction {
-          name: "_"
           opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 67108864
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
           use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "c04d9b7c7ac56f98fcf87ee9c131ab68"
-        network_throughput_bytes_per_sec: 733454255330
+        network_throughput_bytes_per_sec: 18265328874
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          opcode: "all-to-all"
           shape {
             element_type: F32
-            dimensions: 134217728
+            dimensions: 512
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
-          frontend_attributes {}
-          use_global_device_ids: true
-          statistics_viz {}
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 2
-              num_devices_per_group: 4
+              num_replica_groups: 4
+              num_devices_per_group: 2
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "ea64b110db7c46aafbb394dd547e9e23"
-        network_throughput_bytes_per_sec: 804836343575
+        network_throughput_bytes_per_sec: 66528066
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "reduce-scatter"
+          name: "collective-permute"
+          opcode: "collective-permute"
           shape {
             element_type: F32
-            dimensions: 268435456
+            dimensions: 134217728
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
           metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 4
-          operand_ids: 3
-          called_computation_ids: 2
+          id: 4294967299
+          operand_ids: 4294967298
+          source_target_pairs { target: 4 }
+          source_target_pairs { source: 1 target: 5 }
+          source_target_pairs { source: 2 target: 6 }
+          source_target_pairs { source: 3 target: 7 }
           frontend_attributes {}
-          use_global_device_ids: true
           statistics_viz {}
+        }
+        fingerprint: "00842feada0344771c1b4e414c197917"
+        network_throughput_bytes_per_sec: 269253988123
+      }
+      entries {
+        instruction {
+          opcode: "all-gather"
+          shape {
+            element_type: F32
+            dimensions: 16777216
+            layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
+            is_dynamic_dimension: false
+          }
+          dimensions: 0
+          channel_id: 1
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "7137b6e88084dce1309b8c65093ae1ff"
-        network_throughput_bytes_per_sec: 938598637743
+        network_throughput_bytes_per_sec: 342671895424
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-gather"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 32768
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 1
-              num_devices_per_group: 8
+              num_replica_groups: 2
+              num_devices_per_group: 4
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "8370a1824c7b75672651c80e67bfcc33"
-        network_throughput_bytes_per_sec: 671948734380
+        network_throughput_bytes_per_sec: 7262411347
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "all-reduce"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 4096
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
-          dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
               num_replica_groups: 2
@@ -13858,37 +17385,30 @@ constexpr char kDefaultCollectivePTable[] = R"pb(
             }
           }
         }
-        fingerprint: "c092460e68acec60d687e3ff6c5a6674"
-        network_throughput_bytes_per_sec: 788440058273
+        network_throughput_bytes_per_sec: 519796954
       }
       entries {
         instruction {
-          name: "_"
-          opcode: "all-to-all"
+          opcode: "reduce-scatter"
           shape {
             element_type: F32
-            dimensions: 536870912
+            dimensions: 262144
             layout { minor_to_major: 0 tail_padding_alignment_in_elements: 1 }
             is_dynamic_dimension: false
           }
-          metadata {}
           dimensions: 0
           channel_id: 1
-          id: 1
-          operand_ids: 0
-          frontend_attributes {}
-          statistics_viz {}
+          use_global_device_ids: true
           collective_device_list {
             iota_replica_group_list {
-              num_replica_groups: 4
-              num_devices_per_group: 2
+              num_replica_groups: 1
+              num_devices_per_group: 8
               iota_reshape_dims: 8
               iota_transpose_perm: 0
             }
           }
         }
-        fingerprint: "b9ca069b3d1f1eeefa7a6ecf54baacae"
-        network_throughput_bytes_per_sec: 848534088610
+        network_throughput_bytes_per_sec: 141470048569
       }
     }
   }
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index f6dfc5b93ba278..56acaa4060e343 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -795,12 +795,21 @@ cc_library(
         ":collective_perf_table_gen",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
+        "//xla/service/gpu/model:collective_interpolator_data",
         "//xla/service/gpu/model:hlo_op_profile_proto_cc",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/util:command_line_flags",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
     ] + if_cuda([
         "//xla/service:gpu_plugin",
diff --git a/third_party/xla/xla/tools/collective_perf_table_gen_main.cc b/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
index 7f8ca1015cdba0..2ec4b59813245e 100644
--- a/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
+++ b/third_party/xla/xla/tools/collective_perf_table_gen_main.cc
@@ -18,23 +18,32 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <iostream>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
+#include "google/protobuf/text_format.h"
+#include "xla/service/gpu/model/collective_interpolator_data.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/tools/collective_perf_table_gen.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/util/command_line_flags.h"
 #include "tsl/platform/init_main.h"
+#include "tsl/platform/path.h"
 
 namespace {
 
@@ -72,6 +81,24 @@ to 4 GPUs.
   (--tensor_size_bytes_spec)
 * AllReduce will run across all 8 devices.
   (--collective_devices_spec, HloShardingV2 format)
+
+This tool can also merge new profiles (either generated or loaded using
+--merge or --merge_path) into the static performance table defined in
+a C++ header file, e.g. `collective_interpolator_data.h`.
+Use `--update_header_path` to specify header file to update in-place.
+
+Example for generating COLLECTIVE_PERMUTE profiles:
+  CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bazel run --config=cuda  -- \
+    tools:collective_perf_table_gen_main  \
+    --num_nodes=1 --task_id=0 --collectives=COLLECTIVE_PERMUTE \
+    --collective_devices_spec='[1,8]<=[8]' \
+    --tensor_size_bytes_spec='start=1024,stop=2147483648,factor=2' \
+
+Example for merging profiles from `cp_perf_table.pbtxt` into
+`service/gpu/model/collective_interpolator_data.h`:
+  bazel run tools:collective_perf_table_gen_main -- \
+    --merge=cp_perf_table.pbtxt \
+    --update_header_path=service/gpu/model/collective_interpolator_data.h
 )";
 
 constexpr absl::string_view kDefaultCoordinatorAddress = "127.0.0.1:1234";
@@ -161,6 +188,93 @@ std::string DefaultCollectiveDevicesIfEmpty(
   return collective_devices_spec_unparsed;
 }
 
+// Helper to get full path if running under bazel run.
+std::string GetFullPath(const std::string& path) {
+  if (tsl::io::IsAbsolutePath(path)) {
+    return path;
+  }
+  const char* build_workspace_dir = getenv("BUILD_WORKSPACE_DIRECTORY");
+  if (build_workspace_dir != nullptr) {
+    return tsl::io::JoinPath(build_workspace_dir, path);
+  }
+  return path;  // Fallback to relative path if not in bazel run
+}
+
+// Helper to inject proto string into header by replacing content between
+// R"pb( and )pb tags.
+// Note: this function assumes there is only one R"pb(...)pb" block in the
+// header file, and finds the first opening tag R"pb(\n and last closing tag
+// \n)pb".
+std::string InjectProtoToString(const std::string& header_content,
+                                const std::string& new_proto_string) {
+  const std::string start_str = "R\"pb(\n";
+  const std::string end_str = "\n)pb";
+  size_t start = header_content.find(start_str);
+  size_t end = header_content.rfind(end_str);
+  CHECK(start != std::string::npos && end != std::string::npos);
+  start += start_str.length();
+
+  std::string result = header_content.substr(0, start);
+  result += new_proto_string;
+  result += header_content.substr(end);
+  return result;
+}
+
+absl::Status UpdateHeader(const DeviceHloInstructionProfiles& new_profiles,
+                          const std::string& header_path_flag,
+                          CollectivePerfTableGen* gen) {
+  std::string header_path = GetFullPath(header_path_flag);
+
+  // 1. Parse kDefaultCollectivePTable to get current profiles
+  DeviceHloInstructionProfiles current_profiles_proto;
+  CHECK(tsl::protobuf::TextFormat::ParseFromString(kDefaultCollectivePTable,
+                                                   &current_profiles_proto));
+  std::string current_profiles_pbtxt;
+  tsl::protobuf::TextFormat::PrintToString(current_profiles_proto,
+                                           &current_profiles_pbtxt);
+
+  // 2. Save current profiles to temp file
+  std::string temp_file_current =
+      tsl::io::JoinPath("/tmp", "xla_gpu_perf_merge_current.pbtxt");
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(
+      tsl::Env::Default(), temp_file_current, current_profiles_pbtxt));
+
+  // 3. Save new profiles to temp file
+  std::string new_profiles_pbtxt;
+  tsl::protobuf::TextFormat::PrintToString(new_profiles, &new_profiles_pbtxt);
+  std::string temp_file_new =
+      tsl::io::JoinPath("/tmp", "xla_gpu_perf_merge_new.pbtxt");
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), temp_file_new,
+                                            new_profiles_pbtxt));
+
+  // 4. Merge
+  std::vector<std::string> files_to_merge = {temp_file_current, temp_file_new};
+  DeviceHloInstructionProfiles merged_profiles = gen->Merge(files_to_merge);
+
+  // 5. Format as text
+  tsl::protobuf::TextFormat::Printer printer;
+  printer.SetInitialIndentLevel(1);
+  std::string merged_profiles_pbtxt;
+  printer.PrintToString(merged_profiles, &merged_profiles_pbtxt);
+  // The printer might add a trailing newline which we don't want inside
+  // R"pb(...)pb" to avoid unnecessary ClangTidy warnings.
+  if (!merged_profiles_pbtxt.empty() && merged_profiles_pbtxt.back() == '\n') {
+    merged_profiles_pbtxt.pop_back();
+  }
+
+  // 6. Update header
+  std::string header_content;
+  TF_RETURN_IF_ERROR(
+      tsl::ReadFileToString(tsl::Env::Default(), header_path, &header_content));
+  std::string new_header_content =
+      InjectProtoToString(header_content, merged_profiles_pbtxt);
+  TF_RETURN_IF_ERROR(tsl::WriteStringToFile(tsl::Env::Default(), header_path,
+                                            new_header_content));
+
+  LOG(INFO) << "Successfully merged profiles into " << header_path_flag;
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 int main(int argc, char* argv[]) {
@@ -178,6 +292,7 @@ int main(int argc, char* argv[]) {
   std::string output = std::string(CollectivePerfTableGen::Config::kStdout);
   std::string merge_path;
   std::vector<std::string> merge_files;
+  std::string update_header_path;
 
   // Parse flags.
   std::vector<tsl::Flag> flag_list = {
@@ -219,6 +334,9 @@ int main(int argc, char* argv[]) {
           "none",
           "Path to individual DeviceHloInstructionProfiles files. If "
           "specified, these files will be merged into a single one."),
+      tsl::Flag(
+          "update_header_path", &update_header_path,
+          "Path to C++ header file to update in-place with new profiles."),
   };
 
   std::string kUsageString =
@@ -248,10 +366,22 @@ int main(int argc, char* argv[]) {
   if (!merge_path.empty()) {
     profiles = gen->Merge(merge_path);
   } else if (!merge_files.empty()) {
-    profiles = gen->Merge(merge_files);
+    std::vector<std::string> full_path_merge_files;
+    full_path_merge_files.reserve(merge_files.size());
+    absl::c_transform(merge_files, std::back_inserter(full_path_merge_files),
+                      GetFullPath);
+    profiles = gen->Merge(full_path_merge_files);
   } else {
     profiles = gen->ComputeTable();
   }
+
+  if (!update_header_path.empty()) {
+    CHECK_OK(UpdateHeader(profiles, update_header_path, gen.get()));
+    if (output == CollectivePerfTableGen::Config::kStdout) {
+      return 0;  // If header is updated, avoid printing to stdout.
+    }
+  }
+
   CHECK_OK(gen->Dump(profiles));
   return 0;
 }

From ccd3917bab768c842f017a131b222788f6b9c710 Mon Sep 17 00:00:00 2001
From: Kevin Gleason <gleasonk@google.com>
Date: Thu, 11 Dec 2025 15:04:42 -0800
Subject: [PATCH 192/753] [StableHLO optim] Fix issue in compare folder

PiperOrigin-RevId: 843390028
---
 .../xla/third_party/stablehlo/temporary.patch | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index ca34dee010d16d..f87d4742f36878 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -703,6 +703,31 @@ diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/
    func.return %0 : tensor<16x16xf32>
  }
  
+diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
+--- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
++++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
+@@ -218,6 +218,21 @@
+   // CHECK-NEXT: return [[TRUE]], [[FALSE]], [[TRUE]], [[TRUE]], [[TRUE]], [[FALSE]], [[TRUE]], [[FALSE]]
+   return %0, %1, %2, %3, %4, %5, %6, %7 :
+          tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
++}
++
++// -----
++
++// CHECK-LABEL: func.func @compare_fold_with_implicit_comparison_type
++func.func @compare_fold_with_implicit_comparison_type() -> (tensor<3xi1>, tensor<3xi1>) {
++  %c_0 = stablehlo.constant dense<0> : tensor<3xi64>
++  %c = stablehlo.constant dense<[-1, 0, 1]> : tensor<3xi64>
++  %c_1 = stablehlo.constant dense<0.0> : tensor<3xf64>
++  %c_2 = stablehlo.constant dense<[-1.0, 0.0, 1.0]> : tensor<3xf64>
++  %0 = stablehlo.compare GE, %c, %c_0 : (tensor<3xi64>, tensor<3xi64>) -> tensor<3xi1>
++  %1 = stablehlo.compare GE, %c_2, %c_1 : (tensor<3xf64>, tensor<3xf64>) -> tensor<3xi1>
++  // CHECK-DAG:  [[RES:%.+]] = stablehlo.constant dense<[false, true, true]> : tensor<3xi1>
++  // CHECK-NEXT: return [[RES]], [[RES]] : tensor<3xi1>, tensor<3xi1>
++  return %0, %1 : tensor<3xi1>, tensor<3xi1>
+ }
+ 
+ // -----
 diff --ruN a/stablehlo/stablehlo/transforms/CMakeLists.txt b/stablehlo/stablehlo/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/transforms/CMakeLists.txt
@@ -961,4 +986,72 @@ diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stabl
  // Returns the common shape these ops would broadcast to, or an error if the
  // ops are not broadcastable.
  FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
+diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
+--- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
++++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
+@@ -701,22 +701,50 @@
+     if (failed(validateShapeFoldDtype(rewriter, op, resultType)))
+       return failure();
+ 
++    ComparisonType comparisonType = getComparisonType(op);
++    if (comparisonType == ComparisonType::NOTYPE)
++      return rewriter.notifyMatchFailure(
++          op, "Could not determine comparison type.");
++
++    LLVM_DEBUG(llvm::dbgs() << "comparisonType: " << comparisonType << "\n");
++
+     auto res = foldBinaryOpIntOrFloat<FoldCompare, IntegerAttr, IntegerAttr>(
+-        rewriter, op,
+-        FoldCompare(op.getComparisonDirection(), op.getCompareType()));
++        rewriter, op, FoldCompare(op.getComparisonDirection(), comparisonType));
+     if (failed(res)) return failure();
+     rewriter.replaceOpWithNewOp<mlir::stablehlo::ConstantOp>(op, res.value());
+     return success();
+   }
+ 
++  // Return the comparison type if set, else return the assumed comparison type
++  // according to the StableHLO spec.
++  ComparisonType getComparisonType(CompareOp op) const {
++    auto compareType = op.getCompareType();
++    if (compareType.has_value() &&
++        compareType.value() != ComparisonType::NOTYPE)
++      return *compareType;
++
++    Type elementType = op.getLhs().getType().getElementType();
++    if (elementType.isUnsignedInteger() || elementType.isSignlessInteger(1))
++      return ComparisonType::UNSIGNED;
++    if (elementType.isSignlessInteger())
++      return ComparisonType::SIGNED;
++    else if (elementType.isFloat() || mlir::isa<ComplexType>(elementType))
++      return ComparisonType::FLOAT;
++    else
++      return ComparisonType::NOTYPE;
++  }
++
+   struct FoldCompare {
+     FoldCompare(ComparisonDirection direction,
+-                std::optional<ComparisonType> kind)
++                ComparisonType kind)
+         : direction(direction), kind(kind) {}
+     ComparisonDirection direction;
+-    std::optional<ComparisonType> kind;
++    ComparisonType kind;
+ 
+     APInt operator()(APFloat lhs, APFloat rhs) {
++      if (kind != ComparisonType::FLOAT && kind != ComparisonType::TOTALORDER)
++        llvm::report_fatal_error("invalid float comparison");
++
+       bool result = false;
+       switch (direction) {
+         case ComparisonDirection::EQ:
+@@ -741,6 +769,9 @@
+       return APInt(/*bitwidth=*/1, result);
+     }
+     APInt operator()(APInt lhs, APInt rhs) {
++      if (kind != ComparisonType::UNSIGNED && kind != ComparisonType::SIGNED)
++        llvm::report_fatal_error("invalid integer comparison");
++
+       bool result = false;
+       switch (direction) {
+         case ComparisonDirection::EQ:
 

From 5694e42c9773c82f3fbe2d775e4b549d4003ed43 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Thu, 11 Dec 2025 15:06:04 -0800
Subject: [PATCH 193/753] Integrate LLVM at llvm/llvm-project@16c0893f04c0

Updates LLVM usage to match
[16c0893f04c0](https://github.com/llvm/llvm-project/commit/16c0893f04c0)

PiperOrigin-RevId: 843390697
---
 .../xla/third_party/llvm/workspace.bzl        |    4 +-
 .../xla/third_party/shardy/temporary.patch    | 1089 +----------------
 .../xla/third_party/shardy/workspace.bzl      |    4 +-
 3 files changed, 9 insertions(+), 1088 deletions(-)

diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index 3c9c005f2315d3..26b3bf8809ac7b 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "48d942c7158af43094db1b5e6c59c6e6fcf1b5aa"
-    LLVM_SHA256 = "6ce4ac276a4687625e9f57e53715285d99b60c6553e0cde4db9b7e74f2179f69"
+    LLVM_COMMIT = "16c0893f04c04faa8ac36495363344840f7c5db1"
+    LLVM_SHA256 = "3f786bc56ecb8fce511fe504f9b0848c12b5312beb7bded23edfc77272698b90"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 4caadcc3b73011..ee2078ba7263f1 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,1094 +1,15 @@
-diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index 2948da4..509398d 100644
---- a/third_party/llvm/generated.patch
-+++ b/third_party/llvm/generated.patch
-@@ -1,1074 +1 @@
- Auto generated patch. Do not edit or delete it, even if empty.
--diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
----- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
--+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
--@@ -554,8 +554,6 @@
--     if (const auto &Dir = Params.initializationOptions.compilationDatabasePath)
--       CDBOpts.CompileCommandsDir = Dir;
--     CDBOpts.ContextProvider = Opts.ContextProvider;
---    if (Opts.StrongWorkspaceMode)
---      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
--     BaseCDB =
--         std::make_unique<DirectoryBasedGlobalCompilationDatabase>(CDBOpts);
--   }
--diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
----- a/clang-tools-extra/clangd/ClangdServer.h
--+++ b/clang-tools-extra/clangd/ClangdServer.h
--@@ -152,11 +152,6 @@
--     /// FIXME: If not set, should use the current working directory.
--     std::optional<std::string> WorkspaceRoot;
-- 
---    /// Sets an alternate mode of operation. Current effects are:
---    /// - Using the current working directory as the working directory for
---    ///   fallback commands
---    bool StrongWorkspaceMode;
---
--     /// The resource directory is used to find internal headers, overriding
--     /// defaults and -resource-dir compiler flag).
--     /// If std::nullopt, ClangdServer calls
--diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
----- a/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
--+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.cpp
--@@ -64,9 +64,7 @@
--   if (FileExtension.empty() || FileExtension == ".h")
--     Argv.push_back("-xobjective-c++-header");
--   Argv.push_back(std::string(File));
---  tooling::CompileCommand Cmd(FallbackWorkingDirectory
---                                  ? *FallbackWorkingDirectory
---                                  : llvm::sys::path::parent_path(File),
--+  tooling::CompileCommand Cmd(llvm::sys::path::parent_path(File),
--                               llvm::sys::path::filename(File), std::move(Argv),
--                               /*Output=*/"");
--   Cmd.Heuristic = "clangd fallback";
--@@ -351,8 +349,7 @@
-- 
-- DirectoryBasedGlobalCompilationDatabase::
--     DirectoryBasedGlobalCompilationDatabase(const Options &Opts)
---    : GlobalCompilationDatabase(Opts.FallbackWorkingDirectory), Opts(Opts),
---      Broadcaster(std::make_unique<BroadcastThread>(*this)) {
--+    : Opts(Opts), Broadcaster(std::make_unique<BroadcastThread>(*this)) {
--   if (!this->Opts.ContextProvider)
--     this->Opts.ContextProvider = [](llvm::StringRef) {
--       return Context::current().clone();
--@@ -463,21 +460,6 @@
--   return Result;
-- }
-- 
---void DirectoryBasedGlobalCompilationDatabase::Options::
---    applyFallbackWorkingDirectory(
---        std::optional<std::string> FallbackWorkingDirectory) {
---  if (FallbackWorkingDirectory)
---    this->FallbackWorkingDirectory = *FallbackWorkingDirectory;
---  else {
---    // Clangd is running in strong workspace mode but the client didn't
---    // specify a workspace path in the `initialize` request.
---    // Fallback to current working directory.
---    SmallString<256> CWD;
---    llvm::sys::fs::current_path(CWD);
---    this->FallbackWorkingDirectory = std::string(CWD);
---  }
---}
---
-- // The broadcast thread announces files with new compile commands to the world.
-- // Primarily this is used to enqueue them for background indexing.
-- //
--@@ -777,10 +759,9 @@
-- 
-- OverlayCDB::OverlayCDB(const GlobalCompilationDatabase *Base,
--                        std::vector<std::string> FallbackFlags,
---                       CommandMangler Mangler,
---                       std::optional<std::string> FallbackWorkingDirectory)
---    : DelegatingCDB(Base, FallbackWorkingDirectory),
---      Mangler(std::move(Mangler)), FallbackFlags(std::move(FallbackFlags)) {}
--+                       CommandMangler Mangler)
--+    : DelegatingCDB(Base), Mangler(std::move(Mangler)),
--+      FallbackFlags(std::move(FallbackFlags)) {}
-- 
-- std::optional<tooling::CompileCommand>
-- OverlayCDB::getCompileCommand(PathRef File) const {
--@@ -863,20 +844,16 @@
--   return MDB;
-- }
-- 
---DelegatingCDB::DelegatingCDB(
---    const GlobalCompilationDatabase *Base,
---    std::optional<std::string> FallbackWorkingDirectory)
---    : GlobalCompilationDatabase(FallbackWorkingDirectory), Base(Base) {
--+DelegatingCDB::DelegatingCDB(const GlobalCompilationDatabase *Base)
--+    : Base(Base) {
--   if (Base)
--     BaseChanged = Base->watch([this](const std::vector<std::string> Changes) {
--       OnCommandChanged.broadcast(Changes);
--     });
-- }
-- 
---DelegatingCDB::DelegatingCDB(
---    std::unique_ptr<GlobalCompilationDatabase> Base,
---    std::optional<std::string> FallbackWorkingDirectory)
---    : DelegatingCDB(Base.get(), FallbackWorkingDirectory) {
--+DelegatingCDB::DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base)
--+    : DelegatingCDB(Base.get()) {
--   BaseOwner = std::move(Base);
-- }
-- 
--diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/GlobalCompilationDatabase.h b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
----- a/clang-tools-extra/clangd/GlobalCompilationDatabase.h
--+++ b/clang-tools-extra/clangd/GlobalCompilationDatabase.h
--@@ -35,9 +35,6 @@
-- /// Provides compilation arguments used for parsing C and C++ files.
-- class GlobalCompilationDatabase {
-- public:
---  GlobalCompilationDatabase(
---      std::optional<std::string> FallbackWorkingDirectory = std::nullopt)
---      : FallbackWorkingDirectory(FallbackWorkingDirectory) {}
--   virtual ~GlobalCompilationDatabase() = default;
-- 
--   /// If there are any known-good commands for building this file, returns one.
--@@ -72,19 +69,14 @@
--   }
-- 
-- protected:
---  std::optional<std::string> FallbackWorkingDirectory;
--   mutable CommandChanged OnCommandChanged;
-- };
-- 
-- // Helper class for implementing GlobalCompilationDatabases that wrap others.
-- class DelegatingCDB : public GlobalCompilationDatabase {
-- public:
---  DelegatingCDB(
---      const GlobalCompilationDatabase *Base,
---      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
---  DelegatingCDB(
---      std::unique_ptr<GlobalCompilationDatabase> Base,
---      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
--+  DelegatingCDB(const GlobalCompilationDatabase *Base);
--+  DelegatingCDB(std::unique_ptr<GlobalCompilationDatabase> Base);
-- 
--   std::optional<tooling::CompileCommand>
--   getCompileCommand(PathRef File) const override;
--@@ -125,12 +117,6 @@
--     // Only look for a compilation database in this one fixed directory.
--     // FIXME: fold this into config/context mechanism.
--     std::optional<Path> CompileCommandsDir;
---    // Working directory for fallback commands
---    // If unset, parent directory of file should be used
---    std::optional<std::string> FallbackWorkingDirectory;
---
---    void applyFallbackWorkingDirectory(
---        std::optional<std::string> FallbackWorkingDirectory);
--   };
-- 
--   DirectoryBasedGlobalCompilationDatabase(const Options &Opts);
--@@ -208,11 +194,9 @@
--   // Base may be null, in which case no entries are inherited.
--   // FallbackFlags are added to the fallback compile command.
--   // Adjuster is applied to all commands, fallback or not.
---  OverlayCDB(
---      const GlobalCompilationDatabase *Base,
---      std::vector<std::string> FallbackFlags = {},
---      CommandMangler Mangler = nullptr,
---      std::optional<std::string> FallbackWorkingDirectory = std::nullopt);
--+  OverlayCDB(const GlobalCompilationDatabase *Base,
--+             std::vector<std::string> FallbackFlags = {},
--+             CommandMangler Mangler = nullptr);
-- 
--   std::optional<tooling::CompileCommand>
--   getCompileCommand(PathRef File) const override;
--diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/Check.cpp b/clang-tools-extra/clangd/tool/Check.cpp
----- a/clang-tools-extra/clangd/tool/Check.cpp
--+++ b/clang-tools-extra/clangd/tool/Check.cpp
--@@ -169,8 +169,6 @@
--   bool buildCommand(const ThreadsafeFS &TFS) {
--     log("Loading compilation database...");
--     DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
---    if (Opts.StrongWorkspaceMode)
---      CDBOpts.applyFallbackWorkingDirectory(Opts.WorkspaceRoot);
--     CDBOpts.CompileCommandsDir =
--         Config::current().CompileFlags.CDBSearch.FixedCDBPath;
--     BaseCDB =
--@@ -180,10 +178,8 @@
--         getSystemIncludeExtractor(llvm::ArrayRef(Opts.QueryDriverGlobs));
--     if (Opts.ResourceDir)
--       Mangler.ResourceDir = *Opts.ResourceDir;
---
--     CDB = std::make_unique<OverlayCDB>(
---        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler),
---        CDBOpts.FallbackWorkingDirectory);
--+        BaseCDB.get(), std::vector<std::string>{}, std::move(Mangler));
-- 
--     if (auto TrueCmd = CDB->getCompileCommand(File)) {
--       Cmd = std::move(*TrueCmd);
--@@ -506,7 +502,7 @@
--                  config::DiagnosticCallback Diag) const override {
--       config::Fragment F;
--       // If we're timing clang-tidy checks, implicitly disabling the slow ones
---      // is counterproductive!
--+      // is counterproductive! 
--       if (CheckTidyTime.getNumOccurrences())
--         F.Diagnostics.ClangTidy.FastCheckFilter.emplace("None");
--       return {std::move(F).compile(Diag)};
--diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
----- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
--+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
--@@ -500,17 +500,6 @@
--     init(true),
-- };
-- 
---opt<bool> StrongWorkspaceMode{
---    "strong-workspace-mode",
---    cat(Features),
---    desc("An alternate mode of operation for clangd, where the clangd instance "
---         "is used to edit a single workspace.\n"
---         "When enabled, fallback commands use the workspace directory as their "
---         "working directory instead of the parent folder."),
---    init(false),
---    Hidden,
---};
---
-- opt<bool> UseDirtyHeaders{"use-dirty-headers", cat(Misc),
--                           desc("Use files open in the editor when parsing "
--                                "headers instead of reading from the disk"),
--@@ -918,7 +907,6 @@
--   }
--   if (!ResourceDir.empty())
--     Opts.ResourceDir = ResourceDir;
---  Opts.StrongWorkspaceMode = StrongWorkspaceMode;
--   Opts.BuildDynamicSymbolIndex = true;
-- #if CLANGD_ENABLE_REMOTE
--   if (RemoteIndexAddress.empty() != ProjectRoot.empty()) {
--diff -ruN --strip-trailing-cr a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
----- a/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
--+++ b/clang-tools-extra/clangd/unittests/GlobalCompilationDatabaseTests.cpp
--@@ -55,20 +55,6 @@
--                                            testPath("foo/bar")));
-- }
-- 
---TEST(GlobalCompilationDatabaseTest, FallbackWorkingDirectory) {
---  MockFS TFS;
---  DirectoryBasedGlobalCompilationDatabase::Options CDBOpts(TFS);
---  CDBOpts.applyFallbackWorkingDirectory(testPath("foo"));
---  EXPECT_EQ(CDBOpts.FallbackWorkingDirectory, testPath("foo"));
---
---  DirectoryBasedGlobalCompilationDatabase DB(CDBOpts);
---  auto Cmd = DB.getFallbackCommand(testPath("foo/src/bar.cc"));
---  EXPECT_EQ(Cmd.Directory, testPath("foo"));
---  EXPECT_THAT(Cmd.CommandLine,
---              ElementsAre("clang", testPath("foo/src/bar.cc")));
---  EXPECT_EQ(Cmd.Output, "");
---}
---
-- static tooling::CompileCommand cmd(llvm::StringRef File, llvm::StringRef Arg) {
--   return tooling::CompileCommand(
--       testRoot(), File, {"clang", std::string(Arg), std::string(File)}, "");
--diff -ruN --strip-trailing-cr a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
----- a/llvm/lib/CodeGen/ShrinkWrap.cpp
--+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
--@@ -618,8 +618,6 @@
-- 
--   DenseSet<const MachineBasicBlock *> DirtyBBs;
--   for (MachineBasicBlock &MBB : MF) {
---    if (!MDT->isReachableFromEntry(&MBB))
---      continue;
--     if (MBB.isEHPad()) {
--       DirtyBBs.insert(&MBB);
--       continue;
--diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
----- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
--+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
--@@ -708,53 +708,6 @@
--   return 2;
-- }
-- 
---bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
---                               const TargetInstrInfo &TII) {
---  for (MachineInstr &MI : MBB->terminators()) {
---    unsigned Opc = MI.getOpcode();
---    switch (Opc) {
---    case AArch64::CBZW:
---    case AArch64::CBZX:
---    case AArch64::TBZW:
---    case AArch64::TBZX:
---      // CBZ/TBZ with WZR/XZR -> unconditional B
---      if (MI.getOperand(0).getReg() == AArch64::WZR ||
---          MI.getOperand(0).getReg() == AArch64::XZR) {
---        DEBUG_WITH_TYPE("optimizeTerminators",
---                        dbgs() << "Removing always taken branch: " << MI);
---        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
---        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
---        for (auto *S : Succs)
---          if (S != Target)
---            MBB->removeSuccessor(S);
---        DebugLoc DL = MI.getDebugLoc();
---        while (MBB->rbegin() != &MI)
---          MBB->rbegin()->eraseFromParent();
---        MI.eraseFromParent();
---        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
---        return true;
---      }
---      break;
---    case AArch64::CBNZW:
---    case AArch64::CBNZX:
---    case AArch64::TBNZW:
---    case AArch64::TBNZX:
---      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
---      if (MI.getOperand(0).getReg() == AArch64::WZR ||
---          MI.getOperand(0).getReg() == AArch64::XZR) {
---        DEBUG_WITH_TYPE("optimizeTerminators",
---                        dbgs() << "Removing never taken branch: " << MI);
---        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
---        MI.getParent()->removeSuccessor(Target);
---        MI.eraseFromParent();
---        return true;
---      }
---      break;
---    }
---  }
---  return false;
---}
---
-- // Find the original register that VReg is copied from.
-- static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
--   while (Register::isVirtualRegister(VReg)) {
--diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
----- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
--+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
--@@ -705,8 +705,6 @@
--                               unsigned *OutUnscaledOp = nullptr,
--                               int64_t *EmittableOffset = nullptr);
-- 
---bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
---
-- static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
-- 
-- static inline bool isCondBranchOpcode(int Opc) {
--diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
----- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
--+++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
--@@ -14,7 +14,6 @@
-- //===----------------------------------------------------------------------===//
-- 
-- #include "AArch64.h"
---#include "AArch64InstrInfo.h"
-- #include "llvm/CodeGen/MachineFunctionPass.h"
-- #include "llvm/CodeGen/MachineInstrBuilder.h"
-- #include "llvm/CodeGen/TargetInstrInfo.h"
--@@ -46,6 +45,51 @@
--                 "AArch64 Redundant Conditional Branch Elimination pass", false,
--                 false)
-- 
--+static bool optimizeTerminators(MachineBasicBlock *MBB,
--+                                const TargetInstrInfo &TII) {
--+  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
--+    unsigned Opc = MI.getOpcode();
--+    switch (Opc) {
--+    case AArch64::CBZW:
--+    case AArch64::CBZX:
--+    case AArch64::TBZW:
--+    case AArch64::TBZX:
--+      // CBZ/TBZ with WZR/XZR -> unconditional B
--+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
--+          MI.getOperand(0).getReg() == AArch64::XZR) {
--+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
--+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
--+        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
--+        for (auto *S : Succs)
--+          if (S != Target)
--+            MBB->removeSuccessor(S);
--+        DebugLoc DL = MI.getDebugLoc();
--+        while (MBB->rbegin() != &MI)
--+          MBB->rbegin()->eraseFromParent();
--+        MI.eraseFromParent();
--+        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
--+        return true;
--+      }
--+      break;
--+    case AArch64::CBNZW:
--+    case AArch64::CBNZX:
--+    case AArch64::TBNZW:
--+    case AArch64::TBNZX:
--+      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
--+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
--+          MI.getOperand(0).getReg() == AArch64::XZR) {
--+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
--+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
--+        MI.getParent()->removeSuccessor(Target);
--+        MI.eraseFromParent();
--+        return true;
--+      }
--+      break;
--+    }
--+  }
--+  return false;
--+}
--+
-- bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
--   if (skipFunction(MF.getFunction()))
--     return false;
--diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
----- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
--+++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
--@@ -50,7 +50,6 @@
-- //        to use WZR/XZR directly in some cases.
-- //===----------------------------------------------------------------------===//
-- #include "AArch64.h"
---#include "AArch64InstrInfo.h"
-- #include "llvm/ADT/SetVector.h"
-- #include "llvm/ADT/Statistic.h"
-- #include "llvm/ADT/iterator_range.h"
--@@ -476,7 +475,6 @@
--     return false;
--   TRI = MF.getSubtarget().getRegisterInfo();
--   MRI = &MF.getRegInfo();
---  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-- 
--   // Resize the clobbered and used register unit trackers.  We do this once per
--   // function.
--@@ -486,10 +484,8 @@
--   OptBBUsedRegs.init(*TRI);
-- 
--   bool Changed = false;
---  for (MachineBasicBlock &MBB : MF) {
---    Changed |= optimizeTerminators(&MBB, TII);
--+  for (MachineBasicBlock &MBB : MF)
--     Changed |= optimizeBlock(&MBB);
---  }
--   return Changed;
-- }
-- 
--diff -ruN --strip-trailing-cr a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
----- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--@@ -1827,12 +1827,8 @@
--     // profile info.
--     CostTooHigh =
--         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
---    if (CostTooHigh) {
---      // Mark runtime checks as never succeeding when they exceed the threshold.
---      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
---      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
--+    if (CostTooHigh)
--       return;
---    }
-- 
--     BasicBlock *LoopHeader = L->getHeader();
--     BasicBlock *Preheader = L->getLoopPreheader();
--diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
----- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
--+++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
--@@ -735,15 +735,21 @@
-- ; ENABLE-NEXT:    .cfi_offset w29, -16
-- ; ENABLE-NEXT:    .cfi_offset w19, -24
-- ; ENABLE-NEXT:    .cfi_offset w20, -32
--+; ENABLE-NEXT:  ; %bb.1: ; %if.then
-- ; ENABLE-NEXT:    sub x19, sp, #16
-- ; ENABLE-NEXT:    mov sp, x19
-- ; ENABLE-NEXT:    mov w20, wzr
---; ENABLE-NEXT:  LBB10_1: ; %for.body
--+; ENABLE-NEXT:  LBB10_2: ; %for.body
-- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-- ; ENABLE-NEXT:    bl _something
-- ; ENABLE-NEXT:    add w20, w0, w20
-- ; ENABLE-NEXT:    str w20, [x19]
---; ENABLE-NEXT:    b LBB10_1
--+; ENABLE-NEXT:    b LBB10_2
--+; ENABLE-NEXT:  ; %bb.3: ; %if.end
--+; ENABLE-NEXT:    sub sp, x29, #16
--+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
--+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
--+; ENABLE-NEXT:    ret
-- ;
-- ; DISABLE-LABEL: infiniteloop:
-- ; DISABLE:       ; %bb.0: ; %entry
--@@ -755,15 +761,21 @@
-- ; DISABLE-NEXT:    .cfi_offset w29, -16
-- ; DISABLE-NEXT:    .cfi_offset w19, -24
-- ; DISABLE-NEXT:    .cfi_offset w20, -32
--+; DISABLE-NEXT:  ; %bb.1: ; %if.then
-- ; DISABLE-NEXT:    sub x19, sp, #16
-- ; DISABLE-NEXT:    mov sp, x19
-- ; DISABLE-NEXT:    mov w20, wzr
---; DISABLE-NEXT:  LBB10_1: ; %for.body
--+; DISABLE-NEXT:  LBB10_2: ; %for.body
-- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-- ; DISABLE-NEXT:    bl _something
-- ; DISABLE-NEXT:    add w20, w0, w20
-- ; DISABLE-NEXT:    str w20, [x19]
---; DISABLE-NEXT:    b LBB10_1
--+; DISABLE-NEXT:    b LBB10_2
--+; DISABLE-NEXT:  ; %bb.3: ; %if.end
--+; DISABLE-NEXT:    sub sp, x29, #16
--+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
--+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
--+; DISABLE-NEXT:    ret
-- entry:
--   br i1 undef, label %if.then, label %if.end
-- 
--@@ -794,10 +806,11 @@
-- ; ENABLE-NEXT:    .cfi_offset w29, -16
-- ; ENABLE-NEXT:    .cfi_offset w19, -24
-- ; ENABLE-NEXT:    .cfi_offset w20, -32
--+; ENABLE-NEXT:  ; %bb.1: ; %if.then
-- ; ENABLE-NEXT:    sub x8, sp, #16
-- ; ENABLE-NEXT:    mov sp, x8
-- ; ENABLE-NEXT:    mov w9, wzr
---; ENABLE-NEXT:  LBB11_1: ; %for.body
--+; ENABLE-NEXT:  LBB11_2: ; %for.body
-- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-- ; ENABLE-NEXT:    ; InlineAsm Start
-- ; ENABLE-NEXT:    mov x10, #0 ; =0x0
--@@ -808,7 +821,12 @@
-- ; ENABLE-NEXT:    ; InlineAsm Start
-- ; ENABLE-NEXT:    nop
-- ; ENABLE-NEXT:    ; InlineAsm End
---; ENABLE-NEXT:    b LBB11_1
--+; ENABLE-NEXT:    b LBB11_2
--+; ENABLE-NEXT:  ; %bb.3: ; %if.end
--+; ENABLE-NEXT:    sub sp, x29, #16
--+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
--+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
--+; ENABLE-NEXT:    ret
-- ;
-- ; DISABLE-LABEL: infiniteloop2:
-- ; DISABLE:       ; %bb.0: ; %entry
--@@ -820,10 +838,11 @@
-- ; DISABLE-NEXT:    .cfi_offset w29, -16
-- ; DISABLE-NEXT:    .cfi_offset w19, -24
-- ; DISABLE-NEXT:    .cfi_offset w20, -32
--+; DISABLE-NEXT:  ; %bb.1: ; %if.then
-- ; DISABLE-NEXT:    sub x8, sp, #16
-- ; DISABLE-NEXT:    mov sp, x8
-- ; DISABLE-NEXT:    mov w9, wzr
---; DISABLE-NEXT:  LBB11_1: ; %for.body
--+; DISABLE-NEXT:  LBB11_2: ; %for.body
-- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-- ; DISABLE-NEXT:    ; InlineAsm Start
-- ; DISABLE-NEXT:    mov x10, #0 ; =0x0
--@@ -834,7 +853,12 @@
-- ; DISABLE-NEXT:    ; InlineAsm Start
-- ; DISABLE-NEXT:    nop
-- ; DISABLE-NEXT:    ; InlineAsm End
---; DISABLE-NEXT:    b LBB11_1
--+; DISABLE-NEXT:    b LBB11_2
--+; DISABLE-NEXT:  ; %bb.3: ; %if.end
--+; DISABLE-NEXT:    sub sp, x29, #16
--+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
--+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
--+; DISABLE-NEXT:    ret
-- entry:
--   br i1 undef, label %if.then, label %if.end
-- 
--@@ -865,43 +889,49 @@
-- define void @infiniteloop3() {
-- ; ENABLE-LABEL: infiniteloop3:
-- ; ENABLE:       ; %bb.0: ; %entry
--+; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
-- ; ENABLE-NEXT:    mov x8, xzr
-- ; ENABLE-NEXT:    mov x9, xzr
-- ; ENABLE-NEXT:    mov x11, xzr
---; ENABLE-NEXT:    b LBB12_2
---; ENABLE-NEXT:  LBB12_1: ; %loop2b
---; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
--+; ENABLE-NEXT:    b LBB12_3
--+; ENABLE-NEXT:  LBB12_2: ; %loop2b
--+; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
-- ; ENABLE-NEXT:    str x10, [x11]
-- ; ENABLE-NEXT:    mov x11, x10
---; ENABLE-NEXT:  LBB12_2: ; %loop1
--+; ENABLE-NEXT:  LBB12_3: ; %loop1
-- ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-- ; ENABLE-NEXT:    mov x10, x9
-- ; ENABLE-NEXT:    ldr x9, [x8]
---; ENABLE-NEXT:    cbnz x8, LBB12_1
---; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
--+; ENABLE-NEXT:    cbnz x8, LBB12_2
--+; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
-- ; ENABLE-NEXT:    mov x8, x10
-- ; ENABLE-NEXT:    mov x11, x10
---; ENABLE-NEXT:    b LBB12_2
--+; ENABLE-NEXT:    b LBB12_3
--+; ENABLE-NEXT:  ; %bb.5: ; %end
--+; ENABLE-NEXT:    ret
-- ;
-- ; DISABLE-LABEL: infiniteloop3:
-- ; DISABLE:       ; %bb.0: ; %entry
--+; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
-- ; DISABLE-NEXT:    mov x8, xzr
-- ; DISABLE-NEXT:    mov x9, xzr
-- ; DISABLE-NEXT:    mov x11, xzr
---; DISABLE-NEXT:    b LBB12_2
---; DISABLE-NEXT:  LBB12_1: ; %loop2b
---; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
--+; DISABLE-NEXT:    b LBB12_3
--+; DISABLE-NEXT:  LBB12_2: ; %loop2b
--+; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
-- ; DISABLE-NEXT:    str x10, [x11]
-- ; DISABLE-NEXT:    mov x11, x10
---; DISABLE-NEXT:  LBB12_2: ; %loop1
--+; DISABLE-NEXT:  LBB12_3: ; %loop1
-- ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
-- ; DISABLE-NEXT:    mov x10, x9
-- ; DISABLE-NEXT:    ldr x9, [x8]
---; DISABLE-NEXT:    cbnz x8, LBB12_1
---; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
--+; DISABLE-NEXT:    cbnz x8, LBB12_2
--+; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
-- ; DISABLE-NEXT:    mov x8, x10
-- ; DISABLE-NEXT:    mov x11, x10
---; DISABLE-NEXT:    b LBB12_2
--+; DISABLE-NEXT:    b LBB12_3
--+; DISABLE-NEXT:  ; %bb.5: ; %end
--+; DISABLE-NEXT:    ret
-- entry:
--   br i1 undef, label %loop2a, label %body
-- 
--diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
----- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
--+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
--@@ -8,14 +8,20 @@
-- define i8 @foo_optsize(i32 %v4) optsize {
-- ; CHECK-LABEL: foo_optsize:
-- ; CHECK:       // %bb.0: // %entry
---; CHECK-NEXT:    cbnz w0, .LBB0_2
---; CHECK-NEXT:  // %bb.1: // %b2
---; CHECK-NEXT:    mov w0, #1 // =0x1
--+; CHECK-NEXT:    b .LBB0_2
--+; CHECK-NEXT:  .LBB0_1:
--+; CHECK-NEXT:    mov w0, wzr
-- ; CHECK-NEXT:    ret
-- ; CHECK-NEXT:  .LBB0_2: // %b1
---; CHECK-NEXT:    cmp w0, #1
---; CHECK-NEXT:    mov w0, wzr
--+; CHECK-NEXT:    cbnz w0, .LBB0_4
--+; CHECK-NEXT:  // %bb.3: // %b2
--+; CHECK-NEXT:    mov w0, #1 // =0x1
-- ; CHECK-NEXT:    ret
--+; CHECK-NEXT:  .LBB0_4: // %b1
--+; CHECK-NEXT:    cmp w0, #1
--+; CHECK-NEXT:    b.ne .LBB0_1
--+; CHECK-NEXT:  // %bb.5: // %b3
--+; CHECK-NEXT:    b .LBB0_1
-- entry:
--   %v2 = icmp eq i32 0, 0
--   br i1 %v2, label %b1, label %b4
--@@ -41,14 +47,20 @@
-- define i8 @foo_optspeed(i32 %v4) {
-- ; CHECK-LABEL: foo_optspeed:
-- ; CHECK:       // %bb.0: // %entry
---; CHECK-NEXT:    cbnz w0, .LBB1_2
---; CHECK-NEXT:  // %bb.1: // %b2
---; CHECK-NEXT:    mov w0, #1 // =0x1
--+; CHECK-NEXT:    b .LBB1_2
--+; CHECK-NEXT:  .LBB1_1:
--+; CHECK-NEXT:    mov w0, wzr
-- ; CHECK-NEXT:    ret
-- ; CHECK-NEXT:  .LBB1_2: // %b1
---; CHECK-NEXT:    cmp w0, #1
---; CHECK-NEXT:    mov w0, wzr
--+; CHECK-NEXT:    cbnz w0, .LBB1_4
--+; CHECK-NEXT:  // %bb.3: // %b2
--+; CHECK-NEXT:    mov w0, #1 // =0x1
-- ; CHECK-NEXT:    ret
--+; CHECK-NEXT:  .LBB1_4: // %b1
--+; CHECK-NEXT:    cmp w0, #1
--+; CHECK-NEXT:    b.ne .LBB1_1
--+; CHECK-NEXT:  // %bb.5: // %b3
--+; CHECK-NEXT:    b .LBB1_1
-- entry:
--   %v2 = icmp eq i32 0, 0
--   br i1 %v2, label %b1, label %b4
--diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
----- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
--+++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
--@@ -21,8 +21,10 @@
--   ; CHECK-NEXT:   B %bb.3
--   ; CHECK-NEXT: {{  $}}
--   ; CHECK-NEXT: bb.1.bb:
--+  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
--   ; CHECK-NEXT:   liveins: $w0, $lr
--   ; CHECK-NEXT: {{  $}}
--+  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
--   ; CHECK-NEXT:   B %bb.2
--   ; CHECK-NEXT: {{  $}}
--   ; CHECK-NEXT: bb.2.bb1:
--diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
----- a/llvm/test/CodeGen/AArch64/pr164181.ll
--+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
--@@ -29,11 +29,11 @@
-- ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
-- ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
-- ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
---; CHECK-NEXT:    tbz w5, #0, .LBB0_40
--+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
-- ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
-- ; CHECK-NEXT:    ldr x4, [sp, #312]
-- ; CHECK-NEXT:    ldr x14, [sp, #280]
---; CHECK-NEXT:    tbz w0, #0, .LBB0_39
--+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
-- ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
-- ; CHECK-NEXT:    ldrb w8, [sp, #368]
-- ; CHECK-NEXT:    ldrb w12, [sp, #256]
--@@ -92,7 +92,7 @@
-- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
-- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
---; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
--+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-- ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
-- ; CHECK-NEXT:    mov x12, x24
-- ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
--@@ -117,7 +117,7 @@
-- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
-- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
---; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
--+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-- ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
-- ; CHECK-NEXT:    cmn x24, #30
-- ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
--@@ -142,7 +142,7 @@
-- ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
-- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
---; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
--+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-- ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
-- ; CHECK-NEXT:    mov w14, #1152 // =0x480
-- ; CHECK-NEXT:    mov w24, #1 // =0x1
--@@ -176,7 +176,7 @@
-- ; CHECK-NEXT:    // => This Loop Header: Depth=4
-- ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
-- ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
---; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
--+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
-- ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
-- ; CHECK-NEXT:    and w8, w8, w8, asr #31
-- ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
--@@ -281,23 +281,31 @@
-- ; CHECK-NEXT:    mov x24, xzr
-- ; CHECK-NEXT:    mul w12, w12, w22
-- ; CHECK-NEXT:    mov x22, x5
---; CHECK-NEXT:    tbz w0, #0, .LBB0_33
---; CHECK-NEXT:  .LBB0_28: // %if.then222.us
--+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
--+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
-- ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
-- ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
-- ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
-- ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
-- ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
--+; CHECK-NEXT:  // %bb.29: // %if.then222.us
--+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-- ; CHECK-NEXT:    adrp x27, :got:var_32
-- ; CHECK-NEXT:    ldur w8, [x19, #-12]
-- ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
-- ; CHECK-NEXT:    strh w8, [x27]
-- ; CHECK-NEXT:    sxtb w8, w25
---; CHECK-NEXT:    strb w3, [x16]
-- ; CHECK-NEXT:    bic w25, w8, w8, asr #31
--+; CHECK-NEXT:    b .LBB0_31
--+; CHECK-NEXT:    .p2align 5, , 16
--+; CHECK-NEXT:  // %bb.30:
--+; CHECK-NEXT:    mov w25, wzr
--+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
--+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
--+; CHECK-NEXT:    strb w3, [x16]
-- ; CHECK-NEXT:    tst w13, #0xff
---; CHECK-NEXT:    b.eq .LBB0_30
---; CHECK-NEXT:  // %bb.29: // %if.then254.us
--+; CHECK-NEXT:    b.eq .LBB0_33
--+; CHECK-NEXT:  // %bb.32: // %if.then254.us
-- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-- ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
-- ; CHECK-NEXT:    adrp x27, :got:var_35
--@@ -306,7 +314,7 @@
-- ; CHECK-NEXT:    csel x8, xzr, x7, eq
-- ; CHECK-NEXT:    str x8, [x27]
-- ; CHECK-NEXT:    strh w1, [x17]
---; CHECK-NEXT:  .LBB0_30: // %if.end282.us
--+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
-- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-- ; CHECK-NEXT:    orr x27, x24, x4
-- ; CHECK-NEXT:    adrp x8, :got:var_39
--@@ -317,14 +325,14 @@
-- ; CHECK-NEXT:    str x8, [x18]
-- ; CHECK-NEXT:    mov w8, #1 // =0x1
-- ; CHECK-NEXT:    cbnz x2, .LBB0_27
---; CHECK-NEXT:  // %bb.31: // %if.then327.us
--+; CHECK-NEXT:  // %bb.34: // %if.then327.us
-- ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
-- ; CHECK-NEXT:    cbz w8, .LBB0_25
---; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
--+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
-- ; CHECK-NEXT:    mov w4, wzr
-- ; CHECK-NEXT:    b .LBB0_26
-- ; CHECK-NEXT:    .p2align 5, , 16
---; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
--+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
-- ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
-- ; CHECK-NEXT:    mov w3, #1152 // =0x480
-- ; CHECK-NEXT:    mov x22, xzr
--@@ -335,24 +343,24 @@
-- ; CHECK-NEXT:    madd x14, x14, x3, x11
-- ; CHECK-NEXT:    mov w28, w30
-- ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
---; CHECK-NEXT:    b .LBB0_36
--+; CHECK-NEXT:    b .LBB0_39
-- ; CHECK-NEXT:    .p2align 5, , 16
---; CHECK-NEXT:  .LBB0_34: // %if.then466.us
---; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
--+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
--+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
-- ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
-- ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
-- ; CHECK-NEXT:    sxtb w4, w4
-- ; CHECK-NEXT:    bic w4, w4, w4, asr #31
-- ; CHECK-NEXT:    str x3, [x28]
-- ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
---; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
---; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
--+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
--+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
-- ; CHECK-NEXT:    add x22, x22, #1
-- ; CHECK-NEXT:    add x27, x27, #1
-- ; CHECK-NEXT:    mov w28, wzr
-- ; CHECK-NEXT:    cmp x27, #0
-- ; CHECK-NEXT:    b.hs .LBB0_9
---; CHECK-NEXT:  .LBB0_36: // %for.body380.us
--+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
-- ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
-- ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
-- ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
--@@ -364,18 +372,18 @@
-- ; CHECK-NEXT:    strh w28, [x11]
-- ; CHECK-NEXT:    csel w28, w21, w3, ne
-- ; CHECK-NEXT:    str w28, [x20]
---; CHECK-NEXT:    cbz x15, .LBB0_35
---; CHECK-NEXT:  // %bb.37: // %if.then436.us
---; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
--+; CHECK-NEXT:    cbz x15, .LBB0_38
--+; CHECK-NEXT:  // %bb.40: // %if.then436.us
--+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
-- ; CHECK-NEXT:    ldrh w28, [x14]
---; CHECK-NEXT:    cbnz w28, .LBB0_34
---; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
--+; CHECK-NEXT:    cbnz w28, .LBB0_37
--+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
-- ; CHECK-NEXT:    mov w4, wzr
---; CHECK-NEXT:    b .LBB0_35
---; CHECK-NEXT:  .LBB0_39: // %for.body41
--+; CHECK-NEXT:    b .LBB0_38
--+; CHECK-NEXT:  .LBB0_42: // %for.body41
-- ; CHECK-NEXT:    strb wzr, [x4]
-- ; CHECK-NEXT:    strb wzr, [x14]
---; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
--+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
-- ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
-- ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
-- ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
--diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
----- a/llvm/test/CodeGen/AArch64/pr166870.ll
--+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
--@@ -26,11 +26,12 @@
-- ; CHECK-NEXT:    mov x21, x1
-- ; CHECK-NEXT:    bl baz
-- ; CHECK-NEXT:    mov w0, #0 // =0x0
--+; CHECK-NEXT:  // %bb.5: // %bb6
-- ; CHECK-NEXT:    mov w10, #1 // =0x1
--+; CHECK-NEXT:    cbnz w10, .LBB0_11
--+; CHECK-NEXT:  // %bb.6: // %bb7
-- ; CHECK-NEXT:    cbnz w10, .LBB0_10
---; CHECK-NEXT:  // %bb.5: // %bb7
---; CHECK-NEXT:    cbnz w10, .LBB0_9
---; CHECK-NEXT:  // %bb.6: // %bb8
--+; CHECK-NEXT:  // %bb.7: // %bb8
-- ; CHECK-NEXT:    mov x8, x21
-- ; CHECK-NEXT:    mov x9, x20
-- ; CHECK-NEXT:    mov w20, #0 // =0x0
--@@ -38,17 +39,17 @@
-- ; CHECK-NEXT:    mov x21, x9
-- ; CHECK-NEXT:    mov w8, w8
-- ; CHECK-NEXT:    mov x22, x8
---; CHECK-NEXT:  .LBB0_7: // %bb10
--+; CHECK-NEXT:  .LBB0_8: // %bb10
-- ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-- ; CHECK-NEXT:    strb w20, [x19]
---; CHECK-NEXT:    cbnz x21, .LBB0_7
---; CHECK-NEXT:  // %bb.8: // %bb12
---; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
--+; CHECK-NEXT:    cbnz x21, .LBB0_8
--+; CHECK-NEXT:  // %bb.9: // %bb12
--+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
-- ; CHECK-NEXT:    bl snork
---; CHECK-NEXT:    cbnz x22, .LBB0_7
---; CHECK-NEXT:  .LBB0_9:
---; CHECK-NEXT:    mov w0, #0 // =0x0
--+; CHECK-NEXT:    cbnz x22, .LBB0_8
-- ; CHECK-NEXT:  .LBB0_10:
--+; CHECK-NEXT:    mov w0, #0 // =0x0
--+; CHECK-NEXT:  .LBB0_11:
-- ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-- ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-- ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
--diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
----- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
--+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
--@@ -71,21 +71,27 @@
-- ; CHECK-NEXT:    .cfi_def_cfa w29, 16
-- ; CHECK-NEXT:    .cfi_offset w30, -8
-- ; CHECK-NEXT:    .cfi_offset w29, -16
--+; CHECK-NEXT:    .cfi_remember_state
-- ; CHECK-NEXT:    mov w8, #1 // =0x1
---; CHECK-NEXT:    mov w9, #2 // =0x2
-- ; CHECK-NEXT:    stur xzr, [x29, #-8]
---; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
---; CHECK-NEXT:    ldur w8, [x29, #-8]
---; CHECK-NEXT:    cbz w8, .LBB0_2
--+; CHECK-NEXT:    b .LBB0_3
-- ; CHECK-NEXT:  // %bb.1:
---; CHECK-NEXT:    mov w8, #1 // =0x1
-- ; CHECK-NEXT:    str w8, [sp, #16]
---; CHECK-NEXT:    b .LBB0_3
--+; CHECK-NEXT:    ldur w8, [x29, #-8]
--+; CHECK-NEXT:    cbz w8, .LBB0_4
-- ; CHECK-NEXT:  .LBB0_2:
--+; CHECK-NEXT:    .cfi_restore_state
-- ; CHECK-NEXT:    mov w8, #1 // =0x1
---; CHECK-NEXT:    mov w9, #2 // =0x2
---; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--+; CHECK-NEXT:    str w8, [sp, #16]
--+; CHECK-NEXT:    b .LBB0_5
-- ; CHECK-NEXT:  .LBB0_3:
--+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--+; CHECK-NEXT:    ldur w8, [x29, #-8]
--+; CHECK-NEXT:    cbnz w8, .LBB0_2
--+; CHECK-NEXT:  .LBB0_4:
--+; CHECK-NEXT:    mov w8, #1 // =0x1
--+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--+; CHECK-NEXT:  .LBB0_5:
-- ; CHECK-NEXT:    mov w0, wzr
-- ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
-- ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
--@@ -128,6 +134,7 @@
-- ;
-- ; CHECK-LABEL: OUTLINED_FUNCTION_0:
-- ; CHECK:       // %bb.0:
--+; CHECK-NEXT:    mov w9, #2 // =0x2
-- ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
-- ; CHECK-NEXT:    mov w9, #3 // =0x3
-- ; CHECK-NEXT:    mov w8, #4 // =0x4
--diff -ruN --strip-trailing-cr a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
----- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
--+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
--@@ -12,21 +12,27 @@
-- ; CHECK-NEXT:    .cfi_def_cfa w29, 16
-- ; CHECK-NEXT:    .cfi_offset w30, -8
-- ; CHECK-NEXT:    .cfi_offset w29, -16
--+; CHECK-NEXT:    .cfi_remember_state
-- ; CHECK-NEXT:    mov w8, #1 // =0x1
---; CHECK-NEXT:    mov w9, #2 // =0x2
-- ; CHECK-NEXT:    stur xzr, [x29, #-8]
---; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
---; CHECK-NEXT:    ldur w8, [x29, #-8]
---; CHECK-NEXT:    cbz w8, .LBB0_2
--+; CHECK-NEXT:    b .LBB0_3
-- ; CHECK-NEXT:  // %bb.1:
---; CHECK-NEXT:    mov w8, #1 // =0x1
-- ; CHECK-NEXT:    str w8, [sp, #16]
---; CHECK-NEXT:    b .LBB0_3
--+; CHECK-NEXT:    ldur w8, [x29, #-8]
--+; CHECK-NEXT:    cbz w8, .LBB0_4
-- ; CHECK-NEXT:  .LBB0_2:
--+; CHECK-NEXT:    .cfi_restore_state
-- ; CHECK-NEXT:    mov w8, #1 // =0x1
---; CHECK-NEXT:    mov w9, #2 // =0x2
---; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--+; CHECK-NEXT:    str w8, [sp, #16]
--+; CHECK-NEXT:    b .LBB0_5
-- ; CHECK-NEXT:  .LBB0_3:
--+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--+; CHECK-NEXT:    ldur w8, [x29, #-8]
--+; CHECK-NEXT:    cbnz w8, .LBB0_2
--+; CHECK-NEXT:  .LBB0_4:
--+; CHECK-NEXT:    mov w8, #1 // =0x1
--+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
--+; CHECK-NEXT:  .LBB0_5:
-- ; CHECK-NEXT:    mov w0, wzr
-- ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
-- ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
--diff -ruN --strip-trailing-cr a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
----- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
--+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
--@@ -2,23 +2,29 @@
-- ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
-- ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
-- 
---; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
---; no runtime check is generated and the loop should not be vectorized.
--+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
--+; no runtime check is generated even though one is needed and !noalias
--+; annotations are added.
-- define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
-- ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
-- ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
---; LIMIT0-NEXT:  [[ENTRY:.*]]:
---; LIMIT0-NEXT:    br label %[[LOOP:.*]]
---; LIMIT0:       [[LOOP]]:
---; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
---; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
--+; LIMIT0-NEXT:  [[ENTRY:.*:]]
--+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
--+; LIMIT0:       [[VECTOR_PH]]:
--+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
--+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
--+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
--+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
--+; LIMIT0:       [[VECTOR_BODY]]:
--+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-- ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
---; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
---; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
--+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
--+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-- ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
---; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
--+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
--+; LIMIT0:       [[MIDDLE_BLOCK]]:
--+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
-- ; LIMIT0:       [[EXIT]]:
---; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
-- ; LIMIT0-NEXT:    ret i16 [[TMP0]]
-- ;
-- ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
--@@ -82,9 +88,14 @@
-- !3 = !{!"llvm.loop.vectorize.enable", i1 true}
-- 
-- ;.
---; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
---; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
---; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
--+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
--+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
--+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
--+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
--+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
--+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
--+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
--+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
-- ;.
-- ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
-- ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
--diff -ruN --strip-trailing-cr a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
----- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
--+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
--@@ -1320,8 +1320,9 @@
-- }
-- 
-- template <typename T>
---T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
---                               StringRef entryType, uint64_t depth) {
--+T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
--+                               uint64_t index, StringRef entryType,
--+                               uint64_t depth) {
--   if (index >= entries.size()) {
--     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
--     return {};
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index e573782..3c9c005 100644
+index 3c9c005..26b3bf8 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "87bf5ee23863bc0b467ee44b2184b2c134a98464"
--    LLVM_SHA256 = "9d0bca271bfb266de8453cd34156741fd41f64b911f580262d187ce4d4d9b6d9"
-+    LLVM_COMMIT = "48d942c7158af43094db1b5e6c59c6e6fcf1b5aa"
-+    LLVM_SHA256 = "6ce4ac276a4687625e9f57e53715285d99b60c6553e0cde4db9b7e74f2179f69"
+-    LLVM_COMMIT = "48d942c7158af43094db1b5e6c59c6e6fcf1b5aa"
+-    LLVM_SHA256 = "6ce4ac276a4687625e9f57e53715285d99b60c6553e0cde4db9b7e74f2179f69"
++    LLVM_COMMIT = "16c0893f04c04faa8ac36495363344840f7c5db1"
++    LLVM_SHA256 = "3f786bc56ecb8fce511fe504f9b0848c12b5312beb7bded23edfc77272698b90"
  
      tf_http_archive(
          name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 230123596f420e..a0a63a05ad95ea 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "d9023f29bb8ad1fcb72b8183de06f8bc86fc195d"
-    SHARDY_SHA256 = "2b8951f25c0c1e6c1569b842ef3f68a3cefdcc2a1a53eb6f4970d5bf1df91eb5"
+    SHARDY_COMMIT = "179bcb16dc3c2b132f9bccff096cb5559486fdc2"
+    SHARDY_SHA256 = "ba6475e764d830d3e8f9ede9c28f3e67f6703606af10f1398cfcca6a13979e09"
 
     tf_http_archive(
         name = "shardy",

From f5047b172a3141ae75aae1ee843f5b1d44f7af09 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 15:14:10 -0800
Subject: [PATCH 194/753] Reverts 347bf33edd1df1331c2f9d6b443cf90651e9d59e

PiperOrigin-RevId: 843394019
---
 third_party/xla/xla/service/compiler.h                     | 7 +++++++
 .../xla/xla/service/cpu/cpu_aot_compilation_result.h       | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 99f4cfa1171eed..b47c105d72160c 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -69,6 +69,7 @@ namespace xla {
 // computation.
 using ObjectFileData = std::vector<char>;
 
+class Compiler;
 class AotCompilationOptions;
 
 // Abstract superclass describing the result of an ahead-of-time compilation.
@@ -88,6 +89,12 @@ class AotCompilationResult {
     return Unimplemented("LoadExecutable unimplemented.");
   }
 
+  ABSL_DEPRECATE_AND_INLINE()
+  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      Compiler*, const se::StreamExecutor* executor) && {
+    return std::move(*this).LoadExecutable(executor);
+  }
+
   virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
       const {
     return Unimplemented("buffer_assignment unimplemented.");
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 59a3a0597ab7c6..1f845de703b5ec 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -116,6 +116,8 @@ class CpuAotCompilationResult : public AotCompilationResult {
     return proto_.SerializeAsString();
   }
 
+  using AotCompilationResult::LoadExecutable;
+
   absl::StatusOr<std::unique_ptr<Executable>>
       LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 

From ce5fd34ac6c50e52e59e3d6c32301fcadc8c89f6 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Thu, 11 Dec 2025 15:29:00 -0800
Subject: [PATCH 195/753] Stop using recordphase in HloRunnerPjRt.

We are removing this functionality due to the limited utility it provides.

PiperOrigin-RevId: 843399237
---
 third_party/xla/xla/service/BUILD              |  1 -
 third_party/xla/xla/service/hlo_runner_pjrt.cc | 17 -----------------
 2 files changed, 18 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index a18e1b9660700f..2fc73006ca32d4 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4529,7 +4529,6 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:recordphase",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 40d43d98357280..c85e137fe88825 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -58,7 +58,6 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/recordphase.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/util.h"
@@ -302,9 +301,6 @@ absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
 HloRunnerPjRt::TransferLiteralsToDevice(
     const absl::Span<const ShapeLayout> layouts,
     const absl::Span<const Literal* const> literals) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_TransferLiteralsToDevice",
-                                    /*use_unique_phase_name=*/true);
-
   // Note: This function is used for single (default) device execution.
   if (pjrt_client_->addressable_device_count() <= kDeviceIdx) {
     return absl::InternalError("No addressable devices available");
@@ -365,9 +361,6 @@ HloRunnerPjRt::TransferLiteralsToDevice(
 absl::StatusOr<Literal> HloRunnerPjRt::TransferLiteralsFromDevice(
     absl::Span<const std::unique_ptr<PjRtBuffer>> output_buffers,
     const bool untuple_result) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_TransferLiteralsFromDevice",
-                                    /*use_unique_phase_name=*/true);
-
   if (!untuple_result) {
     // If not flattened, the tuple should only contain arrays with layouts.
     TF_RET_CHECK(output_buffers.size() == 1)
@@ -408,9 +401,6 @@ HloRunnerPjRt::ExecuteWithDeviceBuffers(
     OpaqueExecutable* executable,
     const std::vector<std::unique_ptr<PjRtBuffer>>& arguments,
     const ExecuteOptions* execute_options) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
-                                    /*use_unique_phase_name=*/true);
-
   TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
   TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
@@ -479,9 +469,6 @@ HloRunnerPjRt::ExecuteWithExecutable(OpaqueExecutable* executable,
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 HloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                 bool run_hlo_passes) {
-  tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Compile",
-                                    /*use_unique_phase_name=*/true);
-
   TF_ASSIGN_OR_RETURN(
       CompileOptions compile_options,
       GenerateDefaultCompileOptions(module.get(), run_hlo_passes));
@@ -581,8 +568,6 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
               executable_provider_arg)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
-        tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
-                                          /*use_unique_phase_name=*/true);
         TF_ASSIGN_OR_RETURN(
             PjRtLoadedExecutable * pjrt_executable,
             wrapped_executable->GetOrLoadExecutable(pjrt_client_.get()));
@@ -614,8 +599,6 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
               executable_provider_arg)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
-        tsl::recordphase::RecordScoped rs("HloRunnerPjRt_Execute",
-                                          /*use_unique_phase_name=*/true);
         TF_RET_CHECK(options.use_threads);
 
         // The underlying data is modified concurrently. We don't need to

From 6b125c7ff789ae59dd1678552c491a4e77e7efbf Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Thu, 11 Dec 2025 16:42:12 -0800
Subject: [PATCH 196/753] Remove the Google-internal recordphase feature due to
 lack of use & utility.

Reverts changelist 769271048

PiperOrigin-RevId: 843424305
---
 third_party/xla/xla/tsl/platform/BUILD        |  8 --
 .../xla/xla/tsl/platform/default/BUILD        | 14 ---
 .../xla/tsl/platform/default/recordphase.cc   | 34 -------
 .../xla/xla/tsl/platform/recordphase.h        | 99 -------------------
 4 files changed, 155 deletions(-)
 delete mode 100644 third_party/xla/xla/tsl/platform/default/recordphase.cc
 delete mode 100644 third_party/xla/xla/tsl/platform/recordphase.h

diff --git a/third_party/xla/xla/tsl/platform/BUILD b/third_party/xla/xla/tsl/platform/BUILD
index 16110e0432404f..cc094e447e7d8f 100644
--- a/third_party/xla/xla/tsl/platform/BUILD
+++ b/third_party/xla/xla/tsl/platform/BUILD
@@ -46,7 +46,6 @@ exports_files(
         "env.cc",
         "ram_file_system.h",
         "grpc_credentials.h",
-        "recordphase.h",
         "resource.h",
         "rocm_rocdl_path.h",
         "resource_loader.h",
@@ -133,7 +132,6 @@ filegroup(
         "file_system_helper.h",
         "prefetch.h",
         "ram_file_system.h",
-        "recordphase.h",
         "resource.h",
         "stack_frame.h",
         "statusor.h",
@@ -906,12 +904,6 @@ tsl_cc_test(
     ],
 )
 
-cc_library(
-    name = "recordphase",
-    textual_hdrs = ["recordphase.h"],
-    deps = tf_platform_deps("recordphase") + ["@com_google_absl//absl/strings:string_view"],
-)
-
 cc_library(
     name = "debug_me_context",
     hdrs = ["debug_me_context.h"],
diff --git a/third_party/xla/xla/tsl/platform/default/BUILD b/third_party/xla/xla/tsl/platform/default/BUILD
index 0d82327fbcdfad..699c941b26c651 100644
--- a/third_party/xla/xla/tsl/platform/default/BUILD
+++ b/third_party/xla/xla/tsl/platform/default/BUILD
@@ -670,17 +670,3 @@ exports_files(
         "//tensorflow/core/platform:__pkg__",
     ]),
 )
-
-cc_library(
-    name = "recordphase",
-    srcs = ["recordphase.cc"],
-    hdrs = ["//xla/tsl/platform:recordphase.h"],
-    tags = [
-        "manual",
-        "no_oss",
-        "nobuilder",
-    ],
-    deps = [
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
diff --git a/third_party/xla/xla/tsl/platform/default/recordphase.cc b/third_party/xla/xla/tsl/platform/default/recordphase.cc
deleted file mode 100644
index fb823690d0581f..00000000000000
--- a/third_party/xla/xla/tsl/platform/default/recordphase.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/tsl/platform/recordphase.h"
-
-#include <string>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-
-namespace tsl::recordphase {
-void StartPhase(const absl::string_view phase_name,
-                const std::vector<absl::string_view>& dependencies) {}
-
-std::string StartPhaseUnique(
-    absl::string_view phase_name,
-    const std::vector<absl::string_view>& dependencies) {
-  return std::string(phase_name);
-}
-
-void EndPhase(const absl::string_view phase_name) {}
-}  // namespace tsl::recordphase
diff --git a/third_party/xla/xla/tsl/platform/recordphase.h b/third_party/xla/xla/tsl/platform/recordphase.h
deleted file mode 100644
index 23bf927136525a..00000000000000
--- a/third_party/xla/xla/tsl/platform/recordphase.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Client library for recording action subphase timing metrics.
-//
-// This is **different** to the TSL profiler TraceMe functionality.
-//
-// Currently the public implementation of this library is a stub that does
-// nothing. There is an implementation internally at Google.
-//
-// This library is used to record the start and end of a subphase in an action.
-// A subphase is a named section of work that happens within an action.
-//
-// Example:
-//   // Start a phase named "parse_action".
-//   StartPhase("parse_action");
-//   // Do some work.
-//   // ...
-//   // End the phase.
-//   EndPhase("parse_action");
-//   // Start another phase named "link_executable" which depends on
-//   // (always starts after) "parse_action".
-//   StartPhase("link_executable", {"parse_action"});
-//   // Do some work.
-//   // ...
-//   // End the phase.
-//   EndPhase("link_executable");
-//
-// The StartPhase and EndPhase methods are thread-safe.
-//
-// The LoadPhase and LoadAllPhases methods can be used in
-// unit tests to verify the recorded phase timing information.
-
-#ifndef XLA_TSL_PLATFORM_RECORDPHASE_H_
-#define XLA_TSL_PLATFORM_RECORDPHASE_H_
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/string_view.h"
-
-namespace tsl::recordphase {
-// Records the start of a phase.
-// * phase_name: the name of the phase, must be unique within the namespace.
-// * dependencies: the phases that must complete before this phase can start.
-// phase_name and dependencies must contain only alphanumeric characters,
-// dashes and underscores.
-// If a phase has already started, or a dependency does not exist,
-// or if there are illegal characters in the phase name or its dependencies,
-// the method will log an error and fail silently.
-void StartPhase(absl::string_view phase_name,
-                const std::vector<absl::string_view>& dependencies = {});
-
-// This is like StartPhase, but it generates a unique phase name (which it uses
-// to invoke StartPhase) and returns it.
-std::string StartPhaseUnique(
-    absl::string_view phase_name,
-    const std::vector<absl::string_view>& dependencies = {});
-
-// Records the end of a phase. The phase must have been started before.
-void EndPhase(absl::string_view phase_name);
-
-// Simple RAII wrapper around StartPhase and EndPhase. Does not perform any
-// additional checking.
-class RecordScoped {
- public:
-  explicit RecordScoped(const absl::string_view phase_name,
-                        bool use_unique_phase_name = false,
-                        const std::vector<absl::string_view>& dependencies = {})
-      : phase_name_(phase_name) {
-    if (!use_unique_phase_name) {
-      StartPhase(phase_name_, dependencies);
-    } else {
-      phase_name_ = StartPhaseUnique(phase_name, dependencies);
-    }
-  }
-  ~RecordScoped() { EndPhase(phase_name_); }
-
-  absl::string_view phase_name() const { return phase_name_; }
-
- private:
-  std::string phase_name_;
-};
-}  // namespace tsl::recordphase
-
-#endif  // XLA_TSL_PLATFORM_RECORDPHASE_H_

From f54f40d8f2b6bd14965d531eaf7ec25dabb8a3bf Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Thu, 11 Dec 2025 17:10:06 -0800
Subject: [PATCH 197/753] Support bf16 x bf16 -> f32 and i8 x i8 -> i32 data
 types  in convolution op with YNNPACK enabled.

PiperOrigin-RevId: 843432738
---
 .../xla/xla/backends/cpu/ynn_support.cc       |  4 +-
 .../xla/xla/service/cpu/cpu_compiler.cc       | 49 ++++++++++++++-----
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index b455c00c7734fa..4c8825d192aa8b 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -280,9 +280,7 @@ bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
   // Stores tuple of allowed (input, output) dtypes.
   static const absl::NoDestructor<absl::flat_hash_set<
       std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
-      kAllowedTypes({
-          {F32, F32, F32},
-      });
+      kAllowedTypes({{F32, F32, F32}, {BF16, BF16, F32}, {S8, S8, S32}});
 
   PrimitiveType lhs_dtype = conv->operand(0)->shape().element_type();
   PrimitiveType rhs_dtype = conv->operand(1)->shape().element_type();
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 5cced18ff84246..300eaaa319baa9 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -542,6 +542,19 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
   return pipeline;
 }
 
+auto LibrarySupportsConvolution(
+    HloModule* module, TargetMachineFeatures* target_machine_features) {
+  const bool ynnpack_convolution_enabled = absl::c_linear_search(
+      module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
+      DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
+  return [=](const HloInstruction& instr) {
+#ifdef XLA_YNNPACK
+    return ynnpack_convolution_enabled && IsConvolutionOpSupportedByYnn(&instr);
+#endif  // XLA_YNNPACK
+    return false;
+  };
+}
+
 auto LibrarySupportsDot(HloModule* module,
                         TargetMachineFeatures* target_machine_features) {
   // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
@@ -670,31 +683,41 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   auto library_supports_dot =
       LibrarySupportsDot(module, target_machine_features);
 
-  auto call_library_for_dot = [&](const HloInstruction& instr) {
-    if (instr.opcode() != HloOpcode::kDot) {
+  auto library_supports_convolution =
+      LibrarySupportsConvolution(module, target_machine_features);
+
+  auto call_library_for_instruction = [&](const HloInstruction& instr) {
+    if (instr.opcode() != HloOpcode::kDot &&
+        instr.opcode() != HloOpcode::kConvolution) {
       return false;
     }
 
-    auto dot_strategy = GetDotImplementationStrategy(
-        module->config(), instr, *target_machine_features,
-        /*allow_runtime_calls=*/true);
-    if (dot_strategy != DotImplementationStrategy::kEigen) {
-      // We aren't going to call a library for this dot.
-      return false;
+    if (instr.opcode() == HloOpcode::kDot) {
+      auto dot_strategy = GetDotImplementationStrategy(
+          module->config(), instr, *target_machine_features,
+          /*allow_runtime_calls=*/true);
+      if (dot_strategy != DotImplementationStrategy::kEigen) {
+        // We aren't going to call a library for this dot.
+        return false;
+      }
+      return library_supports_dot(instr);
+    }
+    if (instr.opcode() == HloOpcode::kConvolution) {
+      return library_supports_convolution(instr);
     }
 
-    return library_supports_dot(instr);
+    return false;
   };
 
   // If YNNPACK is enabled, we only need to upcast dots that YnnDotThunk does
   // not support. `upcaster_filter` returns false if the instruction shouldn't
   // be processed.
   HloPredicate upcaster_filter = [&](const HloInstruction* instr) {
-    return !call_library_for_dot(*instr);
+    return !call_library_for_instruction(*instr);
   };
 
-  // xla::cpu::GetDotImplementationStrategy (used by call_library_for_dot)
-  // relies on the canonical form of dots.
+  // xla::cpu::GetDotImplementationStrategy (used by
+  // call_library_for_instruction) relies on the canonical form of dots.
   pipeline.AddPass<DotDecomposer>();
   pipeline.AddPass<OperandUpcaster>(upcaster_filter);
 
@@ -754,7 +777,7 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   // Convert BF16 and F8 operations to F32 and F16 respectively so that the CPU
   // backend can support BF16/F8 operations without directly implementing a
   // BF16/F8 lowering for most ops.
-  CpuFloatSupport bf16_support(BF16, call_library_for_dot);
+  CpuFloatSupport bf16_support(BF16, call_library_for_instruction);
 #ifdef XLA_ONEDNN
   bool use_onednn_graph =
       module->config().debug_options().xla_cpu_use_onednn() &&

From e735855c1b19677e8a276958ec949d57dce8ec0f Mon Sep 17 00:00:00 2001
From: Gregory Pataky <gregpataky@google.com>
Date: Thu, 11 Dec 2025 17:47:20 -0800
Subject: [PATCH 198/753] Clean up xla/types.h

Remove unused include, add missin include.

PiperOrigin-RevId: 843442850
---
 third_party/xla/xla/BUILD   | 1 +
 third_party/xla/xla/types.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index e653fc2cc8b7e3..1e53b7268f50fd 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -217,6 +217,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:ml_dtypes",
+        "@ml_dtypes_py//ml_dtypes:intn",
     ],
 )
 
diff --git a/third_party/xla/xla/types.h b/third_party/xla/xla/types.h
index b702404601dae7..89ad42ea911609 100644
--- a/third_party/xla/xla/types.h
+++ b/third_party/xla/xla/types.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 #include <limits>
-#include <string>
 #include <type_traits>
 
 #include "absl/strings/str_cat.h"
 #include "Eigen/Core"  // IWYU pragma: export
+#include "ml_dtypes/include/intn.h"
 #include "tsl/platform/ml_dtypes.h"  // IWYU pragma: export
 
 namespace xla {

From d7c1cdeb5a563b84c50cdc6c3737eefe45d3ed49 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Thu, 11 Dec 2025 17:54:29 -0800
Subject: [PATCH 199/753] Rename std::unique_ptr<HloModule>
 consume_optimized_module() -> std::shared_ptr<HloModule>
 shared_optimized_module()

PiperOrigin-RevId: 843444886
---
 third_party/xla/xla/service/compiler.h                      | 2 +-
 .../xla/xla/service/cpu/cpu_aot_compilation_result.h        | 6 +++---
 .../xla/xla/service/gpu/gpu_aot_compilation_result.h        | 6 +++---
 .../xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h | 6 +++---
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index b47c105d72160c..e69bd65eeea5b9 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -103,7 +103,7 @@ class AotCompilationResult {
   // Returns the optimized HLO module if one was computed and the implementation
   // supports it.
   virtual const HloModule* optimized_module() const = 0;
-  virtual std::unique_ptr<HloModule> consume_optimized_module() = 0;
+  virtual std::shared_ptr<HloModule> shared_optimized_module() = 0;
 
  protected:
   AotCompilationResult() = default;
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 1f845de703b5ec..2ca82de23ae14f 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -123,8 +123,8 @@ class CpuAotCompilationResult : public AotCompilationResult {
 
   const HloModule* optimized_module() const override { return module_.get(); }
 
-  std::unique_ptr<HloModule> consume_optimized_module() override {
-    return std::move(module_);
+  std::shared_ptr<HloModule> shared_optimized_module() override {
+    return module_;
   }
 
   const CompilationResultProto& proto() const { return proto_; }
@@ -193,7 +193,7 @@ class CpuAotCompilationResult : public AotCompilationResult {
         function_library_(std::move(function_library)) {}
 
   CompilationResultProto proto_;
-  std::unique_ptr<HloModule> module_;
+  std::shared_ptr<HloModule> module_;
   std::optional<size_t> temp_allocation_index_;
   std::vector<BufferAllocationInfo> buffer_allocation_infos_;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
index 1fbf15aaa8865c..e9da581af3382b 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_result.h
@@ -78,8 +78,8 @@ class GpuAotCompilationResult : public AotCompilationResult {
 
   const HloModule* optimized_module() const final { return hlo_module_.get(); };
 
-  std::unique_ptr<HloModule> consume_optimized_module() final {
-    return std::move(hlo_module_);
+  std::shared_ptr<HloModule> shared_optimized_module() final {
+    return hlo_module_;
   };
 
  private:
@@ -89,7 +89,7 @@ class GpuAotCompilationResult : public AotCompilationResult {
         hlo_module_(std::move(hlo_module)) {}
 
   GpuExecutableProto executable_;
-  std::unique_ptr<HloModule> hlo_module_;
+  std::shared_ptr<HloModule> hlo_module_;
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
index d511992ce5e371..f4c462b782251a 100644
--- a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
@@ -67,8 +67,8 @@ class LegacyGpuAotCompilationResult : public AotCompilationResult {
       LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 
   const HloModule* optimized_module() const override { return module_.get(); }
-  std::unique_ptr<HloModule> consume_optimized_module() override {
-    return std::move(module_);
+  std::shared_ptr<HloModule> shared_optimized_module() override {
+    return module_;
   }
 
   absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
@@ -85,7 +85,7 @@ class LegacyGpuAotCompilationResult : public AotCompilationResult {
         pointer_size_(pointer_size),
         compiler_(compiler) {}
 
-  std::unique_ptr<HloModule> module_;
+  std::shared_ptr<HloModule> module_;
   GpuExecutableProto proto_;
   int pointer_size_;
   Compiler* compiler_;

From c2fbe3931d8bb8c5d3eed32c501d3662aa27549a Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Thu, 11 Dec 2025 18:24:49 -0800
Subject: [PATCH 200/753] Update XNNPACK in XLA

PiperOrigin-RevId: 843455527
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake | 2 +-
 tensorflow/workspace2.bzl                         | 6 +++---
 third_party/xla/third_party/xnnpack/workspace.bzl | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index b28d5c5b01c1ed..e3efb7cf5ab430 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG dc05a09f076534ce56c6f5b82a0327850c66bf3c
+  GIT_TAG 6400256d3a687d52ae268a553d7208534f39800a
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 13be76236cb855..924a23bbe2fd2b 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -168,9 +168,9 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "7480edcb300368d5516b583d6312b596cd8c23395c214bb786ec2a1e09eb6b4b",
-        strip_prefix = "XNNPACK-dc05a09f076534ce56c6f5b82a0327850c66bf3c",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/dc05a09f076534ce56c6f5b82a0327850c66bf3c.zip"),
+        sha256 = "2d5e0b17d2c25c7100f66e58e7d76b9c4b8a65b1d86c33c9214dc05fce00ee69",
+        strip_prefix = "XNNPACK-6400256d3a687d52ae268a553d7208534f39800a",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/6400256d3a687d52ae268a553d7208534f39800a.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index d4696680d3f47b..1c2e9b15daa1e7 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "7480edcb300368d5516b583d6312b596cd8c23395c214bb786ec2a1e09eb6b4b",
-        strip_prefix = "XNNPACK-dc05a09f076534ce56c6f5b82a0327850c66bf3c",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/dc05a09f076534ce56c6f5b82a0327850c66bf3c.zip"),
+        sha256 = "2d5e0b17d2c25c7100f66e58e7d76b9c4b8a65b1d86c33c9214dc05fce00ee69",
+        strip_prefix = "XNNPACK-6400256d3a687d52ae268a553d7208534f39800a",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/6400256d3a687d52ae268a553d7208534f39800a.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)

From 4d4ebae8265e94e9a6f96f1e68588e705497366c Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Thu, 11 Dec 2025 18:35:29 -0800
Subject: [PATCH 201/753] Replace `http_archive` with `tf_http_archive` for
 Github links to avoid timeout issues.

PiperOrigin-RevId: 843458022
---
 WORKSPACE                 | 14 ++++++-----
 tensorflow/workspace0.bzl | 14 +++++------
 tensorflow/workspace1.bzl |  9 ++++---
 tensorflow/workspace3.bzl | 49 +++++++++++++++++----------------------
 4 files changed, 40 insertions(+), 46 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 0fc24cb3edd116..97d26fb10fd770 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -4,26 +4,28 @@ workspace(name = "org_tensorflow")
 
 # buildifier: disable=load-on-top
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
-http_archive(
+tf_http_archive(
     name = "rules_shell",
     sha256 = "bc61ef94facc78e20a645726f64756e5e285a045037c7a61f65af2941f4c25e1",
     strip_prefix = "rules_shell-0.4.1",
-    url = "https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz",
+    urls = tf_mirror_urls(
+        "https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz",
+    ),
 )
 
 # Initialize toolchains for ML projects.
 #
 # A hermetic build system is designed to produce completely reproducible builds for C++.
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
-http_archive(
+tf_http_archive(
     name = "rules_ml_toolchain",
     sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
     strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
-    urls = [
+    urls = tf_mirror_urls(
         "https://github.com/yuriivcs/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
-    ],
+    ),
 )
 
 load(
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index 144e34d7460806..e6507c60a4090b 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -8,6 +8,7 @@ load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependenci
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 load("@local_config_android//:android.bzl", "android_workspace")
 load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/googleapis:repository_rules.bzl", "config_googleapis")
 
 def _tf_bind():
@@ -79,14 +80,13 @@ def workspace():
     # Note: We add this to fix Kokoro builds.
     # The rules below call into `rules_proto` but the hash has changed and
     # Bazel refuses to continue. So, we add our own mirror.
-    http_archive(
+    tf_http_archive(
         name = "rules_proto",
         sha256 = "20b240eba17a36be4b0b22635aca63053913d5c1ee36e16be36499d167a2f533",
         strip_prefix = "rules_proto-11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
-        ],
+        ),
     )
 
     # Now, finally use the rules
@@ -106,13 +106,13 @@ def workspace():
 
     # Toolchains for ML projects hermetic builds.
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
-    http_archive(
+    tf_http_archive(
         name = "rules_ml_toolchain",
         sha256 = "b1e5e306d8b1103e73b9b778dfc3a9e069d20664437a03246a235724962b5c94",
         strip_prefix = "rules_ml_toolchain-484235be45e6843db962c45d08fe4b2b65a6a24c",
-        urls = [
+        urls = tf_mirror_urls(
             "https://github.com/google-ml-infra/rules_ml_toolchain/archive/484235be45e6843db962c45d08fe4b2b65a6a24c.tar.gz",
-        ],
+        ),
     )
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
diff --git a/tensorflow/workspace1.bzl b/tensorflow/workspace1.bzl
index 399ff8f7579a7d..408e9a89183f0f 100644
--- a/tensorflow/workspace1.bzl
+++ b/tensorflow/workspace1.bzl
@@ -1,11 +1,11 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@local_xla//third_party/llvm:setup.bzl", "llvm_setup")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/android:android_configure.bzl", "android_configure")
 
 # buildifier: disable=unnamed-macro
@@ -21,14 +21,13 @@ def workspace(with_rules_cc = True):
 
     closure_repositories()
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_toolchains",
         sha256 = "294cdd859e57fcaf101d4301978c408c88683fbc46fbc1a3829da92afbea55fb",
         strip_prefix = "bazel-toolchains-8c717f8258cd5f6c7a45b97d974292755852b658",
-        urls = [
-            "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
-        ],
+        ),
     )
 
     android_configure(name = "local_config_android")
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index adabcc54fc586d..b74e2e012b0e3f 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -1,80 +1,73 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//third_party:repo.bzl", "tf_vendored")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls", "tf_vendored")
 load("//third_party/tf_runtime:workspace.bzl", tf_runtime = "repo")
 
 def workspace():
     tf_vendored(name = "local_xla", path = "third_party/xla")
     tf_vendored(name = "local_tsl", path = "third_party/xla/third_party/tsl")
 
-    http_archive(
+    tf_http_archive(
         name = "io_bazel_rules_closure",
         sha256 = "5b00383d08dd71f28503736db0500b6fb4dda47489ff5fc6bed42557c07c6ba9",
         strip_prefix = "rules_closure-308b05b2419edb5c8ee0471b67a40403df940149",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",  # 2019-06-13
-        ],
+        ),
     )
 
     tf_runtime()
 
     # https://github.com/bazelbuild/bazel-skylib/releases
-    http_archive(
+    tf_http_archive(
         name = "bazel_skylib",
         sha256 = "bc283cdfcd526a52c3201279cda4bc298652efa898b10b4db0837dc51652756f",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
-        ],
+        ),
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_license",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
-        ],
+        ),
         sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_pkg",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
-        ],
+        ),
         sha256 = "451e08a4d78988c06fa3f9306ec813b836b1d076d0f055595444ba4ff22b867f",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_features",
         sha256 = "4fd9922d464686820ffd8fcefa28ccffa147f7cdc6b6ac0d8b07fde565c65d66",
         strip_prefix = "bazel_features-1.25.0",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz",
-        ],
+        ),
     )
 
     # Maven dependencies.
     RULES_JVM_EXTERNAL_TAG = "4.3"
-    http_archive(
+    tf_http_archive(
         name = "rules_jvm_external",
         strip_prefix = "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG,
         sha256 = "6274687f6fc5783b589f56a2f1ed60de3ce1f99bc4e8f9edef3de43bdf7c6e74",
-        url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG,
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG),
     )
 
     # Platforms
-    http_archive(
+    tf_http_archive(
         name = "platforms",
         sha256 = "29742e87275809b5e598dc2f04d86960cc7a55b3067d97221c9abbc9926bff0f",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
-        ],
+        ),
     )
 
 # Alias so it can be loaded without assigning to a different symbol to prevent

From c1dc3772d29977666737355147cbd2f61b5d21a9 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Thu, 11 Dec 2025 18:41:40 -0800
Subject: [PATCH 202/753] Replace `http_archive` with `tf_http_archive` to use
 mirrored archives for python repository rules.

PiperOrigin-RevId: 843459412
---
 third_party/py/python_init_rules.bzl          | 23 ++++++++-----------
 .../xla/third_party/py/python_init_rules.bzl  | 23 ++++++++-----------
 2 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/third_party/py/python_init_rules.bzl b/third_party/py/python_init_rules.bzl
index ac9b8eb3893441..e8bfd6548965e4 100644
--- a/third_party/py/python_init_rules.bzl
+++ b/third_party/py/python_init_rules.bzl
@@ -1,6 +1,5 @@
 """Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def python_init_rules(extra_patches = []):
@@ -11,15 +10,14 @@ def python_init_rules(extra_patches = []):
         set of patches.
     """
 
-    http_archive(
+    tf_http_archive(
         name = "rules_cc",
-        urls = ["https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"],
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"),
         strip_prefix = "rules_cc-0.1.0",
         sha256 = "4b12149a041ddfb8306a8fd0e904e39d673552ce82e4296e96fac9cbf0780e59",
-        patches = [
-            Label("//third_party/py:rules_cc_protobuf.patch"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_cc_protobuf.patch",
         ],
-        patch_args = ["-p1"],
     )
 
     tf_http_archive(
@@ -34,15 +32,14 @@ def python_init_rules(extra_patches = []):
         },
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_python",
         sha256 = "fa7dd2c6b7d63b3585028dd8a90a6cf9db83c33b250959c2ee7b583a6c130e12",
         strip_prefix = "rules_python-1.6.0",
-        url = "https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz",
-        patch_args = ["-p1"],
-        patches = [
-            Label("//third_party/py:rules_python_pip_version.patch"),
-            Label("//third_party/py:rules_python_freethreaded.patch"),
-            Label("//third_party/py:rules_python_versions.patch"),
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_python_pip_version.patch",
+            "@local_xla//third_party/py:rules_python_freethreaded.patch",
+            "@local_xla//third_party/py:rules_python_versions.patch",
         ] + extra_patches,
     )
diff --git a/third_party/xla/third_party/py/python_init_rules.bzl b/third_party/xla/third_party/py/python_init_rules.bzl
index ac9b8eb3893441..e8bfd6548965e4 100644
--- a/third_party/xla/third_party/py/python_init_rules.bzl
+++ b/third_party/xla/third_party/py/python_init_rules.bzl
@@ -1,6 +1,5 @@
 """Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def python_init_rules(extra_patches = []):
@@ -11,15 +10,14 @@ def python_init_rules(extra_patches = []):
         set of patches.
     """
 
-    http_archive(
+    tf_http_archive(
         name = "rules_cc",
-        urls = ["https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"],
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_cc/archive/refs/tags/0.1.0.tar.gz"),
         strip_prefix = "rules_cc-0.1.0",
         sha256 = "4b12149a041ddfb8306a8fd0e904e39d673552ce82e4296e96fac9cbf0780e59",
-        patches = [
-            Label("//third_party/py:rules_cc_protobuf.patch"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_cc_protobuf.patch",
         ],
-        patch_args = ["-p1"],
     )
 
     tf_http_archive(
@@ -34,15 +32,14 @@ def python_init_rules(extra_patches = []):
         },
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_python",
         sha256 = "fa7dd2c6b7d63b3585028dd8a90a6cf9db83c33b250959c2ee7b583a6c130e12",
         strip_prefix = "rules_python-1.6.0",
-        url = "https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz",
-        patch_args = ["-p1"],
-        patches = [
-            Label("//third_party/py:rules_python_pip_version.patch"),
-            Label("//third_party/py:rules_python_freethreaded.patch"),
-            Label("//third_party/py:rules_python_versions.patch"),
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_python/releases/download/1.6.0/rules_python-1.6.0.tar.gz"),
+        patch_file = [
+            "@local_xla//third_party/py:rules_python_pip_version.patch",
+            "@local_xla//third_party/py:rules_python_freethreaded.patch",
+            "@local_xla//third_party/py:rules_python_versions.patch",
         ] + extra_patches,
     )

From eb6b5bf40ac554455a3eb583deeffdb8572a7e50 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Thu, 11 Dec 2025 18:50:49 -0800
Subject: [PATCH 203/753] Move timeout to the actual build/test step.

PiperOrigin-RevId: 843461430
---
 third_party/xla/.github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/.github/workflows/ci.yml b/third_party/xla/.github/workflows/ci.yml
index 69509b5a71321a..db5629dac494b0 100644
--- a/third_party/xla/.github/workflows/ci.yml
+++ b/third_party/xla/.github/workflows/ci.yml
@@ -115,7 +115,6 @@ jobs:
     defaults:
       run:
         shell: bash
-    timeout-minutes: 60
     steps:
       - name: "Checking out openxla/xla"
         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
@@ -133,6 +132,7 @@ jobs:
           halt-dispatch-input: ${{ inputs.halt-for-connection }}
       - name: "Run build.py"
         working-directory: ${{ matrix.job_info.repo }}
+        timeout-minutes: 60
         run: |
           if [[ "${{ matrix.job_info.pool }}" == *windows* ]]; then
             python $GITHUB_WORKSPACE\\openxla\\xla\\build_tools\\ci\\build.py --build="${{ matrix.job_info.name }}_github_actions"

From a3d2213eb0da1387b15ce781d62e550b6839f864 Mon Sep 17 00:00:00 2001
From: Penporn Koanantakool <penporn@google.com>
Date: Thu, 11 Dec 2025 19:40:38 -0800
Subject: [PATCH 204/753] [xla:cpu:ynn] Enable rewriting elementwise op fusions
 with YNNPACK.

+ Update library_rewriter_test to match.

PiperOrigin-RevId: 843473334
---
 .../xla/xla/backends/cpu/transforms/BUILD     |  4 +-
 .../cpu/transforms/library_rewriter_test.cc   | 54 +++++++++++--------
 .../xla/xla/service/cpu/cpu_compiler.cc       |  5 +-
 3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index d23039f371acb1..d1d0503dc032b8 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -22,7 +22,7 @@ cc_library(
     name = "library_rewriter",
     srcs = ["library_rewriter.cc"],
     hdrs = ["library_rewriter.h"],
-    defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
+    defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]) + if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":library_matcher",
         ":onednn_matcher",
@@ -49,7 +49,7 @@ cc_library(
 xla_cc_test(
     name = "library_rewriter_test",
     srcs = ["library_rewriter_test.cc"],
-    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
+    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]) + if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":library_rewriter",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
index 2646ee286b0259..4d050182773bd6 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
@@ -64,7 +64,7 @@ class CpuLibraryTest : public TargetMachineTestBase {
 
   static const DotRewriteTestSpec& GetDefaultTestSpec() {
     static const absl::NoDestructor<DotRewriteTestSpec> kDefaultTestSpec(
-        {"xnn", "f32", "f32", "znver3", "+avx,+avx2", "dot"});
+        {"ynn", "f32", "f32", "znver3", "+avx,+avx2", "dot"});
     return *kDefaultTestSpec;
   }
 
@@ -158,15 +158,22 @@ class CpuLibraryFullParamTest
     RunTestInternal(GetParam(), hlo_template, expected);
   }
 
+  // Manually update expected dtype support for each library.
   bool IsDotEnabledOnCPU() {
     DotRewriteTestSpec spec = GetParam();
-    bool bf16_dot_supported = absl::StrContains(spec.features, "+avx512bf16");
-    bool fp16_dot_supported = absl::StrContains(spec.features, "+avx512fp16");
+    EXPECT_TRUE(spec.lib == "onednn" || spec.lib == "ynn");
+
+    if (spec.lib == "ynn") {
+      return (spec.in_dtype == "f32" || spec.in_dtype == "bf16");
+    }
+
     if (spec.in_dtype == "bf16") {
-      return bf16_dot_supported;
+      return absl::StrContains(spec.features, "+avx512bf16") ||
+             absl::StrContains(spec.features, "+amx_bf16");
     }
     if (spec.in_dtype == "f16") {
-      return fp16_dot_supported;
+      return absl::StrContains(spec.features, "+avx512fp16") ||
+             absl::StrContains(spec.features, "+amx_fp16");
     }
     return true;
   }
@@ -259,7 +266,7 @@ TEST_P(CpuLibraryFullParamTest, MatMulDimSizeUnqual) {
 
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kDot, 0, 0, false};
-  if (spec.lib == "xnn" && IsDotEnabledOnCPU()) {
+  if (spec.lib == "ynn" && IsDotEnabledOnCPU()) {
     expected = FusionProperties{HloOpcode::kDot, 2, 3, true};
   }
   RunTest(hlo_template, expected);
@@ -307,8 +314,8 @@ TEST_P(CpuLibraryFullParamTest, MatMulAddSubMulSameInputs) {
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kMultiply, 0, 0, false};
   if (IsDotEnabledOnCPU()) {
-    // {Dot, Add, Sub, Mul} for XNN, {Dot, Add} for oneDNN.
-    expected = spec.lib == "xnn"
+    // {Dot, Add, Sub, Mul} for YNN, {Dot, Add} for oneDNN.
+    expected = spec.lib == "ynn"
                    ? FusionProperties{HloOpcode::kMultiply, 3, 7, true}
                    : FusionProperties{HloOpcode::kAdd, 3, 5, true};
   } else if (spec.fusion_mode == "greedy") {
@@ -338,8 +345,8 @@ TEST_P(CpuLibraryFullParamTest, MatMulAddSubMulDifferentInputs) {
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kMultiply, 0, 0, false};
   if (IsDotEnabledOnCPU()) {
-    // {Dot, Add, Sub, Mul} for XNN, {Dot, Add} for oneDNN.
-    expected = spec.lib == "xnn"
+    // {Dot, Add, Sub, Mul} for YNN, {Dot, Add} for oneDNN.
+    expected = spec.lib == "ynn"
                    ? FusionProperties{HloOpcode::kMultiply, 5, 9, true}
                    : FusionProperties{HloOpcode::kAdd, 3, 5, true};
   } else if (spec.fusion_mode == "greedy") {
@@ -373,12 +380,12 @@ TEST_P(CpuLibraryFullParamTest, MatMulAddMinExpSort) {
                      dimensions={0}, to_apply=compare
     })";
 
-  // Sort is not supported by xnn_emitter and should not be in the fusion.
+  // Sort is not supported by ynn_emitter and should not be in the fusion.
   DotRewriteTestSpec spec = GetParam();
   FusionProperties expected = {HloOpcode::kExp, 0, 0, false};
   if (IsDotEnabledOnCPU()) {
-    // {Dot, Add, Min, Exp} for XNN, {Dot, Add} for oneDNN.
-    expected = spec.lib == "xnn"
+    // {Dot, Add, Min, Exp} for YNN, {Dot, Add} for oneDNN.
+    expected = spec.lib == "ynn"
                    ? FusionProperties{HloOpcode::kExp, 4, 8, true}
                    : FusionProperties{HloOpcode::kAdd, 3, 5, true};
   } else if (spec.fusion_mode == "greedy") {
@@ -430,23 +437,23 @@ std::vector<DotRewriteTestSpec> GetDotRewriteTestSpecs() {
   absl::flat_hash_map<std::string, std::string> cpu_to_features = {
       {"znver3", "+avx,+avx2"},
       {"sapphirerapids",
-       "+avx512vnni,+avx512bf16,+amx-bf16,+avx512fp16,+amx-int8,+amx-tile,+amx-"
-       "transpose"},
+       "+avx512vnni,+avx512bf16,+amx-bf16,+avx512fp16,+amx-int8,+amx-tile"},
   };
 
   // Input and output data types to test per each library + CPU combination.
   using StrPair = std::pair<std::string, std::string>;
   absl::flat_hash_map<StrPair, std::vector<StrPair>> dtype_map = {
-      {{"xnn", "znver3"}, {{"f32", "f32"}, {"bf16", "f32"}}},
-      {{"xnn", "sapphirerapids"},
-       {{"f32", "f32"}, {"bf16", "f32"}, {"bf16", "bf16"}}},
+      {{"ynn", "znver3"}, {{"f32", "f32"}, {"bf16", "f32"}}},
+      {{"ynn", "sapphirerapids"}, {{"f32", "f32"}, {"bf16", "f32"}}},
   };
 
   // Fusion modes to test for each library.
-  // We temporarily use XNN_GRAPH_FUSION_MODE_DISABLED to denote the dot fusion
-  // mode (starting fusion nodes with dots).
-  absl::flat_hash_map<std::string, std::vector<std::string>> fusion_modes = {
-      {"xnn", {"dot", "greedy"}}};
+  absl::flat_hash_map<std::string, std::vector<std::string>> fusion_modes;
+
+#if XLA_YNNPACK
+  // Don't test YNNPACK if we don't build with it.
+  fusion_modes["ynn"] = {"dot", "greedy"};
+#endif
 
 #if XLA_ONEDNN_USE_GRAPH_API
   // Don't test oneDNN if we don't build with it.
@@ -595,7 +602,8 @@ TEST_P(CpuLibraryFusionTypeTest, JoiningFusions) {
   }
 }
 
-TEST_P(CpuLibraryFusionTypeTest, Reduce) {
+// TODO(penporn): Re-enable this test when YNNPACK supports reduce.
+TEST_P(CpuLibraryFusionTypeTest, DISABLED_Reduce) {
   const absl::string_view hlo_template = R"(
     HloModule reduce
 
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 300eaaa319baa9..843e96ab9ab445 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -1007,9 +1007,8 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   // XNNPACK ops availability checks depend on the layout information,
   // so until another solution is developed the passes creating XNNPACK fusions
   // have to run after layout assignment.
-  const bool use_ynnpack = absl::c_linear_search(
-      debug_options.xla_cpu_experimental_ynn_fusion_type(),
-      DebugOptions::LIBRARY_FUSION_TYPE_REDUCE);
+  const bool use_ynnpack =
+      !debug_options.xla_cpu_experimental_ynn_fusion_type().empty();
   LibraryRewriterOptions options = {
       /*use_onednn=*/debug_options.xla_cpu_use_onednn(),
       /*use_xnnpack=*/debug_options.xla_cpu_use_xnnpack(),

From f7391ff899c306243ec060fb5c48ee8ea1155642 Mon Sep 17 00:00:00 2001
From: Chunlei Niu <niuchl@google.com>
Date: Thu, 11 Dec 2025 20:18:30 -0800
Subject: [PATCH 205/753] Include Interpreter Java API in LiteRT Maven package.

PiperOrigin-RevId: 843483402
---
 tensorflow/lite/java/BUILD | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 3249969563db1d..00fdb0c9b77f18 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -39,6 +39,21 @@ exports_files([
     "tflite_version_script.lds",
 ])
 
+exports_files([
+    # go/keep-sorted start
+    "src/main/java/org/tensorflow/lite/DataType.java",
+    "src/main/java/org/tensorflow/lite/DataTypeUtils.java",
+    "src/main/java/org/tensorflow/lite/InterpreterFactory.java",
+    "src/main/java/org/tensorflow/lite/NativeInterpreterWrapperExperimental.java",
+    "src/main/java/org/tensorflow/lite/NativeSignatureRunnerWrapper.java",
+    "src/main/java/org/tensorflow/lite/RuntimeFlavor.java",
+    "src/main/java/org/tensorflow/lite/Tensor.java",
+    "src/main/java/org/tensorflow/lite/TensorImpl.java",
+    "src/main/java/org/tensorflow/lite/annotations/UsedByReflection.java",
+    "src/main/java/org/tensorflow/lite/package-info.java",
+    # go/keep-sorted end
+])
+
 #-----------------------------------------------------------------------------
 # Filegroup targets.
 
@@ -928,6 +943,17 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+filegroup(
+    name = "portable_tests_for_litert",
+    srcs = [
+        "src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java",
+        "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
+        "src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java",
+        "src/test/java/org/tensorflow/lite/TestInit.java",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 # portable_flex_tests includes files for testing interpreter with Flex delegate.
 filegroup(
     name = "portable_flex_tests",

From 3917bee74b369e877ef66f04eb97672f27f122cc Mon Sep 17 00:00:00 2001
From: Marcello Maggioni <maggioni@google.com>
Date: Thu, 11 Dec 2025 20:46:14 -0800
Subject: [PATCH 206/753] [XLA] Adding new attribute for the scheduler that
 allows force delaying the start only of async operations

Also guarantees that the starts are performed in the same order as the scheduling of the dones.

PiperOrigin-RevId: 843492464
---
 .../xla/service/latency_hiding_scheduler.cc   | 22 +++++++
 .../xla/service/latency_hiding_scheduler.h    |  1 +
 .../service/latency_hiding_scheduler_test.cc  | 60 +++++++++++++++++++
 3 files changed, 83 insertions(+)

diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 62995d048976f9..1fd1e01dd9dd11 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -163,6 +163,23 @@ int GetCustomCallForceDelayPriority(const HloInstruction* instr) {
   return 0;
 }
 
+bool HasForceDelayAsyncAttribute(const HloInstruction* instr) {
+  auto attr = instr->get_frontend_attribute("scheduler_hint");
+  return attr.has_value() && attr.value() == "force_delay_async";
+}
+
+const HloGraphNode* AnyStartHasForceDelay(const HloGraphNode* n) {
+  CHECK(n->IsSupportedAsyncDone())
+      << "Meant to check if any start feeding a done has forced delay";
+  for (auto& v : n->GetPredecessors()) {
+    if (v.Target().IsSupportedAsyncStart() &&
+        HasForceDelayAsyncAttribute(&v.Target().GetInstr())) {
+      return v.TargetPtr();
+    }
+  }
+  return nullptr;
+}
+
 absl::flat_hash_map<int64_t, int64_t>
 GetNumResourcesNeededForAnnotationWithKeepOriginalOrderAttrs(
     const DefaultSchedulerCore::SchedulingState& sched_state,
@@ -2622,6 +2639,9 @@ HloScheduleGraph::HloScheduleGraph(
       n->SetForceDelay(true);
       n->SetForceDelayPriority(GetCustomCallForceDelayPriority(instr));
     }
+    if (n->IsSupportedAsyncStart() && HasForceDelayAsyncAttribute(instr)) {
+      n->SetForceDelay(true);
+    }
   }
 
   // num_predecessors[i]: number of predecessors for instruction number "i"
@@ -3001,6 +3021,8 @@ absl::Status DefaultSchedulerCore::InitializeScheduler(
               continue;
             }
             if (count > it->second) {
+              VLOG(5) << "Cross overlap limit for resource: " << resource
+                      << " count: " << count << " limit: " << it->second;
               return true;
             }
           }
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 01630bbaa5bf5e..5f4f6d449162be 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <deque>
 #include <functional>
 #include <limits>
 #include <memory>
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index 9a3dd47c32949b..c1bc14e0268974 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -812,6 +812,66 @@ ENTRY %module {
   EXPECT_TRUE(result.value());
 }
 
+TEST_F(LatencyHidingSchedulerTest, ForceDelayAsyncAllGather) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY %module {
+  %constant.19 = u32[] constant(1)
+  %replica_id = u32[]{:T(128)} replica-id()
+  %add.1 = u32[]{:T(128)} add(replica_id, constant.19)
+  %convert = f32[]{:T(128)} convert(u32[]{:T(128)} %replica_id)
+  %convert.1 = f32[]{:T(128)} convert(u32[]{:T(128)} %add.1)
+  %color_operand.1 = f32[8,256,256]{2,1,0:T(8,128)} broadcast(f32[]{:T(128)} %convert), dimensions={}
+  %color_operand.2 = f32[8,256,256]{2,1,0:T(8,128)} broadcast(f32[]{:T(128)} %convert.1), dimensions={}
+  %ag-start.2 = (f32[8,256,256], f32[16,256,256]) all-gather-start(f32[8,256,256] %color_operand.2), replica_groups={{0,1}}, dimensions={0},
+    metadata={op_type="AllGather" op_name="ag1"}
+  %ag-start = (f32[8,256,256], f32[16,256,256]) all-gather-start(f32[8,256,256] %color_operand.1), replica_groups={{0,1}}, dimensions={0},
+    frontend_attributes={scheduler_hint="force_delay_async"},
+    metadata={op_type="AllGather" op_name="ag0"}
+  %ag-done = f32[16,256,256] all-gather-done((f32[8,256,256], f32[16,256,256]) %ag-start),
+    metadata={op_type="AllGather" op_name="ag0"}
+  %ag-done.2 = f32[16,256,256] all-gather-done((f32[8,256,256], f32[16,256,256]) %ag-start.2),
+    metadata={op_type="AllGather" op_name="ag1"}
+  p0 = f32[16,64,256]{2,1,0} parameter(0)
+  p1 = f32[16,64,256]{2,1,0} parameter(1)
+  c0 = f32[16,256,256]{2,1,0} convolution(p0, p1),
+    window={size=16 stride=15 lhs_dilate=16}, dim_labels=0fb_0io->0fb
+  ROOT a2 = f32[16,256,256]{2,1,0} add(%ag-done, %ag-done.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  HloComputation* entry_computation = hlo_module->entry_computation();
+  std::vector<HloInstruction*> original_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+
+  TF_EXPECT_OK(RunScheduler(hlo_module.get()));
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+
+  if (VLOG_IS_ON(1)) {
+    for (auto* new_i : new_instruction_sequence) {
+      VLOG(1) << new_i->ToString();
+    }
+  }
+
+  // The all-gather with force_delay_async (ag0) should be scheduled earlier
+  // than ag1 because force_delay_async affects the scheduling priority.
+  EXPECT_LT(GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherStart,
+                                        new_instruction_sequence, "ag0"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherStart,
+                                        new_instruction_sequence, "ag1"));
+
+  // Check the order stays the same for the dones.
+  EXPECT_LT(GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherDone,
+                                        new_instruction_sequence, "ag0"),
+            GetOpcodeIndexUsingMetaData(HloOpcode::kAllGatherDone,
+                                        new_instruction_sequence, "ag1"));
+}
+
 TEST_F(LatencyHidingSchedulerTest, WhileLoopAliasingBug2) {
   // Like WhileLoopAliasingBug above, but this time the input buffer of the
   // first collective permute aliases with the output buffer of the second

From 0668ace3e1e2da0026787bae3828d10e8845b81a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 21:29:11 -0800
Subject: [PATCH 207/753] Automated Code Change

PiperOrigin-RevId: 843507538
---
 third_party/xla/xla/service/gpu/tests/regression_dot_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc b/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc
index d5bce4d4325985..fee55012b58f32 100755
--- a/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/regression_dot_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
 #include <utility>
 
 #include <gtest/gtest.h>

From 8c57d32c2532dff9cd79cfbd8cbefae4b5ad1d40 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 22:06:03 -0800
Subject: [PATCH 208/753] Automated Code Change

PiperOrigin-RevId: 843517944
---
 third_party/xla/xla/pjrt/cpu/BUILD                    |  2 ++
 third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc   | 11 -----------
 .../xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc        |  2 ++
 3 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 0756d7beac73bb..2743355a6ed4c0 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -354,6 +354,8 @@ xla_cc_test(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_cpu:cpu_topology",
         "//xla/pjrt/plugin/xla_cpu:cpu_topology_description",
+        "//xla/pjrt/proto:compile_options_proto_cc",
+        "//xla/service/cpu:executable_proto_cc",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_main",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
index f330c75ca62e13..a62f5a8541060d 100644
--- a/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/abstract_cpu_buffer.cc
@@ -18,21 +18,10 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
-#include <memory>
 #include <optional>
-#include <utility>
-#include <vector>
 
 #include "absl/base/casts.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/cpu/alignment.h"
 #include "xla/layout_util.h"
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
index e8fd02e09bc7c8..5d700cb535019a 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_pjrt_compiler_test.cc
@@ -35,6 +35,8 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_topology_description.h"
+#include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/service/cpu/executable.pb.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::cpu {

From aaf62abff4f633dc90343c56abd38172566ada30 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 22:11:38 -0800
Subject: [PATCH 209/753] Automated Code Change

PiperOrigin-RevId: 843519808
---
 third_party/xla/xla/backends/cpu/runtime/BUILD                  | 2 ++
 third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc   | 1 +
 .../xla/xla/backends/cpu/runtime/conditional_thunk_test.cc      | 1 +
 3 files changed, 4 insertions(+)

diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index 026a0476d92786..e193153dd5b85b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -324,6 +324,7 @@ cc_library(
         ":thunk_executor",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/stream_executor:device_address",
@@ -345,6 +346,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_testlib",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
         "//xla/runtime:resource_use",
         "//xla/service:buffer_assignment",
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
index 17ee2002d1b6b0..b324c7bf4b3b7c 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 namespace {
diff --git a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
index b2a918018fc137..019800e844304d 100644
--- a/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/conditional_thunk_test.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 namespace {

From 4906bccaeae8fa7ba05411e788e9051f9e4b56f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 11 Dec 2025 22:46:21 -0800
Subject: [PATCH 210/753] Add PyClif proto library for XPlane.

PiperOrigin-RevId: 843529599
---
 .../xla/third_party/tsl/tsl/profiler/protobuf/BUILD   | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
index 0b151f9ab38b3c..dcac9f547373b8 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
@@ -1,3 +1,4 @@
+# copybara:uncomment(oss-unused) load("//devtools/clif/python:clif_build_rule.bzl", "pyclif_proto_library")
 # copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
 # Placeholder: load py_proto_library
 load("@local_xla//xla/tsl:tsl.bzl", "internal_visibility")
@@ -19,6 +20,16 @@ tf_proto_library(
     visibility = internal_visibility([":friends"]),
 )
 
+# copybara:uncomment_begin(google-only)
+# pyclif_proto_library(
+#     name = "xplane_pyclif",
+#     proto_lib = ":xplane_proto",
+#     visibility = [
+#         "//visibility:public",
+#     ],
+# )
+# copybara:uncomment_end
+
 tf_proto_library(
     name = "profiler_options_proto",
     srcs = ["profiler_options.proto"],

From ff5a8e6365ce800ea0aab07373ee7c08acfa77e3 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 11 Dec 2025 23:02:08 -0800
Subject: [PATCH 211/753] [stream_executor] Move MemoryType to separate header
 and rename to MemorySpace

PiperOrigin-RevId: 843533783
---
 third_party/xla/xla/stream_executor/BUILD     |  9 +++++
 .../xla/stream_executor/cuda/cuda_executor.cc | 28 +++++++-------
 .../xla/stream_executor/cuda/cuda_executor.h  |  4 +-
 .../cuda/cuda_executor_multigpu_test.cc       |  2 +-
 .../cuda/cuda_executor_test.cc                | 24 ++++++------
 .../stream_executor/gpu/gpu_executor_test.cc  | 10 ++---
 .../xla/stream_executor/host/host_executor.cc |  4 +-
 .../xla/stream_executor/host/host_executor.h  |  2 +-
 .../integrations/stream_executor_allocator.cc | 18 ++++-----
 .../integrations/stream_executor_allocator.h  |  4 +-
 .../stream_executor_allocator_test.cc         | 10 ++---
 .../xla/stream_executor/memory_allocator.h    |  7 +++-
 .../xla/xla/stream_executor/memory_space.h    | 38 +++++++++++++++++++
 .../stream_executor/mock_stream_executor.h    |  3 +-
 .../xla/stream_executor/rocm/rocm_executor.cc | 24 ++++++------
 .../xla/stream_executor/rocm/rocm_executor.h  |  4 +-
 .../rocm/rocm_executor_test.cc                |  8 ++--
 .../xla/xla/stream_executor/stream_executor.h |  9 ++---
 18 files changed, 130 insertions(+), 78 deletions(-)
 create mode 100644 third_party/xla/xla/stream_executor/memory_space.h

diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index b94670ea68a49c..b7336030ff9e37 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -255,6 +255,7 @@ cc_library(
         ":kernel_spec",
         ":memory_allocation",
         ":memory_allocator",
+        ":memory_space",
         ":module_spec",
         ":platform",
         ":stream",
@@ -484,9 +485,11 @@ cc_library(
         ":kernel_spec",
         ":memory_allocation",
         ":memory_allocator",
+        ":memory_space",
         ":module_spec",
         ":platform",
         ":stream",
+        ":tensor_map",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/lib/gtl:int_type",
         "@com_google_absl//absl/base:core_headers",
@@ -518,6 +521,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "memory_space",
+    hdrs = ["memory_space.h"],
+    deps = ["@com_google_absl//absl/base:core_headers"],
+)
+
 cc_library(
     name = "generic_memory_allocator",
     hdrs = ["generic_memory_allocator.h"],
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index b2e7027ce1a567..39e49d407abe47 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -1016,8 +1016,8 @@ absl::Status CollectiveMemoryDeallocate(StreamExecutor* executor,
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-CudaExecutor::CreateMemoryAllocator(MemoryType type) {
-  if (type == MemoryType::kUnified) {
+CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
+  if (type == MemorySpace::kUnified) {
     return std::make_unique<GenericMemoryAllocator>(
         [this](uint64_t size)
             -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -1049,7 +1049,7 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
         });
   }
 
-  if (type == MemoryType::kCollective) {
+  if (type == MemorySpace::kCollective) {
     return std::make_unique<GenericMemoryAllocator>(
         [this](uint64_t size)
             -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -1073,7 +1073,7 @@ CudaExecutor::CreateMemoryAllocator(MemoryType type) {
         });
   }
 
-  if (type == MemoryType::kHost) {
+  if (type == MemorySpace::kHost) {
     return std::make_unique<GenericMemoryAllocator>([this](uint64_t size) {
       return AllocateHostMemory(cuda_context_, numa_node_, size);
     });
@@ -1408,7 +1408,7 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
       << "CudaExecutor::Allocate size: " << size
       << " memory_space: " << memory_space;
 
-  if (memory_space == static_cast<int64_t>(MemoryType::kCollective)) {
+  if (memory_space == static_cast<int64_t>(MemorySpace::kCollective)) {
     auto result = CollectiveMemoryAllocate(this, size);
     if (!result.ok()) {
       XLA_LOG_DEVICE(ERROR, device_ordinal())
@@ -1419,7 +1419,7 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceAddressBase(result.value(), size);
   }
 
-  if (memory_space == static_cast<int64_t>(MemoryType::kHost)) {
+  if (memory_space == static_cast<int64_t>(MemorySpace::kHost)) {
     auto result = HostAllocate(cuda_context_, numa_node_, size);
     if (!result.ok()) {
       XLA_LOG_DEVICE(ERROR, device_ordinal())
@@ -1431,7 +1431,7 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceAddressBase(result.value(), size);
   }
 
-  if (memory_space == static_cast<int64_t>(MemoryType::kP2P) &&
+  if (memory_space == static_cast<int64_t>(MemorySpace::kP2P) &&
       is_vmm_supported_) {
     auto device_buf_base = VmmAllocateMemory(size);
 
@@ -1445,8 +1445,8 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
     return DeviceAddressBase(nullptr, 0);
   }
 
-  CHECK(memory_space == static_cast<int64_t>(MemoryType::kDevice) ||
-        memory_space == static_cast<int64_t>(MemoryType::kP2P));
+  CHECK(memory_space == static_cast<int64_t>(MemorySpace::kDevice) ||
+        memory_space == static_cast<int64_t>(MemorySpace::kP2P));
 
   auto device_buf_base = DeviceAllocate(cuda_context_, size);
   XLA_VLOG_DEVICE(1, device_ordinal())
@@ -1469,7 +1469,7 @@ void CudaExecutor::Deallocate(DeviceAddressBase* mem) {
     return;
   }
   auto memory_space = status_or_memory_space.value();
-  if (memory_space == MemoryType::kHost) {
+  if (memory_space == MemorySpace::kHost) {
     HostDeallocate(cuda_context_, numa_node_, mem->opaque(), mem->size());
   } else {
     // Memory space is always kDevice here, so the only way to check if the
@@ -1899,7 +1899,7 @@ CudaExecutor::CreateDeviceDescription(int device_ordinal) {
   return std::make_unique<DeviceDescription>(std::move(desc));
 }
 
-absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
+absl::StatusOr<MemorySpace> CudaExecutor::GetPointerMemorySpace(
     const void* ptr) {
   CUdeviceptr pointer = reinterpret_cast<CUdeviceptr>(const_cast<void*>(ptr));
   unsigned int is_managed;
@@ -1907,7 +1907,7 @@ absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
       &is_managed, CU_POINTER_ATTRIBUTE_IS_MANAGED, pointer)));
 
   if (is_managed) {
-    return MemoryType::kUnified;
+    return MemorySpace::kUnified;
   }
 
   unsigned int value;
@@ -1915,9 +1915,9 @@ absl::StatusOr<MemoryType> CudaExecutor::GetPointerMemorySpace(
       &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, pointer)));
   switch (value) {
     case CU_MEMORYTYPE_DEVICE:
-      return MemoryType::kDevice;
+      return MemorySpace::kDevice;
     case CU_MEMORYTYPE_HOST:
-      return MemoryType::kHost;
+      return MemorySpace::kHost;
     default:
       return absl::InternalError(
           absl::StrCat("unknown memory space provided by CUDA API: ", value));
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index 4c92d0eac36255..ff9c0c3d49a165 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -114,7 +114,7 @@ class CudaExecutor : public GpuExecutor {
   bool HostMemoryRegister(void* location, uint64_t size) override;
   bool HostMemoryUnregister(void* location) override;
 
-  absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;
+  absl::StatusOr<MemorySpace> GetPointerMemorySpace(const void* ptr) override;
 
   Stream* FindAllocatedStream(void* gpu_stream) override {
     absl::MutexLock lock(alive_gpu_streams_mu_);
@@ -138,7 +138,7 @@ class CudaExecutor : public GpuExecutor {
   absl::StatusOr<TensorMap> CreateTensorMap(const TmaDescriptor& tma_desc,
                                             void* global_address) override;
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override;
+      MemorySpace type) override;
 
   // Returns the granularity which is the minimum unit of memory that can be
   // allocated with VMM API. In order to map the memory slices to multicast
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
index 1bd00cb53a35bb..9ad1336dc5343f 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
@@ -45,7 +45,7 @@ template <typename T>
 absl::StatusOr<stream_executor::DeviceAddressBase> AllocateInitializedMemory(
     CudaExecutor* executor, size_t size, size_t offset, T value) {
   stream_executor::DeviceAddressBase device_memory = executor->Allocate(
-      size + offset, static_cast<int64_t>(stream_executor::MemoryType::kP2P));
+      size + offset, static_cast<int64_t>(stream_executor::MemorySpace::kP2P));
   if (device_memory.opaque() == nullptr) {
     return absl::InternalError("Failed to allocate memory.");
   }
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
index 076495e91cf41e..8b6c6ea3491fe9 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
@@ -115,7 +115,7 @@ TEST(CudaExecutorTest, CreateUnifiedMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kUnified));
+      executor->CreateMemoryAllocator(MemorySpace::kUnified));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
   EXPECT_NE(allocation->opaque(), nullptr);
@@ -128,7 +128,7 @@ TEST(CudaExecutorTest, CreateHostMemoryAllocatorWorks) {
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocator> allocator,
-                          executor->CreateMemoryAllocator(MemoryType::kHost));
+                          executor->CreateMemoryAllocator(MemorySpace::kHost));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
   EXPECT_NE(allocation->opaque(), nullptr);
@@ -142,7 +142,7 @@ TEST(CudaExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kCollective));
+      executor->CreateMemoryAllocator(MemorySpace::kCollective));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
   EXPECT_NE(allocation->opaque(), nullptr);
@@ -158,7 +158,7 @@ TEST(CudaExecutorTest,
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kCollective));
+      executor->CreateMemoryAllocator(MemorySpace::kCollective));
   constexpr uint64_t kTooBig = 1125899906842624;  // 1 PiB
   EXPECT_THAT(
       allocator->Allocate(kTooBig),
@@ -173,7 +173,7 @@ TEST(CudaExecutorTest, CreateUnsupportedMemoryAllocatorsFail) {
                           PlatformManager::PlatformWithName("CUDA"));
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
-  EXPECT_THAT(executor->CreateMemoryAllocator(MemoryType::kDevice),
+  EXPECT_THAT(executor->CreateMemoryAllocator(MemorySpace::kDevice),
               Not(absl_testing::IsOk()));
 }
 
@@ -185,12 +185,12 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithUnifiedMemory) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto unified_memory_allocator,
-      executor->CreateMemoryAllocator(MemoryType::kUnified));
+      executor->CreateMemoryAllocator(MemorySpace::kUnified));
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           unified_memory_allocator->Allocate(256));
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kUnified));
+              absl_testing::IsOkAndHolds(MemorySpace::kUnified));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
@@ -202,7 +202,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           executor->HostMemoryAllocate(256));
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kHost));
+              absl_testing::IsOkAndHolds(MemorySpace::kHost));
 }
 
 TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithDeviceAddress) {
@@ -214,7 +214,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithDeviceAddress) {
   DeviceAddressBase allocation = executor->Allocate(256);
   EXPECT_NE(allocation.opaque(), nullptr);
   EXPECT_THAT(executor->GetPointerMemorySpace(allocation.opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kDevice));
+              absl_testing::IsOkAndHolds(MemorySpace::kDevice));
 }
 
 TEST(CudaExecutorTest, AllocateMemoryWithVmmApi) {
@@ -226,12 +226,12 @@ TEST(CudaExecutorTest, AllocateMemoryWithVmmApi) {
   auto cuda_executor = dynamic_cast<CudaExecutor*>(executor);
   ASSERT_NE(cuda_executor, nullptr);
   DeviceAddressBase ptr =
-      cuda_executor->Allocate(1024, static_cast<int>(MemoryType::kP2P));
+      cuda_executor->Allocate(1024, static_cast<int>(MemorySpace::kP2P));
 
   EXPECT_NE(ptr.opaque(), nullptr);
   EXPECT_EQ(ptr.size(), 1024);
   EXPECT_THAT(executor->GetPointerMemorySpace(ptr.opaque()),
-              absl_testing::IsOkAndHolds(MemoryType::kDevice));
+              absl_testing::IsOkAndHolds(MemorySpace::kDevice));
 
   TF_ASSERT_OK_AND_ASSIGN(CudaExecutor::VmmMemoryHandle handle,
                           cuda_executor->RetainVmmMemoryHandle(ptr.opaque()));
@@ -248,7 +248,7 @@ TEST(CudaExecutorTest,
   auto cuda_executor = dynamic_cast<CudaExecutor*>(executor);
   ASSERT_NE(cuda_executor, nullptr);
   DeviceAddressBase ptr =
-      cuda_executor->Allocate(1024, static_cast<int>(MemoryType::kDevice));
+      cuda_executor->Allocate(1024, static_cast<int>(MemorySpace::kDevice));
 
   EXPECT_NE(ptr.opaque(), nullptr);
   EXPECT_EQ(ptr.size(), 1024);
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
index 4d0bb1b1e3c711..fabea8c509c04f 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -44,17 +44,17 @@ TEST_F(GetPointerMemorySpaceTest, Host) {
   TF_ASSERT_OK_AND_ASSIGN(auto host_ptr, executor->HostMemoryAllocate(64));
   TF_ASSERT_OK_AND_ASSIGN(auto memory_space,
                           executor->GetPointerMemorySpace(host_ptr->opaque()));
-  EXPECT_EQ(memory_space, MemoryType::kHost);
+  EXPECT_EQ(memory_space, MemorySpace::kHost);
 }
 
 TEST_F(GetPointerMemorySpaceTest, HostAllocatedWithMemoryKind) {
   StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
   DeviceAddressBase host_ptr = executor->Allocate(
-      64, static_cast<int64_t>(stream_executor::MemoryType::kHost));
+      64, static_cast<int64_t>(stream_executor::MemorySpace::kHost));
   EXPECT_FALSE(host_ptr.is_null());
-  TF_ASSERT_OK_AND_ASSIGN(MemoryType memory_space,
+  TF_ASSERT_OK_AND_ASSIGN(MemorySpace memory_space,
                           executor->GetPointerMemorySpace(host_ptr.opaque()));
-  EXPECT_EQ(memory_space, MemoryType::kHost);
+  EXPECT_EQ(memory_space, MemorySpace::kHost);
   executor->Deallocate(&host_ptr);
 }
 
@@ -64,7 +64,7 @@ TEST_F(GetPointerMemorySpaceTest, Device) {
   ASSERT_NE(mem, nullptr);
   TF_ASSERT_OK_AND_ASSIGN(auto memory_space,
                           executor->GetPointerMemorySpace(mem.opaque()));
-  EXPECT_EQ(memory_space, MemoryType::kDevice);
+  EXPECT_EQ(memory_space, MemorySpace::kDevice);
   executor->Deallocate(&mem);
 }
 
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
index a8f70bd25ccac0..151d7b51306d24 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -148,8 +148,8 @@ absl::StatusOr<std::unique_ptr<Stream>> HostExecutor::CreateStream(
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-HostExecutor::CreateMemoryAllocator(MemoryType type) {
-  if (type == MemoryType::kHost) {
+HostExecutor::CreateMemoryAllocator(MemorySpace type) {
+  if (type == MemorySpace::kHost) {
     return std::make_unique<GenericMemoryAllocator>(
         [](uint64_t size) -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
           void* ptr = new char[size];
diff --git a/third_party/xla/xla/stream_executor/host/host_executor.h b/third_party/xla/xla/stream_executor/host/host_executor.h
index 50475bb0116296..69e40a59a880bf 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_executor.h
@@ -101,7 +101,7 @@ class HostExecutor : public StreamExecutorCommon {
   absl::StatusOr<std::unique_ptr<Stream>> CreateStream(
       std::optional<std::variant<StreamPriority, int>> priority) override;
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override;
+      MemorySpace type) override;
 
  private:
   int device_ordinal_;
diff --git a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
index bcdeebf8eb022b..c49e1b17c61e49 100644
--- a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
+++ b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
@@ -31,7 +31,7 @@ limitations under the License.
 namespace stream_executor {
 
 StreamExecutorAllocator::StreamExecutorAllocator(
-    std::unique_ptr<MemoryAllocator> memory_allocator, MemoryType memory_type,
+    std::unique_ptr<MemoryAllocator> memory_allocator, MemorySpace memory_type,
     int index, const std::vector<Visitor>& alloc_visitors,
     const std::vector<Visitor>& free_visitors)
     : tsl::SubAllocator(alloc_visitors, free_visitors),
@@ -39,16 +39,16 @@ StreamExecutorAllocator::StreamExecutorAllocator(
       memory_type_(memory_type),
       index_(index) {}
 
-// Converts MemoryType to a human-readable string for allocation error messages
-static absl::string_view MemoryTypeToString(MemoryType type) {
+// Converts MemorySpace to a human-readable string for allocation error messages
+static absl::string_view MemorySpaceToString(MemorySpace type) {
   switch (type) {
-    case MemoryType::kDevice:
+    case MemorySpace::kDevice:
       return "device";
-    case MemoryType::kUnified:
+    case MemorySpace::kUnified:
       return "unified";
-    case MemoryType::kHost:
+    case MemorySpace::kHost:
       return "pinned host";
-    case MemoryType::kCollective:
+    case MemorySpace::kCollective:
       return "collective";
     default:
       return "unknown";
@@ -64,7 +64,7 @@ void* StreamExecutorAllocator::Alloc(size_t alignment, size_t num_bytes,
   if (num_bytes > 0) {
     auto allocation = memory_allocator_->Allocate(num_bytes);
     if (!allocation.ok()) {
-      LOG(WARNING) << "could not allocate " << MemoryTypeToString(memory_type_)
+      LOG(WARNING) << "could not allocate " << MemorySpaceToString(memory_type_)
                    << " of size: " << num_bytes;
       *bytes_received = 0;
       return nullptr;
@@ -95,7 +95,7 @@ void StreamExecutorAllocator::Free(void* ptr, size_t num_bytes) {
 bool StreamExecutorAllocator::SupportsCoalescing() const { return false; }
 
 tsl::AllocatorMemoryType StreamExecutorAllocator::GetMemoryType() const {
-  if (memory_type_ == MemoryType::kHost) {
+  if (memory_type_ == MemorySpace::kHost) {
     return tsl::AllocatorMemoryType::kHostPinned;
   } else {
     return tsl::AllocatorMemoryType::kDevice;
diff --git a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h
index 312de7bbba9616..8b104ca784c66e 100644
--- a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h
+++ b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.h
@@ -35,7 +35,7 @@ namespace stream_executor {
 class StreamExecutorAllocator : public tsl::SubAllocator {
  public:
   StreamExecutorAllocator(std::unique_ptr<MemoryAllocator> memory_allocator,
-                          MemoryType memory_type, int index,
+                          MemorySpace memory_type, int index,
                           const std::vector<Visitor>& alloc_visitors = {},
                           const std::vector<Visitor>& free_visitors = {});
 
@@ -48,7 +48,7 @@ class StreamExecutorAllocator : public tsl::SubAllocator {
 
  private:
   std::unique_ptr<MemoryAllocator> memory_allocator_;
-  MemoryType memory_type_;
+  MemorySpace memory_type_;
   int index_;
 
   StreamExecutorAllocator(const StreamExecutorAllocator&) = delete;
diff --git a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc
index 84e2580aff9d90..8a40b3b8c796c1 100644
--- a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator_test.cc
@@ -39,7 +39,7 @@ TEST(StreamExecutorAllocatorTest, NoMemoryReturnsNullptr) {
       });
 
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kHost, /*index=*/0,
+      std::move(allocator), MemorySpace::kHost, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   size_t bytes_received = 0;
@@ -55,7 +55,7 @@ TEST(StreamExecutorAllocatorTest, DoesntSupportCoalescing) {
         return absl::InternalError("Failed to allocate memory");
       });
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kHost, /*index=*/0,
+      std::move(allocator), MemorySpace::kHost, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   EXPECT_FALSE(stream_executor_allocator.SupportsCoalescing());
@@ -67,7 +67,7 @@ TEST(StreamExecutorAllocatorTest, GetMemoryTypeReturnsHostPinnedForHostMemory) {
         return absl::InternalError("Failed to allocate memory");
       });
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kHost, /*index=*/0,
+      std::move(allocator), MemorySpace::kHost, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   EXPECT_EQ(tsl::AllocatorMemoryType::kHostPinned,
@@ -80,7 +80,7 @@ TEST(StreamExecutorAllocatorTest, GetMemoryTypeReturnsDeviceForDeviceAddress) {
         return absl::InternalError("Failed to allocate memory");
       });
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kDevice, /*index=*/0,
+      std::move(allocator), MemorySpace::kDevice, /*index=*/0,
       /*alloc_visitors=*/{},
       /*free_visitors=*/{});
   EXPECT_EQ(tsl::AllocatorMemoryType::kDevice,
@@ -122,7 +122,7 @@ TEST(StreamExecutorAllocatorTest,
     free_visitor_called = true;
   };
   StreamExecutorAllocator stream_executor_allocator(
-      std::move(allocator), MemoryType::kDevice, /*index=*/0, {alloc_visitor},
+      std::move(allocator), MemorySpace::kDevice, /*index=*/0, {alloc_visitor},
       {free_visitor});
   EXPECT_FALSE(free_visitor_called);
   EXPECT_FALSE(alloc_visitor_called);
diff --git a/third_party/xla/xla/stream_executor/memory_allocator.h b/third_party/xla/xla/stream_executor/memory_allocator.h
index 5183768efe12e5..3aed5aac37ea44 100644
--- a/third_party/xla/xla/stream_executor/memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/memory_allocator.h
@@ -24,7 +24,12 @@ limitations under the License.
 
 namespace stream_executor {
 
-// This class defines the interface for memory allocators.
+// A base class for stream executor memory allocators.
+//
+// Memory allocators are responsible allocating physical memory for a given
+// stream executor, this physical memory might reside in different memory spaces
+// such as device memory, unified memory, host memory, etc. See MemoryAllocation
+// documentation for more details.
 class MemoryAllocator {
  public:
   virtual ~MemoryAllocator() = default;
diff --git a/third_party/xla/xla/stream_executor/memory_space.h b/third_party/xla/xla/stream_executor/memory_space.h
new file mode 100644
index 00000000000000..251dc9bbd5b339
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/memory_space.h
@@ -0,0 +1,38 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MEMORY_SPACE_H_
+#define XLA_STREAM_EXECUTOR_MEMORY_SPACE_H_
+
+#include <cstdint>
+
+#include "absl/base/macros.h"
+
+namespace stream_executor {
+
+// Identifies the memory space where a physical allocation resides.
+enum class MemorySpace : uint8_t {
+  kDevice = 0,
+  kUnified,
+  kCollective,
+  kP2P,
+  kHost = 5,
+};
+
+using MemoryType ABSL_DEPRECATE_AND_INLINE() = MemorySpace;
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MEMORY_SPACE_H_
diff --git a/third_party/xla/xla/stream_executor/mock_stream_executor.h b/third_party/xla/xla/stream_executor/mock_stream_executor.h
index 231ddf297e3e66..589a590e65c36e 100644
--- a/third_party/xla/xla/stream_executor/mock_stream_executor.h
+++ b/third_party/xla/xla/stream_executor/mock_stream_executor.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
@@ -112,7 +113,7 @@ class MockStreamExecutor : public StreamExecutor {
               CreateEventBasedTimer, (Stream * stream, bool use_delay_kernel),
               (override));
   MOCK_METHOD(absl::StatusOr<std::unique_ptr<MemoryAllocator>>,
-              CreateMemoryAllocator, (MemoryType type), (override));
+              CreateMemoryAllocator, (MemorySpace memory_space), (override));
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index 316028d7b4109f..7d3d2c8ebaf459 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -779,12 +779,12 @@ absl::StatusOr<ModuleHandle> RocmExecutor::LoadModuleFromHsaco(
 }
 
 DeviceAddressBase RocmExecutor::Allocate(uint64_t size, int64_t memory_space) {
-  switch (static_cast<MemoryType>(memory_space)) {
-    case MemoryType::kCollective:
-    case MemoryType::kDevice:
+  switch (static_cast<MemorySpace>(memory_space)) {
+    case MemorySpace::kCollective:
+    case MemorySpace::kDevice:
       return DeviceAddressBase(
           DeviceAllocate(rocm_context_, size, /*is_fine_grained*/ false), size);
-    case MemoryType::kP2P:
+    case MemorySpace::kP2P:
       // On the ROCm platform, differences in cache design (e.g., coherence
       // protocol) can cause cache coherence issues for some archs (e.g., MI200)
       // when using normal device memory. To avoid these problems, we use
@@ -792,7 +792,7 @@ DeviceAddressBase RocmExecutor::Allocate(uint64_t size, int64_t memory_space) {
       // the correctness.
       return DeviceAddressBase(
           DeviceAllocate(rocm_context_, size, /*is_fine_grained*/ true), size);
-    case MemoryType::kHost:
+    case MemorySpace::kHost:
       if (auto result = HostAllocate(rocm_context_, size); result.ok()) {
         return DeviceAddressBase(*result, size);
       }
@@ -811,9 +811,9 @@ void RocmExecutor::Deallocate(DeviceAddressBase* mem) {
 }
 
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-RocmExecutor::CreateMemoryAllocator(MemoryType type) {
+RocmExecutor::CreateMemoryAllocator(MemorySpace type) {
   switch (type) {
-    case MemoryType::kUnified:
+    case MemorySpace::kUnified:
       return std::make_unique<GenericMemoryAllocator>(
           [this](uint64_t size)
               -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -841,7 +841,7 @@ RocmExecutor::CreateMemoryAllocator(MemoryType type) {
                   }
                 });
           });
-    case MemoryType::kCollective:
+    case MemorySpace::kCollective:
       return std::make_unique<GenericMemoryAllocator>(
           [](uint64_t size)
               -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
@@ -868,7 +868,7 @@ RocmExecutor::CreateMemoryAllocator(MemoryType type) {
                   }
                 });
           });
-    case MemoryType::kHost:
+    case MemorySpace::kHost:
       return std::make_unique<GenericMemoryAllocator>([this](uint64_t size) {
         return AllocateHostMemory(rocm_context_, size);
       });
@@ -1242,7 +1242,7 @@ RocmExecutor::CreateDeviceDescription(int device_ordinal) {
   return std::make_unique<DeviceDescription>(std::move(desc));
 }
 
-absl::StatusOr<MemoryType> RocmExecutor::GetPointerMemorySpace(
+absl::StatusOr<MemorySpace> RocmExecutor::GetPointerMemorySpace(
     const void* ptr) {
   hipDeviceptr_t pointer =
       reinterpret_cast<hipDeviceptr_t>(const_cast<void*>(ptr));
@@ -1252,9 +1252,9 @@ absl::StatusOr<MemoryType> RocmExecutor::GetPointerMemorySpace(
   if (result == hipSuccess) {
     switch (value) {
       case hipMemoryTypeDevice:
-        return MemoryType::kDevice;
+        return MemorySpace::kDevice;
       case hipMemoryTypeHost:
-        return MemoryType::kHost;
+        return MemorySpace::kHost;
       default:
         return absl::InternalError(
             absl::StrCat("unknown memory space provided by ROCM API: ", value));
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
index 71bec6a2376f8c..cbf064795206c6 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.h
@@ -113,7 +113,7 @@ class RocmExecutor : public GpuExecutor {
   bool HostMemoryRegister(void* location, uint64_t size) override;
   bool HostMemoryUnregister(void* location) override;
 
-  absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) override;
+  absl::StatusOr<MemorySpace> GetPointerMemorySpace(const void* ptr) override;
 
   Stream* FindAllocatedStream(void* gpu_stream) override {
     absl::MutexLock lock(alive_gpu_streams_mu_);
@@ -131,7 +131,7 @@ class RocmExecutor : public GpuExecutor {
   // associated with this executor. Otherwise a NotFound error is returned.
   absl::StatusOr<const RocmKernel*> GetRocmKernel(const Kernel* kernel);
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override;
+      MemorySpace type) override;
 
   int GetGpuStreamPriority(StreamPriority priority) override;
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
index 4ede48411b45b5..60e1e72cd4b657 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
@@ -90,7 +90,7 @@ TEST(RocmExecutorTest, CreateUnifiedMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kUnified));
+      executor->CreateMemoryAllocator(MemorySpace::kUnified));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
   EXPECT_NE(allocation->opaque(), nullptr);
@@ -104,7 +104,7 @@ TEST(RocmExecutorTest, CreateHostMemoryAllocatorWorks) {
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocator> allocator,
-                          executor->CreateMemoryAllocator(MemoryType::kHost));
+                          executor->CreateMemoryAllocator(MemorySpace::kHost));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
   EXPECT_NE(allocation->opaque(), nullptr);
@@ -119,7 +119,7 @@ TEST(RocmExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
                           platform->ExecutorForDevice(0));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<MemoryAllocator> allocator,
-      executor->CreateMemoryAllocator(MemoryType::kCollective));
+      executor->CreateMemoryAllocator(MemorySpace::kCollective));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
   EXPECT_NE(allocation->opaque(), nullptr);
@@ -132,7 +132,7 @@ TEST(RocmExecutorTest, CreateUnsupportedMemoryAllocatorsFail) {
                           PlatformManager::PlatformWithName("ROCM"));
   TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
                           platform->ExecutorForDevice(0));
-  EXPECT_THAT(executor->CreateMemoryAllocator(MemoryType::kDevice),
+  EXPECT_THAT(executor->CreateMemoryAllocator(MemorySpace::kDevice),
               Not(absl_testing::IsOk()));
 }
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/xla/xla/stream_executor/stream_executor.h
index 8b43c676ea7014..045d30aefe1aba 100644
--- a/third_party/xla/xla/stream_executor/stream_executor.h
+++ b/third_party/xla/xla/stream_executor/stream_executor.h
@@ -46,9 +46,11 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/tensor_map.h"
 #include "xla/tsl/lib/gtl/int_type.h"
 
 // TODO(ezhulenev): Remove this once transitive dependencies are fixed.
@@ -56,9 +58,6 @@ limitations under the License.
 
 namespace stream_executor {
 
-// Identifies the memory space where an allocation resides.
-enum class MemoryType { kDevice = 0, kUnified, kCollective, kP2P, kHost = 5 };
-
 /// The StreamExecutor is a single-device abstraction for:
 //
 // * Loading/launching data-parallel-kernels
@@ -109,7 +108,7 @@ class StreamExecutor {
 
   // Creates a MemoryAllocator for the given type.
   virtual absl::StatusOr<std::unique_ptr<MemoryAllocator>>
-  CreateMemoryAllocator(MemoryType type) {
+  CreateMemoryAllocator(MemorySpace memory_space) {
     return absl::UnimplementedError("Not Implemented");
   }
 
@@ -179,7 +178,7 @@ class StreamExecutor {
       uint64_t size) = 0;
 
   // Returns the memory space of the given pointer.
-  virtual absl::StatusOr<MemoryType> GetPointerMemorySpace(const void* ptr) {
+  virtual absl::StatusOr<MemorySpace> GetPointerMemorySpace(const void* ptr) {
     return absl::UnimplementedError("Not implemented");
   }
 

From a383df817ea33a46f6ece3e915762d8e192dcbfb Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Thu, 11 Dec 2025 23:13:16 -0800
Subject: [PATCH 212/753] Remove XNNPACK support

This is now mostly unused, because YNNPACK is now enabled by default in all cases. XNNPACK was only ever enabled by default for F32 dots, and only when a cost model allowed it to be used, for a subset of shapes, on AMD CPUs only.

PiperOrigin-RevId: 843537066
---
 third_party/xla/build_tools/ci/build.py       |   2 -
 .../xla/build_tools/ci/golden_commands.txt    |   4 +-
 .../xla/build_tools/configure/configure.py    |  12 -
 .../configure/testdata/gcc.bazelrc            |   2 -
 .../configure/testdata/nvcc_gcc.bazelrc       |   2 -
 third_party/xla/tensorflow.bazelrc            |   2 -
 third_party/xla/xla/backends/cpu/BUILD        |  77 +--
 .../xla/xla/backends/cpu/autotuner/BUILD      |  42 --
 .../backends/cpu/autotuner/xnnpack_backend.cc | 124 -----
 .../backends/cpu/autotuner/xnnpack_backend.h  |  58 --
 .../cpu/autotuner/xnnpack_backend_test.cc     | 170 ------
 .../xla/xla/backends/cpu/benchmarks/BUILD     |  22 -
 .../xla/xla/backends/cpu/runtime/BUILD        |   9 -
 .../xla/xla/backends/cpu/runtime/thunk.cc     |  14 -
 .../xla/xla/backends/cpu/runtime/thunk.h      |  20 +-
 .../xla/xla/backends/cpu/runtime/thunk.proto  |   2 +
 .../cpu/runtime/thunk_proto_serdes.cc         | 199 +------
 .../cpu/runtime/thunk_sequence_serdes_test.cc | 202 -------
 .../xla/backends/cpu/runtime/xnnpack/BUILD    | 213 --------
 .../runtime/xnnpack/xnn_convolution_thunk.cc  | 192 -------
 .../runtime/xnnpack/xnn_convolution_thunk.h   |  89 ---
 .../xnnpack/xnn_convolution_thunk_test.cc     | 202 -------
 .../cpu/runtime/xnnpack/xnn_dot_thunk.cc      | 191 -------
 .../cpu/runtime/xnnpack/xnn_dot_thunk.h       |  81 ---
 .../cpu/runtime/xnnpack/xnn_dot_thunk_test.cc | 124 -----
 .../cpu/runtime/xnnpack/xnn_fusion_thunk.cc   | 364 -------------
 .../cpu/runtime/xnnpack/xnn_fusion_thunk.h    | 184 -------
 .../runtime/xnnpack/xnn_fusion_thunk_test.cc  | 155 ------
 .../cpu/runtime/xnnpack/xnn_interop.cc        |  71 ---
 .../cpu/runtime/xnnpack/xnn_interop.h         | 125 -----
 .../cpu/runtime/xnnpack/xnn_threadpool.cc     |  62 ---
 .../cpu/runtime/xnnpack/xnn_threadpool.h      |  39 --
 .../xla/xla/backends/cpu/transforms/BUILD     |  55 --
 .../cpu/transforms/library_rewriter.h         |   8 -
 .../cpu/transforms/library_rewriter_test.cc   |   7 -
 .../cpu/transforms/xnn_graph_fusion.cc        | 156 ------
 .../cpu/transforms/xnn_graph_fusion.h         |  55 --
 .../cpu/transforms/xnn_graph_fusion_test.cc   | 333 ------------
 .../xla/backends/cpu/transforms/xnn_matcher.h | 118 ----
 .../xla/xla/backends/cpu/xnn_emitter.cc       | 507 ------------------
 .../xla/xla/backends/cpu/xnn_emitter.h        |  31 --
 .../xla/xla/backends/cpu/xnn_gemm_config.cc   | 323 -----------
 .../xla/xla/backends/cpu/xnn_gemm_config.h    |  62 ---
 .../xla/xla/backends/cpu/xnn_support.cc       | 315 -----------
 .../xla/xla/backends/cpu/xnn_support.h        |  77 ---
 .../xla/xla/backends/cpu/xnn_support_test.cc  |  69 ---
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    |  18 +-
 third_party/xla/xla/service/cpu/BUILD         |   6 -
 .../xla/xla/service/cpu/backend_config.proto  |   1 +
 .../xla/xla/service/cpu/cpu_compiler.cc       |  55 +-
 .../xla/xla/service/cpu/cpu_executable.cc     |  15 +-
 third_party/xla/xla/service/cpu/tests/BUILD   |  17 -
 .../xla/service/cpu/tests/xnn_fusion_test.cc  | 388 --------------
 .../xla/xla/service/cpu/thunk_emitter.cc      |  71 +--
 54 files changed, 27 insertions(+), 5715 deletions(-)
 delete mode 100644 third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
 delete mode 100644 third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk_test.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
 delete mode 100644 third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.h
 delete mode 100644 third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/transforms/xnn_matcher.h
 delete mode 100644 third_party/xla/xla/backends/cpu/xnn_emitter.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/xnn_emitter.h
 delete mode 100644 third_party/xla/xla/backends/cpu/xnn_gemm_config.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/xnn_gemm_config.h
 delete mode 100644 third_party/xla/xla/backends/cpu/xnn_support.cc
 delete mode 100644 third_party/xla/xla/backends/cpu/xnn_support.h
 delete mode 100644 third_party/xla/xla/backends/cpu/xnn_support_test.cc
 delete mode 100644 third_party/xla/xla/service/cpu/tests/xnn_fusion_test.cc

diff --git a/third_party/xla/build_tools/ci/build.py b/third_party/xla/build_tools/ci/build.py
index bf4c2071dce9ce..8a6efe40f3a71f 100755
--- a/third_party/xla/build_tools/ci/build.py
+++ b/third_party/xla/build_tools/ci/build.py
@@ -672,7 +672,6 @@ def nvidia_gpu_build_with_compute_capability(
         **_DEFAULT_BAZEL_OPTIONS,
         "macos_minimum_os": "10.15",
         "test_tmpdir": "/Volumes/BuildData/bazel_output",
-        "define": "xnn_enable_avxvnniint8=false",
         "//xla/tsl:ci_build": True,
     },
     build_tag_filters=macos_tag_filter,
@@ -708,7 +707,6 @@ def nvidia_gpu_build_with_compute_capability(
         "macos_minimum_os": "10.15",
         "test_tmpdir": "/tmpfs/bazel_output",
         "test_size_filters": "small,medium",
-        "define": "xnn_enable_avxvnniint8=false",
         "//xla/tsl:ci_build": True,
     },
     build_tag_filters=macos_tag_filter,
diff --git a/third_party/xla/build_tools/ci/golden_commands.txt b/third_party/xla/build_tools/ci/golden_commands.txt
index 3510dd02708fe8..feccb0fc7cdd78 100644
--- a/third_party/xla/build_tools/ci/golden_commands.txt
+++ b/third_party/xla/build_tools/ci/golden_commands.txt
@@ -110,7 +110,7 @@ bazel analyze-profile profile.json.gz
 df -h
 bazel --version
 mkdir -p /tmpfs/bazel_output
-bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/tmpfs/bazel_output --test_size_filters=small,medium --define=xnn_enable_avxvnniint8=false --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
+bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/tmpfs/bazel_output --test_size_filters=small,medium --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_MACOS_ARM64_CPU_KOKORO
 # BEGIN BuildType.XLA_MACOS_X86_CPU_KOKORO
@@ -118,7 +118,7 @@ sudo wget --no-verbose -O /usr/local/bin/bazel https://github.com/bazelbuild/baz
 chmod +x /usr/local/bin/bazel
 bazel --version
 mkdir -p /Volumes/BuildData/bazel_output
-bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/Volumes/BuildData/bazel_output --define=xnn_enable_avxvnniint8=false --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
+bazel test --build_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --test_tag_filters=-no_oss,-gpu,-no_mac,-mac_excluded,-requires-gpu-nvidia,-requires-gpu-amd,-requires-gpu-intel --config=nonccl --color=yes --test_output=errors --verbose_failures --keep_going --nobuild_tests_only --profile=profile.json.gz --flaky_test_attempts=3 --jobs=150 --bes_upload_mode=fully_async --macos_minimum_os=10.15 --test_tmpdir=/Volumes/BuildData/bazel_output --//xla/tsl:ci_build -- //xla/... -//xla/hlo/experimental/... -//xla/python_api/... -//xla/python/... -//xla/service/gpu/...
 bazel analyze-profile profile.json.gz
 # END BuildType.XLA_MACOS_X86_CPU_KOKORO
 # BEGIN BuildType.XLA_WINDOWS_X86_CPU_GITHUB_ACTIONS
diff --git a/third_party/xla/build_tools/configure/configure.py b/third_party/xla/build_tools/configure/configure.py
index 30729ca1031561..a54852db554b35 100755
--- a/third_party/xla/build_tools/configure/configure.py
+++ b/third_party/xla/build_tools/configure/configure.py
@@ -452,18 +452,6 @@ def to_bazelrc_lines(
     if dpav.clang_major_version and dpav.clang_major_version >= 19:
       self.compiler_options.append("-Wno-c23-extensions")
 
-    # Avoid XNNPACK using `-mavxvnniint8` (needs clang-16+/gcc-13+)
-    if (
-        dpav.clang_major_version is not None and dpav.clang_major_version < 16
-    ) or (dpav.gcc_major_version is not None and dpav.gcc_major_version < 13):
-      rc.append("build --define=xnn_enable_avxvnniint8=false")
-
-    # Avoid XNNPACK using `-mavx512fp16` (needs clang-14+/gcc-12+).
-    if (
-        dpav.clang_major_version is not None and dpav.clang_major_version < 14
-    ) or (dpav.gcc_major_version is not None and dpav.gcc_major_version < 12):
-      rc.append("build --define=xnn_enable_avx512fp16=false")
-
     rc.append(f"build --action_env PYTHON_BIN_PATH={self.python_bin_path}")
     rc.append(f"build --python_path {self.python_bin_path}")
     rc.append("test --test_env LD_LIBRARY_PATH")
diff --git a/third_party/xla/build_tools/configure/testdata/gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
index 54545cbb9914bc..8eefec15ee8efb 100644
--- a/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
@@ -1,6 +1,4 @@
 build --action_env GCC_HOST_COMPILER_PATH=/usr/bin/gcc
-build --define=xnn_enable_avxvnniint8=false
-build --define=xnn_enable_avx512fp16=false
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
 build --python_path /usr/bin/python3
 test --test_env LD_LIBRARY_PATH
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
index 3155b30218df08..373613415c1f7c 100644
--- a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
@@ -5,8 +5,6 @@ build:cuda --repo_env HERMETIC_CUDA_COMPUTE_CAPABILITIES=7.5
 build:cuda --repo_env HERMETIC_CUDNN_VERSION="9.8.0"
 build --config nonccl
 build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-build --define=xnn_enable_avxvnniint8=false
-build --define=xnn_enable_avx512fp16=false
 build --action_env PYTHON_BIN_PATH=/usr/bin/python3
 build --python_path /usr/bin/python3
 test --test_env LD_LIBRARY_PATH
diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index 80b55cb9db8fd0..a08ecafdedae0a 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -274,8 +274,6 @@ common:rocm_base --copt=-Wno-gnu-offsetof-extensions
 common:rocm_base --crosstool_top=@local_config_rocm//crosstool:toolchain
 common:rocm_base --define=using_rocm_hipcc=true
 common:rocm_base --define=tensorflow_mkldnn_contraction_kernel=0
-common:rocm_base --define=xnn_enable_avxvnniint8=false
-common:rocm_base --define=xnn_enable_avx512fp16=false
 common:rocm_base --repo_env TF_NEED_ROCM=1
 
 common:rocm_clang_official --config=rocm_base
diff --git a/third_party/xla/xla/backends/cpu/BUILD b/third_party/xla/xla/backends/cpu/BUILD
index 05df8d4e5fd66d..690cbbcb3a3b13 100644
--- a/third_party/xla/xla/backends/cpu/BUILD
+++ b/third_party/xla/xla/backends/cpu/BUILD
@@ -120,34 +120,12 @@ onednn_graph_cc_library(
     ],
 )
 
+# TODO: b/467367981, this is deprecated and should be removed.
 tf_proto_library(
     name = "xnn_fusion_options_proto",
     srcs = ["xnn_fusion_options.proto"],
 )
 
-cc_library(
-    name = "xnn_emitter",
-    srcs = ["xnn_emitter.cc"],
-    hdrs = ["xnn_emitter.h"],
-    deps = [
-        ":xnn_support",
-        "//xla:literal",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/hlo/ir:hlo",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
     name = "ynn_emitter",
     srcs = ["ynn_emitter.cc"],
@@ -175,59 +153,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "xnn_gemm_config",
-    srcs = ["xnn_gemm_config.cc"],
-    hdrs = ["xnn_gemm_config.h"],
-    deps = [
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_dims",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/log:check",
-        "@llvm-project//llvm:Target",
-    ],
-)
-
-cc_library(
-    name = "xnn_support",
-    srcs = ["xnn_support.cc"],
-    hdrs = ["xnn_support.h"],
-    deps = [
-        ":xnn_gemm_config",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/backends/cpu/runtime:dot_dims",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:pattern_matcher",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_support_test",
-    srcs = ["xnn_support_test.cc"],
-    deps = [
-        ":xnn_support",
-        "//xla/hlo/ir:hlo",
-        "@XNNPACK",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 tf_proto_library(
     name = "ynn_fusion_options_proto",
     srcs = ["ynn_fusion_options.proto"],
diff --git a/third_party/xla/xla/backends/cpu/autotuner/BUILD b/third_party/xla/xla/backends/cpu/autotuner/BUILD
index 81247efe7e118a..34e11696f7d15f 100644
--- a/third_party/xla/xla/backends/cpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/cpu/autotuner/BUILD
@@ -79,48 +79,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "xnnpack_backend",
-    srcs = ["xnnpack_backend.cc"],
-    hdrs = ["xnnpack_backend.h"],
-    deps = [
-        ":cpu_codegen_backend",
-        "//xla:util",
-        "//xla/backends/autotuner:codegen_backend",
-        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:compiler",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "xnnpack_backend_test",
-    srcs = ["xnnpack_backend_test.cc"],
-    deps = [
-        ":cpu_codegen_backend",
-        ":xnnpack_backend",
-        "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/service:compiler",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/service/cpu:cpu_compiler",
-        "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:status_matchers",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "llvm_kernel_backend",
     srcs = ["llvm_kernel_backend.cc"],
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
deleted file mode 100644
index 765a50a887cd54..00000000000000
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/autotuner/xnnpack_backend.h"
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/cpu/xnn_fusion_options.pb.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/compiler.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-
-namespace xla::cpu {
-
-absl::Status CheckIfXnnFusion(const HloInstruction& instr) {
-  if (instr.opcode() != HloOpcode::kFusion) {
-    return xla::InvalidArgument(
-        "XnnpackBackend only supports fusion instructions. Received %s.",
-        HloOpcodeString(instr.opcode()));
-  }
-  if (!instr.has_backend_config()) {
-    return xla::InvalidArgument("Instruction %s does not have backend config.",
-                                instr.ToString());
-  }
-
-  TF_ASSIGN_OR_RETURN(auto backend_config,
-                      instr.backend_config<BackendConfig>());
-
-  if (!backend_config.has_fusion_config()) {
-    return xla::InvalidArgument(
-        "Backend config %s does not have an fusion config.",
-        backend_config.DebugString());
-  }
-
-  if (backend_config.fusion_config().kind() != kXnnFusionKind) {
-    return xla::InvalidArgument(
-        "Backend kind %s doesn't match expected kind %s.",
-        backend_config.fusion_config().kind(), kXnnFusionKind);
-  }
-
-  return absl::OkStatus();
-}
-
-absl::StatusOr<std::unique_ptr<CodegenBackend>> XnnpackBackend::Create(
-    Compiler* compiler) {
-  return absl::WrapUnique(new XnnpackBackend(compiler));
-}
-
-bool XnnpackBackend::IsSupported(const HloInstruction& instr) {
-  return CheckIfXnnFusion(instr).ok();
-}
-
-absl::StatusOr<std::vector<std::unique_ptr<xla::BackendConfig>>>
-XnnpackBackend::GetSupportedConfigs(const HloInstruction& instr) {
-  TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  std::vector<std::unique_ptr<xla::BackendConfig>> configs;
-  {
-    XnnFusionOptions options;
-    options.set_use_threadpool(true);
-    auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(options);
-    configs.push_back(std::move(any));
-  }
-
-  {
-    XnnFusionOptions options;
-    options.set_use_threadpool(false);
-    auto any = std::make_unique<xla::BackendConfig>();
-    any->PackFrom(options);
-    configs.push_back(std::move(any));
-  }
-  return configs;
-}
-absl::StatusOr<std::unique_ptr<xla::BackendConfig>>
-XnnpackBackend::GetDefaultConfig(const HloInstruction& instr) {
-  TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  auto config = std::make_unique<XnnFusionOptions>();
-  config->set_use_threadpool(true);
-  auto any = std::make_unique<xla::BackendConfig>();
-  any->PackFrom(*config);
-  return any;
-}
-
-absl::Status XnnpackBackend::ApplyConfig(HloInstruction& instr,
-                                         const xla::BackendConfig& config) {
-  TF_RETURN_IF_ERROR(CheckIfXnnFusion(instr));
-  TF_ASSIGN_OR_RETURN(auto backend_config,
-                      instr.backend_config<xla::cpu::BackendConfig>());
-
-  XnnFusionOptions options;
-  config.UnpackTo(&options);
-
-  *backend_config.mutable_fusion_config()->mutable_xnn_fusion_options() =
-      options;
-
-  TF_RETURN_IF_ERROR(instr.set_backend_config(backend_config));
-
-  return absl::OkStatus();
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
deleted file mode 100644
index 71b8b8c86d8011..00000000000000
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_AUTOTUNER_XNNPACK_BACKEND_H_
-#define XLA_BACKENDS_CPU_AUTOTUNER_XNNPACK_BACKEND_H_
-
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/cpu/autotuner/cpu_codegen_backend.h"
-#include "xla/backends/cpu/xnn_fusion_options.pb.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/compiler.h"
-
-namespace xla::cpu {
-
-inline constexpr absl::string_view kXnnpackBackendName = "xnnpack";
-
-class XnnpackBackend : public CpuCodegenBackend {
- public:
-  static absl::StatusOr<std::unique_ptr<CodegenBackend>> Create(
-      Compiler* compiler);
-
-  bool IsSupported(const HloInstruction& instr);
-
-  absl::StatusOr<std::vector<std::unique_ptr<xla::BackendConfig>>>
-  GetSupportedConfigs(const HloInstruction& instr) final;
-
-  absl::StatusOr<std::unique_ptr<xla::BackendConfig>> GetDefaultConfig(
-      const HloInstruction& instr) final;
-
-  absl::Status ApplyConfig(HloInstruction& instr,
-                           const xla::BackendConfig& config) final;
-
- protected:
-  explicit XnnpackBackend(Compiler* compiler)
-      : CpuCodegenBackend(compiler, kXnnpackBackendName) {}
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_AUTOTUNER_XNNPACK_BACKEND_H_
diff --git a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc b/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
deleted file mode 100644
index 9dea563a7b6d0a..00000000000000
--- a/third_party/xla/xla/backends/cpu/autotuner/xnnpack_backend_test.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/autotuner/xnnpack_backend.h"
-
-#include <memory>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "absl/status/status.h"
-#include "absl/status/status_matchers.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/cpu/autotuner/cpu_codegen_backend.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/service/compiler.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla::cpu {
-namespace {
-
-constexpr absl::string_view kXnnpackFusionHlo = R"(
-    HloModule eltwise_f32_0
-
-    xnn_fusion {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      add0 = f32[1024,1024] add(p0, p1)
-      mul0 = f32[1024,1024] multiply(add0, add0)
-      ROOT sub = f32[1024,1024] subtract(mul0, p0)
-    }
-
-    ENTRY e {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      ROOT %result = f32[1024,1024] fusion(%p0, %p1), kind=kCustom,
-        calls=xnn_fusion,
-        backend_config={"fusion_config": {"kind": "__xnn_fusion"}}
-    }
-  )";
-
-class XnnpackBackendTest : public HloHardwareIndependentTestBase {
- protected:
-  void SetUp() override {
-    TF_ASSERT_OK_AND_ASSIGN(compiler_,
-                            CpuCodegenBackend::CreateBackendCompiler());
-    TF_ASSERT_OK_AND_ASSIGN(backend_, XnnpackBackend::Create(compiler_.get()));
-  }
-  std::unique_ptr<CodegenBackend> backend_;
-  std::unique_ptr<Compiler> compiler_;
-};
-
-TEST_F(XnnpackBackendTest, NameTest) {
-  EXPECT_THAT(backend_->name(), "xnnpack");
-}
-
-TEST_F(XnnpackBackendTest, GetDefaultConfigTest) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto config, backend_->GetDefaultConfig(
-                       *module->entry_computation()->root_instruction()));
-  XnnFusionOptions xnn_fusion_options;
-  config->UnpackTo(&xnn_fusion_options);
-
-  EXPECT_TRUE(xnn_fusion_options.use_threadpool());
-}
-
-TEST_F(XnnpackBackendTest, InvalidFusionKind) {
-  constexpr absl::string_view bad_fusion_kind_hlo = R"(
-    HloModule eltwise_f32_0
-
-    not_xnn_fusion {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      add0 = f32[1024,1024] add(p0, p1)
-      mul0 = f32[1024,1024] multiply(add0, add0)
-      ROOT sub = f32[1024,1024] subtract(mul0, p0)
-    }
-
-    ENTRY e {
-      p0 = f32[1024,1024] parameter(0)
-      p1 = f32[1024,1024] parameter(1)
-      ROOT %result = f32[1024,1024] fusion(%p0, %p1), kind=kCustom,
-        calls=not_xnn_fusion,
-        backend_config={fusion_config: {kind: "__not_xnn_fusion"}}
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(bad_fusion_kind_hlo));
-  auto config = backend_->GetDefaultConfig(
-      *module->entry_computation()->root_instruction());
-
-  EXPECT_THAT(config,
-              absl_testing::StatusIs(
-                  absl::StatusCode::kInvalidArgument,
-                  testing::HasSubstr("Backend kind __not_xnn_fusion doesn't "
-                                     "match expected kind __xnn_fusion.")));
-}
-
-TEST_F(XnnpackBackendTest, GetSupportedConfigsTest) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto configs, backend_->GetSupportedConfigs(
-                        *module->entry_computation()->root_instruction()));
-
-  EXPECT_EQ(configs.size(), 2);
-  XnnFusionOptions xnn_fusion_options0;
-  configs[0]->UnpackTo(&xnn_fusion_options0);
-  EXPECT_TRUE(xnn_fusion_options0.use_threadpool());
-  XnnFusionOptions xnn_fusion_options1;
-  configs[1]->UnpackTo(&xnn_fusion_options1);
-  EXPECT_FALSE(xnn_fusion_options1.use_threadpool());
-}
-
-TEST_F(XnnpackBackendTest, CompileSupportedBackends) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  HloInstruction* fusion_instruction =
-      module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(auto configs,
-                          backend_->GetSupportedConfigs(*fusion_instruction));
-  for (auto& config : configs) {
-    TF_ASSERT_OK_AND_ASSIGN(auto executable,
-                            backend_->Compile(*fusion_instruction, *config));
-  }
-}
-
-TEST_F(XnnpackBackendTest, EnsureConfigIsApplied) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kXnnpackFusionHlo));
-  HloInstruction* fusion_instruction =
-      module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(auto configs,
-                          backend_->GetSupportedConfigs(*fusion_instruction));
-
-  for (const auto& config : configs) {
-    XnnFusionOptions xnn_fusion_options;
-    config->UnpackTo(&xnn_fusion_options);
-    EXPECT_TRUE(backend_->ApplyConfig(*fusion_instruction, *config).ok());
-
-    TF_ASSERT_OK_AND_ASSIGN(
-        auto instruction_backend_config,
-        fusion_instruction->backend_config<BackendConfig>());
-
-    EXPECT_EQ(instruction_backend_config.fusion_config()
-                  .xnn_fusion_options()
-                  .use_threadpool(),
-              xnn_fusion_options.use_threadpool());
-  }
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/BUILD b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
index 71cd84d147728b..b90dd573c2d6b7 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/BUILD
+++ b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
@@ -586,28 +586,6 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
-    name = "xnn_fusion_benchmark_test",
-    srcs = ["xnn_fusion_benchmark_test.cc"],
-    fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
-    fail_if_no_test_selected = False,  # NOLINT=This contains benchmarks only, no tests.
-    linkstatic = 1,  # required to override pthreadpool symbols
-    deps = [
-        ":hlo_benchmark_runner",
-        ":multi_benchmark_config",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:test_benchmark",
-        "//xla/tsl/platform:test_main",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 xla_cc_test(
     name = "snapshot_loading_test",
     srcs = ["snapshot_loading_test.cc"],
diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index e193153dd5b85b..b567d7f7aefe9e 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -177,8 +177,6 @@ cc_library(
         "//xla:executable_run_options",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/backends/cpu/collectives:in_process_collectives",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_threadpool",
         "//xla/ffi:execution_context",
         "//xla/runtime:buffer_use",
         "//xla/runtime:device_id",
@@ -1252,11 +1250,7 @@ cc_library(
         ":while_thunk",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/backends/cpu:xnn_fusion_options_proto_cc",
         "//xla/backends/cpu:ynn_fusion_options_proto_cc",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:resource_use",
         "//xla/runtime:work_group",
@@ -1351,9 +1345,6 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_convolution_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/runtime:resource_use",
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.cc b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
index c1c948124c1257..d675604b4ae202 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
@@ -29,8 +29,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/collectives/cpu_collectives.h"
 #include "xla/backends/cpu/collectives/in_process_collectives.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
 #include "xla/executable_run_options.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
@@ -91,8 +89,6 @@ absl::string_view Thunk::KindToString(Kind kind) {
       return "topk";
     case Kind::kWhile:
       return "while";
-    case Kind::kXnnFusion:
-      return "xnn-fusion";
     case Kind::kYnnFusion:
       return "ynn-fusion";
     case Kind::kOneDnnFusion:
@@ -165,16 +161,6 @@ Thunk::CustomCallExecuteParams::CustomCallExecuteParams(
       intra_op_thread_pool(intra_op_thread_pool),
       ffi_execution_context(ffi_execution_context) {}
 
-absl::StatusOr<Thunk::XnnParams> Thunk::XnnParams::Create(
-    const ExecutableRunOptions* run_options) {
-  TF_ASSIGN_OR_RETURN(XnnThreadpool threadpool,
-                      CreateXnnThreadpool(run_options->intra_op_thread_pool()));
-  return XnnParams(std::move(threadpool));
-}
-
-Thunk::XnnParams::XnnParams(XnnThreadpool threadpool)
-    : threadpool(std::move(threadpool)) {}
-
 #ifdef XLA_YNNPACK
 absl::StatusOr<Thunk::YnnParams> Thunk::YnnParams::Create(
     const ExecutableRunOptions* run_options) {
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.h b/third_party/xla/xla/backends/cpu/runtime/thunk.h
index 154dea8200b7f9..0c48855c06622b 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.h
@@ -35,8 +35,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/xfeed_manager.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/runtime/buffer_use.h"
@@ -91,7 +89,6 @@ class Thunk {
     kSort,
     kTopK,
     kWhile,
-    kXnnFusion,
     kYnnFusion,
     kOneDnnFusion,
   };
@@ -254,20 +251,6 @@ class Thunk {
                             const ffi::ExecutionContext* ffi_execution_context);
   };
 
-  //===--------------------------------------------------------------------===//
-  // XnnParams
-  //===--------------------------------------------------------------------===//
-
-  // Parameters capturing all the details required for running XNNPACK fusions.
-  struct XnnParams {
-    static absl::StatusOr<XnnParams> Create(
-        const ExecutableRunOptions* run_options);
-
-    XnnThreadpool threadpool = nullptr;
-
-    explicit XnnParams(XnnThreadpool threadpool);
-  };
-
   //===--------------------------------------------------------------------===//
   // YnnParams
   //===--------------------------------------------------------------------===//
@@ -284,7 +267,7 @@ class Thunk {
   };
 #else
   // Use XnnParams for placeholder. The parameter won't be used anyway.
-  using YnnParams = XnnParams;
+  struct YnnParams {};
 #endif  // XLA_YNNPACK
 
   //===--------------------------------------------------------------------===//
@@ -301,7 +284,6 @@ class Thunk {
     TaskRunner* task_runner = nullptr;
     CollectiveExecuteParams* collective_params = nullptr;
     CustomCallExecuteParams* custom_call_params = nullptr;
-    XnnParams* xnn_params = nullptr;
     YnnParams* ynn_params = nullptr;
     int64_t run_id = -1;          // -1 means no run id is set.
     int64_t device_ordinal = -1;  // -1 means no device ordinal is set.
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.proto b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
index 0af36ecb40e915..f4501ddce0a0c8 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.proto
@@ -145,6 +145,7 @@ message SortThunkProto {
   repeated ShapeBufferAllocationSliceProto inputs_shapes = 5;
 }
 
+// TODO: b/467367981, this is deprecated and should be removed.
 message XnnDotThunkProto {
   DotDimensionNumbers dot_dimensions = 1;
   ShapeBufferAllocationSliceProto lhs_buffer_shape = 2;
@@ -295,6 +296,7 @@ message ThunkProto {
     CallThunkProto call_thunk = 3;
     ConditionalThunkProto conditional_thunk = 4;
     SortThunkProto sort_thunk = 5;
+    // TODO: b/467367981, this is deprecated and should be removed.
     XnnFusionThunkProto xnn_fusion_thunk = 6;
     DotThunkProto dot_thunk = 7;
     RngGetAndUpdateStateThunkProto rng_get_and_update_state_thunk = 8;
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
index 0d515fd787b1fb..4f5255b2681b2a 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
@@ -61,10 +61,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/xnn_fusion_options.pb.h"
 #include "xla/backends/cpu/ynn_fusion_options.pb.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -125,20 +121,6 @@ ProtoCollectiveThunkToCollectiveThunkKind(const CollectiveThunkProto& proto) {
   }
 }
 
-static absl::StatusOr<XnnFusionThunk::XnnFusionKind>
-ProtoXnnFusionThunkToXnnFusionThunkKind(const XnnFusionThunkProto& proto) {
-  switch (proto.impl_case()) {
-    case XnnFusionThunkProto::ImplCase::kXnnFusionThunk:
-      return XnnFusionThunk::XnnFusionKind::kFusion;
-    case XnnFusionThunkProto::ImplCase::kXnnDotThunk:
-      return XnnFusionThunk::XnnFusionKind::kDot;
-    case XnnFusionThunkProto::ImplCase::kXnnConvolutionThunk:
-      return XnnFusionThunk::XnnFusionKind::kConvolution;
-    case XnnFusionThunkProto::ImplCase::IMPL_NOT_SET:
-      return Internal("XNN fusion thunk kind not set.");
-  }
-}
-
 static absl::StatusOr<Thunk::Kind> ProtoThunkToThunkKind(
     const ThunkProto& proto) {
   switch (proto.impl_case()) {
@@ -173,7 +155,7 @@ static absl::StatusOr<Thunk::Kind> ProtoThunkToThunkKind(
     case ThunkProto::ImplCase::kWhileThunk:
       return Thunk::Kind::kWhile;
     case ThunkProto::ImplCase::kXnnFusionThunk:
-      return Thunk::Kind::kXnnFusion;
+      return Internal("Thunk kind kXnnFusionThunk is deprecated.");
     case ThunkProto::ImplCase::kPartitionIdThunk:
       return Thunk::Kind::kPartitionId;
     case ThunkProto::ImplCase::kReplicaIdThunk:
@@ -756,66 +738,6 @@ static absl::Status ToProto(const YnnFusionThunk& thunk, ThunkProto& proto) {
 }
 #endif  // XLA_YNNPACK
 
-static absl::Status ToProto(const XnnFusionThunk& thunk, ThunkProto& proto) {
-  // TODO(basioli) XnnFusionThunk is not serializable because it contains
-  // a builder function that is not serializable.
-  // This would require a serialization of the XNNPACK subgraph.
-  return absl::UnimplementedError("XnnFusionThunk is not serializable.");
-}
-
-static absl::Status ToProto(const XnnDotThunk& thunk, ThunkProto& proto) {
-  XnnDotThunkProto* xnn_dot_thunk_proto =
-      proto.mutable_xnn_fusion_thunk()->mutable_xnn_dot_thunk();
-  *xnn_dot_thunk_proto->mutable_dot_dimensions() = thunk.dot_dimensions();
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      thunk.dot_slices().lhs_buffer, thunk.dot_slices().lhs_shape,
-      xnn_dot_thunk_proto->mutable_lhs_buffer_shape()));
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      thunk.dot_slices().rhs_buffer, thunk.dot_slices().rhs_shape,
-      xnn_dot_thunk_proto->mutable_rhs_buffer_shape()));
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      thunk.dot_slices().out_buffer, thunk.dot_slices().out_shape,
-      xnn_dot_thunk_proto->mutable_out_buffer_shape()));
-  proto.mutable_xnn_fusion_thunk()->mutable_options()->set_use_threadpool(
-      thunk.options().use_threadpool);
-  xnn_dot_thunk_proto->set_capture_rhs(thunk.capture_rhs());
-  return absl::OkStatus();
-}
-
-static absl::Status ToProto(const XnnConvolutionThunk& thunk,
-                            ThunkProto& proto) {
-  XnnConvolutionThunkProto* convolution_thunk_proto =
-      proto.mutable_xnn_fusion_thunk()->mutable_xnn_convolution_thunk();
-
-  const std::string dnums_as_str = thunk.dnums().SerializeAsString();
-  convolution_thunk_proto->mutable_dimension_numbers()->ParseFromString(
-      dnums_as_str);
-
-  const std::string window_as_str = thunk.window().SerializeAsString();
-  convolution_thunk_proto->mutable_window()->ParseFromString(window_as_str);
-
-  convolution_thunk_proto->set_feature_group_count(thunk.feature_group_count());
-
-  const ConvolutionSlices& convolution_slices = thunk.convolution_slices();
-
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      convolution_slices.input_buffer, convolution_slices.input_shape,
-      convolution_thunk_proto->mutable_input_buffer_shape()));
-
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      convolution_slices.output_buffer, convolution_slices.output_shape,
-      convolution_thunk_proto->mutable_output_buffer_shape()));
-
-  TF_RETURN_IF_ERROR(SerializeSliceShapeIntoProto(
-      convolution_slices.kernel_buffer, convolution_slices.kernel_shape,
-      convolution_thunk_proto->mutable_kernel_buffer_shape()));
-
-  proto.mutable_xnn_fusion_thunk()->mutable_options()->set_use_threadpool(
-      thunk.options().use_threadpool);
-
-  return absl::OkStatus();
-}
-
 static absl::Status ToProto(const FftThunk& thunk, ThunkProto& proto) {
   FftThunkProto* fft_thunk_proto = proto.mutable_fft_thunk();
 
@@ -983,25 +905,6 @@ absl::StatusOr<ThunkProto> ThunkSerDesProtobuf::ToProto(
       TF_RETURN_IF_ERROR(
           ::xla::cpu::ToProto(tsl::down_cast<const WhileThunk&>(thunk), proto));
       break;
-    case Thunk::Kind::kXnnFusion: {
-      const XnnFusionThunk& xnn_fusion_thunk =
-          tsl::down_cast<const XnnFusionThunk&>(thunk);
-      switch (xnn_fusion_thunk.xnn_fusion_kind()) {
-        case XnnFusionThunk::XnnFusionKind::kFusion:
-          TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
-              tsl::down_cast<const XnnFusionThunk&>(thunk), proto));
-          break;
-        case XnnFusionThunk::XnnFusionKind::kDot:
-          TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
-              tsl::down_cast<const XnnDotThunk&>(thunk), proto));
-          break;
-        case XnnFusionThunk::XnnFusionKind::kConvolution:
-          TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
-              tsl::down_cast<const XnnConvolutionThunk&>(thunk), proto));
-          break;
-      }
-      break;
-    }
     case Thunk::Kind::kPartitionId:
       TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
           static_cast<const PartitionIdThunk&>(
@@ -1632,93 +1535,6 @@ static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunkFromProto(
 }
 #endif  // XLA_YNNPACK
 
-static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  return absl::UnimplementedError("XnnFusionThunkFromProto is not implemented");
-}
-
-static absl::StatusOr<std::unique_ptr<XnnDotThunk>> XnnDotThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
-
-  XnnDotThunk::Options options = {
-      proto.xnn_fusion_thunk().options().use_threadpool(),
-  };
-
-  TF_ASSIGN_OR_RETURN(
-      auto lhs_slice_shape,
-      DeserializeSliceShapeFromProto(
-          proto.xnn_fusion_thunk().xnn_dot_thunk().lhs_buffer_shape(),
-          buffer_allocations));
-
-  TF_ASSIGN_OR_RETURN(
-      auto rhs_slice_shape,
-      DeserializeSliceShapeFromProto(
-          proto.xnn_fusion_thunk().xnn_dot_thunk().rhs_buffer_shape(),
-          buffer_allocations));
-  TF_ASSIGN_OR_RETURN(
-      auto out_slice_shape,
-      DeserializeSliceShapeFromProto(
-          proto.xnn_fusion_thunk().xnn_dot_thunk().out_buffer_shape(),
-          buffer_allocations));
-
-  const auto& [lhs_buffer, lhs_shape] = lhs_slice_shape;
-  const auto& [rhs_buffer, rhs_shape] = rhs_slice_shape;
-  const auto& [out_buffer, out_shape] = out_slice_shape;
-
-  bool capture_rhs = proto.xnn_fusion_thunk().xnn_dot_thunk().capture_rhs();
-
-  return XnnDotThunk::Create(
-      std::move(options), std::move(info),
-      proto.xnn_fusion_thunk().xnn_dot_thunk().dot_dimensions(), lhs_buffer,
-      lhs_shape, rhs_buffer, rhs_shape, out_buffer, out_shape, capture_rhs);
-}
-
-static absl::StatusOr<std::unique_ptr<XnnConvolutionThunk>>
-XnnConvolutionThunkFromProto(
-    const ThunkProto& proto,
-    const std::vector<BufferAllocation>& buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(Thunk::Info info, ThunkInfoFromProto(proto.info()));
-
-  XnnConvolutionThunk::Options options = {
-      proto.xnn_fusion_thunk().options().use_threadpool(),
-  };
-
-  const auto& conv_proto = proto.xnn_fusion_thunk().xnn_convolution_thunk();
-
-  // Dimension numbers.
-  ConvolutionDimensionNumbers dnums = conv_proto.dimension_numbers();
-
-  // Window.
-  Window window = conv_proto.window();
-
-  // Feature group count.
-  int64_t feature_group_count = conv_proto.feature_group_count();
-
-  TF_ASSIGN_OR_RETURN(auto input_slice_shape,
-                      DeserializeSliceShapeFromProto(
-                          conv_proto.input_buffer_shape(), buffer_allocations));
-  TF_ASSIGN_OR_RETURN(
-      auto kernel_slice_shape,
-      DeserializeSliceShapeFromProto(conv_proto.kernel_buffer_shape(),
-                                     buffer_allocations));
-  TF_ASSIGN_OR_RETURN(
-      auto output_slice_shape,
-      DeserializeSliceShapeFromProto(conv_proto.output_buffer_shape(),
-                                     buffer_allocations));
-
-  const auto& [input_buffer, input_shape] = input_slice_shape;
-  const auto& [kernel_buffer, kernel_shape] = kernel_slice_shape;
-  const auto& [output_buffer, output_shape] = output_slice_shape;
-
-  return XnnConvolutionThunk::Create(
-      std::move(options), std::move(info), std::move(input_buffer), input_shape,
-      std::move(kernel_buffer), kernel_shape, std::move(output_buffer),
-      output_shape, dnums, window, feature_group_count);
-}
-
 static absl::StatusOr<std::unique_ptr<Thunk>> PartitionIdThunkFromProto(
     const ThunkProto& proto,
     const std::vector<BufferAllocation>& buffer_allocations) {
@@ -1813,19 +1629,6 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
       return TopKThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kWhile:
       return WhileThunkFromProto(proto, hlo_module_, buffer_allocations_);
-    case Thunk::Kind::kXnnFusion: {
-      TF_ASSIGN_OR_RETURN(
-          auto xnn_fusion_kind,
-          ProtoXnnFusionThunkToXnnFusionThunkKind(proto.xnn_fusion_thunk()));
-      switch (xnn_fusion_kind) {
-        case XnnFusionThunk::XnnFusionKind::kFusion:
-          return XnnFusionThunkFromProto(proto, *buffer_allocations_);
-        case XnnFusionThunk::XnnFusionKind::kDot:
-          return XnnDotThunkFromProto(proto, *buffer_allocations_);
-        case XnnFusionThunk::XnnFusionKind::kConvolution:
-          return XnnConvolutionThunkFromProto(proto, *buffer_allocations_);
-      }
-    }
     case Thunk::Kind::kPartitionId:
       return PartitionIdThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kReplicaId:
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
index a8e425116af038..9fd66395fa1a28 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
@@ -59,9 +59,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/literal.h"
@@ -185,9 +182,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateTopKThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateWhileThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateWhileThunk(1));
-    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateXnnDotThunk());
-    TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
-                        CreateXnnConvolutionThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(), CreateKernelThunk());
     TF_ASSIGN_OR_RETURN(thunk_sequence.emplace_back(),
                         CreateConvolutionThunk());
@@ -606,79 +600,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         /*trip_count=*/trip_count);
   }
 
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateXnnDotThunk() {
-    TF_RETURN_IF_ERROR(AddBufferAllocations(3));
-    DotDimensionNumbers dot_dimensions;
-    dot_dimensions.add_lhs_contracting_dimensions(1);
-    dot_dimensions.add_rhs_contracting_dimensions(0);
-    return XnnDotThunk::Create(
-        XnnFusionThunk::Options(), Thunk::Info(),
-        /*dot_dimensions=*/dot_dimensions,
-        /*lhs_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 3]),
-        /*lhs_shape=*/literals_[buffer_allocations_.size() - 3].shape(),
-        /*rhs_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 2]),
-        /*rhs_shape=*/literals_[buffer_allocations_.size() - 2].shape(),
-        /*out_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 1]),
-        /*out_shape=*/literals_[buffer_allocations_.size() - 1].shape(), true);
-  }
-
-  absl::StatusOr<std::unique_ptr<Thunk>> CreateXnnConvolutionThunk() {
-    std::vector<int64_t> input_dims = {1, 8, 8, 16};
-    std::vector<int64_t> kernel_dims = {32, 1, 1, 16};
-    std::vector<int64_t> output_dims = {1, 8, 8, 32};
-
-    // Convolution rank inferred from the input dimensions.
-    int convolution_rank = input_dims.size() - 2;
-
-    // Convolution parameters.
-    ConvolutionDimensionNumbers conv_dims =
-        MakeConvolutionDimensionNumbers(convolution_rank);
-    Window window = MakeWindow(convolution_rank);
-
-    // Adjust kernel dimensions for XNNPACK.
-    conv_dims.set_kernel_input_feature_dimension(3);
-    conv_dims.set_kernel_output_feature_dimension(0);
-    conv_dims.set_kernel_spatial_dimensions(0, 1);
-    conv_dims.set_kernel_spatial_dimensions(1, 2);
-
-    // Actual data.
-    literals_.push_back(
-        LiteralUtil::CreateFull<float>(input_dims, 0.0));  // input
-    literals_.push_back(
-        LiteralUtil::CreateFull<float>(kernel_dims, 0.0));  // kernel
-    literals_.push_back(
-        LiteralUtil::CreateFull<float>(output_dims, 0.0));  // output
-
-    TF_RETURN_IF_ERROR(buffer_allocations_.push_back(CreateBufferAllocation(
-        buffer_allocations_.size(), literals_[literals_.size() - 3])));
-    TF_RETURN_IF_ERROR(buffer_allocations_.push_back(CreateBufferAllocation(
-        buffer_allocations_.size(), literals_[literals_.size() - 2])));
-    TF_RETURN_IF_ERROR(buffer_allocations_.push_back(CreateBufferAllocation(
-        buffer_allocations_.size(), literals_[literals_.size() - 1])));
-
-    return XnnConvolutionThunk::Create(
-        XnnFusionThunk::Options(), Thunk::Info(),
-        /*input_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 3]),
-        /*input_shape=*/literals_[buffer_allocations_.size() - 3].shape(),
-        /*kernel_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 2]),
-        /*kernel_shape=*/literals_[buffer_allocations_.size() - 2].shape(),
-        /*output_buffer=*/
-        CreateBufferAllocationSlice(
-            buffer_allocations_[buffer_allocations_.size() - 1]),
-        /*output_shape=*/literals_[buffer_allocations_.size() - 1].shape(),
-        conv_dims, window, /*feature_group_count=*/1);
-  }
-
   absl::StatusOr<std::unique_ptr<Thunk>> CreateKernelThunk() {
     TF_RETURN_IF_ERROR(AddBufferAllocations(2));
     return KernelThunk::Create(
@@ -1107,13 +1028,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
            thunk_1.trip_count() == thunk_2.trip_count();
   }
 
-  bool VerifyXnnFusionThunkEquality(const XnnFusionThunk& thunk_1,
-                                    const XnnFusionThunk& thunk_2) {
-    // TODO(basioli) assume this is always false until we implement
-    // serialization of XnnFusionThunk.
-    return false;
-  }
-
 #ifdef XLA_YNNPACK
   bool VerifyYnnFusionThunkEquality(const YnnFusionThunk& thunk_1,
                                     const YnnFusionThunk& thunk_2) {
@@ -1123,98 +1037,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
   }
 #endif  // XLA_YNNPACK
 
-  bool VerifyXnnDotThunkEquality(const XnnDotThunk& thunk_1,
-                                 const XnnDotThunk& thunk_2) {
-    const bool are_dot_dimensions_equal =
-        absl::c_equal(thunk_1.dot_dimensions().lhs_batch_dimensions(),
-                      thunk_2.dot_dimensions().lhs_batch_dimensions()) &&
-        absl::c_equal(thunk_1.dot_dimensions().rhs_batch_dimensions(),
-                      thunk_2.dot_dimensions().rhs_batch_dimensions()) &&
-        absl::c_equal(thunk_1.dot_dimensions().lhs_contracting_dimensions(),
-                      thunk_2.dot_dimensions().lhs_contracting_dimensions()) &&
-        absl::c_equal(thunk_1.dot_dimensions().rhs_contracting_dimensions(),
-                      thunk_2.dot_dimensions().rhs_contracting_dimensions());
-
-    const bool are_options_equal =
-        thunk_1.options().use_threadpool == thunk_2.options().use_threadpool;
-
-    const bool is_capturing_rhs_equal =
-        thunk_1.capture_rhs() == thunk_2.capture_rhs();
-
-    return are_options_equal && are_dot_dimensions_equal &&
-           is_capturing_rhs_equal &&
-           VerifySliceShapeEquality(thunk_1.dot_slices().lhs_buffer,
-                                    thunk_1.dot_slices().lhs_shape,
-                                    thunk_2.dot_slices().lhs_buffer,
-                                    thunk_2.dot_slices().lhs_shape) &&
-           VerifySliceShapeEquality(thunk_1.dot_slices().rhs_buffer,
-                                    thunk_1.dot_slices().rhs_shape,
-                                    thunk_2.dot_slices().rhs_buffer,
-                                    thunk_2.dot_slices().rhs_shape) &&
-           VerifySliceShapeEquality(
-               thunk_1.dot_slices().out_buffer, thunk_1.dot_slices().out_shape,
-               thunk_2.dot_slices().out_buffer, thunk_2.dot_slices().out_shape);
-  }
-
-  bool VerifyXnnConvolutionThunkEquality(const XnnConvolutionThunk& thunk_1,
-                                         const XnnConvolutionThunk& thunk_2) {
-    const bool are_dnums_equal =
-        absl::c_equal(thunk_1.dnums().input_spatial_dimensions(),
-                      thunk_2.dnums().input_spatial_dimensions()) &&
-        absl::c_equal(thunk_1.dnums().kernel_spatial_dimensions(),
-                      thunk_2.dnums().kernel_spatial_dimensions()) &&
-        absl::c_equal(thunk_1.dnums().output_spatial_dimensions(),
-                      thunk_2.dnums().output_spatial_dimensions()) &&
-        thunk_1.dnums().input_batch_dimension() ==
-            thunk_2.dnums().input_batch_dimension() &&
-        thunk_1.dnums().input_feature_dimension() ==
-            thunk_2.dnums().input_feature_dimension() &&
-        thunk_1.dnums().kernel_input_feature_dimension() ==
-            thunk_2.dnums().kernel_input_feature_dimension() &&
-        thunk_1.dnums().kernel_output_feature_dimension() ==
-            thunk_2.dnums().kernel_output_feature_dimension() &&
-        thunk_1.dnums().output_batch_dimension() ==
-            thunk_2.dnums().output_batch_dimension() &&
-        thunk_1.dnums().output_feature_dimension() ==
-            thunk_2.dnums().output_feature_dimension();
-
-    const bool are_options_equal =
-        thunk_1.options().use_threadpool == thunk_2.options().use_threadpool;
-
-    const bool are_windows_equal = absl::c_equal(
-        thunk_1.window().dimensions(), thunk_2.window().dimensions(),
-        [](const WindowDimension& window_dimension_1,
-           const WindowDimension& window_dimension_2) {
-          return window_dimension_1.size() == window_dimension_2.size() &&
-                 window_dimension_1.stride() == window_dimension_2.stride() &&
-                 window_dimension_1.padding_low() ==
-                     window_dimension_2.padding_low() &&
-                 window_dimension_1.padding_high() ==
-                     window_dimension_2.padding_high() &&
-                 window_dimension_1.window_dilation() ==
-                     window_dimension_2.window_dilation() &&
-                 window_dimension_1.base_dilation() ==
-                     window_dimension_2.base_dilation() &&
-                 window_dimension_1.window_reversal() ==
-                     window_dimension_2.window_reversal();
-        });
-
-    return are_dnums_equal && are_windows_equal && are_options_equal &&
-           thunk_1.feature_group_count() == thunk_2.feature_group_count() &&
-           VerifySliceShapeEquality(thunk_1.convolution_slices().input_buffer,
-                                    thunk_1.convolution_slices().input_shape,
-                                    thunk_2.convolution_slices().input_buffer,
-                                    thunk_2.convolution_slices().input_shape);
-    VerifySliceShapeEquality(thunk_1.convolution_slices().kernel_buffer,
-                             thunk_1.convolution_slices().kernel_shape,
-                             thunk_2.convolution_slices().kernel_buffer,
-                             thunk_2.convolution_slices().kernel_shape);
-    VerifySliceShapeEquality(thunk_1.convolution_slices().output_buffer,
-                             thunk_1.convolution_slices().output_shape,
-                             thunk_2.convolution_slices().output_buffer,
-                             thunk_2.convolution_slices().output_shape);
-  }
-
   bool VerifyKernelThunkEquality(const KernelThunkBase& thunk_1,
                                  const KernelThunkBase& thunk_2) {
     return thunk_1.kernel_name() == thunk_2.kernel_name() &&
@@ -1408,30 +1230,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         return VerifyWhileThunkEquality(
             tsl::down_cast<const WhileThunk&>(thunk_1),
             tsl::down_cast<const WhileThunk&>(thunk_2));
-      case Thunk::Kind::kXnnFusion: {
-        const XnnFusionThunk& xnn_fusion_thunk_1 =
-            tsl::down_cast<const XnnFusionThunk&>(thunk_1);
-        const XnnFusionThunk& xnn_fusion_thunk_2 =
-            tsl::down_cast<const XnnFusionThunk&>(thunk_2);
-        if (xnn_fusion_thunk_1.xnn_fusion_kind() !=
-            xnn_fusion_thunk_2.xnn_fusion_kind()) {
-          return false;
-        }
-        switch (xnn_fusion_thunk_1.xnn_fusion_kind()) {
-          case XnnFusionThunk::XnnFusionKind::kFusion:
-            return VerifyXnnFusionThunkEquality(
-                tsl::down_cast<const XnnFusionThunk&>(thunk_1),
-                tsl::down_cast<const XnnFusionThunk&>(thunk_2));
-          case XnnFusionThunk::XnnFusionKind::kDot:
-            return VerifyXnnDotThunkEquality(
-                tsl::down_cast<const XnnDotThunk&>(thunk_1),
-                tsl::down_cast<const XnnDotThunk&>(thunk_2));
-          case XnnFusionThunk::XnnFusionKind::kConvolution:
-            return VerifyXnnConvolutionThunkEquality(
-                tsl::down_cast<const XnnConvolutionThunk&>(thunk_1),
-                tsl::down_cast<const XnnConvolutionThunk&>(thunk_2));
-        }
-      }
       case Thunk::Kind::kYnnFusion: {
 #ifdef XLA_YNNPACK
         const YnnFusionThunk& ynn_fusion_thunk_1 =
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
deleted file mode 100644
index dc32dac687585b..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/BUILD
+++ /dev/null
@@ -1,213 +0,0 @@
-load("//xla:xla.default.bzl", "xla_cc_test")
-load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = [
-        "//xla:friends",
-    ],
-)
-
-cc_library(
-    name = "xnn_interop",
-    srcs = ["xnn_interop.cc"],
-    hdrs = ["xnn_interop.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/tsl/platform:logging",
-        "@XNNPACK",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
-cc_library(
-    name = "xnn_threadpool",
-    srcs = ["xnn_threadpool.cc"],
-    hdrs = ["xnn_threadpool.h"],
-    deps = [
-        ":xnn_interop",
-        "@XNNPACK",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/status:statusor",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "xnn_convolution_thunk",
-    srcs = ["xnn_convolution_thunk.cc"],
-    hdrs = ["xnn_convolution_thunk.h"],
-    deps = [
-        ":xnn_fusion_thunk",
-        ":xnn_interop",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:convolution_dims",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_convolution_thunk_test",
-    srcs = ["xnn_convolution_thunk_test.cc"],
-    deps = [
-        ":xnn_convolution_thunk",
-        ":xnn_interop",
-        ":xnn_threadpool",
-        "//xla:error_spec",
-        "//xla:literal",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:buffer_allocations",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_testlib",
-        "//xla/hlo/evaluator:hlo_evaluator",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/utils:hlo_query",
-        "//xla/service:hlo_module_config",
-        "//xla/tests:literal_test_util",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-cc_library(
-    name = "xnn_dot_thunk",
-    srcs = ["xnn_dot_thunk.cc"],
-    hdrs = ["xnn_dot_thunk.h"],
-    deps = [
-        ":xnn_fusion_thunk",
-        ":xnn_interop",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:dot_dims",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/functional:bind_front",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_dot_thunk_test",
-    srcs = ["xnn_dot_thunk_test.cc"],
-    deps = [
-        ":xnn_dot_thunk",
-        ":xnn_interop",
-        ":xnn_threadpool",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:buffer_allocations",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_testlib",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
-cc_library(
-    name = "xnn_fusion_thunk",
-    srcs = ["xnn_fusion_thunk.cc"],
-    hdrs = ["xnn_fusion_thunk.h"],
-    deps = [
-        ":xnn_interop",
-        "//xla:shape_util",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/runtime:buffer_use",
-        "//xla/runtime:object_pool",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_address",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:logging",
-        "//xla/tsl/platform:statusor",
-        "@XNNPACK",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/functional:bind_front",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_fusion_thunk_test",
-    srcs = ["xnn_fusion_thunk_test.cc"],
-    deps = [
-        ":xnn_fusion_thunk",
-        ":xnn_interop",
-        ":xnn_threadpool",
-        "//xla:literal_util",
-        "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/backends/cpu/runtime:buffer_allocations",
-        "//xla/backends/cpu/runtime:thunk",
-        "//xla/backends/cpu/runtime:thunk_testlib",
-        "//xla/tsl/concurrency:async_value",
-        "//xla/tsl/platform:env",
-        "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@XNNPACK",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-        "@eigen_archive//:eigen3",
-    ],
-)
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
deleted file mode 100644
index 0d83fbec77698d..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/memory/memory.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-absl::StatusOr<XnnSubgraph> XnnConvolutionThunk::BuildConvolutionSubgraph(
-    absl::Span<const Argument> arguments, absl::Span<const Result> results,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  TF_ASSIGN_OR_RETURN(XnnSubgraph subgraph,
-                      CreateXnnSubgraph([&](xnn_subgraph_t* subgraph) {
-                        return xnn_create_subgraph(
-                            /*external_value_ids=*/3,
-                            /*flags=*/0, subgraph);
-                      }));
-
-  uint32_t input_id = XNN_INVALID_VALUE_ID;
-  uint32_t kernel_id = XNN_INVALID_VALUE_ID;
-  uint32_t out_id = XNN_INVALID_VALUE_ID;
-
-  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
-    return {dims.begin(), dims.end()};
-  };
-
-  VLOG(3) << absl::StreamFormat(
-      "Create XNNPACK convolution: input_shape=%s kernel_shape=%s out_shape=%s",
-      convolution_slices_.input_shape.ToString(true),
-      convolution_slices_.kernel_shape.ToString(true),
-      convolution_slices_.output_shape.ToString(true));
-
-  std::vector<size_t> input_dims =
-      dims(convolution_slices_.input_shape.dimensions());
-  std::vector<size_t> kernel_dims =
-      dims(convolution_slices_.kernel_shape.dimensions());
-  std::vector<size_t> out_dims =
-      dims(convolution_slices_.output_shape.dimensions());
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, input_dims.size(), input_dims.data(),
-      nullptr,
-      /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &input_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, kernel_dims.size(), kernel_dims.data(),
-      /*data=*/arguments_buffers[1].opaque(),
-      /*external_id=*/1, /*flags=*/0, &kernel_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, out_dims.size(), out_dims.data(),
-      nullptr,
-      /*external_id=*/2, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
-
-  auto& ds = convolution_canonical_dims_;
-  XNN_RETURN_IF_ERROR(xnn_define_convolution_2d(
-      subgraph.get(),  //
-      /*input_padding_top=*/ds.padding_before.x,
-      /*input_padding_right=*/ds.padding_before.y,
-      /*input_padding_bottom=*/ds.padding_after.x,
-      /*input_padding_left=*/ds.padding_after.y,
-      /*kernel_height=*/ds.kernel_dims.x,
-      /*kernel_width=*/ds.kernel_dims.y,
-      /*subsampling_height=*/ds.strides.x,
-      /*subsampling_width=*/ds.strides.y,
-      /*dilation_height=*/ds.base_dilation.x,
-      /*dilation_width=*/ds.base_dilation.y,
-      /*groups=*/ds.feature_group_count,
-      /*group_input_channels=*/ds.input_channels,
-      /*group_output_channels=*/ds.kernel_filters,
-      /*output_min=*/std::numeric_limits<float>::lowest(),
-      /*output_max=*/std::numeric_limits<float>::max(), input_id, kernel_id,
-      /*bias_id=*/XNN_INVALID_VALUE_ID, out_id,
-      /*flags=*/XNN_FLAG_TENSORFLOW_SAME_PADDING));
-
-  return subgraph;
-}
-
-absl::StatusOr<std::unique_ptr<XnnConvolutionThunk>>
-XnnConvolutionThunk::Create(
-    Options options, Info info, BufferAllocation::Slice input_buffer,
-    const Shape& input_shape, BufferAllocation::Slice kernel_buffer,
-    const Shape& kernel_shape, BufferAllocation::Slice output_buffer,
-    const Shape& output_shape, const ConvolutionDimensionNumbers& dnums,
-    const Window& window, int64_t feature_group_count) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  if (dnums.kernel_input_feature_dimension() != 3 ||
-      dnums.kernel_output_feature_dimension() != 0) {
-    return InvalidArgument(
-        "XNNPACK convolution expects kernel (filter) in OHWI format");
-  }
-
-  ConvolutionSlices slices = {input_buffer, input_shape,   kernel_buffer,
-                              kernel_shape, output_buffer, output_shape};
-
-  TF_ASSIGN_OR_RETURN(
-      ConvolutionCanonicalDims canonical_dims,
-      GetConvolutionCanonicalDims(slices, dnums, window, feature_group_count));
-
-  return absl::WrapUnique(new XnnConvolutionThunk(
-      std::move(options), std::move(info), std::move(slices),
-      std::move(canonical_dims), dnums, window));
-}
-
-static std::vector<XnnFusionThunk::Argument> ConvolutionArguments(
-    const ConvolutionSlices& slices) {
-  return {XnnFusionThunk::Argument{slices.input_buffer, slices.input_shape},
-          XnnFusionThunk::Argument{slices.kernel_buffer, slices.kernel_shape}};
-}
-
-static std::vector<XnnFusionThunk::Result> ConvolutionResults(
-    const ConvolutionSlices& slices) {
-  return {XnnFusionThunk::Result{slices.output_buffer, slices.output_shape}};
-}
-
-XnnConvolutionThunk::XnnConvolutionThunk(
-    Options options, Info info, ConvolutionSlices convolution_slices,
-    ConvolutionCanonicalDims convolution_canonical_dims,
-    ConvolutionDimensionNumbers dnums, Window window)
-    : XnnFusionThunk(XnnFusionKind::kConvolution, std::move(options),
-                     std::move(info), ConvolutionArguments(convolution_slices),
-                     ConvolutionResults(convolution_slices),
-                     CapturingBuilder(std::bind(
-                         &XnnConvolutionThunk::BuildConvolutionSubgraph, this,
-                         std::placeholders::_1, std::placeholders::_2,
-                         std::placeholders::_3)),
-                     /*captured_arguments_ids=*/{1}),
-      convolution_slices_(std::move(convolution_slices)),
-      convolution_canonical_dims_(std::move(convolution_canonical_dims)),
-      dnums_(std::move(dnums)),
-      window_(std::move(window)) {}
-
-std::string XnnConvolutionThunk::fusion_kind() const { return "convolution"; }
-
-std::string XnnConvolutionThunk::fusion_description() const {
-  return absl::StrFormat("convolution_rank=%d",
-                         convolution_canonical_dims_.convolution_rank());
-}
-
-std::vector<std::string> XnnConvolutionThunk::fusion_details() const {
-  return {absl::StrCat(convolution_canonical_dims_)};
-}
-
-std::string XnnConvolutionThunk::argument_name(size_t index) const {
-  return index == 0 ? "input" : "kernel";
-}
-
-std::string XnnConvolutionThunk::result_name(size_t index) const {
-  return "out";
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
deleted file mode 100644
index 7269ddff7f20d9..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_CONVOLUTION_THUNK_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_CONVOLUTION_THUNK_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/convolution_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Convolution operation implemented on top of XNNPACK.
-class XnnConvolutionThunk final : public XnnFusionThunk {
- public:
-  static absl::StatusOr<std::unique_ptr<XnnConvolutionThunk>> Create(
-      Options options, Info info, BufferAllocation::Slice input_buffer,
-      const Shape& input_shape, BufferAllocation::Slice kernel_buffer,
-      const Shape& kernel_shape, BufferAllocation::Slice output_buffer,
-      const Shape& output_shape, const ConvolutionDimensionNumbers& dnums,
-      const Window& window, int64_t feature_group_count);
-
-  ConvolutionDimensionNumbers dnums() const { return dnums_; }
-  Window window() const { return window_; }
-
-  int64_t feature_group_count() const {
-    return convolution_canonical_dims_.feature_group_count;
-  }
-
-  const ConvolutionSlices& convolution_slices() const {
-    return convolution_slices_;
-  }
-
- protected:
-  std::string fusion_kind() const final;
-  std::string fusion_description() const final;
-
-  bool has_fusion_details() const final { return true; }
-  std::vector<std::string> fusion_details() const final;
-
-  std::string argument_name(size_t index) const final;
-  std::string result_name(size_t index) const final;
-
- private:
-  XnnConvolutionThunk(Options options, Info info,
-                      ConvolutionSlices convolution_slices,
-                      ConvolutionCanonicalDims convolution_canonical_dims,
-                      ConvolutionDimensionNumbers dnums, Window window);
-
-  absl::StatusOr<XnnSubgraph> BuildConvolutionSubgraph(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  ConvolutionSlices convolution_slices_;
-  ConvolutionCanonicalDims convolution_canonical_dims_;
-
-  // Convolution operation parameters that were used to construct this thunk. We
-  // only keep them around to be able to serialize/deserialize thunk.
-  ConvolutionDimensionNumbers dnums_;
-  Window window_;
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_CONVOLUTION_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk_test.cc
deleted file mode 100644
index 32eb78cf23f09c..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk_test.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_convolution_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <random>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/substitute.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/thunk_testlib.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-#include "xla/error_spec.h"
-#include "xla/hlo/evaluator/hlo_evaluator.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/parser/hlo_parser.h"
-#include "xla/hlo/utils/hlo_query.h"
-#include "xla/layout.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/literal_util.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tests/literal_test_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/xla_data.pb.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-namespace {
-
-class XnnConvolutionThunkTest
-    : public ::testing::TestWithParam<std::tuple<bool, std::vector<int32_t>>> {
- protected:
-  bool use_threadpool() const { return std::get<0>(GetParam()); }
-
-  int32_t dimension(int32_t index) const {
-    return std::get<1>(GetParam())[index];
-  }
-
-  bool IsOdd(int n) { return n % 2 == 1; }
-};
-
-TEST_P(XnnConvolutionThunkTest, SimpleConvolution) {
-  int32_t batch = dimension(0);
-  int32_t height = dimension(1);
-  int32_t width = dimension(2);
-  int32_t input_channels = dimension(3);
-  int32_t kernel_h = dimension(4);
-  int32_t kernel_w = dimension(5);
-  int32_t output_channels = dimension(6);
-
-  // Padding values for 'SAME' padding. Only odd kernel sizes are supported.
-  CHECK(IsOdd(kernel_h) && IsOdd(kernel_w));
-  int padding_h = (kernel_h - 1) / 2;
-  int padding_w = (kernel_w - 1) / 2;
-
-  std::minstd_rand0 engine;
-
-  // Input format is NHWC.
-  auto input_shape =
-      ShapeUtil::MakeShape(F32, {batch, height, width, input_channels});
-
-  // Kernel format is HWIO.
-  auto kernel_shape = ShapeUtil::MakeShape(
-      F32, {kernel_h, kernel_w, input_channels, output_channels});
-
-  auto input =
-      *LiteralUtil::CreateRandomLiteral<F32>(input_shape, &engine, 1.0f, 0.1f);
-  auto kernel =
-      *LiteralUtil::CreateRandomLiteral<F32>(kernel_shape, &engine, 1.0f, 0.1f);
-
-  // Create a reference HLO module that we can use to compare the results.
-  std::string hlo_module_template = R"(
-    HloModule convolution
-
-    ENTRY TestComputation {
-      %p0 = $0 parameter(0)
-      %p1 = $1 parameter(1)
-      ROOT conv = convolution(p0, p1), window={size=$2 pad=$3},
-        dim_labels=b01f_01io->b01f
-    }
-  )";
-
-  std::string hlo_module = absl::Substitute(
-      hlo_module_template, input_shape.ToString(), kernel_shape.ToString(),
-      absl::StrCat(kernel_h, "x", kernel_w),
-      absl::StrCat(padding_h, "_", padding_h, "x", padding_w, "_", padding_w));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloModule> module,
-      ParseAndReturnUnverifiedModule(hlo_module, HloModuleConfig()));
-
-  HloEvaluator evaluator;
-  TF_ASSERT_OK_AND_ASSIGN(Literal expected_result,
-                          evaluator.Evaluate(*module, {&input, &kernel}));
-
-  HloInstruction* conv =
-      hlo_query::FindInstruction(module->entry_computation(), "conv");
-  ASSERT_NE(conv, nullptr);
-
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
-  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
-                                 threads.NumThreads());
-
-  // XNNPACK expects OHWI format for the kernel.
-  Literal kernel_transposed =
-      kernel.Transpose({3, 0, 1, 2})
-          .Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
-
-  // Create a Literal with the expected shape.
-  const Shape& out_shape = expected_result.shape();
-  auto out = LiteralUtil::CreateFull(out_shape.dimensions(), 0.f);
-
-  BufferAllocations allocations =
-      CreateBufferAllocations(input, kernel_transposed, out);
-
-  auto [input_alloc, kernel_transposed_alloc, out_alloc] =
-      CreateBufferAllocation(input, kernel_transposed, out);
-  auto [input_slice, kernel_transposed_slice, out_slice] =
-      CreateBufferAllocationSlice(input_alloc, kernel_transposed_alloc,
-                                  out_alloc);
-
-  // Adjust kernel dimensions for XNNPACK.
-  ConvolutionDimensionNumbers dnums = conv->convolution_dimension_numbers();
-  dnums.set_kernel_input_feature_dimension(3);
-  dnums.set_kernel_output_feature_dimension(0);
-  dnums.set_kernel_spatial_dimensions(0, 1);
-  dnums.set_output_spatial_dimensions(1, 2);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto thunk,
-      XnnConvolutionThunk::Create(
-          XnnConvolutionThunk::Options{use_threadpool()}, {"convolution"},
-          input_slice, input_shape, kernel_transposed_slice,
-          kernel_transposed.shape(), out_slice, out_shape, dnums,
-          conv->window(), conv->feature_group_count()));
-
-  XnnThreadpool threadpool;
-  if (use_threadpool()) {
-    TF_ASSERT_OK_AND_ASSIGN(threadpool, CreateXnnThreadpool(&device));
-  }
-  Thunk::XnnParams xnn_params(std::move(threadpool));
-
-  Thunk::ExecuteParams params;
-  params.buffer_allocations = &allocations;
-  params.intra_op_threadpool = use_threadpool() ? &device : nullptr;
-  params.xnn_params = &xnn_params;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  ErrorSpec error_spec{1e-5};
-  EXPECT_TRUE(LiteralTestUtil::Near(expected_result, out, error_spec));
-
-  // Execute thunk one more time to test that we reuse XNN runtime.
-  execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  EXPECT_TRUE(LiteralTestUtil::Near(expected_result, out, error_spec));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    XnnConvolution, XnnConvolutionThunkTest,
-    ::testing::Combine(::testing::Values(true, false),
-                       ::testing::Values(std::vector<int32_t>{1, 8, 8, 16, 1, 1,
-                                                              32})));
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
deleted file mode 100644
index 44ec1b8139bfc5..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.cc
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/functional/bind_front.h"
-#include "absl/memory/memory.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/primitive_util.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-absl::StatusOr<XnnSubgraph> XnnDotThunk::BuildDotSubgraph(
-    absl::Span<const Argument> arguments, absl::Span<const Result> results,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  TF_ASSIGN_OR_RETURN(XnnSubgraph subgraph,
-                      CreateXnnSubgraph([](xnn_subgraph_t* subgraph) {
-                        return xnn_create_subgraph(
-                            /*external_value_ids=*/3,
-                            /*flags=*/0, subgraph);
-                      }));
-
-  uint32_t lhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t rhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t out_id = XNN_INVALID_VALUE_ID;
-
-  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
-    return {dims.begin(), dims.end()};
-  };
-
-  std::vector<size_t> lhs_dims = dims(dot_slices_.lhs_shape.dimensions());
-  std::vector<size_t> rhs_dims = dims(dot_slices_.rhs_shape.dimensions());
-  std::vector<size_t> out_dims = dims(dot_slices_.out_shape.dimensions());
-
-  PrimitiveType dtype = dot_slices_.lhs_shape.element_type();
-  if (dtype != F32 && dtype != BF16) {
-    return InvalidArgument("Unsupported input data type for XnnDotThunk: %s",
-                           primitive_util::LowercasePrimitiveTypeName(dtype));
-  }
-  xnn_datatype input_dtype =
-      (dtype == F32) ? xnn_datatype_fp32 : xnn_datatype_bf16;
-  xnn_datatype output_dtype = xnn_datatype_fp32;
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), input_dtype, lhs_dims.size(), lhs_dims.data(), nullptr,
-      /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &lhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), input_dtype, rhs_dims.size(), rhs_dims.data(),
-      capture_rhs_ ? arguments_buffers[1].opaque() : nullptr,
-      /*external_id=*/1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &rhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), output_dtype, out_dims.size(), out_dims.data(), nullptr,
-      /*external_id=*/2, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_batch_matrix_multiply(
-      subgraph.get(), lhs_id, rhs_id, out_id,
-      (/*flags=*/dot_canonical_dims_.rhs_canonical ? 0 : XNN_FLAG_TRANSPOSE_B) |
-          XNN_FLAG_NO_BROADCAST));
-
-  return subgraph;
-}
-
-absl::StatusOr<std::unique_ptr<XnnDotThunk>> XnnDotThunk::Create(
-    Options options, Info info, DotDimensionNumbers dot_dimensions,
-    BufferAllocation::Slice lhs_buffer, Shape lhs_shape,
-    BufferAllocation::Slice rhs_buffer, Shape rhs_shape,
-    BufferAllocation::Slice out_buffer, Shape out_shape, bool capture_rhs) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
-                                                      rhs_shape, out_shape));
-
-  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
-                      GetDotCanonicalDims(dot_dimensions, dot_shape));
-
-  DotSlices dot_slices{lhs_buffer, std::move(lhs_shape),
-                       rhs_buffer, std::move(rhs_shape),
-                       out_buffer, std::move(out_shape)};
-
-  return absl::WrapUnique(new XnnDotThunk(
-      std::move(options), std::move(info), std::move(dot_dimensions),
-      std::move(dot_slices), std::move(dot_shape),
-      std::move(dot_canonical_dims), capture_rhs));
-}
-
-static std::vector<XnnFusionThunk::Argument> DotArguments(
-    const DotSlices& slices) {
-  return {XnnFusionThunk::Argument{slices.lhs_buffer, slices.lhs_shape},
-          XnnFusionThunk::Argument{slices.rhs_buffer, slices.rhs_shape}};
-}
-
-static std::vector<XnnFusionThunk::Result> DotResults(const DotSlices& slices) {
-  return {XnnFusionThunk::Result{slices.out_buffer, slices.out_shape}};
-}
-
-static absl::Span<const int64_t> DotCapturedArgumentIds(bool capture_rhs) {
-  static constexpr int64_t kRhsIndex = 1;
-  return capture_rhs ? absl::Span<const int64_t>(&kRhsIndex, 1)
-                     : absl::Span<const int64_t>();
-}
-
-XnnDotThunk::XnnDotThunk(Options options, Info info,
-                         DotDimensionNumbers dot_dimensions,
-                         DotSlices dot_slices, DotShape dot_shape,
-                         DotCanonicalDims dot_canonical_dims, bool capture_rhs)
-    : XnnFusionThunk(XnnFusionKind::kDot, std::move(options), std::move(info),
-                     DotArguments(dot_slices), DotResults(dot_slices),
-                     CapturingBuilder(absl::bind_front(
-                         &XnnDotThunk::BuildDotSubgraph, this)),
-                     DotCapturedArgumentIds(capture_rhs)),
-      dot_dimensions_(std::move(dot_dimensions)),
-      dot_slices_(std::move(dot_slices)),
-      dot_shape_(std::move(dot_shape)),
-      dot_canonical_dims_(std::move(dot_canonical_dims)),
-      capture_rhs_(capture_rhs) {}
-
-std::string XnnDotThunk::fusion_kind() const { return "dot"; }
-
-std::string XnnDotThunk::fusion_description() const {
-  return absl::StrFormat(
-      "lhs_batch_dims=[%s], rhs_batch_dims=[%s], "
-      "lhs_contract_dims=[%s], rhs_contract_dims=[%s], capture_rhs=%v",
-      absl::StrJoin(dot_dimensions_.lhs_batch_dimensions(), ","),
-      absl::StrJoin(dot_dimensions_.rhs_batch_dimensions(), ","),
-      absl::StrJoin(dot_dimensions_.lhs_contracting_dimensions(), ","),
-      absl::StrJoin(dot_dimensions_.rhs_contracting_dimensions(), ","),
-      capture_rhs_);
-}
-
-std::vector<std::string> XnnDotThunk::fusion_details() const {
-  return {
-      absl::StrFormat("  matmul shape: batch_size=%d, lhs=%s, rhs=%s, out=%s",
-                      dot_shape_.batch_size,
-                      dot_shape_.lhs_matmul_shape.ToString(true),
-                      dot_shape_.rhs_matmul_shape.ToString(true),
-                      dot_shape_.out_matmul_shape.ToString(true)),
-      absl::StrFormat("  matmul dims: m=%d, k=%d, n=%d, lhs_column_major=%v, "
-                      "lhs_canonical=%v rhs_column_major=%v, rhs_canonical=%v",
-                      dot_canonical_dims_.m, dot_canonical_dims_.k,
-                      dot_canonical_dims_.n,
-                      dot_canonical_dims_.lhs_column_major,
-                      dot_canonical_dims_.lhs_canonical,
-                      dot_canonical_dims_.rhs_column_major,
-                      dot_canonical_dims_.rhs_canonical),
-  };
-}
-
-std::string XnnDotThunk::argument_name(size_t index) const {
-  return index == 0 ? "lhs" : "rhs";
-}
-
-std::string XnnDotThunk::result_name(size_t index) const { return "out"; }
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
deleted file mode 100644
index 448897ad0eb662..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// Dot operation implemented on top of XNNPACK.
-class XnnDotThunk final : public XnnFusionThunk {
- public:
-  static absl::StatusOr<std::unique_ptr<XnnDotThunk>> Create(
-      Options options, Info info, DotDimensionNumbers dot_dimensions,
-      BufferAllocation::Slice lhs_buffer, Shape lhs_shape,
-      BufferAllocation::Slice rhs_buffer, Shape rhs_shape,
-      BufferAllocation::Slice out_buffer, Shape out_shape, bool capture_rhs);
-
-  DotDimensionNumbers dot_dimensions() const { return dot_dimensions_; }
-  DotSlices dot_slices() const { return dot_slices_; }
-  bool capture_rhs() const { return capture_rhs_; }
-
- protected:
-  std::string fusion_kind() const final;
-  std::string fusion_description() const final;
-
-  bool has_fusion_details() const final { return true; }
-  std::vector<std::string> fusion_details() const final;
-
-  std::string argument_name(size_t index) const final;
-  std::string result_name(size_t index) const final;
-
- private:
-  XnnDotThunk(Options options, Info info, DotDimensionNumbers dot_dimensions,
-              DotSlices dot_slices, DotShape dot_shape,
-              DotCanonicalDims dot_canonical_dims, bool capture_rhs);
-
-  absl::StatusOr<XnnSubgraph> BuildDotSubgraph(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  DotDimensionNumbers dot_dimensions_;
-  DotSlices dot_slices_;
-  DotShape dot_shape_;
-  DotCanonicalDims dot_canonical_dims_;
-
-  // If true, the RHS buffer might be captured by XNNPACK graph by value. This
-  // allows XNNPACK to do packing at graph compile time.
-  bool capture_rhs_;
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_DOT_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
deleted file mode 100644
index 16f0efb0910c92..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk_test.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/thunk_testlib.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-#include "xla/literal_util.h"
-#include "xla/primitive_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/cpu_info.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-namespace {
-
-using XnnDotThunkTestSpec = std::tuple<PrimitiveType, bool, bool>;
-
-class XnnDotThunkTest : public testing::TestWithParam<XnnDotThunkTestSpec> {
- public:
-  static std::string Name(
-      const ::testing::TestParamInfo<XnnDotThunkTestSpec>& info) {
-    return absl::StrCat(
-        primitive_util::LowercasePrimitiveTypeName(std::get<0>(info.param)),
-        "_", std::get<1>(info.param) ? "threadpool" : "single_threaded", "_",
-        std::get<2>(info.param) ? "capture_rhs" : "no_capture_rhs");
-  }
-};
-
-TEST_P(XnnDotThunkTest, SimpleDot) {
-  auto [input_type, use_threadpool, capture_rhs] = GetParam();
-
-  if (input_type == BF16 &&
-      !tsl::port::TestCPUFeature(tsl::port::AVX512_BF16)) {
-    GTEST_SKIP() << "CPU needs AVX512_BF16 for this test.";
-  }
-
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
-  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
-                                 threads.NumThreads());
-
-  auto lhs = LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  auto rhs = LiteralUtil::CreateR2<float>({{4.0, 3.0}, {2.0, 1.0}});
-  auto out = LiteralUtil::CreateR2<float>({{0.0, 0.0}, {0.0, 0.0}});
-  if (input_type == BF16) {
-    lhs = LiteralUtil::ConvertF32ToBF16(lhs);
-    rhs = LiteralUtil::ConvertF32ToBF16(rhs);
-  }
-
-  BufferAllocations allocations = CreateBufferAllocations(lhs, rhs, out);
-
-  auto [lhs_alloc, rhs_alloc, out_alloc] =
-      CreateBufferAllocation(lhs, rhs, out);
-  auto [lhs_slice, rhs_slice, out_slice] =
-      CreateBufferAllocationSlice(lhs_alloc, rhs_alloc, out_alloc);
-
-  Shape input_shape = ShapeUtil::MakeShape(input_type, {2, 2});
-  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  DotDimensionNumbers dot_dimensions;
-  dot_dimensions.add_lhs_contracting_dimensions(1);
-  dot_dimensions.add_rhs_contracting_dimensions(0);
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto thunk,
-      XnnDotThunk::Create(XnnDotThunk::Options{use_threadpool}, {"dot"},
-                          dot_dimensions, lhs_slice, input_shape, rhs_slice,
-                          input_shape, out_slice, output_shape, capture_rhs));
-
-  XnnThreadpool threadpool;
-  if (use_threadpool) {
-    TF_ASSERT_OK_AND_ASSIGN(threadpool, CreateXnnThreadpool(&device));
-  }
-  Thunk::XnnParams xnn_params(std::move(threadpool));
-
-  Thunk::ExecuteParams params;
-  params.buffer_allocations = &allocations;
-  params.intra_op_threadpool = use_threadpool ? &device : nullptr;
-  params.xnn_params = &xnn_params;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  EXPECT_EQ(out, LiteralUtil::CreateR2<float>({{8.0, 5.0}, {20.0, 13.0}}));
-}
-
-INSTANTIATE_TEST_SUITE_P(XnnDot, XnnDotThunkTest,
-                         ::testing::Combine(::testing::ValuesIn({F32, BF16}),
-                                            ::testing::Bool(),
-                                            ::testing::Bool()),
-                         XnnDotThunkTest::Name);
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
deleted file mode 100644
index 6ab367af62fce7..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.cc
+++ /dev/null
@@ -1,364 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <utility>
-#include <vector>
-
-#include "experimental.h"  // xnnpack
-#include "xnnpack.h"
-#include "absl/algorithm/container.h"
-#include "absl/base/no_destructor.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/functional/bind_front.h"
-#include "absl/functional/function_ref.h"
-#include "absl/log/check.h"
-#include "absl/memory/memory.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/runtime/buffer_use.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-
-namespace xla::cpu {
-
-absl::string_view XnnFusionThunk::XnnFusionKindToString(XnnFusionKind kind) {
-  switch (kind) {
-    case XnnFusionKind::kFusion:
-      return "xnn-fusion";
-    case XnnFusionKind::kDot:
-      return "xnn-dot";
-    case XnnFusionKind::kConvolution:
-      return "xnn-convolution";
-  }
-}
-
-std::ostream& operator<<(std::ostream& os, XnnFusionThunk::XnnFusionKind kind) {
-  return os << XnnFusionThunk::XnnFusionKindToString(kind);
-}
-
-// XNNPACK executable instantiated for the fusion operation.
-struct XnnFusionThunk::XnnExecutable {
-  tsl::AsyncValueRef<XnnFusionThunk::ExecuteEvent> Invoke(
-      const XnnThreadpool& threadpool,
-      absl::Span<se::DeviceAddressBase> arguments,
-      absl::Span<se::DeviceAddressBase> results,
-      absl::FunctionRef<bool(size_t)> is_captured_argument);
-
-  // Resets XNNPACK runtime and subgraph.
-  absl::Status Reset();
-
-  XnnSubgraph subgraph = nullptr;
-  XnnRuntime runtime = nullptr;
-
-  // TODO(ezhulenev): Today we rely on device memory as an identity of the
-  // captured argument, and this is not correct as we can have multiple
-  // arguments allocated to the heap address. This is work in progress, and will
-  // be migrated to a buffer identity passed to XLA by the client (PjRt).
-  std::vector<se::DeviceAddressBase> captured_arguments;
-};
-
-tsl::AsyncValueRef<XnnFusionThunk::ExecuteEvent>
-XnnFusionThunk::XnnExecutable::Invoke(
-    const XnnThreadpool& threadpool,
-    absl::Span<se::DeviceAddressBase> arguments,
-    absl::Span<se::DeviceAddressBase> results,
-    absl::FunctionRef<bool(size_t)> is_captured_argument) {
-  // Create external values for all arguments and results.
-  absl::InlinedVector<xnn_external_value, 8> external_values;
-  external_values.reserve(arguments.size() + results.size());
-
-  // External tensor id for arguments and results.
-  uint32_t id = 0;
-
-  for (const se::DeviceAddressBase& argument : arguments) {
-    xnn_external_value value{id++, argument.opaque()};
-    if (!is_captured_argument(value.id)) {
-      external_values.push_back(value);
-    }
-  }
-
-  for (const se::DeviceAddressBase& result : results) {
-    xnn_external_value value{id++, result.opaque()};
-    external_values.push_back(value);
-  }
-
-  DCHECK_NE(runtime.get(), nullptr) << "XNNPACK runtime is not initialized";
-  XNN_RETURN_IF_ERROR(xnn_setup_runtime_v2(
-      runtime.get(), external_values.size(), external_values.data()));
-
-  // Update threadpool used by the XNNPACK runtime.
-  xnn_update_runtime_with_threadpool(runtime.get(), threadpool.get());
-
-  // Execute XNNPACK runtime in the caller thread.
-  XNN_RETURN_IF_ERROR(xnn_invoke_runtime(runtime.get()));
-  return OkExecuteEvent();
-}
-
-absl::Status XnnFusionThunk::XnnExecutable::Reset() {
-  runtime.reset();
-  subgraph.reset();
-  return absl::OkStatus();
-}
-
-absl::StatusOr<XnnFusionThunk::XnnExecutable>
-XnnFusionThunk::CreateXnnExecutable(
-    const XnnThreadpool& threadpool,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  bool capturing = !captured_arguments_ids_.empty();
-  VLOG(3) << absl::StreamFormat(
-      "Create %s XNN executable for `%s` operation: num_created=%d",
-      capturing ? "capturing" : "pooled", info().op_name,
-      capturing ? num_capturing_created_.fetch_add(1)
-                : xnn_executable_pool_.num_created());
-
-  XnnExecutable executable;
-
-  // Keep track of the arguments captured by value.
-  executable.captured_arguments = CaptureArguments(arguments_buffers);
-
-  if (builder_) {
-    TF_ASSIGN_OR_RETURN(executable.subgraph, builder_(arguments_, results_));
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        executable.subgraph,
-        capturing_builder_(arguments_, results_, arguments_buffers));
-  }
-
-  uint32_t flags = XNN_FLAG_SLINKY_ENABLED | XNN_FLAG_SLINKY_STATIC_BOUNDS |
-                   XNN_FLAG_DONT_SPIN_WORKERS;
-
-  TF_ASSIGN_OR_RETURN(
-      executable.runtime, CreateXnnRuntime([&](xnn_runtime_t* runtime) {
-        return xnn_create_runtime_with_threadpool(
-            executable.subgraph.get(), /*weights_cache=*/nullptr,
-            threadpool.get(), flags, runtime);
-      }));
-  XNN_RETURN_IF_ERROR(xnn_reshape_runtime(executable.runtime.get()));
-
-  return {std::move(executable)};
-}
-
-absl::Status XnnFusionThunk::UpdateXnnExecutable(
-    const XnnThreadpool& threadpool, XnnExecutable& executable,
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  DCHECK(capturing_builder_) << "XNN executable is not capturing arguments";
-  DCHECK_EQ(executable.captured_arguments.size(),
-            captured_arguments_ids_.size())
-      << "Unexpected number of captured arguments";
-
-  // If all arguments captured by value are the same as the last execution,
-  // we can reuse the XNN executable.
-  auto capture_arguments = CaptureArguments(arguments_buffers);
-  if (executable.captured_arguments == capture_arguments) {
-    VLOG(3) << absl::StreamFormat("Reuse XNN executable for `%s` operation",
-                                  info().op_name);
-    return absl::OkStatus();
-  }
-
-  VLOG(3) << absl::StreamFormat("Update XNN executable for `%s` operation",
-                                info().op_name);
-
-  TF_RETURN_IF_ERROR(executable.Reset());
-
-  // Keep track of the updated arguments captured by value.
-  executable.captured_arguments = std::move(capture_arguments);
-
-  TF_ASSIGN_OR_RETURN(
-      executable.subgraph,
-      capturing_builder_(arguments_, results_, arguments_buffers));
-
-  uint32_t flags = XNN_FLAG_SLINKY_ENABLED | XNN_FLAG_SLINKY_STATIC_BOUNDS |
-                   XNN_FLAG_DONT_SPIN_WORKERS;
-
-  TF_ASSIGN_OR_RETURN(
-      executable.runtime, CreateXnnRuntime([&](xnn_runtime_t* runtime) {
-        return xnn_create_runtime_with_threadpool(
-            executable.subgraph.get(), /*weights_cache=*/nullptr,
-            threadpool.get(), flags, runtime);
-      }));
-  XNN_RETURN_IF_ERROR(xnn_reshape_runtime(executable.runtime.get()));
-
-  return absl::OkStatus();
-}
-
-std::vector<se::DeviceAddressBase> XnnFusionThunk::CaptureArguments(
-    absl::Span<const se::DeviceAddressBase> arguments_buffers) {
-  std::vector<se::DeviceAddressBase> captured_arguments_ids;
-  captured_arguments_ids.reserve(captured_arguments_ids_.size());
-  for (int64_t i = 0; i < captured_arguments_ids_.size(); ++i) {
-    int32_t arg_index = captured_arguments_ids_[i];
-    captured_arguments_ids.push_back(arguments_buffers[arg_index]);
-  }
-  return captured_arguments_ids;
-}
-
-absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunk::Create(
-    Options options, Info info, std::vector<Argument> arguments,
-    std::vector<Result> results, Builder builder) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  return absl::WrapUnique(new XnnFusionThunk(
-      XnnFusionKind::kFusion, std::move(options), std::move(info),
-      std::move(arguments), std::move(results), std::move(builder)));
-}
-
-absl::StatusOr<std::unique_ptr<XnnFusionThunk>> XnnFusionThunk::Create(
-    Options options, Info info, std::vector<Argument> arguments,
-    std::vector<Result> results, CapturingBuilder capturing_builder,
-    absl::Span<const int64_t> captured_arguments_ids) {
-  TF_RETURN_IF_ERROR(InitializeXnnPack());
-
-  return absl::WrapUnique(new XnnFusionThunk(
-      XnnFusionKind::kFusion, std::move(options), std::move(info),
-      std::move(arguments), std::move(results), std::move(capturing_builder),
-      captured_arguments_ids));
-}
-
-XnnFusionThunk::XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                               std::vector<Argument> arguments,
-                               std::vector<Result> results, Builder builder)
-    : Thunk(Kind::kXnnFusion, std::move(info)),
-      xnn_fusion_kind_(kind),
-      options_(std::move(options)),
-      arguments_(std::move(arguments)),
-      results_(std::move(results)),
-      builder_(std::move(builder)),
-      xnn_executable_pool_(
-          absl::bind_front(&XnnFusionThunk::CreateXnnExecutable, this)) {}
-
-XnnFusionThunk::XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                               std::vector<Argument> arguments,
-                               std::vector<Result> results,
-                               CapturingBuilder capturing_builder,
-                               absl::Span<const int64_t> captured_arguments_ids)
-    : Thunk(Kind::kXnnFusion, std::move(info)),
-      xnn_fusion_kind_(kind),
-      options_(std::move(options)),
-      arguments_(std::move(arguments)),
-      results_(std::move(results)),
-      capturing_builder_(std::move(capturing_builder)),
-      captured_arguments_ids_(captured_arguments_ids.begin(),
-                              captured_arguments_ids.end()),
-      xnn_executable_pool_(
-          absl::bind_front(&XnnFusionThunk::CreateXnnExecutable, this)) {}
-
-XnnFusionThunk::~XnnFusionThunk() = default;
-
-XnnFusionThunk::BufferUses XnnFusionThunk::buffer_uses() const {
-  BufferUses buffer_uses;
-  for (const Argument& argument : arguments_) {
-    buffer_uses.push_back(BufferUse::Read(argument.slice, argument.shape));
-  }
-  for (const Result& result : results_) {
-    buffer_uses.push_back(BufferUse::Write(result.slice, result.shape));
-  }
-
-  return buffer_uses;
-}
-
-const XnnThreadpool& GetXnnThreadpool(const Thunk::ExecuteParams& params) {
-  static absl::NoDestructor<XnnThreadpool> no_threadpool(nullptr);
-  return params.xnn_params ? params.xnn_params->threadpool : *no_threadpool;
-}
-
-tsl::AsyncValueRef<XnnFusionThunk::ExecuteEvent> XnnFusionThunk::Execute(
-    const ExecuteParams& params) {
-  VLOG(3) << absl::StreamFormat("XNN %s `%s`: %s", fusion_kind(),
-                                info().op_name, fusion_description());
-
-  if (VLOG_IS_ON(3) && has_fusion_details()) {
-    for (auto& detail : fusion_details()) {
-      VLOG(3) << detail;
-    }
-  }
-
-  // Resolve device memory for arguments.
-  absl::InlinedVector<se::DeviceAddressBase, 8> arguments_buffers;
-  arguments_buffers.resize(arguments_.size());
-  for (size_t i = 0; i < arguments_.size(); ++i) {
-    Argument& argument = arguments_[i];
-
-    TF_ASSIGN_OR_RETURN(
-        arguments_buffers[i],
-        params.buffer_allocations->GetDeviceAddress(argument.slice));
-
-    VLOG(3) << absl::StreamFormat("  %s: %s in slice %s (%p)", argument_name(i),
-                                  argument.shape.ToString(true),
-                                  argument.slice.ToString(),
-                                  arguments_buffers[i].opaque());
-  }
-
-  // Resolve device memory for results.
-  absl::InlinedVector<se::DeviceAddressBase, 4> results_buffers;
-  results_buffers.resize(results_.size());
-  for (size_t i = 0; i < results_.size(); ++i) {
-    Result& result = results_[i];
-
-    TF_ASSIGN_OR_RETURN(
-        results_buffers[i],
-        params.buffer_allocations->GetDeviceAddress(results_[i].slice));
-
-    VLOG(3) << absl::StreamFormat("  %s: %s in slice %s (%p)", result_name(i),
-                                  result.shape.ToString(true),
-                                  result.slice.ToString(),
-                                  results_buffers[i].opaque());
-  }
-
-  DCHECK(builder_ || capturing_builder_) << "One of the builders must be set.";
-
-  auto invoke = [&](typename XnnExecutablePool::BorrowedObject executable) {
-    auto executed = executable->Invoke(
-        GetXnnThreadpool(params), absl::MakeSpan(arguments_buffers),
-        absl::MakeSpan(results_buffers), [&](size_t id) {
-          return absl::c_linear_search(captured_arguments_ids_, id);
-        });
-
-    // Do not return executable to the pool until the execution is done.
-    executed.AndThen([executable = std::move(executable)] {});
-    return executed;
-  };
-
-  // Borrow XnnExecutable from the pool.
-  TF_ASSIGN_OR_RETURN(auto executable,
-                      xnn_executable_pool_.GetOrCreate(GetXnnThreadpool(params),
-                                                       arguments_buffers));
-
-  // If XNN graph doesn't capture any of the arguments by value, we can execute
-  // XnnExecutable immediately.
-  if (captured_arguments_ids_.empty()) {
-    return invoke(std::move(executable));
-  }
-
-  // Otherwise reset XnnExecutable to capture new arguments buffers.
-  TF_RETURN_IF_ERROR(UpdateXnnExecutable(GetXnnThreadpool(params), *executable,
-                                         arguments_buffers));
-  return invoke(std::move(executable));
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
deleted file mode 100644
index 21deb08bfd6fd6..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
-
-#include <stdbool.h>
-
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <ostream>
-#include <string>
-#include <vector>
-
-#include "absl/functional/any_invocable.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/runtime/object_pool.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
-#include "xla/stream_executor/device_address.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-
-namespace xla::cpu {
-
-// XNN fusion thunk encapsulates XNNPACK subgraph contructed from an XLA fusion
-// operation, where each HLO op has a corresponding XNNPACK operator.
-class XnnFusionThunk : public Thunk {
- public:
-  enum class XnnFusionKind {
-    kFusion,
-    kDot,
-    kConvolution,
-  };
-
-  static absl::string_view XnnFusionKindToString(XnnFusionKind kind);
-
-  ~XnnFusionThunk() override;
-
-  struct Options {
-    // Pass XnnThreadpool constructed from the intra-op threadpool to the
-    // XNNPACK runtime to allow XNNPACK to parallelize the execution.
-    bool use_threadpool = true;
-  };
-
-  struct Argument {
-    BufferAllocation::Slice slice;
-    Shape shape;
-  };
-
-  struct Result {
-    BufferAllocation::Slice slice;
-    Shape shape;
-  };
-
-  // Builder function constructs XNNPACK subgraph for the fusion operation.
-  using Builder = absl::AnyInvocable<absl::StatusOr<XnnSubgraph>(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results)>;
-
-  // Builder function that constructs XNNPACK subgraph for the fusion operation
-  // and captures some of the arguments buffers by value. Such XNNPACK subgraphs
-  // can't be reused if captured arguments are not the same, and can lead to
-  // crashes and undefined behavior if captured arguments are destroyed.
-  // Capturing arguments by value allows XNNPACK to do packing at graph compile
-  // time, and avoid re-packing costs at run time (at inference weights stay
-  // constant, i.e. convolution filters and one of the dot arguments).
-  using CapturingBuilder = absl::AnyInvocable<absl::StatusOr<XnnSubgraph>(
-      absl::Span<const Argument> arguments, absl::Span<const Result> results,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers)>;
-
-  static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> Create(
-      Options options, Info info, std::vector<Argument> arguments,
-      std::vector<Result> results, Builder builder);
-
-  static absl::StatusOr<std::unique_ptr<XnnFusionThunk>> Create(
-      Options options, Info info, std::vector<Argument> arguments,
-      std::vector<Result> results, CapturingBuilder capturing_builder,
-      absl::Span<const int64_t> captured_arguments_ids);
-
-  tsl::AsyncValueRef<ExecuteEvent> Execute(const ExecuteParams& params) final;
-
-  bool ExecuteMayBlock() const final { return true; }
-
-  BufferUses buffer_uses() const final;
-
-  Options options() const { return options_; }
-
-  XnnFusionKind xnn_fusion_kind() const { return xnn_fusion_kind_; }
-
- protected:
-  XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                 std::vector<Argument> arguments, std::vector<Result> results,
-                 Builder builder);
-
-  XnnFusionThunk(XnnFusionKind kind, Options options, Info info,
-                 std::vector<Argument> arguments, std::vector<Result> results,
-                 CapturingBuilder capturing_builder,
-                 absl::Span<const int64_t> captured_arguments_ids);
-
-  // Extension points for subclasses to customize the logging behavior.
-  virtual std::string fusion_kind() const { return "fusion"; }
-  virtual std::string fusion_description() const { return ""; }
-
-  virtual bool has_fusion_details() const { return false; }
-  virtual std::vector<std::string> fusion_details() const { return {}; }
-
-  virtual std::string argument_name(size_t index) const {
-    return absl::StrCat("arg #", index);
-  }
-
-  virtual std::string result_name(size_t index) const {
-    return absl::StrCat("res #", index);
-  }
-
- private:
-  // XNNPACK subgraph + runtime instantiated and ready for execution.
-  struct XnnExecutable;
-
-  // Creates XnnExecutable for the fusion operation using one of the builders.
-  absl::StatusOr<XnnExecutable> CreateXnnExecutable(
-      const XnnThreadpool& threadpool,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  // Updates XnnExecutable to the XNN subgraph constructed with the given
-  // arguments buffers.
-  absl::Status UpdateXnnExecutable(
-      const XnnThreadpool& threadpool, XnnExecutable& executable,
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  // Returns the list of captured arguments buffers.
-  std::vector<se::DeviceAddressBase> CaptureArguments(
-      absl::Span<const se::DeviceAddressBase> arguments_buffers);
-
-  XnnFusionKind xnn_fusion_kind_;
-  Options options_;
-
-  std::vector<Argument> arguments_;
-  std::vector<Result> results_;
-
-  // Builder that constructs XNNPACK subgraph for the fusion operation.
-  Builder builder_;
-
-  // Builder that constructs XNNPACK subgraph for the fusion operation and
-  // captures some of the arguments buffers by value. Such subgraphs can't be
-  // reused if captured arguments changed since the last execution.
-  CapturingBuilder capturing_builder_;
-
-  // Indices of arguments that are captured by XNNPACK subgraph by value.
-  std::vector<int64_t> captured_arguments_ids_;
-
-  // XLA:CPU executable can be called concurrently from multiple threads,
-  // and we need to keep a pool of XNNPACK executables to avoid data races.
-  using XnnExecutablePool = ObjectPool<XnnExecutable, const XnnThreadpool&,
-                                       absl::Span<const se::DeviceAddressBase>>;
-  XnnExecutablePool xnn_executable_pool_;
-
-  // The number of XNNPACK executables created for capturing graphs.
-  std::atomic<int64_t> num_capturing_created_{0};
-};
-
-std::ostream& operator<<(std::ostream& os, XnnFusionThunk::XnnFusionKind kind);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_FUSION_THUNK_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
deleted file mode 100644
index 4f802f04bc6530..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk_test.cc
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/buffer_allocations.h"
-#include "xla/backends/cpu/runtime/thunk.h"
-#include "xla/backends/cpu/runtime/thunk_testlib.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-#include "xla/literal_util.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/env.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
-#include "xla/tsl/platform/threadpool.h"
-#include "xla/xla_data.pb.h"
-
-#define EIGEN_USE_THREADS
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-namespace {
-
-static absl::StatusOr<XnnSubgraph> BuildBinaryAddSubgraph(
-    absl::Span<const XnnFusionThunk::Argument> arguments,
-    absl::Span<const XnnFusionThunk::Result> results) {
-  TF_ASSIGN_OR_RETURN(XnnSubgraph subgraph,
-                      CreateXnnSubgraph([&](xnn_subgraph_t* subgraph) {
-                        return xnn_create_subgraph(
-                            /*external_value_ids=*/3,
-                            /*flags=*/0, subgraph);
-                      }));
-
-  auto dims = [](absl::Span<const int64_t> dims) -> std::vector<size_t> {
-    return {dims.begin(), dims.end()};
-  };
-
-  uint32_t lhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t rhs_id = XNN_INVALID_VALUE_ID;
-  uint32_t out_id = XNN_INVALID_VALUE_ID;
-
-  std::vector<size_t> lhs_dims = dims(arguments[0].shape.dimensions());
-  std::vector<size_t> rhs_dims = dims(arguments[1].shape.dimensions());
-  std::vector<size_t> out_dims = dims(results[0].shape.dimensions());
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, lhs_dims.size(), lhs_dims.data(),
-      nullptr,
-      /*external_id=*/0, XNN_VALUE_FLAG_EXTERNAL_INPUT, &lhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, rhs_dims.size(), rhs_dims.data(),
-      nullptr,
-      /*external_id=*/1, XNN_VALUE_FLAG_EXTERNAL_INPUT, &rhs_id));
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph.get(), xnn_datatype_fp32, out_dims.size(), out_dims.data(),
-      nullptr,
-      /*external_id=*/2, XNN_VALUE_FLAG_EXTERNAL_OUTPUT, &out_id));
-
-  xnn_binary_params params = {-std::numeric_limits<float>::infinity(),
-                              std::numeric_limits<float>::infinity()};
-
-  XNN_RETURN_IF_ERROR(xnn_define_binary(subgraph.get(), xnn_binary_add, &params,
-                                        lhs_id, rhs_id, out_id, /*flags=*/0));
-
-  return subgraph;
-}
-
-class XnnFusionThunkTest : public testing::TestWithParam<bool> {
- public:
-  static std::string Name(const ::testing::TestParamInfo<bool>& info) {
-    return absl::StrCat(info.param ? "threadpool" : "single_threaded");
-  }
-
- protected:
-  bool use_threadpool() const { return GetParam(); }
-};
-
-TEST_P(XnnFusionThunkTest, ElementwiseAdd) {
-  tsl::thread::ThreadPool threads(tsl::Env::Default(), "test", 8);
-  Eigen::ThreadPoolDevice device(threads.AsEigenThreadPool(),
-                                 threads.NumThreads());
-
-  auto lhs = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, 4.0});
-  auto rhs = LiteralUtil::CreateR1<float>({4.0, 3.0, 2.0, 1.0});
-  auto out = LiteralUtil::CreateR1<float>({0.0, 0.0, 0.0, 0.0});
-
-  BufferAllocations allocations = CreateBufferAllocations(lhs, rhs, out);
-
-  auto [lhs_alloc, rhs_alloc, out_alloc] =
-      CreateBufferAllocation(lhs, rhs, out);
-  auto [lhs_slice, rhs_slice, out_slice] =
-      CreateBufferAllocationSlice(lhs_alloc, rhs_alloc, out_alloc);
-
-  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  XnnFusionThunk::Argument lhs_arg = {lhs_slice, shape};
-  XnnFusionThunk::Argument rhs_arg = {rhs_slice, shape};
-  XnnFusionThunk::Result out_res = {out_slice, shape};
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto thunk, XnnFusionThunk::Create(
-                      XnnFusionThunk::Options{use_threadpool()}, {"fusion"},
-                      {lhs_arg, rhs_arg}, {out_res}, &BuildBinaryAddSubgraph));
-
-  XnnThreadpool threadpool;
-  if (use_threadpool()) {
-    TF_ASSERT_OK_AND_ASSIGN(threadpool, CreateXnnThreadpool(&device));
-  }
-  Thunk::XnnParams xnn_params(std::move(threadpool));
-
-  Thunk::ExecuteParams params;
-  params.buffer_allocations = &allocations;
-  params.intra_op_threadpool = use_threadpool() ? &device : nullptr;
-  params.xnn_params = &xnn_params;
-
-  auto execute_event = thunk->Execute(params);
-  tsl::BlockUntilReady(execute_event);
-  ASSERT_FALSE(execute_event.IsError()) << execute_event.GetError();
-
-  EXPECT_EQ(out, LiteralUtil::CreateR1<float>({5.0, 5.0, 5.0, 5.0}));
-}
-
-INSTANTIATE_TEST_SUITE_P(XnnFusion, XnnFusionThunkTest, ::testing::Bool(),
-                         XnnFusionThunkTest::Name);
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.cc
deleted file mode 100644
index 3d219ee8b267f5..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-
-#include "experimental.h"  // xnnpack
-#include "xnnpack.h"
-#include "absl/functional/function_ref.h"
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/primitive_util.h"
-#include "xla/util.h"
-
-namespace xla::cpu {
-
-absl::Status InitializeXnnPack() {
-  static xnn_status status = xnn_initialize(/*allocator=*/nullptr);
-  if (status != xnn_status_success) {
-    return Internal("XNNPACK initialization failed");
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<XnnSubgraph> CreateXnnSubgraph(
-    absl::FunctionRef<xnn_status(xnn_subgraph_t*)> builder) {
-  xnn_subgraph_t subgraph = nullptr;
-  XNN_RETURN_IF_ERROR(builder(&subgraph));
-  return XnnSubgraph(subgraph);
-}
-
-absl::StatusOr<XnnRuntime> CreateXnnRuntime(
-    absl::FunctionRef<xnn_status(xnn_runtime_t*)> builder) {
-  xnn_runtime_t runtime = nullptr;
-  XNN_RETURN_IF_ERROR(builder(&runtime));
-  return XnnRuntime(runtime);
-}
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    absl::FunctionRef<xnn_status(xnn_threadpool_t*)> builder) {
-  xnn_threadpool_t threadpool = nullptr;
-  XNN_RETURN_IF_ERROR(builder(&threadpool));
-  return XnnThreadpool(threadpool);
-}
-
-absl::StatusOr<xnn_datatype> XnnDatatype(const PrimitiveType& type) {
-  switch (type) {
-    case BF16:
-      return xnn_datatype_bf16;
-    case F16:
-      return xnn_datatype_fp16;
-    case F32:
-      return xnn_datatype_fp32;
-    default:
-      return InvalidArgument("Unsupported XNNPACK data type: %s",
-                             primitive_util::LowercasePrimitiveTypeName(type));
-  }
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h
deleted file mode 100644
index e591665c0f38f3..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_interop.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
-
-#include <memory>
-
-#include "experimental.h"  // xnnpack
-#include "xnnpack.h"
-#include "absl/base/optimization.h"
-#include "absl/functional/function_ref.h"
-#include "absl/status/status.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/util.h"
-
-namespace xla::cpu {
-
-//===----------------------------------------------------------------------===//
-// XNNPACK status to ABSL status conversion macros.
-//===----------------------------------------------------------------------===//
-
-#define XNN_RETURN_IF_ERROR(expr)             \
-  do {                                        \
-    absl::Status s = XnnStatusToStatus(expr); \
-    if (!s.ok()) {                            \
-      return s;                               \
-    }                                         \
-  } while (0)
-
-#define XNN_LOG_IF_ERROR(expr)                         \
-  do {                                                 \
-    absl::Status s = XnnStatusToStatus(expr);          \
-    if (!s.ok()) {                                     \
-      LOG(ERROR) << "XNNPACK operation failed: " << s; \
-    }                                                  \
-  } while (0)
-
-// Statically initializes XNNPACK for the current process.
-absl::Status InitializeXnnPack();
-
-// Converts XNNPACK status to absl::Status.
-inline absl::Status XnnStatusToStatus(xnn_status status) {
-  if (ABSL_PREDICT_TRUE(status == xnn_status_success)) {
-    return absl::OkStatus();
-  }
-
-  auto error_message = [](xnn_status status) {
-    switch (status) {
-      case xnn_status_success:
-        return "";
-      case xnn_status_uninitialized:
-        return "uninitialized";
-      case xnn_status_invalid_parameter:
-        return "invalid parameter";
-      case xnn_status_invalid_state:
-        return "invalid state";
-      case xnn_status_unsupported_parameter:
-        return "unsupported parameter";
-      case xnn_status_unsupported_hardware:
-        return "unsupported hardware";
-      case xnn_status_out_of_memory:
-        return "out of memory";
-      case xnn_status_reallocation_required:
-        return "reallocation required";
-      case xnn_status_deprecated:
-        return "deprecated";
-    }
-  };
-
-  return Internal("XNNPACK operation failed: %s", error_message(status));
-}
-
-//===----------------------------------------------------------------------===//
-// XLA to XNNPACK type conversions.
-//===----------------------------------------------------------------------===//
-
-absl::StatusOr<xnn_datatype> XnnDatatype(const PrimitiveType& type);
-
-//===----------------------------------------------------------------------===//
-// RAII wrappers for XNNPACK types.
-//===----------------------------------------------------------------------===//
-
-namespace internal {
-struct XnnDeleter {
-  void operator()(xnn_subgraph* subgraph) {
-    XNN_LOG_IF_ERROR(xnn_delete_subgraph(subgraph));
-  }
-  void operator()(xnn_runtime* runtime) {
-    XNN_LOG_IF_ERROR(xnn_delete_runtime(runtime));
-  }
-  void operator()(xnn_threadpool* threadpool) {
-    XNN_LOG_IF_ERROR(xnn_delete_threadpool(threadpool));
-  }
-};
-}  // namespace internal
-
-using XnnSubgraph = std::unique_ptr<xnn_subgraph, internal::XnnDeleter>;
-using XnnRuntime = std::unique_ptr<xnn_runtime, internal::XnnDeleter>;
-using XnnThreadpool = std::unique_ptr<xnn_threadpool, internal::XnnDeleter>;
-
-absl::StatusOr<XnnSubgraph> CreateXnnSubgraph(
-    absl::FunctionRef<xnn_status(xnn_subgraph_t*)> builder);
-
-absl::StatusOr<XnnRuntime> CreateXnnRuntime(
-    absl::FunctionRef<xnn_status(xnn_runtime_t*)> builder);
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    absl::FunctionRef<xnn_status(xnn_threadpool_t*)> builder);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_INTEROP_H_
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
deleted file mode 100644
index 8ca982278e689e..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h"
-
-#include <cstdint>
-
-#include "experimental.h"  // xnnpack
-#include "absl/base/optimization.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-
-#define EIGEN_USE_THREADS
-#include "Eigen/ThreadPool"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace xla::cpu {
-
-static int32_t NumThreads(void* pool) {
-  if (ABSL_PREDICT_FALSE(pool == nullptr)) {
-    return 0;
-  }
-  return reinterpret_cast<Eigen::ThreadPoolInterface*>(pool)->NumThreads();
-}
-
-static void Schedule(void* pool, void* context, void (*task)(void* context)) {
-  if (ABSL_PREDICT_FALSE(pool == nullptr)) {
-    (*task)(context);
-  }
-  reinterpret_cast<Eigen::ThreadPoolInterface*>(pool)->Schedule(
-      [task, context]() { (*task)(context); });
-}
-
-// And adaptor from Eigen::ThreadPoolInterface to xnn_threadpool_t.
-static constexpr xnn_scheduler_v2 kXnnScheduler = {&NumThreads, &Schedule};
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    Eigen::ThreadPoolInterface* threadpool) {
-  return CreateXnnThreadpool([&](xnn_threadpool_t* xnn_threadpool) {
-    return xnn_create_threadpool_v2(kXnnScheduler, threadpool, /*flags=*/1,
-                                    xnn_threadpool);
-  });
-}
-
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    const Eigen::ThreadPoolDevice* device) {
-  return CreateXnnThreadpool(device->getPool());
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h b/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
deleted file mode 100644
index d154af861814ff..00000000000000
--- a/third_party/xla/xla/backends/cpu/runtime/xnnpack/xnn_threadpool.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
-#define XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
-
-#include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-
-namespace Eigen {
-struct ThreadPoolDevice;
-class ThreadPoolInterface;
-}  // namespace Eigen
-
-namespace xla::cpu {
-
-// Creates an XNNPACK threadpool from an Eigen threadpool.
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    Eigen::ThreadPoolInterface* threadpool);
-
-// Creates an XNNPACK threadpool from an Eigen ThreadPoolDevice.
-absl::StatusOr<XnnThreadpool> CreateXnnThreadpool(
-    const Eigen::ThreadPoolDevice* device);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_RUNTIME_XNNPACK_XNN_THREADPOOL_H_
diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index d1d0503dc032b8..fde7907429e1eb 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -26,7 +26,6 @@ cc_library(
     deps = [
         ":library_matcher",
         ":onednn_matcher",
-        ":xnn_matcher",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -54,7 +53,6 @@ xla_cc_test(
         ":library_rewriter",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/backends/cpu:xnn_gemm_config",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/backends/cpu/codegen:target_machine_test_base",
         "//xla/hlo/ir:hlo",
@@ -100,22 +98,6 @@ onednn_graph_cc_library(
     ],
 )
 
-cc_library(
-    name = "xnn_matcher",
-    hdrs = ["xnn_matcher.h"],
-    deps = [
-        ":library_matcher",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/backends/cpu/codegen:target_machine_features",
-        "//xla/hlo/ir:hlo",
-        "@com_google_absl//absl/base:no_destructor",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:protobuf",
-    ],
-)
-
 cc_library(
     name = "ynn_matcher",
     hdrs = ["ynn_matcher.h"],
@@ -130,40 +112,3 @@ cc_library(
         "@local_tsl//tsl/platform:protobuf",
     ] + if_ynnpack(["//xla/backends/cpu:ynn_support"]),
 )
-
-cc_library(
-    name = "xnn_graph_fusion",
-    srcs = ["xnn_graph_fusion.cc"],
-    hdrs = ["xnn_graph_fusion.h"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:xla_proto_cc",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_interop",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:call_graph",
-        "//xla/service:instruction_fusion",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-xla_cc_test(
-    name = "xnn_graph_fusion_test",
-    srcs = ["xnn_graph_fusion_test.cc"],
-    deps = [
-        ":xnn_graph_fusion",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/cpu:xnn_support",
-        "//xla/hlo/ir:hlo",
-        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/utils:hlo_matchers",
-        "//xla/service/cpu:backend_config_proto_cc",
-        "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:statusor",
-        "@com_google_googletest//:gtest",
-    ],
-)
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
index a36f612c655442..38dba96cac1c67 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/transforms/library_matcher.h"
-#include "xla/backends/cpu/transforms/xnn_matcher.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -53,10 +52,8 @@ enum class FusionDirection {
 
 struct LibraryRewriterOptions {
   bool use_onednn = false;
-  bool use_xnnpack = false;
   bool use_ynnpack = false;
   const tsl::protobuf::RepeatedField<int>* onednn_fusion_types = nullptr;
-  const tsl::protobuf::RepeatedField<int>* xnn_fusion_types = nullptr;
   const tsl::protobuf::RepeatedField<int>* ynn_fusion_types = nullptr;
 };
 
@@ -75,11 +72,6 @@ class LibraryRewriter : public HloModulePass {
           target_machine_features_, options_.onednn_fusion_types));
     }
 #endif  // XLA_ONEDNN_USE_GRAPH_API
-    if (options_.use_xnnpack && options_.xnn_fusion_types != nullptr &&
-        !options_.xnn_fusion_types->empty()) {
-      libs_.push_back(std::make_unique<XnnMatcher>(target_machine_features_,
-                                                   options_.xnn_fusion_types));
-    }
 #ifdef XLA_YNNPACK
     if (options_.use_ynnpack && options_.ynn_fusion_types != nullptr &&
         !options_.ynn_fusion_types->empty()) {
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
index 4d050182773bd6..430986457d215f 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/codegen/target_machine_test_base.h"
-#include "xla/backends/cpu/xnn_gemm_config.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -79,9 +78,6 @@ class CpuLibraryTest : public TargetMachineTestBase {
             /*triple_string=*/"x86_64-unknown-linux-gnu", spec.cpu_name,
             spec.features);
 
-    // Override XnnGemmConfig.
-    GetXnnGemmConfig().SetTestFilter([](const XnnGemm&) { return true; });
-
     // Create an HLO module with the specified input and output data types.
     std::string hlo_text = absl::StrReplaceAll(
         hlo_template,
@@ -100,15 +96,12 @@ class CpuLibraryTest : public TargetMachineTestBase {
     }
     tsl::protobuf::RepeatedField<int> empty_fusion_types;
     bool use_onednn = spec.lib == "onednn";
-    bool use_xnnpack = spec.lib == "xnn";
     bool use_ynnpack = spec.lib == "ynn";
     LibraryRewriterOptions options = {
         use_onednn,
-        use_xnnpack,
         use_ynnpack,
         /*onednn_fusion_types=*/
         use_onednn ? &fusion_types : &empty_fusion_types,
-        /*xnn_fusion_types=*/use_xnnpack ? &fusion_types : &empty_fusion_types,
         /*ynn_fusion_types=*/use_ynnpack ? &fusion_types : &empty_fusion_types,
     };
     LibraryRewriter rewriter(features.get(), options);
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
deleted file mode 100644
index b360691f66f6d6..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/primitive_util.h"
-#include "xla/service/call_graph.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/service/instruction_fusion.h"
-#include "xla/xla.pb.h"
-
-namespace xla::cpu {
-
-namespace {
-
-bool IsWideningConvert(const HloInstruction* instr) {
-  return instr->opcode() == HloOpcode::kConvert &&
-         primitive_util::BitWidth(instr->operand(0)->shape().element_type()) <
-             primitive_util::BitWidth(instr->shape().element_type());
-}
-
-}  // namespace
-
-FusionDecision XnnGraphFusion::ShouldFuse(HloInstruction* consumer,
-                                          int64_t operand_index) {
-  if (!IsXnnGraphFusion(consumer) && !IsOpSupported(consumer)) {
-    return FusionDecision::Forbid("Unsupported consumer");
-  }
-
-  if (consumer->opcode() == HloOpcode::kBroadcast) {
-    return FusionDecision::Forbid(
-        "Do not start growing fusions from broadcasts");
-  }
-
-  if (IsWideningConvert(consumer)) {
-    // We don't want to start a fusion with a widening convert, because that
-    // makes the buffer the fusion writes to bigger, and it would be better to
-    // fuse the convert into the consumer of the convert.
-    return FusionDecision::Forbid(
-        "Do not start growing fusions from widening converts");
-  }
-
-  HloInstruction* producer = consumer->mutable_operand(operand_index);
-  if (!(producer->opcode() == HloOpcode::kParameter ||
-        IsOpSupported(producer))) {
-    return FusionDecision::Forbid("Unsupported producer");
-  }
-  return FusionDecision::Allow();
-}
-
-HloInstruction::FusionKind XnnGraphFusion::ChooseKind(
-    const HloInstruction* producer, const HloInstruction* consumer) {
-  return HloInstruction::FusionKind::kCustom;
-}
-
-HloInstruction* XnnGraphFusion::Fuse(HloInstruction* producer,
-                                     HloInstruction* consumer,
-                                     HloComputation* computation) {
-  HloInstruction* fusion =
-      InstructionFusion::Fuse(producer, consumer, computation);
-
-  BackendConfig backend_config;
-  FusionBackendConfig* fusion_config = backend_config.mutable_fusion_config();
-  fusion_config->set_kind(kXnnFusionKind);
-  CHECK(backend_config.has_fusion_config());
-  CHECK_OK(fusion->set_backend_config(backend_config));
-  return fusion;
-}
-
-std::vector<HloComputation*> XnnGraphFusion::GetNonFusionComputations(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  std::vector<HloComputation*> non_fusion_computations =
-      InstructionFusion::GetNonFusionComputations(module, execution_threads);
-  std::unique_ptr<CallGraph> call_graph =
-      CallGraph::Build(module, execution_threads);
-  auto SkipComputation = [&](HloComputation* c) {
-    auto callers = call_graph->GetComputationCallers(c);
-    return std::any_of(
-        callers.begin(), callers.end(),
-        [&](HloInstruction* caller) { return caller->has_to_apply(); });
-  };
-  auto it = std::remove_if(non_fusion_computations.begin(),
-                           non_fusion_computations.end(), SkipComputation);
-  non_fusion_computations.erase(it, non_fusion_computations.end());
-  return non_fusion_computations;
-}
-
-bool XnnGraphFusion::IsOpSupported(const HloInstruction* instr) {
-  if (!IsLayoutSupportedByXnn(instr->shape())) {
-    return false;
-  }
-  if (!XnnDatatype(instr->shape().element_type()).ok()) {
-    return false;
-  }
-  if (instr->IsConstant()) {
-    return IsConstantSupportedByXnn(instr);
-  }
-  if (instr->IsElementwise()) {
-    return IsElementwiseOpSupportedByXnn(instr);
-  }
-
-  switch (instr->opcode()) {
-    case HloOpcode::kBitcast:
-      return IsBitcastOpSupportedByXnn(instr);
-    case HloOpcode::kBroadcast:
-      return IsBroadcastOpSupportedByXnn(instr);
-    case HloOpcode::kReduce:
-      return IsReduceOpSupportedByXnn(instr);
-    default:
-      return false;
-  }
-}
-
-bool XnnGraphFusion::IsXnnGraphFusion(const HloInstruction* instr) {
-  if (instr->opcode() != HloOpcode::kFusion) {
-    return false;
-  }
-  const HloFusionInstruction* fusion = Cast<HloFusionInstruction>(instr);
-  if (fusion->fusion_kind() != HloInstruction::FusionKind::kCustom) {
-    return false;
-  }
-  auto backend_config = fusion->backend_config<BackendConfig>();
-  if (!backend_config.ok() || !backend_config->has_fusion_config()) {
-    return false;
-  }
-  return backend_config->fusion_config().kind() == kXnnFusionKind;
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.h b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.h
deleted file mode 100644
index ca596ad7a94ac0..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_TRANSFORMS_XNN_GRAPH_FUSION_H_
-#define XLA_BACKENDS_CPU_TRANSFORMS_XNN_GRAPH_FUSION_H_
-
-#include <cstdint>
-
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/instruction_fusion.h"
-
-namespace xla {
-namespace cpu {
-
-class XnnGraphFusion : public InstructionFusion {
- public:
-  XnnGraphFusion() : InstructionFusion(XnnGraphFusion::IsExpensive) {}
-  ~XnnGraphFusion() override = default;
-
- private:
-  FusionDecision ShouldFuse(HloInstruction* consumer,
-                            int64_t operand_index) override;
-  HloInstruction::FusionKind ChooseKind(
-      const HloInstruction* producer, const HloInstruction* consumer) override;
-
-  HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
-                       HloComputation* computation) override;
-
-  std::vector<HloComputation*> GetNonFusionComputations(
-      HloModule* module,
-      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
-  static bool IsOpSupported(const HloInstruction* instr);
-
-  static bool IsXnnGraphFusion(const HloInstruction* instr);
-};
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // XLA_BACKENDS_CPU_TRANSFORMS_XNN_GRAPH_FUSION_H_
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc b/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
deleted file mode 100644
index b992f7aa6b74d6..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_graph_fusion_test.cc
+++ /dev/null
@@ -1,333 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
-
-#include <memory>
-#include <string>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
-#include "xla/hlo/utils/hlo_matchers.h"
-#include "xla/service/cpu/backend_config.pb.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/xla.pb.h"
-#include "xla/xla_data.pb.h"
-
-namespace op = xla::testing::opcode_matchers;
-
-namespace xla::cpu {
-namespace {
-
-using XnnGraphFusionTest = HloHardwareIndependentTestBase;
-
-TEST_F(XnnGraphFusionTest, BasicFusion) {
-  std::string hlo_string = R"(
-HloModule FusionDemonstration
-
-ENTRY entry {
-   %param.0 = f32[2,2] parameter(0)
-   %constant.0 = f32[2,2] constant({ { 1, 2 }, { 3, 4 } })
-   %add.0 = f32[2,2] add(f32[2,2] %param.0, f32[2,2]{1,0} %constant.0)
-   %sub.0 = f32[2,2] subtract(f32[2,2] %param.0, f32[2,2] %constant.0)
-   ROOT %result = f32[2,2] multiply(f32[2,2] %add.0, f32[2,2] %sub.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_TRUE(changed);
-  EXPECT_THAT(module.get()->entry_computation()->root_instruction(),
-              op::Fusion());
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  ASSERT_EQ(root->opcode(), HloOpcode::kFusion);
-  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(root);
-  TF_ASSERT_OK_AND_ASSIGN(auto backend_config,
-                          fusion->backend_config<BackendConfig>());
-  ASSERT_TRUE(backend_config.has_fusion_config());
-  EXPECT_EQ(backend_config.fusion_config().kind(), kXnnFusionKind);
-}
-
-TEST_F(XnnGraphFusionTest, BasicFusionUnsupportedType) {
-  std::string hlo_string = R"(
-HloModule FusionDemonstration
-
-ENTRY entry {
-   %param.0 = s2[2,2] parameter(0)
-   %constant.0 = s2[2,2] constant({ { 0, 1 }, { 1, 0 } })
-   %add.0 = s2[2,2] add(s2[2,2] %param.0, s2[2,2] %constant.0)
-   %sub.0 = s2[2,2] subtract(s2[2,2] %param.0, s2[2,2] %constant.0)
-   ROOT %result = s2[2,2] multiply(s2[2,2] %add.0, s2[2,2] %sub.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, BasicFusionUnsupportedLayout) {
-  std::string hlo_string = R"(
-HloModule FusionDemonstration
-
-ENTRY entry {
-   %param.0 = f32[2,2]{0,1} parameter(0)
-   %constant.0 = f32[2,2]{0,1} constant({ { 0, 1 }, { 1, 0 } })
-   %add.0 = f32[2,2]{0,1} add(f32[2,2]{0,1} %param.0, f32[2,2]{0,1} %constant.0)
-   %sub.0 = f32[2,2]{0,1} subtract(f32[2,2]{0,1} %param.0, f32[2,2]{0,1} %constant.0)
-   ROOT %result = f32[2,2]{0,1} multiply(f32[2,2]{0,1} %add.0, f32[2,2]{0,1} %sub.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-static void SetFusionMode(HloModule* module,
-                          DebugOptions::XnnGraphFusionMode mode) {
-  module->mutable_config()
-      .mutable_debug_options()
-      .set_xla_cpu_experimental_xnn_graph_fusion_mode(mode);
-}
-
-TEST_F(XnnGraphFusionTest, BasicBroadcast) {
-  std::string hlo_string = R"(
-HloModule BroadcastFusion
-
-ENTRY entry {
-  %param.0 = f32[] parameter(0)
-  %broadcast.0 = f32[2,2] broadcast(f32[] %param.0), dimensions={}
-  ROOT result = f32[2,2] add(f32[2,2] %broadcast.0, f32[2,2] %broadcast.0)
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_TRUE(changed);
-  EXPECT_THAT(module.get()->entry_computation()->root_instruction(),
-              op::Fusion());
-
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  ASSERT_EQ(root->opcode(), HloOpcode::kFusion);
-  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(root);
-  TF_ASSERT_OK_AND_ASSIGN(auto backend_config,
-                          fusion->backend_config<BackendConfig>());
-  ASSERT_TRUE(backend_config.has_fusion_config());
-  EXPECT_EQ(backend_config.fusion_config().kind(), kXnnFusionKind);
-}
-
-TEST_F(XnnGraphFusionTest, SkipRootBroadcast) {
-  std::string hlo_string = R"(
-HloModule SkipRootBroadcast
-
-ENTRY entry {
-  %param.0 = f32[] parameter(0)
-  %add.0 = f32[] add(f32[] %param.0, f32[] %param.0)
-  ROOT result = f32[2,2] broadcast(f32[] %param.0), dimensions={}
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, SkipUnsupportedBroadcast) {
-  // Broadcast changes the relative order of dimensions.
-  std::string hlo_string = R"(
-HloModule SkipUnsupportedBroadcast
-
-ENTRY entry {
-  %param.0 = f32[2,3] parameter(0)
-  %broadcast.0 = f32[4,3,2] broadcast(f32[2,3] %param.0), dimensions={2,1}
-  ROOT result = f32[4,3,2] add(f32[4,3,2] %broadcast.0, f32[4,3,2] %broadcast.0)
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, SkipRootWideningConvert) {
-  std::string hlo_string = R"(
-HloModule SkipRootWideningConvert
-
-ENTRY entry {
-  %param.0 = f32[4] parameter(0)
-  %to_bf16.0 = bf16[4] convert(f32[4] %param.0)
-  ROOT result = f32[4] convert(bf16[4] %to_bf16.0)
-}
-
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, BasicFusionUnsupportedOperandType) {
-  std::string hlo_string = R"(
-HloModule BasicFusionUnsupportedOperandType
-
-ENTRY entry {
-   %param.0 = s1[2,2] parameter(0)
-   ROOT %converted_param.0 = f32[2,2] convert(s1[2,2] %param.0)
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, BasicReduce) {
-  std::string hlo_string = R"(
-HloModule BasicReduce
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  ROOT maximum = f32[] maximum(arg_0, arg_1)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(-inf)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_TRUE(changed);
-  EXPECT_THAT(module.get()->entry_computation()->root_instruction(),
-              op::Fusion());
-
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  HloFusionInstruction* fusion = Cast<HloFusionInstruction>(root);
-  TF_ASSERT_OK_AND_ASSIGN(auto backend_config,
-                          fusion->backend_config<BackendConfig>());
-  ASSERT_TRUE(backend_config.has_fusion_config());
-  EXPECT_EQ(backend_config.fusion_config().kind(), kXnnFusionKind);
-}
-
-TEST_F(XnnGraphFusionTest, SkipReduceWithUnsupportedInit) {
-  std::string hlo_string = R"(
-HloModule SkipReduceWithUnsupportedInit
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  ROOT maximum = f32[] maximum(arg_0, arg_1)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(1.33)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, SkipReduceWithUnsupportedReducer) {
-  std::string hlo_string = R"(
-HloModule SkipReduceWithUnsupportedReducer
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  ROOT sub = f32[] subtract(arg_0, arg_1)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(1.33)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-TEST_F(XnnGraphFusionTest, NoFusionInsideReducer) {
-  std::string hlo_string = R"(
-HloModule NoFusionInsideReducer
-
-reducer {
-  arg_0 = f32[] parameter(0)
-  arg_1 = f32[] parameter(1)
-  mul = f32[] multiply(arg_0, arg_1)
-  ROOT result = f32[] add(arg_0, mul)
-}
-
-ENTRY main {
-  arg_0 = f32[3,2] parameter(0)
-  init = f32[] constant(1.33)
-  ROOT result = f32[] reduce(arg_0, init), dimensions={0,1}, to_apply=reducer
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  SetFusionMode(module.get(),
-                DebugOptions::XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, XnnGraphFusion().Run(module.get()));
-  ASSERT_FALSE(changed);
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/transforms/xnn_matcher.h b/third_party/xla/xla/backends/cpu/transforms/xnn_matcher.h
deleted file mode 100644
index faa943fa4ce929..00000000000000
--- a/third_party/xla/xla/backends/cpu/transforms/xnn_matcher.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_TRANSFORMS_XNN_MATCHER_H_
-#define XLA_BACKENDS_CPU_TRANSFORMS_XNN_MATCHER_H_
-
-#include <string>
-
-#include "absl/base/no_destructor.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/transforms/library_matcher.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "tsl/platform/protobuf.h"
-
-namespace xla::cpu {
-
-class XnnMatcher : public LibraryMatcher {
- public:
-  explicit XnnMatcher(const TargetMachineFeatures* target_machine_features,
-                      const tsl::protobuf::RepeatedField<int>* fusion_types)
-      : LibraryMatcher(target_machine_features, fusion_types) {}
-  ~XnnMatcher() override = default;
-
-  // Returns the set of supported HLO instructions.
-  absl::flat_hash_set<HloOpcode> SupportedOps() const override {
-    static const absl::NoDestructor<absl::flat_hash_set<HloOpcode>>
-        kSupportedOps{[]() {
-          absl::flat_hash_set<HloOpcode> supported_ops{
-              HloOpcode::kDot, HloOpcode::kReduce, HloOpcode::kConstant};
-          for (const auto& [op, _] : GetXnnUnaryOpMap()) {
-            supported_ops.insert(op);
-          }
-          for (const auto& [op, _] : GetXnnBinaryOpMap()) {
-            supported_ops.insert(op);
-          }
-          return supported_ops;
-        }()};
-    return *kSupportedOps;
-  }
-
-  // Returns true if the HLO instruction is supported by the library.
-  absl::StatusOr<bool> IsOpSupported(const HloInstruction* instr) override {
-    if (instr->opcode() == HloOpcode::kDot) {
-      return IsDotSupportedByXnn(
-          instr->dot_dimension_numbers(), instr->operand(0)->shape(),
-          instr->operand(1)->shape(), instr->shape(), target_machine_features_);
-    }
-    if (instr->opcode() == HloOpcode::kReduce) {
-      return IsReduceOpSupportedByXnn(instr);
-    }
-    if (instr->IsConstant()) {
-      return IsConstantSupportedByXnn(instr);
-    }
-    // TODO(b/441837668): Need to get the reduction performance/cost model
-    // right before enabling fusions. Fusions make performance analysis quite
-    // challenging.
-    if (fuse_reduce_) {
-      return false;
-    }
-    if (instr->IsElementwise()) {
-      return IsElementwiseOpSupportedByXnn(instr);
-    }
-    return false;
-  }
-
-  // Returns true if we should start a new fusion containing just the given HLO
-  // instruction. We control the instructions that can start a fusion with the
-  // `--xla_cpu_experimental_xnn_fusion_type` flag.
-  bool ShouldCreateFusion(const HloInstruction* instr) override {
-    if (fuse_dot_ && instr->opcode() == HloOpcode::kDot) {
-      return true;
-    }
-    if (fuse_reduce_ && instr->opcode() == HloOpcode::kReduce) {
-      return true;
-    }
-    return fuse_eltwise_ && instr->IsElementwise();
-  }
-
-  // Returns the output type of the XNN op, so we can insert a convert node if
-  // the op does not support the original HLO output type.
-  PrimitiveType LibraryOpOutputType(const HloInstruction* instr) override {
-    auto out_type = instr->shape().element_type();
-    if (instr->opcode() != HloOpcode::kDot) {
-      return out_type;
-    }
-    return out_type == BF16 ? F32 : out_type;
-  }
-
-  // Returns a prefix string for the fusion op's name.
-  std::string fusion_prefix() const override { return "xnn_"; }
-
-  // Returns a string for FusionBackendConfig's fusion kind.
-  absl::string_view fusion_kind() const override { return kXnnFusionKind; }
-
- private:
-  absl::flat_hash_set<DebugOptions::LibraryFusionType> fusion_types_;
-};
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_TRANSFORMS_XNN_MATCHER_H_
diff --git a/third_party/xla/xla/backends/cpu/xnn_emitter.cc b/third_party/xla/xla/backends/cpu/xnn_emitter.cc
deleted file mode 100644
index 30473a0bc2ed9a..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_emitter.cc
+++ /dev/null
@@ -1,507 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_emitter.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <vector>
-
-#include "xnnpack.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/functional/any_invocable.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/xnn_support.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/literal.h"
-#include "xla/shape.h"
-#include "xla/tsl/platform/logging.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-// A mapping from HloInstruction to XNNPACK subgraph tensor id.
-using TensorIdMap = absl::flat_hash_map<const HloInstruction*, uint32_t>;
-
-//===----------------------------------------------------------------------===//
-// XLA <-> XNNPACK type conversion library.
-//===----------------------------------------------------------------------===//
-
-static std::vector<size_t> XnnDimensions(const Shape& shape) {
-  std::vector<size_t> dims;
-  for (auto& dim : shape.dimensions()) {
-    dims.push_back(dim);
-  }
-  return dims;
-}
-
-//===----------------------------------------------------------------------===//
-// XLA <-> XNNPACK emitters.
-//===----------------------------------------------------------------------===//
-
-static absl::StatusOr<uint32_t> FindTensorValue(const TensorIdMap& tensor_ids,
-                                                const HloInstruction* instr) {
-  if (auto it = tensor_ids.find(instr); it != tensor_ids.end()) {
-    return it->second;
-  }
-  return Internal("Can't fine XNNPACK tensor value for instruction %s",
-                  instr->ToString());
-}
-
-static absl::StatusOr<uint32_t> DefineTensorValue(
-    xnn_subgraph_t subgraph, xnn_datatype type, absl::Span<const size_t> dims) {
-  uint32_t tensor_id = XNN_INVALID_VALUE_ID;
-  uint32_t tensor_flags = 0;
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), nullptr,
-      /*external_id=*/tensor_id, tensor_flags, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineTensorValue(xnn_subgraph_t subgraph,
-                                                  const HloInstruction* instr) {
-  // We do not support instructions with multiple results (tuples).
-  if (!instr->shape().IsArray()) {
-    return Internal("Unsupported XNNPACK instruction shape: %s",
-                    instr->ToString());
-  }
-
-  auto dims = XnnDimensions(instr->shape());
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(instr->shape().element_type()));
-
-  uint32_t tensor_id = XNN_INVALID_VALUE_ID;
-  uint32_t tensor_flags = 0;
-
-  // If instruction is a root instruction of the parent computation we assign it
-  // an external tensor id corresponding to the result index.
-  const HloComputation* computation = instr->parent();
-  if (computation->root_instruction() == instr) {
-    tensor_id = computation->num_parameters();
-    tensor_flags = XNN_VALUE_FLAG_EXTERNAL_OUTPUT;
-  }
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), nullptr,
-      /*external_id=*/tensor_id, tensor_flags, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineConstant(
-    xnn_subgraph_t subgraph, std::vector<std::unique_ptr<Literal>>& literals,
-    const HloInstruction* instr) {
-  // We do not support instructions with multiple results (tuples).
-  if (!instr->shape().IsArray()) {
-    return Internal("Unsupported XNNPACK instruction shape: %s",
-                    instr->ToString());
-  }
-
-  auto dims = XnnDimensions(instr->shape());
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(instr->shape().element_type()));
-
-  uint32_t tensor_id = XNN_INVALID_VALUE_ID;
-  uint32_t tensor_flags = 0;
-
-  literals.push_back(instr->literal().CloneToUnique());
-  const void* value = literals.back()->untyped_data();
-
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), value,
-      /*external_id=*/tensor_id, tensor_flags, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineParameter(xnn_subgraph_t subgraph,
-                                                const HloInstruction* param) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for parameter: %s",
-                                param->ToString());
-
-  auto dims = XnnDimensions(param->shape());
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(param->shape().element_type()));
-
-  uint32_t tensor_id = param->parameter_number();
-  XNN_RETURN_IF_ERROR(xnn_define_tensor_value(
-      subgraph, type, dims.size(), dims.data(), nullptr,
-      /*external_id=*/tensor_id, XNN_VALUE_FLAG_EXTERNAL_INPUT, &tensor_id));
-
-  return tensor_id;
-}
-
-static absl::StatusOr<uint32_t> DefineBitcastOp(xnn_subgraph_t subgraph,
-                                                TensorIdMap& tensor_ids,
-                                                const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for bitcast op: %s",
-                                instr->ToString());
-  CHECK_EQ(instr->opcode(), HloOpcode::kBitcast);
-  const HloInstruction* input = instr->operand(0);
-  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-
-  auto dims = XnnDimensions(instr->shape());
-  XNN_RETURN_IF_ERROR(xnn_define_static_reshape(subgraph, dims.size(),
-                                                dims.data(), in, out,
-                                                /*flags=*/0));
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineBroadcastOp(xnn_subgraph_t subgraph,
-                                                  TensorIdMap& tensor_ids,
-                                                  const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for broadcast op: %s",
-                                instr->ToString());
-  CHECK_EQ(instr->opcode(), HloOpcode::kBroadcast);
-  const HloBroadcastInstruction* broadcast_instr =
-      Cast<HloBroadcastInstruction>(instr);
-  const HloInstruction* input = broadcast_instr->operand(0);
-  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
-
-  const absl::Span<const int64_t> input_dims = input->shape().dimensions();
-  const absl::Span<const int64_t> output_dims = instr->shape().dimensions();
-  const absl::Span<const int64_t> dims = broadcast_instr->dimensions();
-  CHECK(std::is_sorted(dims.begin(), dims.end()));
-  CHECK_LE(input_dims.size(), output_dims.size());
-
-  const size_t num_new_axes = output_dims.size() - input_dims.size();
-  // New axis positions used by XNNPACK expand_dims.
-  std::vector<size_t> xnn_expand_dims_new_axes;
-  xnn_expand_dims_new_axes.reserve(num_new_axes);
-  std::vector<size_t> xnn_expand_dims_dimensions;
-  xnn_expand_dims_dimensions.reserve(output_dims.size());
-
-  // Mask used by XNNPACK broadcast.
-  std::vector<size_t> xnn_new_shape;
-  xnn_new_shape.reserve(output_dims.size());
-
-  for (size_t dim_idx = 0; dim_idx < output_dims.size(); ++dim_idx) {
-    const auto it = std::find(dims.begin(), dims.end(), dim_idx);
-    if (it == dims.end()) {
-      // New dimension case.
-      xnn_expand_dims_new_axes.push_back(dim_idx);
-      xnn_expand_dims_dimensions.push_back(1u);
-      // Broadcasted dimension.
-      xnn_new_shape.push_back(output_dims[dim_idx]);
-    } else {
-      // Pass through the input dimension.
-      const size_t input_dim_idx = it - dims.begin();
-      CHECK_EQ(*it, dim_idx);
-      const size_t input_dim = input_dims[input_dim_idx];
-      CHECK_EQ(input_dim, output_dims[dim_idx]);
-      xnn_expand_dims_dimensions.push_back(input_dim);
-      // 0 means keeping the dimension of the input.
-      // See the description of xnn_define_static_broadcast in xnnpack.h
-      xnn_new_shape.push_back(0u);
-    }
-  }
-
-  CHECK_EQ(xnn_expand_dims_dimensions.size(), output_dims.size());
-  CHECK_EQ(xnn_expand_dims_new_axes.size(), num_new_axes);
-  CHECK_EQ(xnn_new_shape.size(), output_dims.size());
-
-  TF_ASSIGN_OR_RETURN(auto type, XnnDatatype(input->shape().element_type()));
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
-  TF_ASSIGN_OR_RETURN(
-      auto xnn_dims_expanded,
-      DefineTensorValue(subgraph, type, xnn_expand_dims_dimensions));
-  TF_ASSIGN_OR_RETURN(auto xnn_broadcast, DefineTensorValue(subgraph, instr));
-
-  XNN_RETURN_IF_ERROR(xnn_define_static_expand_dims(
-      subgraph, num_new_axes, xnn_expand_dims_new_axes.data(), in,
-      xnn_dims_expanded, /*flags=*/0));
-
-  XNN_RETURN_IF_ERROR(xnn_define_static_broadcast(
-      subgraph, xnn_new_shape.size(), xnn_new_shape.data(), xnn_dims_expanded,
-      xnn_broadcast, /*flags=*/0));
-
-  return xnn_broadcast;
-}
-
-static absl::StatusOr<uint32_t> DefineReduceOp(xnn_subgraph_t subgraph,
-                                               TensorIdMap& tensor_ids,
-                                               const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for reduce op: %s",
-                                instr->ToString());
-  CHECK_EQ(instr->opcode(), HloOpcode::kReduce);
-  const HloReduceInstruction* reduce_instr = Cast<HloReduceInstruction>(instr);
-  const HloInstruction* input = instr->operand(0);
-  CHECK_EQ(input->shape().element_type(), instr->shape().element_type());
-
-  xnn_reduce_operator xnn_reduce_op = xnn_reduce_invalid;
-  CHECK_EQ(reduce_instr->to_apply()->num_parameters(), 2);
-  CHECK_EQ(reduce_instr->to_apply()->instruction_count(), 3);
-
-  switch (reduce_instr->to_apply()->root_instruction()->opcode()) {
-    case HloOpcode::kAdd:
-      xnn_reduce_op = xnn_reduce_sum;
-      break;
-    case HloOpcode::kMaximum:
-      xnn_reduce_op = xnn_reduce_max;
-      break;
-    case HloOpcode::kMinimum:
-      xnn_reduce_op = xnn_reduce_min;
-      break;
-    default:
-      LOG(FATAL) << "Unsupported reduction: " << instr->to_apply()->ToString();
-  }
-
-  const absl::Span<const int64_t> dims = reduce_instr->dimensions();
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, input));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-  XNN_RETURN_IF_ERROR(xnn_define_static_reduce(
-      subgraph, xnn_reduce_op, dims.size(),
-      reinterpret_cast<const size_t*>(dims.data()), in, out,
-      /*flags=*/0));
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineUnaryOp(xnn_subgraph_t subgraph,
-                                              TensorIdMap& tensor_ids,
-                                              const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for unary op: %s",
-                                instr->ToString());
-  TF_ASSIGN_OR_RETURN(auto unary_op, XnnUnaryOperator(instr->opcode()));
-
-  TF_ASSIGN_OR_RETURN(auto in, FindTensorValue(tensor_ids, instr->operand(0)));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-
-  VLOG(3) << absl::StreamFormat("  tensors: in=%d, out=%d", in, out);
-
-  xnn_unary_params params;
-  XNN_RETURN_IF_ERROR(
-      xnn_define_unary(subgraph, unary_op, &params, in, out, /*flags=*/0));
-
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineBinaryOp(xnn_subgraph_t subgraph,
-                                               TensorIdMap& tensor_ids,
-                                               const HloInstruction* instr) {
-  VLOG(3) << absl::StreamFormat("Define tensor value for binary op: %s",
-                                instr->ToString());
-
-  TF_ASSIGN_OR_RETURN(auto binary_op, XnnBinaryOperator(instr->opcode()));
-
-  TF_ASSIGN_OR_RETURN(auto lhs, FindTensorValue(tensor_ids, instr->operand(0)));
-  TF_ASSIGN_OR_RETURN(auto rhs, FindTensorValue(tensor_ids, instr->operand(1)));
-  TF_ASSIGN_OR_RETURN(auto out, DefineTensorValue(subgraph, instr));
-
-  VLOG(3) << absl::StreamFormat("  tensors: lhs=%d, rhs=%d, out=%d", lhs, rhs,
-                                out);
-
-  xnn_binary_params params = {-std::numeric_limits<float>::infinity(),
-                              std::numeric_limits<float>::infinity()};
-
-  // In XLA, broadcasts are explicit ops, allowing XNNPACK to assume there is no
-  // broadcasting in the elementwise operation itself, which simplifies data
-  // dependencies.
-  const uint32_t flags = XNN_FLAG_NO_BROADCAST;
-  XNN_RETURN_IF_ERROR(xnn_define_binary(subgraph, binary_op, &params, lhs, rhs,
-                                        out, /*flags=*/flags));
-
-  return out;
-}
-
-static absl::StatusOr<uint32_t> DefineBatchMatMul(xnn_subgraph_t subgraph,
-                                                  TensorIdMap& tensor_ids,
-                                                  const HloInstruction* instr) {
-  // Verify that this Dot is supported by XNNPACK.
-  const DotDimensionNumbers& dnums = instr->dot_dimension_numbers();
-  const Shape& lhs_shape = instr->operand(0)->shape();
-  const Shape& rhs_shape = instr->operand(1)->shape();
-  TF_ASSIGN_OR_RETURN(
-      bool is_supported,
-      IsDotSupportedByXnn(dnums, lhs_shape, rhs_shape, instr->shape(),
-                          /*cpu_features=*/nullptr, /*use_cost_model=*/false));
-
-  if (!is_supported) {
-    return InvalidArgument("Unsupported XNNPACK Dot op variation: %s",
-                           instr->ToString());
-  }
-
-  VLOG(3) << "Define tensor values for batch_matrix_multiply op";
-
-  TF_ASSIGN_OR_RETURN(uint32_t lhs,
-                      FindTensorValue(tensor_ids, instr->operand(0)));
-  TF_ASSIGN_OR_RETURN(uint32_t rhs,
-                      FindTensorValue(tensor_ids, instr->operand(1)));
-  TF_ASSIGN_OR_RETURN(uint32_t out, DefineTensorValue(subgraph, instr));
-
-  VLOG(3) << absl::StreamFormat("  tensors: lhs=%d, rhs=%d, out=%d", lhs, rhs,
-                                out);
-
-  // In XLA, broadcasts are explicit ops, allowing XNNPACK to assume there is no
-  // broadcasting in the elementwise operation itself, which simplifies data
-  // dependencies.
-  uint32_t flags = XNN_FLAG_NO_BROADCAST;
-  // IsXnnDotSupported has verified that rhs_contracting_dimensions has size 1.
-  if (dnums.rhs_contracting_dimensions(0) !=
-      dnums.rhs_batch_dimensions_size()) {
-    flags |= XNN_FLAG_TRANSPOSE_B;
-  }
-  XNN_RETURN_IF_ERROR(xnn_define_batch_matrix_multiply(subgraph, lhs, rhs, out,
-                                                       /*flags=*/flags));
-
-  return out;
-}
-
-//===----------------------------------------------------------------------===//
-// Emit XNNPACK subgraph for the given HLO computation.
-//===----------------------------------------------------------------------===//
-
-static absl::StatusOr<XnnSubgraph> EmitXnnSubgraph(
-    const HloComputation* computation,
-    std::vector<std::unique_ptr<Literal>>& literals) {
-  VLOG(3) << "Emit XNNPACK subgraph for computation: " << computation->name();
-
-  TF_ASSIGN_OR_RETURN(
-      XnnSubgraph subgraph, CreateXnnSubgraph([&](xnn_subgraph_t* subgraph) {
-        return xnn_create_subgraph(
-            /*external_value_ids=*/computation->num_parameters() + 1,
-            /*flags=*/0, subgraph);
-      }));
-
-  // Traverse fused computation in post-order and define XNNPACK operations
-  // corresponding to each HLO instruction.
-  TensorIdMap tensor_ids;
-  auto instructions = computation->MakeInstructionPostOrder();
-
-  for (const HloInstruction* instr : instructions) {
-    if (!IsLayoutSupportedByXnn(instr->shape())) {
-      return InvalidArgument(
-          "Instruction with unsupported layout in XNN fusion: %s",
-          instr->ToString());
-    }
-
-    if (instr->IsConstant()) {
-      if (!IsConstantSupportedByXnn(instr)) {
-        return InvalidArgument(
-            "Unsupported constant instruction in XNN fusion: %s",
-            instr->ToString());
-      }
-      TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                          DefineConstant(subgraph.get(), literals, instr));
-      continue;
-    }
-
-    if (instr->IsElementwise()) {
-      if (!IsElementwiseOpSupportedByXnn(instr)) {
-        return InvalidArgument(
-            "Unsupported elementwise instruction in XNN fusion: %s",
-            instr->ToString());
-      }
-      if (instr->operand_count() == 1) {
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineUnaryOp(subgraph.get(), tensor_ids, instr));
-      } else if (instr->operand_count() == 2) {
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineBinaryOp(subgraph.get(), tensor_ids, instr));
-      } else {
-        LOG(FATAL) << "Unexpected operand count " << instr->operand_count();
-      }
-      continue;
-    }
-
-    switch (instr->opcode()) {
-      case HloOpcode::kParameter: {
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineParameter(subgraph.get(), instr));
-      } break;
-
-      case HloOpcode::kBitcast: {
-        if (!IsBitcastOpSupportedByXnn(instr)) {
-          return InvalidArgument(
-              "Unsupported bitcast instruction in XNN fusion: %s",
-              instr->ToString());
-        }
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineBitcastOp(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      case HloOpcode::kBroadcast: {
-        if (!IsBroadcastOpSupportedByXnn(instr)) {
-          return InvalidArgument(
-              "Unsupported broadcast instruction in XNN fusion: %s",
-              instr->ToString());
-        }
-        TF_ASSIGN_OR_RETURN(
-            tensor_ids[instr],
-            DefineBroadcastOp(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      case HloOpcode::kReduce: {
-        // FIXME: Validate the reduce instruction.
-        // One cannot directly use IsReduceOpSupportedByXnn since the invariant
-        // value is not necessarily included into the same fusion. This might
-        // happen if the original instruction has multiple users or was rejected
-        // by the fusion compiler pass.
-        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
-                            DefineReduceOp(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      case HloOpcode::kDot: {
-        TF_ASSIGN_OR_RETURN(
-            tensor_ids[instr],
-            DefineBatchMatMul(subgraph.get(), tensor_ids, instr));
-      } break;
-
-      default: {
-        return InvalidArgument("Unsupported XNNPACK fusion instruction: %s",
-                               instr->ToString());
-      }
-    }
-  }
-
-  return subgraph;
-}
-
-absl::StatusOr<absl::AnyInvocable<absl::StatusOr<XnnSubgraph>()>>
-EmitXnnFusionBuilder(const HloComputation* computation) {
-  // We do not support non-array parameters for XNNPACK operations.
-  for (auto& param : computation->parameter_instructions()) {
-    if (!param->shape().IsArray()) {
-      return InvalidArgument(
-          "XNNPACK fusion parameters must have array shapes, got %s",
-          param->shape().ToString());
-    }
-  }
-
-  // Result also must be a single array.
-  if (!computation->root_instruction()->shape().IsArray()) {
-    return InvalidArgument("XNNPACK fusion result must be an array, got %s",
-                           computation->root_instruction()->shape().ToString());
-  }
-
-  return [computation,
-          literals = std::vector<std::unique_ptr<Literal>>()]() mutable {
-    return EmitXnnSubgraph(computation, literals);
-  };
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/xnn_emitter.h b/third_party/xla/xla/backends/cpu/xnn_emitter.h
deleted file mode 100644
index 439e7f25d84e0c..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_emitter.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_XNN_EMITTER_H_
-#define XLA_BACKENDS_CPU_XNN_EMITTER_H_
-
-#include "absl/functional/any_invocable.h"
-#include "absl/status/statusor.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/hlo/ir/hlo_computation.h"
-
-namespace xla::cpu {
-
-absl::StatusOr<absl::AnyInvocable<absl::StatusOr<XnnSubgraph>()>>
-EmitXnnFusionBuilder(const HloComputation* computation);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_XNN_EMITTER_H_
diff --git a/third_party/xla/xla/backends/cpu/xnn_gemm_config.cc b/third_party/xla/xla/backends/cpu/xnn_gemm_config.cc
deleted file mode 100644
index 6750a849524741..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_gemm_config.cc
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_gemm_config.h"
-
-#include <algorithm>
-#include <array>
-#include <cmath>
-#include <cstddef>
-#include <limits>
-#include <numeric>
-
-#include "absl/base/no_destructor.h"
-#include "absl/log/check.h"
-#include "llvm/Target/TargetMachine.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-namespace {
-
-double Relu(double x) { return std::max(0.0, x); }
-
-template <size_t Size>
-std::array<double, Size> Relu(const std::array<double, Size>& input) {
-  std::array<double, Size> output{};
-  for (size_t i = 0; i < Size; ++i) {
-    output[i] = Relu(input[i]);
-  }
-  return output;
-}
-
-double Sigmoid(double x) { return 1.0 / (1.0 + std::exp(-x)); }
-
-double Sigmoid(std::array<double, 1> input) { return Sigmoid(input[0]); }
-
-template <size_t InSize, size_t OutSize>
-struct Layer {
-  std::array<std::array<double, InSize>, OutSize> weights;
-  std::array<double, OutSize> biases;
-
-  std::array<double, OutSize> operator()(
-      const std::array<double, InSize>& input) const {
-    std::array<double, OutSize> output{};
-    for (size_t i = 0; i < OutSize; ++i) {
-      output[i] = std::inner_product(input.begin(), input.end(),
-                                     weights[i].begin(), 0.0);
-      output[i] += biases[i];
-    }
-    return output;
-  }
-};
-
-template <size_t InSize>
-struct Scaler {
-  std::array<double, InSize> mean;
-  std::array<double, InSize> scale;
-
-  std::array<double, InSize> operator()(
-      const std::array<double, InSize>& features) const {
-    std::array<double, InSize> out;
-    for (size_t i = 0; i < features.size(); ++i) {
-      out[i] = (features[i] - mean[i]) / scale[i];
-    }
-    return out;
-  }
-};
-
-std::array<double, 6> ExtractFeatures(int m, int k, int n) {
-  std::array<double, 6> features = {static_cast<double>(m),
-                                    static_cast<double>(k),
-                                    static_cast<double>(n),
-                                    std::log(m),
-                                    std::log(k),
-                                    std::log(n)};
-  return features;
-}
-
-struct Net {
-  static constexpr size_t kNumFeatures = 6;
-  static constexpr size_t kHiddenLayer1Size = 8;
-  static constexpr size_t kHiddenLayer2Size = 8;
-
-  Scaler<kNumFeatures> scaler;
-  Layer<kNumFeatures, kHiddenLayer1Size> hidden_layer_1;
-  Layer<kHiddenLayer1Size, kHiddenLayer2Size> hidden_layer_2;
-  Layer<kHiddenLayer2Size, 1> output_layer;
-  double threshold;
-
-  int operator()(double m, double k, double n) const {
-    std::array<double, kNumFeatures> features = ExtractFeatures(m, k, n);
-    double probability = Sigmoid(output_layer(
-        Relu(hidden_layer_2(Relu(hidden_layer_1(scaler(features)))))));
-    return probability < threshold ? 1 : 0;
-  }
-};
-
-struct Range {
-  int min;
-  int max;
-
-  template <class... Args>
-  bool Contains(Args... args) const {
-    auto check = [this](int x) -> bool { return min <= x && x <= max; };
-    return (check(args) && ...);
-  }
-};
-
-struct GemmFilter {
-  Range input_range;
-  PrimitiveType lhs_dtype;
-  PrimitiveType rhs_dtype;
-  PrimitiveType out_dtype;
-
-  bool operator()(const XnnGemm& gemm) const {
-    return input_range.Contains(gemm.dot_canonical_dims.m,
-                                gemm.dot_canonical_dims.k,
-                                gemm.dot_canonical_dims.n) &&
-           gemm.lhs_dtype == lhs_dtype && gemm.rhs_dtype == rhs_dtype &&
-           gemm.out_dtype == out_dtype &&
-           gemm.dot_canonical_dims.lhs_canonical &&
-           !gemm.dot_canonical_dims.rhs_column_major &&
-           gemm.dot_canonical_dims.rhs_canonical &&
-           !gemm.dot_canonical_dims.output_column_major;
-  }
-};
-
-// NOLINTBEGIN
-// clang-format off
-
-static constexpr GemmFilter BF16BF16F32GemmFilter{
-  /*input_range=*/{0, std::numeric_limits<int>::max()},
-  /*lhs_dtype=*/PrimitiveType::BF16,
-  /*rhs_dtype=*/PrimitiveType::BF16,
-  /*out_dtype=*/PrimitiveType::F32,
-};
-
-static constexpr GemmFilter AMDRomeGemmFilter{
-  /*input_range=*/{16, 4096},
-  /*lhs_dtype=*/PrimitiveType::F32,
-  /*rhs_dtype=*/PrimitiveType::F32,
-  /*out_dtype=*/PrimitiveType::F32,
-};
-
-static constexpr Net AMDRomeNet{
-  /*scaler=*/{
-    /*mean=*/
-    {{ 2031.4479060265578, 2036.3171603677222, 2062.2170582226763, 7.29227087924762, 7.308301476602625, 7.331674465299577 }},
-    /*scale=*/
-    {{ 1188.2177375470617, 1178.7350461452038, 1179.7790996965598, 1.0416890873676914, 1.0053399234375506, 0.9757991392501179 }},
-  },
-  /*hidden_layer_1=*/{
-    /*weights=*/{{
-      {{ 0.5255922128957278, -0.8065013670906714, -0.5264014380189966, -1.2772498330118651, 1.3840216299823802, 0.7322759674330881 }},
-      {{ -0.7597171548555842, -1.2571169773685882, -0.32518437620636936, 1.0212806356673838, 0.9165371224616725, -0.19250317971610814 }},
-      {{ -2.3497882574965994, 0.23878289300722322, -2.5867259166595944, 0.8432052252434499, -0.7374592701571068, 0.6061228206232958 }},
-      {{ 0.3412638507438349, 0.009127030753615727, -0.43271581733053577, 0.3058216852138156, 0.4132978840654225, 0.08892908864656021 }},
-      {{ -0.3843556431761765, -0.5398088470059381, -2.0478454682095735, -1.9041927205327738, -1.0368295384919808, -0.1653666006655781 }},
-      {{ 0.9415170642828504, -0.4671602009419241, -2.594401365132767, 0.5011818371933664, 2.6743454901058725, 1.090931094328555 }},
-      {{ -2.030867525769208, 0.9360281369657524, -2.179490537456837, 0.6315631977398317, -0.2797813498393135, 1.1780045163240112 }},
-      {{ 2.026780502536945, 1.1382782700184098, 0.7076892737809293, -0.5003242829913847, 1.7337823655903326, 0.676979521067241 }},
-    }},
-    /*biases=*/{
-      { 2.827760670625431, -0.9347274494671962, 1.7748650815163647, -0.5102747570142624, 1.1443725632238269, 2.0573020231014616, 0.33721201132380757, 2.7437956980307643 },
-    }
-  },
-  /*hidden_layer_2=*/{
-    /*weights=*/{{
-      {{ 2.571821311709108, 0.16869445337763503, 0.3541411973512104, 0.31040383433531593, -1.9138308971941267, 1.577267326066108, 1.0358680188904088, -0.48597239908310547 }},
-      {{ -0.3168524372865204, -0.8109707535168992, -0.6883758912881943, 0.20041683878416458, 0.29562419861502953, 2.9699371941875183, -0.06378706528945598, -1.2627270412739198 }},
-      {{ 1.2121865841893051, 0.4324679330555888, 0.5756742637802713, -0.3965637421226802, -0.8316876650525071, 1.4267737797853521, 0.6590628275882154, 1.0969896994507335 }},
-      {{ 0.08152092107879703, 0.987281670566132, 2.711801967605775, 0.03262333498333622, -0.24851434369301018, 0.5857580261361529, -0.14172228489696118, 1.0096244465236095 }},
-      {{ -1.099617291565094, -0.96182176932886, 1.1198642662894356, 0.09569259551658717, 0.9865508260397995, -1.7073686127591108, 0.8545686868857858, 1.276785903326864 }},
-      {{ 0.6284115174399925, -0.5692706408214737, -0.3776497427936689, 0.2850473804130665, 0.5611912673866001, 0.7074167980672433, 1.3602397130866593, -2.4641849404042104 }},
-      {{ -0.2235255127724266, -0.6066818030776572, 2.098453748102861, -0.551860833640914, -0.6607678541967575, -1.0968858307838945, -3.097129404864497, 1.22936241411423 }},
-      {{ -0.35359032516179434, 0.16659401401800453, 0.7409562527506246, 0.12880569714035928, 1.6235584538175323, 0.35055754805485, -0.5085408039033421, 0.03832167245213557 }},
-    }},
-    /*biases=*/{
-      { -0.9650088973529635, 0.18404512445819377, -1.1301082618712814, -0.4114680200097482, -2.16829227705252, -0.792693003568079, 2.0186809343196432, 0.6651750830570318 },
-    }
-  },
-  /*output_layer=*/{
-    /*weights=*/{{
-      {{ -3.4950798141841886, 3.052869401349734, -1.9332425183341917, -2.4468455334890375, 3.1182134156177734, 2.662143418701658, 3.609609051057281, -1.6114776062537006 }},
-    }},
-    /*biases=*/{
-      { -0.8627209596023582 },
-    }
-  },
-  /*threshold=*/0.03,
-};
-
-static constexpr GemmFilter AMDGenoaGemmFilter{
-  /*input_range=*/{16, 4096},
-  /*lhs_dtype=*/PrimitiveType::F32,
-  /*rhs_dtype=*/PrimitiveType::F32,
-  /*out_dtype=*/PrimitiveType::F32,
-};
-
-static constexpr Net AMDGenoaNet {
-  /*scaler=*/{
-    /*mean=*/
-    {{ 2048.487742594484, 2032.4805924412667, 2042.0275791624106, 7.311636506981553, 7.331182177414692, 7.324348610024091 }},
-    /*scale=*/
-    {{ 1191.317145630777, 1166.4230415375375, 1162.7572402044934, 1.0130577584567735, 0.9372130582909888, 0.9819331632142719 }},
-  },
-  /*hidden_layer_1=*/{
-    /*weights=*/{{
-      {{ -0.3975566315544443, 0.5914998393825349, 0.6099048505253704, -2.2657754130482575, 0.36614796953745665, -0.9019941522654611 }},
-      {{ -1.634528631004246, -1.0247790097319367, 0.7441596497436759, 1.1627072134985457, 0.05409335988074912, -0.12091065051829138 }},
-      {{ 0.38395072299848293, 0.6541884828037803, 0.417837898603066, -0.9405446354332785, 2.184810649384631, -0.36876630139170674 }},
-      {{ 1.4311717327837925, 0.9019482519954495, 0.010222966815173684, 0.3734603575926762, -0.48722286699557477, 0.6097423536728197 }},
-      {{ -0.7136793187709407, -1.9428210404652928, 0.4274609198312262, 0.7241649472475438, 0.7127139917668667, -0.17169269406677637 }},
-      {{ 0.7274093691413374, 1.5619764328746881, 0.3132760663502329, 0.1150444561729908, 0.2015964262316955, -1.6488397218364703 }},
-      {{ -0.2753144111803734, 0.851664634951511, -0.7668837132534746, 0.8536953128922471, 0.5346385907475031, -0.3903852123459044 }},
-      {{ -0.33049518181245935, -0.1445885038395346, 0.33671360297244707, 0.19923558301288513, 0.47714692266995923, 2.673625950077934 }},
-    }},
-    /*biases=*/{
-      { 1.8781920773242509, 0.6510580145727756, 1.3641835181490685, -1.237083419397511, 0.09563962519162661, 1.0633713668067988, -0.2750294272946441, 0.4082406241441991 },
-    }
-  },
-  /*hidden_layer_2=*/{
-    /*weights=*/{{
-      {{ 1.482788775138106, -0.5911919348052194, -0.35265948412831416, 0.5693173975201452, 0.08299331485534553, -1.0926309595949408, 0.334160671733911, -0.8259113265483281 }},
-      {{ -0.7244072332431708, 1.7167578358580047, -0.4425799291591407, 0.38193961610444616, -0.3131049026459214, 0.7057668457879581, -0.8977670579096759, -1.1564071580034785 }},
-      {{ 0.2358887563481682, 0.845047198622242, 0.3965633248481624, -0.9292260319808021, 0.38780851270938177, 0.9073719197977955, 0.8942857890487362, 2.2078844573893486 }},
-      {{ 0.7588397006376895, 0.39649528525833017, 1.1922103753418032, -0.2623025347145879, -1.8688404509544276, 0.23950836230216038, 0.15018196046213705, 1.1091046070474726 }},
-      {{ -0.06639877236719088, 0.09408482409872725, 0.08853697547037886, -0.027191640785169502, -0.025050403848262424, -0.14821218627938373, -0.05119778874800481, -0.003846457076482196 }},
-      {{ -1.3626737341753659, -0.509211567650967, -1.3709529389911908, 0.8181695565961004, -0.9154056938786789, 1.6786394527771, -0.38910973671573107, 0.6109302318778375 }},
-      {{ -0.9490250745418807, -0.22890259271729135, -0.7669763564967859, -1.2378100390537607, 0.9325554827865082, -0.7707072257516585, -0.6101643395959798, 0.6438447441624673 }},
-      {{ 1.1581876959277013, 1.4439015663052703, -1.4659507082977212, 1.0425420146162472, -0.20891484120663645, 0.3292514803046433, 0.38947771607697135, 0.06588859566944062 }},
-    }},
-    /*biases=*/{
-      { 2.0991435035679293, 0.9220598032166089, 0.001237522670163396, -0.2035381110666839, -0.7214610628375114, -2.275782698263265, 3.2572710355363337, -1.309956720253099 },
-    }
-  },
-  /*output_layer=*/{
-    /*weights=*/{{
-      {{ -2.214950317234679, 2.3173207097966624, -2.4148863077632057, 2.440952250974181, 0.016504153668811035, 3.00219780922754, 2.454200734592688, 2.444832006369846 }},
-    }},
-    /*biases=*/{
-      { -0.2538826384470055 },
-    }
-  },
-  /*threshold=*/0.05,
-};
-
-// clang-format on
-// NOLINTEND
-
-bool IsAMDRome(const llvm::TargetMachine* target_machine) {
-  CHECK(target_machine);
-  return target_machine->getTargetCPU() == "znver2";
-}
-
-bool IsAMDMilan(const llvm::TargetMachine* target_machine) {
-  CHECK(target_machine);
-  return target_machine->getTargetCPU() == "znver3";
-}
-
-bool IsAMDGenoa(const llvm::TargetMachine* target_machine) {
-  CHECK(target_machine);
-  return target_machine->getTargetCPU() == "znver4";
-}
-
-}  // namespace
-
-XnnGemmConfig::Opinion XnnGemmConfig::Evaluate(
-    const XnnGemm& gemm, const TargetMachineFeatures* cpu_features) const {
-  if (test_filter_) {
-    return test_filter_(gemm) ? XnnGemmConfig::Opinion::kAccept
-                              : XnnGemmConfig::Opinion::kReject;
-  }
-
-  if (!cpu_features || !cpu_features->target_machine()) {
-    return XnnGemmConfig::Opinion::kNoIdea;
-  }
-
-  CHECK(cpu_features);
-  CHECK(cpu_features->target_machine());
-
-  if (BF16BF16F32GemmFilter(gemm)) {
-    return XnnGemmConfig::Opinion::kAccept;
-  }
-
-  if ((IsAMDRome(cpu_features->target_machine()) ||
-       IsAMDMilan(cpu_features->target_machine())) &&
-      AMDRomeGemmFilter(gemm)) {
-    int out = AMDRomeNet(gemm.dot_canonical_dims.m, gemm.dot_canonical_dims.k,
-                         gemm.dot_canonical_dims.n);
-    return out == 1 ? XnnGemmConfig::Opinion::kAccept
-                    : XnnGemmConfig::Opinion::kReject;
-  }
-
-  if (IsAMDGenoa(cpu_features->target_machine()) && AMDGenoaGemmFilter(gemm)) {
-    int out = AMDGenoaNet(gemm.dot_canonical_dims.m, gemm.dot_canonical_dims.k,
-                          gemm.dot_canonical_dims.n);
-    return out == 1 ? XnnGemmConfig::Opinion::kAccept
-                    : XnnGemmConfig::Opinion::kReject;
-  }
-
-  return XnnGemmConfig::Opinion::kNoIdea;
-}
-
-const XnnGemmConfig& GetXnnGemmConfig() {
-  static const absl::NoDestructor<XnnGemmConfig> gemm_config;
-  return *gemm_config;
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h b/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
deleted file mode 100644
index 83bac68b0c2ce8..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_gemm_config.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
-#define XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
-
-#include <functional>
-
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-struct XnnGemm {
-  DotCanonicalDims dot_canonical_dims;
-  PrimitiveType lhs_dtype;
-  PrimitiveType rhs_dtype;
-  PrimitiveType out_dtype;
-};
-
-// XnnGemmConfig is a static lightweight  mechanism for determining if a given
-// gemm should be offloaded to XNNPACK vs handled by OneDNN/Eigen.
-// Currently it uses a classifier - neural network with: 6 input features
-// m, k, n, log(m), log(k), log(n), two hidden layers of size 8 and a cut-off
-// threshold for the predicted probability tuned to keep the false positive rate
-// below 1%. The classifier was trained on synthetic data (20K random gemms).
-// TODO(ashaposhnikov): add a reference to documentation / collab.
-class XnnGemmConfig {
-  mutable std::function<bool(const XnnGemm&)> test_filter_ = nullptr;
-
- public:
-  XnnGemmConfig() = default;
-
-  enum class Opinion { kAccept, kReject, kNoIdea };
-
-  Opinion Evaluate(const XnnGemm& xnn_gemm,
-                   const TargetMachineFeatures* cpu_features) const;
-
-  template <typename Filter>
-  void SetTestFilter(Filter&& test_filter) const {
-    test_filter_ = std::forward<Filter>(test_filter);
-  }
-};
-
-const XnnGemmConfig& GetXnnGemmConfig();
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_XNN_GEMM_CONFIG_H_
diff --git a/third_party/xla/xla/backends/cpu/xnn_support.cc b/third_party/xla/xla/backends/cpu/xnn_support.cc
deleted file mode 100644
index 307d7adb859472..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_support.cc
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_support.h"
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <utility>
-
-#include "xnnpack.h"
-#include "absl/base/no_destructor.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/statusor.h"
-#include "absl/types/span.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/backends/cpu/runtime/dot_dims.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_interop.h"
-#include "xla/backends/cpu/xnn_gemm_config.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout_util.h"
-#include "xla/primitive_util.h"
-#include "xla/service/pattern_matcher.h"
-#include "xla/shape.h"
-#include "xla/shape_util.h"
-#include "xla/tsl/platform/statusor.h"
-#include "xla/util.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-bool AreDtypesSupported(const Shape& lhs_shape, const Shape& rhs_shape,
-                        const Shape& out_shape,
-                        const TargetMachineFeatures* cpu_features) {
-  // Stores tuple of allowed (input, output) dtypes.
-  static const auto* kAllowedTypes =
-      new absl::flat_hash_set<std::pair<PrimitiveType, PrimitiveType>>(
-          {{F32, F32}, {BF16, F32}, {BF16, BF16}});
-
-  // Types must be in the allowed set.
-  PrimitiveType lhs_dtype = lhs_shape.element_type();
-  PrimitiveType rhs_dtype = rhs_shape.element_type();
-  PrimitiveType out_dtype = out_shape.element_type();
-  if (lhs_dtype != rhs_dtype ||
-      !kAllowedTypes->contains({lhs_dtype, out_dtype})) {
-    return false;
-  }
-
-  // BF16 matmuls can only run when CPU has AVX512_BF16.
-  if (lhs_dtype == BF16) {
-    return cpu_features == nullptr || cpu_features->has_avx512bf16();
-  }
-  return true;
-}
-
-absl::StatusOr<bool> IsDotSupportedByXnn(
-    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
-    const Shape& rhs_shape, const Shape& out_shape,
-    const TargetMachineFeatures* cpu_features, bool use_cost_model) {
-  // Check data types.
-  if (!AreDtypesSupported(lhs_shape, rhs_shape, out_shape, cpu_features)) {
-    return false;
-  }
-  if (!IsLayoutSupportedByXnn(lhs_shape) ||
-      !IsLayoutSupportedByXnn(rhs_shape) ||
-      !IsLayoutSupportedByXnn(out_shape)) {
-    return false;
-  }
-
-  // Check shapes.
-  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
-                                                      rhs_shape, out_shape));
-
-  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
-                      GetDotCanonicalDims(dot_dimensions, dot_shape));
-
-  if (dot_canonical_dims.m == 1 && dot_canonical_dims.n == 1 &&
-      dot_shape.batch_size > 1) {
-    // TODO(b/430079105): XNNPACK does not handle batch dimensions that are not
-    // matrix dimensions. We could handle this case by fully implementing dot
-    // (b/430079105), but we also could just insert dummy dimensions of size 1
-    // for the matrix dimensions, so the batch dimensions get handled correctly.
-    return false;
-  }
-
-  // XNNPACK does not support transposing LHS or col-major layouts.
-  if (!dot_canonical_dims.lhs_canonical ||
-      dot_canonical_dims.lhs_column_major ||
-      dot_canonical_dims.rhs_column_major) {
-    return false;
-  }
-
-  if (!use_cost_model) {
-    return true;
-  }
-
-  const XnnGemm gemm{/*dot_canonical_dims=*/dot_canonical_dims,
-                     /*lhs_dtype=*/lhs_shape.element_type(),
-                     /*rhs_dtype=*/rhs_shape.element_type(),
-                     /*out_dtype=*/out_shape.element_type()};
-  switch (GetXnnGemmConfig().Evaluate(gemm, cpu_features)) {
-    case XnnGemmConfig::Opinion::kAccept:
-      return true;
-    default:
-      return false;
-  }
-}
-
-const absl::flat_hash_map<HloOpcode, xnn_unary_operator>& GetXnnUnaryOpMap() {
-  // TODO(ashaposhnikov): Investigate adding support for kErf, kExpm1, kLog1p,
-  // kNot, kRoundNearestAfz, kTan.
-  static absl::NoDestructor<absl::flat_hash_map<HloOpcode, xnn_unary_operator>>
-      unary_op_map({
-          {HloOpcode::kAbs, xnn_unary_abs},
-          {HloOpcode::kCeil, xnn_unary_ceiling},
-          {HloOpcode::kClz, xnn_unary_count_leading_zeros},
-          {HloOpcode::kConvert, xnn_unary_convert},
-          {HloOpcode::kCos, xnn_unary_cosine},
-          {HloOpcode::kExp, xnn_unary_exp},
-          {HloOpcode::kCbrt, xnn_unary_cube_root},
-          {HloOpcode::kFloor, xnn_unary_floor},
-          {HloOpcode::kLog, xnn_unary_log},
-          {HloOpcode::kLogistic, xnn_unary_sigmoid},
-          {HloOpcode::kNegate, xnn_unary_negate},
-          {HloOpcode::kRoundNearestEven, xnn_unary_bankers_rounding},
-          {HloOpcode::kRsqrt, xnn_unary_reciprocal_square_root},
-          {HloOpcode::kSign, xnn_unary_sign},
-          {HloOpcode::kSin, xnn_unary_sine},
-          {HloOpcode::kSqrt, xnn_unary_square_root},
-          {HloOpcode::kTanh, xnn_unary_tanh},
-      });
-  return *unary_op_map;
-}
-
-absl::StatusOr<xnn_unary_operator> XnnUnaryOperator(const HloOpcode& opcode) {
-  const auto& unary_op_map = GetXnnUnaryOpMap();
-  auto result = unary_op_map.find(opcode);
-  if (result == unary_op_map.end()) {
-    return InvalidArgument("Unsupported XNNPACK unary operator: %s",
-                           HloOpcodeString(opcode));
-  }
-  return result->second;
-}
-
-const absl::flat_hash_map<HloOpcode, xnn_binary_operator>& GetXnnBinaryOpMap() {
-  static absl::NoDestructor<absl::flat_hash_map<HloOpcode, xnn_binary_operator>>
-      binary_op_map({
-          {HloOpcode::kAdd, xnn_binary_add},
-          {HloOpcode::kAnd, xnn_binary_bitwise_and},
-          {HloOpcode::kDivide, xnn_binary_divide},
-          {HloOpcode::kMaximum, xnn_binary_maximum},
-          {HloOpcode::kMinimum, xnn_binary_minimum},
-          {HloOpcode::kMultiply, xnn_binary_multiply},
-          {HloOpcode::kOr, xnn_binary_bitwise_or},
-          {HloOpcode::kPower, xnn_binary_pow},
-          {HloOpcode::kRemainder, xnn_binary_modulus},
-          {HloOpcode::kShiftLeft, xnn_binary_shift_left},
-          {HloOpcode::kShiftRightArithmetic, xnn_binary_shift_right_arithmetic},
-          {HloOpcode::kShiftRightLogical, xnn_binary_shift_right_logical},
-          {HloOpcode::kSubtract, xnn_binary_subtract},
-          {HloOpcode::kXor, xnn_binary_bitwise_xor},
-      });
-  return *binary_op_map;
-}
-
-absl::StatusOr<xnn_binary_operator> XnnBinaryOperator(const HloOpcode& opcode) {
-  const auto& binary_op_map = GetXnnBinaryOpMap();
-  auto result = binary_op_map.find(opcode);
-  if (result == binary_op_map.end()) {
-    return InvalidArgument("Unsupported XNNPACK binary operator: %s",
-                           HloOpcodeString(opcode));
-  }
-  return result->second;
-}
-
-bool IsLayoutSupportedByXnn(const Shape& shape) {
-  return !shape.has_layout() || LayoutUtil::HasDescendingLayout(shape.layout());
-}
-
-bool IsConstantSupportedByXnn(const HloInstruction* hlo) {
-  CHECK(hlo->IsConstant());
-
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-
-  return hlo->shape().IsArray();
-}
-
-bool IsElementwiseOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK(hlo->IsElementwise());
-  // In XLA IsElementwise is true for constants.
-  CHECK(!hlo->IsConstant());
-
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-
-  if (!std::all_of(hlo->operands().begin(), hlo->operands().end(),
-                   [](const HloInstruction* op) {
-                     return XnnDatatype(op->shape().element_type()).ok();
-                   })) {
-    return false;
-  }
-
-  switch (hlo->operand_count()) {
-    case 1:
-      return XnnUnaryOperator(hlo->opcode()).ok();
-    case 2:
-      return XnnBinaryOperator(hlo->opcode()).ok();
-    default:
-      return false;
-  }
-}
-
-bool IsBitcastOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK_EQ(hlo->opcode(), HloOpcode::kBitcast);
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-  const HloInstruction* input = hlo->operand(0);
-  return hlo->shape().element_type() == input->shape().element_type();
-}
-
-bool IsBroadcastOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK_EQ(hlo->opcode(), HloOpcode::kBroadcast);
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-  const absl::Span<const int64_t> dims =
-      Cast<HloBroadcastInstruction>(hlo)->dimensions();
-  if (dims.empty()) {
-    return true;
-  }
-  if (!std::is_sorted(dims.begin(), dims.end())) {
-    return false;
-  }
-  // TODO(ashaposhnikov): this case works well, but we should investigate the
-  // performance regressions that occur if this condition is removed.
-  return dims.back() + 1 == dims.size();
-}
-
-template <class T>
-static T InvariantValueFor(HloOpcode opcode) {
-  switch (opcode) {
-    case HloOpcode::kAdd:
-      return T{0};
-    case HloOpcode::kMinimum:
-      return std::numeric_limits<T>::infinity();
-    case HloOpcode::kMaximum:
-      return -std::numeric_limits<T>::infinity();
-    default:
-      LOG(FATAL) << "Unexpected opcode " << opcode;
-  }
-}
-
-bool IsReduceOpSupportedByXnn(const HloInstruction* hlo) {
-  CHECK_EQ(hlo->opcode(), HloOpcode::kReduce);
-  if (!XnnDatatype(hlo->shape().element_type()).ok()) {
-    return false;
-  }
-  const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
-  CHECK_NE(reduce, nullptr);
-  // TODO(ashaposhnikov): we can support this edge case,
-  // planning to come back to this later.
-  if (reduce->dimensions().empty()) {
-    return false;
-  }
-  const HloComputation* to_apply = reduce->to_apply();
-  CHECK_NE(to_apply, nullptr);
-  if (!Match(to_apply->root_instruction(),
-             match::AnyOf<HloInstruction>(match::Add(), match::Maximum(),
-                                          match::Minimum())
-                 .WithBinaryOperandsAnyOrder(match::Parameter(0),
-                                             match::Parameter(1)))) {
-    return false;
-  }
-  if (reduce->init_values().size() != 1) {
-    return false;
-  }
-  HloInstruction* init = reduce->init_values().front();
-  CHECK_EQ(init->shape().element_type(), hlo->shape().element_type());
-  const HloOpcode opcode = to_apply->root_instruction()->opcode();
-  const PrimitiveType ty = init->shape().element_type();
-  return primitive_util::FloatingPointTypeSwitch(
-      [&](auto primitive_type) {
-        return Match(
-            init,
-            match::ConstantScalar(
-                InvariantValueFor<primitive_util::NativeTypeOf<primitive_type>>(
-                    opcode)));
-      },
-      ty);
-}
-
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/backends/cpu/xnn_support.h b/third_party/xla/xla/backends/cpu/xnn_support.h
deleted file mode 100644
index 2e39e9430e3eb0..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_support.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_BACKENDS_CPU_XNN_SUPPORT_H_
-#define XLA_BACKENDS_CPU_XNN_SUPPORT_H_
-
-#include "xnnpack.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/codegen/target_machine_features.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/shape.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::cpu {
-
-inline constexpr absl::string_view kXnnFusionKind = "__xnn_fusion";
-
-// Returns true if the dot operation is supported by XNNPACK. Returns an error
-// if the dot operation shape is invalid.
-absl::StatusOr<bool> IsDotSupportedByXnn(
-    const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
-    const Shape& rhs_shape, const Shape& out_shape,
-    const TargetMachineFeatures* cpu_features = nullptr,
-    bool use_cost_model = true);
-
-// Returns the mappings from HLO opcodes to XNNPACK unary operators.
-const absl::flat_hash_map<HloOpcode, xnn_unary_operator>& GetXnnUnaryOpMap();
-
-// Returns the XNNPACK unary operator corresponding to the given HLO opcode.
-// Returns `InvalidArgument` if the opcode is not supported.
-absl::StatusOr<xnn_unary_operator> XnnUnaryOperator(const HloOpcode& opcode);
-
-// Returns the mappings from HLO opcodes to XNNPACK binary operators.
-const absl::flat_hash_map<HloOpcode, xnn_binary_operator>& GetXnnBinaryOpMap();
-
-// Returns the XNNPACK binary operator corresponding to the given HLO opcode.
-// Returns `InvalidArgument` if the opcode is not supported.
-absl::StatusOr<xnn_binary_operator> XnnBinaryOperator(const HloOpcode& opcode);
-
-// Returns true if the shape either doesn't have a layout or the layout is
-// descending. Shapes without layout are accepted to make HLO tests less
-// verbose.
-bool IsLayoutSupportedByXnn(const Shape& shape);
-
-// Returns true if the constant is supported by XNNPACK.
-bool IsConstantSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the nonconstant elementwise op is supported by XNNPACK.
-bool IsElementwiseOpSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the bitcast op is supported by XNNPACK.
-bool IsBitcastOpSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the broadcast op is supported by XNNPACK.
-bool IsBroadcastOpSupportedByXnn(const HloInstruction* hlo);
-
-// Returns true if the reduce op is supported by XNNPACK.
-bool IsReduceOpSupportedByXnn(const HloInstruction* hlo);
-
-}  // namespace xla::cpu
-
-#endif  // XLA_BACKENDS_CPU_XNN_SUPPORT_H_
diff --git a/third_party/xla/xla/backends/cpu/xnn_support_test.cc b/third_party/xla/xla/backends/cpu/xnn_support_test.cc
deleted file mode 100644
index a69b3287f54d8a..00000000000000
--- a/third_party/xla/xla/backends/cpu/xnn_support_test.cc
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/backends/cpu/xnn_support.h"
-
-#include <gtest/gtest.h>
-#include "xnnpack.h"
-#include "absl/container/flat_hash_map.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-
-namespace xla::cpu {
-namespace {
-
-class XnnSupportTest : public ::testing::Test {};
-
-TEST_F(XnnSupportTest, UnaryEltwiseOpMap) {
-  const auto& unary_map = GetXnnUnaryOpMap();
-
-  auto check = [&](const HloOpcode opcode, const xnn_unary_operator expected) {
-    auto result = unary_map.find(opcode);
-    EXPECT_NE(result, unary_map.end());
-    EXPECT_EQ(result->second, expected);
-  };
-
-  // Supported unary ops.
-  check(HloOpcode::kAbs, xnn_unary_abs);
-  check(HloOpcode::kExp, xnn_unary_exp);
-  check(HloOpcode::kFloor, xnn_unary_floor);
-  check(HloOpcode::kSqrt, xnn_unary_square_root);
-
-  // Unsupported unary ops.
-  EXPECT_EQ(unary_map.find(HloOpcode::kErf), unary_map.end());
-  EXPECT_EQ(unary_map.find(HloOpcode::kSort), unary_map.end());
-}
-
-TEST_F(XnnSupportTest, BinaryEltwiseOpMap) {
-  const auto& binary_map = GetXnnBinaryOpMap();
-
-  auto check = [&](const HloOpcode opcode, const xnn_binary_operator expected) {
-    auto result = binary_map.find(opcode);
-    EXPECT_NE(result, binary_map.end());
-    EXPECT_EQ(result->second, expected);
-  };
-
-  // Supported unary ops.
-  check(HloOpcode::kAdd, xnn_binary_add);
-  check(HloOpcode::kMultiply, xnn_binary_multiply);
-  check(HloOpcode::kSubtract, xnn_binary_subtract);
-  check(HloOpcode::kDivide, xnn_binary_divide);
-
-  // Unsupported unary ops.
-  EXPECT_EQ(binary_map.find(HloOpcode::kAtan2), binary_map.end());
-  EXPECT_EQ(binary_map.find(HloOpcode::kComplex), binary_map.end());
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 8a5e603efd34e6..06f551239b6163 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -1647,17 +1647,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           cpu::Thunk::CustomCallExecuteParams custom_call_execute_params,
           cpu::Thunk::CustomCallExecuteParams::Create(&run_options));
 
-      std::optional<cpu::Thunk::XnnParams> xnn_params;
-      if (cpu_executable->has_xnn_fusions()) {
-        TF_ASSIGN_OR_RETURN(xnn_params,
-                            cpu::Thunk::XnnParams::Create(&run_options));
-      }
-
       std::optional<cpu::Thunk::YnnParams> ynn_params;
+#ifdef XLA_YNNPACK
       if (cpu_executable->has_ynn_fusions()) {
         TF_ASSIGN_OR_RETURN(ynn_params,
                             cpu::Thunk::YnnParams::Create(&run_options));
       }
+#endif  // XLA_YNNPACK
 
       cpu::ThreadPoolTaskRunner task_runner(
           run_options.intra_op_thread_pool()->getPool());
@@ -1670,7 +1666,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           &task_runner,
           &collective_params,
           &custom_call_execute_params,
-          xnn_params ? &*xnn_params : nullptr,
           ynn_params ? &*ynn_params : nullptr,
           run_options.run_id().ToInt(),
           run_options.device_ordinal(),
@@ -1796,17 +1791,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
                 custom_call_params =
                     cpu::Thunk::CustomCallExecuteParams::Create(&run_options);
 
-            absl::StatusOr<std::optional<cpu::Thunk::XnnParams>> xnn_params(
-                std::nullopt);
-            if (cpu_executable->has_xnn_fusions()) {
-              xnn_params = cpu::Thunk::XnnParams::Create(&run_options);
-            }
-
             absl::StatusOr<std::optional<cpu::Thunk::YnnParams>> ynn_params(
                 std::nullopt);
+#ifdef XLA_YNNPACK
             if (cpu_executable->has_ynn_fusions()) {
               ynn_params = cpu::Thunk::YnnParams::Create(&run_options);
             }
+#endif  // XLA_YNNPACK
 
             cpu::ThreadPoolTaskRunner task_runner(
                 run_options.intra_op_thread_pool()->getPool());
@@ -1820,7 +1811,6 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
                   &task_runner,
                   &*collective_params,
                   &*custom_call_params,
-                  *xnn_params ? &**xnn_params : nullptr,
                   *ynn_params ? &**ynn_params : nullptr,
                   run_options.run_id().ToInt(),
                   run_options.device_ordinal(),
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 7154ec3e7ff9c5..3b5fcce8539a67 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -158,7 +158,6 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:constant_allocation",
         "//xla/backends/cpu:target_machine_options",
-        "//xla/backends/cpu:xnn_support",
         "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:compiled_function_library",
         "//xla/backends/cpu/codegen:cpu_features",
@@ -174,7 +173,6 @@ cc_library(
         "//xla/backends/cpu/runtime:thunk_proto_cc_impl",
         "//xla/backends/cpu/runtime:thunk_proto_serdes",
         "//xla/backends/cpu/transforms:library_rewriter",
-        "//xla/backends/cpu/transforms:xnn_graph_fusion",
         "//xla/backends/cpu/transforms/collectives:all_reduce_combiner",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
@@ -821,8 +819,6 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:onednn_emitter",
         "//xla/backends/cpu:onednn_support",
-        "//xla/backends/cpu:xnn_emitter",
-        "//xla/backends/cpu:xnn_support",
         "//xla/backends/cpu/codegen:computation_kernel_emitter",
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:fusion_emitter",
@@ -855,8 +851,6 @@ cc_library(
         "//xla/backends/cpu/runtime:topk_thunk",
         "//xla/backends/cpu/runtime:while_thunk",
         "//xla/backends/cpu/runtime/onednn:onednn_fusion_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_dot_thunk",
-        "//xla/backends/cpu/runtime/xnnpack:xnn_fusion_thunk",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:llvm_kernel_source",
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index 3da0ca6b17a641..1071345acd574a 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -20,6 +20,7 @@ message CustomCallBackendConfig {
 message FusionBackendConfig {
   string kind = 1;
   oneof custom_fusion_config_oneof {
+    // TODO: b/467367981, this is deprecated and should be removed.
     XnnFusionOptions xnn_fusion_options = 2;
     YnnFusionOptions ynn_fusion_options = 3;
   }
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 843e96ab9ab445..2edab7cd05064f 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -98,8 +98,6 @@ limitations under the License.
 #include "xla/backends/cpu/target_machine_options.h"
 #include "xla/backends/cpu/transforms/collectives/all_reduce_combiner.h"
 #include "xla/backends/cpu/transforms/library_rewriter.h"
-#include "xla/backends/cpu/transforms/xnn_graph_fusion.h"
-#include "xla/backends/cpu/xnn_support.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -492,15 +490,7 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
     pipeline->AddPass<GatherSimplifier>();
   }
 
-  if (module->config()
-              .debug_options()
-              .xla_cpu_experimental_xnn_graph_fusion_mode() ==
-          DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED &&
-      !absl::c_contains(module->config()
-                            .debug_options()
-                            .xla_cpu_experimental_xnn_fusion_type(),
-                        DebugOptions::LIBRARY_FUSION_TYPE_REDUCE) &&
-      !absl::c_contains(module->config()
+  if (!absl::c_contains(module->config()
                             .debug_options()
                             .xla_cpu_experimental_ynn_fusion_type(),
                         DebugOptions::LIBRARY_FUSION_TYPE_REDUCE)) {
@@ -544,15 +534,16 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
 
 auto LibrarySupportsConvolution(
     HloModule* module, TargetMachineFeatures* target_machine_features) {
+#ifdef XLA_YNNPACK
   const bool ynnpack_convolution_enabled = absl::c_linear_search(
       module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
       DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
   return [=](const HloInstruction& instr) {
-#ifdef XLA_YNNPACK
     return ynnpack_convolution_enabled && IsConvolutionOpSupportedByYnn(&instr);
-#endif  // XLA_YNNPACK
-    return false;
   };
+#else
+  return [](const HloInstruction&) { return false; };
+#endif  // XLA_YNNPACK
 }
 
 auto LibrarySupportsDot(HloModule* module,
@@ -560,23 +551,11 @@ auto LibrarySupportsDot(HloModule* module,
   // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
   // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
   // `XnnFusionThunk`.
-  const bool xnnpack_enabled =
-      module->config().debug_options().xla_cpu_use_xnnpack();
-  const auto xnn_graph_fusion_mode =
-      module->config()
-          .debug_options()
-          .xla_cpu_experimental_xnn_graph_fusion_mode();
-  const bool xnnpack_use_cost_model =
-      xnn_graph_fusion_mode !=
-      DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
-  const bool xnnpack_dot_enabled =
-      xnnpack_enabled &&
-      xnn_graph_fusion_mode != DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED;
+#ifdef XLA_YNNPACK
   const bool ynnpack_dot_enabled = absl::c_linear_search(
       module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
       DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
   return [=](const HloInstruction& instr) {
-#ifdef XLA_YNNPACK
     if (ynnpack_dot_enabled &&
         IsDotSupportedByYnn(instr.dot_dimension_numbers(),
                             instr.operand(0)->shape(),
@@ -584,18 +563,12 @@ auto LibrarySupportsDot(HloModule* module,
             .value_or(false)) {
       return true;
     }
-#endif  // XLA_YNNPACK
 
-    if (xnnpack_dot_enabled &&
-        IsDotSupportedByXnn(instr.dot_dimension_numbers(),
-                            instr.operand(0)->shape(),
-                            instr.operand(1)->shape(), instr.shape(),
-                            target_machine_features, xnnpack_use_cost_model)
-            .value_or(false)) {
-      return true;
-    }
     return false;
   };
+#else
+  return [](const HloInstruction&) { return false; };
+#endif  // XLA_YNNPACK
 }
 
 }  // namespace
@@ -1011,26 +984,18 @@ absl::Status CpuCompiler::RunHloPassesAfterLayoutAssn(
       !debug_options.xla_cpu_experimental_ynn_fusion_type().empty();
   LibraryRewriterOptions options = {
       /*use_onednn=*/debug_options.xla_cpu_use_onednn(),
-      /*use_xnnpack=*/debug_options.xla_cpu_use_xnnpack(),
       /*use_ynnpack=*/use_ynnpack,
       /*onednn_fusion_types=*/
       &debug_options.xla_cpu_experimental_onednn_fusion_type(),
-      /*xnn_fusion_types=*/
-      &debug_options.xla_cpu_experimental_xnn_fusion_type(),
       /*ynn_fusion_types=*/
       &debug_options.xla_cpu_experimental_ynn_fusion_type()};
-  if (options.use_onednn || options.use_xnnpack || options.use_ynnpack) {
+  if (options.use_onednn || options.use_ynnpack) {
     HloPassPipeline lib_pipeline("dot-library-passes");
     lib_pipeline.AddPass<DotDecomposer>();
     lib_pipeline.AddPass<LibraryRewriter>(target_machine_features, options);
     TF_RETURN_IF_ERROR(lib_pipeline.Run(module).status());
   }
 
-  if (debug_options.xla_cpu_experimental_xnn_graph_fusion_mode() !=
-      DebugOptions::XNN_GRAPH_FUSION_MODE_DISABLED) {
-    pipeline.AddPass<XnnGraphFusion>();
-  }
-
   bool use_multi_output_fusion =
       options::UseMultiOutputFusion(module->config());
   pipeline.AddPass<CpuInstructionFusion>(
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index c0c1e6446220fa..97bcff96d927ec 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -103,12 +103,6 @@ absl::StatusOr<std::unique_ptr<CpuExecutable>> CpuExecutable::Create(
       executable->thunks_,
       ThunkExecutor::Create(std::move(thunks), thunk_executor_options));
 
-  // Find if the thunk sequence contains any XNN fusion thunks. If we do have
-  // any, we will prepare the XNNPACK thread pool for them at run time.
-  executable->thunks_->thunk_sequence().ForEach([&](const Thunk& thunk) {
-    executable->has_xnn_fusions_ |= thunk.kind() == Thunk::Kind::kXnnFusion;
-  });
-
   // Find if the thunk sequence contains any YNN fusion thunks. If we do have
   // any, we will prepare the YNNPACK thread pool for them at run time.
   executable->thunks_->thunk_sequence().ForEach([&](const Thunk& thunk) {
@@ -262,17 +256,13 @@ absl::Status CpuExecutable::ExecuteThunks(
   TF_ASSIGN_OR_RETURN(Thunk::CustomCallExecuteParams custom_call_execute_params,
                       Thunk::CustomCallExecuteParams::Create(run_options));
 
-  // Prepare for executing XNNPACK fusions.
-  std::optional<Thunk::XnnParams> xnn_params;
-  if (has_xnn_fusions()) {
-    TF_ASSIGN_OR_RETURN(xnn_params, Thunk::XnnParams::Create(run_options));
-  }
-
   // Prepare for executing YNNPACK fusions.
   std::optional<Thunk::YnnParams> ynn_params;
+#ifdef XLA_YNNPACK
   if (has_ynn_fusions()) {
     TF_ASSIGN_OR_RETURN(ynn_params, Thunk::YnnParams::Create(run_options));
   }
+#endif  // XLA_YNNPACK
 
   // Use the intra-op thread pool to offload thunk executor tasks.
   auto* intra_op_thread_pool = run_options->intra_op_thread_pool();
@@ -287,7 +277,6 @@ absl::Status CpuExecutable::ExecuteThunks(
       &task_runner,
       &collective_execute_params,
       &custom_call_execute_params,
-      xnn_params ? &*xnn_params : nullptr,
       ynn_params ? &*ynn_params : nullptr};
 
   auto executed_event = thunks_->Execute(execute_params);
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index dd27f5eb8a38e9..7c8f397f542d90 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -426,23 +426,6 @@ xla_cc_test(
     ],
 )
 
-xla_cc_test(
-    name = "xnn_fusion_test",
-    srcs = ["xnn_fusion_test.cc"],
-    deps = [
-        "//xla:error_spec",
-        "//xla/backends/cpu:xnn_gemm_config",
-        "//xla/service:cpu_plugin",
-        "//xla/tests:hlo_test_base",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:platform_port",
-    ],
-)
-
 xla_cc_test(
     name = "cpu_copy_test",
     srcs = ["cpu_copy_test.cc"],
diff --git a/third_party/xla/xla/service/cpu/tests/xnn_fusion_test.cc b/third_party/xla/xla/service/cpu/tests/xnn_fusion_test.cc
deleted file mode 100644
index 095805fdfffe2f..00000000000000
--- a/third_party/xla/xla/service/cpu/tests/xnn_fusion_test.cc
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright 2024 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_replace.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/cpu/xnn_gemm_config.h"
-#include "xla/error_spec.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tsl/platform/test.h"
-#include "tsl/platform/cpu_info.h"
-
-namespace xla::cpu {
-namespace {
-
-using ::testing::HasSubstr;
-
-struct XnnFusionTestParams {
-  std::string in_dtype;
-  std::string out_dtype;  // Only used for mixed input/output types.
-};
-
-class XnnFusionTest
-    : public HloTestBase,
-      public ::testing::WithParamInterface<XnnFusionTestParams> {
- public:
-  static std::string Name(
-      const ::testing::TestParamInfo<XnnFusionTestParams>& info) {
-    return absl::StrCat(info.param.in_dtype, "_", info.param.out_dtype);
-  }
-
- protected:
-  XnnFusionTest() {
-    // Override XnnGemmConfig.
-    GetXnnGemmConfig().SetTestFilter([](const XnnGemm&) { return true; });
-  }
-
-  ~XnnFusionTest() override { GetXnnGemmConfig().SetTestFilter(nullptr); }
-
-  void RunTest(absl::string_view hlo_template, absl::string_view check_str) {
-    XnnFusionTestParams params = GetParam();
-    std::string hlo_text =
-        absl::StrReplaceAll(hlo_template, {{"$dtype", params.in_dtype},
-                                           {"$in_dtype", params.in_dtype},
-                                           {"$out_dtype", params.out_dtype}});
-    bool bf16_compute = params.in_dtype == "bf16" || params.out_dtype == "bf16";
-    double tolerance = bf16_compute ? 1e-2 : 1e-7;
-    EXPECT_TRUE(RunAndCompare(
-        hlo_text, ErrorSpec{/*aabs=*/tolerance, /*arel=*/tolerance}));
-
-    if (bf16_compute && !check_str.empty()) {
-      std::string check_text =
-          absl::StrReplaceAll(check_str, {{"$dtype", params.in_dtype},
-                                          {"$in_dtype", params.in_dtype},
-                                          {"$out_dtype", params.out_dtype}});
-      MatchOptimizedHlo(hlo_text, check_text);
-    }
-  }
-};
-
-bool ShouldSkipDotBf16Test(absl::string_view in_dtype) {
-  return in_dtype == "bf16" &&
-         !tsl::port::TestCPUFeature(tsl::port::AVX512_BF16);
-}
-
-absl::string_view GetOutputTypeSupportedByXnnBatchMatMul(
-    absl::string_view in_dtype) {
-  static const auto* kSupportedOutputTypes =
-      new absl::flat_hash_map<absl::string_view, absl::string_view>(
-          {{"f32", "f32"}, {"bf16", "f32"}});
-
-  return kSupportedOutputTypes->at(in_dtype);
-}
-
-std::string InsertConvertIfNecessary(absl::string_view hlo_text,
-                                     absl::string_view in_dtype,
-                                     absl::string_view out_dtype,
-                                     absl::string_view convert_text) {
-  absl::string_view supported_dtype =
-      GetOutputTypeSupportedByXnnBatchMatMul(in_dtype);
-  bool need_convert = out_dtype != supported_dtype;
-  return absl::StrReplaceAll(
-      hlo_text, {{"$root ", need_convert ? "" : "ROOT "},
-                 {"$dot_dtype", need_convert ? supported_dtype : "$out_dtype"},
-                 {"$convert_if_necessary", need_convert ? convert_text : ""},
-                 {"$dot_or_convert", need_convert ? "%convert" : "%dot"}});
-}
-
-// For tests that always have same input/output types.
-using SameTypeTest = XnnFusionTest;
-
-TEST_P(SameTypeTest, AddAndMultiply) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule add_and_multiply
-
-    xnn_fusion {
-      %lhs = $dtype[4] parameter(0)
-      %rhs = $dtype[4] parameter(1)
-      %add = $dtype[4] add(%lhs, %rhs)
-      ROOT %mul = $in_dtype[4] multiply(%add, %add)
-    }
-
-    ENTRY entry {
-      %p0 = $dtype[4] parameter(0)
-      %p1 = $dtype[4] parameter(1)
-      ROOT %fusion = $dtype[4] fusion(%p0, %p1), kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  // Optimized HLO shouldn't have any convert.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      multiply
-  )";
-
-  RunTest(kModuleStr, kCheckStr);
-}
-
-TEST_P(SameTypeTest, DotAddMultiply) {
-  XnnFusionTestParams params = GetParam();
-  if (ShouldSkipDotBf16Test(params.in_dtype)) {
-    GTEST_SKIP() << "XNNPACK bf16 matmul requires AVX512_BF16 which this CPU "
-                    "doesn't have.";
-  }
-
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule dot_add_multiply
-
-    xnn_fusion {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[5,6] parameter(1)
-      %addend = $dtype[4,6] parameter(2)
-      %multiplier = $dtype[4,6] parameter(3)
-      %dot = $dot_dtype[4,6] dot(%lhs, %rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={0}
-      $convert_if_necessary
-      %add = $dtype[4,6] add($dot_or_convert, %addend)
-      ROOT %mul = $dtype[4,6] multiply(%add, %multiplier)
-    }
-
-    ENTRY entry {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[5,6] parameter(1)
-      %addend = $dtype[4, 6] parameter(2)
-      %multiplier = $dtype[4, 6] parameter(3)
-      ROOT %fusion = $dtype[4,6] fusion(%lhs, %rhs, %addend, %multiplier),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  constexpr absl::string_view kConvertStr =
-      "%convert = $dtype[4,6] convert(%dot)";
-
-  // Optimized HLO shouldn't have any convert before the dot.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      dot
-  )";
-
-  RunTest(InsertConvertIfNecessary(kModuleStr, params.in_dtype,
-                                   params.out_dtype, kConvertStr),
-          kCheckStr);
-}
-
-TEST_P(SameTypeTest, DotRhsTransposedAndMultiply) {
-  XnnFusionTestParams params = GetParam();
-  if (ShouldSkipDotBf16Test(params.in_dtype)) {
-    GTEST_SKIP() << "XNNPACK bf16 matmul requires AVX512_BF16 which this CPU "
-                    "doesn't have.";
-  }
-
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule dot_rhs_transposed_and_multiply
-
-    xnn_fusion {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[6,5] parameter(1)
-      %multiplier = $dtype[4,6] parameter(2)
-      %dot = $dot_dtype[4,6] dot(%lhs, %rhs),
-        lhs_contracting_dims={1}, rhs_contracting_dims={1}
-      $convert_if_necessary
-      ROOT %mul = $dtype[4,6] multiply($dot_or_convert, %multiplier)
-    }
-
-    ENTRY entry {
-      %lhs = $dtype[4,5] parameter(0)
-      %rhs = $dtype[6,5] parameter(1)
-      %multiplier = $dtype[4, 6] parameter(2)
-      ROOT %fusion = $dtype[4,6] fusion(%lhs, %rhs, %multiplier),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  constexpr absl::string_view kConvertStr =
-      "%convert = $dtype[4,6] convert(%dot)";
-
-  // Optimized HLO shouldn't have any convert before the dot.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      dot
-  )";
-
-  RunTest(InsertConvertIfNecessary(kModuleStr, params.in_dtype,
-                                   params.out_dtype, kConvertStr),
-          kCheckStr);
-}
-
-std::vector<XnnFusionTestParams> GetSameTypeTestCases() {
-  return std::vector<XnnFusionTestParams>({
-      XnnFusionTestParams{"f32", "f32" /*unused*/},
-      XnnFusionTestParams{"bf16", "bf16" /*unused*/},
-  });
-}
-
-INSTANTIATE_TEST_SUITE_P(SameTypeTestInstantiation, SameTypeTest,
-                         ::testing::ValuesIn(GetSameTypeTestCases()),
-                         XnnFusionTest::Name);
-
-// For tests that we might want to use different input/output types.
-using MixedTypesTest = XnnFusionTest;
-
-TEST_P(MixedTypesTest, BatchedDot) {
-  XnnFusionTestParams params = GetParam();
-  if (ShouldSkipDotBf16Test(params.in_dtype)) {
-    GTEST_SKIP() << "XNNPACK bf16 matmul requires AVX512_BF16 which this CPU"
-                    "doesn't have.";
-  }
-
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule dot_add_multiply
-
-    xnn_fusion {
-      %lhs = $in_dtype[2,3,4,5] parameter(0)
-      %rhs = $in_dtype[2,3,5,6] parameter(1)
-      $root %dot = $dot_dtype[2,3,4,6] dot(%lhs, %rhs),
-        lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
-        lhs_contracting_dims={3}, rhs_contracting_dims={2}
-      $convert_if_necessary
-    }
-
-    ENTRY entry {
-      %lhs = $in_dtype[2,3,4,5] parameter(0)
-      %rhs = $in_dtype[2,3,5,6] parameter(1)
-      ROOT %fusion = $out_dtype[2,3,4,6] fusion(%lhs, %rhs),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  constexpr absl::string_view kConvertStr =
-      "ROOT %convert = $out_dtype[2,3,4,6] convert(%dot)";
-
-  // Optimized HLO shouldn't have any convert before the dot.
-  constexpr absl::string_view kCheckStr = R"(
-    CHECK:      %xnn_fusion
-    CHECK-NOT:  convert
-    CHECK:      dot
-  )";
-
-  RunTest(InsertConvertIfNecessary(kModuleStr, params.in_dtype,
-                                   params.out_dtype, kConvertStr),
-          kCheckStr);
-}
-
-std::vector<XnnFusionTestParams> GetMixedTypesTestCases() {
-  return std::vector<XnnFusionTestParams>({
-      XnnFusionTestParams{"f32", "f32"},
-      XnnFusionTestParams{"bf16", "f32"},
-      XnnFusionTestParams{"bf16", "bf16"},
-  });
-}
-
-INSTANTIATE_TEST_SUITE_P(MixedTypesTestInstantiation, MixedTypesTest,
-                         ::testing::ValuesIn(GetMixedTypesTestCases()),
-                         XnnFusionTest::Name);
-
-TEST_F(XnnFusionTest, ConvertF32ToBF16) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule convert
-
-    xnn_fusion {
-      %input = f32[2,3,4,5] parameter(0)
-      ROOT %dot = bf16[2,3,4,5] convert(%input)
-    }
-
-    ENTRY entry {
-      %input = f32[2,3,4,5] parameter(0)
-      ROOT %fusion = bf16[2,3,4,5] fusion(%input),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  EXPECT_TRUE(RunAndCompare(kModuleStr, ErrorSpec{1e-2}));
-}
-
-// The following tests don't need to be run with different data types.
-TEST_F(XnnFusionTest, UnsupportedDot) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule unsupported_dot
-
-    xnn_fusion {
-      %lhs = f32[5,4] parameter(0)
-      %rhs = f32[5,6] parameter(1)
-      ROOT %dot = f32[4,6] dot(%lhs, %rhs),
-        lhs_contracting_dims={0}, rhs_contracting_dims={0}
-    }
-
-    ENTRY entry {
-      %lhs = f32[5,4] parameter(0)
-      %rhs = f32[5,6] parameter(1)
-      ROOT %fusion = f32[4,6] fusion(%lhs, %rhs),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  auto status = RunAndCompare(kModuleStr, ErrorSpec{0.0});
-  EXPECT_FALSE(status);
-  EXPECT_THAT(status.message(),
-              HasSubstr("Unsupported XNNPACK Dot op variation"));
-}
-
-TEST_F(XnnFusionTest, UnsupportedBatchDot) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule unsupported_dot
-
-    xnn_fusion {
-      %lhs = f32[64,64] parameter(0)
-      %rhs = f32[64,64] parameter(1)
-      ROOT %dot = f32[64]{0} dot(%lhs, %rhs),
-        lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
-    }
-
-    ENTRY entry {
-      %lhs = f32[64,64] parameter(0)
-      %rhs = f32[64,64] parameter(1)
-      ROOT %fusion = f32[64] fusion(%lhs, %rhs),
-        kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  auto status = RunAndCompare(kModuleStr, ErrorSpec{0.0});
-  EXPECT_FALSE(status);
-  EXPECT_THAT(status.message(),
-              HasSubstr("Unsupported XNNPACK Dot op variation"));
-}
-
-TEST_F(XnnFusionTest, UnsupportedOp) {
-  constexpr absl::string_view kModuleStr = R"(
-    HloModule unsupported_sqrt
-
-    xnn_fusion {
-      %x = f32[10] parameter(0)
-      ROOT %e = f32[10] erf(%x)
-    }
-
-    ENTRY entry {
-      %x = f32[10] parameter(0)
-      ROOT %e = f32[10] fusion(%x), kind=kCustom, calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
-    })";
-
-  auto status = RunAndCompare(kModuleStr, ErrorSpec{0.0});
-  EXPECT_FALSE(status);
-  EXPECT_THAT(status.message(),
-              HasSubstr("Unsupported elementwise instruction in XNN fusion"));
-}
-
-}  // namespace
-}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 5b91a6c097c6d8..80a9fe9245982f 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -69,10 +69,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_dot_thunk.h"
-#include "xla/backends/cpu/runtime/xnnpack/xnn_fusion_thunk.h"
-#include "xla/backends/cpu/xnn_emitter.h"
-#include "xla/backends/cpu/xnn_support.h"
 #include "xla/codegen/emitters/computation_fingerprint.h"
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -445,10 +441,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
         }
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
-        if (backend_config.fusion_config().kind() == kXnnFusionKind) {
-          return EmitXnnFusionThunk(instruction);
-        }
-
 #ifdef XLA_YNNPACK
         if (backend_config.fusion_config().kind() == kYnnFusionKind) {
           return EmitYnnFusionThunk(instruction);
@@ -1114,31 +1106,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
       }
 #endif  // XLA_YNNPACK
 
-      // Decide whether to use XNNPACK or Eigen.
-      bool use_xnn = hlo_module_config_.debug_options().xla_cpu_use_xnnpack();
-      if (use_xnn) {
-        const bool use_cost_model =
-            hlo_module_config_.debug_options()
-                .xla_cpu_experimental_xnn_graph_fusion_mode() !=
-            DebugOptions::XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL;
-        TF_ASSIGN_OR_RETURN(
-            use_xnn,
-            IsDotSupportedByXnn(dnums, lhs->shape(), rhs->shape(),
-                                instruction->shape(), &target_machine_features_,
-                                use_cost_model));
-      }
-
-      if (use_xnn) {
-        bool capture_rhs = HloPredicateIsOp<HloOpcode::kParameter>(rhs);
-        return ThunkSequence::Of<XnnDotThunk>(
-            XnnDotThunk::Options{}, ThunkInfo(instruction), dnums, lhs_slice,
-            lhs->shape(), rhs_slice, rhs->shape(), out_slice,
-            instruction->shape(), capture_rhs);
-      } else {
-        return ThunkSequence::Of<DotThunk>(
-            ThunkInfo(instruction), dnums, lhs_slice, lhs->shape(), rhs_slice,
-            rhs->shape(), out_slice, instruction->shape());
-      }
+      return ThunkSequence::Of<DotThunk>(
+          ThunkInfo(instruction), dnums, lhs_slice, lhs->shape(), rhs_slice,
+          rhs->shape(), out_slice, instruction->shape());
     }
   }
 }
@@ -1502,41 +1472,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitOneDnnFusionThunk(
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 }
 
-absl::StatusOr<ThunkSequence> ThunkEmitter::EmitXnnFusionThunk(
-    const HloInstruction* instruction) {
-  auto* fusion = Cast<HloFusionInstruction>(instruction);
-
-  // Collect XNNPACK fusion arguments.
-  std::vector<XnnFusionThunk::Argument> arguments;
-  for (HloInstruction* operand : instruction->operands()) {
-    for (auto& indexed : ShapeUtil::GetLeafShapes(operand->shape())) {
-      TF_ASSIGN_OR_RETURN(
-          BufferAllocation::Slice slice,
-          buffer_assignment_.GetUniqueSlice(operand, indexed.index));
-      arguments.push_back(XnnFusionThunk::Argument{slice, indexed.shape});
-    }
-  }
-
-  // Collect XNNPACK fusion results.
-  std::vector<XnnFusionThunk::Result> results;
-  for (auto& indexed : ShapeUtil::GetLeafShapes(instruction->shape())) {
-    TF_ASSIGN_OR_RETURN(
-        BufferAllocation::Slice slice,
-        buffer_assignment_.GetUniqueSlice(instruction, indexed.index));
-    results.push_back(XnnFusionThunk::Result{slice, indexed.shape});
-  }
-
-  const HloComputation* computation = fusion->fused_instructions_computation();
-
-  // Construct XNNPACK subgraph builder from the fusion computation.
-  TF_ASSIGN_OR_RETURN(auto builder, EmitXnnFusionBuilder(computation));
-
-  return ThunkSequence::Of<XnnFusionThunk>(
-      XnnFusionThunk::Options{}, ThunkInfo(instruction), std::move(arguments),
-      std::move(results),
-      [b = std::move(builder)](auto, auto) mutable { return b(); });
-}
-
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
     const HloInstruction* instruction) {
 #ifdef XLA_YNNPACK

From 1d655a94d025b5f6c7ece756e102c32e7c122558 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Thu, 11 Dec 2025 23:28:59 -0800
Subject: [PATCH 213/753] Add Shape to ConditionalThunk buffer_uses

Modify Thunk's serialization

PiperOrigin-RevId: 843541791
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  5 ++
 .../gpu/runtime/command_buffer_cmd.cc         |  8 +--
 .../backends/gpu/runtime/command_buffer_cmd.h |  5 +-
 .../gpu/runtime/command_buffer_cmd_emitter.cc |  1 -
 .../command_buffer_conversion_pass_test.cc    |  7 ++-
 .../gpu/runtime/command_buffer_thunk_test.cc  |  4 +-
 .../backends/gpu/runtime/conditional_thunk.cc | 33 ++++++-----
 .../backends/gpu/runtime/conditional_thunk.h  | 14 ++---
 .../gpu/runtime/conditional_thunk_test.cc     | 58 ++++++++++++-------
 .../gpu/runtime/for_all_thunks_test.cc        | 11 +++-
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  3 +-
 .../runtime/thunk_buffer_debug_pass_test.cc   | 11 ++--
 .../thunk_proto_deserialization_test.cc       |  6 +-
 .../xla/xla/service/gpu/thunk_emitter.cc      |  7 ++-
 14 files changed, 103 insertions(+), 70 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 3876537b50c5b2..e51e4f946e9955 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -449,6 +449,7 @@ cc_library(
     deps = [
         ":host_memory_pool",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
         "//xla:status_macros",
@@ -481,8 +482,10 @@ xla_cc_test(
     deps = [
         ":conditional_thunk",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
@@ -2243,6 +2246,7 @@ xla_cc_test(
         ":thunk",
         ":thunk_proto_cc",
         ":while_thunk",
+        "//xla:shape_util",
         "//xla/service:buffer_assignment",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -3307,6 +3311,7 @@ xla_cc_test(
         ":custom_call_thunk",
         ":runtime_intrinsics",
         ":sequential_thunk",
+        ":shaped_slice",
         ":thunk",
         ":thunk_buffer_debug_pass",
         ":thunk_id",
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index 5412864c823977..f5cbe9d43370fc 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -1505,11 +1505,11 @@ absl::StatusOr<const se::CommandBuffer::Command*> ChildCmd::Record(
 // CaseCmd
 //===----------------------------------------------------------------------===//
 
-CaseCmd::CaseCmd(BufferAllocation::Slice index, bool index_is_bool,
+CaseCmd::CaseCmd(ShapedSlice index,
                  std::vector<CommandBufferCmdExecutor> branches)
     : CommandBufferCmd(CommandBufferCmdType::kCaseCmd),
       index_(index),
-      index_is_bool_(index_is_bool),
+      index_is_bool_(index.shape.element_type() == PRED),
       branches_(std::move(branches)) {}
 
 absl::Status CaseCmd::Initialize(const Thunk::InitializeParams& params,
@@ -1525,7 +1525,7 @@ absl::StatusOr<const se::CommandBuffer::Command*> CaseCmd::Record(
     const RecordParams& record_params, RecordAction record_action,
     se::CommandBuffer* command_buffer) {
   se::DeviceAddressBase index =
-      execute_params.buffer_allocations->GetDeviceAddress(index_);
+      execute_params.buffer_allocations->GetDeviceAddress(index_.slice);
 
   VLOG(5) << "CaseCmd:";
   VLOG(5) << "  index: " << index_ << " (" << index.opaque() << ")";
@@ -1568,7 +1568,7 @@ bool CaseCmd::force_update() {
 
 CommandBufferCmd::BufferUseVector CaseCmd::buffers() const {
   absl::flat_hash_set<BufferUse> buffers;
-  buffers.emplace(BufferUse::Read(index_));
+  buffers.emplace(BufferUse::Read(index_.slice, index_.shape));
   for (auto& branch : branches_) {
     buffers.insert(branch.buffers().begin(), branch.buffers().end());
   }
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index abeb0971888d22..f6bc947262cde5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -869,8 +869,7 @@ class ChildCmd : public CommandBufferCmd {
 
 class CaseCmd : public CommandBufferCmd {
  public:
-  CaseCmd(BufferAllocation::Slice index, bool index_is_bool,
-          std::vector<CommandBufferCmdExecutor> branches);
+  CaseCmd(ShapedSlice index, std::vector<CommandBufferCmdExecutor> branches);
 
   absl::Status Initialize(const Thunk::InitializeParams& params,
                           StateManager& state) override;
@@ -889,7 +888,7 @@ class CaseCmd : public CommandBufferCmd {
   BufferUseVector buffers() const override;
 
  private:
-  BufferAllocation::Slice index_;
+  ShapedSlice index_;
   bool index_is_bool_;
   std::vector<CommandBufferCmdExecutor> branches_;
 };
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
index 1af03c86094247..8865fdf5853060 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -160,7 +160,6 @@ static absl::StatusOr<Command> Convert(
     }
   }
   return std::make_unique<CaseCmd>(thunk.branch_index_buffer(),
-                                   thunk.branch_index_is_bool(),
                                    std::move(branch_cmds));
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
index 19a47a6ff0f5df..f03e3c3b3e944e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass_test.cc
@@ -183,6 +183,7 @@ std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
     std::vector<std::vector<std::unique_ptr<Thunk>>> branch_thunks) {
   BufferAllocation alloc(0, 1024, 0);
   BufferAllocation::Slice slice(&alloc, 0, 1024);
+  Shape shape = ShapeUtil::MakeShape(S32, {});
 
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunk_sequences;
   for (auto& thunks : branch_thunks) {
@@ -190,9 +191,9 @@ std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
         Thunk::ThunkInfo(), std::move(thunks)));
   }
 
-  return std::make_unique<ConditionalThunk>(Thunk::ThunkInfo(), slice,
-                                            std::move(branch_thunk_sequences),
-                                            /*branch_index_is_bool=*/false);
+  return std::make_unique<ConditionalThunk>(Thunk::ThunkInfo(),
+                                            ShapedSlice{slice, shape},
+                                            std::move(branch_thunk_sequences));
 }
 
 std::unique_ptr<CustomCallThunk> CreateCustomCallThunk(
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index 9c80bec9cba67c..b67163dc794508 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -1384,6 +1384,8 @@ TEST(CommandBufferThunkTest, CaseCmd) {
   BufferAllocation alloc_b(/*index=*/2, byte_length, /*color=*/0);
 
   BufferAllocation::Slice slice_i(&alloc_i, 0, sizeof(int32_t));
+  Shape i_shape = ShapeUtil::MakeShape(S32, {});
+
   BufferAllocation::Slice slice_a(&alloc_a, 0, byte_length);
   BufferAllocation::Slice slice_b(&alloc_b, 0, byte_length);
 
@@ -1417,7 +1419,7 @@ TEST(CommandBufferThunkTest, CaseCmd) {
 
   // Prepare commands sequence for thunk.
   CommandBufferCmdSequence commands;
-  commands.Emplace<CaseCmd>(slice_i, false, std::move(branches));
+  commands.Emplace<CaseCmd>(ShapedSlice{slice_i, i_shape}, std::move(branches));
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
index f1077fcf368d3f..49602cbf3fda36 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
@@ -48,14 +49,18 @@ namespace xla {
 namespace gpu {
 
 ConditionalThunk::ConditionalThunk(
-    ThunkInfo thunk_info,
-    const BufferAllocation::Slice& branch_index_buffer_index,
-    std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks,
-    bool branch_index_is_bool)
+    ThunkInfo thunk_info, const ShapedSlice& branch_index_buffer_index,
+    std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks)
     : Thunk(Kind::kConditional, thunk_info),
       branch_index_buffer_index_(branch_index_buffer_index),
       branch_thunks_(std::move(branch_thunks)),
-      branch_index_is_bool_(branch_index_is_bool) {}
+      branch_index_is_bool_(branch_index_buffer_index.shape.element_type() ==
+                            PRED) {
+  PrimitiveType element_type = branch_index_buffer_index.shape.element_type();
+  CHECK(element_type == PRED || element_type == S32);
+  CHECK_EQ(branch_index_buffer_index.shape.dimensions(),
+           std::vector<int64_t>{});
+}
 
 absl::Status ConditionalThunk::Prepare(const PrepareParams& params) {
   if (branch_index_is_bool_) {
@@ -111,7 +116,8 @@ absl::Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
   }();
 
   se::DeviceAddressBase branch_index_address =
-      params.buffer_allocations->GetDeviceAddress(branch_index_buffer_index_);
+      params.buffer_allocations->GetDeviceAddress(
+          branch_index_buffer_index_.slice);
   if (branch_index_is_bool_) {
     TF_RETURN_IF_ERROR(stream.Memcpy(std::get<bool*>(branch_index_or_pred),
                                      branch_index_address, sizeof(bool)));
@@ -191,8 +197,6 @@ absl::StatusOr<ThunkProto> ConditionalThunk::ToProto() const {
     *conditional_thunk_proto->add_branch_thunks() =
         std::move(seq_thunk_proto).sequential_thunk();
   }
-
-  conditional_thunk_proto->set_branch_index_is_bool(branch_index_is_bool_);
   return proto;
 }
 
@@ -200,10 +204,9 @@ absl::StatusOr<std::unique_ptr<ConditionalThunk>> ConditionalThunk::FromProto(
     ThunkInfo thunk_info, const ConditionalThunkProto& thunk_proto,
     absl::Span<const BufferAllocation> buffer_allocations,
     const Deserializer& deserializer) {
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice branch_index_buffer_index,
-      BufferAllocation::Slice::FromProto(thunk_proto.branch_index_buffer(),
-                                         buffer_allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice branch_index_buffer_index,
+                      ShapedSlice::FromProto(thunk_proto.branch_index_buffer(),
+                                             buffer_allocations));
 
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.reserve(thunk_proto.branch_thunks_size());
@@ -213,9 +216,9 @@ absl::StatusOr<std::unique_ptr<ConditionalThunk>> ConditionalThunk::FromProto(
         SequentialThunk::FromProto(thunk_info, seq_thunk_proto, deserializer));
     branch_thunks.push_back(std::move(seq_thunk));
   }
-  return std::make_unique<ConditionalThunk>(
-      std::move(thunk_info), branch_index_buffer_index,
-      std::move(branch_thunks), thunk_proto.branch_index_is_bool());
+  return std::make_unique<ConditionalThunk>(std::move(thunk_info),
+                                            branch_index_buffer_index,
+                                            std::move(branch_thunks));
 }
 
 std::string ConditionalThunk::ToString(int indent) const {
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
index 312574a4a91037..cc03a6cdc4c0fa 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/host_memory_pool.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
@@ -51,10 +52,8 @@ namespace gpu {
 class ConditionalThunk : public Thunk {
  public:
   ConditionalThunk(
-      ThunkInfo thunk_info,
-      const BufferAllocation::Slice& branch_index_buffer_index,
-      std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks,
-      bool branch_index_is_bool);
+      ThunkInfo thunk_info, const ShapedSlice& branch_index_buffer_index,
+      std::vector<std::unique_ptr<SequentialThunk>>&& branch_thunks);
 
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
@@ -67,7 +66,7 @@ class ConditionalThunk : public Thunk {
     return branch_thunks_;
   }
 
-  const BufferAllocation::Slice& branch_index_buffer() const {
+  const ShapedSlice& branch_index_buffer() const {
     return branch_index_buffer_index_;
   }
 
@@ -82,7 +81,8 @@ class ConditionalThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(branch_index_buffer_index_),
+        BufferUse::Read(branch_index_buffer_index_.slice,
+                        branch_index_buffer_index_.shape),
     };
   }
 
@@ -105,7 +105,7 @@ class ConditionalThunk : public Thunk {
   std::string ToString(int indent) const override;
 
  private:
-  const BufferAllocation::Slice branch_index_buffer_index_;
+  const ShapedSlice branch_index_buffer_index_;
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks_;
   bool branch_index_is_bool_;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
index 247ab4eca9835f..4207aca6afbbb7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/conditional_thunk_test.cc
@@ -27,9 +27,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -67,9 +70,8 @@ struct DummyThunk : public Thunk {
 
 std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
     const Thunk::ThunkInfo& thunk_info,
-    const BufferAllocation::Slice& branch_index_buffer_index,
-    std::vector<ThunkSequence> branch_thunk_sequences,
-    bool kBranchIndexIsBool) {
+    const ShapedSlice& branch_index_buffer_index,
+    std::vector<ThunkSequence> branch_thunk_sequences) {
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   for (auto& thunk_sequence : branch_thunk_sequences) {
     branch_thunks.push_back(std::make_unique<SequentialThunk>(
@@ -77,8 +79,7 @@ std::unique_ptr<ConditionalThunk> CreateConditionalThunk(
   }
 
   return std::make_unique<ConditionalThunk>(
-      thunk_info, branch_index_buffer_index, std::move(branch_thunks),
-      kBranchIndexIsBool);
+      thunk_info, branch_index_buffer_index, std::move(branch_thunks));
 }
 
 TEST(ConditionalThunkTest, BufferUses) {
@@ -87,7 +88,10 @@ TEST(ConditionalThunkTest, BufferUses) {
   thunk_info.execution_stream_id = 123;
 
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
+
+  constexpr bool kBranchIndexIsBool = true;
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(PRED, {});
 
   ThunkSequence false_seq;
   false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
@@ -101,12 +105,11 @@ TEST(ConditionalThunkTest, BufferUses) {
   branch_thunk_sequences.push_back(std::move(false_seq));
   branch_thunk_sequences.push_back(std::move(true_seq));
 
-  constexpr bool kBranchIndexIsBool = true;
   std::unique_ptr<ConditionalThunk> thunk = CreateConditionalThunk(
-      thunk_info, slice, std::move(branch_thunk_sequences), kBranchIndexIsBool);
+      thunk_info, {slice, shape}, std::move(branch_thunk_sequences));
 
   EXPECT_EQ(thunk->branch_index_is_bool(), kBranchIndexIsBool);
-  EXPECT_EQ(thunk->branch_index_buffer(), slice);
+  EXPECT_EQ(thunk->branch_index_buffer().slice, slice);
 
   auto thunk_matcher = Pointee(Property(&Thunk::kind, Thunk::Kind::kGemm));
   auto branch_matcher = Pointee(Property(
@@ -122,6 +125,7 @@ TEST(ConditionalThunkTest, ToProto) {
 
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+  Shape shape = ShapeUtil::MakeShape(PRED, {});
 
   ThunkSequence false_seq;
   false_seq.push_back(std::make_unique<DummyThunk>(Kind::kGemm, thunk_info));
@@ -135,9 +139,8 @@ TEST(ConditionalThunkTest, ToProto) {
   branch_thunk_seq.push_back(std::move(false_seq));
   branch_thunk_seq.push_back(std::move(true_seq));
 
-  constexpr bool kBranchIndexIsBool = true;
   std::unique_ptr<ConditionalThunk> thunk = CreateConditionalThunk(
-      thunk_info, slice, std::move(branch_thunk_seq), kBranchIndexIsBool);
+      thunk_info, {slice, shape}, std::move(branch_thunk_seq));
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, thunk->ToProto());
 
   std::string expected = R"pb(
@@ -146,7 +149,13 @@ TEST(ConditionalThunkTest, ToProto) {
       execution_stream_id: 123
     }
     conditional_thunk {
-      branch_index_buffer { size: 256 }
+      branch_index_buffer {
+        slice { size: 256 }
+        shape {
+          element_type: PRED
+          layout { tail_padding_alignment_in_elements: 1 }
+        }
+      }
       branch_thunks {
         thunks {
           thunk_info {
@@ -175,7 +184,6 @@ TEST(ConditionalThunkTest, ToProto) {
           }
         }
       }
-      branch_index_is_bool: true
     }
   )pb";
   EXPECT_THAT(proto, EqualsProto(expected));
@@ -190,7 +198,13 @@ TEST(ConditionalThunkTest, FromProto) {
           execution_stream_id: 123
         }
         conditional_thunk {
-          branch_index_buffer { offset: 8 size: 256 buffer_allocation_index: 0 }
+          branch_index_buffer {
+            slice { offset: 8 size: 256 buffer_allocation_index: 0 }
+            shape {
+              element_type: PRED
+              layout { tail_padding_alignment_in_elements: 1 }
+            }
+          }
           branch_thunks {
             thunks {
               thunk_info {
@@ -219,7 +233,6 @@ TEST(ConditionalThunkTest, FromProto) {
               }
             }
           }
-          branch_index_is_bool: true
         }
       )pb",
       &proto));
@@ -248,6 +261,8 @@ TEST(ConditionalThunkTest, ToString) {
 
   BufferAllocation alloc(/*index=*/0, /*size=*/1024, /*color=*/0);
   BufferAllocation::Slice slice(&alloc, /*offset=*/0, /*size=*/256);
+  Shape bool_shape = ShapeUtil::MakeShape(PRED, {});
+  Shape int_shape = ShapeUtil::MakeShape(S32, {});
 
   auto create_branch_thunk_sequences = [&]() -> std::vector<ThunkSequence> {
     ThunkSequence false_seq;
@@ -264,9 +279,8 @@ TEST(ConditionalThunkTest, ToString) {
   };
 
   ThunkSequence thunk_sequence;
-  thunk_sequence.push_back(
-      CreateConditionalThunk(thunk_info, slice, create_branch_thunk_sequences(),
-                             /*kBranchIndexIsBool=*/true));
+  thunk_sequence.push_back(CreateConditionalThunk(
+      thunk_info, {slice, bool_shape}, create_branch_thunk_sequences()));
   auto sequential_thunk =
       std::make_unique<SequentialThunk>(thunk_info, std::move(thunk_sequence));
   EXPECT_EQ(sequential_thunk->ToString(/*indent=*/0),
@@ -277,9 +291,8 @@ TEST(ConditionalThunkTest, ToString) {
             "    000: kGemm\t\n"
             "    000: kGemm\t\n\n");
 
-  std::unique_ptr<ConditionalThunk> thunk =
-      CreateConditionalThunk(thunk_info, slice, create_branch_thunk_sequences(),
-                             /*kBranchIndexIsBool=*/false);
+  std::unique_ptr<ConditionalThunk> thunk = CreateConditionalThunk(
+      thunk_info, {slice, int_shape}, create_branch_thunk_sequences());
 
   EXPECT_EQ(thunk->ToString(/*indent=*/0),
             "\n"
@@ -292,14 +305,15 @@ TEST(ConditionalThunkTest, ToString) {
 
 TEST(ConditionalThunkTest, TransformAllNestedThunks) {
   BufferAllocation::Slice slice;
+  Shape shape = ShapeUtil::MakeShape(S32, {});
+
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.push_back(
       std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
   branch_thunks.push_back(
       std::make_unique<SequentialThunk>(Thunk::ThunkInfo(), ThunkSequence()));
   auto conditional_thunk = std::make_unique<ConditionalThunk>(
-      Thunk::ThunkInfo(), slice, std::move(branch_thunks),
-      /*branch_index_is_bool=*/false);
+      Thunk::ThunkInfo(), ShapedSlice{slice, shape}, std::move(branch_thunks));
 
   TF_EXPECT_OK(conditional_thunk->TransformAllNestedThunks([](auto) {
     return std::make_unique<DummyThunk>(Kind::kCustomCall, Thunk::ThunkInfo());
diff --git a/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc b/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
index 7f5349975ee739..bec1a8fedb07a1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/for_all_thunks_test.cc
@@ -30,6 +30,8 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 
 namespace xla::gpu {
 namespace {
@@ -106,11 +108,14 @@ TEST(ForAllThunksTest, ConditionalThunk) {
       Thunk::ThunkInfo(), std::move(thunk_sequence));
   SequentialThunk* sequential_thunk_ptr = sequential_thunk.get();
 
+  BufferAllocation alloc(0, 1024, 0);
+  BufferAllocation::Slice slice(&alloc, 0, 4);
+  Shape shape = ShapeUtil::MakeShape(S32, {});
+
   std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
   branch_thunks.push_back(std::move(sequential_thunk));
-  ConditionalThunk conditional_thunk(
-      Thunk::ThunkInfo(), BufferAllocation::Slice(), std::move(branch_thunks),
-      /*branch_index_is_bool=*/false);
+  ConditionalThunk conditional_thunk(Thunk::ThunkInfo(), {slice, shape},
+                                     std::move(branch_thunks));
 
   EXPECT_THAT(GetAllThunks(&conditional_thunk),
               UnorderedElementsAre(thunk_ptr, sequential_thunk_ptr,
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index a84798192bfe65..5a698ae7e8046f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -71,9 +71,8 @@ message DeviceToDeviceCopyThunkProto {
 }
 
 message ConditionalThunkProto {
-  xla.buffer_assignment.BufferAllocationSliceProto branch_index_buffer = 1;
+  ShapedSliceProto branch_index_buffer = 1;
   repeated SequentialThunkProto branch_thunks = 2;
-  bool branch_index_is_bool = 3;
 }
 
 message WhileThunkProto {
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
index 62bc737997450a..135f3e42694bd2 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_buffer_debug_pass_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/runtime_intrinsics.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_buffer_debug_saver_inserter.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
@@ -300,11 +301,13 @@ TEST_F(ThunkBufferDebugPassTest, RecursivelyInsertsBuffersDebugChecksumThunks) {
       SequentialThunk::FromThunk(std::move(conditional_branch0_thunk)));
   branch_thunks.push_back(
       SequentialThunk::FromThunk(std::move(conditional_branch1_thunk)));
+
+  Shape condition_shape = ShapeUtil::MakeShape(PRED, {});
+  BufferAllocation::Slice condition_slice = CreateSlice();
+
   auto conditional_thunk = std::make_unique<ConditionalThunk>(
-      Thunk::ThunkInfo(),
-      /*branch_index_buffer_index=*/BufferAllocation::Slice(),
-      std::move(branch_thunks),
-      /*branch_index_is_bool=*/true);
+      Thunk::ThunkInfo(), ShapedSlice{condition_slice, condition_shape},
+      std::move(branch_thunks));
   const Thunk* const conditional_thunk_ptr = conditional_thunk.get();
   std::vector<std::unique_ptr<Thunk>> while_body_thunks;
   while_body_thunks.push_back(std::move(while_body_fake_thunk));
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
index b3afd91438161a..ca04379d3bf194 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
@@ -436,7 +436,10 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
           execution_stream_id: 123
         }
         conditional_thunk {
-          branch_index_buffer { offset: 8 size: 256 buffer_allocation_index: 5 }
+          branch_index_buffer {
+            slice { offset: 8 size: 1 buffer_allocation_index: 5 }
+            shape { element_type: PRED }
+          }
           branch_thunks {
             thunks {
               thunk_info {
@@ -569,7 +572,6 @@ TEST(ThunkProtoDeserializationTest, ConditionalThunk) {
               }
             }
           }
-          branch_index_is_bool: true
         }
       )pb");
 
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 890f1c660302fa..2dc34ce63ca7ea 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -428,12 +428,13 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConditional(
   }
   TF_ASSIGN_OR_RETURN(auto slice,
                       GetAllocationSliceForHlo(instr->operand(0), {}));
-  bool branch_index_is_bool = instr->operand(0)->shape().element_type() == PRED;
 
-  return GetThunkSequence(std::make_unique<ConditionalThunk>(
+  auto placeholder = GetThunkSequence(std::make_unique<ConditionalThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
-      slice, std::move(branch_thunks), branch_index_is_bool));
+      ShapedSlice{slice, instr->operand(0)->shape()},
+      std::move(branch_thunks)));
+  return placeholder;
 }
 
 // Input = {dynamic array(with dynamic dimension meta data at the end)}

From d040514b80c6197229ebf85fc987af506b4401ad Mon Sep 17 00:00:00 2001
From: Mehrdad Khani <mehrdadk@google.com>
Date: Thu, 11 Dec 2025 23:37:39 -0800
Subject: [PATCH 214/753] [XLA:TPU] Modify memory space assignment to not
 double count the ConcatBitcast shared buffers between input and output
 operands while exporting its heap simulator trace. MSA adds such
 ConcatBitcast operations when it adds sliced prefetches.

PiperOrigin-RevId: 843544179
---
 .../memory_space_assignment.cc                |  70 +++++++++-
 .../memory_space_assignment_test.cc           | 129 ++++++++++++++++++
 2 files changed, 196 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 58241c0e147225..0d2933aebbd6b0 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -1176,6 +1176,62 @@ absl::Status MemorySpaceAssignment::FixSchedule() {
   return absl::OkStatus();
 }
 
+namespace {
+
+bool IsConcatBitcastCustomCall(const HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kCustomCall &&
+         instruction->custom_call_target() ==
+             memory_space_assignment::kConcatBitcastCustomCall;
+}
+
+// If a use is a ConcatBitcastCustomCall, we add the uses of the
+// ConcatBitcastCustomCall's output buffer to the list of uses for the
+// current value. Since the instructions that have ConcatBitcastCustomCall as a
+// user are slices, this means we are considering all uses of the concatenated
+// buffer as uses of the original slices.
+std::vector<HloUse> GetUsesAndExtendIfConcatBitcast(
+    const HloValue* value, const HloDataflowAnalysis& dataflow_analysis) {
+  std::vector<HloUse> uses;
+  for (const HloUse& use : value->GetUses()) {
+    if (IsConcatBitcastCustomCall(use.instruction)) {
+      const HloValue& concat_bitcast_value =
+          dataflow_analysis.GetUniqueValueAt(use.instruction);
+      absl::c_copy(concat_bitcast_value.GetUses(), std::back_inserter(uses));
+    } else {
+      uses.push_back(use);
+    }
+  }
+  return uses;
+}
+
+// If a value is used by a ConcatBitcastCustomCall, we extend the time bound to
+// represent the time bound of the concatenated value.
+HloLiveRange::TimeBound GetTimeBoundAndExtendIfConcatBitcast(
+    const HloValue* value, const HloDataflowAnalysis& dataflow_analysis,
+    const HloLiveRange& hlo_live_range) {
+  HloLiveRange::TimeBound time_bound =
+      hlo_live_range.buffer_live_ranges().at(value);
+  for (const HloUse& use : value->GetUses()) {
+    if (IsConcatBitcastCustomCall(use.instruction)) {
+      const HloValue& concat_bitcast_value =
+          dataflow_analysis.GetUniqueValueAt(use.instruction);
+      const HloLiveRange::TimeBound& concat_time_bound =
+          hlo_live_range.buffer_live_ranges().at(&concat_bitcast_value);
+      time_bound.start = std::min(time_bound.start, concat_time_bound.start);
+      time_bound.end = std::max(time_bound.end, concat_time_bound.end);
+      if (hlo_live_range.instruction_schedule().at(
+              time_bound.end_position.instruction) <
+          hlo_live_range.instruction_schedule().at(
+              concat_time_bound.end_position.instruction)) {
+        time_bound.end_position = concat_time_bound.end_position;
+      }
+    }
+  }
+  return time_bound;
+}
+
+}  // namespace
+
 absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
     const HloAliasAnalysis& alias_analysis,
     std::vector<int64_t>* alt_mem_bytes_occupied) {
@@ -1197,6 +1253,9 @@ absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
   auto add_allocation_and_verify = [&](int64_t start_time, int64_t end_time,
                                        const HeapSimulator::Chunk& chunk,
                                        const HloValue* value) -> absl::Status {
+    if (IsConcatBitcastCustomCall(value->instruction())) {
+      return absl::OkStatus();
+    }
     events[std::make_tuple(start_time, /*is_free=*/false, value->id())] =
         std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
     events[std::make_tuple(end_time, /*is_free=*/true, value->id())] =
@@ -1256,10 +1315,13 @@ absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
 
     for (const HloValue* value : buffer.values()) {
       const HloLiveRange::TimeBound& time_bound =
-          hlo_live_range->buffer_live_ranges().at(value);
+          GetTimeBoundAndExtendIfConcatBitcast(
+              value, alias_analysis.dataflow_analysis(), *hlo_live_range);
       const HloInstruction* last_use_instruction = nullptr;
       int64_t last_use_time = time_bound.start;
-      for (const HloUse& use : value->GetUses()) {
+      std::vector<HloUse> uses = GetUsesAndExtendIfConcatBitcast(
+          value, alias_analysis.dataflow_analysis());
+      for (const HloUse& use : uses) {
         int64_t use_time =
             hlo_live_range->instruction_schedule().at(use.instruction);
         if (use_time > last_use_time) {
@@ -1293,7 +1355,9 @@ absl::Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace(
               std::min(earliest_computation_start_time, computation_start_time);
           int64_t last_use_time = -1;
           const HloInstruction* last_use_instruction = nullptr;
-          for (const HloUse& use : value->GetUses()) {
+          std::vector<HloUse> uses = GetUsesAndExtendIfConcatBitcast(
+              value, alias_analysis.dataflow_analysis());
+          for (const HloUse& use : uses) {
             int64_t use_time =
                 hlo_live_range->instruction_schedule().at(use.instruction);
             if (use.instruction->parent() == called_computation &&
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index b60ef8872a01e7..f97b490e296afb 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -13370,6 +13370,135 @@ ENTRY main {
   TF_EXPECT_OK(CheckSliceChunks(*assignments, root->operand(1)));
 }
 
+// This module is optmized as below, which adds two slices followed by a concat
+// bitcast. Since the concat bitcast uses the same slice buffers for its output,
+// the heap simulator trace should not have freed any of the slices before the
+// concat bitcast users are allocated/processed.
+//
+// ENTRY main {
+//   ...
+//   p1 = f32[8,8]{1,0} parameter(1)
+//   slice-start = slice-start(p1), slice={[4:8], [0:8]}
+//   ...
+//   slice-start.1 = slice-start(p1), slice={[0:4], [0:8]}
+//   ...
+//   c = f32[8,8]{1,0:S(1)} tanh(b)
+//   slice-done = f32[4,8]{1,0:S(1)} slice-done(slice-start)
+//   slice-done.1 = f32[4,8]{1,0:S(1)} slice-done(slice-start.1)
+//   custom-call = f32[8,8]{1,0:S(1)} custom-call(slice-done, slice-done.1),
+//                                    custom_call_target="ConcatBitcast"
+//   r = f32[8,8]{1,0:S(1)} add(c, custom-call)
+//   n = f32[8,8]{1,0:S(1)} negate(custom-call)
+// ROOT f = f32[8,8]{1,0} add(r, n)
+// }
+//
+TEST_F(SlicedPrefetchTest, SlicedPrefetchHeapSimulatorTrace) {
+  std::string hlo_text = R"zz(
+HloModule Slice, is_scheduled=true
+
+ENTRY main {
+  p0 = f32[8,8] parameter(0)
+  p1 = f32[8,8] parameter(1)
+
+  a = f32[8,8] tanh(p0)
+  b = f32[8,8] tanh(a)
+  c = f32[8,8] tanh(b)
+
+  r = f32[8,8] add(c, p1)
+  n = f32[8,8] negate(p1)
+  ROOT f = f32[8,8] add(r, n)
+})zz";
+
+  SetupProposeSlicesToExpect2SlicesOfF32x8x8();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  VLOG(1) << "Original module:\n"
+          << module->ToString(HloPrintOptions::ShortParsable());
+
+  std::unique_ptr<PresetAssignments> assignments = AssignMemorySpace(
+      module.get(), MakeDefaultOptions(),
+      /*max_prefetch_interval=*/10, /*min_prefetch_interval=*/1);
+
+  VLOG(1) << "Post-MSA module:\n"
+          << module->ToString(HloPrintOptions::ShortParsable());
+
+  auto* r_instr = FindInstruction(module.get(), "r");
+  EXPECT_NE(r_instr, nullptr);
+  auto* n_instr = FindInstruction(module.get(), "n");
+  EXPECT_NE(n_instr, nullptr);
+
+  // Expect p1 to be copied via a sliced prefetch for use in r and n.
+  EXPECT_THAT(
+      r_instr,
+      op::Add(_, IsAsyncSlicedCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                                   {{{0, 4}, {0, 8}}, {{4, 8}, {0, 8}}},
+                                   op::Parameter(1))));
+  EXPECT_THAT(n_instr, op::Negate(r_instr->operand(1)));
+
+  // Check the instruction schedule.
+  TF_EXPECT_OK(
+      CheckSchedule(*module, r_instr->operand(1),
+                    /*slices_start_after_instruction_name=*/"p1",
+                    /*slices_done_before_instruction_name=*/"r",
+                    /*expect_slices_started_at_different_times=*/true));
+
+  // Check expectations on the chunks assigned to the asynchronous sliced copy.
+  TF_EXPECT_OK(CheckSliceChunks(*assignments, r_instr->operand(1)));
+
+  const HeapSimulatorTrace& heap_trace =
+      assignments->assignment_information_for_space(kAlternateMemorySpace)
+          ->heap_simulator_trace;
+  // Track the set of instructions currently living in the alternate memory
+  // space.
+  // - ALLOC event: Instruction should not be in the set. Add it.
+  // - FREE event: Instruction should be in the set. Remove it.
+  // - Concat bitcast instruction: Slice operands should remain in the set until
+  //   all concat bitcast users are allocated.
+  absl::flat_hash_set<const HloInstruction*> allocated_instructions;
+  for (const auto& event : heap_trace.events()) {
+    VLOG(3) << "event: " << event.DebugString();
+    const HloInstruction* instruction =
+        FindInstruction(module.get(), event.instruction_name());
+    EXPECT_NE(instruction, nullptr)
+        << "Instruction not found: " << event.instruction_name();
+    if (instruction->opcode() == HloOpcode::kCustomCall) {
+      EXPECT_NE(instruction->custom_call_target(),
+                memory_space_assignment::kConcatBitcastCustomCall)
+          << "We do not expect concat bitcast custom call to add any "
+             "independent events to the heap trace.";
+    }
+    if (instruction->opcode() == HloOpcode::kSlice) {
+      EXPECT_TRUE(event.kind() == HeapSimulatorTrace::Event::ALLOC ||
+                  event.kind() == HeapSimulatorTrace::Event::FREE);
+    }
+    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      allocated_instructions.insert(instruction);
+    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
+      EXPECT_TRUE(allocated_instructions.contains(instruction))
+          << "FREE is called on slice instruction before its ALLOC or is its "
+             "being called more than once on the same slice buffer.";
+      allocated_instructions.erase(instruction);
+    } else {
+      FAIL() << "Unexpected event kind: " << event.kind()
+             << " for instruction: " << event.instruction_name();
+    }
+    // At the time we allocate the r and n instructions, we should still have
+    // valid allocations for both slices of the concatbitcast operands, because
+    // the concatbitcast should share that buffer with its users, i.e. r and n
+    // instructions.
+    if ((instruction == r_instr || instruction == n_instr) &&
+        event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      int slice_count = absl::c_count_if(
+          allocated_instructions, [](const HloInstruction* inst) {
+            return inst->opcode() == HloOpcode::kSlice;
+          });
+      EXPECT_EQ(slice_count, 2)
+          << "Did not find enough valid allocations for both slice buffers in "
+             "the trace at the time of allocation for r or n instructions.";
+    }
+  }
+}
+
 TEST_F(SlicedPrefetchTest, TwoSlicesWithCopyReplacement) {
   std::string hlo_text = R"zz(
 HloModule Slice, is_scheduled=true

From 91b16be2f078084c9657aba0aa8c6ec332ea5387 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 11 Dec 2025 23:39:01 -0800
Subject: [PATCH 215/753] [stream_executor] Create NCCL memory allocator for
 collective memory on CUDA

It is a layering violation to depend from SE to XLA:GPU collectives. All memory allocations should be done via correct se::MemoryAllocator instances. Prepare for removing memory allocation APIs from GPU collectives.

PiperOrigin-RevId: 843544612
---
 .../xla/xla/stream_executor/cuda/BUILD        |  26 +++++
 .../cuda/nccl_memory_allocator.cc             | 108 ++++++++++++++++++
 .../cuda/nccl_memory_allocator.h              |  43 +++++++
 3 files changed, 177 insertions(+)
 create mode 100644 third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
 create mode 100644 third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.h

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 1342a13898784c..dbc9b40220d4f9 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -928,6 +928,32 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "nccl_memory_allocator",
+    srcs = ["nccl_memory_allocator.cc"],
+    hdrs = ["nccl_memory_allocator.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla:util",
+        "//xla/stream_executor:activate_context",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/cuda:nccl",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:numbers",
+    ],
+)
+
 cc_library(
     name = "nvjitlink_support",
     srcs = ["nvjitlink_support.cc"],
diff --git a/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
new file mode 100644
index 00000000000000..38d8d5508acffb
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
@@ -0,0 +1,108 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/nccl_memory_allocator.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "third_party/nccl/nccl.h"
+#include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/numbers.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+absl::StatusOr<void*> NcclAllocate(StreamExecutor* executor, uint64_t size) {
+  std::unique_ptr<ActivateContext> activate = executor->Activate();
+
+  void* ptr = nullptr;
+  ncclResult_t res = ncclMemAlloc(&ptr, size);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to allocate %s (%llu bytes) from NCCL: %s. Last "
+        "NCCL warning(error) log entry (may be unrelated): %s",
+        tsl::strings::HumanReadableNumBytes(size), size,
+        ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+  XLA_VLOG_DEVICE(2, executor->device_ordinal())
+      << "Allocated memory " << ptr << " of " << size << " bytes from NCCL";
+  return ptr;
+}
+
+absl::Status NcclFree(StreamExecutor* executor, void* ptr, uint64_t size) {
+  std::unique_ptr<ActivateContext> activate = executor->Activate();
+
+  ncclResult_t res = ncclMemFree(ptr);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to free NCCL memory at %p; result: %s. Last "
+        "NCCL warning(error) log entry (may be unrelated): %s",
+        ptr, ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+
+  XLA_VLOG_DEVICE(2, executor->device_ordinal())
+      << "Freed NCCL memory " << ptr << " of " << size << " bytes";
+  return absl::OkStatus();
+}
+
+// A memory allocated from NCCL on the given executor.
+class NcclMemoryAllocation : public MemoryAllocation {
+ public:
+  NcclMemoryAllocation(StreamExecutor* executor, void* ptr, uint64_t size);
+
+  ~NcclMemoryAllocation() final;
+  DeviceAddressBase address() const final;
+
+ private:
+  StreamExecutor* executor_;
+  void* ptr_;
+  uint64_t size_;
+};
+
+}  // namespace
+
+NcclMemoryAllocation::NcclMemoryAllocation(StreamExecutor* executor, void* ptr,
+                                           uint64_t size)
+    : executor_(executor), ptr_(ptr), size_(size) {}
+
+NcclMemoryAllocation::~NcclMemoryAllocation() {
+  CHECK_OK(NcclFree(executor_, ptr_, size_));  // Crash OK
+}
+
+DeviceAddressBase NcclMemoryAllocation::address() const {
+  return DeviceAddressBase(ptr_, size_);
+}
+
+NcclMemoryAllocator::NcclMemoryAllocator(StreamExecutor* executor)
+    : executor_(executor) {}
+
+absl::StatusOr<std::unique_ptr<MemoryAllocation>> NcclMemoryAllocator::Allocate(
+    uint64_t size) {
+  TF_ASSIGN_OR_RETURN(void* ptr, NcclAllocate(executor_, size));
+  return std::make_unique<NcclMemoryAllocation>(executor_, ptr, size);
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.h b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.h
new file mode 100644
index 00000000000000..0c678d0c569e57
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NCCL_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NCCL_MEMORY_ALLOCATOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// A memory allocator that uses NCCL to allocate memory.
+class NcclMemoryAllocator : public MemoryAllocator {
+ public:
+  explicit NcclMemoryAllocator(StreamExecutor* executor);
+
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> Allocate(
+      uint64_t size) final;
+
+ private:
+  StreamExecutor* executor_;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NCCL_MEMORY_ALLOCATOR_H_

From 30c336d8474626a6d1cecbd7a4e40558161ab416 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 12 Dec 2025 01:04:37 -0800
Subject: [PATCH 216/753] compat: Update forward compatibility horizon to
 2025-12-12

PiperOrigin-RevId: 843570421
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 638340af389d5e..5847142d8ab4d5 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 11)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 12)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 69fdd2b7c70407a2b82c53ee6784b871a678f08d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 12 Dec 2025 01:04:53 -0800
Subject: [PATCH 217/753] Update GraphDef version to 2439.

PiperOrigin-RevId: 843570515
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 58dbf3272f4164..9476cab5497275 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2438  // Updated: 2025/12/11
+#define TF_GRAPH_DEF_VERSION 2439  // Updated: 2025/12/12
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From a13231ac06b24ecf058a5898c07d3ed553ceadca Mon Sep 17 00:00:00 2001
From: Mohammed Anany <manany@google.com>
Date: Fri, 12 Dec 2025 02:51:47 -0800
Subject: [PATCH 218/753] [XLA:GPU/TMA] Extend all configurations to include
 their TMA counterpart during exhaustive search instead of using the
 recommended configurations only.

PiperOrigin-RevId: 843603850
---
 third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
index 4446900f5569a6..c7da8c617c1199 100644
--- a/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/dot_search_space.cc
@@ -638,7 +638,7 @@ void TritonDotFusionSearchSpace::AddTmaParameter(
   new_config.config.is_tma_allowed = false;
   updated_configs.push_back(new_config);
 
-  if (IsTmaRecommended(config.config)) {
+  if (exhaustive_tiling_search_ || IsTmaRecommended(config.config)) {
     new_config.config.is_tma_allowed = true;
     updated_configs.push_back(new_config);
   }

From 7bb18b14ae07287896747b429eda92ec0c026997 Mon Sep 17 00:00:00 2001
From: Marcin Radomski <dextero@google.com>
Date: Fri, 12 Dec 2025 03:04:44 -0800
Subject: [PATCH 219/753] [XLA] Fix ASSERT_OK_AND_ASSIGN macro in patch

PiperOrigin-RevId: 843607123
---
 ...dd-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch b/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
index 5b16fd63bb94a2..501d22d00cb301 100644
--- a/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
+++ b/third_party/xla/third_party/googletest/0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch
@@ -1,4 +1,4 @@
-From 925be4390f717899f3e825abe6e9a0548f0630e4 Mon Sep 17 00:00:00 2001
+From 5c2d2d62a71fe19c92c6f807d533c0ea90c15f03 Mon Sep 17 00:00:00 2001
 From: Marcin Radomski <dextero@google.com>
 Date: Thu, 4 Dec 2025 15:37:45 +0000
 Subject: [PATCH 1/2] Add ASSERT_OK/EXPECT_OK/ASSERT_OK_AND_ASSIGN macros
@@ -37,7 +37,7 @@ index c78fb8ee..69b33572 100644
  #endif  // GOOGLEMOCK_INCLUDE_GMOCK_GMOCK_H_
 diff --git a/googlemock/include/gmock/internal/xla-gmock-macros.h b/googlemock/include/gmock/internal/xla-gmock-macros.h
 new file mode 100644
-index 00000000..fd48a21a
+index 00000000..b851bcca
 --- /dev/null
 +++ b/googlemock/include/gmock/internal/xla-gmock-macros.h
 @@ -0,0 +1,118 @@
@@ -94,7 +94,7 @@ index 00000000..fd48a21a
 +  ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
 +
 +#define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
-+  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                     \
++  ASSERT_OK_AND_ASSIGN_IMPL(                                     \
 +      XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
 +      lhs, rexpr);
 +

From dd2f53c829df4bd70079b9ef619dd3bc4b60e6a4 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov@google.com>
Date: Fri, 12 Dec 2025 04:22:27 -0800
Subject: [PATCH 220/753] [XLA:GPU] move hoisting of bitcasts to a separate
 pass

PiperOrigin-RevId: 843628242
---
 .../xla/xla/backends/gpu/autotuner/BUILD      |    3 +-
 .../xla/xla/backends/gpu/autotuner/triton.cc  |    5 +-
 .../xla/xla/backends/gpu/codegen/triton/BUILD |    2 +-
 .../codegen/triton/triton_gemm_fusion_test.cc |    3 +-
 third_party/xla/xla/service/gpu/BUILD         |    1 +
 .../xla/xla/service/gpu/autotuning/BUILD      |    1 +
 .../gpu/autotuning/gemm_fusion_autotuner.cc   |    4 +-
 .../xla/xla/service/gpu/gpu_compiler.cc       |    6 +-
 .../xla/xla/service/gpu/gpu_compiler_test.cc  |    8 +-
 .../xla/xla/service/gpu/transforms/BUILD      |   62 +
 .../gpu/transforms/hoist_fused_bitcasts.cc    |  961 ++++++++++++
 .../gpu/transforms/hoist_fused_bitcasts.h     |   66 +
 .../transforms/hoist_fused_bitcasts_test.cc   | 1314 ++++++++++++++++
 .../gpu/transforms/nest_gemm_fusion.cc        |  814 ----------
 .../service/gpu/transforms/nest_gemm_fusion.h |   15 -
 .../gpu/transforms/nest_gemm_fusion_test.cc   | 1318 +----------------
 16 files changed, 2465 insertions(+), 2118 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
 create mode 100644 third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.h
 create mode 100644 third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc

diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index 5b4eea2a700a9a..c626469d5be7b3 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -427,7 +427,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/backends/gpu/codegen/triton:tma_utils",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/simplifiers:float_normalization",
         "//xla/hlo/utils:hlo_query",
@@ -442,12 +441,12 @@ cc_library(
         "//xla/service/gpu/autotuning:dot_search_space",
         "//xla/service/gpu/autotuning:triton_configs",
         "//xla/service/gpu/transforms:fusion_wrapper",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/backends/gpu/autotuner/triton.cc b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
index 0ee4e09dc60572..71a113e2e4631d 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/triton.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/triton.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/backends/gpu/codegen/triton/tma_utils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -42,12 +41,12 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/split_k_gemm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -221,7 +220,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonBackend::RunHloPasses(
   // into fusions.
   FusionWrapper fusion_wrapper(gpu_device_info);
   TF_RETURN_IF_ERROR(fusion_wrapper.Run(hlo_module.get()).status());
-
+  TF_RETURN_IF_ERROR(HoistFusedBitcasts().Run(hlo_module.get()).status());
   NestGemmFusion nest_gemm_fusion(gpu_device_info, mlir_context_);
   TF_RETURN_IF_ERROR(nest_gemm_fusion.Run(hlo_module.get()).status());
   return hlo_module;
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index edd7570f1ed64c..faefcc8e14163f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -551,7 +551,6 @@ xla_test(
         "no_mac",
     ],
     deps = [
-        ":fusion_emitter",
         ":test_utils",
         ":xtile_compiler",
         "//xla:autotuning_proto_cc",
@@ -569,6 +568,7 @@ xla_test(
         "//xla/service/gpu:target_constants",
         "//xla/service/gpu/model:block_level_parameters",
         "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
index ad4de9bf2257ac..52e84704781010 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/autotuning.pb.h"
-#include "xla/backends/gpu/codegen/triton/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/triton/test_utils.h"
 #include "xla/backends/gpu/codegen/triton/xtile_compiler.h"
 #include "xla/error_spec.h"
@@ -50,6 +49,7 @@ limitations under the License.
 #include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/target_constants.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
@@ -112,6 +112,7 @@ class TritonTest : public GpuCodegenTest {
   GetModuleAndNestedFusionMetadata(absl::string_view hlo_text) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<VerifiedHloModule> module,
                         ParseAndReturnVerifiedModule(hlo_text));
+    TF_RETURN_IF_ERROR(HoistFusedBitcasts().Run(module.get()).status());
     TF_ASSIGN_OR_RETURN(
         bool fusion_was_nested,
         NestGemmFusion(device_desc(), &mlir_context_).Run(module.get()));
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b5bde9888fb0f8..be400454a9386c 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1781,6 +1781,7 @@ cc_library(
         "//xla/service/gpu/transforms:gemm_fusion_swap_operands",
         "//xla/service/gpu/transforms:gemm_rewriter",
         "//xla/service/gpu/transforms:gemv_rewriter",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:layout_assignment",
         "//xla/service/gpu/transforms:move_copy_to_users",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 77b0d2daf56629..544ac7a01b382b 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -185,6 +185,7 @@ cc_library(
         "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/service/gpu/transforms:fusion_wrapper",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:nest_gemm_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
         "//xla/service/gpu/transforms:scaled_dot_rewriter",
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
index 548c9281bda625..5828cb9d6b3487 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc
@@ -92,6 +92,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
 #include "xla/service/gpu/transforms/scaled_dot_rewriter.h"
@@ -106,7 +107,6 @@ limitations under the License.
 #include "xla/stream_executor/device_address_allocator.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
-#include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream.h"
@@ -351,6 +351,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     TF_RETURN_IF_ERROR(fusion_wrapper.Run(new_module.get()).status());
   }
 
+  HoistFusedBitcasts hoist_fused_bitcasts;
+  TF_RETURN_IF_ERROR(hoist_fused_bitcasts.Run(new_module.get()).status());
   NestGemmFusion nest_gemm_fusion(gpu_device_info, mlir_context);
   TF_RETURN_IF_ERROR(nest_gemm_fusion.Run(new_module.get()).status());
   return new_module;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index d8b6a86f155288..e5b83fb8f3b13f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -239,6 +239,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/gemm_fusion_swap_operands.h"
 #include "xla/service/gpu/transforms/gemm_rewriter.h"
 #include "xla/service/gpu/transforms/gemv_rewriter.h"
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
 #include "xla/service/gpu/transforms/layout_assignment.h"
 #include "xla/service/gpu/transforms/move_copy_to_users.h"
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
@@ -1757,8 +1758,9 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // normalized again.
   add_float_normalization(pipeline);
 
-  // Match the location of this pass in `gemm_fusion_autotuner.cc` to make sure
-  // that there is no discrepancy.
+  // GemmFusionAutotuner runs hoist-fused-bitcasts and nest-gemm-fusion,
+  // matching its behavior here.
+  pipeline.AddPass<HoistFusedBitcasts>();
   pipeline.AddPass<NestGemmFusion>(gpu_target_config.device_description,
                                    &mlir_context_);
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 0e204cb3466ef4..a12b23b02ee29a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -1812,10 +1812,14 @@ TEST_F(PassOrderTest, GemmRewriterRunsAfterDotNormalizer) {
   VerifyNotRunInBetween(pass_range, /*pass_regex=*/"algsimp");
 }
 
-TEST_F(PassOrderTest, NestGemmFusionRunsAfterGemmFusionAutotuner) {
+TEST_F(PassOrderTest, HoistFusedBitcastsRunsAfterGemmFusionAutotuner) {
+  VerifyPassOrder("gemm-fusion-autotuner", "hoist-fused-bitcasts");
+}
+
+TEST_F(PassOrderTest, NestGemmFusionRunsAfterHoistFusedBitcasts) {
   // NestGemmFusion expect to see __triton_gemm custom call with a backend
   // config created by gemm_fusion_autotuner.
-  VerifyPassOrder("gemm-fusion-autotuner", "nest_gemm_fusion");
+  VerifyPassOrder("hoist-fused-bitcasts", "nest_gemm_fusion");
 }
 
 TEST_F(PassOrderTest, TransposeDimensionGrouperRunsBeforeGemmRewriter) {
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 1a95a47a225823..6b98ddb81a56eb 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1843,6 +1843,68 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "hoist_fused_bitcasts",
+    srcs = ["hoist_fused_bitcasts.cc"],
+    hdrs = ["hoist_fused_bitcasts.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/hlo/utils:hlo_query",
+        "//xla/service:call_graph",
+        "//xla/service:matmul_indexing_utils",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+xla_cc_test(
+    name = "hoist_fused_bitcasts_test",
+    srcs = ["hoist_fused_bitcasts_test.cc"],
+    tags = [
+        "nomsan",
+    ],
+    deps = [
+        ":hoist_fused_bitcasts",
+        "//xla:xla_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service:pattern_matcher",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "nest_gemm_fusion",
     srcs = ["nest_gemm_fusion.cc"],
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
new file mode 100644
index 00000000000000..fbc791ee7ad58c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
@@ -0,0 +1,961 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+// Extracts the TritonGemmConfig from the given fusion's backend config.
+absl::StatusOr<TritonGemmConfig> GetTritonGemmConfig(
+    const HloFusionInstruction& fusion) {
+  TF_ASSIGN_OR_RETURN(auto gpu_config,
+                      fusion.backend_config<GpuBackendConfig>());
+  const FusionBackendConfig& backend_config =
+      gpu_config.fusion_backend_config();
+  if (!backend_config.has_triton_gemm_config()) {
+    return absl::InternalError(
+        "The fusion's backend config doesn't have a triton_gemm_config.");
+  }
+  return TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
+}
+
+using HloInstructionSetVector =
+    llvm::SetVector<HloInstruction*, std::vector<HloInstruction*>,
+                    HloInstructionSet>;
+
+// Returns the set of instructions that are reachable from 'instruction' using
+// the given accessor.
+template <typename T>
+HloInstructionSetVector GetTransitiveInstructionSet(
+    const HloInstruction* instruction, T (HloInstruction::*get)() const) {
+  std::deque<HloInstruction*> worklist;
+  auto append = [&](const auto& instructions) {
+    worklist.insert(worklist.end(), instructions.begin(), instructions.end());
+  };
+  append((instruction->*get)());
+  HloInstructionSetVector result;
+  while (!worklist.empty()) {
+    HloInstruction* front = worklist.front();
+    worklist.pop_front();
+    if (result.insert(front)) {
+      append((front->*get)());
+    }
+  }
+  return result;
+}
+
+// Returns the set of producers reachable from 'instruction' in use-before-def
+// order.
+HloInstructionSetVector GetProducerSet(const HloInstruction* instruction) {
+  return GetTransitiveInstructionSet(instruction, &HloInstruction::operands);
+}
+// Returns the set of consumers reachable from 'instruction' in def-before-use
+// order.
+HloInstructionSetVector GetConsumerSet(const HloInstruction* instruction) {
+  return GetTransitiveInstructionSet(instruction, &HloInstruction::users);
+}
+
+// Verifies that the set of instructions is closed under the given accessor,
+// i.e. that the set of instructions reachable through the given accessor are
+// either in the set itself or the root.
+template <typename T>
+absl::Status VerifyIsClosedInstructionSet(
+    const HloInstructionSetVector& instructions, const HloInstruction* root,
+    T (HloInstruction::*get)() const) {
+  for (HloInstruction* instruction : instructions) {
+    for (HloInstruction* reachable : (instruction->*get)()) {
+      if (reachable != root && instructions.count(reachable) == 0) {
+        return absl::FailedPreconditionError(
+            absl::StrCat("Instruction ", reachable->ToString(),
+                         " is reachable from ", instruction->ToString(),
+                         ", which is not in the recursive set of, or ",
+                         root->ToString(), " itself."));
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status VerifyIsClosedProducerSet(
+    const HloInstructionSetVector& instructions, const HloInstruction* root) {
+  return VerifyIsClosedInstructionSet(instructions, root,
+                                      &HloInstruction::users);
+}
+
+// Copies the element type and size from `source` to `destination`.
+void CopyElementType(const Shape& source, Shape* destination) {
+  destination->set_element_type(source.element_type());
+  destination->mutable_layout()->set_element_size_in_bits(
+      source.layout().element_size_in_bits());
+}
+
+llvm::SmallVector<int64_t> GetInversePermutation(
+    absl::Span<const int64_t> permutation) {
+  llvm::SmallVector<int64_t> result(permutation.size());
+  for (int64_t i = 0; i < permutation.size(); ++i) {
+    result[permutation[i]] = i;
+  }
+  return result;
+}
+
+// Applies the backward-mapping 'permutation' to 'values'.
+llvm::SmallVector<int64_t> ApplyPermutation(
+    absl::Span<const int64_t> values, absl::Span<const int64_t> permutation) {
+  llvm::SmallVector<int64_t> result;
+  result.reserve(permutation.size());
+  for (int64_t index : permutation) {
+    result.push_back(values[index]);
+  }
+  return result;
+}
+
+// Returns the dimensions of 'shape' in minor-to-major order.
+llvm::SmallVector<int64_t> GetPhysicalDimensions(const Shape& shape) {
+  return ApplyPermutation(shape.dimensions(), shape.layout().minor_to_major());
+}
+
+// Parameters to rewrite a bitcast(broadcast/transpose) as
+// broadcast/transpose(bitcast) and vice versa.
+struct BitcastParams {
+  Shape new_shape;                      // The bitcast output shape.
+  llvm::SmallVector<int64_t> new_dims;  // The dims of the broadcast/transpose.
+};
+
+// Returns parameters to rewrite a broadcast + bitcast as bitcast + broadcast.
+//
+// Example:
+//
+// broadcast = broadcast(operand)
+// result = result_shape bitcast(broadcast)
+//
+// to
+//
+// bitcast = new_shape bitcast(operand)
+// result = broadcast(bitcast), dimensions={new_dims}.
+//
+// Assumes that:
+// - broadcast does not transpose dimensions (checked by hlo_verifier);
+// - bitcast does not mix operand and broadcast dimensions (checks);
+absl::StatusOr<BitcastParams> CalculateBitcastOfBroadcast(
+    const HloBroadcastInstruction* broadcast, const Shape& result_shape) {
+  const Shape& broadcast_shape = broadcast->shape();
+
+  // Maps broadcast dimension index to whether it's an operand dimension.
+  llvm::SmallVector<bool> is_operand_dim(broadcast_shape.dimensions().size());
+  for (const int64_t index : broadcast->dimensions()) {
+    is_operand_dim[index] = true;
+  }
+
+  // Dimensions of the new broadcast.
+  llvm::SmallVector<int64_t> new_dims;
+  llvm::SmallVector<int64_t> broadcast_physical_dims =
+      GetPhysicalDimensions(broadcast_shape);
+  auto factors = CommonFactors(GetPhysicalDimensions(result_shape),
+                               broadcast_physical_dims);
+  for (int64_t i = 1; i < factors.size(); ++i) {
+    auto [result_from, broadcast_from] = factors[i - 1];
+    auto [result_to, broadcast_to] = factors[i];
+
+    bool all_operands = true, any_operands = false;
+    for (int64_t j = broadcast_from; j < broadcast_to; ++j) {
+      if (broadcast_physical_dims[j] == 1) {
+        // If dimension size is 1 then we can ignore it: it's either immediately
+        // dropped by old reshape or it's coming from the operand and then the
+        // new reshape will handle it.
+        continue;
+      }
+      bool value = is_operand_dim[broadcast_shape.layout().minor_to_major(j)];
+      all_operands &= value;
+      any_operands |= value;
+    }
+    if (!any_operands) {
+      continue;  // All dimensions in this group are broadcast dimensions.
+    }
+    if (!all_operands) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
+                       " as it mixes operand and broadcast dimensions."));
+    }
+
+    for (int64_t j = result_from; j < result_to; ++j) {
+      new_dims.push_back(result_shape.layout().minor_to_major(j));
+    }
+  }
+  absl::c_sort(new_dims);  // Sort into logical order.
+
+  BitcastParams result;
+  CopyElementType(result_shape, &result.new_shape);
+  for (int64_t index : new_dims) {
+    result.new_shape.add_dimensions(result_shape.dimensions(index));
+  }
+  auto* new_layout =
+      result.new_shape.mutable_layout()->mutable_minor_to_major();
+  new_layout->reserve(new_dims.size());
+  for (int64_t index : result_shape.layout().minor_to_major()) {
+    if (auto it = absl::c_lower_bound(new_dims, index);
+        it != new_dims.end() && *it == index) {
+      new_layout->push_back(it - new_dims.begin());
+    }
+  }
+  result.new_dims = std::move(new_dims);
+
+  VLOG(3) << "CalculateBitcastOfBroadcast:";
+  VLOG(3) << "  broadcast = " << broadcast_shape.ToString(true) << " broadcast("
+          << broadcast->operand(0)->shape().ToString(true)
+          << " operand), dimensions="
+          << absl::StrJoin(broadcast->dimensions(), ",");
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
+          << broadcast_shape.ToString(true) << " broadcast)";
+  VLOG(3) << "--------------------------------";
+  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
+          << broadcast->operand(0)->shape().ToString(true) << " operand)";
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
+          << result.new_shape.ToString(true)
+          << " bitcast), dimensions=" << absl::StrJoin(result.new_dims, ",");
+
+  return result;
+}
+
+// Returns parameters to rewrite a bitcast + broadcast as broadcast + bitcast.
+//
+// Example:
+//
+// bitcast = bitcast(operand_shape operand)
+// result = broadcast(bitcast)
+//
+// to
+//
+// broadcast = new_shape broadcast(operand), dimensions={new_dims}.
+// result = bitcast(broadcast)
+//
+// Assumes that:
+// - broadcast does not transpose dimensions (checked by hlo_verifier);
+// - bitcast does not mix operand and broadcast dimensions (checks);
+absl::StatusOr<BitcastParams> CalculateBroadcastOfBitcast(
+    const HloBroadcastInstruction* broadcast, const Shape& operand_shape) {
+  const Shape& bitcast_shape = broadcast->operand(0)->shape();
+  const Shape& result_shape = broadcast->shape();
+
+  // Maps logical result dimension index to a range of physical operand
+  // dimensions, or nullopt if the dimension is broadcasted.
+  llvm::SmallVector<std::optional<std::pair<int64_t, int64_t>>>
+      result_to_operand_range(result_shape.dimensions().size());
+  auto result_inv_layout =
+      GetInversePermutation(result_shape.layout().minor_to_major());
+  auto factors = CommonFactors(GetPhysicalDimensions(bitcast_shape),
+                               GetPhysicalDimensions(operand_shape));
+  for (int64_t i = 1; i < factors.size(); ++i) {
+    auto [bitcast_from, operand_from] = factors[i - 1];
+    auto [bitcast_to, operand_to] = factors[i];
+
+    llvm::SmallVector<int64_t> indices;
+    indices.reserve(bitcast_to - bitcast_from);
+    for (int64_t j = bitcast_from; j < bitcast_to; ++j) {
+      int64_t index =
+          broadcast->dimensions()[bitcast_shape.layout().minor_to_major(j)];
+
+      // Store the entire operand dimension range in the minor-most dimension
+      // index and an empty range in all others.
+      result_to_operand_range[index].emplace(operand_from, operand_to);
+      operand_from = operand_to;
+
+      // Check that the physical result indices form a contiguous range.
+      indices.push_back(result_inv_layout[index]);
+    };
+
+    if (indices.back() - indices.front() >= bitcast_to - bitcast_from ||
+        !absl::c_is_sorted(indices)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
+                       " because result dimensions are not contiguous."));
+    }
+  }
+
+  BitcastParams result;
+  CopyElementType(operand_shape, &result.new_shape);
+  result.new_dims.resize(operand_shape.dimensions().size());
+  auto* new_layout =
+      result.new_shape.mutable_layout()->mutable_minor_to_major();
+  int64_t new_rank = operand_shape.dimensions().size() +
+                     result_shape.dimensions().size() -
+                     bitcast_shape.dimensions().size();
+  new_layout->reserve(new_rank);
+  llvm::SmallVector<int64_t> new_shape_dims(new_rank);
+
+  // We are free to insert the broadcast dimensions in any order. Insert them
+  // at the end of the the logical dimension order.
+  int64_t broadcast_index = operand_shape.dimensions().size();
+
+  // Iterate through the logical result dimension indices in physical order.
+  for (int64_t result_index : result_shape.layout().minor_to_major()) {
+    if (auto range = result_to_operand_range[result_index]) {
+      // This result dimension corresponds to a group of operand dimensions.
+      // Iterate through the range of physical operand dimension indices.
+      for (int64_t i = range->first; i < range->second; ++i) {
+        int64_t operand_index = operand_shape.layout().minor_to_major(i);
+        int64_t new_index = operand_index;
+        new_shape_dims[new_index] = operand_shape.dimensions(operand_index);
+        new_layout->push_back(new_index);
+        result.new_dims[operand_index] = new_index;
+      }
+    } else {
+      // This is a new dimension introduced by the original broadcast.
+      int64_t new_index = broadcast_index++;
+      new_shape_dims[new_index] = result_shape.dimensions(result_index);
+      new_layout->push_back(new_index);
+    }
+  }
+  absl::c_sort(result.new_dims);  // Sort into logical order.
+  for (int64_t dimension : new_shape_dims) {
+    result.new_shape.add_dimensions(dimension);
+  }
+
+  VLOG(3) << "CalculateBroadcastOfBitcast:";
+  VLOG(3) << "  bitcast   = " << bitcast_shape.ToString(true) << " bitcast("
+          << operand_shape.ToString(true) << " operand)";
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
+          << bitcast_shape.ToString(true) << " bitcast), dimensions="
+          << absl::StrJoin(broadcast->dimensions(), ",");
+  VLOG(3) << "--------------------------------";
+  VLOG(3) << "  broadcast = " << result.new_shape.ToString(true)
+          << " broadcast(" << operand_shape.ToString(true)
+          << " operand), dimensions=" << absl::StrJoin(result.new_dims, ",");
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
+          << result.new_shape.ToString(true) << " broadcast)";
+
+  return result;
+}
+
+// Implements CalculateBitcastOfTranspose(), except that result.new_dims is
+// the inverse permutation, mapping the input dimensions to the output
+// dimensions.
+absl::StatusOr<BitcastParams> CalculateBitcastOfTransposeImpl(
+    const HloTransposeInstruction* transpose, const Shape& result_shape,
+    const Shape& transpose_shape, const Shape& operand_shape,
+    absl::Span<const int64_t> transpose_dims) {
+  if (transpose->shape().layout() != transpose->operand(0)->shape().layout()) {
+    return absl::InternalError(
+        absl::StrCat("Expected input and output layouts to be the same for ",
+                     transpose->ToString()));
+  }
+
+  // Maps physical operand dimension index to a range of physical result
+  // dimensions.
+  llvm::SmallVector<std::pair<int64_t, int64_t>> operand_to_result_range(
+      operand_shape.dimensions().size());
+  // Maps logical operand dimension index to the physical dimension index.
+  llvm::SmallVector<int64_t> operand_inv_layout =
+      GetInversePermutation(operand_shape.layout().minor_to_major());
+
+  const absl::InlinedVector<std::pair<int64_t, int64_t>, 8> factors =
+      ::xla::gpu::detail::CommonFactorsMergingTrivialRanges(
+          GetPhysicalDimensions(result_shape),
+          GetPhysicalDimensions(transpose_shape));
+  for (int64_t i = 1; i < factors.size(); ++i) {
+    auto [result_from, transpose_from] = factors[i - 1];
+    auto [result_to, transpose_to] = factors[i];
+
+    llvm::SmallVector<int64_t> indices;
+    indices.reserve(transpose_to - transpose_from);
+    for (int64_t j = transpose_from; j < transpose_to; ++j) {
+      int64_t index = operand_inv_layout
+          [transpose_dims[transpose_shape.layout().minor_to_major(j)]];
+
+      // Store the entire result dimension range in the minor-most dimension
+      // index and an empty range in all others.
+      operand_to_result_range[index] = {result_from, result_to};
+      result_from = result_to;
+
+      // Check that the physical operand indices form a contiguous range.
+      indices.push_back(index);
+    };
+
+    if (indices.empty()) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
+                       " because size-1 dims in bitcasts are not yet supported "
+                       "(b/466065483)."));
+    }
+    if (indices.back() - indices.front() >= transpose_to - transpose_from ||
+        !absl::c_is_sorted(indices)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
+                       " because result dimensions are not contiguous."));
+    }
+  }
+
+  BitcastParams result;
+  CopyElementType(result_shape, &result.new_shape);
+  // Just like the old transpose, the new transpose does not change the
+  // layout.
+  *result.new_shape.mutable_layout() = result_shape.layout();
+  result.new_dims.resize(result_shape.dimensions().size());
+  llvm::SmallVector<int64_t> new_shape_dims(result_shape.dimensions().size());
+  // Iterate through the physical operand and new_shape dimension indices.
+  for (int64_t i = 0, j = 0; i < operand_shape.dimensions().size(); ++i) {
+    auto range = operand_to_result_range[i];
+    // Iterate through corresponding range of physical result dimension
+    // indices.
+    for (int64_t k = range.first; k < range.second; ++k) {
+      int64_t new_index = result_shape.layout().minor_to_major(j++);
+      int64_t result_index = result_shape.layout().minor_to_major(k);
+      new_shape_dims[new_index] = result_shape.dimensions(result_index);
+      result.new_dims[new_index] = result_index;
+    }
+  }
+  for (int64_t dimension : new_shape_dims) {
+    result.new_shape.add_dimensions(dimension);
+  }
+
+  VLOG(3) << "CalculateBitcastOfTransposeImpl:";
+  VLOG(3) << "  transpose = " << transpose_shape.ToString(true) << " transpose("
+          << operand_shape.ToString(true)
+          << " operand), dimensions=" << absl::StrJoin(transpose_dims, ",");
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
+          << transpose_shape.ToString(true) << " transpose)";
+  VLOG(3) << "--------------------------------";
+  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
+          << operand_shape.ToString(true) << " operand)";
+  VLOG(3) << "  result    = " << result_shape.ToString(true) << " transpose("
+          << result.new_shape.ToString(true) << " bitcast), dimensions="
+          << absl::StrJoin(GetInversePermutation(result.new_dims), ",");
+
+  return result;
+}
+
+// Returns parameters to rewrite a transpose + bitcast as bitcast + transpose.
+//
+// Example:
+//
+// transpose = transpose(operand)
+// result = result_shape bitcast(transpose)
+//
+// to
+//
+// bitcast = new_shape bitcast(operand)
+// result = transpose(bitcast), dimensions={new_dims}.
+//
+// Assumes that:
+// - bitcast only mixes contiguous dimensions (checks);
+// - transpose does not change layout (checks);
+absl::StatusOr<BitcastParams> CalculateBitcastOfTranspose(
+    const HloTransposeInstruction* transpose, const Shape& result_shape) {
+  TF_ASSIGN_OR_RETURN(
+      BitcastParams result,
+      CalculateBitcastOfTransposeImpl(
+          transpose, result_shape, transpose->shape(),
+          transpose->operand(0)->shape(), transpose->dimensions()));
+  result.new_dims = GetInversePermutation(result.new_dims);
+  return result;
+}
+
+// Returns parameters to rewrite a bitcast + transpose as transpose + bitcast.
+//
+// Example:
+//
+// bitcast = bitcast(operand_shape operand)
+// result = transpose(bitcast)
+//
+// to
+//
+// transpose = new_shape transpose(operand), dimensions={new_dims}.
+// result = bitcast(transpose)
+//
+// Assumes that:
+// - bitcast only mixes contiguous dimensions (checks);
+// - transpose does not change layout (checks);
+absl::StatusOr<BitcastParams> CalculateTransposeOfBitcast(
+    const HloTransposeInstruction* transpose, const Shape& operand_shape) {
+  return CalculateBitcastOfTransposeImpl(
+      transpose, operand_shape, transpose->operand(0)->shape(),
+      transpose->shape(), GetInversePermutation(transpose->dimensions()));
+}
+
+// Simulates a rewrite of all producers of a given bitcast/reshape, moving the
+// instruction outside of the computation. Returns the new shapes of affected
+// instructions in order of traversal from consumers to producers.
+absl::StatusOr<std::vector<std::pair<HloInstruction*, Shape>>>
+PlanHoistBitcastUpwardsToCallers(const HloInstruction* bitcast) {
+  // Check that all producers only affect the bitcast. If there are any
+  // other consumers: refuse the hoisting.
+  // It is possible to support more cases by sinking the bitcast from such
+  // producers downward.
+  HloInstructionSetVector producers = GetProducerSet(bitcast);
+  TF_RETURN_IF_ERROR(VerifyIsClosedProducerSet(producers, bitcast));
+  if (bitcast->shape().element_type() !=
+      bitcast->operand(0)->shape().element_type()) {
+    return absl::UnimplementedError(
+        absl::StrCat("Hoisting bitcast with type conversion is not supported: ",
+                     bitcast->ToString()));
+  }
+
+  HloInstructionMap<Shape> result_shapes;
+  auto set_result_shape =
+      [&](const absl::Span<HloInstruction* const> instructions,
+          const Shape& shape) -> absl::Status {
+    for (HloInstruction* instruction : instructions) {
+      // Only update the dimensions keeping the type intact.
+      Shape new_shape(shape);
+      CopyElementType(instruction->shape(), &new_shape);
+      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
+               ShapeUtil::ArrayDataSize(instruction->shape()))
+          << " instruction " << instruction->ToString()
+          << " updating result shape from "
+          << ShapeUtil::HumanStringWithLayout(instruction->shape()) << " to "
+          << ShapeUtil::HumanStringWithLayout(new_shape)
+          << " with different data size";
+      auto it = result_shapes.find(instruction);
+      if (it == result_shapes.end()) {
+        VLOG(2) << "updating the result shape of " << instruction->ToString()
+                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
+        result_shapes.emplace(instruction, new_shape);
+      } else if (it->second != new_shape) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Conflicting shape assignment for ", instruction->ToString(),
+            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
+            ShapeUtil::HumanStringWithLayout(shape)));
+      }
+    }
+    return absl::OkStatus();
+  };
+  TF_RETURN_IF_ERROR(set_result_shape(bitcast->operands(), bitcast->shape()));
+
+  std::vector<std::pair<HloInstruction*, Shape>> result;
+  // We want to visit instructions in order from consumers to producers: we
+  // hoist the bitcast upwards and having a valid HLO at every rewrite step
+  // helps a lot. A simple DFS or BFS over operands will not work in non-tree
+  // situations when there are multiple consumers of the same producer. Instead
+  // of writing a custom traversal we can simply walk the post-order (producers
+  // before consumers) list backward and only update the instructions affected.
+  // TODO(b/393299275): use MakeInstructionPostOrderFrom(bitcast) - that should
+  // be slightly more efficient.
+  auto def_before_use = bitcast->parent()->MakeInstructionPostOrder();
+  for (HloInstruction* instruction :
+       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
+    auto it = result_shapes.find(instruction);
+    if (it == result_shapes.end()) {
+      continue;  // Not affected.
+    }
+    Shape& result_shape = it->second;
+    if (instruction->shape() == result_shape) {
+      continue;  // No change.
+    }
+    result.emplace_back(instruction, result_shape);
+    switch (instruction->opcode()) {
+      case HloOpcode::kParameter:
+      case HloOpcode::kConstant:
+        // No operands.
+        break;
+      case HloOpcode::kReshape:  // Reshape is a bitcast.
+      case HloOpcode::kBitcast:
+        // Other bitcast will be hoisted separately so we don't need to
+        // update its operand.
+        break;
+      case HloOpcode::kBroadcast: {
+        TF_ASSIGN_OR_RETURN(
+            BitcastParams params,
+            CalculateBitcastOfBroadcast(
+                Cast<HloBroadcastInstruction>(instruction), result_shape));
+        TF_RETURN_IF_ERROR(
+            set_result_shape(instruction->operands(), params.new_shape));
+        break;
+      }
+      case HloOpcode::kTranspose: {
+        TF_ASSIGN_OR_RETURN(
+            BitcastParams params,
+            CalculateBitcastOfTranspose(
+                Cast<HloTransposeInstruction>(instruction), result_shape));
+        TF_RETURN_IF_ERROR(
+            set_result_shape(instruction->operands(), params.new_shape));
+        break;
+      }
+      default:
+        if (!instruction->IsElementwise()) {
+          return absl::FailedPreconditionError(absl::StrCat(
+              "Cannot hoist bitcast past ", instruction->ToString()));
+        }
+        TF_RETURN_IF_ERROR(
+            set_result_shape(instruction->operands(), result_shape));
+        break;
+    }
+  }
+  return result;
+}
+
+// Returns the shape of the root instruction after hoisting all bitcasts.
+//
+// For example, given:
+//
+// dot = dot_shape dot
+// bitcast = bitcast(dot)
+// ROOT root = transpose(bitcast)
+//
+// Returns root_shape for:
+//
+// dot = dot_shape dot
+// ROOT root = roots_shape transpose(dot)
+//
+absl::StatusOr<Shape> ComputeRootShapeAfterHoistingBitcasts(
+    const HloInstruction* dot) {
+  if (dot->IsRoot()) {
+    return dot->shape();
+  }
+
+  HloInstructionMap<Shape> operand_shapes;
+  auto set_operand_shape =
+      [&](const absl::Span<HloInstruction* const> instructions,
+          const Shape& shape) -> absl::Status {
+    for (HloInstruction* instruction : instructions) {
+      // Only update the dimensions keeping the type intact.
+      Shape new_shape(shape);
+      const HloInstruction* operand = instruction->operand(0);
+      CopyElementType(operand->shape(), &new_shape);
+      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
+               ShapeUtil::ArrayDataSize(operand->shape()))
+          << " instruction " << instruction->ToString()
+          << " updating operand shape from "
+          << ShapeUtil::HumanStringWithLayout(operand->shape()) << " to "
+          << ShapeUtil::HumanStringWithLayout(new_shape)
+          << " with different data size";
+      auto it = operand_shapes.find(instruction);
+      if (it == operand_shapes.end()) {
+        VLOG(2) << "updating the operand shape of "
+                << instruction->ToString(
+                       HloPrintOptions().set_print_operand_shape(true))
+                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
+        operand_shapes.emplace(instruction, new_shape);
+      } else if (it->second != new_shape) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Conflicting shape assignment for ", instruction->ToString(),
+            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
+            ShapeUtil::HumanStringWithLayout(shape)));
+      }
+    }
+    return absl::OkStatus();
+  };
+  TF_RETURN_IF_ERROR(set_operand_shape(dot->users(), dot->shape()));
+
+  for (HloInstruction* instruction : GetConsumerSet(dot)) {
+    auto it = operand_shapes.find(instruction);
+    if (it == operand_shapes.end()) {
+      continue;  // Not affected.
+    }
+    Shape& operand_shape = it->second;
+    TF_ASSIGN_OR_RETURN(Shape result_shape, [&]() -> absl::StatusOr<Shape> {
+      switch (instruction->opcode()) {
+        case HloOpcode::kBroadcast: {
+          TF_ASSIGN_OR_RETURN(
+              BitcastParams params,
+              CalculateBroadcastOfBitcast(
+                  Cast<HloBroadcastInstruction>(instruction), operand_shape));
+          return params.new_shape;
+        }
+        case HloOpcode::kTranspose: {
+          TF_ASSIGN_OR_RETURN(
+              BitcastParams params,
+              CalculateTransposeOfBitcast(
+                  Cast<HloTransposeInstruction>(instruction), operand_shape));
+          return params.new_shape;
+        }
+        default:
+          if (!instruction->IsElementwise()) {
+            return absl::FailedPreconditionError(absl::StrCat(
+                "Cannot hoist bitcast past ", instruction->ToString()));
+          }
+          [[fallthrough]];
+        case HloOpcode::kReshape:  // Reshape is a bitcast.
+        case HloOpcode::kBitcast:
+          return operand_shape;
+      }
+    }());
+    if (instruction->IsRoot()) {
+      CopyElementType(instruction->shape(), &result_shape);
+      return result_shape;
+    }
+    TF_RETURN_IF_ERROR(set_operand_shape(instruction->users(), result_shape));
+  }
+  return absl::InternalError("No root found");
+}
+
+// Hoists the given 'bitcast' upwards out of its computation, to the parent of
+// each caller.
+absl::Status HoistBitcastUpwardsToCallers(HloInstruction* bitcast,
+                                          absl::Span<HloInstruction*> callers) {
+  TF_ASSIGN_OR_RETURN(auto rewrite_plan,
+                      PlanHoistBitcastUpwardsToCallers(bitcast));
+  for (auto [instruction, result_shape] : rewrite_plan) {
+    VLOG(2) << absl::StrCat("rewriting result shape of ",
+                            instruction->ToString(), " to ",
+                            ShapeUtil::HumanStringWithLayout(result_shape));
+    switch (instruction->opcode()) {
+      case HloOpcode::kParameter: {
+        // Create a new bitcast in callers.
+        int64_t number = instruction->parameter_number();
+        for (HloInstruction* caller : callers) {
+          // Create a more generic `bitcast` even if the caller has a
+          // `reshape`.
+          HloInstruction* new_bitcast =
+              caller->AddInstruction(HloInstruction::CreateBitcast(
+                  result_shape, caller->mutable_operand(number)));
+          TF_RETURN_IF_ERROR(
+              caller->ReplaceOperandWithDifferentShape(number, new_bitcast));
+        }
+        break;
+      }
+      case HloOpcode::kBroadcast: {
+        auto* broadcast = Cast<HloBroadcastInstruction>(instruction);
+        auto params = CalculateBitcastOfBroadcast(broadcast, result_shape);
+        // Must be OK, already succeeded in PlanHoistBitcasUpwardsToCallers.
+        QCHECK_OK(params);
+        broadcast->mutable_dimensions()->assign(params->new_dims.begin(),
+                                                params->new_dims.end());
+        break;
+      }
+      case HloOpcode::kTranspose: {
+        auto* transpose = Cast<HloTransposeInstruction>(instruction);
+        auto params = CalculateBitcastOfTranspose(transpose, result_shape);
+        // Must be OK, already succeeded in PlanHoistBitcastUpwardsToCallers.
+        QCHECK_OK(params);
+        transpose->mutable_dimensions()->assign(params->new_dims.begin(),
+                                                params->new_dims.end());
+        break;
+      }
+      default:
+        break;
+    }
+    *instruction->mutable_shape() = result_shape;
+  }
+  TF_RETURN_IF_ERROR(bitcast->ReplaceAllUsesWith(bitcast->mutable_operand(0)));
+  TF_RETURN_IF_ERROR(bitcast->parent()->RemoveInstruction(bitcast));
+  return absl::OkStatus();
+}
+
+// Inserts a bitcast at the root if the root shape is different from the dot
+// shape. The bitcast is chosen so that it cancels out bitcasts and reshapes
+// along the way up to the dot. Updates the callers of the dot to expect the new
+// root shape.
+absl::Status MaybeInsertRootBitcast(HloInstruction* dot,
+                                    absl::Span<HloInstruction*> callers) {
+  TF_ASSIGN_OR_RETURN(Shape root_shape,
+                      ComputeRootShapeAfterHoistingBitcasts(dot));
+
+  HloComputation* computation = dot->parent();
+  HloInstruction* root = computation->root_instruction();
+  if (root->shape() == root_shape) {
+    return absl::OkStatus();
+  }
+
+  // Insert a new bitcast at the root.
+  computation->set_root_instruction(
+      root->AddInstruction(HloInstruction::CreateBitcast(root_shape, root)));
+
+  // Insert new bitcast for each caller's result.
+  for (HloInstruction* caller : callers) {
+    HloInstruction* new_bitcast = caller->AddInstruction(
+        HloInstruction::CreateBitcast(caller->shape(), caller));
+    TF_RETURN_IF_ERROR(caller->ReplaceAllUsesWith(new_bitcast));
+    *caller->mutable_shape() = root_shape;
+  }
+
+  return absl::OkStatus();
+}
+
+// Try hoisting bitcasts and reshapes in the computation away from 'dot' to the
+// callers of the computation. Some bitcasts or reshapes may remain in the
+// computation, because they cannot be hoisted across all ops, e.g. across some
+// transposes and broadcasts. This is not reported as an error.
+absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
+                                                    CallGraph* call_graph) {
+  VLOG(2) << "Before hoisting bitcasts: " << dot->parent()->ToString();
+
+  auto callers = call_graph->GetComputationCallers(dot->parent());
+  if (auto status = MaybeInsertRootBitcast(dot, absl::MakeSpan(callers));
+      !status.ok()) {
+    VLOG(2) << "Failed to insert root bitcast: " << status;
+  }
+  VLOG(2) << "After inserting root bitcast: " << dot->parent()->ToString();
+
+  auto def_before_use = dot->parent()->MakeInstructionPostOrder();
+  for (HloInstruction* instruction :
+       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
+    if (!HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kReshape>(
+            instruction)) {
+      continue;
+    }
+    VLOG(2) << "Hoisting bitcast upwards " << instruction->ToString();
+    auto status =
+        HoistBitcastUpwardsToCallers(instruction, absl::MakeSpan(callers));
+    if (!status.ok()) {
+      VLOG(2) << "Failed to hoist " << instruction->ToString()
+              << " upwards: " << status;
+    }
+  }
+
+  VLOG(2) << "After hoisting bitcasts: " << dot->parent()->ToString();
+  return absl::OkStatus();
+}
+
+class HoistFusedBitcastsVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit HoistFusedBitcastsVisitor(CallGraph* call_graph)
+      : call_graph_(call_graph) {}
+
+ private:
+  absl::Status RewriteFusion(HloFusionInstruction* fusion,
+                             CallGraph* call_graph) {
+    HloComputation* computation = fusion->called_computation();
+    HloInstruction* instr =
+        hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+    if (instr == nullptr) {
+      instr = hlo_query::GetFirstInstructionWithOpcode(*computation,
+                                                       HloOpcode::kScaledDot);
+      if (instr == nullptr) {
+        return absl::InternalError(absl::StrCat("Computation of fusion ",
+                                                fusion->ToString(),
+                                                " has no dot instruction"));
+      }
+    }
+
+    TF_RETURN_IF_ERROR(
+        TryHoistBitcastsInComputationToCallers(instr, call_graph));
+    // TODO(b/446827313): don't mark as changed if no changes were made.
+    MarkAsChanged();
+    return absl::OkStatus();
+  }
+
+  absl::Status HandleFusion(HloInstruction* instruction) override {
+    HloFusionInstruction* fusion = Cast<HloFusionInstruction>(instruction);
+
+    // Check if we target this fusion.
+    absl::StatusOr<TritonGemmConfig> config = GetTritonGemmConfig(*fusion);
+    if (!config.ok()) {
+      VLOG(2) << "Skipping fusion as it does not have a TritonGemmConfig";
+      return absl::OkStatus();
+    }
+    HloComputation* computation = fusion->called_computation();
+    HloInstruction* instr =
+        hlo_query::GetFirstInstructionWithOpcode(*computation, HloOpcode::kDot);
+    if (instr == nullptr) {
+      instr = hlo_query::GetFirstInstructionWithOpcode(*computation,
+                                                       HloOpcode::kScaledDot);
+      if (instr == nullptr) {
+        VLOG(2) << "Skipping fusion as it has no dot instruction";
+        return absl::OkStatus();
+      }
+    }
+    return RewriteFusion(fusion, call_graph_);
+  }
+
+ private:
+  CallGraph* call_graph_;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> HoistFusedBitcasts::RunOnModule(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  auto call_graph = CallGraph::Build(module, execution_threads);
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    HoistFusedBitcastsVisitor visitor(call_graph.get());
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+    changed |= visitor.changed();
+  }
+  return changed;
+}
+
+absl::StatusOr<bool> HoistFusedBitcasts::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return RunOnModule(module, execution_threads);
+}
+
+namespace detail {
+
+absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
+CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
+                                  absl::Span<const int64_t> b) {
+  // CommonFactors does what we need but it also creates empty groups with
+  // product of 1, e.g. `[1] -> []` or `[] -> [1]`. We remove the bounds of
+  // such ranges to merge them with neighbors. There are many different ways
+  // to do this, here we continously append ranges to the start of the next
+  // group unless it is the very last range.
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> bounds =
+      CommonFactors(a, b);
+  for (size_t i = 0; i + 1 < bounds.size() && bounds.size() > 2;) {
+    auto [a_start, b_start] = bounds[i];
+    auto [a_end, b_end] = bounds[i + 1];
+    if (a_start != a_end && b_start != b_end) {
+      i++;
+      continue;
+    }
+    if (i + 2 == bounds.size()) {
+      // Very last range - append it to the previous one.
+      bounds.erase(bounds.begin() + i);
+    } else {
+      bounds.erase(bounds.begin() + i + 1);
+    }
+  }
+  return bounds;
+}
+
+}  // namespace detail
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.h b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.h
new file mode 100644
index 00000000000000..01d1fb3c367cd9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.h
@@ -0,0 +1,66 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_HOIST_FUSED_BITCASTS_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_HOIST_FUSED_BITCASTS_H_
+
+#include <cstdint>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Hoist bitcasts and reshapes in the computation out of "__triton_gemm" fusions
+// with a dot instruction.
+class HoistFusedBitcasts : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "hoist-fused-bitcasts"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  absl::StatusOr<bool> RunOnModule(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+};
+
+namespace detail {
+
+// Returns the start indices of consecutive non-overlapping subsequences of `a`
+// and `b` with the same product (see `CommonFactors` from `util.h`) grouping
+// ranges having product of 1 with neighbors.
+//
+// For example, if a=[2, 5, 1, 3] and b=[1, 10, 3, 1], the result will be
+// {{0, 0}, {2, 2}, {4, 4}}, grouping [2,5] with [1,10] and [1,3] with [3,1].
+absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
+CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
+                                  absl::Span<const int64_t> b);
+
+}  // namespace detail
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_HOIST_FUSED_BITCASTS_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
new file mode 100644
index 00000000000000..e14c6bba099ca5
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
@@ -0,0 +1,1314 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/hoist_fused_bitcasts.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/inlined_vector.h"
+#include "absl/log/log.h"
+#include "absl/status/status_matchers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_print_options.h"
+#include "xla/hlo/testlib/filecheck.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+
+using ::absl_testing::IsOkAndHolds;
+
+namespace xla {
+
+namespace gpu {
+namespace {
+
+// Wraps a matcher for a fusion instruction's output tile sizes.
+// Proto matchers would be nice, but b/229726259 is P2.
+MATCHER_P(OutputTileSizesIs, matcher, "") {
+  auto backend_config = arg.template backend_config<GpuBackendConfig>();
+  if (!backend_config.ok()) {
+    *result_listener << "failed to get backend config: "
+                     << backend_config.status();
+    return false;
+  }
+  FusionBackendConfig fusion_backend_config =
+      backend_config->fusion_backend_config();
+  if (!fusion_backend_config.has_block_level_fusion_config()) {
+    *result_listener << "has no block level fusion config";
+    return false;
+  }
+  if (fusion_backend_config.kind() != "__triton_nested_gemm_fusion") {
+    *result_listener << "fusion kind is not __triton_nested_gemm_fusion";
+    return false;
+  }
+  auto output_tile_sizes =
+      fusion_backend_config.block_level_fusion_config().output_tiles(0).sizes();
+  return ExplainMatchResult(matcher, output_tile_sizes, result_listener);
+}
+
+class HoistFusedBitcastsReshapeTest
+    : public HloHardwareIndependentTestBase,
+      public ::testing::WithParamInterface<HloOpcode> {
+ protected:
+  const se::DeviceDescription device_description_{
+      TestGpuDeviceInfo::RTXA6000DeviceInfo(
+          se::GpuComputeCapability{se::CudaComputeCapability::Ampere()})};
+  mlir::MLIRContext mlir_context_;
+
+  std::unique_ptr<VerifiedHloModule> RunHoistFusedBitcasts(
+      absl::string_view hlo, const bool expect_change = true) {
+    std::unique_ptr<VerifiedHloModule> module =
+        ParseAndReturnVerifiedModule(hlo).value();
+    EXPECT_THAT(HoistFusedBitcasts().Run(module.get()),
+                IsOkAndHolds(expect_change));
+    EXPECT_OK(verifier().Run(module.get()).status());
+    return module;
+  }
+};
+
+// Tests hoisting of bitcasts which would otherwise trigger unsatisfiable
+// constraints during symbolic tile analysis.
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedOutOfGemmFusions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[21] parameter(0)
+  bitcast = f32[3,7]{0,1} $0(lhs)
+  rhs = f32[7,11] parameter(1)
+  ROOT dot = f32[3,11] dot(bitcast, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[21] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f32[3,11] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+}
+)";
+
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: dot {
+CHECK-NEXT: [[lhs:[^ ]+]] = f32[3,7]{0,1} parameter(0)
+CHECK-NEXT: [[rhs:[^ ]+]] = f32[7,11]{1,0} parameter(1)
+CHECK-NEXT: ROOT {{.*}} = f32[3,11]{1,0} dot([[lhs]], [[rhs]]), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+CHECK-NEXT: }
+CHECK: ENTRY
+CHECK: bitcast
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsCanBeHoistedPastOtherBitcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[3,7] parameter(0)
+  bitcast0 = f32[21] $0(lhs)
+  bitcast1 = f32[3,7] $0(bitcast0)
+  rhs = f32[7,11] parameter(1)
+  ROOT dot = f32[3,11] dot(bitcast1, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[3, 7] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f32[3,11] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+}
+)";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsCanBeHoistedPastElementwiseEpilogues) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[3,7] parameter(0)
+  rhs = f32[7,11] parameter(1)
+  dot = f32[3,11] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bitcast = f32[33] $0(dot)
+  ROOT add = f32[33] add(bitcast, bitcast)
+}
+
+ENTRY entry {
+  p0 = f32[3, 7] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f32[33] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+})";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsCanBeHoistedPastConvertEpilogues) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = f32[3,7] parameter(0)
+  rhs = f32[7,11] parameter(1)
+  dot = f32[3,11] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bitcast = f32[33] $0(dot)
+  ROOT convert = f16[33] convert(bitcast)
+}
+
+ENTRY entry {
+  p0 = f32[3, 7] parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = f16[33] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+})";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: f16[3,11]{1,0} convert(
+CHECK: f16[3,11]{1,0} fusion(
+)"),
+      IsOkAndHolds(true));
+}
+
+// We cannot hoist bitcasts past transposes, but we don't need to hoist
+// because the bitcast is not rank-expanding and symbolic tile analysis
+// works fine.
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsCannotBeHoistedPastTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = f32[72,36,2] parameter(0)
+  transpose0 = f32[72,2,36] transpose(p0), dimensions={0,2,1}
+  bitcast0 = f32[144,36] $0(transpose0)
+  p1 = f32[36,3] parameter(1)
+  dot = f32[144,3] dot(bitcast0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  bitcast1 = f32[144,3] $0(dot)
+  ROOT transpose1 = f32[3,144] transpose(bitcast1), dimensions={1,0}
+}
+
+ENTRY entry {
+  p0 = f32[72,36,2] parameter(0)
+  p1 = f32[36,3] parameter(1)
+  ROOT fusion = f32[3,144] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"128","block_n":"16","block_k":"32",
+          "split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsKeepElementSizeInBits) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  lhs = s8[21]{0:E(4)} parameter(0)
+  c1 = s8[21] convert(lhs)
+  c2 = f32[21] convert(c1)
+  b0 = f32[3,7] $0(c2)
+  rhs = f32[7,11] parameter(1)
+  dot = f32[3,11] dot(b0, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  b1 = f32[33] $0(dot)
+  ROOT c = s8[33]{0:E(4)} convert(b1)
+}
+
+ENTRY entry {
+  p0 = s8[21]{0:E(4)} parameter(0)
+  p1 = f32[7,11] parameter(1)
+  ROOT fusion = s8[33]{0:E(4)} fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"32", "block_n":"64", "block_k":"16",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+})";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+  CHECK: ENTRY
+  CHECK: {{.*}} = s8[3,7]{1,0:E(4)} bitcast({{.*}})
+  CHECK: [[fusion:[^ ]+]] = s8[3,11]{1,0:E(4)} fusion({{.*}})
+  CHECK: ROOT {{.*}} = s8[33]{0:E(4)} bitcast([[fusion]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       TritonFusionEmitterDeviceLegacyTestSample1) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = f16[1,16,17,3] parameter(0)
+  bitcast0 = f16[16,51] $0(f16[1,16,17,3] p0)
+  p1 = s8[16,17,3] parameter(1)
+  bitcast1 = s8[16,51] $0(s8[16,17,3] p1)
+  convert = f16[16,51] convert(s8[16,51] bitcast1)
+  bitcast2 = f16[51,16]{0,1} $0(f16[16,51] convert)
+  dot = f16[16,16] dot(bitcast0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT bitcast3 = f16[1,16,16] $0(f16[16,16] dot)
+}
+
+ENTRY entry {
+  p0 = f16[1,16,17,3] parameter(0)
+  p1 = s8[16,17,3] parameter(1)
+  ROOT fusion = f16[1,16,16] fusion(f16[1,16,17,3] p0, s8[16,17,3] p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"16","block_n":"16","block_k":"32",
+          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       TritonFusionEmitterDeviceLegacyTestSample2) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = pred[3,122,96,12] parameter(0)
+  transpose = pred[3,96,12,122] transpose(p0), dimensions={0,2,3,1}
+  bitcast0 = pred[3456,122] $0(transpose)
+  convert0 = f16[3456,122] convert(bitcast0)
+  p1 = pred[1,5,122] parameter(1)
+  bitcast1 = pred[5,122] $0(p1)
+  convert1 = f16[5,122] convert(bitcast1)
+  bitcast2 = f16[122,5]{0,1} $0(convert1)
+  dot.1 = f16[3456,5] dot(convert0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT bitcast3 = f16[3,96,12,1,5] $0(dot.1)
+}
+
+ENTRY entry_computation {
+  p0 = pred[3,122,96,12] parameter(0)
+  p1 = pred[1,5,122] parameter(1)
+  ROOT gemm_fusion_dot = f16[3,96,12,1,5] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"4","block_n":"16","block_k":"128",
+          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  // Note: block sizes were 16,16,32, but that now fails to satisfy constraints.
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       TritonFusionEmitterDeviceLegacyTestSample3) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = f32[1,40] parameter(0)
+  bitcast0 = f32[40] $0(p0)
+  bitcast1 = f32[40,1] $0(bitcast0)
+  p1 = f32[1,40,250000] parameter(1)
+  bitcast2 = f32[40,250000] $0(p1)
+  dot = f32[1,250000] dot(bitcast1, bitcast2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  bitcast3 = f32[250000] $0(dot)
+  ROOT bitcast4 = f32[1,250000] $0(bitcast3)
+}
+
+ENTRY entry_computation {
+  p0 = f32[1,40] parameter(0)
+  p1 = f32[1,40,250000] parameter(1)
+  ROOT gemm_fusion_dot.2 = f32[1,250000] fusion(p0, p1),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config":{
+        "kind":"__triton_gemm","triton_gemm_config":{
+          "block_m":"16","block_n":"16","block_k":"32",
+          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
+        }
+      }
+    }
+})";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedPastCompare) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = s32[11,24,128]{2,1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  eq = pred[11,24,128]{2,1,0} compare(p0, p1), direction=EQ
+  eq_reshape = pred[264,128]{1,0} $0(eq)
+  eq_f32 = f32[264,128]{1,0} convert(eq_reshape)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT result = f32[264,8]{1,0} dot(eq_f32, p2),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = s32[11,24, 128]{2,1,0} parameter(0)
+  p1 = s32[11,24,128]{2,1,0} parameter(1)
+  p2 = f32[128,8]{1,0} parameter(2)
+  ROOT result = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {
+      "block_m":32,"block_n":16,"block_k":128,
+      "split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}}
+)";
+  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedUpThroughBroadcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,1,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,1,2,3}
+  p0_reshape = f32[264,128] $0(p0_broadcast)
+
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,1,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[dot_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[264,128]{1,0} broadcast([[dot_p0]]), dimensions={0}
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsAreHoistedUpThroughBroadcastsWithTrivialDimensions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,2,3}
+  p0_reshape = f32[264,128] $0(p0_broadcast)
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[dot_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[264,128]{1,0} broadcast([[dot_p0]]), dimensions={0}
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,24,1]{{.*}} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastOfOperandAndBroadcastDimsIsNotHoistedUp) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,4] parameter(0)
+  p1 = f32[64,7]{1,0} parameter(1)
+  broadcast = f32[3,4,16] broadcast(p0), dimensions={0,1}
+  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
+  reshape = f32[3,64] $0(broadcast)
+  ROOT dot = f32[3,7]{1,0} dot(reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[3,4] parameter(0)
+  p1 = f32[64,7] parameter(1)
+  ROOT result = f32[3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  // Cos should not be rewritten as we cannot hoist bitcast.
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[3,4,16]{2,1,0} broadcast
+CHECK-NEXT: f32[3,64]{1,0} $0
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastOfOperandAndBroadcastDimsIsNotHoistedDown) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7]{1,0} parameter(1)
+  dot = f32[6,5]{1,0} dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
+  reshape = f32[2,3,5] $0(dot)
+  ROOT broadcast = f32[2,4,3,5] broadcast(reshape), dimensions={0,2,3}
+}
+
+ENTRY e {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,4,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  // Cos should not be rewritten as we cannot hoist bitcast.
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[2,3,5]{2,1,0} $0
+CHECK-NEXT: f32[2,4,3,5]{3,2,1,0} broadcast
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsAreHoistedUpThroughBroadcastDiamonds) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,5] parameter(0)
+  b0 = f32[3,5,77,1] broadcast(p0), dimensions={0,1}
+  b1 = f32[3,5,1] broadcast(p0), dimensions={0,1}
+  b2 = f32[3,5,77,1] broadcast(b1), dimensions={0,1,3}
+  sum = add(b0, b2)
+  sum_reshape = f32[15,77] $0(sum)
+  p1 = f32[77,8]{1,0} parameter(1)
+  ROOT result = f32[15,8] dot(sum_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[3,5] parameter(0)
+  p1 = f32[77,8] parameter(1)
+  ROOT result = f32[15,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: [[p0:[^ ]+]] = f32[15]{0} parameter(0)
+CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[p0]]), dimensions={0}
+CHECK-DAG: [[br:[^ ]+]] = f32[15]{0} broadcast([[p0]]), dimensions={0}
+CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[br]]), dimensions={0}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedOverBroadcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[11,1,24,1] parameter(0)
+  p0_broadcast = f32[11,1,24,1,128,1] broadcast(p0), dimensions={0,1,2,5}
+  p0_reshape = f32[264,128] $0(p0_broadcast)
+
+  p1 = f32[128,8]{1,0} parameter(1)
+  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[11,1,24,1] parameter(0)
+  p1 = f32[128,8] parameter(1)
+  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           R"(
+// Broadcast fusion:
+CHECK: {{.*}} {
+CHECK-NEXT: [[dot_p0:[^ ]+]] = f32[264]{0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[264,128]{1,0} broadcast([[dot_p0]]), dimensions={0}
+CHECK: ENTRY {{.*}} {
+CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
+CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
+)"),
+
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsLayoutIsPreserved) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+gemm_dot {
+  p0 = pred[3,122,96,12] parameter(0)
+  bitcast0 = pred[3,122,1152] $0(p0)
+  transpose0 = pred[3,1152,122] transpose(bitcast0), dimensions={0,2,1}
+  bitcast2 = pred[3456,122] $0(transpose0)
+  convert0 = f16[3456,122] convert(bitcast2)
+  p1 = pred[1,5,122] parameter(1)
+  bitcast3 = pred[5,122] $0(p1)
+  convert1 = f16[5,122] convert(bitcast3)
+  bitcast4 = f16[122,5]{0,1} $0(convert1)
+  dot0 = f16[3456,5]{1,0} dot(convert0, bitcast4), lhs_contracting_dims={1},
+    rhs_contracting_dims={0}
+  ROOT bitcast5 = f16[3,96,12,1,5] $0(dot0)
+}
+
+ENTRY e {
+  p0 = pred[3,122,96,12] parameter(0)
+  p1 = pred[1,5,122] parameter(1)
+  ROOT fusion = f16[3,96,12,1,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK: {{.*}} {
+CHECK: [[re:[^ ]+]] = pred[3456,122]{1,0} $0({{.*}})
+CHECK: {{.*}} = f16[3456,122]{1,0} convert([[re]])
+CHECK-NOT: $0
+CHECK: {{.*}} = f16[122,5]{0,1} convert({{.*}})
+CHECK-NEXT: }
+CHECK: ENTRY {{.*}} {
+CHECK: {{.*}} = pred[122,5]{0,1} bitcast({{.*}})
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       CheckDimensionsOfBroadcastAfterBitcastIsHoisted) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+dot {
+  p0 = bf16[1,8] parameter(0)
+  broadcast0 = bf16[1,8,8] broadcast(p0), dimensions={0,2}
+  lhs = bf16[1,2,4,8] $0(broadcast0)
+
+  p1 = bf16[1,8] parameter(1)
+  broadcast1 = bf16[1,8,8] broadcast(p1), dimensions={0,2}
+  rhs = bf16[1,2,4,8] $0(broadcast1)
+
+  ROOT dot = bf16[2,1,4,4] dot(lhs, rhs),
+    lhs_contracting_dims={3}, lhs_batch_dims={1,0},
+    rhs_contracting_dims={3}, rhs_batch_dims={1,0}
+}
+
+ENTRY entry {
+  p0 = bf16[1,8] parameter(0)
+  ROOT fusion = bf16[2,1,4,4] fusion(p0, p0), kind=kCustom, calls=dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+})";
+
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
+CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedUpThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[7,6] parameter(0)
+  transpose = f32[6,7] transpose(p0), dimensions={1,0}
+  bitcast = f32[2,3,7] $0(transpose)
+  p1 = f32[2,5,7] parameter(1)
+  ROOT result = f32[2,3,5] dot(bitcast, p1),
+    lhs_contracting_dims={2}, lhs_batch_dims={0},
+    rhs_contracting_dims={2}, rhs_batch_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[7,6] parameter(0)
+  p1 = f32[2,5,7] parameter(1)
+  ROOT result = f32[2,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: {{.*}} {
+CHECK-NEXT: [[p0:[^ ]*]] = f32[7,2,3]{2,1,0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[2,3,7]{2,1,0} transpose([[p0]]), dimensions={1,2,0}
+CHECK ENTRY
+CHECK f32[7,2,3]{2,1,0} bitcast
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsWithSize1DimensionsAreHoistedUpThroughTransposes) {
+  const HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[7,6] parameter(0)
+  transpose = f32[6,7] transpose(p0), dimensions={1,0}
+  bitcast = f32[1,6,7] $0(transpose)
+  p1 = f32[1,5,7] parameter(1)
+  ROOT result = f32[1,6,5] dot(bitcast, p1),
+    lhs_contracting_dims={2}, lhs_batch_dims={0},
+    rhs_contracting_dims={2}, rhs_batch_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[7,6] parameter(0)
+  p1 = f32[1,5,7] parameter(1)
+  ROOT result = f32[1,6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: {{.*}} {
+CHECK-NEXT: [[p0:[^ ]+]] = f32[7,1,6]{2,1,0} parameter(0)
+CHECK-NEXT: {{.*}} = f32[1,6,7]{2,1,0} transpose([[p0]]), dimensions={1,2,0}
+CHECK-NOT: bitcast
+CHECK: }
+CHECK ENTRY {{.*}} {
+CHECK: bitcast
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       RankReducingBitcastsAreNotHoistedUpThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[2,7,3] parameter(0)
+  transpose = f32[3,2,7] transpose(p0), dimensions={2,0,1}
+  $0 = f32[6,7] $0(transpose)
+  p1 = f32[5,7] parameter(1)
+  ROOT dot = f32[6,5] dot($0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = f32[2,7,3] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      transpose
+CHECK-SAME: f32[3,2,7]{2,1,0} transpose
+CHECK-SAME: dimensions={2,0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       RankReducingBitcastsAreNotHoistedDownThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[6,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0 = f32[2,3,5] $0(dot)
+  ROOT transpose = f32[2,5,3] transpose($0), dimensions={0,2,1}
+}
+
+ENTRY e {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,5,3] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[2,3,5]{2,1,0} $0
+CHECK-NEXT: f32[2,5,3]{2,1,0} transpose
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       HoistingBitcastDoesNotIntroduceArtificialDimension) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+gemm_dot {
+  p0 = f16[3,122,1152] parameter(0)
+  transpose = f16[3,1152,122] transpose(p0), dimensions={0,2,1}
+  bitcast0 = f16[3,96,12,122] $0(transpose)
+  bitcast1 = f16[3456,122] $0(bitcast0)
+  p1 = f16[122,5] parameter(1)
+  ROOT dot = f16[3456,5]{1,0} dot(bitcast1, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f16[3,122,1152] parameter(0)
+  p1 = f16[122,5] parameter(1)
+  ROOT fusion = f16[3456,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
+    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
+}
+          )";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  // Checks that transpose is on rank 3 tensor from hoisting bitcast1, not rank
+  // 4 tensor from hoisting bitcast0 first and then failing to hoist bitcast1.
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      transpose
+CHECK-SAME: f16[3,1152,122]{2,1,0} transpose
+CHECK-SAME: dimensions={0,2,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedDownThroughTransposes) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[2,3,7] parameter(0)
+  p1 = f32[2,5,7] parameter(1)
+  dot = f32[2,3,5] dot(p0, p1),
+    lhs_contracting_dims={2}, lhs_batch_dims={0},
+    rhs_contracting_dims={2}, rhs_batch_dims={0}
+  bitcast = f32[6,5] $0(dot)
+  ROOT transpose = f32[5,6] transpose(bitcast), dimensions={1,0}
+}
+
+ENTRY e {
+  p0 = f32[2,3,7] parameter(0)
+  p1 = f32[2,5,7] parameter(1)
+  ROOT result = f32[5,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT transpose
+CHECK-SAME: f32[5,2,3]{2,1,0} transpose
+CHECK-SAME: dimensions={2,0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastsAreHoistedDownThroughBroadcasts) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[3,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast = f32[15] $0(dot)
+  ROOT broadcast = f32[2,15,6] broadcast(bitcast), dimensions={1}
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,15,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT broadcast
+CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
+CHECK-SAME: dimensions={0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+// TODO(b/467306121): handle the case when we need to sink the reshape through
+// broadcast.
+TEST_P(HoistFusedBitcastsReshapeTest,
+       DISABLED_BitcastsAreHoistedDownThroughBroadcastsWithTrivialDimensions) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[6,7] parameter(1)
+  dot = f32[3,6] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast = f32[3,2,3] $0(dot)
+  ROOT broadcast = f32[3,2,1,3,7] broadcast(bitcast), dimensions={0,1,3}
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[6,7] parameter(1)
+  ROOT result = f32[3,2,1,3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK:      ROOT broadcast
+CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
+CHECK-SAME: dimensions={0,1}
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsAreHoistedDownThroughBroadcastsWithNonDefaultLayout) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[6,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast = f32[2,3,5]{2,1,0} $0(dot)
+  ROOT broadcast = f32[2,3,5]{2,0,1} broadcast(bitcast), dimensions={0,1,2}
+}
+
+ENTRY e {
+  p0 = f32[6,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[2,3,5]{2,0,1} fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK:      f32[2,3,5]{2,1,0} $0(dot)
+CHECK-NEXT: f32[2,3,5]{2,0,1} broadcast
+)",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, BitcastRootsAreHoistedDown) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  dot = f32[3,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  ROOT bitcast = f32[15] $0(dot)
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  ROOT result = f32[15] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: ROOT dot
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastAreHoistedDownThroughBinaryElementwiseOps) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+triton_dot {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  p2 = f32[15] parameter(2)
+  dot = f32[3,5] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0 = f32[15] $0(dot)
+  ROOT add = f32[15] add($0, p2)
+}
+
+ENTRY e {
+  p0 = f32[3,7] parameter(0)
+  p1 = f32[5,7] parameter(1)
+  p2 = f32[15] parameter(2)
+  ROOT result = f32[15] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: ROOT add = f32[3,5]{1,0} add
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsWithNonDefaultLayoutAreHoistedOutThroughBroadcast) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[7,2]{0,1} parameter(0)
+  broadcast.1 = f32[15,7,2]{1,0,2} broadcast(p0), dimensions={1,2}
+  $0.1 = f32[2,7,15]{1,2,0} $0(broadcast.1)
+  p1 = f32[2,15,15]{2,1,0} parameter(1)
+  dot = f32[2,7,15]{2,1,0} dot($0.1, p1),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+  $0.2 = f32[15,14]{0,1} $0(dot)
+  ROOT broadcast.2 = f32[15,11,14]{0,2,1} broadcast($0.2), dimensions={0,2}
+}
+
+ENTRY e {
+  p0 = f32[7,2]{0,1} parameter(0)
+  p1 = f32[2,15,15]{2,1,0} parameter(1)
+  ROOT result = f32[15,11,14]{0,2,1} fusion(p0, p1),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[2,7,15]{1,2,0} broadcast({{.*}}), dimensions={0,1}
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[2,7,15,11]{2,1,0,3} broadcast({{.*}}), dimensions={0,1,2}
+CHECK: ENTRY
+CHECK: f32[7,2]{0,1} parameter(0)
+CHECK: f32[2,7]{1,0} bitcast(p0
+CHECK: result = f32[2,7,15,11]{2,1,0,3} fusion
+CHECK: ROOT {{.*}} = f32[15,11,14]{0,2,1} bitcast(result)
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest,
+       BitcastsWithNonDefaultLayoutAreHoistedOutThroughTranspose) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[2,3,7]{0,2,1} parameter(0)
+  $0.1 = f32[7,3,2]{2,0,1} $0(p0)
+  transpose.1 = f32[3,2,7]{2,0,1} transpose($0.1), dimensions={1,2,0}
+  p1 = f32[3,5,7]{2,1,0} parameter(1)
+  dot = f32[3,2,5]{2,1,0} dot(transpose.1, p1),
+    lhs_batch_dims={0}, lhs_contracting_dims={2},
+    rhs_batch_dims={0}, rhs_contracting_dims={2}
+  $0.2 = f32[5,3,2]{0,2,1} $0(dot)
+  ROOT transpose.2 = f32[2,3,5]{0,2,1} transpose($0.2), dimensions={2,1,0}
+}
+
+ENTRY e {
+  p0 = f32[2,3,7]{0,2,1} parameter(0)
+  p1 = f32[3,5,7]{2,1,0} parameter(1)
+  ROOT result = f32[2,3,5]{0,2,1} fusion(p0, p1),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[3,2,7]{2,0,1} transpose({{.*}}), dimensions={1,2,0}
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: f32[3,5,2]{2,1,0} transpose({{.*}}), dimensions={0,2,1}
+CHECK: ENTRY
+CHECK: f32[2,3,7]{0,2,1} parameter(0)
+CHECK: f32[7,3,2]{2,0,1} bitcast(p0
+CHECK: result = f32[3,5,2]{2,1,0} fusion
+CHECK: ROOT {{.*}} = f32[2,3,5]{0,2,1} bitcast(result)
+)"),
+      IsOkAndHolds(true));
+}
+
+TEST_P(HoistFusedBitcastsReshapeTest, MultipleBitcastsAreHoistedOut) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[3,3]{1,0} parameter(0)
+  $0.1 = f32[3,3]{1,0} $0(p0)
+  $0.2 = f32[3,3]{1,0} $0($0.1)
+  p1 = f32[3,3]{1,0} parameter(1)
+  dot = f32[3,3]{1,0} dot($0.2, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0.3 = f32[3,3]{1,0} $0(dot)
+  ROOT $0.4 = f32[3,3]{0,1} $0($0.3)
+}
+
+ENTRY e {
+  p0 = f32[3,3]{1,0} parameter(0)
+  ROOT result = f32[3,3]{0,1} fusion(p0, p0),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+CHECK: ENTRY
+)"),
+      IsOkAndHolds(true));
+}
+
+// TODO(b/393299275): this test was not written correctly and now fails.
+TEST_P(HoistFusedBitcastsReshapeTest,
+       DISABLED_BitcastsAreNotHoistedOutThroughLayoutChangingTranspose) {
+  HloOpcode opcode = GetParam();
+  absl::string_view hlo = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[7,2]{1,0} parameter(0)
+  $0.1 = f32[2,7]{0,1} $0(p0)
+  transpose.1 = f32[2,7]{1,0} transpose($0.1), dimensions={0,1}
+  p1 = f32[5,7]{1,0} parameter(1)
+  dot = f32[2,5]{1,0} dot(transpose.1, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  $0.2 = f32[5,2]{0,1} $0(dot)
+  ROOT transpose.2 = f32[5,2]{1,0} transpose($0.2), dimensions={0,1}
+}
+
+ENTRY e {
+  p0 = f32[7,2]{1,0} parameter(0)
+  p1 = f32[5,7]{1,0} parameter(1)
+  ROOT result = f32[5,2]{1,0} fusion(p0, p1),
+    kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
+    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
+)";
+  std::unique_ptr<VerifiedHloModule> module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
+                           absl::Substitute(R"(
+CHECK: $0.1 = f32[2,7]{0,1} $0
+CHECK: $0.2 = f32[5,2]{0,1} $0
+CHECK: ENTRY
+CHECK-NOT: bitcast
+CHECK-NOT: reshape
+        )",
+                                            HloOpcodeString(opcode))),
+              IsOkAndHolds(true));
+}
+
+INSTANTIATE_TEST_SUITE_P(HoistFusedBitcastsReshapeTestSuite,
+                         HoistFusedBitcastsReshapeTest,
+                         ::testing::ValuesIn({HloOpcode::kReshape,
+                                              HloOpcode::kBitcast}),
+                         [](const ::testing::TestParamInfo<HloOpcode>& info) {
+                           return std::string(HloOpcodeString(info.param));
+                         });
+
+struct CommonFactorsTestCase {
+  std::vector<int64_t> from, to;
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> expected;
+};
+
+class CommonFactorsMergingTrivialRangesTest
+    : public ::testing::TestWithParam<CommonFactorsTestCase> {};
+
+TEST_P(CommonFactorsMergingTrivialRangesTest, Example) {
+  const CommonFactorsTestCase& test_case = GetParam();
+  EXPECT_EQ(test_case.expected, detail::CommonFactorsMergingTrivialRanges(
+                                    test_case.from, test_case.to));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CommonFactorsMergingTrivialRangesTestSuite,
+    CommonFactorsMergingTrivialRangesTest,
+    ::testing::Values(
+        CommonFactorsTestCase{{1}, {}, {{0, 0}, {1, 0}}},
+        CommonFactorsTestCase{{}, {1}, {{0, 0}, {0, 1}}},
+        CommonFactorsTestCase{{}, {}, {{0, 0}}},
+        CommonFactorsTestCase{{1, 2, 0}, {2, 0, 3}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{2, 3, 0}, {1, 0, 1000}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{1, 1, 1}, {1, 1}, {{0, 0}, {1, 1}, {3, 2}}},
+        CommonFactorsTestCase{{1, 1, 3}, {3, 1, 1}, {{0, 0}, {3, 3}}},
+        CommonFactorsTestCase{{2, 6}, {4, 3}, {{0, 0}, {2, 2}}},
+        CommonFactorsTestCase{{1, 2, 6}, {4, 1, 3, 1}, {{0, 0}, {3, 4}}},
+        CommonFactorsTestCase{{2, 3, 4, 5}, {6, 20}, {{0, 0}, {2, 1}, {4, 2}}},
+        CommonFactorsTestCase{
+            {2, 3, 4, 5, 6}, {6, 20, 6}, {{0, 0}, {2, 1}, {4, 2}, {5, 3}}},
+        CommonFactorsTestCase{{2, 2, 2, 2}, {4, 4}, {{0, 0}, {2, 1}, {4, 2}}},
+        CommonFactorsTestCase{
+            {2, 5, 1, 3}, {1, 10, 3, 1}, {{0, 0}, {2, 2}, {4, 4}}}),
+    [](const ::testing::TestParamInfo<CommonFactorsTestCase>& info) {
+      return absl::StrCat(absl::StrJoin(info.param.from, "_"), "_to_",
+                          absl::StrJoin(info.param.to, "_"));
+    });
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
index 28a31ba41a8d50..8a8069c5a04c5b 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.cc
@@ -15,11 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
-#include <cstddef>
 #include <cstdint>
-#include <deque>
 #include <memory>
-#include <optional>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -36,9 +33,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile.h"
@@ -51,10 +46,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
-#include "xla/layout.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
@@ -64,7 +57,6 @@ limitations under the License.
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/matmul_indexing_utils.h"
 #include "xla/shape.h"
-#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tools/hlo_decomposer.h"
 #include "xla/tsl/platform/errors.h"
@@ -338,783 +330,6 @@ absl::Status MakeNestedFusionFromGemmFusion(
   return absl::OkStatus();
 }
 
-using HloInstructionSetVector =
-    llvm::SetVector<HloInstruction*, std::vector<HloInstruction*>,
-                    HloInstructionSet>;
-
-// Returns the set of instructions that are reachable from 'instruction' using
-// the given accessor.
-template <typename T>
-HloInstructionSetVector GetTransitiveInstructionSet(
-    const HloInstruction* instruction, T (HloInstruction::*get)() const) {
-  std::deque<HloInstruction*> worklist;
-  auto append = [&](const auto& instructions) {
-    worklist.insert(worklist.end(), instructions.begin(), instructions.end());
-  };
-  append((instruction->*get)());
-  HloInstructionSetVector result;
-  while (!worklist.empty()) {
-    HloInstruction* front = worklist.front();
-    worklist.pop_front();
-    if (result.insert(front)) {
-      append((front->*get)());
-    }
-  }
-  return result;
-}
-
-// Returns the set of producers reachable from 'instruction' in use-before-def
-// order.
-HloInstructionSetVector GetProducerSet(const HloInstruction* instruction) {
-  return GetTransitiveInstructionSet(instruction, &HloInstruction::operands);
-}
-// Returns the set of consumers reachable from 'instruction' in def-before-use
-// order.
-HloInstructionSetVector GetConsumerSet(const HloInstruction* instruction) {
-  return GetTransitiveInstructionSet(instruction, &HloInstruction::users);
-}
-
-// Verifies that the set of instructions is closed under the given accessor,
-// i.e. that the set of instructions reachable through the given accessor are
-// either in the set itself or the root.
-template <typename T>
-absl::Status VerifyIsClosedInstructionSet(
-    const HloInstructionSetVector& instructions, const HloInstruction* root,
-    T (HloInstruction::*get)() const) {
-  for (HloInstruction* instruction : instructions) {
-    for (HloInstruction* reachable : (instruction->*get)()) {
-      if (reachable != root && instructions.count(reachable) == 0) {
-        return absl::FailedPreconditionError(
-            absl::StrCat("Instruction ", reachable->ToString(),
-                         " is reachable from ", instruction->ToString(),
-                         ", which is not in the recursive set of, or ",
-                         root->ToString(), " itself."));
-      }
-    }
-  }
-
-  return absl::OkStatus();
-}
-
-absl::Status VerifyIsClosedProducerSet(
-    const HloInstructionSetVector& instructions, const HloInstruction* root) {
-  return VerifyIsClosedInstructionSet(instructions, root,
-                                      &HloInstruction::users);
-}
-
-// Copies the element type and size from `source` to `destination`.
-void CopyElementType(const Shape& source, Shape* destination) {
-  destination->set_element_type(source.element_type());
-  destination->mutable_layout()->set_element_size_in_bits(
-      source.layout().element_size_in_bits());
-}
-
-llvm::SmallVector<int64_t> GetInversePermutation(
-    absl::Span<const int64_t> permutation) {
-  llvm::SmallVector<int64_t> result(permutation.size());
-  for (int64_t i = 0; i < permutation.size(); ++i) {
-    result[permutation[i]] = i;
-  }
-  return result;
-}
-
-// Applies the backward-mapping 'permutation' to 'values'.
-llvm::SmallVector<int64_t> ApplyPermutation(
-    absl::Span<const int64_t> values, absl::Span<const int64_t> permutation) {
-  llvm::SmallVector<int64_t> result;
-  result.reserve(permutation.size());
-  for (int64_t index : permutation) {
-    result.push_back(values[index]);
-  }
-  return result;
-}
-
-// Returns the dimensions of 'shape' in minor-to-major order.
-llvm::SmallVector<int64_t> GetPhysicalDimensions(const Shape& shape) {
-  return ApplyPermutation(shape.dimensions(), shape.layout().minor_to_major());
-}
-
-// Parameters to rewrite a bitcast(broadcast/transpose) as
-// broadcast/transpose(bitcast) and vice versa.
-struct BitcastParams {
-  Shape new_shape;                      // The bitcast output shape.
-  llvm::SmallVector<int64_t> new_dims;  // The dims of the broadcast/transpose.
-};
-
-// Returns parameters to rewrite a broadcast + bitcast as bitcast + broadcast.
-//
-// Example:
-//
-// broadcast = broadcast(operand)
-// result = result_shape bitcast(broadcast)
-//
-// to
-//
-// bitcast = new_shape bitcast(operand)
-// result = broadcast(bitcast), dimensions={new_dims}.
-//
-// Assumes that:
-// - broadcast does not transpose dimensions (checked by hlo_verifier);
-// - bitcast does not mix operand and broadcast dimensions (checks);
-absl::StatusOr<BitcastParams> CalculateBitcastOfBroadcast(
-    const HloBroadcastInstruction* broadcast, const Shape& result_shape) {
-  const Shape& broadcast_shape = broadcast->shape();
-
-  // Maps broadcast dimension index to whether it's an operand dimension.
-  llvm::SmallVector<bool> is_operand_dim(broadcast_shape.dimensions().size());
-  for (const int64_t index : broadcast->dimensions()) {
-    is_operand_dim[index] = true;
-  }
-
-  // Dimensions of the new broadcast.
-  llvm::SmallVector<int64_t> new_dims;
-  llvm::SmallVector<int64_t> broadcast_physical_dims =
-      GetPhysicalDimensions(broadcast_shape);
-  auto factors = CommonFactors(GetPhysicalDimensions(result_shape),
-                               broadcast_physical_dims);
-  for (int64_t i = 1; i < factors.size(); ++i) {
-    auto [result_from, broadcast_from] = factors[i - 1];
-    auto [result_to, broadcast_to] = factors[i];
-
-    bool all_operands = true, any_operands = false;
-    for (int64_t j = broadcast_from; j < broadcast_to; ++j) {
-      if (broadcast_physical_dims[j] == 1) {
-        // If dimension size is 1 then we can ignore it: it's either immediately
-        // dropped by old reshape or it's coming from the operand and then the
-        // new reshape will handle it.
-        continue;
-      }
-      bool value = is_operand_dim[broadcast_shape.layout().minor_to_major(j)];
-      all_operands &= value;
-      any_operands |= value;
-    }
-    if (!any_operands) {
-      continue;  // All dimensions in this group are broadcast dimensions.
-    }
-    if (!all_operands) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
-                       " as it mixes operand and broadcast dimensions."));
-    }
-
-    for (int64_t j = result_from; j < result_to; ++j) {
-      new_dims.push_back(result_shape.layout().minor_to_major(j));
-    }
-  }
-  absl::c_sort(new_dims);  // Sort into logical order.
-
-  BitcastParams result;
-  CopyElementType(result_shape, &result.new_shape);
-  for (int64_t index : new_dims) {
-    result.new_shape.add_dimensions(result_shape.dimensions(index));
-  }
-  auto* new_layout =
-      result.new_shape.mutable_layout()->mutable_minor_to_major();
-  new_layout->reserve(new_dims.size());
-  for (int64_t index : result_shape.layout().minor_to_major()) {
-    if (auto it = absl::c_lower_bound(new_dims, index);
-        it != new_dims.end() && *it == index) {
-      new_layout->push_back(it - new_dims.begin());
-    }
-  }
-  result.new_dims = std::move(new_dims);
-
-  VLOG(3) << "CalculateBitcastOfBroadcast:";
-  VLOG(3) << "  broadcast = " << broadcast_shape.ToString(true) << " broadcast("
-          << broadcast->operand(0)->shape().ToString(true)
-          << " operand), dimensions="
-          << absl::StrJoin(broadcast->dimensions(), ",");
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
-          << broadcast_shape.ToString(true) << " broadcast)";
-  VLOG(3) << "--------------------------------";
-  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
-          << broadcast->operand(0)->shape().ToString(true) << " operand)";
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
-          << result.new_shape.ToString(true)
-          << " bitcast), dimensions=" << absl::StrJoin(result.new_dims, ",");
-
-  return result;
-}
-
-// Returns parameters to rewrite a bitcast + broadcast as broadcast + bitcast.
-//
-// Example:
-//
-// bitcast = bitcast(operand_shape operand)
-// result = broadcast(bitcast)
-//
-// to
-//
-// broadcast = new_shape broadcast(operand), dimensions={new_dims}.
-// result = bitcast(broadcast)
-//
-// Assumes that:
-// - broadcast does not transpose dimensions (checked by hlo_verifier);
-// - bitcast does not mix operand and broadcast dimensions (checks);
-absl::StatusOr<BitcastParams> CalculateBroadcastOfBitcast(
-    const HloBroadcastInstruction* broadcast, const Shape& operand_shape) {
-  const Shape& bitcast_shape = broadcast->operand(0)->shape();
-  const Shape& result_shape = broadcast->shape();
-
-  // Maps logical result dimension index to a range of physical operand
-  // dimensions, or nullopt if the dimension is broadcasted.
-  llvm::SmallVector<std::optional<std::pair<int64_t, int64_t>>>
-      result_to_operand_range(result_shape.dimensions().size());
-  auto result_inv_layout =
-      GetInversePermutation(result_shape.layout().minor_to_major());
-  auto factors = CommonFactors(GetPhysicalDimensions(bitcast_shape),
-                               GetPhysicalDimensions(operand_shape));
-  for (int64_t i = 1; i < factors.size(); ++i) {
-    auto [bitcast_from, operand_from] = factors[i - 1];
-    auto [bitcast_to, operand_to] = factors[i];
-
-    llvm::SmallVector<int64_t> indices;
-    indices.reserve(bitcast_to - bitcast_from);
-    for (int64_t j = bitcast_from; j < bitcast_to; ++j) {
-      int64_t index =
-          broadcast->dimensions()[bitcast_shape.layout().minor_to_major(j)];
-
-      // Store the entire operand dimension range in the minor-most dimension
-      // index and an empty range in all others.
-      result_to_operand_range[index].emplace(operand_from, operand_to);
-      operand_from = operand_to;
-
-      // Check that the physical result indices form a contiguous range.
-      indices.push_back(result_inv_layout[index]);
-    };
-
-    if (indices.back() - indices.front() >= bitcast_to - bitcast_from ||
-        !absl::c_is_sorted(indices)) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Cannot hoist bitcast across ", broadcast->ToString(),
-                       " because result dimensions are not contiguous."));
-    }
-  }
-
-  BitcastParams result;
-  CopyElementType(operand_shape, &result.new_shape);
-  result.new_dims.resize(operand_shape.dimensions().size());
-  auto* new_layout =
-      result.new_shape.mutable_layout()->mutable_minor_to_major();
-  int64_t new_rank = operand_shape.dimensions().size() +
-                     result_shape.dimensions().size() -
-                     bitcast_shape.dimensions().size();
-  new_layout->reserve(new_rank);
-  llvm::SmallVector<int64_t> new_shape_dims(new_rank);
-
-  // We are free to insert the broadcast dimensions in any order. Insert them
-  // at the end of the the logical dimension order.
-  int64_t broadcast_index = operand_shape.dimensions().size();
-
-  // Iterate through the logical result dimension indices in physical order.
-  for (int64_t result_index : result_shape.layout().minor_to_major()) {
-    if (auto range = result_to_operand_range[result_index]) {
-      // This result dimension corresponds to a group of operand dimensions.
-      // Iterate through the range of physical operand dimension indices.
-      for (int64_t i = range->first; i < range->second; ++i) {
-        int64_t operand_index = operand_shape.layout().minor_to_major(i);
-        int64_t new_index = operand_index;
-        new_shape_dims[new_index] = operand_shape.dimensions(operand_index);
-        new_layout->push_back(new_index);
-        result.new_dims[operand_index] = new_index;
-      }
-    } else {
-      // This is a new dimension introduced by the original broadcast.
-      int64_t new_index = broadcast_index++;
-      new_shape_dims[new_index] = result_shape.dimensions(result_index);
-      new_layout->push_back(new_index);
-    }
-  }
-  absl::c_sort(result.new_dims);  // Sort into logical order.
-  for (int64_t dimension : new_shape_dims) {
-    result.new_shape.add_dimensions(dimension);
-  }
-
-  VLOG(3) << "CalculateBroadcastOfBitcast:";
-  VLOG(3) << "  bitcast   = " << bitcast_shape.ToString(true) << " bitcast("
-          << operand_shape.ToString(true) << " operand)";
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " broadcast("
-          << bitcast_shape.ToString(true) << " bitcast), dimensions="
-          << absl::StrJoin(broadcast->dimensions(), ",");
-  VLOG(3) << "--------------------------------";
-  VLOG(3) << "  broadcast = " << result.new_shape.ToString(true)
-          << " broadcast(" << operand_shape.ToString(true)
-          << " operand), dimensions=" << absl::StrJoin(result.new_dims, ",");
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
-          << result.new_shape.ToString(true) << " broadcast)";
-
-  return result;
-}
-
-// Implements CalculateBitcastOfTranspose(), except that result.new_dims is
-// the inverse permutation, mapping the input dimensions to the output
-// dimensions.
-absl::StatusOr<BitcastParams> CalculateBitcastOfTransposeImpl(
-    const HloTransposeInstruction* transpose, const Shape& result_shape,
-    const Shape& transpose_shape, const Shape& operand_shape,
-    absl::Span<const int64_t> transpose_dims) {
-  if (transpose->shape().layout() != transpose->operand(0)->shape().layout()) {
-    return absl::InternalError(
-        absl::StrCat("Expected input and output layouts to be the same for ",
-                     transpose->ToString()));
-  }
-
-  // Maps physical operand dimension index to a range of physical result
-  // dimensions.
-  llvm::SmallVector<std::pair<int64_t, int64_t>> operand_to_result_range(
-      operand_shape.dimensions().size());
-  // Maps logical operand dimension index to the physical dimension index.
-  llvm::SmallVector<int64_t> operand_inv_layout =
-      GetInversePermutation(operand_shape.layout().minor_to_major());
-
-  const absl::InlinedVector<std::pair<int64_t, int64_t>, 8> factors =
-      ::xla::gpu::detail::CommonFactorsMergingTrivialRanges(
-          GetPhysicalDimensions(result_shape),
-          GetPhysicalDimensions(transpose_shape));
-  for (int64_t i = 1; i < factors.size(); ++i) {
-    auto [result_from, transpose_from] = factors[i - 1];
-    auto [result_to, transpose_to] = factors[i];
-
-    llvm::SmallVector<int64_t> indices;
-    indices.reserve(transpose_to - transpose_from);
-    for (int64_t j = transpose_from; j < transpose_to; ++j) {
-      int64_t index = operand_inv_layout
-          [transpose_dims[transpose_shape.layout().minor_to_major(j)]];
-
-      // Store the entire result dimension range in the minor-most dimension
-      // index and an empty range in all others.
-      operand_to_result_range[index] = {result_from, result_to};
-      result_from = result_to;
-
-      // Check that the physical operand indices form a contiguous range.
-      indices.push_back(index);
-    };
-
-    if (indices.empty()) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
-                       " because size-1 dims in bitcasts are not yet supported "
-                       "(b/466065483)."));
-    }
-    if (indices.back() - indices.front() >= transpose_to - transpose_from ||
-        !absl::c_is_sorted(indices)) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Cannot hoist bitcast across ", transpose->ToString(),
-                       " because result dimensions are not contiguous."));
-    }
-  }
-
-  BitcastParams result;
-  CopyElementType(result_shape, &result.new_shape);
-  // Just like the old transpose, the new transpose does not change the
-  // layout.
-  *result.new_shape.mutable_layout() = result_shape.layout();
-  result.new_dims.resize(result_shape.dimensions().size());
-  llvm::SmallVector<int64_t> new_shape_dims(result_shape.dimensions().size());
-  // Iterate through the physical operand and new_shape dimension indices.
-  for (int64_t i = 0, j = 0; i < operand_shape.dimensions().size(); ++i) {
-    auto range = operand_to_result_range[i];
-    // Iterate through corresponding range of physical result dimension
-    // indices.
-    for (int64_t k = range.first; k < range.second; ++k) {
-      int64_t new_index = result_shape.layout().minor_to_major(j++);
-      int64_t result_index = result_shape.layout().minor_to_major(k);
-      new_shape_dims[new_index] = result_shape.dimensions(result_index);
-      result.new_dims[new_index] = result_index;
-    }
-  }
-  for (int64_t dimension : new_shape_dims) {
-    result.new_shape.add_dimensions(dimension);
-  }
-
-  VLOG(3) << "CalculateBitcastOfTransposeImpl:";
-  VLOG(3) << "  transpose = " << transpose_shape.ToString(true) << " transpose("
-          << operand_shape.ToString(true)
-          << " operand), dimensions=" << absl::StrJoin(transpose_dims, ",");
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " bitcast("
-          << transpose_shape.ToString(true) << " transpose)";
-  VLOG(3) << "--------------------------------";
-  VLOG(3) << "  bitcast   = " << result.new_shape.ToString(true) << " bitcast("
-          << operand_shape.ToString(true) << " operand)";
-  VLOG(3) << "  result    = " << result_shape.ToString(true) << " transpose("
-          << result.new_shape.ToString(true) << " bitcast), dimensions="
-          << absl::StrJoin(GetInversePermutation(result.new_dims), ",");
-
-  return result;
-}
-
-// Returns parameters to rewrite a transpose + bitcast as bitcast + transpose.
-//
-// Example:
-//
-// transpose = transpose(operand)
-// result = result_shape bitcast(transpose)
-//
-// to
-//
-// bitcast = new_shape bitcast(operand)
-// result = transpose(bitcast), dimensions={new_dims}.
-//
-// Assumes that:
-// - bitcast only mixes contiguous dimensions (checks);
-// - transpose does not change layout (checks);
-absl::StatusOr<BitcastParams> CalculateBitcastOfTranspose(
-    const HloTransposeInstruction* transpose, const Shape& result_shape) {
-  TF_ASSIGN_OR_RETURN(
-      BitcastParams result,
-      CalculateBitcastOfTransposeImpl(
-          transpose, result_shape, transpose->shape(),
-          transpose->operand(0)->shape(), transpose->dimensions()));
-  result.new_dims = GetInversePermutation(result.new_dims);
-  return result;
-}
-
-// Returns parameters to rewrite a bitcast + transpose as transpose + bitcast.
-//
-// Example:
-//
-// bitcast = bitcast(operand_shape operand)
-// result = transpose(bitcast)
-//
-// to
-//
-// transpose = new_shape transpose(operand), dimensions={new_dims}.
-// result = bitcast(transpose)
-//
-// Assumes that:
-// - bitcast only mixes contiguous dimensions (checks);
-// - transpose does not change layout (checks);
-absl::StatusOr<BitcastParams> CalculateTransposeOfBitcast(
-    const HloTransposeInstruction* transpose, const Shape& operand_shape) {
-  return CalculateBitcastOfTransposeImpl(
-      transpose, operand_shape, transpose->operand(0)->shape(),
-      transpose->shape(), GetInversePermutation(transpose->dimensions()));
-}
-
-// Simulates a rewrite of all producers of a given bitcast/reshape, moving the
-// instruction outside of the computation. Returns the new shapes of affected
-// instructions in order of traversal from consumers to producers.
-absl::StatusOr<std::vector<std::pair<HloInstruction*, Shape>>>
-PlanHoistBitcastUpwardsToCallers(const HloInstruction* bitcast) {
-  // Check that all producers only affect the bitcast. If there are any
-  // other consumers: refuse the hoisting.
-  // It is possible to support more cases by sinking the bitcast from such
-  // producers downward.
-  HloInstructionSetVector producers = GetProducerSet(bitcast);
-  TF_RETURN_IF_ERROR(VerifyIsClosedProducerSet(producers, bitcast));
-  if (bitcast->shape().element_type() !=
-      bitcast->operand(0)->shape().element_type()) {
-    return absl::UnimplementedError(
-        absl::StrCat("Hoisting bitcast with type conversion is not supported: ",
-                     bitcast->ToString()));
-  }
-
-  HloInstructionMap<Shape> result_shapes;
-  auto set_result_shape =
-      [&](const absl::Span<HloInstruction* const> instructions,
-          const Shape& shape) -> absl::Status {
-    for (HloInstruction* instruction : instructions) {
-      // Only update the dimensions keeping the type intact.
-      Shape new_shape(shape);
-      CopyElementType(instruction->shape(), &new_shape);
-      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
-               ShapeUtil::ArrayDataSize(instruction->shape()))
-          << " instruction " << instruction->ToString()
-          << " updating result shape from "
-          << ShapeUtil::HumanStringWithLayout(instruction->shape()) << " to "
-          << ShapeUtil::HumanStringWithLayout(new_shape)
-          << " with different data size";
-      auto it = result_shapes.find(instruction);
-      if (it == result_shapes.end()) {
-        VLOG(2) << "updating the result shape of " << instruction->ToString()
-                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
-        result_shapes.emplace(instruction, new_shape);
-      } else if (it->second != new_shape) {
-        return absl::FailedPreconditionError(absl::StrCat(
-            "Conflicting shape assignment for ", instruction->ToString(),
-            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
-            ShapeUtil::HumanStringWithLayout(shape)));
-      }
-    }
-    return absl::OkStatus();
-  };
-  TF_RETURN_IF_ERROR(set_result_shape(bitcast->operands(), bitcast->shape()));
-
-  std::vector<std::pair<HloInstruction*, Shape>> result;
-  // We want to visit instructions in order from consumers to producers: we
-  // hoist the bitcast upwards and having a valid HLO at every rewrite step
-  // helps a lot. A simple DFS or BFS over operands will not work in non-tree
-  // situations when there are multiple consumers of the same producer. Instead
-  // of writing a custom traversal we can simply walk the post-order (producers
-  // before consumers) list backward and only update the instructions affected.
-  // TODO(b/393299275): use MakeInstructionPostOrderFrom(bitcast) - that should
-  // be slightly more efficient.
-  auto def_before_use = bitcast->parent()->MakeInstructionPostOrder();
-  for (HloInstruction* instruction :
-       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
-    auto it = result_shapes.find(instruction);
-    if (it == result_shapes.end()) {
-      continue;  // Not affected.
-    }
-    Shape& result_shape = it->second;
-    if (instruction->shape() == result_shape) {
-      continue;  // No change.
-    }
-    result.emplace_back(instruction, result_shape);
-    switch (instruction->opcode()) {
-      case HloOpcode::kParameter:
-      case HloOpcode::kConstant:
-        // No operands.
-        break;
-      case HloOpcode::kReshape:  // Reshape is a bitcast.
-      case HloOpcode::kBitcast:
-        // Other bitcast will be hoisted separately so we don't need to
-        // update its operand.
-        break;
-      case HloOpcode::kBroadcast: {
-        TF_ASSIGN_OR_RETURN(
-            BitcastParams params,
-            CalculateBitcastOfBroadcast(
-                Cast<HloBroadcastInstruction>(instruction), result_shape));
-        TF_RETURN_IF_ERROR(
-            set_result_shape(instruction->operands(), params.new_shape));
-        break;
-      }
-      case HloOpcode::kTranspose: {
-        TF_ASSIGN_OR_RETURN(
-            BitcastParams params,
-            CalculateBitcastOfTranspose(
-                Cast<HloTransposeInstruction>(instruction), result_shape));
-        TF_RETURN_IF_ERROR(
-            set_result_shape(instruction->operands(), params.new_shape));
-        break;
-      }
-      default:
-        if (!instruction->IsElementwise()) {
-          return absl::FailedPreconditionError(absl::StrCat(
-              "Cannot hoist bitcast past ", instruction->ToString()));
-        }
-        TF_RETURN_IF_ERROR(
-            set_result_shape(instruction->operands(), result_shape));
-        break;
-    }
-  }
-  return result;
-}
-
-// Returns the shape of the root instruction after hoisting all bitcasts.
-//
-// For example, given:
-//
-// dot = dot_shape dot
-// bitcast = bitcast(dot)
-// ROOT root = transpose(bitcast)
-//
-// Returns root_shape for:
-//
-// dot = dot_shape dot
-// ROOT root = roots_shape transpose(dot)
-//
-absl::StatusOr<Shape> ComputeRootShapeAfterHoistingBitcasts(
-    const HloInstruction* dot) {
-  if (dot->IsRoot()) {
-    return dot->shape();
-  }
-
-  HloInstructionMap<Shape> operand_shapes;
-  auto set_operand_shape =
-      [&](const absl::Span<HloInstruction* const> instructions,
-          const Shape& shape) -> absl::Status {
-    for (HloInstruction* instruction : instructions) {
-      // Only update the dimensions keeping the type intact.
-      Shape new_shape(shape);
-      const HloInstruction* operand = instruction->operand(0);
-      CopyElementType(operand->shape(), &new_shape);
-      CHECK_EQ(ShapeUtil::ArrayDataSize(new_shape),
-               ShapeUtil::ArrayDataSize(operand->shape()))
-          << " instruction " << instruction->ToString()
-          << " updating operand shape from "
-          << ShapeUtil::HumanStringWithLayout(operand->shape()) << " to "
-          << ShapeUtil::HumanStringWithLayout(new_shape)
-          << " with different data size";
-      auto it = operand_shapes.find(instruction);
-      if (it == operand_shapes.end()) {
-        VLOG(2) << "updating the operand shape of "
-                << instruction->ToString(
-                       HloPrintOptions().set_print_operand_shape(true))
-                << " to " << ShapeUtil::HumanStringWithLayout(new_shape);
-        operand_shapes.emplace(instruction, new_shape);
-      } else if (it->second != new_shape) {
-        return absl::FailedPreconditionError(absl::StrCat(
-            "Conflicting shape assignment for ", instruction->ToString(),
-            " got ", ShapeUtil::HumanStringWithLayout(it->second), " and ",
-            ShapeUtil::HumanStringWithLayout(shape)));
-      }
-    }
-    return absl::OkStatus();
-  };
-  TF_RETURN_IF_ERROR(set_operand_shape(dot->users(), dot->shape()));
-
-  for (HloInstruction* instruction : GetConsumerSet(dot)) {
-    auto it = operand_shapes.find(instruction);
-    if (it == operand_shapes.end()) {
-      continue;  // Not affected.
-    }
-    Shape& operand_shape = it->second;
-    TF_ASSIGN_OR_RETURN(Shape result_shape, [&]() -> absl::StatusOr<Shape> {
-      switch (instruction->opcode()) {
-        case HloOpcode::kBroadcast: {
-          TF_ASSIGN_OR_RETURN(
-              BitcastParams params,
-              CalculateBroadcastOfBitcast(
-                  Cast<HloBroadcastInstruction>(instruction), operand_shape));
-          return params.new_shape;
-        }
-        case HloOpcode::kTranspose: {
-          TF_ASSIGN_OR_RETURN(
-              BitcastParams params,
-              CalculateTransposeOfBitcast(
-                  Cast<HloTransposeInstruction>(instruction), operand_shape));
-          return params.new_shape;
-        }
-        default:
-          if (!instruction->IsElementwise()) {
-            return absl::FailedPreconditionError(absl::StrCat(
-                "Cannot hoist bitcast past ", instruction->ToString()));
-          }
-          [[fallthrough]];
-        case HloOpcode::kReshape:  // Reshape is a bitcast.
-        case HloOpcode::kBitcast:
-          return operand_shape;
-      }
-    }());
-    if (instruction->IsRoot()) {
-      CopyElementType(instruction->shape(), &result_shape);
-      return result_shape;
-    }
-    TF_RETURN_IF_ERROR(set_operand_shape(instruction->users(), result_shape));
-  }
-  return absl::InternalError("No root found");
-}
-
-// Hoists the given 'bitcast' upwards out of its computation, to the parent of
-// each caller.
-absl::Status HoistBitcastUpwardsToCallers(HloInstruction* bitcast,
-                                          absl::Span<HloInstruction*> callers) {
-  TF_ASSIGN_OR_RETURN(auto rewrite_plan,
-                      PlanHoistBitcastUpwardsToCallers(bitcast));
-  for (auto [instruction, result_shape] : rewrite_plan) {
-    VLOG(2) << absl::StrCat("rewriting result shape of ",
-                            instruction->ToString(), " to ",
-                            ShapeUtil::HumanStringWithLayout(result_shape));
-    switch (instruction->opcode()) {
-      case HloOpcode::kParameter: {
-        // Create a new bitcast in callers.
-        int64_t number = instruction->parameter_number();
-        for (HloInstruction* caller : callers) {
-          // Create a more generic `bitcast` even if the caller has a
-          // `reshape`.
-          HloInstruction* new_bitcast =
-              caller->AddInstruction(HloInstruction::CreateBitcast(
-                  result_shape, caller->mutable_operand(number)));
-          TF_RETURN_IF_ERROR(
-              caller->ReplaceOperandWithDifferentShape(number, new_bitcast));
-        }
-        break;
-      }
-      case HloOpcode::kBroadcast: {
-        auto* broadcast = Cast<HloBroadcastInstruction>(instruction);
-        auto params = CalculateBitcastOfBroadcast(broadcast, result_shape);
-        // Must be OK, already succeeded in PlanHoistBitcasUpwardsToCallers.
-        QCHECK_OK(params);
-        broadcast->mutable_dimensions()->assign(params->new_dims.begin(),
-                                                params->new_dims.end());
-        break;
-      }
-      case HloOpcode::kTranspose: {
-        auto* transpose = Cast<HloTransposeInstruction>(instruction);
-        auto params = CalculateBitcastOfTranspose(transpose, result_shape);
-        // Must be OK, already succeeded in PlanHoistBitcastUpwardsToCallers.
-        QCHECK_OK(params);
-        transpose->mutable_dimensions()->assign(params->new_dims.begin(),
-                                                params->new_dims.end());
-        break;
-      }
-      default:
-        break;
-    }
-    *instruction->mutable_shape() = result_shape;
-  }
-  TF_RETURN_IF_ERROR(bitcast->ReplaceAllUsesWith(bitcast->mutable_operand(0)));
-  TF_RETURN_IF_ERROR(bitcast->parent()->RemoveInstruction(bitcast));
-  return absl::OkStatus();
-}
-
-// Inserts a bitcast at the root if the root shape is different from the dot
-// shape. The bitcast is chosen so that it cancels out bitcasts and reshapes
-// along the way up to the dot. Updates the callers of the dot to expect the new
-// root shape.
-absl::Status MaybeInsertRootBitcast(HloInstruction* dot,
-                                    absl::Span<HloInstruction*> callers) {
-  TF_ASSIGN_OR_RETURN(Shape root_shape,
-                      ComputeRootShapeAfterHoistingBitcasts(dot));
-
-  HloComputation* computation = dot->parent();
-  HloInstruction* root = computation->root_instruction();
-  if (root->shape() == root_shape) {
-    return absl::OkStatus();
-  }
-
-  // Insert a new bitcast at the root.
-  computation->set_root_instruction(
-      root->AddInstruction(HloInstruction::CreateBitcast(root_shape, root)));
-
-  // Insert new bitcast for each caller's result.
-  for (HloInstruction* caller : callers) {
-    HloInstruction* new_bitcast = caller->AddInstruction(
-        HloInstruction::CreateBitcast(caller->shape(), caller));
-    TF_RETURN_IF_ERROR(caller->ReplaceAllUsesWith(new_bitcast));
-    *caller->mutable_shape() = root_shape;
-  }
-
-  return absl::OkStatus();
-}
-
-// Try hoisting bitcasts and reshapes in the computation away from 'dot' to the
-// callers of the computation. Some bitcasts or reshapes may remain in the
-// computation, because they cannot be hoisted across all ops, e.g. across some
-// transposes and broadcasts. This is not reported as an error.
-absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
-                                                    CallGraph* call_graph) {
-  VLOG(2) << "Before hoisting bitcasts: " << dot->parent()->ToString();
-
-  auto callers = call_graph->GetComputationCallers(dot->parent());
-  if (auto status = MaybeInsertRootBitcast(dot, absl::MakeSpan(callers));
-      !status.ok()) {
-    VLOG(2) << "Failed to insert root bitcast: " << status;
-  }
-  VLOG(2) << "After inserting root bitcast: " << dot->parent()->ToString();
-
-  auto def_before_use = dot->parent()->MakeInstructionPostOrder();
-  for (HloInstruction* instruction :
-       llvm::make_range(def_before_use.rbegin(), def_before_use.rend())) {
-    if (!HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kReshape>(
-            instruction)) {
-      continue;
-    }
-    VLOG(2) << "Hoisting bitcast upwards " << instruction->ToString();
-    auto status =
-        HoistBitcastUpwardsToCallers(instruction, absl::MakeSpan(callers));
-    if (!status.ok()) {
-      VLOG(2) << "Failed to hoist " << instruction->ToString()
-              << " upwards: " << status;
-    }
-  }
-
-  VLOG(2) << "After hoisting bitcasts: " << dot->parent()->ToString();
-  return absl::OkStatus();
-}
-
 class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
  public:
   explicit NestGemmFusionVisitor(
@@ -1171,8 +386,6 @@ class NestGemmFusionVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    TF_RETURN_IF_ERROR(
-        TryHoistBitcastsInComputationToCallers(instr, call_graph));
     TF_RETURN_IF_ERROR(MakeNestedFusionFromGemmFusion(
         fusion, instr, mlir_context_, device_description_));
 
@@ -1359,32 +572,5 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
       "Couldn't find output tile sizes that satisfy ", tiled_dot.ToString()));
 }
 
-absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
-CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
-                                  absl::Span<const int64_t> b) {
-  // CommonFactors does what we need but it also creates empty groups with
-  // product of 1, e.g. `[1] -> []` or `[] -> [1]`. We remove the bounds of
-  // such ranges to merge them with neighbors. There are many different ways
-  // to do this, here we continously append ranges to the start of the next
-  // group unless it is the very last range.
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> bounds =
-      CommonFactors(a, b);
-  for (size_t i = 0; i + 1 < bounds.size() && bounds.size() > 2;) {
-    auto [a_start, b_start] = bounds[i];
-    auto [a_end, b_end] = bounds[i + 1];
-    if (a_start != a_end && b_start != b_end) {
-      i++;
-      continue;
-    }
-    if (i + 2 == bounds.size()) {
-      // Very last range - append it to the previous one.
-      bounds.erase(bounds.begin() + i);
-    } else {
-      bounds.erase(bounds.begin() + i + 1);
-    }
-  }
-  return bounds;
-}
-
 }  // namespace detail
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
index bc1a54cfadd09a..720dd8b116bdb0 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion.h
@@ -16,14 +16,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
 #define XLA_SERVICE_GPU_TRANSFORMS_NEST_GEMM_FUSION_H_
 
-#include <cstdint>
-#include <utility>
-
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -86,16 +81,6 @@ absl::StatusOr<BlockLevelParameters> FindBlockLevelParameters(
     mlir::MLIRContext* mlir_context,
     const se::DeviceDescription& device_description);
 
-// Returns the start indices of consecutive non-overlapping subsequences of `a`
-// and `b` with the same product (see `CommonFactors` from `util.h`) grouping
-// ranges having product of 1 with neighbors.
-//
-// For example, if a=[2, 5, 1, 3] and b=[1, 10, 3, 1], the result will be
-// {{0, 0}, {2, 2}, {4, 4}}, grouping [2,5] with [1,10] and [1,3] with [3,1].
-absl::InlinedVector<std::pair<int64_t, int64_t>, 8>
-CommonFactorsMergingTrivialRanges(absl::Span<const int64_t> a,
-                                  absl::Span<const int64_t> b);
-
 }  // namespace detail
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index bab6311f57991e..da4c0d8adc6eb1 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -85,7 +85,7 @@ class NestGemmFusionTest : public HloHardwareIndependentTestBase {
           se::GpuComputeCapability{se::CudaComputeCapability::Ampere()})};
   mlir::MLIRContext mlir_context_;
 
-  std::unique_ptr<VerifiedHloModule> ParseAndRunNestGemmFusion(
+  std::unique_ptr<VerifiedHloModule> RunNestGemmFusion(
       absl::string_view hlo, const bool expect_change = true) {
     std::unique_ptr<VerifiedHloModule> module =
         ParseAndReturnVerifiedModule(hlo).value();
@@ -120,7 +120,7 @@ ENTRY entry {
     }
 })";
 
-  std::unique_ptr<VerifiedHloModule> module = ParseAndRunNestGemmFusion(hlo);
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
   const HloInstruction* fusion = nullptr;
   ASSERT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(match::Fusion(&fusion)));
@@ -174,7 +174,7 @@ ENTRY e {
                          "split_k":1,"num_stages":1,"num_warps":2,
                          "num_ctas":1}}}
 })";
-  std::unique_ptr<VerifiedHloModule> module = ParseAndRunNestGemmFusion(hlo);
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
   HloComputation* fusion_computation = module->entry_computation()
                                            ->root_instruction()
                                            ->fused_instructions_computation();
@@ -187,6 +187,43 @@ ENTRY e {
               GmockMatch(match::Concatenate(match::Fusion(), match::Fusion())));
 }
 
+TEST_F(NestGemmFusionTest, CreatesTwoNestedFusionsFromSameParameter) {
+  absl::string_view hlo = R"(
+dot {
+  p0 = f32[32] parameter(0)
+  lhs = f32[4,8] reshape(p0)
+  rhs = f32[8,4] reshape(p0)
+  ROOT dot = f32[4,4] dot(lhs, rhs),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY entry {
+  p0 = f32[32] parameter(0)
+  ROOT fusion = f32[4,4] fusion(p0),
+    kind=kCustom, calls=dot, backend_config={
+      "fusion_backend_config": {
+        "kind":"__triton_gemm",  "triton_gemm_config": {
+          "block_m":"4", "block_n":"4", "block_k":"8",
+          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
+        }
+      }
+    }
+}
+)";
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+CHECK: {{.*}} {
+CHECK-NEXT: {{.*}} f32[32]{0} parameter(0)
+CHECK-NEXT: ROOT {{.*}} reshape
+CHECK-NEXT: }
+CHECK: {{.*}} {
+CHECK-NEXT: {{.*}} f32[32]{0} parameter(0)
+CHECK-NEXT: ROOT {{.*}} reshape
+)"),
+      IsOkAndHolds(true));
+}
+
 // TODO(b/393299275): update test to use a unsupported operation.
 TEST_F(NestGemmFusionTest, DISABLED_UnsupportedComputationsAreNotChanged) {
   // Fusions other than kTritonNestedGemmFusionKind are not supported.
@@ -234,7 +271,7 @@ ENTRY e {
   ROOT result = (f32[128,128], f32[8192,512]) tuple(r1, r2)
 }
 )";
-  std::unique_ptr<VerifiedHloModule> module = ParseAndRunNestGemmFusion(hlo);
+  std::unique_ptr<VerifiedHloModule> module = RunNestGemmFusion(hlo);
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kFusion);
@@ -251,1279 +288,6 @@ ENTRY e {
             "__triton_nested_gemm_fusion");
 }
 
-class NestGemmFusionReshapeTest
-    : public NestGemmFusionTest,
-      public ::testing::WithParamInterface<HloOpcode> {};
-
-// Tests hoisting of bitcasts which would otherwise trigger unsatisfiable
-// constraints during symbolic tile analysis.
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedOutOfGemmFusions) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[21] parameter(0)
-  bitcast = f32[3,7]{0,1} $0(lhs)
-  rhs = f32[7,11] parameter(1)
-  ROOT dot = f32[3,11] dot(bitcast, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = f32[21] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f32[3,11] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-}
-)";
-
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-
-  const HloInstruction* fusion = nullptr;
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(match::Fusion(&fusion)));
-  EXPECT_THAT(fusion->operand(0), GmockMatch(match::Bitcast()));
-  EXPECT_THAT(*fusion, OutputTileSizesIs(ElementsAre(32, 64)));
-
-  const HloInstruction* lhs = nullptr;
-  const HloInstruction* rhs = nullptr;
-  EXPECT_THAT(fusion->fused_expression_root(),
-              GmockMatch(match::Dot(match::Fusion(&lhs), match::Fusion(&rhs))));
-  EXPECT_THAT(*lhs, OutputTileSizesIs(ElementsAre(32, 16)));
-  EXPECT_THAT(*rhs, OutputTileSizesIs(ElementsAre(16, 64)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, SupportsTwoBitcastsFromSameParameter) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f32[32] parameter(0)
-  lhs = f32[4,8] $0(p0)
-  rhs = f32[8,4] $0(p0)
-  ROOT dot = f32[4,4] dot(lhs, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = f32[32] parameter(0)
-  ROOT fusion = f32[4,4] fusion(p0),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"4", "block_n":"4", "block_k":"8",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-}
-)";
-
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-
-  const HloInstruction* fusion = nullptr;
-  ASSERT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(match::Fusion(&fusion)));
-  EXPECT_THAT(*fusion, OutputTileSizesIs(ElementsAre(4, 4)));
-
-  const HloInstruction* lhs = nullptr;
-  const HloInstruction* rhs = nullptr;
-  EXPECT_THAT(fusion->fused_expression_root(),
-              GmockMatch(match::Dot(match::Fusion(&lhs), match::Fusion(&rhs))));
-  EXPECT_THAT(*lhs, OutputTileSizesIs(ElementsAre(4, 8)));
-  EXPECT_THAT(*rhs, OutputTileSizesIs(ElementsAre(8, 4)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsCanBeHoistedPastOtherBitcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[3,7] parameter(0)
-  bitcast0 = f32[21] $0(lhs)
-  bitcast1 = f32[3,7] $0(bitcast0)
-  rhs = f32[7,11] parameter(1)
-  ROOT dot = f32[3,11] dot(bitcast1, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY entry {
-  p0 = f32[3, 7] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f32[3,11] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-}
-)";
-  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsCanBeHoistedPastElementwiseEpilogues) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[3,7] parameter(0)
-  rhs = f32[7,11] parameter(1)
-  dot = f32[3,11] dot(lhs, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bitcast = f32[33] $0(dot)
-  ROOT add = f32[33] add(bitcast, bitcast)
-}
-
-ENTRY entry {
-  p0 = f32[3, 7] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f32[33] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-})";
-  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsCanBeHoistedPastConvertEpilogues) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = f32[3,7] parameter(0)
-  rhs = f32[7,11] parameter(1)
-  dot = f32[3,11] dot(lhs, rhs),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bitcast = f32[33] $0(dot)
-  ROOT convert = f16[33] convert(bitcast)
-}
-
-ENTRY entry {
-  p0 = f32[3, 7] parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = f16[33] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-})";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: f16[3,11]{1,0} convert(
-CHECK: f16[3,11]{1,0} fusion(
-)"),
-      IsOkAndHolds(true));
-
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-// We cannot hoist bitcasts past transposes, but we don't need to hoist
-// because the bitcast is not rank-expanding and symbolic tile analysis
-// works fine.
-TEST_P(NestGemmFusionReshapeTest, BitcastsCannotBeHoistedPastTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f32[72,36,2] parameter(0)
-  transpose0 = f32[72,2,36] transpose(p0), dimensions={0,2,1}
-  bitcast0 = f32[144,36] $0(transpose0)
-  p1 = f32[36,3] parameter(1)
-  dot = f32[144,3] dot(bitcast0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bitcast1 = f32[144,3] $0(dot)
-  ROOT transpose1 = f32[3,144] transpose(bitcast1), dimensions={1,0}
-}
-
-ENTRY entry {
-  p0 = f32[72,36,2] parameter(0)
-  p1 = f32[36,3] parameter(1)
-  ROOT fusion = f32[3,144] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"128","block_n":"16","block_k":"32",
-          "split_k":"1","num_stages":"4","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsKeepElementSizeInBits) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  lhs = s8[21]{0:E(4)} parameter(0)
-  c1 = s8[21] convert(lhs)
-  c2 = f32[21] convert(c1)
-  b0 = f32[3,7] $0(c2)
-  rhs = f32[7,11] parameter(1)
-  dot = f32[3,11] dot(b0, rhs), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  b1 = f32[33] $0(dot)
-  ROOT c = s8[33]{0:E(4)} convert(b1)
-}
-
-ENTRY entry {
-  p0 = s8[21]{0:E(4)} parameter(0)
-  p1 = f32[7,11] parameter(1)
-  ROOT fusion = s8[33]{0:E(4)} fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config": {
-        "kind":"__triton_gemm",  "triton_gemm_config": {
-          "block_m":"32", "block_n":"64", "block_k":"16",
-          "split_k":"1", "num_stages":"1", "num_warps":"1", "num_ctas":"1"
-        }
-      }
-    }
-})";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-  CHECK: ENTRY
-  CHECK: {{.*}} = s8[3,7]{1,0:E(4)} bitcast({{.*}})
-  CHECK: [[fusion:[^ ]+]] = s8[3,11]{1,0:E(4)} fusion({{.*}})
-  CHECK: ROOT {{.*}} = s8[33]{0:E(4)} bitcast([[fusion]])
-)"),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample1) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f16[1,16,17,3] parameter(0)
-  bitcast0 = f16[16,51] $0(f16[1,16,17,3] p0)
-  p1 = s8[16,17,3] parameter(1)
-  bitcast1 = s8[16,51] $0(s8[16,17,3] p1)
-  convert = f16[16,51] convert(s8[16,51] bitcast1)
-  bitcast2 = f16[51,16]{0,1} $0(f16[16,51] convert)
-  dot = f16[16,16] dot(bitcast0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT bitcast3 = f16[1,16,16] $0(f16[16,16] dot)
-}
-
-ENTRY entry {
-  p0 = f16[1,16,17,3] parameter(0)
-  p1 = s8[16,17,3] parameter(1)
-  ROOT fusion = f16[1,16,16] fusion(f16[1,16,17,3] p0, s8[16,17,3] p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"16","block_n":"16","block_k":"32",
-          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample2) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = pred[3,122,96,12] parameter(0)
-  transpose = pred[3,96,12,122] transpose(p0), dimensions={0,2,3,1}
-  bitcast0 = pred[3456,122] $0(transpose)
-  convert0 = f16[3456,122] convert(bitcast0)
-  p1 = pred[1,5,122] parameter(1)
-  bitcast1 = pred[5,122] $0(p1)
-  convert1 = f16[5,122] convert(bitcast1)
-  bitcast2 = f16[122,5]{0,1} $0(convert1)
-  dot.1 = f16[3456,5] dot(convert0, bitcast2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT bitcast3 = f16[3,96,12,1,5] $0(dot.1)
-}
-
-ENTRY entry_computation {
-  p0 = pred[3,122,96,12] parameter(0)
-  p1 = pred[1,5,122] parameter(1)
-  ROOT gemm_fusion_dot = f16[3,96,12,1,5] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"4","block_n":"16","block_k":"128",
-          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  // Note: block sizes were 16,16,32, but that now fails to satisfy constraints.
-  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, TritonFusionEmitterDeviceLegacyTestSample3) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = f32[1,40] parameter(0)
-  bitcast0 = f32[40] $0(p0)
-  bitcast1 = f32[40,1] $0(bitcast0)
-  p1 = f32[1,40,250000] parameter(1)
-  bitcast2 = f32[40,250000] $0(p1)
-  dot = f32[1,250000] dot(bitcast1, bitcast2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
-  bitcast3 = f32[250000] $0(dot)
-  ROOT bitcast4 = f32[1,250000] $0(bitcast3)
-}
-
-ENTRY entry_computation {
-  p0 = f32[1,40] parameter(0)
-  p1 = f32[1,40,250000] parameter(1)
-  ROOT gemm_fusion_dot.2 = f32[1,250000] fusion(p0, p1),
-    kind=kCustom, calls=dot, backend_config={
-      "fusion_backend_config":{
-        "kind":"__triton_gemm","triton_gemm_config":{
-          "block_m":"16","block_n":"16","block_k":"32",
-          "split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"
-        }
-      }
-    }
-})";
-  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedPastCompare) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = s32[11,24,128]{2,1,0} parameter(0)
-  p1 = s32[11,24,128]{2,1,0} parameter(1)
-  eq = pred[11,24,128]{2,1,0} compare(p0, p1), direction=EQ
-  eq_reshape = pred[264,128]{1,0} $0(eq)
-  eq_f32 = f32[264,128]{1,0} convert(eq_reshape)
-  p2 = f32[128,8]{1,0} parameter(2)
-  ROOT result = f32[264,8]{1,0} dot(eq_f32, p2),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = s32[11,24, 128]{2,1,0} parameter(0)
-  p1 = s32[11,24,128]{2,1,0} parameter(1)
-  p2 = f32[128,8]{1,0} parameter(2)
-  ROOT result = f32[264,8] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {
-      "block_m":32,"block_n":16,"block_k":128,
-      "split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}}
-)";
-  ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedUpThroughBroadcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[11,1,24,1] parameter(0)
-  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,1,2,3}
-  p0_reshape = f32[264,128] $0(p0_broadcast)
-
-  p1 = f32[128,8]{1,0} parameter(1)
-  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[11,1,24,1] parameter(0)
-  p1 = f32[128,8] parameter(1)
-  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-// Broadcast fusion:
-CHECK: {{.*}} {
-CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
-CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
-CHECK-NEXT: }
-CHECK: ENTRY {{.*}} {
-CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
-CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsAreHoistedUpThroughBroadcastsWithTrivialDimensions) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[11,24,1] parameter(0)
-  p0_broadcast = f32[11,1,24,1,128] broadcast(p0), dimensions={0,2,3}
-  p0_reshape = f32[264,128] $0(p0_broadcast)
-  p1 = f32[128,8]{1,0} parameter(1)
-  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[11,24,1] parameter(0)
-  p1 = f32[128,8] parameter(1)
-  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-// Broadcast fusion:
-CHECK: {{.*}} {
-CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
-CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
-CHECK-NEXT: }
-CHECK: ENTRY {{.*}} {
-CHECK: [[entry_p0:[^ ]+]] = f32[11,24,1]{{.*}} parameter(0)
-CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastOfOperandAndBroadcastDimsIsNotHoistedUp) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[3,4] parameter(0)
-  p1 = f32[64,7]{1,0} parameter(1)
-  broadcast = f32[3,4,16] broadcast(p0), dimensions={0,1}
-  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
-  reshape = f32[3,64] $0(broadcast)
-  ROOT dot = f32[3,7]{1,0} dot(reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[3,4] parameter(0)
-  p1 = f32[64,7] parameter(1)
-  ROOT result = f32[3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  // Cos should not be rewritten as we cannot hoist bitcast.
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[3,4,16]{2,1,0} broadcast
-CHECK-NEXT: f32[3,64]{1,0} $0
-)",
-                                            HloOpcodeString(opcode))),
-              IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastOfOperandAndBroadcastDimsIsNotHoistedDown) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7]{1,0} parameter(1)
-  dot = f32[6,5]{1,0} dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  // Bitcast mixes operand and broadcasted dimensions and cannot be hoisted.
-  reshape = f32[2,3,5] $0(dot)
-  ROOT broadcast = f32[2,4,3,5] broadcast(reshape), dimensions={0,2,3}
-}
-
-ENTRY e {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,4,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  // Cos should not be rewritten as we cannot hoist bitcast.
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[2,3,5]{2,1,0} $0
-CHECK-NEXT: f32[2,4,3,5]{3,2,1,0} broadcast
-)",
-                                            HloOpcodeString(opcode))),
-              IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsAreHoistedUpThroughBroadcastDiamonds) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[3,5] parameter(0)
-  b0 = f32[3,5,77,1] broadcast(p0), dimensions={0,1}
-  b1 = f32[3,5,1] broadcast(p0), dimensions={0,1}
-  b2 = f32[3,5,77,1] broadcast(b1), dimensions={0,1,3}
-  sum = add(b0, b2)
-  sum_reshape = f32[15,77] $0(sum)
-  p1 = f32[77,8]{1,0} parameter(1)
-  ROOT result = f32[15,8] dot(sum_reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[3,5] parameter(0)
-  p1 = f32[77,8] parameter(1)
-  ROOT result = f32[15,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: [[p0:[^ ]+]] = f32[15]{0} parameter(0)
-CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[p0]]), dimensions={0}
-CHECK-DAG: [[br:[^ ]+]] = f32[15]{0} broadcast([[p0]]), dimensions={0}
-CHECK-DAG: {{.*}} = f32[15,77]{1,0} broadcast([[br]]), dimensions={0}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedOverBroadcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[11,1,24,1] parameter(0)
-  p0_broadcast = f32[11,1,24,1,128,1] broadcast(p0), dimensions={0,1,2,5}
-  p0_reshape = f32[264,128] $0(p0_broadcast)
-
-  p1 = f32[128,8]{1,0} parameter(1)
-  ROOT result = f32[264,8]{1,0} dot(p0_reshape, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[11,1,24,1] parameter(0)
-  p1 = f32[128,8] parameter(1)
-  ROOT result = f32[264,8] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           R"(
-// Broadcast fusion:
-CHECK: {{.*}} {
-CHECK-NEXT: [[broadcast_p0:[^ ]+]] = f32[264]{0} parameter(0)
-CHECK-NEXT: ROOT {{.*}} = f32[264,128]{1,0} broadcast([[broadcast_p0]]), dimensions={0}
-CHECK-NEXT: }
-CHECK: ENTRY {{.*}} {
-CHECK: [[entry_p0:[^ ]+]] = f32[11,1,24,1]{3,2,1,0} parameter(0)
-CHECK: {{.*}} = f32[264]{0} bitcast([[entry_p0]])
-)"),
-
-              IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsLayoutIsPreserved) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-gemm_dot {
-  p0 = pred[3,122,96,12] parameter(0)
-  bitcast0 = pred[3,122,1152] $0(p0)
-  transpose0 = pred[3,1152,122] transpose(bitcast0), dimensions={0,2,1}
-  bitcast2 = pred[3456,122] $0(transpose0)
-  convert0 = f16[3456,122] convert(bitcast2)
-  p1 = pred[1,5,122] parameter(1)
-  bitcast3 = pred[5,122] $0(p1)
-  convert1 = f16[5,122] convert(bitcast3)
-  bitcast4 = f16[122,5]{0,1} $0(convert1)
-  dot0 = f16[3456,5]{1,0} dot(convert0, bitcast4), lhs_contracting_dims={1},
-    rhs_contracting_dims={0}
-  ROOT bitcast5 = f16[3,96,12,1,5] $0(dot0)
-}
-
-ENTRY e {
-  p0 = pred[3,122,96,12] parameter(0)
-  p1 = pred[1,5,122] parameter(1)
-  ROOT fusion = f16[3,96,12,1,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
-    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
-}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK: {{.*}} {
-CHECK: transpose
-CHECK: [[bitcast_or_reshape:[^ ]+]] = pred[3456,122]{1,0} $0({{.*}})
-CHECK: ROOT {{.*}} = f16[3456,122]{1,0} convert([[bitcast_or_reshape]])
-CHECK-NEXT: }
-CHECK: {{.*}} {
-CHECK-NOT: $0
-CHECK: ROOT {{.*}} = f16[122,5]{0,1} convert({{.*}})
-CHECK-NEXT: }
-CHECK: ENTRY {{.*}} {
-CHECK: {{.*}} = pred[122,5]{0,1} bitcast({{.*}})
-)",
-                                            HloOpcodeString(opcode))),
-              IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       CheckDimensionsOfBroadcastAfterBitcastIsHoisted) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-dot {
-  p0 = bf16[1,8] parameter(0)
-  broadcast0 = bf16[1,8,8] broadcast(p0), dimensions={0,2}
-  lhs = bf16[1,2,4,8] $0(broadcast0)
-
-  p1 = bf16[1,8] parameter(1)
-  broadcast1 = bf16[1,8,8] broadcast(p1), dimensions={0,2}
-  rhs = bf16[1,2,4,8] $0(broadcast1)
-
-  ROOT dot = bf16[2,1,4,4] dot(lhs, rhs),
-    lhs_contracting_dims={3}, lhs_batch_dims={1,0},
-    rhs_contracting_dims={3}, rhs_batch_dims={1,0}
-}
-
-ENTRY entry {
-  p0 = bf16[1,8] parameter(0)
-  ROOT fusion = bf16[2,1,4,4] fusion(p0, p0), kind=kCustom, calls=dot,
-    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
-})";
-
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
-CHECK: bf16[1,2,4,8]{{.*}} broadcast({{.*}}), dimensions={3}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedUpThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[7,6] parameter(0)
-  transpose = f32[6,7] transpose(p0), dimensions={1,0}
-  bitcast = f32[2,3,7] $0(transpose)
-  p1 = f32[2,5,7] parameter(1)
-  ROOT result = f32[2,3,5] dot(bitcast, p1),
-    lhs_contracting_dims={2}, lhs_batch_dims={0},
-    rhs_contracting_dims={2}, rhs_batch_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[7,6] parameter(0)
-  p1 = f32[2,5,7] parameter(1)
-  ROOT result = f32[2,3,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT transpose
-CHECK-SAME: f32[2,3,7]{2,1,0} transpose
-CHECK-SAME: dimensions={1,2,0}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsWithSize1DimensionsAreHoistedUpThroughTransposes) {
-  const HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[7,6] parameter(0)
-  transpose = f32[6,7] transpose(p0), dimensions={1,0}
-  bitcast = f32[1,6,7] $0(transpose)
-  p1 = f32[1,5,7] parameter(1)
-  ROOT result = f32[1,6,5] dot(bitcast, p1),
-    lhs_contracting_dims={2}, lhs_batch_dims={0},
-    rhs_contracting_dims={2}, rhs_batch_dims={0}
-}
-
-ENTRY e {
-  p0 = f32[7,6] parameter(0)
-  p1 = f32[1,5,7] parameter(1)
-  ROOT result = f32[1,6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT transpose
-CHECK-SAME: f32[1,6,7]{2,1,0} transpose
-CHECK-SAME: dimensions={1,2,0}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       RankReducingBitcastsAreNotHoistedUpThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[2,7,3] parameter(0)
-  transpose = f32[3,2,7] transpose(p0), dimensions={2,0,1}
-  $0 = f32[6,7] $0(transpose)
-  p1 = f32[5,7] parameter(1)
-  ROOT dot = f32[6,5] dot($0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-}
-
-ENTRY e {
-  p0 = f32[2,7,3] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[6,5] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      transpose
-CHECK-SAME: f32[3,2,7]{2,1,0} transpose
-CHECK-SAME: dimensions={2,0,1}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       RankReducingBitcastsAreNotHoistedDownThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[6,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0 = f32[2,3,5] $0(dot)
-  ROOT transpose = f32[2,5,3] transpose($0), dimensions={0,2,1}
-}
-
-ENTRY e {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,5,3] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[2,3,5]{2,1,0} $0
-CHECK-NEXT: f32[2,5,3]{2,1,0} transpose
-)",
-                                            HloOpcodeString(opcode))),
-              IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       HoistingBitcastDoesNotIntroduceArtificialDimension) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-gemm_dot {
-  p0 = f16[3,122,1152] parameter(0)
-  transpose = f16[3,1152,122] transpose(p0), dimensions={0,2,1}
-  bitcast0 = f16[3,96,12,122] $0(transpose)
-  bitcast1 = f16[3456,122] $0(bitcast0)
-  p1 = f16[122,5] parameter(1)
-  ROOT dot = f16[3456,5]{1,0} dot(bitcast1, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-}
-
-ENTRY e {
-  p0 = f16[3,122,1152] parameter(0)
-  p1 = f16[122,5] parameter(1)
-  ROOT fusion = f16[3456,5] fusion(p0, p1), kind=kCustom, calls=gemm_dot,
-    backend_config={"fusion_backend_config":{kind:"__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":32,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}
-}
-          )";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  // Checks that transpose is on rank 3 tensor from hoisting bitcast1, not rank
-  // 4 tensor from hoisting bitcast0 first and then failing to hoist bitcast1.
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      transpose
-CHECK-SAME: f16[3,1152,122]{2,1,0} transpose
-CHECK-SAME: dimensions={0,2,1}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedDownThroughTransposes) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[2,3,7] parameter(0)
-  p1 = f32[2,5,7] parameter(1)
-  dot = f32[2,3,5] dot(p0, p1),
-    lhs_contracting_dims={2}, lhs_batch_dims={0},
-    rhs_contracting_dims={2}, rhs_batch_dims={0}
-  bitcast = f32[6,5] $0(dot)
-  ROOT transpose = f32[5,6] transpose(bitcast), dimensions={1,0}
-}
-
-ENTRY e {
-  p0 = f32[2,3,7] parameter(0)
-  p1 = f32[2,5,7] parameter(1)
-  ROOT result = f32[5,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT transpose
-CHECK-SAME: f32[5,2,3]{2,1,0} transpose
-CHECK-SAME: dimensions={2,0,1}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastsAreHoistedDownThroughBroadcasts) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[3,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  bitcast = f32[15] $0(dot)
-  ROOT broadcast = f32[2,15,6] broadcast(bitcast), dimensions={1}
-}
-
-ENTRY e {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,15,6] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT broadcast
-CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
-CHECK-SAME: dimensions={0,1}
-)"),
-      IsOkAndHolds(true));
-}
-
-// TODO(b/467306121): handle the case when we need to sink the reshape through
-// broadcast.
-TEST_P(NestGemmFusionReshapeTest,
-       DISABLED_BitcastsAreHoistedDownThroughBroadcastsWithTrivialDimensions) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[6,7] parameter(1)
-  dot = f32[3,6] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  bitcast = f32[3,2,3] $0(dot)
-  ROOT broadcast = f32[3,2,1,3,7] broadcast(bitcast), dimensions={0,1,3}
-}
-
-ENTRY e {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[6,7] parameter(1)
-  ROOT result = f32[3,2,1,3,7] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(
-                              absl::Substitute(hlo, HloOpcodeString(opcode))));
-  ASSERT_THAT(
-      NestGemmFusion(device_description_, &mlir_context_).Run(module.get()),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK:      ROOT broadcast
-CHECK-SAME: f32[3,5,6,2]{2,1,0,3} broadcast
-CHECK-SAME: dimensions={0,1}
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsAreHoistedDownThroughBroadcastsWithNonDefaultLayout) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[6,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  bitcast = f32[2,3,5]{2,1,0} $0(dot)
-  ROOT broadcast = f32[2,3,5]{2,0,1} broadcast(bitcast), dimensions={0,1,2}
-}
-
-ENTRY e {
-  p0 = f32[6,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[2,3,5]{2,0,1} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK:      f32[2,3,5]{2,1,0} $0(dot)
-CHECK-NEXT: f32[2,3,5]{2,0,1} broadcast
-)",
-                                            HloOpcodeString(opcode))),
-              IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest, BitcastRootsAreHoistedDown) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  dot = f32[3,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  ROOT bitcast = f32[15] $0(dot)
-}
-
-ENTRY e {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  ROOT result = f32[15] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: ROOT dot
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastAreHoistedDownThroughBinaryElementwiseOps) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-triton_dot {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  p2 = f32[15] parameter(2)
-  dot = f32[3,5] dot(p0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0 = f32[15] $0(dot)
-  ROOT add = f32[15] add($0, p2)
-}
-
-ENTRY e {
-  p0 = f32[3,7] parameter(0)
-  p1 = f32[5,7] parameter(1)
-  p2 = f32[15] parameter(2)
-  ROOT result = f32[15] fusion(p0, p1, p2), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":16,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK: ROOT add = f32[3,5]{1,0} add
-)"),
-      IsOkAndHolds(true));
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsWithNonDefaultLayoutAreHoistedOutThroughBroadcast) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[7,2]{0,1} parameter(0)
-  broadcast.1 = f32[15,7,2]{1,0,2} broadcast(p0), dimensions={1,2}
-  $0.1 = f32[2,7,15]{1,2,0} $0(broadcast.1)
-  p1 = f32[2,15,15]{2,1,0} parameter(1)
-  dot = f32[2,7,15]{2,1,0} dot($0.1, p1),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={2}
-  $0.2 = f32[15,14]{0,1} $0(dot)
-  ROOT broadcast.2 = f32[15,11,14]{0,2,1} broadcast($0.2), dimensions={0,2}
-}
-
-ENTRY e {
-  p0 = f32[7,2]{0,1} parameter(0)
-  p1 = f32[2,15,15]{2,1,0} parameter(1)
-  ROOT result = f32[15,11,14]{0,2,1} fusion(p0, p1),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[2,7,15]{1,2,0} broadcast({{.*}}), dimensions={0,1}
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[2,7,15,11]{2,1,0,3} broadcast({{.*}}), dimensions={0,1,2}
-CHECK: ENTRY
-CHECK: f32[7,2]{0,1} parameter(0)
-CHECK: f32[2,7]{1,0} bitcast(p0
-CHECK: result = f32[2,7,15,11]{2,1,0,3} fusion
-CHECK: ROOT {{.*}} = f32[15,11,14]{0,2,1} bitcast(result)
-)"),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest,
-       BitcastsWithNonDefaultLayoutAreHoistedOutThroughTranspose) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[2,3,7]{0,2,1} parameter(0)
-  $0.1 = f32[7,3,2]{2,0,1} $0(p0)
-  transpose.1 = f32[3,2,7]{2,0,1} transpose($0.1), dimensions={1,2,0}
-  p1 = f32[3,5,7]{2,1,0} parameter(1)
-  dot = f32[3,2,5]{2,1,0} dot(transpose.1, p1),
-    lhs_batch_dims={0}, lhs_contracting_dims={2},
-    rhs_batch_dims={0}, rhs_contracting_dims={2}
-  $0.2 = f32[5,3,2]{0,2,1} $0(dot)
-  ROOT transpose.2 = f32[2,3,5]{0,2,1} transpose($0.2), dimensions={2,1,0}
-}
-
-ENTRY e {
-  p0 = f32[2,3,7]{0,2,1} parameter(0)
-  p1 = f32[3,5,7]{2,1,0} parameter(1)
-  ROOT result = f32[2,3,5]{0,2,1} fusion(p0, p1),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[3,2,7]{2,0,1} transpose({{.*}}), dimensions={1,2,0}
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: f32[3,5,2]{2,1,0} transpose({{.*}}), dimensions={0,2,1}
-CHECK: ENTRY
-CHECK: f32[2,3,7]{0,2,1} parameter(0)
-CHECK: f32[7,3,2]{2,0,1} bitcast(p0
-CHECK: result = f32[3,5,2]{2,1,0} fusion
-CHECK: ROOT {{.*}} = f32[2,3,5]{0,2,1} bitcast(result)
-)"),
-      IsOkAndHolds(true));
-  ASSERT_OK(verifier().Run(module.get()).status());
-}
-
-TEST_P(NestGemmFusionReshapeTest, MultipleBitcastsAreHoistedOut) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[3,3]{1,0} parameter(0)
-  $0.1 = f32[3,3]{1,0} $0(p0)
-  $0.2 = f32[3,3]{1,0} $0($0.1)
-  p1 = f32[3,3]{1,0} parameter(1)
-  dot = f32[3,3]{1,0} dot($0.2, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0.3 = f32[3,3]{1,0} $0(dot)
-  ROOT $0.4 = f32[3,3]{0,1} $0($0.3)
-}
-
-ENTRY e {
-  p0 = f32[3,3]{1,0} parameter(0)
-  ROOT result = f32[3,3]{0,1} fusion(p0, p0),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(
-      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-CHECK: ENTRY
-)"),
-      IsOkAndHolds(true));
-}
-
-// TODO(b/393299275): this test was not written correctly and now fails.
-TEST_P(NestGemmFusionReshapeTest,
-       DISABLED_BitcastsAreNotHoistedOutThroughLayoutChangingTranspose) {
-  HloOpcode opcode = GetParam();
-  absl::string_view hlo = R"(
-HloModule t
-
-triton_dot {
-  p0 = f32[7,2]{1,0} parameter(0)
-  $0.1 = f32[2,7]{0,1} $0(p0)
-  transpose.1 = f32[2,7]{1,0} transpose($0.1), dimensions={0,1}
-  p1 = f32[5,7]{1,0} parameter(1)
-  dot = f32[2,5]{1,0} dot(transpose.1, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-  $0.2 = f32[5,2]{0,1} $0(dot)
-  ROOT transpose.2 = f32[5,2]{1,0} transpose($0.2), dimensions={0,1}
-}
-
-ENTRY e {
-  p0 = f32[7,2]{1,0} parameter(0)
-  p1 = f32[5,7]{1,0} parameter(1)
-  ROOT result = f32[5,2]{1,0} fusion(p0, p1),
-    kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":16,"block_k":8,
-    "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
-)";
-  std::unique_ptr<VerifiedHloModule> module =
-      ParseAndRunNestGemmFusion(absl::Substitute(hlo, HloOpcodeString(opcode)));
-  EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
-                           absl::Substitute(R"(
-CHECK: $0.1 = f32[2,7]{0,1} $0
-CHECK: $0.2 = f32[5,2]{0,1} $0
-CHECK: ENTRY
-CHECK-NOT: bitcast
-CHECK-NOT: reshape
-        )",
-                                            HloOpcodeString(opcode))),
-              IsOkAndHolds(true));
-}
-
-INSTANTIATE_TEST_SUITE_P(NestGemmFusionReshapeTestSuite,
-                         NestGemmFusionReshapeTest,
-                         ::testing::ValuesIn({HloOpcode::kReshape,
-                                              HloOpcode::kBitcast}),
-                         [](const ::testing::TestParamInfo<HloOpcode>& info) {
-                           return std::string(HloOpcodeString(info.param));
-                         });
-
-struct CommonFactorsTestCase {
-  std::vector<int64_t> from, to;
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 8> expected;
-};
-
-class CommonFactorsMergingTrivialRangesTest
-    : public ::testing::TestWithParam<CommonFactorsTestCase> {};
-
-TEST_P(CommonFactorsMergingTrivialRangesTest, Example) {
-  const CommonFactorsTestCase& test_case = GetParam();
-  EXPECT_EQ(test_case.expected, detail::CommonFactorsMergingTrivialRanges(
-                                    test_case.from, test_case.to));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    CommonFactorsMergingTrivialRangesTestSuite,
-    CommonFactorsMergingTrivialRangesTest,
-    ::testing::Values(
-        CommonFactorsTestCase{{1}, {}, {{0, 0}, {1, 0}}},
-        CommonFactorsTestCase{{}, {1}, {{0, 0}, {0, 1}}},
-        CommonFactorsTestCase{{}, {}, {{0, 0}}},
-        CommonFactorsTestCase{{1, 2, 0}, {2, 0, 3}, {{0, 0}, {3, 3}}},
-        CommonFactorsTestCase{{2, 3, 0}, {1, 0, 1000}, {{0, 0}, {3, 3}}},
-        CommonFactorsTestCase{{1, 1, 1}, {1, 1}, {{0, 0}, {1, 1}, {3, 2}}},
-        CommonFactorsTestCase{{1, 1, 3}, {3, 1, 1}, {{0, 0}, {3, 3}}},
-        CommonFactorsTestCase{{2, 6}, {4, 3}, {{0, 0}, {2, 2}}},
-        CommonFactorsTestCase{{1, 2, 6}, {4, 1, 3, 1}, {{0, 0}, {3, 4}}},
-        CommonFactorsTestCase{{2, 3, 4, 5}, {6, 20}, {{0, 0}, {2, 1}, {4, 2}}},
-        CommonFactorsTestCase{
-            {2, 3, 4, 5, 6}, {6, 20, 6}, {{0, 0}, {2, 1}, {4, 2}, {5, 3}}},
-        CommonFactorsTestCase{{2, 2, 2, 2}, {4, 4}, {{0, 0}, {2, 1}, {4, 2}}},
-        CommonFactorsTestCase{
-            {2, 5, 1, 3}, {1, 10, 3, 1}, {{0, 0}, {2, 2}, {4, 4}}}),
-    [](const ::testing::TestParamInfo<CommonFactorsTestCase>& info) {
-      return absl::StrCat(absl::StrJoin(info.param.from, "_"), "_to_",
-                          absl::StrJoin(info.param.to, "_"));
-    });
-
 }  // namespace
 }  // namespace gpu
 }  // namespace xla

From 14c26537105096eeb91357a899e0fabc275a0527 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 12 Dec 2025 05:17:00 -0800
Subject: [PATCH 221/753] [PJRT] Remove dead code.

PiperOrigin-RevId: 843642761
---
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index bb0668ba72cf28..6b4d08c8553341 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -177,8 +177,6 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
       hlo_module.get(), std::bind(&Compiler::DefaultDeviceShapeRepresentation,
                                   gpu_compiler, std::placeholders::_1));
   DumpHloModuleIfEnabled(*hlo_module, kBeforeOptimizationsDumpName);
-  Compiler::CompileOptions opts;
-  opts.gpu_target_config = options.gpu_target_config;
 
   AotCompilationOptions aot_options(gpu_compiler->PlatformId());
   aot_options.set_gpu_target_config(*options.gpu_target_config);
@@ -212,14 +210,13 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
     return executable;
   }
 
-  CompileOptions input_options = options;
   XlaComputation xla_computation;
   TF_RETURN_IF_ERROR(MlirToXlaComputation(
       module, xla_computation,
       /*use_tuple_args=*/options.parameter_is_tupled_arguments,
       /*return_tuple=*/false,
-      /*exec_build_options=*/&input_options.executable_build_options,
+      /*exec_build_options=*/&options.executable_build_options,
       mlir::mhlo::getGpuChloToHighLevelMhloOptions()));
-  return Compile(std::move(input_options), xla_computation, topology, client);
+  return Compile(std::move(options), xla_computation, topology, client);
 }
 }  // namespace xla

From dac03bd184399cc8369fcf3421fc5f08af3b250d Mon Sep 17 00:00:00 2001
From: Aliia Khasanova <aliia@google.com>
Date: Fri, 12 Dec 2025 05:37:39 -0800
Subject: [PATCH 222/753] Add serialization of ExecutionState for
 CustomCallThunk.

This change adds the `xla.ffi.ExecutionStateProto` to the `CustomCallThunkProto` and updates the `CustomCallThunk` de/serialization to include the `ffi::ExecutionState`. The `CustomCallThunk::Create` methods now accept an optional `std::unique_ptr<ffi::ExecutionState>`, allowing a pre-existing state to be provided during deserialization.

This allows users to move the compilation of custom calls from FFI_Execute handler to FFI_Instantiate and pass the compiled kernel via ExecutionState, while keeping the CustomCallThunk serializable

PiperOrigin-RevId: 843648575
---
 .../xla/xla/backends/gpu/runtime/BUILD        |   3 +
 .../backends/gpu/runtime/custom_call_thunk.cc |  66 ++++---
 .../backends/gpu/runtime/custom_call_thunk.h  |   7 +-
 .../gpu/runtime/custom_call_thunk_test.cc     | 171 +++++++++++++++++-
 .../xla/xla/backends/gpu/runtime/thunk.proto  |   2 +
 .../thunk_proto_deserialization_test.cc       |   1 +
 third_party/xla/xla/ffi/execution_state.cc    |   7 +
 third_party/xla/xla/ffi/execution_state.h     |   1 +
 .../xla/xla/ffi/execution_state_test.cc       |  19 ++
 9 files changed, 250 insertions(+), 27 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index e51e4f946e9955..91708edcc9ef5b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -762,7 +762,9 @@ xla_test(
         "//xla:shape_util",
         "//xla/ffi",
         "//xla/ffi:attribute_map",
+        "//xla/ffi:execution_state",
         "//xla/ffi:ffi_api",
+        "//xla/ffi:type_registry",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status_public_headers",
@@ -2765,6 +2767,7 @@ tf_proto_library(
         "//xla:xla_data_proto",
         "//xla/core/host_offloading:host_offloading_executable_proto",
         "//xla/ffi:attribute_map_proto",
+        "//xla/ffi:execution_state_proto",
         "//xla/service:buffer_assignment_proto",
         "//xla/service:hlo_proto",
         "//xla/service/gpu:backend_configs",
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
index a8bdbd6719435a..10425250431f34 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.cc
@@ -215,38 +215,42 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
     std::vector<NullableShapedSlice> operands,
     std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
-    const HloComputation* called_computation, absl::string_view platform_name) {
+    const HloComputation* called_computation, absl::string_view platform_name,
+    std::unique_ptr<ffi::ExecutionState> execution_state) {
   TF_ASSIGN_OR_RETURN(ffi::HandlerRegistration registration,
                       ffi::FindHandler(target_name, platform_name));
 
   return Create(thunk_info, std::move(target_name),
                 std::move(registration.bundle), std::move(operands),
-                std::move(results), std::move(attributes), called_computation);
+                std::move(results), std::move(attributes), called_computation,
+                std::move(execution_state));
 }
 
 absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::Create(
     ThunkInfo thunk_info, std::string target_name,
     XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
     std::vector<NullableShapedSlice> results, ffi::AttributesMap attributes,
-    const HloComputation* called_computation) {
-  auto execution_state = std::make_unique<ffi::ExecutionState>();
-
+    const HloComputation* called_computation,
+    std::unique_ptr<ffi::ExecutionState> execution_state) {
   // Initialize FFI handler state if it has an instantiate callback.
-  if (bundle.instantiate) {
-    // At FFI handler instantiation time, we don't have any arguments or
-    // results or access to the underlying device (stream, etc.)
-    CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
-
-    CallFrameBuilder::AttributesBuilder attrs;
-    attrs.Append(attributes);
-
-    builder.AddAttributes(attrs.Build());
-    CallFrame call_frame = builder.Build();
-
-    CallOptions options;
-    options.execution_state = execution_state.get();
-    TF_RETURN_IF_ERROR(Call(bundle.instantiate, call_frame, options,
-                            XLA_FFI_ExecutionStage_INSTANTIATE));
+  if (execution_state == nullptr) {
+    execution_state = std::make_unique<ffi::ExecutionState>();
+    if (bundle.instantiate) {
+      // At FFI handler instantiation time, we don't have any arguments or
+      // results or access to the underlying device (stream, etc.)
+      CallFrameBuilder builder(/*num_args=*/0, /*num_rets=*/0);
+
+      CallFrameBuilder::AttributesBuilder attrs;
+      attrs.Append(attributes);
+
+      builder.AddAttributes(attrs.Build());
+      CallFrame call_frame = builder.Build();
+
+      CallOptions options;
+      options.execution_state = execution_state.get();
+      TF_RETURN_IF_ERROR(Call(bundle.instantiate, call_frame, options,
+                              XLA_FFI_ExecutionStage_INSTANTIATE));
+    }
   }
 
   TF_ASSIGN_OR_RETURN(CallFrame call_frame,
@@ -602,6 +606,12 @@ absl::StatusOr<ThunkProto> CustomCallThunk::ToProto() const {
     *proto.mutable_custom_call_thunk()->mutable_attributes() =
         attributes_->ToProto();
   }
+
+  if (execution_state_ && execution_state_->IsSerializable()) {
+    TF_ASSIGN_OR_RETURN(
+        *proto.mutable_custom_call_thunk()->mutable_execution_state(),
+        execution_state_->ToProto());
+  }
   return proto;
 }
 
@@ -629,6 +639,14 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::FromProto(
         NullableShapedSlice::FromProto(result_proto, buffer_allocations));
     results.push_back(std::move(result));
   }
+
+  if (proto.api_version() != CustomCallApiVersion::API_VERSION_TYPED_FFI) {
+    // Create a thunk that uses the legacy custom call registry.
+    return CustomCallThunk::Create(
+        std::move(thunk_info), proto.target_name(), std::move(operands),
+        std::move(results), proto.opaque(), proto.api_version(), platform_name);
+  }
+
   TF_ASSIGN_OR_RETURN(ffi::AttributesMap attributes,
                       ffi::AttributesMap::FromProto(proto.attributes()));
 
@@ -643,11 +661,17 @@ absl::StatusOr<std::unique_ptr<CustomCallThunk>> CustomCallThunk::FromProto(
           "' not found in the HloModule with name '", hlo_module->name(), "'"));
     }
   }
+  std::unique_ptr<ffi::ExecutionState> execution_state;
+  if (proto.has_execution_state()) {
+    TF_ASSIGN_OR_RETURN(
+        auto state, ffi::ExecutionState::FromProto(proto.execution_state()));
+    execution_state = std::make_unique<ffi::ExecutionState>(std::move(state));
+  }
 
   return CustomCallThunk::Create(std::move(thunk_info), proto.target_name(),
                                  std::move(operands), std::move(results),
                                  std::move(attributes), called_computation,
-                                 platform_name);
+                                 platform_name, std::move(execution_state));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
index 0ac5393d5238c0..f032c954acc1b5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk.h
@@ -103,8 +103,8 @@ class CustomCallThunk : public Thunk {
       std::vector<NullableShapedSlice> operands,
       std::vector<NullableShapedSlice> results,
       xla::ffi::AttributesMap attributes,
-      const HloComputation* called_computation,
-      absl::string_view platform_name);
+      const HloComputation* called_computation, absl::string_view platform_name,
+      std::unique_ptr<xla::ffi::ExecutionState> execution_state = nullptr);
 
   // Creates a serializable custom call thunk from the given XLA FFI handler
   // bundle. Note that `target_name` needs to refer to a registered XLA FFI
@@ -114,7 +114,8 @@ class CustomCallThunk : public Thunk {
       XLA_FFI_Handler_Bundle bundle, std::vector<NullableShapedSlice> operands,
       std::vector<NullableShapedSlice> results,
       xla::ffi::AttributesMap attributes,
-      const HloComputation* called_computation);
+      const HloComputation* called_computation,
+      std::unique_ptr<xla::ffi::ExecutionState> execution_state = nullptr);
 
   // Creates a custom call thunk from a bundle of handlers created with
   // xla::ffi::Bind(). Any pointer or reference lambda captures must be valid
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
index 8aa5c9853bfcc3..897a3bda32f429 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <utility>
 
 #include <gmock/gmock.h>
@@ -35,8 +36,10 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/attribute_map.h"
+#include "xla/ffi/execution_state.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
+#include "xla/ffi/type_registry.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -56,6 +59,49 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/parse_text_proto.h"
 
+namespace xla::gpu {
+struct TestState {
+  std::string value;
+};
+
+struct NonSerializableTestState {
+  int value;
+};
+
+struct FailingSerializableTestState {
+  int value;
+};
+}  // namespace xla::gpu
+
+namespace xla::ffi {
+template <>
+struct TypeRegistry::SerDes<xla::gpu::TestState> : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(
+      const xla::gpu::TestState& value) {
+    return value.value;
+  }
+  static absl::StatusOr<std::unique_ptr<xla::gpu::TestState>> Deserialize(
+      absl::string_view data) {
+    return std::make_unique<xla::gpu::TestState>(
+        xla::gpu::TestState{std::string(data)});
+  }
+};
+
+template <>
+struct TypeRegistry::SerDes<xla::gpu::FailingSerializableTestState>
+    : public std::true_type {
+  static absl::StatusOr<std::string> Serialize(
+      const xla::gpu::FailingSerializableTestState& value) {
+    return absl::InternalError("Serialization failed");
+  }
+  static absl::StatusOr<std::unique_ptr<xla::gpu::FailingSerializableTestState>>
+  Deserialize(absl::string_view data) {
+    return std::make_unique<xla::gpu::FailingSerializableTestState>(
+        xla::gpu::FailingSerializableTestState{0});
+  }
+};
+}  // namespace xla::ffi
+
 namespace xla::gpu {
 namespace {
 using absl_testing::IsOk;
@@ -335,7 +381,8 @@ TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutExecute) {
 absl::Status VerifyCallbackArguments(int my_attribute,
                                      ffi::AnyBuffer my_operand,
                                      ffi::Result<ffi::AnyBuffer> my_result,
-                                     const HloComputation* called_computation) {
+                                     const HloComputation* called_computation,
+                                     xla::gpu::TestState* state) {
   EXPECT_EQ(my_attribute, 42);
   EXPECT_EQ(my_operand.element_type(), xla::PrimitiveType::U8);
   EXPECT_EQ(my_operand.device_memory().opaque(),
@@ -344,6 +391,7 @@ absl::Status VerifyCallbackArguments(int my_attribute,
   EXPECT_EQ(my_result->device_memory().opaque(),
             absl::bit_cast<void*>(static_cast<intptr_t>(0xABCDEF)));
   EXPECT_EQ(called_computation->name(), "test_computation");
+  EXPECT_EQ(state->value, "some state");
   return absl::OkStatus();
 }
 
@@ -352,7 +400,8 @@ XLA_FFI_DEFINE_HANDLER(kVerifyCallbackArguments, VerifyCallbackArguments,
                            .Attr<int>("my_attribute")
                            .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
-                           .Ctx<ffi::CalledComputation>(),
+                           .Ctx<ffi::CalledComputation>()
+                           .Ctx<ffi::State<xla::gpu::TestState>>(),
                        {ffi::Traits::kCmdBufferCompatible});
 
 constexpr absl::string_view kVerifyCallbackArgumentsCustomCallName =
@@ -383,6 +432,11 @@ TEST(CustomCallThunkTest, ProtoConversion) {
   ShapedSlice result_slice{BufferAllocation::Slice{&alloc1, 0, 1024},
                            ShapeUtil::MakeShape(U16, {512})};
 
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  ASSERT_THAT(execution_state->Set(
+                  std::make_unique<TestState>(TestState{"some state"})),
+              IsOk());
+
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<CustomCallThunk> original_thunk,
       CustomCallThunk::Create(
@@ -391,9 +445,10 @@ TEST(CustomCallThunkTest, ProtoConversion) {
           /*operands=*/{operand_slice},
           /*results=*/{result_slice}, /*attributes=*/{{"my_attribute", 42}},
           hlo_module.entry_computation(),
-          /*platform_name=*/kTestPlatformName));
+          /*platform_name=*/kTestPlatformName, std::move(execution_state)));
   TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
   ASSERT_TRUE(proto.has_custom_call_thunk());
+  ASSERT_TRUE(proto.custom_call_thunk().has_execution_state());
   original_thunk.reset();
 
   std::array allocations = {alloc0, alloc1};
@@ -442,5 +497,115 @@ TEST(CustomCallThunkTest, DeserializationFailsWithMissingHloModule) {
               StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
+TEST(CustomCallThunkTest, RoundtripWithNonSerializableExecutionState) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("test_computation");
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  ASSERT_THAT(execution_state->Set(std::make_unique<NonSerializableTestState>(
+                  NonSerializableTestState{42})),
+              IsOk());
+  EXPECT_FALSE(execution_state->IsSerializable());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> original_thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/std::string(kVerifyCallbackArgumentsCustomCallName),
+          /*operands=*/{},
+          /*results=*/{}, /*attributes=*/{}, hlo_module.entry_computation(),
+          /*platform_name=*/kTestPlatformName, std::move(execution_state)));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
+  ASSERT_TRUE(proto.has_custom_call_thunk());
+  EXPECT_FALSE(proto.custom_call_thunk().has_execution_state());
+
+  original_thunk.reset();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> new_thunk,
+      CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto.custom_call_thunk(),
+                                 /*buffer_allocations=*/{}, &hlo_module,
+                                 kTestPlatformName));
+
+  EXPECT_NE(new_thunk->execution_state(), nullptr);
+  EXPECT_FALSE(new_thunk->execution_state()->IsSet());
+}
+
+TEST(CustomCallThunkTest, SerializationFails) {
+  HloModuleConfig config;
+  HloModule hlo_module("test_module", config);
+  HloComputation::Builder builder("test_computation");
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeShape(U32, {42}), "parameter"));
+  hlo_module.AddEntryComputation(builder.Build());
+
+  auto execution_state = std::make_unique<ffi::ExecutionState>();
+  ASSERT_OK(execution_state->Set(std::make_unique<FailingSerializableTestState>(
+      FailingSerializableTestState{42})));
+  EXPECT_TRUE(execution_state->IsSerializable());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/std::string(kVerifyCallbackArgumentsCustomCallName),
+          /*operands=*/{},
+          /*results=*/{}, /*attributes=*/{}, hlo_module.entry_computation(),
+          /*platform_name=*/kTestPlatformName, std::move(execution_state)));
+
+  EXPECT_THAT(thunk->ToProto(), StatusIs(absl::StatusCode::kInternal,
+                                         HasSubstr("Serialization failed")));
+}
+
+TEST(CustomCallThunkTest, LegacyCustomCallRoundTrip) {
+  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * executor, GpuExecutor());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<se::Stream> stream,
+                          executor->CreateStream());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> original_thunk,
+      CustomCallThunk::Create(
+          Thunk::ThunkInfo(),
+          /*target_name=*/"Callback_WithStatusFailed",
+          /*operands=*/{},
+          /*results=*/{}, /*opaque=*/"opaque",
+          CustomCallApiVersion::API_VERSION_STATUS_RETURNING,
+          /*platform_name=*/executor->GetPlatform()->Name()));
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto proto, original_thunk->ToProto());
+  original_thunk.reset();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CustomCallThunk> new_thunk,
+      CustomCallThunk::FromProto(Thunk::ThunkInfo(), proto.custom_call_thunk(),
+                                 /*buffer_allocations=*/{},
+                                 /*hlo_module=*/nullptr,
+                                 executor->GetPlatform()->Name()));
+
+  se::StreamExecutorMemoryAllocator allocator(executor);
+  BufferAllocations empty_unused_allocations({}, 0, &allocator);
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      ServiceExecutableRunOptions(), empty_unused_allocations,
+      /*stream=*/stream.get(),
+      /*command_buffer_trace_stream=*/stream.get(),
+      /*collective_params=*/nullptr,
+      /*collective_cliques=*/nullptr);
+
+  // We check that the new thunk behaves like the original one (returning
+  // internal error with specific message).
+  EXPECT_THAT(new_thunk->ExecuteOnStream(params),
+              StatusIs(absl::StatusCode::kInternal,
+                       HasSubstr("Legacy Custom call was executed!")));
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 5a698ae7e8046f..f2a93d80e31eb8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -22,6 +22,7 @@ import "xla/backends/gpu/runtime/dynamic_slice_thunk.proto";
 import "xla/backends/gpu/runtime/shaped_slice.proto";
 import "xla/core/host_offloading/host_offloading_executable.proto";
 import "xla/ffi/attribute_map.proto";
+import "xla/ffi/execution_state.proto";
 import "xla/service/buffer_assignment.proto";
 import "xla/service/gpu/gpu_conv_runner.proto";
 import "xla/service/gpu/gpu_norm_runner.proto";
@@ -290,6 +291,7 @@ message CustomCallThunkProto {
   // The name of the called computation. It needs to match the HloCompuation in
   // the HloModule that is used to deserialize the thunk.
   optional string called_computation = 7;
+  optional xla.ffi.ExecutionStateProto execution_state = 8;
 }
 
 message CustomKernelThunkProto {
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
index ca04379d3bf194..8613d387871abf 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization_test.cc
@@ -764,6 +764,7 @@ TEST(ThunkProtoDeserializationTest, CustomCallThunk) {
             }
           }
           called_computation: "called_computation"
+          execution_state {}
         }
       )pb");
   std::vector<BufferAllocation> buffer_allocations = {
diff --git a/third_party/xla/xla/ffi/execution_state.cc b/third_party/xla/xla/ffi/execution_state.cc
index 966c195d05fd7f..a3b5439a1ba1e7 100644
--- a/third_party/xla/xla/ffi/execution_state.cc
+++ b/third_party/xla/xla/ffi/execution_state.cc
@@ -125,4 +125,11 @@ bool ExecutionState::IsSet() const {
   return type_id_ != TypeRegistry::kUnknownTypeId;
 }
 
+bool ExecutionState::IsSerializable() const {
+  if (!IsSet()) {
+    return true;
+  }
+  return type_info_.serializer != nullptr;
+}
+
 }  // namespace xla::ffi
diff --git a/third_party/xla/xla/ffi/execution_state.h b/third_party/xla/xla/ffi/execution_state.h
index 17308665653dc6..d4aa091932e066 100644
--- a/third_party/xla/xla/ffi/execution_state.h
+++ b/third_party/xla/xla/ffi/execution_state.h
@@ -91,6 +91,7 @@ class ExecutionState {
   absl::StatusOr<T*> Get() const;
 
   bool IsSet() const;
+  bool IsSerializable() const;
 
  private:
   absl::Status Set(TypeId type_id, TypeInfo type_info, void* state);
diff --git a/third_party/xla/xla/ffi/execution_state_test.cc b/third_party/xla/xla/ffi/execution_state_test.cc
index 254cc1500e2113..e79a54f339edc4 100644
--- a/third_party/xla/xla/ffi/execution_state_test.cc
+++ b/third_party/xla/xla/ffi/execution_state_test.cc
@@ -123,4 +123,23 @@ TEST(ExecutionStateTest, Serialization) {
   EXPECT_EQ(static_cast<MyState*>(round_trip_data)->value, "some_state_data");
 }
 
+TEST(ExecutionStateTest, IsSerializable) {
+  ExecutionState state;
+  // Empty state is serializable (as empty proto).
+  EXPECT_TRUE(state.IsSerializable());
+
+  // State without serializer.
+  struct NoSerializer {
+    int x;
+  };
+  TF_ASSERT_OK(state.Set(std::make_unique<NoSerializer>(NoSerializer{42})));
+  EXPECT_FALSE(state.IsSerializable());
+
+  // State with serializer.
+  ExecutionState serializable_state;
+  TF_ASSERT_OK(
+      serializable_state.Set(std::make_unique<MyState>(MyState{"foo"})));
+  EXPECT_TRUE(serializable_state.IsSerializable());
+}
+
 }  // namespace xla::ffi

From a2d7bc83f6dc59938b438632dd663f41c2837797 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eusebio=20Dur=C3=A1n=20Monta=C3=B1a?= <eusebiodm@google.com>
Date: Fri, 12 Dec 2025 06:02:04 -0800
Subject: [PATCH 223/753] Fix `rocm_trace_test` missing dep and ignoring return
 value errors

The target wasn't building internally.

PiperOrigin-RevId: 843654566
---
 third_party/xla/xla/backends/profiler/gpu/BUILD               | 3 ++-
 third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index 1a559898f65e3e..50eb8fc6fd1980 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -533,7 +533,8 @@ xla_cc_test(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@local_config_rocm//rocm:hip",
+        "@local_config_rocm//rocm:hip",  # buildcleaner: keep
+        "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
     ],
 )
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
index d03bb15dc80527..1d19b33269fc03 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc
@@ -185,7 +185,7 @@ TEST(RocmTracerTest, CapturesHipEvents) {
   HIP_ASSERT_OK(hipDeviceSynchronize());
 
   tracer.Disable();
-  hipFree(device_data);
+  HIP_ASSERT_OK(hipFree(device_data));
 
 #undef HIP_ASSERT_OK
 

From 68827ae4f44cc014658460077bd3a412ed2bdcfb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 12 Dec 2025 06:17:01 -0800
Subject: [PATCH 224/753] Automated Code Change

PiperOrigin-RevId: 843659107
---
 .../xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc b/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
index 03c71381010e3a..32d24489c95343 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tools/ir_compiler_opt_main.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdio>
 #include <iostream>
 #include <memory>
-#include <optional>
 #include <string>
 #include <utility>
 #include <vector>

From ca4dba961a78ef3b6f3c39075e6b31c60d77307e Mon Sep 17 00:00:00 2001
From: deeptanshusekhri <deeptanshu.sekhri@arm.com>
Date: Fri, 12 Dec 2025 15:43:11 +0000
Subject: [PATCH 225/753] [tosa] Update pass macro usage to remove deprecated
 GEN_PASS_CLASSES API (#106129)

---
 tensorflow/compiler/mlir/tosa/transforms/passes.h          | 1 -
 tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc | 7 ++++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tosa/transforms/passes.h b/tensorflow/compiler/mlir/tosa/transforms/passes.h
index 0475d46a37a091..bd170f61cb2fb8 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/passes.h
@@ -70,7 +70,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createVerifyFullyConvertedPass();
 std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFLStatefulPass();
 
 #define GEN_PASS_REGISTRATION
-#define GEN_PASS_CLASSES
 #define GEN_PASS_DECL_TOSALEGALIZETFPASS
 #define GEN_PASS_DECL_TOSALEGALIZETFLPASS
 #define GEN_PASS_DECL_TOSALEGALIZETFTFLPASS
diff --git a/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc b/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
index e4a6ca5a6e56a5..53d21de0195999 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/strip_metadata.cc
@@ -25,7 +25,8 @@ limitations under the License.
 
 namespace mlir::tosa {
 
-#define GEN_PASS_DEF_STRIPM
+#define GEN_PASS_DEF_STRIPFUNCTIONMETADATA
+#define GEN_PASS_DEF_STRIPMODULEMETADATA
 #include "tensorflow/compiler/mlir/tosa/transforms/passes.h.inc"
 
 namespace {
@@ -45,7 +46,7 @@ static bool isTFLAttr(NamedAttribute &namedAttr) {
 }
 
 class StripModuleMetadataPass
-    : public StripModuleMetadataBase<StripModuleMetadataPass> {
+    : public impl::StripModuleMetadataBase<StripModuleMetadataPass> {
  public:
   void runOnOperation() override {
     auto moduleOp = getOperation();
@@ -59,7 +60,7 @@ class StripModuleMetadataPass
 };
 
 class StripFunctionMetadataPass
-    : public StripFunctionMetadataBase<StripFunctionMetadataPass> {
+    : public impl::StripFunctionMetadataBase<StripFunctionMetadataPass> {
  public:
   void runOnOperation() override {
     auto funcOp = getOperation();

From a83ab83e5fe10b5c6c6348ae73abaaab5827cac2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 12 Dec 2025 09:21:46 -0800
Subject: [PATCH 226/753] Squashing commits after sync tool breakage

PiperOrigin-RevId: 843718817
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   3 -
 .../mlir/tensorflow/utils/error_util_test.cc  |  98 +++++++------
 tensorflow/lite/core/c/common.h               |  10 +-
 tensorflow/lite/core/subgraph.cc              |  18 ++-
 third_party/py/python_init_toolchains.bzl     |   5 +-
 .../third_party/py/python_init_toolchains.bzl |   5 +-
 .../cpu/transforms/library_rewriter.cc        |   6 +-
 .../xla/xla/backends/gpu/collectives/BUILD    |   8 +-
 .../gpu/collectives/nvshmem_collectives.cc    | 100 ++-----------
 .../gpu/collectives/nvshmem_collectives.h     |  20 +--
 .../collectives/nvshmem_collectives_test.cc   |   4 +-
 .../xla/xla/backends/gpu/runtime/BUILD        |   5 -
 third_party/xla/xla/pjrt/distributed/BUILD    |   1 -
 .../xla/pjrt/distributed/coordination/BUILD   |   3 -
 third_party/xla/xla/pjrt/gpu/BUILD            |   1 +
 .../pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc |   9 ++
 .../xla/pjrt/stream_executor_executable.cc    |  22 +++
 .../xla/xla/pjrt/stream_executor_executable.h |  11 +-
 third_party/xla/xla/service/cpu/tests/BUILD   |   5 -
 third_party/xla/xla/service/gpu/model/BUILD   |   2 -
 .../export_manual_reduction_collectives.cc    |  93 ++++++++++++
 ...o_export_manual_reduction_collectives.mlir |  28 +++-
 .../xla/xla/stream_executor/cuda/BUILD        |  25 ++++
 .../xla/xla/stream_executor/cuda/nvshmem.cc   | 134 ++++++++++++++++++
 .../xla/xla/stream_executor/cuda/nvshmem.h    |  43 ++++++
 25 files changed, 467 insertions(+), 192 deletions(-)
 create mode 100644 third_party/xla/xla/stream_executor/cuda/nvshmem.cc
 create mode 100644 third_party/xla/xla/stream_executor/cuda/nvshmem.h

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index cbd6bc3b283504..094070fa86a602 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -905,7 +905,6 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/util:managed_stack_trace",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@local_xla//xla/mlir/utils:error_util",
@@ -949,10 +948,8 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@local_xla//xla/hlo/testlib:test",
-        "@local_xla//xla/mlir/utils:error_util",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
index dcd71dedc9790f..a7ea08924aea5e 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
@@ -15,57 +15,71 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 
-#include "llvm/ADT/Twine.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/testlib/test.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace mlir {
 namespace {
 
-using testing::HasSubstr;
+using ::testing::HasSubstr;
 
-TEST(ErrorUtilTest, StatusScopedDiagnosticHandler) {
-  MLIRContext context;
-  auto id = StringAttr::get(&context, "//tensorflow/python/test.py");
-  auto loc = FileLineColLoc::get(&context, id, 0, 0);
+class ErrorUtilTest : public ::testing::Test {
+ protected:
+  ErrorUtilTest()
+      : id_(StringAttr::get(&context_, "//tensorflow/python/test.py")),
+        loc_(FileLineColLoc::get(&context_, id_, 0, 0)) {}
+
+  MLIRContext context_;
+  StringAttr id_;
+  FileLineColLoc loc_;
+};
+
+using StatusScopedDiagnosticHandlerTest = ErrorUtilTest;
+
+TEST_F(StatusScopedDiagnosticHandlerTest,
+       OkWithoutDiagnosticGetsPassedThrough) {
+  TF_ASSERT_OK(
+      StatusScopedDiagnosticHandler(&context_).Combine(tensorflow::OkStatus()));
+}
+
+TEST_F(StatusScopedDiagnosticHandlerTest,
+       VerifyDiagnosticsAreCapturedAsUnknownStatus) {
+  StatusScopedDiagnosticHandler handler(&context_);
+  emitError(loc_) << "Diagnostic message";
+  ASSERT_TRUE(tensorflow::errors::IsUnknown(handler.ConsumeStatus()));
+}
+
+TEST_F(StatusScopedDiagnosticHandlerTest, VerifyPassedInErrorsArePropagated) {
+  const Status err = tensorflow::errors::Internal("Passed in error");
+  ASSERT_TRUE(tensorflow::errors::IsInternal(
+      StatusScopedDiagnosticHandler(&context_).Combine(err)));
+}
+
+TEST_F(StatusScopedDiagnosticHandlerTest,
+       VerifyThatReportedDiagnosticsAreAppendedToPassedInError) {
+  StatusScopedDiagnosticHandler ssdh(&context_);
+  emitError(loc_) << "Diagnostic message reported";
+  emitError(loc_) << "Second diagnostic message reported";
+  const Status s =
+      ssdh.Combine(tensorflow::errors::Internal("Passed in error"));
+  ASSERT_TRUE(tensorflow::errors::IsInternal(s));
+  EXPECT_THAT(s.message(), HasSubstr("Passed in error"));
+  EXPECT_THAT(s.message(), HasSubstr("Diagnostic message reported"));
+  EXPECT_THAT(s.message(), HasSubstr("Second diagnostic message reported"));
+}
 
-  // Test OK without diagnostic gets passed through.
-  {
-    TF_ASSERT_OK(
-        StatusScopedDiagnosticHandler(&context).Combine(absl::OkStatus()));
-  }
-
-  // Verify diagnostics are captured as Unknown status.
-  {
-    StatusScopedDiagnosticHandler handler(&context);
-    emitError(loc) << "Diagnostic message";
-    ASSERT_TRUE(absl::IsUnknown(handler.ConsumeStatus()));
-  }
-
-  // Verify passed in errors are propagated.
-  {
-    Status err = tensorflow::errors::Internal("Passed in error");
-    ASSERT_TRUE(
-        absl::IsInternal(StatusScopedDiagnosticHandler(&context).Combine(err)));
-  }
-
-  // Verify diagnostic reported are append to passed in error.
-  {
-    auto function = [&]() {
-      emitError(loc) << "Diagnostic message reported";
-      emitError(loc) << "Second diagnostic message reported";
-      return tensorflow::errors::Internal("Passed in error");
-    };
-    StatusScopedDiagnosticHandler ssdh(&context);
-    Status s = ssdh.Combine(function());
-    ASSERT_TRUE(absl::IsInternal(s));
-    EXPECT_THAT(s.message(), HasSubstr("Passed in error"));
-    EXPECT_THAT(s.message(), HasSubstr("Diagnostic message reported"));
-    EXPECT_THAT(s.message(), HasSubstr("Second diagnostic message reported"));
-  }
+TEST_F(StatusScopedDiagnosticHandlerTest, VerifyThatWarningsAreIgnored) {
+  // Note: this logic is actually implemented in BaseScopedDiagnosticHandler's
+  // handler() function, but only StatusScopedDiagnosticHandler uses it.
+  StatusScopedDiagnosticHandler handler(&context_);
+  emitWarning(loc_) << "Warning message";
+  TF_EXPECT_OK(handler.ConsumeStatus());
 }
 
 TEST(ErrorUtilTest, StatusScopedDiagnosticHandlerWithFilter) {
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 765c2bc12f2d7d..2c2e703735ef07 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -1352,7 +1352,15 @@ typedef enum TfLiteDelegateFlags {
   /// operator information using `Profiler::EventType::OPERATOR_INVOKE_EVENT`
   /// and the results will appear in the operator-wise Profiling section and not
   /// in the Delegate internal section.
-  kTfLiteDelegateFlagsPerOperatorProfiling = 4
+  kTfLiteDelegateFlagsPerOperatorProfiling = 4,
+
+  // This flag can be used by callers to hint that the delegate is likely to
+  // delegate the entire graph to a single delegate so certain allocations can
+  // be skipped.
+  // This is an ADVANCED feature and should only be used if the caller has
+  // prior knowledge that the delegate will fully delegate all subgraphs
+  // to a single delegate.
+  kTfLiteDelegateFlagsHintFullyDelegatedToSingleDelegate = 8,
 } TfLiteDelegateFlags;
 
 /// WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 996d36b7e9725f..5b0e15d4515ffa 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -2489,9 +2489,11 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
   // Restore delegation state if applicable.
   TF_LITE_ENSURE_STATUS(RedoAllDelegates());
 
+  int64_t delegate_flags = TfLiteDelegateGetFlagsInternal(delegate);
   const bool delegate_supports_dynamic_shapes =
-      TfLiteDelegateGetFlagsInternal(delegate) &
-      kTfLiteDelegateFlagsAllowDynamicTensors;
+      delegate_flags & kTfLiteDelegateFlagsAllowDynamicTensors;
+  const bool hint_fully_delegated_to_single_delegate =
+      delegate_flags & kTfLiteDelegateFlagsHintFullyDelegatedToSingleDelegate;
   const auto pre_delegation_state = state_;
 
   if (state_ == kStateInvokableAndImmutable) {
@@ -2500,7 +2502,8 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
     // tensors.
     // Reset the state to force tensor/op reallocation.
     state_ = kStateUninvokable;
-  } else if (!delegate_supports_dynamic_shapes) {
+  } else if (!delegate_supports_dynamic_shapes &&
+             !hint_fully_delegated_to_single_delegate) {
     // Check if graph has dynamic tensors by preparing ops.
     int last_execution_plan_index_prepared;
     TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
@@ -2539,9 +2542,12 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
   if (!delegate_supports_dynamic_shapes) {
     // CASE 1: Current delegate does not support dynamic shapes.
     // Reset the state to force tensor/op reallocation.
-    state_ = kStateUninvokable;
-    TF_LITE_ENSURE_STATUS(
-        reset_delegation_if_not_ok(EnsureMemoryAllocations()));
+    if (!hint_fully_delegated_to_single_delegate) {
+      state_ = kStateUninvokable;
+      TF_LITE_ENSURE_STATUS(
+          reset_delegation_if_not_ok(EnsureMemoryAllocations()));
+    }
+
     // After using a delegate which doesn't support dynamic tensors, make the
     // entire graph immutable.
     state_ = kStateInvokableAndImmutable;
diff --git a/third_party/py/python_init_toolchains.bzl b/third_party/py/python_init_toolchains.bzl
index 860fc08ceda2a8..82d755c32bbfba 100644
--- a/third_party/py/python_init_toolchains.bzl
+++ b/third_party/py/python_init_toolchains.bzl
@@ -41,7 +41,6 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
         tool_version = MINOR_MAPPING.get(HERMETIC_PYTHON_VERSION)
         if not tool_version:
             tool_version = HERMETIC_PYTHON_VERSION + ".0"
-        url_components = HERMETIC_PYTHON_URL.split("://", 1)
 
         sha256s = {}
         for platform in PLATFORMS.keys():
@@ -51,12 +50,12 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
 
         python_register_toolchains(
             name = get_toolchain_name_per_python_version(name),
-            base_url = url_components[0] + "://",
+            base_url = "",
             ignore_root_user_error = True,
             python_version = tool_version,
             tool_versions = {
                 tool_version: {
-                    "url": url_components[1],
+                    "url": HERMETIC_PYTHON_URL,
                     "sha256": sha256s,
                     "strip_prefix": HERMETIC_PYTHON_PREFIX,
                 },
diff --git a/third_party/xla/third_party/py/python_init_toolchains.bzl b/third_party/xla/third_party/py/python_init_toolchains.bzl
index 860fc08ceda2a8..82d755c32bbfba 100644
--- a/third_party/xla/third_party/py/python_init_toolchains.bzl
+++ b/third_party/xla/third_party/py/python_init_toolchains.bzl
@@ -41,7 +41,6 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
         tool_version = MINOR_MAPPING.get(HERMETIC_PYTHON_VERSION)
         if not tool_version:
             tool_version = HERMETIC_PYTHON_VERSION + ".0"
-        url_components = HERMETIC_PYTHON_URL.split("://", 1)
 
         sha256s = {}
         for platform in PLATFORMS.keys():
@@ -51,12 +50,12 @@ def python_init_toolchains(name = "python", python_version = None, **kwargs):
 
         python_register_toolchains(
             name = get_toolchain_name_per_python_version(name),
-            base_url = url_components[0] + "://",
+            base_url = "",
             ignore_root_user_error = True,
             python_version = tool_version,
             tool_versions = {
                 tool_version: {
-                    "url": url_components[1],
+                    "url": HERMETIC_PYTHON_URL,
                     "sha256": sha256s,
                     "strip_prefix": HERMETIC_PYTHON_PREFIX,
                 },
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
index 4e3fd3e2d1a0d9..10c8aafaa98f99 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.cc
@@ -160,6 +160,10 @@ inline absl::Status InsertConvertIfNecessary(
   return absl::OkStatus();
 }
 
+inline bool IsElementwiseAndNotConstant(const HloInstruction* instr) {
+  return instr->IsElementwise() && !instr->IsConstant();
+}
+
 }  // namespace
 
 absl::StatusOr<LibraryMatcher*> LibraryRewriter::ChooseLibrary(
@@ -298,7 +302,7 @@ absl::StatusOr<bool> LibraryRewriter::ProcessComputation(
       fusion_starters.push_back(*it);
     } else if (fuse_reduce_ && (*it)->opcode() == HloOpcode::kReduce) {
       fusion_starters.push_back(*it);
-    } else if (fuse_eltwise_ && (*it)->IsElementwise()) {
+    } else if (fuse_eltwise_ && IsElementwiseAndNotConstant(*it)) {
       eltwise_ops.push_back(*it);
     }
   }
diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index 6859a56e88c0fa..dacd10d369da45 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -479,6 +479,7 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
+        "//xla/stream_executor/cuda:nvshmem",
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
@@ -568,9 +569,11 @@ xla_test(
     env = {
         "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
     },
+    tags = ["cuda-only"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:status_macros",
+        "//xla/core/collectives:communicator",
         "//xla/pjrt/distributed",
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:service",
@@ -585,7 +588,10 @@ xla_test(
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
         "@local_config_cuda//cuda:cuda_headers",
-    ] + if_cuda_is_configured([":nvshmem_collectives"]),
+    ] + if_cuda_is_configured([
+        ":nvshmem_collectives",
+        "//xla/stream_executor/cuda:nvshmem",
+    ]),
 )
 
 cc_library(
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
index 215f42b89fab0e..2398a7cc37bec5 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
@@ -14,39 +14,38 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/backends/gpu/collectives/nvshmem_collectives.h"
 
-#include <cstddef>
 #include <cstdint>
-#include <cstring>
 #include <memory>
-#include <string>
 
-#include "absl/base/call_once.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
-#include "absl/time/time.h"
 #include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
 #include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
 #include "xla/backends/gpu/collectives/nvshmem_communicator.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
-#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/stream_executor/cuda/nvshmem.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/numbers.h"
 
 namespace xla::gpu {
 
 NvshmemCollectives::~NvshmemCollectives() {
-  if (initialized_) {
-    Finalize();
+  if (se::gpu::nvshmem::IsInitialized()) {
+    se::gpu::nvshmem::Finalize();
   }
 }
 
+bool NvshmemCollectives::IsInitialized() const {
+  return se::gpu::nvshmem::IsInitialized();
+}
+
 NvshmemCollectives* NvshmemCollectives::Default() {
   absl::StatusOr<Collectives*> collectives =
       CollectivesRegistry::Get("gpu", "nvshmem");
@@ -61,87 +60,14 @@ NvshmemCollectives* NvshmemCollectives::Default() {
 }
 
 absl::Status NvshmemCollectives::InitializeTopology(Topology topology) {
-  SetEnvInfo(topology.node_id, topology.num_nodes,
-             topology.device_count_per_process, topology.kv_store);
+  se::gpu::nvshmem::SetEnvInfo(topology.node_id, topology.num_nodes,
+                               topology.device_count_per_process,
+                               topology.kv_store);
   return absl::OkStatus();
 }
 
-void NvshmemCollectives::SetEnvInfo(
-    int process_id, size_t num_processes, size_t device_count_per_process,
-    std::weak_ptr<KeyValueStoreInterface> kv_store) {
-  process_id_ = process_id;
-  num_processes_ = num_processes;
-  device_count_per_process_ = device_count_per_process;
-  kv_store_ = kv_store;
-}
-
-absl::Status NvshmemCollectives::InitializeOnce() {
-  auto init_fn = [this]() -> absl::Status {
-    if (process_id_ == -1) {
-      LOG(FATAL)
-          << "NvshmemCollectives::SetEnvInfo was not called before using "
-             "NVSHMEM API";
-    }
-    if (device_count_per_process_ != 1) {
-      LOG(FATAL) << "NVSHMEM API is only supported with one device per process";
-    }
-    nvshmemx_init_attr_t nvshmem_init_attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
-    nvshmemx_uniqueid_t nvshmem_id = NVSHMEMX_UNIQUEID_INITIALIZER;
-
-    // Initialize NVSHMEM
-    if (std::shared_ptr<KeyValueStoreInterface> kv_store = kv_store_.lock()) {
-      if (process_id_ == 0) {
-        if (nvshmemx_get_uniqueid(&nvshmem_id) != 0) {
-          return absl::InternalError("nvshmemx_get_uniqueid failed.");
-        }
-        char buf[sizeof(nvshmemx_uniqueid_t)];
-        std::memcpy(buf, &nvshmem_id, sizeof(nvshmemx_uniqueid_t));
-        absl::string_view nvshmem_id_str{buf, sizeof(buf)};
-        TF_RETURN_IF_ERROR(kv_store->Set(kKvStoreKey, nvshmem_id_str));
-      } else {
-        TF_ASSIGN_OR_RETURN(std::string id_str,
-                            kv_store->Get(kKvStoreKey, absl::Minutes(10)));
-        CHECK(id_str.size() >= sizeof(nvshmemx_uniqueid_t));
-        std::memcpy(&nvshmem_id, id_str.data(), sizeof(nvshmemx_uniqueid_t));
-      }
-    } else {
-      return absl::InternalError(
-          "KV store is not available for nvshmem initialization.");
-    }
-
-    if (nvshmemx_set_attr_uniqueid_args(process_id_, num_processes_,
-                                        &nvshmem_id, &nvshmem_init_attr) != 0) {
-      return absl::InternalError("nvshmemx_set_attr_uniqueid_args failed.");
-    }
-    if (nvshmemx_hostlib_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID,
-                                   &nvshmem_init_attr) != 0) {
-      return absl::InternalError("nvshmemx_hostlib_init_attr failed.");
-    }
-
-    VLOG(3) << absl::StreamFormat(
-        "Initialized NVSHMEM on process %d; num_processes=%llu", process_id_,
-        num_processes_);
-    return absl::OkStatus();
-  };
-
-  static absl::once_flag once_flag;
-  absl::Status status = absl::OkStatus();
-  absl::call_once(once_flag, [&]() {
-    status = init_fn();
-    initialized_ = true;
-  });
-  return status;
-}
-
-void NvshmemCollectives::Finalize() {
-  VLOG(3) << absl::StreamFormat(
-      "Finilizing NVSHMEM on process %d; num_processes=%llu", process_id_,
-      num_processes_);
-  nvshmemx_hostlib_finalize();
-}
-
 absl::StatusOr<void*> NvshmemCollectives::Allocate(uint64_t bytes) {
-  TF_RETURN_IF_ERROR(InitializeOnce());
+  TF_RETURN_IF_ERROR(se::gpu::nvshmem::InitializeOnce());
   VLOG(3) << absl::StreamFormat(
       "Start allocation of %s (%llu bytes) for NVSHMEM",
       tsl::strings::HumanReadableNumBytes(bytes), bytes);
@@ -155,7 +81,7 @@ absl::StatusOr<void*> NvshmemCollectives::Allocate(uint64_t bytes) {
 }
 
 absl::Status NvshmemCollectives::Deallocate(void* buffer) {
-  TF_RETURN_IF_ERROR(InitializeOnce());
+  TF_RETURN_IF_ERROR(se::gpu::nvshmem::InitializeOnce());
   VLOG(3) << absl::StreamFormat("Start de-allocation for NVSHMEM buffer: %p",
                                 buffer);
   nvshmem_free(buffer);
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
index 4fc7485d48a0e1..82f717e8cb85d9 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COLLECTIVES_H_
 #define XLA_BACKENDS_GPU_COLLECTIVES_NVSHMEM_COLLECTIVES_H_
 
-#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -31,7 +30,6 @@ limitations under the License.
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
-#include "xla/pjrt/distributed/key_value_store_interface.h"
 
 namespace xla::gpu {
 
@@ -41,11 +39,8 @@ class NvshmemCollectives : public GpuCollectives {
   ~NvshmemCollectives() override;
 
   static NvshmemCollectives* Default();
-  bool IsInitialized() { return initialized_; }
 
-  void SetEnvInfo(int process_id, size_t num_processes,
-                  size_t device_count_per_process,
-                  std::weak_ptr<KeyValueStoreInterface> kv_store);
+  bool IsInitialized() const;
 
   absl::StatusOr<void*> Allocate(uint64_t bytes) final;
 
@@ -82,19 +77,6 @@ class NvshmemCollectives : public GpuCollectives {
   }
 
   absl::Status InitializeTopology(Topology topology) final;
-
- private:
-  absl::Status InitializeOnce();
-
-  void Finalize();
-
-  int process_id_ = -1;
-  size_t num_processes_ = 0;
-  size_t device_count_per_process_ = 0;
-  std::weak_ptr<KeyValueStoreInterface> kv_store_;
-  bool initialized_ = false;
-
-  static constexpr char kKvStoreKey[] = "nvshmem_global_init";
 };
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
index 9598c9aa7e866d..ef5ddcc9459ae7 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives_test.cc
@@ -25,11 +25,13 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "xla/core/collectives/communicator.h"
 #include "xla/debug_options_flags.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/service.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/cuda/nvshmem.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
@@ -82,7 +84,7 @@ absl::Status InitializationTestBody(const int node_id, const int num_nodes) {
   auto kv_store =
       GetDistributedKeyValueStore(distributed_client, /*key_prefix=*/"gpu:");
 
-  NvshmemCollectives::Default()->SetEnvInfo(node_id, num_nodes, 1, kv_store);
+  se::gpu::nvshmem::SetEnvInfo(node_id, num_nodes, 1, kv_store);
   cudaSetDevice(node_id);
   TF_ASSIGN_OR_RETURN(void* ptr, NvshmemCollectives::Default()->Allocate(1024));
   TF_RET_CHECK(ptr != nullptr);
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 91708edcc9ef5b..b686b7a78f6a47 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -2030,7 +2030,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
-        "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
@@ -2038,15 +2037,11 @@ cc_library(
         "//xla/stream_executor/gpu:collective_kernel_metadata",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:protobuf_lite",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index fb1bdb6f7800f2..4a2f1f754c3536 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -144,7 +144,6 @@ xla_cc_test(
         ":service",
         ":topology_util",
         "//xla:status_macros",
-        "//xla/pjrt/distributed/coordination:coordination_service_agent",
         "//xla/runtime:device_id",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/BUILD b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
index fc6c3a9001f0af..3798e213c65051 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
@@ -103,14 +103,11 @@ xla_cc_test(
         ":coordination_service",
         ":coordination_service_error_util",
         ":test_device_proto_cc",
-        "//xla/service:global_device_id",
         "//xla/tsl/distributed_runtime:call_options",
-        "//xla/tsl/distributed_runtime/coordination:coordination_service",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:test",
-        "//xla/tsl/platform:types",
         "//xla/tsl/protobuf:coordination_config_proto_cc",
         "//xla/tsl/protobuf:coordination_service_proto_cc",
         "//xla/tsl/util/proto:proto_matchers",
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 8aa72f6c944892..9c8ff6f6b206d2 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -615,6 +615,7 @@ xla_test(
         "//xla/tests:literal_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 246fb166b12771..173e1017162708 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
+#include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -47,6 +48,9 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::absl_testing::IsOkAndHolds;
+using ::testing::SizeIs;
+
 constexpr absl::string_view kProgram = R"(HloModule Computation
 
 ENTRY Computation() -> s32[] {
@@ -100,6 +104,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           compiler.Compile(opts, mlir_module.get(), *topology,
                                            /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       auto loaded_executable,
       se_client->Load(std::move(executable), LoadOptions()));
@@ -129,6 +134,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
       se_client->Load(std::move(executable), LoadOptions()));
@@ -155,6 +161,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
 
   // Serialize the executable and load it.
   TF_ASSERT_OK_AND_ASSIGN(std::string serialized_executable,
@@ -192,6 +199,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(opts, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtLoadedExecutable> loaded_executable,
       se_client->Load(std::move(executable), LoadOptions()));
@@ -242,6 +250,7 @@ TEST(StreamExecutorGpuCompilerTest, UnloadedExecutableMemoryStats) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<PjRtExecutable> executable,
       compiler.Compile(options, computation, *topology, /*client=*/nullptr));
+  EXPECT_THAT(executable->GetHloModules(), IsOkAndHolds(SizeIs(1)));
 
   TF_ASSERT_OK_AND_ASSIGN(CompiledMemoryStats compiled_memory_stats,
                           executable->GetCompiledMemoryStats());
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
index 61210775a4461c..4245c4da9ba100 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "xla/util.h"
 
 namespace xla {
+
 absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
     const {
   std::string serialized;
@@ -81,6 +82,27 @@ absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
   return proto.SerializeAsString();
 }
 
+StreamExecutorExecutable::StreamExecutorExecutable(
+    const CompileOptions& compile_options,
+    std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
+    int num_replicas, int num_partitions, absl::string_view name,
+    absl::string_view fingerprint, absl::string_view default_memory_kind)
+    : compile_options_(compile_options),
+      executables_(std::move(executables)),
+      num_replicas_(num_replicas),
+      num_partitions_(num_partitions),
+      name_(name),
+      fingerprint_(fingerprint),
+      default_memory_kind_(default_memory_kind) {
+  std::vector<std::shared_ptr<HloModule>> hlo_modules;
+  for (const auto& executable :
+       std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+           executables_)) {
+    hlo_modules.push_back(executable->shared_optimized_module());
+  }
+  hlo_modules_ = std::move(hlo_modules);
+}
+
 StreamExecutorExecutable::StreamExecutorExecutable(
     const CompileOptions& compile_options,
     std::optional<HloModuleProto> unoptimized_hlo_module_proto,
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
index cff9d15b53edde..a0a97daa935646 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -37,20 +38,14 @@ limitations under the License.
 #include "xla/service/hlo_proto_util.h"
 
 namespace xla {
+
 class StreamExecutorExecutable : public PjRtExecutable {
  public:
   StreamExecutorExecutable(
       const CompileOptions& compile_options,
       std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
       int num_replicas, int num_partitions, absl::string_view name,
-      absl::string_view fingerprint, absl::string_view default_memory_kind)
-      : compile_options_(compile_options),
-        executables_(std::move(executables)),
-        num_replicas_(num_replicas),
-        num_partitions_(num_partitions),
-        name_(name),
-        fingerprint_(fingerprint),
-        default_memory_kind_(default_memory_kind) {}
+      absl::string_view fingerprint, absl::string_view default_memory_kind);
 
   StreamExecutorExecutable(
       const CompileOptions& compile_options,
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 7c8f397f542d90..0c80d94287c65c 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -432,12 +432,7 @@ xla_cc_test(
     deps = [
         ":cpu_codegen_test_main",
         "//xla:literal",
-        "//xla:literal_util",
-        "//xla/hlo/ir:hlo",
-        "//xla/tests:literal_test_util",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index ac73cffff1ef0d..698fc2f88afa2e 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -90,14 +90,12 @@ xla_cc_test(
         ":sol_latency_estimator",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_module_config",
         "//xla/service:latency_hiding_scheduler",
-        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
index 0b392639b96bbb..c0f7e612472574 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"
@@ -258,6 +259,95 @@ int64_t convertReduceScatter(sdy::ReduceScatterOp op, int64_t nextChannelId,
   return nextChannelId;
 }
 
+void convertShardedToUnreduced(sdy::ShardedToUnreducedOp op,
+                               mlir::IRRewriter& rewriter) {
+  TensorShardingAttr outSharding = op.getOutSharding();
+  MeshAttr mesh = outSharding.getMesh(op);
+  // If the mesh does not have iota device ids, we need an extra step to convert
+  // partition id to logical device id. We do not support this case for now.
+  CHECK(mesh.getDeviceIds().empty());
+
+  mlir::Location loc = op.getLoc();
+  rewriter.setInsertionPoint(op);
+
+  ManualComputationOp manualComputation = createFullyManualComputation(
+      loc, op.getTensor(), outSharding, mesh, rewriter,
+      [&](mlir::BlockArgument arg, OpBuilder& blockBuilder) {
+        RankedTensorType fullType =
+            mlir::cast<RankedTensorType>(op.getResult().getType());
+        RankedTensorType inputType =
+            sdy::getSharding(op.getTensor())
+                .getLocalTensorType(fullType, mesh,
+                                    /*allowNonDivisible=*/false);
+        CHECK(inputType) << kNonDivisibleShardingError;
+        RankedTensorType outputType =
+            outSharding.getLocalTensorType(fullType, mesh);
+
+        Value zero = stablehlo::ConstantOp::create(
+            blockBuilder, loc,
+            blockBuilder.getZeroAttr(outputType.getElementType()));
+        Value broadcast = stablehlo::BroadcastOp::create(
+            blockBuilder, loc, outputType, zero, outputType.getShape());
+
+        // Decompose partitionId into axis coordinates.
+        Value partitionId = stablehlo::PartitionIdOp::create(blockBuilder, loc);
+        Value currentRem = stablehlo::ConvertOp::create(
+            blockBuilder, loc,
+            RankedTensorType::get({}, blockBuilder.getIntegerType(32)),
+            partitionId);
+        llvm::StringMap<Value> axisSizes, axisCoordinates;
+        for (sdy::MeshAxisAttr axis : llvm::reverse(mesh.getAxes())) {
+          Value axisSize = stablehlo::ConstantOp::create(
+              blockBuilder, loc,
+              blockBuilder.getI32IntegerAttr(axis.getSize()));
+          axisSizes[axis.getName()] = axisSize;
+          axisCoordinates[axis.getName()] =
+              stablehlo::RemOp::create(blockBuilder, loc, currentRem, axisSize);
+          currentRem =
+              stablehlo::DivOp::create(blockBuilder, loc, currentRem, axisSize);
+        }
+
+        SmallVector<Value> offsets;
+        offsets.reserve(outputType.getRank());
+        Value zeroOffset = stablehlo::ConstantOp::create(
+            blockBuilder, loc, blockBuilder.getI32IntegerAttr(0));
+        for (int64_t dim = 0; dim < outputType.getRank(); ++dim) {
+          if (op.getAxes()[dim].empty()) {
+            offsets.push_back(zeroOffset);
+            continue;
+          }
+
+          Value offset, prevAxisSize;
+          for (AxisRefAttr axis : op.getAxes()[dim].getValue()) {
+            CHECK(!axis.getSubAxisInfo()) << "Sub-axes not supported in "
+                                             "ShardedToUnreducedOp.";
+            StringRef axisName = axis.getName();
+            if (prevAxisSize == nullptr) {
+              offset = axisCoordinates[axisName];
+            } else {
+              offset = stablehlo::MulOp::create(blockBuilder, loc, offset,
+                                                prevAxisSize);
+              offset = stablehlo::AddOp::create(blockBuilder, loc, offset,
+                                                axisCoordinates[axisName]);
+            }
+
+            prevAxisSize = axisSizes[axisName];
+          }
+
+          Value localDimSize = stablehlo::ConstantOp::create(
+              blockBuilder, loc,
+              blockBuilder.getI32IntegerAttr(inputType.getDimSize(dim)));
+          offset =
+              stablehlo::MulOp::create(blockBuilder, loc, offset, localDimSize);
+          offsets.push_back(offset);
+        }
+
+        return stablehlo::DynamicUpdateSliceOp::create(
+            blockBuilder, loc, outputType, broadcast, arg, offsets);
+      });
+  rewriter.replaceOp(op, manualComputation);
+}
+
 void syncInOutUnreducedAxes(mlir::Operation* op) {
   Value input = op->getOperand(0);
   TensorShardingAttr outSharding = sdy::getSharding(op->getResult(0));
@@ -322,6 +412,9 @@ class StablehloExportManualReductionCollectivesPass
           nextChannelId =
               convertReduceScatter(reduceScatter, nextChannelId, rewriter);
         }
+      } else if (auto shardedToUnreduced =
+                     mlir::dyn_cast<sdy::ShardedToUnreducedOp>(op)) {
+        convertShardedToUnreduced(shardedToUnreduced, rewriter);
       }
     });
   }
diff --git a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
index ef1e4ce2438dca..288dff8d476c67 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
@@ -84,7 +84,6 @@ func.func @all_reduce_single_axis(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sh
   return %0 : tensor<8x8xf32>
 }
 
-
 // CHECK-LABEL: func @all_reduce_single_axis_2
 func.func @all_reduce_single_axis_2(%arg0: tensor<8x8xf32> {sdy.sharding = #sdy.sharding<@mesh_x_2_y_2, [{"x"}, {}], unreduced={"y"}>}) -> tensor<8x8xf32> {
   // CHECK{LITERAL}: replica_groups = dense<[[0, 1], [2, 3]]>
@@ -345,3 +344,30 @@ func.func @unreduced_sine_of_replicated_dot(%arg0: tensor<8x4xf32>, %arg1: tenso
   %1 = stablehlo.sine %0 {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{}, {}], unreduced={"x"}>]>} : tensor<8x2xf32>
   return %1 : tensor<8x2xf32>
 }
+
+// CHECK-LABEL: func @sharded_to_unreduced
+func.func @sharded_to_unreduced(%arg0: tensor<16x16xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"y"}]>}) -> tensor<16x16xf32> {
+  // CHECK-NEXT: %[[MANUAL_COMP:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME:     in_shardings=[<@mesh, [{"x"}, {"y"}]>]
+  // CHECK-SAME:     out_shardings=[<@mesh, [{}, {"y"}], unreduced={"x"}>]
+  // CHECK-SAME:     manual_axes={"x", "y"} (%arg1: tensor<4x8xf32>) {
+  // CHECK-NEXT:   %[[CST:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:   %[[BROADCAST:.*]] = stablehlo.broadcast %[[CST]], sizes = [16, 8] : (tensor<f32>) -> tensor<16x8xf32>
+  // CHECK-NEXT:   %[[PID:.*]] = stablehlo.partition_id : tensor<ui32>
+  // CHECK-NEXT:   %[[PID_I32:.*]] = stablehlo.convert %[[PID]] : (tensor<ui32>) -> tensor<i32>
+  // CHECK-NEXT:   %[[C2:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM2:.*]] = stablehlo.remainder %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV2:.*]] = stablehlo.divide %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[C4:.*]] = stablehlo.constant dense<4> : tensor<i32>
+  // CHECK-NEXT:   %[[REM4:.*]] = stablehlo.remainder %[[DIV2]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV4:.*]] = stablehlo.divide %[[DIV2]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[C0:.*]] = stablehlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT:   %[[C4_2:.*]] = stablehlo.constant dense<4> : tensor<i32>
+  // CHECK-NEXT:   %[[MULT:.*]] = stablehlo.multiply %[[REM4]], %[[C4_2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DUS:.*]] = stablehlo.dynamic_update_slice %[[BROADCAST]], %arg1, %[[MULT]], %[[C0]] : (tensor<16x8xf32>, tensor<4x8xf32>, tensor<i32>, tensor<i32>) -> tensor<16x8xf32>
+  // CHECK-NEXT:   sdy.return %[[DUS]] : tensor<16x8xf32>
+  // CHECK-NEXT: } : (tensor<16x16xf32>) -> tensor<16x16xf32>
+  // CHECK-NEXT: return %[[MANUAL_COMP]] : tensor<16x16xf32>
+  %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {"y"}], unreduced={"x"}> : tensor<16x16xf32>
+  return %0 : tensor<16x16xf32>
+}
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index dbc9b40220d4f9..03eb4da25e21f6 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -954,6 +954,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nvshmem",
+    srcs = ["nvshmem.cc"],
+    hdrs = ["nvshmem.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@nvshmem//:nvshmem_lib",
+    ],
+)
+
 cc_library(
     name = "nvjitlink_support",
     srcs = ["nvjitlink_support.cc"],
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem.cc b/third_party/xla/xla/stream_executor/cuda/nvshmem.cc
new file mode 100644
index 00000000000000..a88b84332c6c28
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem.cc
@@ -0,0 +1,134 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/nvshmem.h"
+
+#include <cstring>
+#include <memory>
+#include <string>
+
+#include "absl/base/call_once.h"
+#include "absl/base/no_destructor.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
+#include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace stream_executor::gpu::nvshmem {
+
+// NVSHMEM environment information is stored per process in a static variable.
+namespace {
+struct EnvInfo {
+  int process_id = -1;
+  size_t num_processes = 0;
+  size_t device_count_per_process = 0;
+  std::weak_ptr<xla::KeyValueStoreInterface> kv_store;
+  bool initialized = false;
+};
+
+static absl::NoDestructor<EnvInfo> env;
+}  // namespace
+
+void SetEnvInfo(int process_id, size_t num_processes,
+                size_t device_count_per_process,
+                std::weak_ptr<xla::KeyValueStoreInterface> kv_store) {
+  env->process_id = process_id;
+  env->num_processes = num_processes;
+  env->device_count_per_process = device_count_per_process;
+  env->kv_store = kv_store;
+}
+
+bool IsInitialized() { return env->initialized; }
+
+absl::Status InitializeOnce() {
+  static constexpr absl::string_view kKvStoreKey = "nvshmem_global_init";
+
+  auto init_fn = []() -> absl::Status {
+    VLOG(2) << "Initializing NVSHMEM: process_id=" << env->process_id
+            << ", num_processes=" << env->num_processes
+            << ", device_count_per_process=" << env->device_count_per_process;
+
+    if (env->process_id == -1) {
+      LOG(FATAL)
+          << "NvshmemCollectives::SetEnvInfo was not called before using "
+             "NVSHMEM API";
+    }
+    if (env->device_count_per_process != 1) {
+      LOG(FATAL) << "NVSHMEM API is only supported with one device per process";
+    }
+    nvshmemx_init_attr_t nvshmem_init_attr = NVSHMEMX_INIT_ATTR_INITIALIZER;
+    nvshmemx_uniqueid_t nvshmem_id = NVSHMEMX_UNIQUEID_INITIALIZER;
+
+    // Initialize NVSHMEM
+    if (std::shared_ptr<xla::KeyValueStoreInterface> kv_store =
+            env->kv_store.lock()) {
+      if (env->process_id == 0) {
+        if (nvshmemx_get_uniqueid(&nvshmem_id) != 0) {
+          return absl::InternalError("nvshmemx_get_uniqueid failed.");
+        }
+        char buf[sizeof(nvshmemx_uniqueid_t)];
+        std::memcpy(buf, &nvshmem_id, sizeof(nvshmemx_uniqueid_t));
+        absl::string_view nvshmem_id_str{buf, sizeof(buf)};
+        TF_RETURN_IF_ERROR(kv_store->Set(kKvStoreKey, nvshmem_id_str));
+      } else {
+        TF_ASSIGN_OR_RETURN(std::string id_str,
+                            kv_store->Get(kKvStoreKey, absl::Minutes(10)));
+        CHECK(id_str.size() >= sizeof(nvshmemx_uniqueid_t));
+        std::memcpy(&nvshmem_id, id_str.data(), sizeof(nvshmemx_uniqueid_t));
+      }
+    } else {
+      return absl::InternalError(
+          "KV store is not available for nvshmem initialization.");
+    }
+
+    if (nvshmemx_set_attr_uniqueid_args(env->process_id, env->num_processes,
+                                        &nvshmem_id, &nvshmem_init_attr) != 0) {
+      return absl::InternalError("nvshmemx_set_attr_uniqueid_args failed.");
+    }
+    if (nvshmemx_hostlib_init_attr(NVSHMEMX_INIT_WITH_UNIQUEID,
+                                   &nvshmem_init_attr) != 0) {
+      return absl::InternalError("nvshmemx_hostlib_init_attr failed.");
+    }
+
+    VLOG(3) << absl::StreamFormat(
+        "Initialized NVSHMEM on process %d; num_processes=%llu",
+        env->process_id, env->num_processes);
+    return absl::OkStatus();
+  };
+
+  static absl::once_flag once_flag;
+  absl::Status status = absl::OkStatus();
+  absl::call_once(once_flag, [&]() {
+    status = init_fn();
+    env->initialized = true;
+  });
+  return status;
+}
+
+void Finalize() {
+  VLOG(3) << absl::StreamFormat(
+      "Finilizing NVSHMEM on process %d; num_processes=%llu", env->process_id,
+      env->num_processes);
+  nvshmemx_hostlib_finalize();
+}
+
+}  // namespace stream_executor::gpu::nvshmem
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem.h b/third_party/xla/xla/stream_executor/cuda/nvshmem.h
new file mode 100644
index 00000000000000..4fa7313a9e57d2
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem.h
@@ -0,0 +1,43 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_H_
+
+#include <cstddef>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+
+namespace stream_executor::gpu::nvshmem {
+
+// Set environment information for NVSHMEM library.
+void SetEnvInfo(int process_id, size_t num_processes,
+                size_t device_count_per_process,
+                std::weak_ptr<xla::KeyValueStoreInterface> kv_store);
+
+// Returns true if NVSHMEM library is initialized.
+bool IsInitialized();
+
+// Initializes NVSHMEM library once per process.
+absl::Status InitializeOnce();
+
+// Finalizes NVSHMEM library
+void Finalize();
+
+}  // namespace stream_executor::gpu::nvshmem
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_H_

From 10ef2e94f70d0b57d0fc89cb0f138cf7d8155c78 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Fri, 12 Dec 2025 09:44:50 -0800
Subject: [PATCH 227/753] Replace `http_archive` with `tf_http_archive` for
 Github links to avoid timeout issues.

PiperOrigin-RevId: 843727468
---
 third_party/xla/WORKSPACE      |  8 +++---
 third_party/xla/workspace0.bzl | 18 +++++++-------
 third_party/xla/workspace1.bzl |  9 +++----
 third_party/xla/workspace3.bzl | 45 +++++++++++++++-------------------
 4 files changed, 37 insertions(+), 43 deletions(-)

diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index 69ebc22643da3b..cf7afb583b2199 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -1,19 +1,19 @@
 # buildifier: disable=load-on-top
 workspace(name = "xla")
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # Initialize toolchains for ML projects.
 #
 # A hermetic build system is designed to produce completely reproducible builds for C++.
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
-http_archive(
+tf_http_archive(
     name = "rules_ml_toolchain",
     sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
     strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
-    urls = [
+    urls = tf_mirror_urls(
         "https://github.com/google-ml-infra/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
-    ],
+    ),
 )
 
 load(
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 0507da08abb6d3..4e66ac333f0e23 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -7,6 +7,7 @@ load("@build_bazel_rules_apple//apple:repositories.bzl", "apple_rules_dependenci
 load("@build_bazel_rules_swift//swift:repositories.bzl", "swift_rules_dependencies")
 load("@com_github_grpc_grpc//bazel:grpc_extra_deps.bzl", "grpc_extra_deps")
 load("@com_google_benchmark//:bazel/benchmark_deps.bzl", "benchmark_deps")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def _tf_bind():
     """Bind targets for some external repositories"""
@@ -109,21 +110,20 @@ def workspace():
     # Note: We add this to fix Kokoro builds.
     # The rules below call into `rules_proto` but the hash has changed and
     # Bazel refuses to continue. So, we add our own mirror.
-    http_archive(
+    tf_http_archive(
         name = "rules_proto",
         sha256 = "20b240eba17a36be4b0b22635aca63053913d5c1ee36e16be36499d167a2f533",
         strip_prefix = "rules_proto-11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_proto/archive/11bf7c25e666dd7ddacbcd4d4c4a9de7a25175f8.tar.gz",
-        ],
+        ),
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_shell",
         sha256 = "bc61ef94facc78e20a645726f64756e5e285a045037c7a61f65af2941f4c25e1",
         strip_prefix = "rules_shell-0.4.1",
-        url = "https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_shell/releases/download/v0.4.1/rules_shell-v0.4.1.tar.gz"),
     )
 
     # Now, finally use the rules
@@ -138,13 +138,13 @@ def workspace():
     # Toolchains for ML projects hermetic builds.
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
     if "rules_ml_toolchain" not in native.existing_rules():
-        http_archive(
+        tf_http_archive(
             name = "rules_ml_toolchain",
             sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
             strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
-            urls = [
+            urls = tf_mirror_urls(
                 "https://github.com/google-ml-infra/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
-            ],
+            ),
         )
 
     # If a target is bound twice, the later one wins, so we have to do tf bindings
diff --git a/third_party/xla/workspace1.bzl b/third_party/xla/workspace1.bzl
index 05121708fc5fa5..cbd3d24aec8a1a 100644
--- a/third_party/xla/workspace1.bzl
+++ b/third_party/xla/workspace1.bzl
@@ -1,9 +1,9 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 
 # buildifier: disable=unnamed-macro
@@ -15,14 +15,13 @@ def workspace():
 
     closure_repositories()
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_toolchains",
         sha256 = "294cdd859e57fcaf101d4301978c408c88683fbc46fbc1a3829da92afbea55fb",
         strip_prefix = "bazel-toolchains-8c717f8258cd5f6c7a45b97d974292755852b658",
-        urls = [
-            "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-toolchains/archive/8c717f8258cd5f6c7a45b97d974292755852b658.tar.gz",
-        ],
+        ),
     )
 
     grpc_deps()
diff --git a/third_party/xla/workspace3.bzl b/third_party/xla/workspace3.bzl
index feee699160ed44..10212d417103d1 100644
--- a/third_party/xla/workspace3.bzl
+++ b/third_party/xla/workspace3.bzl
@@ -1,71 +1,66 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # buildifier: disable=function-docstring
 # buildifier: disable=unnamed-macro
 def workspace():
-    http_archive(
+    tf_http_archive(
         name = "io_bazel_rules_closure",
         sha256 = "5b00383d08dd71f28503736db0500b6fb4dda47489ff5fc6bed42557c07c6ba9",
         strip_prefix = "rules_closure-308b05b2419edb5c8ee0471b67a40403df940149",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz",  # 2019-06-13
-        ],
+        ),
     )
 
     # https://github.com/bazelbuild/bazel-skylib/releases
-    http_archive(
+    tf_http_archive(
         name = "bazel_skylib",
         sha256 = "bc283cdfcd526a52c3201279cda4bc298652efa898b10b4db0837dc51652756f",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/bazel-skylib/releases/download/1.7.1/bazel-skylib-1.7.1.tar.gz",
-        ],
+        ),
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_license",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
-        ],
+        ),
         sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "rules_pkg",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/rules_pkg/releases/download/0.7.1/rules_pkg-0.7.1.tar.gz",
-        ],
+        ),
         sha256 = "451e08a4d78988c06fa3f9306ec813b836b1d076d0f055595444ba4ff22b867f",
     )
 
-    http_archive(
+    tf_http_archive(
         name = "bazel_features",
         sha256 = "4fd9922d464686820ffd8fcefa28ccffa147f7cdc6b6ac0d8b07fde565c65d66",
         strip_prefix = "bazel_features-1.25.0",
-        url = "https://github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz",
+        urls = tf_mirror_urls("https://github.com/bazel-contrib/bazel_features/releases/download/v1.25.0/bazel_features-v1.25.0.tar.gz"),
     )
 
     # Maven dependencies.
     RULES_JVM_EXTERNAL_TAG = "4.3"
-    http_archive(
+    tf_http_archive(
         name = "rules_jvm_external",
         strip_prefix = "rules_jvm_external-%s" % RULES_JVM_EXTERNAL_TAG,
         sha256 = "6274687f6fc5783b589f56a2f1ed60de3ce1f99bc4e8f9edef3de43bdf7c6e74",
-        url = "https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG,
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_jvm_external/archive/%s.zip" % RULES_JVM_EXTERNAL_TAG),
     )
 
     # Platforms
-    http_archive(
+    tf_http_archive(
         name = "platforms",
-        urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
+        urls = tf_mirror_urls(
             "https://github.com/bazelbuild/platforms/releases/download/0.0.11/platforms-0.0.11.tar.gz",
-        ],
+        ),
         sha256 = "29742e87275809b5e598dc2f04d86960cc7a55b3067d97221c9abbc9926bff0f",
     )
 

From 5210e9cd5f4484f21b714e3e2165ee1394beaf54 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Fri, 12 Dec 2025 09:53:39 -0800
Subject: [PATCH 228/753] Rename the status payload for compilation errors to
 be compliant to the recommended naming scheme

PiperOrigin-RevId: 843730949
---
 third_party/xla/xla/pjrt/errors.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/errors.cc b/third_party/xla/xla/pjrt/errors.cc
index 2a269a3caa2588..698067d1beed18 100644
--- a/third_party/xla/xla/pjrt/errors.cc
+++ b/third_party/xla/xla/pjrt/errors.cc
@@ -23,7 +23,7 @@ namespace xla {
 // The payload attached to the absl::Status returned by the compilation
 // service when the compilation fails due to compilation errors.
 inline constexpr absl::string_view kCompilationErrorPayload =
-    "compilation_error";
+    "type.googleapis.com/xla.CompilationError";
 
 bool HasCompilationErrorPayload(const absl::Status& status) {
   return status.GetPayload(kCompilationErrorPayload).has_value();

From 92cb3c46e88ca489baa03ba7d965abb64b561471 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 12 Dec 2025 09:59:53 -0800
Subject: [PATCH 229/753] [stream_executor:cuda] Add NvshmemMemoryAllocator
 class

It is a layering violation to depend from SE to XLA:GPU collectives. All memory allocations should be done via correct se::MemoryAllocator instances. Prepare for removing memory allocation APIs from GPU collectives.

PiperOrigin-RevId: 843733369
---
 .../xla/xla/stream_executor/cuda/BUILD        | 25 +++++
 .../cuda/nvshmem_memory_allocator.cc          | 92 +++++++++++++++++++
 .../cuda/nvshmem_memory_allocator.h           | 37 ++++++++
 3 files changed, 154 insertions(+)
 create mode 100644 third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
 create mode 100644 third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.h

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 03eb4da25e21f6..18502d54c0e9df 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -979,6 +979,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nvshmem_memory_allocator",
+    srcs = ["nvshmem_memory_allocator.cc"],
+    hdrs = ["nvshmem_memory_allocator.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        ":nvshmem",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:numbers",
+        "@nvshmem//:nvshmem_lib",
+    ],
+)
+
 cc_library(
     name = "nvjitlink_support",
     srcs = ["nvjitlink_support.cc"],
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
new file mode 100644
index 00000000000000..b05c32458c31a5
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
@@ -0,0 +1,92 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/nvshmem_memory_allocator.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
+#include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
+#include "xla/stream_executor/cuda/nvshmem.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tsl/platform/numbers.h"
+
+namespace stream_executor::gpu {
+namespace {
+
+absl::StatusOr<void*> NvshmemAllocate(uint64_t size) {
+  TF_RETURN_IF_ERROR(nvshmem::InitializeOnce());
+  VLOG(3) << absl::StreamFormat(
+      "Start allocation of %s (%llu bytes) for NVSHMEM",
+      tsl::strings::HumanReadableNumBytes(size), size);
+  void* buffer = nvshmem_malloc(size);
+  if (buffer == nullptr) {
+    return absl::InternalError(absl::StrFormat(
+        "Failed to allocate %s (%llu bytes) from NVSHMEM memory",
+        tsl::strings::HumanReadableNumBytes(size), size));
+  }
+  return buffer;
+}
+
+absl::Status NvshmemFree(void* ptr) {
+  TF_RETURN_IF_ERROR(nvshmem::InitializeOnce());
+  VLOG(3) << absl::StreamFormat("Start de-allocation for NVSHMEM buffer: %p",
+                                ptr);
+  nvshmem_free(ptr);
+  return absl::OkStatus();
+}
+
+// A memory allocated from NVSHMEM on the given executor.
+class NvshmemMemoryAllocation : public MemoryAllocation {
+ public:
+  NvshmemMemoryAllocation(void* ptr, uint64_t size);
+
+  ~NvshmemMemoryAllocation() final;
+  DeviceAddressBase address() const final;
+
+ private:
+  void* ptr_;
+  uint64_t size_;
+};
+
+}  // namespace
+
+NvshmemMemoryAllocation::NvshmemMemoryAllocation(void* ptr, uint64_t size)
+    : ptr_(ptr), size_(size) {}
+
+NvshmemMemoryAllocation::~NvshmemMemoryAllocation() {
+  CHECK_OK(NvshmemFree(ptr_));  // Crash OK
+}
+
+DeviceAddressBase NvshmemMemoryAllocation::address() const {
+  return DeviceAddressBase(ptr_, size_);
+}
+
+absl::StatusOr<std::unique_ptr<MemoryAllocation>>
+NvshmemMemoryAllocator::Allocate(uint64_t size) {
+  TF_ASSIGN_OR_RETURN(void* ptr, NvshmemAllocate(size));
+  return std::make_unique<NvshmemMemoryAllocation>(ptr, size);
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.h b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.h
new file mode 100644
index 00000000000000..ba19a2f8c8e66d
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.h
@@ -0,0 +1,37 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_MEMORY_ALLOCATOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_allocator.h"
+
+namespace stream_executor::gpu {
+
+// A memory allocator that uses NVSHMEM to allocate memory.
+class NvshmemMemoryAllocator : public MemoryAllocator {
+ public:
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> Allocate(
+      uint64_t size) final;
+};
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_NVSHMEM_MEMORY_ALLOCATOR_H_

From 4c6f6242d0901eb4e152efd76dde5dc2227612cc Mon Sep 17 00:00:00 2001
From: Hariprasad Ravishankar <hravisha@mathworks.com>
Date: Fri, 12 Dec 2025 13:17:08 -0500
Subject: [PATCH 230/753] [mlir][tosa] Enhance TosaConvertTFLUInt8 pass to
 handle tosa.const with quant u8 values (#105178)

* Enhance TosaConvertTFLUInt8 pass to handle tosa.const with quant u8 values

* Fix test to use ui8 attribute values

* Revert "Fixes to convert tfl uint8"

This reverts commit 583daed6aa73fa7b6f5b8e53cb825f4228defb72.

* Revert "Enhance TosaConvertTFLUInt8 pass to handle tosa.const with quant u8 values"

This reverts commit 496b46ef927e427fc470c202ebc497638df9fcdd.

* Error if tosa.const operations are observed when running tosa-convert-tfl-uint8

* Update convert_tfl_uint8 pass to error if there tosa ops

* Address feedback
---
 .../mlir/tosa/tests/convert-tfl-uint8.mlir    | 19 ++++++++++++++++++-
 .../mlir/tosa/transforms/convert_tfl_uint8.cc | 10 +++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir b/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
index 02e9c0649e3f78..cd9a2dcdf746fd 100644
--- a/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/convert-tfl-uint8.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-tosa-opt --tosa-convert-tfl-uint8  --verify-each %s | FileCheck %s
+// RUN: tf-tosa-opt --tosa-convert-tfl-uint8 --verify-diagnostics --verify-each %s | FileCheck %s
 
 
 // Operations for testing --tosa-convert-tfl-uint8
@@ -28,3 +28,20 @@ func.func @test_cast_ui8(%arg0: tensor<1x256x256x3x!quant.uniform<u8:f32, 0.0156
   %0 = "tfl.cast"(%arg0) : (tensor<1x256x256x3x!quant.uniform<u8:f32, 0.015603500418365002:128>>) -> tensor<1x256x256x3xf32>
   func.return %0 : tensor<1x256x256x3xf32>
 }
+
+// ----
+
+// CHECK-LABEL: test_error_tosa_ops
+func.func @test_error_tosa_ops(%arg0: tensor<5x10xi8>) -> (tensor<5x10xi8>, none) {
+
+  // Dummy use to TFL dialect to load TFL dialect in MLIR context
+  %0 = "tfl.no_value"() <{value}> : () -> none
+
+  // expected-error @+1 {{tosa operations are not expected in this pass. Run tosa-convert-tfl-uint8 before tosa-legalize-tfl}}
+  %cst1 = "tosa.const"() <{values = dense<1> : tensor<5x10xi8>}> : () -> tensor<5x10xi8>
+  // expected-error @+1 {{tosa operations are not expected in this pass. Run tosa-convert-tfl-uint8 before tosa-legalize-tfl}}
+  %1 = "tosa.add"(%arg0, %cst1) : (tensor<5x10xi8>, tensor<5x10xi8>) -> tensor<5x10xi8>
+
+
+  func.return %1, %0 : tensor<5x10xi8>, none
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
index afd66102b8a29c..6edfc57ad8a89a 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
@@ -264,7 +264,8 @@ LogicalResult convert_graph_uint8_tensor(mlir::MLIRContext &context,
     // Convert intermediate tensor.
     for (auto &op : bb) {
       if (llvm::dyn_cast<tosa::ConstOp>(&op)) {
-        continue;  // Skip if the operation is a tosa::ConstOp
+        // Skip tosa const ops created during rescaling. 
+        continue;
       }
 
       for (Value output_val : op.getResults()) {
@@ -355,6 +356,13 @@ void ConvertUint8ToInt8::runOnOperation() {
   auto &ctx = getContext();
   mlir::func::FuncOp func = getOperation();
 
+  func.walk([&](Operation *op) {
+    if (isa<TosaOp>(op)){
+      // Run this before calling convert_graph_uint8_tensor as rescaling introduces tosa ops
+      op->emitError("tosa operations are not expected in this pass. Run tosa-convert-tfl-uint8 before tosa-legalize-tfl");
+    }
+  });
+
   // Convert uint8 const tensor. const needs to be handled specifically.
   patterns.add<ConvertUint8QConstOp>(&ctx);
   (void)applyPatternsGreedily(func, std::move(patterns));

From 6e9b9161dd19952a661cf2ee8bffaa93472d263b Mon Sep 17 00:00:00 2001
From: Shaogang Wang <shawnw@nvidia.com>
Date: Fri, 12 Dec 2025 10:01:23 -0800
Subject: [PATCH 231/753] PR #34735: [XLA:GPU] enable dynamic slice fusion
 default lowered to cuda graph
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34735

📝 Summary of Changes
Added DebugOptions::DYNAMIC_SLICE_FUSION to the list of enabled GPU command buffers in the default debug options.

🚀 Kind of Contribution
⚡️ Performance Improvement

🧪 Unit Tests:
change the default setting, unittest has already been added.
Copybara import of the project:

--
12db9b02864a72046775dbd3684ec60beff0c791 by Shawn Wang <shawnw@nvidia.com>:

enable dynamic slice fusion default lowered to cuda graph

fix unittest

fix

fix

Merging this change closes #34735

PiperOrigin-RevId: 843733907
---
 .../xla/xla/backends/gpu/codegen/BUILD        | 11 ++++-
 .../gpu/codegen/dynamic_slice_fusion_test.cc  |  5 ++-
 third_party/xla/xla/debug_options_flags.cc    |  1 +
 .../xla/xla/debug_options_parsers_test.cc     | 41 +++++++++++--------
 4 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index 506a39ee7b28b8..8c4cbe8e9941d2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -1,6 +1,7 @@
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/tsl:tsl.bzl", "if_google")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -185,6 +186,15 @@ cc_library(
 xla_test(
     name = "dynamic_slice_fusion_test",
     srcs = ["dynamic_slice_fusion_test.cc"],
+    # TODO(b/46791573): Remove heap_check= once the bug is fixed.
+    backend_args = if_google(
+        {
+            "b200": ["--heap_check="],
+            "a100": ["--heap_check="],
+            "h100": ["--heap_check="],
+        },
+        {},
+    ),
     backend_tags = {
         "gpu": [
             "multi_gpu",
@@ -224,7 +234,6 @@ xla_test(
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
index 6b6b860885065c..d99cd48be0455a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/dynamic_slice_fusion_test.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
@@ -3412,8 +3411,10 @@ TEST_F(DynamicSliceFusionTest,
       ROOT while = (s32[], s32[32,32], s32[32,32]) while(tuple), body=body, condition=condition
     }
   )";
+  HloModuleConfig config = GetModuleConfigWithoutCommandBuffer();
+  config.mutable_debug_options().set_xla_gpu_enable_dynamic_slice_fusion(true);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> fused_module,
-                          ParseAndReturnVerifiedModule(hlo_fused));
+                          ParseAndReturnVerifiedModule(hlo_fused, config));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> wrapped_exec,
                           CreateExecutable(fused_module->Clone(), false));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> exec,
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 2023cdbcba5ca1..5487a31836ecb3 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -243,6 +243,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUBLASLT);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUSTOM_CALL);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
+  opts.add_xla_gpu_enable_command_buffer(DebugOptions::DYNAMIC_SLICE_FUSION);
   opts.set_xla_gpu_graph_min_graph_size(5);
   opts.set_xla_gpu_command_buffer_scheduling_mode(DebugOptions::LHS);
   opts.set_xla_gpu_command_buffer_unroll_loops(false);
diff --git a/third_party/xla/xla/debug_options_parsers_test.cc b/third_party/xla/xla/debug_options_parsers_test.cc
index 717394aded86d4..1fea535da5e44b 100644
--- a/third_party/xla/xla/debug_options_parsers_test.cc
+++ b/third_party/xla/xla/debug_options_parsers_test.cc
@@ -391,13 +391,14 @@ TEST(ParseRepeatedEnumModifiersTest, Invalid) {
 TEST(ParseRepeatedEnumFlagsTest, CommandBufferCmdType) {
   DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
 
-  // Check that the default setting has 5 types.
+  // Check that the default setting has 6 types.
   const auto& enabled_types = debug_options.xla_gpu_enable_command_buffer();
-  ASSERT_EQ(enabled_types.size(), 5);
-  ASSERT_THAT(enabled_types,
-              ElementsAre(DebugOptions::FUSION, DebugOptions::CUBLAS,
-                          DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN));
+  ASSERT_EQ(enabled_types.size(), 6);
+  ASSERT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::FUSION, DebugOptions::CUBLAS,
+                  DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION));
 
   // Initialize the flag objects.
   std::vector<tsl::Flag> flag_objects;
@@ -406,26 +407,30 @@ TEST(ParseRepeatedEnumFlagsTest, CommandBufferCmdType) {
   // Removing options from the existing setting.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=-fusion,-cublas");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 3);
-  EXPECT_THAT(enabled_types,
-              ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN));
+  EXPECT_EQ(enabled_types.size(), 4);
+  EXPECT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION));
 
   // Removing an option that isn't there and adding a duplicate.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=+cublaslt,-fusion");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 3);
-  EXPECT_THAT(enabled_types,
-              ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN));
+  EXPECT_EQ(enabled_types.size(), 4);
+  EXPECT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION));
 
   // Adding an option.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=+cublas");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 4);
-  EXPECT_THAT(enabled_types,
-              ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                          DebugOptions::CUDNN, DebugOptions::CUBLAS));
+  EXPECT_EQ(enabled_types.size(), 5);
+  EXPECT_THAT(
+      enabled_types,
+      ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::CUBLAS));
 
   // Overwriting the default setting.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=custom_call,fusion");

From 0e81d28322ab962b9abf41c5848cb825676d02b0 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 12 Dec 2025 10:21:57 -0800
Subject: [PATCH 232/753] Integrate LLVM at llvm/llvm-project@43bfec29cbec

Updates LLVM usage to match
[43bfec29cbec](https://github.com/llvm/llvm-project/commit/43bfec29cbec)

PiperOrigin-RevId: 843742433
---
 third_party/xla/third_party/llvm/workspace.bzl         |  4 ++--
 third_party/xla/third_party/shardy/temporary.patch     | 10 +++++-----
 third_party/xla/third_party/shardy/workspace.bzl       |  4 ++--
 .../codegen/emitters/transforms/convert_float_amd.cc   |  4 ++--
 .../emitters/transforms/vectorize_loads_stores.cc      |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index 26b3bf8809ac7b..69a8c63368c081 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "16c0893f04c04faa8ac36495363344840f7c5db1"
-    LLVM_SHA256 = "3f786bc56ecb8fce511fe504f9b0848c12b5312beb7bded23edfc77272698b90"
+    LLVM_COMMIT = "43bfec29cbecc1ff2e5aa6f8908c4d63e9c896c5"
+    LLVM_SHA256 = "d9c35a7c3764666abcf464955530154d528b2e5edeb97bfa8890f02cb52d1f30"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index ee2078ba7263f1..b67860af06a64b 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,15 +1,15 @@
 diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 3c9c005..26b3bf8 100644
+index 26b3bf8..69a8c63 100644
 --- a/third_party/llvm/workspace.bzl
 +++ b/third_party/llvm/workspace.bzl
 @@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
  
  def repo(name):
      """Imports LLVM."""
--    LLVM_COMMIT = "48d942c7158af43094db1b5e6c59c6e6fcf1b5aa"
--    LLVM_SHA256 = "6ce4ac276a4687625e9f57e53715285d99b60c6553e0cde4db9b7e74f2179f69"
-+    LLVM_COMMIT = "16c0893f04c04faa8ac36495363344840f7c5db1"
-+    LLVM_SHA256 = "3f786bc56ecb8fce511fe504f9b0848c12b5312beb7bded23edfc77272698b90"
+-    LLVM_COMMIT = "16c0893f04c04faa8ac36495363344840f7c5db1"
+-    LLVM_SHA256 = "3f786bc56ecb8fce511fe504f9b0848c12b5312beb7bded23edfc77272698b90"
++    LLVM_COMMIT = "43bfec29cbecc1ff2e5aa6f8908c4d63e9c896c5"
++    LLVM_SHA256 = "d9c35a7c3764666abcf464955530154d528b2e5edeb97bfa8890f02cb52d1f30"
  
      tf_http_archive(
          name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index a0a63a05ad95ea..f2f1025815a856 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "179bcb16dc3c2b132f9bccff096cb5559486fdc2"
-    SHARDY_SHA256 = "ba6475e764d830d3e8f9ede9c28f3e67f6703606af10f1398cfcca6a13979e09"
+    SHARDY_COMMIT = "f36aaacad42e307da330bace41c920bdf23f1869"
+    SHARDY_SHA256 = "dd8f9591e7328564222df3e964009d34cf97bf753225b2c172418c3b946c7ee0"
 
     tf_http_archive(
         name = "shardy",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
index 8dead6470b4763..1b06367c81ff93 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
@@ -365,7 +365,7 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
       return std::nullopt;
     }
 
-    mlir::Value vector = extract.getVector();
+    mlir::Value vector = extract.getSource();
 
     size_t element_count =
         mlir::cast<FixedVectorValue>(vector).getType().getNumElements();
@@ -389,7 +389,7 @@ struct RewriteFp8ExtFPattern : public Fp8OpRewritePattern<arith::ExtFOp> {
 
     for (const mlir::OpOperand& use : vector.getUses()) {
       extract = mlir::dyn_cast<vector::ExtractOp>(use.getOwner());
-      if (!extract || !extract->hasOneUse() || extract.getVector() != vector ||
+      if (!extract || !extract->hasOneUse() || extract.getSource() != vector ||
           !matchPos(extract, &pos)) {
         return std::nullopt;
       }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
index dc8dd8eebcf41b..c0a2398d7eec57 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
@@ -506,7 +506,7 @@ struct FoldVectorInsertExtractPairs
     auto bbarg = mlir::cast<mlir::BlockArgument>(insert.getDest());
     int64_t result_index = bbarg.getArgNumber() - 1;
     if (auto transfer_read =
-            extract.getVector().getDefiningOp<mlir::vector::TransferReadOp>()) {
+            extract.getSource().getDefiningOp<mlir::vector::TransferReadOp>()) {
       if (transfer_read.getBase().getType().getNumElements() ==
           vector_type.getNumElements()) {
         return rewriter.notifyMatchFailure(
@@ -538,7 +538,7 @@ struct FoldVectorInsertExtractPairs
       yield_op->setOperand(result_index, insert.getDest());
     });
     rewriter.replaceAllUsesWith(loop->getResult(result_index),
-                                extract.getVector());
+                                extract.getSource());
     return mlir::success();
   }
 };

From 6005dca05054c76177052dabb32e962e56a402c5 Mon Sep 17 00:00:00 2001
From: Nihar0071 <niharpatel007rd@gmail.com>
Date: Fri, 12 Dec 2025 14:11:13 -0500
Subject: [PATCH 233/753] Fix iOS build: correct apple_support platform paths

The platform references in .bazelrc were pointing to the incorrect
path @build_bazel_apple_support//configs/platforms which doesn't
exist in apple_support v1.24.5. Updated all 5 Apple platform
configurations to use the correct path
@build_bazel_apple_support//platforms.

This fixes the build error:
'no such package @@build_bazel_apple_support//configs/platforms':
BUILD file not found in directory 'configs/platforms'

Fixes #105127

Tested on:
- macOS Sequoia 26.0.1 (M1)
- Bazel 7.7.0 (via bazelisk 1.27.0)
- TensorFlow 2.20.0

All 5 platform configurations verified:
- ios_arm64
- ios_arm64e
- ios_sim_arm64
- ios_x86_64
- macos_arm64
---
 .bazelrc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 14a2128d591243..308d8593feee86 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -234,7 +234,7 @@ common:apple-toolchain --host_crosstool_top=@local_config_apple_cc//:toolchain
 common:macos_arm64 --cpu=darwin_arm64
 common:macos_arm64 --macos_minimum_os=11.0
 common:macos_arm64 --config=clang_local
-common:macos_arm64 --platforms=@build_bazel_apple_support//configs/platforms:darwin_arm64
+common:macos_arm64 --platforms=@build_bazel_apple_support//platforms:darwin_arm64
 
 # iOS configs for each architecture and the fat binary builds.
 common:ios --apple_platform_type=ios
@@ -247,16 +247,16 @@ common:ios_armv7 --cpu=ios_armv7
 common:ios_armv7 --platforms=@org_tensorflow//tensorflow/tools/toolchains/ios:ios_armv7
 common:ios_arm64 --config=ios
 common:ios_arm64 --cpu=ios_arm64
-common:ios_arm64 --platforms=@build_bazel_apple_support//configs/platforms:ios_arm64
+common:ios_arm64 --platforms=@build_bazel_apple_support//platforms:ios_arm64
 common:ios_arm64e --config=ios
 common:ios_arm64e --cpu=ios_arm64e
-common:ios_arm64e --platforms=@build_bazel_apple_support//configs/platforms:ios_arm64e
+common:ios_arm64e --platforms=@build_bazel_apple_support//platforms:ios_arm64e
 common:ios_sim_arm64 --config=ios
 common:ios_sim_arm64 --cpu=ios_sim_arm64
-common:ios_sim_arm64 --platforms=@build_bazel_apple_support//configs/platforms:ios_sim_arm64
+common:ios_sim_arm64 --platforms=@build_bazel_apple_support//platforms:ios_sim_arm64
 common:ios_x86_64 --config=ios
 common:ios_x86_64 --cpu=ios_x86_64
-common:ios_x86_64 --platforms=@build_bazel_apple_support//configs/platforms:ios_x86_64
+common:ios_x86_64 --platforms=@build_bazel_apple_support//platforms:ios_x86_64
 common:ios_fat --config=ios
 common:ios_fat --ios_multi_cpus=armv7,arm64,i386,x86_64
 

From 10689c979e1348a2135078c3f54b73ebb9fa3bc7 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 12 Dec 2025 11:44:03 -0800
Subject: [PATCH 234/753] [XLA:GPU] Add early exit after the layout assignment
 to XLA:GPU.

PiperOrigin-RevId: 843774817
---
 third_party/xla/xla/service/compiler.h        | 16 +++++++++
 .../xla/xla/service/gpu/gpu_compiler.cc       | 33 ++++++++++++++++++-
 .../xla/xla/service/gpu/gpu_compiler.h        |  4 +++
 .../xla/xla/service/gpu/gpu_compiler_test.cc  | 29 ++++++++++++++++
 .../gpu/legacy_gpu_aot_compilation_result.cc  | 19 +++++++++++
 .../gpu/legacy_gpu_aot_compilation_result.h   | 22 +++++++++++++
 6 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index e69bd65eeea5b9..5de2f37f65a9b4 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -205,6 +205,9 @@ class Compiler {
 
     // Embed HLO module in the executable. Only used on GPU at the moment.
     bool embed_hlo_module = true;
+
+    // If true, the compiler will exit after the layout assignment pass.
+    bool early_exit_with_layouts = false;
   };
 
   virtual ~Compiler() = default;
@@ -506,6 +509,18 @@ class AotCompilationOptions {
     gpu_target_config_ = gpu_target_config;
   }
 
+  // Provides a way to end compilation early and get partial outputs.
+  enum class EarlyExitPoint {
+    kNone,
+    kAfterLayoutAssignment,
+    kAfterBufferAssignment,
+  };
+
+  EarlyExitPoint early_exit_point() const { return early_exit_point_; }
+  void set_early_exit_point(EarlyExitPoint early_exit_point) {
+    early_exit_point_ = early_exit_point;
+  }
+
  protected:
   AotCompilationOptions();
 
@@ -525,6 +540,7 @@ class AotCompilationOptions {
   std::vector<std::string> sanitize_abilists_dataflow_;
   // Contains target-specific information required by AOT compilation.
   std::optional<Compiler::GpuTargetConfig> gpu_target_config_;
+  EarlyExitPoint early_exit_point_ = EarlyExitPoint::kNone;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index e5b83fb8f3b13f..4f3ca9e1cc7e67 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1435,6 +1435,9 @@ absl::Status GpuCompiler::OptimizeHloModule(
 
   TF_RETURN_IF_ERROR(RunLayoutAssignmentPasses(
       hlo_module, gpu_version, dnn_version, device_description));
+  if (options.early_exit_with_layouts) {
+    return absl::OkStatus();
+  }
 
   TF_RETURN_IF_ERROR(RunLayoutNormalizationPasses(
       hlo_module,
@@ -1900,6 +1903,9 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   TF_RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), is_deviceless ? nullptr : stream_exec,
                         options, gpu_target_config, alias_info.get()));
+  if (options.early_exit_with_layouts) {
+    return std::move(module);
+  }
 
   TF_RETURN_IF_ERROR(RunPreSchedulingCopyInsertion(*module, device_description,
                                                    alias_info.get()));
@@ -2640,6 +2646,11 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   // compilation.
   CHECK_EQ(options.PlatformId(), PlatformId());
 
+  if (options.early_exit_point() !=
+      AotCompilationOptions::EarlyExitPoint::kNone) {
+    return EarlyExitCompileAheadOfTime(std::move(hlo_module), options);
+  }
+
   if (hlo_module->config()
           .debug_options()
           .xla_gpu_experimental_aot_compiled_thunks()) {
@@ -2665,6 +2676,26 @@ GpuCompiler::NewCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   return results;
 }
 
+absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+GpuCompiler::EarlyExitCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                                         const AotCompilationOptions& options) {
+  bool early_exit_with_layouts =
+      options.early_exit_point() ==
+      AotCompilationOptions::EarlyExitPoint::kAfterLayoutAssignment;
+  CompileOptions compile_options;
+  compile_options.device_allocator = options.device_allocator();
+  compile_options.gpu_target_config = options.gpu_target_config();
+  compile_options.early_exit_with_layouts = early_exit_with_layouts;
+
+  std::vector<std::unique_ptr<AotCompilationResult>> results;
+  TF_ASSIGN_OR_RETURN(
+      auto optimized_module,
+      RunHloPasses(std::move(hlo_module), options.executor(), compile_options));
+  results.push_back(std::make_unique<EarlyExitCompilationResult>(
+      std::move(optimized_module)));
+  return std::move(results);
+}
+
 absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                                       const AotCompilationOptions& options) {
@@ -2685,7 +2716,6 @@ GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
     optimized_module = std::move(hlo_module);
   }
 
-  std::vector<std::unique_ptr<AotCompilationResult>> results;
 
   const std::optional<Compiler::GpuTargetConfig>& target_config =
       options.gpu_target_config();
@@ -2700,6 +2730,7 @@ GpuCompiler::LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                              {options.device_allocator()}, gpu_device_info));
 
   // Create GpuThunkAotCompilationResult if thunk runtime is enabled.
+  std::vector<std::unique_ptr<AotCompilationResult>> results;
   TF_ASSIGN_OR_RETURN(
       results.emplace_back(),
       LegacyGpuAotCompilationResult::FromModule(
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index ad2e6840dba02e..595e9210c8c061 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -284,6 +284,10 @@ class GpuCompiler : public LLVMCompiler {
   LegacyCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                            const AotCompilationOptions& options);
 
+  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  EarlyExitCompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
+                              const AotCompilationOptions& options);
+
   se::Platform::Id platform_id_;
 
   // The triple that represents our target.
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index a12b23b02ee29a..61c501b9fa8b3f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -107,6 +107,9 @@ using ::testing::IsEmpty;
 using ::testing::IsSupersetOf;
 using ::testing::Matches;
 using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Pointee;
+using ::testing::Property;
 using ::testing::SizeIs;
 using ::testing::StartsWith;
 using ::testing::TempDir;
@@ -1019,6 +1022,32 @@ TEST_P(AotCompilationTest, ExportAndImportAotResult) {
   EXPECT_TRUE(LiteralTestUtil::Equal(result, literal_expected_result));
 }
 
+TEST_P(AotCompilationTest, EarlyExitWithLayouts) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> add_1_hlo,
+      ParseAndReturnVerifiedModule(R"hlo(
+    add1 {
+      p = s32[] parameter(0)
+      c = s32[] constant(1)
+      ROOT a = s32[] add(p, c)
+    }
+
+    ENTRY e {
+      p = s32[] parameter(0)
+      ROOT r = s32[] fusion(p), kind=kLoop, calls=add1
+    })hlo",
+                                   GetModuleConfigForTest()));
+
+  aot_options_->set_early_exit_point(
+      AotCompilationOptions::EarlyExitPoint::kAfterLayoutAssignment);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      compiler_->CompileAheadOfTime(std::move(add_1_hlo), *aot_options_));
+  EXPECT_THAT(aot_results,
+              ElementsAre(Pointee(Property(
+                  &AotCompilationResult::optimized_module, NotNull()))));
+}
+
 class KernelCacheTest : public HloTestBase {
  public:
   void SetUp() override {
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
index 86f920b6d57ed0..36c87453f8ba98 100644
--- a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.cc
@@ -116,5 +116,24 @@ LegacyGpuAotCompilationResult::buffer_assignment() const {
                                      buffer_size_bytes_function, &alias_info);
 }
 
+absl::StatusOr<std::string> EarlyExitCompilationResult::SerializeAsString()
+    const {
+  return Unavailable(
+      "SerializeAsString() is not supported by EarlyExitCompilationResult.");
+}
+
+absl::StatusOr<std::unique_ptr<Executable>>
+EarlyExitCompilationResult::LoadExecutable(
+    const se::StreamExecutor* stream_exec) && {
+  return Unavailable(
+      "LoadExecutable() is not supported by EarlyExitCompilationResult.");
+}
+
+absl::StatusOr<std::unique_ptr<BufferAssignment>>
+EarlyExitCompilationResult::buffer_assignment() const {
+  return Unavailable(
+      "buffer_assignment() is not supported by EarlyExitCompilationResult.");
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
index f4c462b782251a..476cc2b8dd3ec8 100644
--- a/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/gpu/legacy_gpu_aot_compilation_result.h
@@ -91,6 +91,28 @@ class LegacyGpuAotCompilationResult : public AotCompilationResult {
   Compiler* compiler_;
 };
 
+class EarlyExitCompilationResult : public AotCompilationResult {
+ public:
+  explicit EarlyExitCompilationResult(std::unique_ptr<HloModule> module)
+      : module_(std::move(module)) {}
+
+  absl::StatusOr<std::string> SerializeAsString() const override;
+
+  absl::StatusOr<std::unique_ptr<Executable>>
+      LoadExecutable(const se::StreamExecutor* stream_exec) && override;
+
+  const HloModule* optimized_module() const override { return module_.get(); }
+  std::shared_ptr<HloModule> shared_optimized_module() override {
+    return module_;
+  }
+
+  absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const override;
+
+ private:
+  std::shared_ptr<HloModule> module_;
+};
+
 }  // namespace gpu
 }  // namespace xla
 

From 619f7362044e25c37127e50edcba86cc627f94ea Mon Sep 17 00:00:00 2001
From: Matthias Kramm <kramm@google.com>
Date: Fri, 12 Dec 2025 12:25:39 -0800
Subject: [PATCH 235/753] Fix memory leak in MakeShapesInfo() and re-enable
 layout conversion.

PiperOrigin-RevId: 843790085
---
 .../pjrt/c_api_client/pjrt_c_api_client.cc    | 49 ++++++++++++++-----
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index a173d3c87e0674..08592d5a9c7776 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -1013,7 +1013,7 @@ absl::Status PjRtCApiClient::DmaUnmap(void* data) {
 // Helper struct and method used to serialize shapes past the C API boundary.
 struct ShapesInfo {
   std::vector<size_t> shape_num_dims;
-  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  std::vector<std::optional<pjrt::BufferMemoryLayoutData>> layout_list;
   std::vector<const int64_t*> num_dims;
   std::vector<PJRT_Buffer_Type> element_type_list;
 };
@@ -1021,7 +1021,7 @@ struct ShapesInfo {
 ShapesInfo MakeShapesInfo(absl::Span<const Shape> shapes) {
   std::vector<size_t> shape_num_dims;
   shape_num_dims.reserve(shapes.size());
-  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  std::vector<std::optional<pjrt::BufferMemoryLayoutData>> layout_list;
   layout_list.reserve(shapes.size());
   std::vector<const int64_t*> num_dims;
   num_dims.reserve(shapes.size());
@@ -1034,15 +1034,20 @@ ShapesInfo MakeShapesInfo(absl::Span<const Shape> shapes) {
     num_dims.push_back(shapes[i].dimensions().data());
     element_type_list.push_back(
         pjrt::ConvertToPjRtBufferType(shapes[i].element_type()));
-    // TODO(b/434246423): Enable this once ASAN failure is fixed.
-    // if (shapes[i].has_layout()) {
-    //   // this is messed up
-    //   auto& layout = shapes[i].layout();
-    //   TF_ASSIGN_OR_RETURN(
-    //       pjrt::BufferMemoryLayoutData c_layout_data,
-    //       pjrt::ConvertToBufferMemoryLayoutData(layout));
-    //   layout_list.push_back(&(c_layout_data.c_layout));
-    layout_list.push_back(nullptr);
+
+    if (shapes[i].has_layout()) {
+      auto& layout = shapes[i].layout();
+      absl::StatusOr<pjrt::BufferMemoryLayoutData> c_layout_data =
+          pjrt::ConvertToBufferMemoryLayoutData(layout);
+      if (c_layout_data.ok()) {
+        layout_list.push_back(std::optional<pjrt::BufferMemoryLayoutData>(
+            std::move(*c_layout_data)));
+      } else {
+        layout_list.push_back({});
+      }
+    } else {
+      layout_list.push_back({});
+    }
   }
 
   return ShapesInfo{
@@ -1088,7 +1093,16 @@ PjRtCApiClient::MakeCrossHostReceiveBuffers(
   args.shape_num_dims = shapes_info.shape_num_dims.data();
   args.num_dims = shapes_info.num_dims.data();
   args.element_types = shapes_info.element_type_list.data();
-  args.layouts = shapes_info.layout_list.data();
+
+  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  for (int i = 0; i < shapes_info.layout_list.size(); i++) {
+    if (shapes_info.layout_list[i].has_value()) {
+      layout_list.push_back(&shapes_info.layout_list[i]->c_layout);
+    } else {
+      layout_list.push_back(nullptr);
+    }
+  }
+  args.layouts = layout_list.data();
 
   args.notifier = pjrt::CppCrossHostRecvNotifierToC(c_api, std::move(notifier));
   args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
@@ -1179,7 +1193,16 @@ PjRtCApiClient::CrossHostReceiveBuffers(
   args.shape_num_dims = shapes_info.shape_num_dims.data();
   args.num_dims = shapes_info.num_dims.data();
   args.element_types = shapes_info.element_type_list.data();
-  args.layouts = shapes_info.layout_list.data();
+
+  std::vector<PJRT_Buffer_MemoryLayout*> layout_list;
+  for (int i = 0; i < shapes_info.layout_list.size(); i++) {
+    if (shapes_info.layout_list[i].has_value()) {
+      layout_list.push_back(&shapes_info.layout_list[i]->c_layout);
+    } else {
+      layout_list.push_back(nullptr);
+    }
+  }
+  args.layouts = layout_list.data();
 
   args.device = tensorflow::down_cast<PjRtCApiDevice*>(device)->c_device();
   args.src_global_device_ids = src_global_device_ids.data();

From 4ebc189ac60e9a2218b28ae0e8a99272c4294b4a Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 12 Dec 2025 13:57:54 -0800
Subject: [PATCH 236/753] Update XNNPACK in XLA

PiperOrigin-RevId: 843822351
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake | 2 +-
 tensorflow/workspace2.bzl                         | 6 +++---
 third_party/xla/third_party/xnnpack/workspace.bzl | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index e3efb7cf5ab430..9fde94f7b1a847 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 6400256d3a687d52ae268a553d7208534f39800a
+  GIT_TAG e436865104ef12ff872db68ec94ce1c5332a6ecb
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 924a23bbe2fd2b..a8db5fd1117c1e 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -168,9 +168,9 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "2d5e0b17d2c25c7100f66e58e7d76b9c4b8a65b1d86c33c9214dc05fce00ee69",
-        strip_prefix = "XNNPACK-6400256d3a687d52ae268a553d7208534f39800a",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/6400256d3a687d52ae268a553d7208534f39800a.zip"),
+        sha256 = "f855387f6c4e7db5facdcd83fc41bc94b1888239b396e055ba48dc6da9d89446",
+        strip_prefix = "XNNPACK-e436865104ef12ff872db68ec94ce1c5332a6ecb",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/e436865104ef12ff872db68ec94ce1c5332a6ecb.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index 1c2e9b15daa1e7..6bf3c0ec9ef322 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "2d5e0b17d2c25c7100f66e58e7d76b9c4b8a65b1d86c33c9214dc05fce00ee69",
-        strip_prefix = "XNNPACK-6400256d3a687d52ae268a553d7208534f39800a",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/6400256d3a687d52ae268a553d7208534f39800a.zip"),
+        sha256 = "f855387f6c4e7db5facdcd83fc41bc94b1888239b396e055ba48dc6da9d89446",
+        strip_prefix = "XNNPACK-e436865104ef12ff872db68ec94ce1c5332a6ecb",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/e436865104ef12ff872db68ec94ce1c5332a6ecb.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)

From 1c8cc3218eabe568a443f4f7f2a1944f7c3ca42b Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Fri, 12 Dec 2025 14:50:59 -0800
Subject: [PATCH 237/753] Update Shardy to
 e8435cb5c0b852b0e249b3fbf5f42dd51988afc9. Fix jax typo

PiperOrigin-RevId: 843841248
---
 .../xla/third_party/shardy/temporary.patch        | 15 ---------------
 third_party/xla/third_party/shardy/workspace.bzl  |  4 ++--
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index b67860af06a64b..e69de29bb2d1d6 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,15 +0,0 @@
-diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 26b3bf8..69a8c63 100644
---- a/third_party/llvm/workspace.bzl
-+++ b/third_party/llvm/workspace.bzl
-@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
- 
- def repo(name):
-     """Imports LLVM."""
--    LLVM_COMMIT = "16c0893f04c04faa8ac36495363344840f7c5db1"
--    LLVM_SHA256 = "3f786bc56ecb8fce511fe504f9b0848c12b5312beb7bded23edfc77272698b90"
-+    LLVM_COMMIT = "43bfec29cbecc1ff2e5aa6f8908c4d63e9c896c5"
-+    LLVM_SHA256 = "d9c35a7c3764666abcf464955530154d528b2e5edeb97bfa8890f02cb52d1f30"
- 
-     tf_http_archive(
-         name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index f2f1025815a856..beb5197aeec510 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "f36aaacad42e307da330bace41c920bdf23f1869"
-    SHARDY_SHA256 = "dd8f9591e7328564222df3e964009d34cf97bf753225b2c172418c3b946c7ee0"
+    SHARDY_COMMIT = "e8435cb5c0b852b0e249b3fbf5f42dd51988afc9"
+    SHARDY_SHA256 = "59e1e10fef4f425cd3ef7f5200a5d8111476230818496402cc83b234b277b4be"
 
     tf_http_archive(
         name = "shardy",

From bfe99e7876b8163b5b254a2f966c418e2c4e0421 Mon Sep 17 00:00:00 2001
From: Seher Ellis <sacer@google.com>
Date: Fri, 12 Dec 2025 15:04:12 -0800
Subject: [PATCH 238/753] [XLA:LHS] Remove unused AnyStartHasForceDelay
 function.

PiperOrigin-RevId: 843846331
---
 .../xla/xla/service/latency_hiding_scheduler.cc      | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 1fd1e01dd9dd11..aefb20a2718470 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -168,18 +168,6 @@ bool HasForceDelayAsyncAttribute(const HloInstruction* instr) {
   return attr.has_value() && attr.value() == "force_delay_async";
 }
 
-const HloGraphNode* AnyStartHasForceDelay(const HloGraphNode* n) {
-  CHECK(n->IsSupportedAsyncDone())
-      << "Meant to check if any start feeding a done has forced delay";
-  for (auto& v : n->GetPredecessors()) {
-    if (v.Target().IsSupportedAsyncStart() &&
-        HasForceDelayAsyncAttribute(&v.Target().GetInstr())) {
-      return v.TargetPtr();
-    }
-  }
-  return nullptr;
-}
-
 absl::flat_hash_map<int64_t, int64_t>
 GetNumResourcesNeededForAnnotationWithKeepOriginalOrderAttrs(
     const DefaultSchedulerCore::SchedulingState& sched_state,

From ab3cb0704610923c13ac5b99f4aa9fd7c22f74a5 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Fri, 12 Dec 2025 15:05:05 -0800
Subject: [PATCH 239/753] Add proto serialization for Collective(Done)Thunk

PiperOrigin-RevId: 843846667
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  17 +
 .../backends/gpu/runtime/collective_thunk.cc  |  51 +++
 .../backends/gpu/runtime/collective_thunk.h   |   9 +
 .../gpu/runtime/collective_thunk_test.cc      |  69 ++++
 .../xla/xla/backends/gpu/runtime/thunk.cc     | 305 ++++++++++++++++++
 .../xla/xla/backends/gpu/runtime/thunk.h      |   3 +
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  90 ++++++
 .../runtime/thunk_proto_deserialization.cc    |  15 +-
 8 files changed, 555 insertions(+), 4 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index b686b7a78f6a47..2b410193a09098 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1795,6 +1795,21 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "collective_thunk_test",
+    srcs = ["collective_thunk_test.cc"],
+    deps = [
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "p2p_thunk_common",
     srcs = ["p2p_thunk_common.cc"],
@@ -2171,6 +2186,7 @@ cc_library(
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
@@ -2799,6 +2815,7 @@ cc_library(
     srcs = ["thunk_proto_deserialization.cc"],
     hdrs = ["thunk_proto_deserialization.h"],
     deps = [
+        ":collective_thunk",
         ":conditional_thunk",
         ":convolution_reorder_thunk",
         ":convolution_thunk",
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
index f67cca8c784ad3..1d94ec8c5c8ce0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -410,6 +410,22 @@ std::optional<AsyncEventsUniqueId> CollectiveThunk::GetAsyncEventsUniqueId()
   return absl::bit_cast<AsyncEventsUniqueId>(async_events_.get());
 }
 
+absl::StatusOr<CollectiveThunkProto> CollectiveThunk::ToCollectiveThunkProto()
+    const {
+  CollectiveThunkProto proto;
+
+  proto.set_async_stream_kind(stream_kind_);
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (!async_events_id.has_value()) {
+    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  }
+  proto.set_async_events_unique_id(async_events_id->value());
+  proto.set_thunk_kind(Thunk::KindToProto(kind()));
+
+  return proto;
+}
+
 CollectiveDoneThunk::CollectiveDoneThunk(
     Thunk::Kind kind, ThunkInfo thunk_info,
     std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
@@ -446,4 +462,39 @@ std::optional<AsyncEventsUniqueId> CollectiveDoneThunk::GetAsyncEventsUniqueId()
   // We rely on the fact that the pointer to async_events_ is unique.
   return absl::bit_cast<AsyncEventsUniqueId>(async_events_.get());
 }
+
+absl::StatusOr<ThunkProto> CollectiveDoneThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  CollectiveDoneThunkProto* thunk_proto = proto.mutable_collective_done_thunk();
+  thunk_proto->set_async_stream_kind(stream_kind_);
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (!async_events_id.has_value()) {
+    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  }
+  thunk_proto->set_async_events_unique_id(async_events_id->value());
+  thunk_proto->set_thunk_kind(Thunk::KindToProto(kind()));
+  return proto;
+}
+
+absl::StatusOr<std::unique_ptr<CollectiveDoneThunk>>
+CollectiveDoneThunk::FromProto(
+    ThunkInfo thunk_info, const CollectiveDoneThunkProto& thunk_proto,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
+      async_events_map[AsyncEventsUniqueId{
+          thunk_proto.async_events_unique_id()}];
+  if (!async_events) {
+    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  }
+
+  TF_ASSIGN_OR_RETURN(Thunk::Kind kind,
+                      Thunk::KindFromProto(thunk_proto.thunk_kind()));
+  return std::make_unique<CollectiveDoneThunk>(kind, std::move(thunk_info),
+                                               async_events,
+                                               thunk_proto.async_stream_kind());
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
index ba936ac9d9d472..be08146f9a95c5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
@@ -117,6 +117,8 @@ class CollectiveThunk : public Thunk {
     absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Event>> events_
         ABSL_GUARDED_BY(mu_);
   };
+  using AsyncEventsMap =
+      absl::flat_hash_map<AsyncEventsUniqueId, std::shared_ptr<AsyncEvents>>;
 
   // Logging support.
   static std::string GetDeviceString(const CollectiveParams& params);
@@ -149,6 +151,8 @@ class CollectiveThunk : public Thunk {
                              nccl_stream_id().value());
   }
 
+  absl::StatusOr<CollectiveThunkProto> ToCollectiveThunkProto() const;
+
  protected:
   // Run collective operation on a given stream and return if the first call
   // rendezvous with other participants is needed.
@@ -220,6 +224,11 @@ class CollectiveDoneThunk : public Thunk {
     return async_events_;
   }
 
+  absl::StatusOr<ThunkProto> ToProto() const override;
+  static absl::StatusOr<std::unique_ptr<CollectiveDoneThunk>> FromProto(
+      ThunkInfo thunk_info, const CollectiveDoneThunkProto& thunk_proto,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
  private:
   std::shared_ptr<CollectiveThunk::AsyncEvents> async_events_;
   AsyncStreamKind stream_kind_ = AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE;
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc
new file mode 100644
index 00000000000000..43aa470e095778
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc
@@ -0,0 +1,69 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        collective_done_thunk {
+          thunk_kind: 1
+          async_stream_kind: 2
+          async_events_unique_id: 3
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<CollectiveDoneThunk> thunk,
+      CollectiveDoneThunk::FromProto(thunk_info, proto.collective_done_thunk(),
+                                     async_events_map));
+  CHECK_NE(thunk->async_events(), nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_collective_done_thunk()->set_async_events_unique_id(
+      round_trip_proto.collective_done_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.cc b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
index 5bd1e169aea492..5e174cf83177b0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
@@ -109,6 +110,310 @@ Thunk::ExecuteParams::ExecuteParams(
 
 //===----------------------------------------------------------------------===//
 
+ThunkKindProto Thunk::KindToProto(Kind kind) {
+  switch (kind) {
+    case kAllGather:
+      return THUNK_KIND_ALL_GATHER;
+    case kAllGatherDone:
+      return THUNK_KIND_ALL_GATHER_DONE;
+    case kAllGatherStart:
+      return THUNK_KIND_ALL_GATHER_START;
+    case kAllReduce:
+      return THUNK_KIND_ALL_REDUCE;
+    case kAllReduceDone:
+      return THUNK_KIND_ALL_REDUCE_DONE;
+    case kAllReduceStart:
+      return THUNK_KIND_ALL_REDUCE_START;
+    case kAllToAll:
+      return THUNK_KIND_ALL_TO_ALL;
+    case kAllToAllDone:
+      return THUNK_KIND_ALL_TO_ALL_DONE;
+    case kAllToAllStart:
+      return THUNK_KIND_ALL_TO_ALL_START;
+    case kBuffersDebugChecksum:
+      return THUNK_KIND_BUFFERS_DEBUG_CHECKSUM;
+    case kBuffersDebugFloatCheck:
+      return THUNK_KIND_BUFFERS_DEBUG_FLOAT_CHECK;
+    case kCollectiveBroadcast:
+      return THUNK_KIND_COLLECTIVE_BROADCAST;
+    case kCollectiveBroadcastDone:
+      return THUNK_KIND_COLLECTIVE_BROADCAST_DONE;
+    case kCollectiveBroadcastStart:
+      return THUNK_KIND_COLLECTIVE_BROADCAST_START;
+    case kCollectiveKernel:
+      return THUNK_KIND_COLLECTIVE_KERNEL;
+    case kCollectiveMetadata:
+      return THUNK_KIND_COLLECTIVE_METADATA;
+    case kCollectivePermute:
+      return THUNK_KIND_COLLECTIVE_PERMUTE;
+    case kCollectivePermuteDone:
+      return THUNK_KIND_COLLECTIVE_PERMUTE_DONE;
+    case kCollectivePermuteStart:
+      return THUNK_KIND_COLLECTIVE_PERMUTE_START;
+    case kCommandBuffer:
+      return THUNK_KIND_COMMAND_BUFFER;
+    case kConditional:
+      return THUNK_KIND_CONDITIONAL;
+    case kConvolution:
+      return THUNK_KIND_CONVOLUTION;
+    case kConvolutionReorder:
+      return THUNK_KIND_CONVOLUTION_REORDER;
+    case kCopy:
+      return THUNK_KIND_COPY;
+    case kCopyDone:
+      return THUNK_KIND_COPY_DONE;
+    case kCuDnn:
+      return THUNK_KIND_CU_DNN;
+    case kCubSort:
+      return THUNK_KIND_CUB_SORT;
+    case kCublasLtMatmul:
+      return THUNK_KIND_CUBLAS_LT_MATMUL;
+    case kCustomCall:
+      return THUNK_KIND_CUSTOM_CALL;
+    case kCustomKernel:
+      return THUNK_KIND_CUSTOM_KERNEL;
+    case kDynamicSlice:
+      return THUNK_KIND_DYNAMIC_SLICE;
+    case kFft:
+      return THUNK_KIND_FFT;
+    case kGemm:
+      return THUNK_KIND_GEMM;
+    case kGroupDone:
+      return THUNK_KIND_GROUP_DONE;
+    case kGroupStart:
+      return THUNK_KIND_GROUP_START;
+    case kHostExecuteDone:
+      return THUNK_KIND_HOST_EXECUTE_DONE;
+    case kHostExecuteStart:
+      return THUNK_KIND_HOST_EXECUTE_START;
+    case kHostRecv:
+      return THUNK_KIND_HOST_RECV;
+    case kHostRecvDone:
+      return THUNK_KIND_HOST_RECV_DONE;
+    case kHostSend:
+      return THUNK_KIND_HOST_SEND;
+    case kHostSendDone:
+      return THUNK_KIND_HOST_SEND_DONE;
+    case kInfeed:
+      return THUNK_KIND_INFEED;
+    case kKernel:
+      return THUNK_KIND_KERNEL;
+    case kMemset32BitValue:
+      return THUNK_KIND_MEMSET32_BIT_VALUE;
+    case kMemzero:
+      return THUNK_KIND_MEMZERO;
+    case kNorm:
+      return THUNK_KIND_NORM;
+    case kNvshmemAllReduceDone:
+      return THUNK_KIND_NVSHMEM_ALL_REDUCE_DONE;
+    case kNvshmemAllReduceStart:
+      return THUNK_KIND_NVSHMEM_ALL_REDUCE_START;
+    case kNvshmemCollectivePermute:
+      return THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE;
+    case kNvshmemCollectivePermuteDone:
+      return THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_DONE;
+    case kNvshmemCollectivePermuteStart:
+      return THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_START;
+    case kNvshmemRecv:
+      return THUNK_KIND_NVSHMEM_RECV;
+    case kNvshmemRecvDone:
+      return THUNK_KIND_NVSHMEM_RECV_DONE;
+    case kNvshmemSend:
+      return THUNK_KIND_NVSHMEM_SEND;
+    case kNvshmemSendDone:
+      return THUNK_KIND_NVSHMEM_SEND_DONE;
+    case kOutfeed:
+      return THUNK_KIND_OUTFEED;
+    case kPartitionId:
+      return THUNK_KIND_PARTITION_ID;
+    case kRaggedAllToAll:
+      return THUNK_KIND_RAGGED_ALL_TO_ALL;
+    case kRaggedAllToAllDone:
+      return THUNK_KIND_RAGGED_ALL_TO_ALL_DONE;
+    case kRaggedAllToAllStart:
+      return THUNK_KIND_RAGGED_ALL_TO_ALL_START;
+    case kRecv:
+      return THUNK_KIND_RECV;
+    case kRecvDone:
+      return THUNK_KIND_RECV_DONE;
+    case kReduceScatter:
+      return THUNK_KIND_REDUCE_SCATTER;
+    case kReduceScatterDone:
+      return THUNK_KIND_REDUCE_SCATTER_DONE;
+    case kReduceScatterStart:
+      return THUNK_KIND_REDUCE_SCATTER_START;
+    case kReplicaId:
+      return THUNK_KIND_REPLICA_ID;
+    case kSelectK:
+      return THUNK_KIND_SELECT_K;
+    case kSend:
+      return THUNK_KIND_SEND;
+    case kSendDone:
+      return THUNK_KIND_SEND_DONE;
+    case kSequential:
+      return THUNK_KIND_SEQUENTIAL;
+    case kTriangularSolve:
+      return THUNK_KIND_TRIANGULAR_SOLVE;
+    case kWaitForStreams:
+      return THUNK_KIND_WAIT_FOR_STREAMS;
+    case kWhile:
+      return THUNK_KIND_WHILE;
+  };
+}
+
+absl::StatusOr<Thunk::Kind> Thunk::KindFromProto(ThunkKindProto kind) {
+  switch (kind) {
+    case THUNK_KIND_ALL_GATHER:
+      return kAllGather;
+    case THUNK_KIND_ALL_GATHER_DONE:
+      return kAllGatherDone;
+    case THUNK_KIND_ALL_GATHER_START:
+      return kAllGatherStart;
+    case THUNK_KIND_ALL_REDUCE:
+      return kAllReduce;
+    case THUNK_KIND_ALL_REDUCE_DONE:
+      return kAllReduceDone;
+    case THUNK_KIND_ALL_REDUCE_START:
+      return kAllReduceStart;
+    case THUNK_KIND_ALL_TO_ALL:
+      return kAllToAll;
+    case THUNK_KIND_ALL_TO_ALL_DONE:
+      return kAllToAllDone;
+    case THUNK_KIND_ALL_TO_ALL_START:
+      return kAllToAllStart;
+    case THUNK_KIND_BUFFERS_DEBUG_CHECKSUM:
+      return kBuffersDebugChecksum;
+    case THUNK_KIND_BUFFERS_DEBUG_FLOAT_CHECK:
+      return kBuffersDebugFloatCheck;
+    case THUNK_KIND_COLLECTIVE_BROADCAST:
+      return kCollectiveBroadcast;
+    case THUNK_KIND_COLLECTIVE_BROADCAST_DONE:
+      return kCollectiveBroadcastDone;
+    case THUNK_KIND_COLLECTIVE_BROADCAST_START:
+      return kCollectiveBroadcastStart;
+    case THUNK_KIND_COLLECTIVE_KERNEL:
+      return kCollectiveKernel;
+    case THUNK_KIND_COLLECTIVE_METADATA:
+      return kCollectiveMetadata;
+    case THUNK_KIND_COLLECTIVE_PERMUTE:
+      return kCollectivePermute;
+    case THUNK_KIND_COLLECTIVE_PERMUTE_DONE:
+      return kCollectivePermuteDone;
+    case THUNK_KIND_COLLECTIVE_PERMUTE_START:
+      return kCollectivePermuteStart;
+    case THUNK_KIND_COMMAND_BUFFER:
+      return kCommandBuffer;
+    case THUNK_KIND_CONDITIONAL:
+      return kConditional;
+    case THUNK_KIND_CONVOLUTION:
+      return kConvolution;
+    case THUNK_KIND_CONVOLUTION_REORDER:
+      return kConvolutionReorder;
+    case THUNK_KIND_COPY:
+      return kCopy;
+    case THUNK_KIND_COPY_DONE:
+      return kCopyDone;
+    case THUNK_KIND_CU_DNN:
+      return kCuDnn;
+    case THUNK_KIND_CUB_SORT:
+      return kCubSort;
+    case THUNK_KIND_CUBLAS_LT_MATMUL:
+      return kCublasLtMatmul;
+    case THUNK_KIND_CUSTOM_CALL:
+      return kCustomCall;
+    case THUNK_KIND_CUSTOM_KERNEL:
+      return kCustomKernel;
+    case THUNK_KIND_DYNAMIC_SLICE:
+      return kDynamicSlice;
+    case THUNK_KIND_FFT:
+      return kFft;
+    case THUNK_KIND_GEMM:
+      return kGemm;
+    case THUNK_KIND_GROUP_DONE:
+      return kGroupDone;
+    case THUNK_KIND_GROUP_START:
+      return kGroupStart;
+    case THUNK_KIND_HOST_EXECUTE_DONE:
+      return kHostExecuteDone;
+    case THUNK_KIND_HOST_EXECUTE_START:
+      return kHostExecuteStart;
+    case THUNK_KIND_HOST_RECV:
+      return kHostRecv;
+    case THUNK_KIND_HOST_RECV_DONE:
+      return kHostRecvDone;
+    case THUNK_KIND_HOST_SEND:
+      return kHostSend;
+    case THUNK_KIND_HOST_SEND_DONE:
+      return kHostSendDone;
+    case THUNK_KIND_INFEED:
+      return kInfeed;
+    case THUNK_KIND_KERNEL:
+      return kKernel;
+    case THUNK_KIND_MEMSET32_BIT_VALUE:
+      return kMemset32BitValue;
+    case THUNK_KIND_MEMZERO:
+      return kMemzero;
+    case THUNK_KIND_NORM:
+      return kNorm;
+    case THUNK_KIND_NVSHMEM_ALL_REDUCE_DONE:
+      return kNvshmemAllReduceDone;
+    case THUNK_KIND_NVSHMEM_ALL_REDUCE_START:
+      return kNvshmemAllReduceStart;
+    case THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE:
+      return kNvshmemCollectivePermute;
+    case THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_DONE:
+      return kNvshmemCollectivePermuteDone;
+    case THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_START:
+      return kNvshmemCollectivePermuteStart;
+    case THUNK_KIND_NVSHMEM_RECV:
+      return kNvshmemRecv;
+    case THUNK_KIND_NVSHMEM_RECV_DONE:
+      return kNvshmemRecvDone;
+    case THUNK_KIND_NVSHMEM_SEND:
+      return kNvshmemSend;
+    case THUNK_KIND_NVSHMEM_SEND_DONE:
+      return kNvshmemSendDone;
+    case THUNK_KIND_OUTFEED:
+      return kOutfeed;
+    case THUNK_KIND_PARTITION_ID:
+      return kPartitionId;
+    case THUNK_KIND_RAGGED_ALL_TO_ALL:
+      return kRaggedAllToAll;
+    case THUNK_KIND_RAGGED_ALL_TO_ALL_DONE:
+      return kRaggedAllToAllDone;
+    case THUNK_KIND_RAGGED_ALL_TO_ALL_START:
+      return kRaggedAllToAllStart;
+    case THUNK_KIND_RECV:
+      return kRecv;
+    case THUNK_KIND_RECV_DONE:
+      return kRecvDone;
+    case THUNK_KIND_REDUCE_SCATTER:
+      return kReduceScatter;
+    case THUNK_KIND_REDUCE_SCATTER_DONE:
+      return kReduceScatterDone;
+    case THUNK_KIND_REDUCE_SCATTER_START:
+      return kReduceScatterStart;
+    case THUNK_KIND_REPLICA_ID:
+      return kReplicaId;
+    case THUNK_KIND_SELECT_K:
+      return kSelectK;
+    case THUNK_KIND_SEND:
+      return kSend;
+    case THUNK_KIND_SEND_DONE:
+      return kSendDone;
+    case THUNK_KIND_SEQUENTIAL:
+      return kSequential;
+    case THUNK_KIND_TRIANGULAR_SOLVE:
+      return kTriangularSolve;
+    case THUNK_KIND_WAIT_FOR_STREAMS:
+      return kWaitForStreams;
+    case THUNK_KIND_WHILE:
+      return kWhile;
+    default:
+      return absl::InternalError(absl::StrCat("Unknown ThunkKindProto:", kind));
+  };
+}
+
 /*static*/ absl::string_view Thunk::KindToString(Thunk::Kind kind) {
 #define CASE(x)  \
   case Thunk::x: \
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.h b/third_party/xla/xla/backends/gpu/runtime/thunk.h
index dbbfa3309a27ad..bd88654eaf963a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.h
@@ -202,6 +202,9 @@ class Thunk {
     // go/keep-sorted end
   };
 
+  static ThunkKindProto KindToProto(Kind kind);
+  static absl::StatusOr<Thunk::Kind> KindFromProto(ThunkKindProto kind);
+
   // TODO(ezhulenev): This should become a part of StreamExecutor library, but
   // for now we keep it here as a Thunk implementation detail. It's not yet
   // clear what else should become a part of "executable source", we likely
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index f2a93d80e31eb8..0cd18e571009c5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -53,6 +53,83 @@ message ThunkMetadataListProto {
   repeated ThunkMetadataProto thunk_metadata = 1;
 }
 
+enum ThunkKindProto {
+  THUNK_KIND_UNSPECIFIED = 0;
+  THUNK_KIND_ALL_GATHER = 1;
+  THUNK_KIND_ALL_GATHER_DONE = 2;
+  THUNK_KIND_ALL_GATHER_START = 3;
+  THUNK_KIND_ALL_REDUCE = 4;
+  THUNK_KIND_ALL_REDUCE_DONE = 5;
+  THUNK_KIND_ALL_REDUCE_START = 6;
+  THUNK_KIND_ALL_TO_ALL = 7;
+  THUNK_KIND_ALL_TO_ALL_DONE = 8;
+  THUNK_KIND_ALL_TO_ALL_START = 9;
+  THUNK_KIND_BUFFERS_DEBUG_CHECKSUM = 10;
+  THUNK_KIND_BUFFERS_DEBUG_FLOAT_CHECK = 11;
+  THUNK_KIND_COLLECTIVE_BROADCAST = 12;
+  THUNK_KIND_COLLECTIVE_BROADCAST_DONE = 13;
+  THUNK_KIND_COLLECTIVE_BROADCAST_START = 14;
+  THUNK_KIND_COLLECTIVE_KERNEL = 15;
+  THUNK_KIND_COLLECTIVE_METADATA = 16;
+  THUNK_KIND_COLLECTIVE_PERMUTE = 17;
+  THUNK_KIND_COLLECTIVE_PERMUTE_DONE = 18;
+  THUNK_KIND_COLLECTIVE_PERMUTE_START = 19;
+  THUNK_KIND_COMMAND_BUFFER = 20;
+  THUNK_KIND_CONDITIONAL = 21;
+  THUNK_KIND_CONVOLUTION = 22;
+  THUNK_KIND_CONVOLUTION_REORDER = 23;
+  THUNK_KIND_COPY = 24;
+  THUNK_KIND_COPY_DONE = 25;
+  THUNK_KIND_CU_DNN = 26;
+  THUNK_KIND_CUB_SORT = 27;
+  THUNK_KIND_CUBLAS_LT_MATMUL = 28;
+  THUNK_KIND_CUSTOM_CALL = 29;
+  THUNK_KIND_CUSTOM_KERNEL = 30;
+  THUNK_KIND_DYNAMIC_SLICE = 31;
+  THUNK_KIND_FFT = 32;
+  THUNK_KIND_GEMM = 33;
+  THUNK_KIND_GROUP_DONE = 34;
+  THUNK_KIND_GROUP_START = 35;
+  THUNK_KIND_HOST_EXECUTE_DONE = 36;
+  THUNK_KIND_HOST_EXECUTE_START = 37;
+  THUNK_KIND_HOST_RECV = 38;
+  THUNK_KIND_HOST_RECV_DONE = 39;
+  THUNK_KIND_HOST_SEND = 40;
+  THUNK_KIND_HOST_SEND_DONE = 41;
+  THUNK_KIND_INFEED = 42;
+  THUNK_KIND_KERNEL = 43;
+  THUNK_KIND_MEMSET32_BIT_VALUE = 44;
+  THUNK_KIND_MEMZERO = 45;
+  THUNK_KIND_NORM = 46;
+  THUNK_KIND_NVSHMEM_ALL_REDUCE_DONE = 47;
+  THUNK_KIND_NVSHMEM_ALL_REDUCE_START = 48;
+  THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE = 49;
+  THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_DONE = 50;
+  THUNK_KIND_NVSHMEM_COLLECTIVE_PERMUTE_START = 51;
+  THUNK_KIND_NVSHMEM_RECV = 52;
+  THUNK_KIND_NVSHMEM_RECV_DONE = 53;
+  THUNK_KIND_NVSHMEM_SEND = 54;
+  THUNK_KIND_NVSHMEM_SEND_DONE = 55;
+  THUNK_KIND_OUTFEED = 56;
+  THUNK_KIND_PARTITION_ID = 57;
+  THUNK_KIND_RAGGED_ALL_TO_ALL = 58;
+  THUNK_KIND_RAGGED_ALL_TO_ALL_DONE = 59;
+  THUNK_KIND_RAGGED_ALL_TO_ALL_START = 60;
+  THUNK_KIND_RECV = 61;
+  THUNK_KIND_RECV_DONE = 62;
+  THUNK_KIND_REDUCE_SCATTER = 63;
+  THUNK_KIND_REDUCE_SCATTER_DONE = 64;
+  THUNK_KIND_REDUCE_SCATTER_START = 65;
+  THUNK_KIND_REPLICA_ID = 66;
+  THUNK_KIND_SELECT_K = 67;
+  THUNK_KIND_SEND = 68;
+  THUNK_KIND_SEND_DONE = 69;
+  THUNK_KIND_SEQUENTIAL = 70;
+  THUNK_KIND_TRIANGULAR_SOLVE = 71;
+  THUNK_KIND_WAIT_FOR_STREAMS = 72;
+  THUNK_KIND_WHILE = 73;
+}
+
 message CopyThunkProto {
   ShapedSliceProto source_buffer = 1;
   ShapedSliceProto destination_buffer = 2;
@@ -300,6 +377,18 @@ message CustomKernelThunkProto {
   CustomKernelProto custom_kernel = 3;
 }
 
+message CollectiveThunkProto {
+  ThunkKindProto thunk_kind = 1;
+  AsyncStreamKind async_stream_kind = 2;
+  uint64 async_events_unique_id = 3;
+}
+
+message CollectiveDoneThunkProto {
+  ThunkKindProto thunk_kind = 1;
+  AsyncStreamKind async_stream_kind = 2;
+  uint64 async_events_unique_id = 3;
+}
+
 message ThunkProto {
   ThunkInfoProto thunk_info = 1;
 
@@ -338,6 +427,7 @@ message ThunkProto {
     HostRecvThunkProto host_recv_thunk = 34;
     HostRecvDoneThunkProto host_recv_done_thunk = 35;
     CustomKernelThunkProto custom_kernel_thunk = 36;
+    CollectiveDoneThunkProto collective_done_thunk = 37;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index e323e9f722bee8..a34814163fe4be 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/message.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_thunk.h"
@@ -87,6 +88,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
     const HloModule* absl_nullable hlo_module, absl::string_view platform_name,
     HostExecuteAsyncEventsMap& host_executable_async_events_map,
     HostSendRecvAsyncEventsMap& host_send_recv_async_events_map,
+    CollectiveThunk::AsyncEventsMap& collective_async_events_map,
     const std::optional<stream_executor::KernelLoaderSpec::SymbolResolver>&
         symbol_resolver) {
   TF_ASSIGN_OR_RETURN(Thunk::ThunkInfo thunk_info,
@@ -95,7 +97,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
     return DeserializeThunkProtoImpl(
         thunk_proto, buffer_allocations, hlo_module, platform_name,
         host_executable_async_events_map, host_send_recv_async_events_map,
-        symbol_resolver);
+        collective_async_events_map, symbol_resolver);
   };
 
   switch (thunk_proto.impl_case()) {
@@ -189,7 +191,8 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
             return DeserializeThunkProtoImpl(
                 thunk_proto, custom_allocations, hlo_module, platform_name,
                 host_executable_async_events_map,
-                host_send_recv_async_events_map, symbol_resolver);
+                host_send_recv_async_events_map, collective_async_events_map,
+                symbol_resolver);
           };
       return DynamicSliceThunk::FromProto(std::move(thunk_info),
                                           thunk_proto.dynamic_slice_thunk(),
@@ -235,7 +238,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return CustomKernelThunk::FromProto(std::move(thunk_info),
                                           thunk_proto.custom_kernel_thunk(),
                                           buffer_allocations, symbol_resolver);
-
+    case ThunkProto::kCollectiveDoneThunk:
+      return CollectiveDoneThunk::FromProto(std::move(thunk_info),
+                                            thunk_proto.collective_done_thunk(),
+                                            collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);
@@ -263,10 +269,11 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProto(
         symbol_resolver) {
   HostExecuteAsyncEventsMap host_executable_async_events_map;
   HostSendRecvAsyncEventsMap host_send_recv_async_events_map;
+  CollectiveThunk::AsyncEventsMap collective_async_events_map;
   return DeserializeThunkProtoImpl(
       thunk_proto, buffer_allocations, hlo_module, platform_name,
       host_executable_async_events_map, host_send_recv_async_events_map,
-      symbol_resolver);
+      collective_async_events_map, symbol_resolver);
 }
 
 }  // namespace xla::gpu

From 23b6ac7fa8cbf9945d0832be62e1c54935b46af2 Mon Sep 17 00:00:00 2001
From: Michael Kuperstein <mkuper@google.com>
Date: Fri, 12 Dec 2025 15:30:53 -0800
Subject: [PATCH 240/753] [XLA] Remove unused cache_key field from
 HloModuleGroup

PiperOrigin-RevId: 843854869
---
 third_party/xla/xla/hlo/ir/hlo_module_group.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_module_group.h b/third_party/xla/xla/hlo/ir/hlo_module_group.h
index 63fba3704d07f7..c7ff76d818823c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_group.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module_group.h
@@ -92,18 +92,10 @@ class HloModuleGroup {
   // Returns true if there are no modules in the module group.
   bool empty() const { return !module_; }
 
-  absl::string_view cache_key() const { return cache_key_; }
-  void set_cache_key(absl::string_view cache_key) {
-    cache_key_ = std::string(cache_key);
-  }
-
- private:
   std::string name_;
 
   // Vector of modules as std::unique_ptrs.
   std::unique_ptr<HloModule> module_;
-
-  std::string cache_key_;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloModuleGroup& group);

From 0ceec291ab9a2a59f6374d0fdbf5305c85eb3e6a Mon Sep 17 00:00:00 2001
From: Tommy Chiang <ototot@google.com>
Date: Fri, 12 Dec 2025 15:35:12 -0800
Subject: [PATCH 241/753] Remove unused `stripped_cc_info`

PiperOrigin-RevId: 843856280
---
 tensorflow/tensorflow.bzl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 58d5872cf92976..d7e791ea40cb98 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -89,7 +89,6 @@ load(
     "@local_xla//third_party/py/rules_pywrap:pywrap.default.bzl",
     "use_pywrap_rules",
     _pybind_extension = "pybind_extension",
-    _stripped_cc_info = "stripped_cc_info",
 )
 
 # Do not sort: copybara rule changes this
@@ -3341,8 +3340,6 @@ def pybind_extension(
             **kwargs
         )
 
-stripped_cc_info = _stripped_cc_info
-
 # Note: we cannot add //third_party/tf_runtime:__subpackages__ here,
 # because that builds all of tf_runtime's packages, and some of them
 # are known not to build on big endian systems.

From c74f0f14608dcea307183288466d079e32f045f8 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 12 Dec 2025 15:37:26 -0800
Subject: [PATCH 242/753] Fix unused std::optional<T>::has_value()

This ended up being unused with how the if statement was written. This becomes a compile time error with an upcoming version of libc++.

PiperOrigin-RevId: 843856944
---
 third_party/xla/xla/fp_util.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/fp_util.h b/third_party/xla/xla/fp_util.h
index 7295a1bee56949..e877f7c5dd9de0 100644
--- a/third_party/xla/xla/fp_util.h
+++ b/third_party/xla/xla/fp_util.h
@@ -279,8 +279,10 @@ constexpr T GoldbergUlp(T x) {
     return GoldbergUlp(std::numeric_limits<T>::min());
   }
   std::optional<int> maybe_exponent = LogBase(x);
-  if (maybe_exponent.has_value(); const int exponent = *maybe_exponent) {
-    return ScaleBase(std::numeric_limits<T>::epsilon(), exponent);
+  if (maybe_exponent.has_value()) {
+    if (const int exponent = *maybe_exponent) {
+      return ScaleBase(std::numeric_limits<T>::epsilon(), exponent);
+    }
   }
   if constexpr (std::numeric_limits<T>::has_quiet_NaN) {
     return std::numeric_limits<T>::quiet_NaN();

From d4210972a899074f9bebd73f85803f40afdf0e8b Mon Sep 17 00:00:00 2001
From: Yurii Topin <yuriit@google.com>
Date: Fri, 12 Dec 2025 15:41:45 -0800
Subject: [PATCH 243/753] Add Hermetic C++ Toolchains for Linux aarch64 builds.

Hermetic toolchains provide builds that are isolated from the host system, cutting down on unexpected dependencies and side effects.

By default, TF now builds for Linux aarch64 architectures using hermetic C++ toolchains. For non-hermetic builds, add the flag --config=clang_local. Cross-compilation on Linux x86_64 to Linux aarch64 still runs non-hermetically and will be updated after the next rules_ml_toolchain release.

OpenMP changes are connected to Hermetic C++ because it uses the -nodefaultlibs flag, simply passing -fopenmp is insufficient. OpenMP's dependencies must be explicitly linked to ensure correct inclusion, as automatic linking is disabled.

PiperOrigin-RevId: 843858303
---
 .bazelrc                           | 16 +++++++++-------
 WORKSPACE                          |  6 +++---
 ci/official/envs/linux_arm64       |  2 +-
 tensorflow/tools/pip_package/BUILD |  2 +-
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 14a2128d591243..768289676da89a 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -285,6 +285,7 @@ common:mkl_threadpool -c opt
 # Config setting to build oneDNN with Compute Library for the Arm Architecture (ACL).
 # with Eigen threadpool support
 common:mkl_aarch64_threadpool --define=build_with_mkl_aarch64=true
+common:mkl_aarch64_threadpool --@compute_library//:openmp=false
 common:mkl_aarch64_threadpool -c opt
 
 # This is an alias for the mkl_aarch64_threadpool build.
@@ -297,7 +298,6 @@ common:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
 
 # CUDA: This config refers to building CUDA op kernels with nvcc.
 common:cuda --repo_env TF_NEED_CUDA=1
-common:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain
 common:cuda --@local_config_cuda//:enable_cuda
 common:cuda --config=cuda_version
 # This flag is needed to include CUDA libraries.
@@ -332,8 +332,6 @@ common:cuda_clang --linkopt="-lm"
 # Set up compilation CUDA version and paths and use the CUDA Clang toolchain.
 common:cuda_clang_official --config=cuda_clang
 common:cuda_clang_official --config=cuda_version
-common:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-18/bin/clang"
-common:cuda_clang_official --crosstool_top="@local_config_cuda//crosstool:toolchain"
 
 # Build with nvcc for CUDA and clang for host
 common:cuda_nvcc --config=cuda
@@ -763,14 +761,18 @@ common:release_gpu_linux_clang_local --config=release_cpu_linux_clang_local
 
 common:release_arm64_linux --config=release_linux_base
 common:release_arm64_linux --config=linux_arm64
-common:release_arm64_linux --config=clang_local
-common:release_arm64_linux --repo_env=CC="/usr/lib/llvm-18/bin/clang"
-common:release_arm64_linux --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/clang"
-common:release_arm64_linux --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain"
 common:release_arm64_linux --config=mkl_aarch64_threadpool
 common:release_arm64_linux --copt=-flax-vector-conversions
 test:release_arm64_linux --flaky_test_attempts=3
 
+# Deprecated release CPU config with non-hermetic toolchains.
+common:release_arm64_linux_clang_local --config=release_arm64_linux
+common:release_arm64_linux_clang_local --config=clang_local
+common:release_arm64_linux_clang_local --repo_env=CC="/usr/lib/llvm-18/bin/clang"
+common:release_arm64_linux_clang_local --repo_env=BAZEL_COMPILER="/usr/lib/llvm-18/bin/clang"
+common:release_arm64_linux_clang_local --crosstool_top="@ml2014_clang_aarch64_config_aarch64//crosstool:toolchain"
+test:release_arm64_linux_clang_local --flaky_test_attempts=3
+
 common:release_cpu_macos --config=avx_linux
 
 # Base build configs for macOS
diff --git a/WORKSPACE b/WORKSPACE
index 97d26fb10fd770..a334cf6080074a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -21,10 +21,10 @@ tf_http_archive(
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
-    strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
+    sha256 = "1a911c79fc734c39538781a7a4672b06aab8354c1ddb985c98e3df78f430bcde",
+    strip_prefix = "rules_ml_toolchain-f13852164b6fe240f8a989a744221a51e0d485cd",
     urls = tf_mirror_urls(
-        "https://github.com/yuriivcs/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/f13852164b6fe240f8a989a744221a51e0d485cd.tar.gz",
     ),
 )
 
diff --git a/ci/official/envs/linux_arm64 b/ci/official/envs/linux_arm64
index c886ca75f93f57..9c312f132a4a2f 100644
--- a/ci/official/envs/linux_arm64
+++ b/ci/official/envs/linux_arm64
@@ -26,7 +26,7 @@ TFCI_INDEX_HTML_ENABLE=1
 TFCI_LIB_SUFFIX="-cpu-linux-arm64"
 TFCI_OUTPUT_DIR=build_output
 TFCI_WHL_AUDIT_ENABLE=1
-TFCI_WHL_AUDIT_PLAT=manylinux2014_aarch64
+TFCI_WHL_AUDIT_PLAT=manylinux_2_27_aarch64
 TFCI_WHL_BAZEL_TEST_ENABLE=1
 TFCI_WHL_SIZE_LIMIT=275M
 TFCI_WHL_SIZE_LIMIT_ENABLE=1
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 924c43f6afc3f2..8a38b8146a1fd0 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -441,7 +441,7 @@ py_test(
 
 verify_manylinux_compliance_test(
     name = "manylinux_compliance_test",
-    aarch64_compliance_tag = "manylinux_2_17_aarch64",
+    aarch64_compliance_tag = "manylinux_2_27_aarch64",
     ppc64le_compliance_tag = "manylinux_2_17_ppc64le",
     test_tags = [
         "manual",

From 1245d3c045aee40e71484b5d2e3f3d0bbc9b5b25 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Fri, 12 Dec 2025 15:43:40 -0800
Subject: [PATCH 244/753] [XLA:GPU] Fix typo in bug number and add heap_check
 disable for "gpu" backend.

Corrected the bug number in the TODO comment and added the `--heap_check=` argument for the generic "gpu" backend in dynamic_slice_fusion_test.

PiperOrigin-RevId: 843858899
---
 third_party/xla/xla/backends/gpu/codegen/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index 8c4cbe8e9941d2..edf523644fe7ba 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -186,9 +186,10 @@ cc_library(
 xla_test(
     name = "dynamic_slice_fusion_test",
     srcs = ["dynamic_slice_fusion_test.cc"],
-    # TODO(b/46791573): Remove heap_check= once the bug is fixed.
+    # TODO(b/467915739): Remove heap_check= once the bug is fixed.
     backend_args = if_google(
         {
+            "gpu": ["--heap_check="],
             "b200": ["--heap_check="],
             "a100": ["--heap_check="],
             "h100": ["--heap_check="],

From a895a5a90765fdbe5dc4cd288e3e3167d95ffce2 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Fri, 12 Dec 2025 15:44:07 -0800
Subject: [PATCH 245/753] [TSL] Don't change binding during topology
 enumeration

We do not need the information from this level of detail and it has performance implications

PiperOrigin-RevId: 843859045
---
 third_party/xla/xla/tsl/platform/numa_hwloc.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/xla/tsl/platform/numa_hwloc.cc b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
index 50ba2c6a664fd1..971363aea7d1c1 100644
--- a/third_party/xla/xla/tsl/platform/numa_hwloc.cc
+++ b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
@@ -35,6 +35,11 @@ hwloc_topology_t GetHWLocTopology() {
       LOG(ERROR) << "Call to hwloc_topology_init() failed";
       return;
     }
+    if (hwloc_topology_set_flags(hwloc_topology_handle,
+                                 HWLOC_TOPOLOGY_FLAG_DONT_CHANGE_BINDING)) {
+      LOG(ERROR) << "Call to hwloc_topology_set_flags() failed";
+      return;
+    }
     if (hwloc_topology_load(hwloc_topology_handle)) {
       LOG(ERROR) << "Call to hwloc_topology_load() failed";
       return;

From ae87db29fb489dd85ef195647121a3a85ce11ff3 Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Fri, 12 Dec 2025 15:46:58 -0800
Subject: [PATCH 246/753] Add support and tests for sharded -> unreduced
 operation.

**But why do we need such an operation?**

You might want to use it directly: it's a kind of a lazy (i.e. no-comms) reduce_sum over shards, without changing the logical shape:

```python
reshard(f32[4@x], P(Unreduced={x})) : f32[4]{U:x}
```

Physically, we would zero-pad:

 ```
# f32[4@x] i.e. per-device shape is (2,)
Device-0   Device-1
[0, 1]     [2, 3]

# f32[4]{U:x} i.e. per-device shape is (4,)
Device-0         Device-1
[0, 1, 0, 0]     [0, 0, 2, 3]
```

(There are other valid physical possibilities because Unreduced is flexible. For example, `Device-0: [0/2, 1/2, 2/2, 3/2]` and `Device-1: [0/2, 1/2, 2/2, 3/2]` is valid, but would require comms and would have weird numeric effects. Terrible.)

The inverse operation (not the transpose, since those change the types) is `reshard(f32[4]{U:x}, P('x'))` and physically is a reduce-scatter, which naturally has the right effect on the physical buffers.

**But as another motivation**, this operation naturally arises from autodiff, if we allow other reasonable expressions. For example, if we want to allow elementwise multiplication of sharded and Reduced values at the user level (because everything that works with Replicated should work with Reduced):

```python
a: f32[4@x]
b: f32[4]{R: x}
c: f32[4@x] = a * b
```

we would desugar that as

```python
b_: f32[4@x] = reshard(b, P('x'))  # Reduced -> Sharded
c: f32[4@x] = mul(a, b_)
```

Then the backward pass would require a Sharded -> Unreduced operation:

```python
db_: f32[4@x] = mul(a, dc)
db: f32[4]{U:x} = reshard(db_, P(Unreduced={x}))  # Sharded -> Unreduced
```

**Before this change**, we actually had buggy behavior in that autodiff example where we multiply Reduced with Sharded. We would get incorrect gradients because our lowering of the backward pass's Sharded->Unreduced operation used to all-gather instead of zero-pad.

One very very interesting thing is comparing to varying -> unreduced support inside shard_map, which works via shape-changing rather than zero-padding! How? The varying -> unreduced pcast is shape-preserving operation inside shmap, but when returning shard_map naturally concats so as to increase shapes. If we want exactly the same to be expressible outside shard_map, we might additionally need shape-changing operations like `f32[4@x] -> f32[2]{U:x}` and its transpose. But we'll leave that to future work.

Co-authored-by: Matthew Johnson <mattjj@google.com>
PiperOrigin-RevId: 843859866
---
 third_party/xla/xla/python/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 46d996d59c1a9b..6361abdfe96083 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,6 +18,6 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER 39  // New coordination service implementation.
+#define JAX_IFRT_VERSION_NUMBER 40  // Shardy sharded -> unreduced
 
 #endif  // XLA_PYTHON_VERSION_H_

From e5edcc5c018d1d3825b55e56dd65f92b32e7aeea Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Fri, 12 Dec 2025 15:54:23 -0800
Subject: [PATCH 247/753] Implement device time measurement in IFRT Proxy

This CL propagates device time measured during execution back to the IFRT Proxy client. If there is an active device time measurement on the client side, IFRT Proxy enables device time measurement on the server side and propagate the measured time back to the client. The caller can wait for `ExecuteResult::status` and read the device time.

PiperOrigin-RevId: 843862101
---
 .../xla/xla/python/ifrt_proxy/client/BUILD    |   5 +
 .../python/ifrt_proxy/client/executable.cc    |  73 +++++++++--
 .../xla/python/ifrt_proxy/client/executable.h |   3 +
 .../ifrt_proxy/client/executable_test.cc      | 102 ++++++++++++++--
 .../python/ifrt_proxy/client/rpc_helper.cc    |   1 +
 .../xla/python/ifrt_proxy/client/rpc_helper.h |   3 +
 .../xla/python/ifrt_proxy/common/VERSION.md   |   7 ++
 .../ifrt_proxy/common/ifrt_service.proto      |  16 ++-
 .../xla/python/ifrt_proxy/common/versions.h   |   4 +
 .../xla/xla/python/ifrt_proxy/server/BUILD    |   4 +
 .../python/ifrt_proxy/server/ifrt_backend.cc  | 107 +++++++++++++++--
 .../python/ifrt_proxy/server/ifrt_backend.h   |  13 ++
 .../ifrt_proxy/server/ifrt_backend_test.cc    | 113 ++++++++++++++++--
 13 files changed, 411 insertions(+), 40 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
index 9892f3aebc5e62..1c34eea924511b 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -442,6 +442,7 @@ cc_library(
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:sharding_serdes",
@@ -470,6 +471,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:platform_port",
@@ -623,6 +625,8 @@ ifrt_proxy_cc_test(
         ":version",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/pjrt/profiling:device_time_measurement",
+        "//xla/pjrt/profiling/test_util:mock_device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:mock",
@@ -643,6 +647,7 @@ ifrt_proxy_cc_test(
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index a1173d09c8ee7d..957b9fd5e4841d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -35,6 +35,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -42,6 +43,7 @@
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
@@ -735,10 +737,17 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
     }
   }
 
+  std::optional<uint64_t> device_time_key = xla::GetDeviceTimeMeasurementKey();
+  if (device_time_key.has_value()) {
+    // An active device time measurement requires the server to respond with
+    // measured device times after the execution is complete.
+    req->mutable_execute_options()->set_fill_status(true);
+  }
+
   // Starting version 6, the server populates the status future only if it was
   // explicitly requested via `options.fill_status`.
-  const bool result_needs_exec_status =
-      rpc_helper_->protocol_version() < 6 || options.fill_status;
+  const bool result_needs_exec_status = rpc_helper_->protocol_version() < 6 ||
+                                        req->execute_options().fill_status();
 
   // The client generates handles if the protocol version is sufficiently newer,
   // and we've already seen at least one response from an execute (and thus know
@@ -789,10 +798,13 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
     }
     rpc_helper_->LoadedExecutableExecute(std::move(req));
     if (result_needs_exec_status) {
-      // Note that `CheckFuture` needs to be sent after
+      // Note that the RPCs within `FetchExecuteResult` need to be sent after
       // `LoadedExecutableExecute` above, or the server will not recognize the
       // handle being sent.
-      result.status = rpc_helper_->CheckFuture(status_handle);
+      tsl::Future<> status = FetchExecuteResult(status_handle, device_time_key);
+      if (options.fill_status) {
+        result.status = std::move(status);
+      }
     }
 
     return result;
@@ -808,8 +820,8 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
       Array::Destruct(rpc_helper_.get(), ArrayHandle{output.array_handle()});
     }
     if (result_needs_exec_status) {
-      // `CheckFuture` deletes the server-side future handle.
-      rpc_helper_->CheckFuture(response->status_handle());
+      // `FetchExecuteResult` deletes the server-side future handle.
+      FetchExecuteResult(response->status_handle(), device_time_key);
     }
     return status;
   }
@@ -836,7 +848,11 @@ LoadedExecutable::Execute(absl::Span<xla::ifrt::ArrayRef> args,
     }
   }
   if (result_needs_exec_status) {
-    result.status = rpc_helper_->CheckFuture(response->status_handle());
+    tsl::Future<> status =
+        FetchExecuteResult(response->status_handle(), device_time_key);
+    if (options.fill_status) {
+      result.status = std::move(status);
+    }
   } else {
     CHECK_EQ(response->status_handle(), 0);
   }
@@ -853,6 +869,49 @@ absl::Span<xla::ifrt::Device* const> LoadedExecutable::addressable_devices()
   return addressable_devices_;
 }
 
+tsl::Future<> LoadedExecutable::FetchExecuteResult(
+    uint64_t status_handle, std::optional<uint64_t> device_time_key) {
+  if (rpc_helper_->protocol_version() < protocol_version::kExecuteResult) {
+    return rpc_helper_->CheckFuture(status_handle);
+  }
+  auto req = std::make_unique<LoadedExecutableFetchExecuteResultRequest>();
+  req->set_result_status_handle(status_handle);
+
+  using RespT = std::shared_ptr<LoadedExecutableFetchExecuteResultResponse>;
+
+  tsl::Future<RespT> result =
+      rpc_helper_->LoadedExecutableFetchExecuteResult(std::move(req));
+
+  if (device_time_key.has_value()) {
+    result.OnReady([device_time_key](const absl::StatusOr<RespT>& resp) {
+      if (!resp.ok()) {
+        LOG_EVERY_N_SEC(ERROR, 60)
+            << "Device time measurement was requested but failed to retrieve "
+               "the execution result: "
+            << resp.status();
+        return;
+      }
+
+      for (const auto& [device_type_name, duration] : (*resp)->device_time()) {
+        xla::DeviceTimeMeasurement::DeviceType device_type;
+        if (device_type_name == "tpu") {
+          device_type = xla::DeviceTimeMeasurement::DeviceType::kTpu;
+        } else if (device_type_name == "gpu") {
+          device_type = xla::DeviceTimeMeasurement::DeviceType::kGpu;
+        } else {
+          device_type = xla::DeviceTimeMeasurement::DeviceType::kUnknown;
+        }
+        if (device_type != xla::DeviceTimeMeasurement::DeviceType::kUnknown) {
+          xla::RecordDeviceTimeMeasurement(
+              *device_time_key, absl::Microseconds(duration), device_type);
+        }
+      }
+    });
+  }
+
+  return result.GetReadyFuture();
+}
+
 char LoadedExecutable::ID = 0;  // NOLINT
 
 }  // namespace proxy
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
index 749c7af950543f..56c74a83e718bb 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -143,6 +143,9 @@ class LoadedExecutable final
     int64_t size_of_generated_code_in_bytes;
   };
 
+  tsl::Future<> FetchExecuteResult(uint64_t status_handle,
+                                   std::optional<uint64_t> device_time_key);
+
   xla::ifrt::Client* client_;
   std::shared_ptr<RpcHelper> rpc_helper_;
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
index 54d11c3cdf4539..c2c46a215ce144 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -16,6 +16,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
@@ -28,6 +29,8 @@
 #include "llvm/Support/Casting.h"
 #include "google/protobuf/text_format.h"
 #include "xla/layout_util.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
+#include "xla/pjrt/profiling/test_util/mock_device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/basic_device_list.h"
 #include "xla/python/ifrt/device.h"
@@ -55,6 +58,7 @@
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 using ::testing::_;
@@ -232,7 +236,7 @@ TEST_F(LoadedExecutableTest, Execute) {
   exec_options.fill_status = true;
 
   IfrtResponse execute_response;
-  IfrtResponse check_future_response;
+  IfrtResponse fetch_execute_result_response;
 
   ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
                                             loaded_executable_execute_response {
@@ -277,12 +281,13 @@ TEST_F(LoadedExecutableTest, Execute) {
                                               }
                                             }
                                           )pb",
-                                          &check_future_response));
+                                          &fetch_execute_result_response));
   EXPECT_CALL(*session_,
-              Enqueue(Pointee(Partially(EquivToProto(R"pb(check_future_request {
-                                                            future_handle: 2000
-                                                          })pb")))))
-      .WillOnce(MockClientSessionReturnResponse(check_future_response));
+              Enqueue(Pointee(Partially(EquivToProto(
+                  R"pb(loaded_executable_fetch_execute_result_request {
+                         result_status_handle: 2000
+                       })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(fetch_execute_result_response));
 
   DeviceListRef devices = BasicDeviceList::Create({&device});
 
@@ -333,21 +338,23 @@ TEST_F(LoadedExecutableTest, Execute) {
       Enqueue(IfrtRequestOfType(IfrtRequest::kLoadedExecutableExecuteRequest)))
       .WillOnce(MockClientCaptureAndReturn(&requests_queue, execute_response));
   EXPECT_CALL(*session_,
-              Enqueue(IfrtRequestOfType(IfrtRequest::kCheckFutureRequest)))
-      .WillOnce(
-          MockClientCaptureAndReturn(&requests_queue, check_future_response));
+              Enqueue(IfrtRequestOfType(
+                  IfrtRequest::kLoadedExecutableFetchExecuteResultRequest)))
+      .WillOnce(MockClientCaptureAndReturn(&requests_queue,
+                                           fetch_execute_result_response));
 
   TF_ASSERT_OK_AND_ASSIGN(
       result, executable.Execute(absl::MakeSpan(args), exec_options, devices));
 
   auto execute_req = requests_queue.Pop().loaded_executable_execute_request();
-  auto check_future_req = requests_queue.Pop().check_future_request();
+  auto fetch_execute_result_req =
+      requests_queue.Pop().loaded_executable_fetch_execute_result_request();
 
   EXPECT_THAT(
       result.status.Await(),
       absl_testing::StatusIs(absl::StatusCode::kUnknown, "injected error"));
   EXPECT_EQ(execute_req.result_status_handle(),
-            check_future_req.future_handle());
+            fetch_execute_result_req.result_status_handle());
 
   ASSERT_THAT(result.outputs, SizeIs(2));
   ASSERT_THAT(execute_req.result_array_handle(), SizeIs(2));
@@ -361,6 +368,79 @@ TEST_F(LoadedExecutableTest, Execute) {
             execute_req.result_array_handle()[1]);
 }
 
+TEST_F(LoadedExecutableTest, DeviceTime) {
+  if (tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "DeviceTimeMeasurement implementation isn't available in OSS.";
+  }
+
+  MockClient client;
+
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_metadata_response {
+          parameter_shardings {}
+          output_shardings {}
+          output_layouts_list {}
+        }
+      )pb",
+      &response));
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_metadata_request {
+                                    loaded_executable_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  LoadedExecutable executable(
+      &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
+      /*num_devices=*/1, /*devices=*/{}, /*addressable_devices=*/{},
+      /*fingerprint=*/"fingerprint",
+      /*ready_future=*/tsl::Future<>(absl::OkStatus()),
+      /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
+
+  xla::ifrt::LoadedExecutable::ExecuteOptions exec_options;
+  exec_options.fill_status = true;
+
+  IfrtResponse execute_response;
+  IfrtResponse fetch_execute_result_response;
+
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_execute_response { status_handle: 2000 }
+      )pb",
+      &execute_response));
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_execute_request {
+                                    loaded_executable_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(execute_response));
+
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_fetch_execute_result_response {
+          device_time { key: "tpu" value: 1234.0 }
+        }
+      )pb",
+      &fetch_execute_result_response));
+  EXPECT_CALL(*session_,
+              Enqueue(Pointee(Partially(EquivToProto(
+                  R"pb(loaded_executable_fetch_execute_result_request {
+                         result_status_handle: 2000
+                       })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(fetch_execute_result_response));
+
+  auto device_time = xla::CreateDeviceTimeMeasurement();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          executable.Execute({}, exec_options, std::nullopt));
+  EXPECT_OK(result.status.Await());
+
+  EXPECT_THAT(device_time->GetTotalDuration(
+                  xla::DeviceTimeMeasurement::DeviceType::kTpu),
+              absl::Microseconds(1234.0));
+}
+
 }  // namespace
 }  // namespace proxy
 }  // namespace ifrt
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
index bc728d0002e249..42ade73d4d3c51 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
@@ -477,6 +477,7 @@ RPC(LoadedExecutableMpmdCostAnalysis, loaded_executable_mpmd_cost_analysis);
 RPC(LoadedExecutableHumanReadableProgramText,
     loaded_executable_human_readable_program_text);
 RPC(LoadedExecutableExecute, loaded_executable_execute);
+RPC(LoadedExecutableFetchExecuteResult, loaded_executable_fetch_execute_result);
 RPC(LoadedExecutableDelete, loaded_executable_delete);
 RPC(LoadedExecutableIsDeleted, loaded_executable_is_deleted);
 RPC(LoadedExecutableDestruct, loaded_executable_destruct);
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
index 47193c84f9abb5..14e78f496b2147 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
@@ -152,6 +152,9 @@ class RpcHelper {
       std::unique_ptr<LoadedExecutableHumanReadableProgramTextRequest> req);
   ResponseFuture<LoadedExecutableExecuteResponse> LoadedExecutableExecute(
       std::unique_ptr<LoadedExecutableExecuteRequest> req);
+  ResponseFuture<LoadedExecutableFetchExecuteResultResponse>
+  LoadedExecutableFetchExecuteResult(
+      std::unique_ptr<LoadedExecutableFetchExecuteResultRequest> req);
   ResponseFuture<LoadedExecutableDeleteResponse> LoadedExecutableDelete(
       std::unique_ptr<LoadedExecutableDeleteRequest> req);
   ResponseFuture<LoadedExecutableIsDeletedResponse> LoadedExecutableIsDeleted(
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
index 23894f6ea2938e..84691a8c5de6bd 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
+++ b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
@@ -124,3 +124,10 @@
     *   Added support for `MpmdLoadedExecutable::GetMpmdAddressableDevices()`.
     *   Added support for `MpmdLoadedExecutable::GetMpmdCompiledMemoryStats()`.
     *   Added support for `MpmdLoadedExecutable::GetMpmdCostAnalysis()`.
+
+## Version kExecutionResult
+
+*   Added date: 2025-12-11
+*   Changes:
+    *   Added a new op `LoadedExecutableFetchExecuteResult` for reading
+        execution results.
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index 347a211608edbe..d8638b40629de4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -37,7 +37,7 @@ message IfrtProxyVersion {
   int32 ifrt_serdes_version_number = 2;
 }
 
-// Next ID: 32.
+// Next ID: 33.
 message IfrtRequest {
   RequestMetadata request_metadata = 1;
 
@@ -73,6 +73,8 @@ message IfrtRequest {
     // ===== LoadedExecutable =====
     LoadedExecutableMetadataRequest loaded_executable_metadata_request = 14;
     LoadedExecutableExecuteRequest loaded_executable_execute_request = 15;
+    LoadedExecutableFetchExecuteResultRequest
+        loaded_executable_fetch_execute_result_request = 32;
     LoadedExecutableCostAnalysisRequest
         loaded_executable_cost_analysis_request = 28;
     LoadedExecutableHumanReadableProgramTextRequest
@@ -102,7 +104,7 @@ message IfrtRequest {
   reserved 10;
 }
 
-// Next ID: 32.
+// Next ID: 33.
 message IfrtResponse {
   ResponseMetadata response_metadata = 1;
 
@@ -138,6 +140,8 @@ message IfrtResponse {
     // ===== LoadedExecutable =====
     LoadedExecutableMetadataResponse loaded_executable_metadata_response = 14;
     LoadedExecutableExecuteResponse loaded_executable_execute_response = 15;
+    LoadedExecutableFetchExecuteResultResponse
+        loaded_executable_fetch_execute_result_response = 32;
     LoadedExecutableCostAnalysisResponse
         loaded_executable_cost_analysis_response = 28;
     LoadedExecutableHumanReadableProgramTextResponse
@@ -604,6 +608,14 @@ message LoadedExecutableExecuteResponse {
   repeated Output outputs = 2;
 }
 
+message LoadedExecutableFetchExecuteResultRequest {
+  fixed64 result_status_handle = 1;
+}
+message LoadedExecutableFetchExecuteResultResponse {
+  // Map from device types to device time in microseconds.
+  map<string, double> device_time = 1;
+}
+
 // Mirrors `LoadedExecutable::Delete`. Returns a handle of a future that becomes
 // ready when the deletion completes.
 message LoadedExecutableDeleteRequest {
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/versions.h b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
index ec024d9cb18810..fbbc0675084768 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/versions.h
+++ b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
@@ -66,6 +66,10 @@ enum {
   // GetMpmdCostAnalysis.
   kMpmdLoadedExecutableMethods = 20,
 
+  // kExecuteResult adds a separate request/response type for Execution
+  // results to return extra information such as device time measurement.
+  kExecuteResult = 21,
+
   // kSentiel is used to derive kCurrent below. Keep this as the last value of
   // the enum.
   kSentiel,
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
index 340fa770032977..8faa6a591e9647 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/BUILD
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -134,6 +134,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
@@ -197,6 +198,7 @@ ifrt_proxy_cc_test(
         "//xla/pjrt:host_callback",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt/profiling:device_time_measurement",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
@@ -234,10 +236,12 @@ ifrt_proxy_cc_test(
         "@com_google_absl//absl/strings:cord",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index cfc07e10bfb631..21d8272662662a 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -46,6 +46,7 @@
 #include "xla/future.h"
 #include "xla/layout.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/attribute_map.h"
@@ -636,19 +637,34 @@ tsl::Future<BackendInterface::Response> IfrtBackend::ProcessInternal(
           HandleLoadedExecutableExecuteRequest(*asr, std::move(request));
       if (client_generated_status_handle != 0) {
         // Populate the handle if not already populated.
-        absl::MutexLock l(futures_mutex_);
-        const bool inserted = futures_
-                                  .insert({client_generated_status_handle,
-                                           tsl::Future<>(result.status())})
-                                  .second;
-        // If `HandleLoadedExecutableExecuteRequest` returned OK, verify that
-        // it already has populated status_handle.
-        if (result.ok()) {
-          CHECK(!inserted);
+        if (protocol_version() >= protocol_version::kExecuteResult) {
+          absl::MutexLock l(execute_results_mutex_);
+          if (result.ok()) {
+            CHECK(execute_results_.contains(client_generated_status_handle));
+          } else {
+            CHECK(execute_results_
+                      .insert({client_generated_status_handle,
+                               tsl::Future<ExecuteResult>(result.status())})
+                      .second);
+          }
+        } else {
+          absl::MutexLock l(futures_mutex_);
+          const bool inserted = futures_
+                                    .insert({client_generated_status_handle,
+                                             tsl::Future<>(result.status())})
+                                    .second;
+          // If `HandleLoadedExecutableExecuteRequest` returned OK, verify that
+          // it already has populated status_handle.
+          if (result.ok()) {
+            CHECK(!inserted);
+          }
         }
       }
       return tsl::Future<Response>(asr->ProcessResponse(std::move(result)));
     }
+    case IfrtRequest::RequestCase::kLoadedExecutableFetchExecuteResultRequest:
+      return HandleLoadedExecutableFetchExecuteResultRequest(
+          std::move(request));
     case IfrtRequest::RequestCase::kLoadedExecutableDeleteRequest:
       return tsl::Future<Response>(
           HandleLoadedExecutableDeleteRequest(std::move(request)));
@@ -1815,6 +1831,11 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
     TF_ASSIGN_OR_RETURN(devices, client_->MakeDeviceList(std::move(d)));
   }
 
+  std::unique_ptr<xla::DeviceTimeMeasurement> device_time;
+  if (execute_options.fill_status) {
+    device_time = xla::CreateDeviceTimeMeasurement();
+  }
+
   TF_ASSIGN_OR_RETURN(xla::ifrt::LoadedExecutable::ExecuteResult result,
                       executable_info->executable->Execute(
                           absl::MakeSpan(args), execute_options, devices));
@@ -1894,15 +1915,31 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
   // atomically (as in ACID) across all handles.
   [&]() -> void {
     if (execute_options.fill_status) {
-      // Caller is expected to call `CheckFuture` exactly once to check for its
-      // status and erase it.
-      absl::MutexLock lock(futures_mutex_);
       uint64_t status_handle = execute.result_status_handle();
       if (status_handle == 0) {
         status_handle = handle_generator_.GenerateAtServer();
       }
       execute_response->set_status_handle(status_handle);
-      futures_.insert({status_handle, std::move(result.status)});
+
+      if (version_.protocol_version() >= protocol_version::kExecuteResult) {
+        // Caller is expected to call `LoadedExecutableFetchExecuteResult`
+        // exactly once to check for its status and erase it.
+        absl::MutexLock lock(execute_results_mutex_);
+        tsl::Future<ExecuteResult> future = result.status.Map<ExecuteResult>(
+            [device_time = std::move(device_time)]() mutable {
+              ExecuteResult result;
+              if (device_time != nullptr) {
+                result.device_time = device_time->GetTotalDurations();
+              }
+              return result;
+            });
+        execute_results_.insert({status_handle, std::move(future)});
+      } else {
+        // Caller is expected to call `CheckFuture` exactly once to check for
+        // its status and erase it.
+        absl::MutexLock lock(futures_mutex_);
+        futures_.insert({status_handle, std::move(result.status)});
+      }
     }
 
     std::vector<uint64_t> result_handles = asr.Fill(result.outputs);
@@ -1926,6 +1963,50 @@ IfrtBackend::HandleLoadedExecutableExecuteRequest(
   return ifrt_resp;
 }
 
+tsl::Future<BackendInterface::Response>
+IfrtBackend::HandleLoadedExecutableFetchExecuteResultRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& fetch = request->loaded_executable_fetch_execute_result_request();
+
+  tsl::Future<ExecuteResult> result;
+  {
+    absl::MutexLock lock(execute_results_mutex_);
+    const auto it = execute_results_.find(fetch.result_status_handle());
+    if (it == execute_results_.end()) {
+      return tsl::Future<Response>(absl::NotFoundError(absl::StrCat(
+          "Unknown result status handle: ", fetch.result_status_handle())));
+    }
+    result = std::move(it->second);
+    execute_results_.erase(it);
+  }
+
+  return result.Map<BackendInterface::Response>(
+      [op_id =
+           request->request_metadata().op_id()](const ExecuteResult& result) {
+        auto ifrt_resp = NewIfrtResponse(op_id);
+
+        auto* const fetch_response =
+            ifrt_resp
+                ->mutable_loaded_executable_fetch_execute_result_response();
+        for (const auto& [device_type, duration] : result.device_time) {
+          switch (device_type) {
+            case xla::DeviceTimeMeasurement::DeviceType::kTpu:
+              fetch_response->mutable_device_time()->insert(
+                  {"tpu", absl::ToDoubleMicroseconds(duration)});
+              break;
+            case xla::DeviceTimeMeasurement::DeviceType::kGpu:
+              fetch_response->mutable_device_time()->insert(
+                  {"gpu", absl::ToDoubleMicroseconds(duration)});
+              break;
+            case xla::DeviceTimeMeasurement::DeviceType::kUnknown:
+              break;
+          }
+        }
+
+        return ifrt_resp;
+      });
+}
+
 // This handler will be deleted on 2025-06-06 since the underlying IFRT API is
 // deprecated. An error is returned until then to gracefully handle old clients.
 absl::StatusOr<BackendInterface::Response>
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
index 2ca312d98566fa..ed5945bbc5a8b8 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -27,7 +27,9 @@
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/array_spec.h"
 #include "xla/python/ifrt/client.h"
@@ -157,6 +159,11 @@ class IfrtBackend final : public BackendInterface {
         ABSL_GUARDED_BY(mu_);
   };
 
+  struct ExecuteResult {
+    absl::flat_hash_map<xla::DeviceTimeMeasurement::DeviceType, absl::Duration>
+        device_time;
+  };
+
   IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
               std::shared_ptr<xla::ifrt::Client> ifrt_client,
               std::shared_ptr<HostBufferStore> host_buffer_store);
@@ -223,6 +230,8 @@ class IfrtBackend final : public BackendInterface {
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableExecuteRequest(
       ArrayStore::Reservation& asr, std::unique_ptr<IfrtRequest> request);
+  tsl::Future<Response> HandleLoadedExecutableFetchExecuteResultRequest(
+      std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableDeleteRequest(
       std::unique_ptr<IfrtRequest> request);
   absl::StatusOr<Response> HandleLoadedExecutableIsDeletedRequest(
@@ -292,6 +301,10 @@ class IfrtBackend final : public BackendInterface {
   absl::flat_hash_map<uint64_t, std::shared_ptr<LoadedExecutableWithInfo>>
       executables_ ABSL_GUARDED_BY(executables_mutex_);
 
+  absl::Mutex execute_results_mutex_;
+  absl::flat_hash_map<uint64_t, tsl::Future<ExecuteResult>> execute_results_
+      ABSL_GUARDED_BY(execute_results_mutex_);
+
   absl::Mutex host_callback_queues_mutex_;
   absl::flat_hash_map<uint64_t, std::shared_ptr<RemoteLoadedHostCallbackQueue>>
       host_callback_queues_ ABSL_GUARDED_BY(host_callback_queues_mutex_);
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index b6100518833f7b..332c4c7e3b201c 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -37,6 +37,7 @@
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ExtensibleRTTI.h"
@@ -48,6 +49,7 @@
 #include "xla/pjrt/host_callback.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/profiling/device_time_measurement.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/attribute_map.h"
 #include "xla/python/ifrt/basic_device_list.h"
@@ -88,6 +90,7 @@
 #include "xla/tsl/protobuf/status.pb.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/platform.h"
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace xla {
@@ -102,6 +105,7 @@ using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::Invoke;
+using ::testing::MatchesRegex;
 using ::testing::Not;
 using ::testing::NotNull;
 using ::testing::Optional;
@@ -1375,19 +1379,36 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecute) {
     EXPECT_NE(output.array_handle(), 0);
   }
 
+  auto check_execution_result = [&](uint64_t handle) -> absl::Status {
+    if (handle == 0) {
+      return absl::InternalError("Test error, future handle is 0");
+    }
+    if (Version().protocol_version() >= protocol_version::kExecuteResult) {
+      auto request = NewIfrtRequest(NewOpId());
+      request->mutable_loaded_executable_fetch_execute_result_request()
+          ->set_result_status_handle(handle);
+      TF_ASSIGN_OR_RETURN(std::shared_ptr<IfrtResponse> response,
+                          CallBackend(std::move(request)));
+      return tsl::StatusFromProto(response->response_metadata().status());
+    } else {
+      return CheckFuture(handle);
+    }
+  };
+
   EXPECT_THAT(
-      CheckFuture(
+      check_execution_result(
           response->loaded_executable_execute_response().status_handle()),
       absl_testing::StatusIs(absl::StatusCode::kInternal,
                              StrEq("injected error")));
 
-  // The second call to `CheckFuture` fails since `CheckFuture` above performs a
-  // destructive read.
+  // The second call to `check_execution_result` fails since
+  // `check_execution_result` above performs a destructive read.
   EXPECT_THAT(
-      CheckFuture(
+      check_execution_result(
           response->loaded_executable_execute_response().status_handle()),
-      absl_testing::StatusIs(absl::StatusCode::kNotFound,
-                             HasSubstr("Unknown future handle")));
+      absl_testing::StatusIs(
+          absl::StatusCode::kNotFound,
+          MatchesRegex("Unknown (future|result status) handle.*")));
 }
 
 TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecuteErrorWithClientHandles) {
@@ -1450,13 +1471,91 @@ TEST_P(IfrtBackendHandlerTest, LoadedExecutableExecuteErrorWithClientHandles) {
 
   EXPECT_THAT(CallBackend(std::move(request)), status_is_err);
 
-  EXPECT_THAT(CheckFuture(kFirstResultHandle + kNumOutputs), status_is_err);
+  {
+    const uint64_t handle = kFirstResultHandle + kNumOutputs;
+    if (Version().protocol_version() >= protocol_version::kExecuteResult) {
+      auto request = NewIfrtRequest(NewOpId());
+      request->mutable_loaded_executable_fetch_execute_result_request()
+          ->set_result_status_handle(handle);
+      EXPECT_THAT(CallBackend(std::move(request)), status_is_err);
+    } else {
+      EXPECT_THAT(CheckFuture(handle), status_is_err);
+    }
+  }
 
   for (int i = 0; i < kNumOutputs; ++i) {
     EXPECT_THAT(CheckValueReady(kFirstResultHandle + i), status_is_err);
   }
 }
 
+TEST_P(IfrtBackendHandlerTest, LoadedExecutableDeviceTime) {
+  if (tsl::kIsOpenSource) {
+    GTEST_SKIP()
+        << "DeviceTimeMeasurement implementation isn't available in OSS.";
+  }
+  if (Version().protocol_version() < protocol_version::kExecuteResult) {
+    GTEST_SKIP()
+        << "Device time measurement is not supported in this protocol version";
+  }
+
+  MockLoadedExecutable* executable;
+  uint64_t handle;
+  {
+    auto e = std::make_unique<MockLoadedExecutable>();
+    executable = e.get();
+    TF_ASSERT_OK_AND_ASSIGN(CompileResponse response,
+                            CompileTestLoadedExecutable(std::move(e)));
+    handle = response.loaded_executable_handle();
+  }
+
+  EXPECT_CALL(*executable, Execute(_, _, _))
+      .WillOnce([&](absl::Span<ArrayRef> args,
+                    const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+                    std::optional<DeviceListRef> devices)
+                    -> absl::StatusOr<LoadedExecutable::ExecuteResult> {
+        std::optional<uint64_t> device_time_key =
+            xla::GetDeviceTimeMeasurementKey();
+        if (device_time_key.has_value()) {
+          xla::RecordDeviceTimeMeasurement(
+              *device_time_key, absl::Microseconds(1234),
+              xla::DeviceTimeMeasurement::DeviceType::kTpu);
+        }
+        LoadedExecutable::ExecuteResult result;
+        result.status = tsl::Future<>(absl::OkStatus());
+        return result;
+      });
+
+  constexpr uint64_t kResultStatusHandle = 1000;
+  {
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableExecuteRequest* execute_request =
+        request->mutable_loaded_executable_execute_request();
+    execute_request->set_loaded_executable_handle(handle);
+    execute_request->set_result_status_handle(kResultStatusHandle);
+
+    xla::ifrt::LoadedExecutable::ExecuteOptions execute_options;
+    execute_options.fill_status = true;
+    TF_ASSERT_OK(execute_options.ToProto(
+        *execute_request->mutable_execute_options(), ifrt_serdes_version()));
+
+    EXPECT_OK(CallBackend(std::move(request)));
+  }
+
+  {
+    auto request = NewIfrtRequest(NewOpId());
+    request->mutable_loaded_executable_fetch_execute_result_request()
+        ->set_result_status_handle(kResultStatusHandle);
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+    EXPECT_THAT(response, Pointee(Partially(EquivToProto(R"pb(
+                  loaded_executable_fetch_execute_result_response {
+                    device_time { key: "tpu" value: 1234.0 }
+                    device_time { key: "gpu" value: 0 }
+                  }
+                )pb"))));
+  }
+}
+
 TEST_P(IfrtBackendHandlerTest, LoadedExecutableDestruct) {
   MockLoadedExecutable* executable;
   uint64_t handle;

From 5d6c0bf62b655c3c4fed0c56d5a2004e13093e2d Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Fri, 12 Dec 2025 16:17:05 -0800
Subject: [PATCH 248/753] Unconditionally include YNNPACK

This was disabled on windows due to lack of compatibility, it is hopefully now compatible.

PiperOrigin-RevId: 843870100
---
 .../xla/xla/backends/cpu/runtime/BUILD        | 24 +++++-------
 .../xla/xla/backends/cpu/runtime/thunk.cc     |  9 +----
 .../xla/xla/backends/cpu/runtime/thunk.h      | 12 +-----
 .../cpu/runtime/thunk_proto_serdes.cc         | 17 ++-------
 .../cpu/runtime/thunk_sequence_serdes_test.cc | 12 +-----
 .../xla/backends/cpu/runtime/ynnpack/BUILD    |  6 +--
 .../xla/xla/backends/cpu/transforms/BUILD     | 11 +++---
 .../cpu/transforms/library_rewriter.h         |  6 +--
 .../cpu/transforms/library_rewriter_test.cc   |  2 -
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    |  4 --
 third_party/xla/xla/service/cpu/BUILD         | 15 +++-----
 .../xla/xla/service/cpu/cpu_compiler.cc       | 15 +-------
 .../xla/xla/service/cpu/cpu_executable.cc     |  2 -
 .../xla/xla/service/cpu/thunk_emitter.cc      | 21 ++--------
 .../xla/xla/tsl/xnnpack/build_defs.bzl        | 38 -------------------
 15 files changed, 38 insertions(+), 156 deletions(-)
 delete mode 100644 third_party/xla/xla/tsl/xnnpack/build_defs.bzl

diff --git a/third_party/xla/xla/backends/cpu/runtime/BUILD b/third_party/xla/xla/backends/cpu/runtime/BUILD
index b567d7f7aefe9e..6fd59cdf1d6bd9 100644
--- a/third_party/xla/xla/backends/cpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/BUILD
@@ -4,7 +4,6 @@ load("//xla/tsl:tsl.bzl", "if_google", "if_windows", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -169,7 +168,6 @@ cc_library(
     name = "thunk",
     srcs = ["thunk.cc"],
     hdrs = ["thunk.h"],
-    defines = if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":buffer_allocations",
         ":function_library",
@@ -177,6 +175,8 @@ cc_library(
         "//xla:executable_run_options",
         "//xla/backends/cpu/collectives:cpu_collectives",
         "//xla/backends/cpu/collectives:in_process_collectives",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_threadpool",
         "//xla/ffi:execution_context",
         "//xla/runtime:buffer_use",
         "//xla/runtime:device_id",
@@ -196,10 +196,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/profiler/lib:traceme",
         "@local_tsl//tsl/profiler/lib:traceme_encode",
-    ] + if_ynnpack([
-        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_threadpool",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1250,7 +1247,10 @@ cc_library(
         ":while_thunk",
         "//xla:shape_util",
         "//xla:util",
+        "//xla/backends/cpu:ynn_emitter",
         "//xla/backends/cpu:ynn_fusion_options_proto_cc",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:resource_use",
         "//xla/runtime:work_group",
@@ -1271,11 +1271,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
-    ] + if_ynnpack([
-        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
-        "//xla/backends/cpu:ynn_emitter",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
-    ]),
+    ],
 )
 
 cc_library(
@@ -1310,7 +1306,6 @@ xla_cc_test(
 xla_cc_test(
     name = "thunk_sequence_serdes_test",
     srcs = ["thunk_sequence_serdes_test.cc"],
-    local_defines = if_ynnpack(["XLA_YNNPACK"]),
     deps = [
         ":all_gather_thunk",
         ":all_reduce_thunk",
@@ -1345,6 +1340,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/runtime:resource_use",
@@ -1365,9 +1361,7 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:casts",
-    ] + if_ynnpack([
-        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
-    ]),
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.cc b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
index d675604b4ae202..0468b0c3e84b11 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/collectives/cpu_collectives.h"
 #include "xla/backends/cpu/collectives/in_process_collectives.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
 #include "xla/executable_run_options.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/cpu/cpu_executable_run_options.h"
@@ -41,11 +43,6 @@ limitations under the License.
 #include "tsl/profiler/lib/traceme.h"
 #include "tsl/profiler/lib/traceme_encode.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 
 // Ok execute event allocated with the static storage duration.
@@ -161,7 +158,6 @@ Thunk::CustomCallExecuteParams::CustomCallExecuteParams(
       intra_op_thread_pool(intra_op_thread_pool),
       ffi_execution_context(ffi_execution_context) {}
 
-#ifdef XLA_YNNPACK
 absl::StatusOr<Thunk::YnnParams> Thunk::YnnParams::Create(
     const ExecutableRunOptions* run_options) {
   TF_ASSIGN_OR_RETURN(YnnThreadpool threadpool,
@@ -171,7 +167,6 @@ absl::StatusOr<Thunk::YnnParams> Thunk::YnnParams::Create(
 
 Thunk::YnnParams::YnnParams(YnnThreadpool threadpool)
     : threadpool(std::move(threadpool)) {}
-#endif  // XLA_YNNPACK
 
 Thunk::ExecuteSession::ExecuteSession(int64_t max_workers,
                                       int64_t split_threshold)
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk.h b/third_party/xla/xla/backends/cpu/runtime/thunk.h
index 0c48855c06622b..a30eb2b27f76dd 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk.h
@@ -35,6 +35,8 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/buffer_allocations.h"
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/backends/cpu/runtime/xfeed_manager.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
 #include "xla/executable_run_options.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/runtime/buffer_use.h"
@@ -45,11 +47,6 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_threadpool.h"
-#endif  // XLA_YNNPACK
-
 namespace Eigen {
 struct ThreadPoolDevice;
 }  // namespace Eigen
@@ -255,7 +252,6 @@ class Thunk {
   // YnnParams
   //===--------------------------------------------------------------------===//
 
-#ifdef XLA_YNNPACK
   // Parameters capturing all the details required for running XNNPACK fusions.
   struct YnnParams {
     static absl::StatusOr<YnnParams> Create(
@@ -265,10 +261,6 @@ class Thunk {
 
     explicit YnnParams(YnnThreadpool threadpool);
   };
-#else
-  // Use XnnParams for placeholder. The parameter won't be used anyway.
-  struct YnnParams {};
-#endif  // XLA_YNNPACK
 
   //===--------------------------------------------------------------------===//
   // ExecuteParams
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
index 4f5255b2681b2a..d8381b601813d1 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_proto_serdes.cc
@@ -61,6 +61,9 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.pb.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_emitter.h"
 #include "xla/backends/cpu/ynn_fusion_options.pb.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -77,12 +80,6 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/ynn_emitter.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 
 void ForEachThunkProto(const ThunkSequenceProto& proto,
@@ -716,7 +713,6 @@ static absl::Status ToProto(const WhileThunk& thunk, ThunkProto& proto) {
   return absl::OkStatus();
 }
 
-#ifdef XLA_YNNPACK
 static absl::Status ToProto(const YnnFusionThunk& thunk, ThunkProto& proto) {
   YnnFusionThunkProto* ynn_fusion_proto = proto.mutable_ynn_fusion_thunk();
   ynn_fusion_proto->mutable_options()->set_use_threadpool(
@@ -736,7 +732,6 @@ static absl::Status ToProto(const YnnFusionThunk& thunk, ThunkProto& proto) {
 
   return absl::OkStatus();
 }
-#endif  // XLA_YNNPACK
 
 static absl::Status ToProto(const FftThunk& thunk, ThunkProto& proto) {
   FftThunkProto* fft_thunk_proto = proto.mutable_fft_thunk();
@@ -919,12 +914,10 @@ absl::StatusOr<ThunkProto> ThunkSerDesProtobuf::ToProto(
                   internal::LogicalIdKind::kReplicaId>&>(thunk)),
           proto));
       break;
-#ifdef XLA_YNNPACK
     case Thunk::Kind::kYnnFusion:
       TF_RETURN_IF_ERROR(::xla::cpu::ToProto(
           tsl::down_cast<const YnnFusionThunk&>(thunk), proto));
       break;
-#endif  // XLA_YNNPACK
     default:
       return absl::UnimplementedError(
           absl::StrFormat("ToProto is not implemented for thunk kind: %s",
@@ -1455,7 +1448,6 @@ static absl::StatusOr<std::unique_ptr<WhileThunk>> WhileThunkFromProto(
                             std::move(*body_sequence), trip_count);
 }
 
-#ifdef XLA_YNNPACK
 static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunkFromProto(
     const ThunkProto& proto, const HloModule* hlo_module,
     const std::vector<BufferAllocation>& buffer_allocations) {
@@ -1533,7 +1525,6 @@ static absl::StatusOr<std::unique_ptr<YnnFusionThunk>> YnnFusionThunkFromProto(
       },
       captured_arguments_ids);
 }
-#endif  // XLA_YNNPACK
 
 static absl::StatusOr<std::unique_ptr<Thunk>> PartitionIdThunkFromProto(
     const ThunkProto& proto,
@@ -1633,10 +1624,8 @@ absl::StatusOr<std::unique_ptr<Thunk>> ThunkSerDesProtobuf::FromProto(
       return PartitionIdThunkFromProto(proto, *buffer_allocations_);
     case Thunk::Kind::kReplicaId:
       return ReplicaIdThunkFromProto(proto, *buffer_allocations_);
-#ifdef XLA_YNNPACK
     case Thunk::Kind::kYnnFusion:
       return YnnFusionThunkFromProto(proto, hlo_module_, *buffer_allocations_);
-#endif  // XLA_YNNPACK
     default:
       return absl::Status(absl::StatusCode::kInvalidArgument,
                           absl::StrFormat("Unsupported thunk kind: %s",
diff --git a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
index 9fd66395fa1a28..f4720cd7756d99 100644
--- a/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/thunk_sequence_serdes_test.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk_testlib.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/literal.h"
@@ -77,10 +78,6 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 namespace {
 
@@ -1028,14 +1025,12 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
            thunk_1.trip_count() == thunk_2.trip_count();
   }
 
-#ifdef XLA_YNNPACK
   bool VerifyYnnFusionThunkEquality(const YnnFusionThunk& thunk_1,
                                     const YnnFusionThunk& thunk_2) {
     // TODO(ashaposhnikov) assume this is always false until we implement
     // serialization of YnnFusionThunk.
     return false;
   }
-#endif  // XLA_YNNPACK
 
   bool VerifyKernelThunkEquality(const KernelThunkBase& thunk_1,
                                  const KernelThunkBase& thunk_2) {
@@ -1231,7 +1226,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
             tsl::down_cast<const WhileThunk&>(thunk_1),
             tsl::down_cast<const WhileThunk&>(thunk_2));
       case Thunk::Kind::kYnnFusion: {
-#ifdef XLA_YNNPACK
         const YnnFusionThunk& ynn_fusion_thunk_1 =
             tsl::down_cast<const YnnFusionThunk&>(thunk_1);
         const YnnFusionThunk& ynn_fusion_thunk_2 =
@@ -1243,10 +1237,6 @@ class ThunkSequenceSerdesTest : public ::testing::Test {
         return VerifyYnnFusionThunkEquality(
             tsl::down_cast<const YnnFusionThunk&>(thunk_1),
             tsl::down_cast<const YnnFusionThunk&>(thunk_2));
-#else
-        CHECK(false) << "Unsupported YNN fusion thunk type";
-        return false;
-#endif  // XLA_YNNPACK
       }
       case Thunk::Kind::kKernel:
         return VerifyKernelThunkEquality(
diff --git a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
index f5b22b22c19d8c..2c07477fc16b25 100644
--- a/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
+++ b/third_party/xla/xla/backends/cpu/runtime/ynnpack/BUILD
@@ -1,5 +1,5 @@
+load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "ynn_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -30,7 +30,7 @@ cc_library(
     ],
 )
 
-ynn_cc_test(
+xla_cc_test(
     name = "slinky_threadpool_test",
     srcs = ["slinky_threadpool_test.cc"],
     deps = [
@@ -110,7 +110,7 @@ cc_library(
     ],
 )
 
-ynn_cc_test(
+xla_cc_test(
     name = "ynn_fusion_thunk_test",
     srcs = ["ynn_fusion_thunk_test.cc"],
     deps = [
diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index fde7907429e1eb..67603bc203cdac 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -3,7 +3,6 @@ load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl/mkl:build_defs.bzl", "if_graph_api")
 load("//xla/tsl/mkl:graph.bzl", "onednn_graph_cc_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -22,10 +21,11 @@ cc_library(
     name = "library_rewriter",
     srcs = ["library_rewriter.cc"],
     hdrs = ["library_rewriter.h"],
-    defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]) + if_ynnpack(["XLA_YNNPACK"]),
+    defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
     deps = [
         ":library_matcher",
         ":onednn_matcher",
+        ":ynn_matcher",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
@@ -42,13 +42,13 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:protobuf",
-    ] + if_ynnpack([":ynn_matcher"]),
+    ],
 )
 
 xla_cc_test(
     name = "library_rewriter_test",
     srcs = ["library_rewriter_test.cc"],
-    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]) + if_ynnpack(["XLA_YNNPACK"]),
+    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
     deps = [
         ":library_rewriter",
         "//xla:xla_data_proto_cc",
@@ -103,6 +103,7 @@ cc_library(
     hdrs = ["ynn_matcher.h"],
     deps = [
         ":library_matcher",
+        "//xla/backends/cpu:ynn_support",
         "//xla/backends/cpu/codegen:target_machine_features",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/base:no_destructor",
@@ -110,5 +111,5 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:protobuf",
-    ] + if_ynnpack(["//xla/backends/cpu:ynn_support"]),
+    ],
 )
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
index 38dba96cac1c67..5dfae6ee54a641 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/transforms/library_matcher.h"
+#include "xla/backends/cpu/transforms/ynn_matcher.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -38,9 +39,6 @@ limitations under the License.
 #include "xla/backends/cpu/transforms/onednn_matcher.h"
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/transforms/ynn_matcher.h"
-#endif
 
 namespace xla::cpu {
 
@@ -72,13 +70,11 @@ class LibraryRewriter : public HloModulePass {
           target_machine_features_, options_.onednn_fusion_types));
     }
 #endif  // XLA_ONEDNN_USE_GRAPH_API
-#ifdef XLA_YNNPACK
     if (options_.use_ynnpack && options_.ynn_fusion_types != nullptr &&
         !options_.ynn_fusion_types->empty()) {
       libs_.push_back(std::make_unique<YnnMatcher>(target_machine_features_,
                                                    options_.ynn_fusion_types));
     }
-#endif  // XLA_YNNPACK
 
     for (std::unique_ptr<LibraryMatcher>& lib : libs_) {
       supported_ops_.merge(lib->SupportedOps());
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
index 430986457d215f..dc02e86a829da6 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter_test.cc
@@ -443,10 +443,8 @@ std::vector<DotRewriteTestSpec> GetDotRewriteTestSpecs() {
   // Fusion modes to test for each library.
   absl::flat_hash_map<std::string, std::vector<std::string>> fusion_modes;
 
-#if XLA_YNNPACK
   // Don't test YNNPACK if we don't build with it.
   fusion_modes["ynn"] = {"dot", "greedy"};
-#endif
 
 #if XLA_ONEDNN_USE_GRAPH_API
   // Don't test oneDNN if we don't build with it.
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 06f551239b6163..125ef05f15c00a 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -1648,12 +1648,10 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
           cpu::Thunk::CustomCallExecuteParams::Create(&run_options));
 
       std::optional<cpu::Thunk::YnnParams> ynn_params;
-#ifdef XLA_YNNPACK
       if (cpu_executable->has_ynn_fusions()) {
         TF_ASSIGN_OR_RETURN(ynn_params,
                             cpu::Thunk::YnnParams::Create(&run_options));
       }
-#endif  // XLA_YNNPACK
 
       cpu::ThreadPoolTaskRunner task_runner(
           run_options.intra_op_thread_pool()->getPool());
@@ -1793,11 +1791,9 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
 
             absl::StatusOr<std::optional<cpu::Thunk::YnnParams>> ynn_params(
                 std::nullopt);
-#ifdef XLA_YNNPACK
             if (cpu_executable->has_ynn_fusions()) {
               ynn_params = cpu::Thunk::YnnParams::Create(&run_options);
             }
-#endif  // XLA_YNNPACK
 
             cpu::ThreadPoolTaskRunner task_runner(
                 run_options.intra_op_thread_pool()->getPool());
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 3b5fcce8539a67..af4a362ded3a28 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -17,7 +17,6 @@ load(
     "if_llvm_x86_available",
 )
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load("//xla/tsl/xnnpack:build_defs.bzl", "if_ynnpack")
 load(":build_defs.bzl", "runtime_copts")
 
 package(
@@ -158,6 +157,7 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:constant_allocation",
         "//xla/backends/cpu:target_machine_options",
+        "//xla/backends/cpu:ynn_support",
         "//xla/backends/cpu/codegen:builtin_definition_generator",
         "//xla/backends/cpu/codegen:compiled_function_library",
         "//xla/backends/cpu/codegen:cpu_features",
@@ -354,8 +354,6 @@ cc_library(
         ":onednn_contraction_rewriter",
         ":onednn_float_support",
         ":onednn_ops_rewriter",
-    ]) + if_ynnpack([
-        "//xla/backends/cpu:ynn_support",
     ]),
 )
 
@@ -803,7 +801,7 @@ cc_library(
     srcs = ["thunk_emitter.cc"],
     hdrs = ["thunk_emitter.h"],
     copts = tsl_copts(),
-    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]) + if_ynnpack(["XLA_YNNPACK"]),
+    local_defines = if_graph_api(["XLA_ONEDNN_USE_GRAPH_API=1"]),
     deps = [
         ":backend_config_proto_cc",
         ":cpu_options",
@@ -819,6 +817,8 @@ cc_library(
         "//xla/backends/cpu:alignment",
         "//xla/backends/cpu:onednn_emitter",
         "//xla/backends/cpu:onednn_support",
+        "//xla/backends/cpu:ynn_emitter",
+        "//xla/backends/cpu:ynn_support",
         "//xla/backends/cpu/codegen:computation_kernel_emitter",
         "//xla/backends/cpu/codegen:fusion_compiler",
         "//xla/backends/cpu/codegen:fusion_emitter",
@@ -851,6 +851,8 @@ cc_library(
         "//xla/backends/cpu/runtime:topk_thunk",
         "//xla/backends/cpu/runtime:while_thunk",
         "//xla/backends/cpu/runtime/onednn:onednn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
+        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
         "//xla/codegen:kernel_definition",
         "//xla/codegen:kernel_spec",
         "//xla/codegen:llvm_kernel_source",
@@ -890,11 +892,6 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:traceme",
     ] + if_onednn([
         "//xla/backends/cpu/runtime/onednn:onednn_op_thunk",
-    ]) + if_ynnpack([
-        "//xla/backends/cpu:ynn_emitter",
-        "//xla/backends/cpu:ynn_support",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_interop",
-        "//xla/backends/cpu/runtime/ynnpack:ynn_fusion_thunk",
     ]),
 )
 
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 2edab7cd05064f..19538cd5831074 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -98,6 +98,7 @@ limitations under the License.
 #include "xla/backends/cpu/target_machine_options.h"
 #include "xla/backends/cpu/transforms/collectives/all_reduce_combiner.h"
 #include "xla/backends/cpu/transforms/library_rewriter.h"
+#include "xla/backends/cpu/ynn_support.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
@@ -249,10 +250,6 @@ limitations under the License.
 #include "xla/service/cpu/onednn_ops_rewriter.h"
 #endif  // XLA_ONEDNN
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/ynn_support.h"
-#endif  // XLA_YNNPACK
-
 namespace xla {
 namespace {
 
@@ -497,7 +494,6 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
     pipeline->AddPass<TreeReductionRewriter>();
   }
 
-#ifdef XLA_YNNPACK
   if (absl::c_contains(module->config()
                            .debug_options()
                            .xla_cpu_experimental_ynn_fusion_type(),
@@ -507,7 +503,6 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
           return !IsReduceOpOffloadedToYnn(hlo);
         });
   }
-#endif
 
   // BatchNormExpander can create zero-sized ops, so zero-sized HLO
   // elimination has to come after that pass.
@@ -534,16 +529,12 @@ std::unique_ptr<HloPassFix<HloPassPipeline>> CreateSimplificationPipeline(
 
 auto LibrarySupportsConvolution(
     HloModule* module, TargetMachineFeatures* target_machine_features) {
-#ifdef XLA_YNNPACK
   const bool ynnpack_convolution_enabled = absl::c_linear_search(
       module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
       DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
   return [=](const HloInstruction& instr) {
     return ynnpack_convolution_enabled && IsConvolutionOpSupportedByYnn(&instr);
   };
-#else
-  return [](const HloInstruction&) { return false; };
-#endif  // XLA_YNNPACK
 }
 
 auto LibrarySupportsDot(HloModule* module,
@@ -551,7 +542,6 @@ auto LibrarySupportsDot(HloModule* module,
   // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
   // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
   // `XnnFusionThunk`.
-#ifdef XLA_YNNPACK
   const bool ynnpack_dot_enabled = absl::c_linear_search(
       module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
       DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
@@ -566,9 +556,6 @@ auto LibrarySupportsDot(HloModule* module,
 
     return false;
   };
-#else
-  return [](const HloInstruction&) { return false; };
-#endif  // XLA_YNNPACK
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index 97bcff96d927ec..e04c3205a92c0e 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -258,11 +258,9 @@ absl::Status CpuExecutable::ExecuteThunks(
 
   // Prepare for executing YNNPACK fusions.
   std::optional<Thunk::YnnParams> ynn_params;
-#ifdef XLA_YNNPACK
   if (has_ynn_fusions()) {
     TF_ASSIGN_OR_RETURN(ynn_params, Thunk::YnnParams::Create(run_options));
   }
-#endif  // XLA_YNNPACK
 
   // Use the intra-op thread pool to offload thunk executor tasks.
   auto* intra_op_thread_pool = run_options->intra_op_thread_pool();
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.cc b/third_party/xla/xla/service/cpu/thunk_emitter.cc
index 80a9fe9245982f..5a77a764108d54 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.cc
@@ -69,6 +69,10 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/backends/cpu/runtime/topk_thunk.h"
 #include "xla/backends/cpu/runtime/while_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
+#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
+#include "xla/backends/cpu/ynn_emitter.h"
+#include "xla/backends/cpu/ynn_support.h"
 #include "xla/codegen/emitters/computation_fingerprint.h"
 #include "xla/codegen/emitters/kernel_api_builder.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -121,13 +125,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/onednn/onednn_fusion_thunk.h"
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
-#ifdef XLA_YNNPACK
-#include "xla/backends/cpu/runtime/ynnpack/ynn_fusion_thunk.h"
-#include "xla/backends/cpu/runtime/ynnpack/ynn_interop.h"
-#include "xla/backends/cpu/ynn_emitter.h"
-#include "xla/backends/cpu/ynn_support.h"
-#endif  // XLA_YNNPACK
-
 namespace xla::cpu {
 
 namespace {
@@ -441,11 +438,9 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitHloInstruction(
         }
 #endif  // XLA_ONEDNN_USE_GRAPH_API
 
-#ifdef XLA_YNNPACK
         if (backend_config.fusion_config().kind() == kYnnFusionKind) {
           return EmitYnnFusionThunk(instruction);
         }
-#endif  // XLA_YNNPACK
 
         return Internal("Unsupported custom fusion kind: %s",
                         backend_config.DebugString());
@@ -762,7 +757,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
       /*supported_types=*/
       {PRED, S8, U8, S16, U16, S32, U32, S64, U64, F16, F32, F64, C64, C128}));
 
-#ifdef XLA_YNNPACK
   const bool use_ynn = absl::c_linear_search(
       hlo_module_config_.debug_options().xla_cpu_experimental_ynn_fusion_type(),
       DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
@@ -771,7 +765,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
       return EmitYnnFusionThunk(instruction);
     }
   }
-#endif  // XLA_YNNPACK
 
   // TODO(tonywy): Add PotentiallyImplementedAsMKLConvolution to support
   // different data layouts.
@@ -1090,7 +1083,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
       TF_ASSIGN_OR_RETURN(BufferAllocation::Slice out_slice,
                           GetAllocationSlice(instruction));
 
-#ifdef XLA_YNNPACK
       const bool use_ynn = absl::c_linear_search(
           hlo_module_config_.debug_options()
               .xla_cpu_experimental_ynn_fusion_type(),
@@ -1104,7 +1096,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitDotThunk(
           return EmitYnnFusionThunk(instruction);
         }
       }
-#endif  // XLA_YNNPACK
 
       return ThunkSequence::Of<DotThunk>(
           ThunkInfo(instruction), dnums, lhs_slice, lhs->shape(), rhs_slice,
@@ -1474,7 +1465,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitOneDnnFusionThunk(
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
     const HloInstruction* instruction) {
-#ifdef XLA_YNNPACK
   // Collect YNNPACK fusion arguments.
   std::vector<YnnFusionThunk::Argument> arguments;
   for (HloInstruction* operand : instruction->operands()) {
@@ -1530,9 +1520,6 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitYnnFusionThunk(
         return b(arg_buffers);
       },
       captured_arguments_ids);
-#else
-  return Unimplemented("XLA is not built with YNNPACK.");
-#endif  // XLA_YNNPACK
 }
 
 absl::StatusOr<ThunkEmitter::HostKernelAllocationSlices>
diff --git a/third_party/xla/xla/tsl/xnnpack/build_defs.bzl b/third_party/xla/xla/tsl/xnnpack/build_defs.bzl
deleted file mode 100644
index 5aab05cadd1132..00000000000000
--- a/third_party/xla/xla/tsl/xnnpack/build_defs.bzl
+++ /dev/null
@@ -1,38 +0,0 @@
-"""Macros for XNNPACK and YNNPACK."""
-
-load("//xla:xla.default.bzl", "xla_cc_test")
-load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
-
-visibility(DEFAULT_LOAD_VISIBILITY)
-
-def if_ynnpack(if_true, if_false = []):
-    """Selection based on whether we are building XLA with YNNPACK integration.
-
-    Args:
-      if_true: Expression to evaluate if building with YNNPACK.
-      if_false: Expression to evaluate if building without YNNPACK.
-
-    Returns:
-      A select evaluating to either if_true or if_false as appropriate.
-    """
-    return select({
-        # YNNPACK is not tested on Windows.
-        "//xla/tsl:windows": if_false,
-        "//conditions:default": if_true,
-    })
-
-def ynn_cc_test(
-        srcs = [],
-        deps = [],
-        **kwargs):
-    """xla_cc_test rule with empty src and deps if not building with YNNPACK."""
-    xla_cc_test(
-        # CC_TEST_OK=Just defining `xla_cc_test` rule to be used in XLA.
-        srcs = if_ynnpack(srcs),
-        deps = if_ynnpack(if_true = deps, if_false = ["@com_google_googletest//:gtest_main"]),
-        # If not building with YNNPACK, we don't have any tests linked.
-        fail_if_no_test_linked = False,
-        # If not building with YNNPACK, we don't have any tests defined either.
-        fail_if_no_test_selected = False,
-        **kwargs
-    )

From bc4254e120fd2a8f3f13d94bdf886e6b7981f858 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 12 Dec 2025 16:22:44 -0800
Subject: [PATCH 249/753] add H200 to GetDeviceMemoryInBytes.

PiperOrigin-RevId: 843871678
---
 third_party/xla/xla/python/ifrt/ir/utils.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/third_party/xla/xla/python/ifrt/ir/utils.cc b/third_party/xla/xla/python/ifrt/ir/utils.cc
index c8015a51eccc50..6744cce7d1d796 100644
--- a/third_party/xla/xla/python/ifrt/ir/utils.cc
+++ b/third_party/xla/xla/python/ifrt/ir/utils.cc
@@ -51,6 +51,9 @@ absl::StatusOr<int64_t> GetDeviceMemoryInBytes(absl::string_view device_kind) {
   if (device_kind == "NVIDIA H100 80GB HBM3") {
     return 80LL * kGB;
   }
+  if (device_kind == "NVIDIA H200") {
+    return 141LL * kGB;
+  }
   if (device_kind == "NVIDIA B200") {
     return 192LL * kGB;
   }

From bb14c57f59f9e12602b6ab277a36cf486470b4f8 Mon Sep 17 00:00:00 2001
From: Juhyun Lee <impjdi@google.com>
Date: Fri, 12 Dec 2025 16:37:09 -0800
Subject: [PATCH 250/753] Expose TypeScript of TFLite schema.

PiperOrigin-RevId: 843876097
---
 tensorflow/compiler/mlir/lite/schema/BUILD | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/schema/BUILD b/tensorflow/compiler/mlir/lite/schema/BUILD
index 0c0381439d2b5e..649e198336c911 100644
--- a/tensorflow/compiler/mlir/lite/schema/BUILD
+++ b/tensorflow/compiler/mlir/lite/schema/BUILD
@@ -1,5 +1,10 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
+
+# copybara:uncomment_begin(google-only)
+# load("@flatbuffers//:flatbuffers.bzl", "flatbuffers_library", "ts_flatbuffers_library")
+# copybara:uncomment_end
+
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
@@ -131,3 +136,15 @@ tf_cc_test(
         "@flatbuffers//:flatc_library",
     ],
 )
+
+# copybara:uncomment_begin(google-only)
+# flatbuffers_library(
+#     name = "schema_fbslib",
+#     srcs = ["schema.fbs"],
+# )
+#
+# ts_flatbuffers_library(
+#     name = "schema_ts_fbs",
+#     deps = [":schema_fbslib"],
+# )
+# copybara:uncomment_end

From fb270466c6c0660a32d9d3cbdd8ed6fc84f48e9a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 12 Dec 2025 20:09:02 -0800
Subject: [PATCH 251/753] Automated Code Change

PiperOrigin-RevId: 843930021
---
 tensorflow/c/experimental/ops/gen/cpp/BUILD                 | 1 -
 tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc      | 1 +
 tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/experimental/ops/gen/cpp/BUILD b/tensorflow/c/experimental/ops/gen/cpp/BUILD
index 1e1d4eca98106a..05bd307fd499ec 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/BUILD
+++ b/tensorflow/c/experimental/ops/gen/cpp/BUILD
@@ -28,7 +28,6 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:str_util",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
index 45e7b87069e361..e4b82c59072123 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h"
 
+#include <string>
 #include <vector>
 
 #include "tensorflow/c/experimental/ops/gen/common/path_config.h"
diff --git a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
index e1db2c9b8ce14b..d97bd7ee2d921f 100644
--- a/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
+++ b/tensorflow/c/experimental/ops/gen/cpp/cpp_generator_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/gen/cpp/cpp_generator.h"
 
 #include <algorithm>
+#include <string>
 #include <vector>
 
 #include "tensorflow/c/experimental/ops/gen/common/path_config.h"

From 8216c30a188639a92ce365cf3af195d2917ce663 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 12 Dec 2025 21:16:43 -0800
Subject: [PATCH 252/753] [stream_executor:cuda] Use NCCL or NVSHMEM allocators
 to allocate collective memory

GpuCollectives::Allocate and Free will be removed in followup changes.
PiperOrigin-RevId: 843948327
---
 .../xla/xla/backends/gpu/collectives/BUILD    | 83 ++++++------------
 .../xla/xla/stream_executor/cuda/BUILD        | 54 ++++++++++++
 .../xla/stream_executor/cuda/cuda_executor.cc | 26 ++----
 .../xla/stream_executor/cuda/cuda_executor.h  | 12 ++-
 .../cuda/cuda_memory_allocator.cc             | 87 +++++++++++++++++++
 .../cuda/cuda_memory_allocator.h              | 62 +++++++++++++
 .../xla/stream_executor/cuda/cuda_platform.cc | 10 ++-
 .../cuda/nccl_memory_allocator.cc             | 11 +++
 .../cuda/nvshmem_memory_allocator.cc          | 12 +++
 9 files changed, 278 insertions(+), 79 deletions(-)
 create mode 100644 third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.cc
 create mode 100644 third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h

diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index dacd10d369da45..316f0e882b1b30 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -1,4 +1,3 @@
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("//xla:xla.default.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm_is_configured")
@@ -20,57 +19,6 @@ package_group(
     ],
 )
 
-# Allows to explicitely disable nvshmem collectives using invocation flag.
-bool_flag(
-    name = "nvshmem_enabled",
-    build_setting_default = True,
-)
-
-# NVSHMEM requires builtin functions since it uses printf's for debugging.
-config_setting(
-    name = "no_builtin_used",
-    values = {
-        "copt": "-fno-builtin",
-    },
-)
-
-cc_library(
-    name = "nvshmem_collectives_if_builtin_used",
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps =
-        select({
-            ":no_builtin_used": [],
-            "//conditions:default": [":nvshmem_collectives"],
-        }),
-)
-
-config_setting(
-    name = "nvshmem_supported",
-    constraint_values = [
-        "@platforms//os:linux",
-    ],
-    flag_values = {
-        ":nvshmem_enabled": "True",
-    },
-)
-
-# Since selects can't be nested we need to create this intermediate target
-cc_library(
-    name = "nvshmem_collectives_if_supported",
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps =
-        select({
-            ":nvshmem_supported": [":nvshmem_collectives_if_builtin_used"],
-            "//conditions:default": [],
-        }),
-)
-
 # Build target that registers all available GPU collectives implementations with the collectives
 # registry at link time.
 cc_library(
@@ -373,6 +321,7 @@ cc_library(
         "@local_tsl//tsl/platform:numbers",
     ] + if_cuda_is_configured([
         "//xla/tsl/cuda:nccl",
+        "//xla/stream_executor/cuda:nccl_memory_allocator",  # buildcleaner: keep (static registration)
     ]) + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocm_headers",
         "@local_config_rocm//rocm:rccl",
@@ -475,16 +424,15 @@ cc_library(
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
-        "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/cuda:nvshmem",
+        "//xla/stream_executor/cuda:nvshmem_memory_allocator",  # buildcleaner: keep (static registration)
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -492,7 +440,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
@@ -502,6 +449,32 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "nvshmem_collectives_if_builtin_used",
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = select({
+        "//xla/stream_executor/cuda:no_builtin_used": [],
+        "//conditions:default": [":nvshmem_collectives"],
+    }),
+)
+
+cc_library(
+    name = "nvshmem_collectives_if_supported",
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = select({
+        "//xla/stream_executor/cuda:nvshmem_supported": [
+            ":nvshmem_collectives_if_builtin_used",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
 xla_test(
     name = "nccl_communicator_test",
     srcs = ["nccl_communicator_test.cc"],
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 18502d54c0e9df..faa047a83318fc 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -74,6 +74,24 @@ bool_flag(
     ),
 )
 
+# Allows to explicitely disable nvshmem collectives using invocation flag.
+bool_flag(
+    name = "nvshmem_enabled",
+    build_setting_default = True,
+)
+
+# NVSHMEM requires builtin functions since it uses printf's for debugging.
+config_setting(
+    name = "no_builtin_used",
+    values = {"copt": "-fno-builtin"},
+)
+
+config_setting(
+    name = "nvshmem_supported",
+    constraint_values = ["@platforms//os:linux"],
+    flag_values = {":nvshmem_enabled": "True"},
+)
+
 config_setting(
     name = "libnvjitlink_support_enabled",
     flag_values = {
@@ -100,8 +118,10 @@ cc_library(
     deps = [
         ":cuda_diagnostics",
         ":cuda_executor",
+        ":cuda_memory_allocator",
         ":cuda_platform_id",
         ":cuda_status",
+        "//xla:debug_options_flags",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:executor_cache",
         "//xla/stream_executor:platform",
@@ -928,6 +948,30 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "cuda_memory_allocator",
+    srcs = ["cuda_memory_allocator.cc"],
+    hdrs = ["cuda_memory_allocator.h"],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_library(
     name = "nccl_memory_allocator",
     srcs = ["nccl_memory_allocator.cc"],
@@ -937,12 +981,14 @@ cc_library(
         "gpu",
     ],
     deps = [
+        ":cuda_memory_allocator",
         "//xla:util",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:memory_allocator",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/platform:initialize",
         "//xla/tsl/cuda:nccl",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -952,6 +998,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:numbers",
     ],
+    alwayslink = True,  # static registration
 )
 
 cc_library(
@@ -988,10 +1035,13 @@ cc_library(
         "gpu",
     ],
     deps = [
+        ":cuda_memory_allocator",
         ":nvshmem",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/platform:initialize",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
@@ -1002,6 +1052,7 @@ cc_library(
         "@local_tsl//tsl/platform:numbers",
         "@nvshmem//:nvshmem_lib",
     ],
+    alwayslink = True,  # static registration
 )
 
 cc_library(
@@ -1206,6 +1257,7 @@ cc_library(
         ":cuda_context",
         ":cuda_event",
         ":cuda_kernel",
+        ":cuda_memory_allocator",
         ":cuda_platform_id",
         ":cuda_status",
         ":cuda_stream",
@@ -1236,12 +1288,14 @@ cc_library(
         "//xla/stream_executor:launch_dim",
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:memory_allocator",
+        "//xla/stream_executor:memory_space",
         "//xla/stream_executor:module_spec",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor:tensor_map",
         "//xla/stream_executor/gpu:context",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:multicast_memory",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 39e49d407abe47..fa28dc809e5b0a 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/base/casts.h"
 #include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/numeric/int128.h"
@@ -61,6 +62,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_context.h"
 #include "xla/stream_executor/cuda/cuda_event.h"
 #include "xla/stream_executor/cuda/cuda_kernel.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/cuda/cuda_stream.h"
@@ -90,12 +92,14 @@ limitations under the License.
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tensor_map.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
@@ -1050,27 +1054,7 @@ CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
   }
 
   if (type == MemorySpace::kCollective) {
-    return std::make_unique<GenericMemoryAllocator>(
-        [this](uint64_t size)
-            -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
-          TF_ASSIGN_OR_RETURN(void* ptr, CollectiveMemoryAllocate(this, size));
-          XLA_VLOG_DEVICE(2, device_ordinal())
-              << "allocated " << ptr << " for context " << cuda_context_
-              << " of " << size << " bytes of collective memory";
-          return std::make_unique<GenericMemoryAllocation>(
-              ptr, size, [this](void* location, uint64_t size) {
-                auto status = CollectiveMemoryDeallocate(this, location);
-                if (!status.ok()) {
-                  XLA_LOG_DEVICE(ERROR, device_ordinal())
-                      << "failed to free collective memory at " << location
-                      << "; result: " << status;
-                } else {
-                  XLA_VLOG_DEVICE(2, device_ordinal())
-                      << "deallocated collective memory at " << location
-                      << " for context " << cuda_context_;
-                }
-              });
-        });
+    return CreateCollectiveMemoryAllocator(this, collective_allocator_type_);
   }
 
   if (type == MemorySpace::kHost) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index ff9c0c3d49a165..00a6c0ca48f2a6 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/cuda/cuda_context.h"
 #include "xla/stream_executor/cuda/cuda_kernel.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
@@ -53,17 +54,22 @@ limitations under the License.
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/stream_executor/tensor_map.h"
 
 namespace stream_executor::gpu {
 
 // This class implements GpuExecutor for NVIDIA GPUs that use CUDA libraries.
 class CudaExecutor : public GpuExecutor {
  public:
-  CudaExecutor(Platform* platform, int device_ordinal)
-      : GpuExecutor(platform, device_ordinal) {}
+  CudaExecutor(Platform* platform, int device_ordinal,
+               CollectiveAllocatorType collective_allocator_type)
+      : GpuExecutor(platform, device_ordinal),
+        collective_allocator_type_(collective_allocator_type) {}
+
   ~CudaExecutor() override;
   std::unique_ptr<ActivateContext> Activate() override;
   absl::Status Init() override;
@@ -225,6 +231,8 @@ class CudaExecutor : public GpuExecutor {
   // Returns true if a delay kernel is supported.
   absl::StatusOr<bool> DelayKernelIsSupported();
 
+  CollectiveAllocatorType collective_allocator_type_;
+
   bool is_vmm_supported_ = false;
 
   bool is_rdma_supported_ = false;
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.cc
new file mode 100644
index 00000000000000..2aa70468f488eb
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.cc
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/const_init.h"
+#include "absl/base/no_destructor.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// Per-process registry of collective allocator factories.
+static absl::Mutex collective_allocators_mu(absl::kConstInit);
+static absl::NoDestructor<
+    absl::flat_hash_map<CollectiveAllocatorType, CollectiveAllocatorFactory>>
+    collective_allocators ABSL_GUARDED_BY(collective_allocators_mu);
+
+namespace {
+// Instead of failing early we return a memory allocator that always fails when
+// asked to allocate collective memory.
+//
+// TODO(patrios): We should fail early, but in open source builds something is
+// wrong with linking order and allocators are not registered.
+class NoCollectiveMemoryAllocator : public MemoryAllocator {
+ public:
+  explicit NoCollectiveMemoryAllocator(CollectiveAllocatorType allocator_type)
+      : allocator_type_(allocator_type) {}
+
+  absl::StatusOr<std::unique_ptr<MemoryAllocation>> Allocate(
+      uint64_t size) override {
+    return absl::UnimplementedError(absl::StrCat(
+        "No collective memory allocator registered for ", allocator_type_));
+  }
+
+ private:
+  CollectiveAllocatorType allocator_type_;
+};
+}  // namespace
+
+void RegisterCollectiveAllocatorFactory(
+    CollectiveAllocatorType allocator_type,
+    absl::AnyInvocable<std::unique_ptr<MemoryAllocator>(StreamExecutor*)>
+        allocator_factory) {
+  VLOG(1) << "Registering collective allocator factory for "
+          << absl::StrCat(allocator_type);
+  absl::MutexLock lock(collective_allocators_mu);
+  collective_allocators->insert({allocator_type, std::move(allocator_factory)});
+}
+
+absl::StatusOr<std::unique_ptr<MemoryAllocator>>
+CreateCollectiveMemoryAllocator(StreamExecutor* executor,
+                                CollectiveAllocatorType allocator_type) {
+  absl::MutexLock lock(collective_allocators_mu);
+  auto it = collective_allocators->find(allocator_type);
+  if (it == collective_allocators->end()) {
+    return std::make_unique<NoCollectiveMemoryAllocator>(allocator_type);
+  }
+  return it->second(executor);
+}
+
+}  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h
new file mode 100644
index 00000000000000..6aef4027a7b77f
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h
@@ -0,0 +1,62 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_MEMORY_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_CUDA_CUDA_MEMORY_ALLOCATOR_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/memory_allocator.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace stream_executor::gpu {
+
+// A type of memory allocator for kCollective memory space.
+enum class CollectiveAllocatorType { kNccl, kNvshmem };
+
+template <typename T>
+void AbslStringify(std::string* str, CollectiveAllocatorType allocator_type) {
+  switch (allocator_type) {
+    case CollectiveAllocatorType::kNccl:
+      *str = "NCCL";
+      break;
+    case CollectiveAllocatorType::kNvshmem:
+      *str = "NVSHMEM";
+      break;
+  }
+}
+
+using CollectiveAllocatorFactory =  // NOLINT
+    absl::AnyInvocable<std::unique_ptr<MemoryAllocator>(StreamExecutor*)>;
+
+// Static registration of a collective memory allocator factory. NCCL and
+// NVSHMEM allocators are not supported in all build configurations, and
+// we rely on the static registration pattern as a way to ensure that
+// we can dynamically select between available allocators.
+void RegisterCollectiveAllocatorFactory(
+    CollectiveAllocatorType allocator_type,
+    CollectiveAllocatorFactory allocator_factory);
+
+// Creates a collective memory allocator for the given allocator type.
+absl::StatusOr<std::unique_ptr<MemoryAllocator>>
+CreateCollectiveMemoryAllocator(StreamExecutor* executor,
+                                CollectiveAllocatorType allocator_type);
+
+}  // namespace stream_executor::gpu
+
+#endif  // XLA_STREAM_EXECUTOR_CUDA_CUDA_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index 87bc56a136f65e..5247fc7a9abe77 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -26,8 +26,10 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/nvml/include/nvml.h"
+#include "xla/debug_options_flags.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/device_description.h"
@@ -120,7 +122,13 @@ absl::StatusOr<StreamExecutor*> CudaPlatform::FindExisting(int ordinal) {
 
 absl::StatusOr<std::unique_ptr<StreamExecutor>>
 CudaPlatform::GetUncachedExecutor(int ordinal) {
-  auto executor = std::make_unique<CudaExecutor>(this, ordinal);
+  // TODO(b/468297040): We should not be using DebugOptions here.
+  xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
+  auto executor = std::make_unique<CudaExecutor>(
+      this, ordinal,
+      debug_options.xla_gpu_experimental_enable_nvshmem()
+          ? CollectiveAllocatorType::kNvshmem
+          : CollectiveAllocatorType::kNccl);
   TF_RETURN_IF_ERROR(executor->Init());
   return std::move(executor);
 }
diff --git a/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
index 38d8d5508acffb..0d8e3bcbd0dc9c 100644
--- a/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nccl_memory_allocator.cc
@@ -25,8 +25,10 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "third_party/nccl/nccl.h"
 #include "xla/stream_executor/activate_context.h"
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -106,3 +108,12 @@ absl::StatusOr<std::unique_ptr<MemoryAllocation>> NcclMemoryAllocator::Allocate(
 }
 
 }  // namespace stream_executor::gpu
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    nccl_memory_allocator,
+    stream_executor::gpu::RegisterCollectiveAllocatorFactory(
+        stream_executor::gpu::CollectiveAllocatorType::kNccl,
+        [](stream_executor::StreamExecutor* executor) {
+          return std::make_unique<stream_executor::gpu::NcclMemoryAllocator>(
+              executor);
+        }));
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
index b05c32458c31a5..3988de1106c821 100644
--- a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator.cc
@@ -25,9 +25,12 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
 #include "third_party/nvshmem/nvshmemx.h"  // IWYU pragma: keep
+#include "xla/stream_executor/cuda/cuda_memory_allocator.h"
 #include "xla/stream_executor/cuda/nvshmem.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform/initialize.h"
+#include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tsl/platform/numbers.h"
@@ -90,3 +93,12 @@ NvshmemMemoryAllocator::Allocate(uint64_t size) {
 }
 
 }  // namespace stream_executor::gpu
+
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    nvshmem_memory_allocator,
+    stream_executor::gpu::RegisterCollectiveAllocatorFactory(
+        stream_executor::gpu::CollectiveAllocatorType::kNvshmem,
+        [](stream_executor::StreamExecutor* executor) {
+          return std::make_unique<
+              stream_executor::gpu::NvshmemMemoryAllocator>();
+        }));

From a67b3331b951bb7f27990b8861555bb377ac56e7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 13 Dec 2025 00:37:37 -0800
Subject: [PATCH 253/753] Automated Code Change

PiperOrigin-RevId: 843996657
---
 third_party/xla/xla/service/cpu/BUILD             | 1 +
 third_party/xla/xla/service/cpu/fusion_wrapper.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index af4a362ded3a28..8f0c7d635d3879 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -1088,6 +1088,7 @@ cc_library(
     srcs = ["fusion_wrapper.cc"],
     hdrs = ["fusion_wrapper.h"],
     deps = [
+        "//xla:xla_data_proto_cc",
         "//xla/backends/cpu/codegen/tiled:tiled_fusion_emitter",
         "//xla/codegen/emitters:fusion_wrapper_base",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/cpu/fusion_wrapper.cc b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
index 1bef382eff71fe..5a8a5c6e558fa5 100644
--- a/third_party/xla/xla/service/cpu/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/cpu/fusion_wrapper.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "xla/backends/cpu/codegen/tiled/tiled_fusion_emitter.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {

From 06318830d3e13bcf56cd8483cb60ccdd0dc6a4a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 13 Dec 2025 01:03:13 -0800
Subject: [PATCH 254/753] compat: Update forward compatibility horizon to
 2025-12-13

PiperOrigin-RevId: 844002426
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 5847142d8ab4d5..291882942b6c4e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 12)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 13)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 0f48d364f34f7168ab5f5311475782ac5b0ca768 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 13 Dec 2025 01:03:34 -0800
Subject: [PATCH 255/753] Update GraphDef version to 2440.

PiperOrigin-RevId: 844002537
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9476cab5497275..95c0b503271a81 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2439  // Updated: 2025/12/12
+#define TF_GRAPH_DEF_VERSION 2440  // Updated: 2025/12/13
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 71f10c97469388f97046943fa67073bd44b0f48e Mon Sep 17 00:00:00 2001
From: Chun-nien Chan <cnchan@google.com>
Date: Sat, 13 Dec 2025 01:10:19 -0800
Subject: [PATCH 256/753] Fork flatbuffer_utils to litert.

PiperOrigin-RevId: 844004548
---
 tensorflow/lite/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 8a2549e1c623db..e9c6e8fa5f2c52 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -12,6 +12,7 @@ package(
         "//tensorflow:__subpackages__",
         "//tensorflow:internal",
         "//third_party/odml/infra/genai/conversion:__subpackages__",
+        "//third_party/odml/litert/litert/python:__subpackages__",
         "//third_party/odml/model_customization/quantization:__subpackages__",
         "//third_party/py/ai_edge_torch:__subpackages__",
         "//third_party/py/tensorflow_federated:__subpackages__",

From f41fee8b2ebc4bd8d1cd2bdd845f6ccada6b3c21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 13 Dec 2025 01:21:45 -0800
Subject: [PATCH 257/753] Automated Code Change

PiperOrigin-RevId: 844007709
---
 .../legalize_einsum_to_dot_general.cc                         | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index 0a20c69af6b6fa..7502231cc5814b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <cctype>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 #include <iterator>
 #include <memory>
 #include <utility>

From 68062bd3ca8a6f938c50e0a778ad2557e64080de Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov@google.com>
Date: Sat, 13 Dec 2025 01:49:40 -0800
Subject: [PATCH 258/753] [XLA:GPU] replace includes of symbolic expression
 with mlir context

PiperOrigin-RevId: 844014544
---
 .../xla/xla/backends/cpu/codegen/BUILD        |  1 -
 .../xla/backends/cpu/codegen/emitters/BUILD   |  2 --
 .../codegen/emitters/cpu_fusion_emitter.cc    |  2 +-
 .../cpu/codegen/emitters/cpu_fusion_emitter.h |  2 +-
 .../emitters/cpu_fusion_emitter_test.cc       |  1 -
 .../codegen/emitters/cpu_scatter_emitter.cc   |  2 +-
 .../codegen/emitters/cpu_scatter_emitter.h    |  2 +-
 .../backends/cpu/codegen/fusion_emitter.cc    |  1 -
 .../xla/backends/cpu/codegen/fusion_emitter.h |  1 -
 .../xla/xla/backends/gpu/autotuner/BUILD      |  2 +-
 .../backends/gpu/autotuner/autotuner_main.cc  |  1 -
 .../backends/gpu/autotuner/factory_rocm.cc    |  2 +-
 .../xla/backends/gpu/codegen/emitters/BUILD   |  8 --------
 .../gpu/codegen/emitters/concatenate.cc       |  2 +-
 .../gpu/codegen/emitters/concatenate.h        |  2 +-
 .../gpu/codegen/emitters/emitter_base.cc      |  1 -
 .../gpu/codegen/emitters/emitter_base.h       |  2 +-
 .../gpu/codegen/emitters/emitter_base_test.cc |  1 -
 .../emitters/in_place_dynamic_update_slice.cc |  2 +-
 .../emitters/in_place_dynamic_update_slice.h  |  2 +-
 .../xla/backends/gpu/codegen/emitters/loop.h  |  2 +-
 .../gpu/codegen/emitters/reduction.cc         |  2 +-
 .../backends/gpu/codegen/emitters/reduction.h |  2 +-
 .../backends/gpu/codegen/emitters/scatter.cc  |  1 -
 .../backends/gpu/codegen/emitters/scatter.h   |  2 +-
 .../gpu/codegen/emitters/transpose.cc         |  1 -
 .../backends/gpu/codegen/emitters/transpose.h |  2 +-
 .../xla/xla/backends/gpu/codegen/tools/BUILD  |  1 +
 .../gpu/codegen/tools/gpu_test_correctness.cc |  2 +-
 .../xla/xla/backends/gpu/codegen/triton/BUILD |  2 --
 .../gpu/codegen/triton/fusion_test.cc         |  1 -
 .../backends/gpu/codegen/triton/test_utils.h  |  1 -
 third_party/xla/xla/codegen/BUILD             |  1 -
 third_party/xla/xla/codegen/emitters/BUILD    |  7 -------
 .../emitters/computation_partitioner.cc       |  1 -
 .../emitters/computation_partitioner.h        |  1 -
 .../emitters/computation_partitioner_test.cc  |  1 -
 .../emitters/concatenate_kernel_emitter.cc    |  2 +-
 .../emitters/concatenate_kernel_emitter.h     |  2 +-
 .../dynamic_update_slice_kernel_emitter.cc    |  2 +-
 .../dynamic_update_slice_kernel_emitter.h     |  2 +-
 .../codegen/emitters/elemental_hlo_to_mlir.cc |  2 +-
 .../emitters/elemental_hlo_to_mlir_test.cc    |  1 -
 third_party/xla/xla/codegen/emitters/ir/BUILD |  2 --
 .../xla/xla/codegen/emitters/ir/xla_ops.cc    |  1 -
 .../xla/codegen/emitters/ir/xla_ops_test.cc   |  1 -
 .../codegen/emitters/loop_kernel_emitter.cc   |  2 +-
 .../codegen/emitters/loop_kernel_emitter.h    |  2 +-
 .../xla/xla/codegen/mlir_kernel_source.cc     |  4 +---
 .../xla/xla/codegen/mlir_kernel_source.h      |  4 +---
 third_party/xla/xla/codegen/tiling/BUILD      |  6 ------
 .../codegen/tiling/symbolic_tile_analysis.h   |  1 -
 .../tiling/symbolic_tile_analysis_test.cc     |  1 -
 .../symbolic_tiled_hlo_instruction_test.cc    |  1 -
 .../xla/codegen/tiling/tiled_hlo_schedule.cc  |  1 -
 .../xla/codegen/tiling/tiled_hlo_schedule.h   |  2 +-
 .../codegen/tiling/tiled_hlo_schedule_test.cc |  1 -
 .../tiling/tiling_specification_test.cc       |  1 -
 third_party/xla/xla/hlo/analysis/BUILD        |  4 ----
 .../xla/xla/hlo/analysis/indexing_analysis.cc |  1 -
 .../xla/xla/hlo/analysis/indexing_analysis.h  |  2 +-
 .../analysis/indexing_map_serialization.cc    |  2 +-
 .../hlo/analysis/indexing_map_serialization.h |  2 +-
 .../indexing_map_serialization_test.cc        |  1 -
 .../xla/xla/hlo/analysis/indexing_map_test.cc |  1 -
 .../xla/hlo/analysis/indexing_test_utils.cc   |  2 +-
 .../xla/hlo/analysis/indexing_test_utils.h    |  2 +-
 .../xla/xla/service/cpu/thunk_emitter.h       |  1 -
 third_party/xla/xla/service/gpu/BUILD         | 10 ++--------
 .../xla/xla/service/gpu/autotuning/BUILD      |  5 ++---
 .../gpu/autotuning/gemm_fusion_autotuner.h    |  2 +-
 .../autotuning/gemm_fusion_autotuner_test.cc  |  1 -
 .../service/gpu/fusion_dispatch_pipeline.cc   |  2 +-
 .../service/gpu/fusion_dispatch_pipeline.h    |  2 +-
 .../xla/xla/service/gpu/fusion_pipeline.cc    |  1 -
 .../xla/xla/service/gpu/fusion_pipeline.h     |  2 +-
 .../xla/xla/service/gpu/gpu_compiler.h        |  1 -
 .../xla/xla/service/gpu/gpu_hlo_schedule.cc   |  2 +-
 .../xla/xla/service/gpu/gpu_hlo_schedule.h    |  2 +-
 .../gpu/gpu_latency_hiding_scheduler_test.cc  |  1 -
 .../xla/xla/service/gpu/ir_emitter_context.h  |  1 -
 .../xla/xla/service/gpu/kernel_call.cc        |  2 +-
 third_party/xla/xla/service/gpu/kernel_call.h |  2 +-
 .../xla/xla/service/gpu/kernel_call_test.cc   |  1 -
 third_party/xla/xla/service/gpu/model/BUILD   | 19 ++++---------------
 .../gpu/model/analytical_latency_estimator.cc |  2 +-
 .../gpu/model/analytical_latency_estimator.h  |  1 -
 .../analytical_latency_estimator_test.cc      |  1 -
 .../service/gpu/model/coalescing_analysis.cc  |  1 -
 .../service/gpu/model/coalescing_analysis.h   |  2 +-
 .../gpu/model/coalescing_analysis_test.cc     |  1 -
 .../model/gpu_cost_model_stats_collection.h   |  2 +-
 .../gpu_cost_model_stats_collection_test.cc   |  1 -
 .../model/gpu_indexing_performance_model.h    |  1 -
 .../gpu_indexing_performance_model_test.cc    |  1 -
 .../gpu/model/gpu_performance_model.cc        |  3 +--
 .../service/gpu/model/gpu_performance_model.h |  2 +-
 .../gpu/model/gpu_performance_model_base.cc   |  2 +-
 .../gpu/model/gpu_performance_model_base.h    |  2 +-
 .../model/gpu_performance_model_base_test.cc  |  1 -
 .../gpu/model/gpu_performance_model_test.cc   |  1 -
 .../sol_gpu_cost_model_stats_collection.h     |  2 +-
 ...ol_gpu_cost_model_stats_collection_test.cc |  1 -
 .../model/triton_emitter_constraints_test.cc  |  1 -
 .../xla/service/gpu/nvptx_compiler_test.cc    |  1 -
 .../xla/xla/service/gpu/transforms/BUILD      |  6 +-----
 .../service/gpu/transforms/collectives/BUILD  |  3 +--
 .../collective_combiner_annotator.cc          |  2 +-
 .../collective_combiner_annotator.h           |  2 +-
 .../collective_combiner_annotator_test.cc     |  1 -
 .../gpu/transforms/priority_fusion_test.cc    |  1 -
 .../gpu/transforms/softmax_rewriter_triton.cc |  1 -
 .../gpu/transforms/softmax_rewriter_triton.h  |  2 +-
 .../softmax_rewriter_triton_test.cc           |  1 -
 .../triton_fusion_numerics_verifier.cc        |  2 +-
 .../triton_fusion_numerics_verifier.h         |  2 +-
 .../triton_fusion_numerics_verifier_test.cc   |  1 -
 third_party/xla/xla/tests/BUILD               |  1 -
 .../tests/hlo_test_base_with_mlir_context.h   |  1 -
 third_party/xla/xla/tools/BUILD               |  5 -----
 120 files changed, 64 insertions(+), 180 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/codegen/BUILD b/third_party/xla/xla/backends/cpu/codegen/BUILD
index e770f8a6a7907f..4a4d285391904d 100644
--- a/third_party/xla/xla/backends/cpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/BUILD
@@ -712,7 +712,6 @@ cc_library(
         "//xla/codegen/emitters:kernel_arguments",
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_cluster",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
index a7f171019e92cf..5832c34b6c6fe1 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/BUILD
@@ -48,7 +48,6 @@ cc_library(
         "//xla/codegen/emitters/ir:xla",
         "//xla/codegen/emitters/ir:xla_attrs_inc_gen",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
         "//xla/mlir_hlo",
@@ -105,7 +104,6 @@ xla_cc_test(
         "//xla/codegen:mlir_kernel_source",
         "//xla/hlo/analysis:alias_info",
         "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
index 53f2f0dc055c5f..e7fc61a4c0e774 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
@@ -64,7 +65,6 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
index 9dc42fa9c96f15..042417aac54ee4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.h
@@ -28,11 +28,11 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Pass/PassManager.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
index 0c09ff70124398..e8b0a586fc14e2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/alias_info.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
index 537569a53404e5..711bdb28709ed0 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -62,7 +63,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
index 20c4a7332b86f0..5f8fa0d525cda8 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.h
@@ -25,12 +25,12 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
index 5e0bdaddccbb5f..cec9c5c1769275 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
index 74302f25fd368e..15ccb3b8fce2ae 100644
--- a/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
+++ b/third_party/xla/xla/backends/cpu/codegen/fusion_emitter.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "xla/codegen/emitters/kernel_arguments.h"
 #include "xla/codegen/kernel_definition.h"
 #include "xla/codegen/mlir_kernel_source.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/gpu/autotuner/BUILD b/third_party/xla/xla/backends/gpu/autotuner/BUILD
index c626469d5be7b3..db0708210980ca 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/BUILD
+++ b/third_party/xla/xla/backends/gpu/autotuner/BUILD
@@ -633,11 +633,11 @@ cc_library(
         ":triton",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/service:compiler",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/platform:platform_object_registry",
         "//xla/stream_executor/rocm:rocm_platform_id",
+        "@llvm-project//mlir:IR",
     ],
     alwayslink = True,
 )
diff --git a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
index b81cf80665db88..b32110d7971e4a 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/autotuner_main.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/backends/gpu/autotuner/gpu_profiler.h"
 #include "xla/backends/gpu/autotuner/legacy_cache.h"
 #include "xla/debug_options_flags.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
index e327f6abdde0e0..142738a9ef16e6 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/factory_rocm.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/autotuner/codegen_backend.h"
 #include "xla/backends/gpu/autotuner/cublas.h"
 #include "xla/backends/gpu/autotuner/factory.h"
 #include "xla/backends/gpu/autotuner/triton.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/service/compiler.h"
 #include "xla/stream_executor/platform/platform_object_registry.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
index 6f91eb138bab3c..debca012ccc125 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/BUILD
@@ -25,7 +25,6 @@ cc_library(
         "//xla/codegen/emitters:concatenate_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:gpu_constants",
@@ -63,7 +62,6 @@ cc_library(
         "//xla/codegen/emitters/transforms:pass_pipelines",
         "//xla/codegen/emitters/transforms:passes",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
         "//xla/mlir/tools/mlir_replay/public:compiler_trace_proto_cc",
@@ -137,7 +135,6 @@ xla_cc_test(
         ":emitter_base",
         "//xla/codegen/emitters:computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
@@ -175,7 +172,6 @@ cc_library(
         "//xla/codegen/emitters:dynamic_update_slice_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -207,7 +203,6 @@ cc_library(
         "//xla/codegen/emitters:loop_kernel_emitter",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -243,7 +238,6 @@ cc_library(
         "//xla/codegen/emitters:type_util",
         "//xla/codegen/emitters:utils",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:platform_util",
@@ -335,7 +329,6 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:scatter_simplifier",
         "//xla/service/gpu:gpu_fusible",
@@ -378,7 +371,6 @@ cc_library(
         "//xla/codegen/emitters:utils",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:hlo_fusion_analysis",
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
index 5490797ffd64c6..c7470ee44b7efa 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.cc
@@ -26,12 +26,12 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/concatenate_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_constants.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
index 673dbb8e50a327..497e3a5bc3d0b3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/concatenate.h
@@ -25,11 +25,11 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
index 2821b28b53215b..45a5f5ced1c791 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.cc
@@ -93,7 +93,6 @@ limitations under the License.
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
index bb0adddf675d1d..c019f8fb51535d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -38,7 +39,6 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir/tools/mlir_replay/public/compiler_trace.pb.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
index 151456f686069a..536daf99c5e728 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/emitter_base_test.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/testlib/filecheck.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
index 4eb2b55f210f88..1e77fbbba4f168 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
index 2b96e3f1fe54ab..66f09a64c11a06 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/in_place_dynamic_update_slice.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
index d71111d9f1329b..9f2ad846138a8a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/loop.h
@@ -22,10 +22,10 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
index 19936a63675ce8..e366022385e018 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -57,7 +58,6 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
index 5841543ccf5148..3de57127a1b009 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/reduction.h
@@ -28,13 +28,13 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/backends/gpu/codegen/emitters/reduction_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
index 7a2ba02dfda734..fd07b7ee32e156 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
index d6261abfaadc56..a20866185c3db8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/scatter.h
@@ -25,12 +25,12 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
index 3b8cd150b0eb08..7059f439a844e3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/codegen/emitters/utils.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
index e3a8091c07f95f..ad3948f09a4b99 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
@@ -29,13 +29,13 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/backends/gpu/codegen/emitters/emitter_base.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
index df59b5ebed0cec..1e9e86b0dca529 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/BUILD
@@ -83,6 +83,7 @@ xla_cc_binary(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "//xla/codegen/tools:test_lib",
+        "//xla/hlo/analysis:symbolic_expr",
         "//xla:debug_options_flags",
         "//xla:error_spec",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc b/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
index edc204a7ef8833..a8a606a22557ac 100644
--- a/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/tools/gpu_test_correctness.cc
@@ -35,11 +35,11 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/shape.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
 
 struct Flags {
   std::string input_file = "";
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index faefcc8e14163f..daeb00d2397e2d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -81,7 +81,6 @@ xla_cc_test(
         ":fusion",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service/gpu:gpu_device_info_for_tests",
@@ -760,7 +759,6 @@ cc_library(
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
index e305a4c5cadbe0..e0f0c5272eb37d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
index 0d32097bf4cd88..74da9f00311e7e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/test_utils.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/codegen/BUILD b/third_party/xla/xla/codegen/BUILD
index ddcc8810df0a6f..fce3eb83bd9efb 100644
--- a/third_party/xla/xla/codegen/BUILD
+++ b/third_party/xla/xla/codegen/BUILD
@@ -85,7 +85,6 @@ cc_library(
     deps = [
         ":kernel_source",
         "//xla:util",
-        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/codegen/emitters/BUILD b/third_party/xla/xla/codegen/emitters/BUILD
index 5fc4861574b9ed..63ea5a16a9bc61 100644
--- a/third_party/xla/xla/codegen/emitters/BUILD
+++ b/third_party/xla/xla/codegen/emitters/BUILD
@@ -23,7 +23,6 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/algorithm:container",
@@ -48,7 +47,6 @@ xla_cc_test(
     deps = [
         ":computation_partitioner",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
@@ -91,7 +89,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms/simplifiers:gather_simplifier",
         "//xla/hlo/translate/hlo_to_mhlo:hlo_utils",
@@ -137,7 +134,6 @@ xla_cc_test(
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:filecheck",
@@ -314,7 +310,6 @@ cc_library(
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -366,7 +361,6 @@ cc_library(
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
@@ -425,7 +419,6 @@ cc_library(
         "//xla/codegen:mlir_kernel_source",
         "//xla/codegen/emitters/ir:xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/runtime:work_dimensions",
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
index a610c928102591..dd452ddf321f93 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.cc
@@ -47,7 +47,6 @@ limitations under the License.
 #include "xla/codegen/emitters/type_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner.h b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
index 8fd113f43602da..8c5b6053298dcb 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner.h
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/util.h"
diff --git a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
index 39ef286b2d089c..3fb8c566fb157c 100644
--- a/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
+++ b/third_party/xla/xla/codegen/emitters/computation_partitioner_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
index 090a1dea456744..69d9df07487f12 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -56,7 +57,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
index b7d5d7d9b9b970..2ffc8e9fc8a36e 100644
--- a/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/concatenate_kernel_emitter.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
index fb8d3e6445e32d..af61c02c7e73d0 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -54,7 +55,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
index e727d8296176a5..7346e4b8890c16 100644
--- a/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/dynamic_update_slice_kernel_emitter.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "xla/codegen/kernel_spec.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/runtime/work_dimensions.h"
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
index 14d5adaadb48c6..0f800f3789a95b 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/TypeRange.h"
 #include "mlir/IR/Types.h"
 #include "mlir/IR/Value.h"
@@ -67,7 +68,6 @@ limitations under the License.
 #include "xla/comparison_util.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
index fd85579d9ccb3f..a90c1a51d74ea6 100644
--- a/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
+++ b/third_party/xla/xla/codegen/emitters/elemental_hlo_to_mlir_test.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/filecheck.h"
diff --git a/third_party/xla/xla/codegen/emitters/ir/BUILD b/third_party/xla/xla/codegen/emitters/ir/BUILD
index bb534898b5fa96..d529bfef000b37 100644
--- a/third_party/xla/xla/codegen/emitters/ir/BUILD
+++ b/third_party/xla/xla/codegen/emitters/ir/BUILD
@@ -93,7 +93,6 @@ cc_library(
         ":xla_ops_inc_gen",
         "//xla/codegen/emitters:type_util",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -118,7 +117,6 @@ xla_test(
     deps = [
         ":xla",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/testlib:filecheck",
         "//xla/mlir/utils:error_util",
         "//xla/tests:hlo_pjrt_test_base",
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
index 1239f8afe06f5e..09f351a845baf5 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops.cc
@@ -57,7 +57,6 @@ limitations under the License.
 #include "xla/codegen/emitters/ir/xla_dialect.cc.inc"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
index cc4002b42c5ea5..d6c565694107de 100644
--- a/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
+++ b/third_party/xla/xla/codegen/emitters/ir/xla_ops_test.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/mlir/utils/error_util.h"
 #include "xla/tests/hlo_pjrt_test_base.h"
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
index 467e80fd26888e..acbd64326bdc69 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/IR/Value.h"
 #include "mlir/IR/ValueRange.h"
@@ -52,7 +53,6 @@ limitations under the License.
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
index 46546815d2f8cb..b5d0761060f7a3 100644
--- a/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
+++ b/third_party/xla/xla/codegen/emitters/loop_kernel_emitter.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/emitters/computation_partitioner.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
@@ -29,7 +30,6 @@ limitations under the License.
 #include "xla/codegen/kernel_emitter.h"
 #include "xla/codegen/mlir_kernel_source.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/work_dimensions.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.cc b/third_party/xla/xla/codegen/mlir_kernel_source.cc
index 8e7ea28a9b8198..80ebf56e4a468c 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.cc
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -54,8 +53,7 @@ absl::StatusOr<MlirKernelSource> MlirKernelSource::ParseFromString(
     return Internal("Failed to parse MLIR IR: %s", error_string);
   }
 
-  return MlirKernelSource(std::move(mlir_context),
-                          std::move(mlir_module));
+  return MlirKernelSource(std::move(mlir_context), std::move(mlir_module));
 }
 
 }  // namespace xla
diff --git a/third_party/xla/xla/codegen/mlir_kernel_source.h b/third_party/xla/xla/codegen/mlir_kernel_source.h
index 1beeb9b5a3a708..98342c01a80238 100644
--- a/third_party/xla/xla/codegen/mlir_kernel_source.h
+++ b/third_party/xla/xla/codegen/mlir_kernel_source.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Support/DebugStringHelper.h"
 #include "xla/codegen/kernel_source.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
@@ -45,8 +44,7 @@ class MlirKernelSource final : public KernelSource {
   // context.
   MlirKernelSource(std::unique_ptr<mlir::MLIRContext> mlir_context,
                    mlir::OwningOpRef<mlir::ModuleOp> module)
-      : mlir_context_(std::move(mlir_context)),
-        module_(std::move(module)) {}
+      : mlir_context_(std::move(mlir_context)), module_(std::move(module)) {}
 
   // Construct a MLIR kernel source from a module but don't take any ownership
   // of the MLIR context.
diff --git a/third_party/xla/xla/codegen/tiling/BUILD b/third_party/xla/xla/codegen/tiling/BUILD
index 07ebadad917774..709dab0990718a 100644
--- a/third_party/xla/xla/codegen/tiling/BUILD
+++ b/third_party/xla/xla/codegen/tiling/BUILD
@@ -117,7 +117,6 @@ cc_library(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
@@ -142,7 +141,6 @@ xla_cc_test(
         ":tiling_specification",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -290,7 +288,6 @@ xla_cc_test(
         ":symbolic_tile",
         ":symbolic_tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
@@ -325,7 +322,6 @@ xla_cc_test(
     deps = [
         ":symbolic_tile_analysis",
         ":tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -353,7 +349,6 @@ cc_library(
         "//xla:util",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:instruction_fusion",
@@ -396,7 +391,6 @@ xla_cc_test(
         ":tiling_specification",
         "//xla:util",
         "//xla/hlo/analysis:indexing_test_utils",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
index 6460e62c0b6882..bf1260c2b19820 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
index 23beabca918ab3..4f76367dad31d3 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis_test.cc
@@ -45,7 +45,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
index b480395b10dcc6..136c83ce3215d7 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tiled_hlo_instruction_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/codegen/tiling/symbolic_tile.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
index aa90ba63242859..71eee0d68f8e81 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
index c7868092a37278..0d616b78b73c64 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule.h
@@ -21,9 +21,9 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiling_specification.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
index 77a1c5279051b9..9de0b05f029597 100644
--- a/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiled_hlo_schedule_test.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
diff --git a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
index b5d4c3cb160641..86c4aca14d8ff4 100644
--- a/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
+++ b/third_party/xla/xla/codegen/tiling/tiling_specification_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/hlo/analysis/BUILD b/third_party/xla/xla/hlo/analysis/BUILD
index 9795916a0a8291..627179d0643995 100644
--- a/third_party/xla/xla/hlo/analysis/BUILD
+++ b/third_party/xla/xla/hlo/analysis/BUILD
@@ -604,7 +604,6 @@ cc_library(
     ],
     deps = [
         ":interval",
-        ":symbolic_expr",
         "//xla:permutation_util",
         "//xla:shape_util",
         "//xla:util",
@@ -637,7 +636,6 @@ xla_cc_test(
         ":indexing_analysis",
         ":indexing_test_utils",
         ":interval",
-        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
@@ -657,7 +655,6 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
-        ":symbolic_expr",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/strings:string_view",
@@ -674,7 +671,6 @@ cc_library(
     hdrs = ["indexing_test_utils.h"],
     deps = [
         ":indexing_analysis",
-        ":symbolic_expr",
         "//xla:status_macros",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
index 8f4e2ec440ae3f..7f686baea60ada 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.h b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
index e0650668397b83..56e51d9b5804cc 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
@@ -30,8 +30,8 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
index bcb0b595ac114e..5646e22a76ba1f 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.cc
@@ -39,10 +39,10 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 namespace {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
index b531cdc2ddb14b..d6b5823e2d388f 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
index 2ae50aae18f57b..27041cb7ee2555 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_serialization_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
index 93f727b9b35e91..98283eb8d7fe44 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
index f7db83e6f76331..d7b24a58e4cd11 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.cc
@@ -39,10 +39,10 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/errors.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
index 2c8a67f3ae65cc..dc53be5c8ce0db 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
@@ -28,10 +28,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
diff --git a/third_party/xla/xla/service/cpu/thunk_emitter.h b/third_party/xla/xla/service/cpu/thunk_emitter.h
index 6d034ddc6234aa..bb716b166487f0 100644
--- a/third_party/xla/xla/service/cpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/cpu/thunk_emitter.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/sort_thunk.h"
 #include "xla/backends/cpu/runtime/thunk.h"
 #include "xla/codegen/kernel_spec.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index be400454a9386c..87e5ac19c0e0d6 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -424,7 +424,6 @@ cc_library(
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:host_execute_thunk",
         "//xla/backends/gpu/runtime:thunk_id",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:call_inliner",
@@ -605,7 +604,6 @@ cc_library(
     srcs = ["kernel_call.cc"],
     hdrs = ["kernel_call.h"],
     deps = [
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/stream_executor:launch_dim",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
@@ -626,7 +624,6 @@ xla_cc_test(
     srcs = ["kernel_call_test.cc"],
     deps = [
         ":kernel_call",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
@@ -1504,13 +1501,13 @@ cc_library(
     hdrs = ["fusion_dispatch_pipeline.h"],
     deps = [
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
         "//xla/service:hlo_cost_analysis",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
         "//xla/service/gpu/transforms:fusion_dynamic_memcpy_rewriter",
         "//xla/stream_executor:device_description",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1521,7 +1518,6 @@ cc_library(
     deps = [
         ":alias_info",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2245,7 +2241,6 @@ xla_test(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/hlo/analysis:hlo_ordering",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:backend",
@@ -2498,7 +2493,6 @@ cc_library(
         ":ir_emission_utils",
         "//xla:shape_util",
         "//xla:util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/collectives:async_collective_creator",
@@ -2530,6 +2524,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -3288,7 +3283,6 @@ xla_cc_test(
         ":gpu_device_info_for_tests",
         ":gpu_hlo_schedule",
         ":gpu_latency_hiding_scheduler",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 544ac7a01b382b..10c0b530b33f79 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -50,7 +50,6 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
         "//xla/backends/gpu/autotuner:cudnn",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -76,6 +75,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
         "@local_config_cuda//cuda:cuda_headers",
     ],
 )
@@ -98,7 +98,6 @@ cc_library(
         "//xla:autotuning_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/autotuner:codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/pjrt/distributed:key_value_store_interface",
@@ -117,6 +116,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
         "@local_config_rocm//rocm:rocm_headers",
     ],
 )
@@ -257,7 +257,6 @@ xla_test(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/autotuner:gpu_codegen_backend",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/testlib:filecheck",
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
index 1ca4059f91d77e..19023367bfb7b8 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner.h
@@ -28,9 +28,9 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/autotuning.pb.h"
 #include "xla/backends/autotuner/codegen_backend.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
index 1744f373b71300..1198ba3ec5af83 100644
--- a/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/gemm_fusion_autotuner_test.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/backends/gpu/autotuner/gpu_codegen_backend.h"
 #include "xla/error_spec.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
index 992bef7b2ae8e2..a8b83730b145a9 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/fusion_dispatch_pipeline.h"
 
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/gpu/transforms/fusion_block_level_rewriter.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
index 8d8b26a2ef9387..2fae2a309bf1ac 100644
--- a/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_dispatch_pipeline.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_DISPATCH_PIPELINE_H_
 
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index ce4176081976e6..2758ac1fe479f5 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <utility>
 
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/pass/hlo_pass_fix.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.h b/third_party/xla/xla/service/gpu/fusion_pipeline.h
index 64b4ceea176562..89802eb3a8b422 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.h
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 #define XLA_SERVICE_GPU_FUSION_PIPELINE_H_
 
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/service/gpu/alias_info.h"
 #include "xla/service/hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index 595e9210c8c061..be97807e6c8c89 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/autotune_results.pb.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/simplifiers/algebraic_simplifier.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 2afc4787298d11..5a8070693dd481 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
index ef5c5dacda34b6..1cf84062318131 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
index 397e1c85737bf2..d56c3628e91e37 100644
--- a/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_latency_hiding_scheduler_test.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.h b/third_party/xla/xla/service/gpu/ir_emitter_context.h
index 8268f48c5809de..528be1317cc0fb 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/host_execute_thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_call.cc b/third_party/xla/xla/service/gpu/kernel_call.cc
index c5c876e02837bf..0038faa9153d93 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call.cc
@@ -29,9 +29,9 @@ limitations under the License.
 #include "mlir/AsmParser/AsmParser.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/Parser/Parser.h"
 #include "mlir/Support/LLVM.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/kernel_call.h b/third_party/xla/xla/service/gpu/kernel_call.h
index 104ef7fcbdf548..c158bbf9cd30c0 100644
--- a/third_party/xla/xla/service/gpu/kernel_call.h
+++ b/third_party/xla/xla/service/gpu/kernel_call.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/stream_executor/launch_dim.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/kernel_call_test.cc b/third_party/xla/xla/service/gpu/kernel_call_test.cc
index 030324a94548e4..de0c4c66ea25d6 100644
--- a/third_party/xla/xla/service/gpu/kernel_call_test.cc
+++ b/third_party/xla/xla/service/gpu/kernel_call_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 698fc2f88afa2e..2b1781b7923784 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -31,7 +31,6 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
@@ -159,7 +158,6 @@ xla_test(
     deps = [
         ":analytical_latency_estimator",
         "//xla:shape_util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:latency_hiding_scheduler",
@@ -220,7 +218,6 @@ cc_library(
         ":gpu_dot_fusion_cost_model",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -238,6 +235,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -247,7 +245,6 @@ xla_cc_test(
     deps = [
         ":gpu_cost_model_stats_collection",
         ":gpu_hlo_cost_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
@@ -323,7 +320,6 @@ cc_library(
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/triton:fusion",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service/gpu:backend_configs_cc",
@@ -337,6 +333,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -346,7 +343,6 @@ xla_cc_test(
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model_base",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -374,7 +370,6 @@ cc_library(
         ":gpu_performance_model_base",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -386,6 +381,7 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -400,7 +396,6 @@ xla_cc_test(
         ":gpu_performance_model_base",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -502,7 +497,6 @@ cc_library(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/service:hlo_cost_analysis",
@@ -539,7 +533,6 @@ xla_cc_test(
         "//xla/codegen/tiling:symbolic_tile_analysis",
         "//xla/codegen/tiling:tiled_hlo_computation",
         "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
@@ -617,7 +610,6 @@ xla_cc_test(
     deps = [
         ":triton_emitter_constraints",
         "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
@@ -648,7 +640,6 @@ cc_library(
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:interval",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -676,7 +667,6 @@ xla_cc_test(
         "//xla/codegen/tiling:tiled_hlo_instruction",
         "//xla/codegen/tiling:tiled_hlo_schedule",
         "//xla/codegen/tiling:tiling_specification",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_traversal",
@@ -855,7 +845,6 @@ cc_library(
         ":gpu_hlo_cost_analysis",
         ":sol_latency_estimator",
         "//xla/backends/gpu/codegen/triton:support",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -873,6 +862,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -882,7 +872,6 @@ xla_cc_test(
     deps = [
         ":sol_gpu_cost_model_stats_collection",
         "//xla:shape_util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_cost_analysis",
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index 9b5e7654a2e36c..0c7aa67f8cbc55 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/time/time.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
index c23b78bbaefeda..2b1c51737dea1f 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <optional>
 
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
index 9e9e60691146ac..c4dbc694860320 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator_test.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/alias_info.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index 5346566cb7c281..59479b7e7114a7 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/hlo/analysis/indexing_analysis.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index c3ac7c3a9a9054..550d847d4af1b7 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/hlo/analysis/indexing_map.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/stream_executor/device_description.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index d682e41e49b365..6d81b8c935ec17 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/codegen/tiling/tiled_hlo_instruction.h"
 #include "xla/codegen/tiling/tiled_hlo_schedule.h"
 #include "xla/codegen/tiling/tiling_specification.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
index 8c0fdd437b8614..843b3f6b8b31ba 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
index 2e39608c5780ca..975e02453cfc94 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_cost_model_stats_collection_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
index 88c15cd960926a..85f40e8505482d 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
index 4e54b437174750..27fba492d0bf81 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
 #include "xla/codegen/tiling/tiled_hlo_computation.h"
 #include "xla/codegen/tiling/tiling_specification.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index 4a97ae21959e91..956bc9f926ca01 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -319,6 +319,5 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
   VLOG(8) << "RecordEstimatedRunTime: " << instruction->ToString();
 }
 
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index 9882cec1c2824f..c150db6dde043f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
index 3975bbfafe32a0..e369f215601a70 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/codegen/fusions.h"
 #include "xla/backends/gpu/codegen/triton/fusion.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
index 166fc8e6842e9c..25dd6242985702 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/launch_dimensions.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
index cbdf16d34ffcda..c517f352eeba0f 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index e14d7dfc1ed7a7..8bc33e08942fe1 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
index 6d7ccde25851f5..3deed08bd99c69 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 #include "xla/service/hlo_verifier.h"
diff --git a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
index ea03349b3ab584..13cf1dfa711985 100644
--- a/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_gpu_cost_model_stats_collection_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/backend_configs.pb.h"
diff --git a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
index 8be49e50bd197a..f7ded1b6f880b6 100644
--- a/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
+++ b/third_party/xla/xla/service/gpu/model/triton_emitter_constraints_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
index fe0469980f45ed..6a09c25d21220d 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/analysis/hlo_ordering.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 6b98ddb81a56eb..6bd2d3f396eccf 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -2039,7 +2039,6 @@ xla_cc_test(
     deps = [
         ":priority_fusion",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2508,7 +2507,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
         "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2549,7 +2547,6 @@ xla_cc_test(
         ":softmax_rewriter_triton",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton:support",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:pattern_matcher_gmock",
@@ -2925,7 +2922,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_proto_cc",
         "//xla/backends/gpu/runtime:buffer_comparator",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -2955,6 +2951,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -2974,7 +2971,6 @@ xla_test(
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/service:platform_util",
         "//xla/service/gpu/autotuning:autotuner_compile_util",
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
index 2bd05f3b369ee6..27c3592138be77 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/BUILD
@@ -300,7 +300,6 @@ cc_library(
         ":collective_ops_utils",
         ":convert_async_collectives_to_sync",
         "//xla:util",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/pass:hlo_pass_pipeline",
@@ -316,6 +315,7 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -324,7 +324,6 @@ xla_cc_test(
     srcs = ["collective_combiner_annotator_test.cc"],
     deps = [
         ":collective_combiner_annotator",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/service:hlo_module_config",
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
index d9bbc6808310db..5798942c82a05a 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
index c8854d79b7861a..b211e9647edd9f 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
index 522094a560e749..4b62feaf8069bb 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/collective_combiner_annotator_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/service/gpu/alias_info.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
index f3253d1a1257b7..e9c4905147cb9d 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion_test.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
index ef7b6269cacb51..5aa9efb309d25f 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/codegen/tiling/symbolic_tile_analysis.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
index 315623af25a736..87fd2e45775559 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
index 8f3da454abe41d..eb189496d3c11f 100644
--- a/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/softmax_rewriter_triton_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
index 774bd1185fd184..5095b9d5934222 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/backends/gpu/runtime/buffer_comparator.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
index 5f7101cb05ae9e..ed2199bb7748c2 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
+#include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index c0492c26cdac1c..132b398fd0f439 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/primitive_util.h"
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 4466fb094ab53d..18becb5c2cb638 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -4137,7 +4137,6 @@ cc_library(
     hdrs = ["hlo_test_base_with_mlir_context.h"],
     deps = [
         ":hlo_test_base",
-        "//xla/hlo/analysis:symbolic_expr",
         "@llvm-project//mlir:IR",
     ],
 )
diff --git a/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h b/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
index 516aec73ef38e4..5f11e1cbb4896f 100644
--- a/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
+++ b/third_party/xla/xla/tests/hlo_test_base_with_mlir_context.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_TESTS_HLO_TEST_BASE_WITH_MLIR_CONTEXT_H_
 
 #include "mlir/IR/MLIRContext.h"
-#include "xla/hlo/analysis/symbolic_expr.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 56acaa4060e343..999ad17ad36944 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -475,7 +475,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -692,7 +691,6 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:btree",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/log",
@@ -714,7 +712,6 @@ xla_test(
     deps = [
         ":matmul_perf_table_gen",
         "//xla:xla_data_proto_cc",
-        "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/service/gpu/model:hlo_op_profile_proto_cc",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
@@ -875,7 +872,6 @@ xla_test(
         ":collective_perf_table_gen",
         "//xla/service/gpu/model:hlo_op_profile_proto_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:hlo_test_base",
         "@com_google_absl//absl/log",
         "@com_google_googletest//:gtest_main",
@@ -1158,7 +1154,6 @@ xla_cc_binary(
     deps = [
         ":hlo_module_loader",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/analysis:symbolic_expr",
         "//xla/hlo/ir:hlo",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/util:command_line_flags",

From 17f0ad702cec8cd2679995fc021ceaa4299bad17 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 14 Dec 2025 01:03:56 -0800
Subject: [PATCH 259/753] compat: Update forward compatibility horizon to
 2025-12-14

PiperOrigin-RevId: 844306913
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 291882942b6c4e..f195ab19953375 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 14)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 1cc0481475f4960e04dac7c9bad1e18bacb3c338 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 14 Dec 2025 01:04:00 -0800
Subject: [PATCH 260/753] Update GraphDef version to 2441.

PiperOrigin-RevId: 844306930
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 95c0b503271a81..2ad27b7d7e8251 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2440  // Updated: 2025/12/13
+#define TF_GRAPH_DEF_VERSION 2441  // Updated: 2025/12/14
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 924a6f2bc48e3c13546c6bfe39faa83eb1a611bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 14 Dec 2025 10:44:06 -0800
Subject: [PATCH 261/753] Automated Code Change

PiperOrigin-RevId: 844422017
---
 third_party/xla/xla/service/lockable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/lockable.h b/third_party/xla/xla/service/lockable.h
index 91b43b070aa5fc..b5c999fd62b363 100644
--- a/third_party/xla/xla/service/lockable.h
+++ b/third_party/xla/xla/service/lockable.h
@@ -45,12 +45,12 @@ class Lockable {
    public:
     Lock() = default;
 
-    Lock(Lock&& other) {
+    Lock(Lock&& other) noexcept {
       lockable_ = other.lockable_;
       other.lockable_ = nullptr;
     }
 
-    Lock& operator=(Lock&& other) {
+    Lock& operator=(Lock&& other) noexcept {
       lockable_ = other.lockable_;
       other.lockable_ = nullptr;
       return *this;

From 71c58be1653778d8b94a20847fb36e6924cdcaa4 Mon Sep 17 00:00:00 2001
From: Venkat6871 <maayara@google.com>
Date: Mon, 15 Dec 2025 10:52:37 +0530
Subject: [PATCH 262/753] Add note that bicubic resize always runs on CPU due
 to missing GPU kernel

---
 tensorflow/python/ops/image_ops_impl.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 087af6a842fc86..46bba6571a1ff0 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1717,6 +1717,12 @@ def resize_images_v2(images,
   >>> max_10_20 = tf.image.resize(image, [10,20], preserve_aspect_ratio=True)
   >>> max_10_20.shape.as_list()
   [1, 10, 10, 1]
+  
+  Note:
+    The `bicubic` interpolation method currently does not have a GPU kernel
+    implementation. As a result, `tf.image.resize(..., method='bicubic')`
+    always executes on the CPU, even when GPU devices are available.
+
 
   Args:
     images: 4-D Tensor of shape `[batch, height, width, channels]` or 3-D Tensor

From e1c25dae57a15fc804ed55735a55e1e0bcf12768 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 01:04:10 -0800
Subject: [PATCH 263/753] compat: Update forward compatibility horizon to
 2025-12-15

PiperOrigin-RevId: 844645695
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index f195ab19953375..e7e8921efcb85a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 14)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 15)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 1a44539603da1afbbc2b7363ae47e08e141db50d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 01:04:40 -0800
Subject: [PATCH 264/753] Update GraphDef version to 2442.

PiperOrigin-RevId: 844645840
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 2ad27b7d7e8251..5d23b83edfa05e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2441  // Updated: 2025/12/14
+#define TF_GRAPH_DEF_VERSION 2442  // Updated: 2025/12/15
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 368aaa4c035551a2bc9eb0a228853a0bfe9bfca1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 03:29:33 -0800
Subject: [PATCH 265/753] Automated Code Change

PiperOrigin-RevId: 844693934
---
 third_party/xla/xla/codegen/emitters/kernel_arguments.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/codegen/emitters/kernel_arguments.cc b/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
index 2b61fad0714bac..5e00a7ca077f73 100644
--- a/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
+++ b/third_party/xla/xla/codegen/emitters/kernel_arguments.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <optional>
 #include <utility>
 #include <vector>
 

From 17e9ec1866031576d3ff2801bbcdabe112ed3ef6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 04:46:42 -0800
Subject: [PATCH 266/753] Automated Code Change

PiperOrigin-RevId: 844715656
---
 third_party/xla/xla/backends/gpu/codegen/triton/BUILD          | 3 +++
 .../xla/xla/backends/gpu/codegen/triton/collective_emitter.cc  | 1 +
 .../xla/backends/gpu/codegen/triton/collective_emitter_test.cc | 1 +
 .../backends/gpu/codegen/triton/compilation_pipeline_cuda.cc   | 2 --
 .../backends/gpu/codegen/triton/compilation_pipeline_test.cc   | 1 -
 .../xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc      | 2 +-
 .../xla/xla/backends/gpu/codegen/triton/dot_algorithms.h       | 1 +
 .../gpu/codegen/triton/fusion_emitter_deviceless_test.cc       | 1 +
 .../backends/gpu/codegen/triton/fusion_emitter_large_test.cc   | 1 -
 third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc | 1 +
 10 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index daeb00d2397e2d..6ed2f74e9325c1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -520,6 +520,7 @@ xla_cc_test(
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/service/gpu:target_constants",
         "//xla/service/gpu/model:block_level_parameters",
@@ -1005,6 +1006,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:types",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
         "//xla/backends/gpu/runtime:all_reduce",
         "//xla/codegen/tiling:tiled_hlo_instruction",
@@ -1045,6 +1047,7 @@ xla_cc_test(
         ":xtile_compiler",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
index 66d7d22fd27cc9..e659734dd922af 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter.cc
@@ -60,6 +60,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
index 5191a7ef22b438..5972869610880e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/collective_emitter_test.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 namespace {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
index 00ba104d3bc82e..9c445f68a3d5fc 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string>
-
 #include "nvidia/hopper/include/Transforms/Passes.h"
 #include "nvidia/include/NVGPUToLLVM/Passes.h"
 #include "nvidia/include/TritonNVIDIAGPUToLLVM/Passes.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
index a1be422730922c..58e7690e35aca3 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/compilation_pipeline.h"
 
-#include <algorithm>
 #include <iterator>
 #include <string>
 #include <vector>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
index 6e819ccbdb2dcf..e73d7f5d793016 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/codegen/triton/dot_algorithms.h"
 
-#include <limits>
+#include <cstdint>
 #include <optional>
 #include <string>
 #include <vector>
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
index bf0d228ef87f9a..0eab343cfd957f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 #include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/xla_data.pb.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
index fa77b022d841a4..19fb63a74c651b 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_deviceless_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/block_level_parameters.h"
 #include "xla/service/gpu/target_constants.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
index 4af7ab9cbf76e1..6e0c33e70d8d8f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_large_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include <string>
-#include <variant>
 
 #include <gtest/gtest.h>
 #include "absl/log/check.h"
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
index e0f0c5272eb37d..452c2bf63e2ada 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>

From a49deb86e5155945aa5af63194366c281c65d88b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 05:06:07 -0800
Subject: [PATCH 267/753] Automated Code Change

PiperOrigin-RevId: 844721506
---
 third_party/xla/xla/service/gpu/kernels/custom_kernel.cc       | 1 +
 third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
index d5b413bd166f9d..6f51d83ddf2ab7 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "xla/service/gpu/kernels/custom_kernel.pb.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index 2f8950b839117d..63c8848a038663 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"

From 3f22e93a0cfd95fef374af28853633aaf057e922 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 15 Dec 2025 06:21:44 -0800
Subject: [PATCH 268/753] Remove unused header.

indexing_map doesn't use HloInstruction. The removal of that header required updating other places.

PiperOrigin-RevId: 844743070
---
 .../gpu/codegen/emitters/transforms/convert_float_amd.cc       | 3 ++-
 .../xla/xla/backends/gpu/codegen/emitters/transforms/passes.h  | 2 +-
 .../xla/codegen/emitters/transforms/vectorize_loads_stores.cc  | 3 ++-
 third_party/xla/xla/codegen/tiling/BUILD                       | 1 +
 .../xla/xla/codegen/tiling/size_and_stride_expression.cc       | 1 +
 third_party/xla/xla/hlo/analysis/indexing_map.h                | 1 -
 6 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
index 1b06367c81ff93..da448aa7ae0b76 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
@@ -43,11 +43,11 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/rocm/rocm_compute_capability.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace gpu {
@@ -59,6 +59,7 @@ namespace {
 
 namespace LLVM = ::mlir::LLVM;
 namespace arith = ::mlir::arith;
+namespace se = stream_executor;
 namespace vector = ::mlir::vector;
 
 template <typename SourceOp>
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
index 5e3b87105c049c..bdd72354685ccd 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/passes.h
@@ -39,7 +39,7 @@ std::unique_ptr<mlir::Pass> CreateConvertFloatNvidiaPass(
 std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
     const std::string& gpu_device_info = "");
 std::unique_ptr<mlir::Pass> CreateConvertFloatAMDPass(
-    const se::RocmComputeCapability& cc);
+    const stream_executor::RocmComputeCapability& cc);
 std::unique_ptr<mlir::Pass> CreateConvertIndexTypePass();
 std::unique_ptr<mlir::Pass> CreateOptimizeLoopsPass();
 std::unique_ptr<mlir::Pass> CreatePeelLoopsPass();
diff --git a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
index c0a2398d7eec57..cb8ad0580d740d 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
@@ -42,12 +42,12 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "google/protobuf/text_format.h"
 #include "xla/codegen/device_spec.h"
 #include "xla/codegen/emitters/ir/xla_ops.h"
 #include "xla/codegen/emitters/transforms/atomic_rmw_utils.h"
 #include "xla/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace emitters {
@@ -61,6 +61,7 @@ using mlir::Value;
 namespace arith = ::mlir::arith;
 namespace ml = ::mlir::LLVM;
 namespace scf = mlir::scf;
+namespace se = stream_executor;
 
 // Tries to find the stride of a symbol or dimension in an affine expression.
 // Returns std::nullopt if the stride could not be determined.
diff --git a/third_party/xla/xla/codegen/tiling/BUILD b/third_party/xla/xla/codegen/tiling/BUILD
index 709dab0990718a..e974b0c17d2eb6 100644
--- a/third_party/xla/xla/codegen/tiling/BUILD
+++ b/third_party/xla/xla/codegen/tiling/BUILD
@@ -226,6 +226,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:vlog_is_on",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc b/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc
index 4cb4100e9d55c9..cfc461c8b31a24 100644
--- a/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc
+++ b/third_party/xla/xla/codegen/tiling/size_and_stride_expression.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/log/vlog_is_on.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map.h b/third_party/xla/xla/hlo/analysis/indexing_map.h
index 58b5f5bbec139d..e91d063b2f3408 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_map.h
@@ -36,7 +36,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Support/LLVM.h"
 #include "xla/hlo/analysis/interval.h"
-#include "xla/hlo/ir/hlo_instruction.h"
 
 namespace xla {
 

From 3e2a99cdb107ba74cf48b57ff49d8c5b6fc475b4 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 15 Dec 2025 06:40:18 -0800
Subject: [PATCH 269/753] Add an overload for `IsCompatibleCacheFile` that
 takes a file descriptor.

PiperOrigin-RevId: 844749188
---
 tensorflow/lite/delegates/xnnpack/weight_cache.cc | 10 ++++++++++
 tensorflow/lite/delegates/xnnpack/weight_cache.h  | 11 +++++++++++
 2 files changed, 21 insertions(+)

diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index e9ccdbfd8eedd9..94b0b76ab4b4cd 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -682,6 +682,16 @@ bool IsCompatibleCacheFile(const char* path) {
   FileDescriptor fd = FileDescriptor::Open(path, O_RDONLY);
   XNNPACK_RETURN_CHECK(fd.IsValid(), "Could not open file: %s: %s.", path,
                        strerror(errno));
+  return IsCompatibleCacheFile(std::move(fd));
+}
+
+bool IsCompatibleCacheFile(const FileDescriptor& fd) {
+  XNNPACK_RETURN_CHECK(fd.IsValid(), "Invalid file descriptor: %d.",
+                       fd.Value());
+  const size_t current_pos = fd.GetPos();
+  ScopeGuard reset_pos_on_return(
+      [current_pos, &fd] { fd.SetPos(current_pos); });
+
   XNNPackCacheHeader header;
   XNNPACK_RETURN_CHECK(fd.Read(&header, sizeof(header)),
                        "Couldn't read file header.");
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index 7dd04a20f2095f..270b48bb4092af 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -63,8 +63,19 @@ struct XNNPackCacheHeader {
   uint64_t buffer_list_size;
 };
 
+// Checks if the file at the given path is compatible with the current XNNPack
+// weight cache.
 bool IsCompatibleCacheFile(const char* path);
 
+// Checks if the opened file is compatible with the current XNNPack weight
+// cache.
+//
+// Position in the file may be changed during the function execution but is
+// restored upon exiting.
+//
+// Note: the file descriptor must be open and valid.
+bool IsCompatibleCacheFile(const FileDescriptor& fd);
+
 struct PackIdentifier {
   enum { kNoId = SIZE_MAX };
   uint64_t pack_algorithm_id = kNoId;

From 2e688abc239119ee3e18b41bae4b196aa4626e5a Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Mon, 15 Dec 2025 06:51:01 -0800
Subject: [PATCH 270/753] On Windows, don't abort when a test file cannot be
 opened.

PiperOrigin-RevId: 844752050
---
 tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h b/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h
index 365f94dc6ce885..ab29545730664d 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h
@@ -86,14 +86,14 @@ class TempFileDesc : public FileDescriptor {
     errno_t err = tmpnam_s(filename, L_tmpnam_s);
     if (err) {
       fprintf(stderr, "Could not create temporary filename.\n");
-      std::abort();
+      return;
     }
     path_ = filename;
     FileDescriptor fd =
         FileDescriptor::Open(path_.c_str(), _O_CREAT | _O_EXCL | _O_RDWR, 0644);
     if (!fd.IsValid()) {
       fprintf(stderr, "Could not create temporary file.\n");
-      std::abort();
+      return;
     }
     Reset(fd.Release());
   }

From 82c4d81c96320639e466cc1573a1489df8d7e7a3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Dec 2025 16:23:21 +0000
Subject: [PATCH 271/753] Bump werkzeug in
 /ci/official/requirements_updater/numpy1_requirements

Bumps [werkzeug](https://github.com/pallets/werkzeug) from 3.1.3 to 3.1.4.
- [Release notes](https://github.com/pallets/werkzeug/releases)
- [Changelog](https://github.com/pallets/werkzeug/blob/main/CHANGES.rst)
- [Commits](https://github.com/pallets/werkzeug/compare/3.1.3...3.1.4)

---
updated-dependencies:
- dependency-name: werkzeug
  dependency-version: 3.1.4
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 .../numpy1_requirements/requirements_lock_3_10.txt          | 6 +++---
 .../numpy1_requirements/requirements_lock_3_11.txt          | 6 +++---
 .../numpy1_requirements/requirements_lock_3_12.txt          | 6 +++---
 .../numpy1_requirements/requirements_lock_3_9.txt           | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
index 898ea6c0418532..fb8ab2398ca308 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_10.txt
@@ -733,9 +733,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
index eae965757fad3b..f28cc6e55aad19 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_11.txt
@@ -732,9 +732,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
index ca6904da19ebbe..39d9e567a4f3ec 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_12.txt
@@ -732,9 +732,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \
diff --git a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
index e34567660cc5f7..dfe108225bb4bf 100644
--- a/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
+++ b/ci/official/requirements_updater/numpy1_requirements/requirements_lock_3_9.txt
@@ -729,9 +729,9 @@ urllib3==2.6.0 \
     --hash=sha256:c90f7a39f716c572c4e3e58509581ebd83f9b59cced005b7db7ad2d22b0db99f \
     --hash=sha256:cb9bcef5a4b345d5da5d145dc3e30834f58e8018828cbc724d30b4cb7d4d49f1
     # via requests
-werkzeug==3.1.3 \
-    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
-    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+werkzeug==3.1.4 \
+    --hash=sha256:2ad50fb9ed09cc3af22c54698351027ace879a0b60a3b5edf5730b2f7d876905 \
+    --hash=sha256:cd3cd98b1b92dc3b7b3995038826c68097dcb16f9baa63abe35f20eafeb9fe5e
     # via tb-nightly
 wheel==0.41.3 \
     --hash=sha256:488609bc63a29322326e05560731bf7bfea8e48ad646e1f5e40d366607de0942 \

From da48cdca4bcae3512a1b5fa12e073b804137cd0a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 08:22:30 -0800
Subject: [PATCH 272/753] Automated Code Change

PiperOrigin-RevId: 844780663
---
 .../xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
index b64d94cd87becb..8211e65adf870a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/ir/triton_xla_attrs.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdint>
-
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/Attributes.h"

From 2f794f10fd0581689464a3dea6b2a949c88e4a8f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 09:12:18 -0800
Subject: [PATCH 273/753] Mark nvshmem test as NVIDIA GPU specific.

PiperOrigin-RevId: 844798507
---
 third_party/xla/xla/backends/gpu/collectives/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index 316f0e882b1b30..f0f7ee78a657e3 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -532,6 +532,11 @@ xla_test(
             "no_oss",
             "noasan",
             "nomsan",
+            "requires-gpu-nvidia",
+        ],
+        "b200": [
+            "multi_gpu",
+            "broken",
         ],
         "nvgpu_any": [
             "broken",

From b8da0f5494deac8f9cec3966b62f6b5a03cd0d0b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 09:15:22 -0800
Subject: [PATCH 274/753] [XLA:GPU] Collect requested multimem objects in
 thunks in prepare state and construct them once per HLO run.

Multimem object should be allocated and sets up multimem object once per clique and device address.

PiperOrigin-RevId: 844799607
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  29 ++-
 .../xla/backends/gpu/runtime/all_reduce.cc    |   3 +
 .../backends/gpu/runtime/all_reduce_thunk.cc  |  14 +-
 .../gpu/runtime/collective_kernel_thunk.cc    | 201 +++++++++++-------
 .../gpu/runtime/collective_kernel_thunk.h     |  38 ++--
 .../runtime/collective_kernel_thunk_test.cc   |  18 +-
 .../gpu/runtime/collective_metadata_thunk.cc  |  48 ++++-
 .../gpu/runtime/collective_metadata_thunk.h   |  14 +-
 .../gpu/runtime/collective_multimem.cc        |  23 +-
 .../gpu/runtime/collective_multimem.h         |  30 +--
 .../runtime/collective_multimem_registry.cc   |  55 +++++
 .../runtime/collective_multimem_registry.h    |  83 ++++++++
 .../xla/xla/backends/gpu/runtime/thunk.h      |  10 +
 third_party/xla/xla/service/gpu/BUILD         |   2 +
 .../xla/xla/service/gpu/gpu_executable.cc     |  10 +-
 .../xla/stream_executor/cuda/cuda_executor.cc |  21 +-
 .../xla/stream_executor/cuda/cuda_executor.h  |   2 +
 .../xla/xla/stream_executor/stream_executor.h |   3 +
 .../xla/xla/tests/collective_metadata_test.cc |  85 ++++++--
 19 files changed, 513 insertions(+), 176 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.cc
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.h

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 2b410193a09098..53daeea846b443 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1269,6 +1269,7 @@ cc_library(
         ":collective_cliques",
         ":collective_metadata_thunk",
         ":collective_multimem",
+        ":collective_params",
         ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
@@ -1278,7 +1279,7 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_clique_key",
         "//xla/core/collectives:rank_id",
         "//xla/core/collectives:reduction_kind",
-        "//xla/service:collective_ops_utils",
+        "//xla/runtime:device_id",
         "//xla/service/gpu:gpu_constants",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:stream_executor_util",
@@ -1316,12 +1317,15 @@ xla_test(
     backends = ["h100"],
     deps = [
         ":collective_kernel_thunk",
+        ":collective_multimem_registry",
         ":collective_params",
         ":collective_thunk",
         ":thunk",
         "//xla:array",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/runtime:collective_clique_requests",
+        "//xla/core/collectives:reduction_kind",
         "//xla/pjrt:worker_thread",
         "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
@@ -1675,6 +1679,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "collective_multimem_registry",
+    srcs = ["collective_multimem_registry.cc"],
+    hdrs = ["collective_multimem_registry.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":collective_multimem",
+        "//xla/backends/gpu/collectives:gpu_clique_key",
+        "//xla/runtime:device_id",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base:nullability",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
 cc_library(
     name = "collective_params",
     srcs = ["collective_params.cc"],
@@ -2034,6 +2058,7 @@ cc_library(
     hdrs = ["collective_metadata_thunk.h"],
     deps = [
         ":collective_multimem",
+        ":collective_multimem_registry",
         ":collective_thunk",
         ":thunk",
         "//xla:shape_util",
@@ -2052,6 +2077,7 @@ cc_library(
         "//xla/stream_executor/gpu:collective_kernel_metadata",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
@@ -2159,6 +2185,7 @@ cc_library(
     deps = [
         ":collective_clique_requests",
         ":collective_cliques",
+        ":collective_multimem_registry",
         ":collective_params",
         ":thunk_id",
         ":thunk_proto_cc",
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
index fad17784419356..624e0124c086fb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce.cc
@@ -202,6 +202,7 @@ bool IsAllReduceKernelSupported(int64_t num_ranks, int64_t num_elements,
                                 ReductionKind reduction_kind,
                                 AllReduceStrategy all_reduce_strategy) {
   if (!IsElementReductionSupported(element_type, reduction_kind)) {
+    VLOG(3) << "Element type and reduction kind combination is not supported.";
     return false;
   }
   const int64_t alignment_requirement =
@@ -211,6 +212,8 @@ bool IsAllReduceKernelSupported(int64_t num_ranks, int64_t num_elements,
           : se::gpu::kNumElementsPerThread * num_ranks;
 
   if (num_elements % alignment_requirement != 0) {
+    VLOG(3)
+        << "Number of elements is not aligned to the alignment requirement.";
     return false;
   }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
index 5bad2a67644375..c8efc4c3c933cd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
@@ -153,9 +153,10 @@ absl::Status AllReduceStartThunk::Initialize(const InitializeParams& params) {
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, config()));
-  TF_ASSIGN_OR_RETURN(bool use_collective_kernel,
-                      collective_kernel_thunk_->IsSupported(
-                          clique_key, params.collective_cliques));
+  TF_ASSIGN_OR_RETURN(
+      bool use_collective_kernel,
+      collective_kernel_thunk_->IsSupported(clique_key, *params.executor,
+                                            *params.collective_params));
   if (use_collective_kernel) {
     TF_RETURN_IF_ERROR(collective_kernel_thunk_->Initialize(params));
   }
@@ -170,9 +171,10 @@ absl::StatusOr<bool> AllReduceStartThunk::RunCollective(
       ConvertToDeviceBuffers(params, buffers_,
                              config_.config.operand_element_type));
 
-  TF_ASSIGN_OR_RETURN(bool use_collective_kernel,
-                      collective_kernel_thunk_->IsSupported(
-                          clique_key, params.collective_cliques));
+  TF_ASSIGN_OR_RETURN(
+      bool use_collective_kernel,
+      collective_kernel_thunk_->IsSupported(
+          clique_key, *params.stream->parent(), *params.collective_params));
 
   if (use_collective_kernel) {
     TF_RETURN_IF_ERROR(collective_kernel_thunk_->ExecuteOnStream(params));
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
index 345d6148d18713..703babbbb6f648 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.cc
@@ -33,12 +33,12 @@ limitations under the License.*/
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/all_reduce.h"
-#include "xla/backends/gpu/runtime/collective_cliques.h"
 #include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
-#include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/stream_executor_util.h"
@@ -86,49 +86,125 @@ absl::StatusOr<se::DeviceAddressHandle> AllocateMemory(
   return local_buffer_alloc;
 };
 
+absl::StatusOr<int> GetLocalDeviceId(
+    const GlobalDeviceId& global_device_id,
+    const CollectiveParams& collective_params) {
+  // If the global device id map is not provided, then we can assume that
+  // execution is local.
+  if (!collective_params.global_device_id_map) {
+    return global_device_id.value();
+  }
+
+  for (const auto& local_device : *collective_params.global_device_id_map) {
+    if (local_device.second == global_device_id) {
+      return local_device.first.value();
+    }
+  }
+  return absl::NotFoundError(
+      absl::StrFormat("Global device id %d not found in global device id map.",
+                      global_device_id.value()));
+}
+
 }  // namespace
 
 absl::StatusOr<bool> CollectiveKernelThunk::IsSupported(
-    const GpuCliqueKey& clique_key,
-    const CollectiveCliques* collective_cliques) const {
+    const GpuCliqueKey& clique_key, se::StreamExecutor& executor,
+    const CollectiveParams& collective_params) const {
   if (!collective_kernel_enabled_) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Collective kernel is not enabled.";
     return false;
   }
 
   // TODO(b/407736956): Support variadic all-reduce.
   if (buffers_.size() != 1) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Variadic arguments are not implemented for collective kernels.";
     return false;
   }
-
   const int64_t num_elements = buffers_[0].element_count;
   const int64_t input_size_bytes = GetInputSizeBytes();
   const AllReduceStrategy strategy =
       GetAllReduceStrategy(input_size_bytes, is_multimem_enabled_);
   // Custom all-reduce strategy is only supported for small inputs.
   if (input_size_bytes > GetMaxSupportedAllReduceSizeBytes(strategy)) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Custom all-reduce strategy is only supported for small inputs.";
     return false;
   }
 
-  TF_ASSIGN_OR_RETURN(bool peer_access_enabled,
-                      collective_cliques->peer_access_enabled(clique_key));
-
-  // Check that peer access is enabled.
-  if (!peer_access_enabled) {
+  // Only single-host collectives are supported for now.
+  if (!clique_key.is_local()) {
+    XLA_VLOG_DEVICE(3, executor.device_ordinal())
+        << "Cross-host symmetric memory collectives are not supported.";
     return false;
   }
 
+  for (const GlobalDeviceId& device : clique_key.devices()) {
+    TF_ASSIGN_OR_RETURN(const int peer_device_id,
+                        GetLocalDeviceId(device, collective_params));
+    if (!executor.CanEnablePeerAccessTo(peer_device_id)) {
+      XLA_VLOG_DEVICE(3, executor.device_ordinal())
+          << "Peer access is not supported with device " << peer_device_id;
+      return false;
+    }
+  }
+
   return IsAllReduceKernelSupported(
       clique_key.num_local_participants(), num_elements,
       collective_config_.operand_element_type[0], reduction_kind_, strategy);
 }
 
 absl::Status CollectiveKernelThunk::Prepare(const PrepareParams& params) {
-  TF_RET_CHECK(params.collective_params != nullptr);
   TF_ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, collective_config_,
                                 /*include_participant_groups=*/false));
-  return params.clique_requests->RequestClique(clique_key);
+  TF_ASSIGN_OR_RETURN(
+      bool use_collective_kernel,
+      IsSupported(clique_key, *params.executor, *params.collective_params));
+  if (!use_collective_kernel) {
+    return absl::OkStatus();
+  }
+  TF_RETURN_IF_ERROR(params.clique_requests->RequestClique(clique_key));
+
+  absl::MutexLock lock(mutex_);
+  if (!per_stream_memory_.contains(params.executor)) {
+    // Allocate scratch buffers.
+    const AllReduceStrategy strategy =
+        GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
+    const LaunchDimensions launch_dimensions = AllReduceLaunchDimensions(
+        buffers_[0].element_count, clique_key.num_local_participants(),
+        strategy);
+    const int64_t kNumSignalFlags =
+        clique_key.num_local_participants() * launch_dimensions.num_blocks();
+    const int64_t kSignalBufferSize = xla::RoundUpTo<uint64_t>(
+        kNumSignalFlags * sizeof(int32_t), kXlaAllocatedBufferAlignBytes);
+    const int64_t kLocalBufferSize = xla::RoundUpTo<uint64_t>(
+        buffers_[0].source_buffer.size(), kXlaAllocatedBufferAlignBytes);
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceAddressHandle local_buffers_handle,
+        AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
+                       "Local buffers"));
+
+    TF_ASSIGN_OR_RETURN(
+        se::DeviceAddressHandle signal_buffers_handle,
+        AllocateMemory(params.executor, kSignalBufferSize * kNumBuffers,
+                       "Signal buffers"));
+
+    se::DeviceAddressBase local_buffers_ptr = local_buffers_handle.address();
+    per_stream_memory_.emplace(
+        params.executor,
+        std::make_unique<StreamMemory>(StreamMemory{
+            std::move(local_buffers_handle), std::move(signal_buffers_handle),
+            strategy, kLocalBufferSize, kSignalBufferSize}));
+    if (is_multimem_enabled_ && strategy == AllReduceStrategy::kMultimem) {
+      params.multimem_registry->Register(
+          {clique_key, /*map_to=*/local_buffers_ptr});
+    }
+  }
+
+  return absl::OkStatus();
 }
 
 int64_t CollectiveKernelThunk::GetInputSizeBytes() const {
@@ -137,30 +213,6 @@ int64_t CollectiveKernelThunk::GetInputSizeBytes() const {
              collective_config_.operand_element_type[0]);
 }
 
-absl::Status CollectiveKernelThunk::ExchangeStateMetadata(
-    const GpuCliqueKey& clique_key, const InitializeParams& params,
-    StreamState& state) {
-  const std::optional<RankId> rank =
-      clique_key.rank(params.collective_params->global_device_id);
-  TF_RET_CHECK(rank.has_value())
-      << "Device " << params.collective_params->global_device_id
-      << "is not in the clique.";
-
-  std::vector<se::DeviceAddressBase> parameters{
-      state.local_buffers_handle.memory(),
-      state.signal_buffers_handle.memory()};
-  TF_RET_CHECK(parameters.size() == kNumParameters);
-
-  const size_t param_to_peers_ptrs_size_bytes =
-      parameters.size() * clique_key.num_devices() * sizeof(uint64_t);
-  state.metadata = params.executor->Allocate(
-      sizeof(CollectiveKernelMetadata) + param_to_peers_ptrs_size_bytes, 0);
-
-  return CollectiveMetadataThunk::ConstructCollectiveMetadata(
-      clique_key, state.rank, params.stream, std::move(parameters),
-      state.collective_multimem, state.metadata);
-}
-
 absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
   TF_ASSIGN_OR_RETURN(
       const GpuCliqueKey clique_key,
@@ -171,43 +223,21 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
   TF_RET_CHECK(rank.has_value())
       << "Device " << params.collective_params->global_device_id
       << "is not in the clique.";
-  const AllReduceStrategy strategy =
-      GetAllReduceStrategy(GetInputSizeBytes(), is_multimem_enabled_);
-  const LaunchDimensions launch_dimensions = AllReduceLaunchDimensions(
-      buffers_[0].element_count, clique_key.num_local_participants(), strategy);
 
   StreamState* state = nullptr;
   {
     absl::MutexLock lock(mutex_);
     if (!per_stream_state_.contains(params.executor)) {
-      // Step1: Allocate signal and local buffers.
-      const int64_t kNumSignalFlags =
-          clique_key.num_local_participants() * launch_dimensions.num_blocks();
-
-      int64_t kSignalBufferSize = xla::RoundUpTo<uint64_t>(
-          kNumSignalFlags * sizeof(int32_t), kXlaAllocatedBufferAlignBytes);
-      const int64_t kLocalBufferSize = xla::RoundUpTo<uint64_t>(
-          buffers_[0].source_buffer.size(), kXlaAllocatedBufferAlignBytes);
-
-      TF_ASSIGN_OR_RETURN(
-          se::DeviceAddressHandle local_buffers_handle,
-          AllocateMemory(params.executor, kLocalBufferSize * kNumBuffers,
-                         "Local buffers"));
-
-      TF_ASSIGN_OR_RETURN(
-          se::DeviceAddressHandle signal_buffers_handle,
-          AllocateMemory(params.executor, kSignalBufferSize * kNumBuffers,
-                         "Signal buffers"));
-
-      // Step2: We needs 1 atomic flag per block per device on each device.
+      StreamMemory* memory_state = per_stream_memory_.at(params.executor).get();
+      // Step1: We needs 1 atomic flag per block per device on each device.
       // One-shot kernel expects that the signal flags buffer is zeroed out.
       // Initial state of device memory is undefined, so we need to zero out
       // the buffer. The kernel will take care of leaving the buffer in
       // correct state after use, so we don't need to zero out after
       // initialization.
       TF_RETURN_IF_ERROR(params.executor->SynchronousMemZero(
-          signal_buffers_handle.memory_ptr(),
-          signal_buffers_handle.memory().size()));
+          memory_state->signal_buffers_handle.memory_ptr(),
+          memory_state->signal_buffers_handle.memory().size()));
       // Create a kernel for execution.
       std::unique_ptr<se::Kernel> kernel = nullptr;
       if (!kernel_name_.empty()) {
@@ -224,13 +254,11 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
                            params.executor, shmem_bytes_));
         }
       }
-      // Step3: Emplace into the stream state.
+      // Step2: Emplace into the stream state.
       per_stream_state_.emplace(
           params.executor,
-          std::make_unique<StreamState>(
-              params.executor->device_ordinal(), rank.value(),
-              std::move(local_buffers_handle), std::move(signal_buffers_handle),
-              std::move(kernel)));
+          std::make_unique<StreamState>(params.executor->device_ordinal(),
+                                        rank.value(), std::move(kernel)));
 
       state = per_stream_state_.at(params.executor).get();
 
@@ -238,28 +266,47 @@ absl::Status CollectiveKernelThunk::Initialize(const InitializeParams& params) {
       // half of the total allocation.
       for (int i = 0; i < kNumBuffers; ++i) {
         state->remote_buffer_ptrs[i] =
-            state->local_buffers_handle.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/i * kLocalBufferSize,
-                /*size_bytes=*/kLocalBufferSize);
+            memory_state->local_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * memory_state->local_buffer_size_bytes,
+                /*size_bytes=*/memory_state->local_buffer_size_bytes);
 
         state->signal_buffer_ptrs[i] =
-            state->signal_buffers_handle.memory_ptr()->GetByteSlice(
-                /*offset_bytes=*/i * kSignalBufferSize,
-                /*size_bytes=*/kSignalBufferSize);
+            memory_state->signal_buffers_handle.memory_ptr()->GetByteSlice(
+                /*offset_bytes=*/i * memory_state->signal_buffer_size_bytes,
+                /*size_bytes=*/memory_state->signal_buffer_size_bytes);
       }
     }
   }
 
+  StreamMemory* memory_state = nullptr;
+  {
+    absl::MutexLock lock(mutex_);
+    memory_state = per_stream_memory_.at(params.executor).get();
+  }
+
   if (state != nullptr) {
-    if (strategy == AllReduceStrategy::kMultimem) {
+    if (memory_state->strategy == AllReduceStrategy::kMultimem) {
       TF_ASSIGN_OR_RETURN(
           state->collective_multimem,
-          CollectiveMultimem::Allocate(params.executor, clique_key, *rank,
-                                       state->local_buffers_handle.memory()));
+          params.multicast_memory_registry->Get(
+              {clique_key, memory_state->local_buffers_handle.memory()}));
       state->multicast_device_ptr =
           state->collective_multimem->mapped_ptr(*rank);
     }
-    TF_RETURN_IF_ERROR(ExchangeStateMetadata(clique_key, params, *state));
+
+    std::vector<se::DeviceAddressBase> parameters{
+        memory_state->local_buffers_handle.memory(),
+        memory_state->signal_buffers_handle.memory()};
+    TF_RET_CHECK(parameters.size() == kNumParameters);
+
+    const size_t param_to_peers_ptrs_size_bytes =
+        parameters.size() * clique_key.num_devices() * sizeof(uint64_t);
+    state->metadata = params.executor->Allocate(
+        sizeof(CollectiveKernelMetadata) + param_to_peers_ptrs_size_bytes, 0);
+
+    return CollectiveMetadataThunk::ConstructCollectiveMetadata(
+        clique_key, state->rank, params.stream, std::move(parameters),
+        state->collective_multimem, state->metadata);
   }
 
   return absl::OkStatus();
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
index 8350a7de83b7c1..8476cf19e90f64 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
@@ -28,16 +28,14 @@ limitations under the License.*/
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_cliques.h"
-#include "xla/backends/gpu/runtime/collective_metadata_thunk.h"
 #include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/core/collectives/reduction_kind.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_handle.h"
 #include "xla/stream_executor/gpu/all_reduce_kernel.h"
@@ -82,8 +80,8 @@ class CollectiveKernelThunk : public Thunk {
 
   // Returns true if the collective kernel is supported for the given clique.
   absl::StatusOr<bool> IsSupported(
-      const GpuCliqueKey& clique_key,
-      const CollectiveCliques* collective_cliques) const;
+      const GpuCliqueKey& clique_key, se::StreamExecutor& executor,
+      const CollectiveParams& collective_params) const;
 
   // The single host collective thunk actually requires a clique key.
   absl::Status Prepare(const PrepareParams& params) final;
@@ -100,10 +98,9 @@ class CollectiveKernelThunk : public Thunk {
   // We use a double buffering strategy for the buffers.
   // See docs on struct StreamState for more details.
   static constexpr int64_t kNumBuffers = 2;
-  // Per-executor state that needs to be synchronized for access.
-  struct StreamState {
-    int device_ordinal;
-    RankId rank;
+
+  // Per-executor scratch memory.
+  struct StreamMemory {
     // Buffers allocated for the collective.
     // Buffers are double buffered to allow for consecutive invocation
     // of the kernel on different GPUs.
@@ -118,6 +115,17 @@ class CollectiveKernelThunk : public Thunk {
     // Also double buffered for the same reason as local buffers.
     se::DeviceAddressHandle signal_buffers_handle;
 
+    se::gpu::AllReduceStrategy strategy;
+
+    const int64_t local_buffer_size_bytes = 0;
+    const int64_t signal_buffer_size_bytes = 0;
+  };
+
+  // Per-executor state that needs to be synchronized for access.
+  struct StreamState {
+    int device_ordinal = 0;
+    RankId rank = RankId(0);
+
     // Pointer to the collective kernel metadata on device.
     se::DeviceAddressBase metadata;
 
@@ -136,25 +144,15 @@ class CollectiveKernelThunk : public Thunk {
     // Constructor to make OSS builds happy.
     StreamState() = default;
     StreamState(int device_ordinal_arg, RankId rank_arg,
-                se::DeviceAddressHandle local_buffers_handle_arg,
-                se::DeviceAddressHandle signal_buffers_handle_arg,
                 std::unique_ptr<se::Kernel> kernel_arg)
         : device_ordinal(device_ordinal_arg),
           rank(rank_arg),
-          local_buffers_handle(std::move(local_buffers_handle_arg)),
-          signal_buffers_handle(std::move(signal_buffers_handle_arg)),
           kernel(std::move(kernel_arg)) {}
   };
 
   // Returns the input size in bytes for the collective.
   int64_t GetInputSizeBytes() const;
 
-  // Internal method to sync thread after Initialize.
-  // Returns the collective kernel metadata for the given clique key.
-  absl::Status ExchangeStateMetadata(const GpuCliqueKey& clique_key,
-                                     const InitializeParams& params,
-                                     StreamState& state);
-
   // Whether the one-shot kernel is enabled.
   const bool collective_kernel_enabled_;
   // Whether the collective is run on an async stream.
@@ -177,6 +175,8 @@ class CollectiveKernelThunk : public Thunk {
   absl::Mutex mutex_;
   absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamState>>
       per_stream_state_ ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<StreamMemory>>
+      per_stream_memory_ ABSL_GUARDED_BY(mutex_);
   const bool is_multimem_enabled_;
 };
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
index e65d7760e8981f..43631845f37ad3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
@@ -26,12 +26,14 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
@@ -274,12 +276,26 @@ absl::StatusOr<se::DeviceAddressBase> RunCollectiveKernelThunk(
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
 
+  Thunk::PrepareParams prepare_params;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
+  CollectiveCliqueRequests clique_requests;
+  prepare_params.executor = executor;
+  prepare_params.buffer_allocations = &buffer_allocations;
+  prepare_params.collective_params = &collective_params;
+  prepare_params.clique_requests = &clique_requests;
+  prepare_params.multimem_registry = &multimem_registry;
+  TF_RETURN_IF_ERROR(metadata.thunk->Prepare(prepare_params));
+
+  TF_RETURN_IF_ERROR(multimem_registry.Build());
+
   Thunk::InitializeParams initialize_params;
   initialize_params.executor = executor;
   initialize_params.stream = stream.get();
   initialize_params.buffer_allocations = &buffer_allocations;
   initialize_params.collective_params = &collective_params;
   initialize_params.src = {kKernelSource};
+  initialize_params.multicast_memory_registry = &multimem_registry;
 
   GpuExecutableRunOptions::DeviceIdMap global_device_id_map = {
       {LocalDeviceId(0), GlobalDeviceId(0)}};
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
index a5e44c890f34fe..d6fb24c4b59581 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_clique_rendezvous.h"
 #include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -143,6 +145,36 @@ CollectiveMetadataThunk::GetParameterDeviceMemoryBase(
       /*size_bytes=*/num_devices * sizeof(void*));
 }
 
+absl::Status CollectiveMetadataThunk::Prepare(const PrepareParams& params) {
+  // We currently support only a single memory space for multimem parameters.
+  // So we just pick the first one here.
+  auto fast_memory_parameter =
+      absl::c_find_if(parameters_, [](const Buffer& parameter) {
+        return parameter.memory_space == xla::Layout::kGenericFastMemorySpace;
+      });
+  if (fast_memory_parameter == parameters_.end()) {
+    return absl::OkStatus();
+  }
+
+  se::DeviceAddressBase memory_range;
+  TF_ASSIGN_OR_RETURN(memory_range,
+                      params.executor->GetMemoryRange(
+                          params.buffer_allocations->GetDeviceAddress(
+                              fast_memory_parameter->slice)));
+
+  // Since there is no parameter in the collective memory space, we don't need
+  // to set up the collective multimem.
+  if (memory_range.is_null()) {
+    return absl::OkStatus();
+  }
+  TF_ASSIGN_OR_RETURN(
+      const GpuCliqueKey clique_key,
+      GetCollectiveGpuCliqueKey(*params.collective_params, collective_config_,
+                                /*include_participant_groups=*/false));
+  params.multimem_registry->Register({clique_key, /*map_to=*/memory_range});
+  return absl::OkStatus();
+}
+
 absl::Status CollectiveMetadataThunk::Initialize(
     const InitializeParams& params) {
   TF_ASSIGN_OR_RETURN(
@@ -164,11 +196,11 @@ absl::Status CollectiveMetadataThunk::Initialize(
       params.buffer_allocations->GetDeviceAddress(result_);
 
   GlobalDeviceId global_device_id = params.collective_params->global_device_id;
-  std::optional<RankId> rank = clique_key.rank(global_device_id);
 
-  TF_ASSIGN_OR_RETURN(auto multimem,
-                      AllocateMultimem(clique_key, *rank, params));
+  TF_ASSIGN_OR_RETURN(auto multimem, GetCollectiveMultimem(clique_key, params));
 
+  std::optional<RankId> rank = clique_key.rank(global_device_id);
+  TF_RET_CHECK(rank.has_value());
   return ConstructCollectiveMetadata(clique_key, *rank, params.stream,
                                      std::move(parameters), std::move(multimem),
                                      result_ptr);
@@ -180,9 +212,8 @@ absl::Status CollectiveMetadataThunk::ExecuteOnStream(
 }
 
 absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
-CollectiveMetadataThunk::AllocateMultimem(const GpuCliqueKey& clique_key,
-                                          RankId rank,
-                                          const InitializeParams& params) {
+CollectiveMetadataThunk::GetCollectiveMultimem(const GpuCliqueKey& clique_key,
+                                               const InitializeParams& params) {
   se::DeviceAddressBase memory_range;
   for (const Buffer& parameter : parameters_) {
     if (parameter.memory_space == xla::Layout::kGenericFastMemorySpace) {
@@ -200,10 +231,9 @@ CollectiveMetadataThunk::AllocateMultimem(const GpuCliqueKey& clique_key,
     return nullptr;
   }
 
+  const MultimemRequest request{clique_key, memory_range};
   TF_ASSIGN_OR_RETURN(std::shared_ptr<CollectiveMultimem> collective_multimem,
-                      CollectiveMultimem::Allocate(params.executor, clique_key,
-                                                   rank, memory_range));
-
+                      params.multicast_memory_registry->Get(request));
   absl::MutexLock lock(mutex_);
   return (collective_multimem_[params.executor] =
               std::move(collective_multimem));
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
index 73a36ca88102fe..01677b001866c7 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.h
@@ -45,15 +45,16 @@ class CollectiveMetadataThunk : public Thunk {
     int64_t memory_space;
   };
 
-  explicit CollectiveMetadataThunk(ThunkInfo thunk_info,
-                                   CollectiveConfig collective_config,
-                                   std::vector<Buffer> parameters,
-                                   BufferAllocation::Slice result)
+  CollectiveMetadataThunk(ThunkInfo thunk_info,
+                          CollectiveConfig collective_config,
+                          std::vector<Buffer> parameters,
+                          BufferAllocation::Slice result)
       : Thunk(Thunk::Kind::kCollectiveMetadata, thunk_info),
         collective_config_(std::move(collective_config)),
         parameters_(std::move(parameters)),
         result_(result) {}
 
+  absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -75,9 +76,8 @@ class CollectiveMetadataThunk : public Thunk {
       int64_t num_devices, int64_t parameter_index);
 
  private:
-  absl::StatusOr<std::shared_ptr<CollectiveMultimem>> AllocateMultimem(
-      const GpuCliqueKey& clique_key, RankId rank,
-      const InitializeParams& params);
+  absl::StatusOr<std::shared_ptr<CollectiveMultimem>> GetCollectiveMultimem(
+      const GpuCliqueKey& clique_key, const InitializeParams& params);
 
   const CollectiveConfig collective_config_;
   std::vector<Buffer> parameters_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc
index bbf2535d5f1216..ad17e9b3b8a45b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/backends/gpu/runtime/collective_multimem.h"
 
-#include <any>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -74,7 +73,6 @@ struct AllocateParams {
   se::StreamExecutor* executor;
   RankId rank;
   se::DeviceAddressBase map_to;
-  std::any payload;
 };
 
 struct RankCmp {
@@ -100,9 +98,9 @@ struct MappedPtrFormatter {
 }  // namespace
 
 absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
-CollectiveMultimem::Allocate(se::StreamExecutor* executor,
+CollectiveMultimem::Allocate(se::StreamExecutor& executor,
                              const GpuCliqueKey& clique_key, RankId rank,
-                             se::DeviceAddressBase map_to, std::any payload) {
+                             se::DeviceAddressBase map_to) {
   VLOG(3) << absl::StrFormat(
       "rank=[%d] Allocate collective multimem for clique: %s", rank.value(),
       clique_key.ToString());
@@ -112,13 +110,13 @@ CollectiveMultimem::Allocate(se::StreamExecutor* executor,
   if (!clique_key.is_local()) {
     return Unimplemented(
         "%sMultimem is not supported in multi-process mode in clique %s",
-        XlaFormatDevice(executor->device_ordinal()), clique_key.ToString());
+        XlaFormatDevice(executor.device_ordinal()), clique_key.ToString());
   }
 
   std::string rendezvous_name = absl::StrFormat(
       "CollectiveMultimem::Allocate for clique %s", clique_key.ToString());
   AllocateRendezvousKey rendezvous_key = {clique_key};
-  AllocateParams params = {executor, rank, map_to, std::move(payload)};
+  AllocateParams params = {&executor, rank, map_to};
 
   // A callback for rendezvous to allocate and map the multicast memory.
   auto allocate = [&](absl::Span<const AllocateParams*> params)
@@ -159,12 +157,6 @@ CollectiveMultimem::Allocate(se::StreamExecutor* executor,
               dynamic_cast<se::gpu::GpuExecutor*>(param->executor)));
     }
 
-    // For all participating devices move payloads to the collective multimem.
-    absl::btree_map<RankId, std::any> payloads;
-    for (const auto* param : params) {
-      payloads[param->rank] = std::move(param->payload);
-    }
-
     VLOG(3) << absl::StrFormat(
         "Allocated collective multimem for clique: %s; mapped_ptrs: [%s]",
         clique_key.ToString(),
@@ -182,14 +174,13 @@ CollectiveMultimem::Allocate(se::StreamExecutor* executor,
 }
 
 absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
-CollectiveMultimem::Allocate(se::StreamExecutor* executor,
+CollectiveMultimem::Allocate(se::StreamExecutor& executor,
                              const GpuCliqueKey& clique_key,
                              GlobalDeviceId global_device_id,
-                             se::DeviceAddressBase map_to, std::any payload) {
+                             se::DeviceAddressBase map_to) {
   if (std::optional<RankId> rank = clique_key.rank(global_device_id)) {
-    return Allocate(executor, clique_key, *rank, map_to, std::move(payload));
+    return Allocate(executor, clique_key, *rank, map_to);
   }
   return InvalidArgument("Rank not found for device %v", global_device_id);
 }
-
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h
index 20ce764570c53f..a2ee82ea77e102 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_H_
 
 #include <any>
-#include <functional>
 #include <memory>
 
 #include "absl/container/btree_map.h"
@@ -52,25 +51,19 @@ class CollectiveMultimem {
   // The optional `payload` argument is captured by the returned shared pointer
   // to allow callers to associate arbitrary data with the collective multimem.
   static absl::StatusOr<std::shared_ptr<CollectiveMultimem>> Allocate(
-      se::StreamExecutor* executor, const GpuCliqueKey& clique_key, RankId rank,
-      se::DeviceAddressBase map_to, std::any payload = {});
+      se::StreamExecutor& executor, const GpuCliqueKey& clique_key, RankId rank,
+      se::DeviceAddressBase map_to);
 
   // Allocates a CollectiveMultimem for the given global device id.
   static absl::StatusOr<std::shared_ptr<CollectiveMultimem>> Allocate(
-      se::StreamExecutor* executor, const GpuCliqueKey& clique_key,
-      GlobalDeviceId global_device_id, se::DeviceAddressBase map_to,
-      std::any payload = {});
+      se::StreamExecutor& executor, const GpuCliqueKey& clique_key,
+      GlobalDeviceId global_device_id, se::DeviceAddressBase map_to);
 
   const GpuCliqueKey& clique_key() const { return clique_key_; }
 
   // Returns the device pointer to the multicast memory for the given rank.
   void* mapped_ptr(RankId rank) const { return mapped_ptrs_.at(rank); }
 
-  // Returns the payload associated with the given rank. If payload type is not
-  // the same as `T`, returns an error.
-  template <typename T>
-  absl::StatusOr<std::reference_wrapper<T>> payload(RankId rank) const;
-
  private:
   CollectiveMultimem(
       GpuCliqueKey clique_key, absl::btree_map<RankId, void*> mapped_ptrs,
@@ -89,21 +82,6 @@ class CollectiveMultimem {
   std::unique_ptr<se::gpu::MulticastMemory> multicast_memory_;
 };
 
-template <typename T>
-absl::StatusOr<std::reference_wrapper<T>> CollectiveMultimem::payload(
-    RankId rank) const {
-  auto it = payload_.find(rank);
-  if (it == payload_.end()) {
-    return NotFound("Payload not found for rank %d", rank.value());
-  }
-
-  if (std::any_cast<T>(&it->second) == nullptr) {
-    return InvalidArgument("Payload type mismatch for rank %d", rank.value());
-  }
-
-  return std::ref(std::any_cast<T&>(&it->second));
-}
-
 }  // namespace xla::gpu
 
 #endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.cc b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.cc
new file mode 100644
index 00000000000000..dee42a12108375
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.cc
@@ -0,0 +1,55 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
+#include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+
+void CollectiveMultimemRegistry::Register(const MultimemRequest& request) {
+  requests_.push_back(request);
+}
+
+absl::Status CollectiveMultimemRegistry::Build() {
+  for (const MultimemRequest& request : requests_) {
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<CollectiveMultimem> multimem,
+        CollectiveMultimem::Allocate(executor_, request.key, global_device_id_,
+                                     request.map_to));
+    multimems_[request] = multimem;
+  }
+
+  requests_.clear();
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::shared_ptr<CollectiveMultimem>>
+CollectiveMultimemRegistry::Get(const MultimemRequest& request) const {
+  auto it = multimems_.find(request);
+  if (it == multimems_.end()) {
+    return absl::NotFoundError(absl::StrFormat(
+        "Multimem not found for request: %s", request.key.ToString()));
+  }
+  return it->second;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.h b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.h
new file mode 100644
index 00000000000000..46e7069cf18e44
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_multimem_registry.h
@@ -0,0 +1,83 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_REGISTRY_H_
+#define XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_REGISTRY_H_
+
+#include <cstdint>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/base/nullability.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/runtime/collective_multimem.h"
+#include "xla/runtime/device_id.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla::gpu {
+
+// A request for a multimem for a given clique on a given address space.
+struct MultimemRequest {
+  static std::tuple<GpuCliqueKey, void*, uint64_t> CmpKey(
+      const MultimemRequest& key) {
+    return {key.key, key.map_to.opaque(), key.map_to.size()};
+  }
+
+  friend bool operator==(const MultimemRequest& a, const MultimemRequest& b) {
+    return a.key == b.key && a.map_to == b.map_to;
+  }
+
+  template <typename H>
+  friend H AbslHashValue(H h, const MultimemRequest& key) {
+    return H::combine(std::move(h), key.key, key.map_to.opaque(),
+                      key.map_to.size());
+  }
+
+  GpuCliqueKey key;
+  se::DeviceAddressBase map_to;
+};
+
+// Allocates and provides thunks requested multimem objects.
+class CollectiveMultimemRegistry {
+ public:
+  // Does not take ownership of `executor`, which must outlive this object.
+  CollectiveMultimemRegistry(se::StreamExecutor* absl_nonnull executor,
+                             GlobalDeviceId global_device_id)
+      : executor_(*executor), global_device_id_(global_device_id) {}
+
+  void Register(const MultimemRequest& request);
+
+  absl::Status Build();
+
+  absl::StatusOr<std::shared_ptr<CollectiveMultimem>> Get(
+      const MultimemRequest& request) const;
+
+ private:
+  std::vector<MultimemRequest> requests_;
+  absl::flat_hash_map<MultimemRequest, std::shared_ptr<CollectiveMultimem>>
+      multimems_;
+  se::StreamExecutor& executor_;
+  GlobalDeviceId global_device_id_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_RUNTIME_COLLECTIVE_MULTIMEM_REGISTRY_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.h b/third_party/xla/xla/backends/gpu/runtime/thunk.h
index bd88654eaf963a..e8b8471e7403e9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_clique_requests.h"
 #include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
@@ -250,6 +251,12 @@ class Thunk {
     const CollectiveParams* collective_params = nullptr;
     // Clique requests for preparing collective communicators.
     CollectiveCliqueRequests* clique_requests = nullptr;
+    // Multimem registry for preparing multimem objects.
+    CollectiveMultimemRegistry* absl_nonnull multimem_registry = nullptr;
+    // Stream executor for the thunk.
+    se::StreamExecutor* absl_nonnull executor = nullptr;
+    // Buffer allocations for the thunk.
+    const BufferAllocations* absl_nonnull buffer_allocations = nullptr;
   };
 
   //===--------------------------------------------------------------------===//
@@ -282,6 +289,9 @@ class Thunk {
     // Collective cliques acquired based on resource requests.
     CollectiveCliques* collective_cliques = nullptr;
 
+    // Multimem registry for preparing collective communicators.
+    CollectiveMultimemRegistry* multicast_memory_registry = nullptr;
+
     // XLA FFI execution context.
     const ffi::ExecutionContext* ffi_execution_context = nullptr;
 
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 87e5ac19c0e0d6..e20aee3d7818af 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -483,6 +483,7 @@ cc_library(
         "//xla/backends/gpu/runtime:collective_group_thunk",
         "//xla/backends/gpu/runtime:collective_kernel_thunk",
         "//xla/backends/gpu/runtime:collective_metadata_thunk",
+        "//xla/backends/gpu/runtime:collective_multimem",
         "//xla/backends/gpu/runtime:collective_permute_thunk",
         "//xla/backends/gpu/runtime:collective_thunk",
         "//xla/backends/gpu/runtime:command_buffer_cmd",
@@ -691,6 +692,7 @@ cc_library(
         "//xla/backends/gpu/runtime:annotation",
         "//xla/backends/gpu/runtime:collective_clique_requests",
         "//xla/backends/gpu/runtime:collective_cliques",
+        "//xla/backends/gpu/runtime:collective_multimem_registry",
         "//xla/backends/gpu/runtime:collective_params",
         "//xla/backends/gpu/runtime:command_buffer_conversion_pass",
         "//xla/backends/gpu/runtime:nvshmem_collective_thunk",
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 905cac3061925c..1c73b3a5a8002e 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/annotation.h"
 #include "xla/backends/gpu/runtime/collective_clique_requests.h"
 #include "xla/backends/gpu/runtime/collective_cliques.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/command_buffer_conversion_pass.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
@@ -460,9 +461,13 @@ absl::Status ExecuteThunksImpl(
           collective_max_nchannels, p2p_max_nchannels));
 
   CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
 
   {  // Prepare thunks for execution and collect requested GPU cliques.
-    Thunk::PrepareParams prepare_params{&collective_params, &clique_requests};
+    Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                        &multimem_registry, executor,
+                                        &buffer_allocations};
 
     tsl::profiler::TraceMe trace_prepare("Thunks::Prepare");
     TF_RETURN_IF_ERROR(thunk_sequence.Prepare(prepare_params));
@@ -488,6 +493,8 @@ absl::Status ExecuteThunksImpl(
                 : false));
   }
 
+  TF_RETURN_IF_ERROR(multimem_registry.Build());
+
   {  // Initialize thunks using prepared resources before execution.
     Thunk::InitializeParams initialize_params{
         executor,
@@ -497,6 +504,7 @@ absl::Status ExecuteThunksImpl(
         command_buffer_trace_stream,
         &collective_params,
         &collective_cliques,
+        &multimem_registry,
         run_options->run_options().ffi_execution_context(),
         run_options->local_device_count()};
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index fa28dc809e5b0a..1dbcf218156283 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -412,6 +412,7 @@ bool CanEnablePeerAccess(CUdevice from, CUdevice to) {
     LOG(ERROR) << "failed to detect peer access capability: " << status;
     return false;
   }
+
   return can_access_peer;
 }
 
@@ -949,7 +950,7 @@ absl::StatusOr<void*> CudaExecutor::VmmAllocateMemory(uint64_t bytes) {
   int device_count = 0;
   TF_RETURN_IF_ERROR(cuda::ToStatus(cudaGetDeviceCount(&device_count)));
   for (int peer = 0; peer < device_count; peer++) {
-    if (peer == device_ordinal() || CanEnablePeerAccess(peer, device_)) {
+    if (peer == device_ordinal() || CanEnablePeerAccessTo(peer)) {
       CUmemAccessDesc accessDesc = GetVmmAccessDescriptor(peer);
       TF_RETURN_IF_ERROR(
           cuda::ToStatus(cuMemSetAccess(ptr, padded_size, &accessDesc, 1)));
@@ -1602,11 +1603,29 @@ fft::FftSupport* CudaExecutor::AsFft() {
   return fft_.get();
 }
 
+// TODO(468297175): Precalculate peer access in stream executor constructor.
 bool CudaExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
   CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
   return CanEnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
 }
 
+bool CudaExecutor::CanEnablePeerAccessTo(int other_device_ordinal) {
+  if (other_device_ordinal == device_ordinal()) {
+    // Self-access is always allowed.
+    return true;
+  }
+
+  auto it = peer_access_cache_.find(other_device_ordinal);
+  if (it != peer_access_cache_.end()) {
+    return it->second;
+  }
+
+  const bool result =
+      CanEnablePeerAccess(device_ordinal(), other_device_ordinal);
+  peer_access_cache_[other_device_ordinal] = result;
+  return result;
+}
+
 absl::Status CudaExecutor::EnablePeerAccessTo(StreamExecutor* other) {
   CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
   return EnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
index 00a6c0ca48f2a6..b115fc95d07e13 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.h
@@ -90,6 +90,7 @@ class CudaExecutor : public GpuExecutor {
   void DeallocateStream(Stream* stream) override;
   absl::Status EnablePeerAccessTo(StreamExecutor* other) override;
   bool CanEnablePeerAccessTo(StreamExecutor* other) override;
+  bool CanEnablePeerAccessTo(int other_device_ordinal) override;
   bool DeviceMemoryUsage(int64_t* free_out, int64_t* total_out) const override;
   absl::StatusOr<std::unique_ptr<Kernel>> LoadKernel(
       const KernelLoaderSpec& spec) override;
@@ -318,6 +319,7 @@ class CudaExecutor : public GpuExecutor {
   int stream_priority_lowest_ = 0;
   int stream_priority_highest_ = 0;
   bool stream_priority_query_ok_ = false;
+  absl::flat_hash_map<int, bool> peer_access_cache_;
 };
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/xla/xla/stream_executor/stream_executor.h
index 045d30aefe1aba..c3e8bd145fc7fe 100644
--- a/third_party/xla/xla/stream_executor/stream_executor.h
+++ b/third_party/xla/xla/stream_executor/stream_executor.h
@@ -235,6 +235,9 @@ class StreamExecutor {
   // StreamExecutor to memory allocated by another.
   virtual bool CanEnablePeerAccessTo(StreamExecutor* other) = 0;
 
+  // Same as above, but takes the device ordinal of the other device.
+  virtual bool CanEnablePeerAccessTo(int other_device_ordinal) { return false; }
+
   // Returns the underlying device memory usage information, if it is available.
   // If it is not available (false is returned), free/total may not be
   // initialized.
diff --git a/third_party/xla/xla/tests/collective_metadata_test.cc b/third_party/xla/xla/tests/collective_metadata_test.cc
index 33aad38269cee9..b5655496b18eb8 100644
--- a/third_party/xla/xla/tests/collective_metadata_test.cc
+++ b/third_party/xla/xla/tests/collective_metadata_test.cc
@@ -84,25 +84,86 @@ TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadata) {
   ASSERT_EQ(first_result_data.size(), kNumElements);
   ASSERT_EQ(second_result_data.size(), kNumElements);
 
-  // Check the rank in the first position.
-  EXPECT_EQ(first_result_data[0], 0);
-  EXPECT_EQ(second_result_data[0], 1);
+  EXPECT_EQ(first_result_data[0], 0) << "First result rank is not 0.";
+  EXPECT_EQ(second_result_data[0], 1) << "Second result rank is not 1.";
 
-  // Check pointer to peers in the second position.
-  EXPECT_NE(first_result_data[1], 0);
-  EXPECT_NE(second_result_data[1], 0);
+  EXPECT_NE(first_result_data[1], 0)
+      << "First result pointer to peers is NULL.";
+  EXPECT_NE(second_result_data[1], 0)
+      << "Second result pointer to peers is NULL.";
 
-  // Check pointer to multimem metadata in the third position.
-  EXPECT_NE(first_result_data[2], 0);
-  EXPECT_NE(second_result_data[2], 0);
+  EXPECT_NE(first_result_data[2], 0)
+      << "First result pointer to multimem metadata is not set.";
+  EXPECT_NE(second_result_data[2], 0)
+      << "Second result pointer to multimem metadata is not set.";
 
-  // Check param_to_peers structure.
   for (int i = 3; i < kNumElements; ++i) {
-    EXPECT_NE(first_result_data[i], 0);
-    EXPECT_EQ(second_result_data[i], first_result_data[i]);
+    EXPECT_NE(first_result_data[i], 0)
+        << "First result param_to_peers is NULL.";
+    EXPECT_EQ(second_result_data[i], first_result_data[i])
+        << "Param_to_peers mismatch at index " << i
+        << " in the first result: " << first_result_data[i]
+        << " and in the second result: " << second_result_data[i];
   }
 }
 
+TEST_F(CollectiveMetadataTest, BuildMultimemOnlyOncePerModuleExecution) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test, replica_count=2
+
+  ENTRY test_computation {
+    param_0 = f32[1] parameter(0)
+    copy_1 = f32[1]{0:S(1)} copy(param_0)
+
+    first_result_tuple = (f32[1]{0:S(1)}, u64[5]) custom-call(copy_1), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {})}
+    first_result = u64[5] get-tuple-element(first_result_tuple), index=1
+    second_result_tuple = (f32[1]{0:S(1)}, u64[5]) custom-call(copy_1), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {})}
+    second_result = u64[5] get-tuple-element(second_result_tuple), index=1
+    ROOT result_tuple = (u64[5], u64[5]) tuple(first_result, second_result)
+  })";
+
+  constexpr int kNumReplicas = 2;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
+      << "Test requires at least " << kNumReplicas << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleStr, kNumReplicas));
+
+  Literal input_0 = LiteralUtil::CreateR1<float>({1.0f});
+  Literal input_1 = LiteralUtil::CreateR1<float>({1.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(module),
+                        /*arguments=*/std::vector<Literal*>{&input_0, &input_1},
+                        /*run_hlo_passes=*/false));
+
+  std::vector<Literal>& literals = execution_result.results;
+  ASSERT_EQ(literals.size(), kNumReplicas);
+
+  std::vector<Literal> first_result = literals[0].DecomposeTuple();
+  std::vector<Literal> second_result = literals[1].DecomposeTuple();
+
+  absl::Span<const uint64_t> first_device_first_result =
+      first_result[0].data<uint64_t>();
+  absl::Span<const uint64_t> first_device_second_result =
+      first_result[1].data<uint64_t>();
+  absl::Span<const uint64_t> second_device_first_result =
+      second_result[0].data<uint64_t>();
+  absl::Span<const uint64_t> second_device_second_result =
+      second_result[1].data<uint64_t>();
+  constexpr int kNumElements = 5;
+  ASSERT_EQ(first_device_first_result.size(), kNumElements);
+  ASSERT_EQ(first_device_second_result.size(), kNumElements);
+  ASSERT_EQ(second_device_first_result.size(), kNumElements);
+  ASSERT_EQ(second_device_second_result.size(), kNumElements);
+
+  EXPECT_EQ(first_device_first_result[2], first_device_second_result[2])
+      << "Multimem metadata should be the same for both results.";
+  EXPECT_EQ(second_device_first_result[2], second_device_second_result[2])
+      << "Multimem metadata should be the same for both results.";
+}
+
 TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadataWithReplicaGroup) {
   const absl::string_view kModuleStr = R"(
   HloModule test, replica_count=4

From 67360fac620c966137bcb964c30c04b5511d82ef Mon Sep 17 00:00:00 2001
From: Theotime Combes <tcombes@google.com>
Date: Mon, 15 Dec 2025 09:16:20 -0800
Subject: [PATCH 275/753] [XLA:GPU] Move TransposeDescription back to
 xla/service/gpu.

Only used in gpu emitters.

Pre requisite to consolidate with TransposeSpec

PiperOrigin-RevId: 844800024
---
 .../xla/xla/codegen/ir_emission_utils.cc      |  9 ----
 .../xla/xla/codegen/ir_emission_utils.h       | 42 -------------------
 .../xla/service/gpu/hlo_fusion_analysis.cc    |  1 -
 .../xla/xla/service/gpu/hlo_fusion_analysis.h |  1 +
 .../xla/xla/service/gpu/ir_emission_utils.cc  |  7 ++++
 .../xla/xla/service/gpu/ir_emission_utils.h   | 41 +++++++++++++++++-
 6 files changed, 48 insertions(+), 53 deletions(-)

diff --git a/third_party/xla/xla/codegen/ir_emission_utils.cc b/third_party/xla/xla/codegen/ir_emission_utils.cc
index 5021929753c8c9..58ed76d4ee1a1d 100644
--- a/third_party/xla/xla/codegen/ir_emission_utils.cc
+++ b/third_party/xla/xla/codegen/ir_emission_utils.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "xla/primitive_util.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -40,14 +39,6 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 
 namespace xla {
-
-int GetBitwidth(PrimitiveType type) {
-  if (type == PRED) {
-    return 8;
-  }
-  return primitive_util::BitWidth(type);
-}
-
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count) {
   // Number of operands should be in range [1, allowed_operand_count].
   if (instr->operand_count() == 0 ||
diff --git a/third_party/xla/xla/codegen/ir_emission_utils.h b/third_party/xla/xla/codegen/ir_emission_utils.h
index 8245e2f0b056a1..d2c18b139385c8 100644
--- a/third_party/xla/xla/codegen/ir_emission_utils.h
+++ b/third_party/xla/xla/codegen/ir_emission_utils.h
@@ -16,12 +16,10 @@ limitations under the License.
 #ifndef XLA_CODEGEN_IR_EMISSION_UTILS_H_
 #define XLA_CODEGEN_IR_EMISSION_UTILS_H_
 
-#include <cstdint>
 #include <functional>
 #include <optional>
 #include <vector>
 
-#include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
@@ -29,51 +27,11 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
 
-// Returns the bitwidth of the given primitive type. Unfortunately,
-// primitive_util::BitWidth(PRED) return 1 instead of 8.
-int GetBitwidth(PrimitiveType type);
-
-/// Description of how to emit a given transposition.
-struct TransposeDescription {
-  // Transpose instruction.
-  const HloInstruction* instr;
-
-  // Normalized transpose dimensions.
-  absl::InlinedVector<int64_t, 3> dimensions;
-
-  // Permutations of normalized transpose dimensions.
-  absl::InlinedVector<int64_t, 3> permutation;
-
-  // Required amount of shared memory in bytes.
-  int64_t shmem_usage = 0;
-
-  TransposeDescription(const HloInstruction* instr,
-                       absl::InlinedVector<int64_t, 3> dimensions,
-                       absl::InlinedVector<int64_t, 3> permutation,
-                       int64_t shmem_usage)
-      : instr(instr),
-        dimensions(dimensions),
-        permutation(permutation),
-        shmem_usage(shmem_usage) {}
-
-  // Transpose instruction input shape.
-  const Shape& input_shape() const { return instr->operand(0)->shape(); }
-
-  // Returns true, if both descriptions have the same dimensions and
-  // permutation, even if they're produced by different instructions.
-  bool IsEquivalent(const TransposeDescription& other) const {
-    return dimensions == other.dimensions && permutation == other.permutation &&
-           GetBitwidth(instr->shape().element_type()) ==
-               GetBitwidth(other.instr->shape().element_type());
-  }
-};
-
 // Checks if the instruction is elementwise.
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1);
 
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 1f889842500cfb..3566a20ac1c4a3 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "xla/codegen/hlo_fusion_spec.h"
-#include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_traversal.h"
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index c5b8980abdd5ab..a6bcd04e371213 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index a749ef0923f87b..b22a642419c531 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -177,6 +177,13 @@ static bool IsContiguousSlice(
   return true;
 }
 
+int GetBitwidth(PrimitiveType type) {
+  if (type == PRED) {
+    return 8;
+  }
+  return primitive_util::BitWidth(type);
+}
+
 bool IsContiguousSlice(const HloInstruction& instr) {
   if (auto slice = DynCast<HloSliceInstruction>(&instr)) {
     const Shape& full_shape = slice->operand(0)->shape();
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 18499b6472dabe..012716126d0943 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
-#include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_print_options.h"
@@ -170,6 +169,46 @@ HloInstructionAdaptor FindNonTrivialHero(const HloInstructionAdaptor& instr);
 // Same as above, but fusion is the parent computation of the hlo instruction.
 const HloInstruction& FindNonTrivialHero(const HloInstruction& instr);
 
+// Returns the bitwidth of the given primitive type. Unfortunately,
+// primitive_util::BitWidth(PRED) return 1 instead of 8.
+int GetBitwidth(PrimitiveType type);
+
+/// Description of how to emit a given transposition.
+struct TransposeDescription {
+  // Transpose instruction.
+  const HloInstruction* instr;
+
+  // Normalized transpose dimensions.
+  absl::InlinedVector<int64_t, 3> dimensions;
+
+  // Permutations of normalized transpose dimensions.
+  // Normalized means that permutation[i] + 1 != permutation[i + 1].
+  absl::InlinedVector<int64_t, 3> permutation;
+
+  // Required amount of shared memory in bytes.
+  int64_t shmem_usage = 0;
+
+  TransposeDescription(const HloInstruction* instr,
+                       absl::InlinedVector<int64_t, 3> dimensions,
+                       absl::InlinedVector<int64_t, 3> permutation,
+                       int64_t shmem_usage)
+      : instr(instr),
+        dimensions(dimensions),
+        permutation(permutation),
+        shmem_usage(shmem_usage) {}
+
+  // Transpose instruction input shape.
+  const Shape& input_shape() const { return instr->operand(0)->shape(); }
+
+  // Returns true, if both descriptions have the same dimensions and
+  // permutation, even if they're produced by different instructions.
+  bool IsEquivalent(const TransposeDescription& other) const {
+    return dimensions == other.dimensions && permutation == other.permutation &&
+           GetBitwidth(instr->shape().element_type()) ==
+               GetBitwidth(other.instr->shape().element_type());
+  }
+};
+
 std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     const HloInstruction& hero);
 

From f3dc139eb1d7a2cba45d43f5a621b6225d36311c Mon Sep 17 00:00:00 2001
From: Karlo Basioli <basioli@google.com>
Date: Mon, 15 Dec 2025 09:16:37 -0800
Subject: [PATCH 276/753] [XLA][codegen] Integers with bitwidth 1 should be
 emitted in MLIR as I8

PREDs are emitted as I8 for historical reasons so the same should be true for S1 and U1.

PiperOrigin-RevId: 844800138
---
 .../xla/xla/codegen/emitters/type_util.cc        |  6 +++++-
 .../xla/xla/codegen/emitters/type_util_test.cc   | 16 ++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/codegen/emitters/type_util.cc b/third_party/xla/xla/codegen/emitters/type_util.cc
index 460b9079022257..c498ed84daf604 100644
--- a/third_party/xla/xla/codegen/emitters/type_util.cc
+++ b/third_party/xla/xla/codegen/emitters/type_util.cc
@@ -33,7 +33,11 @@ namespace emitters {
 
 mlir::Type PrimitiveTypeToMlirType(PrimitiveType type, mlir::OpBuilder& b) {
   if (primitive_util::IsIntegralType(type)) {
-    return b.getIntegerType(primitive_util::BitWidth(type));
+    auto bitwidth = primitive_util::BitWidth(type);
+    if (bitwidth == 1) {
+      return b.getI8Type();
+    }
+    return b.getIntegerType(bitwidth);
   }
   return PrimitiveTypeToMlirTypeWithSign(type, b);
 }
diff --git a/third_party/xla/xla/codegen/emitters/type_util_test.cc b/third_party/xla/xla/codegen/emitters/type_util_test.cc
index c11c4d5f768568..96df881acc0891 100644
--- a/third_party/xla/xla/codegen/emitters/type_util_test.cc
+++ b/third_party/xla/xla/codegen/emitters/type_util_test.cc
@@ -61,6 +61,22 @@ TEST(TensorShapeTest, ConvertsPred) {
             "tensor<4x5x6xi8>");
 }
 
+TEST(TensorShapeTest, ConvertsU1) {
+  mlir::MLIRContext ctx;
+  mlir::OpBuilder b(&ctx);
+  EXPECT_EQ(TypeToString(
+                TensorShapeToMlirType(ShapeUtil::MakeShape(U1, {4, 5, 6}), b)),
+            "tensor<4x5x6xi8>");
+}
+
+TEST(TensorShapeTest, ConvertsS1) {
+  mlir::MLIRContext ctx;
+  mlir::OpBuilder b(&ctx);
+  EXPECT_EQ(TypeToString(
+                TensorShapeToMlirType(ShapeUtil::MakeShape(S1, {4, 5, 6}), b)),
+            "tensor<4x5x6xi8>");
+}
+
 TEST(TensorShapeTest, ConvertsLayout) {
   mlir::MLIRContext ctx;
   mlir::OpBuilder b(&ctx);

From eacb3d2330330875c7ed530cb41ff1cf995999d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Eusebio=20Dur=C3=A1n=20Monta=C3=B1a?= <eusebiodm@google.com>
Date: Mon, 15 Dec 2025 09:22:10 -0800
Subject: [PATCH 277/753] Add riegeli as a dependency in XLA

We'll be using it to be able to generate AOT binaries for models bigger than 2GiB (proto size limit). Additional changes:

* Add brotli as a dependency (required by riegeli)
* Upgraded the Snappy versions, which was also required by riegeli

PiperOrigin-RevId: 844802068
---
 third_party/xla/MODULE.bazel                      |  7 +++++--
 third_party/xla/third_party/brotli/BUILD.bazel    |  0
 third_party/xla/third_party/brotli/workspace.bzl  | 11 +++++++++++
 third_party/xla/third_party/riegeli/BUILD.bazel   |  0
 third_party/xla/third_party/riegeli/workspace.bzl | 11 +++++++++++
 third_party/xla/workspace2.bzl                    | 10 +++++++---
 6 files changed, 34 insertions(+), 5 deletions(-)
 create mode 100644 third_party/xla/third_party/brotli/BUILD.bazel
 create mode 100644 third_party/xla/third_party/brotli/workspace.bzl
 create mode 100644 third_party/xla/third_party/riegeli/BUILD.bazel
 create mode 100644 third_party/xla/third_party/riegeli/workspace.bzl

diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index 0aa63a6b4b78f5..9b372e6b9d9ad0 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -3,6 +3,7 @@ module(name = "xla")
 ##############################################################
 # Bazel module dependencies
 
+# go/keep-sorted start
 bazel_dep(name = "abseil-cpp", version = "20250814.0", repo_name = "com_google_absl")
 bazel_dep(name = "abseil-py", version = "2.1.0", repo_name = "absl_py")
 bazel_dep(name = "bazel_features", version = "1.36.0")
@@ -21,6 +22,7 @@ bazel_dep(name = "pybind11_abseil", version = "202402.0")
 bazel_dep(name = "pybind11_bazel", version = "2.13.6")
 bazel_dep(name = "pybind11_protobuf", version = "0.0.0-20250210-f02a2b7")
 bazel_dep(name = "re2", version = "2024-07-02.bcr.1", repo_name = "com_googlesource_code_re2")
+bazel_dep(name = "riegeli", version = "0.0.0-20250822-9f2744d", repo_name = "com_google_riegeli")
 bazel_dep(name = "rules_cc", version = "0.2.0")
 bazel_dep(name = "rules_java", version = "8.16.1")
 bazel_dep(name = "rules_license", version = "1.0.0")
@@ -28,6 +30,7 @@ bazel_dep(name = "rules_python", version = "1.6.0")
 bazel_dep(name = "rules_shell", version = "0.6.1")
 bazel_dep(name = "snappy", version = "1.2.1")
 bazel_dep(name = "zlib", version = "1.3.1.bcr.5")
+# go/keep-sorted end
 
 # Only for compatibility, not directly used, change repo_name to None after upgrading Bazel to latest 7.x
 bazel_dep(name = "eigen", version = "4.0.0-20241125.bcr.3", repo_name = "DO_NOT_USE_eigen")
@@ -71,12 +74,12 @@ single_version_override(
 # Use an unreleased version of googletest
 archive_override(
     module_name = "googletest",
-    strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
-    urls = ["https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c94018.zip"],
     patch_strip = 1,
     patches = [
         "//third_party/googletest:0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch",
     ],
+    strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
+    urls = ["https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c94018.zip"],
 )
 
 ##############################################################
diff --git a/third_party/xla/third_party/brotli/BUILD.bazel b/third_party/xla/third_party/brotli/BUILD.bazel
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/third_party/brotli/workspace.bzl b/third_party/xla/third_party/brotli/workspace.bzl
new file mode 100644
index 00000000000000..ec76237744b347
--- /dev/null
+++ b/third_party/xla/third_party/brotli/workspace.bzl
@@ -0,0 +1,11 @@
+"""Provides the repo macro to import brotli"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "org_brotli",
+        sha256 = "e720a6ca29428b803f4ad165371771f5398faba397edf6778837a18599ea13ff",
+        strip_prefix = "brotli-1.1.0",
+        urls = tf_mirror_urls("https://github.com/google/brotli/archive/refs/tags/v1.1.0.tar.gz"),
+    )
diff --git a/third_party/xla/third_party/riegeli/BUILD.bazel b/third_party/xla/third_party/riegeli/BUILD.bazel
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/xla/third_party/riegeli/workspace.bzl b/third_party/xla/third_party/riegeli/workspace.bzl
new file mode 100644
index 00000000000000..577511ee10e83e
--- /dev/null
+++ b/third_party/xla/third_party/riegeli/workspace.bzl
@@ -0,0 +1,11 @@
+"""Provides the repo macro to import riegeli"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    tf_http_archive(
+        name = "com_google_riegeli",
+        sha256 = "f63337f63f794ba9dc7dd281b20af3d036dfe0c1a5a4b7b8dc20b39f7e323b97",
+        strip_prefix = "riegeli-9f2744dc23e81d84c02f6f51244e9e9bb9802d57",
+        urls = tf_mirror_urls("https://github.com/google/riegeli/archive/9f2744dc23e81d84c02f6f51244e9e9bb9802d57.tar.gz"),
+    )
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index d81988a636a1f2..9438e40f32fe26 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -10,6 +10,7 @@ load("@rules_ml_toolchain//gpu/sycl:sycl_init_repository.bzl", "sycl_init_reposi
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 load("//third_party/absl:workspace.bzl", absl = "repo")
 load("//third_party/benchmark:workspace.bzl", benchmark = "repo")
+load("//third_party/brotli:workspace.bzl", brotli = "repo")
 load("//third_party/clang_toolchain:cc_configure_clang.bzl", "cc_download_clang_toolchain")
 load("//third_party/cpuinfo:workspace.bzl", cpuinfo = "repo")
 load("//third_party/cudnn_frontend:workspace.bzl", cudnn_frontend = "repo")
@@ -42,6 +43,7 @@ load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
 load("//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
 load("//third_party/raft:workspace.bzl", raft = "repo")
 load("//third_party/rapids_logger:workspace.bzl", rapids_logger = "repo")
+load("//third_party/riegeli:workspace.bzl", riegeli = "repo")
 load("//third_party/rmm:workspace.bzl", rmm = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
@@ -76,6 +78,8 @@ def _initialize_third_party():
     farmhash()
     fmt()
     fxdiv()
+    riegeli()
+    brotli()
     gemmlowp()
     gloo()
     gutil()
@@ -389,9 +393,9 @@ def _tf_repositories():
     tf_http_archive(
         name = "snappy",
         build_file = "//third_party:snappy.BUILD",
-        sha256 = "2e458b7017cd58dcf1469ab315389e85e7f445bd035188f2983f81fb19ecfb29",
-        strip_prefix = "snappy-984b191f0fefdeb17050b42a90b7625999c13b8d",
-        urls = tf_mirror_urls("https://github.com/google/snappy/archive/984b191f0fefdeb17050b42a90b7625999c13b8d.tar.gz"),
+        sha256 = "736aeb64d86566d2236ddffa2865ee5d7a82d26c9016b36218fcc27ea4f09f86",
+        strip_prefix = "snappy-1.2.1",
+        urls = tf_mirror_urls("https://github.com/google/snappy/archive/refs/tags/1.2.1.tar.gz"),
     )
 
     tf_http_archive(

From 6d2a2a067073b0cf1965855265df4f3bac5fd1d0 Mon Sep 17 00:00:00 2001
From: TJ Xu <tjx@nvidia.com>
Date: Mon, 15 Dec 2025 09:22:31 -0800
Subject: [PATCH 278/753] PR #34196: [NVIDIA GPU] Fix a deadlock when doing
 comm split
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34196

📝 Summary of Changes
If participant group of a collective is empty, doing split can sometimes cause deadlocks since we dont know what are the participating ranks. Instead it should proceed to a regular comm init.

🎯 Justification
A target case would be if there are multiple communicators cached from different modules that use different device groups.
We rely one a rendezvous to return early if a communicator has been cached. Since participant groups are part of the key, there will be inconsistent cache hits among ranks leading to some ranks go to split but other won't.  ncclCommSplit requires all ranks to be call the api otherwise it will hang.
🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
NA
🧪 Unit Tests:
can only be tested with execution test

🧪 Execution Tests:
added

Copybara import of the project:

--
4e61773e4bc99ef8671d8acada954917643897cb by TJ Xu <tjx@nvidia.com>:

Fix a deadlock when doing comm split

--
11c103570c8f1126e3e57208b2e091458f62fc8f by TJ Xu <tjx@nvidia.com>:

added execution test

--
13848529b145c13e061a97e4e5cda910a50640c4 by TJ Xu <tjx@nvidia.com>:

change test to take input param's element count

--
eef95c1c4b94577050a252baa7e361ce0c14da4d by TJ Xu <tjx@nvidia.com>:

added vlogs and tracing for device sync before split

--
adf545721d0d36308e9e39bc86655d92de1613d7 by TJ Xu <tjx@nvidia.com>:

skip the test if less than 4 gpus

--
9421fbb5159e90d4e245dcfa2427d6bbb9e18b87 by TJ Xu <tjx@nvidia.com>:

reduce input size of test

Merging this change closes #34196

PiperOrigin-RevId: 844802220
---
 .../gpu/collectives/gpu_clique_key.cc         |  5 +
 .../backends/gpu/collectives/gpu_clique_key.h |  2 +
 .../backends/gpu/collectives/gpu_cliques.cc   | 19 +++-
 .../xla/xla/tests/collective_ops_e2e_test.cc  | 91 +++++++++++++++++++
 4 files changed, 116 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
index d588bff13c6f8e..3eca048ba75175 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.cc
@@ -86,6 +86,11 @@ bool GpuCliqueKey::is_p2p() const { return is_p2p_; }
 
 GlobalDeviceId GpuCliqueKey::root_device() const { return root_device_; }
 
+std::vector<std::vector<GlobalDeviceId>> GpuCliqueKey::ParticipantGroups()
+    const {
+  return participant_groups_;
+};
+
 bool GpuCliqueKey::IsSubsetOf(const CliqueKey& other) const {
   auto* other_gpu = tsl::down_cast<const GpuCliqueKey*>(&other);
   if (other_gpu == nullptr) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
index a687ede7bcda2c..9099fd9befd897 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_clique_key.h
@@ -62,6 +62,8 @@ class GpuCliqueKey : public CliqueKey {
 
   CollectiveStreamId stream_id() const;
 
+  std::vector<std::vector<GlobalDeviceId>> ParticipantGroups() const;
+
   // Device generating the unique id for this key
   GlobalDeviceId root_device() const;
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
index 64654bac8b85a5..696c8925e8ac82 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_cliques.cc
@@ -492,6 +492,17 @@ InitializeGpuClique(GpuCollectives* collectives, se::StreamExecutor* device,
   GpuCollectives::DeviceRank device_rank = {&gpu_device, rank};
   RankPair rank_pair = {parent_rank, device_rank};
 
+  // Synchronize the device to make sure no other collectives are
+  // running before we do the split.
+  {
+    tsl::profiler::TraceMe trace("SynchronizeAllActivityBeforeSplit");
+    if (!device->SynchronizeAllActivity()) {
+      return Internal(
+          "Failed to synchronize GPU before splitting communicators.");
+    }
+    VLOG(3) << "Synchronized device before splitting";
+  }
+
   // Current approach for communicator splitting works because of XLAs SPMD
   // programming model where all collective operations have replica groups that
   // include all ranks. This property guarantees that we'll split each
@@ -718,10 +729,16 @@ absl::StatusOr<std::shared_ptr<LockableGpuClique::Lock>> AcquireGpuClique(
 
   if (enable_nccl_comm_splitting) {
     for (auto& [acquired_clique_key, acquired_clique] : acquired_cliques) {
-      if (clique_key.IsSubsetOf(acquired_clique_key)) {
+      // If the participant group is empty, we won't know if there are other
+      // ranks involved in the split. Proceed to normal initialization.
+      if (clique_key.IsSubsetOf(acquired_clique_key) &&
+          !clique_key.ParticipantGroups().empty()) {
         return InitializeGpuClique(collectives, device, run_id, clique_key,
                                    acquired_clique, num_local_participants,
                                    rank, config);
+      } else if (clique_key.ParticipantGroups().empty()) {
+        LOG(WARNING) << "Found empty participant groups."
+                     << " Skip splitting communicators.";
       }
     }
   }
diff --git a/third_party/xla/xla/tests/collective_ops_e2e_test.cc b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
index 872492ffecbdeb..0d68205cd6e6eb 100644
--- a/third_party/xla/xla/tests/collective_ops_e2e_test.cc
+++ b/third_party/xla/xla/tests/collective_ops_e2e_test.cc
@@ -2889,5 +2889,96 @@ INSTANTIATE_TEST_SUITE_P(
       return absl::StrCat(GetAsyncTestName(std::get<0>(info.param)), "_",
                           std::get<1>(info.param) ? "one_shot" : "nccl");
     });
+
+TEST_F(CollectiveOpsTestE2E, MultipleModuleDifferentDeviceGroupsShouldRun) {
+  const absl::string_view kModuleStr_1 = R"(
+  HloModule test
+
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    param_0 = f32[8] parameter(0)
+    ROOT all-reduce = f32[8] all-reduce(param_0), to_apply=apply_op, replica_groups={{0,1}}
+  }
+  )";
+  const absl::string_view kModuleStr_2 = R"(
+  HloModule test
+
+  apply_op {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    ROOT apply_op = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    param_0 = f32[8] parameter(0)
+    all-reduce.1 = f32[8] all-reduce(param_0), to_apply=apply_op, replica_groups={{0,1}, {2,3}}
+    all-reduce.2 = f32[8] all-reduce(all-reduce.1), to_apply=apply_op, replica_groups={{0,1}, {2,3}}
+    all-reduce.3 = f32[8] all-reduce(all-reduce.2), to_apply=apply_op, replica_groups={{0,1}, {2,3}}
+    ROOT all-reduce.4 = f32[8] all-reduce(all-reduce.3), to_apply=apply_op, replica_groups={{0,1,2,3}}
+  }
+  )";
+
+  const int64_t kNumReplicas_1 = 2;
+  const int64_t kNumReplicas_2 = 4;
+  if (hlo_runner_->device_count() < kNumReplicas_2) {
+    GTEST_SKIP() << "Test requires at least " << kNumReplicas_2 << " devices ("
+                 << hlo_runner_->device_count() << " available)";
+  }
+
+  HloModuleConfig config_1 =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas_1);
+  HloModuleConfig config_2 =
+      GetModuleConfigForTest(/*replica_count=*/kNumReplicas_2);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_1,
+                          ParseAndReturnVerifiedModule(kModuleStr_1, config_1));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module_2,
+                          ParseAndReturnVerifiedModule(kModuleStr_2, config_2));
+
+  int64_t num_elements_1 = ShapeUtil::ElementsIn(
+      module_1->entry_computation()->parameter_instructions()[0]->shape());
+
+  int64_t num_elements_2 = ShapeUtil::ElementsIn(
+      module_2->entry_computation()->parameter_instructions()[0]->shape());
+
+  Array<float> input1_1({num_elements_1}), input1_2({num_elements_1});
+  input1_1.FillRandom(1.0f, 10.0f, /*seed=*/0);
+  input1_2.FillRandom(1.0f, 10.0f, /*seed=*/1);
+
+  Literal input_literal1_1 = LiteralUtil::CreateFromArray(input1_1);
+  Literal input_literal1_2 = LiteralUtil::CreateFromArray(input1_2);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result_1,
+      ExecuteReplicated(std::move(module_1),
+                        std::vector<std::vector<Literal*>>{
+                            {&input_literal1_1}, {&input_literal1_2}}));
+
+  Array<float> input2_1({num_elements_2}), input2_2({num_elements_2}),
+      input2_3({num_elements_2}), input2_4({num_elements_2});
+  input2_1.FillRandom(1.0f, 10.0f, /*seed=*/0);
+  input2_2.FillRandom(1.0f, 10.0f, /*seed=*/1);
+  input2_3.FillRandom(1.0f, 10.0f, /*seed=*/2);
+  input2_4.FillRandom(1.0f, 10.0f, /*seed=*/3);
+
+  Literal input_literal2_1 = LiteralUtil::CreateFromArray(input2_1);
+  Literal input_literal2_2 = LiteralUtil::CreateFromArray(input2_2);
+  Literal input_literal2_3 = LiteralUtil::CreateFromArray(input2_3);
+  Literal input_literal2_4 = LiteralUtil::CreateFromArray(input2_4);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result_2,
+      ExecuteReplicated(std::move(module_2), std::vector<std::vector<Literal*>>{
+                                                 {&input_literal2_1},
+                                                 {&input_literal2_2},
+                                                 {&input_literal2_3},
+                                                 {&input_literal2_4}}));
+}
 }  // namespace
 }  // namespace xla

From 9ffcc63595660033d898815015df54a43041aa8b Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 15 Dec 2025 09:29:34 -0800
Subject: [PATCH 279/753] Add support for dot ops in YNN fusions

Currently we only have support for dots *or* fusions, this fixes that (but leaves the refactoring cleanup to a subsequent CL).

PiperOrigin-RevId: 844804682
---
 .../xla/xla/backends/cpu/ynn_emitter.cc       | 64 +++++++++++++++++++
 .../xla/xla/backends/cpu/ynn_support.cc       |  7 ++
 .../xla/xla/backends/cpu/ynn_support.h        |  1 +
 .../xla/xla/service/cpu/cpu_compiler.cc       |  6 +-
 .../cpu/parallel_task_assignment_test.cc      |  4 +-
 5 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index e70e7757577001..5229eadb70dbc9 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -246,6 +246,58 @@ static absl::StatusOr<uint32_t> DefineReduceOp(ynn_subgraph_t subgraph,
   return out;
 }
 
+static absl::StatusOr<uint32_t> DefineDotOp(ynn_subgraph_t subgraph,
+                                            TensorIdMap& tensor_ids,
+                                            const HloInstruction* instr) {
+  VLOG(3) << absl::StreamFormat("Define tensor value for dot op: %s",
+                                instr->ToString());
+  CHECK_EQ(instr->opcode(), HloOpcode::kDot);
+  const HloInstruction* lhs = instr->operand(0);
+  const HloInstruction* rhs = instr->operand(1);
+  CHECK_EQ(lhs->shape().element_type(), instr->shape().element_type());
+  CHECK_EQ(rhs->shape().element_type(), instr->shape().element_type());
+
+  TF_ASSIGN_OR_RETURN(auto lhs_id, FindTensorValue(tensor_ids, lhs));
+  TF_ASSIGN_OR_RETURN(auto rhs_id, FindTensorValue(tensor_ids, rhs));
+  TF_ASSIGN_OR_RETURN(auto output_id, DefineTensorValue(subgraph, instr));
+
+  const Shape& lhs_shape = lhs->shape();
+  const Shape& rhs_shape = rhs->shape();
+  const Shape& out_shape = instr->shape();
+
+  DotDimensionNumbers dot_dimensions = instr->dot_dimension_numbers();
+  TF_ASSIGN_OR_RETURN(DotShape dot_shape, GetDotShape(dot_dimensions, lhs_shape,
+                                                      rhs_shape, out_shape));
+
+  TF_ASSIGN_OR_RETURN(DotCanonicalDims dot_canonical_dims,
+                      GetDotCanonicalDims(dot_dimensions, dot_shape));
+
+  const size_t b_rank = rhs_shape.dimensions().size();
+  const bool transpose_b = !dot_canonical_dims.rhs_canonical;
+
+  if (transpose_b) {
+    uint32_t rhs_id_transposed = YNN_INVALID_VALUE_ID;
+    std::array<int32_t, YNN_MAX_TENSOR_RANK> perm;
+    absl::c_iota(perm, 0);
+    CHECK_LT(b_rank, YNN_MAX_TENSOR_RANK);
+    CHECK_GE(b_rank, 2);
+    std::swap(perm[b_rank - 1], perm[b_rank - 2]);
+    ynn_status status = ynn_define_static_transpose(
+        subgraph,
+        /*num_dims=*/b_rank, perm.data(), rhs_id, &rhs_id_transposed,
+        /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    rhs_id = rhs_id_transposed;
+  }
+
+  YNN_RETURN_IF_ERROR(ynn_define_dot(subgraph, /*num_k_dims=*/1, lhs_id, rhs_id,
+                                     YNN_INVALID_VALUE_ID, &output_id,
+                                     /*flags=*/0));
+  return output_id;
+}
+
 //===----------------------------------------------------------------------===//
 // Emit YNNPACK subgraph for the given HLO computation.
 //===----------------------------------------------------------------------===//
@@ -320,6 +372,16 @@ static absl::StatusOr<YnnSubgraph> EmitYnnSubgraph(
                             DefineBitcastOp(subgraph.get(), tensor_ids, instr));
       } break;
 
+      case HloOpcode::kDot: {
+        if (!IsDotSupportedByYnn(instr).value_or(false)) {
+          return InvalidArgument(
+              "Unsupported dot instruction in YNN fusion: %s",
+              instr->ToString());
+        }
+        TF_ASSIGN_OR_RETURN(tensor_ids[instr],
+                            DefineDotOp(subgraph.get(), tensor_ids, instr));
+      } break;
+
       case HloOpcode::kReduce: {
         TF_ASSIGN_OR_RETURN(tensor_ids[instr],
                             DefineReduceOp(subgraph.get(), tensor_ids, instr));
@@ -432,6 +494,8 @@ static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
     std::vector<std::unique_ptr<Literal>>& literals,
     absl::Span<const se::DeviceAddressBase> arguments_buffers,
     bool capture_rhs) {
+  // TODO(b/468895209): Use the fusion emitter above instead of replicating the
+  // logic here.
   TF_ASSIGN_OR_RETURN(
       YnnSubgraph subgraph, CreateYnnSubgraph([&](ynn_subgraph_t* subgraph) {
         return ynn_create_subgraph(
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index 4c8825d192aa8b..949748ad143071 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -218,6 +218,13 @@ absl::StatusOr<bool> IsDotSupportedByYnn(
   return true;
 }
 
+absl::StatusOr<bool> IsDotSupportedByYnn(const HloInstruction* hlo) {
+  CHECK_EQ(hlo->opcode(), HloOpcode::kDot);
+  return IsDotSupportedByYnn(hlo->dot_dimension_numbers(),
+                             hlo->operand(0)->shape(), hlo->operand(1)->shape(),
+                             hlo->shape());
+}
+
 bool IsReduceOpSupportedByYnn(const HloInstruction* hlo) {
   CHECK_EQ(hlo->opcode(), HloOpcode::kReduce);
   if (!YnnType(hlo->shape().element_type()).ok()) {
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.h b/third_party/xla/xla/backends/cpu/ynn_support.h
index f7352adfe4164b..4586126a617187 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.h
+++ b/third_party/xla/xla/backends/cpu/ynn_support.h
@@ -64,6 +64,7 @@ bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo);
 absl::StatusOr<bool> IsDotSupportedByYnn(
     const DotDimensionNumbers& dot_dimensions, const Shape& lhs_shape,
     const Shape& rhs_shape, const Shape& out_shape);
+absl::StatusOr<bool> IsDotSupportedByYnn(const HloInstruction* hlo);
 
 // Returns true if the reduce op is supported by YNNPACK.
 bool IsReduceOpSupportedByYnn(const HloInstruction* hlo);
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 19538cd5831074..540a2a6f1550fa 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -539,9 +539,9 @@ auto LibrarySupportsConvolution(
 
 auto LibrarySupportsDot(HloModule* module,
                         TargetMachineFeatures* target_machine_features) {
-  // TODO(b/406806134): Stop calling XNNPACK from regular Dot thunks. All XNN
-  // Dots should be wrapped in an `__xnn_fusion` fusion region and processed in
-  // `XnnFusionThunk`.
+  // TODO(b/468895209): Stop calling YNNPACK from regular Dot thunks. All YNN
+  // Dots should be wrapped in an `__ynn_fusion` fusion region and processed in
+  // `YnnFusionThunk`.
   const bool ynnpack_dot_enabled = absl::c_linear_search(
       module->config().debug_options().xla_cpu_experimental_ynn_fusion_type(),
       DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_DOT);
diff --git a/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc b/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
index bf77a1729a4e54..ff6a8a4c99b947 100644
--- a/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/third_party/xla/xla/service/cpu/parallel_task_assignment_test.cc
@@ -247,7 +247,7 @@ TEST_F(ParallelTaskAssignmentTest, ConstantNotParallelized) {
 
 TEST_F(ParallelTaskAssignmentTest, CustomFusionUnchanged) {
   constexpr absl::string_view hlo_string = R"(
-HloModule jit_xnn_bin_ops
+HloModule jit_ynn_bin_ops
 
 fused_computation (matrix_a: f32[1000,1000], matrix_b: f32[1000,1000]) -> f32[1000,1000] {
   matrix_a = f32[1000,1000] parameter(0)
@@ -260,7 +260,7 @@ fused_computation (matrix_a: f32[1000,1000], matrix_b: f32[1000,1000]) -> f32[10
 ENTRY main (input_x: f32[1000,1000], input_y: f32[1000,1000]) -> f32[1000,1000] {
   input_x = f32[1000,1000] parameter(0)
   input_y = f32[1000,1000] parameter(1)
-  ROOT fused_result = f32[1000,1000] fusion(input_x, input_y), kind=kCustom, calls=fused_computation, backend_config={"outer_dimension_partitions":[],"fusion_config":{"kind":"__xnn_fusion"}}
+  ROOT fused_result = f32[1000,1000] fusion(input_x, input_y), kind=kCustom, calls=fused_computation, backend_config={"outer_dimension_partitions":[],"fusion_config":{"kind":"__ynn_fusion"}}
 }
 )";
 

From 858944ab6c092964c5c9c63d9341c144560db874 Mon Sep 17 00:00:00 2001
From: Zviki Nozadze <zviki@google.com>
Date: Mon, 15 Dec 2025 10:12:39 -0800
Subject: [PATCH 280/753] Test for HLO module splitting and linking

Adds end-to-end execution tests for HLO module splitting and linking on both CPU and TPU backends. These tests verify that splitting a module and then linking it back produces a module that is semantically equivalent to the original, by executing both versions and comparing their outputs for numerical consistency.

PiperOrigin-RevId: 844821926
---
 .../xla/xla/hlo/separate_compilation/hlo_module_linking.cc      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
index 82cf884716c34e..1e40fbf64e9f66 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking.cc
@@ -202,6 +202,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> LinkComputation(
   TF_ASSIGN_OR_RETURN(HloComputation * linked_clone_ptr, linker.Link());
 
   linked_module->ReplaceEntryComputation(linked_clone_ptr);
+  linked_module->mutable_config().SetComputationLayoutIfExists(
+      linked_clone_ptr->ComputeProgramShape());
   xla::HloDCE dce_pass;
   TF_RETURN_IF_ERROR(dce_pass.Run(linked_module.get()).status());
 

From 707b6623cee3cbecff53309030ee958a730bb23c Mon Sep 17 00:00:00 2001
From: Marissa Ikonomidis <marissaw@google.com>
Date: Mon, 15 Dec 2025 10:19:55 -0800
Subject: [PATCH 281/753] Error if hint flag is incorrectly set

Users can provide a flag to tell LiteRT that their subgraph
will be fully delegated to a single delegate. Detect if
the flag is incorrectly set and warn the user.

PiperOrigin-RevId: 844824810
---
 tensorflow/lite/core/subgraph.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 5b0e15d4515ffa..4d28de5a21ca2a 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -2536,6 +2536,13 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegateImpl(TfLiteDelegate* delegate) {
   SwitchToKernelContext();
   TF_LITE_ENSURE_STATUS(reset_delegation_if_not_ok(status));
 
+  if (hint_fully_delegated_to_single_delegate && !IsFullyDelegated()) {
+    ReportError(
+        "Hint fully delegated to single delegate is set, but the graph is not "
+        "fully delegated.");
+    return kTfLiteApplicationError;
+  }
+
   // STEP 3: Leave graph in consistent state based on delegate & previous state.
   // ===========================================================================
 

From 29f16c487d4f1a4d6732dd7e91519cf3ac7a6867 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Mon, 15 Dec 2025 10:25:11 -0800
Subject: [PATCH 282/753] [XLA] AllGather cancellation with Dynamic slice was
 too permissive if the dynamic slice size did not match the input size of the
 all-gather.

PiperOrigin-RevId: 844826833
---
 third_party/xla/xla/service/BUILD             |  1 +
 .../xla/xla/service/all_gather_simplifier.cc  | 14 +++++++---
 .../xla/service/all_gather_simplifier_test.cc | 26 +++++++++++++++++++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 2fc73006ca32d4..52716071f9a4ec 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -2489,6 +2489,7 @@ cc_library(
         ":collective_opt_utils",
         ":hlo_module_config",
         "//xla:shape_util",
+        "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
diff --git a/third_party/xla/xla/service/all_gather_simplifier.cc b/third_party/xla/xla/service/all_gather_simplifier.cc
index d1e6f1a285f32a..479784ee969a9d 100644
--- a/third_party/xla/xla/service/all_gather_simplifier.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/errors.h"
+#include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -61,11 +62,18 @@ absl::StatusOr<bool> AllGatherSimplifier::RunImpl(
                 HloPredicateIsOp<HloOpcode::kReplicaId>);
         if (spec.has_value() &&
             spec->split_dim == all_gather->all_gather_dimension()) {
-          changed = true;
           CHECK_EQ(all_gather->users().size(), 1);
           HloInstruction* ds = all_gather->users().front();
-          TF_RETURN_IF_ERROR(
-              ds->ReplaceAllUsesWith(all_gather->mutable_operand(0)));
+          HloInstruction* ag_operand = all_gather->mutable_operand(0);
+          if (!ShapeUtil::Compatible(ds->shape(), ag_operand->shape())) {
+            ag_operand = ag_operand->AddInstruction(HloInstruction::CreateSlice(
+                ds->shape(), ag_operand,
+                DimensionVector(ds->shape().dimensions().size(), 0),
+                ds->shape().dimensions(),
+                DimensionVector(ds->shape().dimensions().size(), 1)));
+          }
+          changed = true;
+          TF_RETURN_IF_ERROR(ds->ReplaceAllUsesWith(ag_operand));
           TF_RETURN_IF_ERROR(
               computation->RemoveInstructionAndUnusedOperands(ds));
         }
diff --git a/third_party/xla/xla/service/all_gather_simplifier_test.cc b/third_party/xla/xla/service/all_gather_simplifier_test.cc
index 13cb631ea38f32..928b749d9b2ba4 100644
--- a/third_party/xla/xla/service/all_gather_simplifier_test.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier_test.cc
@@ -59,5 +59,31 @@ test {
               GmockMatch(m::Add(m::Parameter(0), m::Parameter(1))));
 }
 
+TEST_F(AllGatherSimplifierTest, DoesNotReplaceIfInputShapeMismatch) {
+  const absl::string_view kModuleStr = R"(
+HloModule m
+
+test {
+  p0 = f32[1, 5920, 4, 2304] parameter(0)
+  all-gather = f32[1, 23680, 4, 2304] all-gather(p0), replica_groups={{0, 1, 2, 3}}, dimensions={1}, use_global_device_ids=true, channel_id=1
+  replica-id = u32[] replica-id()
+  table = s32[4] constant({0, 5520, 11040, 16560})
+  ds_index = s32[1] dynamic-slice(table, replica-id), dynamic_slice_sizes={1}
+  reshape = s32[] reshape(ds_index)
+  zero = s32[] constant(0)
+  ROOT dynamic-slice = f32[1, 5520, 4, 2304] dynamic-slice(all-gather, zero, reshape, zero, zero), dynamic_slice_sizes={1, 5520, 4, 2304}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleStr, /*replica_count=*/4));
+  module->mutable_config().set_use_spmd_partitioning(true);
+  AllGatherSimplifier ag_simplifier;
+  auto result = ag_simplifier.Run(module.get());
+  ASSERT_TRUE(result.ok()) << result.status();
+  ASSERT_TRUE(result.value());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
+}
+
 }  // namespace
 }  // namespace xla

From 0b09cda9f622e2fe0968d8b7882534e6b217973f Mon Sep 17 00:00:00 2001
From: Misha Gutman <aelphy@google.com>
Date: Mon, 15 Dec 2025 11:23:02 -0800
Subject: [PATCH 283/753] Updated XNNPACK version.

PiperOrigin-RevId: 844851639
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake | 2 +-
 tensorflow/workspace2.bzl                         | 6 +++---
 third_party/xla/third_party/xnnpack/workspace.bzl | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 9fde94f7b1a847..ecbbf91866a8c0 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG e436865104ef12ff872db68ec94ce1c5332a6ecb
+  GIT_TAG 77468446ebfd9baab7fc4349c32608c9675cf6d9
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index a8db5fd1117c1e..0b42230ec0f651 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -168,9 +168,9 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f855387f6c4e7db5facdcd83fc41bc94b1888239b396e055ba48dc6da9d89446",
-        strip_prefix = "XNNPACK-e436865104ef12ff872db68ec94ce1c5332a6ecb",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/e436865104ef12ff872db68ec94ce1c5332a6ecb.zip"),
+        sha256 = "a89879422c6da8240cffb8ff67f5cd11f0362cb2a174ee9cd96b450e53902ca3",
+        strip_prefix = "XNNPACK-77468446ebfd9baab7fc4349c32608c9675cf6d9",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/77468446ebfd9baab7fc4349c32608c9675cf6d9.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index 6bf3c0ec9ef322..129071d9fa793f 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "f855387f6c4e7db5facdcd83fc41bc94b1888239b396e055ba48dc6da9d89446",
-        strip_prefix = "XNNPACK-e436865104ef12ff872db68ec94ce1c5332a6ecb",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/e436865104ef12ff872db68ec94ce1c5332a6ecb.zip"),
+        sha256 = "a89879422c6da8240cffb8ff67f5cd11f0362cb2a174ee9cd96b450e53902ca3",
+        strip_prefix = "XNNPACK-77468446ebfd9baab7fc4349c32608c9675cf6d9",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/77468446ebfd9baab7fc4349c32608c9675cf6d9.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)

From 624ed913976487817d590f21e5a0ccabf0e6bcd7 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Mon, 15 Dec 2025 11:39:32 -0800
Subject: [PATCH 284/753] Remove out-of bounds dimensions before
 `hlo_sharding_util::PartiallyReplicateTiledShardingOnDims`.

A preparation for cl/842153305.

PiperOrigin-RevId: 844858313
---
 third_party/xla/xla/service/spmd/dot_handler.cc | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index ddc9012283166d..e9739de81ca05d 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -2595,9 +2595,19 @@ absl::StatusOr<HloInstruction*> PartitionDotGroupOnNonContractingImpl(
     if (!other.sharding().ReplicateOnLastTileDim() || !device_group_match) {
       other = other.Reshard(target_sharding);
     }
+
+    DimensionVector dims_to_replicate = other_grouped->group_dims;
+    for (auto it = dims_to_replicate.begin(); it != dims_to_replicate.end();) {
+      if (*it >= other.base_shape().dimensions().size()) {
+        it = dims_to_replicate.erase(it);
+      } else {
+        ++it;
+      }
+    }
+
     partially_replicated_other =
         other.Reshard(hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
-            other.sharding(), other_grouped->group_dims));
+            other.sharding(), dims_to_replicate));
     top_level_sharding_to_reset.emplace_back(
         partially_replicated_other, partially_replicated_other.sharding());
     partially_replicated_other.set_sharding(other_grouped->sharding);

From deb256b360c624f8d476d888eb5dc1f902210fac Mon Sep 17 00:00:00 2001
From: Nikita Putikhin <nputikhin@google.com>
Date: Mon, 15 Dec 2025 11:41:55 -0800
Subject: [PATCH 285/753] [XLA:GPU] Fuse more slices into GeMMs with K<1024

Fusing slices with the consumer is potentially beneficial. However split-K GeMM rewriter does not support sliced contracting dimension, so we restrict the fusion to cases where we have small K.

PiperOrigin-RevId: 844859324
---
 .../gpu/transforms/gemm_fusion_test.cc        | 52 +++++++++++++++++++
 .../xla/service/gpu/triton_fusion_analysis.cc | 30 ++++++++---
 .../service/gpu/triton_tiling_propagation.cc  | 28 ++++++++--
 .../service/gpu/triton_tiling_propagation.h   |  2 +
 4 files changed, 102 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
index c77d76d6954aed..8036bc09ab2ce6 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
@@ -227,6 +227,58 @@ ENTRY e {
   EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
 }
 
+TEST_F(GemmFusionTest, FuseSliceWithOtherUsersWhenDotHasSmallK) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = bf16[512,3584]{1,0} parameter(0)
+  p1 = bf16[3584,14400]{0,1} parameter(1)
+  p2 = bf16[64,14336]{1,0} parameter(2)
+
+  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
+
+  sl1 = bf16[512,64]{1,0} slice(d0), slice={[0:512], [14336:14400]}
+  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
+})"));
+
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
+  EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
+
+  // Check that the second dot is fused and the fusion contains sl1.
+  // We make no assumptions about other fusions.
+  constexpr absl::string_view kExpectedHloText = R"(
+    CHECK: %[[FUSION_DOT:.*]] (
+    CHECK:   %[[SLICE:.*]] = bf16[512,64]{1,0} slice(%parameter_0), slice={[0:512], [14336:14400]}
+    CHECK:   ROOT {{.*}} = bf16[512,14336]{1,0} dot(%[[SLICE]], %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    CHECK: ENTRY
+    CHECK-DAG: %[[FUSION_D1:.*]] = bf16[512,14336]{1,0} fusion({{.*}}, {{.*}}), kind=kCustom, calls=%[[FUSION_DOT]]
+    CHECK-DAG: ROOT %a0 = bf16[512,14336]{1,0} add({{.*}}, %[[FUSION_D1]])
+  )";
+  MatchHloModule(*module, kExpectedHloText);
+}
+
+TEST_F(GemmFusionTest, DoNotFuseSliceWithOtherUsersWhenDotHasLargeK) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = bf16[512,3584]{1,0} parameter(0)
+  p1 = bf16[3584,14400]{0,1} parameter(1)
+  p2 = bf16[1400,14336]{1,0} parameter(2)
+
+  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
+  sl1 = bf16[512,1400]{1,0} slice(d0), slice={[0:512], [13000:14400]}
+
+  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
+})"));
+
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
+  EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
+}
+
 TEST_F(GemmFusionTest, DoNotFuseSliceOfMixedDimensions) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index cfc47a333955d0..9f7fbcba0aa98b 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -57,6 +57,16 @@ using triton_fusion::GetPropagatedDimOrdersAndRequirements;
 using triton_fusion::kNoSplitRequirement;
 using triton_fusion::TransformDirection;
 
+int64_t GetContractingDimSize(const HloInstruction& dot) {
+  const auto& contracting_dims =
+      ContractingDimensionsForOperand(dot, /*operand_number=*/0);
+  int64_t contracting_dim_size = 1;
+  for (int64_t dim : contracting_dims) {
+    contracting_dim_size *= dot.operand(0)->shape().dimensions(dim);
+  }
+  return contracting_dim_size;
+}
+
 }  // namespace
 
 namespace triton_fusion {
@@ -81,9 +91,13 @@ namespace triton_fusion {
           0) {
     splittable_dimension_index = non_contracting_dimension_index;
   }
-  FusionContext context(DotProperties{non_contracting_dimension_index,
-                                      splittable_dimension_index},
-                        DotRequirements(kNoSplitRequirement));
+
+  int64_t contracting_size = GetContractingDimSize(dot);
+
+  FusionContext context(
+      DotProperties{non_contracting_dimension_index, splittable_dimension_index,
+                    contracting_size},
+      DotRequirements(kNoSplitRequirement));
   context.dim_orders_[dot.operand(operand_number)] =
       DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
                                              split_k_dimension_index);
@@ -102,9 +116,13 @@ namespace triton_fusion {
     // LHS non-contracting follows (batch is absent in this case).
     splittable_dimension_index = (split_k > 1) ? 1 : 0;
   }
-  FusionContext context(DotProperties{/*noncontracting_dimension=*/-1,
-                                      splittable_dimension_index},
-                        std::move(requirements));
+
+  int64_t contracting_size = GetContractingDimSize(dot);
+
+  FusionContext context(
+      DotProperties{/*noncontracting_dimension=*/-1, splittable_dimension_index,
+                    contracting_size},
+      std::move(requirements));
   context.dim_orders_[&dot] = DimensionOrder::FromDotOperandOrOutput(dot);
   return context;
 }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 926307d6f0c0e7..24b5e4879240c4 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -992,11 +992,16 @@ bool CanNotBeFusedIntoAUser(const HloInstruction& hlo) {
                           hlo.users()[0]->opcode() == HloOpcode::kTuple);
 }
 
+// Maximum contracting dimension size for which slice fusion is allowed when
+// the operand has multiple users.
+constexpr int kMaxContractingDimSizeForSliceFusion = 1024;
+
 // Let input and output data volumes of a fusion grow by small amounts.
 constexpr int kIoToleranceBytes = 1024;
 
 // Tells that fusing an instruction as an input is efficient.
-bool IsInputWorthFusing(const HloInstruction& hlo) {
+bool IsInputWorthFusing(const HloInstruction& hlo,
+                        const DotProperties& properties) {
   std::optional<int64_t> input_minus_output_bytes = InputMinusOutputBytes(hlo);
   if (!input_minus_output_bytes.has_value()) {
     return false;
@@ -1011,6 +1016,21 @@ bool IsInputWorthFusing(const HloInstruction& hlo) {
       hlo_query::AllOperandsAreParametersOrConstants(hlo)) {
     return true;
   }
+  // Explanation:
+  // * Operand user count > 1 - if the producer of the slice has a single user
+  //   the slice can be fused into the producer instead of here.
+  // * contracting_dim_size < 1024 - fusing slices disables split-K rewriter,
+  //   which may outweigh the benefit of fusing it in the first place. Small
+  //   contracting dimension almost never benefits from splitting it, so we
+  //   allow the fusion.
+
+  // TODO: b/393299275 - Remove the contracting dim size restriction once the
+  // new emitter lands and we can support slices in contracting dimension with
+  // splits.
+  if (hlo.opcode() == HloOpcode::kSlice && hlo.operand(0)->user_count() > 1 &&
+      properties.contracting_dim_size <= kMaxContractingDimSizeForSliceFusion) {
+    return true;
+  }
   const bool enable_subchannel_dequantisation_fusion =
       hlo.GetModule()
           ->config()
@@ -1018,8 +1038,8 @@ bool IsInputWorthFusing(const HloInstruction& hlo) {
           .xla_gpu_experimental_enable_subchannel_dequantisation_fusion();
   if (hlo.opcode() == HloOpcode::kMultiply) {
     return enable_subchannel_dequantisation_fusion &&
-           IsInputWorthFusing(*hlo.operand(0)) &&
-           IsInputWorthFusing(*hlo.operand(1));
+           IsInputWorthFusing(*hlo.operand(0), properties) &&
+           IsInputWorthFusing(*hlo.operand(1), properties);
   }
   return hlo_query::AllOperandsAreParametersOrConstantsWithSingleUser(hlo);
 }
@@ -1139,7 +1159,7 @@ GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
         }
       }
     }
-    if (!accepted && !IsInputWorthFusing(hlo)) {
+    if (!accepted && !IsInputWorthFusing(hlo, properties)) {
       return FusionDecision::Forbid(
           "Not obviously profitable to fuse as input.");
     }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
index a83dd9c976f8c4..df09b35a1f0ffc 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -249,6 +249,8 @@ struct DotProperties {
   // Index of dot dimension that can be split.
   // Currently typically LHS non-contracting one.
   const int splittable_dimension_index;
+  // Size of the contracting dimension (K).
+  const int64_t contracting_dim_size;
 };
 
 // A special value for splittable_dimension_major_part_size.

From f3ec010de9577c6f6709d96e7a3270b3f10568eb Mon Sep 17 00:00:00 2001
From: Gregory Pataky <gregpataky@google.com>
Date: Mon, 15 Dec 2025 11:50:20 -0800
Subject: [PATCH 286/753] Add F4 and F8 types to xla namespace

Allow specifying these types from `xla::` namespace instead of `tsl::` or other.

PiperOrigin-RevId: 844862727
---
 third_party/xla/xla/types.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/third_party/xla/xla/types.h b/third_party/xla/xla/types.h
index 89ad42ea911609..eb2b6ae8384f45 100644
--- a/third_party/xla/xla/types.h
+++ b/third_party/xla/xla/types.h
@@ -28,6 +28,17 @@ limitations under the License.
 
 namespace xla {
 
+using ::tsl::float4_e2m1fn;  // NOLINT(misc-unused-using-decls)
+
+using ::tsl::float8_e3m4;         // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3;         // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3b11fnuz;  // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3fn;       // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e4m3fnuz;     // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e5m2;         // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e5m2fnuz;     // NOLINT(misc-unused-using-decls)
+using ::tsl::float8_e8m0fnu;      // NOLINT(misc-unused-using-decls)
+
 using ::Eigen::bfloat16;  // NOLINT(misc-unused-using-decls)
 using ::Eigen::half;      // NOLINT(misc-unused-using-decls)
 

From 46151f082e5d165c998639a1187d9eb04447e900 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Mon, 15 Dec 2025 11:53:40 -0800
Subject: [PATCH 287/753] Add an option to use a custom host memory allocator
 for PjRt GPU

PiperOrigin-RevId: 844863896
---
 tensorflow/core/common_runtime/gpu/BUILD      |  1 +
 .../core/common_runtime/gpu/gpu_device.cc     |  7 +-
 tensorflow/core/tfrt/common/BUILD             |  1 +
 tensorflow/core/tfrt/common/pjrt_state.h      |  3 +-
 third_party/xla/xla/pjrt/BUILD                | 12 +++
 third_party/xla/xla/pjrt/gpu/BUILD            |  1 +
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    | 37 ++++++++-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  3 +-
 third_party/xla/xla/pjrt/gpu/tfrt/BUILD       | 10 +--
 .../xla/pjrt/gpu/tfrt/host_memory_allocator.h | 46 -----------
 ...u_async_host_to_device_transfer_manager.cc |  2 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc  |  2 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc  | 38 ++++++++--
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h   |  6 +-
 .../xla/xla/pjrt/host_memory_allocator.cc     | 45 +++++++++++
 .../xla/xla/pjrt/host_memory_allocator.h      | 76 +++++++++++++++++++
 .../xla/pjrt/pjrt_stream_executor_client.cc   | 14 ++--
 .../xla/pjrt/pjrt_stream_executor_client.h    |  7 +-
 third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD |  1 +
 .../plugin/xla_gpu/xla_gpu_client_options.h   |  5 ++
 third_party/xla/xla/pjrt/se_raw_buffer.cc     | 27 ++-----
 21 files changed, 239 insertions(+), 105 deletions(-)
 delete mode 100644 third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h
 create mode 100644 third_party/xla/xla/pjrt/host_memory_allocator.cc
 create mode 100644 third_party/xla/xla/pjrt/host_memory_allocator.h

diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index dcc9bd60f08c21..6383fc562d55f2 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -205,6 +205,7 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@local_xla//xla:shape_util",
+        "@local_xla//xla/pjrt:host_memory_allocator",
         "@local_xla//xla/stream_executor/gpu:gpu_init_impl",
         "@local_xla//xla/stream_executor/integrations:stream_executor_allocator",
         "@local_xla//xla/tsl/framework:device_id_utils",
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 22eecde5ba7d8a..f40fd04472700c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // TODO(opensource): Use a more generic sounding preprocessor name than
 // GOOGLE_CUDA
+#include "xla/pjrt/host_memory_allocator.h"
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 
@@ -1880,8 +1881,10 @@ Status BaseGPUDeviceFactory::CreateDevices(
     // TODO(chuanhao): Use the correct NUMA_NODE.
     const int64_t numa_node = 0;
 
-    std::unique_ptr<tsl::Allocator> pjrt_gpu_host_allocator(
-        process_state->GetGpuHostAllocator(/*options=*/{}, numa_node));
+    auto pjrt_gpu_host_allocator =
+        std::make_unique<xla::BasicHostMemoryAllocator>(
+            std::unique_ptr<tsl::Allocator>(
+                process_state->GetGpuHostAllocator(/*options=*/{}, numa_node)));
 
     if (populate_pjrt_gpu_client_creation_info &&
         !should_create_new_pjrt_client) {
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 5658c7db2ca6bb..571caba934dfe7 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -106,6 +106,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/client:local_client",
+        "@local_xla//xla/pjrt:host_memory_allocator",
         "@local_xla//xla/pjrt:local_device_state",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:tf_pjrt_client",
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 0c6f78cfd82ba8..3da5fb930a9e1b 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "xla/client/local_client.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/stream_executor/integrations/tf_allocator_adapter.h"
@@ -44,7 +45,7 @@ using PjRtClientsMap = std::map<DeviceType, std::unique_ptr<xla::PjRtClient>>;
 struct PjRtGpuClientCreationInfo {
   std::set<int> allowed_devices;
   std::unique_ptr<se::MultiDeviceAdapter> allocator;
-  std::unique_ptr<tsl::Allocator> host_memory_allocator;
+  std::unique_ptr<xla::HostMemoryAllocator> host_memory_allocator;
   std::map<int, std::unique_ptr<xla::LocalDeviceState>> local_device_states;
   xla::LocalClient* local_client;
 };
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 954225f6a10310..1821bf0aff7057 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -671,6 +671,7 @@ cc_library(
         ":device_event",
         ":event_pool",
         ":host_callback",
+        ":host_memory_allocator",
         ":host_memory_spaces",
         ":layout_mode",
         ":local_device_state",
@@ -1400,6 +1401,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "host_memory_allocator",
+    srcs = ["host_memory_allocator.cc"],
+    hdrs = ["host_memory_allocator.h"],
+    deps = [
+        "//xla/tsl/framework:allocator",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 xla_cc_test(
     name = "errors_test",
     srcs = ["errors_test.cc"],
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 9c8ff6f6b206d2..be2ae8bb7e80e1 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -85,6 +85,7 @@ cc_library(
         "//xla/pjrt:common_pjrt_client",
         "//xla/pjrt:device_event",
         "//xla/pjrt:event_pool",
+        "//xla/pjrt:host_memory_allocator",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:local_device_state",
         "//xla/pjrt:mlir_to_hlo",
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index c2e23e6cd3a796..b12dcf160cfdbc 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -76,6 +76,7 @@ limitations under the License.
 #include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/host_to_device_transfer_manager.h"
 #include "xla/pjrt/local_device_state.h"
@@ -206,7 +207,7 @@ StreamExecutorGpuClient::StreamExecutorGpuClient(
     std::string platform_name, LocalClient* client,
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
     int process_index, std::unique_ptr<se::DeviceAddressAllocator> allocator,
-    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store,
@@ -1790,9 +1791,37 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
                       GetStreamExecutorGpuDeviceAllocator(
                           xla_client->platform(), options.allocator_config,
                           local_device_states));
-  TF_ASSIGN_OR_RETURN(
-      auto host_memory_allocator,
-      GetGpuHostAllocator(local_device_states.begin()->second->executor()));
+  std::unique_ptr<HostMemoryAllocator> host_memory_allocator;
+  if (options.host_memory_allocator_factory != nullptr) {
+    stream_executor::StreamExecutor* const stream_executor =
+        local_device_states.begin()->second->compute_stream()->parent();
+    HostMemoryAllocator::Options allocator_options;
+    allocator_options.alignment = tsl::Allocator::kAllocatorAlignment;
+    allocator_options.map_fn = [stream_executor](void* data, size_t size) {
+      bool success = stream_executor->HostMemoryRegister(data, size);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to register host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    allocator_options.unmap_fn = [stream_executor](void* data) {
+      bool success = stream_executor->HostMemoryUnregister(data);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to unregister host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    host_memory_allocator =
+        options.host_memory_allocator_factory(allocator_options);
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        auto allocator,
+        GetGpuHostAllocator(local_device_states.begin()->second->executor()));
+    host_memory_allocator = std::make_unique<BasicHostMemoryAllocator>(
+        std::move(allocator), tsl::Allocator::kAllocatorAlignment);
+  }
 
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
   if (options.enable_mock_nccl) {
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index c56d5757a3c929..b65f9a7f4af02a 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -110,7 +111,7 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       std::string platform_name, LocalClient* client,
       std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
       int process_index, std::unique_ptr<se::DeviceAddressAllocator> allocator,
-      std::unique_ptr<tsl::Allocator> host_memory_allocator,
+      std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
       std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
       std::shared_ptr<KeyValueStoreInterface> kv_store,
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index 7cb5b0892be377..04d720161c86b5 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -37,7 +37,6 @@ cc_library(
     visibility = internal_visibility(["//xla/pjrt/gpu:legacy_gpu_client_users"]),
     deps = [
         ":gpu_event",
-        ":host_memory_allocator",
         ":tracked_gpu_device_buffer",
         "//xla:debug_options_flags",
         "//xla:executable_run_options",
@@ -64,6 +63,7 @@ cc_library(
         "//xla/pjrt:abstract_tracked_device_buffer",
         "//xla/pjrt:device_event",
         "//xla/pjrt:host_callback",
+        "//xla/pjrt:host_memory_allocator",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:layout_mode",
         "//xla/pjrt:mlir_to_hlo",
@@ -356,11 +356,3 @@ xla_cc_test(
         "@local_tsl//tsl/platform:casts",
     ],
 )
-
-cc_library(
-    name = "host_memory_allocator",
-    hdrs = ["host_memory_allocator.h"],
-    deps = [
-        "//xla/tsl/framework:allocator",
-    ],
-)
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h b/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h
deleted file mode 100644
index cef01b496e48ee..00000000000000
--- a/third_party/xla/xla/pjrt/gpu/tfrt/host_memory_allocator.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
-#define XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
-
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <utility>
-
-#include "xla/tsl/framework/allocator.h"
-
-namespace xla {
-class HostMemoryAllocator {
- public:
-  explicit HostMemoryAllocator(std::unique_ptr<tsl::Allocator> allocator)
-      : allocator_(std::move(allocator)) {}
-
-  // Uses tsl::Allocator destructor as the deleter for owned pointer.
-  using OwnedPtr = std::unique_ptr<void, std::function<void(void*)>>;
-  OwnedPtr Allocate(size_t size) {
-    if (size == 0) return OwnedPtr(nullptr, [](void* ptr) {});
-    return OwnedPtr(
-        allocator_->AllocateRaw(tsl::Allocator::kAllocatorAlignment, size),
-        [this](void* ptr) { allocator_->DeallocateRaw(ptr); });
-  }
-
- private:
-  std::unique_ptr<tsl::Allocator> allocator_;
-};
-}  // namespace xla
-
-#endif  // XLA_PJRT_GPU_TFRT_HOST_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
index 44031d7249faa5..3f987409f0b21f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
@@ -40,11 +40,11 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index 6c043547ed5e60..1aceca82c59b84 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -41,10 +41,10 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index f63c05f5aa8b77..d36b6acc5510ed 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -63,12 +63,12 @@ limitations under the License.
 #include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_executable.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/layout_mode.h"
 #include "xla/pjrt/mlir_to_hlo.h"
@@ -149,7 +149,7 @@ TfrtGpuClient::TfrtGpuClient(
     bool should_stage_host_to_device_transfers,
     bool abort_collectives_on_failure,
     MaybeOwning<se::DeviceAddressAllocator> allocator,
-    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    std::shared_ptr<HostMemoryAllocator> host_memory_allocator,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
     std::shared_ptr<KeyValueStoreInterface> kv_store,
     std::shared_ptr<const GpuTopology> gpu_topology)
@@ -160,8 +160,7 @@ TfrtGpuClient::TfrtGpuClient(
           should_stage_host_to_device_transfers),
       abort_collectives_on_failure_(abort_collectives_on_failure),
       allocator_(std::move(allocator)),
-      host_memory_allocator_(std::make_unique<HostMemoryAllocator>(
-          std::move(host_memory_allocator))),
+      host_memory_allocator_(std::move(host_memory_allocator)),
       devices_(InitializeDevices(this, devices)),
       id_to_device_(GetIdToDeviceMap(devices)),
       addressable_devices_(GetAddressableDevicePointers(devices)),
@@ -1189,11 +1188,36 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
       GetGpuXlaClient(options.platform_name, options.allowed_devices));
   EnablePeerAccess(xla_client->backend().stream_executors());
 
-  std::unique_ptr<tsl::Allocator> host_memory_allocator;
-  if (!xla_client->backend().stream_executors().empty()) {
+  std::shared_ptr<HostMemoryAllocator> host_memory_allocator;
+  if (options.host_memory_allocator_factory != nullptr) {
+    stream_executor::StreamExecutor* const stream_executor =
+        xla_client->backend().stream_executors().front();
+    HostMemoryAllocator::Options allocator_options;
+    allocator_options.alignment = tsl::Allocator::kAllocatorAlignment;
+    allocator_options.map_fn = [stream_executor](void* data, size_t size) {
+      bool success = stream_executor->HostMemoryRegister(data, size);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to register host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    allocator_options.unmap_fn = [stream_executor](void* data) {
+      bool success = stream_executor->HostMemoryUnregister(data);
+      if (!success) {
+        return absl::InternalError(absl::StrFormat(
+            "Failed to unregister host memory at address: %ps", data));
+      }
+      return absl::OkStatus();
+    };
+    host_memory_allocator =
+        options.host_memory_allocator_factory(allocator_options);
+  } else if (!xla_client->backend().stream_executors().empty()) {
     TF_ASSIGN_OR_RETURN(
-        host_memory_allocator,
+        std::unique_ptr<tsl::Allocator> allocator,
         GetGpuHostAllocator(xla_client->backend().stream_executors().front()));
+    host_memory_allocator = std::make_shared<BasicHostMemoryAllocator>(
+        std::move(allocator), tsl::Allocator::kAllocatorAlignment);
   }
 
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
index 88bed1881f355f..f08c7d9076c9d8 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
@@ -48,10 +48,10 @@ limitations under the License.
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
-#include "xla/pjrt/gpu/tfrt/host_memory_allocator.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
@@ -120,7 +120,7 @@ class TfrtGpuClient final : public PjRtClient {
                 bool should_stage_host_to_device_transfers,
                 bool abort_collectives_on_failure,
                 MaybeOwning<se::DeviceAddressAllocator> allocator,
-                std::unique_ptr<tsl::Allocator> host_memory_allocator,
+                std::shared_ptr<HostMemoryAllocator> host_memory_allocator,
                 std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options,
                 std::shared_ptr<KeyValueStoreInterface> kv_store,
                 std::shared_ptr<const GpuTopology> gpu_topology);
@@ -339,7 +339,7 @@ class TfrtGpuClient final : public PjRtClient {
   // complete.
   MaybeOwning<se::DeviceAddressAllocator> allocator_;
   // Allocator to be used for staging memory transfers to devices.
-  std::unique_ptr<HostMemoryAllocator> host_memory_allocator_;
+  std::shared_ptr<HostMemoryAllocator> host_memory_allocator_;
 
   // Pointers to `owned_devices_`.
   std::vector<PjRtDevice*> devices_;
diff --git a/third_party/xla/xla/pjrt/host_memory_allocator.cc b/third_party/xla/xla/pjrt/host_memory_allocator.cc
new file mode 100644
index 00000000000000..a0a9448a2527ad
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_memory_allocator.cc
@@ -0,0 +1,45 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/pjrt/host_memory_allocator.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "xla/tsl/framework/allocator.h"
+
+namespace xla {
+
+BasicHostMemoryAllocator::BasicHostMemoryAllocator(
+    std::unique_ptr<tsl::Allocator> allocator, size_t alignment)
+    : allocator_(std::move(allocator)), alignment_(alignment) {}
+
+HostMemoryAllocator::OwnedPtr BasicHostMemoryAllocator::Allocate(size_t size) {
+  if (size == 0) {
+    return nullptr;
+  }
+  return OwnedPtr(
+      reinterpret_cast<uint8_t*>(allocator_->AllocateRaw(alignment_, size)),
+      {
+          +[](void* ptr, void* arg) {
+            reinterpret_cast<tsl::Allocator*>(arg)->DeallocateRaw(ptr);
+          },
+          allocator_.get(),
+      });
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/host_memory_allocator.h b/third_party/xla/xla/pjrt/host_memory_allocator.h
new file mode 100644
index 00000000000000..8123ade19c7887
--- /dev/null
+++ b/third_party/xla/xla/pjrt/host_memory_allocator.h
@@ -0,0 +1,76 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_PJRT_HOST_MEMORY_ALLOCATOR_H_
+#define XLA_PJRT_HOST_MEMORY_ALLOCATOR_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "xla/tsl/framework/allocator.h"
+
+namespace xla {
+
+// An interface for host memory allocation.
+class HostMemoryAllocator {
+ public:
+  struct Options {
+    // Minimum alignment of the allocated memory.
+    size_t alignment = tsl::Allocator::kAllocatorAlignment;
+
+    // Functions for mapping and unmapping the allocated memory.
+    absl::AnyInvocable<absl::Status(void*, size_t)> map_fn;
+    absl::AnyInvocable<absl::Status(void*)> unmap_fn;
+  };
+
+  using Factory = std::function<std::unique_ptr<HostMemoryAllocator>(
+      const Options& options)>;
+
+  struct Deleter {
+    void operator()(void* ptr) { deleter(ptr, arg); }
+    void (*deleter)(void* ptr, void* arg);
+    void* arg;
+  };
+  using OwnedPtr = std::unique_ptr<uint8_t[], Deleter>;
+
+  virtual ~HostMemoryAllocator() = default;
+
+  // Allocates `size` bytes of memory. The returned pointer is guaranteed to be
+  // aligned to `options_.alignment`.
+  virtual OwnedPtr Allocate(size_t size) = 0;
+};
+
+// `HostMemoryAllocator` implementation that uses a `tsl::Allocator` to back
+// allocations.
+class BasicHostMemoryAllocator : public HostMemoryAllocator {
+ public:
+  explicit BasicHostMemoryAllocator(
+      std::unique_ptr<tsl::Allocator> allocator,
+      size_t alignment = tsl::Allocator::kAllocatorAlignment);
+
+  OwnedPtr Allocate(size_t size) override;
+
+ private:
+  const std::unique_ptr<tsl::Allocator> allocator_;
+  const size_t alignment_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_PJRT_HOST_MEMORY_ALLOCATOR_H_
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index d11f6e966f5ec2..391ed56fc386c0 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -115,6 +115,7 @@ limitations under the License.
 #include "xla/pjrt/dump/dump.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/host_callback.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/layout_mode.h"
 #include "xla/pjrt/local_device_state.h"
@@ -276,7 +277,7 @@ PjRtStreamExecutorClient::PjRtStreamExecutorClient(
     int process_index,
     std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces,
     std::unique_ptr<se::DeviceAddressAllocator> allocator,
-    std::unique_ptr<tsl::Allocator> host_memory_allocator,
+    std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
     bool should_stage_host_to_device_transfers,
     std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options)
     : platform_id_(tsl::Fingerprint64(platform_name)),
@@ -302,7 +303,8 @@ PjRtStreamExecutorClient::PjRtStreamExecutorClient(
   }
 
   if (!host_memory_allocator_) {
-    host_memory_allocator_ = std::make_unique<CpuAllocator>();
+    host_memory_allocator_ = std::make_unique<BasicHostMemoryAllocator>(
+        std::make_unique<CpuAllocator>());
   }
 
   for (const std::unique_ptr<PjRtStreamExecutorDevice>& device :
@@ -669,12 +671,8 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
   if (must_use_staging_buffer || (!IsDmaMapped(data, packed_size) &&
                                   (should_stage_host_to_device_transfers() &&
                                    packed_size < (int64_t{1} << 30)))) {
-    void* ptr = host_memory_allocator()->AllocateRaw(
-        tsl::Allocator::kAllocatorAlignment, transpose ? size : packed_size);
-    staging_buffer = std::shared_ptr<void>(
-        ptr, [host_memory_allocator = host_memory_allocator()](void* ptr) {
-          host_memory_allocator->DeallocateRaw(ptr);
-        });
+    staging_buffer =
+        host_memory_allocator()->Allocate(transpose ? size : packed_size);
   }
 
   // Copy the buffer into a staging buffer before returning control to the
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 4220db893cb1dc..c00be14ba84295 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -49,6 +49,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/pjrt/abstract_tracked_device_buffer.h"
 #include "xla/pjrt/common_pjrt_client.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -238,7 +239,7 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
       int process_index,
       std::vector<std::unique_ptr<PjRtMemorySpace>> memory_spaces,
       std::unique_ptr<se::DeviceAddressAllocator> allocator,
-      std::unique_ptr<tsl::Allocator> host_memory_allocator,
+      std::unique_ptr<HostMemoryAllocator> host_memory_allocator,
       bool should_stage_host_to_device_transfers,
       std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options);
   ~PjRtStreamExecutorClient() override = default;
@@ -341,7 +342,7 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
   }
   LocalClient* client() const { return client_; }
   se::DeviceAddressAllocator* allocator() const { return allocator_; }
-  tsl::Allocator* host_memory_allocator() const {
+  HostMemoryAllocator* host_memory_allocator() const {
     return host_memory_allocator_.get();
   }
   bool should_stage_host_to_device_transfers() const {
@@ -483,7 +484,7 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
   LocalClient* client_;
 
   // Allocator to be used for staging memory transfers to devices.
-  std::unique_ptr<tsl::Allocator> host_memory_allocator_;
+  std::unique_ptr<HostMemoryAllocator> host_memory_allocator_;
 
   // Device memory allocator. If owned, the allocator must outlive the devices,
   // because it is the device destructor that waits for any outstanding work to
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
index 8f593fb2e05fbd..84fa6ae1997577 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/BUILD
@@ -29,6 +29,7 @@ cc_library(
     hdrs = ["xla_gpu_client_options.h"],
     deps = [
         ":xla_gpu_allocator_config",
+        "//xla/pjrt:host_memory_allocator",
         "//xla/pjrt/distributed:key_value_store_interface",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
index 8e7aa87b935372..771506c9fecf2c 100644
--- a/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
+++ b/third_party/xla/xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 
 #include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_allocator_config.h"
 
 namespace xla {
@@ -40,6 +41,10 @@ struct GpuClientOptions {
 
   bool should_stage_host_to_device_transfers = true;
 
+  // Optional factory for a host memory allocator to use for transfer. Used only
+  // if `should_stage_host_to_device_transfers` is true.
+  HostMemoryAllocator::Factory host_memory_allocator_factory;
+
   // kv_store must be non-null if num_nodes > 1.
   std::shared_ptr<KeyValueStoreInterface> kv_store = nullptr;
 
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.cc b/third_party/xla/xla/pjrt/se_raw_buffer.cc
index 4ba31cb16cb1d9..95a5e499f8e3bc 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.cc
@@ -151,12 +151,8 @@ PjRtStreamExecutorRawBuffer::CopyRawHostToDeviceAndReturnEvent(
                 "host_memory_allocator should be initialized for "
                 "staging buffer transfer.");
           }
-          void* ptr = client->host_memory_allocator()->AllocateRaw(
-              tsl::Allocator::kAllocatorAlignment, transfer_size);
-          staging_buffer = std::shared_ptr<void>(
-              ptr,
-              [host_memory_allocator = client->host_memory_allocator()](
-                  void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+          staging_buffer =
+              client->host_memory_allocator()->Allocate(transfer_size);
           auto copy_to_staging_buffer = [src, transfer_size,
                                          staging_buffer]() mutable {
             std::memcpy(staging_buffer.get(), src, transfer_size);
@@ -210,12 +206,8 @@ PjRtStreamExecutorRawBuffer::CopyRawDeviceToHostAndReturnEvent(
                 "host_memory_allocator should be initialized for "
                 "staging buffer transfer.");
           }
-          void* ptr = client->host_memory_allocator()->AllocateRaw(
-              tsl::Allocator::kAllocatorAlignment, transfer_size);
-          std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
-              ptr,
-              [host_memory_allocator = client->host_memory_allocator()](
-                  void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
+          std::shared_ptr<void> staging_buffer =
+              client->host_memory_allocator()->Allocate(transfer_size);
           TF_RETURN_IF_ERROR(
               stream->Memcpy(staging_buffer.get(), sub_buffer, transfer_size));
           auto copy_from_staging_buffer = [dst, transfer_size,
@@ -496,13 +488,10 @@ void PjRtStreamExecutorRawBuffer::CopyTo(
     src_usage_event_promise->Set(*std::move(d2h_event));
     return;
   } else {
-    void* ptr = client_->host_memory_allocator()->AllocateRaw(
-        tsl::Allocator::kAllocatorAlignment, GetOnDeviceSizeInBytes());
-    std::shared_ptr<void> staging_buffer = std::shared_ptr<void>(
-        ptr, [host_memory_allocator = client_->host_memory_allocator()](
-                 void* ptr) { host_memory_allocator->DeallocateRaw(ptr); });
-    auto d2h_event =
-        CopyRawDeviceToHostAndReturnEvent(ptr, 0, GetOnDeviceSizeInBytes());
+    std::shared_ptr<void> staging_buffer =
+        client_->host_memory_allocator()->Allocate(GetOnDeviceSizeInBytes());
+    auto d2h_event = CopyRawDeviceToHostAndReturnEvent(
+        staging_buffer.get(), 0, GetOnDeviceSizeInBytes());
     if (!d2h_event.ok()) {
       definition_event_promise->SetError(d2h_event.status());
       src_usage_event_promise->SetError(d2h_event.status());

From ad8dc5068d2cd02bca662d1c7ce8f82228fc2b02 Mon Sep 17 00:00:00 2001
From: Luke Baumann <lukebaumann@google.com>
Date: Mon, 15 Dec 2025 11:57:30 -0800
Subject: [PATCH 288/753] Expose profiler advanced configuration as a Python
 dict.

In profiler.cc, the advanced_configuration property of tensorflow::ProfileOptions is now exposed as a Python dictionary. The getter converts the proto map to a nb::dict, handling different value types (bool, int64, string).

Example error:
```
ProfileOptions().advanced_configuration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: Unable to convert function return value to a Python type! The signature was
    (self) -> proto2::Map<std::__u::basic_string<char, std::__u::char_traits<char>, std::__u::allocator<char>>, tensorflow::ProfileOptions_AdvancedConfigValue>
```
PiperOrigin-RevId: 844865140
---
 third_party/xla/xla/python/profiler.cc | 14 +++++++++++++-
 third_party/xla/xla/python/version.h   |  2 +-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 7b141248168629..a59429d66487fb 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -260,7 +260,19 @@ NB_MODULE(_profiler, m) {
           &tensorflow::ProfileOptions::set_raise_error_on_start_failure)
       .def_prop_rw(
           "advanced_configuration",
-          &tensorflow::ProfileOptions::advanced_configuration,
+          [](const tensorflow::ProfileOptions& options) {
+            nb::dict dict;
+            for (const auto& [key, value] : options.advanced_configuration()) {
+              if (value.has_bool_value()) {
+                dict[key.c_str()] = value.bool_value();
+              } else if (value.has_int64_value()) {
+                dict[key.c_str()] = value.int64_value();
+              } else {
+                dict[key.c_str()] = value.string_value();
+              }
+            }
+            return dict;
+          },
           [](tensorflow::ProfileOptions* options, const nb::dict& dict) {
             if (options->mutable_advanced_configuration() == nullptr) {
               throw xla::XlaRuntimeError("advanced_configuration is null");
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 6361abdfe96083..862c0598a7a7aa 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,6 +18,6 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER 40  // Shardy sharded -> unreduced
+#define JAX_IFRT_VERSION_NUMBER 41  // Python getter for advanced_configuration
 
 #endif  // XLA_PYTHON_VERSION_H_

From 46e7f7fb144fd11cf6d17c23dd47620328d77082 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 13:20:46 -0800
Subject: [PATCH 289/753] Disable HDF5 plugin loading when importing h5py in
 Tensorflow.

Set the HDF5_PLUGIN_PATH environment variable to "disable" before importing h5py to prevent the loading of external HDF5 plugins.

PiperOrigin-RevId: 844896565
---
 tensorflow/python/keras/engine/training.py     | 3 +++
 tensorflow/python/keras/keras_parameterized.py | 4 ++++
 tensorflow/python/keras/saving/hdf5_format.py  | 3 +++
 tensorflow/python/keras/saving/save.py         | 5 ++++-
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 9369ffa456392a..4206ef9f882ffc 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -83,6 +83,9 @@
 
 # pylint: disable=g-import-not-at-top
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py
 except ImportError:
   h5py = None
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
index 054df939e8e59a..1a44e6b76f3276 100644
--- a/tensorflow/python/keras/keras_parameterized.py
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -17,6 +17,7 @@
 import collections
 import functools
 import itertools
+import os
 import unittest
 
 from absl.testing import parameterized
@@ -30,6 +31,9 @@
 from tensorflow.python.util import nest
 
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py  # pylint:disable=g-import-not-at-top
 except ImportError:
   h5py = None
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index 1f6bbc43320d0a..05a2c9282909a2 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -34,6 +34,9 @@
 
 # pylint: disable=g-import-not-at-top
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py
   HDF5_OBJECT_HEADER_LIMIT = 64512
 except ImportError:
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index eee859233e5eba..b9ba0bc20d0ba0 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras model saving code."""
-
+import os
 from tensorflow.python import tf2
 from tensorflow.python.keras.saving import hdf5_format
 from tensorflow.python.keras.saving import saving_utils
@@ -25,6 +25,9 @@
 
 # pylint: disable=g-import-not-at-top
 try:
+  # Disable loading HDF5 plugins from a default path and prevent ZDI-CAN-25480.
+  # Importing h5py prior to importing tensorflow will restore the old behavior.
+  os.environ['HDF5_PLUGIN_PATH'] = 'disable'
   import h5py
 except ImportError:
   h5py = None

From f9718d6341aa387f114d706015fde29da038d1b7 Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Mon, 15 Dec 2025 13:38:39 -0800
Subject: [PATCH 290/753] Simplify coordination service configs.

Previously, both the `CoordinationService` and `CoordinationServiceAgent` used
the `tensorflow::CoordinationServiceConfig` proto. Now, both classes have their
own `Config` structs with the options they need.

PiperOrigin-RevId: 844902942
---
 .../xla/xla/pjrt/distributed/client.cc        | 26 +++---
 .../xla/pjrt/distributed/coordination/BUILD   |  6 --
 .../coordination/client_server_test.cc        | 45 +++++-----
 .../coordination/coordination_service.cc      | 50 ++++-------
 .../coordination/coordination_service.h       | 56 +++++++++----
 .../coordination_service_agent.cc             | 50 ++++-------
 .../coordination/coordination_service_agent.h | 42 ++++++++--
 .../coordination_service_agent_test.cc        | 26 +++---
 .../coordination/coordination_service_test.cc | 84 +++++++++----------
 .../preemption_sync_manager_test.cc           | 15 ++--
 .../xla/xla/pjrt/distributed/service.cc       | 23 ++---
 11 files changed, 208 insertions(+), 215 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index b6c3087a56f808..09bbb1060763cc 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -78,7 +78,7 @@ class DistributedRuntimeCoordinationServiceClient
 
  private:
   std::unique_ptr<CoordinationServiceAgent> coord_agent_;
-  tensorflow::CoordinationServiceConfig config_;
+  CoordinationServiceAgent::Config config_;
   absl::Duration min_connect_barrier_timeout_;
   int task_id_;
 };
@@ -87,20 +87,14 @@ DistributedRuntimeCoordinationServiceClient::
     DistributedRuntimeCoordinationServiceClient(
         std::shared_ptr<::grpc::Channel> channel, const Options& options) {
   // Convert options to coordination config.
-  tensorflow::CoordinationServiceConfig config;
-  config.set_service_type("standalone");
-  config.set_service_leader("/job:jax_worker/task:0");
-  config.set_cluster_register_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.init_timeout));
-  config.set_heartbeat_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.heartbeat_timeout));
-  config.set_cluster_register_with_barrier(true);
-  config.set_shutdown_barrier_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.shutdown_timeout));
-  config.set_agent_destruction_without_shutdown(
-      !options.shutdown_on_destruction);
-  config.set_poll_for_error_from_service_at_startup(
-      options.poll_for_error_from_service_at_startup);
+  CoordinationServiceAgent::Config config;
+  config.service_leader = "/job:jax_worker/task:0";
+  config.cluster_register_timeout = options.init_timeout;
+  config.heartbeat_timeout = options.heartbeat_timeout;
+  config.shutdown_barrier_timeout = options.shutdown_timeout;
+  config.agent_destruction_without_shutdown = !options.shutdown_on_destruction;
+  config.poll_for_error_from_service_at_startup =
+      options.poll_for_error_from_service_at_startup;
 
   std::unique_ptr<CoordinationClient> leader_client;
   leader_client.reset(NewGrpcCoordinationClient(channel));
@@ -132,7 +126,7 @@ absl::Status DistributedRuntimeCoordinationServiceClient::Connect() {
            "scheduled, or 3) scheduling delays. Consider setting a longer "
            "initialization timeout if such delays are expected, the timeout is "
            "currently set to: "
-        << absl::Milliseconds(config_.cluster_register_timeout_in_ms())
+        << config_.cluster_register_timeout
         << ".\n\nOriginal runtime error: " << s;
   } else {
     LOG(ERROR) << "Failed to connect to distributed JAX controller: " << s;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/BUILD b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
index 3798e213c65051..a24a4726930aca 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
@@ -53,12 +53,9 @@ cc_library(
     srcs = ["coordination_service.cc"],
     hdrs = ["coordination_service.h"],
     deps = [
-        ":coordination_client",
         ":coordination_service_error_util",
         ":key_value_store",
         "//xla/service:global_device_id",
-        "//xla/tsl/distributed_runtime:call_options",
-        "//xla/tsl/lib/gtl:int_type",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
@@ -142,18 +139,15 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
-        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:random",
     ],
 )
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
index 07fd5fe468f415..783efd39fa4010 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
@@ -54,7 +54,6 @@ limitations under the License.
 
 namespace xla {
 namespace {
-using ::tensorflow::CoordinationServiceConfig;
 using ::testing::AnyOf;
 using ::testing::ElementsAre;
 using ::testing::HasSubstr;
@@ -79,42 +78,40 @@ MATCHER_P2(IsKvEntry, key, value, "") {
 
 class ClientServerTest : public ::testing::Test {
  public:
-  CoordinationServiceConfig GetConfig(
+  CoordinationServiceAgent::Config GetConfig(
       absl::Duration init_and_shutdown_timeout,
       bool shutdown_on_destruction = true,
       bool cluster_register_with_barrier = true,
       bool cluster_shutdown_with_barrier = true) {
     // Set config.
-    tensorflow::CoordinationServiceConfig config;
-    config.set_service_type("standalone");
-    config.set_service_leader("/job:agent/task:0");
-    config.set_cluster_register_timeout_in_ms(
-        absl::ToInt64Milliseconds(init_and_shutdown_timeout));
-    config.set_heartbeat_timeout_in_ms(
-        absl::ToInt64Milliseconds(kHeartbeatTimeout));
+    CoordinationServiceAgent::Config config;
+    config.service_leader = "/job:agent/task:0";
+    config.cluster_register_timeout = init_and_shutdown_timeout;
+    config.heartbeat_timeout = kHeartbeatTimeout;
     if (cluster_shutdown_with_barrier) {
-      config.set_shutdown_barrier_timeout_in_ms(
-          absl::ToInt64Milliseconds(init_and_shutdown_timeout));
+      config.shutdown_barrier_timeout = init_and_shutdown_timeout;
     }
-    config.set_agent_destruction_without_shutdown(!shutdown_on_destruction);
+    config.agent_destruction_without_shutdown = !shutdown_on_destruction;
     // TODO(b/369222279): Add more test cases that exercise TF behaviour (no
     // barrier).
-    config.set_cluster_register_with_barrier(cluster_register_with_barrier);
-    config.set_poll_for_error_from_service_at_startup(true);
+    config.poll_for_error_from_service_at_startup = true;
     return config;
   }
 
-  CoordinationServiceConfig GetServiceConfig(
+  CoordinationService::Config GetServiceConfig(
       int num_nodes, absl::Duration init_and_shutdown_timeout,
       bool cluster_register_with_barrier, bool cluster_shutdown_with_barrier) {
-    auto config =
-        GetConfig(init_and_shutdown_timeout,
-                  /*shutdown_on_destruction=*/true,
-                  cluster_register_with_barrier, cluster_shutdown_with_barrier);
-    tensorflow::CoordinatedJob* job =
-        config.mutable_coordinated_job_list()->Add();
-    job->set_name("agent");
-    job->set_num_tasks(num_nodes);
+    CoordinationService::Config config;
+    config.cluster_register_timeout = init_and_shutdown_timeout;
+    config.heartbeat_timeout = kHeartbeatTimeout;
+    if (cluster_shutdown_with_barrier) {
+      config.shutdown_barrier_timeout = init_and_shutdown_timeout;
+    }
+    config.cluster_register_with_barrier = cluster_register_with_barrier;
+    tensorflow::CoordinatedJob job;
+    job.set_name("agent");
+    job.set_num_tasks(num_nodes);
+    config.coordinated_job_list.push_back(job);
     auto service =
         std::make_unique<CoordinationService>(tsl::Env::Default(), config);
     return config;
@@ -134,7 +131,7 @@ class ClientServerTest : public ::testing::Test {
     leader_client.reset(NewGrpcCoordinationClient(channel));
 
     auto coord_agent = CreateCoordinationServiceAgent();
-    CoordinationServiceConfig config =
+    CoordinationServiceAgent::Config config =
         GetConfig(init_and_shutdown_timeout, shutdown_on_destruction);
     const absl::Status status = coord_agent->Initialize(
         tsl::Env::Default(), "agent", node_id, config, std::move(leader_client),
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
index 7df2ef53488c5f..a3ec50ee8f5971 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
@@ -42,9 +42,7 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/distributed/coordination/coordination_client.h"
 #include "xla/pjrt/distributed/coordination/coordination_service_error_util.h"
-#include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/status.h"
@@ -61,16 +59,12 @@ namespace {
 using tensorflow::CoordinatedTask;
 using tensorflow::CoordinatedTaskState;
 using tensorflow::CoordinatedTaskStateInfo;
-using tensorflow::CoordinationServiceConfig;
-using tensorflow::CoordinationServiceError;
 using tensorflow::DeviceInfo;
 using tensorflow::KeyValueEntry;
 
 constexpr char kClusterRegisterBarrierId[] =
     "[Init]Wait_for_all_tasks_to_register";
 constexpr absl::Duration kDevicePropagationTimeout = absl::Hours(1);
-constexpr int kDefaultHeartbeatTimeoutMs = 10 * 1000;  // 10 seconds
-constexpr int kServiceToClientTimeoutMs = 10 * 1000;   // 10 seconds
 constexpr size_t kOngoingBarriersSoftLimit = 20;
 constexpr char kHealthCheckThread[] = "CoordinationServiceHealthCheck";
 // Limit the number of stragglers we log to avoid `RESOURCE_EXHAUSTED` errors in
@@ -200,25 +194,10 @@ bool CoordinationService::TaskState::IsDisconnectedBeyondGracePeriod() {
          tsl::Env::Default()->NowMicros() > disconnect_grace_period_us_;
 }
 
-CoordinationService::CoordinationService(
-    tsl::Env* env, const CoordinationServiceConfig& config)
-    : env_(*env),
-      heartbeat_timeout_ms_([&config]() -> uint64_t {
-        return config.heartbeat_timeout_in_ms() > 0
-                   ? config.heartbeat_timeout_in_ms()
-                   : kDefaultHeartbeatTimeoutMs;
-      }()),
-      cluster_register_with_barrier_(config.cluster_register_with_barrier()),
-      cluster_register_timeout_(
-          absl::Milliseconds(config.cluster_register_timeout_in_ms())),
-      shutdown_barrier_timeout_(
-          absl::Milliseconds(config.shutdown_barrier_timeout_in_ms())),
-      allow_new_incarnation_to_reconnect_(
-          config.allow_new_incarnation_to_reconnect()) {
+CoordinationService::CoordinationService(tsl::Env* env, const Config& config)
+    : env_(*env), config_(config) {
   LOG(INFO) << "Initializing CoordinationService";
-  recoverable_jobs_ = absl::flat_hash_set<std::string>(
-      config.recoverable_jobs().cbegin(), config.recoverable_jobs().cend());
-  for (const auto& job : config.coordinated_job_list()) {
+  for (const auto& job : config_.coordinated_job_list) {
     for (int i = 0; i < job.num_tasks(); ++i) {
       const std::string task_name = GetTaskName(job.name(), i);
       cluster_state_.emplace(task_name, std::make_unique<TaskState>(task_name));
@@ -237,7 +216,8 @@ void CoordinationService::CheckHeartbeatTimeout() {
       continue;
     }
     const bool is_stale =
-        task_state->TimeSinceLastHeartbeatMs() > heartbeat_timeout_ms_;
+        absl::Milliseconds(task_state->TimeSinceLastHeartbeatMs()) >
+        config_.heartbeat_timeout;
     VLOG(10) << "Checking staleness for " << task_name
              << " stale?=" << is_stale;
     if (is_stale) {
@@ -598,7 +578,7 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
   const auto task_status = task_cluster_state->GetStatus();
 
   if (task_state == CoordinatedTaskState::TASKSTATE_DISCONNECTED ||
-      ((allow_new_incarnation_to_reconnect_ ||
+      ((config_.allow_new_incarnation_to_reconnect ||
         task_cluster_state->IsRecoverable()) &&
        (absl::IsUnavailable(task_status) &&
         task_status.GetPayload(CoordinationErrorPayloadKey())))) {
@@ -609,7 +589,7 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
     //   an unavailable error state, but has now restarted (possibly with
     //   a new incarnation). This is only allowed if configured with
     //   `allow_new_incarnation_to_reconnect`.
-    if (cluster_register_with_barrier_) {
+    if (config_.cluster_register_with_barrier) {
       // Impose barrier so that all tasks can register together.
       // Note: it is possible that the same task restarts multiple times and
       // registers itself with new incarnations.
@@ -633,7 +613,7 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
       }
       BarrierAsyncLocked(
           kClusterRegisterBarrierId, kUniqueBarrierCounter,
-          cluster_register_timeout_, task, {},
+          config_.cluster_register_timeout, task, {},
           ConnectAfterBarrierPasses(task_name, incarnation, std::move(done)));
       ClusterStateUpdated();
       return;
@@ -711,7 +691,8 @@ void CoordinationService::WaitForAllTasks(const CoordinatedTask& task,
 void CoordinationService::ShutdownTaskAsync(const CoordinatedTask& task,
                                             tsl::StatusCallback done) {
   VLOG(3) << "Task " << GetTaskName(task) << " invoked ShutdownTaskAsync()";
-  if (shutdown_barrier_timeout_ > absl::ZeroDuration() && !task.recoverable()) {
+  if (config_.shutdown_barrier_timeout > absl::ZeroDuration() &&
+      !task.recoverable()) {
     // Impose shutdown barrier so that all (non-recoverable) tasks can
     // disconnect together.
     // Notes:
@@ -725,7 +706,7 @@ void CoordinationService::ShutdownTaskAsync(const CoordinatedTask& task,
     //    all tasks.
     auto shutdown_tasks = GetTasksForShutdownBarrier();
     BarrierAsync(shutdown_barrier_id_, kUniqueBarrierCounter,
-                 shutdown_barrier_timeout_, task, shutdown_tasks,
+                 config_.shutdown_barrier_timeout, task, shutdown_tasks,
                  [done = std::move(done)](const absl::Status& s,
                                           int64_t unused_counter) {
                    if (s.ok()) {
@@ -776,7 +757,8 @@ absl::Status CoordinationService::DisconnectTask(const CoordinatedTask& task) {
 
   // Disconnect task.
   task_state->Disconnect(
-      /*grace_period_duration_us=*/heartbeat_timeout_ms_ * 1000);
+      /*grace_period_duration_us=*/absl::ToInt64Milliseconds(
+          config_.heartbeat_timeout));
   LeaveOngoingBarriers(task, "task disconnected");
   RefreshAliveness();
   error_polling_state_.RemoveTask(task, "task has disconnected.");
@@ -1443,7 +1425,8 @@ void CoordinationService::PassBarrier(BarrierState* barrier,
            "some tasks were never scheduled, or 3) scheduling delays. Consider "
            "setting a longer initialization timeout if such delays are "
            "expected, the timeout is currently set to: "
-        << cluster_register_timeout_ << ".\n\nOriginal error: " << result;
+        << config_.cluster_register_timeout
+        << ".\n\nOriginal error: " << result;
     return;
   }
   // Special hook for shutdown barrier to disconnect tasks at the barrier and
@@ -1829,7 +1812,8 @@ void CoordinationService::CompleteShutdownAfterBarrier(
 
 bool CoordinationService::isRecoverableJob(
     const absl::string_view task_name) const {
-  return recoverable_jobs_.find(task_name) != recoverable_jobs_.end();
+  return config_.recoverable_jobs.find(task_name) !=
+         config_.recoverable_jobs.end();
 }
 
 void CoordinationService::SendErrorPollingResponseOrFailAllTasks(
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
index e95bd9c2adf23d..898f262691fbad 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
@@ -19,8 +19,9 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
-#include <utility>
+#include <tuple>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -35,10 +36,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "xla/pjrt/distributed/coordination/coordination_client.h"
+#include "absl/types/span.h"
 #include "xla/pjrt/distributed/coordination/key_value_store.h"
 #include "xla/service/global_device_id.h"
-#include "xla/tsl/lib/gtl/int_type.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
@@ -65,6 +65,42 @@ namespace xla {
 // tasks. Each task interacts with the service through CoordinationServiceAgent.
 class CoordinationService {
  public:
+  struct Config {
+    // Maximum wait time for all members in the cluster to be registered.
+    absl::Duration cluster_register_timeout = absl::Minutes(60);
+
+    // Denotes if we should synchronize the agents' register attempts by
+    // blocking on a barrier. This is useful for synchronized restarts.
+    bool cluster_register_with_barrier = false;
+
+    // Heartbeat timeout, if a task does not record heartbeat in this time
+    // window, it will be considered disconnected.
+    // Note: This is also used as a grace period to accept any heartbeats after
+    // the agent has disconnected, to account for the lag time between the
+    // service recording the state change and the agent stopping heartbeats.
+    absl::Duration heartbeat_timeout = absl::Seconds(10);
+
+    // The list of `CoordinatedJob`s that will register in coordination service.
+    std::vector<tensorflow::CoordinatedJob> coordinated_job_list;
+
+    // Denotes how long to wait for all coordination agents to reach the
+    // barriers (after the first shutdown request) before disconnecting
+    // together. If set to 0, no barrier is imposed upon shutdown and each
+    // worker can disconnect individually.
+    absl::Duration shutdown_barrier_timeout = absl::ZeroDuration();
+
+    // The list of jobs which are recoverable. If a task in this list fails,
+    // it will not propagate error to other tasks.
+    // If empty, no jobs will be recoverable and every task failure will cause
+    // error propagation to other tasks.
+    absl::flat_hash_set<std::string> recoverable_jobs;
+
+    // If a task restarts with a new incarnation, we may allow it to reconnect
+    // silently. This is useful when we know that a task can immediately resume
+    // work upon re-connecting to the service.
+    bool allow_new_incarnation_to_reconnect = false;
+  };
+
   using StatusOrValueCallback =
       std::function<void(const absl::StatusOr<absl::string_view>&)>;
   using BarrierCallback = std::function<void(const absl::Status&, int64_t)>;
@@ -89,8 +125,7 @@ class CoordinationService {
       absl::flat_hash_set<tensorflow::CoordinatedTask, CoordinatedTaskHash,
                           CoordinatedTaskEqual>;
 
-  CoordinationService(tsl::Env* env,
-                      const tensorflow::CoordinationServiceConfig& config);
+  CoordinationService(tsl::Env* env, const Config& config);
 
   ~CoordinationService() {
     absl::MutexLock lock(state_mu_);
@@ -612,14 +647,7 @@ class CoordinationService {
 
   tsl::Env& env_;
   const IncarnationId service_incarnation_{tsl::random::New64()};
-  const uint64_t heartbeat_timeout_ms_;
-  bool cluster_register_with_barrier_ = false;
-  const absl::Duration cluster_register_timeout_;
-  const absl::Duration shutdown_barrier_timeout_;
-  // If a task restarts with a new incarnation, we may allow it to reconnect
-  // silently if configured. This is useful when we know that a task can
-  // immediately resume work upon re-connecting to the service.
-  bool allow_new_incarnation_to_reconnect_ = false;
+  const Config config_;
 
   std::function<tensorflow::DeviceInfo(const tensorflow::DeviceInfo& devices)>
       post_aggregate_device_fn_;
@@ -650,8 +678,6 @@ class CoordinationService {
   // The state of all pending GetAliveTasks calls.
   std::vector<AlivenessState> aliveness_states_ ABSL_GUARDED_BY(state_mu_);
 
-  absl::flat_hash_set<std::string> recoverable_jobs_;
-
   // When the tasks connect to coordination service after cluster initialization
   // is done, they will be added to this set.
   // Tasks connecting after cluster initialization indicate that they
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index 72dfd996032f16..5ab814869e4b96 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -36,16 +36,13 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "absl/types/span.h"
 #include "xla/pjrt/distributed/coordination/coordination_client.h"
-#include "xla/pjrt/distributed/coordination/coordination_service.h"
 #include "xla/pjrt/distributed/coordination/coordination_service_error_util.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
 #include "xla/tsl/framework/cancellation.h"
@@ -59,7 +56,6 @@ namespace xla {
 using tensorflow::CoordinatedTask;
 using tensorflow::CoordinatedTaskState;
 using tensorflow::CoordinatedTaskStateInfo;
-using tensorflow::CoordinationServiceConfig;
 using tensorflow::DeviceInfo;
 using tensorflow::KeyValueEntry;
 
@@ -69,27 +65,22 @@ auto* enabled_usage_metric = tsl::monitoring::Gauge<bool, 0>::New(
     "/coordination_service/v2/agent/enabled",
     "Tracks usage of coordination service.");
 
-constexpr absl::Duration kDefaultClusterRegisterTimeout = absl::Hours(1);
-constexpr absl::Duration kDefaultHeartbeatTimeout = absl::Seconds(10);
-constexpr absl::Duration kDefaultShutdownTimeout = absl::Seconds(10);
 constexpr char kHeartbeatThread[] = "CoordinationServiceHeartbeatLoop";
 
 }  // namespace
 
 absl::Status CoordinationServiceAgent::Initialize(
     tsl::Env* env, absl::string_view job_name, int task_id,
-    const CoordinationServiceConfig& configs,
-    std::unique_ptr<CoordinationClient> leader_client,
+    const Config& config, std::unique_ptr<CoordinationClient> leader_client,
     tsl::StatusCallback error_fn) {
-  return Initialize(env, job_name, task_id, configs, std::move(leader_client),
+  return Initialize(env, job_name, task_id, config, std::move(leader_client),
                     error_fn,
                     /*recoverable=*/false);
 }
 
 absl::Status CoordinationServiceAgent::Initialize(
     tsl::Env* env, absl::string_view job_name, int task_id,
-    const CoordinationServiceConfig& configs,
-    std::unique_ptr<CoordinationClient> leader_client,
+    const Config& config, std::unique_ptr<CoordinationClient> leader_client,
     tsl::StatusCallback error_fn, bool recoverable) {
   CoordinatedTask task;
   task.set_job_name(std::string(job_name));
@@ -102,12 +93,11 @@ absl::Status CoordinationServiceAgent::Initialize(
            "`WaitAtBarrier` explicitly at the end of the program.";
     task.set_recoverable(true);
   }
-  return Initialize(env, task, configs, std::move(leader_client), error_fn);
+  return Initialize(env, task, config, std::move(leader_client), error_fn);
 }
 
 absl::Status CoordinationServiceAgent::Initialize(
-    tsl::Env* env, const CoordinatedTask& task,
-    const CoordinationServiceConfig& configs,
+    tsl::Env* env, const CoordinatedTask& task, const Config& config,
     std::unique_ptr<CoordinationClient> leader_client,
     tsl::StatusCallback error_fn) {
   enabled_usage_metric->GetCell()->Set(true);
@@ -119,8 +109,8 @@ absl::Status CoordinationServiceAgent::Initialize(
 
   env_ = env;
   task_ = task;
-  configs_ = configs;
-  if (configs_.service_leader().empty()) {
+  config_ = config;
+  if (config_.service_leader.empty()) {
     return MakeCoordinationError(absl::InvalidArgumentError(
         "CoordinationServiceAgent must be initialized with a valid leader."));
   }
@@ -183,13 +173,9 @@ absl::Status CoordinationServiceAgent::Connect() {
   request.set_incarnation(incarnation_id_.value());
   RegisterTaskResponse response;
 
-  const int64_t register_timeout =
-      configs_.cluster_register_timeout_in_ms() > 0
-          ? configs_.cluster_register_timeout_in_ms()
-          : absl::ToInt64Milliseconds(kDefaultClusterRegisterTimeout);
   // Give 5 seconds for any service-related timeouts to propagate.
   const absl::Time deadline =
-      absl::Now() + absl::Milliseconds(register_timeout) + absl::Seconds(5);
+      absl::Now() + config_.cluster_register_timeout + absl::Seconds(5);
   int attempt = 0;
   std::default_random_engine generator;
   std::uniform_real_distribution<double> distribution(0.0, 1.0);
@@ -244,7 +230,7 @@ absl::Status CoordinationServiceAgent::Connect() {
       tsl::ThreadOptions(), kHeartbeatThread,
       absl::bind_front(&CoordinationServiceAgent::StartSendingHeartbeats,
                        this)));
-  if (configs_.poll_for_error_from_service_at_startup()) {
+  if (config_.poll_for_error_from_service_at_startup) {
     StartPollingForError();
   }
   return absl::OkStatus();
@@ -255,12 +241,9 @@ void CoordinationServiceAgent::StartSendingHeartbeats() {
   *request.mutable_source_task() = task_;
   request.set_incarnation(incarnation_id_.value());
   HeartbeatResponse response;
-  const int64_t heartbeat_interval_ms =
-      configs_.heartbeat_timeout_in_ms() > 0
-          ? configs_.heartbeat_timeout_in_ms() / 2
-          : absl::ToInt64Milliseconds(kDefaultHeartbeatTimeout) / 2;
+  const absl::Duration heartbeat_interval = config_.heartbeat_timeout;
   tsl::CallOptions call_opts;
-  call_opts.SetTimeout(heartbeat_interval_ms);
+  call_opts.SetTimeout(absl::ToInt64Milliseconds(heartbeat_interval));
 
   while (true) {
     absl::Status status;
@@ -302,7 +285,7 @@ void CoordinationServiceAgent::StartSendingHeartbeats() {
     {
       absl::MutexLock l(shutdown_mu_);
       shutdown_mu_.AwaitWithTimeout(absl::Condition(&shutting_down_),
-                                    absl::Milliseconds(heartbeat_interval_ms));
+                                    config_.heartbeat_timeout);
       if (shutting_down_) {
         return;
       }
@@ -524,18 +507,15 @@ absl::Status CoordinationServiceAgent::ShutdownInternal() {
     is_connected = state_ == CoordinatedTaskState::TASKSTATE_CONNECTED;
   }
   // Disconnect agent from service.
-  if (!configs_.agent_destruction_without_shutdown() && is_connected) {
+  if (!config_.agent_destruction_without_shutdown && is_connected) {
     LOG(INFO) << "Coordination agent has initiated Shutdown().";
     ShutdownTaskRequest request;
     *request.mutable_source_task() = task_;
     ShutdownTaskResponse response;
     tsl::CallOptions call_opts;
+    // Add 5s for service-related errors to propagate.
     const int64_t shutdown_timeout =
-        (configs_.shutdown_barrier_timeout_in_ms() > 0
-             ? configs_.shutdown_barrier_timeout_in_ms()
-             : absl::ToInt64Milliseconds(kDefaultShutdownTimeout)) +
-        // Add 5s for service-related errors to propagate.
-        5 * 1000;
+        absl::ToInt64Milliseconds(config_.shutdown_barrier_timeout) + 5 * 1000;
     call_opts.SetTimeout(shutdown_timeout);
 
     absl::Notification n;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
index 15231c562b94fe..77621a073b14cd 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
@@ -27,14 +27,12 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "absl/types/span.h"
 #include "xla/pjrt/distributed/coordination/coordination_client.h"
 #include "xla/pjrt/distributed/coordination/coordination_service.h"
 #include "xla/tsl/distributed_runtime/call_options.h"
@@ -70,6 +68,36 @@ namespace xla {
 //                               registered or wrong config).
 class CoordinationServiceAgent {
  public:
+  struct Config {
+    // Address where the coordination service instance is hosted.
+    std::string service_leader;
+
+    // Maximum wait time for all members in the cluster to be registered.
+    absl::Duration cluster_register_timeout = absl::Hours(1);
+
+    // Heartbeat timeout, if a task does not record heartbeat in this time
+    // window, it will be considered disconnected.
+    // Note: This is also used as a grace period to accept any heartbeats after
+    // the agent has disconnected, to account for the lag time between the
+    // service recording the state change and the agent stopping heartbeats.
+    absl::Duration heartbeat_timeout = absl::Seconds(10);
+
+    // Denotes how long to wait for all coordination agents to reach the
+    // barriers (after the first shutdown request) before disconnecting
+    // together. If set to 0, no barrier is imposed upon shutdown and each
+    // worker can disconnect individually.
+    absl::Duration shutdown_barrier_timeout = absl::Seconds(10);
+
+    // If set, agents do not make an explicit Shutdown() call. Service will only
+    // find out about the disconnected agent via stale heartbeats. Used for
+    // testing.
+    bool agent_destruction_without_shutdown = false;
+
+    // Use long polling to get error from coordination service as the error
+    // propagation mechanism.
+    bool poll_for_error_from_service_at_startup = false;
+  };
+
   using StatusOrValueCallback =
       std::function<void(const absl::StatusOr<std::string>&)>;
   // Collection of key-value pairs in the same directory.
@@ -86,18 +114,16 @@ class CoordinationServiceAgent {
   }
 
   absl::Status Initialize(tsl::Env* env, absl::string_view job_name,
-                          int task_id,
-                          const tensorflow::CoordinationServiceConfig& configs,
+                          int task_id, const Config& config,
                           std::unique_ptr<CoordinationClient> leader_client,
                           tsl::StatusCallback error_fn, bool recoverable);
   absl::Status Initialize(tsl::Env* env, absl::string_view job_name,
-                          int task_id,
-                          const tensorflow::CoordinationServiceConfig& configs,
+                          int task_id, const Config& config,
                           std::unique_ptr<CoordinationClient> leader_client,
                           tsl::StatusCallback error_fn);
   absl::Status Initialize(tsl::Env* env,
                           const tensorflow::CoordinatedTask& task,
-                          const tensorflow::CoordinationServiceConfig& configs,
+                          const Config& config,
                           std::unique_ptr<CoordinationClient> leader_client,
                           tsl::StatusCallback error_fn);
 
@@ -380,7 +406,7 @@ class CoordinationServiceAgent {
   tsl::Env* env_ = nullptr;  // Not owned.
   const IncarnationId incarnation_id_{tsl::random::New64()};
   tensorflow::CoordinatedTask task_;
-  tensorflow::CoordinationServiceConfig configs_;
+  Config config_;
   tsl::StatusCallback error_fn_;
 
   mutable absl::Mutex state_mu_;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
index 92a35738368af4..0caa18b97102a1 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
@@ -41,7 +41,6 @@ limitations under the License.
 namespace xla {
 namespace {
 using tensorflow::CoordinatedTask;
-using tensorflow::CoordinationServiceConfig;
 using tensorflow::KeyValueEntry;
 
 using ::testing::_;
@@ -181,8 +180,8 @@ class CoordinationServiceAgentTest : public ::testing::Test {
   }
 
   // Should be called after mocking service responses, before testing the agent.
-  void InitializeAgent(CoordinationServiceConfig config = {}) {
-    config.set_service_leader("test_leader");
+  void InitializeAgent(CoordinationServiceAgent::Config config = {}) {
+    config.service_leader = "test_leader";
     TF_ASSERT_OK(agent_->Initialize(
         tsl::Env::Default(), /*job_name=*/"test_job",
         /*task_id=*/0, config, std::move(client_),
@@ -456,8 +455,8 @@ TEST_F(CoordinationServiceAgentTest, ConnectAfterReset_WithErrorPolling) {
       .WillOnce(DoAll(SetArgPointee<2>(mocked_response),
                       InvokeArgument<3>(absl::InternalError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   // The agent will be in ERROR state after the first call to Connect()
   // because the error polling thread will be created and will immediately
@@ -483,8 +482,8 @@ TEST_F(CoordinationServiceAgentTest, CancelledPollForErrorRequest) {
       .WillOnce(DoAll(SetArgPointee<2>(mocked_response),
                       InvokeArgument<3>(absl::CancelledError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   TF_ASSERT_OK(agent_->Connect());
   // Wait a bit for the error polling thread to start.
@@ -501,8 +500,8 @@ TEST_F(CoordinationServiceAgentTest, InvalidPollForErrorRequest) {
           DoAll(SetArgPointee<2>(mocked_response),
                 InvokeArgument<3>(absl::InvalidArgumentError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   TF_ASSERT_OK(agent_->Connect());
   // Wait a bit for the error polling thread to start.
@@ -519,8 +518,8 @@ TEST_F(CoordinationServiceAgentTest,
           SetArgPointee<2>(mocked_response),
           InvokeArgument<3>(absl::FailedPreconditionError("Test Error."))));
 
-  CoordinationServiceConfig config;
-  config.set_poll_for_error_from_service_at_startup(true);
+  CoordinationServiceAgent::Config config;
+  config.poll_for_error_from_service_at_startup = true;
   InitializeAgent(config);
   TF_ASSERT_OK(agent_->Connect());
   // Wait a bit for the error polling thread to start.
@@ -598,10 +597,9 @@ TEST_F(CoordinationServiceAgentTest, Connect_AbortedErrorShouldFailEventually) {
   EXPECT_CALL(*GetClient(), RegisterTaskAsync(_, _, _, _))
       .WillRepeatedly(
           InvokeArgument<3>(absl::AbortedError("DuplicateTaskRegistration")));
-  CoordinationServiceConfig config;
+  CoordinationServiceAgent::Config config;
   // Connect should only be retried for 3 seconds.
-  config.set_cluster_register_timeout_in_ms(
-      absl::ToInt64Milliseconds(absl::Seconds(3)));
+  config.cluster_register_timeout = absl::Seconds(3);
   InitializeAgent(config);
 
   absl::Status s = agent_->Connect();
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
index b201c1100da1c5..7a08a8c0bf3100 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/*r Copyright 2021 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -78,12 +78,12 @@ KeyValueEntry CreateKv(const std::string& key, const std::string& value) {
   return kv;
 }
 
-CoordinationServiceConfig GetCoordinationServiceConfig(int num_tasks) {
-  CoordinationServiceConfig config;
-  config.set_service_type(kCoordinationServiceType);
-  CoordinatedJob* job = config.mutable_coordinated_job_list()->Add();
-  job->set_name("worker");
-  job->set_num_tasks(num_tasks);
+CoordinationService::Config GetCoordinationServiceConfig(int num_tasks) {
+  CoordinationService::Config config;
+  CoordinatedJob job;
+  job.set_name("worker");
+  job.set_num_tasks(num_tasks);
+  config.coordinated_job_list.push_back(std::move(job));
   return config;
 }
 
@@ -168,7 +168,8 @@ class CoordinationBarrierTest : public ::testing::Test {
       tasks_.push_back(task);
       clients_.push_back(std::move(client));
     }
-    CoordinationServiceConfig config = GetCoordinationServiceConfig(num_tasks);
+    CoordinationService::Config config =
+        GetCoordinationServiceConfig(num_tasks);
 
     coord_service_ =
         std::make_unique<CoordinationService>(tsl::Env::Default(), config);
@@ -232,24 +233,21 @@ class CoordinateTwoTasksTest : public ::testing::Test {
       bool enable_register_barrier = false,
       bool set_worker_job_recoverable = false,
       bool allow_new_incarnation_to_reconnect = false) {
-    CoordinationServiceConfig config =
+    CoordinationService::Config config =
         GetCoordinationServiceConfig(/*num_tasks=*/2);
-    config.set_heartbeat_timeout_in_ms(kHeartbeatTimeout /
-                                       absl::Milliseconds(1));
+    config.heartbeat_timeout = kHeartbeatTimeout;
     if (set_worker_job_recoverable) {
-      config.mutable_recoverable_jobs()->Add("worker");
+      config.recoverable_jobs.insert("worker");
     }
     if (enable_shutdown_barrier) {
-      config.set_shutdown_barrier_timeout_in_ms(kShutdownBarrierTimeout /
-                                                absl::Milliseconds(1));
+      config.shutdown_barrier_timeout = kShutdownBarrierTimeout;
     }
     if (enable_register_barrier) {
-      config.set_cluster_register_with_barrier(true);
-      config.set_cluster_register_timeout_in_ms(absl::Seconds(1) /
-                                                absl::Milliseconds(1));
+      config.cluster_register_with_barrier = true;
+      config.cluster_register_timeout = absl::Seconds(1);
     }
     if (allow_new_incarnation_to_reconnect) {
-      config.set_allow_new_incarnation_to_reconnect(true);
+      config.allow_new_incarnation_to_reconnect = true;
     }
     // Init service.
     coord_service_ =
@@ -322,14 +320,15 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
   evaluator.set_job_name("evaluator");
   evaluator.set_task_id(0);
 
-  CoordinationServiceConfig config;
-  config.set_service_type(kCoordinationServiceType);
-  CoordinatedJob* chief_job = config.mutable_coordinated_job_list()->Add();
-  chief_job->set_name("chief");
-  chief_job->set_num_tasks(1);
-  CoordinatedJob* worker_job = config.mutable_coordinated_job_list()->Add();
-  worker_job->set_name("worker");
-  worker_job->set_num_tasks(2);
+  CoordinationService::Config config;
+  CoordinatedJob chief_job;
+  chief_job.set_name("chief");
+  chief_job.set_num_tasks(1);
+  config.coordinated_job_list.push_back(chief_job);
+  CoordinatedJob worker_job;
+  worker_job.set_name("worker");
+  worker_job.set_num_tasks(2);
+  config.coordinated_job_list.push_back(worker_job);
 
   auto coord_service =
       std::make_unique<CoordinationService>(tsl::Env::Default(), config);
@@ -369,7 +368,7 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
 // In this case, the agent would retry Connect() and should succeed if it has
 // the same incarnation.
 TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
@@ -388,7 +387,7 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyConnected_Succeeds) {
 
 TEST(CoordinationServiceTest,
      RegisterTask_AlreadyConnectedDifferentIncarnation_Fails) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
@@ -408,7 +407,7 @@ TEST(CoordinationServiceTest,
 }
 
 TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
-  CoordinationServiceConfig config =
+  CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
@@ -854,7 +853,7 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
 }
 
 TEST(CoordinationServiceTest, TryGetKeyValue) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   std::unique_ptr<CoordinationService> coord_service =
       std::make_unique<CoordinationService>(tsl::Env::Default(), config);
@@ -876,7 +875,7 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
 }
 
 TEST(CoordinationServiceTest, IncrementKeyValue) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/1);
   std::unique_ptr<CoordinationService> coord_service =
       std::make_unique<CoordinationService>(tsl::Env::Default(), config);
@@ -975,7 +974,7 @@ TEST_F(CoordinateTwoTasksTest,
 // Verify that coordination service can gather each task's device info and
 // propagate the aggregated cluster device info correctly.
 TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/3);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
@@ -1032,7 +1031,7 @@ TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
 // Task devices should not be added twice if same task calls WaitForAllDevices()
 // twice.
 TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
-  const CoordinationServiceConfig config =
+  const CoordinationService::Config config =
       GetCoordinationServiceConfig(/*num_tasks=*/2);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
@@ -2110,10 +2109,9 @@ TEST_F(CoordinateTwoTasksTest,
 }
 
 TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
-  CoordinationServiceConfig config;
-  config.set_service_type(kCoordinationServiceType);
+  CoordinationService::Config config;
   // Workers are recoverable, chief is not.
-  config.mutable_recoverable_jobs()->Add("worker");
+  config.recoverable_jobs.insert("worker");
   CoordinatedTask chief;
   chief.set_job_name("chief");
   chief.set_task_id(0);
@@ -2123,12 +2121,14 @@ TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
   CoordinatedTask task_1;
   task_1.set_job_name("worker");
   task_1.set_task_id(1);
-  CoordinatedJob* chief_job = config.mutable_coordinated_job_list()->Add();
-  chief_job->set_name("chief");
-  chief_job->set_num_tasks(1);
-  CoordinatedJob* worker_job = config.mutable_coordinated_job_list()->Add();
-  worker_job->set_name("worker");
-  worker_job->set_num_tasks(2);
+  CoordinatedJob chief_job;
+  chief_job.set_name("chief");
+  chief_job.set_num_tasks(1);
+  config.coordinated_job_list.push_back(chief_job);
+  CoordinatedJob worker_job;
+  worker_job.set_name("worker");
+  worker_job.set_num_tasks(2);
+  config.coordinated_job_list.push_back(worker_job);
 
   std::unique_ptr<CoordinationService> coord_service =
       std::make_unique<CoordinationService>(tsl::Env::Default(), config);
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
index c8edb09a60bcd0..3fed3b8b36cd1b 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
@@ -43,7 +43,6 @@ namespace xla {
 namespace {
 using tensorflow::CoordinatedJob;
 using tensorflow::CoordinatedTask;
-using tensorflow::CoordinationServiceConfig;
 
 constexpr char kJobName[] = "test_worker";
 
@@ -144,11 +143,11 @@ class PreemptionSyncManagerTest : public ::testing::Test {
         [service = coord_rpc_service_.get()]() { service->HandleRPCsLoop(); }));
   }
   std::unique_ptr<CoordinationService> EnableCoordinationService() {
-    CoordinationServiceConfig config;
-    config.set_service_type("standalone");
-    CoordinatedJob* job = config.mutable_coordinated_job_list()->Add();
-    job->set_name(kJobName);
-    job->set_num_tasks(2);
+    CoordinationService::Config config;
+    CoordinatedJob job;
+    job.set_name(kJobName);
+    job.set_num_tasks(2);
+    config.coordinated_job_list.push_back(job);
     return std::make_unique<CoordinationService>(tsl::Env::Default(), config);
   }
   void InitializeAndConnectCoordinationAgents() {
@@ -161,8 +160,8 @@ class PreemptionSyncManagerTest : public ::testing::Test {
     auto error_fn = [](const absl::Status& status) {
       LOG(ERROR) << "Coordination service agent in error status: " << status;
     };
-    CoordinationServiceConfig coord_config;
-    coord_config.set_service_leader("test_leader");
+    CoordinationServiceAgent::Config coord_config;
+    coord_config.service_leader = "test_leader";
     CHECK_OK(coord_agent_->Initialize(tsl::Env::Default(), kJobName,
                                       /*task_id=*/0, coord_config,
                                       std::move(coord_client), error_fn));
diff --git a/third_party/xla/xla/pjrt/distributed/service.cc b/third_party/xla/xla/pjrt/distributed/service.cc
index c5688fc14e8e37..513341bbd42c3e 100644
--- a/third_party/xla/xla/pjrt/distributed/service.cc
+++ b/third_party/xla/xla/pjrt/distributed/service.cc
@@ -37,20 +37,15 @@ namespace {
 std::unique_ptr<xla::CoordinationService> EnableCoordinationService(
     const xla::CoordinationServiceImpl::Options& options) {
   const std::string job_name = "jax_worker";
-  tensorflow::CoordinationServiceConfig config;
-  config.set_service_type("standalone");
-  config.set_service_leader(absl::StrCat("/job:", job_name, "/task:0"));
-  config.set_cluster_register_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.cluster_register_timeout));
-  config.set_cluster_register_with_barrier(true);
-  config.set_heartbeat_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.heartbeat_timeout));
-  config.set_shutdown_barrier_timeout_in_ms(
-      absl::ToInt64Milliseconds(options.shutdown_timeout));
-  tensorflow::CoordinatedJob* job =
-      config.mutable_coordinated_job_list()->Add();
-  job->set_name(job_name);
-  job->set_num_tasks(options.num_nodes);
+  xla::CoordinationService::Config config;
+  config.cluster_register_timeout = options.cluster_register_timeout;
+  config.cluster_register_with_barrier = true;
+  config.heartbeat_timeout = options.heartbeat_timeout;
+  config.shutdown_barrier_timeout = options.shutdown_timeout;
+  tensorflow::CoordinatedJob job;
+  job.set_name(job_name);
+  job.set_num_tasks(options.num_nodes);
+  config.coordinated_job_list.push_back(job);
   auto service =
       std::make_unique<xla::CoordinationService>(options.env, config);
   return service;

From 05e273cafa8a2fe1fb74b6ec01aaec6aaa1d3f09 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Mon, 15 Dec 2025 13:44:27 -0800
Subject: [PATCH 291/753] Add tests for recently added `NamedSharding`
 implementations

PiperOrigin-RevId: 844905452
---
 .../xla/xla/hlo/utils/hlo_sharding_util.cc    | 21 ++++++-----
 .../xla/hlo/utils/hlo_sharding_util_test.cc   | 36 +++++++++++++++++++
 2 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index c8ac759cf20973..dbfbeaa362fb66 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -1516,9 +1516,10 @@ HloSharding PartiallyReplicateTiledShardingOnDims(
         sharding.named_sharding().dim_shardings().begin(),
         sharding.named_sharding().dim_shardings().end());
     for (int64_t dim : dims_to_replicate) {
-      if (dim < dim_shardings.size()) {
-        dim_shardings[dim] = NamedSharding::DimensionSharding();
-      }
+      CHECK_LT(dim, dim_shardings.size())
+          << "Dimension " << dim << " is out of bounds for number dimensions "
+          << dim_shardings.size();
+      dim_shardings[dim] = NamedSharding::DimensionSharding();
     }
     return HloSharding(NamedSharding(
         sharding.named_sharding().mesh(), dim_shardings,
@@ -1527,12 +1528,10 @@ HloSharding PartiallyReplicateTiledShardingOnDims(
   }
 
   int64_t group_count = 1;
-  DimensionVector valid_dims_to_replicate;
   for (int64_t dim : dims_to_replicate) {
-    if (dim >= sharding.TiledDataRank()) {
-      continue;
-    }
-    valid_dims_to_replicate.push_back(dim);
+    CHECK_LT(dim, sharding.TiledDataRank())
+        << "Dimension " << dim << " is out of bounds for number dimensions "
+        << sharding.TiledDataRank();
     group_count *= sharding.dimension(dim);
   }
   if (group_count == 1) {
@@ -1544,14 +1543,14 @@ HloSharding PartiallyReplicateTiledShardingOnDims(
   DimensionVector dim_permutation(sharding.TiledDataRank());
   absl::c_iota(dim_permutation, 0);
   absl::c_stable_sort(dim_permutation, [&](const int64_t a, const int64_t b) {
-    return absl::c_linear_search(valid_dims_to_replicate, a) <
-           absl::c_linear_search(valid_dims_to_replicate, b);
+    return absl::c_linear_search(dims_to_replicate, a) <
+           absl::c_linear_search(dims_to_replicate, b);
   });
   auto new_tile =
       TransposeSharding(sharding, dim_permutation).tile_assignment();
   DimensionVector new_tile_shape(sharding.dimensions().begin(),
                                  sharding.dimensions().end());
-  for (int64_t dim : valid_dims_to_replicate) {
+  for (int64_t dim : dims_to_replicate) {
     new_tile_shape[dim] = 1;
   }
   if (sharding.ReplicateOnLastTileDim()) {
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index ab7a203e0d2ae1..d5deceb27c1970 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -647,6 +647,42 @@ TEST(HloShardingUtilTest, PropagateShardingAlongDimsAndReplicateOthers4) {
   EXPECT_EQ(target_sharding.named_sharding(), expected);
 }
 
+TEST(HloShardingUtilTest, PartiallyReplicateTiledShardingOnDims) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding =
+      test_utils::FromAxisNames(mesh, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}});
+  std::vector<int64_t> dims_to_replicate = {3, 1};
+  HloSharding target_sharding = PartiallyReplicateTiledShardingOnDims(
+      HloSharding(source_sharding), dims_to_replicate);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{"a"}, {}, {"c"}, {}, {"e"}});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
+}
+
+TEST(HloShardingUtilTest, ReplicateAllDataDims) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding = test_utils::FromAxisNames(
+      mesh, {{"a"}, {}, {"c"}, {}, {"e"}}, /*replicated_axes=*/{"d"},
+      /*unreduced_axes=*/{"b"});
+  HloSharding target_sharding =
+      ReplicateAllDataDims(HloSharding(source_sharding), 3);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{}, {}, {}}, {"d"}, {"b"});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
+}
+
+TEST(HloShardingUtilTest, RemoveShapeDimensions) {
+  Mesh mesh({2, 3, 5, 7, 11}, {"a", "b", "c", "d", "e"});
+  NamedSharding source_sharding =
+      test_utils::FromAxisNames(mesh, {{"a"}, {}, {"c"}, {}, {"e"}});
+  std::vector<int64_t> dims_to_remove = {1, 3};
+  HloSharding target_sharding =
+      RemoveShapeDimensions(HloSharding(source_sharding), dims_to_remove);
+  NamedSharding expected =
+      test_utils::FromAxisNames(mesh, {{"a"}, {"c"}, {"e"}});
+  EXPECT_EQ(target_sharding.named_sharding(), expected);
+}
+
 TEST(HloShardingUtilTest, MergeManualSubgroupSharding) {
   TileAssignment tile_assignment({16, 4});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,

From dd3ff5ae83710345274b0db4b423ae3f287a5a73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 15:05:48 -0800
Subject: [PATCH 292/753] Integrate LLVM at llvm/llvm-project@8f264586d752

Updates LLVM usage to match
[8f264586d752](https://github.com/llvm/llvm-project/commit/8f264586d752)

PiperOrigin-RevId: 844935558
---
 third_party/xla/third_party/llvm/workspace.bzl    |  4 ++--
 .../xla/third_party/shardy/temporary.patch        | 15 +++++++++++++++
 third_party/xla/third_party/shardy/workspace.bzl  |  4 ++--
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index 69a8c63368c081..f2c3289a046872 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "43bfec29cbecc1ff2e5aa6f8908c4d63e9c896c5"
-    LLVM_SHA256 = "d9c35a7c3764666abcf464955530154d528b2e5edeb97bfa8890f02cb52d1f30"
+    LLVM_COMMIT = "8f264586d7521b0e305ca7bb78825aa3382ffef7"
+    LLVM_SHA256 = "5784c4af94caba66bc8c460e07e222f751e4f4c9db9c45b3a68ff55379cf587d"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index e69de29bb2d1d6..3f2ad26d310aa0 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -0,0 +1,15 @@
+diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
+index 69a8c63..f2c3289 100644
+--- a/third_party/llvm/workspace.bzl
++++ b/third_party/llvm/workspace.bzl
+@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
+ 
+ def repo(name):
+     """Imports LLVM."""
+-    LLVM_COMMIT = "43bfec29cbecc1ff2e5aa6f8908c4d63e9c896c5"
+-    LLVM_SHA256 = "d9c35a7c3764666abcf464955530154d528b2e5edeb97bfa8890f02cb52d1f30"
++    LLVM_COMMIT = "8f264586d7521b0e305ca7bb78825aa3382ffef7"
++    LLVM_SHA256 = "5784c4af94caba66bc8c460e07e222f751e4f4c9db9c45b3a68ff55379cf587d"
+ 
+     tf_http_archive(
+         name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index beb5197aeec510..504e4d6b2c6ce2 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "e8435cb5c0b852b0e249b3fbf5f42dd51988afc9"
-    SHARDY_SHA256 = "59e1e10fef4f425cd3ef7f5200a5d8111476230818496402cc83b234b277b4be"
+    SHARDY_COMMIT = "940091203da82097e358114a6622d81b73693698"
+    SHARDY_SHA256 = "fa4cdeda270efd2faf3bd957d0a11c2dca6a36a9f071423dcbcbbb6cee43af0d"
 
     tf_http_archive(
         name = "shardy",

From 3fea46db70d0e48b6f54f2b2df420265905f8df2 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Mon, 15 Dec 2025 15:30:40 -0800
Subject: [PATCH 293/753] Remove `goto` in `hlo_sharding_util`.

Pure refactoring.

PiperOrigin-RevId: 844944658
---
 .../xla/xla/hlo/utils/hlo_sharding_util.cc    | 25 ++++++++-----------
 .../xla/xla/hlo/utils/hlo_sharding_util.h     |  4 +--
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index dbfbeaa362fb66..5bf1e9cdb42b23 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -405,19 +405,17 @@ bool MergeSharding(const HloSharding& to_merge, HloSharding* dst,
     }
     return changed;
   }
-  if (!may_combine_partial_sharding || !to_merge.HasPartialReplication() ||
-      !dst->HasPartialReplication() ||
-      to_merge.num_devices() != dst->num_devices()) {
-    goto check_if_more_specific;
-  }
 
-  if (MergeShardingIfCompatible(
+  if (may_combine_partial_sharding && to_merge.HasPartialReplication() &&
+      dst->HasPartialReplication() &&
+      to_merge.num_devices() == dst->num_devices() &&
+      MergeShardingIfCompatible(
           to_merge,
           /*minimum_tiles=*/std::max(to_merge.NumTiles(), dst->NumTiles()) + 1,
           dst)) {
     return true;
   }
-check_if_more_specific:
+
   return IsLeafShardingMoreSpecific(*dst, to_merge);
 }
 
@@ -2872,13 +2870,12 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
     // with the existing one. This avoids unexpected resharding when `sharding`
     // just has more tiles than existing sharding but they are not mergeable.
     if (!allow_aggressive_resharding && to_improved_shape.IsArray() &&
-        !to_improved->IsTileMaximal() && from.NumTiles() == sharding_tiles) {
-      if (!IsSubTilingOrEqualSharding(to_improved_shape, from, *to_improved)) {
-        VLOG(10) << "Not merging because of different device distribution";
-        VLOG(10) << "Instr sharding: " << to_improved->ToString();
-        VLOG(10) << "New sharding " << from.ToString();
-        return std::nullopt;
-      }
+        !to_improved->IsTileMaximal() && from.NumTiles() == sharding_tiles &&
+        !IsSubTilingOrEqualSharding(to_improved_shape, from, *to_improved)) {
+      VLOG(10) << "Not merging because of different device distribution";
+      VLOG(10) << "Instr sharding: " << to_improved->ToString();
+      VLOG(10) << "New sharding " << from.ToString();
+      return std::nullopt;
     }
     return from;
   }
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 1f521eedaa8006..66e60692386523 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -99,8 +99,8 @@ bool IsSubTilingOrEqualSharding(const Shape& shape,
 // sharding with same preference level.
 bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs);
 
-// Tries to refine `to_merge` by combining with `old`. Returns if the final
-// `to_merge` is more specific than `old`.
+// Tries to refine `dst` by merging `to_merge` into it. Returns if the final
+// `dst` is more specific than `to_merge`.
 bool MergeSharding(const HloSharding& to_merge, HloSharding* dst,
                    bool may_combine_partial_sharding);
 

From 4043b4b2816148d05ad05b4cadf4c4c94457015f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 15:48:53 -0800
Subject: [PATCH 294/753] IFRT proxy, nit: Remove `array_is_deleted_hack` flag.

PiperOrigin-RevId: 844951374
---
 third_party/xla/xla/python/ifrt_proxy/client/array.cc         | 3 ---
 third_party/xla/xla/python/ifrt_proxy/client/global_flags.h   | 4 ----
 .../xla/xla/python/ifrt_proxy/client/global_flags_oss.cc      | 2 --
 3 files changed, 9 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
index d30ee6f9e0d03d..38ae216b880237 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/array.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -497,9 +497,6 @@ bool Array::IsDeleted() const {
       return false;
     }
   }
-  if (GetGlobalClientFlags()->array_is_deleted_hack) {
-    return false;
-  }
   auto req = std::make_unique<IsArrayDeletedRequest>();
   req->set_array_handle(handle_.handle);
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h b/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
index 0a0a0ce0286b5d..505cac0be580b6 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/global_flags.h
@@ -35,9 +35,6 @@ struct GlobalClientFlags {
   // codepath works well.
   bool synchronous_host_buffer_store;
 
-  // TODO(b/393445969): Implement faster is_delete without needing a hack.
-  bool array_is_deleted_hack;
-
   // Zero or negative values are interpreted as no maximum.
   int grpc_max_ongoing_host_buffer_stores;
   int grpc_max_ongoing_host_buffer_lookups;
@@ -51,7 +48,6 @@ inline std::ostream& operator<<(std::ostream& os, GlobalClientFlags flags) {
   return os << "xla::ifrt::proxy::GlobalClientFlags{"
             << "synchronous_host_buffer_store="
             << flags.synchronous_host_buffer_store << ","
-            << "array_is_deleted_hack=" << flags.array_is_deleted_hack << ","
             << "grpc_max_ongoing_host_buffer_stores="
             << flags.grpc_max_ongoing_host_buffer_stores << ","
             << "grpc_max_ongoing_host_buffer_lookups="
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc b/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc
index 5c819a4dc8b146..3387a3f5a78b77 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/global_flags_oss.cc
@@ -55,8 +55,6 @@ IntType GetIntFromEnv(const char* key, IntType default_value) {
 static GlobalClientFlags DefaultGlobalClientFlags() {
   GlobalClientFlags result;
   result.synchronous_host_buffer_store = false;
-  result.array_is_deleted_hack =
-      GetBoolFromEnv("IFRT_PROXY_ARRAY_IS_DELETED_HACK", false);
   result.grpc_max_ongoing_host_buffer_stores =
       GetIntFromEnv<int>("IFRT_PROXY_GRPC_MAX_ONGOING_HOST_BUFFER_STORES", 0);
   result.grpc_max_ongoing_host_buffer_lookups =

From bc72b2ce8d0eb465d783f2bc4127e487e5a229e3 Mon Sep 17 00:00:00 2001
From: skill <skill@google.com>
Date: Mon, 15 Dec 2025 15:58:41 -0800
Subject: [PATCH 295/753] Use deps instead of deprecated protodeps field in
 tf_proto_library as it has better compatibility with the tools.

PiperOrigin-RevId: 844954652
---
 tensorflow/core/BUILD           |  6 +--
 tensorflow/core/framework/BUILD | 84 ++++++++++++++++-----------------
 tensorflow/core/util/BUILD      | 26 +++++-----
 3 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index dd8da3665c5294..6ba886d2266ad1 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -174,23 +174,23 @@ tf_proto_library(
     srcs = [],
     create_go_proto = False,
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = ["//visibility:public"],
+    deps = [
         "//tensorflow/core/example:protos_all",
         "//tensorflow/core/framework:protos_all",
+        "//tensorflow/core/grappler/costs:op_performance_data",
         "//tensorflow/core/lib/core:error_codes_proto",
         "//tensorflow/core/profiler:profiler_options_proto",
         "//tensorflow/core/protobuf:error_codes_proto_impl",
         "//tensorflow/core/protobuf:for_core_protos",
         "//tensorflow/core/util:protos_all",
         "//tensorflow/core/util:test_log_proto",
-        "//tensorflow/core/grappler/costs:op_performance_data",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto",
         "@local_xla//xla/tsl/protobuf:coordination_config_proto",
         "@local_xla//xla/tsl/protobuf:distributed_runtime_payloads_proto",
         "@local_xla//xla/tsl/protobuf:status_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_jspb_proto_library(
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 9de70eb28c1b07..b6d0a1bee44ad3 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1554,7 +1554,7 @@ tf_proto_library(
     name = "log_memory_proto",
     srcs = ["log_memory.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":allocation_description_proto",
         ":tensor_description_proto",
         ":tensor_shape_proto",
@@ -1572,7 +1572,8 @@ tf_proto_library(
     name = "graph_proto",
     srcs = ["graph.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = ["//visibility:public"],
+    deps = [
         ":attr_value_proto",
         ":function_proto",
         ":graph_debug_info_proto",
@@ -1584,14 +1585,13 @@ tf_proto_library(
         ":types_proto",
         ":versions_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "node_def_proto",
     srcs = ["node_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":attr_value_proto",
         ":full_type_proto",
         ":resource_handle_proto",
@@ -1623,7 +1623,7 @@ tf_proto_library(
     name = "tensor_description_proto",
     srcs = ["tensor_description.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":allocation_description_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1641,7 +1641,7 @@ tf_proto_library(
     name = "resource_handle_proto",
     srcs = ["resource_handle.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":tensor_shape_proto",
         ":types_proto",
     ],
@@ -1651,7 +1651,7 @@ tf_proto_library(
     name = "step_stats_proto",
     srcs = ["step_stats.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":allocation_description_proto",
         ":tensor_description_proto",
         ":tensor_shape_proto",
@@ -1669,7 +1669,7 @@ tf_proto_library(
     name = "kernel_def_proto",
     srcs = ["kernel_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":attr_value_proto",
         ":resource_handle_proto",
         ":tensor_proto",
@@ -1682,7 +1682,11 @@ tf_proto_library(
     name = "op_def_proto",
     srcs = ["op_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = [
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow/python:__pkg__",
+    ],
+    deps = [
         ":attr_value_proto",
         ":full_type_proto",
         ":resource_handle_proto",
@@ -1690,22 +1694,12 @@ tf_proto_library(
         ":tensor_shape_proto",
         ":types_proto",
     ],
-    visibility = [
-        "//tensorflow/core:__subpackages__",
-        "//tensorflow/python:__pkg__",
-    ],
 )
 
 tf_proto_library(
     name = "attr_value_proto",
     srcs = ["attr_value.proto"],
     make_default_target_header_only = True,
-    protodeps = [
-        ":resource_handle_proto",
-        ":tensor_proto",
-        ":tensor_shape_proto",
-        ":types_proto",
-    ],
     visibility = [
         #internal library,
         "//tensorflow/core:__subpackages__",
@@ -1714,20 +1708,26 @@ tf_proto_library(
         "//tensorflow/security/fuzzing:__subpackages__",
         "//waymo/ml/deploy/benchmark:__subpackages__",
     ],
+    deps = [
+        ":resource_handle_proto",
+        ":tensor_proto",
+        ":tensor_shape_proto",
+        ":types_proto",
+    ],
 )
 
 tf_proto_library(
     name = "full_type_proto",
     srcs = ["full_type.proto"],
     make_default_target_header_only = True,
-    protodeps = [],
+    deps = [],
 )
 
 tf_proto_library(
     name = "tensor_proto",
     srcs = ["tensor.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":resource_handle_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1744,7 +1744,7 @@ tf_proto_library(
     name = "api_def_proto",
     srcs = ["api_def.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":attr_value_proto",
         ":resource_handle_proto",
         ":tensor_proto",
@@ -1757,7 +1757,7 @@ tf_proto_library(
     name = "cpp_shape_inference_proto",
     srcs = ["cpp_shape_inference.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":full_type_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1774,7 +1774,7 @@ tf_proto_library(
     name = "graph_transfer_info_proto",
     srcs = ["graph_transfer_info.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":types_proto",
     ],
 )
@@ -1796,7 +1796,7 @@ tf_proto_library(
     name = "cost_graph_proto",
     srcs = ["cost_graph.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":tensor_shape_proto",
         ":types_proto",
     ],
@@ -1812,7 +1812,10 @@ tf_proto_library(
     name = "function_proto",
     srcs = ["function.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = [
+        "//tensorflow/python:__pkg__",
+    ] + default_visibility,
+    deps = [
         ":attr_value_proto",
         ":node_def_proto",
         ":op_def_proto",
@@ -1821,9 +1824,6 @@ tf_proto_library(
         ":tensor_shape_proto",
         ":types_proto",
     ],
-    visibility = [
-        "//tensorflow/python:__pkg__",
-    ] + default_visibility,
 )
 
 # copybara:uncomment_begin(google-only)
@@ -1840,14 +1840,14 @@ tf_proto_library(
     name = "summary_proto",
     srcs = ["summary.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    exports = ["@local_xla//xla/tsl/protobuf:histogram_proto"],
+    deps = [
         ":resource_handle_proto",
         ":tensor_proto",
         ":tensor_shape_proto",
         ":types_proto",
         "@local_xla//xla/tsl/protobuf:histogram_proto",
     ],
-    exports = ["@local_xla//xla/tsl/protobuf:histogram_proto"],
 )
 
 tf_proto_library(
@@ -1860,7 +1860,7 @@ tf_proto_library(
     name = "dataset_proto",
     srcs = ["dataset.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":tensor_proto",
         ":tensor_shape_proto",
         ":types_proto",
@@ -1877,7 +1877,7 @@ tf_proto_library(
     name = "dataset_options_proto",
     srcs = ["dataset_options.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    deps = [
         ":model_proto",
     ],
 )
@@ -1886,24 +1886,27 @@ tf_proto_library(
     name = "optimized_function_graph_proto",
     srcs = ["optimized_function_graph.proto"],
     make_default_target_header_only = True,
-    protodeps = [
-        ":types_proto",
+    deps = [
         ":graph_proto",
+        ":types_proto",
     ],
 )
 
 tf_proto_library(
     name = "protos_all",
     make_default_target_header_only = True,
-    protodeps = [
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
+    deps = [
         ":allocation_description_proto",
         ":api_def_proto",
-        ":cpp_shape_inference_proto",
         ":attr_value_proto",
         ":cost_graph_proto",
-        ":dataset_proto",
+        ":cpp_shape_inference_proto",
         ":dataset_metadata_proto",
         ":dataset_options_proto",
+        ":dataset_proto",
         ":device_attributes_proto",
         ":full_type_proto",
         ":function_proto",
@@ -1914,8 +1917,8 @@ tf_proto_library(
         ":log_memory_proto",
         ":model_proto",
         ":node_def_proto",
-        ":optimized_function_graph_proto",
         ":op_def_proto",
+        ":optimized_function_graph_proto",
         ":reader_base_proto",
         ":resource_handle_proto",
         ":step_stats_proto",
@@ -1928,9 +1931,6 @@ tf_proto_library(
         ":variable_proto",
         ":versions_proto",
     ],
-    tags = [
-        "alt_dep=//third_party/tensorflow/core:protos_all",
-    ],
 )
 
 tf_cc_fuzz_test(
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index 72cd0b7751e2cc..05ae29c1619d87 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -953,38 +953,39 @@ tf_proto_library(
     name = "test_log_proto",
     srcs = ["test_log.proto"],
     make_default_target_header_only = True,
-    protodeps = ["@local_xla//xla/tsl/protobuf:test_log_proto"],
     # Not to be used outside of tensorflow/core.
     visibility = ["//tensorflow/core:__pkg__"],
     exports = ["@local_xla//xla/tsl/protobuf:test_log_proto"],
+    deps = ["@local_xla//xla/tsl/protobuf:test_log_proto"],
 )
 
 tf_proto_library(
     name = "protos_test",
     srcs = ["example_proto_fast_parsing_test.proto"],
-    protodeps = ["//tensorflow/core:protos_all"],
     visibility = ["//visibility:public"],
+    deps = ["//tensorflow/core:protos_all"],
 )
 
 tf_proto_library(
     name = "event_proto",
     srcs = ["event.proto"],
     make_default_target_header_only = True,
-    protodeps = [
-        "//tensorflow/core/framework:summary_proto",
+    visibility = ["//visibility:public"],
+    deps = [
         "//tensorflow/core/framework:resource_handle_proto",
+        "//tensorflow/core/framework:summary_proto",
         "//tensorflow/core/framework:tensor_proto",
         "//tensorflow/core/framework:tensor_shape_proto",
         "//tensorflow/core/framework:types_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "saved_tensor_slice_proto",
     srcs = ["saved_tensor_slice.proto"],
     make_default_target_header_only = True,
-    protodeps = [
+    visibility = ["//visibility:public"],
+    deps = [
         "//tensorflow/core/framework:resource_handle_proto",
         "//tensorflow/core/framework:tensor_proto",
         "//tensorflow/core/framework:tensor_shape_proto",
@@ -992,7 +993,6 @@ tf_proto_library(
         "//tensorflow/core/framework:types_proto",
         "//tensorflow/core/framework:versions_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
@@ -1005,16 +1005,16 @@ tf_proto_library(
 tf_proto_library(
     name = "protos_all",
     make_default_target_header_only = True,
-    protodeps = [
-        ":event_proto",
-        ":saved_tensor_slice_proto",
-        ":memmapped_file_system_proto",
-        "//tensorflow/core/util/quantization:uniform_quant_ops_attr_proto",
-    ],
     tags = [
         "alt_dep=//third_party/tensorflow/core:protos_all",
     ],
     visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":event_proto",
+        ":memmapped_file_system_proto",
+        ":saved_tensor_slice_proto",
+        "//tensorflow/core/util/quantization:uniform_quant_ops_attr_proto",
+    ],
 )
 
 cc_library(

From ee2f4c2e059f99c8fd3564f3832a2eddab48655a Mon Sep 17 00:00:00 2001
From: Zviki Nozadze <zviki@google.com>
Date: Mon, 15 Dec 2025 16:25:35 -0800
Subject: [PATCH 296/753] HloModuleSplit - convenience function to get cloned
 computation from the original computation

PiperOrigin-RevId: 844963973
---
 .../hlo_module_linking_test.cc                | 19 +++++++++---------
 .../hlo_module_splitting.cc                   | 20 +++++++++++++++++++
 .../hlo_module_splitting.h                    |  5 +++++
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
index 2ebba94ada5c87..5a46cfb331364e 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_linking_test.cc
@@ -147,8 +147,9 @@ TEST_F(LinkingTest, SingleCallLinking) {
   const HloLinkingManifest& linking_manifest =
       module_split_group->linking_manifest;
   auto* original_root = FindComputation(original_module.get(), "main");
-  auto* split_group_root = module_split_group->address_book.at(original_root)
-                               ->computation_map.at(original_root);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloComputation* split_group_root,
+      module_split_group->GetClonedComputation(original_root));
 
   TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
                           LinkComputation(linking_manifest, split_group_root));
@@ -208,8 +209,9 @@ TEST_F(LinkingTest, ChainGraphLinking) {
   const HloLinkingManifest& linking_manifest =
       module_split_group->linking_manifest;
   auto* original_root = FindComputation(original_module.get(), "main");
-  auto* split_group_root = module_split_group->address_book.at(original_root)
-                               ->computation_map.at(original_root);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloComputation* split_group_root,
+      module_split_group->GetClonedComputation(original_root));
 
   TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
                           LinkComputation(linking_manifest, split_group_root));
@@ -280,13 +282,12 @@ TEST_F(LinkingTest, DiamondGraphLinking) {
   const HloLinkingManifest& linking_manifest =
       module_split_group->linking_manifest;
   auto* original_root = FindComputation(original_module.get(), "main");
-  ASSERT_TRUE(module_split_group->address_book.contains(original_root));
-  auto* split = module_split_group->address_book.at(original_root);
-  ASSERT_TRUE(split->computation_map.contains(original_root));
-  auto* split_root = split->computation_map.at(original_root);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloComputation* split_group_root,
+      module_split_group->GetClonedComputation(original_root));
 
   TF_ASSERT_OK_AND_ASSIGN(auto linked_module,
-                          LinkComputation(linking_manifest, split_root));
+                          LinkComputation(linking_manifest, split_group_root));
   HloVerifier verifier(HloVerifierOpts{});
   TF_ASSERT_OK(verifier.Run(linked_module.get()));
 
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
index ac8afaeaeaf0a2..00a0bd2a0acefc 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.cc
@@ -332,4 +332,24 @@ absl::StatusOr<std::unique_ptr<HloModuleSplitGroup>> CreateHloModuleSplitGroup(
       std::move(linking_manifest));
 }
 
+absl::StatusOr<const HloComputation*> HloModuleSplitGroup::GetClonedComputation(
+    const HloComputation* original_computation) const {
+  auto it = address_book.find(original_computation);
+  if (it == address_book.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Original computation '", original_computation->name(),
+                     "' not found in HloModuleSplitGroup address book."));
+  }
+  auto& computation_map = it->second->computation_map;
+  auto it2 = computation_map.find(original_computation);
+  if (it2 == computation_map.end()) {
+    return absl::InternalError(absl::StrCat(
+        "Original computation '", original_computation->name(),
+        "' found in address book but not in computation map for its "
+        "module split '",
+        it->second->submodule->name(), "'."));
+  }
+  return it2->second;
+}
+
 }  // namespace xla::separate_compilation
diff --git a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
index 4569edd925e5ca..97a0031035511c 100644
--- a/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
+++ b/third_party/xla/xla/hlo/separate_compilation/hlo_module_splitting.h
@@ -94,6 +94,11 @@ struct HloModuleSplitGroup {
       : address_book(std::move(address_book)),
         module_splits(std::move(module_splits)),
         linking_manifest(std::move(linking_manifest)) {}
+
+  // Returns the cloned version of the given original computation, or
+  // an error if the computation is not part of this split group.
+  absl::StatusOr<const HloComputation*> GetClonedComputation(
+      const HloComputation* original_computation) const;
 };
 
 // Split the given module. Returns a mapping from `HloComputation*` to

From eb61483d764c3ce21cbeb67bfce012344337f921 Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Mon, 15 Dec 2025 17:13:17 -0800
Subject: [PATCH 297/753] [PjRt-IFRT] Create `ifrt::PjRtExecutable` only from
 `ifrt::PjRtCompiler` and `CompileOnlyIfrtCompiler`

This change migrates direct calls to `ifrt::PjRtExecutable::Create()` outside to use a public IFRT API `ifrt::PjRtCompiler::Compile()` instead.

This change should be no-op in practice. For PjRt-IFRT, it now performs IFRT device ID -> PjRt device ID conversion in `xla::CompileOptions::executable_build_options` (which was missing before) and thus can handle a client using a different device ID mapping.

PiperOrigin-RevId: 844980890
---
 .../xla/xla/python/compile_only_ifrt/BUILD    |  3 ++
 .../xla/python/compile_only_ifrt/client.cc    | 39 +++++++++++++++++++
 .../xla/xla/python/compile_only_ifrt/client.h |  4 +-
 .../xla/python/pjrt_ifrt/pjrt_executable.cc   |  5 ---
 .../xla/python/pjrt_ifrt/pjrt_executable.h    |  7 ----
 third_party/xla/xla/python/version.h          |  3 +-
 6 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/python/compile_only_ifrt/BUILD b/third_party/xla/xla/python/compile_only_ifrt/BUILD
index 943810b3b1f647..a9279abc8fb5e0 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/BUILD
+++ b/third_party/xla/xla/python/compile_only_ifrt/BUILD
@@ -19,14 +19,17 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:host_memory_spaces",
         "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
         "//xla/python/ifrt",
         "//xla/python/ifrt:attribute_map",
         "//xla/python/ifrt:basic_device_list",
         "//xla/python/ifrt:user_context",
+        "//xla/python/ifrt/hlo:hlo_program",
         "//xla/python/pjrt_ifrt",
         "//xla/python/pjrt_ifrt:pjrt_attribute_map_util",
         "//xla/python/pjrt_ifrt:pjrt_dtype",
+        "//xla/python/pjrt_ifrt:xla_ifrt",
         "//xla/service:computation_placer_hdr",
         "//xla/tsl/concurrency:future",
         "//xla/tsl/concurrency:ref_count",
diff --git a/third_party/xla/xla/python/compile_only_ifrt/client.cc b/third_party/xla/xla/python/compile_only_ifrt/client.cc
index f837702ca2d5d5..380dc756cbce2f 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/client.cc
+++ b/third_party/xla/xla/python/compile_only_ifrt/client.cc
@@ -15,6 +15,22 @@ limitations under the License.
 
 #include "xla/python/compile_only_ifrt/client.h"
 
+#include <memory>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/hlo/hlo_program.h"
+#include "xla/python/ifrt/program.h"
+#include "xla/python/ifrt/topology.h"
+#include "xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "xla/python/pjrt_ifrt/pjrt_topology.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/tsl/platform/statusor.h"
+
 namespace xla {
 
 [[maybe_unused]] char CompileOnlyMemory::ID = 0;
@@ -22,4 +38,27 @@ namespace xla {
 [[maybe_unused]] char CompileOnlyIfrtCompiler::ID = 0;
 [[maybe_unused]] char CompileOnlyIfRtClient::ID = 0;
 
+absl::StatusOr<ifrt::ExecutableRef> CompileOnlyIfrtCompiler::Compile(
+    std::unique_ptr<ifrt::Program> program, const ifrt::Topology& topology,
+    std::unique_ptr<ifrt::CompileOptions> options) {
+  const auto* xla_program = llvm::dyn_cast<ifrt::HloProgram>(program.get());
+  if (xla_program == nullptr) {
+    return absl::InvalidArgumentError(
+        "CompileOnlyIfrtCompiler requires an HloProgram");
+  }
+  TF_ASSIGN_OR_RETURN(auto xla_compile_options,
+                      ifrt::GetXlaCompileOptions(std::move(options)));
+  // Unlike PjRt-IFRT, device ID translation is unnecessary because
+  // `CompileOnlyIfrtClient` does not support device ID mapping.
+  const auto* pjrt_topology = llvm::dyn_cast<ifrt::PjRtTopology>(&topology);
+  if (pjrt_topology == nullptr) {
+    return absl::InvalidArgumentError(
+        "CompileOnlyIfrtCompiler requires a PjRtTopology");
+  }
+  return ifrt::PjRtExecutable::Create(
+      xla_program->mlir_module(),
+      std::move(xla_compile_options->compile_options),
+      *pjrt_topology->description());
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/python/compile_only_ifrt/client.h b/third_party/xla/xla/python/compile_only_ifrt/client.h
index 659227fcecbdca..ef30a57d8cdd8a 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/client.h
+++ b/third_party/xla/xla/python/compile_only_ifrt/client.h
@@ -167,9 +167,7 @@ class CompileOnlyIfrtCompiler final
 
   absl::StatusOr<ifrt::ExecutableRef> Compile(
       std::unique_ptr<ifrt::Program> program, const ifrt::Topology& topology,
-      std::unique_ptr<ifrt::CompileOptions> options) override {
-    return Unimplemented("Compile not implemented.");
-  }
+      std::unique_ptr<ifrt::CompileOptions> options) override;
 
   absl::Status IsExecutableVersionCompatible(
       const xla::ifrt::ExecutableVersion& executable_version,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 2ea62a8d42fda4..d668d190ba5ce6 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -418,11 +418,6 @@ char PjRtCompatibleLoadedExecutable::ID = 0;
 char PjRtExecutable::ID = 0;
 char PjRtLoadedExecutable::ID = 0;
 
-absl::StatusOr<ExecutableRef> PjRtExecutable::Create(
-    std::shared_ptr<xla::PjRtExecutable> pjrt_executable) {
-  return ExecutableRef(new PjRtExecutable(std::move(pjrt_executable)));
-}
-
 absl::StatusOr<ExecutableRef> PjRtExecutable::Create(
     mlir::ModuleOp module, xla::CompileOptions compile_options,
     const xla::PjRtTopologyDescription& topology) {
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index c3e4bd2111dc7d..71a3fb944ca0be 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -90,13 +90,6 @@ class PjRtCompatibleLoadedExecutable
 class PjRtExecutable final
     : public llvm::RTTIExtends<PjRtExecutable, PjRtCompatibleExecutable> {
  public:
-  // Creates PjRtExecutable from xla::PjRtExecutable.
-  ABSL_DEPRECATED(
-      "Use the `Create()` that takes an MLIR module and compiles it "
-      "internally.")
-  static absl::StatusOr<ExecutableRef> Create(
-      std::shared_ptr<xla::PjRtExecutable> pjrt_executable);
-
   // Creates PjRtExecutable from an MLIR module. Internally, it compiles the
   // provided MLIR module into an `xla::PjRtExecutable`.
   static absl::StatusOr<ExecutableRef> Create(
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 862c0598a7a7aa..8c3c21a0f9f178 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER 41  // Python getter for advanced_configuration
+#define JAX_IFRT_VERSION_NUMBER \
+  42  // PjRtExecutable is created using IFRT Compiler::Compile() API.
 
 #endif  // XLA_PYTHON_VERSION_H_

From d7a2f4a21ea36000f33fa0485d91bf287a70f0f3 Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 15 Dec 2025 18:00:56 -0800
Subject: [PATCH 298/753] Change xnn_fusion_benchmark_test to
 ynn_fusion_benchmark_test

I also incorrectly removed the build rule for this in a previous change.

PiperOrigin-RevId: 844995933
---
 .../xla/xla/backends/cpu/benchmarks/BUILD     | 21 ++++++++++++++
 ...k_test.cc => ynn_fusion_benchmark_test.cc} | 28 +++++++++----------
 2 files changed, 35 insertions(+), 14 deletions(-)
 rename third_party/xla/xla/backends/cpu/benchmarks/{xnn_fusion_benchmark_test.cc => ynn_fusion_benchmark_test.cc} (90%)

diff --git a/third_party/xla/xla/backends/cpu/benchmarks/BUILD b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
index b90dd573c2d6b7..c9a377c6ecb010 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/BUILD
+++ b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
@@ -586,6 +586,27 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "ynn_fusion_benchmark_test",
+    srcs = ["ynn_fusion_benchmark_test.cc"],
+    fail_if_no_test_linked = False,  # NOLINT=This contains benchmarks only, no tests.
+    fail_if_no_test_selected = False,  # NOLINT=This contains benchmarks only, no tests.
+    deps = [
+        ":hlo_benchmark_runner",
+        ":multi_benchmark_config",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_cc_test(
     name = "snapshot_loading_test",
     srcs = ["snapshot_loading_test.cc"],
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc b/third_party/xla/xla/backends/cpu/benchmarks/ynn_fusion_benchmark_test.cc
similarity index 90%
rename from third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc
rename to third_party/xla/xla/backends/cpu/benchmarks/ynn_fusion_benchmark_test.cc
index e0aee67ebae66b..061e681937330c 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/xnn_fusion_benchmark_test.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/ynn_fusion_benchmark_test.cc
@@ -37,7 +37,7 @@ namespace xla::cpu {
 static absl::Status RunFusionBenchmark(benchmark::State& state,
                                        HloBenchmarkOptions options,
                                        absl::string_view hlo,
-                                       bool is_xnn_fusion = false) {
+                                       bool is_ynn_fusion = false) {
   int64_t d0 = state.range(0);  // Tensor size.
   int64_t n = state.range(1);   // Number of add-multiply iterations.
 
@@ -58,7 +58,7 @@ static absl::Status RunFusionBenchmark(benchmark::State& state,
       ShapeUtil::MakeShape(F32, {d0, d0}), &engine, 1.0f, 0.1f);
   std::vector<const Literal*> args = {&p0, &p1};
 
-  if (is_xnn_fusion) {
+  if (is_ynn_fusion) {
     options.disable_parallel_task_assigner = true;
     options.aot_options = nullptr;
   }
@@ -88,13 +88,13 @@ static void BM_EltwiseF32(benchmark::State& state,
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo));
 }
 
-static void BM_XnnEltwiseF32(benchmark::State& state,
+static void BM_YnnEltwiseF32(benchmark::State& state,
                              HloBenchmarkOptions options) {
   // Perform `n+1` iterations of `add` and `multiply`, then end with `subtract`.
   absl::string_view hlo = R"(
     HloModule eltwise_f32_$n
 
-    xnn_fusion {
+    ynn_fusion {
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       add0 = f32[$d0,$d0] add(p0, p1)
@@ -107,12 +107,12 @@ static void BM_XnnEltwiseF32(benchmark::State& state,
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       ROOT %result = f32[$d0,$d0] fusion(%p0, %p1), kind=kCustom,
-        calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
+        calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     }
   )";
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo,
-                              /*is_xnn_fusion=*/true));
+                              /*is_ynn_fusion=*/true));
 }
 
 static void BM_DotAndEltwiseF32(benchmark::State& state,
@@ -136,14 +136,14 @@ static void BM_DotAndEltwiseF32(benchmark::State& state,
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo));
 }
 
-static void BM_XnnDotAndEltwiseF32(benchmark::State& state,
+static void BM_YnnDotAndEltwiseF32(benchmark::State& state,
                                    HloBenchmarkOptions options) {
   // Perform `dot` followed by `n+1` iterations of `add` and `multiply`, then
   // end with `subtract`.
   absl::string_view hlo = R"(
     HloModule dot_and_eltwise_f32_$n
 
-    xnn_fusion {
+    ynn_fusion {
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       dot0 = f32[$d0,$d0] dot(p0, p1), lhs_contracting_dims={1},
@@ -158,12 +158,12 @@ static void BM_XnnDotAndEltwiseF32(benchmark::State& state,
       p0 = f32[$d0,$d0] parameter(0)
       p1 = f32[$d0,$d0] parameter(1)
       ROOT %result = f32[$d0,$d0] fusion(%p0, %p1), kind=kCustom,
-        calls=xnn_fusion,
-        backend_config={"fusion_config": {kind: "__xnn_fusion"}}
+        calls=ynn_fusion,
+        backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     }
   )";
   CHECK_OK(RunFusionBenchmark(state, std::move(options), hlo,
-                              /*is_xnn_fusion=*/true));
+                              /*is_ynn_fusion=*/true));
 }
 
 #define BENCHMARK_FUSION(name)  \
@@ -175,8 +175,8 @@ static void BM_XnnDotAndEltwiseF32(benchmark::State& state,
       ->Args({1024, 32})
 
 BENCHMARK_FUSION(BM_EltwiseF32);
-BENCHMARK_FUSION(BM_XnnEltwiseF32);
+BENCHMARK_FUSION(BM_YnnEltwiseF32);
 BENCHMARK_FUSION(BM_DotAndEltwiseF32);
-BENCHMARK_FUSION(BM_XnnDotAndEltwiseF32);
+BENCHMARK_FUSION(BM_YnnDotAndEltwiseF32);
 
 }  // namespace xla::cpu

From 090567d56d3527db6e544e5456b7f6c770701859 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 19:51:12 -0800
Subject: [PATCH 299/753] Automated Code Change

PiperOrigin-RevId: 845035652
---
 third_party/xla/xla/BUILD   | 1 +
 third_party/xla/xla/util.cc | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index 1e53b7268f50fd..be1d16e34727eb 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -340,6 +340,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@com_google_protobuf//:protobuf",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:bfloat16",
         "@local_tsl//tsl/platform:casts",
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index fee72cf92ca78e..6d5c5344c9e70c 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -49,6 +49,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "google/protobuf/descriptor.h"
+#include "google/protobuf/text_format.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/types.h"

From 82d621c1b916e13de42c0cf7966cc32063eb3662 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 20:00:03 -0800
Subject: [PATCH 300/753] Automated Code Change

PiperOrigin-RevId: 845038593
---
 third_party/xla/xla/backends/gpu/codegen/llvm/BUILD           | 1 +
 third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
index 4f7e246607e1c0..45c42e67aadd6d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
@@ -27,6 +27,7 @@ cc_library(
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/runtime:kernel_thunk",
         "//xla/backends/gpu/runtime:thunk",
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
index 8ecab390d4d120..4192e2898a6c84 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
@@ -91,6 +91,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/fingerprint.h"
 
 namespace xla::gpu {

From 08b52d49714b67cda1f28efb32e6cc9df4ceb62f Mon Sep 17 00:00:00 2001
From: Felix Wang <wfelix@google.com>
Date: Mon, 15 Dec 2025 21:45:34 -0800
Subject: [PATCH 301/753] Support collective-permute to use the s-curve for
 cross-partition and perf-table for intra-partition.

PiperOrigin-RevId: 845072088
---
 .../gpu/model/collective_interpolator.cc      |   5 +-
 .../gpu/model/sol_latency_estimator.cc        |  72 ++++++---
 .../gpu/model/sol_latency_estimator_test.cc   | 146 ++++++++++++++++--
 3 files changed, 192 insertions(+), 31 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
index 61ac03f237ff80..5663de4e143406 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
@@ -687,7 +687,8 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
   int64_t bytes_transferred =
       GetBytesTransferred(instr, device_info_, analysis_);
 
-  if (instr.opcode() == HloOpcode::kCollectivePermute) {
+  if (instr.opcode() == HloOpcode::kCollectivePermute ||
+      instr.opcode() == HloOpcode::kCollectivePermuteStart) {
     auto* cp = Cast<HloCollectivePermuteInstruction>(&instr);
     const CollectivePermuteCostModelType& permute_type =
         GetCollectivePermuteCostModelType(
@@ -700,7 +701,7 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
              << " for instr: " << instr.ToString() << " num_partitions:"
              << cp->GetModule()->config().num_partitions();
     ExactInterpolatorKey exact_key{
-        /*opcode=*/instr.opcode(),
+        /*opcode=*/HloOpcode::kCollectivePermute,
         /*collective_params=*/permute_type,
         /*data_type=*/std::nullopt,
     };
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
index 8fdcdb8232159d..3b734ee1bb3f6e 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator.cc
@@ -61,7 +61,10 @@ using ::mlir::MLIRContext;
 bool IsSupportedCollectiveOp(const HloInstruction& instr) {
   return HloPredicateIsOp<HloOpcode::kAllReduceStart, HloOpcode::kAllReduce,
                           HloOpcode::kReduceScatter, HloOpcode::kAllGatherStart,
-                          HloOpcode::kAllGather, HloOpcode::kAllToAll>(&instr);
+                          HloOpcode::kAllToAll,
+                          HloOpcode::kCollectivePermuteStart,
+                          HloOpcode::kCollectivePermute, HloOpcode::kAllGather>(
+      &instr);
 }
 
 bool IsHostOffloaded(const HloInstruction& instr) {
@@ -183,7 +186,9 @@ absl::StatusOr<absl::Duration> DCNCollectiveDuration(
       break;
     }
     case HloOpcode::kRecv:
-    case HloOpcode::kSend: {
+    case HloOpcode::kSend:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart: {
       TF_ASSIGN_OR_RETURN(
           absl::Duration runtime,
           sol_model.RingLatency(msg_size, num_participating_hosts,
@@ -325,26 +330,53 @@ SolLatencyEstimator::ComputeCollectiveTime(
     return absl::ZeroDuration();
   }
 
-  const HloCollectiveInstruction* collective_instr =
-      DynCast<HloCollectiveInstruction>(
-          instr.IsAsynchronous() ? instr.async_wrapped_instruction() : &instr);
-
-  if (collective_instr == nullptr) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Unsupported collective instruction: ", instr.ToString()));
+  const HloInstruction* collective =
+      instr.IsAsynchronous() ? instr.async_wrapped_instruction() : &instr;
+  if (const auto* cp = DynCast<HloCollectivePermuteInstruction>(collective)) {
+    // Handles the collective-permute ops.
+    int64_t partition_size = GetPartitionSize(*cp, sol_flags);
+    CollectivePermuteCostModelType cost_model_type =
+        GetCollectivePermuteCostModelType(*cp, partition_size);
+
+    switch (cost_model_type) {
+      case CollectivePermuteCostModelType::kIntraPartitionOneWay:
+      case CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual:
+      case CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual:
+        return collective_interpolator->EstimatedRuntime(*cp);
+      case CollectivePermuteCostModelType::kInterPartitionOneWay:
+      case CollectivePermuteCostModelType::kInterPartitionTwoWayAllMutual:
+      case CollectivePermuteCostModelType::kInterPartitionTwoWayHasNonMutual: {
+        // TODO(wfelix): Distinguish different types of inter-partition
+        // collectives.
+        TF_ASSIGN_OR_RETURN(
+            absl::Duration duration,
+            DCNCollectiveDuration(/*num_participating_hosts=*/2,
+                                  /*num_communicators=*/1, *cp, gpu_device_info,
+                                  sol_flags, analysis, mlir_context));
+        return duration;
+      }
+      case CollectivePermuteCostModelType::kUnknown:
+        return absl::InvalidArgumentError(
+            "Unknown collective permute cost model type.");
+    }
+  } else if (const auto* collective_instr =
+                 DynCast<HloCollectiveInstruction>(collective)) {
+    // Handles the collective ops.
+    int64_t partition_size = GetPartitionSize(*collective_instr, sol_flags);
+    TF_ASSIGN_OR_RETURN(
+        GPUCommunicationType communication_type,
+        CommunicationType(partition_size, *collective_instr,
+                          gpu_device_info.gpu_compute_capability()));
+    TF_ASSIGN_OR_RETURN(
+        absl::Duration result,
+        DispatchEstimation(communication_type, *collective_instr,
+                           gpu_device_info, sol_flags, analysis,
+                           collective_interpolator, mlir_context));
+    return result;
   }
 
-  int64_t partition_size = GetPartitionSize(*collective_instr, sol_flags);
-  TF_ASSIGN_OR_RETURN(
-      GPUCommunicationType communication_type,
-      CommunicationType(partition_size, *collective_instr,
-                        gpu_device_info.gpu_compute_capability()));
-  TF_ASSIGN_OR_RETURN(
-      absl::Duration result,
-      DispatchEstimation(communication_type, *collective_instr, gpu_device_info,
-                         sol_flags, analysis, collective_interpolator,
-                         mlir_context));
-  return result;
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unsupported collective instruction: ", instr.ToString()));
 }
 
 /*static*/ absl::StatusOr<std::unique_ptr<SolLatencyEstimator>>
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index 088099eb468ee2..49756f5d8bb53e 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/replica_group.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal_util.h"
@@ -349,7 +350,7 @@ HloModule m
 ENTRY e {
   p0 = bf16[1024,1024] parameter(0)
   p1 = bf16[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$gemm",
     backend_config={
       "gemm_backend_config":{
@@ -377,7 +378,7 @@ HloModule m
 ENTRY e {
   p0 = f8e5m2[1024,1024] parameter(0)
   p1 = f8e4m3fn[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -405,7 +406,7 @@ HloModule m
 ENTRY e {
   p0 = f8e5m2[1024,1024] parameter(0)
   p1 = f8e4m3fn[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -433,7 +434,7 @@ HloModule m
 ENTRY e {
   p0 = f8e4m3fn[1024,1024] parameter(0)
   p1 = f8e5m2[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -461,7 +462,7 @@ HloModule m
 ENTRY e {
   p0 = f8e4m3fn[1024,1024] parameter(0)
   p1 = f8e4m3fn[1024,1024] parameter(1)
-  ROOT _ =  (bf16[1024,1024], s8[2097152]{0}) custom-call(p0,p1),
+  ROOT _ =  (bf16[1024,1024], s8[2097152]) custom-call(p0,p1),
     custom_call_target="__cublas$lt$matmul$f8",
     backend_config={
       "gemm_backend_config":{
@@ -530,8 +531,109 @@ ENTRY e {
       /*cost_type=*/CostType::kNodeCost,
       /*expected_latency=*/absl::ZeroDuration(),
   };
+  // Test for CollectivePermuteCostModelType::kIntraPartitionTwoWayHasNonMutual
+  EstimatorTestCase collective_permute_intra_host_ring_shift = {
+      /*test_name=*/"collective_permute_intra_host_ring_shift",
+      /*module_string=*/R"(
+HloModule m, num_partitions=4
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0), sharding={devices=[4,1]<=[4]}
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1, source_target_pairs={{0,3},{1,0},{2,1},{3,2}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(3706),
+  };
+
+  // Test for CollectivePermuteCostModelType::kIntraPartitionTwoWayAllMutual
+  EstimatorTestCase collective_permute_intra_host_bidirectional = {
+      /*test_name=*/"collective_permute_intra_host_bidirectional",
+      /*module_string=*/R"(
+HloModule m, num_partitions=4
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0), sharding={devices=[4,1]<=[4]}
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1, source_target_pairs={{0,1},{1,0},{2,3},{3,2}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(3696),
+  };
+
+  // Test for CollectivePermuteCostModelType::kIntraPartitionOneWay
+  EstimatorTestCase collective_permute_intra_host_one_way = {
+      /*test_name=*/"collective_permute_intra_host_one_way",
+      /*module_string=*/R"(
+HloModule m, num_partitions=4
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0), sharding={devices=[4,1]<=[4]}
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1, source_target_pairs={{0,1},{2,3}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(3961),
+  };
+
+  EstimatorTestCase collective_permute_inter_host_global = {
+      /*test_name=*/"collective_permute_inter_host_global",
+      /*module_string=*/R"(
+HloModule m, num_partitions=16
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0)
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1,
+      source_target_pairs={{0,15},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6},{8,7},{9,8},{10,9},{11,10},{12,11},{13,12},{14,13},{15,14}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(27816),
+  };
 
-  return {all_gather_intra_host,
+  EstimatorTestCase collective_permute_inter_host_rail_aligned_bidirection = {
+      /*test_name=*/"collective_permute_inter_host_rail_aligned_bidirection",
+      /*module_string=*/R"(
+HloModule m, num_partitions=16
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0)
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1,
+      source_target_pairs={{0,8},{8,0},{1,9},{9,1},{2,10},{10,2},{3,11},{11,3},{4,12},{12,4},{5,13},{13,5},{6,14},{14,6},{7,15},{15,7}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(27816),
+  };
+
+  EstimatorTestCase collective_permute_inter_host_rail_aligned_unidirection = {
+      /*test_name=*/"collective_permute_inter_host_rail_aligned_unidirection",
+      /*module_string=*/R"(
+HloModule m, num_partitions=16
+
+ENTRY main {
+  %param.2 = f32[262144,1024] parameter(0)
+  %collective-permute-start = (f32[262144,1024], f32[262144,1024]) collective-permute-start(%param.2), channel_id=1,
+      source_target_pairs={{0,8},{1,9},{2,10},{3,11},{4,12},{5,13},{6,14},{7,15}}
+  ROOT %collective-permute-done = f32[262144,1024] collective-permute-done(%collective-permute-start)
+})",
+      /*opcode_to_find=*/HloOpcode::kCollectivePermuteStart,
+      /*cost_type=*/CostType::kEdgeCost,
+      /*expected_latency=*/absl::Microseconds(27816),
+  };
+
+  return {collective_permute_intra_host_ring_shift,
+          collective_permute_intra_host_bidirectional,
+          collective_permute_intra_host_one_way,
+          collective_permute_inter_host_global,
+          collective_permute_inter_host_rail_aligned_bidirection,
+          collective_permute_inter_host_rail_aligned_unidirection,
+          all_gather_intra_host,
           all_gather_inter_host_pairwise,
           all_gather_all_ranks,
           reduce_scatter_all_ranks,
@@ -639,6 +741,30 @@ class IsSolLatencyEstimatorEnabledTest : public HloTestBase {
         /*channel_id=*/std::nullopt, /*use_global_device_ids=*/false));
   }
 
+  // Helper to add a AllToAll instruction.
+  void AddAlltoAll(HloModule* module) {
+    HloComputation* entry = module->entry_computation();
+    Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+    auto dummy_operand = entry->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}})));
+    entry->AddInstruction(HloInstruction::CreateAllToAll(
+        shape, {dummy_operand},
+        /*device_list=*/CollectiveDeviceList(),
+        /*constrain_layout=*/false, /*channel_id=*/false,
+        /*split_dimension=*/std::nullopt));
+  }
+
+  void AddCollectiveBcast(HloModule* module) {
+    HloComputation* entry = module->entry_computation();
+    Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+    auto dummy_operand = entry->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}})));
+    entry->AddInstruction(HloInstruction::CreateCollectiveBroadcast(
+        shape, {dummy_operand},
+        /*device_list=*/CollectiveDeviceList(),
+        /*constrain_layout=*/false, /*channel_id=*/std::nullopt));
+  }
+
   // Helper to add a CollectivePermute instruction.
   void AddCollectivePermute(HloModule* module) {
     HloComputation* entry = module->entry_computation();
@@ -702,7 +828,7 @@ TEST_F(IsSolLatencyEstimatorEnabledTest,
       stream_executor::CudaComputeCapability::Hopper());
 
   auto module = CreateTestModule(config);
-  AddCollectivePermute(module.get());  // Unsupported collective
+  AddCollectiveBcast(module.get());  // Unsupported collective
 
   EXPECT_FALSE(
       SolLatencyEstimator::IsSupportedForModule(*module, gpu_device_info_));
@@ -718,8 +844,10 @@ TEST_F(IsSolLatencyEstimatorEnabledTest,
       stream_executor::CudaComputeCapability::Hopper());
 
   auto module = CreateTestModule(config);
-  AddAllReduce(module.get());          // Supported
-  AddCollectivePermute(module.get());  // Unsupported
+  AddAllReduce(module.get());          // Supported collective
+  AddCollectivePermute(module.get());  // Supported collective
+  AddAlltoAll(module.get());           // Supported collective
+  AddCollectiveBcast(module.get());    // Unsupported collective
 
   EXPECT_FALSE(
       SolLatencyEstimator::IsSupportedForModule(*module, gpu_device_info_));

From 840a3f7832191d3efedd46504a4c52b335627371 Mon Sep 17 00:00:00 2001
From: Tori Baker <vwbaker@google.com>
Date: Mon, 15 Dec 2025 21:56:42 -0800
Subject: [PATCH 302/753] [xla:gpu] Fix triton pipeline discrepancies

It seems that in a few previous triton integrations, we have failed to copy over some of the pipeline changes. I went through all of them & think they should be aligned now:

* triton's version: triton/third_party/nvidia/backend/compiler.py
* xla's version: xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc

Test change: I updated the verifier test to change the tiling as it is now causing a resource exhausted error.

PiperOrigin-RevId: 845075604
---
 third_party/xla/xla/backends/gpu/codegen/triton/BUILD         | 1 +
 .../backends/gpu/codegen/triton/compilation_pipeline_cuda.cc  | 4 +++-
 .../gpu/transforms/triton_fusion_numerics_verifier_test.cc    | 4 ++--
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 6ed2f74e9325c1..81cdd32773c6e8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -211,6 +211,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:Transforms",
+        "@triton//:GluonTransforms",
         "@triton//:TritonDialects",
         "@triton//:TritonGPUToLLVM",
         "@triton//:TritonGPUTransforms",
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
index 9c445f68a3d5fc..5bf494417e65c2 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_cuda.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonToTritonGPU/Passes.h"
+#include "triton/Dialect/Gluon/Transforms/Passes.h"
 #include "triton/Dialect/Triton/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
@@ -138,10 +139,12 @@ static void MakeLLIR(mlir::OpPassManager* pm,
   pm->addPass(mt::gpu::createTritonGPUCombineTensorSelectAndIf());
   pm->addPass(mt::gpu::createTritonGPUAllocateWarpGroups());
   pm->addPass(mlir::createSCFToControlFlowPass());
+  pm->addPass(mlir::triton::gluon::createGluonInline());
   pm->addPass(mt::createAllocateSharedMemoryNvPass(
       cuda_cc_as_int,
       mlir::triton::AllocateSharedMemoryNvOptions{}.ptxVersion));
   pm->addPass(ttng::createTritonTensorMemoryAllocationPass());
+  pm->addPass(ttng::createTritonNvidiaGPUCheckMatmulTwoCTAPass());
   // We could add a flag to XLA to optionally enable the following pass:
   // pm->addPass(mt::instrument::createTritonInstrumentConcurrencySanitizer());
   pm->addPass(mt::gpu::createTritonGPUGlobalScratchAllocationPass());
@@ -153,7 +156,6 @@ static void MakeLLIR(mlir::OpPassManager* pm,
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mt::createConvertNVGPUToLLVM());
   pm->addPass(mt::createConvertWarpSpecializeToLLVM());
-  pm->addPass(mlir::createArithToLLVMConversionPass());
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mlir::createSymbolDCEPass());
diff --git a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
index 132b398fd0f439..ccb54900e2ea2f 100644
--- a/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/triton_fusion_numerics_verifier_test.cc
@@ -268,7 +268,7 @@ gemm_computation (p0: bf16[128,512], p1: bf16[256,512], p2: bf16[512,512]) -> bf
       "kind":"__triton_nested_gemm_fusion",
       "block_level_fusion_config":{
         "num_warps":"8",
-        "output_tiles":[{"sizes":["128","64"]}],
+        "output_tiles":[{"sizes":["128","32"]}],
         "num_ctas":1,
         "num_stages":4,
         "is_tma_allowed":false}}}
@@ -281,7 +281,7 @@ gemm_computation (p0: bf16[128,512], p1: bf16[256,512], p2: bf16[512,512]) -> bf
       "kind":"__triton_nested_gemm_fusion",
       "block_level_fusion_config":{
         "num_warps":"8",
-        "output_tiles":[{"sizes":["64","256"]}],
+        "output_tiles":[{"sizes":["32","256"]}],
         "num_ctas":1,
         "num_stages":4,
         "is_tma_allowed":false}}}

From 6c7405dc15c90b2ab41763060f3347443fa81e8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 22:58:11 -0800
Subject: [PATCH 303/753] Internal build infrastructure cleanup.

PiperOrigin-RevId: 845095420
---
 ....cuda10.1-cudnn7-ubuntu16.04-manylinux2010 |  87 ---------------
 ...dnn7-ubuntu16.04-manylinux2010-multipython |  87 ---------------
 ...dnn7-ubuntu18.04-manylinux2010-multipython |  88 ---------------
 ...dnn7-ubuntu18.04-manylinux2010-multipython |  78 -------------
 ...dnn8-ubuntu18.04-manylinux2010-multipython |  93 ----------------
 ...n8.1-ubuntu20.04-manylinux2014-multipython | 105 ------------------
 tensorflow/tools/ci_build/Dockerfile.rbe.gpu  |  26 -----
 ...rocm-ubuntu18.04-manylinux2010-multipython |  82 --------------
 ...rocm-ubuntu20.04-manylinux2014-multipython |  86 --------------
 .../ci_build/install/install_latest_clang.sh  |  25 -----
 10 files changed, 757 deletions(-)
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.gpu
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
 delete mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
 delete mode 100755 tensorflow/tools/ci_build/install/install_latest_clang.sh

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
deleted file mode 100644
index 91d501109d08a1..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ /dev/null
@@ -1,87 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010
-
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN apt-get update && apt-get install -y \
-    libnvinfer-dev=6.0.1-1+cuda10.1 \
-    libnvinfer6=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/*.sh /install/
-ARG DEBIAN_FRONTEND=noninteractive
-RUN /install/install_bootstrap_deb_packages.sh
-RUN /install/install_deb_packages.sh
-RUN /install/install_latest_clang.sh
-RUN /install/install_bazel.sh
-
-# Install python 3.6.
-RUN apt-get install --reinstall python3-apt
-RUN yes "" | add-apt-repository ppa:deadsnakes/ppa
-RUN apt-get update && apt-get install -y \
-    python3.6 python3.6-dev python3-pip python3.6-venv && \
-    rm -rf /var/lib/apt/lists/* && \
-    python3.6 -m pip install pip --upgrade && \
-    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 0
-
-RUN /install/install_pip_packages.sh
-
-# Install python 3.8.
-RUN apt-get update && apt-get install -y python3.8 python3.8-dev python3.8-venv
-RUN rm -rf /var/lib/apt/lists/*
-# Have to download get-pip.py due to a pip circular issue
-# https://stackoverflow.com/questions/58758447/how-to-fix-module-platform-has-no-attribute-linux-distribution-when-instal
-RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
-RUN python3.8 get-pip.py
-RUN python3.8 -m pip install --upgrade pip setuptools wheel
-
-# Overwrite include paths that are generated for the multipython image.
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
-
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
-RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
-
-# Make apt work with python 3.6.
-RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
-       /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
deleted file mode 100644
index c135dd5bd5d667..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+++ /dev/null
@@ -1,87 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
-
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN apt-get update && apt-get install -y \
-    libnvinfer-dev=6.0.1-1+cuda10.1 \
-    libnvinfer6=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.6.9"
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index b8b9e2195b7830..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,88 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
-
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN apt-get update && apt-get install -y \
-    libnvinfer-dev=6.0.1-1+cuda10.1 \
-    libnvinfer6=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
-    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.5.9"
-RUN /install/build_and_install_python.sh "3.6.9"
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index 5a86fb05d119b6..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,78 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.2-cudnn7-ubuntu18.04-manylinux2010-multipython
-
-FROM gcr.io/tensorflow-testing/nosla-cuda10.0-cudnn7-ubuntu16.04-manylinux2010
-
-RUN apt-get update
-RUN apt-get remove -y --allow-change-held-packages cuda-license-10-0 libcudnn7 libcudnn8 libnccl2 libnccl-dev
-RUN apt-get install -y --no-install-recommends --allow-downgrades --allow-change-held-packages \
-  libcublas10 \
-  libcublas-dev \
-  cuda-nvml-dev-10.2 \
-  cuda-command-line-tools-10.2 \
-  cuda-libraries-dev-10.2 \
-  cuda-minimal-build-10.2 \
-  libcudnn7=7.6.5.32-1+cuda10.2 \
-  libcudnn7-dev=7.6.5.32-1+cuda10.2
-RUN rm -f /usr/local/cuda
-RUN ln -s /usr/local/cuda-10.2 /usr/local/cuda
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      rpm2cpio \
-      unar \
-      wget \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.0"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index 3f90ac008459fc..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,93 +0,0 @@
-# Dockerfile to build a manylinux 2010 compliant cross-compiler.
-#
-# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
-# glibc (2.12) and system libstdc++ (4.4).
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython
-
-FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      patch \
-      rpm2cpio \
-      unar \
-      wget \
-      xz-utils \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
-RUN /build_devtoolset.sh devtoolset-8 /dt8
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt8 /dt8
-
-# Install TensorRT.
-RUN echo \
-    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 / \
-    > /etc/apt/sources.list.d/nvidia-ml.list \
-      && \
-    apt-get update && apt-get install -y \
-    libnvinfer-dev=7.1.3-1+cuda11.0 \
-    libnvinfer7=7.1.3-1+cuda11.0 \
-    libnvinfer-plugin-dev=7.1.3-1+cuda11.0 \
-    libnvinfer-plugin7=7.1.3-1+cuda11.0 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.6.9"
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-
-ENV CLANG_VERSION="r7f6f9f4cf966c78a315d15d6e913c43cfa45c47c"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython
deleted file mode 100644
index cb149c9d82ba21..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython
+++ /dev/null
@@ -1,105 +0,0 @@
-# Dockerfile to build a manylinxu2010/manylinux 2014 compliant cross-compiler.
-#
-# Builds a devtoolset-7 environment with manylinux2010 compatible glibc (2.12) and
-# libstdc++ (4.4) in /dt7. 
-#
-# Builds a devtoolset-9 environment with manylinux2014 compatible glibc (2.17) and
-# libstdc++ (4.8) in /dt9.
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.2-cudnn8.1-ubuntu20.04-manylinux2014-multipython
-
-FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu20.04 as devtoolset
-
-ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get install -y \
-      cpio \
-      file \
-      flex \
-      g++ \
-      make \
-      patch \
-      rpm2cpio \
-      unar \
-      wget \
-      xz-utils \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-ADD devtoolset/fixlinks.sh fixlinks.sh
-ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
-ADD devtoolset/rpm-patch.sh rpm-patch.sh
-
-# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
-RUN /build_devtoolset.sh devtoolset-7 /dt7
-# Set up a sysroot for glibc 2.17 / libstdc++ 4.8 / devtoolset-9 in /dt9.
-RUN /build_devtoolset.sh devtoolset-9 /dt9
-
-# TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:11.2.1-cudnn8-devel-ubuntu20.04
-COPY --from=devtoolset /dt7 /dt7
-COPY --from=devtoolset /dt9 /dt9
-
-# Install TensorRT.
-RUN echo \
-    deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 / \
-    > /etc/apt/sources.list.d/nvidia-ml.list \
-      && \
-    apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F60F4B3D7FA2AF80 && \
-    apt-get update && apt-get install -y \
-    libnvinfer-dev=7.2.2-1+cuda11.1 \
-    libnvinfer7=7.2.2-1+cuda11.1 \
-    libnvinfer-plugin-dev=7.2.2-1+cuda11.1 \
-    libnvinfer-plugin7=7.2.2-1+cuda11.1 \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Copy and run the install scripts.
-ARG DEBIAN_FRONTEND=noninteractive
-
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    libsqlite3-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.4"
-RUN /install/build_and_install_python.sh "3.10.0"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10"
-
-ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
-
-# TensorRT 7 for CUDA 11.1 is compatible with CUDA 11.2, but requires
-# libnvrtc.so.11.1. See https://github.com/NVIDIA/TensorRT/issues/1064.
-# TODO(b/187962120): Remove when upgrading to TensorRT 8.
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/cuda-11.1/lib64"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu b/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
deleted file mode 100644
index c4912a65b65d61..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.gpu
+++ /dev/null
@@ -1,26 +0,0 @@
-FROM nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
-
-LABEL maintainer="Nick Lopez <ngiraldo@google.com>"
-
-# In the Ubuntu 16.04 images, cudnn is placed in system paths. Move them to
-# /usr/local/cuda
-RUN cp -P /usr/include/cudnn.h /usr/local/cuda/include
-RUN cp -P /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64
-
-# Copy and run the install scripts.
-COPY install/*.sh /install/
-ARG DEBIAN_FRONTEND=noninteractive
-RUN /install/install_bootstrap_deb_packages.sh
-RUN add-apt-repository -y ppa:openjdk-r/ppa && \
-    add-apt-repository -y ppa:george-edison55/cmake-3.x
-RUN /install/install_deb_packages.sh
-RUN /install/install_pip_packages.sh
-RUN /install/install_golang.sh
-
-# Install clang from pre-built package
-RUN cd /tmp && \
-    wget https://storage.googleapis.com/clang-builds-stable/clang-ubuntu16_04/clang_r337145.tar.gz && \
-    echo "ab98c63eb09c04112cc992bc95ebc0dcea8c5e9d0760438789be2896cdc69ff8  clang_r337145.tar.gz" | sha256sum -c && \
-    tar -C /usr/local -xf clang_r323528.tar.gz && \
-    rm clang_r337145.tar.gz
-
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
deleted file mode 100644
index e9974b05b3cab8..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
+++ /dev/null
@@ -1,82 +0,0 @@
-# Dockerfile for ROCm RBE builds.
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.local-toolchain-ubuntu18.04-manylinux2010 \
-#  --tag "local-toolchain-ubuntu18.04-manylinux2010" .
-# $ docker build -f Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-rocm-ubuntu18.04-manylinux2010-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-rocm-ubuntu18.04-manylinux2010-multipython
-
-FROM ubuntu:18.04
-COPY --from=local-toolchain-ubuntu18.04-manylinux2010 /dt7 /dt7
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-# Install ROCm packages
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl libnuma-dev gnupg sudo libelf1 build-essential \
-  && curl -k -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-  && printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/5.0/ ubuntu main" | tee /etc/apt/sources.list.d/rocm.list \
-  && apt-get update && apt-get install -y --no-install-recommends \
-    rocm-dev rocm-libs rccl \
-  && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Set ROCm environment variables and paths.
-# We use /opt/rocm without version suffix so that the toolchain configuration
-# for builtin headers doesn't need to be adapted
-ARG ROCM_PATH=/opt/rocm
-ENV HCC_HOME=$ROCM_PATH/hcc
-ENV HIP_PATH=$ROCM_PATH/hip
-ENV OPENCL_ROOT=$ROCM_PATH/opencl
-ENV PATH="$ROCM_PATH/bin:${PATH}"
-ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
-ENV PATH="$OPENCL_ROOT/bin:${PATH}"
-
-# Set target file to help determine which device(s) to build for
-RUN bash -c "ls -al /opt/roc*"
-RUN bash -c "echo -e 'gfx900\ngfx906\ngfx908' > $ROCM_PATH/bin/target.lst"
-
-# Copy and run the install scripts.
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - bsdmainutils (hexdump) for MLIR generated GPU kernels
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    bsdmainutils \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install openjdk 11
-RUN yes "" | add-apt-repository ppa:openjdk-r/ppa
-RUN apt-get update && apt-get install -y openjdk-11-jdk && \
-    update-alternatives --auto java
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.4"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9"
-
-ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
deleted file mode 100644
index 32834ccac2a3af..00000000000000
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython
+++ /dev/null
@@ -1,86 +0,0 @@
-# Dockerfile for ROCm RBE builds.
-#
-# To push a new version, run:
-# $ docker build -f Dockerfile.local-toolchain-ubuntu20.04-manylinux2014 \
-#  --tag "local-toolchain-ubuntu20.04-manylinux2014" .
-# $ docker build -f Dockerfile.rbe.rocm-ubuntu20.04-manylinux2014-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-rocm-ubuntu20.04-manylinux2014-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-rocm-ubuntu20.04-manylinux2014-multipython
-
-FROM ubuntu:20.04
-COPY --from=local-toolchain-ubuntu20.04-manylinux2014 /dt7 /dt7
-COPY --from=local-toolchain-ubuntu20.04-manylinux2014 /dt9 /dt9
-
-ARG DEBIAN_FRONTEND=noninteractive
-
-# Install ROCm packages
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    curl libnuma-dev gnupg sudo libelf1 build-essential \
-  && curl -k -sL https://repo.radeon.com/rocm/rocm.gpg.key | apt-key add - \
-  && printf "deb [arch=amd64] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main" \
-    | tee /etc/apt/sources.list.d/rocm.list \
-  && apt-get update && apt-get install -y \
-    rocm-dev rocm-libs rccl \
-  && apt-get clean && rm -rf /var/lib/apt/lists/*
-
-# Set ROCm environment variables and paths.
-# We use /opt/rocm without version suffix so that the toolchain configuration
-# for builtin headers doesn't need to be adapted
-ARG ROCM_PATH=/opt/rocm
-ENV HCC_HOME=$ROCM_PATH/hcc
-ENV HIP_PATH=$ROCM_PATH/hip
-ENV OPENCL_ROOT=$ROCM_PATH/opencl
-ENV PATH="$ROCM_PATH/bin:${PATH}"
-ENV PATH="$HCC_HOME/bin:$HIP_PATH/bin:${PATH}"
-ENV PATH="$OPENCL_ROOT/bin:${PATH}"
-
-# Set target file to help determine which device(s) to build for
-RUN bash -c "ls -al /opt/roc*"
-RUN bash -c "echo -e 'gfx900\ngfx906\ngfx908' > $ROCM_PATH/bin/target.lst"
-
-# Copy and run the install scripts.
-COPY install/install_bootstrap_deb_packages.sh /install/
-RUN /install/install_bootstrap_deb_packages.sh
-
-COPY install/install_deb_packages.sh /install/
-RUN /install/install_deb_packages.sh
-
-# Install additional packages needed for this image:
-# - dependencies to build Python from source
-# - patchelf, as it is required by auditwheel
-RUN apt-get update && apt-get install -y \
-    libbz2-dev \
-    libffi-dev \
-    libgdbm-dev \
-    libncurses5-dev \
-    libnss3-dev \
-    libreadline-dev \
-    patchelf \
-      && \
-    rm -rf /var/lib/apt/lists/*
-
-# Install openjdk 11
-RUN yes "" | add-apt-repository ppa:openjdk-r/ppa
-RUN apt-get update && apt-get install -y openjdk-11-jdk && \
-    update-alternatives --auto java
-
-COPY install/install_bazel.sh /install/
-RUN /install/install_bazel.sh
-
-COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
-RUN /install/build_and_install_python.sh "3.8.2"
-RUN /install/build_and_install_python.sh "3.9.4"
-RUN /install/build_and_install_python.sh "3.10.0"
-RUN /install/build_and_install_python.sh "3.11.0"
-
-COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.10" "nojax"
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.11" "nojax"
-
-ENV CLANG_VERSION="rf2b94bd7eaa83d853dc7568fac87b1f8bf4ddec6"
-COPY install/install_latest_clang.sh /install/
-RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/install/install_latest_clang.sh b/tensorflow/tools/ci_build/install/install_latest_clang.sh
deleted file mode 100755
index 5eed5d2141fcf0..00000000000000
--- a/tensorflow/tools/ci_build/install/install_latest_clang.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash -eu
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# Contact c-toolchain-team@ for new releases or new ubuntu versions.
-DIST="$(grep "DISTRIB_RELEASE" /etc/lsb-release |sed 's,.*=,,; s,\.,_,')"
-INSTALL_DIR="/clang_${CLANG_VERSION}"
-STORAGE="https://storage.googleapis.com/clang-builds-stable"
-mkdir -p "${INSTALL_DIR}"
-cd "${INSTALL_DIR}"
-wget "${STORAGE}/clang-ubuntu${DIST}/clang_${CLANG_VERSION}.tar.gz"
-tar xvzf clang_${CLANG_VERSION}.tar.gz
-rm clang_${CLANG_VERSION}.tar.gz

From 7d619b015b8526fc6aa1340046cff360c1622840 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 15 Dec 2025 23:24:33 -0800
Subject: [PATCH 304/753] PR #35268: Bump github/codeql-action from 4.31.7 to
 4.31.8

Imported from GitHub PR https://github.com/openxla/xla/pull/35268

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.31.7 to 4.31.8.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/github/codeql-action/releases">github/codeql-action's releases</a>.</em></p>
<blockquote>
<h2>v4.31.8</h2>
<h1>CodeQL Action Changelog</h1>
<p>See the <a href="https://github.com/github/codeql-action/releases">releases page</a> for the relevant changes to the CodeQL CLI and language packs.</p>
<h2>4.31.8 - 11 Dec 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.8. <a href="https://redirect.github.com/github/codeql-action/pull/3354">#3354</a></li>
</ul>
<p>See the full <a href="https://github.com/github/codeql-action/blob/v4.31.8/CHANGELOG.md">CHANGELOG.md</a> for more information.</p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a href="https://github.com/github/codeql-action/blob/main/CHANGELOG.md">github/codeql-action's changelog</a>.</em></p>
<blockquote>
<h1>CodeQL Action Changelog</h1>
<p>See the <a href="https://github.com/github/codeql-action/releases">releases page</a> for the relevant changes to the CodeQL CLI and language packs.</p>
<h2>[UNRELEASED]</h2>
<p>No user facing changes.</p>
<h2>4.31.8 - 11 Dec 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.8. <a href="https://redirect.github.com/github/codeql-action/pull/3354">#3354</a></li>
</ul>
<h2>4.31.7 - 05 Dec 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.7. <a href="https://redirect.github.com/github/codeql-action/pull/3343">#3343</a></li>
</ul>
<h2>4.31.6 - 01 Dec 2025</h2>
<p>No user facing changes.</p>
<h2>4.31.5 - 24 Nov 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.6. <a href="https://redirect.github.com/github/codeql-action/pull/3321">#3321</a></li>
</ul>
<h2>4.31.4 - 18 Nov 2025</h2>
<p>No user facing changes.</p>
<h2>4.31.3 - 13 Nov 2025</h2>
<ul>
<li>CodeQL Action v3 will be deprecated in December 2026.  The Action now logs a warning for customers who are running v3 but could be running v4. For more information, see <a href="https://github.blog/changelog/2025-10-28-upcoming-deprecation-of-codeql-action-v3/">Upcoming deprecation of CodeQL Action v3</a>.</li>
<li>Update default CodeQL bundle version to 2.23.5. <a href="https://redirect.github.com/github/codeql-action/pull/3288">#3288</a></li>
</ul>
<h2>4.31.2 - 30 Oct 2025</h2>
<p>No user facing changes.</p>
<h2>4.31.1 - 30 Oct 2025</h2>
<ul>
<li>The <code>add-snippets</code> input has been removed from the <code>analyze</code> action. This input has been deprecated since CodeQL Action 3.26.4 in August 2024 when this removal was announced.</li>
</ul>
<h2>4.31.0 - 24 Oct 2025</h2>
<ul>
<li>Bump minimum CodeQL bundle version to 2.17.6. <a href="https://redirect.github.com/github/codeql-action/pull/3223">#3223</a></li>
<li>When SARIF files are uploaded by the <code>analyze</code> or <code>upload-sarif</code> actions, the CodeQL Action automatically performs post-processing steps to prepare the data for the upload. Previously, these post-processing steps were only performed before an upload took place. We are now changing this so that the post-processing steps will always be performed, even when the SARIF files are not uploaded. This does not change anything for the <code>upload-sarif</code> action. For <code>analyze</code>, this may affect Advanced Setup for CodeQL users who specify a value other than <code>always</code> for the <code>upload</code> input. <a href="https://redirect.github.com/github/codeql-action/pull/3222">#3222</a></li>
</ul>
<h2>4.30.9 - 17 Oct 2025</h2>
<ul>
<li>Update default CodeQL bundle version to 2.23.3. <a href="https://redirect.github.com/github/codeql-action/pull/3205">#3205</a></li>
<li>Experimental: A new <code>setup-codeql</code> action has been added which is similar to <code>init</code>, except it only installs the CodeQL CLI and does not initialize a database. Do not use this in production as it is part of an internal experiment and subject to change at any time. <a href="https://redirect.github.com/github/codeql-action/pull/3204">#3204</a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/github/codeql-action/commit/1b168cd39490f61582a9beae412bb7057a6b2c4e"><code>1b168cd</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3355">#3355</a> from github/update-v4.31.8-1b0b941e1</li>
<li><a href="https://github.com/github/codeql-action/commit/120f277b1613fcef1261eb850ba9b01ca444bbef"><code>120f277</code></a> Update changelog for v4.31.8</li>
<li><a href="https://github.com/github/codeql-action/commit/1b0b941e1fbd5cb8122c5ebdf087be9d02534840"><code>1b0b941</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3354">#3354</a> from github/update-bundle/codeql-bundle-v2.23.8</li>
<li><a href="https://github.com/github/codeql-action/commit/db812c1ae639aa466b2b1f4a921f823c54371173"><code>db812c1</code></a> Add changelog note</li>
<li><a href="https://github.com/github/codeql-action/commit/2930dba17ac868bf1d3114f09837dbfb9619aa05"><code>2930dba</code></a> Update default bundle to codeql-bundle-v2.23.8</li>
<li><a href="https://github.com/github/codeql-action/commit/c43362b91a940600cde2ebae39ec7a35ad66bdc0"><code>c43362b</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3340">#3340</a> from github/kaspersv/check-for-overlayBaseSpecifier</li>
<li><a href="https://github.com/github/codeql-action/commit/002a7f25fdbaa5bc68ab7b87a336015eebea0b1f"><code>002a7f2</code></a> Overlay: log overlayBaseSpecifier at debug log-level</li>
<li><a href="https://github.com/github/codeql-action/commit/5b7e7fcc9c5a25e1129581e9733c0f6fb5078a71"><code>5b7e7fc</code></a> Update src/codeql.ts</li>
<li><a href="https://github.com/github/codeql-action/commit/149d184a5153ea45e6fbcef5588ac7b8c7af9835"><code>149d184</code></a> Merge pull request <a href="https://redirect.github.com/github/codeql-action/issues/3345">#3345</a> from github/mergeback/v4.31.7-to-main-cf1bb45a</li>
<li><a href="https://github.com/github/codeql-action/commit/97c2630b10bd11032a1791444ba86763b11a21e1"><code>97c2630</code></a> Rebuild</li>
<li>Additional commits viewable in <a href="https://github.com/github/codeql-action/compare/cf1bb45a277cb3c205638b2cd5c984db1c46a412...1b168cd39490f61582a9beae412bb7057a6b2c4e">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=github/codeql-action&package-manager=github_actions&previous-version=4.31.7&new-version=4.31.8)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>
Copybara import of the project:

--
783dbfc90802f02381e5e374bb268e75dc616ae2 by dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>:

Bump github/codeql-action from 4.31.7 to 4.31.8

Bumps [github/codeql-action](https://github.com/github/codeql-action) from 4.31.7 to 4.31.8.
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/cf1bb45a277cb3c205638b2cd5c984db1c46a412...1b168cd39490f61582a9beae412bb7057a6b2c4e)

---
updated-dependencies:
- dependency-name: github/codeql-action
  dependency-version: 4.31.8
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>

Merging this change closes #35268

PiperOrigin-RevId: 845103388
---
 third_party/xla/.github/workflows/scorecards-analysis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/.github/workflows/scorecards-analysis.yml b/third_party/xla/.github/workflows/scorecards-analysis.yml
index 0e410de5bd29bf..0751547d4aea89 100644
--- a/third_party/xla/.github/workflows/scorecards-analysis.yml
+++ b/third_party/xla/.github/workflows/scorecards-analysis.yml
@@ -67,6 +67,6 @@ jobs:
       # Upload the results to GitHub's code scanning dashboard (optional).
       # Commenting out will disable upload of results to your repo's Code Scanning dashboard
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@cf1bb45a277cb3c205638b2cd5c984db1c46a412 # v4.31.7
+        uses: github/codeql-action/upload-sarif@1b168cd39490f61582a9beae412bb7057a6b2c4e # v4.31.8
         with:
           sarif_file: results.sarif

From be28108d5d3c00ef1a69680784d2f38aace68ef0 Mon Sep 17 00:00:00 2001
From: spiao <Songlin.Piao@amd.com>
Date: Mon, 15 Dec 2025 23:26:23 -0800
Subject: [PATCH 305/753] PR #35148: [ROCm] fixed
 //xla/tools/hlo_opt:tests/gpu_hlo_llvm.hlo.test

Imported from GitHub PR https://github.com/openxla/xla/pull/35148

The PR fixed the failed //xla/tools/hlo_opt:tests/gpu_hlo_llvm.hlo.test on rocm.

The @wrapped_b kernel (the transpose) was never actually tested because its name doesn't contain "fusion".

The test passed on NVIDIA "by accident" - it was checking the reduce kernel twice, not the transpose kernel. The fix to use CHECK-LABEL: wrapped_b actually makes the test check what it was originally intended to check.

For the second transpose test, both platforms use the direct element-wise copy approach.

With this PR,  the test can pass on both MI300x and H100.

**./bazel-7.4.1-linux-x86_64 run --test_sharding_strategy=disabled //xla/tools/hlo_opt:tests/gpu_hlo_llvm.hlo.test**

@xla-rotation could you review my PR, please?
Copybara import of the project:

--
8d459548d05b7679fbbe21d4205e405004971dcc by Songlin Piao <Songlin.Piao@amd.com>:

correct the kernel name as the original test was checking the wrong thing - the wrapped_b kernel doesn't use barriers on either platform.  For this second transpose test, both platforms use the direct element-wise copy approach.

Merging this change closes #35148

PiperOrigin-RevId: 845103912
---
 third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
index e92abf0eaee91f..2dac74fda61516 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
@@ -22,9 +22,9 @@ ENTRY e {
 HloModule Test, is_scheduled=true
 
 
-// CHECK-LABEL: fusion
-// CHECK-PTX:     call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
-// CHECK-GCN:     call void @llvm.amdgcn.s.barrier
+// CHECK-LABEL: wrapped_b
+// CHECK-PTX:     call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-GCN:     call i32 @llvm.amdgcn.workitem.id.x()
 fused_computation {
   param_0 = f32[100,200]{1,0} parameter(0)
   ROOT b.1 = f32[100,200]{0,1} copy(f32[100,200]{1,0} param_0)

From 5373efd36023abfb481a175e6bce5bf19bc2069f Mon Sep 17 00:00:00 2001
From: Dillon Sharlet <dsharlet@google.com>
Date: Mon, 15 Dec 2025 23:36:35 -0800
Subject: [PATCH 306/753] Enable more elementwise ops to fuse in YNNPACK

PiperOrigin-RevId: 845107315
---
 .../xla/backends/cpu/tests/ynn_fusion_test.cc | 14 ++++-----
 .../xla/xla/backends/cpu/ynn_support.cc       | 12 +++++++
 .../xla/xla/pjrt/cpu/cpu_client_test.cc       | 31 ++++++++++++-------
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
index 413f90b4ab959b..4903ef04530452 100644
--- a/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
+++ b/third_party/xla/xla/backends/cpu/tests/ynn_fusion_test.cc
@@ -61,16 +61,16 @@ TEST_P(YnnFusionTest, AddAndMultiply) {
     HloModule add_and_multiply
 
     ynn_fusion {
-      %lhs = $dtype[4] parameter(0)
-      %rhs = $dtype[4] parameter(1)
-      %add = $dtype[4] add(%lhs, %rhs)
-      ROOT %mul = $in_dtype[4] multiply(%add, %add)
+      %lhs = $dtype[100] parameter(0)
+      %rhs = $dtype[100] parameter(1)
+      %add = $dtype[100] add(%lhs, %rhs)
+      ROOT %mul = $in_dtype[100] multiply(%add, %add)
     }
 
     ENTRY entry {
-      %p0 = $dtype[4] parameter(0)
-      %p1 = $dtype[4] parameter(1)
-      ROOT %fusion = $dtype[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+      %p0 = $dtype[100] parameter(0)
+      %p1 = $dtype[100] parameter(1)
+      ROOT %fusion = $dtype[100] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
         backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     })";
 
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index 949748ad143071..a4281088298088 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -48,10 +48,13 @@ const absl::flat_hash_map<HloOpcode, ynn_unary_operator>& GetYnnUnaryOpMap() {
           {HloOpcode::kCeil, ynn_unary_ceil},
           {HloOpcode::kConvert, ynn_unary_convert},
           {HloOpcode::kCos, ynn_unary_cosine},
+          {HloOpcode::kErf, ynn_unary_erf},
           {HloOpcode::kExp, ynn_unary_exp},
+          {HloOpcode::kExpm1, ynn_unary_expm1},
           {HloOpcode::kCbrt, ynn_unary_cube_root},
           {HloOpcode::kFloor, ynn_unary_floor},
           {HloOpcode::kLog, ynn_unary_log},
+          {HloOpcode::kLog1p, ynn_unary_log1p},
           {HloOpcode::kLogistic, ynn_unary_sigmoid},
           {HloOpcode::kNegate, ynn_unary_negate},
           {HloOpcode::kRoundNearestEven, ynn_unary_round},
@@ -141,6 +144,15 @@ bool IsElementwiseOpSupportedByYnn(const HloInstruction* hlo) {
     return false;
   }
 
+  // We don't want to handle ops that are too small, overhead will be
+  // significant.
+  // TODO(b/469236467): This threshold is probably too small in some cases and
+  // too big in others.
+  constexpr int64_t kMinElements = 64;
+  if (ShapeUtil::ElementsIn(hlo->shape()) < kMinElements) {
+    return false;
+  }
+
   switch (hlo->operand_count()) {
     case 1:
       return YnnUnaryOperator(hlo->opcode()).ok();
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index f936afbe72e944..c7b6b231e5a7db 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <array>
+#include <numeric>
 
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
@@ -1037,16 +1038,16 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
     HloModule add_and_multiply
 
     ynn_fusion {
-      %lhs = f32[4] parameter(0)
-      %rhs = f32[4] parameter(1)
-      %add = f32[4] add(%lhs, %rhs)
-      ROOT %mul = f32[4] multiply(%add, %add)
+      %lhs = f32[100] parameter(0)
+      %rhs = f32[100] parameter(1)
+      %add = f32[100] add(%lhs, %rhs)
+      ROOT %mul = f32[100] multiply(%add, %add)
     }
 
     ENTRY entry {
-      %p0 = f32[4] parameter(0)
-      %p1 = f32[4] parameter(1)
-      ROOT %fusion = f32[4] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
+      %p0 = f32[100] parameter(0)
+      %p1 = f32[100] parameter(1)
+      ROOT %fusion = f32[100] fusion(%p0, %p1), kind=kCustom, calls=ynn_fusion,
         backend_config={"fusion_config": {kind: "__ynn_fusion"}}
     })";
 
@@ -1057,7 +1058,15 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
                           client->CompileAndLoad(xla_computation, {}));
 
-  Literal literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  std::vector<float> literal_data(100);
+  std::iota(literal_data.begin(), literal_data.end(), 1.0f);
+
+  std::vector<float> literal_data_x2_squared(literal_data);
+  for (float& i : literal_data_x2_squared) {
+    i = 4 * i * i;
+  }
+
+  Literal literal = LiteralUtil::CreateR1<float>(literal_data);
   TF_ASSERT_OK_AND_ASSIGN(auto buf, client->BufferFromHostLiteral(
                                         literal, client->memory_spaces()[0]));
 
@@ -1067,8 +1076,7 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
                           result->at(0).at(0)->ToLiteralSync());
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
-      *result_literal));
+      LiteralUtil::CreateR1<float>(literal_data_x2_squared), *result_literal));
 
   // Check that serialized/deserialized executable works and produces the same
   // result.
@@ -1081,8 +1089,7 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
   result = executable->Execute({{buf.get(), buf.get()}}, opts);
   TF_ASSERT_OK_AND_ASSIGN(result_literal, result->at(0).at(0)->ToLiteralSync());
   EXPECT_TRUE(LiteralTestUtil::Equal(
-      LiteralUtil::CreateR1<float>({4.0f, 16.0f, 36.0f, 64.0f}),
-      *result_literal));
+      LiteralUtil::CreateR1<float>(literal_data_x2_squared), *result_literal));
 }
 
 }  // namespace

From 955183ead5b40a329bf86892eecac69729eec4d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 15 Dec 2025 23:46:32 -0800
Subject: [PATCH 307/753] Automated Code Change

PiperOrigin-RevId: 845110147
---
 third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
index c23bfee06fa7bd..f611687ea4be92 100644
--- a/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
+++ b/third_party/xla/xla/mlir_hlo/bindings/c/Attributes.cc
@@ -12,6 +12,7 @@ limitations under the License.
 
 #include "bindings/c/Attributes.h"
 
+#include <cstdint>
 #include <optional>
 
 #include "mhlo/IR/hlo_ops.h"

From 1440bf89416dd4e9478d078007bc3c7662dc1aa6 Mon Sep 17 00:00:00 2001
From: Ce Zheng <zce@google.com>
Date: Mon, 15 Dec 2025 23:53:25 -0800
Subject: [PATCH 308/753] [PJRT] Change HostMemoryAllocator::Factory to take
 Options by value.

PiperOrigin-RevId: 845111876
---
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc   | 2 +-
 third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc | 2 +-
 third_party/xla/xla/pjrt/host_memory_allocator.h     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index b12dcf160cfdbc..b0b2b83841c0c1 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -1814,7 +1814,7 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
       return absl::OkStatus();
     };
     host_memory_allocator =
-        options.host_memory_allocator_factory(allocator_options);
+        options.host_memory_allocator_factory(std::move(allocator_options));
   } else {
     TF_ASSIGN_OR_RETURN(
         auto allocator,
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index d36b6acc5510ed..377d27976a6f87 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -1211,7 +1211,7 @@ absl::StatusOr<std::unique_ptr<PjRtClient>> GetTfrtGpuClient(
       return absl::OkStatus();
     };
     host_memory_allocator =
-        options.host_memory_allocator_factory(allocator_options);
+        options.host_memory_allocator_factory(std::move(allocator_options));
   } else if (!xla_client->backend().stream_executors().empty()) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<tsl::Allocator> allocator,
diff --git a/third_party/xla/xla/pjrt/host_memory_allocator.h b/third_party/xla/xla/pjrt/host_memory_allocator.h
index 8123ade19c7887..8ebf83ecadcbd4 100644
--- a/third_party/xla/xla/pjrt/host_memory_allocator.h
+++ b/third_party/xla/xla/pjrt/host_memory_allocator.h
@@ -39,8 +39,8 @@ class HostMemoryAllocator {
     absl::AnyInvocable<absl::Status(void*)> unmap_fn;
   };
 
-  using Factory = std::function<std::unique_ptr<HostMemoryAllocator>(
-      const Options& options)>;
+  using Factory =
+      std::function<std::unique_ptr<HostMemoryAllocator>(Options options)>;
 
   struct Deleter {
     void operator()(void* ptr) { deleter(ptr, arg); }

From b2dfa37f332b641f219bdc5afcd5d8553b95a641 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 00:00:58 -0800
Subject: [PATCH 309/753] Automated Code Change

PiperOrigin-RevId: 845114273
---
 tensorflow/c/kernels/BUILD               | 1 +
 tensorflow/c/kernels/bitcast_op_test.cc  | 1 +
 tensorflow/c/kernels/merge_summary_op.cc | 1 +
 3 files changed, 3 insertions(+)

diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 6e8dbc8512fa86..a7e93841a98627 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -117,6 +117,7 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
diff --git a/tensorflow/c/kernels/bitcast_op_test.cc b/tensorflow/c/kernels/bitcast_op_test.cc
index c44bc832547dab..e7ae841194f226 100644
--- a/tensorflow/c/kernels/bitcast_op_test.cc
+++ b/tensorflow/c/kernels/bitcast_op_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
diff --git a/tensorflow/c/kernels/merge_summary_op.cc b/tensorflow/c/kernels/merge_summary_op.cc
index ddbc3440d47dc1..9945f473874e20 100644
--- a/tensorflow/c/kernels/merge_summary_op.cc
+++ b/tensorflow/c/kernels/merge_summary_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <memory>
 #include <sstream>
+#include <string>
 #include <unordered_set>
 
 #include "absl/log/check.h"

From dfa4f241a097b1c524090592173da2ebde1481c5 Mon Sep 17 00:00:00 2001
From: Shaogang Wang <shawnw@nvidia.com>
Date: Tue, 16 Dec 2025 00:10:27 -0800
Subject: [PATCH 310/753] PR #35237: [XLA:GPU] Add synchronize API for cuda
 event
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35237

📝 Summary of Changes
This PR adds support for synchronizing CUDA events from the host and includes a corresponding unit test.

🎯 Justification
We may have XLA optimization that might needs this support, e.g. from command buffer update-free cuda-graph.

🚀 Kind of Contribution
✨ New Feature

🧪 Unit Tests:
xla/stream_executor/cuda/cuda_event_test.cc

Copybara import of the project:

--
53b38a2526453bf19300f2a1e7786e8949f5dc26 by Shawn Wang <shawnw@nvidia.com>:

Add synchronize API for cuda event

--
041fb9a19969390d665f2aab35a142fa4d1731bf by Shawn Wang <shawnw@nvidia.com>:

add missing header

Merging this change closes #35237

PiperOrigin-RevId: 845118206
---
 .../xla/xla/stream_executor/cuda/BUILD        |  3 ++
 .../xla/stream_executor/cuda/cuda_event.cc    |  5 ++++
 .../xla/xla/stream_executor/cuda/cuda_event.h |  1 +
 .../stream_executor/cuda/cuda_event_test.cc   | 29 +++++++++++++++++++
 third_party/xla/xla/stream_executor/event.h   |  7 +++++
 5 files changed, 45 insertions(+)

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index faa047a83318fc..a74214ce0f1f73 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -783,10 +783,13 @@ xla_test(
         ":cuda_event",
         ":cuda_executor",
         ":cuda_platform_id",
+        ":cuda_stream",
         "//xla/stream_executor:event",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:statusor",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
index 5656939a68c091..f232232db3f053 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
@@ -88,6 +88,11 @@ absl::Status CudaEvent::WaitForEventOnExternalStream(std::intptr_t stream) {
                            handle_);
 }
 
+absl::Status CudaEvent::Synchronize() {
+  std::unique_ptr<ActivateContext> activation = executor_->Activate();
+  return cuda::ToStatus(cuEventSynchronize(handle_));
+}
+
 absl::StatusOr<CudaEvent> CudaEvent::Create(StreamExecutor *executor,
                                             bool allow_timing) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.h b/third_party/xla/xla/stream_executor/cuda/cuda_event.h
index 0d6f871d0fbcc7..c338da7795fa8b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.h
@@ -33,6 +33,7 @@ class CudaEvent : public Event {
  public:
   Event::Status PollForStatus() override;
   absl::Status WaitForEventOnExternalStream(std::intptr_t stream) override;
+  absl::Status Synchronize() override;
 
   // Creates a new CudaEvent. If allow_timing is false, the event will not
   // support timing, which is cheaper to create.
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
index 1a8469f4a33e81..d9b230f7f198d9 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event_test.cc
@@ -15,16 +15,20 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_event.h"
 
+#include <memory>
 #include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/cuda/cuda_executor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/cuda/cuda_stream.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
@@ -50,6 +54,31 @@ TEST(CudaEventTest, CreateEvent) {
   EXPECT_EQ(event2.GetHandle(), handle);
 }
 
+TEST(CudaEventTest, Synchronize) {
+  TF_ASSERT_OK_AND_ASSIGN(Platform * platform,
+                          stream_executor::PlatformManager::PlatformWithId(
+                              stream_executor::cuda::kCudaPlatformId));
+  TF_ASSERT_OK_AND_ASSIGN(StreamExecutor * executor,
+                          platform->ExecutorForDevice(0));
+  CudaExecutor* cuda_executor = reinterpret_cast<CudaExecutor*>(executor);
+
+  TF_ASSERT_OK_AND_ASSIGN(CudaEvent event,
+                          CudaEvent::Create(cuda_executor, false));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<CudaStream> stream,
+                          CudaStream::Create(cuda_executor,
+                                             /*priority=*/std::nullopt));
+
+  // Record the event on the stream.
+  TF_ASSERT_OK(stream->RecordEvent(&event));
+
+  // Synchronize on the event (blocks until the event is recorded).
+  EXPECT_THAT(event.Synchronize(), absl_testing::IsOk());
+
+  // After synchronization, the event should be complete.
+  EXPECT_EQ(event.PollForStatus(), Event::Status::kComplete);
+}
+
 }  // namespace
 
 }  // namespace stream_executor::gpu
diff --git a/third_party/xla/xla/stream_executor/event.h b/third_party/xla/xla/stream_executor/event.h
index d79c10b74f4b95..f63e9736e30e9d 100644
--- a/third_party/xla/xla/stream_executor/event.h
+++ b/third_party/xla/xla/stream_executor/event.h
@@ -50,6 +50,13 @@ class Event {
   virtual absl::Status WaitForEventOnExternalStream(std::intptr_t stream) {
     return absl::UnimplementedError("Not supported for this Event.");
   }
+
+  // Blocks the calling host thread until the event has been recorded.
+  // Wraps the underlying platform-specific synchronization (e.g.
+  // cuEventSynchronize for CUDA).
+  virtual absl::Status Synchronize() {
+    return absl::UnimplementedError("Not supported for this Event.");
+  }
 };
 
 }  // namespace stream_executor

From da574d33a05ddb2800545320f418b4675a12bad3 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Tue, 16 Dec 2025 00:39:22 -0800
Subject: [PATCH 311/753] Prepare XLA Bazel dependencies for enabling the
 layering check

This makes a bunch of XLA's third party dependencies compliant with
Bazel's layering check by either:

1. disabling the layering check for the dependency's targets, or by
2. fixing the build rules so that they comply with the layering check

Both is either achieved by:

1. patching the packages itself through patch files
2. changing the BUILD files directly if we maintain Bazel BUILD files separately in the XLA repo.

Some of the patches can be upstreamed which I'm going to attempt separately.

PiperOrigin-RevId: 845127409
---
 third_party/py/python_init_pip.bzl            |    4 +
 .../gen_disable_layering_check_patch.sh       |   61 +
 third_party/xla/third_party/boringssl.patch   |   13 +
 third_party/xla/third_party/curl.BUILD        |    1 +
 .../third_party/googletest/googletest.patch   |   46 +-
 .../third_party/highwayhash/highwayhash.BUILD |    1 +
 .../third_party/ortools/layering_check.patch  | 4261 +++++++++++++++++
 .../xla/third_party/py/python_init_pip.bzl    |    4 +
 .../rocm_device_libs/rocm_device_libs.BUILD   |    1 +
 .../xnnpack/layering_check_fix.patch          |   12 +
 .../xla/third_party/xnnpack/workspace.bzl     |    1 +
 third_party/xla/third_party/zlib.BUILD        |    6 +-
 third_party/xla/workspace2.bzl                |   18 +-
 13 files changed, 4387 insertions(+), 42 deletions(-)
 create mode 100755 third_party/xla/build_tools/dependencies/gen_disable_layering_check_patch.sh
 create mode 100644 third_party/xla/third_party/boringssl.patch
 create mode 100644 third_party/xla/third_party/ortools/layering_check.patch
 create mode 100644 third_party/xla/third_party/xnnpack/layering_check_fix.patch

diff --git a/third_party/py/python_init_pip.bzl b/third_party/py/python_init_pip.bzl
index 7689b92b60a00a..39901b9b2e64ea 100644
--- a/third_party/py/python_init_pip.bzl
+++ b/third_party/py/python_init_pip.bzl
@@ -24,6 +24,10 @@ cc_library(
 cc_library(
     name = "numpy_headers",
     deps = [":numpy_headers_2", ":numpy_headers_1"],
+    # For the layering check to work we need to re-export the headers from the
+    # dependencies.
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]) +
+           glob(["site-packages/numpy/core/include/**/*.h"]),
 )
 """,
         ),
diff --git a/third_party/xla/build_tools/dependencies/gen_disable_layering_check_patch.sh b/third_party/xla/build_tools/dependencies/gen_disable_layering_check_patch.sh
new file mode 100755
index 00000000000000..f1d9f7d670eaa9
--- /dev/null
+++ b/third_party/xla/build_tools/dependencies/gen_disable_layering_check_patch.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2025 The OpenXLA Authors.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Generates a patch file that disables the layering check for all cc_library
+# targets in the archive. Both BUILD and BUILD.bazel files are taken into account.
+#
+# The script takes one argument: the URL of the .tar.gz archive to download.
+#
+# The following tools are needed (need to be installed on the machine):
+# - curl
+# - git
+# - buildozer (from Bazel buildtools)
+#
+# The tool has originally been written for ortools but should work for similarly structured
+# projects as well.
+#
+# Example:
+# build_tools/dependencies/gen_disable_layering_check_patch.sh \
+# https://github.com/google/or-tools/archive/v9.11.tar.gz \
+# > third_party/ortools/layering_check.patch
+
+set -euo pipefail
+
+readonly TMP_DIR=$(mktemp -d)
+trap 'rm -rf -- $TMP_DIR' EXIT
+
+echo "Downloading archive $1..." >&2
+curl -Lqo "$TMP_DIR/archive.tar.gz" "$1" 1>&2
+
+echo "Extracting archive..." >&2
+mkdir -p "$TMP_DIR/extracted" 1>&2
+tar  -x -C "$TMP_DIR/extracted" -f "$TMP_DIR/archive.tar.gz" --strip-components=1 1>&2
+
+echo "Initialzing temporary git repo..." >&2
+git -C "$TMP_DIR/extracted" init 1>&2
+git -C "$TMP_DIR/extracted" add . 1>&2
+git -C "$TMP_DIR/extracted" commit --no-verify -m "original state" -q 1>&2
+
+echo "Patching build targets..." >&2
+find $TMP_DIR/extracted -name BUILD.bazel -or -name BUILD | while read f; do
+   buildozer 'add features "-layering_check"' $(dirname $f):%cc_library 1>&2 || exit_code=$?
+   if [[ $exit_code -ne 0 && $exit_code -ne 3 ]]; then
+     echo "Buildozer command failed with exit code: $exit_code" >&2
+     exit $exit_code
+   fi
+done
+
+echo "Generating diff..." >&2
+git -C "$TMP_DIR/extracted" --no-pager diff
diff --git a/third_party/xla/third_party/boringssl.patch b/third_party/xla/third_party/boringssl.patch
new file mode 100644
index 00000000000000..31433753e3abde
--- /dev/null
+++ b/third_party/xla/third_party/boringssl.patch
@@ -0,0 +1,13 @@
+diff --git a/BUILD b/BUILD
+index 206786442..3d1624382 100644
+--- a/BUILD
++++ b/BUILD
+@@ -145,7 +145,7 @@ cc_library(
+ 
+ cc_library(
+     name = "ssl",
+-    srcs = ssl_sources + ssl_internal_headers,
++    srcs = ssl_sources + ssl_internal_headers + crypto_internal_headers,
+     hdrs = ssl_headers,
+     copts = boringssl_copts_cxx,
+     includes = ["src/include"],
diff --git a/third_party/xla/third_party/curl.BUILD b/third_party/xla/third_party/curl.BUILD
index cb33aa940fe48f..c1884050002a5e 100644
--- a/third_party/xla/third_party/curl.BUILD
+++ b/third_party/xla/third_party/curl.BUILD
@@ -442,6 +442,7 @@ cc_library(
         "@local_xla//xla/tsl:ios": [],
         "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
+            "@boringssl//:crypto",
             "@boringssl//:ssl",
         ],
     }),
diff --git a/third_party/xla/third_party/googletest/googletest.patch b/third_party/xla/third_party/googletest/googletest.patch
index 7e6e300ed273a1..b9f95d91084e6d 100644
--- a/third_party/xla/third_party/googletest/googletest.patch
+++ b/third_party/xla/third_party/googletest/googletest.patch
@@ -2,40 +2,14 @@ diff --git a/BUILD.bazel b/BUILD.bazel
 index cc254457..49120384 100644
 --- a/BUILD.bazel
 +++ b/BUILD.bazel
-@@ -142,16 +142,16 @@ cc_library(
-     }),
-     deps = select({
-         ":has_absl": [
--            "@abseil-cpp//absl/container:flat_hash_set",
--            "@abseil-cpp//absl/debugging:failure_signal_handler",
--            "@abseil-cpp//absl/debugging:stacktrace",
--            "@abseil-cpp//absl/debugging:symbolize",
--            "@abseil-cpp//absl/flags:flag",
--            "@abseil-cpp//absl/flags:parse",
--            "@abseil-cpp//absl/flags:reflection",
--            "@abseil-cpp//absl/flags:usage",
--            "@abseil-cpp//absl/strings",
--            "@re2",
-+            "@com_google_absl//absl/container:flat_hash_set",
-+            "@com_google_absl//absl/debugging:failure_signal_handler",
-+            "@com_google_absl//absl/debugging:stacktrace",
-+            "@com_google_absl//absl/debugging:symbolize",
-+            "@com_google_absl//absl/flags:flag",
-+            "@com_google_absl//absl/flags:parse",
-+            "@com_google_absl//absl/flags:reflection",
-+            "@com_google_absl//absl/flags:usage",
-+            "@com_google_absl//absl/strings",
-+            "@com_googlesource_code_re2//:re2",
-         ],
+@@ -178,6 +178,10 @@ alias(
+ cc_library(
+     name = "gtest_main",
+     srcs = ["googlemock/src/gmock_main.cc"],
++    hdrs = glob([
++        "googletest/include/gtest/*.h",
++        "googlemock/include/gmock/*.h",
++    ]),
+     features = select({
+         ":windows": ["windows_export_all_symbols"],
          "//conditions:default": [],
-     }) + select({
-@@ -160,9 +160,6 @@ cc_library(
-         # so that's why these libraries are needed.
-         # Otherwise, builds targeting Fuchsia would fail to compile.
-         ":fuchsia": [
--            "@fuchsia_sdk//pkg/fdio",
--            "@fuchsia_sdk//pkg/syslog",
--            "@fuchsia_sdk//pkg/zx",
-         ],
-         "//conditions:default": [],
-     }),
diff --git a/third_party/xla/third_party/highwayhash/highwayhash.BUILD b/third_party/xla/third_party/highwayhash/highwayhash.BUILD
index 0314bd443f2617..2c409c8eb8597e 100644
--- a/third_party/xla/third_party/highwayhash/highwayhash.BUILD
+++ b/third_party/xla/third_party/highwayhash/highwayhash.BUILD
@@ -255,6 +255,7 @@ cc_library(
     deps = [
         ":arch_specific",
         ":compiler_specific",
+        ":endianess",
         ":hh_types",
         ":iaca",
         ":load3",
diff --git a/third_party/xla/third_party/ortools/layering_check.patch b/third_party/xla/third_party/ortools/layering_check.patch
new file mode 100644
index 00000000000000..3c2240d8d39e44
--- /dev/null
+++ b/third_party/xla/third_party/ortools/layering_check.patch
@@ -0,0 +1,4261 @@
+diff --git a/examples/cpp/BUILD.bazel b/examples/cpp/BUILD.bazel
+index 6cc1490..a7fa5c1 100644
+--- a/examples/cpp/BUILD.bazel
++++ b/examples/cpp/BUILD.bazel
+@@ -711,6 +711,7 @@ cc_test(
+ cc_library(
+     name = "print_dimacs_assignment",
+     hdrs = ["print_dimacs_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+@@ -725,6 +726,7 @@ cc_library(
+ cc_library(
+     name = "parse_dimacs_assignment",
+     hdrs = ["parse_dimacs_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/graph:ebert_graph",
+@@ -878,6 +880,7 @@ cc_test(
+ cc_library(
+     name = "fap_parser",
+     hdrs = ["fap_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+@@ -891,6 +894,7 @@ cc_library(
+ cc_library(
+     name = "fap_model_printer",
+     hdrs = ["fap_model_printer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":fap_parser",
+         "//ortools/base",
+@@ -903,6 +907,7 @@ cc_library(
+ cc_library(
+     name = "fap_utilities",
+     hdrs = ["fap_utilities.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":fap_parser",
+         "//ortools/base",
+diff --git a/ortools/algorithms/BUILD.bazel b/ortools/algorithms/BUILD.bazel
+index be5f372..4d1c6ae 100644
+--- a/ortools/algorithms/BUILD.bazel
++++ b/ortools/algorithms/BUILD.bazel
+@@ -65,6 +65,7 @@ cc_library(
+     name = "binary_search",
+     srcs = [],
+     hdrs = ["binary_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/functional:function_ref",
+@@ -95,6 +96,7 @@ cc_library(
+     name = "radix_sort",
+     srcs = [],
+     hdrs = ["radix_sort.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/algorithm:container",
+         "@com_google_absl//absl/base",
+@@ -132,6 +134,7 @@ cc_library(
+     name = "duplicate_remover",
+     srcs = ["duplicate_remover.cc"],
+     hdrs = ["duplicate_remover.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/numeric:bits",
+@@ -147,6 +150,7 @@ cc_library(
+     name = "hungarian",
+     srcs = ["hungarian.cc"],
+     hdrs = ["hungarian.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/container:flat_hash_map",
+@@ -174,6 +178,7 @@ cc_test(
+ cc_library(
+     name = "adjustable_k_ary_heap",
+     hdrs = ["adjustable_k_ary_heap.h"],
++    features = ["-layering_check"],
+     deps = ["@com_google_absl//absl/log:check"],
+ )
+ 
+@@ -213,6 +218,7 @@ cc_library(
+         ":use_scip": ["-DUSE_SCIP"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/strings",
+@@ -269,6 +275,7 @@ cc_library(
+     name = "set_cover_lagrangian",
+     srcs = ["set_cover_lagrangian.cc"],
+     hdrs = ["set_cover_lagrangian.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":adjustable_k_ary_heap",
+         ":set_cover_invariant",
+@@ -282,6 +289,7 @@ cc_library(
+     name = "set_cover_model",
+     srcs = ["set_cover_model.cc"],
+     hdrs = ["set_cover_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_cc_proto",
+         "//ortools/base:intops",
+@@ -297,6 +305,7 @@ cc_library(
+     name = "set_cover_invariant",
+     srcs = ["set_cover_invariant.cc"],
+     hdrs = ["set_cover_invariant.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_cc_proto",
+         ":set_cover_model",
+@@ -311,6 +320,7 @@ cc_library(
+     name = "set_cover_heuristics",
+     srcs = ["set_cover_heuristics.cc"],
+     hdrs = ["set_cover_heuristics.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":adjustable_k_ary_heap",
+         ":set_cover_invariant",
+@@ -328,6 +338,7 @@ cc_library(
+     name = "set_cover_mip",
+     srcs = ["set_cover_mip.cc"],
+     hdrs = ["set_cover_mip.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_invariant",
+         ":set_cover_model",
+@@ -343,6 +354,7 @@ cc_library(
+     name = "set_cover_reader",
+     srcs = ["set_cover_reader.cc"],
+     hdrs = ["set_cover_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":set_cover_model",
+         "//ortools/base:file",
+@@ -378,6 +390,7 @@ cc_test(
+ cc_library(
+     name = "dense_doubly_linked_list",
+     hdrs = ["dense_doubly_linked_list.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -387,6 +400,7 @@ cc_library(
+     name = "dynamic_partition",
+     srcs = ["dynamic_partition.cc"],
+     hdrs = ["dynamic_partition.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:murmur",
+         "@com_google_absl//absl/log:check",
+@@ -411,6 +425,7 @@ cc_library(
+     name = "sparse_permutation",
+     srcs = ["sparse_permutation.cc"],
+     hdrs = ["sparse_permutation.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -433,6 +448,7 @@ cc_library(
+     name = "dynamic_permutation",
+     srcs = ["dynamic_permutation.cc"],
+     hdrs = ["dynamic_permutation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sparse_permutation",
+         "//ortools/base",
+@@ -453,6 +469,7 @@ cc_library(
+     name = "find_graph_symmetries",
+     srcs = ["find_graph_symmetries.cc"],
+     hdrs = ["find_graph_symmetries.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":dense_doubly_linked_list",
+         ":dynamic_partition",
+@@ -507,6 +524,7 @@ cc_test(
+ cc_library(
+     name = "binary_indexed_tree",
+     hdrs = ["binary_indexed_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+     ],
+@@ -525,6 +543,7 @@ cc_library(
+     name = "n_choose_k",
+     srcs = ["n_choose_k.cc"],
+     hdrs = ["n_choose_k.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":binary_search",
+         "//ortools/base:mathutil",
+diff --git a/ortools/algorithms/python/BUILD.bazel b/ortools/algorithms/python/BUILD.bazel
+index fe3de2c..0a4ccf9 100644
+--- a/ortools/algorithms/python/BUILD.bazel
++++ b/ortools/algorithms/python/BUILD.bazel
+@@ -48,6 +48,7 @@ config_setting(
+ cc_library(
+     name = "knapsack_solver_doc",
+     hdrs = ["knapsack_solver_doc.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+ )
+ 
+diff --git a/ortools/base/BUILD.bazel b/ortools/base/BUILD.bazel
+index c57c0d2..6ebc65a 100644
+--- a/ortools/base/BUILD.bazel
++++ b/ortools/base/BUILD.bazel
+@@ -54,6 +54,7 @@ cc_library(
+         "-DOR_TOOLS_MINOR=11",
+         "-DOR_TOOLS_PATCH=9999",
+     ],
++    features = ["-layering_check"],
+     linkopts = select({
+         "on_linux": [],
+         "on_macos": ["-framework CoreFoundation"],
+@@ -83,6 +84,7 @@ cc_library(
+ cc_library(
+     name = "accurate_sum",
+     hdrs = ["accurate_sum.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+@@ -91,6 +93,7 @@ cc_library(
+         "adjustable_priority_queue.h",
+         "adjustable_priority_queue-inl.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+     ],
+@@ -99,18 +102,21 @@ cc_library(
+ cc_library(
+     name = "basictypes",
+     hdrs = ["basictypes.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "bitmap",
+     srcs = ["bitmap.cc"],
+     hdrs = ["bitmap.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "case",
+     srcs = ["case.cc"],
+     hdrs = ["case.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+@@ -120,6 +126,7 @@ cc_library(
+         "commandlineflags.cc",
+     ],
+     hdrs = ["commandlineflags.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/flags:flag",
+         "@com_google_absl//absl/flags:parse",
+@@ -130,6 +137,7 @@ cc_library(
+ cc_library(
+     name = "container_logging",
+     hdrs = ["container_logging.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+@@ -142,6 +150,7 @@ cc_library(
+         "on_windows": ["/Zc:preprocessor"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/container:inlined_vector",
+     ],
+@@ -167,6 +176,7 @@ cc_test(
+ cc_library(
+     name = "dynamic_library",
+     hdrs = ["dynamic_library.h"],
++    features = ["-layering_check"],
+     linkopts = select({
+         "on_linux": ["-Wl,--no-as-needed -ldl"],
+         "on_macos": [],
+@@ -182,12 +192,14 @@ cc_library(
+ cc_library(
+     name = "encodingutils",
+     hdrs = ["encodingutils.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "flags",
+     hdrs = ["flags.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/flags:flag",
+     ],
+@@ -205,6 +217,7 @@ cc_library(
+         "helpers.h",
+         "options.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":status_macros",
+         "@com_google_absl//absl/log",
+@@ -218,6 +231,7 @@ cc_library(
+ cc_library(
+     name = "status_matchers",
+     hdrs = ["status_matchers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@com_google_absl//absl/status",
+@@ -230,6 +244,7 @@ cc_library(
+ cc_library(
+     name = "message_matchers",
+     hdrs = ["message_matchers.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+         "@com_google_googletest//:gtest",
+@@ -240,6 +255,7 @@ cc_library(
+ cc_library(
+     name = "gmock",
+     hdrs = ["gmock.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":message_matchers",
+         ":status_matchers",
+@@ -249,6 +265,7 @@ cc_library(
+ 
+ cc_library(
+     name = "gmock_main",
++    features = ["-layering_check"],
+     deps = [
+         ":gmock",
+         "@com_google_googletest//:gtest_main",
+@@ -259,6 +276,7 @@ cc_library(
+     name = "gzipfile",
+     srcs = ["gzipfile.cc"],
+     hdrs = ["gzipfile.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":basictypes",
+@@ -272,6 +290,7 @@ cc_library(
+ cc_library(
+     name = "gzipstring",
+     hdrs = ["gzipstring.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@zlib",
+@@ -286,6 +305,7 @@ cc_library(
+     hdrs = [
+         "hash.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -294,24 +314,28 @@ cc_library(
+ cc_library(
+     name = "int_type",
+     hdrs = ["int_type.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "intops",
+     hdrs = ["strong_int.h"],
++    features = ["-layering_check"],
+     deps = [":int_type"],
+ )
+ 
+ cc_library(
+     name = "iterator_adaptors",
+     hdrs = ["iterator_adaptors.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "linked_hash_map",
+     hdrs = ["linked_hash_map.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":logging",
+@@ -324,6 +348,7 @@ cc_library(
+     name = "logging",
+     srcs = ["logging.cc"],
+     hdrs = ["logging.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":macros",
+         "@com_google_absl//absl/base:log_severity",
+@@ -344,11 +369,13 @@ cc_library(
+ cc_library(
+     name = "macros",
+     hdrs = ["macros.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "map_util",
+     hdrs = ["map_util.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+@@ -356,6 +383,7 @@ cc_library(
+     name = "mathutil",
+     srcs = ["mathutil.cc"],
+     hdrs = ["mathutil.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+     ],
+@@ -364,12 +392,14 @@ cc_library(
+ cc_library(
+     name = "memfile",
+     hdrs = ["memfile.h"],
++    features = ["-layering_check"],
+     deps = [],
+ )
+ 
+ cc_library(
+     name = "murmur",
+     hdrs = ["murmur.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":hash",
+@@ -380,6 +410,7 @@ cc_library(
+ cc_library(
+     name = "mutable_memfile",
+     hdrs = ["mutable_memfile.h"],
++    features = ["-layering_check"],
+     deps = [],
+ )
+ 
+@@ -387,6 +418,7 @@ cc_library(
+     name = "numbers",
+     srcs = ["numbers.cc"],
+     hdrs = ["numbers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":strtoint",
+         "@com_google_absl//absl/strings",
+@@ -396,6 +428,7 @@ cc_library(
+ cc_library(
+     name = "parse_text_proto",
+     hdrs = ["parse_text_proto.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_protobuf//:protobuf",
+@@ -406,6 +439,7 @@ cc_library(
+     name = "path",
+     srcs = ["path.cc"],
+     hdrs = ["path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@com_google_absl//absl/strings",
+@@ -416,6 +450,7 @@ cc_library(
+     name = "temp_path",
+     srcs = ["temp_path.cc"],
+     hdrs = ["temp_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":file",
+@@ -429,11 +464,13 @@ cc_library(
+ cc_library(
+     name = "protobuf_util",
+     hdrs = ["protobuf_util.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "protoutil",
+     hdrs = ["protoutil.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":timer",
+         "@com_google_absl//absl/status",
+@@ -445,12 +482,14 @@ cc_library(
+ cc_library(
+     name = "ptr_util",
+     hdrs = ["ptr_util.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "recordio",
+     srcs = ["recordio.cc"],
+     hdrs = ["recordio.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":file",
+@@ -465,18 +504,21 @@ cc_library(
+ cc_library(
+     name = "small_map",
+     hdrs = ["small_map.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "source_location",
+     hdrs = ["source_location.h"],
++    features = ["-layering_check"],
+     deps = ["@com_google_absl//absl/base:config"],
+ )
+ 
+ cc_library(
+     name = "status_builder",
+     hdrs = ["status_builder.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "@com_google_absl//absl/status",
+@@ -487,6 +529,7 @@ cc_library(
+ cc_library(
+     name = "status_macros",
+     hdrs = ["status_macros.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":status_builder",
+@@ -498,12 +541,14 @@ cc_library(
+ cc_library(
+     name = "stl_util",
+     hdrs = ["stl_util.h"],
++    features = ["-layering_check"],
+     deps = [":base"],
+ )
+ 
+ cc_library(
+     name = "strong_vector",
+     hdrs = ["strong_vector.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":intops",
+@@ -514,6 +559,7 @@ cc_library(
+     name = "strtoint",
+     srcs = ["strtoint.cc"],
+     hdrs = ["strtoint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/strings",
+@@ -524,6 +570,7 @@ cc_library(
+     name = "sysinfo",
+     srcs = ["sysinfo.cc"],
+     hdrs = ["sysinfo.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -533,6 +580,7 @@ cc_library(
+     name = "threadpool",
+     srcs = ["threadpool.cc"],
+     hdrs = ["threadpool.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/synchronization",
+@@ -543,6 +591,7 @@ cc_library(
+     name = "timer",
+     srcs = ["timer.cc"],
+     hdrs = ["timer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":macros",
+         "@com_google_absl//absl/log:check",
+@@ -553,22 +602,26 @@ cc_library(
+ cc_library(
+     name = "top_n",
+     hdrs = ["top_n.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "typeid",
+     hdrs = ["typeid.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "types",
+     hdrs = ["types.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "zipfile",
+     srcs = ["zipfile.cc"],
+     hdrs = ["zipfile.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":basictypes",
+         ":file",
+diff --git a/ortools/bop/BUILD.bazel b/ortools/bop/BUILD.bazel
+index 4720990..605ce2b 100644
+--- a/ortools/bop/BUILD.bazel
++++ b/ortools/bop/BUILD.bazel
+@@ -30,6 +30,7 @@ cc_proto_library(
+ cc_library(
+     name = "bop_types",
+     hdrs = ["bop_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:intops",
+@@ -41,6 +42,7 @@ cc_library(
+     name = "bop_base",
+     srcs = ["bop_base.cc"],
+     hdrs = ["bop_base.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_parameters_cc_proto",
+         ":bop_solution",
+@@ -67,6 +69,7 @@ cc_library(
+     name = "bop_util",
+     srcs = ["bop_util.cc"],
+     hdrs = ["bop_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_solution",
+@@ -80,6 +83,7 @@ cc_library(
+     name = "bop_solution",
+     srcs = ["bop_solution.cc"],
+     hdrs = ["bop_solution.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_types",
+         "//ortools/base",
+@@ -94,6 +98,7 @@ cc_library(
+     name = "bop_fs",
+     srcs = ["bop_fs.cc"],
+     hdrs = ["bop_fs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_parameters_cc_proto",
+@@ -126,6 +131,7 @@ cc_library(
+     name = "bop_lns",
+     srcs = ["bop_lns.cc"],
+     hdrs = ["bop_lns.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_parameters_cc_proto",
+@@ -156,6 +162,7 @@ cc_library(
+     name = "complete_optimizer",
+     srcs = ["complete_optimizer.cc"],
+     hdrs = ["complete_optimizer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_solution",
+@@ -178,6 +185,7 @@ cc_library(
+     name = "bop_ls",
+     srcs = ["bop_ls.cc"],
+     hdrs = ["bop_ls.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_solution",
+@@ -199,6 +207,7 @@ cc_library(
+     name = "bop_portfolio",
+     srcs = ["bop_portfolio.cc"],
+     hdrs = ["bop_portfolio.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_fs",
+@@ -231,6 +240,7 @@ cc_library(
+     name = "bop_solver",
+     srcs = ["bop_solver.cc"],
+     hdrs = ["bop_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_fs",
+@@ -265,6 +275,7 @@ cc_library(
+     name = "integral_solver",
+     srcs = ["integral_solver.cc"],
+     hdrs = ["integral_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bop_base",
+         ":bop_fs",
+diff --git a/ortools/constraint_solver/BUILD.bazel b/ortools/constraint_solver/BUILD.bazel
+index 99d9b4d..6cedaa6 100644
+--- a/ortools/constraint_solver/BUILD.bazel
++++ b/ortools/constraint_solver/BUILD.bazel
+@@ -169,6 +169,7 @@ cc_library(
+         "constraint_solver.h",
+         "constraint_solveri.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":assignment_cc_proto",
+         ":demon_profiler_cc_proto",
+@@ -267,6 +268,7 @@ cc_library(
+     name = "routing_parameters",
+     srcs = ["routing_parameters.cc"],
+     hdrs = ["routing_parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp",
+         ":routing_enums_cc_proto",
+@@ -286,6 +288,7 @@ cc_library(
+ cc_library(
+     name = "routing_types",
+     hdrs = ["routing_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:intops",
+@@ -296,6 +299,7 @@ cc_library(
+     name = "routing_utils",
+     srcs = ["routing_utils.cc"],
+     hdrs = ["routing_utils.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base",
+@@ -307,6 +311,7 @@ cc_library(
+     name = "routing_neighborhoods",
+     srcs = ["routing_neighborhoods.cc"],
+     hdrs = ["routing_neighborhoods.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":cp",
+@@ -320,6 +325,7 @@ cc_library(
+     name = "routing_index_manager",
+     srcs = ["routing_index_manager.cc"],
+     hdrs = ["routing_index_manager.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":routing_types",
+         "//ortools/base",
+@@ -360,6 +366,7 @@ cc_library(
+         "on_windows": ["/Zc:preprocessor"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":cp",
+         ":routing_enums_cc_proto",
+diff --git a/ortools/flatzinc/BUILD.bazel b/ortools/flatzinc/BUILD.bazel
+index d3e8b22..5015c77 100644
+--- a/ortools/flatzinc/BUILD.bazel
++++ b/ortools/flatzinc/BUILD.bazel
+@@ -46,6 +46,7 @@ cc_library(
+     name = "model",
+     srcs = ["model.cc"],
+     hdrs = ["model.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:hash",
+@@ -71,6 +72,7 @@ cc_library(
+     copts = [
+         "$(STACK_FRAME_UNLIMITED)",  # parser.tab.cc
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -90,6 +92,7 @@ cc_library(
+         "on_windows": [],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":parser_yacc_lib",
+         "//ortools/base",
+@@ -102,6 +105,7 @@ cc_library(
+     name = "parser_lib",
+     srcs = ["parser.cc"],
+     hdrs = ["parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":parser_lex_lib",
+@@ -113,6 +117,7 @@ cc_library(
+     name = "presolve",
+     srcs = ["presolve.cc"],
+     hdrs = ["presolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -128,6 +133,7 @@ cc_library(
+     name = "checker",
+     srcs = ["checker.cc"],
+     hdrs = ["checker.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -142,6 +148,7 @@ cc_library(
+     name = "cp_model_fz_solver",
+     srcs = ["cp_model_fz_solver.cc"],
+     hdrs = ["cp_model_fz_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":checker",
+         ":model",
+diff --git a/ortools/glop/BUILD.bazel b/ortools/glop/BUILD.bazel
+index 687c48d..48e856c 100644
+--- a/ortools/glop/BUILD.bazel
++++ b/ortools/glop/BUILD.bazel
+@@ -54,6 +54,7 @@ SAFE_FP_CODE = select({
+ cc_library(
+     name = "pricing",
+     hdrs = ["pricing.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/lp_data:base",
+@@ -69,6 +70,7 @@ cc_library(
+     srcs = ["revised_simplex.cc"],
+     hdrs = ["revised_simplex.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":dual_edge_norms",
+@@ -106,6 +108,7 @@ cc_library(
+     srcs = ["update_row.cc"],
+     hdrs = ["update_row.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -125,6 +128,7 @@ cc_library(
+     srcs = ["variables_info.cc"],
+     hdrs = ["variables_info.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/lp_data:base",
+@@ -139,6 +143,7 @@ cc_library(
+     srcs = ["lu_factorization.cc"],
+     hdrs = ["lu_factorization.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":markowitz",
+         ":parameters_cc_proto",
+@@ -155,6 +160,7 @@ cc_library(
+     srcs = ["markowitz.cc"],
+     hdrs = ["markowitz.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         ":status",
+@@ -174,6 +180,7 @@ cc_library(
+     srcs = ["basis_representation.cc"],
+     hdrs = ["basis_representation.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":lu_factorization",
+         ":parameters_cc_proto",
+@@ -193,6 +200,7 @@ cc_library(
+     name = "rank_one_update",
+     hdrs = ["rank_one_update.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":status",
+         "//ortools/base",
+@@ -210,6 +218,7 @@ cc_library(
+     srcs = ["initial_basis.cc"],
+     hdrs = ["initial_basis.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":markowitz",
+         "//ortools/base",
+@@ -227,6 +236,7 @@ cc_library(
+     srcs = ["status.cc"],
+     hdrs = ["status.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -255,6 +265,7 @@ cc_library(
+     srcs = ["dual_edge_norms.cc"],
+     hdrs = ["dual_edge_norms.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -274,6 +285,7 @@ cc_library(
+     srcs = ["primal_edge_norms.cc"],
+     hdrs = ["primal_edge_norms.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -293,6 +305,7 @@ cc_library(
+     srcs = ["reduced_costs.cc"],
+     hdrs = ["reduced_costs.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -317,6 +330,7 @@ cc_library(
+     srcs = ["variable_values.cc"],
+     hdrs = ["variable_values.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":dual_edge_norms",
+@@ -338,6 +352,7 @@ cc_library(
+     srcs = ["entering_variable.cc"],
+     hdrs = ["entering_variable.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":basis_representation",
+         ":parameters_cc_proto",
+@@ -366,6 +381,7 @@ cc_library(
+     srcs = ["preprocessor.cc"],
+     hdrs = ["preprocessor.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         ":revised_simplex",
+@@ -389,6 +405,7 @@ cc_library(
+     srcs = ["lp_solver.cc"],
+     hdrs = ["lp_solver.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         ":preprocessor",
+@@ -413,6 +430,7 @@ cc_library(
+     name = "parameters_validation",
+     srcs = ["parameters_validation.cc"],
+     hdrs = ["parameters_validation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":parameters_cc_proto",
+         "@com_google_absl//absl/strings",
+diff --git a/ortools/glpk/BUILD.bazel b/ortools/glpk/BUILD.bazel
+index 246ee67..7f2c088 100644
+--- a/ortools/glpk/BUILD.bazel
++++ b/ortools/glpk/BUILD.bazel
+@@ -18,6 +18,7 @@ cc_library(
+     name = "glpk_env_deleter",
+     srcs = ["glpk_env_deleter.cc"],
+     hdrs = ["glpk_env_deleter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@glpk",
+@@ -28,6 +29,7 @@ cc_library(
+     name = "glpk_formatters",
+     srcs = ["glpk_formatters.cc"],
+     hdrs = ["glpk_formatters.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -38,6 +40,7 @@ cc_library(
+ cc_library(
+     name = "glpk_computational_form",
+     hdrs = ["glpk_computational_form.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@glpk",
+     ],
+diff --git a/ortools/graph/BUILD.bazel b/ortools/graph/BUILD.bazel
+index fe0f588..4bb9556 100644
+--- a/ortools/graph/BUILD.bazel
++++ b/ortools/graph/BUILD.bazel
+@@ -35,6 +35,7 @@ config_setting(
+ cc_library(
+     name = "graphs",
+     hdrs = ["graphs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -44,6 +45,7 @@ cc_library(
+ cc_library(
+     name = "graph",
+     hdrs = ["graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":iterators",
+         "//ortools/base",
+@@ -55,6 +57,7 @@ cc_library(
+ cc_library(
+     name = "bfs",
+     hdrs = ["bfs.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/status",
+         "@com_google_absl//absl/strings:str_format",
+@@ -64,6 +67,7 @@ cc_library(
+ cc_library(
+     name = "bounded_dijkstra",
+     hdrs = ["bounded_dijkstra.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base:iterator_adaptors",
+@@ -78,6 +82,7 @@ cc_library(
+ cc_library(
+     name = "multi_dijkstra",
+     hdrs = ["multi_dijkstra.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:map_util",
+         "//ortools/base:types",
+@@ -88,6 +93,7 @@ cc_library(
+ cc_library(
+     name = "bidirectional_dijkstra",
+     hdrs = ["bidirectional_dijkstra.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:iterator_adaptors",
+@@ -103,6 +109,7 @@ cc_library(
+     name = "cliques",
+     srcs = ["cliques.cc"],
+     hdrs = ["cliques.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:int_type",
+@@ -116,6 +123,7 @@ cc_library(
+ cc_library(
+     name = "hamiltonian_path",
+     hdrs = ["hamiltonian_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:types",
+@@ -129,6 +137,7 @@ cc_library(
+ cc_library(
+     name = "christofides",
+     hdrs = ["christofides.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":eulerian_path",
+         ":graph",
+@@ -147,6 +156,7 @@ cc_library(
+ cc_library(
+     name = "eulerian_path",
+     hdrs = ["eulerian_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -155,6 +165,7 @@ cc_library(
+ cc_library(
+     name = "minimum_spanning_tree",
+     hdrs = ["minimum_spanning_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":connected_components",
+         "//ortools/base:adjustable_priority_queue",
+@@ -167,6 +178,7 @@ cc_library(
+ cc_library(
+     name = "one_tree_lower_bound",
+     hdrs = ["one_tree_lower_bound.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":christofides",
+         ":graph",
+@@ -179,6 +191,7 @@ cc_library(
+ cc_library(
+     name = "ebert_graph",
+     hdrs = ["ebert_graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:types",
+@@ -192,6 +205,7 @@ cc_library(
+     name = "shortest_paths",
+     srcs = ["shortest_paths.cc"],
+     hdrs = ["shortest_paths.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -212,6 +226,7 @@ cc_library(
+ cc_library(
+     name = "k_shortest_paths",
+     hdrs = ["k_shortest_paths.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bounded_dijkstra",
+         ":ebert_graph",
+@@ -242,6 +257,7 @@ cc_library(
+     name = "max_flow",
+     srcs = ["max_flow.cc"],
+     hdrs = ["max_flow.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":flow_problem_cc_proto",
+@@ -290,6 +306,7 @@ cc_library(
+         "on_windows": ["/Zc:preprocessor"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -336,6 +353,7 @@ cc_library(
+     name = "assignment",
+     srcs = ["assignment.cc"],
+     hdrs = ["assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":linear_assignment",
+@@ -349,6 +367,7 @@ cc_library(
+     name = "linear_assignment",
+     srcs = ["linear_assignment.cc"],
+     hdrs = ["linear_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         "//ortools/base",
+@@ -364,6 +383,7 @@ cc_library(
+     name = "perfect_matching",
+     srcs = ["perfect_matching.cc"],
+     hdrs = ["perfect_matching.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:adjustable_priority_queue",
+@@ -382,6 +402,7 @@ cc_library(
+     name = "dag_shortest_path",
+     srcs = ["dag_shortest_path.cc"],
+     hdrs = ["dag_shortest_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ebert_graph",
+         ":graph",
+@@ -399,6 +420,7 @@ cc_library(
+     name = "dag_constrained_shortest_path",
+     srcs = ["dag_constrained_shortest_path.cc"],
+     hdrs = ["dag_constrained_shortest_path.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":dag_shortest_path",
+         ":graph",
+@@ -416,6 +438,7 @@ cc_library(
+ cc_library(
+     name = "rooted_tree",
+     hdrs = ["rooted_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/algorithm:container",
+@@ -437,6 +460,7 @@ cc_library(
+     hdrs = [
+         "connected_components.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -450,6 +474,7 @@ cc_library(
+ cc_library(
+     name = "io",
+     hdrs = ["io.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base:numbers",
+@@ -463,12 +488,14 @@ cc_library(
+ cc_library(
+     name = "iterators",
+     hdrs = ["iterators.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "random_graph",
+     srcs = ["random_graph.cc"],
+     hdrs = ["random_graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base:logging",
+@@ -485,6 +512,7 @@ cc_library(
+     hdrs = [
+         "strongly_connected_components.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -494,6 +522,7 @@ cc_library(
+     name = "topologicalsorter",
+     srcs = ["topologicalsorter.cc"],
+     hdrs = ["topologicalsorter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":graph",
+         "//ortools/base",
+@@ -512,6 +541,7 @@ cc_library(
+     name = "util",
+     srcs = ["util.cc"],
+     hdrs = ["util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":connected_components",
+         ":graph",
+diff --git a/ortools/gscip/BUILD.bazel b/ortools/gscip/BUILD.bazel
+index d949483..37dd2ee 100644
+--- a/ortools/gscip/BUILD.bazel
++++ b/ortools/gscip/BUILD.bazel
+@@ -39,6 +39,7 @@ cc_library(
+     name = "gscip_parameters",
+     srcs = ["gscip_parameters.cc"],
+     hdrs = ["gscip_parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip_cc_proto",
+         "//ortools/base:status_macros",
+@@ -62,6 +63,7 @@ cc_library(
+     name = "legacy_scip_params",
+     srcs = ["legacy_scip_params.cc"],
+     hdrs = ["legacy_scip_params.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/linear_solver:scip_helper_macros",
+         "//ortools/linear_solver:scip_with_glop",
+@@ -81,6 +83,7 @@ cc_library(
+         "gscip.h",
+         "gscip_event_handler.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip_cc_proto",
+         ":gscip_message_handler",
+@@ -106,6 +109,7 @@ cc_library(
+     name = "gscip_ext",
+     srcs = ["gscip_ext.cc"],
+     hdrs = ["gscip_ext.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip",
+         "//ortools/base:status_macros",
+@@ -118,6 +122,7 @@ cc_library(
+     name = "gscip_message_handler",
+     srcs = ["gscip_message_handler.cc"],
+     hdrs = ["gscip_message_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/linear_solver:scip_helper_macros",
+@@ -131,6 +136,7 @@ cc_library(
+     name = "gscip_callback_result",
+     srcs = ["gscip_callback_result.cc"],
+     hdrs = ["gscip_callback_result.h"],
++    features = ["-layering_check"],
+     deps = ["@scip//:libscip"],
+ )
+ 
+@@ -138,6 +144,7 @@ cc_library(
+     name = "gscip_constraint_handler",
+     srcs = ["gscip_constraint_handler.cc"],
+     hdrs = ["gscip_constraint_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":gscip",
+         ":gscip_callback_result",
+diff --git a/ortools/gurobi/BUILD.bazel b/ortools/gurobi/BUILD.bazel
+index d8e4a72..83da625 100644
+--- a/ortools/gurobi/BUILD.bazel
++++ b/ortools/gurobi/BUILD.bazel
+@@ -21,6 +21,7 @@ cc_library(
+     hdrs = [
+         "environment.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:dynamic_library",
+@@ -39,6 +40,7 @@ cc_library(
+     name = "gurobi_util",
+     srcs = ["gurobi_util.cc"],
+     hdrs = ["gurobi_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":environment",
+         "@com_google_absl//absl/strings",
+@@ -51,5 +53,6 @@ cc_library(
+     testonly = True,
+     srcs = ["gurobi_stdout_matchers.cc"],
+     hdrs = ["gurobi_stdout_matchers.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base:gmock"],
+ )
+diff --git a/ortools/gurobi/isv_public/BUILD.bazel b/ortools/gurobi/isv_public/BUILD.bazel
+index efae616..1006da8 100644
+--- a/ortools/gurobi/isv_public/BUILD.bazel
++++ b/ortools/gurobi/isv_public/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "gurobi_isv",
+     srcs = ["gurobi_isv.cc"],
+     hdrs = ["gurobi_isv.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/gurobi:environment",
+         "//ortools/math_opt/solvers:gurobi_cc_proto",
+diff --git a/ortools/init/BUILD.bazel b/ortools/init/BUILD.bazel
+index 0705399..aec2da3 100644
+--- a/ortools/init/BUILD.bazel
++++ b/ortools/init/BUILD.bazel
+@@ -16,6 +16,7 @@ package(default_visibility = ["//visibility:public"])
+ cc_library(
+     name = "init",
+     hdrs = ["init.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/gurobi:environment",
+diff --git a/ortools/init/python/BUILD.bazel b/ortools/init/python/BUILD.bazel
+index 1774f36..eb75897 100644
+--- a/ortools/init/python/BUILD.bazel
++++ b/ortools/init/python/BUILD.bazel
+@@ -21,6 +21,7 @@ load("@rules_python//python:defs.bzl", "py_test")
+ cc_library(
+     name = "init_doc",
+     hdrs = ["init_doc.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+ )
+ 
+diff --git a/ortools/linear_solver/BUILD.bazel b/ortools/linear_solver/BUILD.bazel
+index 618e192..b7bcf34 100644
+--- a/ortools/linear_solver/BUILD.bazel
++++ b/ortools/linear_solver/BUILD.bazel
+@@ -252,6 +252,7 @@ cc_library(
+         ":use_cplex": ["-DUSE_CPLEX"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":linear_solver_cc_proto",
+         ":model_exporter",
+@@ -323,6 +324,7 @@ cc_library(
+     name = "model_validator",
+     srcs = ["model_validator.cc"],
+     hdrs = ["model_validator.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_solver_cc_proto",
+@@ -352,6 +354,7 @@ copy_file(
+ cc_library(
+     name = "scip_with_glop",
+     srcs = ["lpi_glop.cpp"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/glop:lp_solver",
+         "@scip//:libscip",
+@@ -361,6 +364,7 @@ cc_library(
+ cc_library(
+     name = "scip_helper_macros",
+     hdrs = ["scip_helper_macros.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -373,6 +377,7 @@ cc_library(
+     name = "model_exporter",
+     srcs = ["model_exporter.cc"],
+     hdrs = ["model_exporter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":linear_solver_cc_proto",
+         "//ortools/base",
+@@ -412,6 +417,7 @@ cc_library(
+     name = "solve_mp_model",
+     srcs = ["solve_mp_model.cc"],
+     hdrs = ["solve_mp_model.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_solver",
+diff --git a/ortools/linear_solver/proto_solver/BUILD.bazel b/ortools/linear_solver/proto_solver/BUILD.bazel
+index 57a1d82..3998779 100644
+--- a/ortools/linear_solver/proto_solver/BUILD.bazel
++++ b/ortools/linear_solver/proto_solver/BUILD.bazel
+@@ -16,6 +16,7 @@ package(default_visibility = ["//visibility:public"])
+ cc_library(
+     name = "proto_utils",
+     hdrs = ["proto_utils.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/port:proto_utils",
+@@ -28,6 +29,7 @@ cc_library(
+     name = "glop_proto_solver",
+     srcs = ["glop_proto_solver.cc"],
+     hdrs = ["glop_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_utils",
+         "//ortools/glop:lp_solver",
+@@ -52,6 +54,7 @@ cc_library(
+     name = "pdlp_proto_solver",
+     srcs = ["pdlp_proto_solver.cc"],
+     hdrs = ["pdlp_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+@@ -71,6 +74,7 @@ cc_library(
+     name = "sat_solver_utils",
+     srcs = ["sat_solver_utils.cc"],
+     hdrs = ["sat_solver_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/glop:parameters_cc_proto",
+         "//ortools/glop:preprocessor",
+@@ -85,6 +89,7 @@ cc_library(
+     name = "sat_proto_solver",
+     srcs = ["sat_proto_solver.cc"],
+     hdrs = ["sat_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_utils",
+         ":sat_solver_utils",
+@@ -118,6 +123,7 @@ cc_library(
+         "//ortools/linear_solver:use_scip": ["USE_SCIP"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:timer",
+@@ -144,6 +150,7 @@ cc_library(
+     name = "gurobi_proto_solver",
+     srcs = ["gurobi_proto_solver.cc"],
+     hdrs = ["gurobi_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:timer",
+         "//ortools/gurobi:environment",
+@@ -171,6 +178,7 @@ cc_library(
+         "//ortools/linear_solver:use_highs": ["USE_HIGHS"],
+         "//conditions:default": [],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:timer",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+@@ -186,6 +194,7 @@ cc_library(
+     name = "xpress_proto_solver",
+     srcs = ["xpress_proto_solver.cc"],
+     hdrs = ["xpress_proto_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:timer",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+diff --git a/ortools/linear_solver/wrappers/BUILD.bazel b/ortools/linear_solver/wrappers/BUILD.bazel
+index f0f031b..fce5554 100644
+--- a/ortools/linear_solver/wrappers/BUILD.bazel
++++ b/ortools/linear_solver/wrappers/BUILD.bazel
+@@ -35,6 +35,7 @@ cc_library(
+         "-DUSE_SCIP",
+         "-DUSE_LP_PARSER",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:file",
+diff --git a/ortools/lp_data/BUILD.bazel b/ortools/lp_data/BUILD.bazel
+index c0e2993..b8bbb47 100644
+--- a/ortools/lp_data/BUILD.bazel
++++ b/ortools/lp_data/BUILD.bazel
+@@ -48,6 +48,7 @@ cc_library(
+     name = "base",
+     srcs = ["lp_types.cc"],
+     hdrs = ["lp_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:hash",
+@@ -61,6 +62,7 @@ cc_library(
+     name = "permutation",
+     hdrs = ["permutation.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -73,6 +75,7 @@ cc_library(
+ cc_library(
+     name = "scattered_vector",
+     hdrs = ["scattered_vector.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -86,6 +89,7 @@ cc_library(
+     name = "sparse_vector",
+     hdrs = ["sparse_vector.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":permutation",
+@@ -102,6 +106,7 @@ cc_library(
+     srcs = ["sparse_column.cc"],
+     hdrs = ["sparse_column.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":sparse_vector",
+@@ -113,6 +118,7 @@ cc_library(
+     name = "sparse_row",
+     hdrs = ["sparse_row.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":sparse_vector",
+@@ -127,6 +133,7 @@ cc_library(
+         "sparse.h",
+     ],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":matrix_scaler_hdr",
+@@ -148,6 +155,7 @@ cc_library(
+     srcs = ["matrix_scaler.cc"],
+     hdrs = ["matrix_scaler.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_utils",
+@@ -165,6 +173,7 @@ cc_library(
+ cc_library(
+     name = "matrix_scaler_hdr",
+     hdrs = ["matrix_scaler.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -177,6 +186,7 @@ cc_library(
+     srcs = ["lp_data.cc"],
+     hdrs = ["lp_data.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_print_utils",
+@@ -200,6 +210,7 @@ cc_library(
+     name = "lp_data_utils",
+     srcs = ["lp_data_utils.cc"],
+     hdrs = ["lp_data_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -213,6 +224,7 @@ cc_library(
+     srcs = ["lp_utils.cc"],
+     hdrs = ["lp_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":scattered_vector",
+@@ -227,6 +239,7 @@ cc_library(
+     srcs = ["matrix_utils.cc"],
+     hdrs = ["matrix_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":sparse",
+@@ -241,6 +254,7 @@ cc_library(
+     hdrs = ["lp_parser.h"],
+     copts = SAFE_FP_CODE,
+     defines = ["USE_LP_PARSER"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -271,6 +285,7 @@ cc_library(
+     srcs = ["lp_print_utils.cc"],
+     hdrs = ["lp_print_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         "//ortools/base",
+@@ -285,6 +300,7 @@ cc_library(
+     srcs = ["proto_utils.cc"],
+     hdrs = ["proto_utils.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -297,6 +313,7 @@ cc_library(
+     name = "mps_reader_template",
+     srcs = ["mps_reader_template.cc"],
+     hdrs = ["mps_reader_template.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -317,6 +334,7 @@ cc_library(
+     srcs = ["mps_reader.cc"],
+     hdrs = ["mps_reader.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":lp_data",
+         ":lp_print_utils",
+@@ -337,6 +355,7 @@ cc_library(
+     name = "model_reader",
+     srcs = ["model_reader.cc"],
+     hdrs = ["model_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":lp_data",
+         ":mps_reader",
+@@ -354,6 +373,7 @@ cc_library(
+     srcs = ["lp_decomposer.cc"],
+     hdrs = ["lp_decomposer.h"],
+     copts = SAFE_FP_CODE,
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+@@ -370,6 +390,7 @@ cc_library(
+     name = "sol_reader",
+     srcs = ["sol_reader.cc"],
+     hdrs = ["sol_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base",
+         ":lp_data",
+diff --git a/ortools/math_opt/constraints/indicator/BUILD.bazel b/ortools/math_opt/constraints/indicator/BUILD.bazel
+index 12fdf6d..e4d2fa4 100644
+--- a/ortools/math_opt/constraints/indicator/BUILD.bazel
++++ b/ortools/math_opt/constraints/indicator/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "indicator_constraint",
+     srcs = ["indicator_constraint.cc"],
+     hdrs = ["indicator_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt/constraints/util:model_util",
+@@ -45,6 +46,7 @@ cc_library(
+     name = "storage",
+     srcs = ["storage.cc"],
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt:model_cc_proto",
+@@ -75,6 +77,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/quadratic/BUILD.bazel b/ortools/math_opt/constraints/quadratic/BUILD.bazel
+index e4a0925..d521c19 100644
+--- a/ortools/math_opt/constraints/quadratic/BUILD.bazel
++++ b/ortools/math_opt/constraints/quadratic/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "quadratic_constraint",
+     srcs = ["quadratic_constraint.cc"],
+     hdrs = ["quadratic_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt/constraints/util:model_util",
+@@ -50,6 +51,7 @@ cc_library(
+     name = "storage",
+     srcs = ["storage.cc"],
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:model_cc_proto",
+         "//ortools/math_opt:model_update_cc_proto",
+@@ -81,6 +83,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/second_order_cone/BUILD.bazel b/ortools/math_opt/constraints/second_order_cone/BUILD.bazel
+index 37ed646..17b383d 100644
+--- a/ortools/math_opt/constraints/second_order_cone/BUILD.bazel
++++ b/ortools/math_opt/constraints/second_order_cone/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "second_order_cone_constraint",
+     srcs = ["second_order_cone_constraint.cc"],
+     hdrs = ["second_order_cone_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":storage",
+         "//ortools/base:intops",
+@@ -47,6 +48,7 @@ cc_library(
+     name = "storage",
+     srcs = ["storage.cc"],
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt:model_cc_proto",
+@@ -79,6 +81,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/sos/BUILD.bazel b/ortools/math_opt/constraints/sos/BUILD.bazel
+index fade5cb..aad7cd5 100644
+--- a/ortools/math_opt/constraints/sos/BUILD.bazel
++++ b/ortools/math_opt/constraints/sos/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "sos1_constraint",
+     srcs = ["sos1_constraint.cc"],
+     hdrs = ["sos1_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":util",
+         "//ortools/base:intops",
+@@ -50,6 +51,7 @@ cc_library(
+     name = "sos2_constraint",
+     srcs = ["sos2_constraint.cc"],
+     hdrs = ["sos2_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":util",
+         "//ortools/base:intops",
+@@ -82,6 +84,7 @@ cc_test(
+ cc_library(
+     name = "storage",
+     hdrs = ["storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt:model_cc_proto",
+@@ -112,6 +115,7 @@ cc_test(
+ cc_library(
+     name = "util",
+     hdrs = ["util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/cpp:variable_and_expressions",
+         "//ortools/util:fp_roundtrip_conv",
+@@ -123,6 +127,7 @@ cc_library(
+     name = "validator",
+     srcs = ["validator.cc"],
+     hdrs = ["validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+diff --git a/ortools/math_opt/constraints/util/BUILD.bazel b/ortools/math_opt/constraints/util/BUILD.bazel
+index c3d0c06..968ba25 100644
+--- a/ortools/math_opt/constraints/util/BUILD.bazel
++++ b/ortools/math_opt/constraints/util/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "model_util",
+     srcs = ["model_util.cc"],
+     hdrs = ["model_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "//ortools/math_opt/cpp:variable_and_expressions",
+diff --git a/ortools/math_opt/core/BUILD.bazel b/ortools/math_opt/core/BUILD.bazel
+index 06da18f..45f3170 100644
+--- a/ortools/math_opt/core/BUILD.bazel
++++ b/ortools/math_opt/core/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "math_opt_proto_utils",
+     srcs = ["math_opt_proto_utils.cc"],
+     hdrs = ["math_opt_proto_utils.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":sparse_vector_view",
+@@ -42,6 +43,7 @@ cc_library(
+ cc_library(
+     name = "sparse_vector_view",
+     hdrs = ["sparse_vector_view.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":arrow_operator_proxy",
+         ":sparse_vector",
+@@ -59,6 +61,7 @@ cc_library(
+     name = "model_summary",
+     srcs = ["model_summary.cc"],
+     hdrs = ["model_summary.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:linked_hash_map",
+         "//ortools/base:status_macros",
+@@ -78,6 +81,7 @@ cc_library(
+     name = "solver_interface",
+     srcs = ["solver_interface.cc"],
+     hdrs = ["solver_interface.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":non_streamable_solver_init_arguments",
+         "//ortools/base:map_util",
+@@ -104,6 +108,7 @@ cc_library(
+     name = "solver",
+     srcs = ["solver.cc"],
+     hdrs = ["solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver",
+         ":concurrent_calls_guard",
+@@ -139,6 +144,7 @@ cc_library(
+     name = "non_streamable_solver_init_arguments",
+     srcs = ["non_streamable_solver_init_arguments.cc"],
+     hdrs = ["non_streamable_solver_init_arguments.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/math_opt:parameters_cc_proto"],
+ )
+ 
+@@ -146,22 +152,26 @@ cc_library(
+     name = "solver_debug",
+     srcs = ["solver_debug.cc"],
+     hdrs = ["solver_debug.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "arrow_operator_proxy",
+     hdrs = ["arrow_operator_proxy.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "sparse_vector",
+     hdrs = ["sparse_vector.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "sparse_submatrix",
+     srcs = ["sparse_submatrix.cc"],
+     hdrs = ["sparse_submatrix.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sparse_vector",
+         ":sparse_vector_view",
+@@ -176,6 +186,7 @@ cc_library(
+     name = "inverted_bounds",
+     srcs = ["inverted_bounds.cc"],
+     hdrs = ["inverted_bounds.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -188,6 +199,7 @@ cc_library(
+     name = "invalid_indicators",
+     srcs = ["invalid_indicators.cc"],
+     hdrs = ["invalid_indicators.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/algorithm:container",
+@@ -200,6 +212,7 @@ cc_library(
+     name = "concurrent_calls_guard",
+     srcs = ["concurrent_calls_guard.cc"],
+     hdrs = ["concurrent_calls_guard.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/base:core_headers",
+         "@com_google_absl//absl/log:check",
+@@ -213,6 +226,7 @@ cc_library(
+     name = "empty_bounds",
+     srcs = ["empty_bounds.cc"],
+     hdrs = ["empty_bounds.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:result_cc_proto",
+         "//ortools/util:fp_roundtrip_conv",
+@@ -223,6 +237,7 @@ cc_library(
+ cc_library(
+     name = "sorted",
+     hdrs = ["sorted.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/algorithm:container",
+         "@com_google_absl//absl/container:flat_hash_map",
+@@ -235,6 +250,7 @@ cc_library(
+     name = "base_solver",
+     srcs = ["base_solver.cc"],
+     hdrs = ["base_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:callback_cc_proto",
+         "//ortools/math_opt:infeasible_subsystem_cc_proto",
+diff --git a/ortools/math_opt/cpp/BUILD.bazel b/ortools/math_opt/cpp/BUILD.bazel
+index a606388..51e4b90 100644
+--- a/ortools/math_opt/cpp/BUILD.bazel
++++ b/ortools/math_opt/cpp/BUILD.bazel
+@@ -20,6 +20,7 @@ package(default_visibility = [
+ cc_library(
+     name = "math_opt",
+     hdrs = ["math_opt.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":model",
+@@ -32,6 +33,7 @@ cc_library(
+     name = "basis_status",
+     srcs = ["basis_status.cc"],
+     hdrs = ["basis_status.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         "//ortools/math_opt:solution_cc_proto",
+@@ -44,6 +46,7 @@ cc_library(
+     name = "sparse_containers",
+     srcs = ["sparse_containers.cc"],
+     hdrs = ["sparse_containers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":basis_status",
+         ":linear_constraint",
+@@ -71,6 +74,7 @@ cc_library(
+     name = "model",
+     srcs = ["model.cc"],
+     hdrs = ["model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":linear_constraint",
+@@ -105,6 +109,7 @@ cc_library(
+     name = "variable_and_expressions",
+     srcs = ["variable_and_expressions.cc"],
+     hdrs = ["variable_and_expressions.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":formatters",
+         ":key_types",
+@@ -126,6 +131,7 @@ cc_library(
+     name = "objective",
+     srcs = ["objective.cc"],
+     hdrs = ["objective.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":variable_and_expressions",
+@@ -140,6 +146,7 @@ cc_library(
+ cc_library(
+     name = "linear_constraint",
+     hdrs = ["linear_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":variable_and_expressions",
+@@ -156,6 +163,7 @@ cc_library(
+     name = "solution",
+     srcs = ["solution.cc"],
+     hdrs = ["solution.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":basis_status",
+         ":enums",
+@@ -184,6 +192,7 @@ cc_library(
+     name = "solve_result",
+     srcs = ["solve_result.cc"],
+     hdrs = ["solve_result.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         ":linear_constraint",
+@@ -212,6 +221,7 @@ cc_library(
+     name = "map_filter",
+     srcs = ["map_filter.cc"],
+     hdrs = ["map_filter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":key_types",
+         ":linear_constraint",
+@@ -231,6 +241,7 @@ cc_library(
+     name = "callback",
+     srcs = ["callback.cc"],
+     hdrs = ["callback.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         ":map_filter",
+@@ -256,6 +267,7 @@ cc_library(
+ cc_library(
+     name = "key_types",
+     hdrs = ["key_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/storage:model_storage",
+         "@com_google_absl//absl/algorithm:container",
+@@ -270,6 +282,7 @@ cc_library(
+     name = "model_solve_parameters",
+     srcs = ["model_solve_parameters.cc"],
+     hdrs = ["model_solve_parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":linear_constraint",
+         ":map_filter",
+@@ -295,6 +308,7 @@ cc_library(
+     name = "update_tracker",
+     srcs = ["update_tracker.cc"],
+     hdrs = ["update_tracker.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/math_opt:model_cc_proto",
+@@ -310,6 +324,7 @@ cc_library(
+     name = "message_callback",
+     srcs = ["message_callback.cc"],
+     hdrs = ["message_callback.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/base:source_location",
+@@ -323,6 +338,7 @@ cc_library(
+ cc_library(
+     name = "solver_init_arguments",
+     hdrs = ["solver_init_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":streamable_solver_init_arguments",
+         "//ortools/math_opt/core:non_streamable_solver_init_arguments",
+@@ -333,6 +349,7 @@ cc_library(
+     name = "solve_arguments",
+     srcs = ["solve_arguments.cc"],
+     hdrs = ["solve_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":callback",
+         ":message_callback",
+@@ -350,6 +367,7 @@ cc_library(
+     name = "solve",
+     srcs = ["solve.cc"],
+     hdrs = ["solve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":compute_infeasible_subsystem_arguments",
+         ":compute_infeasible_subsystem_result",
+@@ -375,6 +393,7 @@ cc_library(
+     name = "streamable_solver_init_arguments",
+     srcs = ["streamable_solver_init_arguments.cc"],
+     hdrs = ["streamable_solver_init_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:parameters_cc_proto",
+         "//ortools/math_opt/solvers:gurobi_cc_proto",
+@@ -386,6 +405,7 @@ cc_library(
+     name = "parameters",
+     srcs = ["parameters.cc"],
+     hdrs = ["parameters.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         "//ortools/base:linked_hash_map",
+@@ -414,6 +434,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["matchers.cc"],
+     hdrs = ["matchers.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_constraint",
+@@ -435,6 +456,7 @@ cc_library(
+ cc_library(
+     name = "enums",
+     hdrs = ["enums.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/strings",
+@@ -446,6 +468,7 @@ cc_library(
+     name = "statistics",
+     srcs = ["statistics.cc"],
+     hdrs = ["statistics.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":model",
+@@ -456,12 +479,14 @@ cc_library(
+ cc_library(
+     name = "formatters",
+     hdrs = ["formatters.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/util:fp_roundtrip_conv"],
+ )
+ 
+ cc_library(
+     name = "update_result",
+     hdrs = ["update_result.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/math_opt:model_update_cc_proto"],
+ )
+ 
+@@ -469,6 +494,7 @@ cc_library(
+     name = "compute_infeasible_subsystem_result",
+     srcs = ["compute_infeasible_subsystem_result.cc"],
+     hdrs = ["compute_infeasible_subsystem_result.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":enums",
+         ":key_types",
+@@ -500,6 +526,7 @@ cc_library(
+ cc_library(
+     name = "compute_infeasible_subsystem_arguments",
+     hdrs = ["compute_infeasible_subsystem_arguments.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":message_callback",
+         ":parameters",
+@@ -511,6 +538,7 @@ cc_library(
+     name = "solver_resources",
+     srcs = ["solver_resources.cc"],
+     hdrs = ["solver_resources.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:rpc_cc_proto",
+         "//ortools/port:proto_utils",
+@@ -524,6 +552,7 @@ cc_library(
+     name = "solve_impl",
+     srcs = ["solve_impl.cc"],
+     hdrs = ["solve_impl.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":compute_infeasible_subsystem_arguments",
+         ":compute_infeasible_subsystem_result",
+@@ -551,6 +580,7 @@ cc_library(
+ cc_library(
+     name = "incremental_solver",
+     hdrs = ["incremental_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":compute_infeasible_subsystem_arguments",
+         ":compute_infeasible_subsystem_result",
+diff --git a/ortools/math_opt/io/BUILD.bazel b/ortools/math_opt/io/BUILD.bazel
+index 428beaf..8e1bb4a 100644
+--- a/ortools/math_opt/io/BUILD.bazel
++++ b/ortools/math_opt/io/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "proto_converter",
+     srcs = ["proto_converter.cc"],
+     hdrs = ["proto_converter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/linear_solver:linear_solver_cc_proto",
+@@ -41,6 +42,7 @@ cc_library(
+     name = "mps_converter",
+     srcs = ["mps_converter.cc"],
+     hdrs = ["mps_converter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_converter",
+         "//ortools/base:status_macros",
+@@ -57,6 +59,7 @@ cc_library(
+     name = "names_removal",
+     srcs = ["names_removal.cc"],
+     hdrs = ["names_removal.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt:model_cc_proto",
+         "//ortools/math_opt:model_update_cc_proto",
+@@ -67,6 +70,7 @@ cc_library(
+     name = "lp_converter",
+     srcs = ["lp_converter.cc"],
+     hdrs = ["lp_converter.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":proto_converter",
+         "//ortools/base:status_macros",
+@@ -81,6 +85,7 @@ cc_library(
+     name = "lp_parser",
+     srcs = ["lp_parser.cc"],
+     hdrs = ["lp_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":mps_converter",
+         "//ortools/base",
+diff --git a/ortools/math_opt/io/lp/BUILD.bazel b/ortools/math_opt/io/lp/BUILD.bazel
+index 95d079c..02f1634 100644
+--- a/ortools/math_opt/io/lp/BUILD.bazel
++++ b/ortools/math_opt/io/lp/BUILD.bazel
+@@ -15,6 +15,7 @@ cc_library(
+     name = "lp_model",
+     srcs = ["lp_model.cc"],
+     hdrs = ["lp_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":lp_name",
+         "//ortools/base:intops",
+@@ -32,6 +33,7 @@ cc_library(
+     name = "lp_name",
+     srcs = ["lp_name.cc"],
+     hdrs = ["lp_name.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -44,6 +46,7 @@ cc_library(
+     name = "model_utils",
+     srcs = ["model_utils.cc"],
+     hdrs = ["model_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":lp_model",
+         "//ortools/base:status_macros",
+diff --git a/ortools/math_opt/labs/BUILD.bazel b/ortools/math_opt/labs/BUILD.bazel
+index d048d84..c57e06a 100644
+--- a/ortools/math_opt/labs/BUILD.bazel
++++ b/ortools/math_opt/labs/BUILD.bazel
+@@ -15,6 +15,7 @@ cc_library(
+     name = "general_constraint_to_mip",
+     srcs = ["general_constraint_to_mip.cc"],
+     hdrs = ["general_constraint_to_mip.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":linear_expr_util",
+@@ -28,6 +29,7 @@ cc_library(
+     name = "linear_expr_util",
+     srcs = ["linear_expr_util.cc"],
+     hdrs = ["linear_expr_util.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/math_opt/cpp:math_opt",
+@@ -39,6 +41,7 @@ cc_library(
+     name = "solution_feasibility_checker",
+     srcs = ["solution_feasibility_checker.cc"],
+     hdrs = ["solution_feasibility_checker.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:mathutil",
+@@ -57,6 +60,7 @@ cc_library(
+     name = "solution_improvement",
+     srcs = ["solution_improvement.cc"],
+     hdrs = ["solution_improvement.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:status_macros",
+@@ -77,6 +81,7 @@ cc_library(
+         "dualizer.cc",
+     ],
+     hdrs = ["dualizer.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:map_util",
+diff --git a/ortools/math_opt/solver_tests/BUILD.bazel b/ortools/math_opt/solver_tests/BUILD.bazel
+index 48fc1d2..e43f488 100644
+--- a/ortools/math_opt/solver_tests/BUILD.bazel
++++ b/ortools/math_opt/solver_tests/BUILD.bazel
+@@ -18,6 +18,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["base_solver_test.cc"],
+     hdrs = ["base_solver_test.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/base:linked_hash_map",
+@@ -34,6 +35,7 @@ cc_library(
+     data = [
+         "//ortools/math_opt/solver_tests/testdata:23588.mps",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         ":test_models",
+@@ -69,6 +71,7 @@ cc_library(
+     data = [
+         "//ortools/math_opt/solver_tests/testdata:23588.mps",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base",
+@@ -91,6 +94,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_tests.cc"],
+     hdrs = ["lp_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -111,6 +115,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_incomplete_solve_tests.cc"],
+     hdrs = ["lp_incomplete_solve_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base",
+@@ -131,6 +136,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["invalid_input_tests.cc"],
+     hdrs = ["invalid_input_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -167,6 +173,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["mip_tests.cc"],
+     hdrs = ["mip_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base",
+@@ -185,6 +192,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["ip_model_solve_parameters_tests.cc"],
+     hdrs = ["ip_model_solve_parameters_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -204,6 +212,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["ip_multiple_solutions_tests.cc"],
+     hdrs = ["ip_multiple_solutions_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt/cpp:matchers",
+@@ -220,6 +229,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_model_solve_parameters_tests.cc"],
+     hdrs = ["lp_model_solve_parameters_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         ":test_models",
+@@ -237,6 +247,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_parameter_tests.cc"],
+     hdrs = ["lp_parameter_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/base:status_macros",
+@@ -258,6 +269,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["lp_initial_basis_tests.cc"],
+     hdrs = ["lp_initial_basis_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":base_solver_test",
+         "//ortools/base:gmock",
+@@ -279,6 +291,7 @@ cc_library(
+         "//ortools/math_opt/solver_tests/testdata:23588.mps",
+         "//ortools/math_opt/solver_tests/testdata:beavma.mps",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base",
+@@ -305,6 +318,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["multi_objective_tests.cc"],
+     hdrs = ["multi_objective_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/base:status_macros",
+@@ -327,6 +341,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["qp_tests.cc"],
+     hdrs = ["qp_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:gmock",
+@@ -345,6 +360,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["qc_tests.cc"],
+     hdrs = ["qc_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt/cpp:matchers",
+@@ -363,6 +379,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["second_order_cone_tests.cc"],
+     hdrs = ["second_order_cone_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt/cpp:matchers",
+@@ -381,6 +398,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["logical_constraint_tests.cc"],
+     hdrs = ["logical_constraint_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/math_opt:model_update_cc_proto",
+@@ -401,6 +419,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["test_models.cc"],
+     hdrs = ["test_models.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/cpp:math_opt",
+         "@com_google_absl//absl/log:check",
+@@ -426,6 +445,7 @@ cc_library(
+     testonly = True,
+     srcs = ["generic_tests.cc"],
+     hdrs = ["generic_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":test_models",
+         "//ortools/base:gmock",
+@@ -452,6 +472,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["infeasible_subsystem_tests.cc"],
+     hdrs = ["infeasible_subsystem_tests.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+         "//ortools/gurobi:gurobi_stdout_matchers",
+diff --git a/ortools/math_opt/solvers/BUILD.bazel b/ortools/math_opt/solvers/BUILD.bazel
+index e7e8054..ef6c123 100644
+--- a/ortools/math_opt/solvers/BUILD.bazel
++++ b/ortools/math_opt/solvers/BUILD.bazel
+@@ -22,6 +22,7 @@ cc_library(
+         "gscip_solver.cc",
+         "gscip_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":message_callback_data",
+@@ -106,6 +107,7 @@ cc_library(
+     name = "gurobi_callback",
+     srcs = ["gurobi_callback.cc"],
+     hdrs = ["gurobi_callback.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":message_callback_data",
+         "//ortools/base:linked_hash_map",
+@@ -140,6 +142,7 @@ cc_library(
+     hdrs = [
+         "gurobi_init_arguments.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":gurobi_callback",
+@@ -195,6 +198,7 @@ cc_library(
+         "glop_solver.cc",
+         "glop_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:map_util",
+@@ -245,6 +249,7 @@ cc_library(
+         "cp_sat_solver.cc",
+         "cp_sat_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base:protoutil",
+@@ -311,9 +316,9 @@ cc_test(
+ 
+ cc_test(
+     name = "cp_sat_solver_test",
++    timeout = "eternal",
+     srcs = ["cp_sat_solver_test.cc"],
+     shard_count = 10,
+-    timeout = "eternal",
+     deps = [
+         ":cp_sat_solver",
+         "//ortools/base:gmock_main",
+@@ -342,6 +347,7 @@ cc_library(
+     name = "message_callback_data",
+     srcs = ["message_callback_data.cc"],
+     hdrs = ["message_callback_data.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/math_opt/core:solver_interface",
+         "@com_google_absl//absl/strings",
+@@ -365,6 +371,7 @@ cc_library(
+     name = "pdlp_bridge",
+     srcs = ["pdlp_bridge.cc"],
+     hdrs = ["pdlp_bridge.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt:model_cc_proto",
+@@ -391,6 +398,7 @@ cc_library(
+         "pdlp_solver.cc",
+         "pdlp_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":pdlp_bridge",
+@@ -464,6 +472,7 @@ cc_library(
+         "glpk_solver.cc",
+         "glpk_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":glpk_cc_proto",
+@@ -563,6 +572,7 @@ cc_library(
+         "highs_solver.cc",
+         "highs_solver.h",
+     ],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":highs_cc_proto",
+diff --git a/ortools/math_opt/solvers/glpk/BUILD.bazel b/ortools/math_opt/solvers/glpk/BUILD.bazel
+index b33dd3b..af07950 100644
+--- a/ortools/math_opt/solvers/glpk/BUILD.bazel
++++ b/ortools/math_opt/solvers/glpk/BUILD.bazel
+@@ -18,6 +18,7 @@ cc_library(
+     name = "rays",
+     srcs = ["rays.cc"],
+     hdrs = ["rays.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "//ortools/base:status_macros",
+@@ -35,6 +36,7 @@ cc_library(
+     name = "glpk_sparse_vector",
+     srcs = ["glpk_sparse_vector.cc"],
+     hdrs = ["glpk_sparse_vector.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:logging",
+         "@com_google_absl//absl/log:check",
+@@ -54,6 +56,7 @@ cc_library(
+     name = "gap",
+     srcs = ["gap.cc"],
+     hdrs = ["gap.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_test(
+diff --git a/ortools/math_opt/solvers/gscip/BUILD.bazel b/ortools/math_opt/solvers/gscip/BUILD.bazel
+index fd91d85..fcc2c9c 100644
+--- a/ortools/math_opt/solvers/gscip/BUILD.bazel
++++ b/ortools/math_opt/solvers/gscip/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "gscip_solver_constraint_handler",
+     srcs = ["gscip_solver_constraint_handler.cc"],
+     hdrs = ["gscip_solver_constraint_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:linked_hash_map",
+         "//ortools/base:protoutil",
+diff --git a/ortools/math_opt/solvers/gurobi/BUILD.bazel b/ortools/math_opt/solvers/gurobi/BUILD.bazel
+index 32f8f39..5039d84 100644
+--- a/ortools/math_opt/solvers/gurobi/BUILD.bazel
++++ b/ortools/math_opt/solvers/gurobi/BUILD.bazel
+@@ -19,6 +19,7 @@ cc_library(
+     hdrs = [
+         "g_gurobi.h",
+     ],
++    features = ["-layering_check"],
+     visibility = [
+         "//ortools/gurobi:__subpackages__",
+         "//ortools/math_opt:__subpackages__",
+diff --git a/ortools/math_opt/storage/BUILD.bazel b/ortools/math_opt/storage/BUILD.bazel
+index cb85a81..459d2b0 100644
+--- a/ortools/math_opt/storage/BUILD.bazel
++++ b/ortools/math_opt/storage/BUILD.bazel
+@@ -16,6 +16,7 @@ package(default_visibility = ["//ortools/math_opt:__subpackages__"])
+ cc_library(
+     name = "model_storage_types",
+     hdrs = ["model_storage_types.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:intops",
+         "@com_google_absl//absl/strings",
+@@ -25,11 +26,13 @@ cc_library(
+ cc_library(
+     name = "range",
+     hdrs = ["range.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "iterators",
+     hdrs = ["iterators.h"],
++    features = ["-layering_check"],
+     deps = [":range"],
+ )
+ 
+@@ -37,6 +40,7 @@ cc_library(
+     name = "sparse_coefficient_map",
+     srcs = ["sparse_coefficient_map.cc"],
+     hdrs = ["sparse_coefficient_map.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         "//ortools/base:intops",
+@@ -51,6 +55,7 @@ cc_library(
+     name = "sparse_matrix",
+     srcs = ["sparse_matrix.cc"],
+     hdrs = ["sparse_matrix.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         "//ortools/base:intops",
+@@ -67,6 +72,7 @@ cc_library(
+ cc_library(
+     name = "linear_expression_data",
+     hdrs = ["linear_expression_data.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sparse_coefficient_map",
+         "//ortools/math_opt:sparse_containers_cc_proto",
+@@ -78,6 +84,7 @@ cc_library(
+ cc_library(
+     name = "update_trackers",
+     hdrs = ["update_trackers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         "//ortools/base:intops",
+@@ -93,6 +100,7 @@ cc_library(
+     name = "variable_storage",
+     srcs = ["variable_storage.cc"],
+     hdrs = ["variable_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         ":range",
+@@ -112,6 +120,7 @@ cc_library(
+     name = "objective_storage",
+     srcs = ["objective_storage.cc"],
+     hdrs = ["objective_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":range",
+         ":sparse_coefficient_map",
+@@ -136,6 +145,7 @@ cc_library(
+     name = "linear_constraint_storage",
+     srcs = ["linear_constraint_storage.cc"],
+     hdrs = ["linear_constraint_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         ":range",
+@@ -158,6 +168,7 @@ cc_library(
+ cc_library(
+     name = "atomic_constraint_storage",
+     hdrs = ["atomic_constraint_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model_storage_types",
+         ":range",
+@@ -176,6 +187,7 @@ cc_library(
+     name = "model_storage",
+     srcs = ["model_storage.cc"],
+     hdrs = ["model_storage.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":atomic_constraint_storage",
+         ":iterators",
+diff --git a/ortools/math_opt/testing/BUILD.bazel b/ortools/math_opt/testing/BUILD.bazel
+index e80e4e0..058bd4b 100644
+--- a/ortools/math_opt/testing/BUILD.bazel
++++ b/ortools/math_opt/testing/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "param_name",
+     testonly = True,
+     hdrs = ["param_name.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:gmock",
+     ],
+@@ -25,4 +26,5 @@ cc_library(
+ cc_library(
+     name = "stream",
+     hdrs = ["stream.h"],
++    features = ["-layering_check"],
+ )
+diff --git a/ortools/math_opt/tools/BUILD.bazel b/ortools/math_opt/tools/BUILD.bazel
+index 55c1f2f..adfeac6 100644
+--- a/ortools/math_opt/tools/BUILD.bazel
++++ b/ortools/math_opt/tools/BUILD.bazel
+@@ -66,6 +66,7 @@ cc_library(
+     name = "file_format_flags",
+     srcs = ["file_format_flags.cc"],
+     hdrs = ["file_format_flags.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+diff --git a/ortools/math_opt/validators/BUILD.bazel b/ortools/math_opt/validators/BUILD.bazel
+index 5448a93..c23ef8e 100644
+--- a/ortools/math_opt/validators/BUILD.bazel
++++ b/ortools/math_opt/validators/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "ids_validator",
+     srcs = ["ids_validator.cc"],
+     hdrs = ["ids_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "//ortools/math_opt/core:model_summary",
+@@ -31,6 +32,7 @@ cc_library(
+     name = "scalar_validator",
+     srcs = ["scalar_validator.cc"],
+     hdrs = ["scalar_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/status",
+         "@com_google_absl//absl/strings",
+@@ -41,6 +43,7 @@ cc_library(
+     name = "sparse_matrix_validator",
+     srcs = ["sparse_matrix_validator.cc"],
+     hdrs = ["sparse_matrix_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         "//ortools/base:status_macros",
+@@ -56,6 +59,7 @@ cc_library(
+ cc_library(
+     name = "sparse_vector_validator",
+     hdrs = ["sparse_vector_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":scalar_validator",
+@@ -70,6 +74,7 @@ cc_library(
+     name = "model_validator",
+     srcs = ["model_validator.cc"],
+     hdrs = ["model_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":scalar_validator",
+@@ -94,6 +99,7 @@ cc_library(
+     name = "solve_stats_validator",
+     srcs = ["solve_stats_validator.cc"],
+     hdrs = ["solve_stats_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:protoutil",
+         "//ortools/math_opt:result_cc_proto",
+@@ -108,6 +114,7 @@ cc_library(
+     name = "result_validator",
+     srcs = ["result_validator.cc"],
+     hdrs = ["result_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":solution_validator",
+         ":solve_stats_validator",
+@@ -128,6 +135,7 @@ cc_library(
+     name = "solution_validator",
+     srcs = ["solution_validator.cc"],
+     hdrs = ["solution_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":scalar_validator",
+@@ -148,6 +156,7 @@ cc_library(
+     name = "solve_parameters_validator",
+     srcs = ["solve_parameters_validator.cc"],
+     hdrs = ["solve_parameters_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:protoutil",
+         "//ortools/base:status_macros",
+@@ -164,6 +173,7 @@ cc_library(
+     name = "callback_validator",
+     srcs = ["callback_validator.cc"],
+     hdrs = ["callback_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":model_parameters_validator",
+@@ -190,6 +200,7 @@ cc_library(
+     name = "model_parameters_validator",
+     srcs = ["model_parameters_validator.cc"],
+     hdrs = ["model_parameters_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":ids_validator",
+         ":solution_validator",
+@@ -208,6 +219,7 @@ cc_library(
+     name = "linear_expression_validator",
+     srcs = ["linear_expression_validator.cc"],
+     hdrs = ["linear_expression_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":scalar_validator",
+         ":sparse_vector_validator",
+@@ -223,6 +235,7 @@ cc_library(
+     name = "infeasible_subsystem_validator",
+     srcs = ["infeasible_subsystem_validator.cc"],
+     hdrs = ["infeasible_subsystem_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bounds_and_status_validator",
+         ":ids_validator",
+@@ -238,6 +251,7 @@ cc_library(
+     name = "bounds_and_status_validator",
+     srcs = ["bounds_and_status_validator.cc"],
+     hdrs = ["bounds_and_status_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":scalar_validator",
+         "//ortools/base:status_macros",
+@@ -252,6 +266,7 @@ cc_library(
+     name = "termination_validator",
+     srcs = ["termination_validator.cc"],
+     hdrs = ["termination_validator.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bounds_and_status_validator",
+         "//ortools/base:status_macros",
+diff --git a/ortools/packing/BUILD.bazel b/ortools/packing/BUILD.bazel
+index 04b7014..a774e8f 100644
+--- a/ortools/packing/BUILD.bazel
++++ b/ortools/packing/BUILD.bazel
+@@ -21,6 +21,7 @@ cc_library(
+     name = "arc_flow_builder",
+     srcs = ["arc_flow_builder.cc"],
+     hdrs = ["arc_flow_builder.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -37,6 +38,7 @@ cc_library(
+         "arc_flow_solver.cc",
+     ],
+     hdrs = ["arc_flow_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":arc_flow_builder",
+         "//ortools/base",
+@@ -65,6 +67,7 @@ cc_library(
+     name = "vector_bin_packing_parser",
+     srcs = ["vector_bin_packing_parser.cc"],
+     hdrs = ["vector_bin_packing_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":vector_bin_packing_cc_proto",
+@@ -131,6 +134,7 @@ cc_library(
+     name = "binpacking_2d_parser",
+     srcs = ["binpacking_2d_parser.cc"],
+     hdrs = ["binpacking_2d_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":multiple_dimensions_bin_packing_cc_proto",
+diff --git a/ortools/pdlp/BUILD.bazel b/ortools/pdlp/BUILD.bazel
+index 5b68856..739a948 100644
+--- a/ortools/pdlp/BUILD.bazel
++++ b/ortools/pdlp/BUILD.bazel
+@@ -20,6 +20,7 @@ package(default_visibility = ["//visibility:public"])
+ cc_library(
+     name = "scheduler",
+     hdrs = ["scheduler.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/functional:any_invocable",
+     ],
+@@ -62,6 +63,7 @@ py_proto_library(
+ cc_library(
+     name = "gtest_main",
+     srcs = ["gtest_main.cc"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:gmock",
+@@ -72,6 +74,7 @@ cc_library(
+     name = "iteration_stats",
+     srcs = ["iteration_stats.cc"],
+     hdrs = ["iteration_stats.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharded_quadratic_program",
+@@ -105,6 +108,7 @@ cc_library(
+     name = "primal_dual_hybrid_gradient",
+     srcs = ["primal_dual_hybrid_gradient.cc"],
+     hdrs = ["primal_dual_hybrid_gradient.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":iteration_stats",
+         ":quadratic_program",
+@@ -169,6 +173,7 @@ cc_library(
+     name = "quadratic_program",
+     srcs = ["quadratic_program.cc"],
+     hdrs = ["quadratic_program.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:status_macros",
+@@ -201,6 +206,7 @@ cc_library(
+     name = "quadratic_program_io",
+     srcs = ["quadratic_program_io.cc"],
+     hdrs = ["quadratic_program_io.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         "//ortools/base",
+@@ -224,6 +230,7 @@ cc_library(
+     name = "sharded_optimization_utils",
+     srcs = ["sharded_optimization_utils.cc"],
+     hdrs = ["sharded_optimization_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharded_quadratic_program",
+@@ -256,6 +263,7 @@ cc_library(
+     name = "sharded_quadratic_program",
+     srcs = ["sharded_quadratic_program.cc"],
+     hdrs = ["sharded_quadratic_program.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharder",
+@@ -285,6 +293,7 @@ cc_library(
+     name = "sharder",
+     srcs = ["sharder.cc"],
+     hdrs = ["sharder.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:mathutil",
+@@ -315,6 +324,7 @@ cc_library(
+     name = "solvers_proto_validation",
+     srcs = ["solvers_proto_validation.cc"],
+     hdrs = ["solvers_proto_validation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":solvers_cc_proto",
+         "//ortools/base:status_macros",
+@@ -340,6 +350,7 @@ cc_library(
+     name = "termination",
+     srcs = ["termination.cc"],
+     hdrs = ["termination.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":solve_log_cc_proto",
+         ":solvers_cc_proto",
+@@ -365,6 +376,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["test_util.cc"],
+     hdrs = ["test_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         "//ortools/base",
+@@ -390,6 +402,7 @@ cc_library(
+     name = "trust_region",
+     srcs = ["trust_region.cc"],
+     hdrs = ["trust_region.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":quadratic_program",
+         ":sharded_optimization_utils",
+diff --git a/ortools/port/BUILD.bazel b/ortools/port/BUILD.bazel
+index 00b8585..b947b31 100644
+--- a/ortools/port/BUILD.bazel
++++ b/ortools/port/BUILD.bazel
+@@ -17,6 +17,7 @@ cc_library(
+     name = "sysinfo",
+     srcs = ["sysinfo.cc"],
+     hdrs = ["sysinfo.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:sysinfo",
+@@ -27,6 +28,7 @@ cc_library(
+     name = "proto_utils",
+     srcs = ["proto_utils.cc"],
+     hdrs = ["proto_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/util:parse_proto",
+@@ -38,6 +40,7 @@ cc_library(
+ cc_library(
+     name = "utf8",
+     hdrs = ["utf8.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:encodingutils",
+@@ -52,6 +55,7 @@ cc_library(
+     hdrs = [
+         "file.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:file",
+         "@com_google_absl//absl/status",
+@@ -64,4 +68,5 @@ cc_library(
+ cc_library(
+     name = "scoped_std_stream_capture",
+     hdrs = ["scoped_std_stream_capture.h"],
++    features = ["-layering_check"],
+ )
+diff --git a/ortools/routing/parsers/BUILD.bazel b/ortools/routing/parsers/BUILD.bazel
+index 94690f3..a99b6dd 100644
+--- a/ortools/routing/parsers/BUILD.bazel
++++ b/ortools/routing/parsers/BUILD.bazel
+@@ -30,6 +30,7 @@ cc_library(
+     name = "simple_graph",
+     srcs = ["simple_graph.cc"],
+     hdrs = ["simple_graph.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/hash",
+     ],
+@@ -50,6 +51,7 @@ cc_library(
+     name = "solomon_parser",
+     srcs = ["solomon_parser.cc"],
+     hdrs = ["solomon_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -79,6 +81,7 @@ cc_library(
+     name = "lilim_parser",
+     srcs = ["lilim_parser.cc"],
+     hdrs = ["lilim_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base:file",
+@@ -110,6 +113,7 @@ cc_library(
+     name = "carp_parser",
+     srcs = ["carp_parser.cc"],
+     hdrs = ["carp_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -153,6 +157,7 @@ cc_library(
+     name = "nearp_parser",
+     srcs = ["nearp_parser.cc"],
+     hdrs = ["nearp_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -186,6 +191,7 @@ cc_library(
+     name = "pdtsp_parser",
+     srcs = ["pdtsp_parser.cc"],
+     hdrs = ["pdtsp_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         "//ortools/base",
+@@ -220,6 +226,7 @@ cc_library(
+     name = "tsplib_parser",
+     srcs = ["tsplib_parser.cc"],
+     hdrs = ["tsplib_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":simple_graph",
+@@ -268,6 +275,7 @@ cc_library(
+     name = "tsptw_parser",
+     srcs = ["tsptw_parser.cc"],
+     hdrs = ["tsptw_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":simple_graph",
+@@ -302,6 +310,7 @@ cc_library(
+     name = "solution_serializer",
+     srcs = ["solution_serializer.cc"],
+     hdrs = ["solution_serializer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":simple_graph",
+         "//ortools/base",
+@@ -332,6 +341,7 @@ cc_library(
+     name = "cvrptw_lib",
+     srcs = ["cvrptw_lib.cc"],
+     hdrs = ["cvrptw_lib.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/constraint_solver:routing",
+@@ -343,6 +353,7 @@ cc_library(
+     name = "dow_parser",
+     srcs = ["dow_parser.cc"],
+     hdrs = ["dow_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":capacity_planning_cc_proto",
+         "//ortools/base",
+diff --git a/ortools/sat/BUILD.bazel b/ortools/sat/BUILD.bazel
+index 222559f..b05763d 100644
+--- a/ortools/sat/BUILD.bazel
++++ b/ortools/sat/BUILD.bazel
+@@ -24,6 +24,7 @@ cc_library(
+     name = "cp_model",
+     srcs = ["cp_model.cc"],
+     hdrs = ["cp_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_solver",
+@@ -42,6 +43,7 @@ cc_library(
+ cc_library(
+     name = "model",
+     hdrs = ["model.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:typeid",
+@@ -95,6 +97,7 @@ cc_library(
+     name = "cp_model_utils",
+     srcs = ["cp_model_utils.cc"],
+     hdrs = ["cp_model_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":sat_base",
+@@ -117,6 +120,7 @@ cc_library(
+     name = "synchronization",
+     srcs = ["synchronization.cc"],
+     hdrs = ["synchronization.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -162,6 +166,7 @@ cc_library(
+     name = "cp_model_checker",
+     srcs = ["cp_model_checker.cc"],
+     hdrs = ["cp_model_checker.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -185,6 +190,7 @@ cc_library(
+     name = "constraint_violation",
+     srcs = ["constraint_violation.cc"],
+     hdrs = ["constraint_violation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -208,6 +214,7 @@ cc_library(
+     name = "feasibility_jump",
+     srcs = ["feasibility_jump.cc"],
+     hdrs = ["feasibility_jump.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":constraint_violation",
+         ":cp_model_cc_proto",
+@@ -243,6 +250,7 @@ cc_library(
+     name = "linear_model",
+     srcs = ["linear_model.cc"],
+     hdrs = ["linear_model.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -258,6 +266,7 @@ cc_library(
+     name = "parameters_validation",
+     srcs = ["parameters_validation.cc"],
+     hdrs = ["parameters_validation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_search",
+         ":sat_parameters_cc_proto",
+@@ -269,6 +278,7 @@ cc_library(
+     name = "cp_model_search",
+     srcs = ["cp_model_search.cc"],
+     hdrs = ["cp_model_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_mapping",
+@@ -297,6 +307,7 @@ cc_library(
+     name = "cp_model_solver_helpers",
+     srcs = ["cp_model_solver_helpers.cc"],
+     hdrs = ["cp_model_solver_helpers.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":circuit",
+         ":clause",
+@@ -381,6 +392,7 @@ cc_library(
+     name = "shaving_solver",
+     srcs = ["shaving_solver.cc"],
+     hdrs = ["shaving_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_lns",
+@@ -411,6 +423,7 @@ cc_library(
+     name = "cp_model_solver",
+     srcs = ["cp_model_solver.cc"],
+     hdrs = ["cp_model_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":circuit",
+         ":clause",
+@@ -498,6 +511,7 @@ cc_library(
+ cc_library(
+     name = "cp_model_mapping",
+     hdrs = ["cp_model_mapping.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -521,6 +535,7 @@ cc_library(
+     name = "cp_model_loader",
+     srcs = ["cp_model_loader.cc"],
+     hdrs = ["cp_model_loader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":all_different",
+         ":circuit",
+@@ -582,6 +597,7 @@ cc_library(
+     name = "presolve_util",
+     srcs = ["presolve_util.cc"],
+     hdrs = ["presolve_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -612,6 +628,7 @@ cc_library(
+     name = "presolve_context",
+     srcs = ["presolve_context.cc"],
+     hdrs = ["presolve_context.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_loader",
+@@ -653,6 +670,7 @@ cc_library(
+         "cp_model_presolve.cc",
+     ],
+     hdrs = ["cp_model_presolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_rectangle_presolve",
+         ":circuit",
+@@ -718,6 +736,7 @@ cc_library(
+         "cp_model_postsolve.cc",
+     ],
+     hdrs = ["cp_model_postsolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -734,6 +753,7 @@ cc_library(
+     name = "cp_model_expand",
+     srcs = ["cp_model_expand.cc"],
+     hdrs = ["cp_model_expand.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_checker",
+@@ -762,6 +782,7 @@ cc_library(
+ cc_library(
+     name = "sat_base",
+     hdrs = ["sat_base.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         "//ortools/base",
+@@ -788,6 +809,7 @@ cc_library(
+         "sat_solver.cc",
+     ],
+     hdrs = ["sat_solver.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":drat_proof_handler",
+@@ -826,6 +848,7 @@ cc_library(
+     name = "restart",
+     srcs = ["restart.cc"],
+     hdrs = ["restart.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_decision",
+@@ -844,6 +867,7 @@ cc_library(
+     name = "probing",
+     srcs = ["probing.cc"],
+     hdrs = ["probing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":implied_bounds",
+@@ -874,6 +898,7 @@ cc_library(
+     name = "sat_inprocessing",
+     srcs = ["sat_inprocessing.cc"],
+     hdrs = ["sat_inprocessing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":drat_checker",
+@@ -907,6 +932,7 @@ cc_library(
+     name = "sat_decision",
+     srcs = ["sat_decision.cc"],
+     hdrs = ["sat_decision.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":pb_constraint",
+@@ -927,6 +953,7 @@ cc_library(
+     name = "clause",
+     srcs = ["clause.cc"],
+     hdrs = ["clause.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":drat_proof_handler",
+         ":inclusion",
+@@ -959,6 +986,7 @@ cc_library(
+     name = "simplification",
+     srcs = ["simplification.cc"],
+     hdrs = ["simplification.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":drat_proof_handler",
+         ":model",
+@@ -988,6 +1016,7 @@ cc_library(
+     name = "pb_constraint",
+     srcs = ["pb_constraint.cc"],
+     hdrs = ["pb_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_base",
+@@ -1012,6 +1041,7 @@ cc_library(
+     name = "symmetry",
+     srcs = ["symmetry.cc"],
+     hdrs = ["symmetry.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sat_base",
+         "//ortools/algorithms:sparse_permutation",
+@@ -1027,6 +1057,7 @@ cc_library(
+     name = "symmetry_util",
+     srcs = ["symmetry_util.cc"],
+     hdrs = ["symmetry_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/algorithms:dynamic_partition",
+         "//ortools/algorithms:sparse_permutation",
+@@ -1040,6 +1071,7 @@ cc_library(
+     name = "var_domination",
+     srcs = ["var_domination.cc"],
+     hdrs = ["var_domination.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_utils",
+@@ -1068,6 +1100,7 @@ cc_library(
+     name = "integer",
+     srcs = ["integer.cc"],
+     hdrs = ["integer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_base",
+@@ -1097,6 +1130,7 @@ cc_library(
+     name = "integer_search",
+     srcs = ["integer_search.cc"],
+     hdrs = ["integer_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":cp_model_cc_proto",
+@@ -1134,6 +1168,7 @@ cc_library(
+     name = "lb_tree_search",
+     srcs = ["lb_tree_search.cc"],
+     hdrs = ["lb_tree_search.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -1165,6 +1200,7 @@ cc_library(
+     name = "pseudo_costs",
+     srcs = ["pseudo_costs.cc"],
+     hdrs = ["pseudo_costs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -1187,6 +1223,7 @@ cc_library(
+     name = "intervals",
+     srcs = ["intervals.cc"],
+     hdrs = ["intervals.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_constraints",
+         ":implied_bounds",
+@@ -1216,6 +1253,7 @@ cc_library(
+     name = "precedences",
+     srcs = ["precedences.cc"],
+     hdrs = ["precedences.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":cp_constraints",
+@@ -1251,6 +1289,7 @@ cc_library(
+     name = "integer_expr",
+     srcs = ["integer_expr.cc"],
+     hdrs = ["integer_expr.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":linear_constraint",
+@@ -1278,6 +1317,7 @@ cc_library(
+     name = "linear_propagation",
+     srcs = ["linear_propagation.cc"],
+     hdrs = ["linear_propagation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1310,6 +1350,7 @@ cc_library(
+     name = "all_different",
+     srcs = ["all_different.cc"],
+     hdrs = ["all_different.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1331,6 +1372,7 @@ cc_library(
+     name = "theta_tree",
+     srcs = ["theta_tree.cc"],
+     hdrs = ["theta_tree.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         "//ortools/base",
+@@ -1342,6 +1384,7 @@ cc_library(
+     name = "disjunctive",
+     srcs = ["disjunctive.cc"],
+     hdrs = ["disjunctive.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":all_different",
+         ":integer",
+@@ -1368,6 +1411,7 @@ cc_library(
+     name = "timetable",
+     srcs = ["timetable.cc"],
+     hdrs = ["timetable.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":intervals",
+@@ -1384,6 +1428,7 @@ cc_library(
+     name = "timetable_edgefinding",
+     srcs = ["timetable_edgefinding.cc"],
+     hdrs = ["timetable_edgefinding.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":intervals",
+@@ -1399,6 +1444,7 @@ cc_library(
+     name = "cumulative",
+     srcs = ["cumulative.cc"],
+     hdrs = ["cumulative.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cumulative_energy",
+         ":disjunctive",
+@@ -1425,6 +1471,7 @@ cc_library(
+     name = "cumulative_energy",
+     srcs = ["cumulative_energy.cc"],
+     hdrs = ["cumulative_energy.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_orthogonal_packing",
+         ":diffn_util",
+@@ -1447,6 +1494,7 @@ cc_library(
+     name = "boolean_problem",
+     srcs = ["boolean_problem.cc"],
+     hdrs = ["boolean_problem.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem_cc_proto",
+         ":cp_model_cc_proto",
+@@ -1480,6 +1528,7 @@ cc_library(
+     name = "linear_relaxation",
+     srcs = ["linear_relaxation.cc"],
+     hdrs = ["linear_relaxation.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":circuit",
+         ":clause",
+@@ -1524,6 +1573,7 @@ cc_library(
+     name = "linear_constraint",
+     srcs = ["linear_constraint.cc"],
+     hdrs = ["linear_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1544,6 +1594,7 @@ cc_library(
+     name = "linear_programming_constraint",
+     srcs = ["linear_programming_constraint.cc"],
+     hdrs = ["linear_programming_constraint.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_mapping",
+@@ -1590,6 +1641,7 @@ cc_library(
+     name = "linear_constraint_manager",
+     srcs = ["linear_constraint_manager.cc"],
+     hdrs = ["linear_constraint_manager.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":linear_constraint",
+@@ -1620,6 +1672,7 @@ cc_library(
+     name = "cuts",
+     srcs = ["cuts.cc"],
+     hdrs = ["cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":clause",
+         ":implied_bounds",
+@@ -1653,6 +1706,7 @@ cc_library(
+     name = "routing_cuts",
+     srcs = ["routing_cuts.cc"],
+     hdrs = ["routing_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cuts",
+@@ -1680,6 +1734,7 @@ cc_library(
+     name = "scheduling_cuts",
+     srcs = ["scheduling_cuts.cc"],
+     hdrs = ["scheduling_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cuts",
+         ":implied_bounds",
+@@ -1710,6 +1765,7 @@ cc_library(
+     name = "diffn_cuts",
+     srcs = ["diffn_cuts.cc"],
+     hdrs = ["diffn_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cuts",
+         ":diffn_util",
+@@ -1741,6 +1797,7 @@ cc_library(
+     name = "zero_half_cuts",
+     srcs = ["zero_half_cuts.cc"],
+     hdrs = ["zero_half_cuts.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":util",
+@@ -1756,6 +1813,7 @@ cc_library(
+     name = "lp_utils",
+     srcs = ["lp_utils.cc"],
+     hdrs = ["lp_utils.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem",
+         ":boolean_problem_cc_proto",
+@@ -1786,6 +1844,7 @@ cc_library(
+     name = "optimization",
+     srcs = ["optimization.cc"],
+     hdrs = ["optimization.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem",
+         ":boolean_problem_cc_proto",
+@@ -1825,6 +1884,7 @@ cc_library(
+     name = "max_hs",
+     srcs = ["max_hs.cc"],
+     hdrs = ["max_hs.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem",
+         ":cp_model_cc_proto",
+@@ -1871,6 +1931,7 @@ cc_library(
+     name = "util",
+     srcs = ["util.cc"],
+     hdrs = ["util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":model",
+         ":sat_base",
+@@ -1904,6 +1965,7 @@ cc_library(
+     name = "stat_tables",
+     srcs = ["stat_tables.cc"],
+     hdrs = ["stat_tables.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_lns",
+@@ -1926,6 +1988,7 @@ cc_library(
+     name = "table",
+     srcs = ["table.cc"],
+     hdrs = ["table.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1943,6 +2006,7 @@ cc_library(
+     name = "cp_constraints",
+     srcs = ["cp_constraints.cc"],
+     hdrs = ["cp_constraints.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -1960,6 +2024,7 @@ cc_library(
+     name = "diffn_util",
+     srcs = ["diffn_util.cc"],
+     hdrs = ["diffn_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":intervals",
+@@ -1981,6 +2046,7 @@ cc_library(
+     name = "2d_orthogonal_packing",
+     srcs = ["2d_orthogonal_packing.cc"],
+     hdrs = ["2d_orthogonal_packing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_packing_brute_force",
+         ":integer",
+@@ -2000,6 +2066,7 @@ cc_library(
+     name = "2d_packing_brute_force",
+     srcs = ["2d_packing_brute_force.cc"],
+     hdrs = ["2d_packing_brute_force.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":diffn_util",
+         ":integer",
+@@ -2016,6 +2083,7 @@ cc_library(
+     name = "2d_rectangle_presolve",
+     srcs = ["2d_rectangle_presolve.cc"],
+     hdrs = ["2d_rectangle_presolve.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":diffn_util",
+         ":integer",
+@@ -2032,6 +2100,7 @@ cc_library(
+     testonly = 1,
+     srcs = ["2d_orthogonal_packing_testing.cc"],
+     hdrs = ["2d_orthogonal_packing_testing.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":diffn_util",
+         ":integer",
+@@ -2046,6 +2115,7 @@ cc_library(
+     name = "diffn",
+     srcs = ["diffn.cc"],
+     hdrs = ["diffn.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":2d_orthogonal_packing",
+         ":cumulative_energy",
+@@ -2075,6 +2145,7 @@ cc_library(
+     name = "circuit",
+     srcs = ["circuit.cc"],
+     hdrs = ["circuit.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":integer",
+         ":model",
+@@ -2097,6 +2168,7 @@ cc_library(
+     name = "encoding",
+     srcs = ["encoding.cc"],
+     hdrs = ["encoding.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem_cc_proto",
+         ":pb_constraint",
+@@ -2117,6 +2189,7 @@ cc_library(
+     name = "cp_model_lns",
+     srcs = ["cp_model_lns.cc"],
+     hdrs = ["cp_model_lns.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_mapping",
+@@ -2160,6 +2233,7 @@ cc_library(
+     name = "feasibility_pump",
+     srcs = ["feasibility_pump.cc"],
+     hdrs = ["feasibility_pump.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -2193,6 +2267,7 @@ cc_library(
+     name = "rins",
+     srcs = ["rins.cc"],
+     hdrs = ["rins.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":integer",
+@@ -2211,6 +2286,7 @@ cc_library(
+     name = "subsolver",
+     srcs = ["subsolver.cc"],
+     hdrs = ["subsolver.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:threadpool",
+@@ -2230,6 +2306,7 @@ cc_library(
+     name = "drat_proof_handler",
+     srcs = ["drat_proof_handler.cc"],
+     hdrs = ["drat_proof_handler.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":drat_checker",
+         ":drat_writer",
+@@ -2246,6 +2323,7 @@ cc_library(
+     name = "drat_checker",
+     srcs = ["drat_checker.cc"],
+     hdrs = ["drat_checker.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sat_base",
+         "//ortools/base",
+@@ -2265,6 +2343,7 @@ cc_library(
+     name = "drat_writer",
+     srcs = ["drat_writer.cc"],
+     hdrs = ["drat_writer.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":sat_base",
+         "//ortools/base",
+@@ -2311,6 +2390,7 @@ cc_binary(
+ cc_library(
+     name = "sat_cnf_reader",
+     hdrs = ["sat_cnf_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":boolean_problem_cc_proto",
+         ":cp_model_cc_proto",
+@@ -2328,6 +2408,7 @@ cc_library(
+     name = "cp_model_symmetries",
+     srcs = ["cp_model_symmetries.cc"],
+     hdrs = ["cp_model_symmetries.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_cc_proto",
+         ":cp_model_checker",
+@@ -2367,6 +2448,7 @@ cc_library(
+     name = "swig_helper",
+     srcs = ["swig_helper.cc"],
+     hdrs = ["swig_helper.h"],
++    features = ["-layering_check"],
+     visibility = [
+         "//ortools/sat/java:__pkg__",
+         "//ortools/sat/python:__pkg__",
+@@ -2389,6 +2471,7 @@ cc_library(
+     name = "implied_bounds",
+     srcs = ["implied_bounds.cc"],
+     hdrs = ["implied_bounds.h"],
++    features = ["-layering_check"],
+     deps = [
+         "linear_constraint",
+         ":clause",
+@@ -2418,6 +2501,7 @@ cc_library(
+ cc_library(
+     name = "inclusion",
+     hdrs = ["inclusion.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/log:check",
+@@ -2429,6 +2513,7 @@ cc_library(
+     name = "diophantine",
+     srcs = ["diophantine.cc"],
+     hdrs = ["diophantine.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":util",
+         "@com_google_absl//absl/log:check",
+@@ -2441,6 +2526,7 @@ cc_library(
+     name = "work_assignment",
+     srcs = ["work_assignment.cc"],
+     hdrs = ["work_assignment.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":cp_model_mapping",
+         ":cp_model_utils",
+diff --git a/ortools/scheduling/BUILD.bazel b/ortools/scheduling/BUILD.bazel
+index d2c0ef0..5c794d4 100644
+--- a/ortools/scheduling/BUILD.bazel
++++ b/ortools/scheduling/BUILD.bazel
+@@ -34,6 +34,7 @@ cc_library(
+     name = "jobshop_scheduling_parser",
+     srcs = ["jobshop_scheduling_parser.cc"],
+     hdrs = ["jobshop_scheduling_parser.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":jobshop_scheduling_cc_proto",
+         "//ortools/base",
+@@ -63,6 +64,7 @@ cc_library(
+     name = "rcpsp_parser",
+     srcs = ["rcpsp_parser.cc"],
+     hdrs = ["rcpsp_parser.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+     deps = [
+         ":rcpsp_cc_proto",
+diff --git a/ortools/util/BUILD.bazel b/ortools/util/BUILD.bazel
+index b2ee315..a123c8d 100644
+--- a/ortools/util/BUILD.bazel
++++ b/ortools/util/BUILD.bazel
+@@ -56,6 +56,7 @@ py_proto_library(
+ cc_library(
+     name = "affine_relation",
+     hdrs = ["affine_relation.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:iterator_adaptors",
+@@ -65,6 +66,7 @@ cc_library(
+ cc_library(
+     name = "filelineiter",
+     hdrs = ["filelineiter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:file",
+@@ -77,6 +79,7 @@ cc_library(
+     name = "bitset",
+     srcs = ["bitset.cc"],
+     hdrs = ["bitset.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+@@ -85,6 +88,7 @@ cc_library(
+     hdrs = [
+         "integer_pq.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -94,6 +98,7 @@ cc_library(
+     name = "cached_log",
+     srcs = ["cached_log.cc"],
+     hdrs = ["cached_log.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:types",
+@@ -103,18 +108,21 @@ cc_library(
+ cc_library(
+     name = "zvector",
+     hdrs = ["zvector.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "permutation",
+     hdrs = ["permutation.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "saturated_arithmetic",
+     hdrs = ["saturated_arithmetic.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":bitset",
+         "//ortools/base",
+@@ -126,6 +134,7 @@ cc_library(
+     name = "piecewise_linear_function",
+     srcs = ["piecewise_linear_function.cc"],
+     hdrs = ["piecewise_linear_function.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":saturated_arithmetic",
+         "//ortools/base",
+@@ -140,6 +149,7 @@ cc_library(
+     name = "rational_approximation",
+     srcs = ["rational_approximation.cc"],
+     hdrs = ["rational_approximation.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -150,6 +160,7 @@ cc_library(
+     name = "sorted_interval_list",
+     srcs = ["sorted_interval_list.cc"],
+     hdrs = ["sorted_interval_list.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":saturated_arithmetic",
+         "//ortools/base",
+@@ -163,6 +174,7 @@ cc_library(
+ cc_library(
+     name = "string_array",
+     hdrs = ["string_array.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -171,6 +183,7 @@ cc_library(
+ cc_library(
+     name = "tuple_set",
+     hdrs = ["tuple_set.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:hash",
+@@ -183,6 +196,7 @@ cc_library(
+     name = "stats",
+     srcs = ["stats.cc"],
+     hdrs = ["stats.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:stl_util",
+@@ -200,6 +214,7 @@ cc_library(
+     name = "time_limit",
+     srcs = ["time_limit.cc"],
+     hdrs = ["time_limit.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":running_stat",
+         "//ortools/base",
+@@ -216,6 +231,7 @@ cc_library(
+     name = "sigint",
+     srcs = ["sigint.cc"],
+     hdrs = ["sigint.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -234,6 +250,7 @@ cc_library(
+         "on_windows": [],
+         "//conditions:default": ["-frounding-math"],
+     }),
++    features = ["-layering_check"],
+     deps = [
+         ":bitset",
+         "//ortools/base",
+@@ -244,18 +261,21 @@ cc_library(
+     name = "monoid_operation_tree",
+     srcs = [],
+     hdrs = ["monoid_operation_tree.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "return_macros",
+     hdrs = ["return_macros.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "running_stat",
+     hdrs = ["running_stat.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+@@ -263,6 +283,7 @@ cc_library(
+     name = "proto_tools",
+     srcs = ["proto_tools.cc"],
+     hdrs = ["proto_tools.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/status",
+         "@com_google_absl//absl/status:statusor",
+@@ -302,6 +323,7 @@ cc_library(
+     hdrs = [
+         "functions_swig_helpers.h",
+     ],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+@@ -311,12 +333,14 @@ cc_library(
+     hdrs = [
+         "functions_swig_test_helpers.h",
+     ],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "range_minimum_query",
+     hdrs = ["range_minimum_query.h"],
++    features = ["-layering_check"],
+     deps = [":bitset"],
+ )
+ 
+@@ -324,6 +348,7 @@ cc_library(
+     name = "range_query_function",
+     srcs = ["range_query_function.cc"],
+     hdrs = ["range_query_function.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":range_minimum_query",
+         "//ortools/base",
+@@ -333,6 +358,7 @@ cc_library(
+ cc_library(
+     name = "rev",
+     hdrs = ["rev.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:map_util",
+@@ -343,6 +369,7 @@ cc_library(
+ cc_library(
+     name = "vector_or_function",
+     hdrs = ["vector_or_function.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -354,6 +381,7 @@ cc_library(
+     name = "qap_reader",
+     srcs = ["qap_reader.cc"],
+     hdrs = ["qap_reader.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/util:filelineiter",
+         "@com_google_absl//absl/strings",
+@@ -363,6 +391,7 @@ cc_library(
+ cc_library(
+     name = "sort",
+     hdrs = ["sort.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+     ],
+@@ -372,6 +401,7 @@ cc_library(
+     name = "file_util",
+     srcs = ["file_util.cc"],
+     hdrs = ["file_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:dump_vars",
+@@ -391,6 +421,7 @@ cc_library(
+ cc_library(
+     name = "random_engine",
+     hdrs = ["random_engine.h"],
++    features = ["-layering_check"],
+     deps = [],
+ )
+ 
+@@ -398,6 +429,7 @@ cc_library(
+     name = "string_util",
+     srcs = ["string_util.cc"],
+     hdrs = ["string_util.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -408,12 +440,14 @@ cc_library(
+ cc_library(
+     name = "adaptative_parameter_value",
+     hdrs = ["adaptative_parameter_value.h"],
++    features = ["-layering_check"],
+     deps = ["//ortools/base"],
+ )
+ 
+ cc_library(
+     name = "lazy_mutable_copy",
+     hdrs = ["lazy_mutable_copy.h"],
++    features = ["-layering_check"],
+     deps = ["@com_google_absl//absl/memory"],
+ )
+ 
+@@ -421,6 +455,7 @@ cc_library(
+     name = "logging",
+     srcs = ["logging.cc"],
+     hdrs = ["logging.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:timer",
+@@ -431,11 +466,13 @@ cc_library(
+ cc_library(
+     name = "testing_utils",
+     hdrs = ["testing_utils.h"],
++    features = ["-layering_check"],
+ )
+ 
+ cc_library(
+     name = "strong_integers",
+     hdrs = ["strong_integers.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "@com_google_absl//absl/strings",
+@@ -445,6 +482,7 @@ cc_library(
+ cc_library(
+     name = "status_macros",
+     hdrs = ["status_macros.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:status_macros",
+         "@com_google_absl//absl/status",
+@@ -455,6 +493,7 @@ cc_library(
+     name = "fp_roundtrip_conv",
+     srcs = ["fp_roundtrip_conv.cc"],
+     hdrs = ["fp_roundtrip_conv.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:status_builder",
+@@ -468,6 +507,7 @@ cc_library(
+ cc_library(
+     name = "flat_matrix",
+     hdrs = ["flat_matrix.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/types:span",
+     ],
+@@ -477,6 +517,7 @@ cc_library(
+     name = "fp_roundtrip_conv_testing",
+     testonly = 1,
+     hdrs = ["fp_roundtrip_conv_testing.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+     ],
+@@ -486,6 +527,7 @@ cc_library(
+     name = "aligned_memory",
+     srcs = ["aligned_memory_internal.h"],
+     hdrs = ["aligned_memory.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base:mathutil",
+     ],
+@@ -495,6 +537,7 @@ cc_library(
+     name = "vector_sum",
+     srcs = ["vector_sum_internal.h"],
+     hdrs = ["vector_sum.h"],
++    features = ["-layering_check"],
+     deps = [
+         ":aligned_memory",
+         "@com_google_absl//absl/base:core_headers",
+@@ -506,6 +549,7 @@ cc_library(
+     name = "parse_proto",
+     srcs = ["parse_proto.cc"],
+     hdrs = ["parse_proto.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/strings",
+         "@com_google_protobuf//:protobuf",
+@@ -516,6 +560,7 @@ cc_library(
+     name = "solve_interrupter",
+     srcs = ["solve_interrupter.cc"],
+     hdrs = ["solve_interrupter.h"],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:intops",
+@@ -529,6 +574,7 @@ cc_library(
+ cc_library(
+     name = "dense_set",
+     hdrs = ["dense_set.h"],
++    features = ["-layering_check"],
+     deps = [
+         "@com_google_absl//absl/log:check",
+         "@com_google_absl//absl/types:span",
+diff --git a/ortools/util/python/BUILD.bazel b/ortools/util/python/BUILD.bazel
+index 925cf57..765f573 100644
+--- a/ortools/util/python/BUILD.bazel
++++ b/ortools/util/python/BUILD.bazel
+@@ -21,6 +21,7 @@ load("@rules_python//python:defs.bzl", "py_test")
+ cc_library(
+     name = "sorted_interval_list_doc",
+     hdrs = ["sorted_interval_list_doc.h"],
++    features = ["-layering_check"],
+     visibility = ["//visibility:public"],
+ )
+ 
+diff --git a/ortools/xpress/BUILD.bazel b/ortools/xpress/BUILD.bazel
+index 22b6ed9..a86bc1d 100644
+--- a/ortools/xpress/BUILD.bazel
++++ b/ortools/xpress/BUILD.bazel
+@@ -21,6 +21,7 @@ cc_library(
+     hdrs = [
+         "environment.h",
+     ],
++    features = ["-layering_check"],
+     deps = [
+         "//ortools/base",
+         "//ortools/base:dynamic_library",
diff --git a/third_party/xla/third_party/py/python_init_pip.bzl b/third_party/xla/third_party/py/python_init_pip.bzl
index 7689b92b60a00a..39901b9b2e64ea 100644
--- a/third_party/xla/third_party/py/python_init_pip.bzl
+++ b/third_party/xla/third_party/py/python_init_pip.bzl
@@ -24,6 +24,10 @@ cc_library(
 cc_library(
     name = "numpy_headers",
     deps = [":numpy_headers_2", ":numpy_headers_1"],
+    # For the layering check to work we need to re-export the headers from the
+    # dependencies.
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]) +
+           glob(["site-packages/numpy/core/include/**/*.h"]),
 )
 """,
         ),
diff --git a/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD b/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
index 11795b3537e7a9..1e52bb31c540fc 100644
--- a/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
+++ b/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
@@ -24,6 +24,7 @@ cc_binary(
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:IRReader",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:config",
     ],
 )
 
diff --git a/third_party/xla/third_party/xnnpack/layering_check_fix.patch b/third_party/xla/third_party/xnnpack/layering_check_fix.patch
new file mode 100644
index 00000000000000..1e663770ba66e5
--- /dev/null
+++ b/third_party/xla/third_party/xnnpack/layering_check_fix.patch
@@ -0,0 +1,12 @@
+diff --git a/ynnpack/kernels/unary/BUILD b/ynnpack/kernels/unary/BUILD
+index 9c46262..2e5ac81 100644
+--- a/ynnpack/kernels/unary/BUILD
++++ b/ynnpack/kernels/unary/BUILD
+@@ -197,6 +197,7 @@ ynn_cc_library(
+         "x86_avx.inc",
+         "x86_avx2.inc",
+         "x86_avx512f.inc",
++        "x86_f16c.inc",
+         "x86_fma3.inc",
+         "x86_sse2.inc",
+         "x86_sse41.inc",
\ No newline at end of file
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index 129071d9fa793f..f67720ec702007 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -9,5 +9,6 @@ def repo():
         sha256 = "a89879422c6da8240cffb8ff67f5cd11f0362cb2a174ee9cd96b450e53902ca3",
         strip_prefix = "XNNPACK-77468446ebfd9baab7fc4349c32608c9675cf6d9",
         urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/77468446ebfd9baab7fc4349c32608c9675cf6d9.zip"),
+        patch_file = ["//third_party/xnnpack:layering_check_fix.patch"],
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
diff --git a/third_party/xla/third_party/zlib.BUILD b/third_party/xla/third_party/zlib.BUILD
index bab615b54dfd50..ae7a59d28af143 100644
--- a/third_party/xla/third_party/zlib.BUILD
+++ b/third_party/xla/third_party/zlib.BUILD
@@ -27,11 +27,13 @@ cc_library(
         "trees.c",
         "trees.h",
         "uncompr.c",
-        "zconf.h",
         "zutil.c",
         "zutil.h",
     ],
-    hdrs = ["zlib.h"],
+    hdrs = [
+        "zconf.h",
+        "zlib.h",
+    ],
     copts = select({
         "@local_xla//xla/tsl:windows": [],
         "//conditions:default": [
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 9438e40f32fe26..8430d9842b1c0c 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -316,9 +316,7 @@ def _tf_repositories():
         sha256 = "f253ca1a07262f8efde8328e4b2c68979e40ddfcfc001f70d1d5f612c7de2974",
         strip_prefix = "googletest-28e9d1f26771c6517c3b4be10254887673c94018",
         # Patch googletest to:
-        #   - avoid dependencies on @fuchsia_sdk,
-        #   - refer to re2 as @com_googlesource_code_re2,
-        #   - refer to abseil as @com_google_absl.
+        #   - make the gtest_main target export the gtest.h header.
         #   - add status assert macros for consistency with internal gmock (see
         #     README.add-status-macros.md).
         #
@@ -339,6 +337,10 @@ def _tf_repositories():
             "//third_party/googletest:0002-Rename-dependencies-for-workspace.bzl-build.patch",
         ],
         urls = tf_mirror_urls("https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c940189.zip"),
+        repo_mapping = {
+            "@abseil-cpp": "@com_google_absl",
+            "@re2": "@com_googlesource_code_re2",
+        },
     )
 
     tf_http_archive(
@@ -436,6 +438,7 @@ def _tf_repositories():
         sha256 = "9dc53f851107eaf87b391136d13b815df97ec8f76dadb487b58b2fc45e624d2c",
         strip_prefix = "boringssl-c00d7ca810e93780bd0c8ee4eea28f4f2ea4bcdc",
         system_build_file = "//third_party:boringssl.BUILD",
+        patch_file = ["//third_party:boringssl.patch"],
         urls = tf_mirror_urls("https://github.com/google/boringssl/archive/c00d7ca810e93780bd0c8ee4eea28f4f2ea4bcdc.tar.gz"),
     )
 
@@ -443,7 +446,14 @@ def _tf_repositories():
         name = "com_google_ortools",
         sha256 = "f6a0bd5b9f3058aa1a814b798db5d393c31ec9cbb6103486728997b49ab127bc",
         strip_prefix = "or-tools-9.11",
-        patch_file = ["//third_party/ortools:ortools.patch"],
+        patch_file = [
+            "//third_party/ortools:ortools.patch",
+            # On a version upgrade, this patch can be regenerated with the command:
+            # third_party/gen_disable_layering_check_patch.sh \
+            #   https://github.com/google/or-tools/archive/v9.11.tar.gz \
+            #   > third_party/ortools/layering_check.patch
+            "//third_party/ortools:layering_check.patch",
+        ],
         urls = tf_mirror_urls("https://github.com/google/or-tools/archive/v9.11.tar.gz"),
         repo_mapping = {
             "@com_google_protobuf_cc": "@com_google_protobuf",

From a61541785e664c7edb19ec1ce419e5fc63e9ef62 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 00:51:48 -0800
Subject: [PATCH 312/753] Automated Code Change

PiperOrigin-RevId: 845131635
---
 third_party/xla/xla/python/ifrt/BUILD  | 1 +
 third_party/xla/xla/python/ifrt/mock.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 59ee58576058f8..2951220554eb77 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -591,6 +591,7 @@ cc_library(
         "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/framework:allocator",
         "@com_google_absl//absl/base:no_destructor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 6e73a68ba80917..7da107a1b79f01 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/no_destructor.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"

From 213685f0d45900480191a733b7cd480331f498d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 01:03:06 -0800
Subject: [PATCH 313/753] Update GraphDef version to 2443.

PiperOrigin-RevId: 845135637
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 5d23b83edfa05e..a94b3e73fba6ba 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2442  // Updated: 2025/12/15
+#define TF_GRAPH_DEF_VERSION 2443  // Updated: 2025/12/16
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 185312b5bdef02a8f2a746207ea7c011d902b1fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 01:03:07 -0800
Subject: [PATCH 314/753] compat: Update forward compatibility horizon to
 2025-12-16

PiperOrigin-RevId: 845135652
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e7e8921efcb85a..397612746c514a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 15)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 16)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From f934657f5bc40e5ab0abec81d6ab906c34c733b4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 01:33:47 -0800
Subject: [PATCH 315/753] Automated Code Change

PiperOrigin-RevId: 845145482
---
 .../passes/bridge/convert_tf_quant_to_mhlo_int_test.cc        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index 0818c8013e534e..5ce7217927771b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -180,7 +180,7 @@ class ConvertTfQuantToMhloIntTest : public Test {
             /*byte_strides=*/std::nullopt, host_buffer_semantics,
             /*on_done_with_host_buffer=*/nullptr,
             *device_->default_memory_space(), /*device_layout=*/nullptr));
-    return buffer->ToLiteralSync();
+    return buffer->ToLiteral().Await();
   }
 
   absl::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileProgram(
@@ -220,7 +220,7 @@ class ConvertTfQuantToMhloIntTest : public Test {
     TF_ASSIGN_OR_RETURN(auto result,
                         executable->Execute({buffer_ptrs}, /*options=*/{}));
     CHECK(result.size() == 1 && result[0].size() == 1);
-    return result[0][0]->ToLiteralSync();
+    return result[0][0]->ToLiteral().Await();
   }
 
   void ExecuteAndCompareResultsWithTfKernel(

From 8e1157f4df64f86fedd7f6cca539de42e9116885 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 01:56:55 -0800
Subject: [PATCH 316/753] Automated Code Change

PiperOrigin-RevId: 845153877
---
 .../core/runtime_fallback/kernel/attr_util.cc | 10 +++----
 .../core/runtime_fallback/kernel/attr_util.h  |  4 +--
 .../runtime_fallback/kernel/attr_util_test.cc | 28 +++++++++----------
 .../kernel/kernel_fallback_execute_compat.cc  | 13 +++++----
 .../kernel/kernel_fallback_kernels.cc         |  2 +-
 .../runtime_fallback/kernel/tfrt_op_kernel.cc |  8 +++---
 .../runtime_fallback/kernel/tfrt_op_kernel.h  |  8 +++---
 .../kernel/tfrt_op_kernel_test.cc             | 18 ++++++------
 8 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/tensorflow/core/runtime_fallback/kernel/attr_util.cc b/tensorflow/core/runtime_fallback/kernel/attr_util.cc
index 82bb7ce1b89b57..3c319e09e0e137 100644
--- a/tensorflow/core/runtime_fallback/kernel/attr_util.cc
+++ b/tensorflow/core/runtime_fallback/kernel/attr_util.cc
@@ -72,7 +72,7 @@ absl::Status ParseValue(absl::string_view input, bool* value) {
   return absl::OkStatus();
 }
 
-absl::Status ParseValue(absl::string_view input, int32* value) {
+absl::Status ParseValue(absl::string_view input, int32_t* value) {
   bool parse_result = absl::SimpleAtoi(input, value);
   if (!parse_result) {
     return errors::InvalidArgument("Could not parse int32 from ", input);
@@ -90,7 +90,7 @@ absl::Status ParseValue(absl::string_view input, std::string* value) {
   return absl::OkStatus();
 }
 
-absl::Status ParseValue(absl::string_view input, std::vector<int32>* value) {
+absl::Status ParseValue(absl::string_view input, std::vector<int32_t>* value) {
   std::vector<std::string> parts = str_util::Split(input, ",");
   value->reserve(parts.size());
   for (const auto& value_str : parts) {
@@ -123,7 +123,7 @@ absl::Status AddOpAttr(const std::string& name, const std::string& attr_value,
   } else if (type == "i32") {
     int32_t val;
     s = ParseValue(value, &val);
-    opattrs->Set<int32>(name, val);
+    opattrs->Set<int32_t>(name, val);
   } else if (type == "string" || type == "padding") {
     std::string val;
     s = ParseValue(value, &val);
@@ -133,9 +133,9 @@ absl::Status AddOpAttr(const std::string& name, const std::string& attr_value,
     s = ParseValue(value, &val);
     opattrs->Set<tfrt::OpAttrType>(name, tfd::ConvertFromTfDataType(val));
   } else if (type == "list(i32)") {
-    std::vector<int32> val;
+    std::vector<int32_t> val;
     s = ParseValue(value, &val);
-    opattrs->SetArray<int32>(name, val);
+    opattrs->SetArray<int32_t>(name, val);
   }
   return s;
 }
diff --git a/tensorflow/core/runtime_fallback/kernel/attr_util.h b/tensorflow/core/runtime_fallback/kernel/attr_util.h
index 4abbb4f8b31c58..41f0e657d6b1af 100644
--- a/tensorflow/core/runtime_fallback/kernel/attr_util.h
+++ b/tensorflow/core/runtime_fallback/kernel/attr_util.h
@@ -38,10 +38,10 @@ typedef llvm::StringMap<std::string> AttrMap;
 
 // Parse value from the given string input.
 absl::Status ParseValue(absl::string_view input, bool* value);
-absl::Status ParseValue(absl::string_view input, int32* value);
+absl::Status ParseValue(absl::string_view input, int32_t* value);
 absl::Status ParseValue(absl::string_view input, DataType* value);
 absl::Status ParseValue(absl::string_view input, std::string* value);
-absl::Status ParseValue(absl::string_view input, std::vector<int32>* value);
+absl::Status ParseValue(absl::string_view input, std::vector<int32_t>* value);
 absl::Status ParseValue(absl::string_view input, Padding* value);
 
 absl::Status AddOpAttr(const std::string& name, const std::string& attr_value,
diff --git a/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc b/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc
index 79d80b13ff501a..e6975350c55da4 100644
--- a/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc
+++ b/tensorflow/core/runtime_fallback/kernel/attr_util_test.cc
@@ -47,9 +47,9 @@ TEST(AttrUtilTest, TestGetIntAttr) {
   TF_ASSERT_OK(AddOpAttr("bar", "i32$0", &opattrs));
   TF_ASSERT_OK(AddOpAttr("baz", "i32$123", &opattrs));
 
-  ASSERT_EQ(opattrs.GetAsserting<int32>("foo"), -2);
-  ASSERT_EQ(opattrs.GetAsserting<int32>("bar"), 0);
-  ASSERT_EQ(opattrs.GetAsserting<int32>("baz"), 123);
+  ASSERT_EQ(opattrs.GetAsserting<int32_t>("foo"), -2);
+  ASSERT_EQ(opattrs.GetAsserting<int32_t>("bar"), 0);
+  ASSERT_EQ(opattrs.GetAsserting<int32_t>("baz"), 123);
 
   absl::Status s = AddOpAttr("invalid", "i32$4.5", &opattrs);
   ASSERT_FALSE(s.ok());
@@ -71,17 +71,17 @@ TEST(AttrUtilTest, TestGetIntListAttr) {
   TF_ASSERT_OK(AddOpAttr("baz", "list(i32)$1,2,3", &opattrs));
 
   // std::vector<int32> v1, v2, v3;
-  ArrayRef<int32> v1, v2, v3;
-  std::vector<int32> expected_v1;
-  std::vector<int32> expected_v2 = {1};
-  std::vector<int32> expected_v3 = {1, 2, 3};
-  ArrayRef<int32> expected_v1_ref(expected_v1);
-  ArrayRef<int32> expected_v2_ref(expected_v2);
-  ArrayRef<int32> expected_v3_ref(expected_v3);
-
-  ASSERT_TRUE(opattrs.GetArray<int32>("foo", &v1));
-  ASSERT_TRUE(opattrs.GetArray<int32>("bar", &v2));
-  ASSERT_TRUE(opattrs.GetArray<int32>("baz", &v3));
+  ArrayRef<int32_t> v1, v2, v3;
+  std::vector<int32_t> expected_v1;
+  std::vector<int32_t> expected_v2 = {1};
+  std::vector<int32_t> expected_v3 = {1, 2, 3};
+  ArrayRef<int32_t> expected_v1_ref(expected_v1);
+  ArrayRef<int32_t> expected_v2_ref(expected_v2);
+  ArrayRef<int32_t> expected_v3_ref(expected_v3);
+
+  ASSERT_TRUE(opattrs.GetArray<int32_t>("foo", &v1));
+  ASSERT_TRUE(opattrs.GetArray<int32_t>("bar", &v2));
+  ASSERT_TRUE(opattrs.GetArray<int32_t>("baz", &v3));
   ASSERT_EQ(v1, expected_v1_ref);
   ASSERT_EQ(v2, expected_v2_ref);
   ASSERT_EQ(v3, expected_v3_ref);
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index b496e1924107d9..2bab64c6a02ac6 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -427,8 +427,9 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOpInternal(
       [&]() { return GetTracingMetadata(args, exec_ctx, kernel_runner); });
 
   if (fallback_request_state.log_device_placement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat("Executing op ", frame.op_name().GetValue().str(),
-                              " in device ", frame.device().GetValue().str());
+    std::string msg =
+        absl::StrCat("Executing op ", frame.op_name().GetValue().str(),
+                     " in device ", frame.device().GetValue().str());
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
@@ -865,10 +866,10 @@ llvm::Expected<bool> Predicate(
 
       CASE(float);
       CASE(double);
-      CASE(uint8);
-      CASE(int8);
-      CASE(int16);
-      CASE(int32);
+      CASE(uint8_t);
+      CASE(int8_t);
+      CASE(int16_t);
+      CASE(int32_t);
       CASE(int64_t);
       CASE(bool);
 #undef CASE
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
index da93625c5111c2..b93902e576ddd6 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
@@ -77,7 +77,7 @@ static void TFDConstantTensor(tfrt::Argument<int32_t> value,
   // it causes a missing typeinfo error when using -fno-rtti. Investigate
   // if we can make it work with no-rtti.
   Tensor out(DT_INT32, TensorShape({}));
-  out.flat<int32>()(0) = value.get();
+  out.flat<int32_t>()(0) = value.get();
   tensor.Emplace(out);
 }
 
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
index 41e7cfae0637e7..c26dae601b69fe 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
@@ -81,9 +81,9 @@ absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
 
 template <>
 absl::Status TFRTOpKernelConstruction::GetAttr(
-    absl::string_view attr_name, std::vector<int32>* value) const {
-  llvm::ArrayRef<int32> arrayref;
-  bool success = attributes_.GetArray<int32>(
+    absl::string_view attr_name, std::vector<int32_t>* value) const {
+  llvm::ArrayRef<int32_t> arrayref;
+  bool success = attributes_.GetArray<int32_t>(
       llvm::StringRef(attr_name.data(), attr_name.size()), &arrayref);
   if (!success) {
     return MissingAttributeError(attr_name);
@@ -239,7 +239,7 @@ TFRTOpMetaBuilder& TFRTOpMetaBuilder::Attr(absl::string_view attr_spec) {
   return *this;
 }
 
-const string& TFRTOpMetaBuilder::op_name() const { return op_name_; }
+const std::string& TFRTOpMetaBuilder::op_name() const { return op_name_; }
 
 TFRTOpMeta TFRTOpMetaBuilder::BuildMeta() const {
   return TFRTOpMeta(output_types_);
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
index e370fde54e23db..e06a0f13f3ec2b 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
@@ -100,8 +100,8 @@ absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
                                                Padding* value) const;
 
 template <>
-absl::Status TFRTOpKernelConstruction::GetAttr(absl::string_view attr_name,
-                                               std::vector<int32>* value) const;
+absl::Status TFRTOpKernelConstruction::GetAttr(
+    absl::string_view attr_name, std::vector<int32_t>* value) const;
 
 absl::Status MissingAttributeError(absl::string_view attr_name);
 
@@ -207,11 +207,11 @@ class TFRTOpMetaBuilder {
   TFRTOpMetaBuilder& Input(absl::string_view input_spec);
   TFRTOpMetaBuilder& Attr(absl::string_view attr_spec);
 
-  const string& op_name() const;
+  const std::string& op_name() const;
   TFRTOpMeta BuildMeta() const;
 
  private:
-  string op_name_;
+  std::string op_name_;
   std::vector<DataType> output_types_;
 };
 
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
index 5c99d39745c519..3b96ce59d9335d 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
@@ -56,7 +56,7 @@ TEST(TFRTOpKernelTest, TestGetBoolAttr) {
 
 TEST(TFRTOpKernelTest, TestGetIntAttr) {
   tfrt::OpAttrs attrs;
-  attrs.Set<int32>("foo", -2);
+  attrs.Set<int32_t>("foo", -2);
   tfrt::OpAttrsRef attrsref(attrs);
 
   TFRTOpKernelConstruction ctx(attrsref);
@@ -68,18 +68,18 @@ TEST(TFRTOpKernelTest, TestGetIntAttr) {
 
 TEST(TFRTOpKernelTest, TestGetIntListAttr) {
   tfrt::OpAttrs attrs;
-  attrs.SetArray<int32>("foo", {});
-  attrs.SetArray<int32>("bar", {1});
-  attrs.SetArray<int32>("baz", {1, 2, 3});
+  attrs.SetArray<int32_t>("foo", {});
+  attrs.SetArray<int32_t>("bar", {1});
+  attrs.SetArray<int32_t>("baz", {1, 2, 3});
   attrs.SetString("bar", "test");
   tfrt::OpAttrsRef attrsref(attrs);
 
   TFRTOpKernelConstruction ctx(attrsref);
 
-  std::vector<int32> v1, v2, v3;
-  std::vector<int32> expected_v1;
-  std::vector<int32> expected_v2 = {1};
-  std::vector<int32> expected_v3 = {1, 2, 3};
+  std::vector<int32_t> v1, v2, v3;
+  std::vector<int32_t> expected_v1;
+  std::vector<int32_t> expected_v2 = {1};
+  std::vector<int32_t> expected_v3 = {1, 2, 3};
   TF_ASSERT_OK(ctx.GetAttr("foo", &v1));
   ASSERT_EQ(v1, expected_v1);
   TF_ASSERT_OK(ctx.GetAttr("bar", &v2));
@@ -217,7 +217,7 @@ TEST(TFRTOpKernelTest, TestAllocateTemp) {
   ASSERT_EQ(out.AllocatedBytes(), 0);
   TF_EXPECT_OK(ctx.allocate_temp(DT_INT32, {}, &out));
   ASSERT_GT(out.AllocatedBytes(), 0);
-  out.scalar<int32>()() = 123;
+  out.scalar<int32_t>()() = 123;
   ASSERT_EQ(out.dtype(), DT_INT32);
   ASSERT_EQ(out.shape().dims(), 0);
 }

From 2b45283df5413f01a9f14832c3fe9d6520a9e852 Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Tue, 16 Dec 2025 02:01:01 -0800
Subject: [PATCH 317/753] PR #35331: Make autotuner annotation more consistent
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35331

📝 Summary of Changes
Consistent annotations are now emitted in `autotuner.cc` and `gemm_fusion_autotuner.cc`:
https://github.com/openxla/xla/blob/a9e717e30ab1d7d87ce5aa9f04424af41da8c56c/xla/service/gpu/autotuning/gemm_fusion_autotuner.cc#L1050-L1052

🎯 Justification
This makes it easier to see autotuner compilation in the main thread timeline, as otherwise the compilation is hidden in worker threads.

🚀 Kind of Contribution
♻️ Cleanup

📊 Benchmark (for Performance Improvements)
n/a

🧪 Unit Tests:
n/a

🧪 Execution Tests:
n/a
Copybara import of the project:

--
4a05ad0df73cfc1da1219a511df2c2b93b5a0380 by Olli Lupton <olupton@nvidia.com>:

Make autotuner annotation more consistent

autotuner.cc now emits the same annotations as gemm_fusion_autotuner.cc.
This makes it easier to see autotuner compilation in the main thread
timeline, as otherwise the compilation is hidden in worker threads.

Merging this change closes #35331

PiperOrigin-RevId: 845155316
---
 third_party/xla/xla/backends/autotuner/BUILD        | 2 ++
 third_party/xla/xla/backends/autotuner/autotuner.cc | 5 +++++
 2 files changed, 7 insertions(+)

diff --git a/third_party/xla/xla/backends/autotuner/BUILD b/third_party/xla/xla/backends/autotuner/BUILD
index b9c1637e90d582..eedb86819b4f3b 100644
--- a/third_party/xla/xla/backends/autotuner/BUILD
+++ b/third_party/xla/xla/backends/autotuner/BUILD
@@ -63,6 +63,8 @@ cc_library(
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
+        "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index 2578a24a614d8f..40a0f43608e097 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -57,6 +57,8 @@ limitations under the License.
 #include "xla/tsl/util/proto/proto_utils.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/fingerprint.h"
+#include "tsl/profiler/lib/scoped_annotation.h"
+#include "tsl/profiler/lib/traceme.h"
 
 namespace xla {
 
@@ -439,6 +441,9 @@ absl::StatusOr<std::vector<Autotuner::Config>> Autotuner::GetSupportedConfigs(
 
 std::vector<absl::StatusOr<std::unique_ptr<Executable>>> Autotuner::CompileAll(
     HloInstruction* instr, std::vector<Config>& configs) {
+  XLA_SCOPED_LOGGING_TIMER_LEVEL("CompileAll", 5);
+  tsl::profiler::TraceMe traceme("CompileAll");
+  tsl::profiler::ScopedAnnotation annotation("XlaAutotunerCompilation");
   if (thread_pool_ == nullptr) {
     std::vector<absl::StatusOr<std::unique_ptr<Executable>>> executables;
     executables.reserve(configs.size());

From a3de89535cf7f0bf9562864c05b59569ae9638b2 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 16 Dec 2025 02:01:04 -0800
Subject: [PATCH 318/753] PR #35309: [xla:gpu] Include NCCL version into log
 messages for debugging

Imported from GitHub PR https://github.com/openxla/xla/pull/35309

Include NCCL version into log messages for debugging
Copybara import of the project:

--
949bdacdcd970f18f7a60c1a8952fa50bf7d4e85 by Eugene Zhulenev <ezv@amazon.com>:

[xla:gpu] Include NCCL version into log messages for debugging:

Merging this change closes #35309

PiperOrigin-RevId: 845155328
---
 .../xla/xla/backends/gpu/collectives/nccl_collectives.cc      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
index bb9397f99df70c..65ad3cdb8cd54f 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
@@ -159,7 +159,9 @@ NcclCollectives::CreateCommunicatorsWithCancel(
     return InvalidArgument(
         "CliqueIds size must be 1 for NCCL communicator initialization");
   }
-  VLOG(1) << "Initialize NCCL communicator for " << ranks.size() << " devices"
+  VLOG(1) << "Initialize NCCL (version "
+          << absl::StrCat(NCCL_MAJOR, ".", NCCL_MINOR, ".", NCCL_PATCH)
+          << ") communicator for " << ranks.size() << " devices"
           << "; fingerprint(id)=" << clique_ids->fingerprint();
 
   const auto& gpu_config =

From 96ed126a594735c8106699b4f9c2ac8f74798242 Mon Sep 17 00:00:00 2001
From: Sevin Fide Varoglu <svaroglu@nvidia.com>
Date: Tue, 16 Dec 2025 02:02:34 -0800
Subject: [PATCH 319/753] PR #32297: [XLA:GPU] Add host offloading support to
 collective pipeliner
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/32297

📝 Summary of Changes
1. Add host offloading support to collective pipeliner to enable memory optimization for collective operations by offloading intermediate results to host memory.

2. Add dynamic variable detection for CollectivePipeliner-transformed loops to enable FusionDynamicMemcpyRewriter optimization, allowing better memory management for dynamically shaped computations.

🎯 Justification
When host memory offloading is enabled, activations are copied asynchronously; device to host in the forward pass and back for the backward pass. These copies often happen at loop boundaries, limiting overlap with compute. This PR adds host offloading support to collective pipeliner so that we can move these copies to nearby iterations to reduce dependencies and improve overlap. This feature is enabled via `xla_gpu_enable_pipelined_host_offloading` flag.

🚀 Kind of Contribution
⚡️ Performance Improvement, ✨ New Feature

📊 Benchmark (for Performance Improvements)
The changes in this PR are gated by a flag. The HLOs under `xla/tools/benchmarks/hlo` don't affect offloading behavior, so we shouldn't expect any impact.

🧪 Unit Tests:
Added unit tests.

🧪 Execution Tests:
Saw ~12% speedup on GB200 with llama3-8b, fsdp=8 using MLP offloading plus collective pipelining, compared to just offloading.
Copybara import of the project:

--
996c7cfc0aa847fd49b72d839a69195d4e88f3b4 by Sevin Varoglu <svaroglu@nvidia.com>:

[XLA:GPU] Add host offloading support to collective pipeliner

1. Add host offloading support to collective pipeliner to enable memory
   optimization for collective operations by offloading intermediate results
   to host memory.

2. Add dynamic variable detection for CollectivePipeliner-transformed loops
   to enable FusionDynamicMemcpyRewriter optimization, allowing better
   memory management for dynamically shaped computations.

--
2f402d6af17e0590620fdc74bbe5af55ab1be0ec by Sevin Varoglu <svaroglu@nvidia.com>:

Add HLO file

--
b98ed4c0194c4c387629053bd5d8608fd5846442 by Sevin Varoglu <svaroglu@nvidia.com>:

Add fix to preserve dynamic_variable_tuple_indices after loop unrolling

--
f599c596b4a53e6deb9adc7a01822c59092ec2aa by Sevin Varoglu <svaroglu@nvidia.com>:

Fix format

--
5e6f58087ad0561aff1a53bbad8267b1865447db by Sevin Varoglu <svaroglu@nvidia.com>:

Add review feedback

--
0b602e197b835dfc6e3520fcf5a0212f26e8cb37 by Sevin Varoglu <svaroglu@nvidia.com>:

Fix clang format error

--
506f3b29e70af57c0862bdefaca2f544fe44a333 by Sevin Varoglu <svaroglu@nvidia.com>:

Incorporate review feedback

--
9bdb3d1d60b38e9e0fd46d729a422f850e5ca32e by Sevin Varoglu <svaroglu@nvidia.com>:

Address review feedback

--
599ebe31aea7de80f7b647b9d0d5e7fe8b8beb69 by Sevin Varoglu <svaroglu@nvidia.com>:

Fix format

--
e0b67de9064f331a02640d9a8f78125c2ad35f1c by Sevin Varoglu <svaroglu@nvidia.com>:

Address feedback

--
190ae1ce1c0e69e6116f1b53dd5cdee83bf4e7f4 by Sevin Varoglu <svaroglu@nvidia.com>:

Add tests

--
d9a68771aa14ec880ef4065018c2cb949a192b00 by Sevin Varoglu <svaroglu@nvidia.com>:

Fix compile error

--
d4c033184fc95701154422853e3a4f4abf6a2cfb by Sevin Varoglu <svaroglu@nvidia.com>:

Add find_dynamic_slice_operand

--
f53d220213028673304a44027dfa96594546ab3d by Sevin Varoglu <svaroglu@nvidia.com>:

Add postprocess_transformed_while_loop

--
02b44c60bf6a91eb1938e43a6de7e7d9127fadaa by Sevin Varoglu <svaroglu@nvidia.com>:

Rename as additional_chain_start_op_finder, fix tests

--
10c716382cc95fff0bb60dfe4c6a5ba6083bf130 by Sevin Varoglu <svaroglu@nvidia.com>:

Remove visited_set

--
504af7c277bec4cc88e7f2d5940bc7fe3746369c by Sevin Varoglu <svaroglu@nvidia.com>:

Fix format

--
e2f0c909d098400a22466de56952476610b9d08c by Sevin Varoglu <svaroglu@nvidia.com>:

Update BUILD

--
73da8624fff1bda792e2f1c0360fb6fa50e9ec14 by Sevin Varoglu <svaroglu@nvidia.com>:

Move flag

--
d1e3ddbc9cb196685b3a3182db191ee6d11cce18 by Sevin Varoglu <svaroglu@nvidia.com>:

Increment

--
c410305c267203896d85aff9be2e08adf28916ac by Sevin Varoglu <svaroglu@nvidia.com>:

Remove headers

--
38e1d58aa53b96fc9e000bbbe3bb0b8c20adb9ec by Sevin Varoglu <svaroglu@nvidia.com>:

Add header

Merging this change closes #32297

PiperOrigin-RevId: 845155965
---
 third_party/xla/docs/flags_guidance.md        |  33 +--
 .../xla/xla/backends/gpu/codegen/copy_test.cc |  67 ++++++
 third_party/xla/xla/debug_options_flags.cc    |   7 +
 third_party/xla/xla/service/BUILD             |   1 +
 .../xla/xla/service/collective_pipeliner.cc   |  94 +++++---
 .../xla/xla/service/collective_pipeliner.h    |  12 +
 .../xla/service/collective_pipeliner_test.cc  | 216 +++++++++++++++++-
 third_party/xla/xla/service/gpu/BUILD         |   2 +
 .../xla/xla/service/gpu/gpu_compiler.cc       | 105 +++++++++
 .../xla/xla/service/gpu/ir_emission_utils.cc  |  52 ++++-
 .../xla/service/gpu/ir_emission_utils_test.cc | 159 +++++++++++++
 .../gpu_collective_combiner_utils_test.cc     |   2 +
 .../double_buffer_loop_unrolling.cc           |  15 +-
 .../double_buffer_loop_unrolling_test.cc      |  91 +++++++-
 .../xla/xla/service/host_offload_utils.cc     |  83 +++++++
 .../xla/xla/service/host_offload_utils.h      |   4 +
 third_party/xla/xla/xla.proto                 |   4 +-
 third_party/xla/xla/xla_data.proto            |   4 +
 18 files changed, 881 insertions(+), 70 deletions(-)

diff --git a/third_party/xla/docs/flags_guidance.md b/third_party/xla/docs/flags_guidance.md
index c973a1a665ca7c..e2fff15a4cdcb1 100644
--- a/third_party/xla/docs/flags_guidance.md
+++ b/third_party/xla/docs/flags_guidance.md
@@ -81,19 +81,20 @@ data-parallel collectives (`xla_gpu_enable_pipelined_all_gather`,
 Hopper/Blackwell (`xla_gpu_enable_analytical_sol_latency_estimator`). See
 [GPU Effort Levels](https://openxla.org/xla/effort_levels) for details.
 
-| Flag | Type | Notes |
-| :---- | :---- | :----- |
-| `xla_gpu_enable_latency_hiding_scheduler` | Boolean (true/false) |This flag enables latency hiding schedulers to overlap asynchronous communication with computation efficiently. The default value is False. |
-| `xla_gpu_enable_analytical_sol_latency_estimator` | Boolean (true/false) | Enables platform specific scheduling decisions, which in turn improve compute-communication overlap. The default value is true. |
-| `xla_gpu_analytical_latency_estimator_options` | Structured string | Configures parameters for the `xla_gpu_enable_analytical_sol_latency_estimator`. Adjust by setting `nic_speed_gbps=$NIC_SPEED,nccl_op_launch_us=$LAUNCH_OVERHEAD,chunk_prep_us=$CHUNK_PREP,rtt_us=$RTT,chunk_size_bytes=$CHUNK_SIZE,gpus_per_node=$GPUS_PER_NODE`. The default value depends on a detected platform. |
-| `xla_gpu_enable_triton_gemm` | Boolean (true/false) | Use Triton-based matrix multiplication. |
-| `xla_gpu_enable_command_buffer` | List of CommandBufferCmdType | Which kind of commands should be captured in command buffers. |
-| `xla_gpu_all_reduce_combine_threshold_bytes` | Integer (bytes) | These flags tune when to combine multiple small AllGather / ReduceScatter / AllReduce into one big AllGather / ReduceScatter / AllReduce to reduce time spent on cross-device communication. For example, for the AllGather / ReduceScatter thresholds on a Transformer-based workload, consider tuning them high enough so as to combine at least a Transformer Layer’s weight AllGather / ReduceScatter. By default, the combine_threshold_bytes is set to 256. |
-| `xla_gpu_all_gather_combine_threshold_bytes` | Integer (bytes) | See xla_gpu_all_reduce_combine_threshold_bytes above. |
-| `xla_gpu_reduce_scatter_combine_threshold_bytes` | Integer (bytes) | See xla_gpu_all_reduce_combine_threshold_bytes above. |
-| `xla_gpu_enable_pipelined_all_gather` | Boolean (true/false) | Enable pipelinling of all-gather instructions. |
-| `xla_gpu_enable_pipelined_reduce_scatter` | Boolean (true/false) | Enable pipelinling of reduce-scatter instructions. |
-| `xla_gpu_enable_pipelined_all_reduce` | Boolean (true/false) | Enable pipelinling of all-reduce instructions. |
-| `xla_gpu_enable_while_loop_double_buffering` | Boolean (true/false) | Enable double-buffering for while loop. |
-| `xla_gpu_enable_all_gather_combine_by_dim` | Boolean (true/false) | Combine all-gather ops with the same gather dimension or irrespective of their dimension. |
-| `xla_gpu_enable_reduce_scatter_combine_by_dim` | Boolean (true/false) | Combine reduce-scatter ops with the same dimension or irrespective of their dimension. |
+Flag                                              | Type                         | Notes
+:------------------------------------------------ | :--------------------------- | :----
+`xla_gpu_enable_latency_hiding_scheduler`         | Boolean (true/false)         | This flag enables latency hiding schedulers to overlap asynchronous communication with computation efficiently. The default value is False.
+`xla_gpu_enable_analytical_sol_latency_estimator` | Boolean (true/false)         | Enables platform specific scheduling decisions, which in turn improve compute-communication overlap. The default value is true.
+`xla_gpu_analytical_latency_estimator_options`    | Structured string            | Configures parameters for the `xla_gpu_enable_analytical_sol_latency_estimator`. Adjust by setting `nic_speed_gbps=$NIC_SPEED,nccl_op_launch_us=$LAUNCH_OVERHEAD,chunk_prep_us=$CHUNK_PREP,rtt_us=$RTT,chunk_size_bytes=$CHUNK_SIZE,gpus_per_node=$GPUS_PER_NODE`. The default value depends on a detected platform.
+`xla_gpu_enable_triton_gemm`                      | Boolean (true/false)         | Use Triton-based matrix multiplication.
+`xla_gpu_enable_command_buffer`                   | List of CommandBufferCmdType | Which kind of commands should be captured in command buffers.
+`xla_gpu_all_reduce_combine_threshold_bytes`      | Integer (bytes)              | These flags tune when to combine multiple small AllGather / ReduceScatter / AllReduce into one big AllGather / ReduceScatter / AllReduce to reduce time spent on cross-device communication. For example, for the AllGather / ReduceScatter thresholds on a Transformer-based workload, consider tuning them high enough so as to combine at least a Transformer Layer’s weight AllGather / ReduceScatter. By default, the combine_threshold_bytes is set to 256.
+`xla_gpu_all_gather_combine_threshold_bytes`      | Integer (bytes)              | See xla_gpu_all_reduce_combine_threshold_bytes above.
+`xla_gpu_reduce_scatter_combine_threshold_bytes`  | Integer (bytes)              | See xla_gpu_all_reduce_combine_threshold_bytes above.
+`xla_gpu_enable_pipelined_all_gather`             | Boolean (true/false)         | Enable pipelinling of all-gather instructions.
+`xla_gpu_enable_pipelined_reduce_scatter`         | Boolean (true/false)         | Enable pipelinling of reduce-scatter instructions.
+`xla_gpu_enable_pipelined_all_reduce`             | Boolean (true/false)         | Enable pipelinling of all-reduce instructions.
+`xla_gpu_enable_pipelined_host_offloading`        | Boolean (true/false)         | Enable pipelining of host offloading instructions.
+`xla_gpu_enable_while_loop_double_buffering`      | Boolean (true/false)         | Enable double-buffering for while loop.
+`xla_gpu_enable_all_gather_combine_by_dim`        | Boolean (true/false)         | Combine all-gather ops with the same gather dimension or irrespective of their dimension.
+`xla_gpu_enable_reduce_scatter_combine_by_dim`    | Boolean (true/false)         | Combine reduce-scatter ops with the same dimension or irrespective of their dimension.
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
index 6da38eae63927d..01e47f38e0ab1d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy_test.cc
@@ -315,6 +315,73 @@ TEST_F(CopyFusionTest, BuildUpdateSliceDescriptor) {
   EXPECT_EQ(offset.byte_stride, 8 * 8 * sizeof(float));
 }
 
+TEST_F(CopyFusionTest, BuildDescriptorWithDynamicVariable) {
+  constexpr char kModuleWithDynamicVariable[] = R"(
+    dynamic_slice {
+      p0 = s32[4,8,8] parameter(0)
+      p1 = s32[1,1,8] parameter(1)
+      p2 = s32[] parameter(2)
+      c1 = s32[] constant(1)
+
+      ROOT update-slice = s32[4,8,8] dynamic-update-slice(p0, p1, p2, c1, c1)
+    }
+
+    body {
+      p0 = (s32[], s32[4,8,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      input = s32[4,8,8] get-tuple-element(p0), index=1
+      dynamic_idx = s32[] get-tuple-element(p0), index=2
+      val = s32[1,1,8] constant({{{1,2,3,4,5,6,7,8}}})
+
+      updated = s32[4,8,8] fusion(input, val, dynamic_idx), kind=kLoop, calls=dynamic_slice,
+          backend_config={"fusion_backend_config":{"kind":"__dynamic_memcpy"}}
+      c1 = s32[] constant(1)
+      next_ivar = s32[] add(ivar, c1)
+      next_dynamic_idx = s32[] add(dynamic_idx, c1)
+
+      ROOT result = (s32[], s32[4,8,8], s32[])
+          tuple(next_ivar, updated, next_dynamic_idx)
+    }
+
+    condition {
+      p0 = (s32[], s32[4,8,8], s32[]) parameter(0)
+      ivar = s32[] get-tuple-element(p0), index=0
+      c6 = s32[] constant(6)
+      ROOT cmp = pred[] compare(ivar, c6), direction=LT
+    }
+
+    ENTRY main {
+      input = s32[4,8,8] parameter(0)
+      c0 = s32[] constant(0)
+      tuple = (s32[], s32[4,8,8], s32[]) tuple(c0, input, c0)
+      ROOT while = (s32[], s32[4,8,8], s32[]) while(tuple),
+          condition=condition, body=body,
+          backend_config={"known_trip_count":{"n":"6"},
+                          "known_init_step":{"init":"0","step":"1"},
+                          "known_induction_variable":{"tuple_index":"0"},
+                          "dynamic_variable_tuple_indices":["2"]}
+    })";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(kModuleWithDynamicVariable));
+
+  auto descriptor = DynamicMemcpyFusion::GetMemcpyDescriptorForFusion(
+      GetFusion(module.get()));
+
+  ASSERT_TRUE(descriptor.has_value());
+  EXPECT_THAT(descriptor->src_dynamic_offsets, ::testing::IsEmpty());
+  EXPECT_EQ(descriptor->src_byte_static_offset, 0);
+
+  ASSERT_THAT(descriptor->dst_dynamic_offsets, ::testing::SizeIs(1));
+  const auto& offset = descriptor->dst_dynamic_offsets[0];
+  EXPECT_EQ(descriptor->dst_byte_static_offset, 32);
+  EXPECT_EQ(offset.while_loop->name(), "while");
+  EXPECT_EQ(offset.induction_variable->name(), "dynamic_idx");
+  EXPECT_EQ(offset.offset->name(), "p2");
+  EXPECT_EQ(offset.dimension_size, 4);
+  EXPECT_EQ(offset.byte_stride, 8 * 8 * sizeof(float));
+}
+
 TEST_F(CopyFusionTest, PackedSubByteTypesAreNotSupported) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     dynamic_slice {
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 5487a31836ecb3..e83d912282ac05 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -312,6 +312,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_pipelined_all_reduce(false);
   opts.set_xla_gpu_enable_pipelined_all_gather(false);
   opts.set_xla_gpu_enable_pipelined_reduce_scatter(true);
+  opts.set_xla_gpu_enable_pipelined_host_offloading(false);
   opts.set_xla_gpu_enable_pipelined_p2p(false);
 
   opts.set_xla_gpu_collective_permute_decomposer_threshold(
@@ -1932,6 +1933,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                     &DebugOptions::set_xla_gpu_enable_pipelined_reduce_scatter),
                 debug_options->xla_gpu_enable_pipelined_reduce_scatter(),
                 "[Stable] Enable pipelinling of reduce-scatter instructions."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_pipelined_host_offloading",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_pipelined_host_offloading),
+      debug_options->xla_gpu_enable_pipelined_host_offloading(),
+      "Enable pipelining of host offloading instructions."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_pipelined_p2p",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_pipelined_p2p),
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 52716071f9a4ec..f8821201e34dc6 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -504,6 +504,7 @@ xla_cc_test(
         ":collective_pipeliner_utils",
         ":hlo_module_config",
         ":hlo_verifier",
+        ":host_offload_utils",
         ":legalize_scheduling_annotations",
         ":memory_annotations_hdr",
         ":scheduling_annotations_util",
diff --git a/third_party/xla/xla/service/collective_pipeliner.cc b/third_party/xla/xla/service/collective_pipeliner.cc
index 85a04feae59633..eec1f85305f70b 100644
--- a/third_party/xla/xla/service/collective_pipeliner.cc
+++ b/third_party/xla/xla/service/collective_pipeliner.cc
@@ -514,7 +514,9 @@ std::optional<std::vector<HloInstruction*>> CollectIndependentOperandChain(
     HloPredicate should_allow_loop_variant_parameter_in_chain,
     const absl::flat_hash_set<const HloInstruction*>&
         loop_invariant_instructions,
-    bool should_add_loop_invariant_op_in_chain) {
+    bool should_add_loop_invariant_op_in_chain,
+    CollectivePipeliner::AdditionalChainStartOpFinder
+        additional_chain_start_op_finder) {
   std::vector<HloInstruction*> chain;
   absl::flat_hash_set<const HloInstruction*> visited_set({instr});
   std::vector<std::pair<HloInstruction*, int>> stack(1, {instr, 0});
@@ -527,6 +529,16 @@ std::optional<std::vector<HloInstruction*>> CollectIndependentOperandChain(
         return !IsLoopIterator(instr, loop_iter) &&
                !loop_invariant_params.count(instr);
       };
+
+  if (additional_chain_start_op_finder) {
+    auto maybe_additional_op = additional_chain_start_op_finder(instr);
+    if (maybe_additional_op.has_value()) {
+      if (visited_set.insert(maybe_additional_op.value()).second) {
+        stack.emplace_back(maybe_additional_op.value(), 0);
+      }
+    }
+  }
+
   while (!stack.empty()) {
     auto& curr = stack.back();
     if (curr.second == curr.first->operand_count()) {
@@ -600,14 +612,16 @@ std::optional<std::vector<HloInstruction*>> CollectChainsToPushBackwards(
     bool should_allow_control_dependencies,
     const absl::flat_hash_set<const HloInstruction*>&
         loop_invariant_instructions,
-    bool should_add_loop_invariant_op_in_chain) {
+    bool should_add_loop_invariant_op_in_chain,
+    CollectivePipeliner::AdditionalChainStartOpFinder
+        additional_chain_start_op_finder) {
   if (instr->HasControlDependencies() && !should_allow_control_dependencies) {
     return std::nullopt;
   }
   return CollectIndependentOperandChain(
       instr, loop_iter, loop_invariant_params,
       should_allow_loop_variant_parameter_in_chain, loop_invariant_instructions,
-      should_add_loop_invariant_op_in_chain);
+      should_add_loop_invariant_op_in_chain, additional_chain_start_op_finder);
 }
 
 // Given a dynamic-update-slice find the output index of the loop we feed into.
@@ -910,7 +924,9 @@ class WhileLoopAnalysis {
       HloPredicate should_allow_loop_variant_parameter_in_chain =
           HloPredicateFalse,
       bool should_allow_control_dependencies = false,
-      bool should_add_loop_invariant_op_in_chain = false);
+      bool should_add_loop_invariant_op_in_chain = false,
+      CollectivePipeliner::AdditionalChainStartOpFinder
+          additional_chain_start_op_finder = nullptr);
   HloInstruction* while_loop_instruction() const { return while_; }
   void ExtractLoopInvariantOps();
 
@@ -1321,7 +1337,9 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
     HloPredicate should_process, HloPredicate acceptable_formatting,
     HloPredicate should_allow_loop_variant_parameter_in_chain,
     bool should_allow_control_dependencies,
-    bool should_add_loop_invariant_op_in_chain) {
+    bool should_add_loop_invariant_op_in_chain,
+    CollectivePipeliner::AdditionalChainStartOpFinder
+        additional_chain_start_op_finder) {
   move_infos_.clear();
   HloComputation* while_body = while_->while_body();
   const HloInstruction* loop_parameter =
@@ -1498,7 +1516,8 @@ void WhileLoopAnalysis::CollectCollectivesToMove(
           invariant_loop_parameters_,
           should_allow_loop_variant_parameter_in_chain,
           should_allow_control_dependencies, invariant_loop_instructions_,
-          should_add_loop_invariant_op_in_chain);
+          should_add_loop_invariant_op_in_chain,
+          additional_chain_start_op_finder);
       if (!chain_collected.has_value()) {
         VLOG(5) << "Skipping " << instr->name()
                 << " because didn't find compatible slice of parameter";
@@ -1691,7 +1710,7 @@ HloInstruction* CreateZero(HloComputation* comp, const Shape& shape,
 // }
 // xg_last = all-reduce(x)
 // yg_last = all-reduce(y)
-absl::Status TransformLoopForward(
+absl::StatusOr<HloInstruction*> TransformLoopForward(
     const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
     int64_t level_to_operate_on, bool pipeline_use_tree,
     bool process_different_sized_ops, HloPredicate should_process,
@@ -2179,7 +2198,7 @@ absl::Status TransformLoopForward(
         absl::MakeSpan(loop_output_to_replace), output_stacked_data));
   }
   TF_RETURN_IF_ERROR(loop_computation->parent()->RemoveUnusedComputations());
-  return absl::OkStatus();
+  return new_while_loop;
 }
 
 absl::Status TransformFormattingOp(
@@ -2412,7 +2431,7 @@ absl::Status TransformFormattingOp(
 // }
 // xg_all = all-reduce(x_all)
 // yg_all = all-reduce(y_all)
-absl::Status TransformLoopForwardSink(
+absl::StatusOr<HloInstruction*> TransformLoopForwardSink(
     const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
     int64_t level_to_operate_on, bool pipeline_use_tree,
     bool process_different_sized_ops, HloPredicate should_process,
@@ -2767,7 +2786,7 @@ absl::Status TransformLoopForwardSink(
   TF_RETURN_IF_ERROR(
       loop_computation->RemoveInstructionAndUnusedOperands(while_loop));
   TF_RETURN_IF_ERROR(loop_computation->parent()->RemoveUnusedComputations());
-  return absl::OkStatus();
+  return new_while;
 }
 
 // Function that does the work of pushing backward instructions that have been
@@ -2793,7 +2812,7 @@ absl::Status TransformLoopForwardSink(
 //   x_ag = p0_ag_next
 // }
 // x_last = computation(p0_ag_next)
-static absl::Status TransformLoopBackward(
+static absl::StatusOr<HloInstruction*> TransformLoopBackward(
     const WhileLoopAnalysis& loop_analysis, bool insert_non_alias_custom_call,
     int64_t level_to_operate_on, bool process_different_sized_ops,
     HloPredicate acceptable_formatting,
@@ -3139,7 +3158,7 @@ static absl::Status TransformLoopBackward(
   TF_RETURN_IF_ERROR(
       loop_computation->RemoveInstructionAndUnusedOperands(while_loop));
   TF_RETURN_IF_ERROR(loop_computation->parent()->RemoveUnusedComputations());
-  return absl::OkStatus();
+  return new_while_loop;
 }
 
 absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
@@ -3162,6 +3181,7 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
       if (instruction->opcode() != HloOpcode::kWhile) {
         continue;
       }
+
       if (std::none_of(instruction->while_body()->instructions().begin(),
                        instruction->while_body()->instructions().end(),
                        config_.should_process)) {
@@ -3195,7 +3215,8 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
         config_.should_process, config_.acceptable_formatting,
         config_.should_allow_loop_variant_parameter_in_chain,
         config_.should_allow_control_dependencies,
-        config_.should_add_loop_invariant_op_in_chain);
+        config_.should_add_loop_invariant_op_in_chain,
+        config_.additional_chain_start_op_finder);
     if (loop_analysis->GetMoveInfos().empty()) {
       continue;
     }
@@ -3207,31 +3228,44 @@ absl::StatusOr<bool> CollectivePipeliner::RunPipeliner(
         VLOG(1) << "MoveInfo #" << id++ << "\n" << ToString(to_move);
       }
     }
+    HloInstruction* transformed_while_loop;
     if (config_.pipelining_direction ==
         collective_pipeliner_utils::PipeliningDirection::kForward) {
       CHECK(config_.reuse_pipelined_op_buffer);
-      TF_RETURN_IF_ERROR(TransformLoopForward(
-          *loop_analysis, !config_.last_run, config_.level_to_operate_on,
-          config_.pipeline_use_tree, config_.process_different_sized_ops,
-          config_.should_process, config_.acceptable_formatting,
-          config_.reuse_pipelined_op_buffer, next_channel_id,
-          config_.unique_channel_id, config_.postprocess_pipelined_ops));
+      TF_ASSIGN_OR_RETURN(
+          transformed_while_loop,
+          TransformLoopForward(
+              *loop_analysis, !config_.last_run, config_.level_to_operate_on,
+              config_.pipeline_use_tree, config_.process_different_sized_ops,
+              config_.should_process, config_.acceptable_formatting,
+              config_.reuse_pipelined_op_buffer, next_channel_id,
+              config_.unique_channel_id, config_.postprocess_pipelined_ops));
     } else if (config_.pipelining_direction ==
                collective_pipeliner_utils::PipeliningDirection::kForwardSink) {
-      TF_RETURN_IF_ERROR(TransformLoopForwardSink(
-          *loop_analysis, !config_.last_run, config_.level_to_operate_on,
-          config_.pipeline_use_tree, config_.process_different_sized_ops,
-          config_.should_process, next_channel_id, config_.unique_channel_id));
+      TF_ASSIGN_OR_RETURN(
+          transformed_while_loop,
+          TransformLoopForwardSink(
+              *loop_analysis, !config_.last_run, config_.level_to_operate_on,
+              config_.pipeline_use_tree, config_.process_different_sized_ops,
+              config_.should_process, next_channel_id,
+              config_.unique_channel_id));
     } else {
       CHECK_EQ(config_.pipelining_direction,
                collective_pipeliner_utils::PipeliningDirection::kBackward);
-      TF_RETURN_IF_ERROR(TransformLoopBackward(
-          *loop_analysis, !config_.last_run, config_.level_to_operate_on,
-          config_.process_different_sized_ops, config_.acceptable_formatting,
-          config_.postprocess_backward_peeled_op,
-          config_.postprocess_backward_rotated_op,
-          config_.postprocess_backward_peeled_trailing_op, next_channel_id,
-          config_.unique_channel_id, config_.postprocess_pipelined_ops));
+      TF_ASSIGN_OR_RETURN(
+          transformed_while_loop,
+          TransformLoopBackward(
+              *loop_analysis, !config_.last_run, config_.level_to_operate_on,
+              config_.process_different_sized_ops,
+              config_.acceptable_formatting,
+              config_.postprocess_backward_peeled_op,
+              config_.postprocess_backward_rotated_op,
+              config_.postprocess_backward_peeled_trailing_op, next_channel_id,
+              config_.unique_channel_id, config_.postprocess_pipelined_ops));
+    }
+    if (config_.postprocess_transformed_while_loop) {
+      TF_RETURN_IF_ERROR(
+          config_.postprocess_transformed_while_loop(transformed_while_loop));
     }
     ++transformed_loops;
     changed = true;
diff --git a/third_party/xla/xla/service/collective_pipeliner.h b/third_party/xla/xla/service/collective_pipeliner.h
index 7988daa085caaf..427702e3fe7287 100644
--- a/third_party/xla/xla/service/collective_pipeliner.h
+++ b/third_party/xla/xla/service/collective_pipeliner.h
@@ -67,6 +67,10 @@ class CollectivePipeliner : public HloModulePass {
   // created.
   using HloPostprocessor = std::function<absl::Status(
       HloInstruction* instr, HloInstruction* new_while_instr)>;
+  using WhileLoopPostprocessor =
+      std::function<absl::Status(HloInstruction* while_loop)>;
+  using AdditionalChainStartOpFinder =
+      std::function<std::optional<HloInstruction*>(HloInstruction*)>;
 
   struct Config {
     int64_t level_to_operate_on = 0;
@@ -99,6 +103,11 @@ class CollectivePipeliner : public HloModulePass {
     // pipelined. The control dependencies will be dropped when the operation is
     // pipelined. This is currently only used to support kBackward pipelining.
     bool should_allow_control_dependencies = false;
+    // Function to find an additional operation to start the operand chain from.
+    // If set, this function will be called to discover additional starting
+    // points for the operand chain (e.g., DynamicSlice operations through
+    // formatting ops).
+    AdditionalChainStartOpFinder additional_chain_start_op_finder = nullptr;
     // TODO(b/399476667): Consolidate these postprocessing functions.
     HloPostprocessor postprocess_backward_peeled_op;
     HloPostprocessor postprocess_backward_rotated_op;
@@ -112,6 +121,9 @@ class CollectivePipeliner : public HloModulePass {
     bool delay_sinking_large_collectives = true;
     // When cloning collectives, use a unique channel id for each clone.
     bool unique_channel_id = true;
+    // Postprocessing hook which runs for every successfully transformed while
+    // loop.
+    WhileLoopPostprocessor postprocess_transformed_while_loop;
   };
   static const char* const kInsertedByPreviousStep;
   static const char* const kSunkByPreviousStep;
diff --git a/third_party/xla/xla/service/collective_pipeliner_test.cc b/third_party/xla/xla/service/collective_pipeliner_test.cc
index 7b01e0d9fdd61e..a8fb19690f1592 100644
--- a/third_party/xla/xla/service/collective_pipeliner_test.cc
+++ b/third_party/xla/xla/service/collective_pipeliner_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <queue>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -52,6 +53,7 @@ limitations under the License.
 #include "xla/service/collective_pipeliner_utils.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
+#include "xla/service/host_offload_utils.h"
 #include "xla/service/legalize_scheduling_annotations.h"
 #include "xla/service/memory_annotations.h"
 #include "xla/service/scheduling_annotations_util.h"
@@ -107,7 +109,9 @@ absl::StatusOr<bool> RunOptimizer(
         {},
     bool should_add_loop_invariant_op_in_chain = false,
     int64_t collective_size_threshold_to_delay_sinking = INT64_MAX,
-    bool unique_channel_id = true) {
+    bool unique_channel_id = true,
+    CollectivePipeliner::WhileLoopPostprocessor
+        postprocess_transformed_while_loop = {}) {
   CollectivePipeliner::Config config = {
       /*level_to_operate_on=*/level_to_operate_on,
       /*max_pipelining_per_loop=*/INT64_MAX,
@@ -120,12 +124,14 @@ absl::StatusOr<bool> RunOptimizer(
       /*acceptable_formatting=*/acceptable_formatting,
       /*reuse_pipelined_op_buffer=*/reuse_pipelined_op_buffer,
       should_allow_loop_variant_parameter_in_chain,
-      /*should_allow_control_dependencies=*/false, postprocess_backward_peeled,
+      /*should_allow_control_dependencies=*/false,
+      /*additional_chain_start_op_finder=*/nullptr, postprocess_backward_peeled,
       postprocess_backward_rotated, postprocess_backward_peeled_trailing,
       should_add_loop_invariant_op_in_chain,
       /*postprocess_pipelined_ops=*/{},
       collective_size_threshold_to_delay_sinking,
-      /*delay_sinking_large_collectives=*/true, unique_channel_id};
+      /*delay_sinking_large_collectives=*/true, unique_channel_id,
+      postprocess_transformed_while_loop};
   HloPassPipeline pass("optimizer");
   pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
                             /*allow_mixed_precision=*/false);
@@ -5625,5 +5631,209 @@ ENTRY entry {
   EXPECT_EQ(fusion_count, 4);
 }
 
+TEST_F(CollectivePipelinerTest, HostOffloadingForward) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule jit_scanned
+
+%region_0.40 (arg_tuple.13: (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000])) -> (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) {
+  %arg_tuple.13 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.14 = s32[] get-tuple-element(%arg_tuple.13), index=0
+  %constant.19 = s32[] constant(1)
+  %add.38 = s32[] add(%get-tuple-element.14, %constant.19)
+  %get-tuple-element.15 = f32[1000,1000] get-tuple-element(%arg_tuple.13), index=1
+  %get-tuple-element.17 = f32[10,1000,8000] get-tuple-element(%arg_tuple.13), index=3
+  %constant.20 = s32[] constant(0)
+  %dynamic-slice.21 = f32[1,1000,8000] dynamic-slice(%get-tuple-element.17, %get-tuple-element.14, %constant.20, %constant.20), dynamic_slice_sizes={1,1000,8000}
+  %reshape.22 = f32[1000,8000] reshape(%dynamic-slice.21)
+  %dot.0 = f32[1000,8000] dot(%get-tuple-element.15, %reshape.22), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %get-tuple-element.18 = f32[10,8000,1000] get-tuple-element(%arg_tuple.13), index=4
+  %dynamic-slice.23 = f32[1,8000,1000] dynamic-slice(%get-tuple-element.18, %get-tuple-element.14, %constant.20, %constant.20), dynamic_slice_sizes={1,8000,1000}
+  %reshape.24 = f32[8000,1000] reshape(%dynamic-slice.23)
+  %dot.1 = f32[1000,1000] dot(%dot.0, %reshape.24), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %get-tuple-element.16 = f32[10,1000,1000] get-tuple-element(%arg_tuple.13), index=2
+  %custom-call.2 = f32[1000,1000] custom-call(%dot.1), custom_call_target="MoveToHost"
+  %reshape.36 = f32[1,1000,1000] reshape(%custom-call.2)
+  %dynamic-update-slice.37 = f32[10,1000,1000] dynamic-update-slice(%get-tuple-element.16, %reshape.36, %get-tuple-element.14, %constant.20, %constant.20)
+  ROOT %tuple.39 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) tuple(%add.38, %dot.1, %dynamic-update-slice.37, %get-tuple-element.17, %get-tuple-element.18)
+}
+
+%region_1.49 (arg_tuple.41: (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000])) -> pred[] {
+  %arg_tuple.41 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.42 = s32[] get-tuple-element(%arg_tuple.41), index=0
+  %constant.47 = s32[] constant(10)
+  ROOT %compare.48 = pred[] compare(%get-tuple-element.42, %constant.47), direction=LT
+}
+
+ENTRY %main.117 (Arg_0.1: f32[10,1000,8000], Arg_1.2: f32[10,8000,1000], Arg_2.3: f32[1000,1000]) -> (f32[10,1000,8000], f32[10,8000,1000]) {
+  %constant.10 = s32[] constant(0)
+  %constant.4 = f32[] constant(0)
+  %Arg_2.3 = f32[1000,1000] parameter(2)
+  %broadcast.12 = f32[10,1000,1000] broadcast(%constant.4), dimensions={}
+  %Arg_0.1 = f32[10,1000,8000] parameter(0)
+  %Arg_1.2 = f32[10,8000,1000] parameter(1)
+  %tuple.50 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) tuple(%constant.10, %Arg_2.3, %broadcast.12, %Arg_0.1, %Arg_1.2)
+  %while.51 = (s32[], f32[1000,1000], f32[10,1000,1000], f32[10,1000,8000], f32[10,8000,1000]) while(%tuple.50), condition=%region_1.49, body=%region_0.40
+  %get-tuple-element.52 = f32[10,1000,8000] get-tuple-element(%while.51), index=3
+  %get-tuple-element.53 = f32[10,8000,1000] get-tuple-element(%while.51), index=4
+  ROOT %tuple.116 = (f32[10,1000,8000], f32[10,8000,1000]) tuple(%get-tuple-element.52, %get-tuple-element.53)
+}
+)";
+
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/true,
+          /*process_different_sized_ops=*/true,
+          /*direction=*/
+          collective_pipeliner_utils::PipeliningDirection::kForward,
+          /*should_process=*/
+          host_offload_utils::IsMoveToHostWithDynamicUpdateSlice,
+          /*acceptable_formatting=*/HloPredicateTrue,
+          /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
+          /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+          /*postprocess_backward_peeled=*/{},
+          /*postprocess_backward_rotated=*/{},
+          /*postprocess_backward_peeled_trailing=*/{},
+          /*should_add_loop_invariant_op_in_chain=*/false,
+          /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+          /*unique_channel_id=*/true,
+          /*postprocess_transformed_while_loop=*/
+          host_offload_utils::MarkDynamicVariables)
+          .value());
+
+  std::vector<HloInstruction*> while_loops;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_loops.push_back(instr);
+    }
+  }
+  ASSERT_EQ(while_loops.size(), 1) << "Expected 1 while loop in the module";
+
+  XLA_VLOG_LINES(1, "Transformed while body:\n" +
+                        while_loops[0]->while_body()->ToString());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_loops[0]->backend_config<WhileLoopBackendConfig>());
+
+  std::set<int64_t> dynamic_indices(
+      config.dynamic_variable_tuple_indices().begin(),
+      config.dynamic_variable_tuple_indices().end());
+
+  std::set<int64_t> expected_indices = {0, 5};
+  EXPECT_EQ(dynamic_indices, expected_indices);
+}
+
+TEST_F(CollectivePipelinerTest, HostOffloadingBackward) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule jit_scanned
+
+%region_2.98 (arg_tuple.55: (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000])) -> (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) {
+  %arg_tuple.55 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.56 = s32[] get-tuple-element(%arg_tuple.55), index=0
+  %constant.64 = s32[] constant(1)
+  %add.96 = s32[] add(%get-tuple-element.56, %constant.64)
+  %get-tuple-element.57 = f32[1000,1000] get-tuple-element(%arg_tuple.55), index=1
+  %get-tuple-element.62 = f32[10,8000,1000] get-tuple-element(%arg_tuple.55), index=6
+  %constant.1 = s32[] constant(9)
+  %subtract = s32[] subtract(%constant.1, %get-tuple-element.56)
+  %constant.63 = s32[] constant(0)
+  %dynamic-slice.72 = f32[1,8000,1000] dynamic-slice(%get-tuple-element.62, %subtract, %constant.63, %constant.63), dynamic_slice_sizes={1,8000,1000}
+  %reshape.73 = f32[8000,1000] reshape(%dynamic-slice.72)
+  %dot.2 = f32[1000,8000] dot(%get-tuple-element.57, %reshape.73), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  %get-tuple-element.61 = f32[10,1000,8000] get-tuple-element(%arg_tuple.55), index=5
+  %dynamic-slice.70 = f32[1,1000,8000] dynamic-slice(%get-tuple-element.61, %subtract, %constant.63, %constant.63), dynamic_slice_sizes={1,1000,8000}
+  %reshape.71 = f32[1000,8000] reshape(%dynamic-slice.70)
+  %dot.3 = f32[1000,1000] dot(%dot.2, %reshape.71), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  %get-tuple-element.58 = f32[10,1000,8000] get-tuple-element(%arg_tuple.55), index=2
+  %get-tuple-element.60 = f32[10,1000,1000] get-tuple-element(%arg_tuple.55), index=4
+  %dynamic-slice.68 = f32[1,1000,1000] dynamic-slice(%get-tuple-element.60, %subtract, %constant.63, %constant.63), dynamic_slice_sizes={1,1000,1000}
+  %reshape.69 = f32[1000,1000] reshape(%dynamic-slice.68)
+  %custom-call.3 = f32[1000,1000] custom-call(%reshape.69), custom_call_target="MoveToDevice"
+  %dot.7 = f32[1000,8000] dot(%custom-call.3, %dot.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  %reshape.92 = f32[1,1000,8000] reshape(%dot.7)
+  %dynamic-update-slice.93 = f32[10,1000,8000] dynamic-update-slice(%get-tuple-element.58, %reshape.92, %subtract, %constant.63, %constant.63)
+  %get-tuple-element.59 = f32[10,8000,1000] get-tuple-element(%arg_tuple.55), index=3
+  %dot.5 = f32[1000,8000] dot(%custom-call.3, %reshape.71), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  %dot.8 = f32[8000,1000] dot(%dot.5, %get-tuple-element.57), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  %reshape.94 = f32[1,8000,1000] reshape(%dot.8)
+  %dynamic-update-slice.95 = f32[10,8000,1000] dynamic-update-slice(%get-tuple-element.59, %reshape.94, %subtract, %constant.63, %constant.63)
+  ROOT %tuple.97 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) tuple(%add.96, %dot.3, %dynamic-update-slice.93, %dynamic-update-slice.95, %get-tuple-element.60, /*index=5*/%get-tuple-element.61, %get-tuple-element.62)
+}
+
+%region_3.109 (arg_tuple.99: (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000])) -> pred[] {
+  %arg_tuple.99 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) parameter(0)
+  %get-tuple-element.100 = s32[] get-tuple-element(%arg_tuple.99), index=0
+  %constant.107 = s32[] constant(10)
+  ROOT %compare.108 = pred[] compare(%get-tuple-element.100, %constant.107), direction=LT
+}
+
+ENTRY %main.117 (Arg_0.1: f32[10,1000,8000], Arg_1.2: f32[10,8000,1000], Arg_2.3: f32[10,1000,1000]) -> (f32[10,1000,8000], f32[10,8000,1000]) {
+  %constant.10 = s32[] constant(0)
+  %constant.8 = f32[] constant(1)
+  %broadcast.9 = f32[1000,1000] broadcast(%constant.8), dimensions={}
+  %constant.4 = f32[] constant(0)
+  %broadcast.7 = f32[10,1000,8000] broadcast(%constant.4), dimensions={}
+  %broadcast.5 = f32[10,8000,1000] broadcast(%constant.4), dimensions={}
+  %Arg_2.3 = f32[10,1000,1000] parameter(2)
+  %Arg_0.1 = f32[10,1000,8000] parameter(0)
+  %Arg_1.2 = f32[10,8000,1000] parameter(1)
+  %tuple.110 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) tuple(%constant.10, %broadcast.9, %broadcast.7, %broadcast.5, %Arg_2.3, /*index=5*/%Arg_0.1, %Arg_1.2)
+  %while.111 = (s32[], f32[1000,1000], f32[10,1000,8000], f32[10,8000,1000], f32[10,1000,1000], /*index=5*/f32[10,1000,8000], f32[10,8000,1000]) while(%tuple.110), condition=%region_3.109, body=%region_2.98
+  %get-tuple-element.114 = f32[10,1000,8000] get-tuple-element(%while.111), index=2
+  %get-tuple-element.115 = f32[10,8000,1000] get-tuple-element(%while.111), index=3
+  ROOT %tuple.116 = (f32[10,1000,8000], f32[10,8000,1000]) tuple(%get-tuple-element.114, %get-tuple-element.115)
+}
+)";
+
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*pipeline_use_tree=*/true,
+          /*process_different_sized_ops=*/true,
+          /*direction=*/
+          collective_pipeliner_utils::PipeliningDirection::kBackward,
+          /*should_process=*/
+          host_offload_utils::IsMoveToDeviceWithDynamicSlice,
+          /*acceptable_formatting=*/HloPredicateTrue,
+          /*reuse_pipelined_op_buffer=*/HloPredicateTrue,
+          /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+          /*postprocess_backward_peeled=*/{},
+          /*postprocess_backward_rotated=*/{},
+          /*postprocess_backward_peeled_trailing=*/{},
+          /*should_add_loop_invariant_op_in_chain=*/false,
+          /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+          /*unique_channel_id=*/true,
+          /*postprocess_transformed_while_loop=*/
+          host_offload_utils::MarkDynamicVariables)
+          .value());
+
+  std::vector<HloInstruction*> while_loops;
+  for (auto* instr : module->entry_computation()->instructions()) {
+    if (instr->opcode() == HloOpcode::kWhile) {
+      while_loops.push_back(instr);
+    }
+  }
+  ASSERT_EQ(while_loops.size(), 1) << "Expected 1 while loop in the module";
+
+  XLA_VLOG_LINES(1, "Transformed while body:\n" +
+                        while_loops[0]->while_body()->ToString());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileLoopBackendConfig config,
+      while_loops[0]->backend_config<WhileLoopBackendConfig>());
+
+  std::set<int64_t> dynamic_indices(
+      config.dynamic_variable_tuple_indices().begin(),
+      config.dynamic_variable_tuple_indices().end());
+
+  std::set<int64_t> expected_indices = {0, 8};
+  EXPECT_EQ(dynamic_indices, expected_indices);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index e20aee3d7818af..fa1687089d29d2 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1728,11 +1728,13 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
         "//xla/service:hlo_verifier",
+        "//xla/service:host_offload_utils",
         "//xla/service:layout_assignment",
         "//xla/service:layout_normalization",
         "//xla/service:llvm_compiler",
         "//xla/service:logical_buffer",
         "//xla/service:loop_schedule_linearizer",
+        "//xla/service:memory_annotations_hdr",
         "//xla/service:reduce_scatter_combiner",
         "//xla/service:reduce_scatter_reassociate",
         "//xla/service:scatter_determinism_expander",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 4f3ca9e1cc7e67..dd90b654eace23 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <optional>
+#include <set>
 #include <string>
 #include <utility>
 #include <variant>
@@ -274,10 +275,12 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/hlo_verifier.h"
+#include "xla/service/host_offload_utils.h"
 #include "xla/service/layout_assignment.h"
 #include "xla/service/layout_normalization.h"
 #include "xla/service/llvm_ir/llvm_command_line_options.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/memory_annotations.h"
 #include "xla/service/reduce_scatter_reassociate.h"
 #include "xla/service/scatter_determinism_expander.h"
 #include "xla/service/scatter_expander.h"
@@ -882,6 +885,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
         /*postprocess_backward_peeled_op=*/{},
         /*postprocess_backward_rotated_op=*/{},
         /*postprocess_backward_peeled_trailing_op=*/{},
@@ -905,6 +909,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
         /*postprocess_backward_peeled_op=*/{},
         /*postprocess_backward_rotated_op=*/{},
         /*postprocess_backward_peeled_trailing_op=*/{},
@@ -928,6 +933,7 @@ absl::Status RunCollectiveOptimizationPasses(
         /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
         /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
         /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
         /*postprocess_backward_peeled_op=*/{},
         /*postprocess_backward_rotated_op=*/{},
         /*postprocess_backward_peeled_trailing_op=*/{},
@@ -937,6 +943,105 @@ absl::Status RunCollectiveOptimizationPasses(
     collectives_pipeline.AddPass<CollectivePipeliner>(config);
   }
 
+  if (debug_options.xla_gpu_enable_pipelined_host_offloading() ||
+      IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(*hlo_module)) {
+    // Forward pass host offloading pipelining
+    CollectivePipeliner::Config config{
+        /*level_to_operate_on=*/0,
+        /*max_pipelining_per_loop=*/INT64_MAX,
+        /*last_run=*/true,
+        /*pipeline_use_tree=*/true,
+        /*process_different_sized_ops=*/true,
+        /*pipelining_direction=*/
+        collective_pipeliner_utils::PipeliningDirection::kForward,
+        /*should_process=*/
+        host_offload_utils::IsMoveToHostWithDynamicUpdateSlice,
+        /*acceptable_formatting=*/HloPredicateTrue,
+        /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
+        /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+        /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/nullptr,
+        /*postprocess_backward_peeled_op=*/{},
+        /*postprocess_backward_rotated_op=*/{},
+        /*postprocess_backward_peeled_trailing_op=*/{},
+        /*should_add_loop_invariant_op_in_chain=*/false,
+        /*postprocess_pipelined_ops=*/AppendPipelinedInstruction,
+        /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+        /*delay_sinking_large_collectives=*/true,
+        /*unique_channel_id=*/true,
+        /*postprocess_transformed_while_loop=*/
+        host_offload_utils::MarkDynamicVariables,
+    };
+    collectives_pipeline.AddPass<CollectivePipeliner>(config);
+  }
+
+  if (debug_options.xla_gpu_enable_pipelined_host_offloading() ||
+      IsPassEnabledAtOptimizationEffort<CollectivePipeliner>(*hlo_module)) {
+    // Backward pass host offloading pipelining
+    auto acceptable_formatting = [](const HloInstruction* instr) {
+      return instr->opcode() == HloOpcode::kReshape ||
+             instr->opcode() == HloOpcode::kBroadcast ||
+             instr->opcode() == HloOpcode::kTranspose;
+    };
+    CollectivePipeliner::Config config_backward{
+        /*level_to_operate_on=*/0,
+        /*max_pipelining_per_loop=*/INT64_MAX,
+        /*last_run=*/true,
+        /*pipeline_use_tree=*/true,
+        /*process_different_sized_ops=*/true,
+        /*pipelining_direction=*/
+        collective_pipeliner_utils::PipeliningDirection::kBackward,
+        /*should_process=*/host_offload_utils::IsMoveToDeviceWithDynamicSlice,
+        /*acceptable_formatting=*/acceptable_formatting,
+        /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
+        /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
+        /*should_allow_control_dependencies=*/false,
+        /*additional_chain_start_op_finder=*/
+        [acceptable_formatting](
+            HloInstruction* instr) -> std::optional<HloInstruction*> {
+          if (!instr->IsCustomCall(
+                  memory_annotations::kMoveToDeviceCustomCallTarget)) {
+            return std::nullopt;
+          }
+          if (instr->operand_count() == 0) {
+            return std::nullopt;
+          }
+
+          std::vector<HloInstruction*> to_check = {instr->mutable_operand(0)};
+          std::set<HloInstruction*> visited;
+
+          while (!to_check.empty()) {
+            HloInstruction* current = to_check.back();
+            to_check.pop_back();
+
+            if (visited.insert(current).second) {
+              if (current->opcode() == HloOpcode::kDynamicSlice) {
+                return current;
+              }
+              if (acceptable_formatting(current)) {
+                for (HloInstruction* operand : current->operands()) {
+                  to_check.push_back(operand);
+                }
+              }
+            }
+          }
+          return std::nullopt;
+        },
+        /*postprocess_backward_peeled_op=*/{},
+        /*postprocess_backward_rotated_op=*/{},
+        /*postprocess_backward_peeled_trailing_op=*/{},
+        /*should_add_loop_invariant_op_in_chain=*/true,
+        /*postprocess_pipelined_ops=*/AppendPipelinedInstruction,
+        /*collective_size_threshold_to_delay_sinking=*/INT64_MAX,
+        /*delay_sinking_large_collectives=*/true,
+        /*unique_channel_id=*/true,
+        /*postprocess_transformed_while_loop=*/
+        host_offload_utils::MarkDynamicVariables,
+    };
+
+    collectives_pipeline.AddPass<CollectivePipeliner>(config_backward);
+  }
+
   DebugOptions::PipelineParallelismOptLevel pipeline_parallelism_opt_level =
       debug_options.xla_gpu_experimental_pipeline_parallelism_opt_level();
   if (debug_options.xla_gpu_enable_pipelined_p2p()) {
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index b22a642419c531..022edaee8653c3 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -668,6 +668,23 @@ bool IsInductionVariable(const HloInstruction* maybe_variable,
          maybe_variable->tuple_index() == loop.induction_variable_index;
 }
 
+// Returns true if `variable` is marked as a dynamic variable.
+bool IsDynamicVariable(const HloInstruction* variable,
+                       const VerifiedLoop& loop) {
+  auto config = loop.loop->backend_config<xla::WhileLoopBackendConfig>();
+  if (!config.ok()) {
+    return false;
+  }
+
+  int64_t tuple_idx = variable->tuple_index();
+  for (int64_t dynamic_idx : config->dynamic_variable_tuple_indices()) {
+    if (dynamic_idx == tuple_idx) {
+      return true;
+    }
+  }
+  return false;
+}
+
 // Attempts to find the induction variable of `loop` in `dependencies`. If there
 // are any dependencies on non-induction variable loop-carried variables,
 // returns nullopt.
@@ -675,25 +692,38 @@ std::optional<const HloInstruction*> VerifyInductionVariable(
     const Dependencies& dependencies, const VerifiedLoop& loop) {
   const HloInstruction* induction_var = nullptr;
   for (const HloInstruction* gte : dependencies.get_tuple_elements) {
-    if (IsInductionVariable(gte, loop)) {
-      if (induction_var) {
-        // This should never happen.
-        VLOG(5) << "Found non-unique GTEs for the induction variable. Did "
-                   "HloCSE run?";
+    if (IsLoopCarriedVariable(gte, loop)) {
+      if (IsInductionVariable(gte, loop)) {
+        if (induction_var) {
+          // This should never happen.
+          VLOG(5) << "Found non-unique GTEs for the induction variable. Did "
+                     "HloCSE run?";
+          return std::nullopt;
+        }
+        induction_var = gte;
+      } else if (IsDynamicVariable(gte, loop)) {
+        // Dynamic variables are also acceptable because they represent tuple
+        // indices used in DS/DUS that can be optimized by
+        // FusionDynamicMemcpyRewriter.
+        if (induction_var) {
+          // This should never happen.
+          VLOG(5) << "Found non-unique GTEs for the dynamic variable. Did "
+                     "HloCSE run?";
+          return std::nullopt;
+        }
+        induction_var = gte;
+      } else {
+        // Other dependencies on loop-carried variables are not allowed.
+        VLOG(5) << "Found illegal dependency on loop-carried variable.";
         return std::nullopt;
       }
-      induction_var = gte;
-    } else if (IsLoopCarriedVariable(gte, loop)) {
-      // Other dependencies on loop-carried variables are not allowed.
-      VLOG(5) << "Found illegal dependency on loop-carried variable.";
-      return std::nullopt;
     }
     // Other GTEs are OK, as long as their tuples are ultimately just derived
     // from the loop's induction variable. We already verified that there are no
     // side-effecting dependencies in GetLeafDependencies.
   }
   if (!induction_var) {
-    VLOG(5) << "Did not find an induction variable.";
+    VLOG(5) << "Did not find an induction variable or dynamic variable.";
     return std::nullopt;
   }
   return induction_var;
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index 49345bd960ec58..14762eb4a410fc 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -957,6 +957,165 @@ TEST_F(IrEmissionUtilsTest, NonInductionVariableLoopCarriedVariable) {
                    .has_value());
 }
 
+TEST_F(IrEmissionUtilsTest, DynamicVariableLoopCarriedVariable) {
+  constexpr absl::string_view kHlo = R"(
+      while_body {
+        p0 = (s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        dynamic_var = s32[] get-tuple-element(p0), index=1
+        other_var = s32[] get-tuple-element(p0), index=2
+
+        c1 = s32[] constant(1)
+        next_ivar = s32[] add(ivar, c1)
+        next_dynamic_var = s32[] add(dynamic_var, c1)
+        next_other = s32[] add(other_var, c1)
+
+        ROOT result = (s32[], s32[], s32[]) tuple(next_ivar, next_dynamic_var, next_other)
+      }
+
+      condition {
+        p0 = (s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        c5 = s32[] constant(5)
+        ROOT cmp = pred[] compare(ivar, c5), direction=LT
+      }
+
+      ENTRY main {
+        c0 = s32[] constant(0)
+        tuple = (s32[], s32[], s32[]) tuple(c0, c0, c0)
+        ROOT while = (s32[], s32[], s32[]) while(tuple),
+            condition=condition, body=while_body,
+            backend_config={"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":[1]}
+      }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloComputation* while_body = module->GetComputationWithName("while_body");
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("next_ivar"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("next_dynamic_var"))
+                  .has_value());
+
+  ASSERT_FALSE(ResolveFunctionalDependencyOnInductionVariable(
+                   while_body->GetInstructionWithName("next_other"))
+                   .has_value());
+}
+
+TEST_F(IrEmissionUtilsTest, DynamicVariableWithIrrelevantGTE) {
+  constexpr absl::string_view kHlo = R"(
+      while_body {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        dynamic_var = s32[] get-tuple-element(p0), index=1
+        irrelevant_var = s32[] get-tuple-element(p0), index=2
+        other_var = s32[] get-tuple-element(p0), index=3
+
+        c1 = s32[] constant(1)
+        next_ivar = s32[] add(ivar, c1)
+        
+        dynamic_computation = s32[] add(ivar, c1)
+        
+        irrelevant_computation = s32[] add(irrelevant_var, c1)
+        
+        next_other = s32[] add(other_var, c1)
+
+        ROOT result = (s32[], s32[], s32[], s32[]) tuple(next_ivar, dynamic_computation, irrelevant_computation, next_other)
+      }
+
+      condition {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        c5 = s32[] constant(5)
+        ROOT cmp = pred[] compare(ivar, c5), direction=LT
+      }
+
+      ENTRY main {
+        c0 = s32[] constant(0)
+        tuple = (s32[], s32[], s32[], s32[]) tuple(c0, c0, c0, c0)
+        ROOT while = (s32[], s32[], s32[], s32[]) while(tuple),
+            condition=condition, body=while_body,
+            backend_config={"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":[1, 2]}
+      }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloComputation* while_body = module->GetComputationWithName("while_body");
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("next_ivar"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("dynamic_computation"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("irrelevant_computation"))
+                  .has_value());
+
+  ASSERT_FALSE(ResolveFunctionalDependencyOnInductionVariable(
+                   while_body->GetInstructionWithName("next_other"))
+                   .has_value());
+}
+
+TEST_F(IrEmissionUtilsTest, MultipleDynamicVariables) {
+  constexpr absl::string_view kHlo = R"(
+      while_body {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        dynamic_var1 = s32[] get-tuple-element(p0), index=1
+        dynamic_var2 = s32[] get-tuple-element(p0), index=2
+        regular_var = s32[] get-tuple-element(p0), index=3
+
+        c1 = s32[] constant(1)
+        next_ivar = s32[] add(ivar, c1)
+        
+        compute1 = s32[] add(dynamic_var1, c1)
+        compute2 = s32[] add(dynamic_var2, c1)
+        compute_regular = s32[] add(regular_var, c1)
+
+        ROOT result = (s32[], s32[], s32[], s32[]) tuple(next_ivar, compute1, compute2, compute_regular)
+      }
+
+      condition {
+        p0 = (s32[], s32[], s32[], s32[]) parameter(0)
+        ivar = s32[] get-tuple-element(p0), index=0
+        c5 = s32[] constant(5)
+        ROOT cmp = pred[] compare(ivar, c5), direction=LT
+      }
+
+      ENTRY main {
+        c0 = s32[] constant(0)
+        tuple = (s32[], s32[], s32[], s32[]) tuple(c0, c0, c0, c0)
+        ROOT while = (s32[], s32[], s32[], s32[]) while(tuple),
+            condition=condition, body=while_body,
+            backend_config={"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":[1, 2]}
+      }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kHlo));
+  HloComputation* while_body = module->GetComputationWithName("while_body");
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("compute1"))
+                  .has_value());
+
+  ASSERT_TRUE(ResolveFunctionalDependencyOnInductionVariable(
+                  while_body->GetInstructionWithName("compute2"))
+                  .has_value());
+
+  ASSERT_FALSE(ResolveFunctionalDependencyOnInductionVariable(
+                   while_body->GetInstructionWithName("compute_regular"))
+                   .has_value());
+}
+
 TEST_F(IrEmissionUtilsTest, Transpose_10) {
   auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
     p0 = f32[8, 32] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
index 6478bac78a9f6d..cae46d2980bfbb 100644
--- a/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/collectives/gpu_collective_combiner_utils_test.cc
@@ -275,6 +275,7 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
       /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
       /*should_allow_control_dependencies=*/false,
+      /*additional_chain_start_op_finder=*/nullptr,
       /*postprocess_backward_peeled_op=*/{},
       /*postprocess_backward_rotated_op=*/{},
       /*postprocess_backward_peeled_trailing_op=*/{},
@@ -370,6 +371,7 @@ TEST_F(CollectiveCombinerUtilsTest,
       /*reuse_pipelined_op_buffer=*/HloPredicateFalse,
       /*should_allow_loop_variant_parameter_in_chain=*/HloPredicateFalse,
       /*should_allow_control_dependencies=*/false,
+      /*additional_chain_start_op_finder=*/nullptr,
       /*postprocess_backward_peeled_op=*/{},
       /*postprocess_backward_rotated_op=*/{},
       /*postprocess_backward_peeled_trailing_op=*/{},
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
index 589633cef66afd..fa58739643633a 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
@@ -256,8 +256,13 @@ absl::StatusOr<bool> FullyUnroll(HloInstruction* while_instr,
     changed = true;
   }
 
-  WhileLoopBackendConfig new_config;
+  WhileLoopBackendConfig old_config;
+  TF_ASSIGN_OR_RETURN(old_config,
+                      while_instr->backend_config<WhileLoopBackendConfig>());
+
+  WhileLoopBackendConfig new_config = old_config;
   new_config.mutable_known_trip_count()->set_n(1);
+
   TF_RETURN_IF_ERROR(while_instr->set_backend_config(new_config));
 
   return changed;
@@ -394,15 +399,9 @@ absl::StatusOr<bool> DoubleBufferingUnroll(HloInstruction* while_instr,
                                                &old_loop_roots, input_parameter,
                                                skip_control_dep_injection));
 
-  WhileLoopBackendConfig new_config;
+  WhileLoopBackendConfig new_config = config;
   new_config.mutable_known_trip_count()->set_n(exact_trip_count / 2);
 
-  // Keep known induction variable metadata if it was present before.
-  if (config.has_known_induction_variable()) {
-    *new_config.mutable_known_induction_variable() =
-        config.known_induction_variable();
-  }
-
   // Update the init/step metadata if it was present before.
   if (config.has_known_init_step()) {
     int64_t step = config.known_init_step().step();
diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
index 3c643a3b83a110..dbe914ce6037d3 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <set>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
@@ -42,7 +44,6 @@ namespace xla {
 namespace gpu {
 namespace {
 
-
 int64_t CountInstructions(HloComputation& computation, HloOpcode opcode) {
   int64_t count = 0;
   hlo_query::ForEachInstructionWithOpcode(
@@ -1498,6 +1499,94 @@ TEST_F(GpuLoopDoubleBufferTransformerTest, UpdateInitStepEvenTripCount) {
   EXPECT_EQ(config.known_init_step().step(), 4);
 }
 
+TEST_F(GpuLoopDoubleBufferTransformerTest,
+       PreserveDynamicVariableIndicesAfterDoubleBuffering) {
+  absl::string_view kModuleString = R"(
+HloModule test
+
+condition {
+  input_tuple = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) parameter(0)
+  cond = s32[] get-tuple-element(input_tuple), index=0
+  trip_count = s32[] constant(10)
+  ROOT done = pred[] compare(cond, trip_count), direction=LT
+}
+
+body {
+  input_tuple = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) parameter(0)
+  idx = s32[] get-tuple-element(input_tuple), index=0
+  buffer = f32[2,8]{1,0:S(5)} get-tuple-element(input_tuple), index=1
+  update = f32[1,8]{1,0} get-tuple-element(input_tuple), index=2
+  counter = s32[] get-tuple-element(input_tuple), index=3
+
+  c0 = s32[] constant(0)
+  dus = f32[2,8]{1,0:S(5)} dynamic-update-slice(buffer, update, idx, c0)
+
+  c1 = s32[] constant(1)
+  idx_plus_1 = s32[] add(idx, c1)
+  counter_plus_1 = s32[] add(counter, c1)
+  ROOT output = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) tuple(idx_plus_1, dus, update, counter_plus_1)
+}
+
+ENTRY main {
+  c0 = s32[] constant(0)
+  buffer_init = f32[2,8]{1,0:S(5)} parameter(0)
+  update_init = f32[1,8]{1,0} parameter(1)
+  input_tuple = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) tuple(c0, buffer_init, update_init, c0)
+  ROOT while = (s32[], f32[2,8]{1,0:S(5)}, f32[1,8]{1,0}, s32[]) while(input_tuple), condition=condition, body=body, backend_config={"known_trip_count":{"n":"10"},"known_induction_variable":{"tuple_index":"0"},"dynamic_variable_tuple_indices":["3","0"]}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+
+  DoubleBufferLoopUnrolling double_buffer(
+      DoubleBufferLoopUnrolling::UnrollStrategy::kDoubleBuffer);
+  TupleSimplifier tuple_simplifier;
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, double_buffer.Run(module.get()));
+  ASSERT_TRUE(changed);
+  TF_ASSERT_OK_AND_ASSIGN(changed, tuple_simplifier.Run(module.get()));
+
+  std::vector<HloInstruction*> while_loops;
+  for (HloComputation* comp : module->computations()) {
+    for (HloInstruction* instr : comp->instructions()) {
+      if (instr->opcode() == HloOpcode::kWhile) {
+        while_loops.push_back(instr);
+      }
+    }
+  }
+
+  ASSERT_FALSE(while_loops.empty())
+      << "Expected at least one while loop after double buffering";
+
+  for (HloInstruction* while_loop : while_loops) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        WhileLoopBackendConfig config,
+        while_loop->backend_config<WhileLoopBackendConfig>());
+
+    std::set<int64_t> dynamic_indices(
+        config.dynamic_variable_tuple_indices().begin(),
+        config.dynamic_variable_tuple_indices().end());
+
+    EXPECT_FALSE(dynamic_indices.empty())
+        << "Expected dynamic_variable_tuple_indices to be preserved for while "
+           "loop: "
+        << while_loop->name()
+        << ". Double buffering should not erase indices set by "
+           "CollectivePipeliner.";
+
+    EXPECT_NE(dynamic_indices.find(0), dynamic_indices.end())
+        << "Expected tuple index 0 (induction variable) to be preserved as "
+           "dynamic for while loop: "
+        << while_loop->name();
+
+    EXPECT_NE(dynamic_indices.find(3), dynamic_indices.end())
+        << "Expected tuple index 3 (additional counter) to be preserved as "
+           "dynamic for while loop: "
+        << while_loop->name();
+  }
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_offload_utils.cc b/third_party/xla/xla/service/host_offload_utils.cc
index 13eef41d1453d2..1473db05c250f9 100644
--- a/third_party/xla/xla/service/host_offload_utils.cc
+++ b/third_party/xla/xla/service/host_offload_utils.cc
@@ -353,5 +353,88 @@ bool IsMoveToDeviceWithDynamicSlice(const HloInstruction* instr) {
   return false;
 }
 
+namespace {
+
+// Recursively finds GTE indices used in DS/DUS index operands.
+absl::flat_hash_set<int64_t> FindTupleIndicesInOperand(
+    const HloInstruction* operand) {
+  absl::flat_hash_set<int64_t> indices;
+
+  if (operand->opcode() == HloOpcode::kGetTupleElement) {
+    indices.insert(operand->tuple_index());
+  } else if (operand->opcode() == HloOpcode::kCopy &&
+             operand->operand_count() == 1) {
+    auto copy_indices = FindTupleIndicesInOperand(operand->operand(0));
+    indices.insert(copy_indices.begin(), copy_indices.end());
+  } else if (operand->opcode() == HloOpcode::kAdd ||
+             operand->opcode() == HloOpcode::kSubtract ||
+             operand->opcode() == HloOpcode::kMultiply ||
+             operand->opcode() == HloOpcode::kDivide) {
+    for (int i = 0; i < operand->operand_count(); ++i) {
+      auto op_indices = FindTupleIndicesInOperand(operand->operand(i));
+      indices.insert(op_indices.begin(), op_indices.end());
+    }
+  }
+
+  return indices;
+}
+
+}  // namespace
+
+absl::Status MarkDynamicVariables(HloInstruction* while_loop) {
+  if (while_loop->opcode() != HloOpcode::kWhile) {
+    return absl::OkStatus();
+  }
+
+  if (!while_loop->while_body()) {
+    return absl::OkStatus();
+  }
+
+  bool has_host_offloading = false;
+  for (const HloInstruction* instr : while_loop->while_body()->instructions()) {
+    if (IsMoveToHostWithDynamicUpdateSlice(instr) ||
+        IsMoveToDeviceWithDynamicSlice(instr)) {
+      has_host_offloading = true;
+      break;
+    }
+  }
+  if (!has_host_offloading) {
+    return absl::OkStatus();
+  }
+
+  WhileLoopBackendConfig config;
+  TF_ASSIGN_OR_RETURN(config,
+                      while_loop->backend_config<WhileLoopBackendConfig>());
+
+  config.clear_dynamic_variable_tuple_indices();
+
+  std::set<int64_t> dynamic_slice_indices;
+
+  for (auto* instr : while_loop->while_body()->instructions()) {
+    if (instr->opcode() == HloOpcode::kDynamicUpdateSlice ||
+        instr->opcode() == HloOpcode::kDynamicSlice) {
+      int first_index_operand =
+          (instr->opcode() == HloOpcode::kDynamicUpdateSlice)
+              ? Cast<HloDynamicUpdateSliceInstruction>(instr)
+                    ->first_index_operand_number()
+              : Cast<HloDynamicSliceInstruction>(instr)
+                    ->first_index_operand_number();
+
+      for (int i = first_index_operand; i < instr->operand_count(); ++i) {
+        auto* index_op = instr->operand(i);
+        auto op_indices = FindTupleIndicesInOperand(index_op);
+        dynamic_slice_indices.insert(op_indices.begin(), op_indices.end());
+      }
+    }
+  }
+
+  for (int64_t tuple_idx : dynamic_slice_indices) {
+    config.add_dynamic_variable_tuple_indices(tuple_idx);
+  }
+
+  TF_RETURN_IF_ERROR(while_loop->set_backend_config(config));
+  return absl::OkStatus();
+}
+
 }  // namespace host_offload_utils
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_offload_utils.h b/third_party/xla/xla/service/host_offload_utils.h
index 1edac1898c7b4b..e49b9b64bd423b 100644
--- a/third_party/xla/xla/service/host_offload_utils.h
+++ b/third_party/xla/xla/service/host_offload_utils.h
@@ -111,6 +111,10 @@ bool IsMoveToHostWithDynamicUpdateSlice(const HloInstruction* instr);
 
 bool IsMoveToDeviceWithDynamicSlice(const HloInstruction* instr);
 
+// Scans while loop body for DS/DUS, traces their index operands back to GTEs
+// and marks corresponding tuple indices as dynamic variables.
+absl::Status MarkDynamicVariables(HloInstruction* while_loop);
+
 }  // namespace host_offload_utils
 }  // namespace xla
 
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index bb3123dfef0137..054a2de5446d64 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -550,6 +550,8 @@ message DebugOptions {
 
   optional bool xla_gpu_enable_pipelined_collectives = 239 [deprecated = true];
 
+  optional bool xla_gpu_enable_pipelined_host_offloading = 440;
+
   optional bool xla_gpu_enable_pipelined_p2p = 246;
 
   optional bool xla_gpu_enable_pipelined_reduce_scatter = 231;
@@ -1328,7 +1330,7 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 440
+  // Next id: 441
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index 9557c9526209a1..92add3f8d46b57 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -1383,6 +1383,10 @@ message WhileLoopBackendConfig {
   // This lets us distinguish between an unknown induction variable (or none)
   // and tuple index 0.
   KnownInductionVariable known_induction_variable = 3;
+
+  // Variables that should be treated as induction variables for dynamic memcpy
+  // analysis, even though they are not the primary induction variable.
+  repeated int64 dynamic_variable_tuple_indices = 4;
 }
 
 // Specifies a pair of output/operand buffers that alias each other for

From c62696e7df38065ca35364ae5c5edc22a77fa795 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 02:04:41 -0800
Subject: [PATCH 320/753] Remove `xla::ifrt::kIsSdyPartitioned` from mesh
 existance check.

We can then remove xla::ifrt::kIsSdyPartitioned from LowerToIfrt after the 3
week forward compatability window has passed. This should be safe since all
exports that would have had kIsSdyPartitioned also would have had
sdy_meshes_round_trip_attr.

PiperOrigin-RevId: 845156885
---
 .../python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
index fbd4f41c157e0f..0669c2599d5566 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_compile_atom_program_pass.cc
@@ -156,7 +156,7 @@ void IfrtCompileAtomProgramPass::runOnOperation() {
       }
 
       // TODO(b/433244129) - remove after 6 months bwd compatibility window.
-      if (sdy_meshes_round_trip_attr && call_op->hasAttr(kIsSdyPartitioned)) {
+      if (sdy_meshes_round_trip_attr) {
         // Add the meshes roundtrip attribute to the callee module if the
         // atom program was partitioned with sdy.
         xla::sdy::setFrontendAttribute(callee_module,

From fb160028edb308bf14489e33c80823cb2bae813c Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 16 Dec 2025 02:40:53 -0800
Subject: [PATCH 321/753] [XLA:GPU] Move gpu_specs/* to
 xla/backends/gpu/specs/*.

PiperOrigin-RevId: 845168853
---
 third_party/xla/docs/test_hlo_passes.md       |  2 +-
 third_party/xla/xla/backends/gpu/BUILD        |  9 +++
 .../gpu/specs}/README.md                      |  0
 .../gpu/specs}/a100_pcie_80.txtpb             |  0
 .../gpu/specs}/a100_sxm_40.txtpb              |  0
 .../gpu/specs}/a100_sxm_80.txtpb              |  0
 .../gpu/specs}/a6000.txtpb                    |  0
 .../gpu/specs}/b200.txtpb                     |  0
 .../gpu/specs}/b300.txtpb                     | 59 +++++++++----------
 .../gpu/specs}/h100_pcie.txtpb                |  0
 .../gpu/specs}/h100_sxm.txtpb                 |  0
 .../gpu/specs}/mi200.txtpb                    |  0
 .../gpu/specs}/p100.txtpb                     |  0
 .../gpu/specs}/v100.txtpb                     |  0
 third_party/xla/xla/lit.bzl                   | 14 ++---
 third_party/xla/xla/service/BUILD             | 10 ++--
 .../xla/xla/service/gpu/autotuning/BUILD      | 12 ++--
 .../gpu/autotuning/autotune_cache_key_test.cc |  4 +-
 .../xla/service/gpu/gpu_spmd_pipeline_test.cc |  2 +-
 .../xla/service/gpu/tests/bitcast-convert.hlo |  2 +-
 .../service/gpu/tests/calling_convention.hlo  |  2 +-
 .../xla/xla/service/gpu/tests/dot_bf16.hlo    |  6 +-
 .../xla/service/gpu/tests/kernel_reuse.hlo    |  2 +-
 .../service/gpu/tests/offload_scan_output.hlo |  2 +-
 .../xla/service/gpu/tests/pad_to_static.hlo   |  2 +-
 .../service/gpu/tests/reduce-precision.hlo    |  2 +-
 .../gpu/tests/reduce_fold_zero_add.hlo        |  2 +-
 .../gpu/tests/rng_get_and_update_state.hlo    |  2 +-
 .../service/gpu/tests/single_instruction.hlo  |  6 +-
 .../service/gpu/tests/slice_to_dynamic.hlo    |  2 +-
 .../xla/xla/service/gpu/tests/sorting.hlo     |  2 +-
 .../gpu/tests/sub_byte_collectives.hlo        |  2 +-
 .../gpu/tests/triton_calling_convention.hlo   |  2 +-
 .../xla/service/gpu/tests/triton_naming.hlo   |  2 +-
 .../gpu/tests/zero_clamp_abs_index.hlo        |  2 +-
 .../xla/xla/service/gpu/transforms/BUILD      |  6 +-
 .../gpu/transforms/layout_assignment_a100.hlo |  2 +-
 .../gpu/transforms/layout_assignment_h100.hlo |  2 +-
 .../gpu/transforms/layout_assignment_v100.hlo |  2 +-
 third_party/xla/xla/stream_executor/gpu/BUILD | 20 +++----
 .../gpu/gpu_device_info_test.cc               |  4 +-
 third_party/xla/xla/tools/BUILD               |  2 +-
 third_party/xla/xla/tools/hlo_opt/BUILD       | 12 +---
 .../xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo   |  2 +-
 .../tools/hlo_opt/tests/gpu_hlo_backend.hlo   |  2 +-
 .../tools/hlo_opt/tests/gpu_hlo_buffers.hlo   |  2 +-
 .../hlo_opt/tests/gpu_hlo_collective_cse.hlo  |  2 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_html.hlo  |  2 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo  |  2 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo  |  2 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo   |  2 +-
 .../tests/gpu_hlo_unoptimized_llvm.hlo        |  2 +-
 .../xla/xla/tools/xla_gpu_compile_lib_test.cc |  2 +-
 53 files changed, 110 insertions(+), 112 deletions(-)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/README.md (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/a100_pcie_80.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/a100_sxm_40.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/a100_sxm_80.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/a6000.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/b200.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/b300.txtpb (95%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/h100_pcie.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/h100_sxm.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/mi200.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/p100.txtpb (100%)
 rename third_party/xla/xla/{tools/hlo_opt/gpu_specs => backends/gpu/specs}/v100.txtpb (100%)

diff --git a/third_party/xla/docs/test_hlo_passes.md b/third_party/xla/docs/test_hlo_passes.md
index 8afcf6bf773aaf..723406e74d991a 100644
--- a/third_party/xla/docs/test_hlo_passes.md
+++ b/third_party/xla/docs/test_hlo_passes.md
@@ -51,7 +51,7 @@ For example, some
 be written as follows:
 
 ```
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 HloModule Test, is_scheduled=true
 fused_computation {
diff --git a/third_party/xla/xla/backends/gpu/BUILD b/third_party/xla/xla/backends/gpu/BUILD
index b0a85c6ca377fc..385fb6153ff023 100644
--- a/third_party/xla/xla/backends/gpu/BUILD
+++ b/third_party/xla/xla/backends/gpu/BUILD
@@ -32,3 +32,12 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
     ],
 )
+
+filegroup(
+    name = "all_gpu_specs",
+    data = glob(["specs/*.txtpb"]),
+)
+
+exports_files(glob([
+    "specs/*.txtpb",
+]))
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md b/third_party/xla/xla/backends/gpu/specs/README.md
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/README.md
rename to third_party/xla/xla/backends/gpu/specs/README.md
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb b/third_party/xla/xla/backends/gpu/specs/a100_pcie_80.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb
rename to third_party/xla/xla/backends/gpu/specs/a100_pcie_80.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_40.txtpb b/third_party/xla/xla/backends/gpu/specs/a100_sxm_40.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_40.txtpb
rename to third_party/xla/xla/backends/gpu/specs/a100_sxm_40.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_80.txtpb b/third_party/xla/xla/backends/gpu/specs/a100_sxm_80.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_sxm_80.txtpb
rename to third_party/xla/xla/backends/gpu/specs/a100_sxm_80.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a6000.txtpb b/third_party/xla/xla/backends/gpu/specs/a6000.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a6000.txtpb
rename to third_party/xla/xla/backends/gpu/specs/a6000.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b200.txtpb b/third_party/xla/xla/backends/gpu/specs/b200.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/b200.txtpb
rename to third_party/xla/xla/backends/gpu/specs/b200.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb b/third_party/xla/xla/backends/gpu/specs/b300.txtpb
similarity index 95%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb
rename to third_party/xla/xla/backends/gpu/specs/b300.txtpb
index a4d2f9de3aea3b..76d77b2cda6089 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_specs/b300.txtpb
+++ b/third_party/xla/xla/backends/gpu/specs/b300.txtpb
@@ -11,33 +11,32 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-gpu_device_info {
-  threads_per_block_limit: 1024
-  threads_per_warp: 32
-  shared_memory_per_block: 49152
-  shared_memory_per_core: 233472
-  threads_per_core_limit: 2048
-  core_count: 158
-  fpus_per_core: 128
-  block_dim_limit_x: 2147483647
-  block_dim_limit_y: 65535
-  block_dim_limit_z: 65535
-  memory_bandwidth: 7936000000000
-  l2_cache_size: 135528448
-  clock_rate_ghz: 1.965
-  device_memory_size: 297021865984
-  shared_memory_per_block_optin: 232448
-  cuda_compute_capability {
-    major: 10
-    minor: 3
-  }
-  registers_per_core_limit: 65536
-  registers_per_block_limit: 65536
-}
-platform_name: "CUDA"
-dnn_version_info {
-  major: 9
-  minor: 9
-}
-device_description_str: "NVIDIA B300"
+{
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 233472
+  threads_per_core_limit: 2048
+  core_count: 158
+  fpus_per_core: 128
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 7936000000000
+  l2_cache_size: 135528448
+  clock_rate_ghz: 1.965
+  device_memory_size: 297021865984
+  shared_memory_per_block_optin: 232448
+  cuda_compute_capability {
+    major: 10
+    minor: 3
+  }
+  registers_per_core_limit: 65536
+  registers_per_block_limit: 65536
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 9
+  minor: 9
+}
+device_description_str: "NVIDIA B300"
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_pcie.txtpb b/third_party/xla/xla/backends/gpu/specs/h100_pcie.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_pcie.txtpb
rename to third_party/xla/xla/backends/gpu/specs/h100_pcie.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_sxm.txtpb b/third_party/xla/xla/backends/gpu/specs/h100_sxm.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/h100_sxm.txtpb
rename to third_party/xla/xla/backends/gpu/specs/h100_sxm.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb b/third_party/xla/xla/backends/gpu/specs/mi200.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/mi200.txtpb
rename to third_party/xla/xla/backends/gpu/specs/mi200.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/p100.txtpb b/third_party/xla/xla/backends/gpu/specs/p100.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/p100.txtpb
rename to third_party/xla/xla/backends/gpu/specs/p100.txtpb
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/v100.txtpb b/third_party/xla/xla/backends/gpu/specs/v100.txtpb
similarity index 100%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/v100.txtpb
rename to third_party/xla/xla/backends/gpu/specs/v100.txtpb
diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl
index 3f252f4653d64a..75bd84958410bd 100644
--- a/third_party/xla/xla/lit.bzl
+++ b/third_party/xla/xla/lit.bzl
@@ -206,13 +206,13 @@ def lit_test_suite_for_gpus(
             "--param=GPU=%s" % (gpu),
         ]
         gpu_data = data + [
-            "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/b200.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
-            "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
+            "//xla/backends/gpu:specs/a100_pcie_80.txtpb",
+            "//xla/backends/gpu:specs/a6000.txtpb",
+            "//xla/backends/gpu:specs/b200.txtpb",
+            "//xla/backends/gpu:specs/h100_sxm.txtpb",
+            "//xla/backends/gpu:specs/mi200.txtpb",
+            "//xla/backends/gpu:specs/p100.txtpb",
+            "//xla/backends/gpu:specs/v100.txtpb",
         ]
         lit_test_suite(
             "%s_%s" % (name, gpu),
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index f8821201e34dc6..681ca64d46e3e6 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5817,34 +5817,34 @@ xla_aot_compile_cpu(
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_hlo",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test.hlo",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_constant",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_constant.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_convolution",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
 xla_aot_compile_gpu_runtime_autotuning(
     name = "xla_aot_compile_test_gpu_executable_convolution_runtime_autotuning",
-    gpu_target_config = "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 10c0b530b33f79..ddbf483fc5a710 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -411,9 +411,9 @@ xla_cc_test(
     name = "autotune_cache_key_test",
     srcs = ["autotune_cache_key_test.cc"],
     data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
+        "//xla/backends/gpu:specs/a100_sxm_40.txtpb",
+        "//xla/backends/gpu:specs/a100_sxm_80.txtpb",
+        "//xla/backends/gpu:specs/mi200.txtpb",
     ],
     deps = [
         ":autotune_cache_key",
@@ -678,9 +678,9 @@ xla_cc_test(
     name = "autotuner_util_test",
     srcs = ["autotuner_util_test.cc"],
     data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
+        "//xla/backends/gpu:specs/a100_sxm_40.txtpb",
+        "//xla/backends/gpu:specs/a100_sxm_80.txtpb",
+        "//xla/backends/gpu:specs/mi200.txtpb",
     ],
     tags = [
         "gpu",
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
index 35c5669ade26df..dcb44e7f8cec60 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
@@ -66,8 +66,8 @@ TEST(AutotuneCacheKeyTest, DeviceDescriptionToCacheKey) {
     std::string spec_string;
     CHECK_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
-        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "tools", "hlo_opt",
-                          "gpu_specs", spec_file_name),
+        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "backends", "gpu",
+                          "specs", spec_file_name),
         &spec_string));
     EXPECT_TRUE(
         tsl::protobuf::TextFormat::ParseFromString(spec_string, &proto));
diff --git a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
index 961e503f06fcb2..a99ce566bf8681 100644
--- a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
@@ -60,7 +60,7 @@ class GpuSpmdPartitioningTest : public HloHardwareIndependentTestBase,
     HloPassPipeline spmd_pipeline("spmd-partitioner");
     se::CudaComputeCapability ampere(8, 0);
     AlgebraicSimplifierOptions alg_simplifier_options;
-    // Ampere Core_count from tensorflow/compiler/xla/tools/hlo_opt/gpu_specs/.
+    // Ampere Core_count from tensorflow/compiler/xla/backends/gpu/specs/.
     AddSPMDPasses(module.get(), alg_simplifier_options, ampere, spmd_pipeline,
                   std::nullopt);
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(module.get()).status());
diff --git a/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo b/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
index a70d34d60e935a..831e2cce5977ce 100644
--- a/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
+++ b/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = s4[8,2]{1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
index dca775f862228b..6ba3138d32f91c 100644
--- a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // Arguments are passed separately.
 // Even constant arguments are passed as arguments.
diff --git a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
index 982d34501f4730..9c2a3ffe406710 100644
--- a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
@@ -1,6 +1,6 @@
-// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %}
-// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
-// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %}
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/a100_pcie_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
+// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
 
 
 // CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} fusion(%{{.+}})
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index 26fa9db336994b..a0a40ea5c8d9e1 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // All fusions must reuse the same kernel:
 // CHECK-LABEL: target triple
diff --git a/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo b/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
index 76c221666c840a..6ed7b42bdd369a 100644
--- a/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK %s
 
 HloModule jit_f, entry_computation_layout={()->(f32[4]{0:S(5)}, f32[4]{0})}, allow_spmd_sharding_propagation_to_output={true,true}
 
diff --git a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
index 6e147df3928c09..4681d8bb6293b3 100644
--- a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
+++ b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
index 94752f8871c81e..2686645ffa76b0 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = bf16[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo b/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
index e5772500423492..a351a41eeb4212 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-after-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-after-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule test
 
diff --git a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
index e140b56af9d60c..f218bfd5f46c40 100644
--- a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index 893619567093d3..25b1ddbd10d262 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,6 +1,6 @@
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100_sxm.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/h100_sxm.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
 
 // CHECK-DAG: sqrt.approx.f32
 
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index 79fdfb950966e4..c353ad741a4dee 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index 4ecf3fcb847da4..868666512e6b2f 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
index fd858818c3dbcd..acd8b35b1f5252 100644
--- a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --split-input-file --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --split-input-file --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = s4[4,16]{1,0:E(4)} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
index 6a83c444793d47..37674565a81d50 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // Verify that Triton kernels have the correct calling convention:
 // - PTX_KERNEL (71) for NVIDIA targets
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index 0578a2b058fcde..f042371e435f10 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // CHECK-PTX: define ptx_kernel void @triton_gemm_r(
 // CHECK-GCN: define amdgpu_kernel void @triton_gemm_r(
diff --git a/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo b/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
index 59f448644172d4..39294a000c2d8a 100644
--- a/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
+++ b/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
 
 e {
   p0 = s32[8,9] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 6bd2d3f396eccf..abd291790b3331 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1740,9 +1740,9 @@ lit_test_suite(
     ),
     cfg = "//xla:lit.cfg.py",
     data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
+        "//xla/backends/gpu:specs/a100_pcie_80.txtpb",
+        "//xla/backends/gpu:specs/h100_sxm.txtpb",
+        "//xla/backends/gpu:specs/v100.txtpb",
     ],
     default_tags = tf_gpu_tests_tags(),
     tools = [
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
index f5f35fca937004..8230d28582b039 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: bf16[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
index 07b3a04afabf74..ab91085ea9b00f 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100_sxm.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/h100_sxm.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: f8e4m3fn[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
index f91bea89a7cce5..a627b6551cf7ed 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/v100.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: f16[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 52e926f2befb72..09200e423bd1b8 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -707,16 +707,16 @@ xla_test(
     srcs = ["gpu_device_info_test.cc"],
     backends = ["gpu"],
     data = [
-        "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/b200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_pcie.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
+        "//xla/backends/gpu:specs/a100_pcie_80.txtpb",
+        "//xla/backends/gpu:specs/a100_sxm_40.txtpb",
+        "//xla/backends/gpu:specs/a100_sxm_80.txtpb",
+        "//xla/backends/gpu:specs/a6000.txtpb",
+        "//xla/backends/gpu:specs/b200.txtpb",
+        "//xla/backends/gpu:specs/h100_pcie.txtpb",
+        "//xla/backends/gpu:specs/h100_sxm.txtpb",
+        "//xla/backends/gpu:specs/mi200.txtpb",
+        "//xla/backends/gpu:specs/p100.txtpb",
+        "//xla/backends/gpu:specs/v100.txtpb",
     ],
     deps = [
         "//xla/service:platform_util",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
index 14bf3ddd285fe4..3470b878182ab7 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
@@ -45,8 +45,8 @@ TEST(DeviceInfoTest, DeviceInfoMatches) {
     std::string spec_string;
     TF_ASSERT_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
-        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "tools", "hlo_opt",
-                          "gpu_specs", absl::StrCat(file_name, ".txtpb")),
+        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "backends", "gpu",
+                          "specs", absl::StrCat(file_name, ".txtpb")),
         &spec_string));
     ASSERT_TRUE(
         tsl::protobuf::TextFormat::ParseFromString(spec_string, &proto));
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 999ad17ad36944..7cfbccb52aba71 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1103,8 +1103,8 @@ xla_test(
     ],
     data = [
         ":data/add.hlo",
+        "//xla/backends/gpu:specs/h100_sxm.txtpb",
         "//xla/service/gpu:gpu_compiler_test_autotune_db.textproto",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
     ],
     deps = [
         ":xla_compile_lib",
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 2d5d86eb434b59..4eeffad9c2b681 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -258,18 +258,8 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "gpu_specs/a100_pcie_80.txtpb",
-        "gpu_specs/mi200.txtpb",
+        "//xla/backends/gpu:all_gpu_specs",
         "//xla/tools:hlo-opt",
         "@llvm-project//llvm:FileCheck",
     ],
 )
-
-filegroup(
-    name = "all_gpu_specs",
-    data = glob(["gpu_specs/*.txtpb"]),
-)
-
-exports_files(glob([
-    "gpu_specs/*.txtpb",
-]))
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
index 71eed4621791ac..cb45f800808a59 100755
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
index c12803692ffefc..9a60d938a9562f 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
index 94fea4361d0516..65abb9b3ab2c09 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
index fa7b25f68abea6..f2d9127e5bff21 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --passes=schedule-aware-collective-cse --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb --xla_gpu_experimental_collective_cse_distance_threshold=100 | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --passes=schedule-aware-collective-cse --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --xla_gpu_experimental_collective_cse_distance_threshold=100 | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
index 6e482da819805b..74675fddafa9ed 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=html --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=html --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK: <!DOCTYPE html>
 // CHECK: bitcast
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
index 2dac74fda61516..dc1ca2693ffb6e 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
index 8df2cdf621eb15..60fe903cee5326 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --passes=dot-algorithm-rewriter --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --passes=dot-algorithm-rewriter --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule Algorithm3xBF16
 // CHECK-LABEL: HloModule Algorithm3xBF16, entry_computation_layout={(f32[128,128]{1,0}, f32[128,128]{1,0})->f32[128,128]{1,0}}
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
index b01f867b96483a..20a86530be50cb 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
index 7e47a652c452f0..b8888a5d362944 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // CHECK-PTX:     define ptx_kernel void @fusion
 // CHECK-GCN:     define amdgpu_kernel void @fusion
diff --git a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
index ce26278c7293a7..d3eec58a3de994 100644
--- a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
@@ -72,7 +72,7 @@ TEST_F(XlaCompileLibTest, CompilesForGpuWithDevice) {
 
 TEST_F(XlaCompileLibTest, CompilesForGpuWithoutDevice) {
   const std::string target_config_path = tsl::io::JoinPath(
-      tsl::testing::XlaSrcRoot(), "tools/hlo_opt/gpu_specs", "h100_sxm.txtpb");
+      tsl::testing::XlaSrcRoot(), "backends/gpu/specs", "h100_sxm.txtpb");
   stream_executor::GpuTargetConfigProto target_config;
   TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,
                                   &target_config));

From 79d7c14950e39ae5c3339e7c2b26461103bb51fa Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Tue, 16 Dec 2025 02:44:36 -0800
Subject: [PATCH 322/753] Reverts f5047b172a3141ae75aae1ee843f5b1d44f7af09

PiperOrigin-RevId: 845169962
---
 third_party/xla/xla/service/compiler.h                     | 7 -------
 .../xla/xla/service/cpu/cpu_aot_compilation_result.h       | 2 --
 2 files changed, 9 deletions(-)

diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index 5de2f37f65a9b4..ee2fac51790803 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -69,7 +69,6 @@ namespace xla {
 // computation.
 using ObjectFileData = std::vector<char>;
 
-class Compiler;
 class AotCompilationOptions;
 
 // Abstract superclass describing the result of an ahead-of-time compilation.
@@ -89,12 +88,6 @@ class AotCompilationResult {
     return Unimplemented("LoadExecutable unimplemented.");
   }
 
-  ABSL_DEPRECATE_AND_INLINE()
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler*, const se::StreamExecutor* executor) && {
-    return std::move(*this).LoadExecutable(executor);
-  }
-
   virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
       const {
     return Unimplemented("buffer_assignment unimplemented.");
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 2ca82de23ae14f..0a2b8cf2a17378 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -116,8 +116,6 @@ class CpuAotCompilationResult : public AotCompilationResult {
     return proto_.SerializeAsString();
   }
 
-  using AotCompilationResult::LoadExecutable;
-
   absl::StatusOr<std::unique_ptr<Executable>>
       LoadExecutable(const se::StreamExecutor* stream_exec) && override;
 

From 99ddc261a68f179af9f9273ba46d45041034576b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 03:18:35 -0800
Subject: [PATCH 323/753] Automated Code Change

PiperOrigin-RevId: 845180767
---
 third_party/xla/xla/service/spmd/custom_call_handler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/spmd/custom_call_handler.cc b/third_party/xla/xla/service/spmd/custom_call_handler.cc
index 7dd17a745e6231..007657b5986a1a 100644
--- a/third_party/xla/xla/service/spmd/custom_call_handler.cc
+++ b/third_party/xla/xla/service/spmd/custom_call_handler.cc
@@ -482,7 +482,7 @@ absl::Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
   // Block-scaled dot with MX operands.
   if (hlo->custom_call_target() == "__op$block_scaled_dot") {
     // Evaluate the dimension numbers of the block-scaled dot.
-    int dimensions_size = hlo->operand(0)->shape().dimensions_size();
+    int dimensions_size = hlo->operand(0)->shape().dimensions().size();
     TF_RET_CHECK(dimensions_size == 2 || dimensions_size == 3);
     DotDimensionNumbers dimension_numbers;
     dimension_numbers.add_lhs_contracting_dimensions(dimensions_size - 1);

From 2f93881580df8ba8aa13cb2cb11ddc24226cd0e0 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Tue, 16 Dec 2025 03:32:08 -0800
Subject: [PATCH 324/753] [XLA] Skip printing StackFrameIndex if print_metadata
 is false.

PiperOrigin-RevId: 845184915
---
 third_party/xla/xla/hlo/ir/hlo_module.cc      |   8 +-
 third_party/xla/xla/hlo/ir/hlo_module.h       |   3 +-
 .../xla/xla/hlo/parser/hlo_parser_test.cc     | 272 ++++++++++--------
 3 files changed, 155 insertions(+), 128 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index d67349ea44e393..aa64124e7d6bdb 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -406,7 +406,7 @@ void HloModule::Print(
         },
         value);
   }
-  PrintStackFrameIndex(printer);
+  PrintStackFrameIndex(printer, options);
   printer->Append("\n\n");
   PrintComputations(printer, options);
 }
@@ -479,8 +479,10 @@ void HloModule::PrintComputations(Printer* printer,
   }
 }
 
-void HloModule::PrintStackFrameIndex(Printer* printer) const {
-  if (!stack_frame_index_.has_value()) {
+void HloModule::PrintStackFrameIndex(Printer* printer,
+                                     const HloPrintOptions& options) const {
+  if (!stack_frame_index_.has_value() ||
+      stack_frame_index_->file_names().empty() || !options.print_metadata()) {
     return;
   }
   printer->Append("\n\nFileNames\n");
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 25f0ba3ac2d967..584fd38d1e8979 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -448,7 +448,8 @@ class HloModule {
   void PrintComputations(Printer* printer,
                          const HloPrintOptions& options) const;
   void PrintConfig(Printer* printer, const HloModuleConfig& config) const;
-  void PrintStackFrameIndex(Printer* printer) const;
+  void PrintStackFrameIndex(Printer* printer,
+                            const HloPrintOptions& options) const;
 
  public:
   // Prints a string representation of the module.
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
index 2d2cbf31651e68..3edde58537a8fd 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser_test.cc
@@ -2820,15 +2820,14 @@ class HloParameterizedParserTest
     VLOG(3) << "Running HloParameterizedParserTest with short_form = "
             << short_form << ", proto_round_trip = " << proto_round_trip;
     const std::string& original = GetParam().module_string;
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(original));
-    TF_ASSERT_OK_AND_ASSIGN(
+    ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(original));
+    ASSERT_OK_AND_ASSIGN(
         module, ParseAndReturnVerifiedModule(module->ToString(
                     HloPrintOptions().set_print_large_constants(true))));
 
     if (proto_round_trip) {
-      TF_ASSERT_OK_AND_ASSIGN(module, HloModule::CreateFromProto(
-                                          module->ToProto(), module->config()));
+      ASSERT_OK_AND_ASSIGN(module, HloModule::CreateFromProto(
+                                       module->ToProto(), module->config()));
     }
     if (short_form) {
       EXPECT_EQ(original, module->ToString(HloPrintOptions::ShortParsable()));
@@ -2879,7 +2878,7 @@ INSTANTIATE_TEST_SUITE_P(HloParserTestSuccessInstantiation,
 class HloNonRoundtripParserTest
     : public ::testing::TestWithParam<NonRoundtripTestData> {};
 TEST_P(HloNonRoundtripParserTest, Run) {
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnVerifiedModule(GetParam().test_name,
                                                 GetParam().input_module_string,
                                                 HloModuleConfig()));
@@ -3020,7 +3019,7 @@ ENTRY %configuration_test() -> s32[] {
   %constant = s32[] constant(42), backend_config="foo bar"
 })";
   auto result = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(result.status());
+  ASSERT_OK(result.status());
   EXPECT_EQ("foo bar", result.value()
                            ->entry_computation()
                            ->root_instruction()
@@ -3481,7 +3480,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
 })";
 
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   auto program_layout = module.value()->entry_computation_layout();
   ASSERT_EQ(program_layout.parameter_count(), 1);
   auto param_layout = program_layout.parameter_layout(0).layout();
@@ -3516,7 +3515,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(false)
                                          .set_keep_module_auto_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   // Do not set the default layout.
   EXPECT_FALSE(module.value()->entry_computation_layout().AnyLayoutSet());
 }
@@ -3542,7 +3541,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(true)
                                          .set_keep_module_auto_layouts(true));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   // Do not set the default layout.
   EXPECT_FALSE(module.value()->entry_computation_layout().AnyLayoutSet());
 }
@@ -3568,7 +3567,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(true)
                                          .set_keep_module_auto_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation_layout()
                   .parameter_layout(0)
@@ -3598,7 +3597,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
                                      HloParserOptions()
                                          .set_fill_missing_layouts(true)
                                          .set_keep_module_auto_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation_layout()
                   .parameter_layout(0)
@@ -3620,7 +3619,7 @@ ENTRY main {
   absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(
           original, {}, HloParserOptions().set_fill_missing_layouts(false));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_FALSE(module.value()
                    ->entry_computation()
                    ->root_instruction()
@@ -3641,7 +3640,7 @@ ENTRY main {
   absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(
           original, {}, HloParserOptions().set_fill_missing_layouts(true));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation()
                   ->root_instruction()
@@ -3664,7 +3663,7 @@ ENTRY main {
   absl::StatusOr<std::unique_ptr<HloModule>> module =
       ParseAndReturnUnverifiedModule(
           original, {}, HloParserOptions().set_fill_missing_layouts(true));
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_THAT(module.value()
                   ->entry_computation()
                   ->root_instruction()
@@ -3683,7 +3682,7 @@ c2 {
   const2 = f32[1]{0} constant({67890})
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_EQ(module.value()->entry_computation()->name(), "c2");
 }
 
@@ -3694,7 +3693,7 @@ ENTRY consts {
   last = f32[1]{0} constant({67890})
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   EXPECT_EQ(module.value()->entry_computation()->root_instruction()->name(),
             "last");
 }
@@ -3713,7 +3712,7 @@ ENTRY /*comment*/ c1 {
 
 )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, MultilineComments) {
@@ -3732,7 +3731,7 @@ d
 */
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, UnterminatedComment) {
@@ -3755,7 +3754,7 @@ ENTRY c1 {
   ROOT const1 = f32[1]{0} constant({12345}) // Something else
 })";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, SlashSlashCommentMsDosEolFormat) {
@@ -3763,7 +3762,7 @@ TEST_F(HloParserTest, SlashSlashCommentMsDosEolFormat) {
       "HloModule slash_slash_comment:\r\n// Garbage\r\nENTRY c1 {\r\n// Foo "
       "bar\r\nROOT const1 = f32[1]{0} constant({12345}) // Something else\r\n}";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, SlashSlashCommentMacEolFormat) {
@@ -3771,7 +3770,7 @@ TEST_F(HloParserTest, SlashSlashCommentMacEolFormat) {
       "HloModule slash_slash_comment:\r// Garbage\rENTRY c1 {\r// Foo "
       "bar\rROOT const1 = f32[1]{0} constant({12345}) // Something else\r}";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
 }
 
 TEST_F(HloParserTest, MultipleEntries) {
@@ -3798,7 +3797,7 @@ ENTRY entry {
 }
   )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   std::unique_ptr<HloModule> parsed_module = std::move(module).value();
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {0}),
             ShapeIndex{0});
@@ -3825,7 +3824,7 @@ ENTRY entry {
 }
   )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   std::unique_ptr<HloModule> parsed_module = std::move(module).value();
   EXPECT_EQ(parsed_module->input_output_alias_config().GetAliasedOutput(0, {0}),
             ShapeIndex({0, 0}));
@@ -3920,7 +3919,7 @@ ENTRY entry {
 }
   )";
   auto module = ParseAndReturnVerifiedModule(original);
-  TF_ASSERT_OK(module.status());
+  ASSERT_OK(module.status());
   std::unique_ptr<HloModule> parsed_module = std::move(module).value();
   EXPECT_TRUE(
       parsed_module->buffer_donor_config().ParameterIsBufferDonor(0, {0}));
@@ -4019,7 +4018,7 @@ ENTRY ReduceR3ToR2 {
   ROOT result = f32[8,16]{1,0} reduce(p0, p1), dimensions={2}, to_apply=add
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(original));
   ASSERT_NE(module->entry_computation(), nullptr);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               GmockMatch(m::Reduce()));
@@ -4027,13 +4026,13 @@ ENTRY ReduceR3ToR2 {
 
 TEST_F(HloParserTest, ParseSharding) {
   const std::string original = "{maximal device=42}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
 }
 
 TEST_F(HloParserTest, ParseShardingPartialReplication) {
   const std::string original = "{devices=[2,2]0,1,2,3 last_tile_dim_replicate}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   Array<int64_t> tiling_last_dim_replicated({{0, 1}, {2, 3}});
   EXPECT_EQ(HloSharding::PartialTile(tiling_last_dim_replicated).ToString(),
@@ -4044,7 +4043,7 @@ TEST_F(HloParserTest, ParseShardingSubGroup) {
   const std::string original =
       "{devices=[2,2,2,2]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 "
       "last_tile_dims={manual, replicated}}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   Array<int64_t> tile_assignment({2, 2, 2, 2});
   tile_assignment.FillIota(0);
@@ -4056,7 +4055,7 @@ TEST_F(HloParserTest, ParseShardingSubGroup) {
 
 TEST_F(HloParserTest, ParseTrivialIotaShardingPartialReplication) {
   const std::string original = "{devices=[2,2]<=[4] last_tile_dim_replicate}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tiling_last_dim_replicated((absl::Span<const int64_t>){2, 2});
   EXPECT_EQ(HloSharding::PartialTile(tiling_last_dim_replicated).ToString(),
@@ -4066,7 +4065,7 @@ TEST_F(HloParserTest, ParseTrivialIotaShardingPartialReplication) {
 TEST_F(HloParserTest, ParseTrivialIotaShardingSubGroup) {
   const std::string original =
       "{devices=[2,2,2,2]<=[16] last_tile_dims={manual, replicated}}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tile_assignment({2, 2, 2, 2});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,
@@ -4078,7 +4077,7 @@ TEST_F(HloParserTest, ParseTrivialIotaShardingSubGroup) {
 TEST_F(HloParserTest, ParseTransposedIotaShardingPartialReplication) {
   const std::string original =
       "{devices=[2,2]<=[2,2]T(1,0) last_tile_dim_replicate}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tiling_last_dim_replicated({2, 2}, {2, 2}, {1, 0});
   EXPECT_EQ(HloSharding::PartialTile(tiling_last_dim_replicated).ToString(),
@@ -4089,7 +4088,7 @@ TEST_F(HloParserTest, ParseTransposedIotaShardingSubGroup) {
   const std::string original =
       "{devices=[2,2,2,2]<=[2,2,4]T(2,1,0) last_tile_dims={manual, "
       "replicated}}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tile_assignment({2, 2, 2, 2}, {2, 2, 4}, {2, 1, 0});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,
@@ -4100,7 +4099,7 @@ TEST_F(HloParserTest, ParseTransposedIotaShardingSubGroup) {
 
 TEST_F(HloParserTest, ParseShardAs) {
   const std::string original = "{manual shard_as 1}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   EXPECT_EQ(
       HloSharding::Manual().SetShardGroup(HloSharding::ShardAs(1)).ToString(),
@@ -4111,7 +4110,7 @@ TEST_F(HloParserTest, ParseShardLike) {
   const std::string original =
       "{devices=[2,2,2,2]<=[16] last_tile_dims={manual, replicated} shard_like "
       "1}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   TileAssignment tile_assignment({2, 2, 2, 2});
   std::vector<OpSharding::Type> subgroup_types = {OpSharding::MANUAL,
@@ -4124,7 +4123,7 @@ TEST_F(HloParserTest, ParseShardLike) {
 
 TEST_F(HloParserTest, ParseUnknownSharding) {
   const std::string original = "{unknown}";
-  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
   EXPECT_EQ(sharding.ToString(), original);
   EXPECT_EQ(HloSharding::Unknown().ToString(), original);
 }
@@ -4132,53 +4131,53 @@ TEST_F(HloParserTest, ParseUnknownSharding) {
 TEST_F(HloParserTest, ParseFrontendAttributes) {
   const std::string original =
       R"({attr_a="test_a",attr_b="b",attr_c={type="s64"},attr_d="a=\"b/c\""})";
-  TF_ASSERT_OK_AND_ASSIGN(FrontendAttributes frontend_attributes,
-                          ParseFrontendAttributes(original));
+  ASSERT_OK_AND_ASSIGN(FrontendAttributes frontend_attributes,
+                       ParseFrontendAttributes(original));
   EXPECT_EQ(FrontendAttributesToString(frontend_attributes), original);
 }
 
 TEST_F(HloParserTest, ParseWindow) {
   Window original = window_util::MakeWindow({1, 2, 3});
-  TF_ASSERT_OK_AND_ASSIGN(Window parsed,
-                          ParseWindow(window_util::ToString(original)));
+  ASSERT_OK_AND_ASSIGN(Window parsed,
+                       ParseWindow(window_util::ToString(original)));
   EXPECT_EQ(window_util::ToString(original), window_util::ToString(parsed));
 }
 
 TEST_F(HloParserTest, ParseConvolutionDimensionNumbers) {
   const std::string original = "b0f_0io->b0f";
-  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
-                          ParseConvolutionDimensionNumbers(original));
+  ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
+                       ParseConvolutionDimensionNumbers(original));
   EXPECT_EQ(original, ConvolutionDimensionNumbersToString(dnums));
 }
 
 TEST_F(HloParserTest, ParseConvolutionDimensionNumbersWithUnknownDims) {
   const std::string original = "b0?f_?0?io->?b?0?f";
-  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
-                          ParseConvolutionDimensionNumbers(original));
+  ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
+                       ParseConvolutionDimensionNumbers(original));
   EXPECT_EQ(original, ConvolutionDimensionNumbersToString(dnums));
 }
 
 TEST_F(HloParserTest, ParseReplicaGroups) {
   const std::string original = "{{0,1},{2,3}}";
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<ReplicaGroup> replica_groups,
-                          ParseReplicaGroupsOnly(original));
+  ASSERT_OK_AND_ASSIGN(std::vector<ReplicaGroup> replica_groups,
+                       ParseReplicaGroupsOnly(original));
   EXPECT_EQ(original, ReplicaGroupsToString(replica_groups));
 }
 
 TEST_F(HloParserTest, ParsePaddingConfigNoInteriorPadding) {
   const std::string original = "0_1x2_3";
-  TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
+  ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
   EXPECT_EQ(original, PaddingConfigToString(dnums));
 }
 
 TEST_F(HloParserTest, ParsePaddingConfigInteriorPadding) {
   const std::string original = "0_1_0x2_3_4";
-  TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
+  ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig(original));
   EXPECT_EQ(original, PaddingConfigToString(dnums));
 }
 
 TEST_F(HloParserTest, ParsePaddingConfigInteriorPaddingImplicitZeroDim) {
-  TF_ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig("0_1x2_3_4"));
+  ASSERT_OK_AND_ASSIGN(PaddingConfig dnums, ParsePaddingConfig("0_1x2_3_4"));
   // The extra "_0" gets added to the canonical string because the other dim has
   // interior padding.
   EXPECT_EQ("0_1_0x2_3_4", PaddingConfigToString(dnums));
@@ -4198,7 +4197,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
   const std::string text =
       "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0} %broadcast, "
       "f32[2,4]{1,0} %x)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4229,17 +4228,55 @@ TEST(HloParserSingleOpTest, SingleOpNoOperandShapesProducesError) {
 TEST(HloParserSingleOpTest, SingleOpNoNames) {
   const std::string text =
       "%multiply = f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
+TEST(HloParserSingleOpTest, SkipStackFrameIndex) {
+  const std::string text = R"(HloModule m, entry_computation_layout={()->pred[]}
+
+FileNames
+1 "<embedded module>"
+2 "experimental/module.py"
+3 "yet/another/test.py"
+
+FunctionNames
+1 "main"
+2 "method"
+
+FileLocations
+1 {file_name_id=1 function_name_id=1 line=153 end_line=153 column=2 end_column=31}
+2 {file_name_id=3 function_name_id=2 line=35 end_line=35 column=2 end_column=24}
+3 {file_name_id=2 function_name_id=2 line=83 end_line=83 column=2 end_column=15}
+
+StackFrames
+1 {file_location_id=1 parent_frame_id=1}
+2 {file_location_id=2 parent_frame_id=2}
+
+
+ENTRY %constant_pred () -> pred[] {
+  ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="opname" stack_frame_id=1}
+})";
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  HloPrintOptions options = HloPrintOptions::Canonical();
+  options.set_print_metadata(false);
+  EXPECT_EQ(module->ToString(options),
+            R"(HloModule m, entry_computation_layout={()->pred[]}
+
+ENTRY constant_pred {
+  ROOT tmp_0 = pred[] constant(true)
+}
+
+)");
+}
+
 TEST(HloParserSingleOpTest, CanonicalOp) {
   const std::string text =
       "f32[2,4]{1,0} multiply(f32[2,4]{1,0}, f32[2,4]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4275,7 +4312,7 @@ TEST(HloParserSingleOpTest, CanonicalOpWithNested) {
   }
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_EQ(
@@ -4300,7 +4337,7 @@ TEST(HloParserSingleOpTest, CanonicalOpIndexedConditionalInlinedBranches) {
 }
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_EQ(
@@ -4318,7 +4355,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested) {
   ROOT %subtract = f32[3,2,1,1]{3,2,1,0} subtract(f32[3,2,1,1]{3,2,1,0} %param_0, f32[3,2,1,1]{3,2,1,0} %broadcast)
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4365,7 +4402,7 @@ TEST(HloParserSingleOpTest, SingleOpWithNested_NoOperandName) {
 TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
   const std::string text =
       R"(%convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), window={size=1}, dim_labels=b0f_0io->b0f)";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(text));
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
@@ -4398,7 +4435,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_FALSE(module->has_schedule());
 }
 
@@ -4415,7 +4452,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_FALSE(module->has_schedule());
 }
 
@@ -4432,9 +4469,9 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_TRUE(module->has_schedule());
-  TF_ASSERT_OK(module->schedule().Verify());
+  ASSERT_OK(module->schedule().Verify());
   EXPECT_EQ(module->schedule().sequences().size(), 1);
   ASSERT_TRUE(
       module->schedule().is_computation_scheduled(module->entry_computation()));
@@ -4459,9 +4496,9 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
   ROOT %add = f32[2,4]{1,0} add(f32[2,4]{1,0} %multiply, f32[2,4]{1,0} %y)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   ASSERT_TRUE(module->has_schedule());
-  TF_ASSERT_OK(module->schedule().Verify());
+  ASSERT_OK(module->schedule().Verify());
   EXPECT_EQ(module->schedule().sequences().size(), 1);
   ASSERT_TRUE(
       module->schedule().is_computation_scheduled(module->entry_computation()));
@@ -4536,7 +4573,7 @@ ENTRY entry {
   ROOT root = f32[ 1, 2,3, 4, 5]{0, 1, 2,3, 4 } parameter(0)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
 }
 
 TEST_F(HloParserTest, ShapeMismatchInOperand) {
@@ -4557,7 +4594,7 @@ ENTRY %entrycomp (p: f32[2,2]) -> f32[2,2] {
 
 TEST_F(HloParserTest, ParseShapeStringR2F32) {
   std::string shape_string = "f32[123,456]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShape(F32, {123, 456});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4566,7 +4603,7 @@ TEST_F(HloParserTest, ParseShapeStringR2F32) {
 
 TEST_F(HloParserTest, ParseShapeStringUnbounded) {
   std::string shape_string = "f32[?,784]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected =
       ShapeUtil::MakeShape(F32, {Shape::kUnboundedSize, 784}, {true, false});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
@@ -4576,7 +4613,7 @@ TEST_F(HloParserTest, ParseShapeStringUnbounded) {
 
 TEST_F(HloParserTest, ParseShapeStringTupleOfArrays) {
   std::string shape_string = "(f32[1572864],s8[5120,1024])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected =
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {1572864}),
                                  ShapeUtil::MakeShape(S8, {5120, 1024})});
@@ -4587,7 +4624,7 @@ TEST_F(HloParserTest, ParseShapeStringTupleOfArrays) {
 
 TEST_F(HloParserTest, ParseShapeStringNestedTuple) {
   std::string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeTupleShape({
       ShapeUtil::MakeShape(F32, {1}),
       ShapeUtil::MakeTupleShape(
@@ -4602,7 +4639,7 @@ TEST_F(HloParserTest, ParseShapeStringNestedTuple) {
 
 TEST_F(HloParserTest, ParseShapeStringWithLayout) {
   std::string shape_string = "f32[123,456]{0,1}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(F32, {123, 456}, {0, 1});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4612,7 +4649,7 @@ TEST_F(HloParserTest, ParseShapeStringWithLayout) {
 TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
   // One tile.
   std::string shape_string = "f32[123,456]{0,1:T(2,128)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(F32, {123, 456}, {0, 1},
                                                        {Tile({2, 128})});
   EXPECT_EQ(expected, actual)
@@ -4621,7 +4658,7 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
 
   // Tile with negative dimension size for combining dimensions.
   shape_string = "f32[123,456,789]{0,1,2:T(2, * , 128)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {123, 456, 789}, {0, 1, 2},
       {Tile({2, Tile::kCombineDimension, 128})});
@@ -4631,7 +4668,7 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
 
   // Two tiles.
   shape_string = "bf16[123,456,789]{2,1,0:T(2,*,128)(2,1)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       BF16, {123, 456, 789}, {2, 1, 0},
       {Tile({2, Tile::kCombineDimension, 128}), Tile({2, 1})});
@@ -4649,7 +4686,7 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
 TEST_F(HloParserTest, ParseShapeStringWithElementSizeInBits) {
   // Tile, element size, and memory space.
   std::string shape_string = "s4[123,456]{1,0:T(2,128)E(4)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(S4, {123, 456}, {1, 0},
                                                        {Tile({2, 128})}, 1, 4);
   EXPECT_EQ(expected, actual)
@@ -4660,7 +4697,7 @@ TEST_F(HloParserTest, ParseShapeStringWithElementSizeInBits) {
 TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   // Tile, element size, and memory space.
   std::string shape_string = "pred[123,456]{1,0:T(2,128)S(3)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {Tile({2, 128})}, 1, 0, 3);
   EXPECT_EQ(expected, actual)
@@ -4669,7 +4706,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
 
   // Element size and memory space.
   shape_string = "pred[123,456]{1,0:S(3)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {},
                                                  1, 0, 3);
   EXPECT_EQ(expected, actual)
@@ -4678,7 +4715,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
 
   // Memory space only.
   shape_string = "pred[123,456]{1,0:S(3)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {},
                                                  1, 0, 3);
   EXPECT_EQ(expected, actual)
@@ -4689,7 +4726,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
 TEST_F(HloParserTest, ParseShapeStringWithDynamicShapeMetadataPrefix) {
   // Tile, element size, and memory space.
   std::string shape_string = "f32[123,456]{1,0:T(16,128)M(1024)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(F32, {123, 456}, {1, 0},
                                                        {Tile({16, 128})});
   expected.mutable_layout()->set_dynamic_shape_metadata_prefix_bytes(1024);
@@ -4701,7 +4738,7 @@ TEST_F(HloParserTest, ParseShapeStringWithDynamicShapeMetadataPrefix) {
 TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
   // Tile, memory space, and split config.
   std::string shape_string = "pred[123,456]{1,0:T(2,128)S(3)SC(1:200)}";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {Tile({2, 128})}, 1, 0, 3,
       {SplitConfig(1, {200})});
@@ -4711,7 +4748,7 @@ TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
 
   // Memory space and split config.
   shape_string = "pred[123,456]{1,0:S(3)SC(0:10)(1:4,5)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {}, 1, 0, 3,
       {SplitConfig(0, {10}), SplitConfig(1, {4, 5})});
@@ -4721,7 +4758,7 @@ TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
 
   // Split config only.
   shape_string = "pred[123,456]{1,0:SC(1:50,200)}";
-  TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected = ShapeUtil::MakeShapeWithDenseLayout(
       PRED, {123, 456}, {1, 0}, {}, 1, 0, 0, {SplitConfig(1, {50, 200})});
   EXPECT_EQ(expected, actual)
@@ -4730,7 +4767,7 @@ TEST_F(HloParserTest, ParseShapeStringWithSplitConfigLayout) {
 }
 
 TEST_F(HloParserTest, ParseOpaqueType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("opaque[]"));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("opaque[]"));
   Shape expected = ShapeUtil::MakeOpaqueShape();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4738,7 +4775,7 @@ TEST_F(HloParserTest, ParseOpaqueType) {
 }
 
 TEST_F(HloParserTest, ParseTokenType) {
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("token[]"));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape("token[]"));
   Shape expected = ShapeUtil::MakeTokenShape();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4756,7 +4793,7 @@ TEST_F(HloParserTest, ParseInvalidShapeString) {
 
 TEST_F(HloParserTest, ParseDynamicArray) {
   std::string shape_string = "f32[123,<=456]";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShape(F32, {123, 456}, {false, true});
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -4765,7 +4802,7 @@ TEST_F(HloParserTest, ParseDynamicArray) {
 
 TEST_F(HloParserTest, ParseDynamicTuple) {
   std::string shape_string = "(f32[42], u32[<=123,<=456])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(F32, {42}),
        ShapeUtil::MakeShape(U32, {123, 456}, {true, true})});
@@ -4897,7 +4934,7 @@ ENTRY InferUnaryShape {
   ROOT v = abs(a)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
 }
 
 TEST_F(HloParserTest, InferBinaryShape) {
@@ -4908,7 +4945,7 @@ ENTRY InferBinaryShape {
   ROOT sum = add(a, b)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 10}, {1, 0})));
@@ -4923,7 +4960,7 @@ ENTRY InferTernaryShape {
   ROOT select = select(p, f, t)
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeScalarShape(S32)));
@@ -4950,7 +4987,7 @@ ENTRY InferDotShape {
   ROOT dot = dot(a, b), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={1}, rhs_contracting_dims={0}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeShape(F32, {2}, {0})));
@@ -4965,7 +5002,7 @@ ENTRY InferTupleShape () -> s32[2,3] {
   ROOT get = get-tuple-element(tuple), index=1, sharding={maximal device=0}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeShapeWithDenseLayout(S32, {2, 3}, {1, 0})));
@@ -4991,7 +5028,7 @@ ENTRY InferUnaryShape {
   ROOT conditional = conditional(p, a, c), true_computation=Negate, false_computation=Identity
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(text));
   EXPECT_TRUE(ShapeUtil::Equal(
       module->entry_computation()->ComputeProgramShape().result(),
       ShapeUtil::MakeScalarShape(F32)));
@@ -5189,8 +5226,7 @@ TEST_F(HloParserTest, ParseSingleComputation) {
 test {
   ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5221,8 +5257,7 @@ TEST_F(HloParserTest, ParseSingleEntryComputation) {
 ENTRY test {
   ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5247,8 +5282,7 @@ comp1 {
 comp2 {
   ROOT root =  f32[1,64,10,128]{1,0,2,3} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5273,8 +5307,7 @@ ENTRY comp1 {
 comp2 {
   ROOT root =  f32[1,64,10,128]{3,2,1,0} parameter(0)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
   EXPECT_TRUE(module->entry_computation()
                   ->ComputeProgramShape()
                   .parameters()[0]
@@ -5314,9 +5347,8 @@ ENTRY %main {
   ROOT %async-done = s32[1024]{0} async-done(((s32[1024]{0}, s32[256]{0}, s32[]), s32[1024]{0}, u32[]) %async-start), calls=%async_wrapped
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(original));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(original));
+  ASSERT_OK_AND_ASSIGN(
       auto roundtrip_module,
       ParseAndReturnUnverifiedModule(module->ToString(
           HloPrintOptions().set_syntax_sugar_async_ops(true))));
@@ -5698,8 +5730,7 @@ TEST_F(HloParserTest, ReplicaIdWithLayout) {
     ROOT replica-id.18600 = u32[]{:T(128)} replica-id()
   }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   EXPECT_TRUE(
       module->entry_computation()->root_instruction()->shape().has_layout());
   EXPECT_FALSE(module->entry_computation()
@@ -5720,8 +5751,7 @@ ENTRY %test {
 
 
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
 
   ExpectHasSubstr(module->ToString(HloPrintOptions::ShortParsable()),
                   "origin={{\"v\"}}");
@@ -5734,8 +5764,7 @@ ENTRY %test {
   ROOT op = ((f32[], f32[3]{0}), f32[2,3]) parameter(0),  origin={(({}, {"v2"}), {"v3"})}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
 
   ExpectHasSubstr(module->ToString(HloPrintOptions::ShortParsable()),
                   "origin={(({}, {\"v2\"}), {\"v3\"})}");
@@ -5755,8 +5784,7 @@ ENTRY %test (Arg_0: s32[]) -> s32[] {
   %Arg_0 = s32[] parameter(0), origin={{"Arg_0"}}
   ROOT %pad_add_fusion = s32[] fusion(%Arg_0), kind=kLoop, calls=%fused_computation, origin={{"concatenate"}}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
 
   auto fusion_inst = static_cast<HloFusionInstruction*>(
       module->entry_computation()->root_instruction());
@@ -5777,8 +5805,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyMode) {
   )";
   ResultAccuracy expected_result_accuracy = ResultAccuracy();
   expected_result_accuracy.set_mode(ResultAccuracy::HIGHEST);
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   auto* unary = module->entry_computation()->root_instruction();
   EXPECT_THAT(unary->result_accuracy(), EqualsProto(expected_result_accuracy));
 }
@@ -5813,8 +5840,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyRtol) {
   tolerance.set_atol(1.0);  // NOLINT
   tolerance.set_ulps(2);
   *expected_result_accuracy.mutable_tolerance() = tolerance;
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   auto* unary = module->entry_computation()->root_instruction();
   EXPECT_THAT(unary->result_accuracy(), EqualsProto(expected_result_accuracy));
 }
@@ -5856,8 +5882,7 @@ TEST_F(HloParserTest, TranscendentalAccuracyNoConfig) {
     ROOT %exponential = f32[] exponential(f32[] %exponent)
   }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   ResultAccuracy default_result_accuracy;
   default_result_accuracy.set_mode(ResultAccuracy::DEFAULT);
   EXPECT_THAT(
@@ -5894,8 +5919,8 @@ TEST_F(HloParserTest,
                 statistics={visualizing_index=1,stat-1=33,stat-2=44}
     ROOT add-done = s32[] add-done(add-start), origin={{"v3"}}
   })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
-                          ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                       ParseAndReturnVerifiedModule(hlo));
   // Check the wrapped instruction.
   HloInstruction* wrapped_instr =
       m->entry_computation()->root_instruction()->async_wrapped_instruction();
@@ -5942,8 +5967,7 @@ TEST_F(HloParserTest, ResultAccuracyToProto) {
     ROOT %exponential = f32[] exponential(f32[] %exponent), result_accuracy={tolerance={rtol=0.5, atol=1.0, ulps=2}}
   }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnUnverifiedModule(hlo_string));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo_string));
   HloInstruction* exp_hlo_instruction =
       module->entry_computation()->root_instruction();
   HloInstructionProto exp_hlo_inst_proto = exp_hlo_instruction->ToProto();
@@ -5969,7 +5993,7 @@ TEST_F(HloParserTest, ParseBufferMoreThanOneElement) {
 
 TEST_F(HloParserTest, ParseBufferScalar) {
   std::string shape_string = "b(s32[])";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeValidatedBufferShape(S32, {}).value();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -5978,7 +6002,7 @@ TEST_F(HloParserTest, ParseBufferScalar) {
 
 TEST_F(HloParserTest, ParseBufferArray) {
   std::string shape_string = "b(f32[8,16]{1,0})";
-  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeValidatedBufferShape(F32, {8, 16}).value();
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
       << "expected: " << ShapeUtil::HumanString(expected)
@@ -6006,10 +6030,10 @@ ENTRY entry {
       absl::StrFormat(hlo_template, "mode=cross_replica,");
   const std::string hlo_without_mode = absl::StrFormat(hlo_template, "");
 
-  TF_ASSERT_OK_AND_ASSIGN(auto module_with_mode,
-                          ParseAndReturnVerifiedModule(hlo_with_mode));
-  TF_ASSERT_OK_AND_ASSIGN(auto module_without_mode,
-                          ParseAndReturnVerifiedModule(hlo_without_mode));
+  ASSERT_OK_AND_ASSIGN(auto module_with_mode,
+                       ParseAndReturnVerifiedModule(hlo_with_mode));
+  ASSERT_OK_AND_ASSIGN(auto module_without_mode,
+                       ParseAndReturnVerifiedModule(hlo_without_mode));
   EXPECT_EQ(*module_with_mode->entry_computation(),
             *module_without_mode->entry_computation());
 }

From 09c97bbcf667f5325107c82ff35df9f82a547a1c Mon Sep 17 00:00:00 2001
From: Theotime Combes <tcombes@google.com>
Date: Tue, 16 Dec 2025 03:38:47 -0800
Subject: [PATCH 325/753] [XLA:GPU] Move layout checks into
 GetNormalizedLogicalTransposeShape

Required to fail gracefully in `GetDescriptionForTiledTransposeEmitter` in cl/841759079.

PiperOrigin-RevId: 845187244
---
 .../xla/xla/service/gpu/transforms/BUILD      |  2 +-
 .../transforms/transpose_dimension_grouper.cc | 27 +++---
 third_party/xla/xla/shape_util.cc             | 16 ++--
 third_party/xla/xla/shape_util.h              |  8 +-
 third_party/xla/xla/shape_util_test.cc        | 85 +++++++++++++------
 5 files changed, 87 insertions(+), 51 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index abd291790b3331..9f017b8ad97df5 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -2799,7 +2799,7 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
-        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
index 26d9a8f87049c0..de243e47343e98 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
@@ -30,12 +30,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/layout_util.h"
 #include "xla/permutation_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -46,19 +45,14 @@ class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
  public:
   absl::Status HandleTranspose(HloInstruction* transpose) override {
     VLOG(4) << "Input: " << transpose->ToString();
-    if (!LayoutUtil::IsMonotonicWithDim0Major(transpose->shape().layout()) ||
-        !LayoutUtil::IsMonotonicWithDim0Major(
-            transpose->operand(0)->shape().layout())) {
-      // TransposeDimensionGrouper runs almost immediately after
-      // LayoutNormalization. The passes in between have been verified to not
-      // introduce transposes with non-default layout.
-      return FailedPrecondition(
-          "Layout normalization should have assigned the default layout to "
-          "transpose and its operand");
-    }
     absl::InlinedVector<int64_t, 3> permutation;
-    auto normalized_dims = ShapeUtil::GetNormalizedLogicalTransposeShape(
-        transpose->shape(), transpose->dimensions(), permutation);
+    // TransposeDimensionGrouper runs almost immediately after
+    // LayoutNormalization. The passes in between have been verified to not
+    // introduce transposes with non-default layout.
+    ASSIGN_OR_RETURN(auto normalized_dims,
+                     ShapeUtil::GetNormalizedLogicalTransposeShape(
+                         transpose->operand(0)->shape(), transpose->shape(),
+                         transpose->dimensions(), permutation));
     if (normalized_dims.size() == 1 ||
         normalized_dims == transpose->shape().dimensions()) {
       return absl::OkStatus();
@@ -85,9 +79,8 @@ class TransposeDimensionGroupVisitor : public DfsHloRewriteVisitor {
 absl::StatusOr<bool> TransposeDimensionGrouper::RunImpl(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  TF_ASSIGN_OR_RETURN(
-      bool changed,
-      TransposeDimensionGroupVisitor().RunOnModule(module, execution_threads));
+  ASSIGN_OR_RETURN(bool changed, TransposeDimensionGroupVisitor().RunOnModule(
+                                     module, execution_threads));
   return changed;
 }
 
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 1fbea10079413b..a9e0f6dee030c2 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -2403,19 +2403,25 @@ absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
 
 }  // namespace
 
-/*static*/ absl::InlinedVector<int64_t, 3>
+/*static*/ absl::StatusOr<absl::InlinedVector<int64_t, 3>>
 ShapeUtil::GetNormalizedLogicalTransposeShape(
-    const Shape& output_shape, absl::Span<int64_t const> dimensions,
+    const Shape& input_shape, const Shape& output_shape,
+    absl::Span<int64_t const> dimensions,
     absl::InlinedVector<int64_t, 3>& permutation) {
+  if (!LayoutUtil::IsMonotonicWithDim0Major(input_shape.layout()) ||
+      !LayoutUtil::IsMonotonicWithDim0Major(output_shape.layout())) {
+    return FailedPrecondition(
+        "Transpose normalization requires monotonic layouts. Layout "
+        "normalization should have assigned the default layout.");
+  }
+
   permutation.clear();
   // Drop degenerate dimensions.
   absl::InlinedVector<int64_t, 3> delta(output_shape.dimensions().size() + 1,
                                         0);
-  auto input_dimensions =
-      Permute(output_shape.dimensions(), InversePermutation(dimensions));
   for (int i = 0; i < output_shape.dimensions().size(); ++i) {
     delta[i + 1] = delta[i];
-    if (input_dimensions[i] == static_cast<int64_t>(1)) {
+    if (input_shape.dimensions(i) == static_cast<int64_t>(1)) {
       ++delta[i + 1];
     }
   }
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index 12cd8e59bd58c7..545b247fbc2a2c 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -463,8 +463,12 @@ class ShapeUtil {
   // 'dimensions' is set to {3, 1, 0, 2}. This means the corresponding input
   // shape is [10, 1, 11, 32]. The normalized output shape is [32, 110] with
   // 'permutation' set to {1,0}.
-  static absl::InlinedVector<int64_t, 3> GetNormalizedLogicalTransposeShape(
-      const Shape& output_shape, absl::Span<int64_t const> dimensions,
+  // Note: the method fails if the input shape or the output shape has a
+  // non-monotonic layout.
+  static absl::StatusOr<absl::InlinedVector<int64_t, 3>>
+  GetNormalizedLogicalTransposeShape(
+      const Shape& input_shape, const Shape& output_shape,
+      absl::Span<int64_t const> dimensions,
       absl::InlinedVector<int64_t, 3>& permutation);
 
   // Returns an empty tuple shape. Can be used as a sentinel Shape value.
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index 265e36b839b289..8d3cedb6d1d21a 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -1305,7 +1305,7 @@ TEST(ShapeUtilTest, B_250640044) {
              is_dynamic_dimension: false
            })pb",
       &proto));
-  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+  ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
   EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
 }
 
@@ -1339,7 +1339,7 @@ TEST(ShapeUtilTest, B_251055887) {
           physical_shape { element_type: -562 }
         })pb",
       &proto));
-  TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+  ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
   EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
 }
 
@@ -1350,14 +1350,14 @@ TEST(ShapeUtilTest, B_385192799) {
   {
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
         R"pb(element_type: 2000)pb", &proto));
-    TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+    ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
     EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
   }
 
   {
     EXPECT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
         R"pb(element_type: -1)pb", &proto));
-    TF_ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
+    ASSERT_OK_AND_ASSIGN(Shape shape, Shape::FromProto(proto));
     EXPECT_FALSE(ShapeUtil::ValidateShape(shape).ok());
   }
 }
@@ -1781,10 +1781,12 @@ BENCHMARK(BM_ForEachIndexNoStatus)->Arg(0)->Arg(1)->Arg(2);
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {32, 1, 10, 11});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 1, 11, 32});
   absl::InlinedVector<int64_t, 3> dimensions = {3, 1, 0, 2};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(32, 110));
   EXPECT_THAT(permutation, ElementsAre(1, 0));
@@ -1792,10 +1794,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape2) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {20, 30, 50});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {50, 20, 30});
   absl::InlinedVector<int64_t, 3> dimensions = {1, 2, 0};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(600, 50));
   EXPECT_THAT(permutation, ElementsAre(1, 0));
@@ -1803,10 +1807,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape2) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NoTranspose) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {64, 1, 128});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {64, 128, 1});
   absl::InlinedVector<int64_t, 3> dimensions = {0, 2, 1};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(8192));
   EXPECT_THAT(permutation, IsEmpty());
@@ -1814,10 +1820,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NoTranspose) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple2D) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {64, 128});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {128, 64});
   absl::InlinedVector<int64_t, 3> dimensions = {1, 0};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(64, 128));
   EXPECT_THAT(permutation, ElementsAre(1, 0));
@@ -1825,10 +1833,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple2D) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_021) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 32768});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 32768, 16});
   absl::InlinedVector<int64_t, 3> dimensions = {0, 2, 1};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(8, 16, 32768));
   EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
@@ -1836,10 +1846,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_021) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_210) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {16, 32768, 8});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 32768, 16});
   absl::InlinedVector<int64_t, 3> dimensions = {2, 1, 0};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(16, 32768, 8));
   EXPECT_THAT(permutation, ElementsAre(2, 1, 0));
@@ -1847,10 +1859,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple3D_210) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple4D) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {16, 32768, 8, 4});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {32768, 4, 16, 8});
   absl::InlinedVector<int64_t, 3> dimensions = {2, 0, 3, 1};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(16, 32768, 8, 4));
   EXPECT_THAT(permutation, ElementsAre(2, 0, 3, 1));
@@ -1858,10 +1872,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple4D) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NormalizeTo3D) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 32, 32, 32});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 32, 32, 32, 16});
   absl::InlinedVector<int64_t, 3> dimensions = {0, 4, 1, 2, 3};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(8, 16, 32768));
   EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
@@ -1869,10 +1885,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NormalizeTo3D) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_LargeShapeSizeOverflow) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {16, 4096, 4096, 128});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4096, 4096, 128, 16});
   absl::InlinedVector<int64_t, 3> dimensions = {3, 0, 1, 2};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(16, 2147483648));
   EXPECT_THAT(permutation, ElementsAre(1, 0));
@@ -1880,10 +1898,12 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_LargeShapeSizeOverflow) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_DegenerateDims) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {1, 32, 1, 64, 1, 3, 1});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 32, 1, 3, 1, 64, 1});
   absl::InlinedVector<int64_t, 3> dimensions = {6, 1, 4, 5, 2, 3, 0};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(32, 64, 3));
   EXPECT_THAT(permutation, ElementsAre(0, 2, 1));
@@ -1891,14 +1911,27 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_DegenerateDims) {
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_TransposeWithGrouping) {
   Shape output_shape = ShapeUtil::MakeShape(F32, {10, 1, 32, 100, 2});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {100, 1, 10, 32, 2});
   absl::InlinedVector<int64_t, 3> dimensions = {2, 1, 3, 0, 4};
   absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_shape = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      output_shape, dimensions, permutation);
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(320, 100, 2));
   EXPECT_THAT(permutation, ElementsAre(1, 0, 2));
 }
 
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_InvalidLayout) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {32, 10});
+  *output_shape.mutable_layout() = LayoutUtil::MakeLayout({0, 1});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 32});
+  absl::InlinedVector<int64_t, 3> dimensions = {1, 0};
+  absl::InlinedVector<int64_t, 3> permutation;
+  EXPECT_FALSE(ShapeUtil::GetNormalizedLogicalTransposeShape(
+                   input_shape, output_shape, dimensions, permutation)
+                   .ok());
+}
+
 }  // namespace
 }  // namespace xla

From eed469c33ed2d07752573cc0b3c6c096a4506944 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Tue, 16 Dec 2025 03:53:11 -0800
Subject: [PATCH 326/753] Remove undefined support of tuple shardings from
 `NumTiles()` and `TiledDataRank()` methods. These methods should only be
 defined for non-tuple shardings, when sharding is tuple `tile_assignment_` is
 not populated therefore these would be returning undefined results.

PiperOrigin-RevId: 845191404
---
 third_party/xla/xla/hlo/ir/hlo_sharding.cc         | 13 +------------
 third_party/xla/xla/hlo/ir/hlo_sharding.h          | 14 ++------------
 third_party/xla/xla/hlo/utils/hlo_sharding_util.cc |  7 +++++--
 3 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.cc b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
index b6376ad4f88aca..e958a1fc966bf8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.cc
@@ -1099,23 +1099,12 @@ int64_t HloSharding::TotalNumTiles() const {
 }
 
 int64_t HloSharding::NumTiles() const {
-  if (IsTileMaximal()) {
-    return 1;
-  }
-  CHECK(!IsManual());
-  CHECK(!IsUnknown());
-  return Product(absl::Span<const int64_t>(tile_assignment_.dimensions())
-                     .subspan(0, TiledDataRank()));
-}
-
-int64_t HloSharding::NumTilesLeaf() const {
-  DCHECK(!IsTuple());
   if (IsTileMaximalLeaf()) {
     return 1;
   }
   CHECK(!IsManualLeaf() && !IsUnknownLeaf());
   return Product(absl::Span<const int64_t>(tile_assignment_.dimensions())
-                     .subspan(0, TiledDataRankLeaf()));
+                     .subspan(0, TiledDataRank()));
 }
 
 int64_t HloSharding::NumTiles(absl::Span<const int64_t> dims) const {
diff --git a/third_party/xla/xla/hlo/ir/hlo_sharding.h b/third_party/xla/xla/hlo/ir/hlo_sharding.h
index 02673f27e7ff1e..93a58a4cdfb4d6 100644
--- a/third_party/xla/xla/hlo/ir/hlo_sharding.h
+++ b/third_party/xla/xla/hlo/ir/hlo_sharding.h
@@ -545,9 +545,8 @@ class HloSharding {
   // Gets the total number of tiles including subgroups and partial replication.
   int64_t TotalNumTiles() const;
   // Gets the number of tiles. If it has partial replication, this will not
-  // equal the device count.
+  // equal the device count. This method is not defined for tuple shardings.
   int64_t NumTiles() const;
-  int64_t NumTilesLeaf() const;
   // Like NumTiles() but considers only some specific dimensions passed as
   // argument
   int64_t NumTiles(absl::Span<const int64_t> dims) const;
@@ -587,17 +586,8 @@ class HloSharding {
   }
 
   // Returns the data rank for tiled sharding. It doesn't include subgroup dims.
+  // This method is not defined for tuple shardings.
   int64_t TiledDataRank() const {
-    CHECK(IsTiled());
-    int64_t rank = tile_assignment_.num_dimensions();
-    if (ReplicateOnLastTileDim()) {
-      rank--;
-    }
-    rank -= subgroup_types_.size();
-    return rank;
-  }
-  int64_t TiledDataRankLeaf() const {
-    DCHECK(!IsTuple());
     CHECK(IsTiledLeaf());
     int64_t rank = tile_assignment_.num_dimensions();
     if (ReplicateOnLastTileDim()) {
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 5bf1e9cdb42b23..a2ce251f7ab382 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -361,7 +361,7 @@ static bool IsLeafShardingMoreSpecific(const HloSharding& lhs,
     return false;
   }
   if (!rhs.IsTileMaximalLeaf()) {
-    return lhs.NumTilesLeaf() > rhs.NumTilesLeaf();
+    return lhs.NumTiles() > rhs.NumTiles();
   }
   // If we are not replicated then only tiled (not tile maximal) shardings
   // can improve us.
@@ -2864,7 +2864,10 @@ std::optional<HloSharding> ReturnImprovedShardingImpl(
   if (from.IsManual()) {
     return std::nullopt;
   }
-  int64_t sharding_tiles = from.NumTiles();
+  int64_t sharding_tiles;
+  if (!from.IsTuple()) {
+    sharding_tiles = from.NumTiles();
+  }
   if (MergeSharding(*to_improved, &from, may_combine_partial_sharding)) {
     // Override existing tiled sharding only when the new sharding is compatible
     // with the existing one. This avoids unexpected resharding when `sharding`

From 2c44d6471b5c7432b526517e96d156bcc8ae948f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 04:00:34 -0800
Subject: [PATCH 327/753] Automated Code Change

PiperOrigin-RevId: 845193499
---
 third_party/xla/xla/backends/cpu/onednn_emitter.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/third_party/xla/xla/backends/cpu/onednn_emitter.cc b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
index 0ea1f6a916ffc5..0e51804ff92743 100644
--- a/third_party/xla/xla/backends/cpu/onednn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/onednn_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::cpu {
 

From c8951b02a3b5598ad98cb6c35605c680e277056f Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 16 Dec 2025 04:04:51 -0800
Subject: [PATCH 328/753] Factorize XNNPack delegate macros in a common header.

PiperOrigin-RevId: 845195313
---
 tensorflow/lite/delegates/xnnpack/BUILD       | 14 +++++-
 tensorflow/lite/delegates/xnnpack/macros.h    | 48 +++++++++++++++++++
 .../lite/delegates/xnnpack/mmap_handle.cc     | 26 ++++------
 .../lite/delegates/xnnpack/weight_cache.cc    | 36 +++++---------
 4 files changed, 80 insertions(+), 44 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/macros.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index a2715c519798a5..10d401078d0fae 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -333,6 +333,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "macros",
+    hdrs = ["macros.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:minimal_logging",
+    ],
+)
+
 flatbuffer_cc_library(
     name = "weight_cache_schema",
     srcs = ["weight_cache_schema.fbs"],
@@ -350,8 +360,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":file_util",
+        ":macros",
         ":mmap_handle",
         ":weight_cache_schema",
+        "//tensorflow/lite:logger",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
         "@XNNPACK",
@@ -387,8 +399,8 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":file_util",
+        ":macros",
         ":windows_util",
-        "//tensorflow/lite:minimal_logging",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/xnnpack/macros.h b/tensorflow/lite/delegates/xnnpack/macros.h
new file mode 100644
index 00000000000000..ef2218ec621107
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/macros.h
@@ -0,0 +1,48 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_MACROS_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_MACROS_H_
+
+#include <cstdio>
+
+#include "tensorflow/lite/minimal_logging.h"
+
+#define XNNPACK_LOG_LIMIT 4048
+
+#define XNNPACK_ABORT_CHECK(TEST, ...)                                   \
+  if (!(TEST)) {                                                         \
+    char msg[XNNPACK_LOG_LIMIT] = {0};                                   \
+    int bytes =                                                          \
+        snprintf(msg, XNNPACK_LOG_LIMIT, "%s:%d: ", __FILE__, __LINE__); \
+    snprintf(msg + bytes, XNNPACK_LOG_LIMIT - bytes, "" __VA_ARGS__);    \
+    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR, msg);                      \
+    std::abort();                                                        \
+  }
+
+#define XNNPACK_VAR_ARG_HEAD(FIRST, ...) FIRST
+
+#define XNNPACK_RETURN_CHECK(TEST, ...)                                    \
+  if (!(TEST)) {                                                           \
+    if (sizeof(XNNPACK_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) {       \
+      char msg[XNNPACK_LOG_LIMIT] = {0};                                   \
+      int bytes =                                                          \
+          snprintf(msg, XNNPACK_LOG_LIMIT, "%s:%d: ", __FILE__, __LINE__); \
+      snprintf(msg + bytes, XNNPACK_LOG_LIMIT - bytes, "" __VA_ARGS__);    \
+      TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR, msg);                      \
+    }                                                                      \
+    return false;                                                          \
+  }
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_MACROS_H_
diff --git a/tensorflow/lite/delegates/xnnpack/mmap_handle.cc b/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
index 169e284de47f46..92caf07fac811e 100644
--- a/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
+++ b/tensorflow/lite/delegates/xnnpack/mmap_handle.cc
@@ -32,20 +32,8 @@ limitations under the License.
 #include <cstring>
 
 #include "tensorflow/lite/delegates/xnnpack/file_util.h"
+#include "tensorflow/lite/delegates/xnnpack/macros.h"
 #include "tensorflow/lite/delegates/xnnpack/windows_util.h"
-#include "tensorflow/lite/logger.h"
-#include "tensorflow/lite/minimal_logging.h"
-
-#define XNNPACK_VAR_ARG_HEAD(FIRST, ...) FIRST
-
-#define XNNPACK_RETURN_CHECK(TEST, ...)                              \
-  if (!(TEST)) {                                                     \
-    if (sizeof(XNNPACK_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) { \
-      TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,                      \
-                      "XNNPack weight cache: " __VA_ARGS__);         \
-    }                                                                \
-    return false;                                                    \
-  }
 
 namespace tflite::xnnpack {
 
@@ -100,9 +88,10 @@ bool MMapHandle::Map(const FileDescriptorView& fd, const size_t offset,
                        safe_path, strerror(errno));
 #else
   struct stat file_stats;
-  XNNPACK_RETURN_CHECK(fstat(fd.Value(), &file_stats) == 0,
-                       "could not access file stats to get size ('%s'): %s.",
-                       safe_path, strerror(errno));
+  XNNPACK_RETURN_CHECK(
+      fstat(fd.Value(), &file_stats) == 0,
+      "could not access file descriptor %d stats to get size ('%s'): %s.",
+      fd.Value(), safe_path, strerror(errno));
 #endif
 
   // This will reset data_ and size_ on return until it is deactivated.
@@ -149,8 +138,9 @@ bool MMapHandle::Map(const FileDescriptorView& fd, const size_t offset,
   data_ = static_cast<uint8_t*>(
       mmap(/*addr=*/nullptr, size_ + offset_page_adjustment_, PROT_READ,
            MAP_SHARED, fd.Value(), offset_ - offset_page_adjustment_));
-  XNNPACK_RETURN_CHECK(data_ != MAP_FAILED, "could not mmap file (%s): %s.",
-                       safe_path, strerror(errno));
+  XNNPACK_RETURN_CHECK(data_ != MAP_FAILED,
+                       "could not mmap file descriptor %d (%s): %s.",
+                       fd.Value(), safe_path, strerror(errno));
 #endif
   unmap_on_error.Deactivate();
   return true;
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index 94b0b76ab4b4cd..516a2ebbdeda9f 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
 
 #include <fcntl.h>
+
+#include "tensorflow/lite/logger.h"
+#include "tensorflow/lite/minimal_logging.h"
 #if defined(_MSC_VER)
 #include <io.h>
 #define F_OK 0
@@ -23,6 +26,7 @@ limitations under the License.
 #endif
 
 #include <cerrno>  // IWYU pragma: keep
+#include <cinttypes>
 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
@@ -38,27 +42,9 @@ limitations under the License.
 #include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/file_util.h"
+#include "tensorflow/lite/delegates/xnnpack/macros.h"
 #include "tensorflow/lite/delegates/xnnpack/mmap_handle.h"
 #include "tensorflow/lite/delegates/xnnpack/weight_cache_schema_generated.h"
-#include "tensorflow/lite/logger.h"
-#include "tensorflow/lite/minimal_logging.h"
-
-#define XNNPACK_ABORT_CHECK(TEST, ...)                      \
-  if (!(TEST)) {                                            \
-    TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR, __VA_ARGS__); \
-    std::abort();                                           \
-  }
-
-#define XNNPACK_VAR_ARG_HEAD(FIRST, ...) FIRST
-
-#define XNNPACK_RETURN_CHECK(TEST, ...)                              \
-  if (!(TEST)) {                                                     \
-    if (sizeof(XNNPACK_VAR_ARG_HEAD("" __VA_ARGS__)) > sizeof("")) { \
-      TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,                      \
-                      "XNNPack weight cache: " __VA_ARGS__);         \
-    }                                                                \
-    return false;                                                    \
-  }
 
 namespace tflite::xnnpack {
 
@@ -409,8 +395,8 @@ bool MMapWeightCacheProvider::Load() {
   }();
 
   XNNPACK_RETURN_CHECK(header.version == XNNPackCacheHeader::kVersion,
-                       "incompatible header version. Got %zd, expected %zd. "
-                       "Cache needs to be built again.",
+                       "incompatible header version. Got %" PRIu64
+                       ", expected %" PRIu64 ". Cache needs to be built again.",
                        header.version, XNNPackCacheHeader::kVersion);
 
   XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
@@ -695,10 +681,10 @@ bool IsCompatibleCacheFile(const FileDescriptor& fd) {
   XNNPackCacheHeader header;
   XNNPACK_RETURN_CHECK(fd.Read(&header, sizeof(header)),
                        "Couldn't read file header.");
-  XNNPACK_RETURN_CHECK(
-      header.version == XNNPackCacheHeader::kVersion,
-      "Cache header version is incompatible. Expected %llu, got %llu.",
-      XNNPackCacheHeader::kVersion, header.version);
+  XNNPACK_RETURN_CHECK(header.version == XNNPackCacheHeader::kVersion,
+                       "Cache header version is incompatible. Expected %" PRIu64
+                       ", got %" PRIu64 ".",
+                       XNNPackCacheHeader::kVersion, header.version);
   XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
                            header.xnnpack_build_identifier,
                            sizeof(header.xnnpack_build_identifier)),

From 293e0043b194416623b6480f58d912414c137f44 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Tue, 16 Dec 2025 04:06:55 -0800
Subject: [PATCH 329/753] [XLA:GPU] DotMerger refactoring: Compare dots with
 the same queue_id only.

To do so, use the queue_id as part of the key of equivalence classes. This way, dots on different stream (with different queue_ids) do not need to be considered pair-wise for merging. This replaces the previous pairwise `can_merge` function.

PiperOrigin-RevId: 845196209
---
 .../hlo/transforms/simplifiers/dot_merger.cc  | 32 +++++++++++--------
 .../hlo/transforms/simplifiers/dot_merger.h   | 13 ++++----
 .../transforms/simplifiers/dot_merger_test.cc | 15 +++++----
 .../xla/xla/service/gpu/gpu_compiler.cc       | 11 +++----
 4 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
index dc55e09dbdb953..be61f6be2a96c6 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
@@ -415,10 +415,9 @@ absl::StatusOr<HloInstruction*> TryMergeOperand(HloInstruction* a,
   return TryMergeLHSWithRHSOperand(b, a);
 }
 
-absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
-                               std::function<bool(const HloInstruction* dot_a,
-                                                  const HloInstruction* dot_b)>
-                                   can_merge) {
+absl::StatusOr<bool> MergeDots(
+    HloComputation* comp, int64_t max_size_to_merge,
+    std::function<int64_t(const HloInstruction* dot)> queue_id) {
   auto is_merge_candidate = [&](HloInstruction* instr) {
     int64_t bytes = ShapeUtil::ByteSizeOfElements(instr->shape());
     for (const HloInstruction* operand : instr->operands()) {
@@ -429,13 +428,17 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
 
   // Collect equivalence classes.  Specifically, create the map
   //
-  //   instruction -> [canonical dots that use the instruction].
+  //   instruction, queue_id -> [canonical dots that use the instruction].
+  //
+  // queue_id is backend-specific. Dots with different queue_ids may run
+  // concurrently on different streams and will not be merged.
   //
   // We'll then try to merge dots within each equivalence class.  A dot will be
   // a member of two equivalence classes (because it has two operands), but if
   // it's merged with a dot from one equivalence class, it won't also be merged
   // in another class.
-  absl::flat_hash_map<HloInstruction*, absl::flat_hash_set<HloInstruction*>>
+  absl::flat_hash_map<std::pair<HloInstruction*, int64_t>,
+                      absl::flat_hash_set<HloInstruction*>>
       equivalence_classes;
   for (HloInstruction* instr : comp->instructions()) {
     // Cowardly skip instructions with control dependencies.
@@ -445,11 +448,12 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
       continue;
     }
     for (HloInstruction* operand : instr->operands()) {
-      equivalence_classes[operand].insert(instr);
+      equivalence_classes[{operand, queue_id(instr)}].insert(instr);
       // DotDecomposer inserts transposes to establish a normal form. Transposed
       // operands still count as equivalent.
       if (operand->opcode() == HloOpcode::kTranspose) {
-        equivalence_classes[operand->mutable_operand(0)].insert(instr);
+        equivalence_classes[{operand->mutable_operand(0), queue_id(instr)}]
+            .insert(instr);
       }
     }
   }
@@ -462,7 +466,7 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
   //    us to merge.)
   absl::erase_if(
       equivalence_classes,
-      [&](const std::pair<const HloInstruction*,
+      [&](const std::pair<std::pair<const HloInstruction*, int64_t>,
                           absl::flat_hash_set<HloInstruction*>>& kv) {
         const auto& v = kv.second;
         return v.size() < 2 || absl::c_none_of(v, is_merge_candidate);
@@ -508,13 +512,14 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
   // them earlier because removing an instruction deletes it; we'd then have
   // dangling pointers in our hashtable!)
   absl::flat_hash_set<HloInstruction*> dead_instrs;
-  std::vector<HloInstruction*> keys;
+  std::vector<std::pair<HloInstruction*, int64_t>> keys;
   keys.reserve(equivalence_classes.size());
   for (auto& kv : equivalence_classes) {
     keys.push_back(kv.first);
   }
-  absl::c_sort(keys, [](const HloInstruction* a, const HloInstruction* b) {
-    return a->unique_id() < b->unique_id();
+  absl::c_sort(keys, [](std::pair<const HloInstruction*, int64_t> a,
+                        std::pair<const HloInstruction*, int64_t> b) {
+    return a.first->unique_id() < b.first->unique_id();
   });
   for (auto key : keys) {
     const auto& values = equivalence_classes[key];
@@ -540,7 +545,6 @@ absl::StatusOr<bool> MergeDots(HloComputation* comp, int64_t max_size_to_merge,
 
         if (dead_instrs.contains(a) || dead_instrs.contains(b) ||
             (!is_merge_candidate(a) && !is_merge_candidate(b)) ||
-            !can_merge(a, b) ||
             // Perform reachability checks last since they can be expensive.
             graph.IsReachableNonConst(a_id, b_id) ||
             graph.IsReachableNonConst(b_id, a_id)) {
@@ -599,7 +603,7 @@ absl::StatusOr<bool> DotMerger::RunImpl(
   for (HloComputation* comp :
        module->MakeNonfusionComputations(execution_threads)) {
     TF_ASSIGN_OR_RETURN(bool changed_computation,
-                        MergeDots(comp, max_size_to_merge_, can_merge_));
+                        MergeDots(comp, max_size_to_merge_, queue_id_));
     changed |= changed_computation;
   }
   return changed;
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
index c89ba6153f2feb..e0da3f4f1952b8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
 
@@ -59,10 +60,9 @@ class DotMerger : public HloModulePass {
  public:
   explicit DotMerger(
       int64_t max_size_to_merge,
-      std::function<bool(const HloInstruction* a, const HloInstruction* b)>
-          can_merge = [](const HloInstruction* dot_a,
-                         const HloInstruction* dot_b) -> bool { return true; })
-      : max_size_to_merge_(max_size_to_merge), can_merge_(can_merge) {}
+      std::function<int64_t(const HloInstruction* dot)> queue_id =
+          [](const HloInstruction* dot) -> int64_t { return 0; })
+      : max_size_to_merge_(max_size_to_merge), queue_id_(queue_id) {}
 
   absl::string_view name() const override { return "dot-merger"; }
 
@@ -73,9 +73,8 @@ class DotMerger : public HloModulePass {
 
  private:
   int64_t max_size_to_merge_;
-  // Predicate function for backend-specific compatibility check.
-  std::function<bool(const HloInstruction* dot_a, const HloInstruction* dot_b)>
-      can_merge_;
+  // Predicate function for backend-specific operation queue mapping.
+  std::function<int64_t(const HloInstruction* dot)> queue_id_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
index 3e6d3e109c6196..73ba972f33f3cc 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger_test.cc
@@ -831,18 +831,21 @@ TEST_F(DotMergerTest, NoMergeWithFalseCompatibility) {
     lhs1 = f32[2,4,300,200] parameter(1)
     rhs  = f32[2,4,200, 50] parameter(2)
     dot0 = f32[2,4,100, 50] dot(lhs0, rhs), lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
-                                            lhs_contracting_dims={3}, rhs_contracting_dims={2}
+        lhs_contracting_dims={3}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"0"}
     dot1 = f32[2,4,300, 50] dot(lhs1, rhs), lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
-                                            lhs_contracting_dims={3}, rhs_contracting_dims={2}
+        lhs_contracting_dims={3}, rhs_contracting_dims={2}, backend_config={"operation_queue_id":"1"}
     ROOT tuple = (f32[2,4,100,50], f32[2,4,300,50]) tuple(dot0, dot1)
   })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(module_string));
-  std::function<bool(const HloInstruction* dot_a, const HloInstruction* dot_b)>
-      can_merge = [&](const HloInstruction* dot_a,
-                      const HloInstruction* dot_b) -> bool { return false; };
+  std::function<int64_t(const HloInstruction* dot)> queue_id =
+      [&](const HloInstruction* dot) -> int64_t {
+    // The queue_id will typically be taken from the backend_config, but deps on
+    // backend-specific protos is avoided for testing.
+    return dot->name() == "dot1" ? 1 : 0;
+  };
   DotMerger pass(/*max_size_to_merge=*/std::numeric_limits<int64_t>::max(),
-                 can_merge);
+                 queue_id);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_FALSE(changed);
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index dd90b654eace23..f864daf9e86df2 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -757,18 +757,15 @@ absl::Status RunOptimizationPasses(
     // Only merge "smallish" dots.  This threshold defaults to 32MB today, with
     // a flag to override.
     // Do not merge dots when they are assigned different stream ids.
-    std::function<bool(const HloInstruction* dot_a,
-                       const HloInstruction* dot_b)>
-        can_merge = [&](const HloInstruction* dot_a,
-                        const HloInstruction* dot_b) -> bool {
-      return dot_a->backend_config<GpuBackendConfig>()->operation_queue_id() ==
-             dot_b->backend_config<GpuBackendConfig>()->operation_queue_id();
+    std::function<int64_t(const HloInstruction* dot)> queue_id =
+        [&](const HloInstruction* dot) -> int64_t {
+      return dot->backend_config<GpuBackendConfig>()->operation_queue_id();
     };
     pipeline.AddPass<DotMerger>(
         /*max_size_to_merge=*/int64_t{debug_options
                                           .xla_gpu_dot_merger_threshold_mb()}
             << 20,
-        can_merge);
+        queue_id);
     pipeline.AddPass<SortSimplifier>();
     pipeline.AddPass<TupleSimplifier>();
     pipeline.AddPass<WhileLoopConstantSinking>();

From b47342de0a5911fee447c7cbadd6420e3072fc6d Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Tue, 16 Dec 2025 04:10:50 -0800
Subject: [PATCH 330/753] [XLA:GPU] Remove RaggedAllToAll from command buffer
 conversion pass.

RaggedAllToAll is not yet supported by command buffers. Added a test to verify that RaggedAllToAll executes correctly when command buffers are enabled for collectives.

PiperOrigin-RevId: 845197490
---
 .../runtime/command_buffer_conversion_pass.cc |  1 -
 .../xla/tests/ragged_all_to_all_e2e_test.cc   | 48 +++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
index f6ef881b0966e3..e25b81c61bfbfe 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_conversion_pass.cc
@@ -144,7 +144,6 @@ std::optional<DebugOptions::CommandBufferCmdType> GetCommandBufferCmdType(
     case Thunk::kAllToAllStart:
     case Thunk::kCollectiveBroadcastStart:
     case Thunk::kCollectivePermuteStart:
-    case Thunk::kRaggedAllToAllStart:
     case Thunk::kRecv:
     case Thunk::kSend:
       return DebugOptions::COLLECTIVES;
diff --git a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
index 257d2b9c2625b2..d229e0b0fa4745 100644
--- a/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
+++ b/third_party/xla/xla/tests/ragged_all_to_all_e2e_test.cc
@@ -361,6 +361,54 @@ TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs) {
   EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
 }
 
+TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_CommandBuffer) {
+  absl::string_view kModuleReplicatedStr = R"(
+  HloModule module, num_partitions=1
+
+  ENTRY entry {
+    input = f32[4] parameter(0)
+    output = f32[4] parameter(1)
+    input_offsets = s32[2] parameter(2)
+    send_sizes = s32[2] parameter(3)
+    output_offsets = s32[2] parameter(4)
+    recv_sizes = s32[2] parameter(5)
+    ROOT ra2a = f32[4] ragged-all-to-all(input, output, input_offsets,
+    send_sizes, output_offsets, recv_sizes), replica_groups={{0,1}}
+  })";
+
+  const int64_t kNumReplicas = 2;
+  ASSERT_GE(hlo_runner_->device_count(), kNumReplicas)
+      << "Test requires at least " << kNumReplicas << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                        kModuleReplicatedStr, kNumReplicas));
+
+  // Verify correctness of ragged-all-to-all when command buffers for
+  // collectives are enabled.
+  // As of Dec 2025, ragged-all-to-all command is not implemented, so this test
+  // verifies that we don't try to accidentally create a command buffer and
+  // crash.
+  DebugOptions& debug_options =
+      module->mutable_config().mutable_debug_options();
+  debug_options.add_xla_gpu_enable_command_buffer(DebugOptions::COLLECTIVES);
+  debug_options.set_xla_gpu_graph_min_graph_size(1);
+
+  ASSERT_OK(CreateRandomTestData(module.get(),
+                                 /*input_sizes=*/{/*replica_0=*/{1, 1},
+                                                  /*replica_1=*/{3, 1}}));
+
+  ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(module), GetInputLiteralPtrs()));
+
+  const std::vector<Literal>& results = execution_result.results;
+
+  ASSERT_EQ(results.size(), kNumReplicas);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[0], results[0]));
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected_outputs_[1], results[1]));
+}
+
 TEST_P(RaggedAllToAllTest, RaggedAllToAll_2GPUs_S4) {
   absl::string_view kModuleReplicatedStr = R"(
   HloModule module, num_partitions=1

From a8ad7b1565df6a2e737756f7fa55002be3d58b98 Mon Sep 17 00:00:00 2001
From: Nikita Putikhin <nputikhin@google.com>
Date: Tue, 16 Dec 2025 04:28:13 -0800
Subject: [PATCH 331/753] Reverts deb256b360c624f8d476d888eb5dc1f902210fac

PiperOrigin-RevId: 845202732
---
 .../gpu/transforms/gemm_fusion_test.cc        | 52 -------------------
 .../xla/service/gpu/triton_fusion_analysis.cc | 30 +++--------
 .../service/gpu/triton_tiling_propagation.cc  | 28 ++--------
 .../service/gpu/triton_tiling_propagation.h   |  2 -
 4 files changed, 10 insertions(+), 102 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
index 8036bc09ab2ce6..c77d76d6954aed 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
@@ -227,58 +227,6 @@ ENTRY e {
   EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
 }
 
-TEST_F(GemmFusionTest, FuseSliceWithOtherUsersWhenDotHasSmallK) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-ENTRY e {
-  p0 = bf16[512,3584]{1,0} parameter(0)
-  p1 = bf16[3584,14400]{0,1} parameter(1)
-  p2 = bf16[64,14336]{1,0} parameter(2)
-
-  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
-
-  sl1 = bf16[512,64]{1,0} slice(d0), slice={[0:512], [14336:14400]}
-  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
-})"));
-
-  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
-  EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
-
-  // Check that the second dot is fused and the fusion contains sl1.
-  // We make no assumptions about other fusions.
-  constexpr absl::string_view kExpectedHloText = R"(
-    CHECK: %[[FUSION_DOT:.*]] (
-    CHECK:   %[[SLICE:.*]] = bf16[512,64]{1,0} slice(%parameter_0), slice={[0:512], [14336:14400]}
-    CHECK:   ROOT {{.*}} = bf16[512,14336]{1,0} dot(%[[SLICE]], %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-    CHECK: ENTRY
-    CHECK-DAG: %[[FUSION_D1:.*]] = bf16[512,14336]{1,0} fusion({{.*}}, {{.*}}), kind=kCustom, calls=%[[FUSION_DOT]]
-    CHECK-DAG: ROOT %a0 = bf16[512,14336]{1,0} add({{.*}}, %[[FUSION_D1]])
-  )";
-  MatchHloModule(*module, kExpectedHloText);
-}
-
-TEST_F(GemmFusionTest, DoNotFuseSliceWithOtherUsersWhenDotHasLargeK) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
-                          ParseAndReturnVerifiedModule(R"(
-ENTRY e {
-  p0 = bf16[512,3584]{1,0} parameter(0)
-  p1 = bf16[3584,14400]{0,1} parameter(1)
-  p2 = bf16[1400,14336]{1,0} parameter(2)
-
-  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
-  sl1 = bf16[512,1400]{1,0} slice(d0), slice={[0:512], [13000:14400]}
-
-  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
-})"));
-
-  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
-  EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
-}
-
 TEST_F(GemmFusionTest, DoNotFuseSliceOfMixedDimensions) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index 9f7fbcba0aa98b..cfc47a333955d0 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -57,16 +57,6 @@ using triton_fusion::GetPropagatedDimOrdersAndRequirements;
 using triton_fusion::kNoSplitRequirement;
 using triton_fusion::TransformDirection;
 
-int64_t GetContractingDimSize(const HloInstruction& dot) {
-  const auto& contracting_dims =
-      ContractingDimensionsForOperand(dot, /*operand_number=*/0);
-  int64_t contracting_dim_size = 1;
-  for (int64_t dim : contracting_dims) {
-    contracting_dim_size *= dot.operand(0)->shape().dimensions(dim);
-  }
-  return contracting_dim_size;
-}
-
 }  // namespace
 
 namespace triton_fusion {
@@ -91,13 +81,9 @@ namespace triton_fusion {
           0) {
     splittable_dimension_index = non_contracting_dimension_index;
   }
-
-  int64_t contracting_size = GetContractingDimSize(dot);
-
-  FusionContext context(
-      DotProperties{non_contracting_dimension_index, splittable_dimension_index,
-                    contracting_size},
-      DotRequirements(kNoSplitRequirement));
+  FusionContext context(DotProperties{non_contracting_dimension_index,
+                                      splittable_dimension_index},
+                        DotRequirements(kNoSplitRequirement));
   context.dim_orders_[dot.operand(operand_number)] =
       DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
                                              split_k_dimension_index);
@@ -116,13 +102,9 @@ namespace triton_fusion {
     // LHS non-contracting follows (batch is absent in this case).
     splittable_dimension_index = (split_k > 1) ? 1 : 0;
   }
-
-  int64_t contracting_size = GetContractingDimSize(dot);
-
-  FusionContext context(
-      DotProperties{/*noncontracting_dimension=*/-1, splittable_dimension_index,
-                    contracting_size},
-      std::move(requirements));
+  FusionContext context(DotProperties{/*noncontracting_dimension=*/-1,
+                                      splittable_dimension_index},
+                        std::move(requirements));
   context.dim_orders_[&dot] = DimensionOrder::FromDotOperandOrOutput(dot);
   return context;
 }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 24b5e4879240c4..926307d6f0c0e7 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -992,16 +992,11 @@ bool CanNotBeFusedIntoAUser(const HloInstruction& hlo) {
                           hlo.users()[0]->opcode() == HloOpcode::kTuple);
 }
 
-// Maximum contracting dimension size for which slice fusion is allowed when
-// the operand has multiple users.
-constexpr int kMaxContractingDimSizeForSliceFusion = 1024;
-
 // Let input and output data volumes of a fusion grow by small amounts.
 constexpr int kIoToleranceBytes = 1024;
 
 // Tells that fusing an instruction as an input is efficient.
-bool IsInputWorthFusing(const HloInstruction& hlo,
-                        const DotProperties& properties) {
+bool IsInputWorthFusing(const HloInstruction& hlo) {
   std::optional<int64_t> input_minus_output_bytes = InputMinusOutputBytes(hlo);
   if (!input_minus_output_bytes.has_value()) {
     return false;
@@ -1016,21 +1011,6 @@ bool IsInputWorthFusing(const HloInstruction& hlo,
       hlo_query::AllOperandsAreParametersOrConstants(hlo)) {
     return true;
   }
-  // Explanation:
-  // * Operand user count > 1 - if the producer of the slice has a single user
-  //   the slice can be fused into the producer instead of here.
-  // * contracting_dim_size < 1024 - fusing slices disables split-K rewriter,
-  //   which may outweigh the benefit of fusing it in the first place. Small
-  //   contracting dimension almost never benefits from splitting it, so we
-  //   allow the fusion.
-
-  // TODO: b/393299275 - Remove the contracting dim size restriction once the
-  // new emitter lands and we can support slices in contracting dimension with
-  // splits.
-  if (hlo.opcode() == HloOpcode::kSlice && hlo.operand(0)->user_count() > 1 &&
-      properties.contracting_dim_size <= kMaxContractingDimSizeForSliceFusion) {
-    return true;
-  }
   const bool enable_subchannel_dequantisation_fusion =
       hlo.GetModule()
           ->config()
@@ -1038,8 +1018,8 @@ bool IsInputWorthFusing(const HloInstruction& hlo,
           .xla_gpu_experimental_enable_subchannel_dequantisation_fusion();
   if (hlo.opcode() == HloOpcode::kMultiply) {
     return enable_subchannel_dequantisation_fusion &&
-           IsInputWorthFusing(*hlo.operand(0), properties) &&
-           IsInputWorthFusing(*hlo.operand(1), properties);
+           IsInputWorthFusing(*hlo.operand(0)) &&
+           IsInputWorthFusing(*hlo.operand(1));
   }
   return hlo_query::AllOperandsAreParametersOrConstantsWithSingleUser(hlo);
 }
@@ -1159,7 +1139,7 @@ GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
         }
       }
     }
-    if (!accepted && !IsInputWorthFusing(hlo, properties)) {
+    if (!accepted && !IsInputWorthFusing(hlo)) {
       return FusionDecision::Forbid(
           "Not obviously profitable to fuse as input.");
     }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
index df09b35a1f0ffc..a83dd9c976f8c4 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -249,8 +249,6 @@ struct DotProperties {
   // Index of dot dimension that can be split.
   // Currently typically LHS non-contracting one.
   const int splittable_dimension_index;
-  // Size of the contracting dimension (K).
-  const int64_t contracting_dim_size;
 };
 
 // A special value for splittable_dimension_major_part_size.

From 62020f04ae210ab456ed7c88b3079861522a2f6b Mon Sep 17 00:00:00 2001
From: Ilya Tikhonovskiy <loislo@google.com>
Date: Tue, 16 Dec 2025 04:33:34 -0800
Subject: [PATCH 332/753] [XLA:GPU] Refine scaled_dot composite rewriting
 logic.

The composite rewriter now supports two types of scaled_dot patterns: FP8 inputs with FP8 scales where the scale factor is a multiple of 32, and BF16 inputs with BF16 scales that are constant tensors of all ones. The test suite has been updated to cover these different scenarios.

PiperOrigin-RevId: 845204253
---
 .../xla/xla/service/gpu/transforms/BUILD      |   3 +
 .../gpu/transforms/composite_rewriter.cc      |  64 +++--
 .../gpu/transforms/composite_rewriter_test.cc | 263 +++++++++++++++---
 3 files changed, 278 insertions(+), 52 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 9f017b8ad97df5..89b60223b66765 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -2371,6 +2371,7 @@ cc_library(
     srcs = ["composite_rewriter.cc"],
     hdrs = ["composite_rewriter.h"],
     deps = [
+        "//xla:literal",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -2397,7 +2398,9 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
index 7afd2c38666760..dc8e754bf52ae2 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/literal.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -143,28 +144,51 @@ absl::StatusOr<bool> CompositeRewriter::RewriteComputation(
     const HloInstruction* lhs_scale = call->operand(2);
     const HloInstruction* rhs_scale = call->operand(3);
 
-    if (lhs->shape().element_type() != BF16) {
-      int64_t contracting_dim =
-          dot_dimension_numbers.lhs_contracting_dimensions(0);
-      int64_t scale_factor = lhs->shape().dimensions(contracting_dim) /
-                             lhs_scale->shape().dimensions(contracting_dim);
-      if (scale_factor != 32) {
-        VLOG(2) << "LHS scale_factor is not 32: " << scale_factor
-                << " ignore such scaled_dot. It will be inlined later.";
-        continue;
+    int64_t lhs_contracting_dim =
+        dot_dimension_numbers.lhs_contracting_dimensions(0);
+    int64_t rhs_contracting_dim =
+        dot_dimension_numbers.rhs_contracting_dimensions(0);
+
+    auto is_supported = [&](const HloInstruction* operand,
+                            const HloInstruction* scale,
+                            int64_t contracting_dim) {
+      auto op_type = operand->shape().element_type();
+      auto scale_type = scale->shape().element_type();
+      if ((op_type == F8E4M3FN || op_type == F8E5M2) &&
+          scale_type == F8E8M0FNU) {
+        if (contracting_dim >= scale->shape().dimensions_size()) {
+          return false;
+        }
+        int64_t operand_dim_size = operand->shape().dimensions(contracting_dim);
+        int64_t scale_dim_size = scale->shape().dimensions(contracting_dim);
+
+        if (scale_dim_size == 0 || operand_dim_size % scale_dim_size != 0) {
+          return false;
+        }
+        int64_t scale_factor = operand_dim_size / scale_dim_size;
+        return scale_factor % 32 == 0;
       }
-    }
-
-    if (rhs->shape().element_type() != BF16) {
-      int64_t contracting_dim =
-          dot_dimension_numbers.rhs_contracting_dimensions(0);
-      int64_t scale_factor = rhs->shape().dimensions(contracting_dim) /
-                             rhs_scale->shape().dimensions(contracting_dim);
-      if (scale_factor != 32) {
-        VLOG(2) << "RHS scale_factor is not 32: " << scale_factor
-                << " ignore such scaled_dot for now. It will be inlined later.";
-        continue;
+      if (op_type == BF16 && scale_type == BF16) {
+        if (scale->shape().dimensions_size() !=
+            operand->shape().dimensions_size()) {
+          return false;
+        }
+        for (int64_t dim : scale->shape().dimensions()) {
+          if (dim != 1) {
+            return false;
+          }
+        }
+        if (scale->opcode() != HloOpcode::kConstant) {
+          return false;
+        }
+        return scale->literal().IsAllFloat(1.0);
       }
+      return false;
+    };
+
+    if (!is_supported(lhs, lhs_scale, lhs_contracting_dim) ||
+        !is_supported(rhs, rhs_scale, rhs_contracting_dim)) {
+      continue;
     }
 
     PrecisionConfig precision{};
diff --git a/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
index a2935374ef2c6b..e81a5f13a9e842 100644
--- a/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/composite_rewriter_test.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/composite_rewriter.h"
 
+#include <optional>
 #include <string>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/log.h"
 #include "absl/status/status_matchers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/substitute.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/tsl/platform/statusor.h"
@@ -27,39 +32,109 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
-TEST(CompositeRewriterTest, ScaledDotCompositeRewrite) {
-  const std::string hlo_string = R"(
-    HloModule jit_my_dot
+struct TestCase {
+  std::string test_name;
+  std::string lhs_type;
+  std::string rhs_type;
+  std::string lhs_scale_type;
+  std::string rhs_scale_type;
+  std::string lhs_scale_shape;
+  std::string rhs_scale_shape;
+  std::optional<float> lhs_scale_const_val;
+  std::optional<float> rhs_scale_const_val;
+  bool expected_rewrite;
+};
+
+std::string GenerateHlo(const TestCase& test_case) {
+  // Helper to generate scale definition (either parameter or constant)
+  // and maintain the list of main parameters.
+  std::string main_params_decl;
+  std::vector<std::string> call_operands;
+  int param_idx = 0;
+
+  // LHS operand (always param 0)
+  main_params_decl +=
+      absl::Substitute("  %lhs = $0[3,128,256]{2,1,0} parameter($1)\n",
+                       test_case.lhs_type, param_idx++);
+  call_operands.push_back("%lhs");
+
+  // RHS operand (always param 1)
+  main_params_decl +=
+      absl::Substitute("  %rhs = $0[3,256,128]{2,1,0} parameter($1)\n",
+                       test_case.rhs_type, param_idx++);
+  call_operands.push_back("%rhs");
+
+  // LHS Scale
+  if (test_case.lhs_scale_const_val.has_value()) {
+    std::string val_str = std::to_string(*test_case.lhs_scale_const_val);
+    // Remove trailing zeros for cleanliness
+    val_str.erase(val_str.find_last_not_of('0') + 1, std::string::npos);
+    if (val_str.back() == '.') {
+      val_str.pop_back();
+    }
+
+    std::string literal;
+    if (test_case.lhs_scale_shape == "3,1,1") {
+      literal = absl::Substitute("{{{$0}}, {{$0}}, {{$0}}}", val_str);
+    } else {
+      // Assume rank 3 scalar for 1,1,1 or others
+      literal = absl::Substitute("{{{$0}}}", val_str);
+    }
+
+    main_params_decl += absl::Substitute(
+        "  %lhs_scales = $0[$1]{2,1,0} constant($2)\n",
+        test_case.lhs_scale_type, test_case.lhs_scale_shape, literal);
+  } else {
+    main_params_decl += absl::Substitute(
+        "  %lhs_scales = $0[$1]{2,1,0} parameter($2)\n",
+        test_case.lhs_scale_type, test_case.lhs_scale_shape, param_idx++);
+  }
+  call_operands.push_back("%lhs_scales");
+
+  // RHS Scale
+  if (test_case.rhs_scale_const_val.has_value()) {
+    std::string val_str = std::to_string(*test_case.rhs_scale_const_val);
+    val_str.erase(val_str.find_last_not_of('0') + 1, std::string::npos);
+    if (val_str.back() == '.') {
+      val_str.pop_back();
+    }
+
+    std::string literal;
+    if (test_case.rhs_scale_shape == "3,1,1") {
+      literal = absl::Substitute("{{{$0}}, {{$0}}, {{$0}}}", val_str);
+    } else {
+      literal = absl::Substitute("{{{$0}}}", val_str);
+    }
+
+    main_params_decl += absl::Substitute(
+        "  %rhs_scales = $0[$1]{2,1,0} constant($2)\n",
+        test_case.rhs_scale_type, test_case.rhs_scale_shape, literal);
+  } else {
+    main_params_decl += absl::Substitute(
+        "  %rhs_scales = $0[$1]{2,1,0} parameter($2)\n",
+        test_case.rhs_scale_type, test_case.rhs_scale_shape, param_idx++);
+  }
+  call_operands.push_back("%rhs_scales");
+
+  // Construct the HLO string
+  // Note: We use a dummy body for xla.scaled_dot.1 because the rewriter
+  // currently doesn't inspect it, only the call site.
+  // We match the parameter types to avoid parser errors.
+  std::string hlo_template = R"(
+    HloModule test_module
 
     %xla.scaled_dot.1 {
-      %lhs = f8e4m3fn[3,128,256]{2,1,0} parameter(0)
-      %lhs_bf16 = bf16[3,128,256]{2,1,0} convert(%lhs)
-      %lhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(2)
-      %lhs_scales_bf16 = bf16[3,128,8]{2,1,0} convert(%lhs_scales)
-      %lhs_scales_bf16_broadcasted = bf16[3,128,8,32]{3,2,1,0} broadcast(%lhs_scales_bf16), dimensions={0,1,2}
-      %lhs_scales_broadcasted = bf16[3,128,256]{2,1,0} reshape(%lhs_scales_bf16_broadcasted)
-      %lhs_scaled = bf16[3,128,256]{2,1,0} multiply(%lhs_bf16, %lhs_scales_broadcasted)
-      %rhs = f8e4m3fn[3,128,256]{2,1,0} parameter(1)
-      %rhs_bf16 = bf16[3,128,256]{2,1,0} convert(%rhs)
-      %rhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(3)
-      %rhs_scales_bf16 = bf16[3,128,8]{2,1,0} convert(%rhs_scales)
-      %rhs_scales_bf16_broadcasted = bf16[3,128,8,32]{3,2,1,0} broadcast(%rhs_scales_bf16), dimensions={0,1,2}
-      %rhs_scales_broadcasted = bf16[3,128,256]{2,1,0} reshape(%rhs_scales_bf16_broadcasted)
-      %rhs_scaled = bf16[3,128,256]{2,1,0} multiply(%rhs_bf16, %rhs_scales_broadcasted)
-      %rhs_scaled_transposed = bf16[3,256,128]{1,2,0} transpose(%rhs_scaled), dimensions={0,2,1}
-      ROOT %dot_general.1 = bf16[3,128,128]{2,1,0} dot(%lhs_scaled, %rhs_scaled_transposed),
-          lhs_batch_dims={0},
-          lhs_contracting_dims={2},
-          rhs_batch_dims={0},
-          rhs_contracting_dims={1}
+      %p0 = $0[3,128,256]{2,1,0} parameter(0)
+      %p1 = $1[3,256,128]{2,1,0} parameter(1)
+      %p2 = $2[$4]{2,1,0} parameter(2)
+      %p3 = $3[$5]{2,1,0} parameter(3)
+      // Dummy root with correct shape
+      ROOT %dummy = bf16[3,128,128]{2,1,0} constant({...})
     }
 
     ENTRY %main {
-      %lhs = f8e4m3fn[3,128,256]{2,1,0} parameter(0)
-      %rhs = f8e4m3fn[3,256,128]{2,1,0} parameter(1)
-      %lhs_scales = f8e8m0fnu[3,128,8]{2,1,0} parameter(2)
-      %rhs_scales = f8e8m0fnu[3,8,128]{2,1,0} parameter(3)
-      ROOT %call.1 = bf16[3,128,128]{2,1,0} call(%lhs, %rhs, %lhs_scales, %rhs_scales),
+      $6
+      ROOT %call = bf16[3,128,128]{2,1,0} call($7),
           to_apply=%xla.scaled_dot.1,
           is_composite=true,
           frontend_attributes={
@@ -67,14 +142,138 @@ TEST(CompositeRewriterTest, ScaledDotCompositeRewrite) {
             composite.name="xla.scaled_dot",
             composite.version="1"
           }
-    })";
+    }
+  )";
+
+  std::string call_operands_str = absl::StrJoin(call_operands, ", ");
+
+  return absl::Substitute(hlo_template, test_case.lhs_type, test_case.rhs_type,
+                          test_case.lhs_scale_type, test_case.rhs_scale_type,
+                          test_case.lhs_scale_shape, test_case.rhs_scale_shape,
+                          main_params_decl, call_operands_str);
+}
+
+class CompositeRewriterParameterizedTest
+    : public ::testing::TestWithParam<TestCase> {};
+
+TEST_P(CompositeRewriterParameterizedTest, Run) {
+  const TestCase& test_case = GetParam();
+  std::string hlo_string = GenerateHlo(test_case);
+  LOG(INFO) << "HLO string: \n" << hlo_string;
+
   CompositeRewriter rewriter;
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
-  EXPECT_THAT(rewriter.Run(module.get()), absl_testing::IsOkAndHolds(true));
-  EXPECT_THAT(module->entry_computation()->root_instruction()->opcode(),
-              HloOpcode::kScaledDot);
+
+  auto result = rewriter.Run(module.get());
+
+  if (test_case.expected_rewrite) {
+    EXPECT_THAT(result, absl_testing::IsOkAndHolds(true));
+    EXPECT_THAT(module->entry_computation()->root_instruction()->opcode(),
+                HloOpcode::kScaledDot);
+  } else {
+    // If it didn't rewrite, it should either be OkAndHolds(false)
+    // or arguably just check that the opcode is still Call.
+    // The current implementation returns OkAndHolds(false) if no change.
+    EXPECT_THAT(result, absl_testing::IsOkAndHolds(false));
+    EXPECT_THAT(module->entry_computation()->root_instruction()->opcode(),
+                HloOpcode::kCall);
+  }
 }
 
+INSTANTIATE_TEST_SUITE_P(
+
+    ScaledDotTests, CompositeRewriterParameterizedTest,
+
+    ::testing::Values(
+        TestCase{
+            /*test_name=*/"FP8_Standard_Case",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"f8e8m0fnu",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,8",
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/true,
+        },
+        TestCase{
+            /*test_name=*/"BF16_Identity_Case",
+            /*lhs_type=*/"bf16",
+            /*rhs_type=*/"bf16",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"bf16",
+            /*lhs_scale_shape=*/"1,1,1",
+            /*rhs_scale_shape=*/"1,1,1",
+            /*lhs_scale_const_val=*/1.0f,
+            /*rhs_scale_const_val=*/1.0f,
+            /*expected_rewrite=*/true,
+        },
+        TestCase{
+            /*test_name=*/"BF16_Invalid_Scale_Value",
+            /*lhs_type=*/"bf16",
+            /*rhs_type=*/"bf16",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"bf16",
+            /*lhs_scale_shape=*/"1,1,1",
+            /*rhs_scale_shape=*/"1,1,1",
+            /*lhs_scale_const_val=*/1.0f,
+            /*rhs_scale_const_val=*/2.0f,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"BF16_Invalid_Scale_Shape",
+            /*lhs_type=*/"bf16",
+            /*rhs_type=*/"bf16",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"bf16",
+            /*lhs_scale_shape=*/"3,128,1",
+            /*rhs_scale_shape=*/"1,1,1",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/1.0f,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"Mixed_Type_Fail_BF16_Scale_With_FP8_Op",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"bf16",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,8",
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"FP8_ScaleFactor_16",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"f8e8m0fnu",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,16",  // 256 / 16 = 16 (not divisible by
+                                             // 32)
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/false,
+        },
+        TestCase{
+            /*test_name=*/"FP8_ScaleFactor_64",
+            /*lhs_type=*/"f8e4m3fn",
+            /*rhs_type=*/"f8e4m3fn",
+            /*lhs_scale_type=*/"f8e8m0fnu",
+            /*rhs_scale_type=*/"f8e8m0fnu",
+            /*lhs_scale_shape=*/"3,128,4",  // 256 / 4 = 64 (divisible by 32)
+            /*rhs_scale_shape=*/"3,8,128",
+            /*lhs_scale_const_val=*/std::nullopt,
+            /*rhs_scale_const_val=*/std::nullopt,
+            /*expected_rewrite=*/true,
+        }),
+    [](const ::testing::TestParamInfo<TestCase>& info) {
+      return info.param.test_name;
+    });
+
 }  // namespace
 }  // namespace xla::gpu

From 3bff94579443c830b19fe0a176fb4b1d8b12a4ce Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Tue, 16 Dec 2025 04:51:00 -0800
Subject: [PATCH 333/753] Remove AsGpuStreamValue and split NCCL/RCCL targets

This splits the NCCL collective targets into NCCL and RCCL implementations. It's the last little bit of code that has not yet been separated.

It's also the last user of gpu_stream.h which can also be deleted.

API Changes:
- `GpuCollectives::Default` now takes a platform as an argument. Previously we've been using "gpu" as the platform name, but I'm now migrating this to CUDA and ROCM which are the platform names that we use anywhere else.
- I also moved NvshmemCollectives onto the CUDA platform

PiperOrigin-RevId: 845209298
---
 .../xla/xla/backends/gpu/collectives/BUILD    | 254 ++++--
 .../gpu/collectives/gpu_collectives.cc        |   5 +-
 .../gpu/collectives/gpu_collectives.h         |   5 +-
 .../gpu/collectives/nccl_collectives.cc       |  25 +-
 .../gpu/collectives/nccl_communicator.cc      |  57 +-
 .../gpu/collectives/nccl_communicator.h       |  14 +-
 .../gpu/collectives/nccl_communicator_test.cc |  14 +-
 .../backends/gpu/collectives/nccl_errors.cc   |  12 +-
 .../backends/gpu/collectives/nccl_errors.h    |  14 +-
 .../gpu/collectives/nvshmem_collectives.cc    |   4 +-
 .../gpu/collectives/nvshmem_communicator.cc   |  66 +-
 .../gpu/collectives/rccl_collectives.cc       | 407 +++++++++
 .../gpu/collectives/rccl_collectives.h        |  93 ++
 .../gpu/collectives/rccl_communicator.cc      | 831 ++++++++++++++++++
 .../gpu/collectives/rccl_communicator.h       | 254 ++++++
 .../gpu/collectives/rccl_communicator_test.cc | 162 ++++
 .../backends/gpu/collectives/rccl_errors.cc   |  59 ++
 .../backends/gpu/collectives/rccl_errors.h    |  83 ++
 .../backends/gpu/runtime/collective_params.cc |   5 +-
 third_party/xla/xla/pjrt/gpu/BUILD            |   1 -
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |   6 +-
 third_party/xla/xla/service/gpu/BUILD         |   6 +-
 .../xla/xla/stream_executor/cuda/BUILD        |   5 +-
 .../xla/stream_executor/cuda/cuda_timer.cc    |   7 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |  13 -
 .../xla/xla/stream_executor/gpu/gpu_stream.cc |  33 -
 .../xla/xla/stream_executor/gpu/gpu_stream.h  |  33 -
 27 files changed, 2168 insertions(+), 300 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/collectives/rccl_collectives.cc
 create mode 100644 third_party/xla/xla/backends/gpu/collectives/rccl_collectives.h
 create mode 100644 third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
 create mode 100644 third_party/xla/xla/backends/gpu/collectives/rccl_communicator.h
 create mode 100644 third_party/xla/xla/backends/gpu/collectives/rccl_communicator_test.cc
 create mode 100644 third_party/xla/xla/backends/gpu/collectives/rccl_errors.cc
 create mode 100644 third_party/xla/xla/backends/gpu/collectives/rccl_errors.h
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_stream.h

diff --git a/third_party/xla/xla/backends/gpu/collectives/BUILD b/third_party/xla/xla/backends/gpu/collectives/BUILD
index f0f7ee78a657e3..291f156d5000d5 100644
--- a/third_party/xla/xla/backends/gpu/collectives/BUILD
+++ b/third_party/xla/xla/backends/gpu/collectives/BUILD
@@ -19,6 +19,17 @@ package_group(
     ],
 )
 
+alias(
+    # Since RCCL reimplements the NCCL API and there is no way to disable that,
+    # we need to make sure that we only link either NCCL or RCCL.
+    name = "nccl_or_rccl_collectives",
+    actual = if_rocm_is_configured(
+        ":rccl_collectives",
+        ":nccl_collectives",
+    ),
+    tags = ["manual"],
+)
+
 # Build target that registers all available GPU collectives implementations with the collectives
 # registry at link time.
 cc_library(
@@ -26,7 +37,7 @@ cc_library(
     deps = [
         ":gpu_collectives_stub",
     ] + if_cuda_or_rocm_is_configured([
-        ":nccl_collectives",
+        ":nccl_or_rccl_collectives",
     ]) + if_cuda_is_configured([
         ":nvshmem_collectives_if_supported",
     ]),
@@ -192,6 +203,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
     ],
@@ -237,13 +249,30 @@ cc_library(
     name = "nccl_errors",
     srcs = ["nccl_errors.cc"],
     hdrs = ["nccl_errors.h"],
-    local_defines =
-        if_rocm_is_configured([
-            "TENSORFLOW_USE_ROCM=1",
-        ]),
+    tags = [
+        "cuda-only",
+        "gpu",
+        "no-oneapi",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//xla:util",
+        "//xla/tsl/cuda:nccl",
+        "//xla/tsl/platform:logging",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "rccl_errors",
+    srcs = ["rccl_errors.cc"],
+    hdrs = ["rccl_errors.h"],
     tags = [
         "gpu",
         "no-oneapi",
+        "rocm-only",
     ],
     visibility = ["//visibility:private"],
     deps = [
@@ -253,23 +282,17 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
         "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
 )
 
 cc_library(
     name = "nccl_collectives",
     srcs = ["nccl_collectives.cc"],
     hdrs = ["nccl_collectives.h"],
-    local_defines =
-        if_rocm_is_configured([
-            "TENSORFLOW_USE_ROCM=1",
-        ]),
     tags = [
+        "cuda-only",
         "gpu",
         "no-oneapi",
     ],
@@ -277,11 +300,9 @@ cc_library(
     deps = [
         ":gpu_clique_key",
         ":gpu_collectives",
-        ":gpu_communicator",
         ":nccl_communicator",
         ":nccl_errors",
         "//xla:debug_options_flags",
-        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla/core/collectives",
@@ -292,12 +313,10 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
         "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/stream_executor:device_address",
-        "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/stream_executor/cuda:nccl_memory_allocator",  # buildcleaner: keep (static registration)
+        "//xla/tsl/cuda:nccl",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
@@ -305,27 +324,70 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/debugging:leak_check",
-        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:numbers",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-        "//xla/stream_executor/cuda:nccl_memory_allocator",  # buildcleaner: keep (static registration)
-    ]) + if_rocm_is_configured([
+    ],
+    alwayslink = True,  # registers collectives implementation
+)
+
+cc_library(
+    name = "rccl_collectives",
+    srcs = ["rccl_collectives.cc"],
+    hdrs = ["rccl_collectives.h"],
+    tags = [
+        "gpu",
+        "no-oneapi",
+        "rocm-only",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":gpu_clique_key",
+        ":gpu_collectives",
+        ":rccl_communicator",
+        ":rccl_errors",
+        "//xla:debug_options_flags",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla/core/collectives",
+        "//xla/core/collectives:clique_id",
+        "//xla/core/collectives:clique_key",
+        "//xla/core/collectives:collectives_registry",
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/pjrt/distributed:key_value_store_interface",
+        "//xla/runtime:device_id",
+        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@local_config_rocm//rocm:rccl",  # buildcleaner: keep
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_tsl//tsl/platform:casts",
+        "@local_tsl//tsl/platform:numbers",
+    ],
     alwayslink = True,  # registers collectives implementation
 )
 
@@ -333,10 +395,8 @@ cc_library(
     name = "nccl_communicator",
     srcs = ["nccl_communicator.cc"],
     hdrs = ["nccl_communicator.h"],
-    local_defines = if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     tags = [
+        "cuda-only",
         "gpu",
         "no-oneapi",
     ],
@@ -348,35 +408,23 @@ cc_library(
         ":single_threaded_executor",
         "//xla:future",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:util",
-        "//xla/core/collectives",
-        "//xla/core/collectives:clique_id",
-        "//xla/core/collectives:clique_key",
-        "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
-        "//xla/pjrt/distributed:key_value_store_interface",
-        "//xla/runtime:device_id",
-        "//xla/service:collective_ops_utils",
-        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/core/collectives:reduction_kind",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/tsl/concurrency:async_value",
         "//xla/tsl/concurrency:executor",
+        "//xla/tsl/cuda:nccl",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/debugging:leak_check",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
@@ -384,17 +432,59 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:casts",
-        "@local_tsl//tsl/platform:numbers",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-    ]) + if_rocm_is_configured([
+    ],
+)
+
+cc_library(
+    name = "rccl_communicator",
+    srcs = ["rccl_communicator.cc"],
+    hdrs = ["rccl_communicator.h"],
+    tags = [
+        "gpu",
+        "no-oneapi",
+        "rocm-only",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":gpu_collectives",
+        ":gpu_communicator",
+        ":rccl_errors",
+        ":single_threaded_executor",
+        "//xla:future",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/core/collectives:communicator",
+        "//xla/core/collectives:rank_id",
+        "//xla/core/collectives:reduction_kind",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/concurrency:executor",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_config_rocm//rocm:rccl",  # buildcleaner: keep
         "@local_config_rocm//rocm:rocm_headers",
-        "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_tsl//tsl/platform:casts",
+    ],
 )
 
 cc_library(
@@ -424,15 +514,16 @@ cc_library(
         "//xla/core/collectives:collectives_registry",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/core/collectives:reduction_kind",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/stream_executor/cuda:nvshmem",
         "//xla/stream_executor/cuda:nvshmem_memory_allocator",  # buildcleaner: keep (static registration)
-        "//xla/stream_executor/gpu:gpu_stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -479,10 +570,6 @@ xla_test(
     name = "nccl_communicator_test",
     srcs = ["nccl_communicator_test.cc"],
     backends = ["gpu"],
-    local_defines =
-        if_rocm_is_configured([
-            "TENSORFLOW_USE_ROCM=1",
-        ]),
     tags = [
         # Stop chloroxylenol from running this test with msan because msan does
         # not work with CUDA.
@@ -493,6 +580,7 @@ xla_test(
         "no-oneapi",
         # TODO(b/435404154): Reenable once this is fixed.
         "no_oss",
+        "cuda-only",
     ],
     visibility = ["//visibility:private"],
     deps = [
@@ -501,8 +589,40 @@ xla_test(
         ":nccl_communicator",
         ":nccl_errors",
         "//xla:future",
+        "//xla/core/collectives:rank_id",
+        "//xla/core/collectives:reduction_kind",
+        "//xla/stream_executor:device_address",
+        "//xla/tsl/cuda:nccl",
+        "//xla/tsl/platform:errors",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+xla_test(
+    name = "rccl_communicator_test",
+    srcs = ["rccl_communicator_test.cc"],
+    backends = ["gpu"],
+    tags = [
+        "no-oneapi",
+        # TODO(b/435404154): Reenable once this is fixed.
+        "no_oss",
+        "rocm-only",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":gpu_collectives",
+        ":rccl_collectives",
+        ":rccl_communicator",
+        ":rccl_errors",
+        "//xla:future",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
+        "//xla/core/collectives:reduction_kind",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor:device_address",
         "//xla/tsl/concurrency:async_value",
@@ -515,12 +635,9 @@ xla_test(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/utility",
         "@com_google_googletest//:gtest_main",
-    ] + if_cuda_is_configured([
-        "//xla/tsl/cuda:nccl",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
         "@local_config_rocm//rocm:rccl",
-    ]),
+        "@local_config_rocm//rocm:rocm_headers",
+    ],
 )
 
 xla_test(
@@ -549,12 +666,14 @@ xla_test(
     },
     tags = ["cuda-only"],
     deps = [
+        ":nvshmem_collectives",
         "//xla:debug_options_flags",
         "//xla:status_macros",
         "//xla/core/collectives:communicator",
         "//xla/pjrt/distributed",
         "//xla/pjrt/distributed:client",
         "//xla/pjrt/distributed:service",
+        "//xla/stream_executor/cuda:nvshmem",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
@@ -566,10 +685,7 @@ xla_test(
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
         "@local_config_cuda//cuda:cuda_headers",
-    ] + if_cuda_is_configured([
-        ":nvshmem_collectives",
-        "//xla/stream_executor/cuda:nvshmem",
-    ]),
+    ],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
index 05c54e2cf1e022..ca98b036a35dd8 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "xla/core/collectives/collectives.h"
 #include "xla/core/collectives/collectives_registry.h"
 #include "xla/shape_util.h"
@@ -31,9 +32,9 @@ limitations under the License.
 
 namespace xla::gpu {
 
-GpuCollectives* GpuCollectives::Default() {
+GpuCollectives* GpuCollectives::Default(absl::string_view platform_name) {
   absl::StatusOr<Collectives*> collectives =
-      CollectivesRegistry::Default("gpu");
+      CollectivesRegistry::Default(platform_name);
   CHECK_OK(collectives) << "Failed to get GPU collectives";  // Crash OK
 
   if (auto* gpu_collectives = tsl::down_cast<GpuCollectives*>(*collectives)) {
diff --git a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
index 5c7c51f8f7068d..7297b98d6e2741 100644
--- a/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
+++ b/third_party/xla/xla/backends/gpu/collectives/gpu_collectives.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
@@ -46,8 +47,8 @@ namespace xla::gpu {
 // XLA:GPU extension of the Collectives interface with GPU-specific APIs.
 class GpuCollectives : public Collectives {
  public:
-  // Returns the default collectives implementation for GPU backend.
-  static GpuCollectives* Default();
+  // Returns the default collectives implementation for the given platform.
+  static GpuCollectives* Default(absl::string_view platform_name);
 
   // A callback to get a unique clique id.
   using CliqueIdCallback =  // NOLINT
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
index 65ad3cdb8cd54f..b887bbb24e4e78 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
@@ -35,7 +35,9 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/nccl_communicator.h"
@@ -61,17 +63,6 @@ limitations under the License.
 #include "tsl/platform/casts.h"
 #include "tsl/platform/numbers.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 
 static ncclComm_t Cast(const Communicator* comm) {
@@ -343,7 +334,8 @@ class NcclIdStore {
         device_to_node_(std::move(device_to_node)),
         kv_store_(std::move(kv_store)) {}
 
-  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key) {
+  absl::StatusOr<CliqueId> GetNcclUniqueId(const CliqueKey& key,
+                                           NcclCollectives& nccl_collectives) {
     auto* gpu_key = tsl::down_cast<const gpu::GpuCliqueKey*>(&key);
     if (gpu_key == nullptr) {
       return InvalidArgument("Expected GPU clique key");
@@ -362,8 +354,7 @@ class NcclIdStore {
     CliqueId clique_id;
     int primary_node_id = device_to_node_.at(gpu_key->root_device());
     if (node_id_ == primary_node_id) {
-      TF_ASSIGN_OR_RETURN(
-          clique_id, gpu::GpuCollectives::Default()->CreateUniqueCliqueId());
+      TF_ASSIGN_OR_RETURN(clique_id, nccl_collectives.CreateUniqueCliqueId());
       TF_RETURN_IF_ERROR(
           kv_store_->Set(gpu_key->ToString(), clique_id.ToString()));
     } else {
@@ -399,13 +390,13 @@ absl::Status NcclCollectives::InitializeTopology(
         topology.node_id, topology.device_id_to_node_id,
         std::move(topology.kv_store));
     topology.gpu_executable_run_options->set_clique_id_callback(
-        [nccl_id_store](const CliqueKey& key) {
-          return nccl_id_store->GetNcclUniqueId(key);
+        [nccl_id_store, this](const CliqueKey& key) {
+          return nccl_id_store->GetNcclUniqueId(key, *this);
         });
   }
   return absl::OkStatus();
 }
 }  // namespace xla::gpu
 
-XLA_COLLECTIVES_REGISTER("gpu", "nccl", 1,
+XLA_COLLECTIVES_REGISTER("CUDA", "nccl", 1,
                          std::make_unique<xla::gpu::NcclCollectives>());
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
index 1ed56cb153b71e..3c4b043ad1287d 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/memory/memory.h"
@@ -33,17 +34,18 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
 #include "xla/backends/gpu/collectives/single_threaded_executor.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
 #include "xla/primitive_util.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/concurrency/executor.h"
@@ -54,20 +56,13 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 namespace {
 
+CUstream AsCudaStream(se::Stream* stream) {
+  return absl::bit_cast<CUstream>(stream->platform_specific_handle().stream);
+}
+
 se::Stream* ToStream(const Communicator::Executor& executor) {
   return tsl::down_cast<const GpuCollectives::Executor&>(executor).stream();
 }
@@ -556,7 +551,7 @@ absl::Status NcclCommunicator::LaunchAllReduce(
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclAllReduce(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
       nccl_dtype, ToNcclReduction(reduction_kind), comm_,
-      se::gpu::AsGpuStreamValue(stream))));
+      AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -583,7 +578,7 @@ absl::Status NcclCommunicator::LaunchBroadcast(
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclBroadcast(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
-      nccl_dtype, root.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+      nccl_dtype, root.value(), comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -612,7 +607,7 @@ absl::Status NcclCommunicator::LaunchReduceScatter(
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclReduceScatter(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
       nccl_dtype, ToNcclReduction(reduction_kind), comm_,
-      se::gpu::AsGpuStreamValue(stream))));
+      AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -638,7 +633,7 @@ absl::Status NcclCommunicator::LaunchAllGather(
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(ncclAllGather(
       send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
-      nccl_dtype, comm_, se::gpu::AsGpuStreamValue(stream))));
+      nccl_dtype, comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -688,13 +683,13 @@ absl::Status NcclCommunicator::LaunchAllToAll(
     se::DeviceAddressBase send_buffer = send_buffers[i];
     se::DeviceAddressBase recv_buffer = recv_buffers[i];
 
-    XLA_NCCL_RETURN_IF_ERROR(
-        ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype, i,
-                 comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(ncclSend(send_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsCudaStream(stream)));
 
-    XLA_NCCL_RETURN_IF_ERROR(
-        ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype, i,
-                 comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(ncclRecv(recv_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsCudaStream(stream)));
   }
   TF_RETURN_IF_ERROR(GroupEnd());
   return absl::OkStatus();
@@ -732,15 +727,15 @@ absl::Status NcclCommunicator::LaunchCollectivePermute(
   TF_RETURN_IF_ERROR(GroupStart());
 
   if (source_rank) {
-    XLA_NCCL_RETURN_IF_ERROR(ncclRecv(
-        recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-        source_rank->value(), comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(
+        ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 source_rank->value(), comm_, AsCudaStream(stream)));
   }
 
   for (auto target_rank : target_ranks) {
-    XLA_NCCL_RETURN_IF_ERROR(ncclSend(
-        send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-        target_rank.value(), comm_, se::gpu::AsGpuStreamValue(stream)));
+    XLA_NCCL_RETURN_IF_ERROR(
+        ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 target_rank.value(), comm_, AsCudaStream(stream)));
   }
 
   TF_RETURN_IF_ERROR(GroupEnd());
@@ -768,7 +763,7 @@ absl::Status NcclCommunicator::LaunchSend(se::DeviceAddressBase send_buffer,
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(
       ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+               peer.value(), comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
@@ -795,7 +790,7 @@ absl::Status NcclCommunicator::LaunchRecv(se::DeviceAddressBase recv_buffer,
 
   TF_RETURN_IF_ERROR(XLA_NCCL_STATUS(
       ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
-               peer.value(), comm_, se::gpu::AsGpuStreamValue(stream))));
+               peer.value(), comm_, AsCudaStream(stream))));
   if (group_nesting_level_ == 0) {
     TF_RETURN_IF_ERROR(PollUntilDone());
   }
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
index f23b21682c200e..3c1d5cbd805f07 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.h
@@ -32,27 +32,15 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/future.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/tsl/concurrency/async_value_ref.h"
 #include "xla/tsl/concurrency/executor.h"
 #include "xla/tsl/platform/env.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 
 // XLA collectives communicator wrapping an NCCL communicator.
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
index 988081b1c94423..bc0e49a2a90795 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator_test.cc
@@ -26,25 +26,15 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "third_party/nccl/nccl.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/nccl_errors.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/tsl/platform/errors.h"
 
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
-#include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
-
 namespace xla::gpu {
 namespace {
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
index bb3714843bcf95..4716280de16b6b 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.cc
@@ -20,18 +20,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "xla/util.h"
-
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
 #include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
+#include "xla/util.h"
 
 namespace xla::gpu {
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
index 5968a3961d0c4e..5806ebf2e39582 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_errors.h
@@ -19,20 +19,8 @@ limitations under the License.
 #include <atomic>
 
 #include "absl/status/status.h"
-#include "absl/strings/str_format.h"  // IWYU pragma: keep
-#include "xla/tsl/platform/logging.h"  // IWYU pragma: keep
-#include "xla/util.h"  // IWYU pragma: keep
-                                                       //
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#if (TF_ROCM_VERSION >= 50200)
-#include "rocm/include/rccl/rccl.h"
-#else
-#include "rocm/include/rccl.h"
-#endif  // TF_ROCM_VERSION >= 50200
-#else
 #include "third_party/nccl/nccl.h"
-#endif  // TENSORFLOW_USE_ROCM
+#include "xla/tsl/platform/logging.h"
 
 //===----------------------------------------------------------------------===//
 // Collection of helper macros for handling NCCL errors.
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
index 2398a7cc37bec5..df73c5b8b0e892 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_collectives.cc
@@ -48,7 +48,7 @@ bool NvshmemCollectives::IsInitialized() const {
 
 NvshmemCollectives* NvshmemCollectives::Default() {
   absl::StatusOr<Collectives*> collectives =
-      CollectivesRegistry::Get("gpu", "nvshmem");
+      CollectivesRegistry::Get("CUDA", "nvshmem");
   CHECK_OK(collectives) << "Failed to get NVSHMEM collectives";  // Crash OK
 
   if (auto* nvshmem_collectives =
@@ -98,5 +98,5 @@ NvshmemCollectives::CreateCommunicator() {
 
 // NvshmemCollectives currently does not implement GpuCollectives, so it cannot
 // be used as a host-side collectives library. Therefore, set priority to -100.
-XLA_COLLECTIVES_REGISTER("gpu", "nvshmem", -100,
+XLA_COLLECTIVES_REGISTER("CUDA", "nvshmem", -100,
                          std::make_unique<xla::gpu::NvshmemCollectives>());
diff --git a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
index fd14b490160e59..585302fa25fb3e 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nvshmem_communicator.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include <cstdint>
 #include <string>
 
+#include "absl/base/casts.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_bf16.h"
 #include "third_party/gpus/cuda/include/cuda_fp16.h"
 #include "third_party/nvshmem/nvshmem.h"   // IWYU pragma: keep
@@ -29,11 +31,10 @@ limitations under the License.
 #include "xla/backends/gpu/collectives/nvshmem_collectives.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
 #include "xla/primitive_util.h"
-#include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -47,10 +48,14 @@ namespace xla::gpu {
 // NVSHMEM Utility Functions
 //==-----------------------------------------------------------------------===//
 
-size_t ToRealCount(PrimitiveType dtype, size_t count) {
+static size_t ToRealCount(PrimitiveType dtype, size_t count) {
   return primitive_util::IsComplexType(dtype) ? count * 2 : count;
 }
 
+static CUstream AsCudaStream(se::Stream* stream) {
+  return absl::bit_cast<CUstream>(stream->platform_specific_handle().stream);
+}
+
 //==-----------------------------------------------------------------------===//
 // NVSHMEM Templated APIs
 //==-----------------------------------------------------------------------===//
@@ -130,7 +135,7 @@ size_t ToRealCount(PrimitiveType dtype, size_t count) {
                          num_elements, stream)                            \
   nvshmemx_##TYPENAME##_##op##_nbi_on_stream(                             \
       (TYPE*)dest_ptr, (const TYPE*)source_ptr, num_elements, pe.value(), \
-      se::gpu::AsGpuStreamValue(stream))
+      AsCudaStream(stream))
 
 //==-----------------------------------------------------------------------===//
 // NVSHMEM Communicator
@@ -169,7 +174,7 @@ absl::Status NvshmemCommunicator::Barrier(
 
   TF_ASSIGN_OR_RETURN(se::Stream * stream, ToStream(executor));
 
-  auto gpu_stream = se::gpu::AsGpuStreamValue(stream);
+  auto gpu_stream = AsCudaStream(stream);
 
   if (nvshmemx_barrier_on_stream(NVSHMEM_TEAM_SHARED, gpu_stream) != 0) {
     return absl::InternalError("Nvshmem team barrier failed.");
@@ -239,69 +244,62 @@ Future<> NvshmemCommunicator::AllReduce(
   switch (dtype) {
     case PrimitiveType::F64: {
       CALL_NVSHMEM_REDUCTION_DATATYPE(double, double, NVSHMEM_TEAM_SHARED,
-                                      se::gpu::AsGpuStreamValue(stream),
-                                      reduction_kind, source_ptr, dest_ptr,
-                                      count);
+                                      AsCudaStream(stream), reduction_kind,
+                                      source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::F16: {
-      CALL_NVSHMEM_REDUCTION_DATATYPE(
-          half, __half, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
-          reduction_kind, source_ptr, dest_ptr, count);
+      CALL_NVSHMEM_REDUCTION_DATATYPE(half, __half, NVSHMEM_TEAM_SHARED,
+                                      AsCudaStream(stream), reduction_kind,
+                                      source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::F32: {
-      CALL_NVSHMEM_REDUCTION_DATATYPE(
-          float, float, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
-          reduction_kind, source_ptr, dest_ptr, count);
+      CALL_NVSHMEM_REDUCTION_DATATYPE(float, float, NVSHMEM_TEAM_SHARED,
+                                      AsCudaStream(stream), reduction_kind,
+                                      source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::BF16: {
       CALL_NVSHMEM_REDUCTION_DATATYPE(
-          bfloat16, __nv_bfloat16, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          bfloat16, __nv_bfloat16, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::S32: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          int32, int32_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          int32, int32_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::S64: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          int64, int64_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          int64, int64_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::U32: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          uint32, uint32_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          uint32, uint32_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::U64: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          uint64, uint64_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          uint64, uint64_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::PRED:
     case PrimitiveType::U8: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          uint8, uint8_t, NVSHMEM_TEAM_SHARED,
-          se::gpu::AsGpuStreamValue(stream), reduction_kind, source_ptr,
-          dest_ptr, count);
+          uint8, uint8_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
+          reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
     case PrimitiveType::S8: {
       CALL_NVSHMEM_BITWISE_REDUCTION_DATATYPE(
-          int8, int8_t, NVSHMEM_TEAM_SHARED, se::gpu::AsGpuStreamValue(stream),
+          int8, int8_t, NVSHMEM_TEAM_SHARED, AsCudaStream(stream),
           reduction_kind, source_ptr, dest_ptr, count);
       break;
     }
@@ -492,7 +490,7 @@ absl::Status NvshmemCommunicator::Quiet(const Executor& executor) {
   }
 
   TF_ASSIGN_OR_RETURN(se::Stream * stream, ToStream(executor));
-  nvshmemx_quiet_on_stream(se::gpu::AsGpuStreamValue(stream));
+  nvshmemx_quiet_on_stream(AsCudaStream(stream));
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.cc
new file mode 100644
index 00000000000000..974e36d9b4916a
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.cc
@@ -0,0 +1,407 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_collectives.h"
+
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/base/call_once.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "rocm/rocm_config.h"
+#include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/rccl_communicator.h"
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/collectives_registry.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/debug_options_flags.h"
+#include "xla/pjrt/distributed/key_value_store_interface.h"
+#include "xla/runtime/device_id.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/threadpool.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+#include "tsl/platform/numbers.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+
+static ncclComm_t Cast(const Communicator* comm) {
+  auto* nccl_communicator = tsl::down_cast<const RcclCommunicator*>(comm);
+  CHECK(nccl_communicator != nullptr) << "Unsupported XLA communicator";
+  return nccl_communicator->comm();
+}
+
+absl::StatusOr<CliqueId> RcclCollectives::CreateUniqueCliqueId() const {
+  VLOG(3) << "Create NCCL unique clique id";
+  ncclUniqueId id;
+  XLA_RCCL_RETURN_IF_ERROR(ncclGetUniqueId(&id));
+  return CliqueId(absl::string_view(id.internal, NCCL_UNIQUE_ID_BYTES));
+}
+
+bool RcclCollectives::IsGlobalConfig() const {
+  static const char* const nccl_comm_id = std::getenv("NCCL_COMM_ID");
+  return nccl_comm_id != nullptr;
+}
+
+absl::StatusOr<const RcclCollectives::CliqueIdCallback*>
+RcclCollectives::GetCliqueIdCallback(const CliqueIdCallback* clique_id_callback,
+                                     bool is_local) {
+  if (clique_id_callback != nullptr) {
+    return clique_id_callback;
+  }
+
+  TF_RET_CHECK(is_local || IsGlobalConfig())
+      << "If non-local devices are taking part of a collective API on "
+         "GPU, the clique_id_callback must be provided by the client.";
+
+  static auto* const local_callback = new CliqueIdCallback(
+      [this](const CliqueKey&) { return CreateUniqueCliqueId(); });
+  return local_callback;
+}
+
+static absl::StatusOr<ncclConfig_t> AsRcclConfig(
+    const GpuCollectives::Config& config,
+    const se::StreamExecutor* stream_executor) {
+  ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
+  comm_config.blocking = config.blocking_communicators ? 1 : 0;
+#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
+  comm_config.splitShare = config.split_share;
+#endif
+  int nccl_version;
+  XLA_RCCL_RETURN_IF_ERROR(ncclGetVersion(&nccl_version));
+  if (config.max_nchannels > 0) {
+    VLOG(1) << "Maximum number of channels is set to: " << comm_config.maxCTAs;
+    comm_config.maxCTAs = config.max_nchannels;
+  } else if (stream_executor->GetDeviceDescription()
+                 .cuda_compute_capability()
+                 .IsBlackwell() &&
+             nccl_version >= NCCL_VERSION(2, 28, 0)) {
+    // Future NCCL versions will reduce the default max number of channels on
+    // Blackwell to 16. We need to manually set it to 32 here to avoid surprise
+    // perf regressions.
+    VLOG(1) << "Setting max number of channels to 32 on Blackwell.";
+    comm_config.maxCTAs = 32;
+  }
+  return comm_config;
+}
+
+static absl::StatusOr<ncclUniqueId> AsRcclUniqueId(const CliqueId& clique_id) {
+  if (clique_id.size() != NCCL_UNIQUE_ID_BYTES) {
+    return Internal(
+        "CliqueId size is not equal to NCCL_UNIQUE_ID_BYTES: %d vs %d",
+        clique_id.size(), NCCL_UNIQUE_ID_BYTES);
+  }
+  ncclUniqueId id;
+  absl::c_copy(clique_id.data(), id.internal);
+  return id;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+RcclCollectives::CreateCommunicatorsWithCancel(
+    const CliqueKey& clique_key, const std::optional<CliqueIds>& clique_ids,
+    absl::Span<const DeviceRank> ranks, const Collectives::Config& config,
+    std::atomic_bool* cancel) {
+  // Validate clique ids. With the NCCL backend, we rely on the host to exchange
+  // unique clique ids.
+  if (!clique_ids.has_value() || clique_ids->data().empty()) {
+    return InvalidArgument("CliqueId is required to create NCCL communicators");
+  }
+  if (clique_ids->data().size() != 1) {
+    return InvalidArgument(
+        "CliqueIds size must be 1 for NCCL communicator initialization");
+  }
+  VLOG(1) << "Initialize NCCL communicator for " << ranks.size() << " devices"
+          << "; fingerprint(id)=" << clique_ids->fingerprint();
+
+  const auto& gpu_config =
+      tsl::down_cast<const GpuCollectives::Config&>(config);
+  if (!gpu_config.blocking_communicators && !gpu_config.async_execution) {
+    return FailedPrecondition(
+        "GpuCollectives::Config blocking_communicators is false, but "
+        "async_execution is false. Non-blocking communicators require "
+        "asynchronous execution.");
+  }
+
+  // make_comm returns a new ncclComm_t.
+  auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
+    VLOG(1) << "Initialize NCCL communicator for rank #" << ranks[i].rank
+            << " of " << clique_key.num_devices()
+            << "; fingerprint(id)=" << clique_ids->fingerprint()
+            << "; size(id)=" << clique_ids->data().size();
+    auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
+    TF_RET_CHECK(device != nullptr);
+    auto activate_context = device->stream_executor()->Activate();
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsRcclConfig(gpu_config, device->stream_executor()));
+
+    TF_ASSIGN_OR_RETURN(auto nccl_unique_id, AsRcclUniqueId(clique_ids->at(0)));
+    ncclComm_t comm;
+    XLA_RCCL_RETURN_IF_ERROR(
+        ncclCommInitRankConfig(&comm, clique_key.num_devices(), nccl_unique_id,
+                               ranks[i].rank.value(), &comm_config));
+    return comm;
+  };
+
+  // Create all communicators. Each communicator is created on its own thread.
+  std::vector<std::unique_ptr<Communicator>> comms(ranks.size());
+  absl::Status status;
+  absl::once_flag once;
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "CreateCommunicators",
+                                 ranks.size());
+    for (size_t i = 0; i < ranks.size(); ++i) {
+      pool.Schedule([&, i]() {
+        absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+            RcclCommunicator::Create(std::bind(make_comm, i),
+                                     gpu_config.async_execution, cancel);
+        if (!comm.ok()) {
+          absl::call_once(once, [&] { status = comm.status(); });
+          return;
+        }
+        comms[i] = *std::move(comm);
+      });
+    }
+  }  // pool's destructor blocks until all scheduled work is done.
+  TF_RETURN_IF_ERROR(status);
+  return comms;
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+RcclCollectives::SplitCommunicatorsWithCancel(
+    absl::Span<const Communicator* const> comms, int32_t color,
+    absl::Span<const RankId> keys, const Collectives::Config& config,
+    absl::Span<const DeviceRank> ranks, std::atomic_bool* cancel) {
+  auto rank_formatter = [](std::string* str, RankId rank) {
+    absl::StrAppend(str, rank.value());
+  };
+
+  VLOG(1) << absl::StreamFormat(
+      "Split %d NCCL communicators using color %d and keys: [%s]", comms.size(),
+      color, absl::StrJoin(keys, ",", rank_formatter));
+
+  if (keys.size() != comms.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Comms and keys must have the same size, but %d != %d",
+                        comms.size(), keys.size()));
+  }
+
+  const auto& gpu_config =
+      tsl::down_cast<const GpuCollectives::Config&>(config);
+
+#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
+  auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
+    auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
+    TF_RET_CHECK(device != nullptr);
+
+    TF_ASSIGN_OR_RETURN(ncclConfig_t comm_config,
+                        AsRcclConfig(gpu_config, device->stream_executor()));
+
+    VLOG(1) << "Split NCCL communicator " << comms[i] << " with color " << color
+            << " and key " << keys[i];
+    ncclComm_t split_comm;
+    XLA_RCCL_RETURN_IF_ERROR(ncclCommSplit(
+        Cast(comms[i]), color, keys[i].value(), &split_comm, &comm_config));
+    return split_comm;
+  };
+
+  std::vector<std::unique_ptr<Communicator>> split_comms(comms.size());
+  absl::Status status;
+  absl::once_flag once;
+  {
+    tsl::thread::ThreadPool pool(tsl::Env::Default(), "SplitCommunicators",
+                                 comms.size());
+    for (size_t i = 0; i < comms.size(); ++i) {
+      pool.Schedule([&, i]() {
+        absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+            RcclCommunicator::Create(std::bind(make_comm, i),
+                                     gpu_config.async_execution, cancel);
+        if (!comm.ok()) {
+          absl::call_once(once, [&] { status = comm.status(); });
+          return;
+        }
+        split_comms[i] = *std::move(comm);
+      });
+    }
+  }  // pool's destructor blocks until all scheduled work is done.
+  TF_RETURN_IF_ERROR(status);
+  return split_comms;
+#else
+  return absl::UnimplementedError(
+      absl::StrFormat("%s:%d: NCCL operation ncclCommSplit not implemented",
+                      __FILE__, __LINE__));
+#endif  // !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
+}
+
+static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Get("gpu", "nvshmem"));
+  xla::gpu::GpuCollectives* nvshmem_collectives =
+      tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+  if (nvshmem_collectives == nullptr) {
+    return absl::InternalError("Failed to get NVSHMEM collectives");
+  }
+
+  return nvshmem_collectives;
+}
+
+absl::StatusOr<void*> RcclCollectives::Allocate(uint64_t bytes) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Allocate(bytes);
+  }
+
+  void* ptr = nullptr;
+  ncclResult_t res = ncclMemAlloc(&ptr, bytes);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to allocate %s (%llu bytes) from device collective memory: %s, "
+        "Last NCCL warning(error) log entry (may be unrelated): %s",
+        tsl::strings::HumanReadableNumBytes(bytes), bytes,
+        ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+  VLOG(2) << "Allocated collective memory " << ptr << " of " << bytes
+          << " bytes";
+  return ptr;
+}
+
+absl::Status RcclCollectives::Deallocate(void* location) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    return nvshmem_collectives->Deallocate(location);
+  }
+
+  ncclResult_t res = ncclMemFree(location);
+  if (res != ncclSuccess) {
+    return absl::InternalError(absl::StrFormat(
+        "failed to free device collective memory at %p; result: %s, Last NCCL "
+        "warning(error) log entry (may be unrelated): %s",
+        location, ncclGetErrorString(res), ncclGetLastError(nullptr)));
+  }
+
+  VLOG(2) << "Deallocated collective memory " << location;
+  return absl::OkStatus();
+}
+
+class RcclIdStore {
+ public:
+  RcclIdStore(int node_id,
+              absl::flat_hash_map<GlobalDeviceId, int> device_to_node,
+              std::shared_ptr<KeyValueStoreInterface> kv_store)
+      : node_id_(node_id),
+        device_to_node_(std::move(device_to_node)),
+        kv_store_(std::move(kv_store)) {}
+
+  absl::StatusOr<CliqueId> GetRcclUniqueId(const CliqueKey& key,
+                                           RcclCollectives& rccl_collectives) {
+    auto* gpu_key = tsl::down_cast<const gpu::GpuCliqueKey*>(&key);
+    if (gpu_key == nullptr) {
+      return InvalidArgument("Expected GPU clique key");
+    }
+
+    // The caller must ensure that threads calling this method concurrently have
+    // unique keys, otherwise the global key-value store may hold the wrong
+    // value.
+    {
+      absl::MutexLock lock(mu_);
+      auto it = cache_.find(*gpu_key);
+      if (it != cache_.end()) {
+        return it->second;
+      }
+    }
+    CliqueId clique_id;
+    int primary_node_id = device_to_node_.at(gpu_key->root_device());
+    if (node_id_ == primary_node_id) {
+      TF_ASSIGN_OR_RETURN(clique_id, rccl_collectives.CreateUniqueCliqueId());
+      TF_RETURN_IF_ERROR(
+          kv_store_->Set(gpu_key->ToString(), clique_id.ToString()));
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          std::string id_str,
+          kv_store_->Get(gpu_key->ToString(), absl::Minutes(10)));
+      clique_id = CliqueId(id_str);
+    }
+    absl::MutexLock lock(mu_);
+    auto result = cache_.emplace(*gpu_key, std::move(clique_id));
+    TF_RET_CHECK(result.second) << "Unique ID already in cache.";
+    return result.first->second;
+  }
+
+ private:
+  const int node_id_;
+  const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
+  const std::shared_ptr<KeyValueStoreInterface> kv_store_;
+
+  absl::Mutex mu_;
+  absl::flat_hash_map<gpu::GpuCliqueKey, CliqueId> cache_ ABSL_GUARDED_BY(mu_);
+};
+
+absl::Status RcclCollectives::InitializeTopology(
+    RcclCollectives::Topology topology) {
+  if (xla::GetDebugOptionsFromFlags().xla_gpu_experimental_enable_nvshmem()) {
+    TF_ASSIGN_OR_RETURN(auto* nvshmem_collectives, GetNvshmemCollectives());
+    TF_RETURN_IF_ERROR(nvshmem_collectives->InitializeTopology(topology));
+  }
+
+  if (topology.num_nodes > 1) {
+    auto nccl_id_store = std::make_shared<RcclIdStore>(
+        topology.node_id, topology.device_id_to_node_id,
+        std::move(topology.kv_store));
+    topology.gpu_executable_run_options->set_clique_id_callback(
+        [nccl_id_store, this](const CliqueKey& key) {
+          return nccl_id_store->GetRcclUniqueId(key, *this);
+        });
+  }
+  return absl::OkStatus();
+}
+}  // namespace xla::gpu
+
+XLA_COLLECTIVES_REGISTER("ROCM", "nccl", 1,
+                         std::make_unique<xla::gpu::RcclCollectives>());
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.h b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.h
new file mode 100644
index 00000000000000..0b7f274686a061
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_collectives.h
@@ -0,0 +1,93 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COLLECTIVES_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COLLECTIVES_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/clique_id.h"
+#include "xla/core/collectives/clique_key.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+
+namespace xla::gpu {
+
+// XLA host-initiated collectives implemented on top of NCCL.
+class RcclCollectives : public GpuCollectives {
+ public:
+  bool IsImplemented() const final { return true; }
+
+  bool IsGlobalConfig() const final;
+
+  absl::StatusOr<const CliqueIdCallback*> GetCliqueIdCallback(
+      const CliqueIdCallback* clique_id_callback, bool is_local) final;
+
+  absl::StatusOr<CliqueId> CreateUniqueCliqueId() const final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicators(const CliqueKey& clique_key,
+                      const std::optional<CliqueIds>& clique_ids,
+                      absl::Span<const DeviceRank> ranks,
+                      const Collectives::Config& config) final {
+    return CreateCommunicatorsWithCancel(clique_key, clique_ids, ranks, config,
+                                         nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  CreateCommunicatorsWithCancel(const CliqueKey& clique_key,
+                                const std::optional<CliqueIds>& clique_ids,
+                                absl::Span<const DeviceRank> ranks,
+                                const Collectives::Config& config,
+                                std::atomic_bool* cancel) final;
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>> SplitCommunicators(
+      absl::Span<const Communicator* const> comms, int32_t color,
+      absl::Span<const RankId> keys, const Collectives::Config& config,
+      absl::Span<const DeviceRank> ranks) final {
+    return SplitCommunicatorsWithCancel(comms, color, keys, config, ranks,
+                                        nullptr);
+  }
+
+  absl::StatusOr<std::vector<std::unique_ptr<Communicator>>>
+  SplitCommunicatorsWithCancel(absl::Span<const Communicator* const> comms,
+                               int32_t color, absl::Span<const RankId> keys,
+                               const Collectives::Config& config,
+                               absl::Span<const DeviceRank> ranks,
+                               std::atomic_bool* cancel) final;
+
+  absl::StatusOr<std::unique_ptr<Communicator>> CreateCommunicator() final {
+    return absl::UnimplementedError("Not implemented.");
+  }
+
+  absl::StatusOr<void*> Allocate(uint64_t bytes) final;
+
+  absl::Status Deallocate(void* location) final;
+
+  absl::Status InitializeTopology(Topology topology) final;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COLLECTIVES_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
new file mode 100644
index 00000000000000..1135391d0c4370
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
@@ -0,0 +1,831 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_communicator.h"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "rocm/include/hip/hip_runtime.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+#include "xla/backends/gpu/collectives/single_threaded_executor.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
+#include "xla/future.h"
+#include "xla/primitive_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/concurrency/executor.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/logging.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/casts.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+namespace {
+
+hipStream_t AsHipStream(se::Stream* stream) {
+  return absl::bit_cast<hipStream_t>(stream->platform_specific_handle().stream);
+}
+
+se::Stream* ToStream(const Communicator::Executor& executor) {
+  return tsl::down_cast<const GpuCollectives::Executor&>(executor).stream();
+}
+
+//==-----------------------------------------------------------------------===//
+// Conversions between XLA and RCCL data types
+//==-----------------------------------------------------------------------===//
+
+static size_t ToNcclCount(PrimitiveType dtype, size_t count) {
+  return primitive_util::IsComplexType(dtype) ? count * 2 : count;
+}
+
+static absl::StatusOr<ncclDataType_t> ToNcclDataType(PrimitiveType dtype,
+                                                     bool is_reduction_op) {
+  switch (dtype) {
+    case S8:
+    case F8E5M2:
+    case F8E4M3FN:
+    case F8E5M2FNUZ:
+    case F8E4M3FNUZ:
+    case F8E8M0FNU:
+      return ncclInt8;
+    case PRED:
+    case U8:
+      return ncclUint8;
+    case S32:
+      return ncclInt32;
+    case U32:
+      return ncclUint32;
+    case S64:
+      return ncclInt64;
+    case U64:
+      return ncclUint64;
+    case F16:
+      return ncclFloat16;
+    case F32:
+    case C64:
+      return ncclFloat32;
+    case F64:
+    case C128:
+      return ncclFloat64;
+    case S16:
+    case U16:
+      // For reductions we expect 16 bit integer types to be promoted to 32-bit.
+      if (is_reduction_op) {
+        return absl::InvalidArgumentError(
+            absl::StrFormat("Unsupported data type for reduction operation: %s",
+                            primitive_util::LowercasePrimitiveTypeName(dtype)));
+      }
+      // For collectives that just move data around, we can use ncclFloat16 for
+      // 16-bit integer data types.
+      return ncclFloat16;
+    case BF16:
+      return ncclBfloat16;
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrFormat("Unsupported data type: %s",
+                          primitive_util::LowercasePrimitiveTypeName(dtype)));
+  }
+}
+
+static ncclRedOp_t ToNcclReduction(ReductionKind kind) {
+  switch (kind) {
+    case ReductionKind::SUM:
+      return ncclSum;
+    case ReductionKind::PRODUCT:
+      return ncclProd;
+    case ReductionKind::MIN:
+      return ncclMin;
+    case ReductionKind::MAX:
+      return ncclMax;
+  }
+}
+
+}  // namespace
+
+//==-----------------------------------------------------------------------===//
+// RCCL Registered Buffer Handle
+//==-----------------------------------------------------------------------===//
+
+// An RAII handle for user buffers registered with an RCCL communicator.
+class RcclCommunicator::RcclRegisteredBufferHandle
+    : public Communicator::RegisteredBufferHandle {
+ public:
+  RcclRegisteredBufferHandle(RcclCommunicator& comm, void* handle,
+                             tsl::Executor* executor, bool symmetric_handle,
+                             int device_ordinal)
+      : comm_(comm),
+        handle_(handle),
+        symmetric_handle_(symmetric_handle),
+        device_ordinal_(device_ordinal) {}
+
+  ~RcclRegisteredBufferHandle() override {
+    if (auto status = Unregister(); !status.ok()) {
+      LOG(ERROR) << status.message();
+    }
+  }
+
+  absl::Status Unregister() final {
+    VLOG(3) << absl::StreamFormat(
+        "[%d] Deregister buffer for RCCL communicator; handle=%p; comm=%p",
+        device_ordinal_, handle_, comm_.comm_);
+    if (!symmetric_handle_) {
+#if (NCCL_VERSION_CODE >= 21901)
+      auto f = [this]() -> absl::Status {
+        if (comm_.canceling_.load()) {
+          return FailedPrecondition("[%d] RcclCommunicator aborted",
+                                    device_ordinal_);
+        }
+        XLA_RCCL_RETURN_IF_ERROR(ncclCommDeregister(comm_.comm_, handle_));
+        return comm_.PollUntilDone();
+      };
+      return executor_ ? Future<>::MakeOn(*executor_, f).Await() : f();
+#else
+      return Unimplemented(
+          "[%d] RCCL version does not support ncclCommDeregister",
+          device_ordinal_);
+#endif  // NCCL_VERSION_CODE >= 21901
+    } else {
+      VLOG(3) << absl::StreamFormat(
+          "[%d] Deregister symmetric buffer for RCCL communicator; handle=%p; "
+          "comm=%p",
+          device_ordinal_, handle_, comm_.comm());
+#if (NCCL_VERSION_CODE >= 22700)
+      auto f = [this]() -> absl::Status {
+        if (comm_.canceling_.load()) {
+          return FailedPrecondition("[%d] RcclCommunicator aborted",
+                                    device_ordinal_);
+        }
+        XLA_RCCL_RETURN_IF_ERROR(
+            ncclCommWindowDeregister(comm_.comm_, *(ncclWindow_t*)(handle_)));
+        return comm_.PollUntilDone();
+      };
+      return executor_ ? Future<>::MakeOn(*executor_, f).Await() : f();
+#else
+      return Unimplemented(
+          "[%d] RCCL version does not support ncclCommWindowDeregister",
+          device_ordinal_);
+#endif  // NCCL_VERSION_CODE >= 22700
+    }
+  }
+
+ private:
+  RcclCommunicator& comm_;
+  void* handle_;
+  bool symmetric_handle_;
+  tsl::Executor* executor_;
+  int device_ordinal_;
+};
+
+//==-----------------------------------------------------------------------===//
+// RCCL Communicator
+//==-----------------------------------------------------------------------===//
+
+absl::StatusOr<std::unique_ptr<RcclCommunicator>> RcclCommunicator::Create(
+    absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm, bool is_async,
+    std::atomic_bool* cancel, tsl::Env& env) {
+  auto f = [cancel, &make_comm]() -> absl::StatusOr<ncclComm_t> {
+    TF_ASSIGN_OR_RETURN(ncclComm_t comm, make_comm());
+    if (cancel) {
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, *cancel));
+    } else {
+      std::atomic_bool never_cancelled;
+      TF_RETURN_IF_ERROR(::xla::gpu::PollUntilDone(comm, never_cancelled));
+    }
+    return comm;
+  };
+
+  if (!is_async) {
+    // If this RcclCommunicator is synchronous, construct ncclComm_t in the
+    // calling thread.
+    TF_ASSIGN_OR_RETURN(ncclComm_t comm, f());
+    return absl::WrapUnique(new RcclCommunicator(comm, nullptr));
+  }
+
+  // If this RcclCommunicator is asynchronous, then all operations on the
+  // underlying ncclComm_t, including its creation, must take place on the
+  // single threaded executor.
+  auto executor = std::make_unique<SingleThreadedExecutor>(env);
+  TF_ASSIGN_OR_RETURN(ncclComm_t comm,
+                      Future<ncclComm_t>::MakeOn(*executor, f).Await());
+  return absl::WrapUnique(new RcclCommunicator(comm, std::move(executor)));
+}
+
+RcclCommunicator::~RcclCommunicator() {
+  auto f = [this]() -> absl::Status {
+    if (comm_ == nullptr) {
+      VLOG(1) << "Skipping destruction; null comm_ " << *this;
+      return absl::OkStatus();
+    }
+
+    if (aborted_) {
+      VLOG(1) << "Skipping destruction; already aborted " << *this;
+      return absl::OkStatus();
+    }
+
+    // Note that we intentionally don't call PollUntilDone. Once comm_ has been
+    // destroyed, we can no longer safely touch it.
+    VLOG(1) << "Destroy " << *this;
+    return XLA_RCCL_STATUS(ncclCommDestroy(comm_));
+  };
+
+  if (absl::Status s = Execute(f).Await(); !s.ok()) {
+    LOG(ERROR) << "RcclCommunicator::~RcclCommunicator: " << s;
+  }
+}
+
+absl::Status RcclCommunicator::Abort() {
+  // By setting canceling_ to true, all pending collectives scheduled on
+  // executor_ will cancel. This will allow the aborting lambda below to run.
+  canceling_.store(true);
+
+  return ExecuteAwait([this]() -> absl::Status {
+    VLOG(1) << "Abort RCCL communicator: " << *this;
+    if (aborted_) {
+      return FailedPrecondition("RcclCommunicator already aborted");
+    }
+    aborted_ = true;
+    // Note that we intentionally don't call PollUntilDone. Once comm_
+    // has been aborted, we can no longer safely touch it.
+    return XLA_RCCL_STATUS(ncclCommAbort(comm_));
+  });
+}
+
+absl::Status RcclCommunicator::HealthCheck() const {
+  return ExecuteAwait([this]() -> absl::Status {
+    VLOG(5) << "Get last async error for RCCL communicator: " << *this;
+    if (canceling_.load()) {
+      return absl::FailedPreconditionError("RcclCommunicator aborted");
+    }
+
+    ncclResult_t async_err;
+    XLA_RCCL_RETURN_IF_ERROR(ncclCommGetAsyncError(comm_, &async_err));
+    if (async_err == ncclSuccess) {
+      return absl::OkStatus();
+    }
+
+    return Internal("%s. Last RCCL error (maybe unrelated): %s",
+                    ncclGetLastError(comm_), ncclGetErrorString(async_err));
+  });
+}
+
+absl::StatusOr<size_t> RcclCommunicator::NumRanks() const {
+  return ExecuteAwait<size_t>([this]() -> absl::StatusOr<size_t> {
+    VLOG(5) << "Get the number of ranks in RCCL communicator: " << *this;
+    if (canceling_.load()) {
+      return absl::FailedPreconditionError("RcclCommunicator aborted");
+    }
+
+    // We intentionally don't call PollUntilDone. ncclCommCount is
+    // blocking.
+    int32_t count = 0;
+    XLA_RCCL_RETURN_IF_ERROR(ncclCommCount(comm_, &count));
+    return count;
+  });
+}
+
+absl::Status RcclCommunicator::RegisterBufferOnce(
+    se::DeviceAddressBase buffer_range, int device_ordinal,
+    bool use_symmetric_buffer) {
+  bool need_reg = false;
+  {
+    absl::MutexLock lock(registered_buffers_.mu);
+    if (!registered_buffers_.range_to_handle.contains(buffer_range.opaque())) {
+      need_reg = true;
+    } else {
+      XLA_VLOG_DEVICE(5, device_ordinal)
+          << "Buffer range: " << buffer_range.opaque()
+          << " with size: " << buffer_range.size() << " is already registered.";
+    }
+  }
+  if (need_reg) {
+    XLA_VLOG_DEVICE(5, device_ordinal)
+        << "Registering " << buffer_range.opaque()
+        << " with size: " << buffer_range.size()
+        << ", is symmetric: " << (use_symmetric_buffer ? "true" : "false");
+    // Symmetric buffer registration is a collective operation,
+    // we need to do that before locking on a global.
+    TF_ASSIGN_OR_RETURN(
+        auto handle,
+        RegisterBuffer(buffer_range, device_ordinal, use_symmetric_buffer));
+    absl::MutexLock lock(registered_buffers_.mu);
+    registered_buffers_.range_to_handle[buffer_range.opaque()] =
+        std::move(handle);
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::unique_ptr<Communicator::RegisteredBufferHandle>>
+RcclCommunicator::RegisterBuffer(stream_executor::DeviceAddressBase buffer,
+                                 int device_ordinal,
+                                 bool use_symmetric_buffer) {
+#if (NCCL_VERSION_CODE >= 21901)
+  using Handle = std::unique_ptr<Communicator::RegisteredBufferHandle>;
+
+  if (!use_symmetric_buffer) {
+    return ExecuteAwait<Handle>(
+        [&buffer, device_ordinal, this]() -> absl::StatusOr<Handle> {
+          VLOG(3) << absl::StreamFormat(
+              "[%d] Register buffer for RCCL communicator; buffer=%p; "
+              "size=%d; "
+              "comm=%p",
+              device_ordinal, buffer.opaque(), buffer.size(), comm_);
+          if (canceling_.load()) {
+            return absl::FailedPreconditionError("RcclCommunicator aborted");
+          }
+          void* handle = nullptr;
+          XLA_RCCL_RETURN_IF_ERROR(
+              ncclCommRegister(comm_, buffer.opaque(), buffer.size(), &handle));
+          if (group_nesting_level_ == 0) {
+            TF_RETURN_IF_ERROR(PollUntilDone());
+          }
+          return std::make_unique<RcclRegisteredBufferHandle>(
+              *this, handle, executor_.get(), /*symmetric_buffer= */ false,
+              device_ordinal);
+        });
+#else
+  return Unimplemented("[%d] RCCL version does not support ncclCommRegister",
+                       device_ordinal);
+#endif  // RCCL_VERSION_CODE >= 21901
+  } else {
+#if (NCCL_VERSION_CODE >= 22700)
+    return ExecuteAwait<Handle>(
+        [&buffer, device_ordinal, this]() -> absl::StatusOr<Handle> {
+          VLOG(3) << absl::StreamFormat(
+              "[%d] Register symmetric buffer for RCCL communicator; "
+              "buffer=%p; size=%d; comm=%p",
+              device_ordinal, buffer.opaque(), buffer.size(), comm_);
+          void* handle = nullptr;
+          XLA_RCCL_RETURN_IF_ERROR(ncclGroupStart());
+          XLA_RCCL_RETURN_IF_ERROR(ncclCommWindowRegister(
+              comm_, buffer.opaque(), buffer.size(), (ncclWindow_t*)&handle,
+              RCCL_WIN_COLL_SYMMETRIC));
+          XLA_RCCL_RETURN_IF_ERROR(ncclGroupEnd());
+          if (group_nesting_level_ == 0) {
+            TF_RETURN_IF_ERROR(PollUntilDone());
+          }
+          return std::make_unique<RcclRegisteredBufferHandle>(
+              *this, handle, executor_.get(),
+              /*symmetric_buffer= */ true, device_ordinal);
+        });
+#else
+  return Unimplemented(
+      "[%d] RCCL version does not support ncclCommWindowRegister",
+      device_ordinal);
+#endif  // RCCL_VERSION_CODE >= 22700
+  }
+}
+
+Future<> RcclCommunicator::GroupExecute(
+    absl::AnyInvocable<absl::Status(GpuCommunicator*)> f) {
+  return Execute([f = std::move(f), this]() mutable -> absl::Status {
+    TF_RETURN_IF_ERROR(GroupStart());
+    TF_RETURN_IF_ERROR(f(this));
+    TF_RETURN_IF_ERROR(GroupEnd());
+    return absl::OkStatus();
+  });
+}
+
+Future<> RcclCommunicator::AllReduce(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     ReductionKind reduction_kind,
+                                     const Communicator::Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, reduction_kind,
+                  &executor, this]() -> absl::Status {
+    return LaunchAllReduce(send_buffer, recv_buffer, dtype, count,
+                           reduction_kind, executor);
+  });
+}
+
+Future<> RcclCommunicator::Broadcast(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     RankId root, const Executor& executor) {
+  return Execute(
+      [send_buffer, recv_buffer, dtype, count, root, &executor, this]() {
+        return LaunchBroadcast(send_buffer, recv_buffer, dtype, count, root,
+                               executor);
+      });
+}
+
+Future<> RcclCommunicator::ReduceScatter(se::DeviceAddressBase send_buffer,
+                                         se::DeviceAddressBase recv_buffer,
+                                         PrimitiveType dtype, size_t count,
+                                         ReductionKind reduction_kind,
+                                         const Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, reduction_kind,
+                  &executor, this]() {
+    return LaunchReduceScatter(send_buffer, recv_buffer, dtype, count,
+                               reduction_kind, executor);
+  });
+}
+
+Future<> RcclCommunicator::AllGather(se::DeviceAddressBase send_buffer,
+                                     se::DeviceAddressBase recv_buffer,
+                                     PrimitiveType dtype, size_t count,
+                                     const Executor& executor) {
+  return Execute([send_buffer, recv_buffer, dtype, count, &executor, this]() {
+    return LaunchAllGather(send_buffer, recv_buffer, dtype, count, executor);
+  });
+}
+
+Future<> RcclCommunicator::AllToAll(
+    absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  return Execute([send_buffers, recv_buffers, dtype, count, &executor, this]() {
+    return LaunchAllToAll(send_buffers, recv_buffers, dtype, count, executor);
+  });
+}
+
+Future<> RcclCommunicator::CollectivePermute(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+    absl::Span<const RankId> target_ranks, const Executor& executor) {
+  std::vector<RankId> owned_target_ranks(target_ranks.begin(),
+                                         target_ranks.end());
+  return Execute([send_buffer, recv_buffer, dtype, count, source_rank,
+                  owned_target_ranks = std::move(owned_target_ranks), &executor,
+                  this]() {
+    return LaunchCollectivePermute(send_buffer, recv_buffer, dtype, count,
+                                   source_rank, owned_target_ranks, executor);
+  });
+}
+
+Future<> RcclCommunicator::Send(se::DeviceAddressBase send_buffer,
+                                PrimitiveType dtype, size_t count, RankId peer,
+                                const Executor& executor) {
+  return Execute([send_buffer, dtype, count, peer, &executor, this]() {
+    return LaunchSend(send_buffer, dtype, count, peer, executor);
+  });
+}
+
+Future<> RcclCommunicator::Recv(se::DeviceAddressBase recv_buffer,
+                                PrimitiveType dtype, size_t count, RankId peer,
+                                const Executor& executor) {
+  return Execute([recv_buffer, dtype, count, peer, &executor, this]() {
+    return LaunchRecv(recv_buffer, dtype, count, peer, executor);
+  });
+}
+
+absl::Status RcclCommunicator::GroupStart() {
+  VLOG(5) << "Start RCCL group";
+  XLA_RCCL_RETURN_IF_ERROR(ncclGroupStart());
+  group_nesting_level_++;
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::GroupEnd() {
+  VLOG(5) << "End RCCL group";
+  XLA_RCCL_RETURN_IF_ERROR(ncclGroupEnd());
+  group_nesting_level_--;
+  if (group_nesting_level_ > 0) {
+    // Though NCCL allows groups to be nested, no operations are actually
+    // performed until the outermost group ends. The inner calls to
+    // GroupStart() and GroupEnd() are effectively noops.
+    //
+    // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/groups.html
+    return absl::OkStatus();
+  }
+  // Wait for the communicator to finish.
+  return PollUntilDone();
+}
+
+absl::Status RcclCommunicator::LaunchAllReduce(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Communicator::Executor& executor) {
+  if (canceling_.load()) {
+    return FailedPrecondition("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL AllReduce operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; reduction_kind=%v; comm=%p; "
+      "stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, reduction_kind, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclAllReduce(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, ToNcclReduction(reduction_kind), comm_,
+      AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchBroadcast(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, RankId root, const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL Broadcast operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; root=%d; comm=%p; "
+      "stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, root.value(), comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclBroadcast(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, root.value(), comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchReduceScatter(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, ReductionKind reduction_kind,
+    const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL ReduceScatter operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; reduction_kind=%v; comm=%p; "
+      "stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, reduction_kind, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclReduceScatter(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, ToNcclReduction(reduction_kind), comm_,
+      AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchAllGather(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL AllGather operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; count=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      count, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclAllGather(
+      send_buffer.opaque(), recv_buffer.opaque(), ToNcclCount(dtype, count),
+      nccl_dtype, comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchAllToAll(
+    absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+    absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+    PrimitiveType dtype, size_t count, const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  auto buffer_formatter = [](std::string* out, se::DeviceAddressBase buffer) {
+    absl::StrAppendFormat(out, "%p", buffer.opaque());
+  };
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL AllToAll operation; send_buffers=[%s]; "
+      "recv_buffers=[%s]; dtype=%s; count=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(),
+      absl::StrJoin(send_buffers, ", ", buffer_formatter),
+      absl::StrJoin(recv_buffers, ", ", buffer_formatter),
+      primitive_util::LowercasePrimitiveTypeName(dtype), count, comm_, stream);
+
+  if (send_buffers.size() != recv_buffers.size()) {
+    return InvalidArgument(
+        "Number of send buffers must match number of recv buffers: %d != %d",
+        send_buffers.size(), recv_buffers.size());
+  }
+
+  int32_t num_ranks;
+  XLA_RCCL_RETURN_IF_ERROR(ncclCommCount(comm_, &num_ranks));
+
+  if (send_buffers.size() != num_ranks) {
+    return InvalidArgument(
+        "Number of send buffers must match number of ranks: %d != %d",
+        send_buffers.size(), num_ranks);
+  }
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(GroupStart());
+  for (size_t i = 0; i < send_buffers.size(); ++i) {
+    se::DeviceAddressBase send_buffer = send_buffers[i];
+    se::DeviceAddressBase recv_buffer = recv_buffers[i];
+
+    XLA_RCCL_RETURN_IF_ERROR(ncclSend(send_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsHipStream(stream)));
+
+    XLA_RCCL_RETURN_IF_ERROR(ncclRecv(recv_buffer.opaque(),
+                                      ToNcclCount(dtype, count), nccl_dtype, i,
+                                      comm_, AsHipStream(stream)));
+  }
+  TF_RETURN_IF_ERROR(GroupEnd());
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchCollectivePermute(
+    se::DeviceAddressBase send_buffer, se::DeviceAddressBase recv_buffer,
+    PrimitiveType dtype, size_t count, std::optional<RankId> source_rank,
+    absl::Span<const RankId> target_ranks, const Executor& executor) {
+  if (canceling_.load()) {
+    return FailedPrecondition("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  auto rank_formatter = [](std::string* out, RankId rank) {
+    absl::StrAppendFormat(out, "%d", rank.value());
+  };
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL CollectivePermute operation; send_buffer=%p; "
+      "recv_buffer=%p; dtype=%s; source_rank=%s; target_ranks=[%s]; count=%d; "
+      "comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      recv_buffer.opaque(), primitive_util::LowercasePrimitiveTypeName(dtype),
+      source_rank ? absl::StrCat(source_rank->value()) : "<empty>",
+      absl::StrJoin(target_ranks, ", ", rank_formatter), count, comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  // Short-circuit if there is no source or target rank.
+  if (!source_rank && target_ranks.empty()) {
+    return absl::OkStatus();
+  }
+
+  TF_RETURN_IF_ERROR(GroupStart());
+
+  if (source_rank) {
+    XLA_RCCL_RETURN_IF_ERROR(
+        ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 source_rank->value(), comm_, AsHipStream(stream)));
+  }
+
+  for (auto target_rank : target_ranks) {
+    XLA_RCCL_RETURN_IF_ERROR(
+        ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+                 target_rank.value(), comm_, AsHipStream(stream)));
+  }
+
+  TF_RETURN_IF_ERROR(GroupEnd());
+
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchSend(se::DeviceAddressBase send_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          RankId peer,
+                                          const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL Send operation; send_buffer=%p; dtype=%s; "
+      "count=%d; peer=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), send_buffer.opaque(),
+      primitive_util::LowercasePrimitiveTypeName(dtype), count, peer.value(),
+      comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(
+      ncclSend(send_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+               peer.value(), comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+absl::Status RcclCommunicator::LaunchRecv(se::DeviceAddressBase recv_buffer,
+                                          PrimitiveType dtype, size_t count,
+                                          RankId peer,
+                                          const Executor& executor) {
+  if (canceling_.load()) {
+    return absl::FailedPreconditionError("RcclCommunicator aborted");
+  }
+  se::Stream* stream = ToStream(executor);
+
+  VLOG(3) << absl::StreamFormat(
+      "[%d] Launch RCCL Recv operation; recv_buffer=%p; dtype=%s; "
+      "count=%d; peer=%d; comm=%p; stream=%p",
+      stream->parent()->device_ordinal(), recv_buffer.opaque(),
+      primitive_util::LowercasePrimitiveTypeName(dtype), count, peer.value(),
+      comm_, stream);
+
+  TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
+
+  TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(
+      ncclRecv(recv_buffer.opaque(), ToNcclCount(dtype, count), nccl_dtype,
+               peer.value(), comm_, AsHipStream(stream))));
+  if (group_nesting_level_ == 0) {
+    TF_RETURN_IF_ERROR(PollUntilDone());
+  }
+  return absl::OkStatus();
+}
+
+std::string RcclCommunicator::ToString() const {
+  // comm_ should not be "touched" outside of executor_, but we are printing the
+  // pointer itself and not touching the value, so this is safe.
+  return absl::StrFormat("RcclCommunicator(ncclComm_t=%p)", comm_);
+}
+
+absl::Status RcclCommunicator::PollUntilDone() const {
+  if (canceling_.load()) {
+    return FailedPrecondition("RcclCommunicator aborted");
+  }
+  return ::xla::gpu::PollUntilDone(comm_, canceling_);
+}
+
+Future<> RcclCommunicator::Execute(
+    absl::AnyInvocable<absl::Status() &&> f) const {
+  return executor_ ? Future<>::MakeOn(*executor_, std::move(f))
+                   : Future<>(std::move(f)());
+}
+
+template <typename T>
+Future<T> RcclCommunicator::Execute(
+    absl::AnyInvocable<absl::StatusOr<T>() &&> f) const {
+  return executor_ ? Future<T>::MakeOn(*executor_, std::move(f))
+                   : Future<T>(std::move(f)());
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.h b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.h
new file mode 100644
index 00000000000000..d9b853fc918ec5
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.h
@@ -0,0 +1,254 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COMMUNICATOR_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COMMUNICATOR_H_
+
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/backends/gpu/collectives/gpu_communicator.h"
+#include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
+#include "xla/future.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/tsl/concurrency/executor.h"
+#include "xla/tsl/platform/env.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+
+// XLA collectives communicator wrapping an RCCL communicator.
+class RcclCommunicator : public GpuCommunicator {
+ public:
+  // Creates a RCCL communicator.
+  //
+  // make_comm should construct and return a new ncclComm_t. For example, it
+  // could call ncclCommInitRank. make_comm should not return a ncclComm_t that
+  // was created by a different thread.
+  //
+  // If is_async is true, all collective methods (e.g., AllReduce) are performed
+  // asynchronously on a separate thread. Otherwise, they are performed
+  // synchronously on the calling thread.
+  static absl::StatusOr<std::unique_ptr<RcclCommunicator>> Create(
+      absl::AnyInvocable<absl::StatusOr<ncclComm_t>()> make_comm,
+      bool is_async = false, std::atomic_bool* cancel = nullptr,
+      tsl::Env& env = *tsl::Env::Default());
+
+  ~RcclCommunicator() override;
+
+  // RcclCommunicator is not copyable or movable.
+  RcclCommunicator(const RcclCommunicator&) = delete;
+  RcclCommunicator(RcclCommunicator&&) = delete;
+  RcclCommunicator& operator=(const RcclCommunicator&) = delete;
+  RcclCommunicator& operator=(RcclCommunicator&&) = delete;
+
+  absl::Status Abort() final;
+  absl::Status HealthCheck() const final;
+  absl::StatusOr<size_t> NumRanks() const final;
+
+  // Since each XLA buffer is a slice into a larger BFCAllocator chunk, first
+  // get the base address of buffer. We will use the base address to keep track
+  // of which chunks we have registered.
+  absl::Status RegisterBufferOnce(se::DeviceAddressBase buffer_range,
+                                  int device_ordinal,
+                                  bool use_symmetric_buffer) final;
+
+  Future<> GroupExecute(
+      absl::AnyInvocable<absl::Status(GpuCommunicator*)> f) final;
+
+  Future<> AllReduce(se::DeviceAddressBase send_buffer,
+                     se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                     size_t count, ReductionKind reduction_kind,
+                     const Executor& executor) final;
+
+  Future<> Broadcast(se::DeviceAddressBase send_buffer,
+                     se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                     size_t count, RankId root, const Executor& executor) final;
+
+  Future<> ReduceScatter(se::DeviceAddressBase send_buffer,
+                         se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                         size_t count, ReductionKind reduction_kind,
+                         const Executor& executor) final;
+
+  Future<> AllGather(se::DeviceAddressBase send_buffer,
+                     se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                     size_t count, const Executor& executor) final;
+
+  Future<> AllToAll(absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+                    absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+                    PrimitiveType dtype, size_t count,
+                    const Executor& executor) final;
+
+  Future<> CollectivePermute(se::DeviceAddressBase send_buffer,
+                             se::DeviceAddressBase recv_buffer,
+                             PrimitiveType dtype, size_t count,
+                             std::optional<RankId> source_rank,
+                             absl::Span<const RankId> target_ranks,
+                             const Executor& executor) final;
+
+  Future<> Send(se::DeviceAddressBase send_buffer, PrimitiveType dtype,
+                size_t count, RankId peer, const Executor& executor) final;
+
+  Future<> Recv(se::DeviceAddressBase recv_buffer, PrimitiveType dtype,
+                size_t count, RankId peer, const Executor& executor) final;
+
+  std::string ToString() const final;
+
+  ncclComm_t comm() const { return comm_; }
+
+ private:
+  absl::StatusOr<std::unique_ptr<RegisteredBufferHandle>> RegisterBuffer(
+      se::DeviceAddressBase buffer, int device_ordinal,
+      bool use_symmetric_buffer);
+
+  class RcclRegisteredBufferHandle;
+
+  explicit RcclCommunicator(ncclComm_t comm,
+                            std::unique_ptr<tsl::Executor> executor)
+      : comm_(comm), executor_(std::move(executor)) {
+    VLOG(1) << "Created " << *this;
+  }
+
+  absl::Status GroupStart();
+  absl::Status GroupEnd();
+
+  absl::Status LaunchAllReduce(se::DeviceAddressBase send_buffer,
+                               se::DeviceAddressBase recv_buffer,
+                               PrimitiveType dtype, size_t count,
+                               ReductionKind reduction_kind,
+                               const Executor& executor) final;
+
+  absl::Status LaunchBroadcast(se::DeviceAddressBase send_buffer,
+                               se::DeviceAddressBase recv_buffer,
+                               PrimitiveType dtype, size_t count, RankId root,
+                               const Executor& executor) final;
+
+  absl::Status LaunchReduceScatter(se::DeviceAddressBase send_buffer,
+                                   se::DeviceAddressBase recv_buffer,
+                                   PrimitiveType dtype, size_t count,
+                                   ReductionKind reduction_kind,
+                                   const Executor& executor) final;
+
+  absl::Status LaunchAllGather(se::DeviceAddressBase send_buffer,
+                               se::DeviceAddressBase recv_buffer,
+                               PrimitiveType dtype, size_t count,
+                               const Executor& executor) final;
+
+  absl::Status LaunchAllToAll(
+      absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
+      absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
+      PrimitiveType dtype, size_t count, const Executor& executor) final;
+
+  absl::Status LaunchCollectivePermute(se::DeviceAddressBase send_buffer,
+                                       se::DeviceAddressBase recv_buffer,
+                                       PrimitiveType dtype, size_t count,
+                                       std::optional<RankId> source_rank,
+                                       absl::Span<const RankId> target_ranks,
+                                       const Executor& executor) final;
+
+  absl::Status LaunchSend(se::DeviceAddressBase send_buffer,
+                          PrimitiveType dtype, size_t count, RankId peer,
+                          const Executor& executor) final;
+
+  absl::Status LaunchRecv(se::DeviceAddressBase recv_buffer,
+                          PrimitiveType dtype, size_t count, RankId peer,
+                          const Executor& executor) final;
+
+  // Polls the communicator until any pending non-blocking operations are "done"
+  // or aborted.
+  absl::Status PollUntilDone() const;
+
+  // Executes f on executor_, or calls f directly if executor_ is null.
+  Future<> Execute(absl::AnyInvocable<absl::Status() &&> f) const;
+
+  // Executes f on executor_, or calls f directly if executor_ is null.
+  template <typename T>
+  Future<T> Execute(absl::AnyInvocable<absl::StatusOr<T>() &&> f) const;
+
+  absl::Status ExecuteAwait(absl::AnyInvocable<absl::Status() &&> f) const {
+    return Execute(std::move(f)).Await();
+  }
+
+  template <typename T>
+  absl::StatusOr<T> ExecuteAwait(
+      absl::AnyInvocable<absl::StatusOr<T>() &&> f) const {
+    return Execute<T>(std::move(f)).Await();
+  }
+
+  // Underlying RCCL communicator.
+  ncclComm_t comm_;
+
+  // If not null, used to execute methods.
+  //
+  // RCCL communicators (instances of ncclComm_t) are not thread safe. Thus,
+  // multiple threads cannot concurrently access the same ncclComm_t. This is
+  // not surprising. What is very surprising is that multiple threads cannot
+  // serially access the same ncclComm_t. In fact, a ncclComm_t must be created
+  // by, live on, and be destroyed by a single thread. A ncclComm_t cannot be
+  // accessed by any thread except the one that created it. To accomplish this,
+  // we perform all comm_ operations on executor_, if it is not null.
+  //
+  // Concretely, the lack of thread safety comes from the fact that the RCCL
+  // code uses thread-local variables that do not work properly when a
+  // ncclComm_t is accessed from multiple threads. Emperically, the lack of
+  // thread safety only manifests as buggy behavior when using non-blocking
+  // communicators.
+  std::unique_ptr<tsl::Executor> executor_;
+
+  // Should all pending collectives cancel?
+  std::atomic_bool canceling_ = false;
+
+  // Has comm_ been aborted?
+  bool aborted_ = false;
+
+  // Nesting level of current RCCL group
+  int group_nesting_level_ = 0;
+
+  // Keep track of which communicators we have registered for already.
+  // Each ncclMemAlloc'd buffer needs to be registered once per comm.
+  struct RegisteredBuffers {
+    absl::Mutex mu;
+    // Buffer range to the registered buffer handle.
+    absl::flat_hash_map<void*,
+                        std::unique_ptr<Communicator::RegisteredBufferHandle>>
+        range_to_handle ABSL_GUARDED_BY(mu);
+  };
+  RegisteredBuffers registered_buffers_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_RCCL_COMMUNICATOR_H_
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator_test.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator_test.cc
new file mode 100644
index 00000000000000..1d21082326f747
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_communicator.h"
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+#include "xla/core/collectives/rank_id.h"
+#include "xla/core/collectives/reduction_kind.h"
+#include "xla/future.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/tsl/platform/errors.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::HasSubstr;
+
+constexpr absl::string_view kCudaError = "unhandled cuda error";
+
+void AssertAborted(absl::Status s) {
+  ASSERT_THAT(s, absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition,
+                                        HasSubstr("aborted")));
+};
+
+void AssertEventAborted(Future<> future) {
+  ASSERT_THAT(future.Await(),
+              absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition,
+                                     HasSubstr("aborted")));
+};
+
+// Creates a non-blocking NCCL communicator.
+absl::StatusOr<std::unique_ptr<RcclCommunicator>> CreateCommunicator(
+    bool blocking) {
+  auto f = [blocking]() -> absl::StatusOr<ncclComm_t> {
+    // Create a unique NCCL Id.
+    ncclUniqueId id;
+    TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclGetUniqueId(&id)));
+
+    // Initialize a communicator.
+    ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
+    config.blocking = blocking ? 1 : 0;
+    ncclComm_t comm;
+    ncclResult_t r =
+        ncclCommInitRankConfig(&comm, /*nranks=*/1, id, /*rank=*/0, &config);
+    if (r == ncclUnhandledCudaError) {
+      // If this test runs on a machine without any CUDA-capable devices
+      // available, we get a ncclUnhandledCudaError. We return a specific error
+      // and skip the test.
+      LOG(ERROR) << XLA_RCCL_STATUS(r);
+      return absl::FailedPreconditionError(kCudaError);
+    }
+    if (r != ncclSuccess && r != ncclInProgress) {
+      return XLA_RCCL_STATUS(r);
+    }
+
+    // Wait for the communicator to finish initializing.
+    ncclResult_t state = ncclInProgress;
+    while (state == ncclInProgress) {
+      TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(ncclCommGetAsyncError(comm, &state)));
+    }
+    TF_RETURN_IF_ERROR(XLA_RCCL_STATUS(state));
+    return comm;
+  };
+  bool is_async = !blocking;
+  return RcclCommunicator::Create(f, is_async);
+}
+
+TEST(RcclCommunicator, AbortSucceeds) {
+  for (const bool blocking : {true, false}) {
+    absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm, absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(), absl_testing::IsOk());
+  }
+}
+
+TEST(RcclCommunicator, DoubleAbortFails) {
+  for (const bool blocking : {true, false}) {
+    absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm.status(), absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(), absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(),
+                absl_testing::StatusIs(absl::StatusCode::kFailedPrecondition,
+                                       HasSubstr("aborted")));
+  }
+}
+
+TEST(RcclCommunicator, OperationsFailAfterAbort) {
+  for (const bool blocking : {true, false}) {
+    // Declare placeholder variables to make the operations below compile.
+    se::DeviceAddressBase buf;
+    PrimitiveType dtype = PrimitiveType::U64;
+    size_t count = 0;
+    ReductionKind rk = ReductionKind::SUM;
+    GpuCollectives::Executor executor(nullptr);
+
+    // Execute RcclCommunicator operations. They should all immediately fail
+    // because the communicator has been aborted.
+    absl::StatusOr<std::unique_ptr<RcclCommunicator>> comm =
+        CreateCommunicator(blocking);
+    if (comm.status().message() == kCudaError) {
+      GTEST_SKIP() << "unhandled cuda error";
+    }
+    ASSERT_THAT(comm.status(), absl_testing::IsOk());
+    ASSERT_THAT((*comm)->Abort(), absl_testing::IsOk());
+    AssertAborted((*comm)->HealthCheck());
+    AssertAborted((*comm)->NumRanks().status());
+    AssertAborted((*comm)->RegisterBufferOnce(buf, 0, false));
+    AssertEventAborted(
+        (*comm)->AllReduce(buf, buf, dtype, count, rk, executor));
+    AssertEventAborted(
+        (*comm)->Broadcast(buf, buf, dtype, count, RankId(0), executor));
+    AssertEventAborted(
+        (*comm)->ReduceScatter(buf, buf, dtype, count, rk, executor));
+    AssertEventAborted((*comm)->AllGather(buf, buf, dtype, count, executor));
+    AssertEventAborted((*comm)->AllToAll({}, {}, dtype, count, executor));
+    AssertEventAborted(
+        (*comm)->CollectivePermute(buf, buf, dtype, count, {}, {}, executor));
+    AssertEventAborted((*comm)->Send(buf, dtype, count, RankId(0), executor));
+    AssertEventAborted((*comm)->Recv(buf, dtype, count, RankId(0), executor));
+  }
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_errors.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.cc
new file mode 100644
index 00000000000000..c79483e1126fa6
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.cc
@@ -0,0 +1,59 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/collectives/rccl_errors.h"
+
+#include <atomic>
+
+#include "absl/log/log.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/util.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+namespace xla::gpu {
+
+absl::Status PollUntilDone(ncclComm_t comm, const std::atomic_bool& aborted) {
+  auto poll = [](ncclComm_t comm,
+                 const std::atomic_bool& aborted) -> absl::Status {
+    ncclResult_t state = ncclInProgress;
+    while (state == ncclInProgress && !aborted.load()) {
+      XLA_RCCL_RETURN_IF_ERROR(ncclCommGetAsyncError(comm, &state));
+    }
+    if (aborted.load()) {
+      return Cancelled("NcclCommunicator aborted");
+    }
+    return XLA_RCCL_STATUS(state);
+  };
+
+  if (!VLOG_IS_ON(1)) {
+    return poll(comm, aborted);
+  }
+
+  absl::Time start = absl::Now();
+  absl::Status s = poll(comm, aborted);
+  absl::Time stop = absl::Now();
+  VLOG(1) << "Polled RCCL communicator " << comm << " for " << (stop - start)
+          << ": " << s;
+  return s;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_errors.h b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.h
new file mode 100644
index 00000000000000..90384b5700f0db
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_errors.h
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_COLLECTIVES_RCCL_ERRORS_H_
+#define XLA_BACKENDS_GPU_COLLECTIVES_RCCL_ERRORS_H_
+
+#include <atomic>
+
+#include "absl/status/status.h"
+#include "rocm/rocm_config.h"  // IWYU pragma: keep
+#include "xla/tsl/platform/logging.h"
+
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+
+//===----------------------------------------------------------------------===//
+// Collection of helper macros for handling RCCL errors.
+//===----------------------------------------------------------------------===//
+
+#define XLA_RCCL_STATUS(expr)                                         \
+  [](ncclResult_t s, absl::string_view str) -> absl::Status {         \
+    if (s == ncclSuccess || s == ncclInProgress) {                    \
+      return absl::OkStatus();                                        \
+    }                                                                 \
+    return xla::Internal(                                             \
+        "RCCL operation %s failed: %s. Last RCCL warning(error) log " \
+        "entry (may be unrelated) '%s'.",                             \
+        str, ncclGetErrorString(s), ncclGetLastError(nullptr));       \
+  }(expr, #expr)
+
+#define XLA_RCCL_RETURN_IF_ERROR(expr)      \
+  do {                                      \
+    absl::Status s = XLA_RCCL_STATUS(expr); \
+    if (!s.ok()) {                          \
+      return s;                             \
+    }                                       \
+  } while (0)
+
+#define XLA_RCCL_LOG_IF_ERROR(expr)         \
+  do {                                      \
+    absl::Status s = XLA_RCCL_STATUS(expr); \
+    if (!s.ok()) {                          \
+      LOG(ERROR) << s.ToString();           \
+    }                                       \
+  } while (0)
+
+#define XLA_RCCL_CHECK(expr) CHECK(XLA_RCCL_STATUS(expr).ok())
+
+namespace xla::gpu {
+
+// Polls the provided communicator until it is "done" or aborted.
+//
+// RCCL communicators can be blocking or non-blocking. Operations performed on
+// non-blocking communicators return immediately, and it is the responsibility
+// of the programmer to repeatedly call ncclCommGetAsyncError on the
+// communicator until ncclCommGetAsyncError no long returns inProgress. That is
+// what PollUntilDone does.
+//
+// Note, however, that the semantics of RCCL collectives are a bit subtle. For
+// example, a collective operation may report itself as done when it is
+// scheduled on the GPU but has not yet executed. Refer to the RCCL
+// documentation and exercise caution when reasoning about whether an operation
+// is really "done".
+absl::Status PollUntilDone(ncclComm_t comm, const std::atomic_bool& aborted);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_COLLECTIVES_RCCL_ERRORS_H_
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_params.cc b/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
index ec60f01138d2db..7a1f1791d3d911 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_params.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_params.h"
 
 #include <cstdint>
+#include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/statusor.h"
@@ -60,9 +61,11 @@ absl::StatusOr<CollectiveParams> CollectiveParams::Create(
   const GpuExecutableRunOptions* gpu_options =
       run_options.run_options().gpu_executable_run_options();
 
+  const std::string& platform_name =
+      run_options.run_options().stream()->parent()->GetPlatform()->Name();
   auto* collectives = gpu_options && gpu_options->collectives()
                           ? gpu_options->collectives()
-                          : GpuCollectives::Default();
+                          : GpuCollectives::Default(platform_name);
 
   auto* device_id_map = gpu_options && gpu_options->gpu_global_device_ids()
                             ? &*gpu_options->gpu_global_device_ids()
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index be2ae8bb7e80e1..d4a9ae9085c66e 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -178,7 +178,6 @@ cc_library(
     ]) + if_cuda([
         # keep sorted
         "//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
-        "//xla/stream_executor/gpu:gpu_stream",
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm([
         # keep sorted
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index b0b2b83841c0c1..be168f00082456 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -759,7 +759,8 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
   auto setup_sends = [&]() -> absl::Status {
     TF_ASSIGN_OR_RETURN(local_device_state, GetLocalDeviceState(device));
     stream = local_device_state->GetDeviceToDeviceStream();
-    gpu::GpuCollectives* gpu_collectives = gpu::GpuCollectives::Default();
+    gpu::GpuCollectives* gpu_collectives =
+        gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     usage_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
         BufferSequencingEvent::Create(this->thread_pool()));
 
@@ -974,7 +975,8 @@ StreamExecutorGpuClient::CrossHostReceiveBuffers(
     stream = local_device_state->GetDeviceToDeviceStream();
     TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
                         device->default_memory_space());
-    gpu::GpuCollectives* gpu_collectives = gpu::GpuCollectives::Default();
+    gpu::GpuCollectives* gpu_collectives =
+        gpu::GpuCollectives::Default(stream->parent()->GetPlatform()->Name());
     definition_event = tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(
         BufferSequencingEvent::Create(this->thread_pool()));
 
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index fa1687089d29d2..33f0ab94a82627 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2322,9 +2322,7 @@ xla_cc_test(
         "no_oss",
         "nomsan",  # Pulls in precompiled NVIDIA libraries which cause false positives in msan.
         "requires-gpu-nvidia",
-    ] + if_google([
-        "ignore_for_dep=third_party/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h",
-    ]),
+    ],
     deps = if_cuda_is_configured([
         ":nvptx_compiler_impl",
         "//xla/stream_executor:cuda_platform",
@@ -2365,7 +2363,6 @@ cc_library(
     ],
     tags = [
         "gpu",
-        "manual",
         "rocm-only",
     ],
     deps = [
@@ -2386,7 +2383,6 @@ cc_library(
     ],
     tags = [
         "gpu",
-        "manual",
         "rocm-only",
     ],
     deps = [
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index a74214ce0f1f73..f2126b3e9ad1ba 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1594,14 +1594,13 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_semaphore",
-        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/time",
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc b/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc
index b33fda0dc59317..89c2018dfeaf12 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_timer.cc
@@ -28,9 +28,10 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_status.h"
 #include "xla/stream_executor/cuda/delay_kernel.h"
 #include "xla/stream_executor/gpu/gpu_semaphore.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace stream_executor::gpu {
 
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 09200e423bd1b8..0529fdcdcf0e89 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -241,19 +241,6 @@ tsl_gpu_library(
     alwayslink = True,
 )
 
-cc_library(
-    name = "gpu_stream",
-    srcs = ["gpu_stream.cc"],
-    hdrs = ["gpu_stream.h"],
-    tags = ["gpu"],
-    deps = [
-        ":gpu_types_header",
-        "//xla/stream_executor:stream",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/log:check",
-    ],
-)
-
 cc_library(
     name = "gpu_semaphore",
     srcs = ["gpu_semaphore.cc"],
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc b/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
deleted file mode 100644
index ee9b15487bab65..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/gpu/gpu_stream.h"
-
-#include "absl/base/casts.h"
-#include "absl/log/check.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/stream.h"
-
-namespace stream_executor {
-namespace gpu {
-
-GpuStreamHandle AsGpuStreamValue(Stream* stream) {
-  DCHECK(stream != nullptr);
-  return absl::bit_cast<GpuStreamHandle>(
-      stream->platform_specific_handle().stream);
-}
-
-}  // namespace gpu
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h b/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
deleted file mode 100644
index ec95ec50e25226..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_stream.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Defines the GpuStream type - the CUDA-specific implementation of the generic
-// StreamExecutor Stream interface.
-
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
-
-#include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/stream.h"
-
-namespace stream_executor {
-namespace gpu {
-
-// Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
-GpuStreamHandle AsGpuStreamValue(Stream* stream);
-}  // namespace gpu
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_

From f46e0c47d3ae99a7a5859521d19db1593c4bc76c Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 16 Dec 2025 04:52:37 -0800
Subject: [PATCH 334/753] Check a weight cache file size before trying to load
 it.

This avoid an error message when trying to load a file that is empty because it
needs to be built.

This also adds a `Size()` member to `tflite::xnnpack::FileDescriptorView`

PiperOrigin-RevId: 845209923
---
 tensorflow/lite/delegates/xnnpack/file_util.cc    | 2 +-
 tensorflow/lite/delegates/xnnpack/file_util.h     | 8 ++++++++
 tensorflow/lite/delegates/xnnpack/weight_cache.cc | 3 ++-
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/file_util.cc b/tensorflow/lite/delegates/xnnpack/file_util.cc
index b475080480ecb4..8a24eb9568b884 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util.cc
@@ -57,7 +57,7 @@ FileDescriptor FileDescriptor::Duplicate() const {
   if (!IsValid()) {
     return FileDescriptor(-1);
   }
-  return FileDescriptor(dup(fd_));
+  return FileDescriptor::Duplicate(fd_);
 }
 
 void FileDescriptor::Reset(int new_fd) {
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.h b/tensorflow/lite/delegates/xnnpack/file_util.h
index cddc0a4c615f06..113a378007506b 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.h
+++ b/tensorflow/lite/delegates/xnnpack/file_util.h
@@ -76,6 +76,14 @@ class FileDescriptorView {
   // WARNING: the file descriptor must be valid and the file must be opened.
   Offset MovePos(Offset offset) const;
 
+  // Returns the size of the file.
+  Offset Size() const {
+    Offset pos = GetPos();
+    Offset size = SetPosFromEnd(0);
+    SetPos(pos);
+    return size;
+  }
+
   // Reads `count` bytes from the file at the current position to `dst`.
   //
   // Returns true if all the data available in the file was read to the buffer
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index 516a2ebbdeda9f..cbd3ac2ca29e2e 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -325,7 +325,8 @@ bool MMapWeightCacheProvider::LoadOrStartBuild(const char* path,
   }
   const char* const safe_path = Sanitize(path);
   FileDescriptor build_fd = fd.Duplicate();
-  if (!IsInMemoryCachePath(safe_path) && Load(safe_path, std::move(fd))) {
+  if (!IsInMemoryCachePath(safe_path) && fd.Size() &&
+      Load(safe_path, std::move(fd))) {
     TFLITE_LOG_PROD(tflite::TFLITE_LOG_VERBOSE,
                     "XNNPack weight cache loaded from '%s'.", safe_path);
     return true;

From 6e7f0eba0d87e0144d0a1795d91bb600d9a8ba54 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 05:31:39 -0800
Subject: [PATCH 335/753] Automated Code Change

PiperOrigin-RevId: 845221386
---
 .../transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
index 7042c5786437c6..7f3c6b3e24f8cf 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file implements logic for bufferizing HLO dialect to memref dialect.
 
-#include <memory>
+#include <cstdint>
 #include <optional>
 #include <utility>
 

From 190d2db8d813e9e8a120852ab9ad7ba34a130d6f Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Tue, 16 Dec 2025 06:06:23 -0800
Subject: [PATCH 336/753] Enable CublasLt via GemmRewriter configuration. Moves
 DebugOptions flag `xla_gpu_enable_cublas_lt` out of GemmRewriter pass. This
 change is required to enable autotuning both: Cublas and CublasLt kernels.

PiperOrigin-RevId: 845232642
---
 .../runtime/gpublas_lt_matmul_thunk_test.cc   | 10 +++---
 .../xla/xla/service/gpu/gpu_compiler.cc       |  3 +-
 .../xla/xla/service/gpu/transforms/BUILD      |  2 --
 .../transforms/cublas_gemm_rewriter_test.cc   | 20 ++++++++---
 .../service/gpu/transforms/gemm_rewriter.cc   |  5 +--
 .../service/gpu/transforms/gemm_rewriter.h    |  4 +++
 .../gpu/transforms/gemm_rewriter_test.cc      | 34 +++++++++++++------
 7 files changed, 52 insertions(+), 26 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index 77a6ac88f8ff70..c0d7d088081e52 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -191,12 +191,14 @@ void GpuBlasLtMatmulThunkTest::CreateExecuteThunksFromHLO(
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           this->ParseAndReturnVerifiedModule(hlo_string));
 
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
   TF_ASSERT_OK_AND_ASSIGN(
       bool changed,
-      RunHloPass(
-          GemmRewriter(gpu_comp(executor),
-                       /*toolkit_version=*/se::SemanticVersion{12, 4, 0}),
-          module.get()));
+      RunHloPass(GemmRewriter(gpu_comp(executor),
+                              /*toolkit_version=*/se::SemanticVersion{12, 4, 0},
+                              options),
+                 module.get()));
   ASSERT_TRUE(changed);
 
   GpuBlasLtThunkBuilder builder(executor, gpu_comp(executor));
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index f864daf9e86df2..ca28137784bfdf 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1638,7 +1638,8 @@ void AddGemmRewriterPasses(HloPassPipeline& pipeline,
       GemmRewriterOptions{GemmRewriterOptions::DType::kFp8Only, bias_mode});
   pipeline.AddPass<GemmRewriter>(
       gpu_version, toolkit_version,
-      GemmRewriterOptions{GemmRewriterOptions::DType::kNonFp8Only, bias_mode});
+      GemmRewriterOptions{GemmRewriterOptions::DType::kNonFp8Only, bias_mode,
+                          debug_options.xla_gpu_enable_cublaslt()});
 }
 }  // namespace
 
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 89b60223b66765..12c6abf3a46270 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1546,7 +1546,6 @@ xla_test(
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:test",
-        "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
@@ -1557,7 +1556,6 @@ xla_test(
         "//xla/stream_executor:semantic_version",
         "//xla/stream_executor:stream_executor_memory_allocator",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status:statusor",
diff --git a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
index 4c412e16ade5e0..a877a45991fb78 100644
--- a/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
@@ -2467,7 +2467,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -2544,7 +2546,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -2827,7 +2831,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -3145,7 +3151,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   SCOPED_TRACE(module->ToString());
   EXPECT_TRUE(changed);
@@ -3434,7 +3442,9 @@ ENTRY %test (x: f32[2,3,4], y: f32[4,5,7], z: f32[7]) -> f32[2,3,5,7] {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text, config));
 
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = true;
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
index d420d471002a79..1481eebc29a044 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
@@ -2047,10 +2047,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   absl::StatusOr<absl::string_view> GetNonFp8GemmCustomCallTarget(
       const HloInstruction& instr,
       const GemmBackendConfig& gemm_backend_config) const {
-    if (!instr.GetModule()
-             ->config()
-             .debug_options()
-             .xla_gpu_enable_cublaslt()) {
+    if (!options_.enable_cublaslt) {
       // cublasLt is not enabled.
       return absl::string_view(kGemmCallTarget);
     }
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
index f49f7ed3a1da84..e4b9bf8cc9d9f8 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.h
@@ -72,6 +72,10 @@ struct GemmRewriterOptions {
   // In this case, the two GEMMs can be scheduled in parallel.
   enum class BiasMode { kBias, kNoBias };
   BiasMode bias_mode = BiasMode::kBias;
+
+  // Enables the use of cublasLt for non-FP8 GEMMs.
+  // FP8 GEMMs are always rewritten to use cublasLt.
+  bool enable_cublaslt = false;
 };
 
 class GemmRewriter : public HloModulePass {
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
index 449dfe481c19a6..9249133b3ad766 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "xla/hlo/testlib/filecheck.h"
 #include "xla/hlo/testlib/pattern_matcher_gmock.h"
 #include "xla/hlo/testlib/test.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
@@ -47,7 +46,6 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/semantic_version.h"
 #include "xla/stream_executor/stream_executor_memory_allocator.h"
-#include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla.pb.h"
 
@@ -1273,7 +1271,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1297,7 +1297,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1321,7 +1323,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -1342,7 +1346,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1366,7 +1372,9 @@ ENTRY test {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1394,7 +1402,9 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1422,7 +1432,9 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 
@@ -1452,7 +1464,9 @@ ENTRY main {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  GemmRewriter pass(Capability(), GetToolkitVersion());
+  GemmRewriterOptions options;
+  options.enable_cublaslt = GetDebugOptionsForTest().xla_gpu_enable_cublaslt();
+  GemmRewriter pass(Capability(), GetToolkitVersion(), options);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, this->RunHloPass(&pass, module.get()));
   EXPECT_TRUE(changed);
 

From d33f8e36c55a13f502fa5ea66ce97f854147cb6e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 06:57:46 -0800
Subject: [PATCH 337/753] Automated Code Change

PiperOrigin-RevId: 845251896
---
 tensorflow/core/util/stat_summarizer.cc     |  2 +-
 tensorflow/core/util/stat_summarizer.h      |  2 +-
 tensorflow/core/util/stream_executor_util.h |  3 ++-
 tensorflow/core/util/strided_slice_op.cc    | 22 ++++++++++-----------
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index 26a06bbb6ff129..a9f1675544a2f2 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -82,7 +82,7 @@ void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
 }
 
 void StatSummarizer::PrintStepStats() const {
-  string output = GetOutputString();
+  std::string output = GetOutputString();
   std::istringstream iss(output);
   for (std::string line; std::getline(iss, line);) {
     LOG(INFO) << line;
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 3eae427f548475..62d192ab5193d2 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -89,7 +89,7 @@ class StatSummarizer {
     return stats_calculator_->GetStatsByNodeType();
   }
 
-  std::string GetStatsByMetric(const string& title,
+  std::string GetStatsByMetric(const std::string& title,
                                StatsCalculator::SortingMetric sorting_metric,
                                int num_stats) const {
     return stats_calculator_->GetStatsByMetric(title, sorting_metric,
diff --git a/tensorflow/core/util/stream_executor_util.h b/tensorflow/core/util/stream_executor_util.h
index 4787bcf6ded5d2..63d6a3f1a9f146 100644
--- a/tensorflow/core/util/stream_executor_util.h
+++ b/tensorflow/core/util/stream_executor_util.h
@@ -32,7 +32,8 @@ class StreamExecutorUtil {
   template <typename T>
   static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<char*>(t.tensor_data().data()));
-    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory<T>(
+        stream_executor::DeviceAddressBase(ptr, t.TotalBytes()));
   }
 };
 
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 93c5a7e9818ae2..3984d78e1b90bc 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -33,13 +33,13 @@ constexpr int32_t kShrinkAxis = -1, kNewAxis = -2;
 // if one does foo[3:5, ..., -3], this will have 3 length tensors
 struct StridedSliceSparseSpec {
   int64_t dims;
-  int32 num_add_axis_after_ellipsis;
+  int32_t num_add_axis_after_ellipsis;
   const Tensor* begin_tensor;
   const Tensor* end_tensor;
   const Tensor& strides_tensor;
-  const int32 begin_mask, end_mask;
-  int32 ellipsis_mask;
-  const int32 new_axis_mask, shrink_axis_mask;
+  const int32_t begin_mask, end_mask;
+  int32_t ellipsis_mask;
+  const int32_t new_axis_mask, shrink_axis_mask;
 };
 
 // Dense slicing specification
@@ -49,8 +49,8 @@ struct StridedSliceSparseSpec {
 // sparse had 3 length tensors.
 struct StridedSliceDenseSpec {
   const int64_t dims;
-  int32 begin_mask;
-  int32 end_mask;
+  int32_t begin_mask;
+  int32_t end_mask;
   bool begin_valid;
   bool end_valid;
   absl::InlinedVector<int64_t, 4UL>& begin;
@@ -62,18 +62,18 @@ struct StridedSliceDenseSpec {
   // entries. If an index in this array is positive, the size of the dimension
   // is obtained from canonical end-begin. Otherwise, if it is a kNewAxis,
   // it will be 1. A shrunk dimension is skipped.
-  absl::InlinedVector<int32, 4UL> final_shape_gather_indices;
+  absl::InlinedVector<int32_t, 4UL> final_shape_gather_indices;
   // This vector has the same size as final_shape_gather_indices, but it
   // remembers the sparse index that a dimension comes from, instead of dense
   // index. A -1 in this vector means there the index is not from the sparse
   // input.
-  absl::InlinedVector<int32, 4UL> final_shape_gather_indices_sparse;
-  absl::InlinedVector<int32, 4UL> input_shape_gather_indices_sparse;
+  absl::InlinedVector<int32_t, 4UL> final_shape_gather_indices_sparse;
+  absl::InlinedVector<int32_t, 4UL> input_shape_gather_indices_sparse;
   // The dense indexed shrink mask is which processing dimensions
   // should be shrunk. For example, if foo.shape = (10,10,10,10)
   // foo[3, ..., 5] has sparse_shrink_axis_mask of 0x5 and
   // dense_shrink_axis_mask of 0x9, yielding a final shape (10,10).
-  int32 shrink_axis_mask;
+  int32_t shrink_axis_mask;
 };
 
 }  // namespace
@@ -281,7 +281,7 @@ absl::Status ValidateStridedSliceOp(
                                       *strides};
 
   if (strides_tensor.dtype() == DT_INT32) {
-    TF_RETURN_IF_ERROR(BuildDenseSpec<int32>(sparse_spec, &dense_spec));
+    TF_RETURN_IF_ERROR(BuildDenseSpec<int32_t>(sparse_spec, &dense_spec));
   } else if (strides_tensor.dtype() == DT_INT64) {
     TF_RETURN_IF_ERROR(BuildDenseSpec<int64_t>(sparse_spec, &dense_spec));
   } else if (strides_tensor.dtype() == DT_INT16) {

From a64fa6b481e0ffd2f81b0fed2392649cee919b28 Mon Sep 17 00:00:00 2001
From: Theotime Combes <tcombes@google.com>
Date: Tue, 16 Dec 2025 07:19:26 -0800
Subject: [PATCH 338/753] [XLA:GPU] Create PackedTransposeDescription using
 TransposeDescription rather than hlo

So that it benefits from any OTF normalization that we do we creating the TransposeDescription (see child CL)

Otherwise, in the current state we re-use the hlo directly and would miss any normalization applied to transposedescription

I don't think that we should entirely merge them though, given that the Packed description does some transformation only relevant for the packed emitter.

PiperOrigin-RevId: 845260501
---
 .../gpu/codegen/emitters/transpose.cc         | 16 ++--
 .../backends/gpu/codegen/emitters/transpose.h | 10 +--
 .../xla/xla/service/gpu/ir_emission_utils.cc  | 56 ++++++------
 .../xla/xla/service/gpu/ir_emission_utils.h   | 21 +++--
 .../xla/service/gpu/ir_emission_utils_test.cc | 89 +++++++++++++++++--
 5 files changed, 135 insertions(+), 57 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
index 7059f439a844e3..531dc6ff0fe71d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.cc
@@ -543,7 +543,7 @@ std::vector<int64_t> GetBlockCounts(absl::Span<const int64_t> shape,
 }
 
 PackedTranspose::PackedTranspose(const HloFusionAnalysis& analysis,
-                                 const TransposeSpec& spec,
+                                 const PackedTransposeDescription& spec,
                                  absl::Span<const int64_t> output_block_tile,
                                  int64_t num_shmem_groups,
                                  MLIRContext* mlir_context)
@@ -676,8 +676,8 @@ PackedTranspose::WriteResult PackedTranspose::EmitWriteToShMemMlir(
     auto* root_tuple = fusion.fused_expression_root();
     for (auto root : side_output_roots_) {
       auto indexing = ComposeIndexingMaps(
-          input_indexing,
-          GetBitcastMap(spec_.input_shape(), root->shape(), mlir_context_));
+          input_indexing, GetBitcastMap(spec_.original_input_shape(),
+                                        root->shape(), mlir_context_));
       indexing.Simplify();
       side_output_indices.push_back(ApplyIndexing(
           indexing, thread_and_block_ids, symbol_values, nested_b));
@@ -864,7 +864,7 @@ IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* mlir_context) const {
 
   // Actual indexing.
   auto canonical_input_shape_to_real_shape = GetBitcastMap(
-      spec_.canonical_input_shape, spec_.input_shape(), mlir_context);
+      spec_.canonical_input_shape, spec_.original_input_shape(), mlir_context);
   // When we compose, the constraints w.r.t. to the input dimension sizes will
   // be added.
   auto input_indexing = ComposeIndexingMaps(
@@ -1000,8 +1000,9 @@ IndexingMap PackedTranspose::GetOutputIndexing(
   canonical_output_indexing.Simplify();
 
   // Actual indexing.
-  auto canonical_output_shape_to_real_shape = GetBitcastMap(
-      spec_.canonical_output_shape, spec_.output_shape(), mlir_context);
+  auto canonical_output_shape_to_real_shape =
+      GetBitcastMap(spec_.canonical_output_shape, spec_.original_output_shape(),
+                    mlir_context);
   // When we compose, the constraints w.r.t. to the output dimension sizes will
   // be added.
   auto output_indexing = ComposeIndexingMaps(
@@ -1012,8 +1013,7 @@ IndexingMap PackedTranspose::GetOutputIndexing(
 
 std::unique_ptr<EmitterBase> CreateTransposeFusion(
     const HloFusionAnalysis& analysis, MLIRContext* mlir_context) {
-  auto spec = GetTransposeSpec(
-      Cast<HloTransposeInstruction>(analysis.tiled_transpose().instr));
+  PackedTransposeDescription spec(analysis.tiled_transpose());
   auto packed_transpose_tile = GetPackedTransposeTileSizes(spec);
   if (packed_transpose_tile.ok()) {
     return std::make_unique<PackedTranspose>(
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
index ad3948f09a4b99..7c92bef8af1d2d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transpose.h
@@ -164,9 +164,9 @@ class TransposeFusion : public TransposeFusionBase {
 };
 
 // Packed transpose is a more advanced version of the transpose emitter.
-// It considers the canonical transpose described by TransposeSpec class,
-// i.e. [T2, A, T1, B] -> [T1, A, T2, B] and tries to pack as many T1 rows into
-// shared memory as possible.
+// It considers the canonical transpose described by PackedTransposeDescription
+// class, i.e. [T2, A, T1, B] -> [T1, A, T2, B] and tries to pack as many T1
+// rows into shared memory as possible.
 //
 // Let's describe the algorithm for a concrete example.
 //   bf16 [640,100,6,1] - > bf16 [6,100,640,1]
@@ -237,7 +237,7 @@ class TransposeFusion : public TransposeFusionBase {
 class PackedTranspose : public TransposeFusionBase {
  public:
   explicit PackedTranspose(const HloFusionAnalysis& analysis,
-                           const TransposeSpec& spec,
+                           const PackedTransposeDescription& spec,
                            absl::Span<const int64_t> output_block_tile,
                            int64_t num_shmem_groups,
                            mlir::MLIRContext* mlir_context);
@@ -273,7 +273,7 @@ class PackedTranspose : public TransposeFusionBase {
   IndexingMap GetShmemReadIndexing(mlir::MLIRContext* ctx) const;
   IndexingMap GetOutputIndexing(mlir::MLIRContext* ctx) const;
 
-  TransposeSpec spec_;
+  PackedTransposeDescription spec_;
 
   // Tile sizes for the canonical input shape.
   std::vector<int64_t> output_tile_;
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 022edaee8653c3..72c74c7b8ea8d2 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -220,8 +220,8 @@ absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
   return buffer_assignment.GetUniqueSlice(instr, index);
 }
 
-bool IsNormalized(const HloTransposeInstruction& transpose) {
-  const auto& permutation = transpose.dimensions();
+bool IsNormalized(const TransposeDescription& desc) {
+  const auto& permutation = desc.permutation;
   for (int i = 0; i < permutation.size() - 1; ++i) {
     if (permutation[i] + 1 == permutation[i + 1]) {
       return false;
@@ -230,12 +230,12 @@ bool IsNormalized(const HloTransposeInstruction& transpose) {
   return true;
 }
 
-bool CanEmitPackedTranspose(const HloTransposeInstruction& transpose) {
+bool CanEmitPackedTranspose(const TransposeDescription& desc) {
   // Support only normalized transposes.
-  if (!IsNormalized(transpose)) {
+  if (!IsNormalized(desc)) {
     return false;
   }
-  const auto& spec = GetTransposeSpec(&transpose);
+  PackedTransposeDescription spec(desc);
   return GetPackedTransposeTileSizes(spec).ok();
 }
 
@@ -257,13 +257,15 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
   absl::InlinedVector<int64_t, 3> dimensions(hero.shape().dimensions().begin(),
                                              hero.shape().dimensions().end());
   int64_t operand_most_minor_dim = hero.operand(0)->shape().dimensions().back();
-  if (CanEmitPackedTranspose(*Cast<HloTransposeInstruction>(&hero))) {
+
+  TransposeDescription desc{&hero, dimensions, permutation,
+                            /*shmem_usage=*/0};
+  if (CanEmitPackedTranspose(desc)) {
     int64_t vector_size =
         kBankBitwidth / GetBitwidth(hero.shape().element_type());
-    int64_t shmem_usage_bytes =
+    desc.shmem_usage =
         kNumShmemBanks * (kBankBitwidth / 8) * kNumShmemBanks * vector_size;
-    return TransposeDescription{&hero, dimensions, permutation,
-                                shmem_usage_bytes};
+    return desc;
   }
   if (permutation.back() == dimensions.size() - 1) {
     operand_most_minor_dim =
@@ -301,17 +303,17 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
   return std::nullopt;
 }
 
-TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose) {
-  auto inv_permutation = InversePermutation(transpose->dimensions());
-  auto& output_shape = transpose->shape();
-  llvm::SmallVector<int64_t, 3> canonical_output_shape =
-      llvm::to_vector<3>(output_shape.dimensions());
-  llvm::SmallVector<int64_t, 3> canonical_permutation =
-      llvm::to_vector<3>(transpose->dimensions());
+PackedTransposeDescription::PackedTransposeDescription(
+    const TransposeDescription& description)
+    : transpose(Cast<HloTransposeInstruction>(description.instr)) {
+  permutation = llvm::to_vector<3>(description.permutation);
+  inv_permutation = llvm::to_vector<3>(InversePermutation(permutation));
+  canonical_output_shape = llvm::to_vector<3>(description.dimensions);
+  canonical_permutation = llvm::to_vector<3>(description.permutation);
 
   // If the last dimension is transposed, add a size-1 B dimension.
   if (canonical_permutation.back() != canonical_output_shape.size() - 1) {
-    canonical_permutation.push_back(output_shape.dimensions().size());
+    canonical_permutation.push_back(canonical_output_shape.size());
     canonical_output_shape.push_back(1);
   }
   int64_t dim_t1 = -1;
@@ -333,21 +335,13 @@ TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose) {
     canonical_permutation.insert(canonical_permutation.begin() + dim_t1,
                                  dim_t1);
   }
-  auto canonical_inv_permutation = InversePermutation(canonical_permutation);
-  auto canonical_input_shape =
-      Permute(canonical_output_shape, canonical_inv_permutation);
-  return TransposeSpec{
-      transpose,
-      llvm::to_vector<3>(transpose->dimensions()),
-      llvm::to_vector<3>(inv_permutation),
-      canonical_output_shape,
-      canonical_permutation,
-      llvm::to_vector<3>(canonical_inv_permutation),
-      llvm::to_vector<3>(canonical_input_shape),
-  };
+  canonical_inv_permutation =
+      llvm::to_vector<3>(InversePermutation(canonical_permutation));
+  canonical_input_shape = llvm::to_vector<3>(
+      Permute(canonical_output_shape, canonical_inv_permutation));
 }
 
-std::string TransposeSpec::ToString() const {
+std::string PackedTransposeDescription::ToString() const {
   return absl::Substitute(R"(
 transpose: $0
 canonical_input_shape: $1
@@ -365,7 +359,7 @@ canonical_inv_permutation: $4
 }
 
 absl::StatusOr<absl::InlinedVector<int64_t, 3>> GetPackedTransposeTileSizes(
-    const TransposeSpec& spec) {
+    const PackedTransposeDescription& spec) {
   // Check the side outputs, etc.
   int64_t bits_per_element = GetBitwidth(spec.elem_type());
   if (bits_per_element >= kBankBitwidth) {
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index 012716126d0943..510940703bc650 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -224,13 +224,19 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
 // 3. <8x2x32x7x6> -> <6x32x2x7x8> becomes <8x2x32x7x6x1> -> <6x32x2x7x8x1>.
 
 // TODO(b/370690811): Unify this with TransposeDescription.
-struct TransposeSpec {
-  PrimitiveType elem_type() const { return input_shape().element_type(); }
+struct PackedTransposeDescription {
+  explicit PackedTransposeDescription(const TransposeDescription& description);
 
-  const Shape& input_shape() const { return transpose->operand(0)->shape(); }
-  const Shape& output_shape() const { return transpose->shape(); }
+  PrimitiveType elem_type() const {
+    return original_input_shape().element_type();
+  }
+
+  const Shape& original_input_shape() const {
+    return transpose->operand(0)->shape();
+  }
+  const Shape& original_output_shape() const { return transpose->shape(); }
 
-  int64_t rank() const { return input_shape().dimensions().size(); }
+  int64_t rank() const { return original_input_shape().dimensions().size(); }
   int64_t canonical_rank() const { return canonical_input_shape.size(); }
 
   int64_t dim_A() const { return canonical_input_shape[dim_A_id()]; }
@@ -264,11 +270,12 @@ struct TransposeSpec {
   llvm::SmallVector<int64_t, 3> canonical_input_shape;
 };
 
-TransposeSpec GetTransposeSpec(const HloTransposeInstruction* transpose);
+// Returns true if the given transpose can be emitted using the packed emitter.
+bool CanEmitPackedTranspose(const TransposeDescription& desc);
 
 // Returns the default tile sizes for the packed transpose emitter.
 absl::StatusOr<absl::InlinedVector<int64_t, 3>> GetPackedTransposeTileSizes(
-    const TransposeSpec& spec);
+    const PackedTransposeDescription& spec);
 
 // Verify the given module, and crash if it failed.
 void VerifyModule(const llvm::Module& module);
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index 14762eb4a410fc..f14ff91e330107 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -50,10 +51,27 @@ using ::testing::SizeIs;
 
 class IrEmissionUtilsTest : public HloHardwareIndependentTestBase {
  public:
-  TransposeSpec GetTransposeSpecFromRoot(absl::string_view hlo_text) {
+  PackedTransposeDescription GetTransposeSpecFromTransposeDescription(
+      absl::string_view hlo_text,
+      std::optional<absl::InlinedVector<int64_t, 3>> permutation = std::nullopt,
+      std::optional<absl::InlinedVector<int64_t, 3>> dimensions =
+          std::nullopt) {
     auto module = ParseAndReturnVerifiedModule(hlo_text).value();
-    auto* root = module->entry_computation()->root_instruction();
-    return GetTransposeSpec(Cast<HloTransposeInstruction>(root));
+    auto* root = Cast<HloTransposeInstruction>(
+        module->entry_computation()->root_instruction());
+
+    if (!permutation.has_value()) {
+      permutation = absl::InlinedVector<int64_t, 3>(root->dimensions().begin(),
+                                                    root->dimensions().end());
+    }
+    if (!dimensions.has_value()) {
+      dimensions = absl::InlinedVector<int64_t, 3>(
+          root->shape().dimensions().begin(), root->shape().dimensions().end());
+    }
+
+    TransposeDescription description{root, *dimensions, *permutation,
+                                     /*shmem_usage=*/0};
+    return PackedTransposeDescription(description);
   }
 };
 
@@ -1117,7 +1135,7 @@ TEST_F(IrEmissionUtilsTest, MultipleDynamicVariables) {
 }
 
 TEST_F(IrEmissionUtilsTest, Transpose_10) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+  auto spec = GetTransposeSpecFromTransposeDescription(R"(ENTRY entry {
     p0 = f32[8, 32] parameter(0)
     ROOT transpose_p0 = f32[32, 8] transpose(p0), dimensions={1, 0}
   })");
@@ -1130,7 +1148,7 @@ TEST_F(IrEmissionUtilsTest, Transpose_10) {
 }
 
 TEST_F(IrEmissionUtilsTest, Transpose_210) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+  auto spec = GetTransposeSpecFromTransposeDescription(R"(ENTRY entry {
     p0 = f32[8, 2, 32] parameter(0)
     ROOT transpose_p0 = f32[32, 2, 8] transpose(p0), dimensions={2, 1, 0}
   })");
@@ -1141,7 +1159,7 @@ TEST_F(IrEmissionUtilsTest, Transpose_210) {
 }
 
 TEST_F(IrEmissionUtilsTest, Transpose_102) {
-  auto spec = GetTransposeSpecFromRoot(R"(ENTRY entry {
+  auto spec = GetTransposeSpecFromTransposeDescription(R"(ENTRY entry {
     p0 = f32[8, 2, 32, 7, 6] parameter(0)
     ROOT transpose_p0 = f32[6, 32, 2, 7, 8] transpose(p0),
       dimensions={4, 2, 1, 3, 0}
@@ -1152,6 +1170,65 @@ TEST_F(IrEmissionUtilsTest, Transpose_102) {
   EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
 }
 
+TEST_F(IrEmissionUtilsTest,
+       PackedTransposeDescriptionUsesProvidedDims_Grouping) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p = f32[32,32,64]{2,1,0} parameter(0)
+    ROOT t = f32[64,32,32]{2,1,0} transpose(p), dimensions={2,0,1}
+  })",
+      /*permutation=*/InlinedVector({1, 0}),
+      /*dimensions=*/InlinedVector({64, 1024}));
+
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(64, 1, 1024, 1));
+}
+
+TEST_F(IrEmissionUtilsTest, PackedTransposeDescriptionUsesProvidedDims_10) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p0 = f32[8, 4, 8] parameter(0)
+    ROOT transpose_p0 = f32[4, 8, 8] transpose(p0), dimensions={1, 2, 0}
+  })",
+      /*permutation=*/InlinedVector({1, 0}),
+      /*dimensions=*/InlinedVector({32, 8}));
+  EXPECT_THAT(spec.permutation, ElementsAre(1, 0));
+  EXPECT_THAT(spec.inv_permutation, ElementsAre(1, 0));
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 1, 32, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 1, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
+}
+
+TEST_F(IrEmissionUtilsTest, PackedTransposeDescriptionUsesProvidedDims_210) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p0 = f32[8, 2, 4, 8] parameter(0)
+    ROOT transpose_p0 = f32[4, 8, 2, 8] transpose(p0),
+      dimensions={2, 3, 1, 0}
+  })",
+      /*permutation=*/InlinedVector({2, 1, 0}),
+      /*dimensions=*/InlinedVector({32, 2, 8}));
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(32, 2, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(2, 1, 0, 3));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(2, 1, 0, 3));
+}
+
+TEST_F(IrEmissionUtilsTest, PackedTransposeDescriptionUsesProvidedDims_102) {
+  auto spec = GetTransposeSpecFromTransposeDescription(
+      R"(ENTRY entry {
+    p0 = f32[8, 2, 32, 7, 2, 3] parameter(0)
+    ROOT transpose_p0 = f32[2, 3, 32, 2, 7, 8] transpose(p0),
+      dimensions={4, 5, 2, 1, 3, 0}
+  })",
+      /*permutation=*/InlinedVector({4, 2, 1, 3, 0}),
+      /*dimensions=*/InlinedVector({6, 32, 2, 7, 8}));
+  EXPECT_THAT(spec.canonical_input_shape, ElementsAre(8, 2, 32, 7, 6, 1));
+  EXPECT_THAT(spec.canonical_output_shape, ElementsAre(6, 32, 2, 7, 8, 1));
+  EXPECT_THAT(spec.canonical_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
+  EXPECT_THAT(spec.canonical_inv_permutation, ElementsAre(4, 2, 1, 3, 0, 5));
+}
+
 TEST(DenseDataIntermediateTest, OwnedDataToProto) {
   const std::vector<uint8_t> data = {1, 2, 3, 4};
   DenseDataIntermediate constant = DenseDataIntermediate::Own(data);

From 7e946e16f50b385ee4bc5efaf837a6c3abc9e81b Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Tue, 16 Dec 2025 08:02:16 -0800
Subject: [PATCH 339/753] [XLA] The slice based simplification seems to be
 imprecise somehow. bail out of the transformation instead.

PiperOrigin-RevId: 845276118
---
 third_party/xla/xla/service/all_gather_simplifier.cc      | 6 +-----
 third_party/xla/xla/service/all_gather_simplifier_test.cc | 4 +---
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/all_gather_simplifier.cc b/third_party/xla/xla/service/all_gather_simplifier.cc
index 479784ee969a9d..f56bf656b9d72d 100644
--- a/third_party/xla/xla/service/all_gather_simplifier.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier.cc
@@ -66,11 +66,7 @@ absl::StatusOr<bool> AllGatherSimplifier::RunImpl(
           HloInstruction* ds = all_gather->users().front();
           HloInstruction* ag_operand = all_gather->mutable_operand(0);
           if (!ShapeUtil::Compatible(ds->shape(), ag_operand->shape())) {
-            ag_operand = ag_operand->AddInstruction(HloInstruction::CreateSlice(
-                ds->shape(), ag_operand,
-                DimensionVector(ds->shape().dimensions().size(), 0),
-                ds->shape().dimensions(),
-                DimensionVector(ds->shape().dimensions().size(), 1)));
+            continue;
           }
           changed = true;
           TF_RETURN_IF_ERROR(ds->ReplaceAllUsesWith(ag_operand));
diff --git a/third_party/xla/xla/service/all_gather_simplifier_test.cc b/third_party/xla/xla/service/all_gather_simplifier_test.cc
index 928b749d9b2ba4..418fa8f9134763 100644
--- a/third_party/xla/xla/service/all_gather_simplifier_test.cc
+++ b/third_party/xla/xla/service/all_gather_simplifier_test.cc
@@ -80,9 +80,7 @@ test {
   AllGatherSimplifier ag_simplifier;
   auto result = ag_simplifier.Run(module.get());
   ASSERT_TRUE(result.ok()) << result.status();
-  ASSERT_TRUE(result.value());
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              GmockMatch(m::Slice(m::Parameter(0))));
+  ASSERT_FALSE(result.value());
 }
 
 }  // namespace

From 0abb6854fd82a664852426806412fc45f412a422 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Tue, 16 Dec 2025 08:03:19 -0800
Subject: [PATCH 340/753] Update `rules_ml_toolchain` version.

PiperOrigin-RevId: 845276670
---
 third_party/xla/MODULE.bazel   | 6 +++---
 third_party/xla/WORKSPACE      | 6 +++---
 third_party/xla/workspace0.bzl | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index 9b372e6b9d9ad0..ae3daa05e8f78f 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -45,9 +45,9 @@ bazel_dep(name = "rules_ml_toolchain")
 # echo "sha256-${HASH}"
 archive_override(
     module_name = "rules_ml_toolchain",
-    integrity = "sha256-seXjBtixED5zubd438Op4GnSBmRDegMkaiNXJJYrXJQ=",
-    strip_prefix = "rules_ml_toolchain-484235be45e6843db962c45d08fe4b2b65a6a24c",
-    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/484235be45e6843db962c45d08fe4b2b65a6a24c.tar.gz"],
+    integrity = "sha256-6YQt4/77WhINOxZH06CebnBx6N+NHNLf5vZu4x/SWV4=",
+    strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
+    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz"],
 )
 
 # TODO: Upstream the patch?
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index cf7afb583b2199..f7786569d234ab 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -9,10 +9,10 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
-    strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
+    sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
+    strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
     urls = tf_mirror_urls(
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
     ),
 )
 
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 4e66ac333f0e23..89a7cc6454943e 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -140,10 +140,10 @@ def workspace():
     if "rules_ml_toolchain" not in native.existing_rules():
         tf_http_archive(
             name = "rules_ml_toolchain",
-            sha256 = "7f00b3e94bbca1a4737ded6b9ed5358f6d1c86430c2ec97c90081343c0482f18",
-            strip_prefix = "rules_ml_toolchain-29d54c875da37e74b8548924ed30e78cb28126b9",
+            sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
+            strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
             urls = tf_mirror_urls(
-                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/29d54c875da37e74b8548924ed30e78cb28126b9.tar.gz",
+                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
             ),
         )
 

From da0b8c8771ceae1a2a3a0c0f3dc5e72153a2e0fe Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 16 Dec 2025 08:31:28 -0800
Subject: [PATCH 341/753] [PJRT] Simplify and broaden the interface of
 TransposePlan.

* allow both an input striding and an input tiling. There's no reason these are mutually exclusive. If both a striding and a tiling are provided, the striding is the stride between tiles.
* use std::optional<> for input and output stridings and tilings.
* refactor test code so it supports input striding and tiling, and make a few small cleanup.

PiperOrigin-RevId: 845287645
---
 .../xla/backends/cpu/runtime/copy_thunk.cc    |   2 +-
 third_party/xla/xla/pjrt/BUILD                |  13 +-
 third_party/xla/xla/pjrt/cpu/raw_buffer.cc    |   2 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc  |   2 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc  |   2 +-
 .../xla/pjrt/pjrt_stream_executor_client.cc   |   2 +-
 third_party/xla/xla/pjrt/se_raw_buffer.cc     |   2 +-
 third_party/xla/xla/pjrt/transpose.cc         | 171 +++++++------
 third_party/xla/xla/pjrt/transpose.h          |  35 ++-
 third_party/xla/xla/pjrt/transpose_test.cc    | 229 ++++++++++++------
 third_party/xla/xla/python/version.h          |   3 +-
 11 files changed, 295 insertions(+), 168 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
index 106d945ddc3981..129a2a7dd32df3 100644
--- a/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
+++ b/third_party/xla/xla/backends/cpu/runtime/copy_thunk.cc
@@ -84,7 +84,7 @@ CopyThunk::CopyThunk(Info info, BufferAllocation::Slice src_buffer,
 
     auto byte_strides = ShapeUtil::ByteStrides(src_shape_);
     CHECK(byte_strides.has_value());
-    options.input_layout = TransposePlan::Striding{*byte_strides};
+    options.input_striding = TransposePlan::Striding{*byte_strides};
 
     absl::InlinedVector<int64_t, 4> permutation(options.dims.size());
     absl::c_reverse_copy(dst_shape_.layout().minor_to_major(),
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index 1821bf0aff7057..a0bdf45320c23e 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -951,6 +951,7 @@ cc_library(
         "//xla:ef57",
         "//xla:permutation_util",
         "//xla:util",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
@@ -975,24 +976,26 @@ xla_cc_test(
         ":transpose",
         "//xla:array",
         "//xla:permutation_util",
-        "//xla:shape_util",
         "//xla:util",
         "//xla/hlo/testlib:test",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test_benchmark",
+        "//xla/tsl/platform:test_main",
         "//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/random",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
index ab453cceaba7c4..04a00dc9b7d776 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
@@ -232,7 +232,7 @@ CpuRawBuffer::CopyFromHostBuffer(
       options.dims = dims;
       options.permutation = permutation;
       if (byte_strides) {
-        options.input_layout = TransposePlan::Striding{*byte_strides};
+        options.input_striding = TransposePlan::Striding{*byte_strides};
       }
       if (thread_pool) {
         options.num_threads =
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index 1aceca82c59b84..40e62765ffd351 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -384,7 +384,7 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
                 primitive_util::ByteWidth(on_device_shape.element_type());
             options.dims = on_device_shape.dimensions();
             options.permutation = permutation;
-            options.input_layout = TransposePlan::Striding{byte_strides};
+            options.input_striding = TransposePlan::Striding{byte_strides};
             {
               absl::MutexLock lock(client->transpose_mu_);
               absl::StatusOr<std::shared_ptr<TransposePlan>> t =
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index 377d27976a6f87..14c44cf1c53925 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -887,7 +887,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuClient::BufferFromHostBuffer(
     options.elem_size_in_bytes = primitive_util::ByteWidth(type);
     options.dims = dims;
     options.permutation = permutation;
-    options.input_layout = TransposePlan::Striding{*byte_strides};
+    options.input_striding = TransposePlan::Striding{*byte_strides};
     absl::MutexLock lock(transpose_mu_);
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 391ed56fc386c0..b47aaf7f446ea0 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -644,7 +644,7 @@ PjRtStreamExecutorClient::LinearizeHostBufferInto(
     options.elem_size_in_bytes = primitive_util::ByteWidth(type);
     options.dims = dims;
     options.permutation = permutation;
-    options.input_layout = TransposePlan::Striding{*byte_strides};
+    options.input_striding = TransposePlan::Striding{*byte_strides};
     absl::MutexLock lock(transpose_mu_);
     TF_ASSIGN_OR_RETURN(transpose, transpose_cache_.GetOrCreate(options));
   }
diff --git a/third_party/xla/xla/pjrt/se_raw_buffer.cc b/third_party/xla/xla/pjrt/se_raw_buffer.cc
index 95a5e499f8e3bc..641e892394bfa8 100644
--- a/third_party/xla/xla/pjrt/se_raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/se_raw_buffer.cc
@@ -308,7 +308,7 @@ void PjRtStreamExecutorRawBuffer::CopyToLiteralAsync(
                 primitive_util::ByteWidth(on_device_shape.element_type());
             options.dims = on_device_shape.dimensions();
             options.permutation = permutation;
-            options.input_layout = TransposePlan::Striding{byte_strides};
+            options.input_striding = TransposePlan::Striding{byte_strides};
             {
               absl::MutexLock lock(client->transpose_mu_);
               absl::StatusOr<std::shared_ptr<TransposePlan>> t =
diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 734f05b3d9c50c..ab20440d7105d8 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -98,6 +98,7 @@ limitations under the License.
 #include "xla/ef57.h"
 #include "xla/permutation_util.h"
 #include "xla/pjrt/transpose_kernels.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
@@ -682,9 +683,13 @@ int64_t TransposePlan::OutputNumElems() const {
 
 // Parses and validates a tiling specification, and populates `tiling`.
 static absl::Status ParseTilingSpecification(
-    int ndim, absl::Span<int64_t const> tiling_spec,
+    int ndim, const std::optional<TransposePlan::Tiling>& tiling_opt,
     absl::InlinedVector<int64_t, 4>& tiling) {
   tiling.resize(ndim, 1);
+  if (!tiling_opt) {
+    return absl::OkStatus();
+  }
+  absl::Span<int64_t const> tiling_spec = tiling_opt->tiling;
   if (tiling_spec.size() > ndim) {
     return InvalidArgument(
         "Tiling (%s) must have at most as many dimensions as the array (%d)",
@@ -909,7 +914,15 @@ void TransposePlan::BuildPlanNodes(
 }
 
 absl::StatusOr<std::unique_ptr<TransposePlan>> TransposePlan::Create(
-    const Options& o) {
+    Options o) {
+  if (o.input_layout.has_value()) {
+    if (const auto* t = std::get_if<Tiling>(&*o.input_layout)) {
+      o.input_tiling = *t;
+    } else if (const auto* s = std::get_if<Striding>(&*o.input_layout)) {
+      o.input_striding = *s;
+    }
+  }
+
   auto is_negative = [](int64_t d) { return d < 0; };
   if (absl::c_find_if(o.dims, is_negative) != o.dims.end()) {
     return InvalidArgument("dims must be non-negative, got %s",
@@ -952,68 +965,82 @@ absl::StatusOr<std::unique_ptr<TransposePlan>> TransposePlan::Create(
   plan->original_b_dims_ = Permute(o.dims, o.permutation);
 
   TF_RETURN_IF_ERROR(
-      ParseTilingSpecification(ndim, o.output_tiling.tiling, plan->b_tiling_));
+      ParseTilingSpecification(ndim, o.output_tiling, plan->b_tiling_));
 
-  // Handles strides.
-  if (std::holds_alternative<Striding>(o.input_layout)) {
+  // Temporary vectors to hold un-permuted attributes
+  absl::InlinedVector<int64_t, 4> temp_lda, temp_lda_tile, temp_a_tiling;
+
+  // Parse the tile and stride specifications.
+  TF_RETURN_IF_ERROR(
+      ParseTilingSpecification(ndim, o.input_tiling, temp_a_tiling));
+  ComputeStrides(plan->elem_size_in_bytes_, o.dims, temp_a_tiling, temp_lda,
+                 temp_lda_tile);
+
+  // Determine tile (outer) strides
+  absl::InlinedVector<int64_t, 4> input_outer_strides;
+  if (o.input_striding) {
     absl::Span<int64_t const> input_strides_in_bytes =
-        std::get<Striding>(o.input_layout).strides_in_bytes;
+        o.input_striding->strides_in_bytes;
     if (input_strides_in_bytes.size() != o.dims.size()) {
       return InvalidArgument(
-          "dims and input_strides_in_bytes must have equal sizes, got %d "
-          "and %d",
+          "dims and input_striding must have equal sizes, "
+          "got %d and %d",
           o.dims.size(), input_strides_in_bytes.size());
     }
+    input_outer_strides.assign(input_strides_in_bytes.begin(),
+                               input_strides_in_bytes.end());
+    // Also save original strides if explicit
     plan->original_a_strides_.resize(ndim);
     absl::c_copy(input_strides_in_bytes, plan->original_a_strides_.begin());
-    // Sort the dimensions from slowest-varying (largest strides) to
-    // fastest-varying (smallest strides).
-    std::vector<int64_t> dim_order(ndim);
-    absl::c_iota(dim_order, 0);
-
-    auto cost = [&](int k) {
-      int64_t stride = input_strides_in_bytes.at(k);
-      // If there is a dimension with size equal to the element size, sort it
-      // last. This ensures that we place any stride-1 dimension last.
-      bool is_stride1 = stride == o.elem_size_in_bytes;
-      // If there are multiple stride-1 dimensions, we'd prefer the one that
-      // matches the stride-1 dimension of the output.
-      // Failing that, we'd just prefer the largest stride-1 dimension last.
-      bool is_trailing_dim_in_b = o.permutation.back() == k;
-
-      // If we are applying ef57 conversion, we want a size-2 stride-1
-      // dimension last.
-      bool ef57_even =
-          (is_stride1 && o.transformation == Transformation::kF64ToEf57 &&
-           o.dims[k] == 2);
-
-      return std::make_tuple(is_stride1, -std::abs(stride), ef57_even,
-                             is_trailing_dim_in_b, o.dims[k]);
-    };
-    absl::c_stable_sort(dim_order,
-                        [&cost](int i, int j) { return cost(i) < cost(j); });
-    // dim_order maps new input dim -> old input dim, we need its inverse to
-    // compute the new permutation.
-    auto inv_dim_order = InversePermutation(dim_order);
-    plan->lda_.reserve(ndim);
-    plan->a_dims_.reserve(ndim);
-    plan->permutation_.reserve(ndim);
-    for (int i = 0; i < ndim; ++i) {
-      plan->lda_.push_back(input_strides_in_bytes.at(dim_order[i]));
-      plan->a_dims_.push_back(o.dims[dim_order[i]]);
-      plan->permutation_.push_back(inv_dim_order[o.permutation[i]]);
-    }
-    plan->lda_tile_.resize(ndim, 1);
-    plan->a_tiling_.resize(ndim, 1);
   } else {
-    TF_RETURN_IF_ERROR(ParseTilingSpecification(
-        ndim, std::get<Tiling>(o.input_layout).tiling, plan->a_tiling_));
-
-    plan->a_dims_ = plan->original_a_dims_;
-    plan->permutation_.resize(ndim);
-    absl::c_copy(o.permutation, plan->permutation_.begin());
-    ComputeStrides(plan->elem_size_in_bytes_, plan->a_dims_, plan->a_tiling_,
-                   plan->lda_, plan->lda_tile_);
+    input_outer_strides = temp_lda;
+  }
+
+  // Sort the dimensions from slowest-varying (largest strides) to
+  // fastest-varying (smallest strides).
+  // Maps new input dim -> old input dim
+  std::vector<int64_t> dim_order(ndim);
+  absl::c_iota(dim_order, 0);
+
+  auto cost = [&](int k) {
+    int64_t stride = input_outer_strides.at(k);
+    // If there is a dimension with size equal to the element size, sort it
+    // last. This ensures that we place any stride-1 dimension last.
+    bool is_stride1 = stride == o.elem_size_in_bytes;
+    // If there are multiple stride-1 dimensions, we'd prefer the one that
+    // matches the stride-1 dimension of the output.
+    // Failing that, we'd just prefer the largest stride-1 dimension last.
+    bool is_trailing_dim_in_b = o.permutation.back() == k;
+
+    // If we are applying ef57 conversion, we want a size-2 stride-1
+    // dimension last.
+    bool ef57_even =
+        (is_stride1 && o.transformation == Transformation::kF64ToEf57 &&
+         o.dims[k] == 2);
+
+    return std::make_tuple(is_stride1, -std::abs(stride), ef57_even,
+                           is_trailing_dim_in_b, o.dims[k]);
+  };
+  absl::c_stable_sort(dim_order,
+                      [&cost](int i, int j) { return cost(i) < cost(j); });
+
+  // Apply permutation to all plan attributes
+  // dim_order maps new input dim -> old input dim, we need its inverse to
+  // compute the new permutation.
+  auto inv_dim_order = InversePermutation(dim_order);
+  plan->lda_.reserve(ndim);
+  plan->lda_tile_.reserve(ndim);
+  plan->a_dims_.reserve(ndim);
+  plan->permutation_.reserve(ndim);
+  plan->a_tiling_.reserve(ndim);
+
+  for (int i = 0; i < ndim; ++i) {
+    int old_idx = dim_order[i];
+    plan->lda_.push_back(input_outer_strides.at(old_idx));
+    plan->lda_tile_.push_back(temp_lda_tile.at(old_idx));
+    plan->a_dims_.push_back(o.dims[old_idx]);
+    plan->permutation_.push_back(inv_dim_order[o.permutation[i]]);
+    plan->a_tiling_.push_back(temp_a_tiling[old_idx]);
   }
 
   auto is_not_one = [](int64_t x) { return x != 1; };
@@ -1354,8 +1381,8 @@ bool TransposePlanCacheKey::operator==(
     const TransposePlanCacheKey& other) const {
   return elem_size_in_bytes == other.elem_size_in_bytes && dims == other.dims &&
          permutation == other.permutation &&
-         input_layout_is_tiling == other.input_layout_is_tiling &&
-         input_layout == other.input_layout &&
+         input_tiling == other.input_tiling &&
+         input_striding == other.input_striding &&
          output_tiling == other.output_tiling &&
          transformation == other.transformation &&
          num_threads == other.num_threads;
@@ -1363,10 +1390,9 @@ bool TransposePlanCacheKey::operator==(
 
 template <typename H>
 H AbslHashValue(H h, const TransposePlanCacheKey& key) {
-  return H::combine(std::move(h), key.elem_size_in_bytes,
-                    key.input_layout_is_tiling, key.num_threads,
+  return H::combine(std::move(h), key.elem_size_in_bytes, key.num_threads,
                     key.transformation, key.dims, key.permutation,
-                    key.input_layout, key.output_tiling);
+                    key.input_tiling, key.input_striding, key.output_tiling);
 }
 
 TransposePlanCache::TransposePlanCache(int capacity)
@@ -1382,21 +1408,18 @@ absl::StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
   absl::c_copy(o.dims, key.dims.begin());
   key.permutation.resize(o.permutation.size());
   absl::c_copy(o.permutation, key.permutation.begin());
-  if (std::holds_alternative<TransposePlan::Striding>(o.input_layout)) {
-    absl::Span<int64_t const> input_strides_in_bytes =
-        std::get<TransposePlan::Striding>(o.input_layout).strides_in_bytes;
-    key.input_layout = absl::InlinedVector<int64_t, 4>(
-        input_strides_in_bytes.begin(), input_strides_in_bytes.end());
-    key.input_layout_is_tiling = false;
-  } else {
-    absl::Span<int64_t const> input_tiling =
-        std::get<TransposePlan::Tiling>(o.input_layout).tiling;
-    key.input_layout = absl::InlinedVector<int64_t, 4>(input_tiling.begin(),
-                                                       input_tiling.end());
-    key.input_layout_is_tiling = true;
+  if (o.input_tiling) {
+    key.input_tiling.emplace(o.input_tiling->tiling.begin(),
+                             o.input_tiling->tiling.end());
+  }
+  if (o.input_striding) {
+    key.input_striding.emplace(o.input_striding->strides_in_bytes.begin(),
+                               o.input_striding->strides_in_bytes.end());
+  }
+  if (o.output_tiling) {
+    key.output_tiling.emplace(o.output_tiling->tiling.begin(),
+                              o.output_tiling->tiling.end());
   }
-  key.output_tiling.resize(o.output_tiling.tiling.size());
-  absl::c_copy(o.output_tiling.tiling, key.output_tiling.begin());
   key.transformation = o.transformation;
   key.num_threads = o.num_threads;
   return cache_.GetOrCreateIfAbsent(
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index 714db857b7da2d..975e2bccc22c0c 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -50,7 +50,9 @@ class TransposePlan {
   // dims: the input shape, in elements.
   // permutation: for each output dimension, gives the number of the
   //   corresponding input dimension. Must be a permutation of [0..dims.size())
-  // input_layout: either byte strides or an input tiling.
+  // input_tiling: optional input tiling.
+  // input_striding: optional input byte strides.
+  // output_tiling: optional output tiling.
   //
   // A Striding represents the strides of the input array in bytes. (N.B. not
   // elements).
@@ -71,7 +73,9 @@ class TransposePlan {
   // tiled dimensions. This is acceptable because in the intended use case for
   // this code we expect at most 2 tiled dimensions on input and output.
   //
-  // The input may have either a striding or a tiling but not both.
+  // The input may have both a tiling and a striding. If both are present,
+  // the striding determines the strides between tiles (in bytes).
+  //
   //
   // num_threads: is the number of threads requested. The actual number of
   //   threads used may be smaller if there isn't enough work per thread.
@@ -94,14 +98,19 @@ class TransposePlan {
     size_t elem_size_in_bytes;
     absl::Span<int64_t const> dims;
     absl::Span<int64_t const> permutation;
-    std::variant<Tiling, Striding> input_layout = Tiling{};
-    Tiling output_tiling;
+    std::optional<Tiling> input_tiling = std::nullopt;
+    std::optional<Striding> input_striding = std::nullopt;
+    std::optional<Tiling> output_tiling = std::nullopt;
     Transformation transformation = Transformation::kNone;
     int num_threads = 1;
+
+    // DEPRECATED: Use input_tiling or input_striding instead.
+    // This field is only present for backward compatibility.
+    // TODO(phawkins): remove me.
+    std::optional<std::variant<Tiling, Striding>> input_layout = std::nullopt;
   };
 
-  static absl::StatusOr<std::unique_ptr<TransposePlan>> Create(
-      const Options& options);
+  static absl::StatusOr<std::unique_ptr<TransposePlan>> Create(Options options);
 
   TransposePlan();
   ~TransposePlan();
@@ -199,10 +208,10 @@ class TransposePlan {
   absl::InlinedVector<int64_t, 4> permutation_;
 
   // Leading-dimension sizes (byte strides) of each dimension.
-  absl::InlinedVector<int64_t, 4> lda_;
-  absl::InlinedVector<int64_t, 4> lda_tile_;
-  absl::InlinedVector<int64_t, 4> ldb_;
-  absl::InlinedVector<int64_t, 4> ldb_tile_;
+  absl::InlinedVector<int64_t, 4> lda_;       // Strides for tiles
+  absl::InlinedVector<int64_t, 4> lda_tile_;  // Strides for tile interiors
+  absl::InlinedVector<int64_t, 4> ldb_;       // Strides for tiles
+  absl::InlinedVector<int64_t, 4> ldb_tile_;  // Strides for tile interiors
 
   // Tile sizes in each dimension. Has size equal to the number of dimensions.
   // A 1 entry means that dimension is not tiled.
@@ -257,9 +266,9 @@ struct TransposePlanCacheKey {
   size_t elem_size_in_bytes;
   absl::InlinedVector<int64_t, 4> dims;
   absl::InlinedVector<int64_t, 4> permutation;
-  bool input_layout_is_tiling;
-  absl::InlinedVector<int64_t, 4> input_layout;
-  absl::InlinedVector<int64_t, 4> output_tiling;
+  std::optional<absl::InlinedVector<int64_t, 4>> input_tiling;
+  std::optional<absl::InlinedVector<int64_t, 4>> input_striding;
+  std::optional<absl::InlinedVector<int64_t, 4>> output_tiling;
   TransposePlan::Transformation transformation;
   int num_threads;
 
diff --git a/third_party/xla/xla/pjrt/transpose_test.cc b/third_party/xla/xla/pjrt/transpose_test.cc
index dc16d7b01073d9..4ba51f5bcff8f5 100644
--- a/third_party/xla/xla/pjrt/transpose_test.cc
+++ b/third_party/xla/xla/pjrt/transpose_test.cc
@@ -23,14 +23,17 @@ limitations under the License.
 #include <ostream>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/numeric/int128.h"
+#include "absl/random/random.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -39,13 +42,13 @@ limitations under the License.
 #include "xla/array.h"
 #include "xla/hlo/testlib/test.h"
 #include "xla/permutation_util.h"
-#include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test_benchmark.h"
+#include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/util.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test_benchmark.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -138,7 +141,7 @@ TEST(TransposeTest, InvalidTilings) {
   options.permutation = perm;
   std::vector<int64_t> input_tiling = {8, 128};
   std::vector<int64_t> output_tiling = {4};
-  options.input_layout = TransposePlan::Tiling{input_tiling};
+  options.input_tiling = TransposePlan::Tiling{input_tiling};
   options.output_tiling = TransposePlan::Tiling{output_tiling};
   auto plan = TransposePlan::Create(options);
   EXPECT_EQ(plan.status().code(), tsl::error::UNIMPLEMENTED);
@@ -156,8 +159,6 @@ TEST(TransposeTest, LargeDimensions) {
   options.elem_size_in_bytes = 8;
   options.dims = dims;
   options.permutation = permutation;
-  options.input_layout = TransposePlan::Tiling{};
-  options.output_tiling = TransposePlan::Tiling{};
   options.transformation = TransposePlan::Transformation::kNone;
   TF_EXPECT_OK(TransposePlan::Create(options).status());
 }
@@ -192,52 +193,60 @@ bool BumpIndices(absl::Span<int64_t const> shape, absl::Span<int64_t> indices) {
   return false;
 }
 
+// Helper to pad tiling to match shape rank. (Suffix alignment).
+std::vector<int64_t> PadTiling(absl::Span<int64_t const> shape,
+                               absl::Span<int64_t const> tiling) {
+  CHECK_LE(tiling.size(), shape.size());
+  std::vector<int64_t> full_tiling(shape.size(), 1);
+  absl::c_copy(tiling, full_tiling.end() - tiling.size());
+  return full_tiling;
+}
+
+std::vector<int64_t> ComputeDefaultStrides(
+    absl::Span<int64_t const> shape, absl::Span<int64_t const> full_tiling,
+    int elem_size_bytes) {
+  CHECK_EQ(full_tiling.size(), shape.size());
+  std::vector<int64_t> strides(shape.size());
+  int64_t stride = elem_size_bytes;
+  for (int64_t t : full_tiling) {
+    stride *= t;
+  }
+
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    strides[i] = stride;
+    stride *= CeilOfRatio(shape[i], full_tiling[i]);
+  }
+  return strides;
+}
+
 // Converts a multidimensional index `indices` into an array with `shape` and
 // tiling `tiling` into a linear offset into a buffer.
+// `striding` is the stride between tiles in bytes.
 int64_t IndexToLinearIndex(absl::Span<int64_t const> shape,
-                           absl::Span<int64_t const> tiling,
-                           absl::Span<int64_t const> indices) {
-  CHECK_LE(tiling.size(), shape.size());
+                           absl::Span<int64_t const> full_tiling,
+                           absl::Span<int64_t const> indices,
+                           absl::Span<int64_t const> striding,
+                           int elem_size_bytes) {
+  CHECK_EQ(full_tiling.size(), shape.size());
   CHECK_EQ(shape.size(), indices.size());
+  CHECK_EQ(shape.size(), striding.size());
+
   int64_t stride = 1;
   int64_t offset = 0;
 
-  auto index_it = indices.rbegin();
-  auto tile_it = tiling.rbegin();
-  for (; tile_it != tiling.rend(); ++index_it, ++tile_it) {
-    offset += (*index_it % *tile_it) * stride;
-    stride *= *tile_it;
-  }
-  index_it = indices.rbegin();
-  tile_it = tiling.rbegin();
-  auto shape_it = shape.rbegin();
-  for (; tile_it != tiling.rend(); ++index_it, ++shape_it, ++tile_it) {
-    offset += (*index_it / *tile_it) * stride;
-    stride *= CeilOfRatio(*shape_it, *tile_it);
+  // Strides within a tiling are always the default strides.
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += (indices[i] % full_tiling[i]) * stride;
+    stride *= full_tiling[i];
   }
-  for (; shape_it != shape.rend(); ++index_it, ++shape_it) {
-    offset += *index_it * stride;
-    stride *= *shape_it;
+  // Strides outside a tiling are the input strides.
+  for (size_t i = 0; i < shape.size(); ++i) {
+    int64_t outer_idx = indices[i] / full_tiling[i];
+    offset += outer_idx * (striding[i] / elem_size_bytes);
   }
   return offset;
 }
 
-// Slow reference code that converts an array from an untiled layout into a
-// tiled layout.
-template <typename T>
-std::vector<T> TileArray(const Array<T>& in, absl::Span<int64_t const> tiling) {
-  std::vector<T> out(SizeOfTiledArray(in.dimensions(), tiling), -1);
-  if (in.num_elements() == 0) {
-    return out;
-  }
-  std::vector<int64_t> indices(in.num_dimensions(), 0);
-  do {
-    int64_t i = IndexToLinearIndex(in.dimensions(), tiling, indices);
-    out.at(i) = in(indices);
-  } while (BumpIndices(in.dimensions(), absl::MakeSpan(indices)));
-  return out;
-}
-
 // Reference implementation: transpose using Eigen.
 template <typename T, int NDIMS>
 void TransposeUsingEigenNd(const T* input, T* output,
@@ -291,25 +300,70 @@ void TransposeUsingEigen(const T* input, T* output,
   }
 }
 
+template <typename T>
+void FillRandom(absl::Span<T> input) {
+  absl::BitGen gen;
+  for (auto& val : input) {
+    if constexpr (std::is_same_v<T, absl::int128>) {
+      val = absl::MakeInt128(absl::Uniform<uint64_t>(gen),
+                             absl::Uniform<uint64_t>(gen));
+    } else {
+      using U = std::make_unsigned_t<T>;
+      val = absl::bit_cast<T>(absl::Uniform<U>(gen));
+    }
+  }
+}
+
+// Reference implementation of transpose that handles tiling and striding.
+template <typename T>
+void ReferenceTranspose(absl::Span<int64_t const> dims,
+                        absl::Span<int64_t const> permutation,
+                        absl::Span<int64_t const> input_tiling,
+                        absl::Span<int64_t const> input_striding,
+                        absl::Span<int64_t const> output_tiling,
+                        absl::Span<int64_t const> output_striding,
+                        absl::Span<const T> input, absl::Span<T> output) {
+  std::vector<int64_t> output_dims = Permute(dims, permutation);
+  std::vector<int64_t> indices(dims.size(), 0);
+  std::vector<int64_t> output_indices(dims.size());
+
+  do {
+    int64_t input_linear_idx = IndexToLinearIndex(dims, input_tiling, indices,
+                                                  input_striding, sizeof(T));
+    T val = input[input_linear_idx];
+
+    for (size_t i = 0; i < dims.size(); ++i) {
+      output_indices[i] = indices[permutation[i]];
+    }
+    int64_t output_linear_idx = IndexToLinearIndex(
+        output_dims, output_tiling, output_indices, output_striding, sizeof(T));
+    output[output_linear_idx] = val;
+  } while (BumpIndices(dims, absl::MakeSpan(indices)));
+}
+
 struct TransposeTestCase {
   TransposeTestCase(std::vector<int64_t> dims, std::vector<int64_t> permutation,
                     std::vector<int64_t> input_tiling = {},
-                    std::vector<int64_t> output_tiling = {})
+                    std::vector<int64_t> output_tiling = {},
+                    std::vector<int64_t> input_striding = {})
       : dims(std::move(dims)),
         permutation(std::move(permutation)),
         input_tiling(std::move(input_tiling)),
+        input_striding(std::move(input_striding)),
         output_tiling(std::move(output_tiling)) {}
 
   std::vector<int64_t> dims;
   std::vector<int64_t> permutation;
   std::vector<int64_t> input_tiling;
+  std::vector<int64_t> input_striding;
   std::vector<int64_t> output_tiling;
 
   std::string ToString() const {
     return absl::StrFormat(
-        "[%s],perm=[%s],tiling=[%s]/[%s]", absl::StrJoin(dims, ","),
-        absl::StrJoin(permutation, ","), absl::StrJoin(input_tiling, ","),
-        absl::StrJoin(output_tiling, ","));
+        "[%s],perm=[%s],tiling=[%s]/[%s],striding=[%s]",
+        absl::StrJoin(dims, ","), absl::StrJoin(permutation, ","),
+        absl::StrJoin(input_tiling, ","), absl::StrJoin(output_tiling, ","),
+        input_striding.empty() ? "none" : absl::StrJoin(input_striding, ","));
   }
 };
 
@@ -320,6 +374,7 @@ std::ostream& operator<<(std::ostream& os, const TransposeTestCase& test) {
 
 std::vector<TransposeTestCase> GetTransposeTestCases() {
   std::vector<TransposeTestCase> cases = {
+      TransposeTestCase(/*dims=*/{}, /*permutation=*/{}),
       TransposeTestCase(/*dims=*/{1}, /*permutation=*/{0}),
       TransposeTestCase(/*dims=*/{4}, /*permutation=*/{0}),
       TransposeTestCase(/*dims=*/{27}, /*permutation=*/{0}),
@@ -376,6 +431,12 @@ std::vector<TransposeTestCase> GetTransposeTestCases() {
                         /*input_tiling=*/{2, 4}),
       TransposeTestCase(/*dims=*/{12, 7}, /*permutation=*/{1, 0},
                         /*input_tiling=*/{}, /*output_tiling=*/{5, 2}),
+      TransposeTestCase(/*dims=*/{4, 6}, /*permutation=*/{1, 0},
+                        /*input_tiling=*/{2, 3},
+                        /*output_tiling=*/{}, /*input_striding=*/{512, 128}),
+      TransposeTestCase(/*dims=*/{13, 9}, /*permutation=*/{1, 0},
+                        /*input_tiling=*/{2, 3},
+                        /*output_tiling=*/{}, /*input_striding=*/{0, 0}),
       TransposeTestCase(/*dims=*/{128, 224, 224, 3},
                         /*permutation=*/{3, 1, 2, 0},
                         /*input_tiling=*/{},
@@ -396,29 +457,63 @@ class TransposeTest : public ::testing::TestWithParam<TransposeTestCase> {
     options.elem_size_in_bytes = sizeof(T);
     options.dims = test.dims;
     options.permutation = test.permutation;
-    options.input_layout = TransposePlan::Tiling{test.input_tiling};
-    options.output_tiling = TransposePlan::Tiling{test.output_tiling};
+    if (!test.input_striding.empty()) {
+      options.input_striding = TransposePlan::Striding{test.input_striding};
+    }
+    if (!test.input_tiling.empty()) {
+      options.input_tiling = TransposePlan::Tiling{test.input_tiling};
+    }
+    if (!test.output_tiling.empty()) {
+      options.output_tiling = TransposePlan::Tiling{test.output_tiling};
+    }
     options.transformation = TransposePlan::Transformation::kNone;
     options.num_threads = parallelism;
     TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
     VLOG(1) << plan->ToString();
-    xla::Array<T> untiled_input(test.dims);
-    untiled_input.FillIota(0);
-    xla::Array<T> expected_untiled_output(output_dims);
-    TransposeUsingEigen(untiled_input.data(), expected_untiled_output.data(),
-                        test.dims, output_dims, test.permutation);
-
-    auto tiled_input = TileArray(untiled_input, test.input_tiling);
-    auto expected_tiled_output =
-        TileArray(expected_untiled_output, test.output_tiling);
-
-    std::vector<T> output(
-        SizeOfTiledArray(plan->OutputDims(), test.output_tiling), -1);
-    plan->Execute(
-        tiled_input.data(), output.data(),
-        [&](std::function<void()> fn) { threadpool.Schedule(std::move(fn)); });
-
-    EXPECT_EQ(expected_tiled_output, output);
+
+    // Allocate sufficiently large buffers.
+    // We can use SizeOfTiledArray for output which is always tiled/dense.
+    int64_t output_size =
+        SizeOfTiledArray(plan->OutputDims(), test.output_tiling);
+    std::vector<T> output(output_size, -1);
+
+    std::vector<int64_t> input_striding = test.input_striding;
+    std::vector<int64_t> input_tiling = PadTiling(test.dims, test.input_tiling);
+    if (input_striding.empty()) {
+      input_striding =
+          ComputeDefaultStrides(test.dims, input_tiling, sizeof(T));
+    }
+    int64_t input_tile_size = absl::c_accumulate(input_tiling, int64_t{1},
+                                                 std::multiplies<int64_t>());
+
+    std::vector<int64_t> output_tiling =
+        PadTiling(output_dims, test.output_tiling);
+    std::vector<int64_t> output_striding =
+        ComputeDefaultStrides(output_dims, output_tiling, sizeof(T));
+
+    int64_t input_size = 1;
+    if (!test.dims.empty()) {
+      std::vector<int64_t> max_indices = test.dims;
+      for (int i = 0; i < test.dims.size(); ++i) {
+        max_indices[i] = RoundDownTo(max_indices[i], input_tiling[i]);
+      }
+      input_size = IndexToLinearIndex(test.dims, input_tiling, max_indices,
+                                      input_striding, sizeof(T)) +
+                   input_tile_size;
+    }
+    std::vector<T> input(input_size);
+    FillRandom<T>(absl::MakeSpan(input));
+    std::vector<T> expected_output(output_size, -1);
+
+    ReferenceTranspose<T>(test.dims, test.permutation, input_tiling,
+                          input_striding, output_tiling, output_striding, input,
+                          absl::MakeSpan(expected_output));
+
+    plan->Execute(input.data(), output.data(), [&](std::function<void()> fn) {
+      threadpool.Schedule(std::move(fn));
+    });
+
+    EXPECT_EQ(output, expected_output);
   }
 };
 
@@ -448,7 +543,7 @@ TEST(TransposeTest, NegativeStrides1D) {
   options.dims = dims;
   options.permutation = permutation;
   std::vector<int64_t> strides = {-int64_t{sizeof(int32_t)}};
-  options.input_layout = TransposePlan::Striding{strides};
+  options.input_striding = TransposePlan::Striding{strides};
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
   plan->Execute(input.data() + (n - 1), output.data());
   EXPECT_EQ(expected, output);
@@ -475,7 +570,7 @@ TEST(TransposeTest, NegativeStrides2D) {
   options.permutation = permutation;
   std::vector<int64_t> strides = {4 * sizeof(int16_t),
                                   -int64_t{sizeof(int16_t)}};
-  options.input_layout = TransposePlan::Striding{strides};
+  options.input_striding = TransposePlan::Striding{strides};
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
   plan->Execute(input.data() + 3, output.data());
   EXPECT_EQ(expected, output);
@@ -546,8 +641,6 @@ void BM_Transpose(const TransposeTestCase& bm, int parallelism,
   options.elem_size_in_bytes = sizeof(T);
   options.dims = bm.dims;
   options.permutation = bm.permutation;
-  options.input_layout = TransposePlan::Tiling{};
-  options.output_tiling = TransposePlan::Tiling{};
   options.transformation = TransposePlan::Transformation::kNone;
   options.num_threads = parallelism;
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 8c3c21a0f9f178..6ee37c65188467 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER \
-  42  // PjRtExecutable is created using IFRT Compiler::Compile() API.
+#define JAX_IFRT_VERSION_NUMBER 43  // Transpose API update
 
 #endif  // XLA_PYTHON_VERSION_H_

From 6e338a3da3768588fe69705b46a06d6e8fb56211 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Tue, 16 Dec 2025 10:15:27 -0800
Subject: [PATCH 342/753] Migrate `hlo_reduce_test` to PjRt

PiperOrigin-RevId: 845329928
---
 third_party/xla/xla/tests/BUILD              | 10 +++++----
 third_party/xla/xla/tests/reduce_hlo_test.cc | 22 ++++++++++----------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 18becb5c2cb638..77955f587ee64c 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -2212,20 +2212,22 @@ xla_test(
         "cpu",
         "interpreter",
     ],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":hlo_test_base",
-        ":test_utils",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_util",
+        "//xla:shape_layout",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
+        "@com_google_googletest//:gtest",
     ],
 )
 
diff --git a/third_party/xla/xla/tests/reduce_hlo_test.cc b/third_party/xla/xla/tests/reduce_hlo_test.cc
index 9e5c5ed2bb9f54..f69438baa48268 100644
--- a/third_party/xla/xla/tests/reduce_hlo_test.cc
+++ b/third_party/xla/xla/tests/reduce_hlo_test.cc
@@ -16,11 +16,10 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <memory>
-#include <ostream>
 #include <string>
 #include <utility>
-#include <vector>
 
+#include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
@@ -33,10 +32,10 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/shape.h"
-#include "xla/tests/hlo_test_base.h"
-#include "xla/tests/test_utils.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
+#include "xla/shape_layout.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
 
 // Tests the Reduce HLO in ways that can't be done using the ComputationBuilder
 // API.
@@ -59,12 +58,8 @@ std::string PrintReduceLayout(
   return reduce_layout_param.param.ToString();
 }
 
-void PrintTo(const ReduceLayout& reduce_layout, ::std::ostream* os) {
-  *os << reduce_layout.ToString();
-}
-
 class ReduceWithLayoutTest
-    : public HloTestBase,
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>,
       public ::testing::WithParamInterface<ReduceLayout> {
  public:
   absl::StatusOr<std::unique_ptr<HloModule>> GetParsedModule() {
@@ -127,6 +122,11 @@ TEST_P(ReduceWithLayoutTest, Reduce) {
 
   Literal reduce_input_relaid =
       reduce_input.Relayout(reduce_input_shape->layout());
+
+  // Strict layout check in PjRt requires entry computation layout to match.
+  *module->mutable_entry_computation_layout()->mutable_parameter_layout(0) =
+      ShapeLayout(*reduce_input_shape);
+
   EXPECT_TRUE(RunAndCompareNoHloPasses(
       std::move(module), {&reduce_input_relaid}, ErrorSpec(1e-5)));
 }

From b96a7e312785c7a4b4b41f508e95c3b6d4a5dd1a Mon Sep 17 00:00:00 2001
From: Shawn Lu <xiaoxlu@google.com>
Date: Tue, 16 Dec 2025 10:33:54 -0800
Subject: [PATCH 343/753] Minor internal change to save repeated proto copies.

PiperOrigin-RevId: 845338656
---
 tensorflow/core/tfrt/ifrt/BUILD               |  1 -
 .../core/tfrt/ifrt/ifrt_serving_executable.cc | 43 ++++++++++---------
 .../core/tfrt/ifrt/ifrt_serving_executable.h  | 10 ++---
 3 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index 4533b3c2e102de..7d0d005b0474dc 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -120,7 +120,6 @@ cc_library(
         ":ifrt_persistent_compilation_cache",
         ":ifrt_restore_tensor_registry",
         ":ifrt_serving_core_selector",
-        ":ifrt_tensor_utils",
         ":sharding_utils",
         ":tf_host_callback",
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 2976f38f71e352..bc376e94d09962 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -97,7 +97,6 @@ limitations under the License.
 #include "tensorflow/core/tfrt/ifrt/ifrt_persistent_compilation_cache.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_restore_tensor_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_serving_core_selector.h"
-#include "tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 #include "tensorflow/core/tfrt/ifrt/tf_host_callback.h"
 #include "tsl/platform/tstring.h"
@@ -504,7 +503,7 @@ IfrtServingExecutable::CreateExecutableSynchronously(
       compile_metadata.use_shardy_partitioner());
   xla_compile_options.parameter_is_tupled_arguments = false;
   // Use portable execution for single device + core selection.
-  if (UsePortableExecution(compile_metadata)) {
+  if (UsePortableExecution()) {
     xla_compile_options.compile_portable_executable = true;
   } else {
     TF_ASSIGN_OR_RETURN(
@@ -555,9 +554,8 @@ IfrtServingExecutable::CreateExecutableSynchronously(
   return executable_bundle;
 }
 
-tsl::Future<IfrtServingExecutable::SharedCachedExecutableBundle>
+absl::StatusOr<tsl::Future<IfrtServingExecutable::SharedCachedExecutableBundle>>
 IfrtServingExecutable::LookUpOrCreateExecutable(
-    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
     absl::Span<const DtypeAndShape> dtypes_and_shapes,
     absl::Span<const int> variable_arg_indices) {
   std::vector<tensorflow::TensorShape> input_shapes;
@@ -597,7 +595,18 @@ IfrtServingExecutable::LookUpOrCreateExecutable(
     // compilation.
     module_copy = mlir::OwningOpRef<mlir::ModuleOp>(module_->clone());
   }
+  tensorflow::tpu::TPUCompileMetadataProto compile_metadata =
+      original_compile_metadata_;
+
+  // b/469105465: Add test coverage for core selection in execution.
+  if (UsePortableExecution()) {
+    // Clear device_assignment because portable execution doesn't allow device
+    // assignment.
+    compile_metadata.clear_device_assignment();
+  }
 
+  TF_RETURN_IF_ERROR(
+      UpdateCompileMetadata(compile_metadata, dtypes_and_shapes));
   LOG(INFO) << "Cache missed. Building executable";
   absl::StatusOr<SharedCachedExecutableBundle> executable_bundle =
       CreateExecutableSynchronously(std::move(module_copy), compile_metadata,
@@ -613,11 +622,11 @@ void IfrtServingExecutable::Freeze() {
   module_ = nullptr;
 }
 
-bool IfrtServingExecutable::UsePortableExecution(
-    const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata) {
+bool IfrtServingExecutable::UsePortableExecution() {
   // TODO(b/335247101) Add a check that the core selector must be non-null if
   // it is a single-device program after core selection in Ifrt is stable.
-  return IsSingleDevice(compile_metadata) && ifrt_serving_core_selector_;
+  return IsSingleDevice(original_compile_metadata_) &&
+         ifrt_serving_core_selector_;
 }
 
 absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
@@ -657,20 +666,12 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
                       BuildDtypeAndShape(inputs, variable_arg_indices,
                                          ifrt_restore_tensor_registry_));
 
-  tensorflow::tpu::TPUCompileMetadataProto compile_metadata =
-      original_compile_metadata_;
-  TF_RETURN_IF_ERROR(
-      UpdateCompileMetadata(compile_metadata, dtypes_and_shapes));
-
   // `device_reservation` should be alive before the end of the execution.
   tsl::DeviceReservation device_reservation(kNoCoreSelectedIndex, nullptr);
   xla::ifrt::DeviceListRef device_list;
-  if (UsePortableExecution(compile_metadata)) {
+  if (UsePortableExecution()) {
     device_reservation =
         ifrt_serving_core_selector_->ReserveDevice(program_id_);
-    // Clear device_assignment because portable execution doesn't allow device
-    // assignment.
-    compile_metadata.clear_device_assignment();
     TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
                         ifrt_client_->LookupDevice(xla::ifrt::DeviceId(
                             device_reservation.device_index())));
@@ -679,10 +680,10 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
     device_list = assigned_device_list_;
   }
   TF_ASSIGN_OR_RETURN(
-      SharedCachedExecutableBundle executable_bundle,
-      LookUpOrCreateExecutable(compile_metadata, dtypes_and_shapes,
-                               variable_arg_indices)
-          .Await());
+      tsl::Future<SharedCachedExecutableBundle> executable_bundle_future,
+      LookUpOrCreateExecutable(dtypes_and_shapes, variable_arg_indices));
+  TF_ASSIGN_OR_RETURN(SharedCachedExecutableBundle executable_bundle,
+                      executable_bundle_future.Await());
 
   if (executable_bundle->compile_metadata.args().size() !=
       dtypes_and_shapes.size()) {
@@ -775,7 +776,7 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   VLOG(2) << "Start Execution";
 
   std::optional<xla::ifrt::DeviceListRef> execution_device_list;
-  if (UsePortableExecution(compile_metadata)) {
+  if (UsePortableExecution()) {
     execution_device_list = device_list;
   }
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index 8e29544fd01d78..ac772ae89d89be 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -230,10 +230,9 @@ class IfrtServingExecutable {
       const CachedExecutableBundle& executable_bundle,
       const xla::ifrt::DeviceListRef& devices);
 
-  tsl::Future<SharedCachedExecutableBundle> LookUpOrCreateExecutable(
-      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata,
-      absl::Span<const DtypeAndShape> dtypes_and_shapes,
-      absl::Span<const int> variable_arg_indices);
+  absl::StatusOr<tsl::Future<SharedCachedExecutableBundle>>
+  LookUpOrCreateExecutable(absl::Span<const DtypeAndShape> dtypes_and_shapes,
+                           absl::Span<const int> variable_arg_indices);
   absl::StatusOr<IfrtServingExecutable::SharedCachedExecutableBundle>
   CreateExecutableSynchronously(
       mlir::OwningOpRef<mlir::ModuleOp> module_copy,
@@ -248,8 +247,7 @@ class IfrtServingExecutable {
   std::vector<xla::ifrt::Shape> GetArgShape(
       int arg_index, const CachedExecutableBundle& entry);
 
-  bool UsePortableExecution(
-      const tensorflow::tpu::TPUCompileMetadataProto& compile_metadata);
+  bool UsePortableExecution();
 };
 
 }  // namespace ifrt_serving

From 85fba99e27d9f89e300bb54d24638ee435ac91a8 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 16 Dec 2025 10:46:41 -0800
Subject: [PATCH 344/753] Add Shape to While(Thunk|Cmd) buffer_uses

PiperOrigin-RevId: 845344696
---
 third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc | 2 +-
 third_party/xla/xla/backends/gpu/runtime/while_thunk.h         | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index f5cbe9d43370fc..a99e25ac16bf21 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -1701,7 +1701,7 @@ bool WhileCmd::force_update() {
 
 CommandBufferCmd::BufferUseVector WhileCmd::buffers() const {
   absl::flat_hash_set<BufferUse> buffers;
-  buffers.emplace(BufferUse::Write(pred_));
+  buffers.emplace(BufferUse::Read(pred_, ShapeUtil::MakeShape(PRED, {})));
   buffers.insert(cond_commands_.buffers().begin(),
                  cond_commands_.buffers().end());
   buffers.insert(body_commands_.buffers().begin(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
index 51d0cf5ad532f4..c8af5cf26bf378 100644
--- a/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/while_thunk.h
@@ -103,7 +103,8 @@ class WhileThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(condition_result_buffer_index_),
+        BufferUse::Read(condition_result_buffer_index_,
+                        ShapeUtil::MakeShape(PRED, {})),
     };
   }
 

From d740c6a294c07ed6cf462ccc09dded01b7e433d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 11:43:16 -0800
Subject: [PATCH 345/753] Report mixed_priority_batching_policy metric with
 policy name string.

Add `GetMixedPriorityBatchingPolicyString` to convert the `MixedPriorityBatchingPolicy` enum to its string attribute value. This function is used when recording the policy in metrics, ensuring that the string representation (e.g., "priority_isolation") is used instead of the enum's integer value.

PiperOrigin-RevId: 845370645
---
 tensorflow/core/kernels/batching_util/BUILD    |  1 +
 .../batching_util/batch_resource_base.cc       |  8 ++++++--
 .../batching_util/batch_resource_base_test.cc  |  7 ++++++-
 .../kernels/batching_util/batch_scheduler.cc   | 18 ++++++++++++++++++
 .../kernels/batching_util/batch_scheduler.h    |  3 +++
 .../batching_util/batch_scheduler_test.cc      |  8 ++++++++
 6 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index cdd9af962f346e..6448e5ae36fead 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -535,6 +535,7 @@ tf_cc_test(
         "@local_xla//xla/tsl/lib/monitoring:cell_reader",
         "@local_xla//xla/tsl/lib/monitoring:test_utils",
         "@local_xla//xla/tsl/platform:criticality",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
 
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index abc5fa34f3a3ab..8c53a2299110a5 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -284,8 +284,12 @@ static auto* mixed_priority_batching_policy_value =
 void RecordBatchParamMixedPriorityBatchingPolicy(
     MixedPriorityBatchingPolicy mixed_priority_batching_policy,
     const std::string& model_name, const std::string& op_name) {
-  mixed_priority_batching_policy_value->GetCell(model_name, op_name)
-      ->Set(absl::StrCat(mixed_priority_batching_policy));
+  auto policy_str =
+      GetMixedPriorityBatchingPolicyString(mixed_priority_batching_policy);
+  if (policy_str.ok()) {
+    mixed_priority_batching_policy_value->GetCell(model_name, op_name)
+        ->Set(std::string(*policy_str));
+  }
 }
 
 void RecordBatchParamMaxEnqueuedBatches(int64_t max_enqueued_batches,
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
index 0e781747bcf170..a3b813ec6b7a4f 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base_test.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/tsl/lib/monitoring/cell_reader.h"
 #include "xla/tsl/lib/monitoring/test_utils.h"
 #include "xla/tsl/platform/criticality.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/cost_constants.h"
 #include "tensorflow/core/common_runtime/cost_measurement.h"
 #include "tensorflow/core/common_runtime/cost_measurement_registry.h"
@@ -282,9 +283,13 @@ TEST_P(BatchResourceBaseWithPriorityTest, BatchingWithMixedPriorityPolicy) {
         /*forced_warmup_batch_size=*/0));
   }
   blocking_counter.Wait();
+
+  TF_ASSERT_OK_AND_ASSIGN(absl::string_view policy_str,
+                          GetMixedPriorityBatchingPolicyString(
+                              GetParam().mixed_priority_batching_policy));
   EXPECT_EQ(
       mixed_priority_policy_reader_->Read("my_model_name", "my_batch_node"),
-      absl::StrCat(GetParam().mixed_priority_batching_policy));
+      policy_str);
 
   for (const auto& [batch_size, expected_count] :
        GetParam().expected_batch_size_count) {
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.cc b/tensorflow/core/kernels/batching_util/batch_scheduler.cc
index 91bfad8642ecd8..e74f6dfe9ddc08 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.cc
@@ -40,5 +40,23 @@ absl::StatusOr<MixedPriorityBatchingPolicy> GetMixedPriorityBatchingPolicy(
       "Unknown mixed priority batching policy: %s", attr_value));
 }
 
+absl::StatusOr<absl::string_view> GetMixedPriorityBatchingPolicyString(
+    MixedPriorityBatchingPolicy policy) {
+  switch (policy) {
+    case MixedPriorityBatchingPolicy::kLowPriorityPaddingWithMaxBatchSize:
+      return kLowPriorityPaddingWithMaxBatchSizeAttrValue;
+    case MixedPriorityBatchingPolicy::
+        kLowPriorityPaddingWithNextAllowedBatchSize:
+      return kLowPriorityPaddingWithNextAllowedBatchSizeAttrValue;
+    case MixedPriorityBatchingPolicy::kPriorityIsolation:
+      return kPriorityIsolationAttrValue;
+    case MixedPriorityBatchingPolicy::kPriorityMerge:
+      return kPriorityMergeAttrValue;
+    default:
+      return absl::InvalidArgumentError(absl::StrFormat(
+          "Unknown mixed priority batching policy: %d", policy));
+  }
+}
+
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler.h b/tensorflow/core/kernels/batching_util/batch_scheduler.h
index 936473a1884dc9..4060a8b15fbd96 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler.h
@@ -70,6 +70,9 @@ enum class MixedPriorityBatchingPolicy {
 absl::StatusOr<MixedPriorityBatchingPolicy> GetMixedPriorityBatchingPolicy(
     absl::string_view attr_value);
 
+absl::StatusOr<absl::string_view> GetMixedPriorityBatchingPolicyString(
+    MixedPriorityBatchingPolicy policy);
+
 // The abstract superclass for a unit of work to be done as part of a batch.
 //
 // An implementing subclass typically contains (or points to):
diff --git a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
index d587f482763fde..fce07c171b8e85 100644
--- a/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/batch_scheduler_test.cc
@@ -49,6 +49,12 @@ TEST(MixedPriorityBatchingPolicyTest, InvalidAttrValueError) {
           absl::StatusCode::kInvalidArgument,
           ::testing::HasSubstr(
               "Unknown mixed priority batching policy: invalid_attr_value")));
+  EXPECT_THAT(
+      GetMixedPriorityBatchingPolicyString(
+          static_cast<MixedPriorityBatchingPolicy>(4)),
+      absl_testing::StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          ::testing::HasSubstr("Unknown mixed priority batching policy: 4")));
 }
 
 using MixedPriorityBatchingPolicyParameterizedTest = ::testing::TestWithParam<
@@ -59,6 +65,8 @@ TEST_P(MixedPriorityBatchingPolicyParameterizedTest,
   auto [attr_name, policy] = GetParam();
   EXPECT_THAT(GetMixedPriorityBatchingPolicy(attr_name),
               absl_testing::IsOkAndHolds(Eq(policy)));
+  EXPECT_THAT(GetMixedPriorityBatchingPolicyString(policy),
+              absl_testing::IsOkAndHolds(Eq(attr_name)));
 }
 
 INSTANTIATE_TEST_SUITE_P(

From 002edd9267f35db699885c644d7445e883d59c83 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Tue, 16 Dec 2025 12:42:10 -0800
Subject: [PATCH 346/753] Fix `IsCompatibleCacheFile()`

- Change the overload signature to take a `FileDescriptorView` instead of a `FileDescriptor`.
- Move to the beginning of the file before attempting to read it.

PiperOrigin-RevId: 845393565
---
 .../lite/delegates/xnnpack/weight_cache.cc    |  4 +-
 .../lite/delegates/xnnpack/weight_cache.h     |  2 +-
 .../delegates/xnnpack/weight_cache_test.cc    | 45 ++++++++++++++++---
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index cbd3ac2ca29e2e..d92060ad2357d2 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -672,12 +672,14 @@ bool IsCompatibleCacheFile(const char* path) {
   return IsCompatibleCacheFile(std::move(fd));
 }
 
-bool IsCompatibleCacheFile(const FileDescriptor& fd) {
+bool IsCompatibleCacheFile(FileDescriptorView fd) {
   XNNPACK_RETURN_CHECK(fd.IsValid(), "Invalid file descriptor: %d.",
                        fd.Value());
   const size_t current_pos = fd.GetPos();
   ScopeGuard reset_pos_on_return(
       [current_pos, &fd] { fd.SetPos(current_pos); });
+  XNNPACK_RETURN_CHECK(fd.SetPos(0) != -1,
+                       "Couldn't move to the start of the file.");
 
   XNNPackCacheHeader header;
   XNNPACK_RETURN_CHECK(fd.Read(&header, sizeof(header)),
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index 270b48bb4092af..a7c8654df4f7ec 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -74,7 +74,7 @@ bool IsCompatibleCacheFile(const char* path);
 // restored upon exiting.
 //
 // Note: the file descriptor must be open and valid.
-bool IsCompatibleCacheFile(const FileDescriptor& fd);
+bool IsCompatibleCacheFile(FileDescriptorView fd);
 
 struct PackIdentifier {
   enum { kNoId = SIZE_MAX };
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
index a74e40018e1eba..dd3093b2736517 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
@@ -972,8 +972,13 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackRebuildOnVersionMismatch) {
   ASSERT_TRUE(cache_provider.StartBuildStep());
 }
 
-class IsCompatibleCacheFileTest : public testing::Test {
+enum class IsCompatibleCacheFileTestOverload { kPath, kDescriptor };
+
+class IsCompatibleCacheFileTest
+    : public testing::TestWithParam<IsCompatibleCacheFileTestOverload> {
  public:
+  using Param = IsCompatibleCacheFileTestOverload;
+
   void SetUp() override {
     header_.version = XNNPackCacheHeader::kVersion;
     memcpy(header_.xnnpack_build_identifier,
@@ -982,28 +987,54 @@ class IsCompatibleCacheFileTest : public testing::Test {
   }
 
   bool WriteHeaderAndReturnIsCompatibleCacheFile() {
-    const bool res = fd_.Write(&header_, sizeof(header_));
-    fd_.Close();
-    return res && IsCompatibleCacheFile(fd_.GetCPath());
+    if (!fd_.Write(&header_, sizeof(header_))) {
+      return false;
+    }
+    if (GetParam() == Param::kPath) {
+      fd_.Close();
+      return IsCompatibleCacheFile(fd_.GetCPath());
+    } else {
+      const FileDescriptor::Offset pos = fd_.GetPos();
+      EXPECT_NE(pos, 0);  // Ensure that we are testing with a non 0 position.
+      const bool compatible = IsCompatibleCacheFile(fd_);
+      EXPECT_EQ(pos, fd_.GetPos());
+      return compatible;
+    }
   }
 
   XNNPackCacheHeader header_{};
   TempFileDesc fd_;
 };
 
-TEST_F(IsCompatibleCacheFileTest, ReturnsTrueForACorrectHeader) {
+std::string Name(
+    const testing::TestParamInfo<IsCompatibleCacheFileTestOverload>& info) {
+  switch (info.param) {
+    case IsCompatibleCacheFileTestOverload::kPath:
+      return "WithPathOverload";
+    case IsCompatibleCacheFileTestOverload::kDescriptor:
+      return "WithFileDescriptorOverload";
+  }
+}
+
+TEST_P(IsCompatibleCacheFileTest, ReturnsTrueForACorrectHeader) {
   EXPECT_TRUE(WriteHeaderAndReturnIsCompatibleCacheFile());
 }
 
-TEST_F(IsCompatibleCacheFileTest, ReturnsFalseForWrongHeaderVersion) {
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongHeaderVersion) {
   header_.version += 1;
   EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
 }
 
-TEST_F(IsCompatibleCacheFileTest, ReturnsFalseForWrongBuildIdentifier) {
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongBuildIdentifier) {
   header_.xnnpack_build_identifier[0] += 1;
   EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
 }
 
+INSTANTIATE_TEST_SUITE_P(
+    Test, IsCompatibleCacheFileTest,
+    testing::Values(IsCompatibleCacheFileTest::Param::kPath,
+                    IsCompatibleCacheFileTest::Param::kDescriptor),
+    Name);
+
 }  // namespace
 }  // namespace tflite::xnnpack

From 7d2b8cb49c16734e5c3d9a73be48aefd4e697fe0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 12:50:28 -0800
Subject: [PATCH 347/753] Suppress LocalRendezvous abort warning for
 OUT_OF_RANGE errors.

Change the log level for LocalRendezvous aborts from INFO to WARNING. Additionally, do not log a warning when the status is OUT_OF_RANGE, as this typically indicates a normal end of sequence (e.g., in tf.data), reducing log spam.

PiperOrigin-RevId: 845396351
---
 tensorflow/core/framework/local_rendezvous.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index 6a56c1695d35b9..36e87d36d594fd 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "xla/tsl/platform/logging.h"
 #include "tensorflow/core/activity_watcher/activity.h"
@@ -404,8 +405,13 @@ void LocalRendezvous::DoAbort(const absl::Status& status) {
     mutex_lock l(mu_);
     status_.Update(status);
   }
-  LOG_EVERY_POW_2(INFO) << "Local rendezvous is aborting with status: "
-                        << status;
+
+  // OUT_OF_RANGE implies a normal end of sequence (e.g. for tf.data),
+  // so we suppress the warning to avoid log noise.
+  if (status.code() != absl::StatusCode::kOutOfRange) {
+    LOG_EVERY_POW_2(WARNING)
+        << "Local rendezvous is aborting with status: " << status;
+  }
 
   // Keeps one Item to make sure the current rendezvous won't be destructed.
   std::unique_ptr<Item> to_delete;

From f4c2d0b25fa1bfa1971add1a35c195aeb3945bf0 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Tue, 16 Dec 2025 13:09:55 -0800
Subject: [PATCH 348/753] Update `rules_ml_toolchain` version.

PiperOrigin-RevId: 845403838
---
 WORKSPACE                 | 6 +++---
 tensorflow/workspace0.bzl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index a334cf6080074a..33c2a4a4ac691d 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -21,10 +21,10 @@ tf_http_archive(
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "1a911c79fc734c39538781a7a4672b06aab8354c1ddb985c98e3df78f430bcde",
-    strip_prefix = "rules_ml_toolchain-f13852164b6fe240f8a989a744221a51e0d485cd",
+    sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
+    strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
     urls = tf_mirror_urls(
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/f13852164b6fe240f8a989a744221a51e0d485cd.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
     ),
 )
 
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index e6507c60a4090b..3385041e0c76da 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -108,10 +108,10 @@ def workspace():
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
     tf_http_archive(
         name = "rules_ml_toolchain",
-        sha256 = "b1e5e306d8b1103e73b9b778dfc3a9e069d20664437a03246a235724962b5c94",
-        strip_prefix = "rules_ml_toolchain-484235be45e6843db962c45d08fe4b2b65a6a24c",
+        sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
+        strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
         urls = tf_mirror_urls(
-            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/484235be45e6843db962c45d08fe4b2b65a6a24c.tar.gz",
+            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
         ),
     )
 

From 989142678572ce991aa0ee2078adbc34aa5a44fb Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Tue, 16 Dec 2025 16:00:29 -0800
Subject: [PATCH 349/753] Migrate triangular_solve_test to PjRt.

PiperOrigin-RevId: 845468800
---
 third_party/xla/xla/tests/BUILD               |  6 +++--
 .../xla/xla/tests/triangular_solve_test.cc    | 25 ++++++-------------
 2 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 77955f587ee64c..e3f85c05b090a6 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -3856,13 +3856,15 @@ xla_test(
     name = "triangular_solve_test",
     srcs = ["triangular_solve_test.cc"],
     real_hardware_only = True,
-    shard_count = 3,
+    shard_count = 50,
     tags = [
         "optonly",
+        "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array",
         "//xla:array2d",
diff --git a/third_party/xla/xla/tests/triangular_solve_test.cc b/third_party/xla/xla/tests/triangular_solve_test.cc
index d5e95eca8c7345..2060edcc668b49 100644
--- a/third_party/xla/xla/tests/triangular_solve_test.cc
+++ b/third_party/xla/xla/tests/triangular_solve_test.cc
@@ -34,7 +34,8 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -45,9 +46,10 @@ namespace {
 constexpr float kNan = std::numeric_limits<float>::quiet_NaN();
 constexpr complex64 kNanC64 = complex64(kNan, kNan);
 
-using TriangularSolveTest = ClientLibraryTestRunnerMixin<HloTestBase>;
-using TriangularSolveLeftLookingTest =
-    ClientLibraryTestRunnerMixin<HloTestBase>;
+using TriangularSolveTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
+using TriangularSolveLeftLookingTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 Array2D<float> AValsLower() {
   return {{2, kNan, kNan, kNan},
@@ -448,24 +450,13 @@ struct TriangularSolveTestSpec {
 };
 
 class TriangularSolveParametricTest
-    : public ClientLibraryTestRunnerMixin<HloTestBase>,
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>,
       public ::testing::WithParamInterface<TriangularSolveTestSpec> {};
 
 TEST_P(TriangularSolveParametricTest, Random) {
   TriangularSolveTestSpec spec = GetParam();
 
-  if (backend()
-          .default_stream_executor()
-          ->GetDeviceDescription()
-          .cuda_compute_capability()
-          .major == 6) {
-    if (spec.dims.size() == 3 && spec.dims[0] > 1 && spec.dims[1] == 150 &&
-        (spec.dims[2] == 150 || spec.dims[2] == 5) &&
-        (!spec.left_side || spec.dims[2] == 150)) {
-      GTEST_SKIP() << "triggers a bug in cuda 12. b/287345077";
-    }
-  }
-
   XlaBuilder builder(TestName());
 
   CHECK_GE(spec.dims.size(), 2);

From 041ab8d8a0667bb6b621f2386df7aa6631d8ad0e Mon Sep 17 00:00:00 2001
From: Gregory Pataky <gregpataky@google.com>
Date: Tue, 16 Dec 2025 16:04:14 -0800
Subject: [PATCH 350/753] Handle edge case around float to int conversion when
 float type does not have a max value greater than the destination

Due to the lovely design of C++, floating point to integer conversions are undefined when the float value can not exactly fit into the destination type (source attribution appears on line 1740 in literal.cc). Integer to floating point conversions are defined to be able to round however the implementation sees fit, and only values outside of the `float` (finite) representatble range are undefined behavior. To get around the undefined behavoire for float->int, we pre-map floating point values to integer values before doing the `static_cast`.

The previous logic would cast the largest S32 to F4E2M1FN and then check if the input was >= to that, but this ends up having the F4E2M1FN max of `6` mapped to S32::MAX. The logic now only saturates to destination max/lowest if the src is truly out-of-bound for the destination type as defined by C++'s float-to-int mapping.

PiperOrigin-RevId: 845470175
---
 third_party/xla/xla/literal.cc      | 17 ++++++--
 third_party/xla/xla/literal_test.cc | 60 +++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index f93f6784af1aaf..3e76da9f4608aa 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -1747,18 +1747,27 @@ void ConvertBetweenNativeTypes(absl::Span<const NativeSrcT> src_data,
     if constexpr (!std::is_same_v<NativeDestT, bool> &&
                   !std::numeric_limits<NativeSrcT>::is_integer &&
                   std::numeric_limits<NativeDestT>::is_integer) {
+      // NaN check.
       if (src != src) {
         return NativeDestT{0};
       }
-      if (src >=
-          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::max())) {
+
+      // Clamp values that cannot fit in the destination type to avoid undefined
+      // behavior.
+      // An N-bit integer has a max of 2^N - 1 so max() + 1 is 2^N. Ensure
+      // double can losslessly hold.
+      static_assert((std::numeric_limits<double>::max_exponent - 1) >=
+                    std::numeric_limits<NativeDestT>::digits);
+      if (static_cast<double>(src) >=
+          static_cast<double>(std::numeric_limits<NativeDestT>::max())) {
         return std::numeric_limits<NativeDestT>::max();
       }
-      if (src <=
-          static_cast<NativeSrcT>(std::numeric_limits<NativeDestT>::lowest())) {
+      if (static_cast<double>(src) <=
+          static_cast<double>(std::numeric_limits<NativeDestT>::lowest())) {
         return std::numeric_limits<NativeDestT>::lowest();
       }
     }
+
     // TODO(b/370786669): Once ml_dtypes is updated to include
     // https://github.com/jax-ml/ml_dtypes/pull/205, do not special-case e3m4 by
     // casting to half first.
diff --git a/third_party/xla/xla/literal_test.cc b/third_party/xla/xla/literal_test.cc
index 0c55b2cb54e10c..fd86910256d471 100644
--- a/third_party/xla/xla/literal_test.cc
+++ b/third_party/xla/xla/literal_test.cc
@@ -1912,6 +1912,66 @@ TEST_F(LiteralUtilTest, ConvertIfTypesMatch) {
   EXPECT_EQ(c128.Convert(S32).status().code(), tsl::error::UNIMPLEMENTED);
 }
 
+TEST_F(LiteralUtilTest, ConvertFromF4E2M1FN) {
+  // Raw F4 inputs.
+  Literal f4 = LiteralUtil::CreateR1<tsl::float4_e2m1fn>({
+      static_cast<tsl::float4_e2m1fn>(0.0f),
+      static_cast<tsl::float4_e2m1fn>(0.5f),
+      static_cast<tsl::float4_e2m1fn>(1.0f),
+      static_cast<tsl::float4_e2m1fn>(1.5f),
+      static_cast<tsl::float4_e2m1fn>(2.0f),
+      static_cast<tsl::float4_e2m1fn>(3.0f),
+      static_cast<tsl::float4_e2m1fn>(4.0f),
+      static_cast<tsl::float4_e2m1fn>(6.0f),
+      static_cast<tsl::float4_e2m1fn>(-0.0f),
+      static_cast<tsl::float4_e2m1fn>(-0.5f),
+      static_cast<tsl::float4_e2m1fn>(-1.0f),
+      static_cast<tsl::float4_e2m1fn>(-1.5f),
+      static_cast<tsl::float4_e2m1fn>(-2.0f),
+      static_cast<tsl::float4_e2m1fn>(-3.0f),
+      static_cast<tsl::float4_e2m1fn>(-4.0f),
+      static_cast<tsl::float4_e2m1fn>(-6.0f),
+  });
+  // We assert these are our expectations.
+  Literal f32 = LiteralUtil::CreateR1<float>({
+      0.0f,
+      0.5f,
+      1.0f,
+      1.5f,
+      2.0f,
+      3.0f,
+      4.0f,
+      6.0f,
+      -0.0f,
+      -0.5f,
+      -1.0f,
+      -1.5f,
+      -2.0f,
+      -3.0f,
+      -4.0f,
+      -6.0f,
+  });
+
+  // From F4E2M1FN.
+  EXPECT_EQ(f4.Convert(U2), f32.Convert(U2));
+  EXPECT_EQ(f4.Convert(S2), f32.Convert(S2));
+  EXPECT_EQ(f4.Convert(U4), f32.Convert(U4));
+  EXPECT_EQ(f4.Convert(S4), f32.Convert(S4));
+  EXPECT_EQ(f4.Convert(F4E2M1FN), f32.Convert(F4E2M1FN));
+  EXPECT_EQ(f4.Convert(U8), f32.Convert(U8));
+  EXPECT_EQ(f4.Convert(S8), f32.Convert(S8));
+  EXPECT_EQ(f4.Convert(F8E4M3FN), f32.Convert(F8E4M3FN));
+  EXPECT_EQ(f4.Convert(F8E4M3B11FNUZ), f32.Convert(F8E4M3B11FNUZ));
+  EXPECT_EQ(f4.Convert(F8E5M2), f32.Convert(F8E5M2));
+  EXPECT_EQ(f4.Convert(U16), f32.Convert(U16));
+  EXPECT_EQ(f4.Convert(S16), f32.Convert(S16));
+  EXPECT_EQ(f4.Convert(F16), f32.Convert(F16));
+  EXPECT_EQ(f4.Convert(BF16), f32.Convert(BF16));
+  EXPECT_EQ(f4.Convert(U32), f32.Convert(U32));
+  EXPECT_EQ(f4.Convert(S32), f32.Convert(S32));
+  EXPECT_EQ(f4.Convert(F32).value(), f32);
+}
+
 TYPED_TEST(LiteralUtilFloatTest, ConvertIfTypesMatchF8) {
   constexpr auto ptype = primitive_util::NativeToPrimitiveType<TypeParam>();
   if (!primitive_util::IsF8Type(ptype)) {

From d5573b2ff32c383a1a47fa607897f4106a8a2276 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 16:04:59 -0800
Subject: [PATCH 351/753] Add method to get stack info so that users could copy
 it to the new model.

PiperOrigin-RevId: 845470437
---
 third_party/xla/xla/hlo/ir/hlo_module.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 584fd38d1e8979..8a7dfdca9792ff 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -825,6 +825,11 @@ class HloModule {
     stack_frame_index_ = std::move(stack_frame_index);
   }
 
+  // Getter for the stack frame index.
+  const std::optional<StackFrameIndexProto>& stack_frame_index() const {
+    return stack_frame_index_;
+  }
+
   // Finalizes this module by destroying internal data structures that might be
   // used for building or modifying the module. It is undefined behavior to
   // modify the module (add computations or instructions) after the call. Should

From eca48b1b6fb331fc48f5e056f76c6b061a410733 Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Tue, 16 Dec 2025 16:08:50 -0800
Subject: [PATCH 352/753] Add `PJRT_Buffer_CopyRawToHostFuture` to the PJRT C
 API.

PiperOrigin-RevId: 845471805
---
 third_party/xla/xla/pjrt/c/CHANGELOG.md       |  4 ++
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 44 ++++++++++++++++--
 third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc |  6 +++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 46 +++++++++++++++++++
 .../pjrt/c_api_client/pjrt_c_api_client.cc    | 42 +++++++++++++++++
 .../xla/pjrt/c_api_client/pjrt_c_api_client.h |  3 ++
 .../c_api_client/pjrt_c_api_client_test.cc    | 33 +++++++++++++
 7 files changed, 175 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 25c3fbd29a0d3d..c47d05a264835b 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,9 @@
 # PJRT C API changelog
 
+## 0.84
+
+* Add `PJRT_Buffer_CopyRawToHostFuture`.
+
 ## 0.83
 
 * Add `PJRT_AsyncHostToDeviceTransferManager_TransferLiteral`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 8a34314ff037b3..7dd3139bd7658c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -104,7 +104,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 83
+#define PJRT_API_MINOR 84
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -2210,6 +2210,43 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHost_Args, event);
 typedef PJRT_Error* PJRT_Buffer_CopyRawToHost(
     PJRT_Buffer_CopyRawToHost_Args* args);
 
+struct PJRT_Buffer_CopyRawToHostFuture_Callback_Args {
+  size_t struct_size;
+
+  // callback_data should be set to the one returned by
+  // PJRT_Buffer_CopyRawToHostFuture.
+  void* callback_data;
+
+  PJRT_Error_Code error_code;
+  // error_message and error_message_size are only valid if error_code is not
+  // PJRT_ERROR_CODE_OK.
+  const char* error_message;
+  size_t error_message_size;
+  // dst is only valid if error_code is PJRT_ERROR_CODE_OK.
+  void* dst;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHostFuture_Callback_Args, dst);
+
+struct PJRT_Buffer_CopyRawToHostFuture_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+  int64_t offset;
+  int64_t transfer_size;
+  PJRT_Event* event;  // out
+  // callback_data should be sent to the future_ready, when dst is ready.
+  void* callback_data;  // out
+  void (*future_ready_callback)(
+      PJRT_Buffer_CopyRawToHostFuture_Callback_Args* args);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_CopyRawToHostFuture_Args,
+                          future_ready_callback);
+
+// Similar to PJRT_Buffer_CopyRawToHost, but the transfer will not happen until
+// `future_ready_callback` is invoked.
+typedef PJRT_Error* PJRT_Buffer_CopyRawToHostFuture(
+    PJRT_Buffer_CopyRawToHostFuture_Args* args);
+
 struct PJRT_Buffer_CopyToDevice_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2734,11 +2771,12 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_LoadedExecutable_GetDeviceAssignment);
   _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateErrorBuffer);
   _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_TransferLiteral);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyRawToHostFuture);
 } PJRT_Api;
 
 enum {
-  PJRT_Api_STRUCT_SIZE = PJRT_STRUCT_SIZE(
-      PJRT_Api, PJRT_AsyncHostToDeviceTransferManager_TransferLiteral)
+  PJRT_Api_STRUCT_SIZE =
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Buffer_CopyRawToHostFuture)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 0a089ae0f34e38..ce115dd3958adb 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -950,6 +950,9 @@ FieldOffsetsAndSizesForVersion(int major_version, int minor_version) {
       add_field("PJRT_AsyncHostToDeviceTransferManager_TransferLiteral",
                 kFnPtrSize);
     }
+    if (minor_version >= 84) {
+      add_field("PJRT_Buffer_CopyRawToHostFuture", kFnPtrSize);
+    }
     return version_offsets_and_sizes;
   }
   LOG(FATAL) << "Unsupported API version: " << major_version << "."
@@ -1349,6 +1352,9 @@ TEST_F(PjrtCAbiTestBase, FieldOffsetsAndSizes) {
                      PJRT_AsyncHostToDeviceTransferManager_TransferLiteral),
             sizeof(PJRT_Api::
                        PJRT_AsyncHostToDeviceTransferManager_TransferLiteral)}},
+          {"PJRT_Buffer_CopyRawToHostFuture",
+           {offsetof(PJRT_Api, PJRT_Buffer_CopyRawToHostFuture),
+            sizeof(PJRT_Api::PJRT_Buffer_CopyRawToHostFuture)}},
       };
   ASSERT_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
   ASSERT_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 078f16b27087f1..801328d8288165 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -2248,6 +2248,50 @@ PJRT_Error* PJRT_Buffer_CopyRawToHost(PJRT_Buffer_CopyRawToHost_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_Buffer_CopyRawToHostFuture(
+    PJRT_Buffer_CopyRawToHostFuture_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Buffer_CopyRawToHostFuture_Args",
+      PJRT_Buffer_CopyRawToHostFuture_Args_STRUCT_SIZE, args->struct_size));
+
+  auto [promise, future] = xla::Future<void*>::MakePromise();
+  xla::Future<> wrapped_promise = args->buffer->buffer->CopyRawToHostFuture(
+      future, args->offset, args->transfer_size);
+  args->event = new PJRT_Event{std::move(wrapped_promise)};
+
+  typedef absl::AnyInvocable<void(
+      PJRT_Buffer_CopyRawToHostFuture_Callback_Args*) &&>
+      Callback;
+  auto callback = new Callback(
+      [promise = std::move(promise)](
+          PJRT_Buffer_CopyRawToHostFuture_Callback_Args* args) mutable {
+        absl::Status status = ActualStructSizeIsGreaterOrEqual(
+            "PJRT_Buffer_CopyRawToHostFuture_Callback_Args",
+            PJRT_Buffer_CopyRawToHostFuture_Callback_Args_STRUCT_SIZE,
+            args->struct_size);
+        if (!status.ok()) {
+          promise.Set(status);
+          return;
+        }
+        if (args->error_code != PJRT_Error_Code_OK) {
+          absl::Status error = absl::Status(
+              pjrt::PjrtErrorCodeToStatusCode(args->error_code),
+              absl::string_view(args->error_message, args->error_message_size));
+          promise.Set(std::move(error));
+          return;
+        }
+        promise.Set(args->dst);
+      });
+  args->callback_data = callback;
+  args->future_ready_callback =
+      +[](PJRT_Buffer_CopyRawToHostFuture_Callback_Args* args) {
+        auto* callback = reinterpret_cast<Callback*>(args->callback_data);
+        std::move (*callback)(args);
+        delete callback;
+      };
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Buffer_CopyToDevice(PJRT_Buffer_CopyToDevice_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
       "PJRT_Buffer_CopyToDevice_Args",
@@ -3181,6 +3225,8 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       pjrt::PJRT_Client_CreateErrorBuffer,
       /*PJRT_AsyncHostToDeviceTransferManager_TransferLiteral=*/
       pjrt::PJRT_AsyncHostToDeviceTransferManager_TransferLiteral,
+      /*PJRT_Buffer_CopyRawToHostFuture=*/
+      pjrt::PJRT_Buffer_CopyRawToHostFuture,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 08592d5a9c7776..8fc556ae3c7150 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -3096,6 +3096,48 @@ Future<> PjRtCApiBuffer::CopyRawToHost(void* dst, int64_t offset,
   return pjrt::ConvertCEventToCppFuture(args.event, api);
 }
 
+Future<> PjRtCApiBuffer::CopyRawToHostFuture(Future<void*> dst, int64_t offset,
+                                             int64_t transfer_size) {
+  if (pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      pjrt_c_api()->pjrt_api_version.minor_version < 84) {
+    return Future<>(absl::UnimplementedError(
+        "PJRT_Buffer_CopyRawToHostFuture requires PJRT C API version 0.84 or "
+        "higher."));
+  }
+
+  PJRT_Buffer_CopyRawToHostFuture_Args args;
+  args.struct_size = PJRT_Buffer_CopyRawToHostFuture_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.buffer = buffer_.get();
+  args.offset = offset;
+  args.transfer_size = transfer_size;
+  const PJRT_Api* api = pjrt_c_api();
+  RETURN_FUTURE_IF_ERROR(api->PJRT_Buffer_CopyRawToHostFuture(&args), api);
+  dst.OnReady(
+      [callback_data = args.callback_data,
+       callback = args.future_ready_callback](absl::StatusOr<void*> dst) {
+        PJRT_Buffer_CopyRawToHostFuture_Callback_Args callback_args;
+        callback_args.struct_size =
+            PJRT_Buffer_CopyRawToHostFuture_Callback_Args_STRUCT_SIZE;
+        if (dst.ok()) {
+          callback_args.dst = *dst;
+          callback_args.error_code = PJRT_Error_Code_OK;
+          callback_args.error_message = nullptr;
+          callback_args.error_message_size = 0;
+        } else {
+          callback_args.dst = nullptr;
+          callback_args.error_code =
+              pjrt::StatusCodeToPjrtErrorCode(dst.status().code());
+          callback_args.error_message = dst.status().message().data();
+          callback_args.error_message_size = dst.status().message().size();
+        }
+        callback_args.callback_data = callback_data;
+        callback(&callback_args);
+      });
+  CHECK(args.event != nullptr);
+  return pjrt::ConvertCEventToCppFuture(args.event, api);
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
     PjRtMemorySpace* dst_memory) {
   const PJRT_Api* api = pjrt_c_api();
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index ea773cddabf832..42291bc32e0e7c 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -549,6 +549,9 @@ class PjRtCApiBuffer : public PjRtBuffer {
   Future<> CopyRawToHost(void* dst, int64_t offset,
                          int64_t transfer_size) override;
 
+  Future<> CopyRawToHostFuture(Future<void*> dst, int64_t offset,
+                               int64_t transfer_size) override;
+
   void Delete() override;
 
   absl::StatusOr<std::unique_ptr<ExternalReference>>
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
index 3d74a142ac77f4..cfe59018a80413 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
@@ -639,5 +639,38 @@ TEST(PjRtCApiClientTest, AsyncHostToDeviceTransferManagerTransferLiteral) {
   EXPECT_TRUE(LiteralTestUtil::Equal(literal, *result_literal));
 }
 
+TEST(PjRtCApiClientTest, CopyRawToHostFuture) {
+  SetUpCpuPjRtApi();
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                       GetCApiClient("cpu"));
+  std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> recv_data(4);
+  Shape shape = ShapeUtil::MakeShape(F32, {4});
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostBuffer(
+          data.data(), shape.element_type(), shape.dimensions(),
+          /*byte_strides=*/std::nullopt,
+          PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall, nullptr,
+          client->memory_spaces()[0], /*device_layout=*/nullptr));
+  auto [dst_promise, dst_future] = Future<void*>::MakePromise();
+  ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
+  auto result = buffer->CopyRawToHostFuture(dst_future, 0, size);
+
+  // Fulfill the promise with a valid host buffer.
+  dst_promise.Set(recv_data.data());
+  EXPECT_OK(result.Await());
+  ASSERT_EQ(recv_data.size(), data.size());
+  EXPECT_THAT(recv_data, ElementsAreArray(data));
+
+  // Test error case.
+  auto [error_dst_promise, error_dst_future] = Future<void*>::MakePromise();
+  result = buffer->CopyRawToHostFuture(error_dst_future, 0, size);
+  error_dst_promise.Set(absl::InternalError("Future error"));
+  absl::Status status = result.Await();
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_EQ(status.message(), "Future error");
+}
+
 }  // namespace
 }  // namespace xla

From 832e64a062ffcf5042a59b493ec4feb37883f618 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Tue, 16 Dec 2025 16:10:47 -0800
Subject: [PATCH 353/753] Move ExecutePrepare and ExecuteLaunch to
 CommonPjRtClient.

PiperOrigin-RevId: 845472618
---
 .../xla/xla/pjrt/common_pjrt_client.cc        | 120 ++++++++++++++++++
 third_party/xla/xla/pjrt/common_pjrt_client.h | 107 ++++++++++++++++
 2 files changed, 227 insertions(+)

diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index d3756104d08943..86833819c6c356 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -747,6 +747,126 @@ std::vector<std::unique_ptr<PjRtBuffer>> CommonPjRtClient::CreateOutputs(
   return res;
 }
 
+absl::Status CommonPjRtLoadedExecutable::ExecutePrepare(
+    ExecuteLaunchArgs& launch_args,
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    const ExecuteOptions& options, size_t host_callback_idx,
+    PjRtDevice* device) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::ExecutePrepare");
+  TF_ASSIGN_OR_RETURN(auto executable,
+                      StartRawExecutable(options, replica, partition, device));
+  // Fill in device to launch_args so it will be present even if ExecutePrepare
+  // fails with OOM.
+  device = executable->device();
+  launch_args.device = device;
+
+  // Execute takes `extra_deps` and waits for those to be
+  // fulfilled before executing the program and returning an available
+  // `execute_event` signaling that the program execution is complete. To avoid
+  // clobbering inputs, we must ensure that
+  //   `extra_deps` = inputs' definition events + donated inputs' usage events.
+  // This also ensures that the returned `execute_event` dominates all inputs'
+  // events, and thus output buffer only need to contain `execute_event` as the
+  // single definition event.
+  launch_args.extra_deps =
+      client()->CreateDeviceEventSet(argument_handles.size());
+  launch_args.control_deps =
+      client()->CreateDeviceEventSet(argument_handles.size());
+
+  bool is_error = false;
+  TF_RETURN_IF_ERROR(CommonPjRtClient::PrepareArguments(
+      options, argument_handles, ParametersThatMustBeDonated(),
+      *launch_args.extra_deps, *launch_args.control_deps,
+      launch_args.input_buffers, launch_args.device_buffers, device, replica,
+      partition, parameter_device_shapes_, is_error));
+
+  absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
+      output_leaf_buffers;
+  if (!is_error) {
+    // Allocate output with input reuse. Any allocation errors are returned
+    // immediately. Derived classes may use custom logic for allocation.
+    TF_ASSIGN_OR_RETURN(output_leaf_buffers,
+                        client()->AllocateOutputBuffersWithInputReuse(
+                            output_device_shape_, launch_args.device_buffers,
+                            input_output_alias_config(), device,
+                            output_memory_space_kind_ids_));
+    VLOG(3) << "Created output buffer: " << output_device_shape_.ToString();
+
+    TF_RETURN_IF_ERROR(CheckBufferCompatibilities(
+        options, launch_args.input_buffers, argument_handles));
+  }
+
+  TF_RETURN_IF_ERROR(executable->Load(options, host_callback_idx));
+
+  launch_args.executable = std::move(executable);
+  launch_args.options = &options;
+  launch_args.is_predetermined_error = is_error;
+  launch_args.output_leaf_buffers = std::move(output_leaf_buffers);
+  return absl::OkStatus();
+}
+
+absl::Span<int const> CommonPjRtLoadedExecutable::ParametersThatMustBeDonated()
+    const {
+  return parameters_that_must_be_donated_;
+}
+
+absl::Status CommonPjRtLoadedExecutable::CheckBufferCompatibilities(
+    const ExecuteOptions& options,
+    absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> input_buffers,
+    absl::Span<PjRtBuffer* const> argument_handles) const {
+  if (input_buffers.size() != input_buffer_sizes_in_bytes_.size()) {
+    return InvalidArgument(
+        "Execution supplied %lld buffers but compiled program expected %lld "
+        "buffers",
+        input_buffers.size(), input_buffer_sizes_in_bytes_.size());
+  }
+  for (int i = 0; i < input_buffers.size(); ++i) {
+    size_t buffer_size = input_buffers[i]->GetOnDeviceSizeInBytes();
+    if (input_buffer_sizes_in_bytes_[i] != buffer_size) {
+      const auto& expected_shape = parameter_device_shapes_[i];
+      const auto& actual_shape = argument_handles[i]->on_device_shape();
+      return InvalidArgument(
+          "Executable(%s) expected parameter %d of size %lld (%s) but got "
+          "buffer with incompatible size %lld (%s)",
+          name(), i, input_buffer_sizes_in_bytes_[i],
+          expected_shape.ToString(true), buffer_size,
+          actual_shape.ToString(true));
+    }
+  }
+  return absl::OkStatus();
+}
+
+PjRtLoadedExecutable::Result CommonPjRtLoadedExecutable::ExecuteLaunch(
+    ExecuteLaunchArgs& launch_args, bool fill_future) const {
+  CHECK(launch_args.extra_deps.get()) << "extra_deps is nullptr";
+  CHECK(launch_args.control_deps.get()) << "control_deps is nullptr";
+  auto results =
+      std::move(*launch_args.executable)
+          .Execute(*launch_args.options, launch_args.input_buffers,
+                   launch_args.output_leaf_buffers, *launch_args.extra_deps,
+                   *launch_args.control_deps,
+                   launch_args.is_predetermined_error, fill_future);
+  {
+    tsl::profiler::TraceMe t3("Handle input event recording");
+    // Handle input event recording.
+    for (CommonPjRtBuffer::ScopedHold& b : launch_args.device_buffers) {
+      if (b.type() == CommonPjRtBuffer::ScopedHold::kUsage) {
+        b.ConvertUsageHold(results.primary_execute_event);
+      } else {
+        CHECK(b.type() == CommonPjRtBuffer::ScopedHold::kDonation);
+        b.ConfirmDonation();
+      }
+    }
+  }
+  return PjRtLoadedExecutable::Result(
+      {/*future=*/std::move(results.future),
+       /*buffers=*/client()->CreateOutputs(
+           output_device_shape_, results.primary_execute_event,
+           launch_args.device, output_memory_space_kind_ids_,
+           std::move(launch_args.output_leaf_buffers),
+           launch_args.is_predetermined_error)});
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
                                            PjRtMemorySpace* dst_memory_space) {
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index 5470a087376444..839403ce4e7a67 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -152,6 +152,11 @@ class CommonPjRtClient : public PjRtClient {
     return CreateLinkedEventPromise(memory_space, "CreateLinkedEventPromise");
   }
 
+  virtual std::unique_ptr<PjRtDeviceEventSet> CreateDeviceEventSet(
+      size_t preallocated_size) const {
+    LOG(FATAL) << "Implement";
+  }
+
   // Registers the necessary debug information for an allocation event.
   // TODO(parkers): Once everything is unified this should be controlled
   // by a non-device-specific config instead of delegating this control
@@ -263,6 +268,108 @@ class CommonPjRtClient : public PjRtClient {
       bool is_predetermined_error);
 };
 
+// Represents the launch state for a loaded executable. This state must be
+// reconstructed each time we want to launch the executable.
+class PjRtRawLoadedExecutable {
+ public:
+  virtual ~PjRtRawLoadedExecutable() = default;
+
+  virtual PjRtDevice* device() = 0;
+
+  virtual absl::Status Load(const ExecuteOptions& options,
+                            size_t host_callback_idx) = 0;
+
+  struct RawExecuteResult {
+    std::optional<tsl::Future<>> future;
+    tsl::RCReference<PjRtDeviceEvent> primary_execute_event;
+  };
+  virtual RawExecuteResult Execute(
+      const ExecuteOptions& options,
+      absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> inputs,
+      absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> results,
+      PjRtDeviceEventSet& extra_deps, PjRtDeviceEventSet& control_deps,
+      bool is_predetermined_error, bool fill_future) && = 0;
+};
+
+class CommonPjRtLoadedExecutable : public PjRtLoadedExecutable {
+ public:
+  CommonPjRtLoadedExecutable(CommonPjRtClient* client,
+                             std::vector<Shape> parameter_device_shapes,
+                             Shape output_device_shape,
+                             std::vector<int> output_memory_space_kind_ids,
+                             std::vector<PjRtDevice*> addressable_devices)
+      : parameter_device_shapes_(std::move(parameter_device_shapes)),
+        output_device_shape_(std::move(output_device_shape)),
+        output_memory_space_kind_ids_(std::move(output_memory_space_kind_ids)),
+        addressable_devices_(std::move(addressable_devices)) {}
+
+  CommonPjRtClient* client() const override = 0;
+
+  absl::Span<PjRtDevice* const> addressable_devices() const override {
+    return addressable_devices_;
+  }
+
+ protected:
+  // Execute is split into Prepare and Launch.
+  // Prepare can fail and be retried, while Launch is guaranteed to succeed.
+  struct ExecuteLaunchArgs {
+    PjRtDevice* device;
+    std::unique_ptr<PjRtRawLoadedExecutable> executable;
+    absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4> input_buffers;
+    absl::InlinedVector<CommonPjRtBuffer::ScopedHold, 4> device_buffers;
+    std::unique_ptr<PjRtDeviceEventSet> extra_deps;
+    std::unique_ptr<PjRtDeviceEventSet> control_deps;
+    absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
+        output_leaf_buffers;
+    bool is_predetermined_error;
+    const ExecuteOptions* options;
+  };
+
+  virtual absl::StatusOr<std::unique_ptr<PjRtRawLoadedExecutable>>
+  StartRawExecutable(const ExecuteOptions& options, int replica, int partition,
+                     PjRtDevice* device) const = 0;
+
+  // Returns a sorted list of the parameters that must be donated as a
+  // side-effect of the execution. Derived classes may use custom logic.
+  absl::Span<int const> ParametersThatMustBeDonated() const;
+
+  virtual const HloInputOutputAliasConfig& input_output_alias_config()
+      const = 0;
+
+  // Checks that the input buffers passed in by the user have the correct size
+  // on device for the compiled program.
+  absl::Status CheckBufferCompatibilities(
+      const ExecuteOptions& options,
+      absl::Span<const tsl::RCReference<CommonPjRtRawBuffer>> input_buffers,
+      absl::Span<PjRtBuffer* const> argument_handles) const;
+
+  absl::Status ExecutePrepare(ExecuteLaunchArgs& launch_args,
+                              absl::Span<PjRtBuffer* const> argument_handles,
+                              int replica, int partition,
+                              const ExecuteOptions& options,
+                              size_t host_callback_idx,
+                              PjRtDevice* device) const;
+
+  Result ExecuteLaunch(ExecuteLaunchArgs& launch_args, bool fill_future) const;
+
+  // Parameter shapes.
+  std::vector<Shape> parameter_device_shapes_;
+  // A sorted vector of parameters that have any aliased buffers and thus must
+  // be donated when executing the computation.
+  std::vector<int> parameters_that_must_be_donated_;
+  // Result layouts (device shapes).
+  Shape output_device_shape_;
+  // memory_space()->kind_id() for each output buffer.
+  std::vector<int> output_memory_space_kind_ids_;
+  // Size on device of each leaf buffer of the compiled program, cached here
+  // for performance reasons.
+  std::vector<int64_t> input_buffer_sizes_in_bytes_;
+  // addressable_devices_[i] is the Device to which
+  // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
+  // unique_ptrs to play well with the Python bindings (see xla.cc).
+  std::vector<PjRtDevice*> addressable_devices_;
+};
+
 // TODO(parkers): Merge everything here into CommonPjRtBuffer.
 class CommonPjRtBufferImpl : public CommonPjRtBuffer {
  public:

From fc70d01ee09e7140ad031d4d2e7710e4f7367190 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 16 Dec 2025 16:28:44 -0800
Subject: [PATCH 354/753] Add proto serialization for AllGatherStartThunk

PiperOrigin-RevId: 845478753
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  21 +++
 .../backends/gpu/runtime/all_gather_thunk.cc  |  92 ++++++++++---
 .../backends/gpu/runtime/all_gather_thunk.h   |  13 ++
 .../gpu/runtime/all_gather_thunk_test.cc      |  75 ++++++++++
 .../backends/gpu/runtime/collective_thunk.cc  | 128 ++++++++++++++----
 .../backends/gpu/runtime/collective_thunk.h   |  12 ++
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  22 +++
 .../runtime/thunk_proto_deserialization.cc    |   5 +
 8 files changed, 322 insertions(+), 46 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/all_gather_thunk_test.cc

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 53daeea846b443..69b1b5d0f589c1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1246,11 +1246,13 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -1260,6 +1262,23 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "all_gather_thunk_test",
+    srcs = ["all_gather_thunk_test.cc"],
+    deps = [
+        ":all_gather_thunk",
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "collective_kernel_thunk",
     srcs = ["collective_kernel_thunk.cc"],
@@ -1802,6 +1821,7 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -2842,6 +2862,7 @@ cc_library(
     srcs = ["thunk_proto_deserialization.cc"],
     hdrs = ["thunk_proto_deserialization.h"],
     deps = [
+        ":all_gather_thunk",
         ":collective_thunk",
         ":conditional_thunk",
         ":convolution_reorder_thunk",
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
index 7ab65e853aebc1..f4d8617abaf50f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
@@ -30,6 +33,7 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/shape.h"
@@ -40,11 +44,12 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
 
-namespace impl {
+namespace {
 AllGatherConfig GetAllGatherConfig(const HloAllGatherInstruction* inst) {
   AllGatherConfig config;
   config.config = GetCollectiveConfig(inst, inst->use_global_device_ids());
@@ -55,7 +60,7 @@ absl::Status CheckImplementableInst(const HloAllGatherInstruction* inst) {
   for (HloInstruction* operand : inst->operands()) {
     const Shape& shape = operand->shape();
 
-    TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kAllGather));
+    RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kAllGather));
 
     if (!ShapeUtil::IsEffectivelyMostMajorDimension(
             shape, inst->all_gather_dimension())) {
@@ -67,7 +72,16 @@ absl::Status CheckImplementableInst(const HloAllGatherInstruction* inst) {
 
   return absl::OkStatus();
 }
-}  // namespace impl
+}  // namespace
+
+AllGatherStartThunk::AllGatherStartThunk(
+    ThunkInfo thunk_info,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
+    CollectiveConfig config, std::vector<Buffer> buffers)
+    : CollectiveThunk(Thunk::kAllGatherStart, thunk_info, async_events,
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      config_(AllGatherConfig{config}),
+      buffers_(std::move(buffers)) {}
 
 AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
                                          const HloAllGatherInstruction* inst,
@@ -76,7 +90,7 @@ AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
     : CollectiveThunk(Thunk::kAllGatherStart, thunk_info,
                       IsGPUSyncCollective(*inst),
                       AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
-      config_(impl::GetAllGatherConfig(inst)),
+      config_(GetAllGatherConfig(inst)),
       buffers_(std::move(buffers)) {
   CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
@@ -85,23 +99,69 @@ AllGatherStartThunk::AllGatherStartThunk(ThunkInfo thunk_info,
     const HloAllGatherInstruction* inst, int64_t replica_count,
     int64_t partition_count) {
   return AddOpDescription<AllGatherStartThunk>(
-      impl::CheckImplementableInst(inst), inst, replica_count, partition_count);
+      CheckImplementableInst(inst), inst, replica_count, partition_count);
 }
 
 /*static*/ CollectiveOpGroupMode AllGatherStartThunk::GetGroupMode(
     const HloAllGatherInstruction* inst) {
-  return impl::GetAllGatherConfig(inst).config.group_mode;
+  return GetAllGatherConfig(inst).config.group_mode;
+}
+
+absl::StatusOr<std::unique_ptr<AllGatherStartThunk>>
+AllGatherStartThunk::FromProto(
+    ThunkInfo thunk_info, const AllGatherStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
+      async_events_map[AsyncEventsUniqueId{
+          thunk_proto.async_events_unique_id()}];
+  if (!async_events) {
+    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  }
+
+  return std::make_unique<AllGatherStartThunk>(
+      std::move(thunk_info), async_events,
+      CollectiveConfig::FromProto(thunk_proto.collective_config()),
+      std::move(buffers));
+}
+
+absl::StatusOr<ThunkProto> AllGatherStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  AllGatherStartThunkProto* thunk_proto =
+      proto.mutable_all_gather_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (!async_events_id.has_value()) {
+    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  }
+  thunk_proto->set_async_events_unique_id(async_events_id->value());
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  return proto;
 }
 
 absl::StatusOr<bool> AllGatherStartThunk::RunCollective(
     const ExecuteParams& params, const GpuCliqueKey& clique_key,
     se::Stream& stream, Communicator& comm) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  TF_RETURN_IF_ERROR(xla::gpu::RunAllGather(
-      device_buffers, stream, comm, config_.config.use_symmetric_buffer));
+  ASSIGN_OR_RETURN(std::vector<DeviceBufferPair> device_buffers,
+                   ConvertToDeviceBuffers(params, buffers_,
+                                          config_.config.operand_element_type));
+  RETURN_IF_ERROR(xla::gpu::RunAllGather(device_buffers, stream, comm,
+                                         config_.config.use_symmetric_buffer));
   return true;
 }
 
@@ -110,13 +170,13 @@ absl::Status RunAllGather(std::vector<DeviceBufferPair>& buffers,
                           bool use_symmetric_buffer) {
   int device_ordinal = stream.parent()->device_ordinal();
   XLA_VLOG_DEVICE(3, device_ordinal) << "Performing all-gather";
-  TF_RETURN_IF_ERROR(MaybeRegisterBuffers(stream.parent(), buffers, &comm,
-                                          use_symmetric_buffer));
+  RETURN_IF_ERROR(MaybeRegisterBuffers(stream.parent(), buffers, &comm,
+                                       use_symmetric_buffer));
   auto* gpu_comm = tsl::down_cast<GpuCommunicator*>(&comm);
   Future<> future = gpu_comm->GroupExecute(
       [&buffers, &stream](GpuCommunicator* comm) -> absl::Status {
         for (DeviceBufferPair& buffer : buffers) {
-          TF_RETURN_IF_ERROR(comm->LaunchAllGather(
+          RETURN_IF_ERROR(comm->LaunchAllGather(
               buffer.source_buffer, buffer.destination_buffer,
               buffer.element_type, buffer.element_count,
               GpuCollectives::On(stream)));
@@ -124,7 +184,7 @@ absl::Status RunAllGather(std::vector<DeviceBufferPair>& buffers,
         return absl::OkStatus();
       });
 
-  TF_RETURN_IF_ERROR(future.Await());
+  RETURN_IF_ERROR(future.Await());
   XLA_VLOG_DEVICE(3, device_ordinal) << "Done performing all-gather";
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
index 422781194dc72c..2c3c9beb571448 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_ALL_GATHER_THUNK_H_
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
@@ -41,6 +43,10 @@ class AllGatherStartThunk : public CollectiveThunk {
   AllGatherStartThunk(ThunkInfo thunk_info, const HloAllGatherInstruction* inst,
                       std::vector<Buffer> buffers,
                       bool p2p_memcpy_enabled = false);
+  AllGatherStartThunk(
+      ThunkInfo thunk_info,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events,
+      CollectiveConfig config, std::vector<Buffer> buffers);
 
   static const char* GetHloOpName() { return "all-gather-start"; }
 
@@ -54,6 +60,13 @@ class AllGatherStartThunk : public CollectiveThunk {
   const CollectiveConfig& config() const override { return config_.config; }
   absl::Span<const Buffer> buffers() const { return buffers_; }
 
+  static absl::StatusOr<std::unique_ptr<AllGatherStartThunk>> FromProto(
+      ThunkInfo thunk_info, const AllGatherStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk_test.cc
new file mode 100644
index 00000000000000..38a08a872fe0f4
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_gather_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllGatherStartThunk> thunk,
+      AllGatherStartThunk::FromProto(thunk_info, proto.all_gather_start_thunk(),
+                                     buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_all_gather_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.all_gather_start_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
index 1d94ec8c5c8ce0..c421e0323641d8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <cstdlib>
+#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
@@ -45,6 +46,7 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/primitive_util.h"
 #include "xla/runtime/device_id.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/rendezvous.h"
@@ -59,6 +61,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::gpu {
 namespace {
@@ -148,6 +151,41 @@ bool CollectiveConfig::IsDegenerate(int64_t replica_count,
   }
 }
 
+CollectiveConfigProto CollectiveConfig::ToProto() const {
+  CollectiveConfigProto proto;
+
+  proto.mutable_operand_element_type()->Assign(operand_element_type.begin(),
+                                               operand_element_type.end());
+  proto.mutable_replica_groups()->Assign(replica_groups.begin(),
+                                         replica_groups.end());
+
+  proto.set_group_mode(group_mode);
+  proto.set_use_symmetric_buffer(use_symmetric_buffer);
+
+  return proto;
+}
+
+CollectiveConfig CollectiveConfig::FromProto(
+    const CollectiveConfigProto& proto) {
+  CollectiveConfig config;
+
+  config.operand_element_type.reserve(proto.operand_element_type_size());
+  for (int element_type : proto.operand_element_type()) {
+    config.operand_element_type.push_back(
+        static_cast<PrimitiveType>(element_type));
+  }
+
+  config.replica_groups.assign(proto.replica_groups().begin(),
+                               proto.replica_groups().end());
+
+  absl::c_copy(proto.replica_groups(),
+               std::back_inserter(config.replica_groups));
+
+  config.group_mode = proto.group_mode();
+  config.use_symmetric_buffer = proto.use_symmetric_buffer();
+  return config;
+}
+
 CollectiveConfig GetCollectiveConfig(
     const HloInstruction* hlo, std::optional<bool> use_global_device_ids) {
   CollectiveConfig config;
@@ -176,6 +214,13 @@ CollectiveThunk::CollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync,
       stream_kind_(stream_kind),
       async_events_(is_sync ? nullptr : std::make_shared<AsyncEvents>()) {}
 
+CollectiveThunk::CollectiveThunk(Kind kind, ThunkInfo thunk_info,
+                                 std::shared_ptr<AsyncEvents> async_events,
+                                 AsyncStreamKind stream_kind)
+    : Thunk(kind, thunk_info),
+      stream_kind_(stream_kind),
+      async_events_(async_events) {}
+
 absl::StatusOr<GpuCliqueKey> GetCollectiveGpuCliqueKey(
     const CollectiveParams& params, const CollectiveConfig& collective_config,
     bool include_participant_groups) {
@@ -217,7 +262,7 @@ absl::Status MaybeRegisterBuffer(se::StreamExecutor* executor,
                                  const se::DeviceAddressBase& buffer,
                                  Communicator* comm,
                                  bool use_symmetric_buffer) {
-  TF_ASSIGN_OR_RETURN(auto range, executor->GetMemoryRange(buffer));
+  ASSIGN_OR_RETURN(auto range, executor->GetMemoryRange(buffer));
   XLA_VLOG_DEVICE(1, executor->device_ordinal())
       << "Registering range: " << range.opaque()
       << " with size: " << range.size() << " for buffer: " << buffer.opaque()
@@ -235,17 +280,44 @@ absl::Status MaybeRegisterBuffers(se::StreamExecutor* executor,
                                   bool use_symmetric_buffer) {
   for (int i = 0; i < buffers.size(); ++i) {
     if (buffers[i].source_memory_space == kCollectiveMemorySpaceColor) {
-      TF_RETURN_IF_ERROR(MaybeRegisterBuffer(executor, buffers[i].source_buffer,
-                                             comm, use_symmetric_buffer));
+      RETURN_IF_ERROR(MaybeRegisterBuffer(executor, buffers[i].source_buffer,
+                                          comm, use_symmetric_buffer));
     }
     if (buffers[i].destination_memory_space == kCollectiveMemorySpaceColor) {
-      TF_RETURN_IF_ERROR(MaybeRegisterBuffer(
+      RETURN_IF_ERROR(MaybeRegisterBuffer(
           executor, buffers[i].destination_buffer, comm, use_symmetric_buffer));
     }
   }
   return absl::OkStatus();
 }
 
+absl::StatusOr<CollectiveBufferProto> CollectiveThunk::Buffer::ToProto() const {
+  CollectiveBufferProto proto;
+  proto.set_element_count(element_count);
+  ASSIGN_OR_RETURN(*proto.mutable_source_buffer(), source_buffer.ToProto());
+  ASSIGN_OR_RETURN(*proto.mutable_destination_buffer(),
+                   destination_buffer.ToProto());
+  proto.set_source_memory_space(source_memory_space);
+  proto.set_destination_memory_space(destination_memory_space);
+  return proto;
+}
+
+absl::StatusOr<CollectiveThunk::Buffer> CollectiveThunk::Buffer::FromProto(
+    const CollectiveBufferProto& buffer_proto,
+    absl::Span<const BufferAllocation> buffer_allocations) {
+  CollectiveThunk::Buffer res;
+  res.element_count = buffer_proto.element_count();
+  ASSIGN_OR_RETURN(res.source_buffer,
+                   BufferAllocation::Slice::FromProto(
+                       buffer_proto.source_buffer(), buffer_allocations));
+  ASSIGN_OR_RETURN(res.destination_buffer,
+                   BufferAllocation::Slice::FromProto(
+                       buffer_proto.destination_buffer(), buffer_allocations));
+  res.source_memory_space = buffer_proto.source_memory_space();
+  res.destination_memory_space = buffer_proto.destination_memory_space();
+  return res;
+}
+
 absl::Status CollectiveThunk::AsyncEvents::Initialize(
     se::StreamExecutor* executor) {
   absl::MutexLock lock(mu_);
@@ -253,7 +325,7 @@ absl::Status CollectiveThunk::AsyncEvents::Initialize(
     return absl::OkStatus();
   }
 
-  TF_ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
+  ASSIGN_OR_RETURN(auto event, executor->CreateEvent());
 
   events_.try_emplace(executor, std::move(event));
   return absl::OkStatus();
@@ -274,7 +346,7 @@ absl::StatusOr<se::Event*> CollectiveThunk::AsyncEvents::GetEvent(
 
 absl::Status CollectiveThunk::Prepare(const PrepareParams& params) {
   TF_RET_CHECK(params.collective_params != nullptr);
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(*params.collective_params, config().replica_groups,
                       config().group_mode, GetAsyncStreamKind()));
@@ -283,7 +355,7 @@ absl::Status CollectiveThunk::Prepare(const PrepareParams& params) {
 
 absl::Status CollectiveThunk::Initialize(const InitializeParams& params) {
   if (async_events_) {
-    TF_RETURN_IF_ERROR(async_events_->Initialize(params.executor));
+    RETURN_IF_ERROR(async_events_->Initialize(params.executor));
   }
   return absl::OkStatus();
 }
@@ -294,15 +366,14 @@ absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
       IsAsync() ? "async" : "sync", Thunk::KindToString(kind()));
   AsyncStreamKind stream_kind = GetAsyncStreamKind();
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(*params.collective_params, config().replica_groups,
                       config().group_mode, stream_kind));
 
-  TF_ASSIGN_OR_RETURN(
-      Communicator * comm,
-      params.collective_cliques->GetComm(
-          clique_key, params.collective_params->global_device_id));
+  ASSIGN_OR_RETURN(Communicator * comm,
+                   params.collective_cliques->GetComm(
+                       clique_key, params.collective_params->global_device_id));
   DCHECK(comm) << "Failed to get communicator for collective operation";
 
   se::StreamExecutor* executor = params.stream->parent();
@@ -315,20 +386,18 @@ absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
         *params.collective_params->async_streams.at(async_stream_idx);
 
     // Wait for main compute stream to make sure all buffers are ready.
-    TF_RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
+    RETURN_IF_ERROR(async_stream.WaitFor(params.stream));
 
-    TF_ASSIGN_OR_RETURN(is_first_rendezvous_needed,
-                        RunCollective(params, clique_key, async_stream, *comm));
+    ASSIGN_OR_RETURN(is_first_rendezvous_needed,
+                     RunCollective(params, clique_key, async_stream, *comm));
 
     // Record collective operation completion.
-    TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
-    TF_RETURN_IF_ERROR(async_stream.RecordEvent(event));
-
+    ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+    RETURN_IF_ERROR(async_stream.RecordEvent(event));
   } else {
     // Launch collective operation on a main stream.
-    TF_ASSIGN_OR_RETURN(
-        is_first_rendezvous_needed,
-        RunCollective(params, clique_key, *params.stream, *comm));
+    ASSIGN_OR_RETURN(is_first_rendezvous_needed,
+                     RunCollective(params, clique_key, *params.stream, *comm));
   }
 
   // After a first execution of this instance of collective operation do a
@@ -356,7 +425,7 @@ absl::Status CollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
 
     const xla::DebugOptions debug_options = xla::GetDebugOptionsFromFlags();
 
-    TF_RETURN_IF_ERROR(Rendezvous(
+    RETURN_IF_ERROR(Rendezvous(
         first_call_rendezvous_flag_, rendezvous_name, rendezvous_key,
         num_local_participants,
         /*warn_stuck_timeout=*/
@@ -378,14 +447,13 @@ absl::StatusOr<std::vector<Communicator*>> CollectiveThunk::GetCommunicators(
     const ExecuteParams& params) const {
   AsyncStreamKind stream_kind = GetAsyncStreamKind();
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       GpuCliqueKey clique_key,
       GetGpuCliqueKey(*params.collective_params, config().replica_groups,
                       config().group_mode, stream_kind));
-  TF_ASSIGN_OR_RETURN(
-      Communicator * comm,
-      params.collective_cliques->GetComm(
-          clique_key, params.collective_params->global_device_id));
+  ASSIGN_OR_RETURN(Communicator * comm,
+                   params.collective_cliques->GetComm(
+                       clique_key, params.collective_params->global_device_id));
   return std::vector<Communicator*>{comm};
 }
 
@@ -436,7 +504,7 @@ CollectiveDoneThunk::CollectiveDoneThunk(
 
 absl::Status CollectiveDoneThunk::ExecuteOnStream(const ExecuteParams& params) {
   se::StreamExecutor* executor = params.stream->parent();
-  TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+  ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
   return params.stream->WaitFor(event);
 }
 
@@ -490,8 +558,8 @@ CollectiveDoneThunk::FromProto(
     async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
   }
 
-  TF_ASSIGN_OR_RETURN(Thunk::Kind kind,
-                      Thunk::KindFromProto(thunk_proto.thunk_kind()));
+  ASSIGN_OR_RETURN(Thunk::Kind kind,
+                   Thunk::KindFromProto(thunk_proto.thunk_kind()));
   return std::make_unique<CollectiveDoneThunk>(kind, std::move(thunk_info),
                                                async_events,
                                                thunk_proto.async_stream_kind());
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
index be08146f9a95c5..702db0ab6e5cfd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
@@ -57,6 +57,9 @@ struct CollectiveConfig {
   std::vector<ReplicaGroup> replica_groups;
   CollectiveOpGroupMode group_mode;
   bool use_symmetric_buffer;
+
+  CollectiveConfigProto ToProto() const;
+  static CollectiveConfig FromProto(const CollectiveConfigProto& proto);
 };
 
 CollectiveConfig GetCollectiveConfig(const HloInstruction* hlo,
@@ -96,6 +99,11 @@ class CollectiveThunk : public Thunk {
     BufferAllocation::Slice destination_buffer;
     int64_t source_memory_space;
     int64_t destination_memory_space;
+
+    absl::StatusOr<CollectiveBufferProto> ToProto() const;
+    static absl::StatusOr<Buffer> FromProto(
+        const CollectiveBufferProto& buffer_proto,
+        absl::Span<const BufferAllocation> buffer_allocations);
   };
 
   // Completion events for asynchronous collective operations (operations
@@ -120,6 +128,10 @@ class CollectiveThunk : public Thunk {
   using AsyncEventsMap =
       absl::flat_hash_map<AsyncEventsUniqueId, std::shared_ptr<AsyncEvents>>;
 
+  CollectiveThunk(Kind kind, ThunkInfo thunk_info,
+                  std::shared_ptr<AsyncEvents> async_events,
+                  AsyncStreamKind stream_kind);
+
   // Logging support.
   static std::string GetDeviceString(const CollectiveParams& params);
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 0cd18e571009c5..36b2e69ac96c4c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -377,12 +377,33 @@ message CustomKernelThunkProto {
   CustomKernelProto custom_kernel = 3;
 }
 
+message CollectiveBufferProto {
+  int64 element_count = 1;
+  xla.buffer_assignment.BufferAllocationSliceProto source_buffer = 2;
+  xla.buffer_assignment.BufferAllocationSliceProto destination_buffer = 3;
+  int64 source_memory_space = 4;
+  int64 destination_memory_space = 5;
+}
+
+message CollectiveConfigProto {
+  repeated PrimitiveType operand_element_type = 1;
+  repeated ReplicaGroup replica_groups = 2;
+  CollectiveOpGroupMode group_mode = 3;
+  bool use_symmetric_buffer = 4;
+}
+
 message CollectiveThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
   uint64 async_events_unique_id = 3;
 }
 
+message AllGatherStartThunkProto {
+  uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+}
+
 message CollectiveDoneThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
@@ -428,6 +449,7 @@ message ThunkProto {
     HostRecvDoneThunkProto host_recv_done_thunk = 35;
     CustomKernelThunkProto custom_kernel_thunk = 36;
     CollectiveDoneThunkProto collective_done_thunk = 37;
+    AllGatherStartThunkProto all_gather_start_thunk = 38;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index a34814163fe4be..4b1a965a41a449 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/message.h"
+#include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
@@ -242,6 +243,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return CollectiveDoneThunk::FromProto(std::move(thunk_info),
                                             thunk_proto.collective_done_thunk(),
                                             collective_async_events_map);
+    case ThunkProto::kAllGatherStartThunk:
+      return AllGatherStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.all_gather_start_thunk(),
+          buffer_allocations, collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);

From 1b7be6132ce305b8d5bd8c5f5613c2c0a00aa959 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Tue, 16 Dec 2025 16:31:25 -0800
Subject: [PATCH 355/753] Make `is_sdy_partitioned` warning less verbose

`Operation::emitWarning()` by default attaches the entire op (module in this case) as a note, which is rarely useful for this particular warning.

PiperOrigin-RevId: 845479681
---
 .../transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc
index 66c8aa2d6a3eb9..063423317fbbfe 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_lower_atom_program_metadata_to_xla_pass.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Pass/Pass.h"
 #include "xla/hlo/ir/hlo_sharding.h"
@@ -94,7 +95,7 @@ void IfrtLowerAtomProgramMetadataToXlaPass::runOnOperation() {
   // after 6 month bwd compatibility window.
   bool is_sdy = module_op->hasAttr(kIsSdyPartitioned);
   if (is_sdy) {
-    module_op.emitWarning()
+    mlir::emitWarning(module_op->getLoc())
         << "`" << kIsSdyPartitioned
         << "` attribute is deprecated and will be removed. See b/433244129."
            " Please use `compile_options_override` to specify sharding.";

From 91e0e5c4e0934182e61ed5c3484b3c11b6a9c5e8 Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Tue, 16 Dec 2025 17:10:02 -0800
Subject: [PATCH 356/753] [ReplicaGroupV3][Refactor][1/n] Add virtual base
 class which all replica groups inherit from.

PiperOrigin-RevId: 845493049
---
 third_party/xla/xla/hlo/ir/replica_group.cc   |  27 +--
 third_party/xla/xla/hlo/ir/replica_group.h    | 162 +++++++++++++-----
 .../xla/xla/service/spmd/spmd_partitioner.cc  |  34 ++--
 3 files changed, 147 insertions(+), 76 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/replica_group.cc b/third_party/xla/xla/hlo/ir/replica_group.cc
index e78141fc8fee05..c7a8ecc2eb8f25 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.cc
+++ b/third_party/xla/xla/hlo/ir/replica_group.cc
@@ -189,7 +189,8 @@ std::vector<std::vector<int64_t>> get_replica_groups_for_full_axes(
   return replica_groups;
 }
 
-void MeshAxesReplicaGroupList::InitializeDimToReshapeAndAggregateAxes() {
+absl::flat_hash_map<int64_t, MeshAxesReplicaGroupList::ReshapeAndAggregateAxes>
+MeshAxesReplicaGroupList::GetDimToReshapeAndAggregateAxes() const {
   absl::flat_hash_map<int64_t, std::vector<AxisRef>> dim_to_axes;
   for (const AxisRef& axis : axes_) {
     dim_to_axes[axis.mesh_axis_index()].push_back(axis);
@@ -216,17 +217,14 @@ void MeshAxesReplicaGroupList::InitializeDimToReshapeAndAggregateAxes() {
     }
     dim_map[dim] = reshape_and_aggregate_axes;
   }
-  dim_to_reshape_and_aggregate_axes_ = dim_map;
+  return dim_map;
 }
 
 std::pair<std::vector<int64_t>, std::vector<int64_t>>
-MeshAxesReplicaGroupList::ComputeReindexedAxes() {
-  if (!dim_to_reshape_and_aggregate_axes_.has_value()) {
-    InitializeDimToReshapeAndAggregateAxes();
-  }
+MeshAxesReplicaGroupList::ComputeReindexedAxes() const {
   std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
   absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes> dim_map =
-      dim_to_reshape_and_aggregate_axes_.value();
+      GetDimToReshapeAndAggregateAxes();
   for (int64_t i = 0; i < mesh_.axis_sizes().size(); ++i) {
     int64_t axis_size = mesh_.axis_size(i);
     auto it = dim_map.find(i);
@@ -247,7 +245,7 @@ MeshAxesReplicaGroupList::ComputeReindexedAxes() {
 }
 
 std::vector<std::vector<int64_t>>
-MeshAxesReplicaGroupList::flattened_replica_groups() {
+MeshAxesReplicaGroupList::flattened_replica_groups() const {
   std::vector<int64_t> reindex_axis_sizes, reindexed_grouped_axes;
   std::tie(reindex_axis_sizes, reindexed_grouped_axes) = ComputeReindexedAxes();
   return get_replica_groups_for_full_axes(
@@ -292,7 +290,7 @@ MeshAxesReplicaGroupList MeshAxesReplicaGroupList::FromProto(
   return MeshAxesReplicaGroupList(mesh, axes);
 }
 
-IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() {
+IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() const {
   CHECK(mesh_.device_assignment().iota().has_value());
   std::vector<int64_t> reshape_dims, reindexed_grouped_axes;
   std::tie(reshape_dims, reindexed_grouped_axes) = ComputeReindexedAxes();
@@ -312,7 +310,7 @@ IotaReplicaGroupList MeshAxesReplicaGroupList::ToIotaReplicaGroupList() {
                               reshape_dims, transpose_perm);
 }
 
-CollectiveDeviceList MeshAxesReplicaGroupList::ToCollectiveDeviceList() {
+CollectiveDeviceList MeshAxesReplicaGroupList::ToCollectiveDeviceList() const {
   return CollectiveDeviceList(flattened_replica_groups());
 }
 
@@ -413,15 +411,22 @@ CollectiveDeviceList::flattened_replica_groups() const {
   return result;
 }
 
+std::string CollectiveDeviceList::ToString() const {
+  return ToString(/*print_full_replica_group_list=*/false);
+}
+
 std::string CollectiveDeviceList::ToString(
     bool print_full_replica_group_list) const {
   if (iota_replica_group_list_.has_value() && !print_full_replica_group_list) {
     return iota_replica_group_list_->ToString();
   }
-
   return ReplicaGroupsToString(replica_groups());
 }
 
+void CollectiveDeviceList::Print(Printer* printer) const {
+  return Print(printer, /*print_full_replica_group_list=*/false);
+}
+
 void CollectiveDeviceList::Print(Printer* printer,
                                  bool print_full_replica_group_list) const {
   if (iota_replica_group_list_.has_value() && !print_full_replica_group_list) {
diff --git a/third_party/xla/xla/hlo/ir/replica_group.h b/third_party/xla/xla/hlo/ir/replica_group.h
index b6e30d24071ef3..92e23d1310dde5 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.h
+++ b/third_party/xla/xla/hlo/ir/replica_group.h
@@ -41,7 +41,57 @@ namespace xla {
 class IotaReplicaGroupList;
 class CollectiveDeviceList;
 
-class MeshAxesReplicaGroupList {
+enum class CollectiveDeviceListVersion { kListOfLists, kIota, kMeshAxes };
+
+class CollectiveDeviceListBase {
+ public:
+  virtual ~CollectiveDeviceListBase() = default;
+  CollectiveDeviceListBase() = default;
+  CollectiveDeviceListBase(const CollectiveDeviceListBase&) = default;
+  CollectiveDeviceListBase& operator=(const CollectiveDeviceListBase&) =
+      default;
+  CollectiveDeviceListBase(CollectiveDeviceListBase&&) = default;
+  CollectiveDeviceListBase& operator=(CollectiveDeviceListBase&&) = default;
+
+  virtual int64_t num_replica_groups() const = 0;
+  virtual int64_t num_devices_per_group() const = 0;
+  int64_t num_total_devices() const {
+    return num_replica_groups() * num_devices_per_group();
+  }
+  virtual std::vector<std::vector<int64_t>> flattened_replica_groups()
+      const = 0;
+  virtual const std::vector<ReplicaGroup>& replica_groups() const {
+    if (replica_groups_ != nullptr) {
+      return *replica_groups_;
+    }
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>();
+    replica_groups_->reserve(num_replica_groups());
+    for (const auto& group : flattened_replica_groups()) {
+      ReplicaGroup replica_group;
+      replica_group.mutable_replica_ids()->Add(group.begin(), group.end());
+      replica_groups_->push_back(std::move(replica_group));
+    }
+    return *replica_groups_;
+  };
+  virtual void Print(Printer* printer) const = 0;
+  virtual void Print(Printer* printer,
+                     bool print_full_replica_group_list) const {
+    return Print(printer);
+  };
+  virtual std::string ToString() const = 0;
+  virtual std::string ToString(bool print_full_replica_group_list) const {
+    return ToString();
+  };
+
+  virtual std::unique_ptr<CollectiveDeviceListBase> Clone() const = 0;
+
+  virtual CollectiveDeviceListVersion version() const = 0;
+
+  // shared_ptr for fast copy.
+  mutable std::shared_ptr<std::vector<ReplicaGroup>> replica_groups_ = nullptr;
+};
+
+class MeshAxesReplicaGroupList : public CollectiveDeviceListBase {
   struct ReshapeAndAggregateAxes {
     std::vector<int64_t> reshape_dims;
     std::vector<int64_t> aggregate_axes;
@@ -59,30 +109,33 @@ class MeshAxesReplicaGroupList {
     return H::combine(std::move(h), c.mesh_, c.axes_);
   }
 
-  int64_t num_replica_groups() const;
-  int64_t num_devices_per_group() const;
-  std::vector<std::vector<int64_t>> flattened_replica_groups();
-
-  void Print(Printer* printer) const;
-
-  std::string ToString() const;
-
+  int64_t num_replica_groups() const override;
+  int64_t num_devices_per_group() const override;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
+  void Print(Printer* printer) const override;
+  std::string ToString() const override;
   MeshAxesReplicaGroupListProto ToProto() const;
+  std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
+    return std::make_unique<MeshAxesReplicaGroupList>(*this);
+  };
+  CollectiveDeviceListVersion version() const override {
+    return CollectiveDeviceListVersion::kMeshAxes;
+  }
 
   static MeshAxesReplicaGroupList FromProto(
       const MeshAxesReplicaGroupListProto& proto);
 
   // Methods for converting to V2 and V1 representations.
-  IotaReplicaGroupList ToIotaReplicaGroupList();
-  CollectiveDeviceList ToCollectiveDeviceList();
+  IotaReplicaGroupList ToIotaReplicaGroupList() const;
+  CollectiveDeviceList ToCollectiveDeviceList() const;
 
  private:
-  void InitializeDimToReshapeAndAggregateAxes();
-  std::pair<std::vector<int64_t>, std::vector<int64_t>> ComputeReindexedAxes();
+  absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>
+  GetDimToReshapeAndAggregateAxes() const;
+  std::pair<std::vector<int64_t>, std::vector<int64_t>> ComputeReindexedAxes()
+      const;
   Mesh mesh_;
   std::vector<AxisRef> axes_;
-  std::optional<absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>>
-      dim_to_reshape_and_aggregate_axes_;
 };
 
 std::string ReplicaGroupsToString(
@@ -92,7 +145,7 @@ std::string ReplicaGroupsToString(
 // reshaping and transposing an iota array (iota tile assignment). Can be used
 // to represent certain common patterns of device lists in a compact, scalable
 // format.
-class IotaReplicaGroupList {
+class IotaReplicaGroupList : public CollectiveDeviceListBase {
  public:
   explicit IotaReplicaGroupList(int64_t num_replica_groups,
                                 int64_t num_devices_per_group)
@@ -125,8 +178,8 @@ class IotaReplicaGroupList {
                       c.transpose_perm());
   }
 
-  int64_t num_replica_groups() const;
-  int64_t num_devices_per_group() const;
+  int64_t num_replica_groups() const override;
+  int64_t num_devices_per_group() const override;
   absl::Span<const int64_t> reshape_dims() const {
     return iota_tile_assignment_.reshape_dims();
   }
@@ -134,11 +187,17 @@ class IotaReplicaGroupList {
     return iota_tile_assignment_.transpose_perm();
   }
   Array<int64_t> ToArray() const { return iota_tile_assignment_.ToArray(); }
-  std::vector<std::vector<int64_t>> flattened_replica_groups() const;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
 
-  void Print(Printer* printer) const;
+  void Print(Printer* printer) const override;
 
-  std::string ToString() const;
+  std::string ToString() const override;
+  std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
+    return std::make_unique<IotaReplicaGroupList>(*this);
+  };
+  CollectiveDeviceListVersion version() const override {
+    return CollectiveDeviceListVersion::kIota;
+  }
 
   IotaReplicaGroupListProto ToProto() const;
 
@@ -153,22 +212,26 @@ class IotaReplicaGroupList {
 // Represents a series of devices participating in a collective operation
 // (all-gather, all-reduce, etc.). While this directly translates to a list of
 // replica groups, it may be used to represent these lists in compact forms.
-class CollectiveDeviceList {
+class CollectiveDeviceList : public CollectiveDeviceListBase {
  public:
-  explicit CollectiveDeviceList()
-      : replica_groups_(std::make_shared<std::vector<ReplicaGroup>>()) {};
+  explicit CollectiveDeviceList() {
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>();
+  };
 
-  explicit CollectiveDeviceList(std::vector<ReplicaGroup> replica_groups)
-      : replica_groups_(std::make_shared<std::vector<ReplicaGroup>>(
-            std::move(replica_groups))) {};
+  explicit CollectiveDeviceList(std::vector<ReplicaGroup> replica_groups) {
+    replica_groups_ =
+        std::make_shared<std::vector<ReplicaGroup>>(std::move(replica_groups));
+  };
 
-  explicit CollectiveDeviceList(absl::Span<const ReplicaGroup> replica_groups)
-      : replica_groups_(std::make_shared<std::vector<ReplicaGroup>>(
-            replica_groups.begin(), replica_groups.end())) {};
+  explicit CollectiveDeviceList(absl::Span<const ReplicaGroup> replica_groups) {
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>(
+        replica_groups.begin(), replica_groups.end());
+  };
 
   explicit CollectiveDeviceList(
-      absl::Span<const std::vector<int64_t>> replica_groups)
-      : replica_groups_(ToReplicaGroupVector(replica_groups)) {};
+      absl::Span<const std::vector<int64_t>> replica_groups) {
+    replica_groups_ = ToReplicaGroupVector(replica_groups);
+  };
 
   // Replica groups are materialized lazily upon first access.
   explicit CollectiveDeviceList(
@@ -206,41 +269,51 @@ class CollectiveDeviceList {
   }
 
   // Lazyly explands iota if applicable.
-  const std::vector<ReplicaGroup>& replica_groups() const;
-  std::vector<std::vector<int64_t>> flattened_replica_groups() const;
+  const std::vector<ReplicaGroup>& replica_groups() const override;
+  std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
   const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
     return iota_replica_group_list_;
   }
 
-  int64_t num_replica_groups() const {
+  int64_t num_replica_groups() const override {
     return iota_replica_group_list_.has_value()
                ? iota_replica_group_list_->num_replica_groups()
                : replica_groups_->size();
   }
 
-  int64_t num_devices_per_group() const {
+  int64_t num_devices_per_group() const override {
     return iota_replica_group_list_.has_value()
                ? iota_replica_group_list_->num_devices_per_group()
                : replica_groups_->begin()->replica_ids_size();
   }
 
+  void Print(Printer* printer) const override;
   void Print(Printer* printer,
-             bool print_full_replica_group_list = false) const;
-
-  std::string ToString(bool print_full_replica_group_list = false) const;
+             bool print_full_replica_group_list) const override;
+  std::string ToString() const override;
+  std::string ToString(bool print_full_replica_group_list) const override;
+  CollectiveDeviceListVersion version() const override {
+    if (iota_replica_group_list_.has_value()) {
+      return CollectiveDeviceListVersion::kIota;
+    }
+    return CollectiveDeviceListVersion::kListOfLists;
+  }
 
   CollectiveDeviceListProto ToProto() const;
   static CollectiveDeviceList FromProto(const CollectiveDeviceListProto& proto);
   static CollectiveDeviceList FromProto(const HloInstructionProto& proto);
+  std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
+    return std::make_unique<CollectiveDeviceList>(*this);
+  };
 
  private:
   // Construct collective device list from protobuf replica group start and end
   // iterators.
   CollectiveDeviceList(
       tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator start,
-      tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator end)
-      : replica_groups_(
-            std::make_shared<std::vector<ReplicaGroup>>(start, end)) {};
+      tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator end) {
+    replica_groups_ = std::make_shared<std::vector<ReplicaGroup>>(start, end);
+  };
 
   static std::shared_ptr<std::vector<ReplicaGroup>> ToReplicaGroupVector(
       absl::Span<const std::vector<int64_t>> replica_groups) {
@@ -258,10 +331,11 @@ class CollectiveDeviceList {
   void MaybeMaterializeFullReplicaGroupList() const;
 
   std::optional<IotaReplicaGroupList> iota_replica_group_list_;
-  // shared_ptr for fast copy.
-  mutable std::shared_ptr<std::vector<ReplicaGroup>> replica_groups_ = nullptr;
 };
 
+std::optional<CollectiveDeviceList> ConvertToV1CollectiveDeviceList(
+    const CollectiveDeviceListBase& device_list);
+
 }  // namespace xla
 
 #endif  // XLA_HLO_IR_REPLICA_GROUP_H_
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 1f4c6f84b6b968..4f031265de029f 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -4953,12 +4953,6 @@ absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
 
 SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                                                         int64_t num_replicas) {
-  auto uses_all_partitions =
-      [num_partitions](const IotaReplicaGroupList& partition_group_list) {
-        return partition_group_list.num_replica_groups() *
-                   partition_group_list.num_devices_per_group() ==
-               num_partitions;
-      };
   auto create_all_reduce_lists_of_lists =
       [num_replicas, num_partitions](
           SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
@@ -5053,14 +5047,14 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 b, operand, reduction, partition_subgroups, channel_id);
           },
       .create_cross_partition_all_reduce_with_iota_device_list =
-          [create_all_reduce_lists_of_lists, uses_all_partitions, num_replicas,
-           num_partitions](SpmdBuilder* b, HloInstruction* operand,
-                           HloComputation* reduction,
-                           const IotaReplicaGroupList& partition_group_list,
-                           int64_t channel_id) {
+          [create_all_reduce_lists_of_lists, num_replicas, num_partitions](
+              SpmdBuilder* b, HloInstruction* operand,
+              HloComputation* reduction,
+              const IotaReplicaGroupList& partition_group_list,
+              int64_t channel_id) {
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
-            if (!uses_all_partitions(partition_group_list)) {
+            if (partition_group_list.num_total_devices() != num_partitions) {
               return create_all_reduce_lists_of_lists(
                   b, operand, reduction,
                   partition_group_list.flattened_replica_groups(), channel_id);
@@ -5111,14 +5105,13 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 b, operands, partition_subgroups, channel_id, split_dimension);
           },
       .create_cross_partition_all_to_all_with_iota_device_list =
-          [create_all_to_all_list_of_lists, uses_all_partitions, num_replicas,
-           num_partitions](
+          [create_all_to_all_list_of_lists, num_replicas, num_partitions](
               SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
               const IotaReplicaGroupList& partition_group_list,
               int64_t channel_id, std::optional<int64_t> split_dimension) {
             // Fallback back to list of lists collective creation if the
             // partition group list does not utilize all the partitions.
-            if (!uses_all_partitions(partition_group_list)) {
+            if (partition_group_list.num_total_devices() != num_partitions) {
               return create_all_to_all_list_of_lists(
                   b, operands, partition_group_list.flattened_replica_groups(),
                   channel_id, split_dimension);
@@ -5143,14 +5136,13 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 all_gather_dimension);
           },
       .create_cross_partition_all_gather_with_iota_device_list =
-          [create_all_gather_list_of_lists, uses_all_partitions, num_replicas,
-           num_partitions](SpmdBuilder* b, HloInstruction* operand,
-                           const Shape& ag_shape,
-                           const IotaReplicaGroupList& partition_group_list,
-                           int64_t channel_id, int64_t all_gather_dimension) {
+          [create_all_gather_list_of_lists, num_replicas, num_partitions](
+              SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+              const IotaReplicaGroupList& partition_group_list,
+              int64_t channel_id, int64_t all_gather_dimension) {
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
-            if (!uses_all_partitions(partition_group_list)) {
+            if (partition_group_list.num_total_devices() != num_partitions) {
               return create_all_gather_list_of_lists(
                   b, operand, ag_shape,
                   partition_group_list.flattened_replica_groups(), channel_id,

From c744da664cbc3d319688e1d760e4020e2ec27266 Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Tue, 16 Dec 2025 17:10:44 -0800
Subject: [PATCH 357/753] Make miscellaneous clang-suggested fixes to
 coordination service.

PiperOrigin-RevId: 845493331
---
 .../coordination/coordination_service.cc      | 66 ++++++++++++-------
 .../coordination_service_agent.cc             | 11 +++-
 2 files changed, 49 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
index a3ec50ee8f5971..d524685fc760c1 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
@@ -98,7 +98,9 @@ absl::Status MakeShutdownBarrierError(const absl::Status& error) {
 
 void CoordinationService::ErrorPollingState::SetError(
     const absl::Status& error) {
-  if (responded_) return;
+  if (responded_) {
+    return;
+  }
   responded_ = true;
   error_ = error;
   for (auto& [_, done_cb] : done_callbacks_) {
@@ -119,7 +121,9 @@ void CoordinationService::ErrorPollingState::RemoveTask(
 void CoordinationService::ErrorPollingState::AddTask(
     const CoordinatedTask& task, tsl::StatusCallback&& done) {
   // Do not allow to insert a task if the service has already responded.
-  if (Responded()) return;
+  if (Responded()) {
+    return;
+  }
   polling_task_names_.insert(GetTaskName(task));
   RemoveTask(task, "new request from the same task");
   done_callbacks_[task] = done;
@@ -143,7 +147,9 @@ void CoordinationService::TaskState::Disconnect(
 }
 
 bool CoordinationService::TaskState::SetError(const absl::Status& status) {
-  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return false;
+  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
+    return false;
+  }
   state_ = CoordinatedTaskState::TASKSTATE_ERROR;
   status_ = status;
   return true;
@@ -151,7 +157,9 @@ bool CoordinationService::TaskState::SetError(const absl::Status& status) {
 
 absl::Status CoordinationService::TaskState::RecordHeartbeat(
     IncarnationId task_incarnation) {
-  if (!status_.ok()) return status_;
+  if (!status_.ok()) {
+    return status_;
+  }
   // Record heartbeat.
   if (task_incarnation_ == task_incarnation) {
     absl::MutexLock l(last_heartbeat_mu_);
@@ -161,14 +169,13 @@ absl::Status CoordinationService::TaskState::RecordHeartbeat(
   // Task incarnation mismatch!
   if (IsRecoverable()) {
     return absl::OkStatus();  // Ignore, but don't record new heartbeat.
-  } else {
-    return MakeCoordinationError(absl::AbortedError(absl::StrCat(
-        task_name_, " Heartbeat: Incarnation ID mismatch: expecting ",
-        task_incarnation_.value(), " but got ", task_incarnation.value(),
-        ". The task has restarted and likely crashed earlier - check for any "
-        "earlier errors or any scheduler events (e.g. preemption, eviction) to "
-        "debug further.")));
   }
+  return MakeCoordinationError(absl::AbortedError(absl::StrCat(
+      task_name_, " Heartbeat: Incarnation ID mismatch: expecting ",
+      task_incarnation_.value(), " but got ", task_incarnation.value(),
+      ". The task has restarted and likely crashed earlier - check for any "
+      "earlier errors or any scheduler events (e.g. preemption, eviction) to "
+      "debug further.")));
 }
 
 int64_t CoordinationService::TaskState::TimeSinceLastHeartbeatMs() {
@@ -625,7 +632,8 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
     done(absl::OkStatus());
     ClusterStateUpdated();
     return;
-  } else if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
+  }
+  if (task_state == CoordinatedTaskState::TASKSTATE_CONNECTED) {
     // This may happen if the service processes the initial RegisterTask(),
     // but the agent did not receive the response so the agent retries again.
     if (task_cluster_state->GetTaskIncarnation() == incarnation ||
@@ -639,12 +647,11 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
       done(absl::OkStatus());
       ClusterStateUpdated();
       return;
-    } else {
-      error_message =
-          absl::StrCat(task_name,
-                       " unexpectedly tried to connect with a different "
-                       "incarnation. It has likely restarted.");
     }
+    error_message =
+        absl::StrCat(task_name,
+                     " unexpectedly tried to connect with a different "
+                     "incarnation. It has likely restarted.");
   } else {
     // This task is already in error, which implies it has registered
     // previously.
@@ -744,7 +751,8 @@ absl::Status CoordinationService::DisconnectTask(const CoordinatedTask& task) {
         absl::StrCat("Coordination service has stopped. DisconnectTask() "
                      "failed for task_name=",
                      task_name)));
-  } else if (!cluster_state_.contains(task_name)) {
+  }
+  if (!cluster_state_.contains(task_name)) {
     return MakeCoordinationError(absl::InvalidArgumentError(absl::StrCat(
         "Unexpected disconnect request with task_name=", task_name)));
   }
@@ -782,11 +790,13 @@ absl::Status CoordinationService::ReportTaskError(const CoordinatedTask& task,
   if (ServiceHasStopped()) {
     return MakeCoordinationError(absl::InternalError(
         "Coordination service has stopped. ReportTaskError() failed."));
-  } else if (!cluster_state_.contains(task_name)) {
+  }
+  if (!cluster_state_.contains(task_name)) {
     return MakeCoordinationError(absl::InvalidArgumentError(
         absl::StrCat("Unexpected request from task ", task_name)));
-  } else if (cluster_state_[task_name]->GetState() !=
-             CoordinatedTaskState::TASKSTATE_CONNECTED) {
+  }
+  if (cluster_state_[task_name]->GetState() !=
+      CoordinatedTaskState::TASKSTATE_CONNECTED) {
     return MakeCoordinationError(absl::FailedPreconditionError(
         "The task is not connected or already has an error."));
   }
@@ -878,7 +888,8 @@ absl::Status CoordinationService::RecordHeartbeat(const CoordinatedTask& task,
         "gracefully. Check the task leader's logs for an earlier error or "
         "scheduler events (e.g. preemption, eviction) to debug the root "
         "cause.")));
-  } else if (!cluster_state_.contains(task_name)) {
+  }
+  if (!cluster_state_.contains(task_name)) {
     return MakeCoordinationError(absl::InvalidArgumentError(
         absl::StrCat("Unexpected heartbeat request from task: ", task_name,
                      ". This usually implies a configuration error.")));
@@ -889,7 +900,8 @@ absl::Status CoordinationService::RecordHeartbeat(const CoordinatedTask& task,
         "Unexpected heartbeat request from an already-in-error task: ",
         task_name,
         " with existing error: ", task_state->GetStatus().ToString())));
-  } else if (task_state->IsDisconnectedBeyondGracePeriod()) {
+  }
+  if (task_state->IsDisconnectedBeyondGracePeriod()) {
     // We accept heartbeats for a short grace period to account for the lag
     // time between the service recording the state change and the agent
     // stopping heartbeats.
@@ -960,7 +972,9 @@ std::string NormalizeKey(absl::string_view orig_key) {
   // Parse all characters
   while (*src) {
     // Skip leading slashes
-    while (*src == '/') src++;
+    while (*src == '/') {
+      src++;
+    }
     // Copy over all non-slash characters
     while (*src && *src != '/') {
       *dst++ = *src++;
@@ -971,7 +985,9 @@ std::string NormalizeKey(absl::string_view orig_key) {
     }
   }
   // If ending with slash, remove the trailing slash
-  if (dst > norm_key.begin() && *(dst - 1) == '/') dst--;
+  if (dst > norm_key.begin() && *(dst - 1) == '/') {
+    dst--;
+  }
   norm_key.resize(dst - norm_key.begin());
   return norm_key;
 }
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index 5ab814869e4b96..fe6c625e91c910 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -461,7 +461,8 @@ absl::Status CoordinationServiceAgent::ReportError(const absl::Status& error) {
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Coordination service agent must be initialized first before "
           "reporting error."));
-    } else if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
+    }
+    if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Coordination service agent is already in error state."));
     }
@@ -837,7 +838,9 @@ absl::Status CoordinationServiceAgent::StopWatchKey(absl::string_view key) {
 void CoordinationServiceAgent::SetError(const absl::Status& error) {
   assert(!error.ok());
   absl::MutexLock l(state_mu_);
-  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) return;
+  if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
+    return;
+  }
   absl::Status trimmed_error = TrimCoordinationErrorMessage(error);
 
   state_ = CoordinatedTaskState::TASKSTATE_ERROR;
@@ -1042,7 +1045,9 @@ absl::Status CoordinationServiceAgent::ValidateRunningAgent(
           "Agent must be in CONNECTED state. It is currently UNINITIALIZED."));
 
     case CoordinatedTaskState::TASKSTATE_DISCONNECTED:
-      if (allow_disconnected) return absl::OkStatus();
+      if (allow_disconnected) {
+        return absl::OkStatus();
+      }
       return MakeCoordinationError(absl::FailedPreconditionError(
           "Agent must be in CONNECTED state. It is currently DISCONNECTED."));
 

From ced4314ee641572aa3c16ba963ba03ec3eb2c20f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 17:37:16 -0800
Subject: [PATCH 358/753] Internal visibility only change.

PiperOrigin-RevId: 845501210
---
 tensorflow/core/framework/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index b6d0a1bee44ad3..8a10b3d5557f42 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -739,6 +739,7 @@ cc_library(
         "//waymo/ml/compiler/frontend/kernels:__pkg__",
         "//waymo/ml/compiler/runtime/alpine/core:__pkg__",
         "//waymo/ml/woodshed/ops:__pkg__",
+        "//waymo/perception/training/point_lens/unified_dataset/python/tensorflow:__pkg__",
     ],
     deps = [
         "//tensorflow/core/lib/core:refcount",

From dc6314fc0eee658b39f74e85418c3a8dc762315a Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Tue, 16 Dec 2025 17:42:43 -0800
Subject: [PATCH 359/753] [ReplicaGroupV3][Refactor][2/n] Store
 `CollectiveDeviceListBase` in HLOCollectiveInstruction proto. Add fields for
 different versions of collective device lists and assert a oneof constraint
 that only one field can be set.

PiperOrigin-RevId: 845502926
---
 .../xla/xla/hlo/ir/hlo_instructions.cc        | 24 +++++++++++++++----
 third_party/xla/xla/hlo/ir/hlo_instructions.h | 15 +++++++++---
 third_party/xla/xla/service/hlo.proto         |  8 +++++--
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 526165754c9794..c8a09b07ce03c1 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -921,7 +921,7 @@ HloCollectiveInstruction::HloCollectiveInstruction(
     const CollectiveDeviceList& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id)
     : HloChannelInstruction(opcode, shape, channel_id),
-      device_list_(device_list),
+      device_list_(std::make_shared<CollectiveDeviceList>(device_list)),
       constrain_layout_(constrain_layout) {
   for (auto operand : operands) {
     AppendOperand(operand);
@@ -930,7 +930,23 @@ HloCollectiveInstruction::HloCollectiveInstruction(
 
 HloInstructionProto HloCollectiveInstruction::ToProto() const {
   HloInstructionProto proto = HloChannelInstruction::ToProto();
-  *proto.mutable_collective_device_list() = device_list_.ToProto();
+
+  if (const CollectiveDeviceList* device_list_v1 =
+          dynamic_cast<const CollectiveDeviceList*>(device_list_.get())) {
+    *proto.mutable_collective_device_list() = device_list_v1->ToProto();
+  } else if (const IotaReplicaGroupList* device_list_v2 =
+                 dynamic_cast<const IotaReplicaGroupList*>(
+                     device_list_.get())) {
+    *proto.mutable_iota_collective_device_list() = device_list_v2->ToProto();
+  } else if (const MeshAxesReplicaGroupList* device_list_v3 =
+                 dynamic_cast<const MeshAxesReplicaGroupList*>(
+                     device_list_.get())) {
+    *proto.mutable_mesh_axes_replica_group_list() = device_list_v3->ToProto();
+  } else {
+    LOG(FATAL) << "Unknown or missing CollectiveDeviceList type in "
+                  "HloCollectiveInstruction";
+  }
+
   proto.set_constrain_layout(constrain_layout_);
   return proto;
 }
@@ -940,10 +956,10 @@ void HloCollectiveInstruction::PrintExtraAttributesImpl(
   HloChannelInstruction::PrintExtraAttributesImpl(printer, options);
   printer.Next([this, &options](Printer* printer) {
     VLOG(4) << name() << " replica_groups="
-            << device_list_.ToString(options.print_full_replica_group_list());
+            << device_list_->ToString(options.print_full_replica_group_list());
 
     printer->Append("replica_groups=");
-    device_list_.Print(printer, options.print_full_replica_group_list());
+    device_list_->Print(printer, options.print_full_replica_group_list());
   });
   if (constrain_layout_) {
     printer.Next(
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index b73f54e0b830f9..5b87d2636778be 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -648,10 +648,19 @@ class HloRecvDoneInstruction : public HloSendRecvInstruction {
 class HloCollectiveInstruction : public HloChannelInstruction {
  public:
   const std::vector<ReplicaGroup>& replica_groups() const {
-    return device_list_.replica_groups();
+    return device_list_->replica_groups();
   }
 
-  const CollectiveDeviceList& device_list() const { return device_list_; }
+  const CollectiveDeviceList& device_list() const {
+    const CollectiveDeviceList* device_list_v1 =
+        dynamic_cast<const CollectiveDeviceList*>(device_list_.get());
+    // TODO(b/468442352): After XLA codebase is genericized to utilize
+    // CollectiveDeviceListBase instead of CollectiveDeviceList remove this
+    // check and return CollectiveDeviceListBase instead.
+    CHECK(device_list_v1 != nullptr)
+        << "Failed to cast device_list_ to CollectiveDeviceList";
+    return *device_list_v1;
+  }
 
   // Returns true if the layout of the AllReduce is enforced by XLA client (as
   // the layout set in the shape). The only reason for the client to set the
@@ -686,7 +695,7 @@ class HloCollectiveInstruction : public HloChannelInstruction {
       absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
           eq_computations) const override;
 
-  CollectiveDeviceList device_list_;
+  std::shared_ptr<CollectiveDeviceListBase> device_list_;
   bool constrain_layout_;
 };
 
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index 6ea3da98fd7cba..01610575d714cf 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -113,7 +113,7 @@ enum CustomCallApiVersion {
 }
 
 // Serialization of HloInstruction.
-// Next ID: 92
+// Next ID: 94
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -385,7 +385,11 @@ message HloInstructionProto {
   reserved 86;
 
   // Represents the list of devices that participate in a collective operation.
-  xla.CollectiveDeviceListProto collective_device_list = 87;
+  oneof replica_group_list {
+    xla.CollectiveDeviceListProto collective_device_list = 87;
+    xla.IotaReplicaGroupListProto iota_collective_device_list = 92;
+    xla.MeshAxesReplicaGroupListProto mesh_axes_replica_group_list = 93;
+  }
 
   // For HLO value tracking.
   xla.OriginalValueProto original_value = 88;

From 4f5e199a87e11ef4bb44992a3ccb22ea7e9fe983 Mon Sep 17 00:00:00 2001
From: Grant Jensen <grantjensen@google.com>
Date: Tue, 16 Dec 2025 17:48:33 -0800
Subject: [PATCH 360/753] Fix cmakes build of tflite when
 TFLITE_WITH_XNNPACK=OFF and tflite is a dependency

PiperOrigin-RevId: 845504524
---
 tensorflow/lite/CMakeLists.txt                 | 18 +++++++++++-------
 .../cmake/modules/ml_dtypes/CMakeLists.txt     |  5 +++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 1249b5c01e321f..7e877cd92f1cd1 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -749,9 +749,11 @@ add_library(tensorflow-lite
 set(_ALL_TFLITE_HDRS ${_ALL_TFLITE_SRCS})
 list(FILTER _ALL_TFLITE_HDRS INCLUDE REGEX ".*\\.h$")
 target_include_directories(tensorflow-lite
-  PUBLIC $<BUILD_INTERFACE:${TENSORFLOW_SOURCE_DIR}> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
-  PUBLIC ${CMAKE_CURRENT_BINARY_DIR}
-  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/..
+  PUBLIC
+    $<BUILD_INTERFACE:${TENSORFLOW_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/..>
 )
 target_link_libraries(tensorflow-lite
   PUBLIC
@@ -879,7 +881,9 @@ target_compile_options(_pywrap_tensorflow_interpreter_wrapper
   PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
 )
 
-target_compile_options(xnnpack-delegate
-  PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
-  PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
-)
\ No newline at end of file
+if(TFLITE_ENABLE_XNNPACK)
+  target_compile_options(xnnpack-delegate
+    PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
+    PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
+  )
+endif()
\ No newline at end of file
diff --git a/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt
index 91e893ee377048..8be897f54d728f 100644
--- a/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt
+++ b/tensorflow/lite/tools/cmake/modules/ml_dtypes/CMakeLists.txt
@@ -24,8 +24,9 @@ endif()
 
 add_library(ml_dtypes INTERFACE)
 target_include_directories(ml_dtypes INTERFACE
-  "${ML_DTYPES_SOURCE_DIR}"
-  "${ML_DTYPES_SOURCE_DIR}/ml_dtypes")
+  "$<BUILD_INTERFACE:${ML_DTYPES_SOURCE_DIR}>"
+  "$<BUILD_INTERFACE:${ML_DTYPES_SOURCE_DIR}/ml_dtypes>"
+  "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>")
 file(GLOB ML_DTYPES_PUBLIC_HEADERS
   ${ML_DTYPES_SOURCE_DIR}/ml_dtypes/include/*.h)
 set_target_properties(ml_dtypes PROPERTIES

From ee3e4e37bea35192119cc7c72633028024e5b7e1 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Tue, 16 Dec 2025 19:50:38 -0800
Subject: [PATCH 361/753] Make c-api topology and PjRtClient versions produce
 identical platform_verison strings to improve cache reuse between aot and
 actual runtime uses.

Remove runtime_type as a cache key (compilation shouldn't depend on the runtime).

PiperOrigin-RevId: 845541086
---
 third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc      | 11 +++++++++++
 third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h       |  2 ++
 .../xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc    | 10 +++-------
 .../xla/xla/pjrt/c_api_client/pjrt_c_api_client.h     |  1 +
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index 9f072392bffadd..00b653824a1654 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -707,6 +707,17 @@ absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api) {
   return platform_version;
 }
 
+absl::string_view GetPlatformVersion(PJRT_TopologyDescription* c_topology,
+                                     const PJRT_Api* api) {
+  PJRT_TopologyDescription_PlatformVersion_Args args;
+  args.struct_size = PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.topology = c_topology;
+  pjrt::LogFatalIfPjrtError(
+      api->PJRT_TopologyDescription_PlatformVersion(&args), api);
+  return absl::string_view(args.platform_version, args.platform_version_size);
+}
+
 absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api) {
   PJRT_Client_PlatformName_Args args;
   args.client = client;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
index a585e00719694b..5c0ae4a1ac2d7b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.h
@@ -199,6 +199,8 @@ absl::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
                                               size_t actual_size);
 
 absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api);
+absl::string_view GetPlatformVersion(PJRT_TopologyDescription* c_topology,
+                                     const PJRT_Api* api);
 absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api);
 
 absl::StatusOr<PJRT_TopologyDescription*> GetTopologyDescription(
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 8fc556ae3c7150..b0f25add8a9ff3 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -3352,6 +3352,8 @@ PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
       tpu_topology_extension_(pjrt::FindExtension<PJRT_TpuTopology_Extension>(
           c_api, PJRT_Extension_Type::PJRT_Extension_Type_TpuTopology)),
       c_topology_(c_topology),
+      platform_version_(absl::StrCat(
+          "PJRT C API\n", ::pjrt::GetPlatformVersion(c_topology, c_api))),
       platform_name_(::pjrt::PlatformName(c_api, c_topology)),
       platform_id_(tsl::Fingerprint64(platform_name_)) {
   if (owned) {
@@ -3363,13 +3365,7 @@ PjRtCApiTopologyDescription::PjRtCApiTopologyDescription(
 }
 
 absl::string_view PjRtCApiTopologyDescription::platform_version() const {
-  PJRT_TopologyDescription_PlatformVersion_Args args;
-  args.struct_size = PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE;
-  args.extension_start = nullptr;
-  args.topology = c_topology_;
-  pjrt::LogFatalIfPjrtError(
-      c_api_->PJRT_TopologyDescription_PlatformVersion(&args), c_api_);
-  return absl::string_view(args.platform_version, args.platform_version_size);
+  return platform_version_;
 }
 
 std::vector<std::unique_ptr<const PjRtDeviceDescription>>
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index 42291bc32e0e7c..24e781396cefbe 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -313,6 +313,7 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
   // Device specific attributes with corresponding values.
   absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
 
+  const std::string platform_version_;
   const std::string platform_name_;
   const PjRtPlatformId platform_id_;
 

From 795c2f0907ed777642c13719e0da733a3b07e3a6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 19:53:58 -0800
Subject: [PATCH 362/753] Automated Code Change

PiperOrigin-RevId: 845541854
---
 third_party/xla/xla/pjrt/pjrt_client_test.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/pjrt/pjrt_client_test.cc b/third_party/xla/xla/pjrt/pjrt_client_test.cc
index 4fddbe0dc5d331..c3cd93ab5e8faa 100644
--- a/third_party/xla/xla/pjrt/pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_client_test.cc
@@ -132,7 +132,7 @@ TEST_P(PjRtClientTest, Execute) {
                           executable->Execute({{buffer.get()}}, options));
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -161,7 +161,7 @@ TEST_P(PjRtClientTest, ExecuteWithImmutableUntilTransferCompletes) {
                           executable->Execute({{buffer.get()}}, options));
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -203,7 +203,7 @@ TEST_P(PjRtClientTest, ExecuteWithTupleZeroCopy) {
 
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -232,7 +232,7 @@ TEST_P(PjRtClientTest, ExecuteWithDonation) {
                           executable->Execute({{buffer.get()}}, options));
   ASSERT_EQ(results.size(), 1);
   ASSERT_EQ(results[0].size(), 1);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, results[0][0]->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 1);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -312,7 +312,7 @@ TEST_P(PjRtClientTest, ExecuteWithConcurrentUsage) {
 
   std::vector<int32_t> expected(4, 1);
   for (const auto& result : results) {
-    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                        *literal));
   }
@@ -357,7 +357,7 @@ TEST_P(PjRtClientTest, ExecuteWithConcurrentUsageAndDonation) {
         auto& results = *results_or;
         CHECK_EQ(results.size(), 1);
         CHECK_EQ(results[0].size(), 1);
-        auto literal_or = results[0][0]->ToLiteralSync();
+        auto literal_or = results[0][0]->ToLiteral().Await();
         if (literal_or.ok()) {
           CHECK(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                        *literal_or.value()));
@@ -380,7 +380,7 @@ TEST_P(PjRtClientTest, ExecuteWithConcurrentUsageAndDonation) {
 
   blocking_counter.Wait();
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                      *literal));
 }
@@ -409,7 +409,7 @@ TEST(PjRtClientTest, CopyToDevice) {
   TF_ASSERT_OK_AND_ASSIGN(auto result, buffer->CopyToMemorySpace(
                                            *device_1->default_memory_space()));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 0);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -448,7 +448,7 @@ TEST(PjRtClientTest, CopyToDeviceAsync) {
 
   for (const auto& result : results) {
     ASSERT_TRUE(result);
-    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
     std::vector<int32_t> expected(4, 0);
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -495,7 +495,7 @@ TEST(PjRtClientTest, CopyToDeviceAsyncExternalCpuOnly) {
 
   for (const auto& result : results) {
     ASSERT_TRUE(result);
-    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+    TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
     std::vector<int32_t> expected(4, 0);
     EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),

From e522862e3324b136c714148ecb42e05640fea5c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 20:03:52 -0800
Subject: [PATCH 363/753] Automated Code Change

PiperOrigin-RevId: 845545983
---
 third_party/xla/xla/hlo/transforms/expanders/BUILD               | 1 +
 .../xla/hlo/transforms/expanders/permutation_sort_expander.cc    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/hlo/transforms/expanders/BUILD b/third_party/xla/xla/hlo/transforms/expanders/BUILD
index f175a711fa1e57..315802b86326c1 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/BUILD
+++ b/third_party/xla/xla/hlo/transforms/expanders/BUILD
@@ -381,6 +381,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
index cfb796aef6c898..f2d015bc3f98c8 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
+++ b/third_party/xla/xla/hlo/transforms/expanders/permutation_sort_expander.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace {

From 6d857f3af8ae039137c218364ef4b74fc64ac25d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 20:23:27 -0800
Subject: [PATCH 364/753] Automated Code Change

PiperOrigin-RevId: 845553728
---
 tensorflow/cc/training/queue_runner.cc |  6 +++---
 tensorflow/cc/training/queue_runner.h  | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/cc/training/queue_runner.cc b/tensorflow/cc/training/queue_runner.cc
index 1d23f9d87e2d7d..87b696edc39681 100644
--- a/tensorflow/cc/training/queue_runner.cc
+++ b/tensorflow/cc/training/queue_runner.cc
@@ -118,7 +118,7 @@ absl::Status QueueRunner::StartAndCollectCostGraph(
 
 absl::Status QueueRunner::Start(Session* sess, int wait_for) {
   counter_ = std::make_unique<BlockingCounter>(runs_);
-  for (const string& enqueue_op : enqueue_op_names_) {
+  for (const std::string& enqueue_op : enqueue_op_names_) {
     thread_pool_->Schedule(
         std::bind(&QueueRunner::Run, this, sess, enqueue_op));
   }
@@ -182,7 +182,7 @@ void QueueRunner::UpdateStatus(const absl::Status& status) {
   }
 }
 
-void QueueRunner::Run(Session* sess, const string& enqueue_op) {
+void QueueRunner::Run(Session* sess, const std::string& enqueue_op) {
   bool first_iteration = true;
   absl::Status status;
   while (status.ok()) {
@@ -245,7 +245,7 @@ void QueueRunner::SetRunArgumentsAndCostGraph(const RunOptions& run_options) {
   run_options_ = run_options;
 }
 
-absl::Status QueueRunner::RealRun(Session* sess, const string& op,
+absl::Status QueueRunner::RealRun(Session* sess, const std::string& op,
                                   bool update_costs) {
   absl::Status s;
   if (update_costs && cg_mu_) {
diff --git a/tensorflow/cc/training/queue_runner.h b/tensorflow/cc/training/queue_runner.h
index d5d6ca10a56821..ffba8987c6d518 100644
--- a/tensorflow/cc/training/queue_runner.h
+++ b/tensorflow/cc/training/queue_runner.h
@@ -97,7 +97,7 @@ class QueueRunner : public RunnerInterface {
   absl::Status Init(const QueueRunnerDef& queue_runner_def);
 
   // The Run function for each thread.
-  void Run(Session* sess, const string& enqueue_op);
+  void Run(Session* sess, const std::string& enqueue_op);
 
   // Updates the internal status; it only keeps OK or the first unexpected error
   // status.
@@ -112,12 +112,12 @@ class QueueRunner : public RunnerInterface {
 
   void SetRunArgumentsAndCostGraph(const RunOptions& run_options);
 
-  absl::Status RealRun(Session* sess, const string& op, bool update_costs);
+  absl::Status RealRun(Session* sess, const std::string& op, bool update_costs);
 
-  string queue_name_;
-  std::vector<string> enqueue_op_names_;
-  string close_op_name_;
-  string cancel_op_name_;
+  std::string queue_name_;
+  std::vector<std::string> enqueue_op_names_;
+  std::string close_op_name_;
+  std::string cancel_op_name_;
   // code::Code casted to int to avoid a hash function.
   std::unordered_set<int> queue_closed_exception_types_;
 

From e9e8ae875c3ed4b23c590bc2d7d6aedf7893fbea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 22:38:19 -0800
Subject: [PATCH 365/753] Automated Code Change

PiperOrigin-RevId: 845597209
---
 .../xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
index cc33051cdb219a..5557feb257ceb3 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
@@ -52,7 +52,7 @@ ENTRY primitive_computation_mul.8 {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                           GetOptimizedModule(hlo_text));
 
-  se::StreamExecutorMemoryAllocator allocator(
+  stream_executor::StreamExecutorAddressAllocator allocator(
       backend().default_stream_executor());
   absl::StatusOr<std::unique_ptr<Executable>> failed_executable =
       backend().compiler()->RunBackend(std::move(optimized_module),

From 3ac2cb42d80f2bd4f675460e32fd1977020ff53f Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Tue, 16 Dec 2025 22:41:30 -0800
Subject: [PATCH 366/753] [xla:cpu] hlo_benchmark_runner: add
 RunHloBenchmarkOnce

This will allow users to unit-test their benchmarks.

PiperOrigin-RevId: 845598230
---
 .../xla/xla/backends/cpu/benchmarks/BUILD     |  1 +
 .../cpu/benchmarks/hlo_benchmark_runner.cc    | 33 +++++++++++++++----
 .../cpu/benchmarks/hlo_benchmark_runner.h     |  7 ++++
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/benchmarks/BUILD b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
index c9a377c6ecb010..418a5e3e9027d2 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/BUILD
+++ b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
@@ -70,6 +70,7 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
+        "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
index 0aeeb6d69a12bc..97708cacc365c5 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/nullability.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -152,10 +153,10 @@ absl::Status RunHloBenchmark(benchmark::State& state,
 }
 
 // NOLINTNEXTLINE(readability-function-cognitive-complexity)
-absl::Status RunHloBenchmark(benchmark::State& state,
-                             std::unique_ptr<HloModule> module,
-                             absl::Span<const Literal* const> args,
-                             const HloBenchmarkOptions& benchmark_options) {
+absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
+                                 std::unique_ptr<HloModule> module,
+                                 absl::Span<const Literal* const> args,
+                                 const HloBenchmarkOptions& benchmark_options) {
   xla::CpuClientOptions client_options;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
                       xla::GetXlaPjrtCpuClient(client_options));
@@ -290,17 +291,35 @@ absl::Status RunHloBenchmark(benchmark::State& state,
     return absl::OkStatus();
   };
 
-  // Warm up executable.
+  // Run once. For a regular benchmark this will serve as a warm-up;
+  // for RunHloBenchmarkOnce this will be the only run.
   TF_RETURN_IF_ERROR(run_benchmark_once());
 
   // Benchmark executable.
-  for (auto _ : state) {
-    TF_RETURN_IF_ERROR(run_benchmark_once());
+  if (state) {
+    for (auto _ : *state) {
+      TF_RETURN_IF_ERROR(run_benchmark_once());
+    }
   }
 
   return absl::OkStatus();
 }
 
+absl::Status RunHloBenchmark(benchmark::State& state,
+                             std::unique_ptr<HloModule> module,
+                             absl::Span<const Literal* const> args,
+                             const HloBenchmarkOptions& benchmark_options) {
+  return RunHloBenchmarkImpl(&state, std::move(module), args,
+                             benchmark_options);
+}
+
+absl::Status RunHloBenchmarkOnce(std::unique_ptr<HloModule> module,
+                                 absl::Span<const Literal* const> args,
+                                 const HloBenchmarkOptions& benchmark_options) {
+  return RunHloBenchmarkImpl(nullptr, std::move(module), args,
+                             benchmark_options);
+}
+
 absl::Status CompileHloBenchmark(benchmark::State& state,
                                  absl::string_view hlo_module,
                                  StrToStrMapping replacements,
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
index 87fac7b2cbb1cb..562853d29e9c2b 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.h
@@ -70,6 +70,13 @@ absl::Status RunHloBenchmark(benchmark::State& state,
                              absl::Span<const Literal* const> args,
                              const HloBenchmarkOptions& benchmark_options = {});
 
+// Same as above, except that it runs the module exactly once and does not
+// have a benchmark::State parameter, which makes it suitable for unit tests.
+absl::Status RunHloBenchmarkOnce(
+    std::unique_ptr<HloModule> hlo_module,
+    absl::Span<const Literal* const> args,
+    const HloBenchmarkOptions& benchmark_options = {});
+
 // Benchmarks the given HLO's compilation time.
 //
 // Takes the same options as RunHloBenchmark, except no arguments since the

From 3ed66e80c401cdfb40b229644192880048aad9ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 22:42:08 -0800
Subject: [PATCH 367/753] Automated Code Change

PiperOrigin-RevId: 845598430
---
 .../tpu/ops/sparse_core_preprocess_ops.cc     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
index 4985cea9558993..ed7ff78c77da57 100644
--- a/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/ops/sparse_core_preprocess_ops.cc
@@ -73,14 +73,14 @@ REGISTER_OP("GetMinibatchesInCsrWithPhysicalReplica")
       for (int i = 0; i < c->num_inputs(); ++i) {
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &rank));
       }
-      int32 max_minibatches_per_sc;
+      int32_t max_minibatches_per_sc;
       TF_RETURN_IF_ERROR(
           c->GetAttr("max_minibatches_per_sc", &max_minibatches_per_sc));
-      int32 num_replica;
+      int32_t num_replica;
       TF_RETURN_IF_ERROR(c->GetAttr("num_replica", &num_replica));
-      int32 sample_count;
+      int32_t sample_count;
       TF_RETURN_IF_ERROR(c->GetAttr("sample_count", &sample_count));
-      int32 max_ids_per_chip_per_sample;
+      int32_t max_ids_per_chip_per_sample;
       TF_RETURN_IF_ERROR(c->GetAttr("max_ids_per_chip_per_sample",
                                     &max_ids_per_chip_per_sample));
 
@@ -88,7 +88,7 @@ REGISTER_OP("GetMinibatchesInCsrWithPhysicalReplica")
       // will be run as part of the graph generation which might not have the
       // tpu system available.
       const int xla_pad_size = 8;
-      int32 num_sc_per_chip;
+      int32_t num_sc_per_chip;
       TF_RETURN_IF_ERROR(c->GetAttr("num_sc_per_chip", &num_sc_per_chip));
 
       const int num_physical_replica = num_replica * num_sc_per_chip;
@@ -253,22 +253,22 @@ REGISTER_OP("ConvertToSparseCoreCsrWrappedCooTensor")
     .Attr("table_name: string")
     .Attr("allow_id_dropping: bool")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      int32 max_minibatches_per_sc;
+      int32_t max_minibatches_per_sc;
       TF_RETURN_IF_ERROR(
           c->GetAttr("max_minibatches_per_sc", &max_minibatches_per_sc));
-      int32 num_replica;
+      int32_t num_replica;
       TF_RETURN_IF_ERROR(c->GetAttr("num_replica", &num_replica));
-      int32 sample_count_per_sc;
+      int32_t sample_count_per_sc;
       TF_RETURN_IF_ERROR(
           c->GetAttr("sample_count_per_sc", &sample_count_per_sc));
-      int32 max_ids_per_chip_per_sample;
+      int32_t max_ids_per_chip_per_sample;
       TF_RETURN_IF_ERROR(c->GetAttr("max_ids_per_chip_per_sample",
                                     &max_ids_per_chip_per_sample));
       // We can't get this number programmatically since the shape inference
       // will be run as part of the graph generation which might not have the
       // tpu system available.
       const int xla_pad_size = 8;
-      int32 num_sc_per_chip;
+      int32_t num_sc_per_chip;
       TF_RETURN_IF_ERROR(c->GetAttr("num_sc_per_chip", &num_sc_per_chip));
 
       const int num_physical_replica = num_replica * num_sc_per_chip;

From 4d3d330f90e038c94a86d9f0f1ac4d216024825a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 22:42:21 -0800
Subject: [PATCH 368/753] Automated Code Change

PiperOrigin-RevId: 845598510
---
 .../kernels/tpu_reshard_variables_op_util.cc  | 21 ++++++++++++-------
 .../kernels/tpu_reshard_variables_op_util.h   |  2 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
index f50652f8b5e81c..73214c817eaf04 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
@@ -99,7 +99,7 @@ absl::Status GetComputationCacheEntry(
 }
 
 // Builds an InputBuffers object that describes the inputs to the computation.
-absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
+absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceAddress>> BuildInputBuffers(
     OpKernelContext* context, const std::vector<VariableInfo>& variables,
     const xla::Shape& input_host_shape, xla::Backend* backend,
     int device_ordinal, se::Stream* stream) {
@@ -150,10 +150,11 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
         validate_shape(variables[i].index(), *variables[i].var()->tensor()));
   }
 
-  se::DeviceMemoryAllocator* const allocator = backend->memory_allocator();
+  stream_executor::DeviceAddressAllocator* const allocator =
+      backend->memory_allocator();
   xla::TransferManager* const transfer_manager = backend->transfer_manager();
 
-  xla::ShapeTree<xla::MaybeOwningDeviceMemory> input_buffers(
+  xla::ShapeTree<xla::MaybeOwningDeviceAddress> input_buffers(
       transfer_manager->HostShapeToDeviceShape(input_host_shape));
 
   // Allocates a buffer for the root tuple.
@@ -165,15 +166,17 @@ absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
   auto set_input_buffers_helper = [&](int arg_index, xla::ShapedBuffer* buffers,
                                       bool owning = false) {
     buffers->buffers().ForEachMutableElement(
-        [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        [&](const xla::ShapeIndex& index,
+            stream_executor::DeviceAddressBase* buffer) {
           xla::ShapeIndex in_index = {arg_index};
           for (int64_t j : index) {
             in_index.push_back(j);
           }
           if (owning) {
             *input_buffers.mutable_element(in_index) =
-                se::OwningDeviceMemory(*buffer, device_ordinal, allocator);
-            *buffer = se::DeviceMemoryBase();
+                stream_executor::ScopedDeviceAddress<uint8_t>(
+                    *buffer, device_ordinal, allocator);
+            *buffer = stream_executor::DeviceAddressBase();
           } else {
             *input_buffers.mutable_element(in_index) = *buffer;
           }
@@ -268,7 +271,8 @@ absl::Status UpdateOutputVariables(
   TF_RET_CHECK(result_buffers.on_host_shape().IsTuple());
   TF_RET_CHECK(!xla::ShapeUtil::IsNestedTuple(result_buffers.on_host_shape()));
 
-  se::DeviceMemoryAllocator* const allocator = backend->memory_allocator();
+  stream_executor::DeviceAddressAllocator* const allocator =
+      backend->memory_allocator();
 
   auto output_buffers = result_buffers.release();
   const xla::Shape& output_host_shape = output_buffers.on_host_shape();
@@ -285,7 +289,8 @@ absl::Status UpdateOutputVariables(
       xla::ScopedShapedBuffer shaped_buffer(host_shape, device_shape, allocator,
                                             device_ordinal);
       shaped_buffer.buffers().ForEachMutableElement(
-          [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+          [&](const xla::ShapeIndex& index,
+              stream_executor::DeviceAddressBase* buffer) {
             xla::ShapeIndex out_index = {i};
             for (int64_t j : index) {
               out_index.push_back(j);
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
index c731cc10ec70ce..ab44f7788fbf50 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
@@ -42,7 +42,7 @@ absl::Status GetComputationCacheEntry(
     std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
     tpu::CompilationCacheFetchTarget fetch_target);
 
-absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceMemory>> BuildInputBuffers(
+absl::StatusOr<xla::ShapeTree<xla::MaybeOwningDeviceAddress>> BuildInputBuffers(
     OpKernelContext* context, const std::vector<VariableInfo>& variables,
     const xla::Shape& input_host_shape, xla::Backend* backend,
     int device_ordinal, se::Stream* stream);

From 4de7fc71dde2f77425a00947a27fb9d9950a8554 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 22:44:59 -0800
Subject: [PATCH 369/753] Automated Code Change

PiperOrigin-RevId: 845599471
---
 third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
index f3731f64821230..6f5e41e3391830 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
@@ -509,7 +509,8 @@ class NanoArray final : public NanoValue<NanoArray, ifrt::Array> {
   // Allocates an aligned buffer of the given size.
   static absl::StatusOr<OwnedDataPtr> AllocateData(size_t size) {
     OwnedDataPtr owned_data(
-        tsl::port::AlignedMalloc(std::max<size_t>(size, Align()), Align()),
+        tsl::port::AlignedMalloc(std::max<size_t>(size, Align()),
+                                 static_cast<std::align_val_t>(Align())),
         [](void* ptr) { tsl::port::AlignedFree(ptr); });
     if (ABSL_PREDICT_FALSE(owned_data == nullptr)) {
       return Internal("Failed to allocate memory for NanoArray. Errno: %s",

From 3b7afe039e43e0fd6255463a9f65e9301e3244c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 22:47:12 -0800
Subject: [PATCH 370/753] Automated Code Change

PiperOrigin-RevId: 845600263
---
 .../experimental/microfrontend/ops/audio_microfrontend_op.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
index 8cba5779565223..56692cbcaeecb7 100644
--- a/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
+++ b/tensorflow/lite/experimental/microfrontend/ops/audio_microfrontend_op.cc
@@ -292,8 +292,8 @@ class AudioMicrofrontendOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("AudioMicrofrontend")
                             .Device(tensorflow::DEVICE_CPU)
-                            .TypeConstraint<uint16>("out_type"),
-                        AudioMicrofrontendOp<uint16>);
+                            .TypeConstraint<uint16_t>("out_type"),
+                        AudioMicrofrontendOp<uint16_t>);
 REGISTER_KERNEL_BUILDER(Name("AudioMicrofrontend")
                             .Device(tensorflow::DEVICE_CPU)
                             .TypeConstraint<float>("out_type"),

From 22b9f488a6a43a815152fdda05510b8046147f38 Mon Sep 17 00:00:00 2001
From: Grant Jensen <grantjensen@google.com>
Date: Tue, 16 Dec 2025 22:49:49 -0800
Subject: [PATCH 371/753] Use relative paths instead of absolute paths for
 tflite cmake files, allowing for cmake build of tflite with tflite as a
 dependency to build.

PiperOrigin-RevId: 845601004
---
 tensorflow/lite/profiling/proto/CMakeLists.txt       | 12 ++++++------
 tensorflow/lite/tools/benchmark/proto/CMakeLists.txt |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/profiling/proto/CMakeLists.txt b/tensorflow/lite/profiling/proto/CMakeLists.txt
index 0bfa81a41476f3..5738c992fc2839 100644
--- a/tensorflow/lite/profiling/proto/CMakeLists.txt
+++ b/tensorflow/lite/profiling/proto/CMakeLists.txt
@@ -17,8 +17,8 @@ find_package(Protobuf REQUIRED)
 add_library(profiling_info_proto profiling_info.proto)
 
 list(APPEND profiling_info_generated_files
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/profiling_info.pb.cc
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/profiling_info.pb.h)
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/profiling_info.pb.cc
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/profiling_info.pb.h)
 
 # Generate profiling_info.pb.cc and profiling_info.pb.h from
 # profiling_info.proto using protoc. Once the protobuf package version is
@@ -26,7 +26,7 @@ list(APPEND profiling_info_generated_files
 add_custom_command(
     OUTPUT ${profiling_info_generated_files}
     COMMAND ${Protobuf_PROTOC_EXECUTABLE}
-    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${CMAKE_CURRENT_SOURCE_DIR}/../../.. tflite/profiling/proto/profiling_info.proto
+    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${TENSORFLOW_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/profiling_info.proto
     DEPENDS ${Protobuf_PROTOC_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/profiling_info.proto
 )
 
@@ -37,8 +37,8 @@ target_include_directories(profiling_info_proto PUBLIC ${CMAKE_BINARY_DIR})
 
 add_library(model_runtime_info_proto model_runtime_info.proto)
 list(APPEND model_runtime_info_generated_files
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/model_runtime_info.pb.cc
-    ${CMAKE_BINARY_DIR}/tflite/profiling/proto/model_runtime_info.pb.h
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/model_runtime_info.pb.cc
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/profiling/proto/model_runtime_info.pb.h
 )
 
 # Generate model_runtime_info.pb.cc and model_runtime_info.pb.h from
@@ -47,7 +47,7 @@ list(APPEND model_runtime_info_generated_files
 add_custom_command(
     OUTPUT ${model_runtime_info_generated_files}
     COMMAND ${Protobuf_PROTOC_EXECUTABLE}
-    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${CMAKE_CURRENT_SOURCE_DIR}/../../.. tflite/profiling/proto/model_runtime_info.proto
+    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${TENSORFLOW_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/model_runtime_info.proto
     DEPENDS ${Protobuf_PROTOC_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/model_runtime_info.proto ${profiling_info_generated_files}
 )
 
diff --git a/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt b/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt
index 12a7e577bd3277..6a39f06e03c32b 100644
--- a/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/proto/CMakeLists.txt
@@ -17,8 +17,8 @@ find_package(Protobuf REQUIRED)
 add_library(benchmark_result_proto benchmark_result.proto)
 
 list(APPEND benchmark_result_generated_files
-    ${CMAKE_BINARY_DIR}/tflite/tools/benchmark/proto/benchmark_result.pb.cc
-    ${CMAKE_BINARY_DIR}/tflite/tools/benchmark/proto/benchmark_result.pb.h)
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/tools/benchmark/proto/benchmark_result.pb.cc
+    ${CMAKE_BINARY_DIR}/tensorflow/lite/tools/benchmark/proto/benchmark_result.pb.h)
 
 # Generate benchmark_result.pb.cc and benchmark_result.pb.h from
 # benchmark_result.proto using protoc. Once the protobuf package version is
@@ -26,7 +26,7 @@ list(APPEND benchmark_result_generated_files
 add_custom_command(
     OUTPUT ${benchmark_result_generated_files}
     COMMAND ${Protobuf_PROTOC_EXECUTABLE}
-    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${CMAKE_CURRENT_SOURCE_DIR}/../../../.. tflite/tools/benchmark/proto/benchmark_result.proto
+    ARGS --cpp_out=${CMAKE_BINARY_DIR} --proto_path=${TENSORFLOW_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_result.proto
     DEPENDS ${Protobuf_PROTOC_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_result.proto
 )
 

From b73ff25f8ce5de512391e07858bd982e5a68d9b6 Mon Sep 17 00:00:00 2001
From: Shaogang Wang <shawnw@nvidia.com>
Date: Tue, 16 Dec 2025 22:49:54 -0800
Subject: [PATCH 372/753] PR #35132: [XLA:GPU] Update HLO cublas workspace size
 after autotuner select the algorithm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35132

📝 Summary of Changes

This PR introduces a pass that updates the workspace size for cuBLAS/cuBLASLt GEMM operations after autotuning has selected a specific algorithm. The GemmRewriter pass conservatively allocates workspace before autotuning. After autotuning,we know the exact algorithm selected and can query its actual workspace requirement, potentially reducing memory usage.

🎯 Justification
Potentially reducing memory usage.

🚀 Kind of Contribution
Please remove what does not apply: ⚡️ Performance Improvement,

🧪 Unit Tests:
Existing gemm tests should cover the workspace size config.

Copybara import of the project:

--
a6ed2653e758a2a57e9bf2ce994549c5bc3e72d3 by Shawn Wang <shawnw@nvidia.com>:

Update cublas workspace size with the exact size extracted from algorithm

--
d67a48ae705069e68c854f403abc7f1c1a07ef47 by Shawn Wang <shawnw@nvidia.com>:

fix comments

--
613e0909ff390c5c1962345fc5f36f174e45393f by Shawn Wang <shawnw@nvidia.com>:

add unittest

Merging this change closes #35132

PiperOrigin-RevId: 845601031
---
 third_party/xla/xla/autotuning.proto          |   1 +
 .../xla/xla/backends/gpu/autotuner/cublas.cc  |   2 +
 .../xla/backends/gpu/autotuner/cublaslt.cc    |   3 +
 .../gpu/runtime/command_buffer_thunk_test.cc  |   5 +-
 .../gpu/runtime/gpublas_lt_matmul_thunk.cc    |  23 +-
 .../gpu/runtime/gpublas_lt_matmul_thunk.h     |   2 +
 .../runtime/gpublas_lt_matmul_thunk_test.cc   |   6 +-
 .../xla/xla/backends/gpu/runtime/thunk.proto  |   1 +
 third_party/xla/xla/service/gpu/BUILD         |   2 +
 .../xla/xla/service/gpu/backend_configs.proto |   3 +
 .../xla/xla/service/gpu/nvptx_compiler.cc     |   6 +
 .../xla/xla/service/gpu/thunk_emitter.cc      |   8 +-
 .../xla/xla/service/gpu/transforms/BUILD      |  38 +++
 .../gpu/transforms/gemm_workspace_rewriter.cc | 240 ++++++++++++++++++
 .../gpu/transforms/gemm_workspace_rewriter.h  |  55 ++++
 .../gemm_workspace_rewriter_test.cc           |  69 +++++
 16 files changed, 446 insertions(+), 18 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.cc
 create mode 100644 third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.h
 create mode 100644 third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter_test.cc

diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index b3ded36658852c..ceac0079baaaa3 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -64,6 +64,7 @@ message AutotuneResult {
 
   message GemmKey {
     int64 algorithm = 1;
+    int64 autotune_workspace_size = 2;
   }
 
   // Legacy and unused in new data; superseded by AlgorithmProto.
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
index 425343686beeaf..4f80eff3317db6 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
@@ -158,6 +158,8 @@ absl::Status CublasBackend::ApplyConfig(HloInstruction& instr,
                       instr.backend_config<GpuBackendConfig>());
   GemmBackendConfig& backend_config = *gpu_config.mutable_gemm_backend_config();
   backend_config.set_selected_algorithm(gemm_key.algorithm());
+  backend_config.set_autotune_workspace_size(
+      gemm_key.autotune_workspace_size());
   TF_RETURN_IF_ERROR(instr.set_backend_config(std::move(gpu_config)));
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
index 54c5b0e50a7bd6..5235c0646073f5 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublaslt.cc
@@ -124,6 +124,7 @@ CublasLtBackend::GetSupportedConfigs(const HloInstruction& instr) {
   for (int i = 0; i < num_algorithms; ++i) {
     CublasLtBackendConfig gemm_key;
     gemm_key.set_algorithm(i);
+    gemm_key.set_autotune_workspace_size(workspace_size);
     auto any = std::make_unique<google::protobuf::Any>();
     any->PackFrom(gemm_key);
     configs.push_back(std::move(any));
@@ -157,6 +158,8 @@ absl::Status CublasLtBackend::ApplyConfig(HloInstruction& instr,
                       instr.backend_config<GpuBackendConfig>());
   GemmBackendConfig& backend_config = *gpu_config.mutable_gemm_backend_config();
   backend_config.set_selected_algorithm(gemm_key.algorithm());
+  backend_config.set_autotune_workspace_size(
+      gemm_key.autotune_workspace_size());
   TF_RETURN_IF_ERROR(instr.set_backend_config(std::move(gpu_config)));
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index b67163dc794508..35e1ab249d4489 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -1132,8 +1132,9 @@ TEST(CommandBufferThunkTest, CublasLtCmd) {
   CommandBufferCmdSequence commands;
   commands.Emplace<CublasLtCmd>(CublasLtMatmulThunk(
       Thunk::ThunkInfo(), /*canonical_hlo=*/"", config.value(),
-      se::gpu::BlasLt::Epilogue::kDefault, 0, slice_a, slice_b, slice_c,
-      slice_d, BufferAllocation::Slice(), BufferAllocation::Slice(),
+      se::gpu::BlasLt::Epilogue::kDefault, /*algorithm_idx=*/0,
+      /*autotune_workspace_size=*/0, slice_a, slice_b, slice_c, slice_d,
+      BufferAllocation::Slice(), BufferAllocation::Slice(),
       BufferAllocation::Slice(), BufferAllocation::Slice(),
       BufferAllocation::Slice(), BufferAllocation::Slice(),
       BufferAllocation::Slice(), slice_workspace));
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
index fdc7d63891d797..fd0176489df957 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -44,6 +44,7 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(const CublasLtMatmulThunk& rhs)
       gemm_config_(rhs.gemm_config_),
       epilogue_(rhs.epilogue_),
       algorithm_idx_(rhs.algorithm_idx_),
+      autotune_workspace_size_(rhs.autotune_workspace_size_),
       canonical_hlo_(rhs.canonical_hlo_),
       a_(rhs.a_),
       b_(rhs.b_),
@@ -61,7 +62,8 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(const CublasLtMatmulThunk& rhs)
 CublasLtMatmulThunk::CublasLtMatmulThunk(
     Thunk::ThunkInfo thunk_info, std::string canonical_hlo,
     GemmConfig gemm_config, se::gpu::BlasLt::Epilogue epilogue,
-    int64_t algorithm_idx, BufferAllocation::Slice a, BufferAllocation::Slice b,
+    int64_t algorithm_idx, int64_t autotune_workspace_size,
+    BufferAllocation::Slice a, BufferAllocation::Slice b,
     BufferAllocation::Slice c, BufferAllocation::Slice d,
     BufferAllocation::Slice bias, BufferAllocation::Slice aux,
     BufferAllocation::Slice a_scale, BufferAllocation::Slice b_scale,
@@ -72,6 +74,7 @@ CublasLtMatmulThunk::CublasLtMatmulThunk(
       gemm_config_(std::move(gemm_config)),
       epilogue_(epilogue),
       algorithm_idx_(algorithm_idx),
+      autotune_workspace_size_(autotune_workspace_size),
       canonical_hlo_(std::move(canonical_hlo)),
       a_(a),
       b_(b),
@@ -135,10 +138,11 @@ CublasLtMatmulThunk::GetCachedMatmulPlan(const ExecuteParams& params) {
 
     TF_ASSIGN_OR_RETURN(auto plan,
                         blas_lt->GetMatmulPlan(gemm_config_, epilogue_));
-    // if workspace buffer is not provided, consider only the algorithms which
-    // do not require a scratch space
-    int64_t max_workspace =
-        workspace_.has_value() ? workspace_.value().size() : 0;
+
+    // Set the workspace size to the size that was used for autotuning, so
+    // algorithm index will be the same as returned by GetAlgorithms called
+    // during autotuning.
+    int64_t max_workspace = autotune_workspace_size_;
 
     // If autotuning is disabled, there is no point on retrieving all
     // algorithms, it's enough to get the default one only.
@@ -182,6 +186,7 @@ absl::StatusOr<ThunkProto> CublasLtMatmulThunk::ToProto() const {
   cublas_lt_matmul_thunk->set_epilogue(
       stream_executor::gpu::BlasLt::EpilogueToProto(epilogue_));
   cublas_lt_matmul_thunk->set_algorithm_idx(algorithm_idx_);
+  cublas_lt_matmul_thunk->set_autotune_workspace_size(autotune_workspace_size_);
   cublas_lt_matmul_thunk->set_canonical_hlo(canonical_hlo_);
   TF_ASSIGN_OR_RETURN(*cublas_lt_matmul_thunk->mutable_a(), a_.ToProto());
   TF_ASSIGN_OR_RETURN(*cublas_lt_matmul_thunk->mutable_b(), b_.ToProto());
@@ -286,10 +291,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> CublasLtMatmulThunk::FromProto(
   return std::make_unique<CublasLtMatmulThunk>(
       std::move(thunk_info), std::move(proto.canonical_hlo()),
       xla::gpu::GemmConfig(std::move(gemm_config)), std::move(epilogue),
-      proto.algorithm_idx(), std::move(a), std::move(b), std::move(c),
-      std::move(d), std::move(bias), std::move(aux), std::move(a_scale),
-      std::move(b_scale), std::move(c_scale), std::move(d_scale),
-      std::move(d_amax), std::move(workspace));
+      proto.algorithm_idx(), proto.autotune_workspace_size(), std::move(a),
+      std::move(b), std::move(c), std::move(d), std::move(bias), std::move(aux),
+      std::move(a_scale), std::move(b_scale), std::move(c_scale),
+      std::move(d_scale), std::move(d_amax), std::move(workspace));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
index 26c35ccca59902..efd1276d2c9b1d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -38,6 +38,7 @@ class CublasLtMatmulThunk : public Thunk {
   CublasLtMatmulThunk(Thunk::ThunkInfo thunk_info, std::string canonical_hlo,
                       GemmConfig gemm_config,
                       se::gpu::BlasLt::Epilogue epilogue, int64_t algorithm_idx,
+                      int64_t autotune_workspace_size,
                       BufferAllocation::Slice a, BufferAllocation::Slice b,
                       BufferAllocation::Slice c, BufferAllocation::Slice d,
                       BufferAllocation::Slice bias /* may be null */,
@@ -75,6 +76,7 @@ class CublasLtMatmulThunk : public Thunk {
   GemmConfig gemm_config_;
   se::gpu::BlasLt::Epilogue epilogue_;
   int64_t algorithm_idx_;
+  int64_t autotune_workspace_size_;
   std::string canonical_hlo_;
   BufferAllocation::Slice a_;
   BufferAllocation::Slice b_;
diff --git a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
index c0d7d088081e52..8fa69ad0f10b36 100644
--- a/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/gpublas_lt_matmul_thunk_test.cc
@@ -159,9 +159,9 @@ class GpuBlasLtThunkBuilder {
     return std::make_unique<CublasLtMatmulThunk>(
         std::move(thunk_info), std::move(canonical_hlo), std::move(gemm_config),
         epilogue,
-        /*algorithm_idx*/ 0, slices[0], slices[1],
-        has_matrix_bias ? slices[2] : slices.back(), slices.back(), bias,
-        BufferAllocation::Slice{} /* aux */,
+        /*algorithm_idx*/ 0, backend_config.autotune_workspace_size(),
+        slices[0], slices[1], has_matrix_bias ? slices[2] : slices.back(),
+        slices.back(), bias, BufferAllocation::Slice{} /* aux */,
         BufferAllocation::Slice{} /* a_scale */,
         BufferAllocation::Slice{} /* b_scale */,
         BufferAllocation::Slice{} /* c_scale */,
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 36b2e69ac96c4c..6d464b253e4409 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -309,6 +309,7 @@ message CublasLtMatmulThunkProto {
   optional xla.buffer_assignment.BufferAllocationSliceProto d_scale = 14;
   optional xla.buffer_assignment.BufferAllocationSliceProto d_amax = 15;
   optional xla.buffer_assignment.BufferAllocationSliceProto workspace = 16;
+  int64 autotune_workspace_size = 17;
 }
 
 message CubSortThunkProto {
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 33f0ab94a82627..79e13fb1726a28 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1780,6 +1780,7 @@ cc_library(
         "//xla/service/gpu/transforms:gemm_fusion",
         "//xla/service/gpu/transforms:gemm_fusion_swap_operands",
         "//xla/service/gpu/transforms:gemm_rewriter",
+        "//xla/service/gpu/transforms:gemm_workspace_rewriter",
         "//xla/service/gpu/transforms:gemv_rewriter",
         "//xla/service/gpu/transforms:hoist_fused_bitcasts",
         "//xla/service/gpu/transforms:layout_assignment",
@@ -2184,6 +2185,7 @@ cc_library(
         "//xla/service/gpu/transforms:cudnn_norm_rewriter",
         "//xla/service/gpu/transforms:cudnn_pad_for_convolutions",
         "//xla/service/gpu/transforms:cudnn_simplify_padding",
+        "//xla/service/gpu/transforms:gemm_workspace_rewriter",
         "//xla/service/gpu/transforms:triangular_solve_rewriter",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 4d3c77a3a63c38..ff1799169002b9 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -109,6 +109,9 @@ message GemmBackendConfig {
   bool damax_output = 18;
 
   reserved 19;
+
+  // The workspace size used during autotuning when the algorithm was selected.
+  int64 autotune_workspace_size = 20;
 }
 
 // Backend config for bitcast operation generated from MLIR MHLO dialect.
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index d48523abc3da12..56c680f4de81d7 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -91,6 +91,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/cudnn_norm_rewriter.h"
 #include "xla/service/gpu/transforms/cudnn_pad_for_convolutions.h"
 #include "xla/service/gpu/transforms/cudnn_simplify_padding.h"
+#include "xla/service/gpu/transforms/gemm_workspace_rewriter.h"
 #include "xla/service/gpu/transforms/triangular_solve_rewriter.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
@@ -375,6 +376,11 @@ absl::Status NVPTXCompiler::AddConvAndGemmAutotuningPasses(
                             thread_pool, should_autotune, target_config,
                             options.device_allocator));
   pipeline->AddPass(std::move(autotuner_pass));
+
+  // After autotuning, update GEMM workspace sizes to match the exact
+  // requirements of the selected algorithms, potentially reducing memory usage.
+  pipeline->AddPass<GemmWorkspaceRewriter>(gpu_version, stream_exec);
+
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 2dc34ce63ca7ea..d058a7276b1b96 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -672,8 +672,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCublasLtMatmulThunk(
       HloPrintOptions::Fingerprint().set_print_backend_config(true));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       std::move(thunk_info), std::move(canonical_hlo), std::move(gemm_config),
-      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax, workspace_buffer);
+      blas_lt_epilogue, algorithm, config.autotune_workspace_size(), a, b, c, d,
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax, workspace_buffer);
   return GetThunkSequence(std::move(thunk));
 }
 
@@ -767,8 +767,8 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCublasLtMatmulThunkF8(
       HloPrintOptions::Fingerprint().set_print_backend_config(true));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
       std::move(thunk_info), std::move(canonical_hlo), std::move(gemm_config),
-      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
-      c_scale, d_scale, d_amax, workspace_buffer);
+      blas_lt_epilogue, algorithm, config.autotune_workspace_size(), a, b, c, d,
+      bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax, workspace_buffer);
   return GetThunkSequence(std::move(thunk));
 }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 12c6abf3a46270..9802b9648076d5 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1376,6 +1376,44 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "gemm_workspace_rewriter",
+    srcs = ["gemm_workspace_rewriter.cc"],
+    hdrs = ["gemm_workspace_rewriter.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_test(
+    name = "gemm_workspace_rewriter_test",
+    srcs = ["gemm_workspace_rewriter_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":gemm_workspace_rewriter",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu/tests:gpu_codegen_test",
+        "//xla/stream_executor:stream_executor_h",
+        "//xla/tsl/platform:statusor",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "gemm_fusion",
     srcs = ["gemm_fusion.cc"],
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.cc
new file mode 100644
index 00000000000000..31a17257a3ccd7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.cc
@@ -0,0 +1,240 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/gemm_workspace_rewriter.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/matmul_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla {
+namespace gpu {
+
+namespace se = ::stream_executor;
+using se::gpu::BlasLt;
+
+namespace {
+
+absl::StatusOr<BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue) {
+  switch (epilogue) {
+    case GemmBackendConfig::DEFAULT:
+      return BlasLt::Epilogue::kDefault;
+    case GemmBackendConfig::RELU:
+      return BlasLt::Epilogue::kReLU;
+    case GemmBackendConfig::GELU:
+      return BlasLt::Epilogue::kGELU;
+    case GemmBackendConfig::GELU_AUX:
+      return BlasLt::Epilogue::kGELUWithAux;
+    case GemmBackendConfig::BIAS:
+      return BlasLt::Epilogue::kBias;
+    case GemmBackendConfig::BIAS_RELU:
+      return BlasLt::Epilogue::kBiasThenReLU;
+    case GemmBackendConfig::BIAS_GELU:
+      return BlasLt::Epilogue::kBiasThenGELU;
+    case GemmBackendConfig::BIAS_GELU_AUX:
+      return BlasLt::Epilogue::kBiasThenGELUWithAux;
+    default:
+      return absl::InternalError("Unsupported Epilogue.");
+  }
+}
+
+// Visitor that updates workspace sizes for cuBLASLt GEMM operations
+// based on the selected algorithm's actual workspace requirement.
+class GemmWorkspaceRewriteVisitor : public DfsHloRewriteVisitor {
+ public:
+  explicit GemmWorkspaceRewriteVisitor(
+      const se::GpuComputeCapability& gpu_version,
+      se::StreamExecutor* stream_exec)
+      : gpu_version_(gpu_version), stream_exec_(stream_exec) {}
+
+  absl::Status HandleCustomCall(HloInstruction* instr) override {
+    // Only handle cuBLASLt matmul calls
+    if (instr->custom_call_target() != kCublasLtMatmulCallTarget &&
+        instr->custom_call_target() != kCublasLtMatmulF8CallTarget) {
+      return absl::OkStatus();
+    }
+
+    // Skip if stream executor is not available
+    if (stream_exec_ == nullptr) {
+      return absl::OkStatus();
+    }
+
+    // Get the backend config
+    ASSIGN_OR_RETURN(auto gpu_config,
+                     instr->backend_config<GpuBackendConfig>());
+    const GemmBackendConfig& config = gpu_config.gemm_backend_config();
+
+    // Skip if no algorithm has been selected (not autotuned yet)
+    if (config.algorithm_case() != GemmBackendConfig::kSelectedAlgorithm) {
+      return absl::OkStatus();
+    }
+
+    int64_t selected_algorithm = config.selected_algorithm();
+
+    // Get the current output shape - must be a tuple with workspace as last
+    // element
+    if (!instr->shape().IsTuple() || instr->shape().tuple_shapes().empty()) {
+      return absl::OkStatus();
+    }
+
+    // Get the current workspace size
+    const Shape& current_workspace_shape = instr->shape().tuple_shapes().back();
+    if (current_workspace_shape.element_type() != S8) {
+      return absl::OkStatus();
+    }
+    int64_t current_workspace_size =
+        ShapeUtil::ByteSizeOf(current_workspace_shape);
+
+    // Create GemmConfig to get the matmul plan
+    ASSIGN_OR_RETURN(GemmConfig gemm_config,
+                     GemmConfig::For(instr, gpu_version_));
+
+    // Get the epilogue
+    ASSIGN_OR_RETURN(BlasLt::Epilogue epilogue,
+                     AsBlasLtEpilogue(config.epilogue()));
+
+    // Create a stream to query algorithms
+    ASSIGN_OR_RETURN(std::unique_ptr<se::Stream> stream,
+                     stream_exec_->CreateStream());
+
+    // Get the matmul plan
+    ASSIGN_OR_RETURN(
+        std::unique_ptr<BlasLt::MatmulPlan> plan,
+        se::gpu::BlasLt::GetMatmulPlan(stream.get(), gemm_config, epilogue));
+
+    // Query algorithms with the current workspace size limit
+    ASSIGN_OR_RETURN(
+        std::vector<BlasLt::MatmulAlgorithm> algorithms,
+        plan->GetAlgorithms(stream.get(), GemmConfig::kNumAlgorithms,
+                            current_workspace_size));
+
+    // Verify that the selected algorithm index is valid
+    if (selected_algorithm < 0 ||
+        selected_algorithm >= static_cast<int64_t>(algorithms.size())) {
+      VLOG(3) << "Selected algorithm index " << selected_algorithm
+              << " is out of range for " << instr->name()
+              << ", skipping workspace update.";
+      return absl::OkStatus();
+    }
+
+    // Get the actual workspace size for the selected algorithm
+    int64_t actual_workspace_size =
+        static_cast<int64_t>(algorithms[selected_algorithm].workspace_size);
+
+    // If the workspace size is already optimal, nothing to do
+    if (actual_workspace_size == current_workspace_size) {
+      return absl::OkStatus();
+    }
+
+    // Ensure we're not increasing the workspace size
+    if (actual_workspace_size > current_workspace_size) {
+      VLOG(3) << "Algorithm workspace size (" << actual_workspace_size
+              << ") exceeds current allocation (" << current_workspace_size
+              << ") for " << instr->name() << ", skipping update.";
+      return absl::OkStatus();
+    }
+
+    VLOG(2) << "Updating workspace size for " << instr->name() << " from "
+            << current_workspace_size << " to " << actual_workspace_size;
+
+    // Build the new output shape with updated workspace size
+    Shape new_output_shape = instr->shape();
+    *new_output_shape.mutable_tuple_shapes(
+        new_output_shape.tuple_shapes().size() - 1) =
+        ShapeUtil::MakeShape(S8, {actual_workspace_size});
+
+    // Clone the instruction with the new shape
+    HloInstruction* new_call = instr->AddInstruction(
+        instr->CloneWithNewOperands(new_output_shape, instr->operands()));
+
+    // Update operand aliasing if present
+    auto* custom_call = Cast<HloCustomCallInstruction>(new_call);
+    if (!custom_call->output_to_operand_aliasing().empty()) {
+      custom_call->set_output_to_operand_aliasing(
+          Cast<HloCustomCallInstruction>(instr)->output_to_operand_aliasing());
+    }
+
+    // Collect users first to avoid modifying during iteration
+    std::vector<HloInstruction*> users(instr->users().begin(),
+                                       instr->users().end());
+
+    // Replace all users of the old instruction
+    for (HloInstruction* user : users) {
+      HloGetTupleElementInstruction* user_get_tuple =
+          DynCast<HloGetTupleElementInstruction>(user);
+      if (user_get_tuple == nullptr) {
+        continue;
+      }
+      HloInstruction* get_output =
+          instr->AddInstruction(HloInstruction::CreateGetTupleElement(
+              new_call, user_get_tuple->tuple_index()));
+      RETURN_IF_ERROR(ReplaceInstruction(user_get_tuple, get_output));
+    }
+
+    MarkAsChanged();
+    return absl::OkStatus();
+  }
+
+ private:
+  se::GpuComputeCapability gpu_version_;
+  se::StreamExecutor* stream_exec_;
+};
+
+}  // namespace
+
+absl::StatusOr<bool> GemmWorkspaceRewriter::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  // Skip if stream executor is not available
+  if (stream_exec_ == nullptr) {
+    VLOG(2) << "Stream executor not available, skipping workspace rewrite.";
+    return false;
+  }
+
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    GemmWorkspaceRewriteVisitor visitor(gpu_version_, stream_exec_);
+    RETURN_IF_ERROR(computation->Accept(&visitor));
+    changed |= visitor.changed();
+  }
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.h b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.h
new file mode 100644
index 00000000000000..b7c5e3e5f47feb
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter.h
@@ -0,0 +1,55 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_GEMM_WORKSPACE_REWRITER_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_GEMM_WORKSPACE_REWRITER_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// This pass updates the workspace size for cuBLAS/cuBLASLt GEMM operations
+// after autotuning has selected a specific algorithm. The GemmRewriter pass
+// conservatively allocates workspace before autotuning. After autotuning,
+// we know the exact algorithm selected and can query its actual workspace
+// requirement, potentially reducing memory usage.
+class GemmWorkspaceRewriter : public HloModulePass {
+ public:
+  explicit GemmWorkspaceRewriter(const se::GpuComputeCapability& gpu_version,
+                                 stream_executor::StreamExecutor* stream_exec)
+      : gpu_version_(gpu_version), stream_exec_(stream_exec) {}
+
+  absl::string_view name() const override { return "gemm-workspace-rewriter"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  se::GpuComputeCapability gpu_version_;
+  stream_executor::StreamExecutor* stream_exec_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_GEMM_WORKSPACE_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter_test.cc
new file mode 100644
index 00000000000000..e8028932f2ad5c
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_workspace_rewriter_test.cc
@@ -0,0 +1,69 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/gemm_workspace_rewriter.h"
+
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace se = ::stream_executor;
+
+class GemmWorkspaceRewriterTest : public GpuCodegenTest {};
+
+// Tests that cuBLASLt calls with a selected algorithm and large workspace
+// are rewritten to use a smaller workspace.
+TEST_F(GemmWorkspaceRewriterTest,
+       CublasLtCallWithSelectedAlgorithmIsRewritten) {
+  // This HLO simulates a cuBLASLt matmul after autotuning - it has
+  // selected_algorithm set and a conservatively large workspace.
+  const char* hlo_text = R"(
+HloModule TestModule
+
+ENTRY main {
+  lhs = f32[32,64] parameter(0)
+  rhs = f32[64,128] parameter(1)
+  custom_call = (f32[32,128], s8[4194304]) custom-call(lhs, rhs),
+    custom_call_target="__cublas$lt$matmul",
+    backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"gemm_backend_config":{"alpha_real":1,"alpha_imag":0,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":["1"],"rhs_contracting_dimensions":["0"],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT","selected_algorithm":"0"}}
+  ROOT result = f32[32,128] get-tuple-element(custom_call), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  se::StreamExecutor* stream_exec = backend().default_stream_executor();
+  GemmWorkspaceRewriter pass(
+      stream_exec->GetDeviceDescription().gpu_compute_capability(),
+      stream_exec);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+
+  // The pass should reduce the workspace size from 4MB to the algorithm's
+  // actual requirement (typically much smaller).
+  EXPECT_TRUE(changed);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From 4c7b99b876eb7a7b73552511b49daafb368dc3f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 22:58:57 -0800
Subject: [PATCH 373/753] Automated Code Change

PiperOrigin-RevId: 845603681
---
 third_party/xla/xla/service/generic_transfer_manager_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/generic_transfer_manager_test.cc b/third_party/xla/xla/service/generic_transfer_manager_test.cc
index 4347dded08428f..6c394b0886618c 100644
--- a/third_party/xla/xla/service/generic_transfer_manager_test.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager_test.cc
@@ -65,7 +65,8 @@ class GenericTransferManagerTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(stream_executor_, platform->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, stream_executor_->CreateStream());
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_executor_);
+        std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
+            stream_executor_);
   }
 
   ScopedShapedBuffer AllocateBuffer(const Shape& shape) {

From 9804d21251e7143b91d1fabfb40815b8de848008 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Tue, 16 Dec 2025 23:00:30 -0800
Subject: [PATCH 374/753] [xla:cpu] hlo_benchmark_runner: NFC: drop TF_ prefix
 from RETURN_IF_ERROR/ASSIGN_OR_RETURN

PiperOrigin-RevId: 845604102
---
 .../xla/xla/backends/cpu/benchmarks/BUILD     |  1 +
 .../cpu/benchmarks/hlo_benchmark_runner.cc    | 79 ++++++++++---------
 2 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/benchmarks/BUILD b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
index 418a5e3e9027d2..8ad37162b2c78f 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/BUILD
+++ b/third_party/xla/xla/backends/cpu/benchmarks/BUILD
@@ -68,6 +68,7 @@ cc_library(
         "//xla/tools:run_hlo_module_proto_cc",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test_benchmark",
         "@com_google_absl//absl/base:nullability",
diff --git a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
index 97708cacc365c5..4d5d25d78d5fdc 100644
--- a/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
+++ b/third_party/xla/xla/backends/cpu/benchmarks/hlo_benchmark_runner.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/path.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::cpu {
 
@@ -131,10 +132,10 @@ absl::Status RunHloBenchmark(benchmark::State& state,
                              absl::Span<const Literal* const> args,
                              StrToStrMapping replacements,
                              const HloBenchmarkOptions& benchmark_options) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      ParseAndReturnUnverifiedModule(
-                          absl::StrReplaceAll(hlo_module, replacements),
-                          HloModuleConfig() /* unused */));
+  ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                   ParseAndReturnUnverifiedModule(
+                       absl::StrReplaceAll(hlo_module, replacements),
+                       HloModuleConfig() /* unused */));
   return RunHloBenchmark(state, std::move(module), args, benchmark_options);
 }
 
@@ -158,11 +159,11 @@ absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
                                  absl::Span<const Literal* const> args,
                                  const HloBenchmarkOptions& benchmark_options) {
   xla::CpuClientOptions client_options;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                      xla::GetXlaPjrtCpuClient(client_options));
+  ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                   xla::GetXlaPjrtCpuClient(client_options));
   PjRtDevice* device = client->devices().front();
-  TF_ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
-                      device->default_memory_space());
+  ASSIGN_OR_RETURN(PjRtMemorySpace * memory_space,
+                   device->default_memory_space());
 
   XlaComputation computation(module->ToProto());
 
@@ -176,12 +177,12 @@ absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
   std::unique_ptr<PjRtLoadedExecutable> executable;
   if (benchmark_options.aot_options) {
     auto* cpu_client = tsl::down_cast<PjRtCpuClient*>(client.get());
-    TF_ASSIGN_OR_RETURN(executable, cpu_client->CompileAheadOfTimeAndLoad(
-                                        computation, compile_options,
-                                        *benchmark_options.aot_options));
+    ASSIGN_OR_RETURN(executable, cpu_client->CompileAheadOfTimeAndLoad(
+                                     computation, compile_options,
+                                     *benchmark_options.aot_options));
   } else {
-    TF_ASSIGN_OR_RETURN(executable,
-                        client->CompileAndLoad(computation, compile_options));
+    ASSIGN_OR_RETURN(executable,
+                     client->CompileAndLoad(computation, compile_options));
   }
 
   CHECK_GE(benchmark_options.num_executions, 1);
@@ -198,14 +199,14 @@ absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
   // If the user has not passed any arguments we need to generate
   // fake arguments based on the number of inputs to the hlo module.
   if (args.empty()) {
-    TF_ASSIGN_OR_RETURN(std::vector<Literal> fake_args,
-                        MakeFakeArguments(module.get()));
+    ASSIGN_OR_RETURN(std::vector<Literal> fake_args,
+                     MakeFakeArguments(module.get()));
     for (auto& args_buffers : execution_args_buffers) {
       args_buffers.reserve(fake_args.size());
       for (const Literal& arg : fake_args) {
-        TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
-                            client->BufferFromHostLiteral(arg, memory_space));
-        TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+        ASSIGN_OR_RETURN(args_buffers.emplace_back(),
+                         client->BufferFromHostLiteral(arg, memory_space));
+        RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
       }
     }
   } else {
@@ -218,9 +219,9 @@ absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
     for (auto& args_buffers : execution_args_buffers) {
       args_buffers.reserve(args.size());
       for (const Literal* arg : args) {
-        TF_ASSIGN_OR_RETURN(args_buffers.emplace_back(),
-                            client->BufferFromHostLiteral(*arg, memory_space));
-        TF_RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
+        ASSIGN_OR_RETURN(args_buffers.emplace_back(),
+                         client->BufferFromHostLiteral(*arg, memory_space));
+        RETURN_IF_ERROR(args_buffers.back()->GetReadyFuture().Await());
       }
     }
   }
@@ -284,7 +285,7 @@ absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
       std::vector<std::unique_ptr<PjRtBuffer>>& args_buffers =
           execution_args_buffers[i];
       std::vector<PjRtBuffer*>& args_ptrs = execution_args_ptrs[i];
-      TF_RETURN_IF_ERROR(alias_helper.SwapOutputAliasedBuffersToArgumentBuffers(
+      RETURN_IF_ERROR(alias_helper.SwapOutputAliasedBuffersToArgumentBuffers(
           execution_results[i], args_buffers, args_ptrs));
     }
 
@@ -293,12 +294,12 @@ absl::Status RunHloBenchmarkImpl(benchmark::State* absl_nullable state,
 
   // Run once. For a regular benchmark this will serve as a warm-up;
   // for RunHloBenchmarkOnce this will be the only run.
-  TF_RETURN_IF_ERROR(run_benchmark_once());
+  RETURN_IF_ERROR(run_benchmark_once());
 
   // Benchmark executable.
   if (state) {
     for (auto _ : *state) {
-      TF_RETURN_IF_ERROR(run_benchmark_once());
+      RETURN_IF_ERROR(run_benchmark_once());
     }
   }
 
@@ -324,10 +325,10 @@ absl::Status CompileHloBenchmark(benchmark::State& state,
                                  absl::string_view hlo_module,
                                  StrToStrMapping replacements,
                                  const HloBenchmarkOptions& benchmark_options) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      ParseAndReturnUnverifiedModule(
-                          absl::StrReplaceAll(hlo_module, replacements),
-                          HloModuleConfig() /* unused */));
+  ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                   ParseAndReturnUnverifiedModule(
+                       absl::StrReplaceAll(hlo_module, replacements),
+                       HloModuleConfig() /* unused */));
 
   return CompileHloBenchmark(state, std::move(module), benchmark_options);
 }
@@ -336,8 +337,8 @@ absl::Status CompileHloBenchmark(benchmark::State& state,
                                  std::unique_ptr<HloModule> module,
                                  const HloBenchmarkOptions& benchmark_options) {
   xla::CpuClientOptions client_options;
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
-                      xla::GetXlaPjrtCpuClient(client_options));
+  ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                   xla::GetXlaPjrtCpuClient(client_options));
 
   XlaComputation computation(module->ToProto());
 
@@ -348,8 +349,8 @@ absl::Status CompileHloBenchmark(benchmark::State& state,
   }
 
   for (auto _ : state) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
-                        client->CompileAndLoad(computation, compile_options));
+    ASSIGN_OR_RETURN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                     client->CompileAndLoad(computation, compile_options));
     tsl::testing::DoNotOptimize(executable);
   }
 
@@ -365,11 +366,11 @@ LoadFromHloSnapshotOrHloModuleProto(absl::string_view hlo_data,
   auto iteration_literals_proto =
       std::make_unique<RunHloModuleIterationLiterals>();
   if (extension == "pb" || extension == "pbtxt") {
-    TF_ASSIGN_OR_RETURN(iteration_literals_proto,
-                        LoadInputFromData(hlo_data, extension));
+    ASSIGN_OR_RETURN(iteration_literals_proto,
+                     LoadInputFromData(hlo_data, extension));
   }
 
-  TF_ASSIGN_OR_RETURN(auto hlo_module, LoadModuleFromData(hlo_data, extension));
+  ASSIGN_OR_RETURN(auto hlo_module, LoadModuleFromData(hlo_data, extension));
 
   return std::make_pair(std::move(hlo_module),
                         std::move(iteration_literals_proto));
@@ -379,12 +380,12 @@ absl::StatusOr<std::pair<std::unique_ptr<HloModule>,
                          std::unique_ptr<RunHloModuleIterationLiterals>>>
 LoadFromHloUnoptimizedSnapshot(
     const HloUnoptimizedSnapshot& unoptimized_snapshot) {
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       HloModuleConfig config,
       HloModule::CreateModuleConfigFromProto(unoptimized_snapshot.hlo_module(),
                                              xla::GetDebugOptionsFromFlags()));
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> hlo_module,
       HloModule::CreateFromProto(unoptimized_snapshot.hlo_module(), config));
 
@@ -410,8 +411,8 @@ absl::StatusOr<std::pair<std::unique_ptr<HloModule>,
                          std::unique_ptr<RunHloModuleIterationLiterals>>>
 LoadHloModuleAndMaybeIterationLiterals(absl::string_view hlo_path) {
   std::string hlo_data;
-  TF_RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(),
-                                           std::string(hlo_path), &hlo_data));
+  RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(),
+                                        std::string(hlo_path), &hlo_data));
 
   HloUnoptimizedSnapshot unoptimized_snapshot;
   if (unoptimized_snapshot.ParseFromString(hlo_data)) {

From 55a073938815c44dca4289be0b5ff3f0d1f1b964 Mon Sep 17 00:00:00 2001
From: Alex <alexandros.theodoridis@amd.com>
Date: Tue, 16 Dec 2025 23:21:44 -0800
Subject: [PATCH 375/753] PR #35191: [ROCm] Fix jax build with rocm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35191

📝 Summary of Changes
While building jax and running its test it complaints about missing hipsolver_potrf_ffi
FFI handler. This change ensures that libs located in data are linked against the targets
where they are set as a dependency.

🎯 Justification
Fixing this issue:
```
jax.errors.JaxRuntimeError: NOT_FOUND: No FFI handler registered for hipsolver_potrf_ffi on a platform ROCM (canonical rocm)
--------------------
For simplicity, JAX has removed its internal frames from the traceback of the following exception. Set JAX_TRACEBACK_FILTERING=off to include these.
```

🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
Not relevant

🧪 Unit Tests:
CI U-Tests

🧪 Execution Tests:
Not relevant

Copybara import of the project:

--
1cf513b957650bc7e1510cf8bef6bc0285b9d8c9 by Alexandros Theodoridis <atheodor@amd.com>:

Fix jax build with rocm

--
3c7dee35701d8fc1e7fe040389c4ec8fcc1fe475 by Alexandros Theodoridis <atheodor@amd.com>:

Add more libs to data

--
90192b07dd5d0a8ba6acd9f51b3110db70095db4 by Alexandros Theodoridis <atheodor@amd.com>:

Put everhting to data

--
cfa46dd5ce30181b268d0338c901834699988838 by Alexandros Theodoridis <atheodor@amd.com>:

Switch hermetic build to release version of rocm

Merging this change closes #35191

PiperOrigin-RevId: 845610364
---
 .../xla/third_party/gpus/rocm/BUILD.tpl       | 26 ++++++++++++++-----
 .../gpu/collectives/rccl_communicator.cc      |  2 +-
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index d3b6f87a4adf18..f14780e2b4a194 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -105,6 +105,7 @@ cc_library(
         ":hip",
         ":hipblas",
         ":hipblaslt",
+        ":hipfft",
         ":hiprand",
         ":hipsolver",
         ":hipsparse",
@@ -116,7 +117,6 @@ cc_library(
         ":rocsolver",
         ":rocsparse",
         ":roctracer",
-        ":hipfft",
     ],
 )
 
@@ -408,11 +408,15 @@ cc_library(
 cc_library(
     name = "rocsolver",
     hdrs = glob(["%{rocm_root}/include/rocsolver/**"]),
-    data = glob(["%{rocm_root}/lib/librocsolver*.so*"]),
+    data = glob([
+        "%{rocm_root}/lib/librocsolver*.so*",
+        "%{rocm_root}/lib/host-math/lib/*.so*",
+    ]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lrocsolver"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
     deps = [
@@ -423,14 +427,18 @@ cc_library(
 
 cc_library(
     name = "rocsparse",
-    srcs = glob(["%{rocm_root}/lib/librocsparse*.so*"]),
+    data = glob(["%{rocm_root}/lib/librocsparse*.so*"]),
     include_prefix = "rocm",
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lrocsparse"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+    ],
 )
 
 cc_library(
@@ -441,9 +449,14 @@ cc_library(
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lhipsolver"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
-    deps = [":rocm_config"],
+    deps = [
+        ":rocm_config",
+        ":rocm_rpath",
+        ":rocsparse",
+    ],
 )
 
 cc_library(
@@ -454,6 +467,7 @@ cc_library(
     includes = [
         "%{rocm_root}/include/",
     ],
+    linkopts = ["-lhipblas"],
     strip_include_prefix = "%{rocm_root}",
     visibility = ["//visibility:public"],
     deps = [
@@ -532,8 +546,8 @@ cc_library(
 
 cc_library(
     name = "amd_comgr_dynamic",
-    hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
     srcs = ["%{rocm_root}/lib/libamd_comgr_stub.a"],
+    hdrs = glob(["%{rocm_root}/include/amd_comgr/**"]),
     data = glob([
         "%{rocm_root}/lib/libamd_comgr_loader.so*",
         "%{rocm_root}/lib/libamd_comgr.so*",
diff --git a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
index 1135391d0c4370..2b54d5fdcfe495 100644
--- a/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/rccl_communicator.cc
@@ -401,7 +401,7 @@ RcclCommunicator::RegisterBuffer(stream_executor::DeviceAddressBase buffer,
           XLA_RCCL_RETURN_IF_ERROR(ncclGroupStart());
           XLA_RCCL_RETURN_IF_ERROR(ncclCommWindowRegister(
               comm_, buffer.opaque(), buffer.size(), (ncclWindow_t*)&handle,
-              RCCL_WIN_COLL_SYMMETRIC));
+              NCCL_WIN_COLL_SYMMETRIC));
           XLA_RCCL_RETURN_IF_ERROR(ncclGroupEnd());
           if (group_nesting_level_ == 0) {
             TF_RETURN_IF_ERROR(PollUntilDone());

From 7deb4e7851c3ec14ed8b4b0d948610bd5308ccb1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 23:33:13 -0800
Subject: [PATCH 376/753] Automated Code Change

PiperOrigin-RevId: 845614233
---
 .../xla/xla/service/gpu/transforms/gemm_rewriter_test.cc   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
index 9249133b3ad766..d401c9900a65d5 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_test.cc
@@ -129,7 +129,7 @@ ENTRY AddDotsFunc {
     return ParseAndReturnVerifiedModule(hlo_text, config);
   };
 
-  se::StreamExecutorMemoryAllocator allocator(
+  stream_executor::StreamExecutorAddressAllocator allocator(
       backend().default_stream_executor());
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<HloModule> optimized_module,
@@ -1488,8 +1488,9 @@ class GemmRewriteAllocationTest : public GpuCodegenTest {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> optimized_module,
                             GetOptimizedModule(hlo));
     if (allocator_ == nullptr) {
-      allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
-          backend().default_stream_executor());
+      allocator_ =
+          std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
+              backend().default_stream_executor());
     }
     TF_ASSERT_OK_AND_ASSIGN(
         std::unique_ptr<Executable> executable,

From 991ed1011d841cd14a193571a9ac2df68d62efe9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 23:43:47 -0800
Subject: [PATCH 377/753] Automated Code Change

PiperOrigin-RevId: 845617932
---
 .../buffer_debug_float_check_kernel_cuda_test.cc |  4 ++--
 ...buffer_debug_xor_checksum_kernel_cuda_test.cc |  4 ++--
 .../cuda/cub_prefix_sum_kernel_cuda_test.cc      |  4 ++--
 .../stream_executor/cuda/cuda_executor_test.cc   | 16 ++++++++--------
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
index a1ab9cbb610482..053dc05642c88f 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_float_check_kernel_cuda_test.cc
@@ -64,7 +64,7 @@ class FloatCheckKernelTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
 
     if (!executor_->GetDeviceDescription()
              .cuda_compute_capability()
@@ -140,7 +140,7 @@ class FloatCheckKernelTest : public ::testing::Test {
   se::Platform* platform_;
   se::StreamExecutor* executor_;
   std::unique_ptr<se::Stream> stream_;
-  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_F(FloatCheckKernelTest, ChecksFloatsForF32) {
diff --git a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
index 26638e74e65d8e..b0583d8000dc1f 100644
--- a/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/buffer_debug_xor_checksum_kernel_cuda_test.cc
@@ -59,7 +59,7 @@ class ChecksumKernelTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
 
     if (!executor_->GetDeviceDescription()
              .cuda_compute_capability()
@@ -117,7 +117,7 @@ class ChecksumKernelTest : public ::testing::Test {
   se::Platform* platform_;
   se::StreamExecutor* executor_;
   std::unique_ptr<se::Stream> stream_;
-  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_F(ChecksumKernelTest, ComputesCorrectChecksumForMultipleOf32Bit) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
index d85c55114372c4..3a4b19a46996d3 100644
--- a/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cub_prefix_sum_kernel_cuda_test.cc
@@ -63,7 +63,7 @@ class CubPrefixSumKernelCudaTest
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<se::StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
   }
 
   template <typename T>
@@ -148,7 +148,7 @@ class CubPrefixSumKernelCudaTest
   se::Platform* platform_;
   se::StreamExecutor* executor_;
   std::unique_ptr<se::Stream> stream_;
-  std::unique_ptr<se::StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_P(CubPrefixSumKernelCudaTest, TestPrefixSum) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
index 8b6c6ea3491fe9..86e4198a59b4d0 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_test.cc
@@ -118,8 +118,8 @@ TEST(CudaExecutorTest, CreateUnifiedMemoryAllocatorWorks) {
       executor->CreateMemoryAllocator(MemorySpace::kUnified));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 TEST(CudaExecutorTest, CreateHostMemoryAllocatorWorks) {
@@ -131,8 +131,8 @@ TEST(CudaExecutorTest, CreateHostMemoryAllocatorWorks) {
                           executor->CreateMemoryAllocator(MemorySpace::kHost));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 TEST(CudaExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
@@ -145,8 +145,8 @@ TEST(CudaExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
       executor->CreateMemoryAllocator(MemorySpace::kCollective));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 // TODO: b/420735471 - Enable test once fixed.
@@ -189,7 +189,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithUnifiedMemory) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           unified_memory_allocator->Allocate(256));
-  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
+  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->address().opaque()),
               absl_testing::IsOkAndHolds(MemorySpace::kUnified));
 }
 
@@ -201,7 +201,7 @@ TEST(CudaExecutorTest, GetPointerMemorySpaceWorksWithHostMemory) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           executor->HostMemoryAllocate(256));
-  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->opaque()),
+  EXPECT_THAT(executor->GetPointerMemorySpace(allocation->address().opaque()),
               absl_testing::IsOkAndHolds(MemorySpace::kHost));
 }
 

From 49a1442e81898db33c575e115f758b47c5c2bf26 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 16 Dec 2025 23:48:20 -0800
Subject: [PATCH 378/753] Automated Code Change

PiperOrigin-RevId: 845619279
---
 .../xla/tsl/lib/random/philox_random_test.cc  |  6 +-
 .../tsl/lib/random/philox_random_test_utils.h |  2 +-
 .../tsl/lib/random/random_distributions.cc    |  2 +-
 .../xla/tsl/lib/random/random_distributions.h | 88 ++++++++++---------
 .../lib/random/random_distributions_test.cc   | 26 +++---
 .../xla/xla/tsl/lib/random/simple_philox.cc   | 13 +--
 .../xla/xla/tsl/lib/random/simple_philox.h    | 18 ++--
 .../xla/tsl/lib/random/simple_philox_test.cc  | 18 ++--
 .../xla/xla/tsl/lib/random/weighted_picker.cc | 26 +++---
 .../xla/xla/tsl/lib/random/weighted_picker.h  | 12 +--
 .../tsl/lib/random/weighted_picker_test.cc    | 10 +--
 11 files changed, 113 insertions(+), 108 deletions(-)

diff --git a/third_party/xla/xla/tsl/lib/random/philox_random_test.cc b/third_party/xla/xla/tsl/lib/random/philox_random_test.cc
index 3a4cc70d9f6ba8..c25cd3f1bea3dc 100644
--- a/third_party/xla/xla/tsl/lib/random/philox_random_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/philox_random_test.cc
@@ -50,15 +50,15 @@ TEST(PhiloxRandomTest, SkipMatchTest) {
   constexpr int count = 1024;
   constexpr int skip_count = 2048;
 
-  uint64 test_seed = GetTestSeed();
-  std::vector<uint32> v1(count);
+  uint64_t test_seed = GetTestSeed();
+  std::vector<uint32_t> v1(count);
   {
     PhiloxRandom gen(test_seed);
     gen.Skip(skip_count / 4);
     FillRandoms<TrivialPhiloxDistribution>(gen, &v1[0], v1.size());
   }
 
-  std::vector<uint32> v2(count + skip_count);
+  std::vector<uint32_t> v2(count + skip_count);
   {
     PhiloxRandom gen(test_seed);
     FillRandoms<TrivialPhiloxDistribution>(gen, &v2[0], v2.size());
diff --git a/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h b/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
index 3c76e1553774f3..dce28404322b95 100644
--- a/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
+++ b/third_party/xla/xla/tsl/lib/random/philox_random_test_utils.h
@@ -26,7 +26,7 @@ namespace tsl {
 namespace random {
 
 // Return a random seed.
-inline uint64 GetTestSeed() { return New64(); }
+inline uint64_t GetTestSeed() { return New64(); }
 
 // A utility function to fill the given array with samples from the given
 // distribution.
diff --git a/third_party/xla/xla/tsl/lib/random/random_distributions.cc b/third_party/xla/xla/tsl/lib/random/random_distributions.cc
index ab8930008f8c8b..46763c7c63196c 100644
--- a/third_party/xla/xla/tsl/lib/random/random_distributions.cc
+++ b/third_party/xla/xla/tsl/lib/random/random_distributions.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tsl {
 namespace random {
 template <>
-void SingleSampleAdapter<PhiloxRandom>::SkipFromGenerator(uint64 num_skips) {
+void SingleSampleAdapter<PhiloxRandom>::SkipFromGenerator(uint64_t num_skips) {
   // Use the O(1) PhiloxRandom::Skip instead of the default O(N) impl.
   generator_->Skip(num_skips);
 }
diff --git a/third_party/xla/xla/tsl/lib/random/random_distributions.h b/third_party/xla/xla/tsl/lib/random/random_distributions.h
index 72ee2ae49aa875..7c77797688276f 100644
--- a/third_party/xla/xla/tsl/lib/random/random_distributions.h
+++ b/third_party/xla/xla/tsl/lib/random/random_distributions.h
@@ -29,9 +29,9 @@ namespace tsl {
 namespace random {
 
 // Helper function to convert a 16-bit integer to a half between [0..1).
-PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x);
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x);
 // Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
-PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16_t x);
 
 // Computes a + b. Requires that the result is representable in the destination
 // type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
@@ -158,7 +158,7 @@ class UniformDistribution<Generator, double> {
 };
 
 template <class Generator>
-class UniformDistribution<Generator, int32> {
+class UniformDistribution<Generator, int32_t> {
  public:
   // The number of elements that will be returned.
   static constexpr int kResultElementCount = Generator::kResultElementCount;
@@ -167,12 +167,13 @@ class UniformDistribution<Generator, int32> {
   // Indicate that this distribution may take variable number of samples
   // during the runtime.
   static constexpr bool kVariableSamplesPerOutput = false;
-  typedef Array<int32, kResultElementCount> ResultType;
-  typedef int32 ResultElementType;
+  typedef Array<int32_t, kResultElementCount> ResultType;
+  typedef int32_t ResultElementType;
 
   // Must have lo < hi
   UniformDistribution(int32_t lo, int32_t hi)
-      : lo_(lo), range_(static_cast<uint32>(hi) - static_cast<uint32>(lo)) {}
+      : lo_(lo),
+        range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
@@ -188,8 +189,8 @@ class UniformDistribution<Generator, int32> {
   // Note that lo_ is intentionally signed while range_ is intentionally
   // unsigned.  This is because hi - lo can overflow signed integers if
   // lo < 0 < hi, but always fits in unsigned.
-  int32 lo_;
-  uint32 range_;
+  int32_t lo_;
+  uint32_t range_;
 };
 
 template <class Generator>
@@ -207,14 +208,16 @@ class UniformDistribution<Generator, int64_t> {
 
   // Must have lo < hi
   UniformDistribution(int64_t lo, int64_t hi)
-      : lo_(lo), range_(static_cast<uint64>(hi) - static_cast<uint64>(lo)) {}
+      : lo_(lo),
+        range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo)) {}
 
   PHILOX_DEVICE_INLINE
   ResultType operator()(Generator* gen) {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
-      auto bits = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
+      auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1])
+                                      << 32;
       result[i] = SignedAdd(lo_, bits % range_);
     }
     return result;
@@ -225,7 +228,7 @@ class UniformDistribution<Generator, int64_t> {
   // unsigned.  This is because hi - lo can overflow signed integers if
   // lo < 0 < hi, but always fits in unsigned.
   int64_t lo_;
-  uint64 range_;
+  uint64_t range_;
 };
 
 // Similar to `UniformDistribution`, except that instead of generating numbers
@@ -276,24 +279,25 @@ class UniformFullIntDistribution64 {
     typename Generator::ResultType sample = (*gen)();
     ResultType result;
     for (int i = 0; i < kResultElementCount; ++i) {
-      result[i] = sample[2 * i] | static_cast<uint64>(sample[2 * i + 1]) << 32;
+      result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1])
+                                      << 32;
     }
     return result;
   }
 };
 
 template <typename Generator>
-class UniformFullIntDistribution<Generator, int32>
-    : public UniformFullIntDistribution32<Generator, int32> {};
+class UniformFullIntDistribution<Generator, int32_t>
+    : public UniformFullIntDistribution32<Generator, int32_t> {};
 template <typename Generator>
-class UniformFullIntDistribution<Generator, uint32>
-    : public UniformFullIntDistribution32<Generator, uint32> {};
+class UniformFullIntDistribution<Generator, uint32_t>
+    : public UniformFullIntDistribution32<Generator, uint32_t> {};
 template <typename Generator>
 class UniformFullIntDistribution<Generator, int64_t>
     : public UniformFullIntDistribution64<Generator, int64_t> {};
 template <typename Generator>
-class UniformFullIntDistribution<Generator, uint64>
-    : public UniformFullIntDistribution64<Generator, uint64> {};
+class UniformFullIntDistribution<Generator, uint64_t>
+    : public UniformFullIntDistribution64<Generator, uint64_t> {};
 
 // A class that adapts the underlying native multiple samples to return a single
 // sample at a time.
@@ -322,7 +326,7 @@ class SingleSampleAdapter {
   }
 
   PHILOX_DEVICE_INLINE
-  void Skip(uint64 num_skips) {
+  void Skip(uint64_t num_skips) {
     if (!num_skips) {
       return;
     }
@@ -346,7 +350,7 @@ class SingleSampleAdapter {
   // from `generator_`. There is an O(1) implementation for PhiloxRandom
   // in random_distributions.cc.
   PHILOX_DEVICE_INLINE
-  void SkipFromGenerator(uint64 num_skips) {
+  void SkipFromGenerator(uint64_t num_skips) {
     while (num_skips--) {
       (*generator_)();
     }
@@ -372,8 +376,8 @@ template <class Generator, typename RealType>
 class NormalDistribution;
 
 PHILOX_DEVICE_INLINE
-void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
-                     double* d1);
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+                     double* d0, double* d1);
 
 // Exactly like the float version, except that we convert to half afterwards;
 // since we don't have half-precision sin/cos even on GPUs, there's nothing to
@@ -527,8 +531,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half> {
       // Repeatedly take samples from the normal distribution, until we have
       // the desired number of elements that fall within the pre-defined cutoff
       // threshold.
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
@@ -573,8 +577,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, bfloat16> {
       // Repeatedly take samples from the normal distribution, until we have
       // the desired number of elements that fall within the pre-defined cutoff
       // threshold.
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
@@ -620,8 +624,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, float> {
       // Repeatedly take samples from the normal distribution, until we have
       // the desired number of elements that fall within the pre-defined cutoff
       // threshold.
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
       float f[2];
       BoxMullerFloat(x0, x1, &f[0], &f[1]);
 
@@ -664,10 +668,10 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> {
     ResultType results;
     int index = 0;
     while (true) {
-      const uint32 x0 = (*gen)();
-      const uint32 x1 = (*gen)();
-      const uint32 x2 = (*gen)();
-      const uint32 x3 = (*gen)();
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      const uint32_t x2 = (*gen)();
+      const uint32_t x3 = (*gen)();
       double d[2];
       BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
 
@@ -690,8 +694,8 @@ class TruncatedNormalDistribution<SingleSampleGenerator, double> {
 // Helper function to convert four 32-bit uniform integers to two doubles
 // under the unit normal distribution.
 PHILOX_DEVICE_INLINE
-void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
-                     double* d1) {
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
+                     double* d0, double* d1) {
   // This function implements the Box-Muller transform:
   // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
   // Do not send a really small number to log().
@@ -714,16 +718,16 @@ void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
 }
 
 // Helper function to convert an 16-bit integer to a half between [0..1).
-PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x) {
   // IEEE754 halfs are formatted as follows (MSB first):
   //    sign(1) exponent(5) mantissa(10)
   // Conceptually construct the following:
   //    sign == 0
   //    exponent == 15  -- an excess 15 representation of a zero exponent
   //    mantissa == 10 random bits
-  const uint16 man = x & 0x3ffu;  // 10 bit mantissa
-  const uint16 exp = static_cast<uint16>(15);
-  const uint16 val = (exp << 10) | man;
+  const uint16_t man = x & 0x3ffu;  // 10 bit mantissa
+  const uint16_t exp = static_cast<uint16_t>(15);
+  const uint16_t val = (exp << 10) | man;
 
   Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(val);
   return result - Eigen::half(1.0);
@@ -731,16 +735,16 @@ PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16 x) {
 
 // Helper function to convert an 16-bit integer to a bfloat16 between [0..1).
 // This can create a uniform distribution of values between [0..1).
-PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x) {
+PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16_t x) {
   // bfloat are formatted as follows (MSB first):
   //    sign(1) exponent(8) mantissa(7)
   // Conceptually construct the following:
   //    sign == 0
   //    exponent == 127  -- an excess 127 representation of a zero exponent
   //    mantissa == 7 random bits
-  const uint16 man = x & 0x7fu;  // 7 bit mantissa
-  const uint16 exp = static_cast<uint16>(127);
-  const uint16 val = (exp << 7) | man;
+  const uint16_t man = x & 0x7fu;  // 7 bit mantissa
+  const uint16_t exp = static_cast<uint16_t>(127);
+  const uint16_t val = (exp << 7) | man;
 
   bfloat16 result;
   memcpy(&result, &val, sizeof(val));
diff --git a/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc b/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc
index 5203daac1a04c0..4b69232418e05b 100644
--- a/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/random_distributions_test.cc
@@ -151,7 +151,7 @@ void UniformMomentsTest(int count, int max_moments,
   auto uniform_moments = [](int n) -> double { return 1. / (n + 1); };
 
   std::vector<T> v1(count);
-  uint64 seed = GetTestSeed();
+  uint64_t seed = GetTestSeed();
   PhiloxRandom gen(seed);
   FillRandoms<UniformDistribution<PhiloxRandom, T> >(gen, &v1[0], v1.size());
   for (int stride : strides) {
@@ -181,7 +181,7 @@ void NormalMomentsTest(int count, int max_moments,
   };
 
   std::vector<T> v1(count);
-  uint64 seed = GetTestSeed();
+  uint64_t seed = GetTestSeed();
   PhiloxRandom gen(seed);
   FillRandoms<NormalDistribution<PhiloxRandom, T> >(gen, &v1[0], v1.size());
 
@@ -241,7 +241,7 @@ template <typename T>
 void RandomParametersMomentsTest(int count, int max_moments,
                                  const std::vector<int>& strides, T z_limit) {
   std::vector<T> v1(count);
-  uint64 seed = GetTestSeed();
+  uint64_t seed = GetTestSeed();
   PhiloxRandom gen(seed);
   FillRandomsWithSingles<
       TruncatedNormalDistribution<SingleSampleAdapter<PhiloxRandom>, T> >(
@@ -302,9 +302,9 @@ TEST(PhiloxRandomTest, RandomParametersDoubleMomentsTest) {
 
 class MockGenerator {
  public:
-  explicit MockGenerator(uint64 seed) : counter_(seed) {}
-  using ResultType = std::vector<uint32>;
-  using ResultElementType = uint32;
+  explicit MockGenerator(uint64_t seed) : counter_(seed) {}
+  using ResultType = std::vector<uint32_t>;
+  using ResultElementType = uint32_t;
   static constexpr int kResultElementCount = 1;
   ResultType operator()() {
     ResultType result;
@@ -313,20 +313,20 @@ class MockGenerator {
   }
 
  private:
-  uint32 counter_;
+  uint32_t counter_;
 };
 
 template <typename T>
 void SingleSampleAdapterSkipTest() {
-  std::vector<uint64> skips(10);
-  std::vector<uint64> skip_afters(10);
+  std::vector<uint64_t> skips(10);
+  std::vector<uint64_t> skip_afters(10);
   absl::c_iota(skips, 0);
   absl::c_iota(skip_afters, 0);
-  uint64 total_samples = 100;
-  uint64 seed = GetTestSeed();
+  uint64_t total_samples = 100;
+  uint64_t seed = GetTestSeed();
 
-  for (uint64 skip : skips) {
-    for (uint64 skip_after : skip_afters) {
+  for (uint64_t skip : skips) {
+    for (uint64_t skip_after : skip_afters) {
       // Baseline rngs.
       T parent_gen(seed);
       SingleSampleAdapter<T> gen(&parent_gen);
diff --git a/third_party/xla/xla/tsl/lib/random/simple_philox.cc b/third_party/xla/xla/tsl/lib/random/simple_philox.cc
index 8b3481ac7c4f39..81c553a7d0fcef 100644
--- a/third_party/xla/xla/tsl/lib/random/simple_philox.cc
+++ b/third_party/xla/xla/tsl/lib/random/simple_philox.cc
@@ -21,18 +21,19 @@ limitations under the License.
 namespace tsl {
 namespace random {
 
-uint32 SimplePhilox::Uniform(uint32 n) {
-  return ExactUniformInt<uint32>(n, [this]() { return Rand32(); });
+uint32_t SimplePhilox::Uniform(uint32_t n) {
+  return ExactUniformInt<uint32_t>(n, [this]() { return Rand32(); });
 }
 
-uint64 SimplePhilox::Uniform64(uint64 n) {
-  return ExactUniformInt<uint64>(n, [this]() { return Rand64(); });
+uint64_t SimplePhilox::Uniform64(uint64_t n) {
+  return ExactUniformInt<uint64_t>(n, [this]() { return Rand64(); });
 }
 
-uint32 SimplePhilox::Skewed(int max_log) {
+uint32_t SimplePhilox::Skewed(int max_log) {
   CHECK(0 <= max_log && max_log <= 32);
   const int shift = Rand32() % (max_log + 1);
-  const uint32 mask = shift == 32 ? ~static_cast<uint32>(0) : (1 << shift) - 1;
+  const uint32_t mask =
+      shift == 32 ? ~static_cast<uint32_t>(0) : (1 << shift) - 1;
   return Rand32() & mask;
 }
 
diff --git a/third_party/xla/xla/tsl/lib/random/simple_philox.h b/third_party/xla/xla/tsl/lib/random/simple_philox.h
index 736bec4d84d238..c8b7005968b0b8 100644
--- a/third_party/xla/xla/tsl/lib/random/simple_philox.h
+++ b/third_party/xla/xla/tsl/lib/random/simple_philox.h
@@ -34,12 +34,12 @@ class SimplePhilox {
   explicit SimplePhilox(PhiloxRandom* gen) : single_(gen) {}
 
   // 32 random bits
-  PHILOX_DEVICE_INLINE uint32 Rand32() { return single_(); }
+  PHILOX_DEVICE_INLINE uint32_t Rand32() { return single_(); }
 
   // 64 random bits
-  PHILOX_DEVICE_INLINE uint64 Rand64() {
-    const uint32 lo = single_(), hi = single_();
-    return lo | static_cast<uint64>(hi) << 32;
+  PHILOX_DEVICE_INLINE uint64_t Rand64() {
+    const uint32_t lo = single_(), hi = single_();
+    return lo | static_cast<uint64_t>(hi) << 32;
   }
 
   // Uniform float in [0, 1)
@@ -47,25 +47,25 @@ class SimplePhilox {
 
   // Uniform double in [0, 1)
   PHILOX_DEVICE_INLINE double RandDouble() {
-    const uint32 x0 = single_(), x1 = single_();
+    const uint32_t x0 = single_(), x1 = single_();
     return Uint64ToDouble(x0, x1);
   }
 
   // Uniform integer in [0, n).
   // Uses rejection sampling, so may need more than one 32-bit sample.
-  uint32 Uniform(uint32 n);
+  uint32_t Uniform(uint32_t n);
 
   // Approximately uniform integer in [0, n).
   // Uses rejection sampling, so may need more than one 64-bit sample.
-  uint64 Uniform64(uint64 n);
+  uint64_t Uniform64(uint64_t n);
 
   // True with probability 1/n.
-  bool OneIn(uint32 n) { return Uniform(n) == 0; }
+  bool OneIn(uint32_t n) { return Uniform(n) == 0; }
 
   // Skewed: pick "base" uniformly from range [0,max_log] and then
   // return "base" random bits.  The effect is to pick a number in the
   // range [0,2^max_log-1] with bias towards smaller numbers.
-  uint32 Skewed(int max_log);
+  uint32_t Skewed(int max_log);
 
  private:
   SingleSampleAdapter<PhiloxRandom> single_;
diff --git a/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc b/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc
index 7a20dbeccf56c0..4433351c295d6f 100644
--- a/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/simple_philox_test.cc
@@ -76,10 +76,10 @@ TEST(SimplePhiloxTest, Regression_CloseSeedsAreDifferent) {
   PhiloxRandom philox1(0, 1), philox2(1, 1);
   SimplePhilox gen1(&philox1), gen2(&philox2);
 
-  std::set<uint32> first;
-  std::set<uint32> all;
+  std::set<uint32_t> first;
+  std::set<uint32_t> all;
   for (int i = 0; i < kCount; ++i) {
-    uint32 v = gen1.Rand32();
+    uint32_t v = gen1.Rand32();
     first.insert(v);
     all.insert(v);
     all.insert(gen2.Rand32());
@@ -96,13 +96,13 @@ TEST(SimplePhiloxTest, TestUniform) {
   PhiloxRandom philox(17, 17);
   SimplePhilox gen(&philox);
 
-  uint32 range = 3 * (1L << 29);
-  uint32 threshold = 1L << 30;
+  uint32_t range = 3 * (1L << 29);
+  uint32_t threshold = 1L << 30;
 
   size_t count = 0;
   static const int kTrials = 100000;
   for (int i = 0; i < kTrials; ++i) {
-    uint32 rnd = gen.Uniform(range);
+    uint32_t rnd = gen.Uniform(range);
     if (rnd < threshold) {
       ++count;
     }
@@ -115,13 +115,13 @@ TEST(SimplePhiloxTest, TestUniform64) {
   PhiloxRandom philox(17, 17);
   SimplePhilox gen(&philox);
 
-  uint64 range = 3 * (1LL << 59);
-  uint64 threshold = 1LL << 60;
+  uint64_t range = 3 * (1LL << 59);
+  uint64_t threshold = 1LL << 60;
 
   size_t count = 0;
   static const int kTrials = 100000;
   for (int i = 0; i < kTrials; ++i) {
-    uint64 rnd = gen.Uniform64(range);
+    uint64_t rnd = gen.Uniform64(range);
     if (rnd < threshold) {
       ++count;
     }
diff --git a/third_party/xla/xla/tsl/lib/random/weighted_picker.cc b/third_party/xla/xla/tsl/lib/random/weighted_picker.cc
index 911f0f4d300616..8dc3edd659c97b 100644
--- a/third_party/xla/xla/tsl/lib/random/weighted_picker.cc
+++ b/third_party/xla/xla/tsl/lib/random/weighted_picker.cc
@@ -35,9 +35,9 @@ WeightedPicker::WeightedPicker(int N) {
   }
 
   // Initialize the levels
-  level_ = new int32*[num_levels_];
+  level_ = new int32_t*[num_levels_];
   for (int l = 0; l < num_levels_; l++) {
-    level_[l] = new int32[LevelSize(l)];
+    level_[l] = new int32_t[LevelSize(l)];
   }
 
   SetAllWeights(1);
@@ -50,9 +50,9 @@ WeightedPicker::~WeightedPicker() {
   delete[] level_;
 }
 
-static int32 UnbiasedUniform(SimplePhilox* r, int32_t n) {
+static int32_t UnbiasedUniform(SimplePhilox* r, int32_t n) {
   CHECK_LE(0, n);
-  const uint32 range = ~static_cast<uint32>(0);
+  const uint32_t range = ~static_cast<uint32_t>(0);
   if (n == 0) {
     return r->Rand32() * n;
   } else if (0 == (n & (n - 1))) {
@@ -64,8 +64,8 @@ static int32 UnbiasedUniform(SimplePhilox* r, int32_t n) {
     // Rand32's output is uniform in the half-open interval [0, 2^{32}).
     // For any interval [m,n), the number of elements in it is n-m.
 
-    uint32 rem = (range % n) + 1;
-    uint32 rnd;
+    uint32_t rem = (range % n) + 1;
+    uint32_t rnd;
 
     // rem = ((2^{32}-1) \bmod n) + 1
     // 1 <= rem <= n
@@ -145,7 +145,7 @@ void WeightedPicker::set_weight(int index, int32_t weight) {
 
 void WeightedPicker::SetAllWeights(int32_t weight) {
   // Initialize leaves
-  int32* leaves = level_[num_levels_ - 1];
+  int32_t* leaves = level_[num_levels_ - 1];
   for (int i = 0; i < N_; i++) leaves[i] = weight;
   for (int i = N_; i < LevelSize(num_levels_ - 1); i++) leaves[i] = 0;
 
@@ -153,11 +153,11 @@ void WeightedPicker::SetAllWeights(int32_t weight) {
   RebuildTreeWeights();
 }
 
-void WeightedPicker::SetWeightsFromArray(int N, const int32* weights) {
+void WeightedPicker::SetWeightsFromArray(int N, const int32_t* weights) {
   Resize(N);
 
   // Initialize leaves
-  int32* leaves = level_[num_levels_ - 1];
+  int32_t* leaves = level_[num_levels_ - 1];
   for (int i = 0; i < N_; i++) leaves[i] = weights[i];
   for (int i = N_; i < LevelSize(num_levels_ - 1); i++) leaves[i] = 0;
 
@@ -167,8 +167,8 @@ void WeightedPicker::SetWeightsFromArray(int N, const int32* weights) {
 
 void WeightedPicker::RebuildTreeWeights() {
   for (int l = num_levels_ - 2; l >= 0; l--) {
-    int32* level = level_[l];
-    int32* children = level_[l + 1];
+    int32_t* level = level_[l];
+    int32_t* children = level_[l + 1];
     for (int i = 0; i < LevelSize(l); i++) {
       level[i] = children[2 * i] + children[2 * i + 1];
     }
@@ -202,8 +202,8 @@ void WeightedPicker::Resize(int new_size) {
   // O(N) regardless.
   assert(new_size > N_);
   WeightedPicker new_picker(new_size);
-  int32* dst = new_picker.level_[new_picker.num_levels_ - 1];
-  int32* src = this->level_[this->num_levels_ - 1];
+  int32_t* dst = new_picker.level_[new_picker.num_levels_ - 1];
+  int32_t* src = this->level_[this->num_levels_ - 1];
   memcpy(dst, src, sizeof(dst[0]) * N_);
   memset(dst + N_, 0, sizeof(dst[0]) * (new_size - N_));
   new_picker.RebuildTreeWeights();
diff --git a/third_party/xla/xla/tsl/lib/random/weighted_picker.h b/third_party/xla/xla/tsl/lib/random/weighted_picker.h
index 1300fba858d881..e2060f35b43eac 100644
--- a/third_party/xla/xla/tsl/lib/random/weighted_picker.h
+++ b/third_party/xla/xla/tsl/lib/random/weighted_picker.h
@@ -58,7 +58,7 @@ class WeightedPicker {
 
   // Get the weight associated with an element
   // REQUIRES 0 <= index < N
-  int32 get_weight(int index) const;
+  int32_t get_weight(int index) const;
 
   // Set the weight associated with an element
   // REQUIRES weight >= 0.0f
@@ -66,7 +66,7 @@ class WeightedPicker {
   void set_weight(int index, int32_t weight);
 
   // Get the total combined weight of all elements
-  int32 total_weight() const;
+  int32_t total_weight() const;
 
   // Get the number of elements in the picker
   int num_elements() const;
@@ -78,7 +78,7 @@ class WeightedPicker {
   // sets the weight of each element i to weight[i].
   // The sum of the weights should not exceed 2^31 - 2
   // Complexity O(N).
-  void SetWeightsFromArray(int N, const int32* weights);
+  void SetWeightsFromArray(int N, const int32_t* weights);
 
   // REQUIRES   N >= 0
   //
@@ -106,7 +106,7 @@ class WeightedPicker {
   // the sum of the weights of its children.
   int N_;           // Number of elements
   int num_levels_;  // Number of levels in tree (level-0 is root)
-  int32** level_;   // Array that holds nodes per level
+  int32_t** level_;  // Array that holds nodes per level
 
   // Size of each level
   static int LevelSize(int level) { return 1 << level; }
@@ -118,13 +118,13 @@ class WeightedPicker {
   void operator=(const WeightedPicker&) = delete;
 };
 
-inline int32 WeightedPicker::get_weight(int index) const {
+inline int32_t WeightedPicker::get_weight(int index) const {
   DCHECK_GE(index, 0);
   DCHECK_LT(index, N_);
   return level_[num_levels_ - 1][index];
 }
 
-inline int32 WeightedPicker::total_weight() const { return level_[0][0]; }
+inline int32_t WeightedPicker::total_weight() const { return level_[0][0]; }
 
 inline int WeightedPicker::num_elements() const { return N_; }
 
diff --git a/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc b/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc
index c4ae1bb4a1b036..34b004ece20ec6 100644
--- a/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc
+++ b/third_party/xla/xla/tsl/lib/random/weighted_picker_test.cc
@@ -32,7 +32,7 @@ namespace random {
 static void TestPicker(SimplePhilox* rnd, int size);
 static void CheckUniform(SimplePhilox* rnd, WeightedPicker* picker, int trials);
 static void CheckSkewed(SimplePhilox* rnd, WeightedPicker* picker, int trials);
-static void TestPickAt(int items, const int32* weights);
+static void TestPickAt(int items, const int32_t* weights);
 
 TEST(WeightedPicker, Simple) {
   PhiloxRandom philox(testing::RandomSeed(), 17);
@@ -101,7 +101,7 @@ TEST(WeightedPicker, BigWeights) {
 
 TEST(WeightedPicker, Deterministic) {
   VLOG(0) << "======= Testing deterministic pick";
-  static const int32 weights[] = {1, 0, 200, 5, 42};
+  static const int32_t weights[] = {1, 0, 200, 5, 42};
   TestPickAt(TF_ARRAYSIZE(weights), weights);
 }
 
@@ -130,7 +130,7 @@ static void TestPicker(SimplePhilox* rnd, int size) {
   }
 
   // Create zero weights array
-  std::vector<int32> weights(size);
+  std::vector<int32_t> weights(size);
   for (int elem = 0; elem < size; elem++) {
     weights[elem] = 0;
   }
@@ -221,7 +221,7 @@ static void CheckSkewed(SimplePhilox* rnd, WeightedPicker* picker, int trials) {
   delete[] count;
 }
 
-static void TestPickAt(int items, const int32* weights) {
+static void TestPickAt(int items, const int32_t* weights) {
   WeightedPicker picker(items);
   picker.SetWeightsFromArray(items, weights);
   int weight_index = 0;
@@ -245,7 +245,7 @@ BENCHMARK(BM_Create)->Range(1, 1024);
 
 static void BM_CreateAndSetWeights(::testing::benchmark::State& state) {
   int arg = state.range(0);
-  std::vector<int32> weights(arg);
+  std::vector<int32_t> weights(arg);
   for (int i = 0; i < arg; i++) {
     weights[i] = i * 10;
   }

From 37b3bb1e3dc5eb0fbc63492deef815bea4082cdf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:03:51 -0800
Subject: [PATCH 379/753] Automated Code Change

PiperOrigin-RevId: 845624259
---
 tensorflow/compiler/mlir/tensorflow/BUILD                | 9 +++++++++
 .../compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc   | 2 ++
 tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc      | 1 +
 tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc    | 1 +
 .../compiler/mlir/tensorflow/ir/tf_op_interfaces.cc      | 2 ++
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc         | 8 +-------
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc     | 1 +
 .../compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc  | 4 ++++
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc     | 2 ++
 .../compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc  | 2 ++
 .../compiler/mlir/tensorflow/ir/tf_remaining_ops.cc      | 8 ++------
 tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc | 3 +++
 .../compiler/mlir/tensorflow/ir/tf_saved_model_test.cc   | 1 +
 tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc   | 2 ++
 .../mlir/tensorflow/ir/tpu_embedding_ops_registry.cc     | 2 --
 15 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 094070fa86a602..588bb1caa75e8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -298,9 +298,11 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
@@ -350,6 +352,8 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_canonicalize_inc_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
         "@llvm-project//mlir:CallOpInterfaces",
@@ -399,6 +403,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:inline_function_utils",
         "//tensorflow/core/common_runtime:lower_function_call_inline_policy",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:BytecodeOpInterface",
@@ -502,6 +507,9 @@ cc_library(
         "//tensorflow/core/ir:Dialect",
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
@@ -535,6 +543,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
+        "@com_google_googletest//:gtest_main",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
index 9a78a1a83ae214..a41e81b0bda21a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h"
 
+#include <cstdint>
+
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 19a988827bdf42..f6ce8d327a8874 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <optional>
 #include <utility>
 
+#include "absl/log/check.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index e8d0ea525943fd..db85471f6ed6aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <iterator>
 
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
index 2cc385794122a2..60a3ea3abdc10c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 
+#include <cstdint>
+
 namespace mlir {
 namespace TF {
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index f432b6b1f612f8..160413009efb3a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -15,16 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
-#include <algorithm>
-#include <cstdint>
 #include <functional>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <tuple>
-#include <type_traits>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index 6382f325a47505..a0fefadca96559 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
index b3ce501c1c08d1..02105ad8cfc210 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h"
 
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 23683673fe189a..1d9a4fecfab4cf 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -90,6 +91,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/side_effect_analysis_util.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace mlir {
 namespace TF {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
index ca8f27a1489c06..0b13f1791c7717 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_tensor_helper.h"
 
+#include <cstdint>
+
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
index 7419149074fb8a..1764aa1124059c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -15,16 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h"
 
-#include <algorithm>
 #include <cstdint>
-#include <functional>
-#include <limits>
-#include <numeric>
 #include <optional>
 #include <string>
-#include <tuple>
-#include <type_traits>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 45717471e373a2..74af6e58fb2294 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
 #include <algorithm>
+#include <cassert>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
index 48cfb26d6802b9..1c53c8296a4a17 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
+#include <gmock/gmock.h>
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
index d6d22098666ffe..adf055365a9c56 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
+#include <cstdint>
+
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
index 5921efa20969b2..93c33e9799a298 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tpu_embedding_ops_registry.h"
 
-#include <vector>
-
 namespace mlir {
 namespace TF {
 

From e51b72b3723216fafb36f0eb6d8297314f0bb5da Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:09:08 -0800
Subject: [PATCH 380/753] Automated Code Change

PiperOrigin-RevId: 845626242
---
 third_party/xla/xla/python/ifrt_proxy/client/executable.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
index 957b9fd5e4841d..9204aa35e25ac9 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -135,7 +135,8 @@ absl::StatusOr<absl::Cord> ExecuteLoadedHostCallback(
   absl::CordReader reader(operand_buffer);
   for (const auto& spec : xla_host_callback.operands) {
     const int64_t size = xla::ShapeUtil::ByteSizeOf(spec.shape);
-    void* p = tsl::port::AlignedMalloc(size, kAlignment);
+    void* p = tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(kAlignment));
     CHECK(p != nullptr);
     std::unique_ptr<char, Deleter> buffer(reinterpret_cast<char*>(p));
 
@@ -163,7 +164,8 @@ absl::StatusOr<absl::Cord> ExecuteLoadedHostCallback(
 
   for (const auto& spec : xla_host_callback.results) {
     const int64_t size = xla::ShapeUtil::ByteSizeOf(spec.shape);
-    void* data = tsl::port::AlignedMalloc(size, kAlignment);
+    void* data = tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(kAlignment));
     CHECK(data != nullptr);
 
     result_ptrs.push_back(data);

From daddf59ea22c309877229e4d39b61be15b1a4307 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:09:09 -0800
Subject: [PATCH 381/753] Automated Code Change

PiperOrigin-RevId: 845626245
---
 third_party/xla/xla/tools/xla_compile_lib.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tools/xla_compile_lib.cc b/third_party/xla/xla/tools/xla_compile_lib.cc
index 01f7feac6b8f80..60c82a6b173013 100644
--- a/third_party/xla/xla/tools/xla_compile_lib.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib.cc
@@ -112,7 +112,7 @@ static absl::StatusOr<std::string> CompileGpuExecutable(
   TF_ASSIGN_OR_RETURN(stream_executor::StreamExecutor * stream_executor,
                       platform->ExecutorForDevice(0));
   auto allocator =
-      std::make_unique<stream_executor::StreamExecutorMemoryAllocator>(
+      std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
           stream_executor);
   compile_options.device_allocator = allocator.get();
 

From 22f353e125fd0c625dfaaa7ea74b99a0d9cc788f Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 17 Dec 2025 00:09:48 -0800
Subject: [PATCH 382/753] [XLA:GPU] Simplify Copy Fusions.

They don't need a separate reference to BufferAssignment. It is available via
IrEmitterContext.

PiperOrigin-RevId: 845626420
---
 .../xla/xla/backends/gpu/codegen/copy.cc      | 28 +++++++++++--------
 .../xla/xla/backends/gpu/codegen/copy.h       | 13 +++------
 .../xla/xla/backends/gpu/codegen/fusions.cc   |  5 ++--
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/copy.cc b/third_party/xla/xla/backends/gpu/codegen/copy.cc
index 2997e07de0e0ef..54d13fb70e059a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/copy.cc
@@ -70,8 +70,9 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
     const HloInstruction* root = &root_adaptor.instruction();
     const HloInstruction* src_instr =
         fusion.operand(root->operand(0)->parameter_number());
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                        buffer_assignment_->GetUniqueSlice(src_instr, {}));
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice slice,
+        ir_emitter_context.buffer_assignment().GetUniqueSlice(src_instr, {}));
     src_buffers.push_back(slice);
     src_shapes.push_back(root->operand(0)->shape());
   }
@@ -82,8 +83,10 @@ absl::StatusOr<FusionEmissionResult> MemcpyFusion::Emit(
         if (!subshape.IsArray()) {
           return absl::OkStatus();
         }
-        TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                            buffer_assignment_->GetUniqueSlice(&fusion, index));
+        TF_ASSIGN_OR_RETURN(
+            BufferAllocation::Slice slice,
+            ir_emitter_context.buffer_assignment().GetUniqueSlice(&fusion,
+                                                                  index));
         dst_buffers.push_back(slice);
         return absl::OkStatus();
       }));
@@ -122,10 +125,11 @@ absl::StatusOr<FusionEmissionResult> DynamicMemcpyFusion::Emit(
     // implemented: we only support dynamic offsets, no dynamic sizes.
     TF_ASSIGN_OR_RETURN(
         BufferAllocation::Slice input_slice,
-        buffer_assignment_->GetUniqueSlice(
+        ir_emitter_context.buffer_assignment().GetUniqueSlice(
             &SkipOptionalBitcast(root.GetOperand(0)).instruction(), {}));
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_slice,
-                        buffer_assignment_->GetUniqueSlice(&fusion, {}));
+    TF_ASSIGN_OR_RETURN(
+        BufferAllocation::Slice dst_slice,
+        ir_emitter_context.buffer_assignment().GetUniqueSlice(&fusion, {}));
     CHECK_EQ(input_slice, dst_slice);
 
     source_operand_index = 1;
@@ -138,10 +142,12 @@ absl::StatusOr<FusionEmissionResult> DynamicMemcpyFusion::Emit(
 
   const auto* src_instr =
       &SkipOptionalBitcast(root.GetOperand(source_operand_index)).instruction();
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice src_buffer,
-                      buffer_assignment_->GetUniqueSlice(src_instr, {}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dst_buffer,
-                      buffer_assignment_->GetUniqueSlice(&fusion, {}));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice src_buffer,
+      ir_emitter_context.buffer_assignment().GetUniqueSlice(src_instr, {}));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice dst_buffer,
+      ir_emitter_context.buffer_assignment().GetUniqueSlice(&fusion, {}));
 
   FusionEmissionResult result;
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/copy.h b/third_party/xla/xla/backends/gpu/codegen/copy.h
index abd2e73e222842..7ea70ff9ef3072 100644
--- a/third_party/xla/xla/backends/gpu/codegen/copy.h
+++ b/third_party/xla/xla/backends/gpu/codegen/copy.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 
@@ -33,9 +32,8 @@ namespace gpu {
 // implemented using `memcpy`s.
 class MemcpyFusion : public FusionInterface {
  public:
-  MemcpyFusion(const HloFusionAnalysis& analysis,
-               const BufferAssignment* buffer_assignment)
-      : analysis_(analysis), buffer_assignment_(buffer_assignment) {}
+  explicit MemcpyFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
 
   absl::StatusOr<FusionEmissionResult> Emit(
       IrEmitterContext& ir_emitter_context,
@@ -43,7 +41,6 @@ class MemcpyFusion : public FusionInterface {
 
  private:
   const HloFusionAnalysis& analysis_;
-  const BufferAssignment* buffer_assignment_;
 };
 
 // Special case of a fusion consisting only of instructions that can be
@@ -52,9 +49,8 @@ class MemcpyFusion : public FusionInterface {
 // (e.g. dynamic-slice in a while loop).
 class DynamicMemcpyFusion : public FusionInterface {
  public:
-  DynamicMemcpyFusion(const HloFusionAnalysis& analysis,
-                      const BufferAssignment* buffer_assignment)
-      : analysis_(analysis), buffer_assignment_(buffer_assignment) {}
+  explicit DynamicMemcpyFusion(const HloFusionAnalysis& analysis)
+      : analysis_(analysis) {}
 
   absl::StatusOr<FusionEmissionResult> Emit(
       IrEmitterContext& ir_emitter_context,
@@ -70,7 +66,6 @@ class DynamicMemcpyFusion : public FusionInterface {
 
  private:
   const HloFusionAnalysis& analysis_;
-  const BufferAssignment* buffer_assignment_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.cc b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
index d8e082f8b0a3a8..d5528bf9eb6d5d 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
@@ -53,8 +53,7 @@ std::optional<std::unique_ptr<FusionInterface>> HloFusionInfo::GetCopyFusion()
       return std::nullopt;
     }
 
-    return std::make_unique<DynamicMemcpyFusion>(analysis(),
-                                                 buffer_assignment_);
+    return std::make_unique<DynamicMemcpyFusion>(analysis());
   }
 
   for (const HloInstructionAdaptor& root_adaptor : analysis().fusion_roots()) {
@@ -67,7 +66,7 @@ std::optional<std::unique_ptr<FusionInterface>> HloFusionInfo::GetCopyFusion()
     }
   }
 
-  return std::make_unique<MemcpyFusion>(analysis(), buffer_assignment_);
+  return std::make_unique<MemcpyFusion>(analysis());
 }
 
 bool HloFusionInfo::CanEmitDynamicUpdateSliceInPlace() const {

From 50fc42bd3f887dbb4859c2fcaed42a1bf3416072 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:26:11 -0800
Subject: [PATCH 383/753] Automated Code Change

PiperOrigin-RevId: 845631520
---
 .../core/data/service/client/data_service_client.cc       | 8 ++++----
 tensorflow/core/data/service/client/data_service_client.h | 2 +-
 .../core/data/service/client/data_service_client_test.cc  | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 1a79089fbccc0f..e99277b79f8752 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -226,16 +226,16 @@ TraceMeMetadata DataServiceClient::GetTraceMeMetadata() const {
       "num_tasks",
       num_tasks == -1
           ? kTraceInfoUnavailable
-          : strings::Printf("%lld", static_cast<long long>(num_tasks))));
+          : absl::StrFormat("%lld", static_cast<long long>(num_tasks))));
   result.push_back(std::make_pair("job_name", params_.job_name));
   result.push_back(std::make_pair(
       "max_outstanding_requests",
-      strings::Printf(
+      absl::StrFormat(
           "%lld", static_cast<long long>(params_.max_outstanding_requests))));
   if (params_.max_outstanding_requests == model::kAutotune) {
     result.push_back(std::make_pair(
         "autotuned_max_outstanding_requests",
-        strings::Printf("%lld", static_cast<long long>(
+        absl::StrFormat("%lld", static_cast<long long>(
                                     autotuned_max_outstanding_requests))));
   }
   return result;
@@ -295,7 +295,7 @@ void DataServiceClient::TaskThreadManager() TF_LOCKS_EXCLUDED(mu_) {
   auto cleanup =
       gtl::MakeCleanup([] { VLOG(1) << "Task thread manager exiting"; });
   VLOG(1) << "Starting task thread manager";
-  uint64 next_check = Env::Default()->NowMicros();
+  uint64_t next_check = Env::Default()->NowMicros();
   while (true) {
     {
       mutex_lock l(mu_);
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index 7c211d5551c46e..ecaecc841573e5 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -48,7 +48,7 @@ namespace data {
 class DataServiceContext {
  public:
   virtual ~DataServiceContext() = default;
-  virtual std::unique_ptr<Thread> StartThread(const string& name,
+  virtual std::unique_ptr<Thread> StartThread(const std::string& name,
                                               std::function<void()> fn) = 0;
   virtual void RecordBufferEnqueue(const std::vector<Tensor>& element) = 0;
   virtual void RecordBufferDequeue(const std::vector<Tensor>& element) = 0;
diff --git a/tensorflow/core/data/service/client/data_service_client_test.cc b/tensorflow/core/data/service/client/data_service_client_test.cc
index 9af455d11a0201..0baca60d285a8f 100644
--- a/tensorflow/core/data/service/client/data_service_client_test.cc
+++ b/tensorflow/core/data/service/client/data_service_client_test.cc
@@ -82,7 +82,7 @@ class TestDataServiceContext : public DataServiceContext {
   TestDataServiceContext() = default;
   ~TestDataServiceContext() override = default;
 
-  std::unique_ptr<Thread> StartThread(const string& name,
+  std::unique_ptr<Thread> StartThread(const std::string& name,
                                       std::function<void()> fn) override {
     return absl::WrapUnique(
         Env::Default()->StartThread({}, name, std::move(fn)));

From 136038ca4cc2d9745271f40d01366eb7a90eb27f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:29:50 -0800
Subject: [PATCH 384/753] Automated Code Change

PiperOrigin-RevId: 845632463
---
 tensorflow/core/common_runtime/gradients.cc   |   8 +-
 tensorflow/core/common_runtime/gradients.h    |   2 +-
 .../core/common_runtime/graph_constructor.cc  |  68 +++++------
 .../core/common_runtime/graph_constructor.h   |   8 +-
 .../common_runtime/graph_constructor_fuzz.cc  |  26 ++---
 .../common_runtime/graph_constructor_test.cc  | 107 +++++++++---------
 .../common_runtime/graph_execution_state.cc   |  38 +++----
 .../common_runtime/graph_execution_state.h    |  15 +--
 .../core/common_runtime/graph_optimizer.h     |   4 +-
 .../core/common_runtime/graph_runner.cc       |  18 +--
 tensorflow/core/common_runtime/graph_runner.h |   4 +-
 .../core/common_runtime/graph_runner_test.cc  |   8 +-
 tensorflow/core/common_runtime/graph_view.cc  |  30 ++---
 tensorflow/core/common_runtime/graph_view.h   |  26 ++---
 .../hierarchical_tree_broadcaster.cc          |  24 ++--
 .../hierarchical_tree_broadcaster_test.cc     |   8 +-
 .../immutable_executor_state.cc               |  18 +--
 .../common_runtime/immutable_executor_state.h |  18 +--
 .../common_runtime/inline_function_utils.cc   |  82 +++++++-------
 .../common_runtime/inline_function_utils.h    |  15 ++-
 .../inline_function_utils_test.cc             |   5 +-
 .../input_colocation_exemption_registry.cc    |   2 +-
 .../input_colocation_exemption_registry.h     |   8 +-
 .../core/common_runtime/inspecting_placer.cc  |  22 ++--
 .../core/common_runtime/inspecting_placer.h   |   2 +-
 .../core/common_runtime/int32_fulltype.h      |   4 +-
 .../common_runtime/int32_fulltype_test.cc     |   4 +-
 ...lacer_inspection_required_ops_pass_test.cc |   4 +-
 .../kernel_benchmark_testlib.cc               |  25 ++--
 .../common_runtime/kernel_benchmark_testlib.h |  10 +-
 .../core/common_runtime/local_device.cc       |   2 +-
 .../core/common_runtime/lower_case_op.cc      |  18 +--
 .../core/common_runtime/lower_case_op_test.cc |   4 +-
 .../lower_function_call_op_test.cc            |   4 +-
 .../common_runtime/lower_functional_ops.cc    |   2 +-
 .../lower_functional_ops_test.cc              |   4 +-
 tensorflow/core/common_runtime/lower_if_op.cc |   8 +-
 .../core/common_runtime/lower_if_op_test.cc   |   6 +-
 .../core/common_runtime/lower_while_op.cc     |  16 +--
 .../common_runtime/lower_while_op_test.cc     |  15 +--
 .../core/common_runtime/memory_types.cc       |  22 ++--
 tensorflow/core/common_runtime/memory_types.h |   2 +-
 .../core/common_runtime/memory_types_test.cc  |   4 +-
 43 files changed, 369 insertions(+), 351 deletions(-)

diff --git a/tensorflow/core/common_runtime/gradients.cc b/tensorflow/core/common_runtime/gradients.cc
index 466977ecf772d6..ede2bed5eced15 100644
--- a/tensorflow/core/common_runtime/gradients.cc
+++ b/tensorflow/core/common_runtime/gradients.cc
@@ -40,18 +40,18 @@ namespace tensorflow {
 static const char* const kGradientOp = "SymbolicGradient";
 static const char* const kNodeLabel = "Func";
 
-string NodeOut::name() const {
+std::string NodeOut::name() const {
   if (index == 0) {
     return node->name();
   } else {
-    return strings::StrCat(node->name(), ":", index);
+    return absl::StrCat(node->name(), ":", index);
   }
 }
 
 DataType NodeOut::dtype() const { return node->output_type(index); }
 
 struct NodeOutHash {
-  uint64 operator()(const NodeOut& x) const {
+  uint64_t operator()(const NodeOut& x) const {
     return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
                   x.index);
   }
@@ -334,7 +334,7 @@ NodeOut SymbolicGradientBuilder::SumGradients(const NodeOut& src) {
   return {add, 0};
 }
 
-static bool IsPrimitiveOpWithNoGrad(const string& func) {
+static bool IsPrimitiveOpWithNoGrad(const std::string& func) {
   gradient::Creator creator;
   absl::Status s = gradient::GetOpGradientCreator(func, &creator);
   return s.ok() && (creator == nullptr);
diff --git a/tensorflow/core/common_runtime/gradients.h b/tensorflow/core/common_runtime/gradients.h
index aaa9cad80ad691..6eb32e450e1dcf 100644
--- a/tensorflow/core/common_runtime/gradients.h
+++ b/tensorflow/core/common_runtime/gradients.h
@@ -28,7 +28,7 @@ struct NodeOut {
   int index;
 
   // Returns the string name that represents the output of this node.
-  string name() const;
+  std::string name() const;
   // Returns the data type of the output of this node.
   DataType dtype() const;
 };
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index cbbbee60ee7c6f..5fb43daa1c0b8d 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -137,14 +137,14 @@ class GraphConstructor {
     bool expect_device_spec;
     bool propagate_device_spec;
 
-    string prefix;
+    std::string prefix;
     bool uniquify_names;
     bool uniquify_prefix;
     std::map<TensorId, TensorId> input_map;
     bool skip_mapped_nodes;
-    std::vector<string> control_dependencies;
+    std::vector<std::string> control_dependencies;
     std::vector<TensorId> return_tensors;
-    std::vector<string> return_nodes;
+    std::vector<std::string> return_nodes;
 
     // TODO(ashankar): This bool exists to separate out functionality required
     // to make ImportGraphDef a close equivalent of Python's import_graph_def
@@ -166,7 +166,7 @@ class GraphConstructor {
     // value to the Node when they are missing from the NodeDef.
     bool add_default_attributes = true;
 
-    string default_device;
+    std::string default_device;
   };
 
   typedef absl::Span<const NodeDef* const> NodeDefSlice;
@@ -288,7 +288,7 @@ class GraphConstructor {
 
   // Returns a unique version of `original_name`, or `original_name` if it's
   // already unique in the graph.
-  string FindUniqueName(absl::string_view original_name);
+  std::string FindUniqueName(absl::string_view original_name);
 
   // Decrement pending count for users of `processed` and add the ones that now
   // have all of their pending inputs satisfied to `ready_`.
@@ -321,7 +321,7 @@ class GraphConstructor {
   const VersionDef original_versions_;
 
   // A copy of opts_.prefix, possibly uniquified.
-  string prefix_;
+  std::string prefix_;
 
   StackTracesMap traces_;
 
@@ -364,7 +364,7 @@ class GraphConstructor {
 
   // Imported node names that have been uniquified. The key is the original
   // name, the value is the new unique name.
-  gtl::FlatMap<string, string> uniquified_names_;
+  gtl::FlatMap<std::string, std::string> uniquified_names_;
 
   // Index of NodeDefs in node_defs_ with all inputs already converted. We use a
   // (sorted) set so nodes are created in the order defined in the GraphDef.
@@ -381,10 +381,10 @@ class GraphConstructor {
   // Used in the conversion from node_defs_ to g_ to represent the ith input
   // of a node.
   struct InputInfo {
-    explicit InputInfo(const string& node_name, Node* n, int i)
+    explicit InputInfo(const std::string& node_name, Node* n, int i)
         : name(node_name), node(n), index(i) {}
     // Use string instead of StringPiece so we don't have to manage lifetime
-    string name;
+    std::string name;
     Node* node;
     int index;
 
@@ -402,10 +402,10 @@ class GraphConstructor {
   // Used in the conversion from node_defs_ to g_ to represent an edge from
   // the node named 'name' to node 'n'.
   struct EdgeInfo {
-    explicit EdgeInfo(const string& name, int i1, Node* n, int i2)
+    explicit EdgeInfo(const std::string& name, int i1, Node* n, int i2)
         : src_name(name), src_index(i1), dst_node(n), dst_index(i2) {}
     // Use string instead of StringPiece so we don't have to manage lifetime
-    string src_name;
+    std::string src_name;
     int src_index;
     Node* dst_node;
     int dst_index;
@@ -594,7 +594,7 @@ bool NodeNameInValues(const std::map<TensorId, TensorId>& input_map,
   return false;
 }
 
-bool NodeNameInValues(const std::vector<string>& control_dependencies,
+bool NodeNameInValues(const std::vector<std::string>& control_dependencies,
                       const absl::string_view& node_name) {
   return std::find(control_dependencies.begin(), control_dependencies.end(),
                    node_name) != control_dependencies.end();
@@ -632,7 +632,7 @@ absl::Status GraphConstructor::EnsureNoNameCollisions() {
   }
   if (prefix_.empty() && opts_.importing && !opts_.uniquify_names) {
     for (size_t i = 0; i < node_def_count(); ++i) {
-      const string& name = get_node_def(i).name();
+      const std::string& name = get_node_def(i).name();
       if (NameExistsInGraph(name)) {
         return errors::InvalidArgument("Node name '", name,
                                        "' already exists in the Graph");
@@ -646,7 +646,7 @@ absl::Status GraphConstructor::EnsureNoNameCollisions() {
                                      "' would lead to invalid node names");
     }
     if (NameExistsInGraph(prefix_no_slash) && opts_.uniquify_prefix) {
-      prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
+      prefix_ = absl::StrCat(FindUniqueName(prefix_no_slash), "/");
     }
   }
   return absl::OkStatus();
@@ -668,7 +668,7 @@ absl::Status GraphConstructor::ValidateInputMapAndControlDependencies() {
                                      "control edge and non-control edge");
     }
   }
-  for (const string& node : opts_.control_dependencies) {
+  for (const std::string& node : opts_.control_dependencies) {
     if (existing_nodes_.count(node) == 0) {
       return errors::InvalidArgument(
           "node '", node,
@@ -727,7 +727,7 @@ absl::Status GraphConstructor::InitFromEdges() {
   const int num_nodes = node_def_count();
   pending_count_.reserve(num_nodes);
   outputs_.resize(num_nodes);
-  gtl::FlatSet<string> next_iteration_nodes;
+  gtl::FlatSet<std::string> next_iteration_nodes;
   for (int n = 0; n < node_def_count(); ++n) {
     const NodeDef& node_def = get_node_def(n);
     if (IsNextIteration(node_def)) {
@@ -752,7 +752,7 @@ absl::Status GraphConstructor::InitFromEdges() {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_nodes.find(string(id.first)) !=
+          if (next_iteration_nodes.find(std::string(id.first)) !=
               next_iteration_nodes.end()) {
             has_loop_back_edge = true;
           }
@@ -796,7 +796,7 @@ absl::Status GraphConstructor::ValidateColocationConstraints(
     return absl::OkStatus();
   const auto iter = node_def.attr().find(kColocationAttrName);
   if (iter == node_def.attr().end()) return absl::OkStatus();
-  for (const string& c : iter->second.list().s()) {
+  for (const std::string& c : iter->second.list().s()) {
     absl::string_view s(c);
     if (absl::ConsumePrefix(&s, kColocationGroupPrefix) &&
         gdef_nodes_.find(s) == gdef_nodes_.end()) {
@@ -957,11 +957,11 @@ void GraphConstructor::AddControlDependencies(
 
   // node_def either has no inputs or all remapped inputs, add the control
   // dependencies
-  for (const string& control_dep : opts_.control_dependencies) {
-    string input = TensorId(control_dep, Graph::kControlSlot).ToString();
+  for (const std::string& control_dep : opts_.control_dependencies) {
+    std::string input = TensorId(control_dep, Graph::kControlSlot).ToString();
     bool found = false;
     for (int i = node_def->input_size() - 1; i >= 0; --i) {
-      const string& node_input = node_def->input(i);
+      const std::string& node_input = node_def->input(i);
       if (node_input[0] != '^') {
         // Control inputs are at the end. Break when we reach the non-control
         // inputs.
@@ -984,7 +984,7 @@ void GraphConstructor::AddControlDependencies(
 void GraphConstructor::AddPrefixToNodeDef(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
   if (prefix_.empty()) return;
-  node_def->set_name(strings::StrCat(prefix_, node_def->name()));
+  node_def->set_name(absl::StrCat(prefix_, node_def->name()));
   // Update names of input nodes
   for (int i = 0; i < node_def->input_size(); ++i) {
     // Skip remapped inputs (which already exist in g_ and are not being
@@ -992,9 +992,9 @@ void GraphConstructor::AddPrefixToNodeDef(
     if (input_already_exists[i]) continue;
     absl::string_view input(node_def->input(i));
     if (absl::ConsumePrefix(&input, "^")) {
-      node_def->set_input(i, strings::StrCat("^", prefix_, input));
+      node_def->set_input(i, absl::StrCat("^", prefix_, input));
     } else {
-      node_def->set_input(i, strings::StrCat(prefix_, input));
+      node_def->set_input(i, absl::StrCat(prefix_, input));
     }
   }
   // Update names of colocation groups
@@ -1004,7 +1004,7 @@ void GraphConstructor::AddPrefixToNodeDef(
     for (int i = 0; i < list->s_size(); ++i) {
       absl::string_view v(list->s(i));
       if (absl::ConsumePrefix(&v, kColocationGroupPrefix)) {
-        list->set_s(i, strings::StrCat(kColocationGroupPrefix, prefix_, v));
+        list->set_s(i, absl::StrCat(kColocationGroupPrefix, prefix_, v));
       }
     }
   }
@@ -1013,7 +1013,7 @@ void GraphConstructor::AddPrefixToNodeDef(
 void GraphConstructor::UniquifyNames(
     const std::vector<bool>& input_already_exists, NodeDef* node_def) {
   if (NameExistsInGraph(node_def->name())) {
-    string old_name = node_def->name();
+    std::string old_name = node_def->name();
     node_def->set_name(FindUniqueName(node_def->name()));
     uniquified_names_[old_name] = node_def->name();
     // Note that we don't have to update gdef_nodes_ or gdef_prefixes_ with
@@ -1028,7 +1028,7 @@ void GraphConstructor::UniquifyNames(
     // We require that UniquifyNames() is called on all NodeDefs in topological
     // order. This guarantees that node_def's inputs will already be uniquified
     // if necessary.
-    auto iter = uniquified_names_.find(string(id.first));
+    auto iter = uniquified_names_.find(std::string(id.first));
     if (iter == uniquified_names_.end()) continue;
     id.first = iter->second;
     node_def->set_input(i, id.ToString());
@@ -1039,18 +1039,18 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
   for (const auto& pair : gdef_nodes_) {
     Node* node = pair.second.node;
     if (node == nullptr) continue;
-    std::vector<string> coloc_values;
+    std::vector<std::string> coloc_values;
     if (!TryGetNodeAttr(node->attrs(), kColocationAttrName, &coloc_values))
       continue;
     bool updated = false;
     for (size_t i = 0; i < coloc_values.size(); ++i) {
       absl::string_view val(coloc_values[i]);
       if (absl::ConsumePrefix(&val, kColocationGroupPrefix)) {
-        auto name_pair = uniquified_names_.find(string(val));
+        auto name_pair = uniquified_names_.find(std::string(val));
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
         coloc_values[i] =
-            strings::StrCat(kColocationGroupPrefix, name_pair->second);
+            absl::StrCat(kColocationGroupPrefix, name_pair->second);
       }
     }
     if (updated) {
@@ -1071,13 +1071,13 @@ bool GraphConstructor::NameExistsInGraphDef(absl::string_view name) {
   return false;
 }
 
-string GraphConstructor::FindUniqueName(absl::string_view original_name) {
-  string name(original_name);
+std::string GraphConstructor::FindUniqueName(absl::string_view original_name) {
+  std::string name(original_name);
   int count = 0;
   // Check that any generated names don't collide with imported NodeDefs (as
   // well as nodes in g_).
   while (NameExistsInGraph(name) || (count > 0 && NameExistsInGraphDef(name))) {
-    name = strings::StrCat(original_name, "_", ++count);
+    name = absl::StrCat(original_name, "_", ++count);
   }
   return name;
 }
@@ -1280,7 +1280,7 @@ absl::Status GraphConstructor::Convert() {
         return errors::InvalidArgument(out.str());
       }
 
-      inputs.emplace_back(string(tensor_id.node()), src_node, src_index);
+      inputs.emplace_back(std::string(tensor_id.node()), src_node, src_index);
     }
 
     if (has_data_back_edge && !IsMerge(node_def)) {
diff --git a/tensorflow/core/common_runtime/graph_constructor.h b/tensorflow/core/common_runtime/graph_constructor.h
index d0764903eb0931..e527801ea9f426 100644
--- a/tensorflow/core/common_runtime/graph_constructor.h
+++ b/tensorflow/core/common_runtime/graph_constructor.h
@@ -89,7 +89,7 @@ struct ImportGraphDefOptions {
   // prefix="animals" and GraphDef contains a node "bunny" then the node will be
   // named "animals/bunny" in *g. Must not be already used as a node name or
   // prefix in the graph.
-  string prefix;
+  std::string prefix;
 
   // If true, imported node names will be modified if their name already exists
   // in the graph. If false, conflicting names will be treated as an error. Note
@@ -125,7 +125,7 @@ struct ImportGraphDefOptions {
   // Note that to avoid creating many redundant control edges, ImportGraphDef()
   // won't add control edges to nodes that will inherit the dependencies from
   // other nodes in `gdef`.
-  std::vector<string> control_dependencies;
+  std::vector<std::string> control_dependencies;
 
   // Tensors in `gdef` that will be returned via the ImportGraphDefResults
   // output parameter of `ImportGraphDef()`. If this list is non-empty, the
@@ -151,7 +151,7 @@ struct ImportGraphDefOptions {
   // Unlike `return_tensors`, `input_map` has no effect on the nodes
   // returned. `return_nodes` must be empty if `skip_mapped_nodes` is true.
   // TODO(skyewm): make this work with `skip_mapped_nodes` if there's a need.
-  std::vector<string> return_nodes;
+  std::vector<std::string> return_nodes;
 
   // If true, checks that all colocation constraints are nodes in the GraphDef.
   bool validate_colocation_constraints = true;
@@ -165,7 +165,7 @@ struct ImportGraphDefOptions {
   // python API.
 
   // Try to set default execution device for this grapth.
-  string default_device;
+  std::string default_device;
 
   // If true, propagates a node's assigned device. By default the runtime
   // will recompute the assigned device every time.
diff --git a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
index fa92230d4dcdc2..df0c63473b849d 100644
--- a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
@@ -63,10 +63,10 @@ void FuzzGraphEndToEndSimpleFixedInput(const GraphDef& graph_def) {
   p1.scalar<float>()() = 1.0;
   Tensor p2(DT_FLOAT, TensorShape({1}));
   p2.scalar<float>()() = 2.0;
-  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
-                                                   {"Placeholder_1", p2}};
-  std::vector<string> output_names = {"O_FUZZ"};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"Placeholder", p1},
+                                                        {"Placeholder_1", p2}};
+  std::vector<std::string> output_names = {"O_FUZZ"};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   status = sess->Run(inputs, output_names, target_names, &outputs);
 }
@@ -93,22 +93,22 @@ void FuzzGraphEndToEndAllStatic(const GraphDef& graph_def) {
     return;
   }
 
-  std::vector<std::pair<string, Tensor>> inputs = {};
-  std::vector<string> output_names = {};
-  std::vector<string> target_names = {};
+  std::vector<std::pair<std::string, Tensor>> inputs = {};
+  std::vector<std::string> output_names = {};
+  std::vector<std::string> target_names = {};
   std::vector<Tensor> outputs = {};
   status = sess->Run(inputs, output_names, target_names, &outputs);
 }
 FUZZ_TEST(GraphDefFuzz, FuzzGraphEndToEndAllStatic);
 
-Node* FindNode(const string& name, Graph* graph) {
+Node* FindNode(const std::string& name, Graph* graph) {
   for (Node* n : graph->nodes()) {
     if (n->name() == name) return n;
   }
   return nullptr;
 }
 
-bool HasNode(const string& name, Graph* graph) {
+bool HasNode(const std::string& name, Graph* graph) {
   return FindNode(name, graph) != nullptr;
 }
 
@@ -399,10 +399,10 @@ void FuzzGraphEndToEndFDP(std::vector<uint8_t> data) {
     input_tensors.push_back(input_tensor);
   }
 
-  std::vector<std::pair<string, Tensor>> inputs = {{"N0", input_tensors[0]},
-                                                   {"N1", input_tensors[1]}};
-  std::vector<string> output_names = {last_node};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {
+      {"N0", input_tensors[0]}, {"N1", input_tensors[1]}};
+  std::vector<std::string> output_names = {last_node};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   s = sess->Run(inputs, output_names, target_names, &outputs);
   if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/graph_constructor_test.cc b/tensorflow/core/common_runtime/graph_constructor_test.cc
index 9494bf48f9a74f..036ee63a354f89 100644
--- a/tensorflow/core/common_runtime/graph_constructor_test.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_test.cc
@@ -53,22 +53,22 @@ class GraphConstructorTest : public ::testing::Test {
  protected:
   GraphConstructorTest() : graph_(OpRegistry::Global()) {}
 
-  void Convert(const string& gdef_ascii) {
+  void Convert(const std::string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef_));
   }
 
-  void ExpectError(const string& gdef_ascii,
-                   const std::vector<string>& expected_error_strs,
-                   string not_expected_error_str = "") {
+  void ExpectError(const std::string& gdef_ascii,
+                   const std::vector<std::string>& expected_error_strs,
+                   std::string not_expected_error_str = "") {
     // Used to verify that errors don't change graph
-    const string original_graph_description = GraphDebugString();
+    const std::string original_graph_description = GraphDebugString();
 
     Convert(gdef_ascii);
     GraphConstructorOptions opts;
     absl::Status status = ConvertGraphDefToGraph(opts, gdef_, &graph_);
     EXPECT_FALSE(status.ok());
 
-    for (const string& error : expected_error_strs) {
+    for (const std::string& error : expected_error_strs) {
       EXPECT_TRUE(absl::StrContains(status.message(), error))
           << "Expected to find '" << error << "' in " << status;
     }
@@ -82,19 +82,20 @@ class GraphConstructorTest : public ::testing::Test {
     EXPECT_EQ(original_graph_description, GraphDebugString());
   }
 
-  void ExpectError(const string& gdef_ascii, const ImportGraphDefOptions& opts,
-                   const std::vector<string>& expected_error_strs,
+  void ExpectError(const std::string& gdef_ascii,
+                   const ImportGraphDefOptions& opts,
+                   const std::vector<std::string>& expected_error_strs,
                    ShapeRefiner* refiner = nullptr,
                    ImportGraphDefResults* results = nullptr) {
     // Used to verify that errors don't change graph
-    const string original_graph_description = GraphDebugString();
+    const std::string original_graph_description = GraphDebugString();
 
     Convert(gdef_ascii);
     absl::Status status =
         ImportGraphDef(opts, gdef_, &graph_, refiner, results);
     EXPECT_FALSE(status.ok());
 
-    for (const string& error : expected_error_strs) {
+    for (const std::string& error : expected_error_strs) {
       EXPECT_TRUE(absl::StrContains(status.message(), error))
           << "Expected to find '" << error << "' in " << status;
     }
@@ -102,13 +103,14 @@ class GraphConstructorTest : public ::testing::Test {
     EXPECT_EQ(original_graph_description, GraphDebugString());
   }
 
-  void ExpectOK(const string& gdef_ascii) {
+  void ExpectOK(const std::string& gdef_ascii) {
     Convert(gdef_ascii);
     GraphConstructorOptions opts;
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef_, &graph_));
   }
 
-  void ExpectOK(const string& gdef_ascii, const ImportGraphDefOptions& opts,
+  void ExpectOK(const std::string& gdef_ascii,
+                const ImportGraphDefOptions& opts,
                 ShapeRefiner* refiner = nullptr,
                 ImportGraphDefResults* results = nullptr) {
     Convert(gdef_ascii);
@@ -125,16 +127,17 @@ class GraphConstructorTest : public ::testing::Test {
         << graph_.versions().producer();
   }
 
-  Node* FindNode(const string& name) {
+  Node* FindNode(const std::string& name) {
     for (Node* n : graph_.nodes()) {
       if (n->name() == name) return n;
     }
     return nullptr;
   }
 
-  bool HasNode(const string& name) { return FindNode(name) != nullptr; }
+  bool HasNode(const std::string& name) { return FindNode(name) != nullptr; }
 
-  bool HasEdge(const string& src, int src_out, const string& dst, int dst_in) {
+  bool HasEdge(const std::string& src, int src_out, const std::string& dst,
+               int dst_in) {
     for (const Edge* e : graph_.edges()) {
       if (e->src()->name() == src && e->src_output() == src_out &&
           e->dst()->name() == dst && e->dst_input() == dst_in) {
@@ -144,11 +147,11 @@ class GraphConstructorTest : public ::testing::Test {
     return false;
   }
 
-  bool HasControlEdge(const string& src, const string& dst) {
+  bool HasControlEdge(const std::string& src, const std::string& dst) {
     return HasEdge(src, Graph::kControlSlot, dst, Graph::kControlSlot);
   }
 
-  string ColocationGroup(const string& node) {
+  std::string ColocationGroup(const std::string& node) {
     Node* n = nullptr;
     for (Node* ni : graph_.nodes()) {
       if (ni->name() == node) {
@@ -159,7 +162,7 @@ class GraphConstructorTest : public ::testing::Test {
     if (n == nullptr) {
       return "";
     }
-    std::vector<string> value;
+    std::vector<std::string> value;
     absl::Status s = GetNodeAttr(n->attrs(), kColocationAttrName, &value);
     if (!s.ok()) {
       return "";
@@ -171,10 +174,11 @@ class GraphConstructorTest : public ::testing::Test {
       return "";
     }
     absl::string_view loc(value[0]);
-    return absl::ConsumePrefix(&loc, kColocationGroupPrefix) ? string(loc) : "";
+    return absl::ConsumePrefix(&loc, kColocationGroupPrefix) ? std::string(loc)
+                                                             : "";
   }
 
-  string GraphDebugString() const {
+  std::string GraphDebugString() const {
     return graph_.ToGraphDefDebug().DebugString();
   }
 
@@ -232,7 +236,7 @@ REGISTER_OP("RequiresCurrentGraphVersion")
 
 TEST_F(GraphConstructorTest, InvalidNodeName) {
   auto expect_invalid_name = [this](const char* name) {
-    ExpectError(strings::StrCat("node { name: '", name, "' op: 'ABC' }"),
+    ExpectError(absl::StrCat("node { name: '", name, "' op: 'ABC' }"),
                 {"Node name contains invalid characters"});
   };
 
@@ -504,7 +508,7 @@ TEST_F(GraphConstructorTest, ImportGraphThatUsesConstantValueFromInsideLoop) {
         f.write(str(tf.get_default_graph().as_graph_def()))
 
   */
-  const string pb_ascii = R"EOF(
+  const std::string pb_ascii = R"EOF(
 node {
   name: "Const"
   op: "Const"
@@ -862,7 +866,7 @@ TEST_F(GraphConstructorTest, NoForwardCompatError) {
 }
 
 TEST_F(GraphConstructorTest, LowVersion) {
-  ExpectError(strings::StrCat("versions { producer: ", -1, " }"),
+  ExpectError(absl::StrCat("versions { producer: ", -1, " }"),
               {strings::StrCat("GraphDef producer version -1 below min "
                                "producer ",
                                TF_GRAPH_DEF_VERSION_MIN_PRODUCER,
@@ -872,7 +876,7 @@ TEST_F(GraphConstructorTest, LowVersion) {
 
 TEST_F(GraphConstructorTest, HighVersion) {
   const int version = TF_GRAPH_DEF_VERSION + 1;
-  ExpectError(strings::StrCat("versions { min_consumer: ", version, " }"),
+  ExpectError(absl::StrCat("versions { min_consumer: ", version, " }"),
               {strings::StrCat("GraphDef min consumer version ", version,
                                " above current version ", TF_GRAPH_DEF_VERSION,
                                " for TensorFlow ", TF_VERSION_STRING,
@@ -885,7 +889,7 @@ TEST_F(GraphConstructorTest, BadVersion) {
   ExpectError(
       strings::StrCat("versions { producer: ", version, " bad_consumers: ", bad,
                       " }"),
-      {strings::StrCat(
+      {absl::StrCat(
           "GraphDef disallows consumer version ", bad,
           ".  Please upgrade TensorFlow: this version is likely buggy.")});
 }
@@ -932,8 +936,8 @@ TEST_F(GraphConstructorTest, Error_ControlEdgeBeforeRealInput) {
 TEST_F(GraphConstructorTest, ImportGraphDef) {
   GraphDef def;
   ImportGraphDefOptions opts;
-  const string& source = graph_.FindNodeId(Graph::kSourceId)->name();
-  const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
+  const std::string& source = graph_.FindNodeId(Graph::kSourceId)->name();
+  const std::string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
 
   // Importing an empty graph is fine.
   absl::Status s = ImportGraphDef(opts, def, &graph_, nullptr);
@@ -2447,8 +2451,8 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   TF_EXPECT_OK(
       NodeDefBuilder("scope/A", "TestParams").Finalize(def.add_node()));
   ImportGraphDefOptions opts;
-  const string& source = graph_.FindNodeId(Graph::kSourceId)->name();
-  const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
+  const std::string& source = graph_.FindNodeId(Graph::kSourceId)->name();
+  const std::string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
 
   absl::Status s = ImportGraphDef(opts, def, &graph_, nullptr);
   ASSERT_EQ(absl::OkStatus(), s) << s;
@@ -2457,7 +2461,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   EXPECT_TRUE(HasControlEdge(source, "scope/A"));
   EXPECT_TRUE(HasControlEdge("scope/A", sink));
   EXPECT_EQ(3, graph_.num_edges());
-  const string original_graph_description = GraphDebugString();
+  const std::string original_graph_description = GraphDebugString();
 
 #define EXPECT_IMPORT_FAILURE(graph_def, options, expected_err)       \
   do {                                                                \
@@ -2663,10 +2667,10 @@ TEST_F(GraphConstructorTest, ImportGraphDef_FunctionDefs) {
   p1.scalar<float>()() = 1.0;
   Tensor p2(DT_FLOAT, TensorShape({1}));
   p2.scalar<float>()() = 2.0;
-  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
-                                                   {"Placeholder_1", p2}};
-  std::vector<string> output_names = {"Foo_d03c39a3"};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"Placeholder", p1},
+                                                        {"Placeholder_1", p2}};
+  std::vector<std::string> output_names = {"Foo_d03c39a3"};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   TF_ASSERT_OK(sess->Run(inputs, output_names, target_names, &outputs));
 
@@ -2756,10 +2760,10 @@ TEST_F(GraphConstructorTest, ImportGraphDef_NestedFunctionDefs) {
   p1.scalar<float>()() = 1.0;
   Tensor p2(DT_FLOAT, TensorShape({1}));
   p2.scalar<float>()() = 2.0;
-  std::vector<std::pair<string, Tensor>> inputs = {{"Placeholder", p1},
-                                                   {"Placeholder_1", p2}};
-  std::vector<string> output_names = {"Outer_966fa13d"};
-  std::vector<string> target_names;
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"Placeholder", p1},
+                                                        {"Placeholder_1", p2}};
+  std::vector<std::string> output_names = {"Outer_966fa13d"};
+  std::vector<std::string> target_names;
   std::vector<Tensor> outputs;
   s = sess->Run(inputs, output_names, target_names, &outputs);
   ASSERT_TRUE(s.ok()) << s.message();
@@ -2835,16 +2839,16 @@ TEST_F(GraphConstructorTest, CopyGraph) {
 // Confirms that graph def version in the graph reaches the shape inference
 // function.
 TEST_F(GraphConstructorTest, GraphDefVersionUsedForShapeInference) {
-  string gdef_ascii = strings::StrCat(R"EOF(
+  std::string gdef_ascii = absl::StrCat(R"EOF(
       node{ name:"A" op:"RequiresCurrentGraphVersion" }
       versions { producer: )EOF",
-                                      TF_GRAPH_DEF_VERSION - 1, "}");
+                                        TF_GRAPH_DEF_VERSION - 1, "}");
   ImportGraphDefOptions opts;
   ExpectError(gdef_ascii, opts, {"Wrong graph version for shape"});
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = absl::StrCat(R"EOF(
       node{ name:"A" op:"RequiresCurrentGraphVersion" }
       versions { producer: )EOF",
-                               TF_GRAPH_DEF_VERSION, "}");
+                            TF_GRAPH_DEF_VERSION, "}");
   ExpectOK(gdef_ascii, opts);
 }
 
@@ -2887,7 +2891,7 @@ TEST_F(GraphConstructorTest, ImportGraphDefProvidedShapeRefinerVersions) {
   ImportGraphDefOptions opts;
   // A valid graph at producer version 20, but one
   // that would not import if the graph_def_version were 21.
-  string gdef_ascii;
+  std::string gdef_ascii;
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
   gdef_ascii = strings::StrCat(R"EOF(
 node {
@@ -2973,7 +2977,7 @@ versions {
 })EOF");
 
 #else
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = R"EOF(
 node {
   name: "Sum/input"
   op: "Const"
@@ -3054,7 +3058,7 @@ node {
 }
 versions {
   producer: 20
-})EOF");
+})EOF";
 #endif
   // Create a shape refiner with the latest TF_GRAPH_DEF_VERSION.
   // Importing the graphdef with an existing refiner should
@@ -3098,7 +3102,7 @@ versions {
 })EOF");
 
 #else
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = R"EOF(
 node {
   name: "RandomConst"
   op: "Const"
@@ -3128,7 +3132,7 @@ node {
 }
 versions {
   producer: 21
-})EOF");
+})EOF";
 #endif
 
   ExpectOK(gdef_ascii, opts, &refiner);
@@ -3171,7 +3175,7 @@ versions {
 })EOF");
 
 #else
-  gdef_ascii = strings::StrCat(R"EOF(
+  gdef_ascii = R"EOF(
 node {
   name: "RandomConst2"
   op: "Const"
@@ -3201,7 +3205,7 @@ node {
 }
 versions {
   producer: 17
-})EOF");
+})EOF";
 #endif
   ExpectOK(gdef_ascii, opts, &refiner);
 
@@ -3242,7 +3246,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateDefaultDevice) {
   ImportGraphDefResults res;
 
   TF_ASSERT_OK(ImportGraphDef(options, gdef, &graph_, nullptr, &res));
-  std::map<string, string> node2dev;
+  std::map<std::string, std::string> node2dev;
   for (Node* n : graph_.nodes()) {
     node2dev[n->name()] = n->requested_device();
   }
@@ -3253,7 +3257,8 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateDefaultDevice) {
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_UnknownOps) {
-  const string pb_ascii = "node { name: 'op_from_contrib' op: 'OpFromContrib'}";
+  const std::string pb_ascii =
+      "node { name: 'op_from_contrib' op: 'OpFromContrib'}";
   // Try load twice to check for two parts of the error message. We cannot check
   // for the whole thing in one go because the message includes the hostname.
   ExpectError(pb_ascii, {"Op type not registered 'OpFromContrib'"});
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index d7a9462e387d2d..a3c1d024babae0 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -64,7 +64,7 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
-bool IsCollectiveV2(const string& op) {
+bool IsCollectiveV2(const std::string& op) {
   return op == "CollectiveReduceV2" || op == "CollectiveGatherV2" ||
          op == "CollectiveBcastRecvV2" || op == "CollectiveBcastSendV2" ||
          op == "ColectiveReduceScatterV2" || op == "ColectiveAllToAllV2";
@@ -199,7 +199,7 @@ absl::Status GraphExecutionState::Extend(
   *gdef.mutable_library() = flib_def_->ToProto();
 
   // 2. Build an index of the new node names.
-  std::unordered_set<string> new_names;
+  std::unordered_set<std::string> new_names;
   for (const NodeDef& node : extension_def.node()) {
     new_names.insert(node.name());
   }
@@ -315,7 +315,7 @@ namespace {
 
 class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
  public:
-  TensorConnectionPruneRewrite(const string* endpoint_name,
+  TensorConnectionPruneRewrite(const std::string* endpoint_name,
                                NodeBuilder::NodeOut from_tensor)
       : subgraph::PruneRewrite(endpoint_name, nullptr /* device_info */),
         from_tensor_(std::move(from_tensor)) {}
@@ -336,8 +336,8 @@ class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
     TF_RETURN_IF_ERROR(s);
 
     TF_RETURN_IF_ERROR(
-        NodeBuilder(strings::StrCat("_identity_", feed_tensor.node->name(), "_",
-                                    feed_tensor.index),
+        NodeBuilder(absl::StrCat("_identity_", feed_tensor.node->name(), "_",
+                                 feed_tensor.index),
                     "Identity")
             .Input(from_tensor_)
             .Attr("T",
@@ -355,7 +355,7 @@ class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
 
 template <class Map>
 absl::Status LookupDevice(
-    const DeviceSet& device_set, const string& tensor_name,
+    const DeviceSet& device_set, const std::string& tensor_name,
     const Map& tensor2device,
     const tensorflow::DeviceAttributes** out_device_attrs) {
   *out_device_attrs = nullptr;
@@ -394,7 +394,7 @@ struct TensorAndDevice {
 
 // Tensors of some DataTypes cannot placed in device memory as feeds or
 // fetches. Validate against a allowlist of those known to work.
-bool IsFeedAndFetchSupported(DataType dtype, const string& device_type) {
+bool IsFeedAndFetchSupported(DataType dtype, const std::string& device_type) {
   // The mechanism for supporting feeds of device-backed Tensors requires
   // the _Arg kernel to be registered for the corresponding type (and that
   // the input to the kernel be in device and not host memory).
@@ -474,8 +474,8 @@ absl::Status ValidateFeedAndFetchDevices(
 absl::Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
                                               PartialTensorShape* shape,
                                               DataType* type) {
-  static const gtl::FlatSet<string>* const kHasExplicitShapeAttribute =
-      CHECK_NOTNULL((new gtl::FlatSet<string>{
+  static const gtl::FlatSet<std::string>* const kHasExplicitShapeAttribute =
+      CHECK_NOTNULL((new gtl::FlatSet<std::string>{
           "Placeholder", "PlaceholderV2", "PlaceholderWithDefault",
           "ParallelConcat", "ImmutableConst", "_ParallelConcatStart",
           "InfeedDequeue", "OutfeedDequeue", "CollectiveBcastSend",
@@ -520,7 +520,7 @@ absl::Status GraphExecutionState::PruneGraph(
     for (int i = 0; i < options.callable_options.feed_size(); ++i) {
       // WARNING: feed MUST be a reference, since ArgFeedRewrite and
       // tensors_and_devices holds on to its address.
-      const string& feed = options.callable_options.feed(i);
+      const std::string& feed = options.callable_options.feed(i);
       const DeviceAttributes* device_info;
       TF_RETURN_IF_ERROR(LookupDevice(*device_set_, feed,
                                       options.callable_options.feed_devices(),
@@ -540,7 +540,7 @@ absl::Status GraphExecutionState::PruneGraph(
     for (int i = 0; i < options.callable_options.fetch_size(); ++i) {
       // WARNING: fetch MUST be a reference, since RetvalFetchRewrite and
       // tensors_and_devices holds on to its address.
-      const string& fetch = options.callable_options.fetch(i);
+      const std::string& fetch = options.callable_options.fetch(i);
       const DeviceAttributes* device_info;
       TF_RETURN_IF_ERROR(LookupDevice(*device_set_, fetch,
                                       options.callable_options.fetch_devices(),
@@ -561,11 +561,11 @@ absl::Status GraphExecutionState::PruneGraph(
     }
     const DeviceAttributes* device_info =
         &device_set_->client_device()->attributes();
-    for (const string& feed : options.callable_options.feed()) {
+    for (const std::string& feed : options.callable_options.feed()) {
       feed_rewrites.emplace_back(
           new subgraph::RecvFeedRewrite(&feed, device_info));
     }
-    for (const string& fetch : options.callable_options.fetch()) {
+    for (const std::string& fetch : options.callable_options.fetch()) {
       fetch_rewrites.emplace_back(
           new subgraph::SendFetchRewrite(&fetch, device_info));
     }
@@ -598,7 +598,7 @@ absl::Status GraphExecutionState::PruneGraph(
         &tensor_connection.to_tensor(), {from_node, from_id.second}));
   }
 
-  std::vector<string> target_node_names(
+  std::vector<std::string> target_node_names(
       options.callable_options.target().begin(),
       options.callable_options.target().end());
   TF_RETURN_IF_ERROR(subgraph::RewriteGraphForExecution(
@@ -699,7 +699,7 @@ absl::Status GraphExecutionState::OptimizeGraph(
           options.callable_options.tensor_connection().empty())) {
       std::vector<SafeTensorId> feeds;
 
-      for (const string& feed : options.callable_options.feed()) {
+      for (const std::string& feed : options.callable_options.feed()) {
         feeds.emplace_back(ParseTensorName(feed));
       }
       for (const TensorConnection& tensor_connection :
@@ -830,7 +830,7 @@ absl::Status GraphExecutionState::OptimizeGraph(
     *optimized_flib = std::make_unique<FunctionLibraryDefinition>(*flib_def);
 
     for (const FunctionDef& fdef : new_graph.library().function()) {
-      const string& func_name = fdef.signature().name();
+      const std::string& func_name = fdef.signature().name();
 
       if ((*optimized_flib)->Contains(func_name)) {
         VLOG(3) << "Replace function: name=" << func_name;
@@ -864,7 +864,7 @@ absl::Status GraphExecutionState::OptimizeGraph(
 absl::Status GraphExecutionState::BuildGraph(
     const BuildGraphOptions& options, std::unique_ptr<ClientGraph>* out) {
   VLOG(1) << "BuildGraph";
-  const uint64 start_time_usecs = Env::Default()->NowMicros();
+  const uint64_t start_time_usecs = Env::Default()->NowMicros();
   if (!graph_) {
     // It is only valid to call this method directly when the original graph
     // was created with the option `place_pruned_graph == false`.
@@ -922,7 +922,7 @@ absl::Status GraphExecutionState::BuildGraph(
     // nodes in the Graph and FunctionLibraryDefinition for collective ops and
     // if found, initialize a collective_graph_key as a hash of the ordered set
     // of instance keys.
-    std::set<int32> instance_key_set;
+    std::set<int32_t> instance_key_set;
     bool has_collective_v2 = false;
     for (Node* node : optimized_graph->nodes()) {
       if (node->IsCollective()) {
@@ -952,7 +952,7 @@ absl::Status GraphExecutionState::BuildGraph(
       }
     }
     if (!instance_key_set.empty()) {
-      uint64 hash = 0x8774aa605c729c72ULL;
+      uint64_t hash = 0x8774aa605c729c72ULL;
       for (int32_t instance_key : instance_key_set) {
         hash = Hash64Combine(instance_key, hash);
       }
diff --git a/tensorflow/core/common_runtime/graph_execution_state.h b/tensorflow/core/common_runtime/graph_execution_state.h
index 4f713ae922f12d..a718b57063f10d 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.h
+++ b/tensorflow/core/common_runtime/graph_execution_state.h
@@ -43,10 +43,10 @@ struct GraphExecutionStateOptions {
   const DeviceSet* device_set = nullptr;
   const SessionOptions* session_options = nullptr;
   // Unique session identifier. Can be empty.
-  string session_handle;
+  std::string session_handle;
   // A map from node name to device name, representing the unchangeable
   // placement of stateful nodes.
-  std::unordered_map<string, string> stateful_placements;
+  std::unordered_map<std::string, std::string> stateful_placements;
   // Whether to run Placer on the graph.
   bool run_placer = true;
 
@@ -166,7 +166,7 @@ class GraphExecutionState {
   const FunctionLibraryDefinition& flib_def() const { return *flib_def_; }
 
   // Returns the node with the given name, or null if it does not exist.
-  const Node* get_node_by_name(const string& name) const {
+  const Node* get_node_by_name(const std::string& name) const {
     NodeNameToCostIdMap::const_iterator iter =
         node_name_to_cost_id_map_.find(name);
     if (iter != node_name_to_cost_id_map_.end()) {
@@ -178,7 +178,7 @@ class GraphExecutionState {
 
   // Returns the map of stateful placements as a map of
   // node name to placement string.
-  std::unordered_map<string, string> GetStatefulPlacements() const {
+  std::unordered_map<std::string, std::string> GetStatefulPlacements() const {
     return stateful_placements_;
   }
 
@@ -194,8 +194,9 @@ class GraphExecutionState {
   // is true, such as "params" and "queue" nodes.  Once placed these
   // nodes can not be moved to a different device.  Maps node names to
   // device names.
-  std::unordered_map<string, string> stateful_placements_;  // Immutable after
-                                                            // ctor.
+  std::unordered_map<std::string, std::string>
+      stateful_placements_;  // Immutable after
+                             // ctor.
   void SaveStatefulNodes(Graph* graph);
   void RestoreStatefulNodes(Graph* graph);
 
@@ -215,7 +216,7 @@ class GraphExecutionState {
   const DeviceSet* device_set_;            // Not owned
   const SessionOptions* session_options_;  // Not owned
   // Unique session identifier. Can be empty.
-  string session_handle_;
+  std::string session_handle_;
 
   // Map from name to Node for the full graph in placed_.
   NodeNameToCostIdMap node_name_to_cost_id_map_;
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index f8322cfe7213a2..746c080e4d3f66 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -36,8 +36,8 @@ class GraphOptimizer {
     // pass may replace a node with a different node of the same name that has a
     // different number of outputs, or outputs with different known shapes.
     // TODO(b/65453533) introduce a unique way to name nodes in a graph.
-    std::unordered_map<string, std::vector<PartialTensorShape>>* shape_map =
-        nullptr;
+    std::unordered_map<std::string, std::vector<PartialTensorShape>>*
+        shape_map = nullptr;
 
     // If not null then only nodes for which cse_consider_fn returns true will
     // be considered for CSE.
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 90052d68873c6a..8379c126e22711 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -58,7 +58,7 @@ class SimpleRendezvous : public RendezvousInterface {
     }
 
     mutex_lock l(mu_);
-    string edge_name(parsed.edge_name);
+    std::string edge_name(parsed.edge_name);
     if (table_.count(edge_name) > 0) {
       return errors::Internal("Send of an already sent tensor");
     }
@@ -71,7 +71,7 @@ class SimpleRendezvous : public RendezvousInterface {
     Tensor tensor;
     absl::Status status = absl::OkStatus();
     {
-      string key(parsed.edge_name);
+      std::string key(parsed.edge_name);
       mutex_lock l(mu_);
       if (table_.count(key) <= 0) {
         status = errors::Internal("Did not find key ", key);
@@ -85,7 +85,7 @@ class SimpleRendezvous : public RendezvousInterface {
   void StartAbort(const absl::Status& status) override {}
 
  private:
-  typedef std::unordered_map<string, Tensor> Table;
+  typedef std::unordered_map<std::string, Tensor> Table;
 
   mutex mu_;
   Table table_ TF_GUARDED_BY(mu_);
@@ -103,7 +103,7 @@ GraphRunner::~GraphRunner() {}
 absl::Status GraphRunner::Run(Graph* graph,
                               FunctionLibraryRuntime* function_library,
                               const NamedTensorList& inputs,
-                              const std::vector<string>& output_names,
+                              const std::vector<std::string>& output_names,
                               std::vector<Tensor>* outputs) {
   if (device_ == nullptr) {
     return errors::NotFound("Cannot find a device for GraphRunner.");
@@ -130,12 +130,12 @@ absl::Status GraphRunner::Run(Graph* graph,
   SimpleRendezvous rendez;
 
   // Extract the input names and keys, and feed in the inputs.
-  std::vector<string> input_names;
+  std::vector<std::string> input_names;
   for (const auto& in : inputs) {
-    const string& tensor_name = in.first;
+    const std::string& tensor_name = in.first;
     input_names.emplace_back(tensor_name);
-    string full_key = Rendezvous::CreateKey("/device:CPU:0", 1, "/device:CPU:1",
-                                            tensor_name, FrameAndIter(0, 0));
+    std::string full_key = Rendezvous::CreateKey(
+        "/device:CPU:0", 1, "/device:CPU:1", tensor_name, FrameAndIter(0, 0));
     Rendezvous::ParsedKey parsed;
     TF_RETURN_IF_ERROR(Rendezvous::ParseKey(full_key, &parsed));
     TF_RETURN_IF_ERROR(rendez.Send(parsed, Rendezvous::Args(), in.second,
@@ -194,7 +194,7 @@ absl::Status GraphRunner::Run(Graph* graph,
 
   outputs->resize(output_names.size());
   for (size_t i = 0; i < output_names.size(); ++i) {
-    const string& output_key =
+    const std::string& output_key =
         Rendezvous::CreateKey("/device:CPU:0", 1, "/device:CPU:1",
                               output_names[i], FrameAndIter(0, 0));
     Rendezvous::ParsedKey parsed;
diff --git a/tensorflow/core/common_runtime/graph_runner.h b/tensorflow/core/common_runtime/graph_runner.h
index a40d17b862b0af..3f651727db5923 100644
--- a/tensorflow/core/common_runtime/graph_runner.h
+++ b/tensorflow/core/common_runtime/graph_runner.h
@@ -58,10 +58,10 @@ class GraphRunner {
   //
   // REQUIRES: `graph`, `env`, and `outputs` are not nullptr.
   // `function_library` may be nullptr.
-  typedef std::vector<std::pair<string, Tensor>> NamedTensorList;
+  typedef std::vector<std::pair<std::string, Tensor>> NamedTensorList;
   absl::Status Run(Graph* graph, FunctionLibraryRuntime* function_library,
                    const NamedTensorList& inputs,
-                   const std::vector<string>& output_names,
+                   const std::vector<std::string>& output_names,
                    std::vector<Tensor>* outputs);
 
  private:
diff --git a/tensorflow/core/common_runtime/graph_runner_test.cc b/tensorflow/core/common_runtime/graph_runner_test.cc
index fa9798b929f79e..2d41bc455d5322 100644
--- a/tensorflow/core/common_runtime/graph_runner_test.cc
+++ b/tensorflow/core/common_runtime/graph_runner_test.cc
@@ -64,8 +64,8 @@ TEST(GraphRunnerTest, DeepCopy) {
   Tensor p2_data(DT_FLOAT, TensorShape({}));
   p1_data.scalar<float>()() = 1.0f;
   p2_data.scalar<float>()() = 2.0f;
-  std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
-                                                   {"p2:0", p2_data}};
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"p1:0", p1_data},
+                                                        {"p2:0", p2_data}};
 
   // Create and destroy the GraphRunner, and ensure that the outputs are
   // consumable beyond the lifetime of GraphRunner.
@@ -102,8 +102,8 @@ TEST(GraphRunnerTest, FeedAndFetch) {
   Tensor p2_data(DT_FLOAT, TensorShape({}));
   p1_data.scalar<float>()() = 1.0f;
   p2_data.scalar<float>()() = 2.0f;
-  std::vector<std::pair<string, Tensor>> inputs = {{"p1:0", p1_data},
-                                                   {"p2:0", p2_data}};
+  std::vector<std::pair<std::string, Tensor>> inputs = {{"p1:0", p1_data},
+                                                        {"p2:0", p2_data}};
 
   GraphRunner graph_runner(Env::Default());
   std::vector<Tensor> outputs;
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index f84dbfac0d3f6d..65359febf97937 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -40,12 +40,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-string NodeItem::DebugString() const {
-  string ret = strings::StrCat("{name:'", kernel->name(), "' id:", node_id);
+std::string NodeItem::DebugString() const {
+  std::string ret = absl::StrCat("{name:'", kernel->name(), "' id:", node_id);
   if (is_source) {
-    strings::StrAppend(&ret, " source}");
+    absl::StrAppend(&ret, " source}");
   } else {
-    strings::StrAppend(&ret, " def:{", SummarizeNodeDef(kernel->def()), "}}");
+    absl::StrAppend(&ret, " def:{", SummarizeNodeDef(kernel->def()), "}}");
   }
   return ret;
 }
@@ -67,7 +67,7 @@ GraphView::~GraphView() {
 }
 
 namespace {
-typedef std::tuple<int32, int32> OutputAndControlEdges;
+typedef std::tuple<int32_t, int32_t> OutputAndControlEdges;
 
 OutputAndControlEdges CountOutputEdges(const Node* n) {
   DCHECK_LE(n->out_edges().size(), std::numeric_limits<int32_t>::max());
@@ -102,8 +102,8 @@ size_t GraphView::NodeItemBytes(const Node* n) {
             sizeof(ControlEdgeInfo)                // output_control_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
       + num_outputs * sizeof(int)                  // forward_from[num_outputs]
-      + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
+      + num_inputs * sizeof(uint8_t)               // input_type[num_inputs]
+      + num_outputs * sizeof(uint8_t);             // output_type[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -141,7 +141,7 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   // values as "int" vs "size_t" in CHECK_LE.
   CHECK_LE(static_cast<int64_t>(ptr - space_),
            std::numeric_limits<uint32_t>::max());
-  const uint32 offset = static_cast<uint32>(ptr - space_);
+  const uint32_t offset = static_cast<uint32_t>(ptr - space_);
   node_offsets_[id] = offset;
   ptr += bytes;
 
@@ -197,10 +197,10 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
   }
 
   DCHECK_LT(DataType_MAX, 255);  // Must fit in uint8
-  uint8* input_types = item->input_type_base();
+  uint8_t* input_types = item->input_type_base();
   item->is_any_input_ref_typed = false;
   for (int i = 0; i < num_inputs; i++) {
-    input_types[i] = static_cast<uint8>(n->input_type(i));
+    input_types[i] = static_cast<uint8_t>(n->input_type(i));
     DCHECK_EQ(item->input_type(i), n->input_type(i));
     item->is_any_input_ref_typed |= IsRefType(n->input_type(i));
   }
@@ -215,9 +215,9 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
         GetNodeAttr(n->attrs(), "_scoped_allocator", &scoped_allocator_attrs);
 
     int* forward_from = item->forward_from_base();
-    uint8* output_types = item->output_type_base();
+    uint8_t* output_types = item->output_type_base();
     for (int i = 0; i < num_outputs; ++i) {
-      output_types[i] = static_cast<uint8>(n->output_type(i));
+      output_types[i] = static_cast<uint8_t>(n->output_type(i));
       DCHECK_EQ(item->output_type(i), n->output_type(i));
 
       forward_from[i] = OpKernelContext::Params::kNoReservation;
@@ -264,7 +264,7 @@ absl::Status GraphView::Initialize(const Graph* g) {
     total_bytes += NodeItemBytes(n);
   }
 
-  node_offsets_ = new uint32[num_nodes];
+  node_offsets_ = new uint32_t[num_nodes];
   for (int i = 0; i < num_nodes; i++) {
     node_offsets_[i] = std::numeric_limits<uint32_t>::max();
   }
@@ -363,7 +363,7 @@ absl::Status InferAllocAttr(const Node* n, const Node* dst,
   // Note that it's possible for *n to be a Recv and *dst to be a Send,
   // so these two cases are not mutually exclusive.
   if (IsRecv(n)) {
-    string src_name;
+    std::string src_name;
     s = GetNodeAttr(n->attrs(), "send_device", &src_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_src_name;
@@ -388,7 +388,7 @@ absl::Status InferAllocAttr(const Node* n, const Node* dst,
     }
   }
   if (IsSend(dst)) {
-    string dst_name;
+    std::string dst_name;
     s = GetNodeAttr(dst->attrs(), "recv_device", &dst_name);
     if (!s.ok()) return s;
     DeviceNameUtils::ParsedName parsed_dst_name;
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index 3864df8a6ce165..32df420842d657 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -102,10 +102,10 @@ struct NodeItem {
   int input_start = 0;
 
   // Number of output edges, excluding control edges.
-  int32 num_output_edges;
+  int32_t num_output_edges;
 
   // Number of output control edges.
-  int32 num_output_control_edges;
+  int32_t num_output_control_edges;
 
   // If non-null, contains an array of num_outputs bools, where the ith bool
   // is true if and only if the ith output is consumed by another node.
@@ -143,7 +143,7 @@ struct NodeItem {
   // 0... for forward from that input.
   const int* forward_from() const { return forward_from_base(); }
 
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   friend class GraphView;
@@ -185,18 +185,18 @@ struct NodeItem {
                                       num_output_control_edges +
                                   sizeof(AllocatorAttributes) * num_outputs);
   }
-  uint8* input_type_base() const {
-    return reinterpret_cast<uint8*>(
+  uint8_t* input_type_base() const {
+    return reinterpret_cast<uint8_t*>(
         var() + sizeof(EdgeInfo) * num_output_edges +
         sizeof(ControlEdgeInfo) * num_output_control_edges +
         sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs);
   }
-  uint8* output_type_base() const {
-    return reinterpret_cast<uint8*>(
+  uint8_t* output_type_base() const {
+    return reinterpret_cast<uint8_t*>(
         var() + sizeof(EdgeInfo) * num_output_edges +
         sizeof(ControlEdgeInfo) * num_output_control_edges +
         sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs +
-        sizeof(uint8) * num_inputs);
+        sizeof(uint8_t) * num_inputs);
   }
 
   NodeItem(const NodeItem&) = delete;
@@ -220,7 +220,7 @@ class GraphView {
   NodeItem* node(int32_t id) const {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
-    uint32 offset = node_offsets_[id];
+    uint32_t offset = node_offsets_[id];
     return ((offset == std::numeric_limits<uint32_t>::max())
                 ? nullptr
                 : reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]));
@@ -232,19 +232,19 @@ class GraphView {
   const NodeItem& node_ref(int32_t id) const {
     DCHECK_GE(id, 0);
     DCHECK_LT(id, num_nodes_);
-    uint32 offset = node_offsets_[id];
+    uint32_t offset = node_offsets_[id];
     DCHECK_NE(offset, std::numeric_limits<uint32_t>::max());
     return *reinterpret_cast<NodeItem*>(space_ + node_offsets_[id]);
   }
 
-  int32 num_nodes() const { return num_nodes_; }
+  int32_t num_nodes() const { return num_nodes_; }
 
  private:
   char* InitializeNode(char* ptr, const Node* n);
   size_t NodeItemBytes(const Node* n);
 
-  int32 num_nodes_ = 0;
-  uint32* node_offsets_ = nullptr;  // array of size "num_nodes_"
+  int32_t num_nodes_ = 0;
+  uint32_t* node_offsets_ = nullptr;  // array of size "num_nodes_"
   // node_offsets_[id] holds the byte offset for node w/ "id" in space_
 
   char* space_;  // NodeItem objects are allocated here
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index eccea063ad5abf..ebbdfde177da79 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -43,8 +43,8 @@ namespace tensorflow {
 
 namespace {
 // Key to be used for BufRendezvous by Broadcaster.
-string BroadcastBufKey(const string& exec_key, int subdiv, int src_rank,
-                       int dst_rank) {
+std::string BroadcastBufKey(const std::string& exec_key, int subdiv,
+                            int src_rank, int dst_rank) {
   if (READABLE_KEYS) {
     return strings::StrCat("broadcast(", exec_key, "):subdiv(", subdiv,
                            "):src(", src_rank, "):dst(", dst_rank, ")");
@@ -81,13 +81,13 @@ absl::Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
   CHECK_EQ(col_params->instance.type, BROADCAST_COLLECTIVE);
   CHECK_EQ(col_params->instance.impl_details.collective_name,
            "HierarchicalTreeBroadcast");
-  const string& device_name =
+  const std::string& device_name =
       col_params->group.members[col_params->default_rank].device.name();
   // Start by counting the devices in each task.
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->group.members[0].task;
+  const std::string* prior_task_name = &col_params->group.members[0].task;
   int dev_count = 1;
   for (int di = 1; di < col_params->group.group_size; ++di) {
     if (col_params->group.members[di].task != *prior_task_name) {
@@ -102,8 +102,8 @@ absl::Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
   CHECK_EQ(col_params->group.num_tasks, dev_per_task.size());
 
   if (VLOG_IS_ON(2)) {
-    string dpt_buf;
-    for (int dpt : dev_per_task) strings::StrAppend(&dpt_buf, dpt, ";");
+    std::string dpt_buf;
+    for (int dpt : dev_per_task) absl::StrAppend(&dpt_buf, dpt, ";");
     VLOG(2) << "HierarchicalTreeBroadcaster::InitializeCollectiveParams device="
             << device_name << " source_rank=" << col_params->source_rank
             << " dev_per_task=" << dpt_buf;
@@ -302,9 +302,9 @@ void HierarchicalTreeBroadcaster::RunTree() {
     if (-1 == my_rank) continue;
     int source_rank = col_params_->instance.impl_details.subdiv_source_rank[si];
     if (VLOG_IS_ON(1)) {
-      string subdiv_buf;
+      std::string subdiv_buf;
       for (int r : col_params_->instance.impl_details.subdiv_permutations[si]) {
-        strings::StrAppend(&subdiv_buf, r, ",");
+        absl::StrAppend(&subdiv_buf, r, ",");
       }
       VLOG(1) << "Running Broadcast tree device=" << col_ctx_->device_name
               << " subdiv=" << si << " perm=" << subdiv_buf
@@ -318,7 +318,7 @@ void HierarchicalTreeBroadcaster::RunTree() {
     if (my_rank >= 0 && my_rank != source_rank) {
       // Begin by receiving the value.
       tsl::profiler::TraceMe activity(
-          [&] { return strings::StrCat("ReceiveValue:", si); },
+          [&] { return absl::StrCat("ReceiveValue:", si); },
           tsl::profiler::TraceMeLevel::kInfo);
       int recv_from_rank = TreeRecvFrom(*col_params_, si);
       absl::Notification note;
@@ -334,7 +334,7 @@ void HierarchicalTreeBroadcaster::RunTree() {
     // Then forward value to all descendent devices.
     {
       tsl::profiler::TraceMe activity(
-          [&] { return strings::StrCat("ForwardValue:", si); },
+          [&] { return absl::StrCat("ForwardValue:", si); },
           tsl::profiler::TraceMeLevel::kInfo);
       if (my_rank >= 0 && status_.ok()) {
         std::vector<int> send_to_ranks;
@@ -413,7 +413,7 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
   tsl::profiler::ScopedMemoryDebugAnnotation op_annotation(
       col_params_->name, col_ctx_->step_id, "dynamic", src_tensor->dtype(),
       [src_tensor]() { return src_tensor->shape().DebugString(); });
-  string send_buf_key =
+  std::string send_buf_key =
       BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
   int dst_idx =
       col_params_->instance.impl_details.subdiv_permutations[subdiv][dst_rank];
@@ -434,7 +434,7 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
 void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
                                                int dst_rank, Tensor* dst_tensor,
                                                const StatusCallback& done) {
-  string recv_buf_key =
+  std::string recv_buf_key =
       BroadcastBufKey(col_ctx_->exec_key, subdiv, src_rank, dst_rank);
   int src_idx =
       col_params_->instance.impl_details.subdiv_permutations[subdiv][src_rank];
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index ba419077d2774e..408d8cb65b3682 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -191,7 +191,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       if (!instances_[di]->status_.ok()) {
         ASSERT_GT(fail_after, 0);
         ASSERT_NE(instances_[di]->status_.message().find("Deliberate failure"),
-                  string::npos);
+                  std::string::npos);
         ++failure_count_;
         continue;
       }
@@ -221,7 +221,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       // In the test we always broadcast from rank 0.
       col_params_->is_source = (rank == 0);
       col_params_->source_rank = 0;
-      string dev_name = col_params_->group.members[rank].device.name();
+      std::string dev_name = col_params_->group.members[rank].device.name();
       TF_CHECK_OK(test_env_->device_mgr->LookupDevice(dev_name, &device_))
           << "Couldn't find device " << dev_name
           << " existing devices: " << test_env_->device_mgr->DebugString();
@@ -356,10 +356,10 @@ TEST_F(HierarchicalTreeBroadcasterInitParamsTest,
   cp->instance.impl_details.collective_name = "HierarchicalTreeBroadcast";
   std::vector<int> dev_per_task = {4, 4, 6, 8};
   for (int ti = 0; ti < cp->group.num_tasks; ti++) {
-    string task_name = strings::StrCat("/job:worker/replica:0/task:", ti);
+    std::string task_name = absl::StrCat("/job:worker/replica:0/task:", ti);
     for (int di = 0; di < dev_per_task[ti]; di++) {
       CollGroupMember member;
-      member.device.set_name(strings::StrCat(task_name, "/device:GPU:", di));
+      member.device.set_name(absl::StrCat(task_name, "/device:GPU:", di));
       member.task = task_name;
       cp->group.members.push_back(member);
       cp->group.group_size++;
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
index 6eef9e802d862e..64ded72c5e0d4e 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.cc
+++ b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -68,7 +68,7 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 }  // namespace
 
 ImmutableExecutorState::FrameInfo* ImmutableExecutorState::EnsureFrameInfo(
-    const string& fname) {
+    const std::string& fname) {
   auto iter = frame_info_.find(fname);
   if (iter != frame_info_.end()) {
     return iter->second.get();
@@ -110,8 +110,8 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
       // TODO(mrry): Track whether control flow was present in the
       // pre-partitioned graph, and enable the caller (e.g.
       // `DirectSession`) to relax this constraint.
-      string send_device;
-      string recv_device;
+      std::string send_device;
+      std::string recv_device;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "send_device", &send_device));
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "recv_device", &recv_device));
       if (send_device != recv_device) {
@@ -120,7 +120,7 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
     }
 
     const int id = n->id();
-    const string& frame_name = cf_info.frame_names[id];
+    const std::string& frame_name = cf_info.frame_names[id];
     FrameInfo* frame_info = EnsureFrameInfo(frame_name);
 
     NodeItem* item = gview_.node(id);
@@ -162,7 +162,7 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
           GetNodeAttr(n->attrs(), "is_constant", &is_constant_enter));
       item->is_constant_enter = is_constant_enter;
 
-      string frame_name;
+      std::string frame_name;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &frame_name));
       FrameInfo* frame_info = frame_info_[frame_name].get();
 
@@ -214,7 +214,7 @@ absl::Status ImmutableExecutorState::Initialize(const Graph& graph) {
     // Initialize static information about the frames in the graph.
     frame_info->nodes->push_back(item);
     if (item->is_enter) {
-      string enter_name;
+      std::string enter_name;
       TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &enter_name));
       EnsureFrameInfo(enter_name)->input_count++;
     }
@@ -291,7 +291,7 @@ absl::Status ImmutableExecutorState::BuildControlFlowInfo(
   std::vector<bool> visited;
   visited.resize(num_nodes);
 
-  string frame_name;
+  std::string frame_name;
   std::deque<Node*> ready;
 
   // Initialize with the root nodes.
@@ -360,7 +360,7 @@ void ImmutableExecutorState::InitializePending(const Graph* graph,
   }
 
   if (!requires_control_flow_) {
-    atomic_pending_counts_.reset(new std::atomic<int32>[gview_.num_nodes()]);
+    atomic_pending_counts_.reset(new std::atomic<int32_t>[gview_.num_nodes()]);
     std::fill(atomic_pending_counts_.get(),
               atomic_pending_counts_.get() + gview_.num_nodes(), 0);
   }
@@ -368,7 +368,7 @@ void ImmutableExecutorState::InitializePending(const Graph* graph,
   for (const Node* n : graph->nodes()) {
     if (IsSink(n)) continue;
     const int id = n->id();
-    const string& name = cf_info.frame_names[id];
+    const std::string& name = cf_info.frame_names[id];
     size_t max_pending, max_dead;
     GetMaxPendingCounts(n, &max_pending, &max_dead);
     auto& counts = EnsureFrameInfo(name)->pending_counts;
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.h b/tensorflow/core/common_runtime/immutable_executor_state.h
index 6a12bc1fb0b0c0..7e7437c5311d20 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.h
+++ b/tensorflow/core/common_runtime/immutable_executor_state.h
@@ -42,7 +42,7 @@ class Graph;
 class ImmutableExecutorState {
  public:
   struct FrameInfo {
-    explicit FrameInfo(string name)
+    explicit FrameInfo(std::string name)
         : name(std::move(name)),
           input_count(0),
           total_inputs(0),
@@ -51,7 +51,7 @@ class ImmutableExecutorState {
           parallel_iterations(-1) {}
 
     // The name of the frame.
-    string name;
+    std::string name;
 
     // The total number of inputs to a frame.
     int input_count;
@@ -71,7 +71,7 @@ class ImmutableExecutorState {
     std::unique_ptr<std::vector<const NodeItem*>> nodes;
 
     // The number of iterations of this frame that can execute concurrently.
-    int32 parallel_iterations;
+    int32_t parallel_iterations;
   };
 
   explicit ImmutableExecutorState(const LocalExecutorParams& p)
@@ -109,24 +109,24 @@ class ImmutableExecutorState {
   //
   // REQUIRES: `!requires_control_flow_support && len(dest) ==
   // graph_view().num_nodes()`.
-  void copy_pending_counts(std::atomic<int32>* dest) const {
+  void copy_pending_counts(std::atomic<int32_t>* dest) const {
     DCHECK(!requires_control_flow_);
     memcpy(dest, atomic_pending_counts_.get(),
-           graph_view().num_nodes() * sizeof(std::atomic<int32>));
+           graph_view().num_nodes() * sizeof(std::atomic<int32_t>));
     std::atomic_thread_fence(std::memory_order_release);
   }
 
  private:
   struct ControlFlowInfo {
-    gtl::FlatSet<string> unique_frame_names;
-    std::vector<string> frame_names;
+    gtl::FlatSet<std::string> unique_frame_names;
+    std::vector<std::string> frame_names;
   };
 
   static absl::Status BuildControlFlowInfo(const Graph* graph,
                                            ControlFlowInfo* cf_info);
   void InitializePending(const Graph* graph, const ControlFlowInfo& cf_info);
 
-  FrameInfo* EnsureFrameInfo(const string& fname);
+  FrameInfo* EnsureFrameInfo(const std::string& fname);
 
   // Owned.
   LocalExecutorParams params_;
@@ -150,7 +150,7 @@ class ImmutableExecutorState {
 
   // If `requires_control_flow_` is false, this points to an array of initial
   // pending counts for the nodes in the graph, indexed by node ID.
-  std::unique_ptr<std::atomic<int32>[]> atomic_pending_counts_;
+  std::unique_ptr<std::atomic<int32_t>[]> atomic_pending_counts_;
 
   // Shallow copies of the constant tensors used in the graph.
   std::vector<Tensor> const_tensors_;
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index 1e8a85207fa0b1..a627e9e8aff9c9 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -70,11 +70,11 @@ struct Endpoint {
   int index;
 
   // Returns the string name represents this endpoint.
-  string name() const {
+  std::string name() const {
     if (index == 0) {
       return node->name();
     } else {
-      return strings::StrCat(node->name(), ":", index);
+      return absl::StrCat(node->name(), ":", index);
     }
   }
 
@@ -82,7 +82,7 @@ struct Endpoint {
 };
 
 struct EndpointHash {
-  uint64 operator()(const Endpoint& x) const {
+  uint64_t operator()(const Endpoint& x) const {
     return Hash64(reinterpret_cast<const char*>(&x.node), sizeof(Node*),
                   x.index);
   }
@@ -120,15 +120,15 @@ static Node* AddIdentity(absl::string_view name, Graph* g, Endpoint input) {
   return ret;
 }
 
-std::vector<string> InputDevices(const Node& caller) {
-  std::vector<string> input_devices(caller.in_edges().size());
-  std::vector<string> input_tensors(caller.in_edges().size());
+std::vector<std::string> InputDevices(const Node& caller) {
+  std::vector<std::string> input_devices(caller.in_edges().size());
+  std::vector<std::string> input_tensors(caller.in_edges().size());
 
   for (const Edge* edge : caller.in_edges()) {
     if (edge->IsControlEdge()) continue;
-    const string& input_device = edge->src()->has_assigned_device_name()
-                                     ? edge->src()->assigned_device_name()
-                                     : edge->src()->requested_device();
+    const std::string& input_device = edge->src()->has_assigned_device_name()
+                                          ? edge->src()->assigned_device_name()
+                                          : edge->src()->requested_device();
     input_devices[edge->dst_input()] = input_device;
     input_tensors[edge->dst_input()] =
         absl::StrCat(edge->src()->name(), ":", edge->src_output());
@@ -154,22 +154,24 @@ class DefaultFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
   explicit DefaultFunctionBodyPlacer(const Node& caller)
       : input_devices_(InputDevices(caller)) {}
 
-  absl::optional<string> InputNodeDevice(int input_index) const override {
+  absl::optional<std::string> InputNodeDevice(int input_index) const override {
     return input_devices_[input_index];
   }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
+  absl::optional<std::string> OutputNodeDevice(
+      int output_index) const override {
     return absl::nullopt;
   }
   bool ColocateInputOutputIdentities() const override { return false; }
-  absl::optional<string> ControlNodeDevice() const override {
+  absl::optional<std::string> ControlNodeDevice() const override {
     return absl::nullopt;
   }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+  absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const override {
     return absl::nullopt;
   }
 
  private:
-  const std::vector<string> input_devices_;
+  const std::vector<std::string> input_devices_;
 };
 
 // Place all nodes on the same device as caller node.
@@ -178,22 +180,24 @@ class SingleDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
   explicit SingleDeviceFunctionBodyPlacer(const Node& caller)
       : caller_device_(caller.def().device()) {}
 
-  absl::optional<string> InputNodeDevice(int input_index) const override {
+  absl::optional<std::string> InputNodeDevice(int input_index) const override {
     return caller_device_;
   }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
+  absl::optional<std::string> OutputNodeDevice(
+      int output_index) const override {
     return caller_device_;
   }
   bool ColocateInputOutputIdentities() const override { return false; }
-  absl::optional<string> ControlNodeDevice() const override {
+  absl::optional<std::string> ControlNodeDevice() const override {
     return caller_device_;
   }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+  absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const override {
     return caller_device_;
   }
 
  private:
-  const string caller_device_;
+  const std::string caller_device_;
 };
 
 // Place input nodes on the same device as the corresponding caller input
@@ -209,17 +213,19 @@ class MultiDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
         DeviceNameUtils::ParseFullName(caller_device_, &caller_parsed_device_);
   }
 
-  absl::optional<string> InputNodeDevice(int input_index) const override {
+  absl::optional<std::string> InputNodeDevice(int input_index) const override {
     return input_devices_[input_index];
   }
-  absl::optional<string> OutputNodeDevice(int output_index) const override {
+  absl::optional<std::string> OutputNodeDevice(
+      int output_index) const override {
     return absl::nullopt;
   }
   bool ColocateInputOutputIdentities() const override { return true; }
-  absl::optional<string> ControlNodeDevice() const override {
+  absl::optional<std::string> ControlNodeDevice() const override {
     return caller_device_;
   }
-  absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const override {
+  absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const override {
     // LINT.IfChange
     // TODO(ezhulenev): If function would have been instantiated as a
     // multi-device function and executed via FunctionLibraryRuntime, it could
@@ -240,10 +246,10 @@ class MultiDeviceFunctionBodyPlacer : public InlinedFunctionBodyPlacer {
   }
 
  private:
-  string caller_device_;
+  std::string caller_device_;
   bool has_parsed_caller_device_;
   DeviceNameUtils::ParsedName caller_parsed_device_;
-  std::vector<string> input_devices_;
+  std::vector<std::string> input_devices_;
 };
 
 }  // namespace
@@ -286,7 +292,7 @@ using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
 // Propagate the debug info of `nodes` in function `func` to the `target` node.
 // If the debug info of any node is missing, its node name and function name
 // is used.
-void PropagateDebugInfoToNode(const string& func,
+void PropagateDebugInfoToNode(const std::string& func,
                               const std::vector<const Node*>& nodes,
                               NodeDef* target) {
   if (nodes.empty() || target->has_experimental_debug_info()) {
@@ -306,10 +312,10 @@ void PropagateDebugInfoToNode(const string& func,
 }
 }  // namespace
 
-string InlineFunctionBodyOptions::DebugString() const {
+std::string InlineFunctionBodyOptions::DebugString() const {
   const auto true_false = [](bool b) { return b ? "true" : "false"; };
 
-  const auto keep_caller_node_str = [this]() -> string {
+  const auto keep_caller_node_str = [this]() -> std::string {
     switch (keep_caller_node) {
       case KeepCallerNode::kDoNotKeep:
         return "DoNotKeep";
@@ -508,7 +514,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   // Add a NoOp node for function control inputs/outputs.
   const auto no_op = [&](absl::string_view name) -> Node* {
     Node* node = AddNoOp(absl::StrCat(caller->name(), "/", name), g);
-    const absl::optional<string> device = placer->ControlNodeDevice();
+    const absl::optional<std::string> device = placer->ControlNodeDevice();
     if (device.has_value()) node->set_requested_device(*device);
     return node;
   };
@@ -517,13 +523,13 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   const auto input_identity = [&](absl::string_view name, Endpoint input,
                                   int index) -> Node* {
     Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
-    const absl::optional<string> device = placer->InputNodeDevice(index);
+    const absl::optional<std::string> device = placer->InputNodeDevice(index);
     if (device.has_value()) node->set_requested_device(*device);
     bool colocate_identity = placer->ColocateInputOutputIdentities();
     if (colocate_identity) {
       node->AddAttr(kColocationAttrName,
-                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
-                                                     input.node->name())});
+                    std::vector<std::string>{absl::StrCat(
+                        kColocationGroupPrefix, input.node->name())});
     }
     return node;
   };
@@ -532,13 +538,13 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   const auto output_identity = [&](absl::string_view name, Endpoint input,
                                    int index) -> Node* {
     Node* node = AddIdentity(absl::StrCat(caller->name(), "/", name), g, input);
-    const absl::optional<string> device = placer->OutputNodeDevice(index);
+    const absl::optional<std::string> device = placer->OutputNodeDevice(index);
     if (device.has_value()) node->set_requested_device(*device);
     bool colocate_identity = placer->ColocateInputOutputIdentities();
     if (colocate_identity) {
       node->AddAttr(kColocationAttrName,
-                    std::vector<string>{absl::StrCat(kColocationGroupPrefix,
-                                                     input.node->name())});
+                    std::vector<std::string>{absl::StrCat(
+                        kColocationGroupPrefix, input.node->name())});
     }
     return node;
   };
@@ -597,7 +603,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
   //
   // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we
   // remember 'y' in node_map[x->id()].
-  std::unordered_set<string> fn_nodes;
+  std::unordered_set<std::string> fn_nodes;
   for (Node* n : fbody->graph->op_nodes()) {
     fn_nodes.insert(n->name());
   }
@@ -606,7 +612,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
     NodeDef ndef = n->def();
 
     // Maybe override requested node device assignment.
-    const absl::optional<string> device = placer->BodyNodeDevice(ndef);
+    const absl::optional<std::string> device = placer->BodyNodeDevice(ndef);
     if (device.has_value()) ndef.set_device(*device);
 
     // Add inlined function name to inlined node debug information.
@@ -617,7 +623,7 @@ absl::Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def,
     //  1) to node name to avoid collisions
     //  2) to frame name to avoid multiple LoopCond nodes in one frame
     //  3) to colocation attribute
-    const string prefix = strings::StrCat(caller->name(), "/");
+    const std::string prefix = absl::StrCat(caller->name(), "/");
     TF_RETURN_IF_ERROR(AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &ndef,
                                                 options.uniquify_frame_names));
 
diff --git a/tensorflow/core/common_runtime/inline_function_utils.h b/tensorflow/core/common_runtime/inline_function_utils.h
index 94c118fe882a20..7ffafe13e5df03 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.h
+++ b/tensorflow/core/common_runtime/inline_function_utils.h
@@ -41,13 +41,16 @@ class InlinedFunctionBodyPlacer {
  public:
   virtual ~InlinedFunctionBodyPlacer() = default;
 
-  virtual absl::optional<string> InputNodeDevice(int input_index) const = 0;
-  virtual absl::optional<string> OutputNodeDevice(int output_index) const = 0;
+  virtual absl::optional<std::string> InputNodeDevice(
+      int input_index) const = 0;
+  virtual absl::optional<std::string> OutputNodeDevice(
+      int output_index) const = 0;
   // Returns true if the added input/output identity nodes should be colocated
   // with the corresponding input/output from the function body.
   virtual bool ColocateInputOutputIdentities() const = 0;
-  virtual absl::optional<string> ControlNodeDevice() const = 0;
-  virtual absl::optional<string> BodyNodeDevice(const NodeDef& ndef) const = 0;
+  virtual absl::optional<std::string> ControlNodeDevice() const = 0;
+  virtual absl::optional<std::string> BodyNodeDevice(
+      const NodeDef& ndef) const = 0;
 
   // LINT.IfChange
   // Place input nodes on the same device as the corresponding caller input
@@ -72,7 +75,7 @@ class InlinedFunctionBodyPlacer {
       const Graph&, const Node&)>;
 
   struct Config {
-    string name;
+    std::string name;
     Factory get;
   };
 
@@ -147,7 +150,7 @@ struct InlineFunctionBodyOptions {
   bool uniquify_frame_names = true;
 
   // A human-readable debug string for this options.
-  string DebugString() const;
+  std::string DebugString() const;
 };
 
 // Returns 'OkStatus()' iff the function '*fbody' can be inlined at 'node'
diff --git a/tensorflow/core/common_runtime/inline_function_utils_test.cc b/tensorflow/core/common_runtime/inline_function_utils_test.cc
index 0d726ade656f21..1e20e6da535a16 100644
--- a/tensorflow/core/common_runtime/inline_function_utils_test.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils_test.cc
@@ -50,7 +50,7 @@ TEST(InlineFunctionBody, ColocationConstraintPropagation) {
           {{"z"},
            "AddV2",
            {"x", "y"},
-           {{"T", DT_FLOAT}, {"_class", std::vector<string>({"loc:@x"})}}},
+           {{"T", DT_FLOAT}, {"_class", std::vector<std::string>({"loc:@x"})}}},
       });
   TF_ASSERT_OK(flib_def.AddFunctionDef(fdef));
 
@@ -98,7 +98,8 @@ TEST(InlineFunctionBody, ColocationConstraintPropagation) {
           // Func/call/input/_0.
           NDef("call/z", "AddV2", {"Func/call/input/_0", "Func/call/input/_1"},
                {{"T", DT_FLOAT},
-                {"_class", std::vector<string>({"loc:@Func/call/input/_0"})}}),
+                {"_class",
+                 std::vector<std::string>({"loc:@Func/call/input/_0"})}}),
           NDef("Func/call/output/_2", "Identity", {"call/z"},
                {{"T", DT_FLOAT}}),
       },
diff --git a/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc b/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc
index 7b0fa4af464fe9..4edf42ff812b8d 100644
--- a/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc
+++ b/tensorflow/core/common_runtime/input_colocation_exemption_registry.cc
@@ -27,7 +27,7 @@ InputColocationExemptionRegistry* InputColocationExemptionRegistry::Global() {
   return registry;
 }
 
-void InputColocationExemptionRegistry::Register(const string& op) {
+void InputColocationExemptionRegistry::Register(const std::string& op) {
   auto it = ops_.find(op);
   if (it != ops_.end()) {
     LOG(WARNING) << "Input colocation exemption for op: " << op
diff --git a/tensorflow/core/common_runtime/input_colocation_exemption_registry.h b/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
index c393fe7498b696..9e4bbc9e77f4af 100644
--- a/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
+++ b/tensorflow/core/common_runtime/input_colocation_exemption_registry.h
@@ -40,20 +40,20 @@ class InputColocationExemptionRegistry {
   static InputColocationExemptionRegistry* Global();
 
   // Returns the set of ops exempt from the input colocation constraints.
-  const gtl::FlatSet<string>& Get() { return ops_; }
+  const gtl::FlatSet<std::string>& Get() { return ops_; }
 
   // Registers an op to be excluded from the input colocation constraints.
-  void Register(const string& op);
+  void Register(const std::string& op);
 
  private:
-  gtl::FlatSet<string> ops_;
+  gtl::FlatSet<std::string> ops_;
 };
 
 namespace input_colocation_exemption_registration {
 
 class InputColocationExemptionRegistration {
  public:
-  explicit InputColocationExemptionRegistration(const string& op) {
+  explicit InputColocationExemptionRegistration(const std::string& op) {
     InputColocationExemptionRegistry::Global()->Register(op);
   }
 };
diff --git a/tensorflow/core/common_runtime/inspecting_placer.cc b/tensorflow/core/common_runtime/inspecting_placer.cc
index 96799bcf1e4be8..816d3dcae487a9 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.cc
+++ b/tensorflow/core/common_runtime/inspecting_placer.cc
@@ -34,21 +34,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-string IOColocationGroups::DebugString() const {
-  std::unordered_map<int, std::vector<string>> group_members;
+std::string IOColocationGroups::DebugString() const {
+  std::unordered_map<int, std::vector<std::string>> group_members;
   for (int arg_index = 0; arg_index < input_groups.size(); ++arg_index) {
     int group_id = input_groups[arg_index];
-    group_members[group_id].push_back(strings::StrCat("i:", arg_index));
+    group_members[group_id].push_back(absl::StrCat("i:", arg_index));
   }
   for (int ret_index = 0; ret_index < output_groups.size(); ++ret_index) {
     int group_id = output_groups[ret_index];
-    group_members[group_id].push_back(strings::StrCat("o:", ret_index));
+    group_members[group_id].push_back(absl::StrCat("o:", ret_index));
   }
 
-  std::vector<string> group_strings;
+  std::vector<std::string> group_strings;
   for (const auto& it : group_members) {
     int group_id = it.first;
-    const std::vector<string>& members = it.second;
+    const std::vector<std::string>& members = it.second;
     const PossibleDevices& devices = group_devices[group_id];
     group_strings.push_back(strings::StrCat(
         "Group(", group_id, " members = [", absl::StrJoin(members, ", "),
@@ -57,11 +57,11 @@ string IOColocationGroups::DebugString() const {
         "\" resource_device_name = \"",
         DeviceNameUtils::ParsedNameToString(devices.resource_device_name),
         "\" device_types = [",
-        absl::StrJoin(
-            devices.device_types, ", ",
-            [](string* out, const std::pair<DeviceType, int32>& type_and_pref) {
-              out->append(DeviceTypeString(type_and_pref.first));
-            }),
+        absl::StrJoin(devices.device_types, ", ",
+                      [](std::string* out,
+                         const std::pair<DeviceType, int32_t>& type_and_pref) {
+                        out->append(DeviceTypeString(type_and_pref.first));
+                      }),
         "])"));
   }
 
diff --git a/tensorflow/core/common_runtime/inspecting_placer.h b/tensorflow/core/common_runtime/inspecting_placer.h
index 90df36c58139fd..27e45dacadad8b 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.h
+++ b/tensorflow/core/common_runtime/inspecting_placer.h
@@ -59,7 +59,7 @@ struct IOColocationGroups {
   // group_devices[i] contains possible devices for group with id i.
   std::vector<PossibleDevices> group_devices;
 
-  string DebugString() const;
+  std::string DebugString() const;
 };
 
 class InspectingPlacer {
diff --git a/tensorflow/core/common_runtime/int32_fulltype.h b/tensorflow/core/common_runtime/int32_fulltype.h
index 1a55e0bc6a1e7c..8e89b0bec2f6d9 100644
--- a/tensorflow/core/common_runtime/int32_fulltype.h
+++ b/tensorflow/core/common_runtime/int32_fulltype.h
@@ -29,7 +29,7 @@ namespace tensorflow {
 class Int32FulltypePass {
  public:
   Int32FulltypePass() = default;
-  explicit Int32FulltypePass(string debug_location)
+  explicit Int32FulltypePass(std::string debug_location)
       : debug_location_(debug_location) {}
 
   // For each node in this graph that outputs int32 tensors, set full
@@ -57,7 +57,7 @@ class Int32FulltypePass {
 
  private:
   // Location of where annotations were added for debug messages.
-  string debug_location_;
+  std::string debug_location_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/int32_fulltype_test.cc b/tensorflow/core/common_runtime/int32_fulltype_test.cc
index 8cfb991cdacd38..ed8587667e9bcc 100644
--- a/tensorflow/core/common_runtime/int32_fulltype_test.cc
+++ b/tensorflow/core/common_runtime/int32_fulltype_test.cc
@@ -96,14 +96,14 @@ class Int32FulltypeTest : public ::testing::Test {
   // Returns the node in "graph" with the given name.
   //
   // REQUIRES: "graph" was produced by the most recent call to BuildGraph.
-  Node* GetNodeByName(const Graph& graph, const string& name) {
+  Node* GetNodeByName(const Graph& graph, const std::string& name) {
     const auto search = nodes_by_name_.find(name);
     CHECK(search != nodes_by_name_.end()) << "Unknown node name: " << name;
     return graph.FindNodeId(search->second);
   }
 
  protected:
-  std::unordered_map<string, int> nodes_by_name_;
+  std::unordered_map<std::string, int> nodes_by_name_;
 
  private:
   void RebuildNodeNameMap(const Graph& graph) {
diff --git a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
index 5afdc072fcc1ae..be10cd744f35f1 100644
--- a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
+++ b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass_test.cc
@@ -67,11 +67,11 @@ void RunPassAndCompare(const GraphDef& original,
   GraphDef rewritten;
   RunPass(original, &rewritten);
 
-  std::vector<string> errors;
+  std::vector<std::string> errors;
   errors.push_back(absl::StrCat("Graphs did not match.\n  Rewritten graph:\n",
                                 SummarizeGraphDef(rewritten)));
   for (const GraphDef& alternative : expected_alternatives) {
-    string diff;
+    std::string diff;
     bool graphs_equal = EqualGraphDef(rewritten, alternative, &diff);
     if (graphs_equal) {
       return;
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 1e17e24df37677..78f2d219505341 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -45,7 +45,7 @@ namespace tensorflow {
 namespace test {
 
 // TODO(hongm): Convert `g` and `init` to using std::unique_ptr.
-Benchmark::Benchmark(const string& device, Graph* g,
+Benchmark::Benchmark(const std::string& device, Graph* g,
                      const SessionOptions* options, Graph* init,
                      Rendezvous* rendez, const char* executor_type,
                      bool old_benchmark_api) {
@@ -61,7 +61,7 @@ Benchmark::Benchmark(const string& device, Graph* g,
 
   CHECK(!old_benchmark_api) << "Expected new API only";
 
-  string t = absl::AsciiStrToUpper(device);
+  std::string t = absl::AsciiStrToUpper(device);
   // Allow NewDevice to allocate a new threadpool with different number of
   // threads for each new benchmark.
   LocalDevice::set_use_global_threadpool(false);
@@ -121,7 +121,8 @@ Benchmark::Benchmark(const string& device, Graph* g,
   TF_CHECK_OK(NewExecutor(executor_type, params, *g, &exec_));
 }
 
-Benchmark::Benchmark(const string& device, Graph* g, bool old_benchmark_api)
+Benchmark::Benchmark(const std::string& device, Graph* g,
+                     bool old_benchmark_api)
     : Benchmark(device, g, nullptr, nullptr, nullptr, "", old_benchmark_api) {}
 
 Benchmark::~Benchmark() {
@@ -141,14 +142,14 @@ void Benchmark::Run(benchmark::State& state) {
   RunWithRendezvousArgs({}, {}, state);
 }
 
-string GetRendezvousKey(const Node* node) {
-  string send_device;
+std::string GetRendezvousKey(const Node* node) {
+  std::string send_device;
   TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device));
-  string recv_device;
+  std::string recv_device;
   TF_CHECK_OK(GetNodeAttr(node->attrs(), "recv_device", &recv_device));
-  string tensor_name;
+  std::string tensor_name;
   TF_CHECK_OK(GetNodeAttr(node->attrs(), "tensor_name", &tensor_name));
-  uint64 send_device_incarnation;
+  uint64_t send_device_incarnation;
   TF_CHECK_OK(
       GetNodeAttr(node->attrs(), "send_device_incarnation",
                   reinterpret_cast<int64_t*>(&send_device_incarnation)));
@@ -157,8 +158,8 @@ string GetRendezvousKey(const Node* node) {
 }
 
 void Benchmark::RunWithRendezvousArgs(
-    const std::vector<std::pair<string, Tensor>>& inputs,
-    const std::vector<string>& outputs, benchmark::State& state) {
+    const std::vector<std::pair<std::string, Tensor>>& inputs,
+    const std::vector<std::string>& outputs, benchmark::State& state) {
   if (!device_ || state.max_iterations == 0) {
     return;
   }
@@ -179,7 +180,7 @@ void Benchmark::RunWithRendezvousArgs(
       TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
     }
     TF_CHECK_OK(exec_->Run(args));
-    for (const string& key : outputs) {
+    for (const std::string& key : outputs) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
       TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
@@ -197,7 +198,7 @@ void Benchmark::RunWithRendezvousArgs(
       TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
     }
     TF_CHECK_OK(exec_->Run(args));
-    for (const string& key : outputs) {
+    for (const std::string& key : outputs) {
       Rendezvous::ParsedKey parsed;
       TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
       TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index fcab9a65bc586a..a0e5486b96c120 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -48,20 +48,20 @@ class Benchmark {
   //   * In the new API, the timer starts automatically at the first
   //     iteration of the loop and stops after the last iteration.
   // TODO(vyng) Remove this once we have migrated all code to newer API.
-  Benchmark(const string& device, Graph* g,
+  Benchmark(const std::string& device, Graph* g,
             const SessionOptions* options = nullptr, Graph* init = nullptr,
             Rendezvous* rendez = nullptr, const char* executor_type = "",
             bool old_benchmark_api = false);
 
-  Benchmark(const string& device, Graph* g, bool old_benchmark_api);
+  Benchmark(const std::string& device, Graph* g, bool old_benchmark_api);
 
   ~Benchmark();
 
   void Run(benchmark::State& state);
 
   void RunWithRendezvousArgs(
-      const std::vector<std::pair<string, Tensor>>& inputs,
-      const std::vector<string>& outputs, benchmark::State& state);
+      const std::vector<std::pair<std::string, Tensor>>& inputs,
+      const std::vector<std::string>& outputs, benchmark::State& state);
 
  private:
   thread::ThreadPool* pool_ = nullptr;  // Not owned.
@@ -78,7 +78,7 @@ class Benchmark {
 };
 
 // Returns the rendezvous key associated with the given Send/Recv node.
-string GetRendezvousKey(const Node* node);
+std::string GetRendezvousKey(const Node* node);
 
 }  // end namespace test
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 63fd2f1b59c223..9997ff2a30c008 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -85,7 +85,7 @@ struct LocalDevice::EigenThreadPoolInfo {
     thread_opts.numa_node = numa_node;
     eigen_worker_threads_.num_threads = intra_op_parallelism_threads;
     eigen_worker_threads_.workers = new thread::ThreadPool(
-        options.env, thread_opts, strings::StrCat("numa_", numa_node, "_Eigen"),
+        options.env, thread_opts, absl::StrCat("numa_", numa_node, "_Eigen"),
         intra_op_parallelism_threads,
         !options.config.experimental().disable_thread_spinning(),
         /*allocator=*/nullptr);
diff --git a/tensorflow/core/common_runtime/lower_case_op.cc b/tensorflow/core/common_runtime/lower_case_op.cc
index 39d1d150fa8a1b..88c169bc4a80d3 100644
--- a/tensorflow/core/common_runtime/lower_case_op.cc
+++ b/tensorflow/core/common_runtime/lower_case_op.cc
@@ -38,7 +38,7 @@ class CaseBuilder {
  public:
   // Create a CaseBuilder to create the lowered form of `case` with branch
   // functions identified by `branch_fn_names` in the `graph`.
-  CaseBuilder(Node* case_op, const std::vector<string>& branch_fn_names,
+  CaseBuilder(Node* case_op, const std::vector<std::string>& branch_fn_names,
               bool keep_node_fetchable, Graph* graph);
 
   // Constructs the basic conditional control flow using switch and merge nodes.
@@ -58,7 +58,7 @@ class CaseBuilder {
  private:
   // Returns unique name containing the name of the Case op being rewritten
   // (name_), infix and a suffix to ensure it is unique within the graph.
-  string NewName(const string& infix);
+  std::string NewName(const std::string& infix);
 
   // Adds input to both the then and else nodes from src:src_output.
   absl::Status AddInput(Node* src, int src_output);
@@ -88,7 +88,7 @@ class CaseBuilder {
   // for the side effects.
   Node* branch_executed_node_;
   Graph* graph_;
-  string name_;
+  std::string name_;
   bool keep_node_fetchable_;
 
   NodeDebugInfo debug_info_;
@@ -96,7 +96,7 @@ class CaseBuilder {
 };
 
 CaseBuilder::CaseBuilder(Node* case_op,
-                         const std::vector<string>& branch_fn_names,
+                         const std::vector<std::string>& branch_fn_names,
                          bool keep_node_fetchable, Graph* graph)
     : case_op_(case_op),
       num_branches_(branch_fn_names.size()),
@@ -106,7 +106,7 @@ CaseBuilder::CaseBuilder(Node* case_op,
       debug_info_(*case_op_) {
   branch_call_builders_.reserve(num_branches_);
   for (int b = 0; b < num_branches_; b++) {
-    branch_call_builders_.emplace_back(NewName(strings::StrCat("branch", b)),
+    branch_call_builders_.emplace_back(NewName(absl::StrCat("branch", b)),
                                        branch_fn_names[b], graph->op_registry(),
                                        &debug_info_);
     branch_call_builders_[b].Device(case_op_->requested_device());
@@ -129,7 +129,7 @@ absl::Status CaseBuilder::CreatePivotNodes() {
   control_predecessor_ = branch_index;
   pivots_.resize(num_branches_, nullptr);
   for (int b = 0; b < num_branches_; b++) {
-    TF_RETURN_IF_ERROR(NodeBuilder(NewName(strings::StrCat("pivot_", b)),
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName(absl::StrCat("pivot_", b)),
                                    "Identity", graph_->op_registry(),
                                    &debug_info_)
                            .Input(branch_index, b)
@@ -139,8 +139,8 @@ absl::Status CaseBuilder::CreatePivotNodes() {
   return absl::OkStatus();
 }
 
-string CaseBuilder::NewName(const string& infix) {
-  return graph_->NewName(strings::StrCat(name_, "/", infix));
+std::string CaseBuilder::NewName(const std::string& infix) {
+  return graph_->NewName(absl::StrCat(name_, "/", infix));
 }
 
 absl::Status CaseBuilder::AddInput(Node* src, int src_output) {
@@ -276,7 +276,7 @@ absl::Status RewriteCaseNode(Node* n, Graph* g, bool keep_node_fetchable) {
   }
 
   int num_branches = branches_attr->list().func_size();
-  std::vector<string> branch_fn_names;
+  std::vector<std::string> branch_fn_names;
   branch_fn_names.reserve(num_branches);
   for (int b = 0; b < num_branches; b++) {
     branch_fn_names.emplace_back(branches_attr->list().func(b).name());
diff --git a/tensorflow/core/common_runtime/lower_case_op_test.cc b/tensorflow/core/common_runtime/lower_case_op_test.cc
index eb5033cd75b000..d460d761fc646d 100644
--- a/tensorflow/core/common_runtime/lower_case_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_case_op_test.cc
@@ -184,8 +184,8 @@ TEST(LowerCaseOpTest, BranchFunctionsWithoutOutputs) {
   using FDH = ::tensorflow::FunctionDefHelper;
 
   // Wrap AssignAddVariable + Const into a function.
-  const auto assign_add = [](const string& fn_name, int v) {
-    const Tensor tensor = test::AsScalar<int32>(v);
+  const auto assign_add = [](const std::string& fn_name, int v) {
+    const Tensor tensor = test::AsScalar<int32_t>(v);
     return FDH::Create(
         fn_name, {"v: resource"}, {}, {},
         {
diff --git a/tensorflow/core/common_runtime/lower_function_call_op_test.cc b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
index d276c7c43abbb7..3a2de9036df433 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op_test.cc
@@ -36,13 +36,13 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-AttrValue FuncAttr(const string& name) {
+AttrValue FuncAttr(const std::string& name) {
   AttrValue attr;
   attr.mutable_func()->set_name(name);
   return attr;
 }
 
-AttrValue FuncAttr(const string& name, const DataType type) {
+AttrValue FuncAttr(const std::string& name, const DataType type) {
   AttrValue attr;
   attr.mutable_func()->set_name(name);
   (*attr.mutable_func()->mutable_attr())["T"].set_type(type);
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.cc b/tensorflow/core/common_runtime/lower_functional_ops.cc
index 49885ba8129e8e..a2c2b6986a5e8b 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops.cc
@@ -52,7 +52,7 @@ bool CheckBoolAttr(const Node* n, absl::string_view attr_name) {
 
 // Checks if string attribute is defined and it's not empty.
 bool CheckStringAttr(const Node* n, absl::string_view attr_name) {
-  string match;
+  std::string match;
   bool found = TryGetNodeAttr(n->attrs(), attr_name, &match);
   return found && !match.empty();
 }
diff --git a/tensorflow/core/common_runtime/lower_functional_ops_test.cc b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
index 2f16c6fef7e308..2d47ac5d70bd3c 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops_test.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops_test.cc
@@ -66,7 +66,7 @@ absl::Status Rewrite(std::unique_ptr<Graph>* graph) {
 
 // (counter:int32, pred:bool, x:int32) -> counter < N
 FunctionDef WhileWithIfCond(int32_t N) {
-  const Tensor kN = test::AsScalar<int32>(N);
+  const Tensor kN = test::AsScalar<int32_t>(N);
   return FDH::Define(
       // Name
       "WhileWithIfCond",
@@ -90,7 +90,7 @@ FunctionDef WhileWithIfBody() {
   then_func.set_name("XTimesTwo");
   NameAttrList else_func;
   else_func.set_name("XTimesFour");
-  const Tensor kOne = test::AsScalar<int32>(1);
+  const Tensor kOne = test::AsScalar<int32_t>(1);
   std::vector<DataType> input_types = {DT_INT32};
   std::vector<DataType> output_types = {DT_INT32};
   return FDH::Define(
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index e46ef4ff3de543..01beef8fc2328d 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -60,7 +60,7 @@ class CondBuilder {
  private:
   // Returns unique name containing the name of the If op being rewritten
   // (name_), infix and a suffix to ensure it is unique within the graph.
-  string NewName(const string& infix);
+  std::string NewName(const std::string& infix);
 
   // Adds input to both the then and else nodes from src:src_output.
   absl::Status AddInput(Node* src, int src_output);
@@ -102,7 +102,7 @@ class CondBuilder {
   // executed for the side effects.
   Node* branch_executed_node_;
   Graph* graph_;
-  string name_;
+  std::string name_;
   bool keep_node_fetchable_;
 
   NodeDebugInfo debug_info_;
@@ -172,8 +172,8 @@ absl::Status CondBuilder::CreatePivotNodes() {
   return absl::OkStatus();
 }
 
-string CondBuilder::NewName(const string& infix) {
-  return graph_->NewName(strings::StrCat(name_, "/", infix));
+std::string CondBuilder::NewName(const std::string& infix) {
+  return graph_->NewName(absl::StrCat(name_, "/", infix));
 }
 
 absl::Status CondBuilder::AddInput(Node* src, int src_output) {
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
index 91bddb27b452be..68c55d27d16433 100644
--- a/tensorflow/core/common_runtime/lower_if_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -35,7 +35,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-AttrValue FuncAttr(const string& name) {
+AttrValue FuncAttr(const std::string& name) {
   AttrValue attr;
   attr.mutable_func()->set_name(name);
   return attr;
@@ -153,8 +153,8 @@ TEST(LowerIfOpTest, BranchFunctionsWithoutOutputs) {
   using FDH = ::tensorflow::FunctionDefHelper;
 
   // Wrap AssignAddVariable + Const into a function.
-  const auto assign_add = [](const string& fn_name, int v) {
-    const Tensor tensor = test::AsScalar<int32>(v);
+  const auto assign_add = [](const std::string& fn_name, int v) {
+    const Tensor tensor = test::AsScalar<int32_t>(v);
     return FDH::Create(
         fn_name, {"v: resource"}, {}, {},
         {
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 8a8c3c075dd235..84f03444a93972 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -132,7 +132,7 @@ class LowerWhileHelper {
 
   // Returns unique name containing the name of the While op being rewritten
   // (name_), infix and a suffix to ensure it is unique within the graph.
-  string NewName(const string& infix);
+  std::string NewName(const std::string& infix);
 
   // Returns true if the input at index is a resource and the same resource is
   // returned as an output.
@@ -156,7 +156,7 @@ class LowerWhileHelper {
   Graph* graph_;
   const FunctionLibraryDefinition* flib_def_;
   // Name of the `while_op_`.
-  string name_;
+  std::string name_;
   // Max number of parallel_iterations for the while loop.
   const int parallel_iterations_;
   bool keep_node_fetchable_;
@@ -363,15 +363,15 @@ absl::Status LowerWhileHelper::CreateSwitchNodes() {
     if (IsLoopCarriedResource(i)) {
       continue;
     }
-    string op_name;
+    std::string op_name;
     {
       const Node* input_node;
       TF_RETURN_IF_ERROR(while_op_->input_node(i, &input_node));
-      op_name = strings::StrCat(input_node->name(), "_switch");
+      op_name = absl::StrCat(input_node->name(), "_switch");
     }
     Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
     Node* switch_node;
-    string op_type = "Switch";
+    std::string op_type = "Switch";
     if (IsRefType(merge_node->output_type(0))) {
       op_type = "RefSwitch";
     }
@@ -413,7 +413,7 @@ absl::Status LowerWhileHelper::CreateBodyFuncCallNode() {
   // node is not the first one to be ready? Can we speed that case up using some
   // sort of multi-input Merge?
   Node* body_control_node_;
-  string op_type = "Identity";
+  std::string op_type = "Identity";
   if (IsRefType(switch_nodes_[0]->output_type(1))) {
     op_type = "RefIdentity";
   }
@@ -569,8 +569,8 @@ absl::Status LowerWhileHelper::UpdateConsumers() {
   return absl::OkStatus();
 }
 
-string LowerWhileHelper::NewName(const string& infix) {
-  return graph_->NewName(strings::StrCat(name_, "/", infix));
+std::string LowerWhileHelper::NewName(const std::string& infix) {
+  return graph_->NewName(absl::StrCat(name_, "/", infix));
 }
 
 bool LowerWhileHelper::IsLoopCarriedResource(int index) {
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 4fe9337c942766..eb19c84c04dd44 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -253,7 +253,8 @@ TEST(LowerWhileOpTest, ForwardAssignedInputDevice) {
   TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
                   .Attr("dtype", type)
                   .Finalize(graph.get(), &placeholder));
-  const string assigned_device_name = "/job:localhost/replica:0/task:0/gpu:0";
+  const std::string assigned_device_name =
+      "/job:localhost/replica:0/task:0/gpu:0";
   placeholder->set_assigned_device_name(assigned_device_name);
   Node* while_node;
   std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(placeholder)});
@@ -343,11 +344,11 @@ TEST(LowerWhileOpTest, ForwardRequestedInputDevice) {
   TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
   auto type = DT_FLOAT;
   // We will place the loop var on the gpu:0.
-  const string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
+  const std::string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
   // We will place loop's control input on the gpu:1.
-  const string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
+  const std::string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
   // We will place While op on gpu:2.
-  const string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
+  const std::string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
   Node* gpu_0_ph;
   TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
                   .Attr("dtype", type)
@@ -483,11 +484,11 @@ TEST(LowerWhileOpTest, ForwardColocationKeyAttribute) {
   TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
   auto type = DT_FLOAT;
   // We will place the loop var on the gpu:0.
-  const string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
+  const std::string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
   // We will place loop's control input on the gpu:1.
-  const string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
+  const std::string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
   // We will place While op on gpu:2.
-  const string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
+  const std::string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
   Node* gpu_0_ph;
   AttrValue gpu_0_colocation_attr;
   gpu_0_colocation_attr.mutable_list()->add_s("loc@:some_op_on_gpu_0_device");
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index d22d72f1a57019..216fdfd6d239c4 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -34,14 +34,14 @@ struct Endpoint {
 };
 
 struct EndpointHash {
-  uint32 operator()(const Endpoint& x) const {
+  uint32_t operator()(const Endpoint& x) const {
     return Hash32(reinterpret_cast<const char*>(&x.node_id), sizeof(int),
                   x.output_index);
   }
 };
 
 struct EndpointEq {
-  uint32 operator()(const Endpoint& x, const Endpoint& y) const {
+  uint32_t operator()(const Endpoint& x, const Endpoint& y) const {
     return (x.node_id == y.node_id) && (x.output_index == y.output_index);
   }
 };
@@ -116,14 +116,14 @@ absl::Status ValidateMemoryTypes(const DeviceType& device_type,
 // within this process. That is sufficient because EnsureMemoryTypes
 // is only used on a TensorFlow graph that is gonna to be executed in
 // a single tf device (hence within a single process).
-static string GetTensorName(const Edge* edge) {
+static std::string GetTensorName(const Edge* edge) {
   static std::atomic<int64_t> counter(0);
-  return strings::StrCat("memtype_", counter.fetch_add(1), "_",
-                         edge->src()->name());
+  return absl::StrCat("memtype_", counter.fetch_add(1), "_",
+                      edge->src()->name());
 }
 
-static Node* Send(Graph* g, const string& tensor_name,
-                  const string& device_name, bool host, const Edge* edge) {
+static Node* Send(Graph* g, const std::string& tensor_name,
+                  const std::string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), host ? "_HostSend" : "_Send")
                   .Input(edge->src(), edge->src_output())
@@ -138,8 +138,8 @@ static Node* Send(Graph* g, const string& tensor_name,
   return ret;
 }
 
-static Node* Recv(Graph* g, const string& tensor_name,
-                  const string& device_name, bool host, const Edge* edge) {
+static Node* Recv(Graph* g, const std::string& tensor_name,
+                  const std::string& device_name, bool host, const Edge* edge) {
   Node* ret;
   TF_CHECK_OK(
       NodeBuilder(g->NewName("n"), host ? "_HostRecv" : "_Recv")
@@ -156,7 +156,7 @@ static Node* Recv(Graph* g, const string& tensor_name,
 }
 
 absl::Status EnsureMemoryTypes(const DeviceType& device_type,
-                               const string& device_name, Graph* g) {
+                               const std::string& device_name, Graph* g) {
   struct Item {
     const Edge* edge;
     MemoryType sm;
@@ -191,7 +191,7 @@ absl::Status EnsureMemoryTypes(const DeviceType& device_type,
       Endpoint key{e->src()->id(), e->src_output()};
       auto iter = recv_nodes.find(key);
       if (iter == recv_nodes.end()) {
-        const string tensor_name = GetTensorName(e);
+        const std::string tensor_name = GetTensorName(e);
         Node* send =
             Send(g, tensor_name, device_name, (item.sm == HOST_MEMORY), e);
         recv = Recv(g, tensor_name, device_name, (item.dm == HOST_MEMORY), e);
diff --git a/tensorflow/core/common_runtime/memory_types.h b/tensorflow/core/common_runtime/memory_types.h
index 46a943c0a3836e..bbadfe24e156c8 100644
--- a/tensorflow/core/common_runtime/memory_types.h
+++ b/tensorflow/core/common_runtime/memory_types.h
@@ -36,7 +36,7 @@ absl::Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g);
 // be OK). Otherwise, returns an error and '*g' may be in an
 // invalidate state and the caller should discard it.
 absl::Status EnsureMemoryTypes(const DeviceType& device_type,
-                               const string& device_name, Graph* g);
+                               const std::string& device_name, Graph* g);
 
 // Get the memory type for 'index'th output of node 'n' in graph 'g', when
 // running on 'device_type'.
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index 26f414c14204ce..0be98557679406 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 TEST(MemoryTypeChecker, Int32OK) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor v(DT_INT32, {});
-  v.scalar<int32>().setZero();
+  v.scalar<int32_t>().setZero();
   auto in0 = test::graph::Constant(g, v);
   auto in1 = test::graph::Constant(g, v);
   test::graph::Add(g, in0, in1);
@@ -45,7 +45,7 @@ TEST(MemoryTypeChecker, Int32OK) {
 TEST(MemoryTypeChecker, Int32NotOk) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor v(DT_INT32, {});
-  v.scalar<int32>().setZero();
+  v.scalar<int32_t>().setZero();
   auto x = test::graph::Constant(g, v);
   test::graph::Cast(g, x, DT_FLOAT);
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_CPU, g));

From 220e7dd0b68ff1ee58db7d15041ec0f86edc4162 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:32:03 -0800
Subject: [PATCH 385/753] Automated Code Change

PiperOrigin-RevId: 845633049
---
 third_party/xla/xla/stream_executor/scratch_allocator.h     | 4 ++--
 third_party/xla/xla/stream_executor/stream_executor_test.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/scratch_allocator.h b/third_party/xla/xla/stream_executor/scratch_allocator.h
index 0c50a4686dc1dc..86830d2c01a229 100644
--- a/third_party/xla/xla/stream_executor/scratch_allocator.h
+++ b/third_party/xla/xla/stream_executor/scratch_allocator.h
@@ -72,7 +72,7 @@ class OwningScratchAllocator : public ScratchAllocator {
 
   absl::StatusOr<DeviceAddress<uint8_t>> AllocateBytes(
       int64_t byte_size) override {
-    TF_ASSIGN_OR_RETURN(OwningDeviceAddress buffer,
+    TF_ASSIGN_OR_RETURN(ScopedDeviceAddress<uint8_t> buffer,
                         allocator_->Allocate(device_ordinal_, byte_size,
                                              /*retry_on_failure=*/false));
     buffers_.push_back(std::move(buffer));
@@ -82,7 +82,7 @@ class OwningScratchAllocator : public ScratchAllocator {
  private:
   int device_ordinal_;
   DeviceAddressAllocator* allocator_;
-  absl::InlinedVector<OwningDeviceAddress, N> buffers_;
+  absl::InlinedVector<ScopedDeviceAddress<uint8_t>, N> buffers_;
 };
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream_executor_test.cc b/third_party/xla/xla/stream_executor/stream_executor_test.cc
index 98c34e3ff6a1c4..74f6421952b8cd 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_test.cc
@@ -42,8 +42,8 @@ static absl::StatusOr<StreamExecutor*> NewStreamExecutor() {
 TEST(StreamExecutorTest, HostMemoryAllocate) {
   TF_ASSERT_OK_AND_ASSIGN(auto executor, NewStreamExecutor());
   TF_ASSERT_OK_AND_ASSIGN(auto allocation, executor->HostMemoryAllocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
 }
 
 TEST(StreamExecutorTest, GetOrCreateResource) {

From a05dfbfc3226832f26f24daecaf46412aac182ce Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 17 Dec 2025 10:06:38 +0000
Subject: [PATCH 386/753] [mlir][tosa] Use `getTosaConstShape` implementation
 from MLIR (#106002)

MLIR `ConversionUtils.h` defines an implementation of
`getTosaConstShape`. In some cases here, but not all,
the implementation in `legalization_utils` was being used
instead. This commit removes the implementation in
`legalization_utils` and updates the relevant call sites
to use the MLIR version.

Change-Id: I4595c5294e26f1d67b398446025c624d88e0ce18
---
 .../compiler/mlir/tosa/transforms/legalize_common.cc     | 4 ++--
 tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc  | 2 +-
 tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc | 2 +-
 .../compiler/mlir/tosa/transforms/legalize_utils.cc      | 9 ---------
 .../compiler/mlir/tosa/transforms/legalize_utils.h       | 5 -----
 5 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index 803061fe56adaf..a2aaa3b905f87f 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -4774,7 +4774,7 @@ std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
       tensorflow::GetTypeFromTFTensorShape({N, W, C},
                                            on_value_type.getElementType()),
       op1_reshape_on_value.getResult(),
-      getTosaConstShape(rewriter, op, {N, W, C}));
+      getTosaConstShape(rewriter, op->getLoc(), {N, W, C}));
 
   // Reshape off_value to [1, 1, 1]
   auto op3_reshape_off_value = CreateOpAndInfer<tosa::ReshapeOp>(
@@ -4789,7 +4789,7 @@ std::optional<Value> convertOneHotOp(PatternRewriter& rewriter, Operation* op,
       tensorflow::GetTypeFromTFTensorShape({N, K, C},
                                            on_value_type.getElementType()),
       op3_reshape_off_value.getResult(),
-      getTosaConstShape(rewriter, op, {N, K, C}));
+      getTosaConstShape(rewriter, op->getLoc(), {N, K, C}));
 
   // Reshape indices to [N, W]
   shape_value =
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
index 9d227f75bad616..43a22266bcb0c6 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
@@ -1558,7 +1558,7 @@ LogicalResult ConvertTFTileOp::matchAndRewrite(
     multiples_vals.push_back(
         multiples_elems.getValues<IntegerAttr>()[i].getInt());
 
-  auto multiples = getTosaConstShape(rewriter, op, multiples_vals);
+  auto multiples = getTosaConstShape(rewriter, op->getLoc(), multiples_vals);
 
   CreateReplaceOpAndInfer<tosa::TileOp>(rewriter, op, output_type,
                                         tf_tile_op.getInput(), multiples);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index c7e49c5703fcd7..d9c20cb9ab67b7 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -3009,7 +3009,7 @@ LogicalResult ConvertTFLTileOp::matchAndRewrite(
     multiples_vals.push_back(
         multiples_elems.getValues<APInt>()[i].getSExtValue());
 
-  auto multiples = getTosaConstShape(rewriter, op, multiples_vals);
+  auto multiples = getTosaConstShape(rewriter, op->getLoc(), multiples_vals);
 
   CreateReplaceOpAndInfer<tosa::TileOp>(rewriter, op, output_type,
                                         tfl_tile_op.getInput(), multiples);
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index b1bde08cf929eb..dcfff41af1f1d7 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -991,15 +991,6 @@ Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
   return const_op.getResult();
 }
 
-Value getTosaConstShape(PatternRewriter& rewriter, Operation* op,
-                        llvm::ArrayRef<int64_t> values) {
-  auto attr = rewriter.getIndexTensorAttr(values);
-  auto type =
-      tosa::shapeType::get(rewriter.getContext(), /* rank = */ values.size());
-  return CreateOpAndInfer<tosa::ConstShapeOp>(rewriter, op->getLoc(), type,
-                                              attr);
-}
-
 // Create a vector from a 32-bit value tensor.  Returns the size of
 // the new vector or -1 on error.
 // Populate a int32_t vector from a val tensor
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
index 20908312f40718..b22db1b0963278 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -144,11 +144,6 @@ Value getTosaConstTensorSingleI32(PatternRewriter& rewriter, Operation* op,
 Value getTosaConstTensorScalarInt(ImplicitLocOpBuilder& builder, Type type,
                                   int64_t val, int rank);
 
-// Create a tosa::ConstShape based on the specified values
-Value getTosaConstShape(PatternRewriter& rewriter, Operation* op,
-                        llvm::ArrayRef<int64_t> values);
-
-
 // Populate a int32_t vector from a val tensor
 // return failure if val is not a constant value
 // return success otherwise

From 8e0f4b64e6075ffd271a84423cbd539d0b82f6a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:36:01 -0800
Subject: [PATCH 387/753] Automated Code Change

PiperOrigin-RevId: 845634459
---
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc     | 23 +++++++++++--------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 49f90941489596..f839c61ed1a304 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -212,8 +212,8 @@ TEST_F(PjrtCApiGpuBufferTest, CopyRawToHost) {
   args.struct_size = PJRT_Buffer_CopyRawToHost_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.buffer = buffer.get();
-  args.dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  args.dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
   args.offset = 0;
   args.transfer_size = size;
   PJRT_Error* error = api_->PJRT_Buffer_CopyRawToHost(&args);
@@ -221,8 +221,9 @@ TEST_F(PjrtCApiGpuBufferTest, CopyRawToHost) {
   xla::Future<> copy_to_host_event = ConvertCEventToCppFuture(args.event, api_);
   TF_EXPECT_OK(copy_to_host_event.Await());
   EXPECT_EQ(*(static_cast<float*>(args.dst)), 41);
-  tsl::port::AlignedSizedFree(args.dst, tsl::Allocator::kAllocatorAlignment,
-                              size);
+  tsl::port::AlignedSizedFree(
+      args.dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST_F(PjrtCApiGpuBufferTest, CopyRawToHostWithInvalidOffset) {
@@ -231,8 +232,8 @@ TEST_F(PjrtCApiGpuBufferTest, CopyRawToHostWithInvalidOffset) {
   args.struct_size = PJRT_Buffer_CopyRawToHost_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.buffer = buffer_.get();
-  args.dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  args.dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
   args.offset = size + 1;  // offset is invalid
   args.transfer_size = size;
   PJRT_Error* error = api_->PJRT_Buffer_CopyRawToHost(&args);
@@ -376,10 +377,12 @@ TEST_F(PjrtCApiGpuTest, CreateAndDestroyExecuteContext) {
 TEST_F(PjrtCApiGpuTest, DmaMapAndUnmap) {
   size_t dma_size = 1024 * 1024;
   size_t alignment = 1024 * 1024;
-  void* host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  void* host_dma_ptr = tsl::port::AlignedMalloc(
+      dma_size, static_cast<std::align_val_t>(alignment));
   auto host_dma_ptr_deleter =
       absl::Cleanup([host_dma_ptr, dma_size, alignment] {
-        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+        tsl::port::AlignedSizedFree(host_dma_ptr, dma_size,
+                                    static_cast<std::align_val_t>(alignment));
       });
 
   PJRT_Client_DmaMap_Args dma_args;
@@ -468,7 +471,7 @@ TEST_F(PjrtCApiGpuTransferManagerTest, SetBufferError) {
           &set_buffer_error_args);
   ASSERT_EQ(set_buffer_error_error, nullptr);
 
-  EXPECT_THAT(buffer_out->buffer->ToLiteralSync(),
+  EXPECT_THAT(buffer_out->buffer->ToLiteral().Await(),
               absl_testing::StatusIs(absl::StatusCode::kInternal,
                                      HasSubstr(error_message)));
 
@@ -538,7 +541,7 @@ TEST_F(PjrtCApiGpuTransferManagerTest, TransferRawDataToBufferIsSuccessful) {
       transfer_args.done_with_h2d_transfer, MakeEventDeleter(api_));
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> literal,
-                          buffer_out->buffer->ToLiteralSync());
+                          buffer_out->buffer->ToLiteral().Await());
   EXPECT_EQ(literal->element_count(), 8);
   EXPECT_THAT(literal->data<uint32_t>(), ElementsAreArray(data));
 

From 79e54e1d40fd40cd174b21b7307cd14a82706239 Mon Sep 17 00:00:00 2001
From: Emilio Cota <ecg@google.com>
Date: Wed, 17 Dec 2025 00:41:22 -0800
Subject: [PATCH 388/753] [xla:codegen] deduplicate LowerToLLVM passes

Follow-up work to "349a1d4e00c: [xla:codegen] split LowerToLLVMPass
into CPU and GPU versions".

PiperOrigin-RevId: 845636364
---
 .../xla/xla/codegen/emitters/transforms/BUILD |  30 +++--
 .../transforms/lower_to_llvm_common.cc        |  95 ++++++++++++++
 .../transforms/lower_to_llvm_common.h         |  40 ++++++
 .../emitters/transforms/lower_to_llvm_cpu.cc  |  66 +---------
 .../emitters/transforms/lower_to_llvm_gpu.cc  | 117 ++++++------------
 .../emitters/transforms/lower_to_llvm_gpu.h   |   1 -
 6 files changed, 203 insertions(+), 146 deletions(-)
 create mode 100644 third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.cc
 create mode 100644 third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.h

diff --git a/third_party/xla/xla/codegen/emitters/transforms/BUILD b/third_party/xla/xla/codegen/emitters/transforms/BUILD
index 9e07eef2ba0f25..e39d3565a5edbf 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/codegen/emitters/transforms/BUILD
@@ -43,31 +43,46 @@ gentbl_cc_library(
 )
 
 cc_library(
-    name = "lower_to_llvm_cpu_pass",
-    srcs = ["lower_to_llvm_cpu.cc"],
-    hdrs = ["lower_to_llvm_cpu.h"],
+    name = "lower_to_llvm_common",
+    srcs = ["lower_to_llvm_common.cc"],
+    hdrs = ["lower_to_llvm_common.h"],
     deps = [
-        ":lower_to_llvm_cpu_inc_gen",
+        "@com_google_absl//absl/functional:function_ref",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:ArithTransforms",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowToLLVM",
+        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MemRefToLLVM",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFToControlFlow",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:UBToLLVM",
         "@llvm-project//mlir:VectorToLLVM",
     ],
 )
 
+cc_library(
+    name = "lower_to_llvm_cpu_pass",
+    srcs = ["lower_to_llvm_cpu.cc"],
+    hdrs = ["lower_to_llvm_cpu.h"],
+    deps = [
+        ":lower_to_llvm_common",
+        ":lower_to_llvm_cpu_inc_gen",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 gentbl_cc_library(
     name = "lower_to_llvm_gpu_inc_gen",
     compatible_with = get_compatible_with_portable(),
@@ -86,6 +101,7 @@ cc_library(
     srcs = ["lower_to_llvm_gpu.cc"],
     hdrs = ["lower_to_llvm_gpu.h"],
     deps = [
+        ":lower_to_llvm_common",
         ":lower_to_llvm_gpu_inc_gen",
         "//xla/codegen:device_spec",
         "//xla/stream_executor:device_description",
@@ -95,13 +111,10 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AMDGPUUtils",
         "@llvm-project//mlir:AffineToStandard",
-        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:ArithToLLVM",
         "@llvm-project//mlir:ArithTransforms",
-        "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToLLVM",
         "@llvm-project//mlir:ControlFlowToLLVM",
-        "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUToLLVMSPVTransforms",
@@ -110,7 +123,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MemRefToLLVM",
         "@llvm-project//mlir:NVVMDialect",
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.cc
new file mode 100644
index 00000000000000..bf1174b2eb930a
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.cc
@@ -0,0 +1,95 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/codegen/emitters/transforms/lower_to_llvm_common.h"
+
+#include <utility>
+
+#include "absl/functional/function_ref.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
+#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
+#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
+#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Transforms/Passes.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/DataLayoutInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace xla {
+namespace emitters {
+
+mlir::LogicalResult LowerToLLVM(
+    mlir::ModuleOp op,
+    absl::FunctionRef<mlir::LogicalResult(mlir::LLVMTypeConverter&,
+                                          mlir::RewritePatternSet&,
+                                          mlir::ConversionTarget&)>
+        populate_platform_patterns) {
+  // Populate type conversions.
+  mlir::LowerToLLVMOptions llvm_opts(op.getContext(), mlir::DataLayout(op));
+  mlir::LLVMTypeConverter type_converter(op.getContext(), llvm_opts);
+  mlir::LLVMConversionTarget target(*op.getContext());
+
+  // Populate patterns.
+  mlir::RewritePatternSet patterns(op.getContext());
+  mlir::arith::populateArithExpandOpsPatterns(patterns);
+  mlir::arith::populateArithToLLVMConversionPatterns(type_converter, patterns);
+  if (mlir::failed(
+          populate_platform_patterns(type_converter, patterns, target))) {
+    return mlir::failure();
+  }
+  mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
+  mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
+                                                       patterns);
+  mlir::ub::populateUBToLLVMConversionPatterns(type_converter, patterns);
+  mlir::populateVectorToLLVMConversionPatterns(type_converter, patterns);
+  mlir::cf::populateControlFlowToLLVMConversionPatterns(type_converter,
+                                                        patterns);
+  mlir::populateComplexToLLVMConversionPatterns(type_converter, patterns);
+
+  //  Set up target.
+  target.addIllegalDialect<mlir::arith::ArithDialect, mlir::func::FuncDialect,
+                           mlir::complex::ComplexDialect>();
+  target.addLegalOp<mlir::ModuleOp>();
+
+  if (mlir::failed(applyPartialConversion(op, target, std::move(patterns)))) {
+    return mlir::failure();
+  }
+
+  // Clean up any leftover math ops.
+  mlir::RewritePatternSet mathPatterns(op.getContext());
+  mlir::populateMathToLLVMConversionPatterns(type_converter, mathPatterns,
+                                             /*approximateLog1p=*/false);
+  target.addIllegalDialect<mlir::math::MathDialect>();
+
+  if (mlir::failed(applyFullConversion(op, target, std::move(mathPatterns)))) {
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+
+}  // namespace emitters
+}  // namespace xla
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.h
new file mode 100644
index 00000000000000..5aec57c5e65cc6
--- /dev/null
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_common.h
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_COMMON_H_
+#define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_COMMON_H_
+
+#include "absl/functional/function_ref.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+namespace xla {
+namespace emitters {
+
+mlir::LogicalResult LowerToLLVM(
+    mlir::ModuleOp op,
+    absl::FunctionRef<mlir::LogicalResult(mlir::LLVMTypeConverter&,
+                                          mlir::RewritePatternSet&,
+                                          mlir::ConversionTarget&)>
+        populate_platform_patterns =
+            [](mlir::LLVMTypeConverter&, mlir::RewritePatternSet&,
+               mlir::ConversionTarget&) { return mlir::success(); });
+
+}  // namespace emitters
+}  // namespace xla
+
+#endif  // XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_COMMON_H_
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
index d98dc0cd8e7055..cbe400674ea2e5 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_cpu.cc
@@ -13,27 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "xla/codegen/emitters/transforms/lower_to_llvm_cpu.h"
+
 #include <memory>
-#include <utility>
 
-#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
-#include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h"
-#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h"
-#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
-#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
-#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
-#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
-#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // IWYU pragma: keep, needed by lower_to_llvm_cpu.h.inc.
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Support/LLVM.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_common.h"
 
 namespace xla {
 namespace emitters {
@@ -45,49 +32,8 @@ namespace {
 class LowerToLLVMCPUPass
     : public impl::LowerToLLVMCPUPassBase<LowerToLLVMCPUPass> {
  public:
-  LowerToLLVMCPUPass() : LowerToLLVMCPUPassBase() {}
-
   void runOnOperation() override {
-    // Populate type conversions.
-    mlir::LowerToLLVMOptions llvm_opts(&getContext(),
-                                       mlir::DataLayout(getOperation()));
-    mlir::LLVMTypeConverter type_converter(getOperation().getContext(),
-                                           llvm_opts);
-    mlir::LLVMConversionTarget target(*getOperation().getContext());
-
-    // Populate patterns.
-    mlir::RewritePatternSet patterns(&getContext());
-    mlir::arith::populateArithExpandOpsPatterns(patterns);
-    mlir::arith::populateArithToLLVMConversionPatterns(type_converter,
-                                                       patterns);
-    mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
-    mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
-                                                         patterns);
-    mlir::ub::populateUBToLLVMConversionPatterns(type_converter, patterns);
-    mlir::populateVectorToLLVMConversionPatterns(type_converter, patterns);
-    mlir::cf::populateControlFlowToLLVMConversionPatterns(type_converter,
-                                                          patterns);
-    mlir::populateComplexToLLVMConversionPatterns(type_converter, patterns);
-
-    //  Set up target.
-    target.addIllegalDialect<mlir::arith::ArithDialect, mlir::func::FuncDialect,
-                             mlir::complex::ComplexDialect>();
-    target.addLegalOp<mlir::ModuleOp>();
-
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-
-    // Clean up any leftover math ops.
-    mlir::RewritePatternSet mathPatterns(&getContext());
-    mlir::populateMathToLLVMConversionPatterns(type_converter, mathPatterns,
-                                               /*approximateLog1p=*/false);
-    target.addIllegalDialect<mlir::math::MathDialect>();
-
-    if (failed(applyFullConversion(getOperation(), target,
-                                   std::move(mathPatterns)))) {
+    if (mlir::failed(LowerToLLVM(getOperation()))) {
       signalPassFailure();
     }
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
index 3c22e505ff63ef..3a9091747ab33c 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <string>
-#include <utility>
 
 #include "llvm/Support/LogicalResult.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
@@ -29,7 +28,6 @@ limitations under the License.
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
-#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
@@ -37,22 +35,20 @@ limitations under the License.
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/AMDGPU/Utils/Chipset.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // IWYU pragma: keep
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // IWYU pragma: keep
-#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "google/protobuf/text_format.h"
 #include "xla/codegen/device_spec.h"
+#include "xla/codegen/emitters/transforms/lower_to_llvm_common.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/tsl/platform/logging.h"
@@ -86,78 +82,47 @@ class LowerToLLVMGPUPass
       CHECK_OK(device_description.status());
       *device_spec_.mutable_type() = *device_description;
     }
-    // Populate type conversions.
-    mlir::LowerToLLVMOptions llvm_opts(&getContext(),
-                                       mlir::DataLayout(getOperation()));
-    mlir::LLVMTypeConverter type_converter(getOperation().getContext(),
-                                           llvm_opts);
-    mlir::LLVMConversionTarget target(*getOperation().getContext());
-
-    // Populate patterns.
-    mlir::RewritePatternSet patterns(&getContext());
-    mlir::arith::populateArithExpandOpsPatterns(patterns);
-    mlir::arith::populateArithToLLVMConversionPatterns(type_converter,
-                                                       patterns);
-    if (device_spec_.IsAmdGpu()) {
-      std::string chipset =
-          device_spec_.gpu().rocm_compute_capability().gfx_version();
-      llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
-          mlir::amdgpu::Chipset::parse(chipset);
-      if (failed(maybeChipset)) {
-        mlir::emitError(mlir::UnknownLoc::get(&getContext()),
-                        "Invalid chipset name: " + chipset);
-        return signalPassFailure();
-      }
-      mlir::populateGpuToROCDLConversionPatterns(
-          type_converter, patterns, mlir::gpu::amd::Runtime::Unknown,
-          *maybeChipset);
-      mlir::configureGpuToROCDLConversionLegality(target);
-    } else if (device_spec_.IsIntelGpu()) {
-      // Add sub-group-size attribute to functions.
-      int32_t sub_group_size = device_spec_.gpu().threads_per_warp();
-      if (auto module_op = mlir::dyn_cast<mlir::ModuleOp>(getOperation())) {
-        module_op.walk([sub_group_size](mlir::func::FuncOp func) {
-          if (!func.getBody().empty()) {
-            mlir::OpBuilder b(func.getContext());
-            auto sub_group_attr = b.getI32IntegerAttr(sub_group_size);
-            func->setAttr("intel_reqd_sub_group_size", sub_group_attr);
-          }
-        });
-      }
-      populateGpuToLLVMSPVConversionPatterns(type_converter, patterns);
-      populateGpuMemorySpaceAttributeConversions(type_converter);
-    } else {
-      mlir::populateGpuToNVVMConversionPatterns(type_converter, patterns);
-      mlir::configureGpuToNVVMConversionLegality(target);
-    }
-    mlir::populateFuncToLLVMConversionPatterns(type_converter, patterns);
-    mlir::populateFinalizeMemRefToLLVMConversionPatterns(type_converter,
-                                                         patterns);
-    mlir::ub::populateUBToLLVMConversionPatterns(type_converter, patterns);
-    mlir::populateVectorToLLVMConversionPatterns(type_converter, patterns);
-    mlir::cf::populateControlFlowToLLVMConversionPatterns(type_converter,
-                                                          patterns);
-    mlir::populateComplexToLLVMConversionPatterns(type_converter, patterns);
-
-    // Set up target.
-    target.addIllegalDialect<mlir::arith::ArithDialect, mlir::func::FuncDialect,
-                             mlir::complex::ComplexDialect>();
-    target.addLegalOp<mlir::ModuleOp>();
-
-    if (failed(applyPartialConversion(getOperation(), target,
-                                      std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
 
-    // Clean up any leftover math ops not handled NVVM or ROCDL lowering.
-    mlir::RewritePatternSet mathPatterns(&getContext());
-    mlir::populateMathToLLVMConversionPatterns(type_converter, mathPatterns,
-                                               /*approximateLog1p=*/false);
-    target.addIllegalDialect<mlir::math::MathDialect>();
+    auto populate_patterns =
+        [&](mlir::LLVMTypeConverter& converter,
+            mlir::RewritePatternSet& patterns,
+            mlir::ConversionTarget& target) -> mlir::LogicalResult {
+      if (device_spec_.IsAmdGpu()) {
+        std::string chipset =
+            device_spec_.gpu().rocm_compute_capability().gfx_version();
+        llvm::FailureOr<mlir::amdgpu::Chipset> maybeChipset =
+            mlir::amdgpu::Chipset::parse(chipset);
+        if (mlir::failed(maybeChipset)) {
+          mlir::emitError(mlir::UnknownLoc::get(&getContext()),
+                          "Invalid chipset name: " + chipset);
+          return mlir::failure();
+        }
+        mlir::populateGpuToROCDLConversionPatterns(
+            converter, patterns, mlir::gpu::amd::Runtime::Unknown,
+            *maybeChipset);
+        mlir::configureGpuToROCDLConversionLegality(target);
+      } else if (device_spec_.IsIntelGpu()) {
+        // Add sub-group-size attribute to functions.
+        int32_t sub_group_size = device_spec_.gpu().threads_per_warp();
+        if (auto module_op = mlir::dyn_cast<mlir::ModuleOp>(getOperation())) {
+          module_op.walk([sub_group_size](mlir::func::FuncOp func) {
+            if (!func.getBody().empty()) {
+              mlir::OpBuilder b(func.getContext());
+              auto sub_group_attr = b.getI32IntegerAttr(sub_group_size);
+              func->setAttr("intel_reqd_sub_group_size", sub_group_attr);
+            }
+          });
+        }
+        populateGpuToLLVMSPVConversionPatterns(converter, patterns);
+        populateGpuMemorySpaceAttributeConversions(converter);
+      } else {
+        mlir::populateGpuToNVVMConversionPatterns(converter, patterns);
+        mlir::configureGpuToNVVMConversionLegality(target);
+      }
+      return mlir::success();
+    };
 
-    if (failed(applyFullConversion(getOperation(), target,
-                                   std::move(mathPatterns)))) {
+    if (mlir::failed(LowerToLLVM(getOperation(), populate_patterns))) {
       signalPassFailure();
     }
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
index 35bd6a0d33766b..0bed290d078d4f 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_to_llvm_gpu.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
 #define XLA_CODEGEN_EMITTERS_TRANSFORMS_LOWER_TO_LLVM_GPU_H_
 
-#include <cstdint>
 #include <memory>
 #include <string>
 

From 89b883eecf5130b2b353a051e687abe9a12e6662 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:46:31 -0800
Subject: [PATCH 389/753] Automated Code Change

PiperOrigin-RevId: 845638609
---
 tensorflow/core/kernels/list_kernels.cc       | 18 +++---
 tensorflow/core/kernels/list_kernels.h        | 35 ++++++-----
 tensorflow/core/kernels/listdiff_op.cc        |  2 +-
 .../core/kernels/load_and_remap_matrix_op.cc  |  4 +-
 tensorflow/core/kernels/logging_ops.cc        | 26 ++++----
 tensorflow/core/kernels/logging_ops.h         |  2 +-
 tensorflow/core/kernels/logging_ops_test.cc   | 44 ++++++-------
 tensorflow/core/kernels/lookup_ops_test.cc    |  4 +-
 .../core/kernels/lookup_table_init_op.cc      |  5 +-
 .../core/kernels/lookup_table_init_op.h       |  2 +-
 tensorflow/core/kernels/lookup_table_op.cc    | 62 +++++++++---------
 tensorflow/core/kernels/lookup_table_op.h     |  2 +-
 tensorflow/core/kernels/lookup_util.cc        | 42 +++++++------
 tensorflow/core/kernels/lookup_util.h         |  6 +-
 tensorflow/core/kernels/lrn_op_test.cc        |  4 +-
 tensorflow/core/kernels/map_kernels.h         |  2 +-
 tensorflow/core/kernels/map_stage_op.cc       |  6 +-
 tensorflow/core/kernels/matching_files_op.cc  |  2 +-
 tensorflow/core/kernels/matmul_op_real.cc     | 22 +++----
 tensorflow/core/kernels/matmul_op_test.cc     | 21 ++++---
 tensorflow/core/kernels/maxpooling_op.cc      | 62 +++++++++---------
 .../kernels/merge_v2_checkpoints_op_test.cc   | 14 +++--
 tensorflow/core/kernels/mfcc_op.cc            |  6 +-
 .../core/kernels/multinomial_op_test.cc       |  2 +-
 tensorflow/core/kernels/mutex_ops.cc          |  8 ++-
 tensorflow/core/kernels/nn_ops_test.cc        | 63 ++++++++++---------
 tensorflow/core/kernels/nth_element_op.cc     |  2 +-
 tensorflow/core/kernels/one_hot_op.cc         |  4 +-
 tensorflow/core/kernels/ops_testutil.cc       |  2 +-
 tensorflow/core/kernels/ops_testutil.h        |  2 +-
 tensorflow/core/kernels/padding_fifo_queue.cc |  3 +-
 tensorflow/core/kernels/padding_fifo_queue.h  |  2 +-
 .../parameterized_truncated_normal_op.cc      |  4 +-
 .../parameterized_truncated_normal_op_test.cc |  6 +-
 tensorflow/core/kernels/parse_tensor_test.cc  | 47 +++++++-------
 .../core/kernels/partitioned_function_ops.cc  | 10 +--
 .../core/kernels/partitioned_function_ops.h   |  2 +-
 tensorflow/core/kernels/pooling_ops_3d.cc     | 58 ++++++++---------
 tensorflow/core/kernels/pooling_ops_3d.h      |  4 +-
 tensorflow/core/kernels/pooling_ops_common.cc |  6 +-
 tensorflow/core/kernels/pooling_ops_common.h  | 24 +++----
 .../core/kernels/population_count_op.cc       | 21 ++++---
 tensorflow/core/kernels/population_count_op.h |  2 +-
 tensorflow/core/kernels/priority_queue.cc     |  2 +-
 tensorflow/core/kernels/priority_queue.h      |  4 +-
 tensorflow/core/kernels/quantization_utils.h  |  8 +--
 .../core/kernels/quantization_utils_test.cc   | 12 ++--
 .../kernels/quantize_and_dequantize_op.cc     |  4 +-
 .../quantize_and_dequantize_op_test.cc        | 20 +++---
 .../kernels/quantize_down_and_shrink_range.cc |  4 +-
 50 files changed, 369 insertions(+), 350 deletions(-)

diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 51c0d4b6654034..3919cb763171c7 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -48,7 +48,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 
 absl::Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
   if (t.shape() == TensorShape({})) {
-    if ((t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) ||
+    if ((t.dtype() == DT_INT32 && t.scalar<int32_t>()() == -1) ||
         (t.dtype() == DT_INT64 && t.scalar<int64_t>()() == -1)) {
       *out = PartialTensorShape();
       return absl::OkStatus();
@@ -61,7 +61,7 @@ absl::Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
                                    t.shape().dims());
   }
   if (t.dtype() == DT_INT32) {
-    return PartialTensorShape::MakePartialShape(t.vec<int32>().data(),
+    return PartialTensorShape::MakePartialShape(t.vec<int32_t>().data(),
                                                 t.NumElements(), out);
   } else if (t.dtype() == DT_INT64) {
     return PartialTensorShape::MakePartialShape(t.vec<int64_t>().data(),
@@ -157,7 +157,7 @@ class EmptyTensorList : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
     TensorList empty;
     empty.element_dtype = element_dtype_;
-    empty.max_num_elements = max_num_elements_t.scalar<int32>()();
+    empty.max_num_elements = max_num_elements_t.scalar<int32_t>()();
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(ctx, TensorShapeFromTensor(ctx->input(0), &element_shape));
     empty.element_shape = element_shape;
@@ -257,7 +257,7 @@ class TensorListLength : public OpKernel {
     OP_REQUIRES_OK(c, GetInputList(c, 0, &l));
     Tensor* result;
     OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
-    result->scalar<int32>()() = l->tensors().size();
+    result->scalar<int32_t>()() = l->tensors().size();
   }
 };
 
@@ -287,7 +287,7 @@ class TensorListElementShape : public OpKernel {
     if (l->element_shape.unknown_rank()) {
       OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape({}), &result));
       if (result->dtype() == DT_INT32) {
-        result->scalar<int32>()() = -1;
+        result->scalar<int32_t>()() = -1;
       } else {
         result->scalar<int64_t>()() = -1;
       }
@@ -296,7 +296,7 @@ class TensorListElementShape : public OpKernel {
                             0, TensorShape{l->element_shape.dims()}, &result));
       for (int i = 0; i < l->element_shape.dims(); ++i) {
         if (result->dtype() == DT_INT32) {
-          result->flat<int32>()(i) = l->element_shape.dim_size(i);
+          result->flat<int32_t>()(i) = l->element_shape.dim_size(i);
         } else {
           result->flat<int64_t>()(i) = l->element_shape.dim_size(i);
         }
@@ -336,7 +336,7 @@ class TensorListReserve : public OpKernel {
         errors::InvalidArgument(
             "The num_elements to reserve must be a tensor size 1, but got ",
             c->input(1).shape()));
-    int32_t num_elements = c->input(1).scalar<int32>()();
+    int32_t num_elements = c->input(1).scalar<int32_t>()();
     OP_REQUIRES(c, num_elements >= 0,
                 errors::InvalidArgument("The num_elements to reserve must be a "
                                         "non negative number, but got ",
@@ -384,7 +384,7 @@ class TensorListResize : public OpKernel {
     OP_REQUIRES_OK(c, GetInputList(c, 0, &input_list));
     OP_REQUIRES(c, TensorShapeUtils::IsScalar(c->input(1).shape()),
                 errors::InvalidArgument("size must be a scalar"));
-    int32_t size = c->input(1).scalar<int32>()();
+    int32_t size = c->input(1).scalar<int32_t>()();
     OP_REQUIRES(
         c, size >= 0,
         errors::InvalidArgument(
@@ -473,7 +473,7 @@ class TensorListSetItem : public OpKernel {
                     " list shape: ", l->element_shape.DebugString()));
     TensorList* output_list = nullptr;
     OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
-    int32_t index = c->input(1).scalar<int32>()();
+    int32_t index = c->input(1).scalar<int32_t>()();
     if (!resize_if_index_out_of_bounds_) {
       OP_REQUIRES(c, index < l->tensors().size(),
                   errors::InvalidArgument("Trying to modify element ", index,
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 9837b08716afae..5af26a518f0b18 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -80,8 +80,8 @@ template <typename Device, typename T>
 inline void SetZero(OpKernelContext* ctx, Tensor& tensor) {
 #ifdef PLUGGABLE_DEVICE_SUPPORTED
   if (IsPluggableDevice(ctx)) {
-    auto ptr =
-        se::DeviceMemoryBase(tensor.flat<T>().data(), tensor.TotalBytes());
+    auto ptr = stream_executor::DeviceAddressBase(tensor.flat<T>().data(),
+                                                  tensor.TotalBytes());
     auto stream = ctx->op_device_context()->stream();
     auto result = stream->MemZero(&ptr, tensor.TotalBytes()).ok();
     DCHECK_EQ(true, result);
@@ -101,8 +101,10 @@ inline void CopyTensorPluggableDevice(OpKernelContext* ctx, Tensor& src,
   auto src_t = src.unaligned_flat<T>();
   auto dst_t = dst.flat<T>();
   DCHECK(DataTypeCanUseMemcpy(DataTypeToEnum<T>::v()));
-  auto src_ptr = se::DeviceMemoryBase(src_t.data(), src.TotalBytes());
-  auto dst_ptr = se::DeviceMemoryBase(dst_t.data(), dst.TotalBytes());
+  auto src_ptr =
+      stream_executor::DeviceAddressBase(src_t.data(), src.TotalBytes());
+  auto dst_ptr =
+      stream_executor::DeviceAddressBase(dst_t.data(), dst.TotalBytes());
   auto stream = ctx->op_device_context()->stream();
   auto result = stream->Memcpy(&dst_ptr, src_ptr, src.TotalBytes()).ok();
   DCHECK_EQ(true, result);
@@ -133,7 +135,7 @@ void ConcatPluggableDevice(
   size_t num_inputs = inputs.size();
   std::vector<ptrdiff_t> sizes;
   sizes.reserve(num_inputs);
-  int64 row_size = 0;
+  int64_t row_size = 0;
   for (const auto& input : inputs) {
     sizes.push_back(input->dimension(1));
     row_size += sizes.back();
@@ -145,12 +147,13 @@ void ConcatPluggableDevice(
   for (const auto& input : inputs) {
     inp.push_back(&(*input)(0, 0));
   }
-  const int64 dim0 = output->dimension(0);
-  for (int64 i = 0; i < dim0; ++i) {
-    for (int64 j = 0; j < num_inputs; ++j) {
+  const int64_t dim0 = output->dimension(0);
+  for (int64_t i = 0; i < dim0; ++i) {
+    for (int64_t j = 0; j < num_inputs; ++j) {
       auto size = sizes[j];
-      se::DeviceMemoryBase out_base{out, size * sizeof(T)};
-      se::DeviceMemoryBase inp_base{const_cast<T*>(inp[j]), size * sizeof(T)};
+      stream_executor::DeviceAddressBase out_base{out, size * sizeof(T)};
+      stream_executor::DeviceAddressBase inp_base{const_cast<T*>(inp[j]),
+                                                  size * sizeof(T)};
       OP_REQUIRES_OK(context,
                      stream->Memcpy(&out_base, inp_base, size * sizeof(T)));
       out += size;
@@ -284,7 +287,7 @@ class TensorListGetItem : public OpKernel {
                                         DataTypeString(element_dtype_),
                                         " but list elements ",
                                         DataTypeString(l->element_dtype)));
-    int32_t index = c->input(1).scalar<int32>()();
+    int32_t index = c->input(1).scalar<int32_t>()();
     OP_REQUIRES(c, index < l->tensors().size(),
                 errors::InvalidArgument("Trying to access element ", index,
                                         " in a list with ", l->tensors().size(),
@@ -693,7 +696,7 @@ class TensorListGather : public OpKernel {
     // element tensors.
     if (!tensor_list->element_shape.IsFullyDefined()) {
       for (int index = 0; index < indices.NumElements(); ++index) {
-        const int i = indices.flat<int32>()(index);
+        const int i = indices.flat<int32_t>()(index);
 
         OP_REQUIRES(c, 0 <= i && i < tensor_list->tensors().size(),
                     absl::InvalidArgumentError(absl::StrCat(
@@ -728,7 +731,7 @@ class TensorListGather : public OpKernel {
     inputs_flat.reserve(indices.NumElements());
     Tensor zeros;
     for (int index = 0; index < indices.NumElements(); ++index) {
-      const int i = indices.flat<int32>()(index);
+      const int i = indices.flat<int32_t>()(index);
       OP_REQUIRES(
           c, i < tensor_list->tensors().size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
@@ -832,7 +835,7 @@ absl::Status Scatter(OpKernelContext* c, const Tensor& value,
   const auto copy_tensor = IsPluggableDevice(c) ? &CopyTensorPluggableDevice<T>
                                                 : &CopyTensor<Device, T>;
   for (int index = 0; index < indices.NumElements(); ++index) {
-    const int i = indices.flat<int32>()(index);
+    const int i = indices.flat<int32_t>()(index);
     Tensor tmp = value.Slice(index, index + 1);
     TensorShape tmp_shape = tmp.shape();
     tmp_shape.RemoveDim(0);
@@ -885,7 +888,7 @@ class TensorListScatterIntoExistingList : public OpKernel {
     // Resize the list if needed to accommodate all indices.
     TensorList* output_list = nullptr;
     OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list));
-    const auto indices_vec = indices.vec<int32>();
+    const auto indices_vec = indices.vec<int32_t>();
     int32_t max_index =
         (indices.NumElements() == 0)
             ? -1
@@ -956,7 +959,7 @@ class TensorListScatter : public OpKernel {
     {
       int highest_index = -1;
       for (int index = 0; index < indices.NumElements(); ++index) {
-        const int i = indices.flat<int32>()(index);
+        const int i = indices.flat<int32_t>()(index);
         OP_REQUIRES(
             c, i >= 0,
             errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/listdiff_op.cc b/tensorflow/core/kernels/listdiff_op.cc
index 92d461aba58c8e..eb0a6eec9345aa 100644
--- a/tensorflow/core/kernels/listdiff_op.cc
+++ b/tensorflow/core/kernels/listdiff_op.cc
@@ -48,7 +48,7 @@ class ListDiffOp : public OpKernel {
     const auto Ty = y.vec<T>();
     const size_t y_size = Ty.size();
 
-    OP_REQUIRES(context, x_size < std::numeric_limits<int32>::max(),
+    OP_REQUIRES(context, x_size < std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument("x too large for int32 indexing"));
 
     std::unordered_set<T> y_set;
diff --git a/tensorflow/core/kernels/load_and_remap_matrix_op.cc b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
index c746fec71d5e4d..a952da3595ccda 100644
--- a/tensorflow/core/kernels/load_and_remap_matrix_op.cc
+++ b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
@@ -133,11 +133,11 @@ class LoadAndRemapMatrixOp : public OpKernel {
         errors::InvalidArgument("The `ckpt_path` tensor must have exactly one "
                                 "element, got tensor of shape ",
                                 ckpt_path_t->shape().DebugString()));
-    const string& ckpt_path = ckpt_path_t->scalar<tstring>()();
+    const std::string& ckpt_path = ckpt_path_t->scalar<tstring>()();
     const Tensor* old_tensor_name_t;
     OP_REQUIRES_OK(context,
                    context->input("old_tensor_name", &old_tensor_name_t));
-    const string& old_tensor_name = old_tensor_name_t->scalar<tstring>()();
+    const std::string& old_tensor_name = old_tensor_name_t->scalar<tstring>()();
 
     LOG(INFO) << "Processing checkpoint : " << ckpt_path;
     BundleReader reader(context->env(), ckpt_path);
diff --git a/tensorflow/core/kernels/logging_ops.cc b/tensorflow/core/kernels/logging_ops.cc
index 904e84d21778aa..b589d918626f1d 100644
--- a/tensorflow/core/kernels/logging_ops.cc
+++ b/tensorflow/core/kernels/logging_ops.cc
@@ -64,7 +64,7 @@ void AssertOp::Compute(OpKernelContext* ctx) {
   if (cond.scalar<bool>()()) {
     return;
   }
-  string msg = "assertion failed: ";
+  std::string msg = "assertion failed: ";
   for (int i = 1; i < ctx->num_inputs(); ++i) {
     absl::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_), "]");
     if (i < ctx->num_inputs() - 1) absl::StrAppend(&msg, " ");
@@ -98,7 +98,7 @@ class PrintOp : public OpKernel {
       if (call_counter_ >= first_n_) return;
       call_counter_++;
     }
-    string msg;
+    std::string msg;
     absl::StrAppend(&msg, message_);
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       absl::StrAppend(&msg, "[", ctx->input(i).SummarizeValue(summarize_), "]");
@@ -110,8 +110,8 @@ class PrintOp : public OpKernel {
   mutex mu_;
   int64_t call_counter_ TF_GUARDED_BY(mu_) = 0;
   int64_t first_n_ = 0;
-  int32 summarize_ = 0;
-  string message_;
+  int32_t summarize_ = 0;
+  std::string message_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Print").Device(DEVICE_CPU), PrintOp);
@@ -130,8 +130,8 @@ class PrintV2Op : public OpKernel {
                   std::end(valid_output_streams_), output_stream_);
 
     if (output_stream_index == std::end(valid_output_streams_)) {
-      string error_msg = absl::StrCat("Unknown output stream: ", output_stream_,
-                                      ", Valid streams are:");
+      std::string error_msg = absl::StrCat(
+          "Unknown output stream: ", output_stream_, ", Valid streams are:");
       for (auto valid_stream : valid_output_streams_) {
         absl::StrAppend(&error_msg, " ", valid_stream);
       }
@@ -146,9 +146,9 @@ class PrintV2Op : public OpKernel {
         ctx, TensorShapeUtils::IsScalar(input_->shape()),
         errors::InvalidArgument("Input is expected to be scalar, but got ",
                                 input_->shape()));
-    const string& msg = input_->scalar<tstring>()();
+    const std::string& msg = input_->scalar<tstring>()();
 
-    string ended_msg = absl::StrCat(msg, end_);
+    std::string ended_msg = absl::StrCat(msg, end_);
 
     if (!file_path_.empty()) {
       // Outputs to a file at the specified path.
@@ -172,8 +172,8 @@ class PrintV2Op : public OpKernel {
     } else if (output_stream_ == "log(error)") {
       LOG(ERROR) << ended_msg << std::flush;
     } else {
-      string error_msg = absl::StrCat("Unknown output stream: ", output_stream_,
-                                      ", Valid streams are:");
+      std::string error_msg = absl::StrCat(
+          "Unknown output stream: ", output_stream_, ", Valid streams are:");
       for (auto valid_stream : valid_output_streams_) {
         absl::StrAppend(&error_msg, " ", valid_stream);
       }
@@ -186,10 +186,10 @@ class PrintV2Op : public OpKernel {
                                           "log(warning)", "log(error)"};
 
  private:
-  string end_;
+  std::string end_;
   // Either output_stream_ or file_path_ (but not both) will be non-empty.
-  string output_stream_;
-  string file_path_;
+  std::string output_stream_;
+  std::string file_path_;
 
   // If output_stream_ is a file path, extracts it to file_path_ and clears
   // output_stream_; otherwise sets file_paths_ to "".
diff --git a/tensorflow/core/kernels/logging_ops.h b/tensorflow/core/kernels/logging_ops.h
index 5cb1213998f499..f5a58643d8e1a3 100644
--- a/tensorflow/core/kernels/logging_ops.h
+++ b/tensorflow/core/kernels/logging_ops.h
@@ -25,7 +25,7 @@ class AssertOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  int32 summarize_ = 0;
+  int32_t summarize_ = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index 7efdeac7d1db9f..fbce44642938db 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -34,7 +34,7 @@ namespace {
 
 class PrintingV2GraphTest : public OpsTestBase {
  protected:
-  absl::Status Init(const string& output_stream = "log(warning)") {
+  absl::Status Init(const std::string& output_stream = "log(warning)") {
     TF_CHECK_OK(NodeDefBuilder("op", "PrintV2")
                     .Input(FakeInput(DT_STRING))
                     .Attr("output_stream", output_stream)
@@ -61,8 +61,8 @@ TEST_F(PrintingV2GraphTest, InvalidInputRank) {
 
 class PrintingGraphTest : public OpsTestBase {
  protected:
-  absl::Status Init(DataType input_type1, DataType input_type2, string msg = "",
-                    int first_n = -1, int summarize = 3) {
+  absl::Status Init(DataType input_type1, DataType input_type2,
+                    std::string msg = "", int first_n = -1, int summarize = 3) {
     TF_CHECK_OK(NodeDefBuilder("op", "Print")
                     .Input(FakeInput(input_type1))
                     .Input(FakeInput(2, input_type2))
@@ -76,58 +76,58 @@ class PrintingGraphTest : public OpsTestBase {
 
 TEST_F(PrintingGraphTest, Int32Success_6) {
   TF_ASSERT_OK(Init(DT_INT32, DT_INT32));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, Int32Success_Summarize6) {
   TF_ASSERT_OK(Init(DT_INT32, DT_INT32, "", -1, 6));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, StringSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<tstring>(TensorShape({}), {"foo"});
   AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, MsgSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING, "Message: "));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<tstring>(TensorShape({}), {"foo"});
   AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 TEST_F(PrintingGraphTest, FirstNSuccess) {
   TF_ASSERT_OK(Init(DT_INT32, DT_STRING, "", 3));
-  AddInputFromArray<int32>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
+  AddInputFromArray<int32_t>(TensorShape({6}), {1, 2, 3, 4, 5, 6});
   AddInputFromArray<tstring>(TensorShape({}), {"foo"});
   AddInputFromArray<tstring>(TensorShape({}), {"bar"});
   // run 4 times but we only print 3 as intended
   for (int i = 0; i < 4; i++) TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_INT32, TensorShape({6}));
-  test::FillValues<int32>(&expected, {1, 2, 3, 4, 5, 6});
-  test::ExpectTensorEqual<int32>(expected, *GetOutput(0));
+  test::FillValues<int32_t>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<int32_t>(expected, *GetOutput(0));
 }
 
 class TimestampTest : public OpsTestBase {
diff --git a/tensorflow/core/kernels/lookup_ops_test.cc b/tensorflow/core/kernels/lookup_ops_test.cc
index 2a57a46cf165f0..fb13ccc162eb90 100644
--- a/tensorflow/core/kernels/lookup_ops_test.cc
+++ b/tensorflow/core/kernels/lookup_ops_test.cc
@@ -51,8 +51,8 @@ class MockHashTable : public lookup::HashTable<K, V> {
   ~MockHashTable() override { alive = false; }
 };
 
-typedef int32 key_dtype;
-typedef int32 value_dtype;
+typedef int32_t key_dtype;
+typedef int32_t value_dtype;
 
 REGISTER_KERNEL_BUILDER(
     Name("MockAnonymousHashTable")
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 27cc76ee11b945..c936cad9addd6d 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -111,7 +111,7 @@ class InitializeTableFromTextFileOp : public OpKernel {
     if (ctx->HasAttr("offset")) {
       OP_REQUIRES_OK(ctx, ctx->GetAttr("offset", &offset_));
     }
-    string delimiter;
+    std::string delimiter;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("delimiter", &delimiter));
     OP_REQUIRES(ctx, delimiter.size() == 1,
                 errors::InvalidArgument("delimiter should be only 1 char"));
@@ -137,7 +137,8 @@ class InitializeTableFromTextFileOp : public OpKernel {
         errors::InvalidArgument("filename should be a single string, but got ",
                                 vocab_filename_tensor.shape().DebugString()));
 
-    const string& vocab_filename = vocab_filename_tensor.scalar<tstring>()();
+    const std::string& vocab_filename =
+        vocab_filename_tensor.scalar<tstring>()();
     OP_REQUIRES(ctx, !vocab_filename.empty(),
                 errors::InvalidArgument("filename cannot be empty."));
 
diff --git a/tensorflow/core/kernels/lookup_table_init_op.h b/tensorflow/core/kernels/lookup_table_init_op.h
index e94db921bfd237..f6e246486a4532 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.h
+++ b/tensorflow/core/kernels/lookup_table_init_op.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace lookup {
 
 // Helper function to initialize an InitializableLookupTable from a text file.
-absl::Status InitializeTableFromTextFile(const string& filename,
+absl::Status InitializeTableFromTextFile(const std::string& filename,
                                          int64_t vocab_size, char delimiter,
                                          int32_t key_index, int32_t value_index,
                                          Env* env,
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 49a28dc324b9fb..54d2c8cca1669e 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -411,11 +411,11 @@ class MutableHashTableOfTensors final : public LookupInterface {
 namespace {
 
 template <typename T>
-inline uint64 HashScalar(const T& key) {
-  return static_cast<uint64>(key);
+inline uint64_t HashScalar(const T& key) {
+  return static_cast<uint64_t>(key);
 }
 
-inline uint64 HashScalar(const tstring& key) { return Hash64(key); }
+inline uint64_t HashScalar(const tstring& key) { return Hash64(key); }
 
 // If the given shape is a scalar return {1} instead. Otherwise leave it alone.
 TensorShape MaybeVectorizeShape(const TensorShape& shape) {
@@ -523,7 +523,7 @@ class MutableDenseHashTable final : public LookupInterface {
     const int64_t bit_mask = num_buckets_ - 1;
     // TODO(andreasst): parallelize using work_sharder
     for (int64_t i = 0; i < num_elements; ++i) {
-      const uint64 key_hash = HashKey(key_matrix, i);
+      const uint64_t key_hash = HashKey(key_matrix, i);
       if (empty_key_hash_ == key_hash &&
           IsEqualKey(empty_key_matrix, 0, key_matrix, i)) {
         return errors::InvalidArgument(
@@ -693,7 +693,7 @@ class MutableDenseHashTable final : public LookupInterface {
         deleted_key_.template shaped<K, 2>({1, key_size});
     const int64_t bit_mask = num_buckets_ - 1;
     for (int64_t i = 0; i < num_elements; ++i) {
-      const uint64 key_hash = HashKey(key_matrix, i);
+      const uint64_t key_hash = HashKey(key_matrix, i);
       if (empty_key_hash_ == key_hash &&
           IsEqualKey(empty_key_tensor, 0, key_matrix, i)) {
         if (ignore_empty_and_deleted_key) {
@@ -760,7 +760,7 @@ class MutableDenseHashTable final : public LookupInterface {
     const auto deleted_key_flat = deleted_key_.template flat<K>();
     const int64_t bit_mask = num_buckets_ - 1;
     for (int64_t i = 0; i < num_elements; ++i) {
-      const uint64 key_hash = HashKey(key_matrix, i);
+      const uint64_t key_hash = HashKey(key_matrix, i);
       if (empty_key_hash_ == key_hash &&
           IsEqualKey(empty_key_tensor, 0, key_matrix, i)) {
         return errors::InvalidArgument(
@@ -843,11 +843,11 @@ class MutableDenseHashTable final : public LookupInterface {
     return DoInsert(ctx, old_key_buckets, old_value_buckets, true);
   }
 
-  uint64 HashKey(typename TTypes<K>::ConstMatrix key, int64_t index) const {
+  uint64_t HashKey(typename TTypes<K>::ConstMatrix key, int64_t index) const {
     if (key_shape_.num_elements() == 1) {
       return HashScalar(key(index, 0));
     }
-    uint64 result = 0;
+    uint64_t result = 0;
     for (int64_t i = 0; i < key_shape_.num_elements(); ++i) {
       result = Hash64Combine(result, HashScalar(key(index, i)));
     }
@@ -876,9 +876,9 @@ class MutableDenseHashTable final : public LookupInterface {
   Tensor key_buckets_ TF_GUARDED_BY(mu_);
   Tensor value_buckets_ TF_GUARDED_BY(mu_);
   Tensor empty_key_;
-  uint64 empty_key_hash_;
+  uint64_t empty_key_hash_;
   Tensor deleted_key_;
-  uint64 deleted_key_hash_;
+  uint64_t deleted_key_hash_;
 };
 
 }  // namespace lookup
@@ -1103,19 +1103,19 @@ REGISTER_KERNEL_BUILDER(Name("LookupTableImportV2").Device(DEVICE_CPU),
       AnonymousLookupTableOp<lookup::HashTable<key_dtype, value_dtype>,   \
                              key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
-REGISTER_KERNEL(int32, tstring);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
+REGISTER_KERNEL(int32_t, tstring);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, tstring);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 REGISTER_KERNEL(tstring, tstring);
 
@@ -1146,19 +1146,19 @@ REGISTER_KERNEL(tstring, tstring);
           lookup::MutableHashTableOfScalars<key_dtype, value_dtype>,           \
           key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, tstring);
 REGISTER_KERNEL(int64_t, Variant);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 
 #undef REGISTER_KERNEL
@@ -1188,18 +1188,18 @@ REGISTER_KERNEL(tstring, int64_t);
           lookup::MutableHashTableOfTensors<key_dtype, value_dtype>,           \
           key_dtype, value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, tstring);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 
 #undef REGISTER_KERNEL
@@ -1229,19 +1229,19 @@ REGISTER_KERNEL(tstring, int64_t);
           lookup::MutableDenseHashTable<key_dtype, value_dtype>, key_dtype, \
           value_dtype>)
 
-REGISTER_KERNEL(int32, double);
-REGISTER_KERNEL(int32, float);
-REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32_t, double);
+REGISTER_KERNEL(int32_t, float);
+REGISTER_KERNEL(int32_t, int32_t);
 REGISTER_KERNEL(int64_t, bool);
 REGISTER_KERNEL(int64_t, double);
 REGISTER_KERNEL(int64_t, float);
-REGISTER_KERNEL(int64_t, int32);
+REGISTER_KERNEL(int64_t, int32_t);
 REGISTER_KERNEL(int64_t, int64_t);
 REGISTER_KERNEL(int64_t, Variant);
 REGISTER_KERNEL(tstring, bool);
 REGISTER_KERNEL(tstring, double);
 REGISTER_KERNEL(tstring, float);
-REGISTER_KERNEL(tstring, int32);
+REGISTER_KERNEL(tstring, int32_t);
 REGISTER_KERNEL(tstring, int64_t);
 REGISTER_KERNEL(tstring, ResourceHandle);
 
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index daa7f6e32dc9dd..840720d2e3e61d 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -300,7 +300,7 @@ class HashTable : public InitializableLookupTable {
     return absl::OkStatus();
   };
 
-  absl::Status DoLazyPrepare(std::function<int64(void)> size_fn) override {
+  absl::Status DoLazyPrepare(std::function<int64_t(void)> size_fn) override {
     return DoPrepare(size_fn());
   }
 
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 3576b6c7339bd1..744b2e9c21b5ac 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -37,13 +37,13 @@ static const int kInputBufferSize = 1 * 1024 * 1024; /* bytes */
 static const int kLineNumber = -1;
 static const int kWholeLine = -2;
 
-absl::Status GetNumLinesInTextFile(Env* env, const string& vocab_file,
+absl::Status GetNumLinesInTextFile(Env* env, const std::string& vocab_file,
                                    int64_t* num_lines) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(vocab_file, &file));
 
   io::InputBuffer input_buffer(file.get(), kInputBufferSize);
-  string line;
+  std::string line;
   absl::Status s = input_buffer.ReadLine(&line);
   int64_t next_id = 0;
   while (s.ok()) {
@@ -81,9 +81,10 @@ class TextFileLineIterator
   // - Index -1 means the line number stored in int64.
   // - Index >= 0 represent index (starting at zero) of the split line based on
   //   delimiter.
-  absl::Status Init(const string& filename, int64_t vocab_size, char delimiter,
-                    DataType key_dtype, int64_t key_index, DataType value_dtype,
-                    int64_t value_index, int64_t offset, Env* env) {
+  absl::Status Init(const std::string& filename, int64_t vocab_size,
+                    char delimiter, DataType key_dtype, int64_t key_index,
+                    DataType value_dtype, int64_t value_index, int64_t offset,
+                    Env* env) {
     filename_ = filename;
     vocab_size_ = vocab_size;
     delimiter_ = delimiter;
@@ -108,7 +109,7 @@ class TextFileLineIterator
   void Next() override {
     if (!valid_) return;
 
-    string line;
+    std::string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
       if (absl::IsOutOfRange(status_) && vocab_size_ != -1 &&
@@ -137,7 +138,7 @@ class TextFileLineIterator
       return;
     }
 
-    std::vector<string> tokens;
+    std::vector<std::string> tokens;
     if (!ignore_split_) {
       tokens = str_util::Split(line, delimiter_);
       const auto expected_size =
@@ -197,7 +198,7 @@ class TextFileLineIterator
   int64_t next_id_;
   int64_t offset_;
   int64_t vocab_size_;
-  string filename_;
+  std::string filename_;
   char delimiter_;
   absl::Status status_;
   bool ignore_split_;
@@ -206,13 +207,14 @@ class TextFileLineIterator
 
   // Set the corresponding value from line or tokens based on 'index' into the
   // tensor 't'. The value is transformed to the given data type 'dtype'.
-  absl::Status SetValue(const string& line, const std::vector<string>& tokens,
-                        int64_t index, Tensor* tensor) {
+  absl::Status SetValue(const std::string& line,
+                        const std::vector<std::string>& tokens, int64_t index,
+                        Tensor* tensor) {
     if (index == kLineNumber) {
       tensor->flat<int64_t>()(0) = next_id_ + offset_;
       return absl::OkStatus();
     }
-    const string& token = (index == kWholeLine) ? line : tokens[index];
+    const std::string& token = (index == kWholeLine) ? line : tokens[index];
     const DataType& dtype = tensor->dtype();
     switch (dtype) {
       case DT_INT32: {
@@ -222,7 +224,7 @@ class TextFileLineIterator
           return errors::InvalidArgument("Field ", token, " in line ", next_id_,
                                          " is not a valid int32.");
         }
-        tensor->flat<int32>()(0) = value + offset_;
+        tensor->flat<int32_t>()(0) = value + offset_;
       } break;
       case DT_INT64: {
         int64_t value;
@@ -267,7 +269,7 @@ class TextFileLineIterator
 };
 
 absl::Status GetTableHandle(absl::string_view input_name, OpKernelContext* ctx,
-                            string* container, string* table_handle) {
+                            std::string* container, std::string* table_handle) {
   {
     mutex* mu;
     TF_RETURN_IF_ERROR(ctx->input_ref_mutex(input_name, &mu));
@@ -300,8 +302,8 @@ absl::Status GetResourceLookupTable(absl::string_view input_name,
 absl::Status GetReferenceLookupTable(absl::string_view input_name,
                                      OpKernelContext* ctx,
                                      LookupInterface** table) {
-  string container;
-  string table_handle;
+  std::string container;
+  std::string table_handle;
   TF_RETURN_IF_ERROR(
       GetTableHandle(input_name, ctx, &container, &table_handle));
   return ctx->resource_manager()->Lookup(container, table_handle, table);
@@ -335,8 +337,8 @@ absl::Status GetInitializableLookupTable(absl::string_view input_name,
                                      handle.name(), " is not initializable");
     }
   } else {
-    string container;
-    string table_handle;
+    std::string container;
+    std::string table_handle;
     TF_RETURN_IF_ERROR(
         GetTableHandle(input_name, ctx, &container, &table_handle));
     TF_RETURN_IF_ERROR(ctx->resource_manager()->Lookup(container, table_handle,
@@ -353,7 +355,7 @@ absl::Status GetInitializableLookupTable(absl::string_view input_name,
 
 absl::Status CheckTableDataTypes(const LookupInterface& table,
                                  DataType key_dtype, DataType value_dtype,
-                                 const string& table_name) {
+                                 const std::string& table_name) {
   if (table.key_dtype() != key_dtype || table.value_dtype() != value_dtype) {
     return errors::InvalidArgument(
         "Conflicting key/value dtypes ", DataTypeString(key_dtype), "->",
@@ -365,7 +367,7 @@ absl::Status CheckTableDataTypes(const LookupInterface& table,
 }
 
 // Helper function to initialize an InitializableLookupTable from a text file.
-absl::Status InitializeTableFromTextFile(const string& filename,
+absl::Status InitializeTableFromTextFile(const std::string& filename,
                                          int64_t vocab_size, char delimiter,
                                          int32_t key_index, int32_t value_index,
                                          int64_t offset, Env* env,
@@ -376,7 +378,7 @@ absl::Status InitializeTableFromTextFile(const string& filename,
 }
 
 absl::Status InitializeTableFromTextFile(
-    const string& filename, int64_t vocab_size, char delimiter,
+    const std::string& filename, int64_t vocab_size, char delimiter,
     int32_t key_index, int32_t value_index, int64_t offset, Env* env,
     std::unique_ptr<InitializableLookupTable::InitializerSerializer> serializer,
     InitializableLookupTable* table) {
diff --git a/tensorflow/core/kernels/lookup_util.h b/tensorflow/core/kernels/lookup_util.h
index 677c6a5659fc23..e48718ad805bdb 100644
--- a/tensorflow/core/kernels/lookup_util.h
+++ b/tensorflow/core/kernels/lookup_util.h
@@ -53,10 +53,10 @@ absl::Status GetInitializableLookupTable(absl::string_view input_name,
 // table's data types.
 absl::Status CheckTableDataTypes(const LookupInterface& table,
                                  DataType key_dtype, DataType value_dtype,
-                                 const string& table_name);
+                                 const std::string& table_name);
 
 // Initializes `table` from `filename`.
-absl::Status InitializeTableFromTextFile(const string& filename,
+absl::Status InitializeTableFromTextFile(const std::string& filename,
                                          int64_t vocab_size, char delimiter,
                                          int32_t key_index, int32_t value_index,
                                          int64_t offset, Env* env,
@@ -65,7 +65,7 @@ absl::Status InitializeTableFromTextFile(const string& filename,
 // Initializes `table` from `filename`. `func` may specify how to represent the
 // initializer as a graphdef, so that the table can be serialized as metadata.
 absl::Status InitializeTableFromTextFile(
-    const string& filename, int64_t vocab_size, char delimiter,
+    const std::string& filename, int64_t vocab_size, char delimiter,
     int32_t key_index, int32_t value_index, int64_t offset, Env* env,
     std::unique_ptr<InitializableLookupTable::InitializerSerializer> serializer,
     InitializableLookupTable* table);
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index a4843b04d84b1b..3c8515d522501b 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -40,13 +40,13 @@ class LRNFloatTest : public OpsTestBase {
  protected:
   LRNFloatTest() : philox_(123, 17), rand_(&philox_) {}
 
-  int GetIntAttr(const string& name) {
+  int GetIntAttr(const std::string& name) {
     int value;
     TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
     return value;
   }
 
-  float GetFloatAttr(const string& name) {
+  float GetFloatAttr(const std::string& name) {
     float value;
     TF_CHECK_OK(GetNodeAttr(*node_def(), name, &value));
     return value;
diff --git a/tensorflow/core/kernels/map_kernels.h b/tensorflow/core/kernels/map_kernels.h
index 6949ff554a286b..ab57ba02dccbc4 100644
--- a/tensorflow/core/kernels/map_kernels.h
+++ b/tensorflow/core/kernels/map_kernels.h
@@ -102,7 +102,7 @@ class TensorMapSize : public OpKernel {
     OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
     Tensor* result;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
-    result->scalar<int32>()() = map->tensors().size();
+    result->scalar<int32_t>()() = map->tensors().size();
   }
 };
 
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 14787c38e72502..12e018dfdd311d 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -489,7 +489,7 @@ class StagingMap : public ResourceBase {
     return map_.size();
   }
 
-  string DebugString() const override { return "StagingMap"; }
+  std::string DebugString() const override { return "StagingMap"; }
 };
 
 template <bool Ordered>
@@ -736,7 +736,7 @@ class MapSizeOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
-    size->scalar<int32>().setConstant(map->size());
+    size->scalar<int32_t>().setConstant(map->size());
   }
 };
 
@@ -766,7 +766,7 @@ class MapIncompleteSizeOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &size));
 
     // Set it to the actual size
-    size->scalar<int32>().setConstant(map->incomplete_size());
+    size->scalar<int32_t>().setConstant(map->incomplete_size());
   }
 };
 
diff --git a/tensorflow/core/kernels/matching_files_op.cc b/tensorflow/core/kernels/matching_files_op.cc
index 515e58d518a129..c48e6aeeab3bad 100644
--- a/tensorflow/core/kernels/matching_files_op.cc
+++ b/tensorflow/core/kernels/matching_files_op.cc
@@ -43,7 +43,7 @@ class MatchingFilesOp : public OpKernel {
     const auto patterns = patterns_t->flat<tstring>();
     int num_patterns = patterns.size();
     int num_files = 0;
-    std::vector<std::vector<string>> all_fnames(num_patterns);
+    std::vector<std::vector<std::string>> all_fnames(num_patterns);
     for (int i = 0; i < num_patterns; i++) {
       OP_REQUIRES_OK(context, context->env()->GetMatchingPaths(patterns(i),
                                                                &all_fnames[i]));
diff --git a/tensorflow/core/kernels/matmul_op_real.cc b/tensorflow/core/kernels/matmul_op_real.cc
index 46fbf83a53e067..54049fb852c008 100644
--- a/tensorflow/core/kernels/matmul_op_real.cc
+++ b/tensorflow/core/kernels/matmul_op_real.cc
@@ -29,18 +29,18 @@ TF_CALL_int64(REGISTER_BATCH_MATMUL_CPU);
 REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, bfloat16, bfloat16);
 REGISTER_BATCH_MATMUL_TOUT_CPU(float, float, float);
 REGISTER_BATCH_MATMUL_TOUT_CPU(double, double, double);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int16, int16, int16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int32, int32, int32);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int16_t, int16_t, int16_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int32_t, int32_t, int32_t);
 REGISTER_BATCH_MATMUL_TOUT_CPU(int64_t, int64_t, int64_t);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int8, int8, int32);
-REGISTER_BATCH_MATMUL_TOUT_CPU(uint8, int8, int32);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int8, uint8, int32);
-REGISTER_BATCH_MATMUL_TOUT_CPU(uint8, uint8, int32);
-
-REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, int8, bfloat16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, uint8, bfloat16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(int8, bfloat16, bfloat16);
-REGISTER_BATCH_MATMUL_TOUT_CPU(uint8, bfloat16, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int8_t, int8_t, int32_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(uint8_t, int8_t, int32_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int8_t, uint8_t, int32_t);
+REGISTER_BATCH_MATMUL_TOUT_CPU(uint8_t, uint8_t, int32_t);
+
+REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, int8_t, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(bfloat16, uint8_t, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(int8_t, bfloat16, bfloat16);
+REGISTER_BATCH_MATMUL_TOUT_CPU(uint8_t, bfloat16, bfloat16);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_BATCH_MATMUL_GPU);
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
index 4562998b2848aa..e755ceb2beed1a 100644
--- a/tensorflow/core/kernels/matmul_op_test.cc
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -52,7 +52,7 @@ class FusedMatMulOpTest : public OpsTestBase {
   // of 'fetch' node into the output Tensor. Optional `fetch_node` parameter
   // allows to define a fetch node directly using a NodeDef for the ops that are
   // not supported by the C++ Api.
-  void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
+  void RunAndFetch(const tensorflow::Scope& root, const std::string& fetch,
                    Tensor* output, bool allow_gpu_device,
                    const NodeDef* fetch_node = nullptr,
                    absl::Status* last_status = nullptr) {
@@ -97,7 +97,8 @@ class FusedMatMulOpTest : public OpsTestBase {
     // to compare GPU vs CPU numbers, so place all nodes on CPU in this case.
     const bool place_all_on_gpu = allow_gpu_device && has_gpu_device;
 
-    const string device = place_all_on_gpu ? "/device:GPU:0" : "/device:CPU:0";
+    const std::string device =
+        place_all_on_gpu ? "/device:GPU:0" : "/device:CPU:0";
     for (NodeDef& mutable_node : *graph.mutable_node()) {
       mutable_node.set_device(device);
     }
@@ -137,7 +138,7 @@ class FusedMatMulOpTest : public OpsTestBase {
 
   void RunMatMulWithBiasAndActivation(
       const Tensor& lhs_data, const Tensor& rhs_data, const Tensor& bias_data,
-      bool transpose_a, bool transpose_b, const string& activation_type,
+      bool transpose_a, bool transpose_b, const std::string& activation_type,
       Tensor* output, bool allow_gpu_device = false) {
     Scope root = tensorflow::Scope::NewRootScope();
 
@@ -175,8 +176,8 @@ class FusedMatMulOpTest : public OpsTestBase {
 
   void RunFusedMatMulOp(const Tensor& lhs_data, const Tensor& rhs_data,
                         const std::vector<Tensor>& args_data,
-                        const std::vector<string>& fused_ops, bool transpose_a,
-                        bool transpose_b, Tensor* output,
+                        const std::vector<std::string>& fused_ops,
+                        bool transpose_a, bool transpose_b, Tensor* output,
                         bool allow_gpu_device = false,
                         bool* test_skipped = nullptr) {
     Scope root = tensorflow::Scope::NewRootScope();
@@ -295,7 +296,7 @@ class FusedMatMulOpTest : public OpsTestBase {
   // to FusedMatMul.
   void VerifyConv2DWithBiasAndActivation(int m, int k, int n, bool transpose_a,
                                          bool transpose_b,
-                                         const string& activation) {
+                                         const std::string& activation) {
     bool use_gpu_device =
         activation == "Relu" || (this->kTValueType == DT_HALF);
     const BiasAddGraphRunner run_default =
@@ -372,7 +373,7 @@ static auto GetActivations(DataType dtype) {
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x128x64WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(256, 128, 64, false, false,
                                             activation);
     this->VerifyConv2DWithBiasAndActivation(256, 128, 64, true, false,
@@ -385,21 +386,21 @@ TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x128x64WithActivation) {
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x256WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(1, 256, 256, false, false,
                                             activation);
   }
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul256x256x1WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(256, 256, 1, false, false,
                                             activation);
   }
 }
 
 TYPED_TEST_P(FusedMatMulWithBiasOpTest, MatMul1x256x1WithActivation) {
-  for (const string& activation : GetActivations(this->kTValueType)) {
+  for (const std::string& activation : GetActivations(this->kTValueType)) {
     this->VerifyConv2DWithBiasAndActivation(1, 256, 1, false, false,
                                             activation);
   }
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index bc99ad59db4543..c20e9a957be25d 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -227,7 +227,7 @@ template <class Device, class T>
 class MaxPoolingGradOp : public OpKernel {
  public:
   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -289,16 +289,16 @@ class MaxPoolingGradOp : public OpKernel {
     OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum<int64_t>::v(),
                                                    tensor_out.shape(),
                                                    &tensor_out_arg_max));
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -351,8 +351,8 @@ class MaxPoolingGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -473,7 +473,7 @@ class MaxPoolingGradGradOp : public OpKernel {
  public:
   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -518,16 +518,16 @@ class MaxPoolingGradGradOp : public OpKernel {
         context, out_grad_backprop.dims() == 4,
         errors::InvalidArgument("out_grad_backprop must be 4-dimensional"));
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -683,8 +683,8 @@ class MaxPoolingGradGradOp : public OpKernel {
           params.tensor_in_batch, shard_cost, shard);
   }
 
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -815,7 +815,7 @@ class MaxPoolingNoMaskOp : public OpKernel {
  public:
   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -866,8 +866,8 @@ class MaxPoolingNoMaskOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -877,7 +877,7 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
  public:
   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -912,17 +912,17 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
 
     if (context->num_inputs() != 1) {
       const Tensor& tensor_ksize = context->input(1);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(2);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -956,8 +956,8 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -1036,8 +1036,8 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   bool propagate_nans_;
   bool include_batch_in_index_;
@@ -1109,7 +1109,7 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
  public:
   explicit MaxPoolingGradWithArgmaxOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format_str;
+    std::string data_format_str;
     if (std::is_same<Device, GPUDevice>::value) {
       OP_REQUIRES(context, !tensorflow::OpDeterminismRequired(),
                   errors::Unimplemented("Determinism is not yet supported "
@@ -1187,8 +1187,8 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
   bool include_batch_in_index_;
@@ -1257,8 +1257,8 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   bool include_batch_in_index_;
 };
diff --git a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
index cc838aace88f33..d1185f0d5d7998 100644
--- a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
+++ b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
@@ -34,7 +34,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void WriteCheckpoint(const string& prefix, absl::Span<const string> names,
+void WriteCheckpoint(const std::string& prefix,
+                     absl::Span<const std::string> names,
                      absl::Span<const Tensor> tensors) {
   BundleWriter writer(Env::Default(), prefix);
   ASSERT_TRUE(names.size() == tensors.size());
@@ -65,12 +66,12 @@ class MergeV2CheckpointsOpTest : public OpsTestBase {
 
   void RunMergeTest(bool delete_old_dirs, bool allow_missing_files) {
     // Writes two checkpoints.
-    const std::vector<string> prefixes = {
+    const std::vector<std::string> prefixes = {
         io::JoinPath(testing::TmpDir(), "worker0/ckpt0"),
         io::JoinPath(testing::TmpDir(), "worker1/ckpt1"),
         io::JoinPath(testing::TmpDir(), "merged/ckpt") /* merged prefix */};
     // In a different directory, to exercise "delete_old_dirs".
-    const string& kMergedPrefix = prefixes[2];
+    const std::string& kMergedPrefix = prefixes[2];
 
     // Only write this particular checkpoint if we do not allow missing files.
     if (!allow_missing_files) {
@@ -123,9 +124,10 @@ class MergeV2CheckpointsOpTest : public OpsTestBase {
     for (int i = 0; i < 2; ++i) {
       // If we allow missing files, the first checkpoint file did not exist.
       if (allow_missing_files && i == 0) continue;
-      int directory_found = Env::Default()
-                                ->IsDirectory(string(io::Dirname(prefixes[i])))
-                                .raw_code();
+      int directory_found =
+          Env::Default()
+              ->IsDirectory(std::string(io::Dirname(prefixes[i])))
+              .raw_code();
       if (delete_old_dirs) {
         EXPECT_EQ(error::NOT_FOUND, directory_found);
       } else {
diff --git a/tensorflow/core/kernels/mfcc_op.cc b/tensorflow/core/kernels/mfcc_op.cc
index 2c5f9560aaa31c..760781605239fb 100644
--- a/tensorflow/core/kernels/mfcc_op.cc
+++ b/tensorflow/core/kernels/mfcc_op.cc
@@ -49,7 +49,7 @@ class MfccOp : public OpKernel {
                 errors::InvalidArgument(
                     "Input sample_rate should be a scalar tensor, got ",
                     sample_rate_tensor.shape().DebugString(), " instead."));
-    const int32_t sample_rate = sample_rate_tensor.scalar<int32>()();
+    const int32_t sample_rate = sample_rate_tensor.scalar<int32_t>()();
 
     const int spectrogram_channels = spectrogram.dim_size(2);
     const int spectrogram_samples = spectrogram.dim_size(1);
@@ -105,8 +105,8 @@ class MfccOp : public OpKernel {
  private:
   float upper_frequency_limit_;
   float lower_frequency_limit_;
-  int32 filterbank_channel_count_;
-  int32 dct_coefficient_count_;
+  int32_t filterbank_channel_count_;
+  int32_t dct_coefficient_count_;
 };
 REGISTER_KERNEL_BUILDER(Name("Mfcc").Device(DEVICE_CPU), MfccOp);
 
diff --git a/tensorflow/core/kernels/multinomial_op_test.cc b/tensorflow/core/kernels/multinomial_op_test.cc
index df2d0af01c7bea..e7ce0bddbd6119 100644
--- a/tensorflow/core/kernels/multinomial_op_test.cc
+++ b/tensorflow/core/kernels/multinomial_op_test.cc
@@ -29,7 +29,7 @@ static Graph* Multinomial(int batch_size, int num_classes, int num_samples) {
   Tensor logits_t(DT_FLOAT, TensorShape({batch_size, num_classes}));
   Tensor num_samples_t(DT_INT32, TensorShape());
   logits_t.flat<float>().setRandom();
-  num_samples_t.scalar<int32>().setConstant(num_samples);
+  num_samples_t.scalar<int32_t>().setConstant(num_samples);
 
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("multinomial"), "Multinomial")
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index 8fa7170c3c0c59..61a745df498cdd 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -36,7 +36,7 @@ namespace {
 
 class Mutex : public ResourceBase {
  public:
-  explicit Mutex(OpKernelContext* c, const string& name)
+  explicit Mutex(OpKernelContext* c, const std::string& name)
       : locked_(false),
         thread_pool_(new thread::ThreadPool(
             c->env(), ThreadOptions(),
@@ -46,7 +46,9 @@ class Mutex : public ResourceBase {
     VLOG(2) << "Creating mutex with name " << name << ": " << this;
   }
 
-  string DebugString() const override { return absl::StrCat("Mutex ", name_); }
+  std::string DebugString() const override {
+    return absl::StrCat("Mutex ", name_);
+  }
 
   class LockReleaser {
    public:
@@ -127,7 +129,7 @@ class Mutex : public ResourceBase {
   condition_variable cv_ TF_GUARDED_BY(mu_);
   bool locked_ TF_GUARDED_BY(mu_);
   std::unique_ptr<thread::ThreadPool> thread_pool_;
-  string name_;
+  std::string name_;
 };
 
 }  // namespace
diff --git a/tensorflow/core/kernels/nn_ops_test.cc b/tensorflow/core/kernels/nn_ops_test.cc
index 87b3f4d98d344f..dfaad0122c6e57 100644
--- a/tensorflow/core/kernels/nn_ops_test.cc
+++ b/tensorflow/core/kernels/nn_ops_test.cc
@@ -57,8 +57,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-static void SetConstOp(const string& name, std::initializer_list<int64_t> dims,
-                       DataType data_type, NodeDef* node) {
+static void SetConstOp(const std::string& name,
+                       std::initializer_list<int64_t> dims, DataType data_type,
+                       NodeDef* node) {
   Tensor tensor(data_type, TensorShape(dims));
   for (int64_t i = 0; i < tensor.NumElements(); ++i) {
     switch (data_type) {
@@ -81,13 +82,13 @@ static void SetConstOp(const string& name, std::initializer_list<int64_t> dims,
                   .Finalize(node));
 }
 
-static void SetConstSizesOp(const string& name, const std::vector<int32>& sizes,
-                            NodeDef* node) {
+static void SetConstSizesOp(const std::string& name,
+                            const std::vector<int32_t>& sizes, NodeDef* node) {
   TensorShape shape;
   shape.AddDim(sizes.size());
   Tensor tensor(DT_INT32, shape);
   for (int64_t i = 0; i < tensor.NumElements(); ++i) {
-    tensor.flat<int32>()(i) = sizes[i];
+    tensor.flat<int32_t>()(i) = sizes[i];
   }
   TF_CHECK_OK(NodeDefBuilder(name, "Const")
                   .Attr("dtype", DT_INT32)
@@ -112,7 +113,7 @@ static void BM_ConvFloat(::testing::benchmark::State& state, int batch,
                          int filter_rows, int filter_cols, CONV_OP op,
                          int num_threads, int stride, Padding padding,
                          bool use_gpu, DataType data_type,
-                         const string& label) {
+                         const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -159,19 +160,19 @@ static void BM_ConvFloat(::testing::benchmark::State& state, int batch,
   SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth},
              data_type, graph.add_node());
   SetConstSizesOp("input_sizes",
-                  std::vector<int32>({batch, rows, cols, in_depth}),
+                  std::vector<int32_t>({batch, rows, cols, in_depth}),
                   graph.add_node());
   SetConstSizesOp(
       "filter_sizes",
-      std::vector<int32>({filter_rows, filter_cols, in_depth, out_depth}),
+      std::vector<int32_t>({filter_rows, filter_cols, in_depth, out_depth}),
       graph.add_node());
-  SetConstSizesOp("resize_size", std::vector<int32>({rows, cols}),
+  SetConstSizesOp("resize_size", std::vector<int32_t>({rows, cols}),
                   graph.add_node());
 
   TensorShape paddings_shape({4, 2});
   Tensor paddings_tensor(DT_INT32, paddings_shape);
   for (int64_t i = 0; i < paddings_tensor.NumElements(); ++i) {
-    paddings_tensor.flat<int32>()(i) = 0;
+    paddings_tensor.flat<int32_t>()(i) = 0;
   }
   TF_CHECK_OK(NodeDefBuilder("paddings", "Const")
                   .Attr("dtype", DT_INT32)
@@ -234,7 +235,7 @@ static void BM_ConvFloat(::testing::benchmark::State& state, int batch,
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
 
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   test::Benchmark(device, g, &options, nullptr, nullptr, "",
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -540,7 +541,7 @@ static void BM_ConvFloatDepthwise(::testing::benchmark::State& state, int batch,
                                   int filter_rows, int filter_cols,
                                   DEPTHWISE_CONV_OP op, int num_threads,
                                   int stride, Padding padding, bool use_gpu,
-                                  const string& label) {
+                                  const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -594,10 +595,10 @@ static void BM_ConvFloatDepthwise(::testing::benchmark::State& state, int batch,
   SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth}, dtype,
              graph.add_node());
   SetConstSizesOp("input_sizes",
-                  std::vector<int32>({batch, rows, cols, in_depth}),
+                  std::vector<int32_t>({batch, rows, cols, in_depth}),
                   graph.add_node());
   SetConstSizesOp("filter_sizes",
-                  std::vector<int32>(
+                  std::vector<int32_t>(
                       {filter_rows, filter_cols, in_depth, depth_multiplier}),
                   graph.add_node());
 
@@ -637,7 +638,7 @@ static void BM_ConvFloatDepthwise(::testing::benchmark::State& state, int batch,
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
 
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   test::Benchmark(device, g, &options, nullptr, nullptr, "",
                   /*old_benchmark_api=*/false)
       .Run(state);
@@ -788,7 +789,7 @@ BM_ConvFloatDepthwiseBk_All(bfloat16);
 
 static void BM_LRNFloat(::testing::benchmark::State& state, int depth, int cols,
                         int rows, int batch_size, int range, int num_threads,
-                        const string& label) {
+                        const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -869,7 +870,7 @@ AvgPooling Op
 static void BM_AvgPool(::testing::benchmark::State& state, int batch_size,
                        int rows, int cols, int depth, int kernel_rows,
                        int kernel_cols, int stride, Padding padding,
-                       int num_threads, const string& label) {
+                       int num_threads, const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -960,7 +961,7 @@ BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME");
 static void BM_AvgPoolBk(::testing::benchmark::State& state, int batch_size,
                          int rows, int cols, int depth, int kernel_rows,
                          int kernel_cols, int stride, Padding padding,
-                         int num_threads, const string& label) {
+                         int num_threads, const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -979,9 +980,9 @@ static void BM_AvgPoolBk(::testing::benchmark::State& state, int batch_size,
   TensorShape output_shape({batch_size, out_height, out_width, depth});
   TensorShape shape2({4});
   Tensor input_shape_tensor(DT_INT32, shape2);
-  int32 input_dims[] = {batch_size, rows, cols, depth};
+  int32_t input_dims[] = {batch_size, rows, cols, depth};
   for (int i = 0; i < 4; i++) {
-    input_shape_tensor.flat<int32>()(i) = input_dims[i];
+    input_shape_tensor.flat<int32_t>()(i) = input_dims[i];
   }
   inputs.push_back({nullptr, &input_shape_tensor});
 
@@ -1063,7 +1064,7 @@ MaxPooling Op
 static void BM_MaxPool(::testing::benchmark::State& state, int batch_size,
                        int rows, int cols, int depth, int kernel_rows,
                        int kernel_cols, int stride, Padding padding,
-                       int num_threads, const string& label) {
+                       int num_threads, const std::string& label) {
   SessionOptions options;
   options.config.set_intra_op_parallelism_threads(num_threads);
 
@@ -1158,7 +1159,8 @@ BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME");
 static void BM_MaxPoolBk(::testing::benchmark::State& state, int batch_size,
                          int rows, int cols, int depth, int kernel_rows,
                          int kernel_cols, int stride, Padding padding,
-                         int num_threads, bool use_gpu, const string& label) {
+                         int num_threads, bool use_gpu,
+                         const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -1192,7 +1194,7 @@ static void BM_MaxPoolBk(::testing::benchmark::State& state, int batch_size,
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   test::Benchmark(device, g, /*old_benchmark_api*/ false).Run(state);
 
   state.SetItemsProcessed(batch_size * rows * cols * depth *
@@ -1252,7 +1254,7 @@ Run benchmark with:
 */
 static void BM_ReluFloat(::testing::benchmark::State& state, int batch_size,
                          int rows, int cols, int depth, int num_threads,
-                         const string& label) {
+                         const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -1323,7 +1325,7 @@ Run benchmark with:
 */
 static void BM_SoftplusFloat(::testing::benchmark::State& state, int batch_size,
                              int rows, int cols, int depth, int num_threads,
-                             const string& label) {
+                             const std::string& label) {
   std::unique_ptr<Device> device(
       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
 
@@ -1392,7 +1394,7 @@ BM_Softplus(32, 14, 14, 576, 4, "softplus10");
 static void BM_ImageNetSoftmaxFwd(::testing::benchmark::State& state,
                                   int batch_size, int node_depth,
                                   int num_threads, bool use_gpu,
-                                  const string& label) {
+                                  const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -1409,7 +1411,7 @@ static void BM_ImageNetSoftmaxFwd(::testing::benchmark::State& state,
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   SessionOptions opts;
   opts.config.set_inter_op_parallelism_threads(1);
   opts.config.set_intra_op_parallelism_threads(num_threads);
@@ -1444,7 +1446,8 @@ BM_ImageNetSoftmaxFwd(8192, 1024, 1, true, "softmax32");
 BM_ImageNetSoftmaxFwd(8192, 32768, 1, true, "softmax128");
 
 static void BM_TopK(::testing::benchmark::State& state, int rows, int cols,
-                    int k, int num_threads, bool use_gpu, const string& label) {
+                    int k, int num_threads, bool use_gpu,
+                    const std::string& label) {
   if (!IsGoogleCudaEnabled() && use_gpu) {
     state.SkipWithError(
         absl::StrCat("Skipping GPU test (no --config=cuda): ", label));
@@ -1458,14 +1461,14 @@ static void BM_TopK(::testing::benchmark::State& state, int rows, int cols,
   input.flat<float>().setRandom();
 
   Tensor input_k(DT_INT32, TensorShape({}));
-  input_k.scalar<int32>()() = k;
+  input_k.scalar<int32_t>()() = k;
 
   auto top_k = ops::TopK(root, input, input_k, ops::TopK::Sorted(true));
 
   TF_CHECK_OK(root.status());
   Graph* g = new Graph(OpRegistry::Global());
   TF_CHECK_OK(root.ToGraph(g));
-  string device = use_gpu ? "gpu" : "cpu";
+  std::string device = use_gpu ? "gpu" : "cpu";
   SessionOptions opts;
   opts.config.set_inter_op_parallelism_threads(1);
   opts.config.set_intra_op_parallelism_threads(num_threads);
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index 104a4c9421d188..12db3b63d8cdad 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -43,7 +43,7 @@ class NthElementOp : public OpKernel {
     OP_REQUIRES(
         context, TensorShapeUtils::IsScalar(n_in.shape()),
         errors::InvalidArgument("N must be scalar but has rank ", n_in.dims()));
-    int n = n_in.scalar<int32>()();
+    int n = n_in.scalar<int32_t>()();
     OP_REQUIRES(context, n >= 0,
                 errors::InvalidArgument("n must be non-negative but is ", n));
 
diff --git a/tensorflow/core/kernels/one_hot_op.cc b/tensorflow/core/kernels/one_hot_op.cc
index 1a7ef6a9a46d0f..4a205ac3503f2e 100644
--- a/tensorflow/core/kernels/one_hot_op.cc
+++ b/tensorflow/core/kernels/one_hot_op.cc
@@ -77,7 +77,7 @@ class OneHotOp : public OpKernel {
     const int axis = (axis_ == -1) ? indices_dims : axis_;
 
     // The one-hot dimension.
-    const int32_t depth_v = depth.scalar<int32>()();
+    const int32_t depth_v = depth.scalar<int32_t>()();
     OP_REQUIRES(
         ctx, depth_v >= 0,
         errors::InvalidArgument("depth must be non-negative, got: ", depth_v));
@@ -122,7 +122,7 @@ class OneHotOp : public OpKernel {
   }
 
  private:
-  int32 axis_;
+  int32_t axis_;
 
   OneHotOp(const OneHotOp&) = delete;
   void operator=(const OneHotOp&) = delete;
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 4efbac731bcaf2..ec0c6a1adcadf5 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -176,7 +176,7 @@ void OpsTestBase::CreateContext() {
   params_->frame_iter = FrameAndIter(0, 0);
   params_->inputs = inputs_;
   params_->op_kernel = kernel_.get();
-  step_container_.reset(new ScopedStepContainer(0, [](const string&) {}));
+  step_container_.reset(new ScopedStepContainer(0, [](const std::string&) {}));
   params_->step_container = step_container_.get();
   test::SetOutputAttrs(params_.get(), &out_alloc_attrs_);
   params_->slice_reader_cache = &slice_reader_cache_wrapper_;
diff --git a/tensorflow/core/kernels/ops_testutil.h b/tensorflow/core/kernels/ops_testutil.h
index ef4a7cd5142cde..da2ccad9cbba72 100644
--- a/tensorflow/core/kernels/ops_testutil.h
+++ b/tensorflow/core/kernels/ops_testutil.h
@@ -119,7 +119,7 @@ class OpsTestBase : public ::testing::Test {
   // Adds a Resource type as input. If <container> is empty, uses the default
   // container name.
   template <typename T>
-  void AddResourceInput(const string& container, const string& name,
+  void AddResourceInput(const std::string& container, const std::string& name,
                         T* resource) {
     CHECK_GT(input_types_.size(), inputs_.size())
         << "Adding more inputs than types; perhaps you need to call MakeOp";
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 3b50099fb9997c..bd9a07006a8870 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -36,7 +36,8 @@ namespace tensorflow {
 
 PaddingFIFOQueue::PaddingFIFOQueue(
     int capacity, const DataTypeVector& component_dtypes,
-    const std::vector<PartialTensorShape>& component_shapes, const string& name)
+    const std::vector<PartialTensorShape>& component_shapes,
+    const std::string& name)
     : FIFOQueue(capacity, component_dtypes,
                 ConvertShapesPartialDimensionsToZero(component_shapes), name),
       partial_shapes_(component_shapes) {}
diff --git a/tensorflow/core/kernels/padding_fifo_queue.h b/tensorflow/core/kernels/padding_fifo_queue.h
index 74107e80b1977b..f05862ff9b3bdd 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.h
+++ b/tensorflow/core/kernels/padding_fifo_queue.h
@@ -36,7 +36,7 @@ class PaddingFIFOQueue : public FIFOQueue {
  public:
   PaddingFIFOQueue(int32_t capacity, const DataTypeVector& component_dtypes,
                    const std::vector<PartialTensorShape>& component_shapes,
-                   const string& name);
+                   const std::string& name);
 
   absl::Status Initialize() override;
 
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index 782c22c5efd43c..66ec30bc4a2136 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -777,8 +777,8 @@ class StatelessParameterizedTruncatedNormal : public OpKernel {
                                 shape_tensor.shape().DebugString()));
     TensorShape output_shape;
     if (shape_tensor.dtype() == DataType::DT_INT32) {
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
-                                                      &output_shape));
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                              shape_tensor.vec<int32_t>(), &output_shape));
     } else {
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
                               shape_tensor.vec<int64_t>(), &output_shape));
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
index 1d5865587e0c13..1257b8da742ce2 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 static Graph* PTruncatedNormal(int num_batches, int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   // Use mean 0 and stdev 1
   Tensor means_t(DT_FLOAT, TensorShape({num_batches}));
@@ -56,7 +56,7 @@ static Graph* PTruncatedNormal(int num_batches, int samples_per_batch) {
 static Graph* PTruncatedNormal2SD(int num_batches, int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   Tensor means_t(DT_FLOAT, TensorShape({num_batches}));
   means_t.flat<float>().setConstant(0.0);
@@ -83,7 +83,7 @@ static Graph* PTruncatedNormal2SD(int num_batches, int samples_per_batch) {
 static Graph* PTruncatedNormalOneTail(int num_batches, int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   Tensor means_t(DT_FLOAT, TensorShape({num_batches}));
   means_t.flat<float>().setConstant(0.0);
diff --git a/tensorflow/core/kernels/parse_tensor_test.cc b/tensorflow/core/kernels/parse_tensor_test.cc
index 1473eff064e3ea..d5a40489b64fd3 100644
--- a/tensorflow/core/kernels/parse_tensor_test.cc
+++ b/tensorflow/core/kernels/parse_tensor_test.cc
@@ -106,8 +106,9 @@ TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_double) {
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int64) {
-  MakeOp<int64_t>(TensorShape({2, 3, 4}),
-                  [](int x) -> int64 { return static_cast<int64_t>(x - 10); });
+  MakeOp<int64_t>(TensorShape({2, 3, 4}), [](int x) -> int64_t {
+    return static_cast<int64_t>(x - 10);
+  });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
   ParseSerializedOutput<int64_t>(GetOutput(0), &parse_output);
@@ -115,48 +116,50 @@ TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int64) {
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int32) {
-  MakeOp<int32>(TensorShape({4, 2}),
-                [](int x) -> int32 { return static_cast<int32>(x + 7); });
+  MakeOp<int32_t>(TensorShape({4, 2}),
+                  [](int x) -> int32_t { return static_cast<int32_t>(x + 7); });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<int32>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<int32>(parse_output, GetInput(0));
+  ParseSerializedOutput<int32_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int32_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int16) {
-  MakeOp<int16>(TensorShape({8}),
-                [](int x) -> int16 { return static_cast<int16>(x + 18); });
+  MakeOp<int16_t>(TensorShape({8}), [](int x) -> int16_t {
+    return static_cast<int16_t>(x + 18);
+  });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<int16>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<int16>(parse_output, GetInput(0));
+  ParseSerializedOutput<int16_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int16_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_int8) {
-  MakeOp<int8>(TensorShape({2}),
-               [](int x) -> int8 { return static_cast<int8>(x + 8); });
+  MakeOp<int8_t>(TensorShape({2}),
+                 [](int x) -> int8_t { return static_cast<int8_t>(x + 8); });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<int8>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<int8>(parse_output, GetInput(0));
+  ParseSerializedOutput<int8_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<int8_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint16) {
-  MakeOp<uint16>(TensorShape({1, 3}),
-                 [](int x) -> uint16 { return static_cast<uint16>(x + 2); });
+  MakeOp<uint16_t>(TensorShape({1, 3}), [](int x) -> uint16_t {
+    return static_cast<uint16_t>(x + 2);
+  });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<uint16>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<uint16>(parse_output, GetInput(0));
+  ParseSerializedOutput<uint16_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<uint16_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_uint8) {
-  MakeOp<uint8>(TensorShape({2, 1, 1}),
-                [](int x) -> uint8 { return static_cast<uint8>(x + 1); });
+  MakeOp<uint8_t>(TensorShape({2, 1, 1}),
+                  [](int x) -> uint8_t { return static_cast<uint8_t>(x + 1); });
   TF_ASSERT_OK(RunOpKernel());
   Tensor parse_output;
-  ParseSerializedOutput<uint8>(GetOutput(0), &parse_output);
-  test::ExpectTensorEqual<uint8>(parse_output, GetInput(0));
+  ParseSerializedOutput<uint8_t>(GetOutput(0), &parse_output);
+  test::ExpectTensorEqual<uint8_t>(parse_output, GetInput(0));
 }
 
 TEST_F(SerializeTensorOpTest, SerializeTensorOpTest_complex64) {
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 97b08ce6fd2982..bbff2dc35654ad 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -43,9 +43,9 @@ PartitionedCallOp::PartitionedCallOp(OpKernelConstruction* ctx)
       shared_rendezvous_(false) {
   OP_REQUIRES_OK(
       ctx, ctx->GetAttr(FunctionLibraryDefinition::kFuncAttr, func_.get()));
-  string deprecated_config_serialized;
+  std::string deprecated_config_serialized;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
-  string config_proto_serialized;
+  std::string config_proto_serialized;
   OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
   OP_REQUIRES(
       ctx,
@@ -232,7 +232,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   FunctionLibraryRuntime::Options run_opts;
   ResourceMgr* resource_mgr = lib->device()->resource_manager();
   ScopedStepContainer* step_container = new ScopedStepContainer(
-      run_opts.step_id, [resource_mgr](const string& name) {
+      run_opts.step_id, [resource_mgr](const std::string& name) {
         resource_mgr->Cleanup(name).IgnoreError();
       });
   run_opts.step_container = step_container;
@@ -251,13 +251,13 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   }
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
-  const string& func_name = func_->name();
+  const std::string& func_name = func_->name();
   tsl::profiler::TraceMe trace_me("PartitionedCallOp");
   lib->Run(run_opts, handle, inputs, rets,
            [rets, done = std::move(done), ctx, func_name,
             step_container](const absl::Status& status) {
              if (!status.ok()) {
-               const string function_and_msg =
+               const std::string function_and_msg =
                    absl::StrCat(errors::FormatFunctionForError(func_name), " ",
                                 status.message());
                ctx->SetStatus(
diff --git a/tensorflow/core/kernels/partitioned_function_ops.h b/tensorflow/core/kernels/partitioned_function_ops.h
index 2b2ec8ea959f7c..f38ad56e8a9f73 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.h
+++ b/tensorflow/core/kernels/partitioned_function_ops.h
@@ -57,7 +57,7 @@ class PartitionedCallOp : public AsyncOpKernel {
   // Using unique pointers to avoid including proto headers in kernel headers
   std::unique_ptr<NameAttrList> func_;
   std::unique_ptr<ConfigProto> config_proto_;
-  string executor_type_;
+  std::string executor_type_;
   bool shared_rendezvous_;
   mutex mu_;
   // Cache the handle per FLR because this kernel may be instantiated for
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 28e24e79fe0bcf..42e00c52a8c814 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -46,8 +46,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
-                                   const std::vector<int32>& ksize,
-                                   const std::vector<int32>& stride,
+                                   const std::vector<int32_t>& ksize,
+                                   const std::vector<int32_t>& stride,
                                    Padding padding, TensorFormat data_format,
                                    const TensorShape& tensor_in_shape) {
   // For maxpooling, tensor_in should have 4 dimensions.
@@ -97,9 +97,9 @@ absl::Status Pool3dParameters::forward_output_shape(TensorShape* shape) {
 template <typename T>
 struct LaunchPoolingOp<CPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
@@ -112,9 +112,9 @@ struct LaunchPoolingOp<CPUDevice, T, AVG> {
 template <typename T>
 struct LaunchPoolingOp<CPUDevice, T, MAX> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     output->tensor<T, 5>().device(context->eigen_device<CPUDevice>()) =
@@ -128,7 +128,7 @@ template <typename Device, typename T, PoolingType Type>
 class Pooling3DOp : public UnaryOp<T> {
  public:
   explicit Pooling3DOp(OpKernelConstruction* context) : UnaryOp<T>(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -204,8 +204,8 @@ class Pooling3DOp : public UnaryOp<T> {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -214,10 +214,10 @@ template <typename T>
 struct LaunchMaxPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const Tensor& tensor_out, const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& out,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     output->flat<T>().setZero();
     for (int64_t p = 0; p < out_backprop.dim_size(3); ++p) {
@@ -307,7 +307,7 @@ class MaxPooling3dGradOp : public OpKernel {
  public:
   explicit MaxPooling3dGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -391,8 +391,8 @@ class MaxPooling3dGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -402,10 +402,10 @@ struct LaunchAvgPooling3dGradOp<CPUDevice, T> {
   static void launch(OpKernelContext* context,
                      const TensorShape& tensor_in_shape,
                      const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& output_shape,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& output_shape,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     OP_REQUIRES(
         context, tensor_in_shape.dim_size(0) == out_backprop.dim_size(0),
@@ -487,7 +487,7 @@ class AvgPooling3dGradOp : public OpKernel {
  public:
   explicit AvgPooling3dGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -536,7 +536,7 @@ class AvgPooling3dGradOp : public OpKernel {
                 errors::InvalidArgument("out_backprop must be 5-dimensional"));
 
     TensorShape output_shape;
-    auto shape_vec = tensor_in_shape.vec<int32>();
+    auto shape_vec = tensor_in_shape.vec<int32_t>();
     for (int64_t i = 0; i < tensor_in_shape.NumElements(); ++i) {
       OP_REQUIRES_OK(context, output_shape.AddDimWithStatus(shape_vec(i)));
     }
@@ -568,8 +568,8 @@ class AvgPooling3dGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
@@ -693,7 +693,7 @@ class MaxPooling3dGradGradOp : public OpKernel {
  public:
   explicit MaxPooling3dGradGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -779,8 +779,8 @@ class MaxPooling3dGradGradOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
diff --git a/tensorflow/core/kernels/pooling_ops_3d.h b/tensorflow/core/kernels/pooling_ops_3d.h
index c0a589ff95092a..edc59f89f760bb 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.h
+++ b/tensorflow/core/kernels/pooling_ops_3d.h
@@ -39,8 +39,8 @@ struct LaunchMaxPooling3dGradGradOp;
 // A helper class to manage sizes and shapes for 3d pooling operations.
 struct Pool3dParameters {
   // Updates context->status if there is an invalid input.
-  Pool3dParameters(OpKernelContext* context, const std::vector<int32>& ksize,
-                   const std::vector<int32>& stride, Padding padding,
+  Pool3dParameters(OpKernelContext* context, const std::vector<int32_t>& ksize,
+                   const std::vector<int32_t>& stride, Padding padding,
                    TensorFormat data_format,
                    const TensorShape& tensor_in_shape);
 
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 4ccca647c154aa..ac0cd5df525b90 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -49,7 +49,7 @@ struct RawType {
 
 template <>
 struct RawType<qint8> {
-  using type = int8;
+  using type = int8_t;
 };
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -117,8 +117,8 @@ absl::Status CheckPaddingSize(int64_t window_rows, int64_t window_cols,
 }
 
 PoolParameters::PoolParameters(OpKernelContext* context,
-                               const std::vector<int32>& ksize,
-                               const std::vector<int32>& stride,
+                               const std::vector<int32_t>& ksize,
+                               const std::vector<int32_t>& stride,
                                Padding padding,
                                std::vector<int64_t> explicit_paddings,
                                TensorFormat data_format,
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index bb5dda562af672..71cddc32bbb3c5 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -47,8 +47,8 @@ struct PoolParameters {
   // Updates context->status if there is an invalid input.
   // explicit_paddings has eight elements if padding==EXPLIICT, and zero
   // elements otherwise.
-  PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
-                 const std::vector<int32>& stride, Padding padding,
+  PoolParameters(OpKernelContext* context, const std::vector<int32_t>& ksize,
+                 const std::vector<int32_t>& stride, Padding padding,
                  std::vector<int64_t> explicit_paddings,
                  TensorFormat data_format, const TensorShape& tensor_in_shape);
 
@@ -90,7 +90,7 @@ template <typename Device, typename T>
 class MaxPoolingOp : public OpKernel {
  public:
   explicit MaxPoolingOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     auto status = context->GetAttr("data_format", &data_format);
     if (status.ok()) {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
@@ -297,8 +297,8 @@ class MaxPoolingOp : public OpKernel {
     }
   }
 
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -338,7 +338,7 @@ template <typename Device, typename T>
 class MaxPoolingV2Op : public OpKernel {
  public:
   explicit MaxPoolingV2Op(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     auto status = context->GetAttr("data_format", &data_format);
     if (status.ok()) {
       OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
@@ -375,17 +375,17 @@ class MaxPoolingV2Op : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
 
     if (context->num_inputs() != 1) {
       const Tensor& tensor_ksize = context->input(1);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(2);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -572,8 +572,8 @@ class MaxPoolingV2Op : public OpKernel {
     }
   }
 
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
 };
diff --git a/tensorflow/core/kernels/population_count_op.cc b/tensorflow/core/kernels/population_count_op.cc
index 9d0fc7530ae889..c43415982f257a 100644
--- a/tensorflow/core/kernels/population_count_op.cc
+++ b/tensorflow/core/kernels/population_count_op.cc
@@ -49,7 +49,7 @@ class PopulationCountOp : public OpKernel {
     OP_REQUIRES_OK(c, c->allocate_output(0, input_t.shape(), &output_t));
 
     auto input = input_t.flat<T>();
-    auto output = output_t->flat<uint8>();
+    auto output = output_t->flat<uint8_t>();
 
     functor::PopulationCount<Device, T> popcnt;
     popcnt(c, input, output);
@@ -77,7 +77,7 @@ namespace functor {
 namespace {
 
 template <typename T>
-inline uint8 PopCnt(const T v);
+inline uint8_t PopCnt(const T v);
 
 #define POPCNT(T, N)                  \
   template <>                         \
@@ -86,13 +86,13 @@ inline uint8 PopCnt(const T v);
   }
 
 POPCNT(int8_t, 8);
-POPCNT(uint8, 8);
+POPCNT(uint8_t, 8);
 POPCNT(int16_t, 16);
-POPCNT(uint16, 16);
+POPCNT(uint16_t, 16);
 POPCNT(int32_t, 32);
-POPCNT(uint32, 32);
+POPCNT(uint32_t, 32);
 POPCNT(int64_t, 64);
-POPCNT(uint64, 64);
+POPCNT(uint64_t, 64);
 
 #undef POPCNT
 
@@ -101,9 +101,9 @@ POPCNT(uint64, 64);
 template <typename T>
 struct PopulationCount<CPUDevice, T> {
   void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
-                  TTypes<uint8>::Flat output) {
+                  TTypes<uint8_t>::Flat output) {
     const T* input_ptr = input.data();
-    uint8* output_ptr = output.data();
+    uint8_t* output_ptr = output.data();
     auto shard = [input_ptr, output_ptr](int64_t start, int64_t limit) {
       for (int64_t i = start; i < limit; ++i) {
         output_ptr[i] = PopCnt<T>(input_ptr[i]);
@@ -113,8 +113,9 @@ struct PopulationCount<CPUDevice, T> {
     // Approximating cost of popcnt: convert T to int64
     // (std::bitset constructor) and convert int64 to uint8
     // (bitset.count() -> output).  The .count() itself is relatively cheap.
-    const double total_cost = (Eigen::TensorOpCost::CastCost<T, uint8>() +
-                               Eigen::TensorOpCost::CastCost<int64_t, uint8>());
+    const double total_cost =
+        (Eigen::TensorOpCost::CastCost<T, uint8_t>() +
+         Eigen::TensorOpCost::CastCost<int64_t, uint8_t>());
     const int64_t shard_cost =
         (total_cost >= static_cast<double>(std::numeric_limits<int64_t>::max()))
             ? std::numeric_limits<int64_t>::max()
diff --git a/tensorflow/core/kernels/population_count_op.h b/tensorflow/core/kernels/population_count_op.h
index 2c9812967366d8..b9811e59c3ea38 100644
--- a/tensorflow/core/kernels/population_count_op.h
+++ b/tensorflow/core/kernels/population_count_op.h
@@ -28,7 +28,7 @@ namespace functor {
 template <typename Device, typename T>
 struct PopulationCount {
   void operator()(OpKernelContext* c, typename TTypes<T>::ConstFlat input,
-                  TTypes<uint8>::Flat output);
+                  TTypes<uint8_t>::Flat output);
 };
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index 56ea77fdbcf2ca..490cc338ddb99c 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -37,7 +37,7 @@ namespace tensorflow {
 PriorityQueue::PriorityQueue(int32_t capacity,
                              const DataTypeVector& component_dtypes,
                              const std::vector<TensorShape>& component_shapes,
-                             const string& name)
+                             const std::string& name)
     : TypedQueue(capacity, component_dtypes, component_shapes, name) {}
 
 absl::Status PriorityQueue::Initialize() {
diff --git a/tensorflow/core/kernels/priority_queue.h b/tensorflow/core/kernels/priority_queue.h
index f7ca800a66bf7a..46408300778673 100644
--- a/tensorflow/core/kernels/priority_queue.h
+++ b/tensorflow/core/kernels/priority_queue.h
@@ -50,7 +50,7 @@ class PriorityQueue
  public:
   PriorityQueue(int32_t capacity, const DataTypeVector& component_dtypes,
                 const std::vector<TensorShape>& component_shapes,
-                const string& name);
+                const std::string& name);
 
   absl::Status Initialize()
       override;  // Must be called before any other method.
@@ -69,7 +69,7 @@ class PriorityQueue
   absl::Status MatchesPriorityNodeDefTypes(const NodeDef& node_def) const;
   absl::Status MatchesPriorityNodeDefShapes(const NodeDef& node_def) const;
 
-  int32 size() const override {
+  int32_t size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 88bee91121641a..f10b5e823d4143 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -84,7 +84,7 @@ T FloatToQuantized(float input, float range_min, float range_max) {
       static_cast<int64_t>(Eigen::NumTraits<T>::highest());
   quantized = std::max(quantized, lowest_quantized);
   quantized = std::min(quantized, highest_quantized);
-  return static_cast<T>(static_cast<int32>(quantized));
+  return static_cast<T>(static_cast<int32_t>(quantized));
 }
 
 template <class T>
@@ -284,7 +284,7 @@ inline void RequantizeManyInNewRangeReference(const qint32* input,
     int64_t quantized_int64 = round_intermediate >> fp_shift;
     quantized_int64 = std::max(quantized_int64, int64_t{0});
     quantized_int64 = std::min(quantized_int64, int64_t{255});
-    output[index] = static_cast<quint8>(static_cast<int32>(quantized_int64));
+    output[index] = static_cast<quint8>(static_cast<int32_t>(quantized_int64));
   }
 }
 
@@ -310,7 +310,7 @@ inline void RequantizeManyInNewRange8To32BitReference(
     int64_t output_value = code_0_int64 + (input_value * mult_int32);
     output_value = std::max(output_value, lowest_quantized);
     output_value = std::min(output_value, highest_quantized);
-    output[i] = static_cast<int32>(output_value);
+    output[i] = static_cast<int32_t>(output_value);
   }
 }
 
@@ -725,7 +725,7 @@ inline void RequantizeManyInNewRangeUsingEigen<qint32, quint8>(
   auto intermediate = fp_value.unaryExpr(int64_right_shift_op<fp_shift>());
   auto input_requantized = intermediate.cwiseMax(int64_t{0})
                                .cwiseMin(int64_t{255})
-                               .template cast<int32>()
+                               .template cast<int32_t>()
                                .template cast<quint8>();
   output->flat<quint8>().device(device) = input_requantized;
 }
diff --git a/tensorflow/core/kernels/quantization_utils_test.cc b/tensorflow/core/kernels/quantization_utils_test.cc
index 689e98cfebb2de..6c0251b7249484 100644
--- a/tensorflow/core/kernels/quantization_utils_test.cc
+++ b/tensorflow/core/kernels/quantization_utils_test.cc
@@ -60,7 +60,7 @@ void TestRequantizeMany(Eigen::ThreadPoolDevice* eigen_device, float input_min,
         &o_tensor);
   }
 
-  const string tolerance_str = absl::StrCat("+-", tolerance);
+  const std::string tolerance_str = absl::StrCat("+-", tolerance);
   for (size_t value_index = 0; value_index < values_count; ++value_index) {
     int e = expected_values[value_index];
     int v = output_values(value_index);
@@ -96,7 +96,7 @@ void TestRequantizeMany8To32Bit(float input_min, float input_max,
                            input_max, output_min, output_max,
                            output_values.data());
 
-  const string tolerance_str = absl::StrCat("+-", tolerance);
+  const std::string tolerance_str = absl::StrCat("+-", tolerance);
   for (int value_index = 0; value_index < values_count; ++value_index) {
     const qint32 e = expected_values[value_index];
     const qint32 v = output_values(value_index);
@@ -143,7 +143,7 @@ void TestRequantizeManyInNewRange32To8Bit(
     qint32 high = Eigen::NumTraits<qint32>::highest();
     std::vector<qint32> vals{low, high};
     int num_steps = 14419;
-    qint32 step = static_cast<int32>((1LL << 32) / num_steps);
+    qint32 step = static_cast<int32_t>((1LL << 32) / num_steps);
     qint32 v = low + static_cast<qint32>(1);
     for (int i = 0; i < num_steps; ++i) {
       vals.push_back(v);
@@ -405,7 +405,7 @@ void TestQuantizedToFloatInPlaceUsingEigen(
         input_array(i) = Eigen::NumTraits<T>::lowest() + i;
       } else {
         int64_t offset = static_cast<int64_t>(q_range / values_count * i);
-        input_array(i) = static_cast<int32>(
+        input_array(i) = static_cast<int32_t>(
             std::min<int64_t>(Eigen::NumTraits<T>::lowest() + offset,
                               Eigen::NumTraits<T>::highest()));
       }
@@ -662,8 +662,8 @@ void TestOverflowWithEigen() {
   // because the implementation does a bounds check using float, not int32.
   test::FillValues<qint32>(
       &expected,
-      {static_cast<int32>(-2147483648), static_cast<int32>(-2147483648),
-       static_cast<int32>(2147483520), static_cast<int32>(2147483520)});
+      {static_cast<int32_t>(-2147483648), static_cast<int32_t>(-2147483648),
+       static_cast<int32_t>(2147483520), static_cast<int32_t>(2147483520)});
 
   FloatToQuantizedStruct<qint32> f2q(input_min, input_max);
   Tensor output(DT_QINT32, shape);
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index e34601a86b6b77..64e7ec09c46eed 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -57,7 +57,7 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
                                 " with signed_input_ ", signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
 
-    string round_mode_string;
+    std::string round_mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
     OP_REQUIRES(
         ctx,
@@ -284,7 +284,7 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
                                 "be a scalar. Got dimensions: ",
                                 num_bits_tensor.dims()));
 
-    const int num_bits_val = num_bits_tensor.scalar<int32>()();
+    const int num_bits_val = num_bits_tensor.scalar<int32_t>()();
     OP_REQUIRES(ctx,
                 num_bits_val > 0 && num_bits_val < (signed_input_ ? 62 : 63),
                 InvalidArgument("num_bits is out of range: ", num_bits_val,
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 0d5b923ecbd0e7..b93292f83d677a 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -86,7 +86,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor_V3) {
   AddInputFromArray<float>(TensorShape({1}), {-3.5});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({1}));
@@ -103,7 +103,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_scalar_tensor_V3) {
 template <typename T>
 std::vector<T> ScalePerSliceAlongAxis(std::vector<int64_t> dims, int axis,
                                       const std::vector<T>& data) {
-  uint32 seed = 123;
+  uint32_t seed = 123;
   int64_t out_size = 1;
   for (int dim : dims) {
     out_size *= dim;
@@ -292,7 +292,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
   AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
   // Scale is: 1/128
@@ -337,7 +337,7 @@ TEST_P(ParameterizedQuantizeAndDequantizeTest,
   std::vector<float> init_value(num_slices, 0.0f);
   AddInputFromArray<float>(range_shape, init_value);  // Min
   AddInputFromArray<float>(range_shape, init_value);  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});   // num_bits
 
   // With int8, the values in the tensor are quantized to
   // {-127, -63, 0, 38, 102, 70, 64}.
@@ -490,7 +490,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {4});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {4});  // num_bits
 
   // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
   // Scale is: 1/8
@@ -583,7 +583,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
                            {-0.8, -0.5, 0, 0.3, 0.8, 0.555, -2, 33});
   AddInputFromArray<float>(TensorShape({}), {-1.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});   // num_bits
 
   // Note that the range is given as [-1, 1].
   // With int8, the tensor is quantized to {-102, -64, 0, 38, 102, 70, -128,
@@ -664,7 +664,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {1.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   // Note that the range is given as [0, 1].
   // With int8, the tensor is quantized to {0, 0, 76, 204}
@@ -712,7 +712,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_tensor_with_all_0_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {0, 0, 0, 0});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
@@ -755,7 +755,7 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_range_given_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
   AddInputFromArray<float>(TensorShape({}), {1.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   absl::Status s = RunOpKernel();
   EXPECT_TRUE(absl::StrContains(s.ToString(),
@@ -778,7 +778,7 @@ TEST_F(QuantizeAndDequantizeTest, Invalid_axis_given_V3) {
   AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
   AddInputFromArray<float>(TensorShape({}), {1.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
-  AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
+  AddInputFromArray<int32_t>(TensorShape({}), {8});  // num_bits
 
   EXPECT_THAT(
       RunOpKernel(),
diff --git a/tensorflow/core/kernels/quantize_down_and_shrink_range.cc b/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
index 02ca323b991f68..9a49f96d4c6024 100644
--- a/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
+++ b/tensorflow/core/kernels/quantize_down_and_shrink_range.cc
@@ -64,9 +64,9 @@ class QuantizeDownAndShrinkRangeOp : public OpKernel {
     // See QuantizationRangeOp as well, which has a copy of this logic.
     auto input_array = input.flat<T1>();
     const int32_t input_lowest_quantized =
-        static_cast<int32>(Eigen::NumTraits<T1>::lowest());
+        static_cast<int32_t>(Eigen::NumTraits<T1>::lowest());
     const int32_t input_highest_quantized =
-        static_cast<int32>(Eigen::NumTraits<T1>::highest());
+        static_cast<int32_t>(Eigen::NumTraits<T1>::highest());
     T1 actual_min_quantized = input_highest_quantized;
     T1 actual_max_quantized = input_lowest_quantized;
     for (int i = 0; i < input_array.size(); ++i) {

From a967808638b7256d777722630be808f801c08291 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 00:56:13 -0800
Subject: [PATCH 390/753] Automated Code Change

PiperOrigin-RevId: 845641559
---
 .../eager/cluster_function_library_runtime.cc |   7 +-
 .../eager/cluster_function_library_runtime.h  |  16 +--
 .../eager/destroy_tensor_handle_node.h        |   6 +-
 .../distributed_runtime/eager/eager_client.h  |   2 +-
 .../eager/eager_service_impl.cc               |  31 +++---
 .../eager/eager_service_impl.h                |  14 +--
 .../eager/eager_service_impl_test.cc          | 102 +++++++++---------
 .../eager/remote_copy_node.cc                 |  22 ++--
 .../eager/remote_copy_node.h                  |  10 +-
 .../eager/remote_execute_node.cc              |   6 +-
 .../eager/remote_execute_node.h               |   8 +-
 .../distributed_runtime/eager/remote_mgr.cc   |   8 +-
 .../distributed_runtime/eager/remote_mgr.h    |  12 +--
 .../eager/remote_mgr_test.cc                  |  12 +--
 .../eager/remote_tensor_handle.h              |   2 +-
 .../eager/remote_tensor_handle_data.cc        |  17 +--
 .../eager/remote_tensor_handle_data.h         |  22 ++--
 17 files changed, 153 insertions(+), 144 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 5688c30275eb2e..f62268f3a40d3f 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -54,7 +54,7 @@ void StripDefaultAttributesInRegisterFunctionOp(
 }  // namespace
 
 void EagerClusterFunctionLibraryRuntime::Instantiate(
-    const string& function_name, const FunctionLibraryDefinition& lib_def,
+    const std::string& function_name, const FunctionLibraryDefinition& lib_def,
     AttrSlice attrs, const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::LocalHandle* handle,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -281,7 +281,7 @@ void EagerClusterFunctionLibraryRuntime::Run(
 }
 
 void EagerClusterFunctionLibraryRuntime::CleanUp(
-    uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+    uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
     FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
@@ -312,7 +312,8 @@ void EagerClusterFunctionLibraryRuntime::CleanUp(
 }
 
 DistributedFunctionLibraryRuntime* CreateClusterFLR(
-    const uint64 context_id, EagerContext* ctx, WorkerSession* worker_session) {
+    const uint64_t context_id, EagerContext* ctx,
+    WorkerSession* worker_session) {
   return new EagerClusterFunctionLibraryRuntime(
       context_id, ctx, worker_session->remote_device_mgr());
 }
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
index 58af5ed93ae8ac..6fb1fc280f0638 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.h
@@ -37,7 +37,8 @@ namespace eager {
 class EagerClusterFunctionLibraryRuntime
     : public DistributedFunctionLibraryRuntime {
  public:
-  EagerClusterFunctionLibraryRuntime(const uint64 context_id, EagerContext* ctx,
+  EagerClusterFunctionLibraryRuntime(const uint64_t context_id,
+                                     EagerContext* ctx,
                                      DeviceMgr* remote_device_mgr)
       : context_id_(context_id),
         ctx_(ctx),
@@ -49,7 +50,7 @@ class EagerClusterFunctionLibraryRuntime
   // on the remote target specified in `options.target`. This should be
   // triggered as part of instantiating a multi-device function in
   // ProcessFunctionLibraryRuntime.
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
                    FunctionLibraryRuntime::LocalHandle* handle,
@@ -75,23 +76,23 @@ class EagerClusterFunctionLibraryRuntime
            absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
-  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+  void CleanUp(uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
                FunctionLibraryRuntime::DoneCallback done) override;
 
   DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
 
  private:
-  const uint64 context_id_;
+  const uint64_t context_id_;
   EagerContext* ctx_;
   DeviceMgr* remote_device_mgr_;  // not owned.
 
   struct FunctionData {
-    const string target;
+    const std::string target;
     const absl::optional<std::vector<int>> ret_indices;
     core::RefCountPtr<EagerClient> eager_client;
     std::unique_ptr<EagerOperation> op;
 
-    FunctionData(const string& target,
+    FunctionData(const std::string& target,
                  const absl::optional<std::vector<int>>& ret_indices,
                  EagerClient* eager_client, std::unique_ptr<EagerOperation> op)
         : target(target),
@@ -107,7 +108,8 @@ class EagerClusterFunctionLibraryRuntime
 };
 
 DistributedFunctionLibraryRuntime* CreateClusterFLR(
-    const uint64 context_id, EagerContext* ctx, WorkerSession* worker_session);
+    const uint64_t context_id, EagerContext* ctx,
+    WorkerSession* worker_session);
 
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index ade7260cc9fb74..a0991dc601be4e 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -71,8 +71,8 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
   // Remote node deletions are best effort
   bool Fatal() const override { return false; }
 
-  string DebugString() const override {
-    string out = "[DestroyTensorHandleNode]";
+  std::string DebugString() const override {
+    std::string out = "[DestroyTensorHandleNode]";
     absl::StrAppend(&out, " request: ", request_->DebugString());
     return out;
   }
@@ -80,7 +80,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
  private:
   std::unique_ptr<EnqueueRequest> request_;
   core::RefCountPtr<EagerClient> eager_client_;
-  const string remote_task_;
+  const std::string remote_task_;
   bool ready_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 6fc956014ab666..a2a3d596bff10a 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -92,7 +92,7 @@ class EagerClientCache {
   // increment the refcount of the client. The reference ownership is
   // transferred to the caller, and the unref should automatically happen when
   // destructing the RefCountPtr object from the caller's side.
-  virtual absl::Status GetClient(const string& target,
+  virtual absl::Status GetClient(const std::string& target,
                                  core::RefCountPtr<EagerClient>* client) = 0;
 };
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 2bb45a8ed53d67..abae4bdce1d23a 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -59,8 +59,9 @@ namespace eager {
 
 namespace {
 absl::Status GetNumRetvals(
-    FunctionLibraryDefinition* func_lib_def, const string& op_name,
-    const google::protobuf::Map<string, tensorflow::AttrValue>& attrs, int* num_retvals) {
+    FunctionLibraryDefinition* func_lib_def, const std::string& op_name,
+    const google::protobuf::Map<std::string, tensorflow::AttrValue>& attrs,
+    int* num_retvals) {
   const tensorflow::OpRegistrationData* op_reg_data = nullptr;
   auto status = tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
   if (absl::IsNotFound(status)) {
@@ -189,10 +190,10 @@ absl::Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
 
 absl::Status AddOpRetvalsToResponse(
     EagerContext* eager_context, int op_id, int num_retvals,
-    const std::vector<int32>& output_nums, TensorHandle** retvals,
+    const std::vector<int32_t>& output_nums, TensorHandle** retvals,
     std::function<TensorProto*()> add_tensor_proto_fn,
     std::function<TensorShapeProto*()> add_shape_proto_fn,
-    std::function<string*()> add_device_fn = nullptr) {
+    std::function<std::string*()> add_device_fn = nullptr) {
   // retvals hold references to the allocated output tensor handles. If errors
   // happen with adding some results to the response, aggregate the status in sg
   // instead of directly returning the error, to make sure unref or ownership
@@ -291,7 +292,7 @@ absl::Status EagerServiceImpl::CreateContext(
     TF_RETURN_IF_ERROR(env_->session_mgr->DeleteAllSessions());
 
     // Cleanup existing contexts if any.
-    std::unordered_map<uint64, ServerContext*> tmp_contexts;
+    std::unordered_map<uint64_t, ServerContext*> tmp_contexts;
     {
       mutex_lock l(contexts_mu_);
       if (!contexts_.empty()) {
@@ -372,7 +373,7 @@ absl::Status EagerServiceImpl::CreateContext(
   // case ctx will be deleted by this unref.
   core::ScopedUnref unref_ctx(ctx);
 
-  std::vector<string> remote_workers;
+  std::vector<std::string> remote_workers;
   worker_session->worker_cache()->ListWorkers(&remote_workers);
   remote_workers.erase(std::remove(remote_workers.begin(), remote_workers.end(),
                                    worker_session->worker_name()),
@@ -500,7 +501,7 @@ absl::Status EagerServiceImpl::UpdateContext(
 
   const tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
 
-  std::vector<string> remote_workers;
+  std::vector<std::string> remote_workers;
   worker_session->worker_cache()->ListWorkers(&remote_workers);
   remote_workers.erase(std::remove(remote_workers.begin(), remote_workers.end(),
                                    worker_session->worker_name()),
@@ -508,7 +509,7 @@ absl::Status EagerServiceImpl::UpdateContext(
   VLOG(1) << "On existing server " << worker_session->worker_name()
           << " updating remote workers";
   if (VLOG_IS_ON(2)) {
-    for (const string& rw : remote_workers) {
+    for (const std::string& rw : remote_workers) {
       VLOG(2) << "Remote worker " << rw;
     }
   }
@@ -546,8 +547,8 @@ absl::Status EagerServiceImpl::UpdateContext(
   return absl::OkStatus();
 }
 
-absl::Status EagerServiceImpl::CreateMasterContext(
-    const tensorflow::uint64 context_id, EagerContext* context) {
+absl::Status EagerServiceImpl::CreateMasterContext(const uint64_t context_id,
+                                                   EagerContext* context) {
   {
     mutex_lock l(contexts_mu_);
     auto iter = contexts_.find(context_id);
@@ -616,7 +617,7 @@ void EagerServiceImpl::RunComponentFunction(
   auto* retvals = new absl::FixedArray<TensorHandle*>(*num_retvals);
   VLOG(3) << "ServerContext: Calling EagerLocalExecuteAsync for op "
           << operation.id();
-  std::vector<int32> output_nums;
+  std::vector<int32_t> output_nums;
   for (const int32_t output_num : request->output_num()) {
     output_nums.push_back(output_num);
   }
@@ -676,7 +677,7 @@ absl::Status EagerServiceImpl::ExecuteOp(CallOptions* call_opts,
           num_retvals),
       &num_retvals));
 
-  std::function<string*()> add_device_fn = nullptr;
+  std::function<std::string*()> add_device_fn = nullptr;
   // Send the output devices of a function back to let a client know where the
   // outputs are. For a primitive op, an output devics is the op device which is
   // known on a client.
@@ -694,7 +695,7 @@ absl::Status EagerServiceImpl::ExecuteOp(CallOptions* call_opts,
 absl::Status EagerServiceImpl::Enqueue(CallOptions* call_opts,
                                        const EnqueueRequest* request,
                                        EnqueueResponse* response,
-                                       uint64 stream_id) {
+                                       uint64_t stream_id) {
   tsl::profiler::TraceMe activity(
       [&] {
         return absl::StrCat(
@@ -901,12 +902,12 @@ absl::Status EagerServiceImpl::SendPackedHandle(
 }
 
 absl::Status EagerServiceImpl::GetServerContext(
-    uint64 context_id, ServerContext** server_context) {
+    uint64_t context_id, ServerContext** server_context) {
   tf_shared_lock l(contexts_mu_);
   auto iter = contexts_.find(context_id);
   if (iter == contexts_.end()) {
     *server_context = nullptr;
-    return errors::Aborted(strings::Printf(
+    return errors::Aborted(absl::StrFormat(
         "Unable to find a context_id matching the specified one "
         "(%llu). Perhaps the worker was restarted, or the context was GC'd?",
         static_cast<unsigned long long>(context_id)));
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 329f60cf583ef7..90d49cc7a64e19 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -83,15 +83,15 @@ class EagerServiceImpl {
                              UpdateContextResponse* response);
 
   // Create a ServerContext for master eager context.
-  absl::Status CreateMasterContext(const tensorflow::uint64 context_id,
+  absl::Status CreateMasterContext(const uint64_t context_id,
                                    EagerContext* context);
 
-  static constexpr uint64 kInvalidStreamId = 0;
+  static constexpr uint64_t kInvalidStreamId = 0;
 
   // Used by both Enqueue and StreamingEnqueue RPCs.
   absl::Status Enqueue(CallOptions* call_opts, const EnqueueRequest* request,
                        EnqueueResponse* response,
-                       uint64 stream_id = kInvalidStreamId);
+                       uint64_t stream_id = kInvalidStreamId);
 
   absl::Status WaitQueueDone(const WaitQueueDoneRequest* request,
                              WaitQueueDoneResponse* response);
@@ -166,7 +166,7 @@ class EagerServiceImpl {
     const bool is_master_;
   };
   // The returned ServerContext will need to be Unrefed.
-  absl::Status GetServerContext(uint64, ServerContext**);
+  absl::Status GetServerContext(uint64_t, ServerContext**);
 
   class ClientTensorHandleDeleteNode : public EagerNode {
    public:
@@ -194,8 +194,8 @@ class EagerServiceImpl {
     // Remote node deletions are best effort
     bool Fatal() const override { return false; }
 
-    string DebugString() const override {
-      string out = "[ClientTensorHandleDeleteNode]";
+    std::string DebugString() const override {
+      std::string out = "[ClientTensorHandleDeleteNode]";
       absl::StrAppend(&out, " op_id: ", handle_to_delete_->op_id);
       absl::StrAppend(&out, ", output_num: ", handle_to_delete_->output_num);
       return out;
@@ -225,7 +225,7 @@ class EagerServiceImpl {
   WorkerEnv* const env_;  // Not owned.
 
   mutex contexts_mu_;
-  std::unordered_map<uint64, ServerContext*> contexts_
+  std::unordered_map<uint64_t, ServerContext*> contexts_
       TF_GUARDED_BY(contexts_mu_);
 
   std::unique_ptr<Thread> gc_thread_;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index a4b1f6552b4b33..e9be274d4fea19 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -56,14 +56,14 @@ namespace {
 class TestEagerServiceImpl : public EagerServiceImpl {
  public:
   explicit TestEagerServiceImpl(WorkerEnv* env) : EagerServiceImpl(env) {}
-  absl::Status GetEagerContext(const uint64 context_id, EagerContext** ctx) {
+  absl::Status GetEagerContext(const uint64_t context_id, EagerContext** ctx) {
     ServerContext* context = nullptr;
     TF_RETURN_IF_ERROR(GetServerContext(context_id, &context));
     core::ScopedUnref context_unref(context);
     *ctx = context->Context();
     return absl::OkStatus();
   }
-  absl::Status GetTensorHandle(const uint64 context_id,
+  absl::Status GetTensorHandle(const uint64_t context_id,
                                const RemoteTensorHandleInternal& remote_handle,
                                tensorflow::TensorHandle** handle) {
     ServerContext* context = nullptr;
@@ -136,7 +136,7 @@ class FakeEagerClient : public EagerClient {
 class DummyEagerClientCache : public EagerClientCache {
  public:
   DummyEagerClientCache() : client_(new FakeEagerClient) {}
-  absl::Status GetClient(const string& target,
+  absl::Status GetClient(const std::string& target,
                          core::RefCountPtr<EagerClient>* client) override {
     client->reset(client_.get());
     client_->Ref();
@@ -154,7 +154,7 @@ class FakeCache : public TestWorkerCache {
     return absl::OkStatus();
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     workers->push_back("/job:localhost/replica:0/task:0");
   }
 };
@@ -202,10 +202,11 @@ void SetTensorProto(TensorProto* tensor_proto) {
 }
 
 void BuildOperation(
-    Operation* operation, int64_t id, const string& name,
-    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32>>>&
+    Operation* operation, int64_t id, const std::string& name,
+    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32_t>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device) {
+    const std::unordered_map<std::string, AttrValue>& attrs,
+    const std::string& device) {
   operation->set_id(id);
   operation->set_name(name);
   operation->set_device(device);
@@ -216,7 +217,7 @@ void BuildOperation(
           std::get<TensorProto>(input);
     } else {
       const auto& tensor_handle_pair =
-          std::get<std::pair<int64_t, int32>>(input);
+          std::get<std::pair<int64_t, int32_t>>(input);
       auto* input = operation->add_op_inputs()->mutable_remote_handle();
       input->set_op_id(tensor_handle_pair.first);
       input->set_output_num(tensor_handle_pair.second);
@@ -231,21 +232,22 @@ void BuildOperation(
 }
 
 void AddOperationToEnqueueRequest(
-    int64_t id, const string& name,
-    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32>>>&
+    int64_t id, const std::string& name,
+    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32_t>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    EnqueueRequest* request) {
+    const std::unordered_map<std::string, AttrValue>& attrs,
+    const std::string& device, EnqueueRequest* request) {
   auto* operation = request->add_queue()->mutable_operation();
   BuildOperation(operation, id, name, inputs, attrs, device);
 }
 
 void AddOperationToRunComponentFunctionRequest(
-    int64_t id, const string& name,
-    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32>>>&
+    int64_t id, const std::string& name,
+    const std::vector<std::variant<TensorProto, std::pair<int64_t, int32_t>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    const int output_num, RunComponentFunctionRequest* request) {
+    const std::unordered_map<std::string, AttrValue>& attrs,
+    const std::string& device, const int output_num,
+    RunComponentFunctionRequest* request) {
   auto* operation = request->mutable_operation();
   operation->set_is_function(true);
   operation->set_is_component_function(true);
@@ -450,7 +452,7 @@ tensorflow::FunctionDef SingleRecvNodeFunction() {
 TEST_F(EagerServiceImplTest, BasicTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
 
   CreateContextRequest request;
   request.mutable_server_def()->set_job_name("localhost");
@@ -464,7 +466,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
   remote_enqueue_request.set_context_id(context_id);
   EnqueueResponse remote_enqueue_response;
 
-  std::unordered_map<string, AttrValue> const_attrs;
+  std::unordered_map<std::string, AttrValue> const_attrs;
   AttrValue val;
   val.set_type(tensorflow::DataType::DT_FLOAT);
   const_attrs.insert({"dtype", val});
@@ -476,7 +478,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
                                "/job:localhost/replica:0/task:0/device:CPU:0",
                                &remote_enqueue_request);
 
-  std::unordered_map<string, AttrValue> attrs;
+  std::unordered_map<std::string, AttrValue> attrs;
   val.Clear();
   val.set_type(tensorflow::DataType::DT_FLOAT);
   attrs.insert({"T", val});
@@ -529,12 +531,12 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
 
   // Creates a context and attempts to execute a function.
   void TestFunction(const RegisterFunctionOp& register_op,
-                    const string& function_name,
+                    const std::string& function_name,
                     const bool local_inputs = false,
                     const bool test_cancel = false) {
     TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-    uint64 context_id = random::New64();
+    uint64_t context_id = random::New64();
 
     CreateContextRequest request;
     request.mutable_server_def()->set_job_name("localhost");
@@ -561,12 +563,12 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
       SetTensorProto(&tensor_proto);
       AddOperationToEnqueueRequest(
           2, function_name, {tensor_proto},
-          std::unordered_map<string, AttrValue>(),
+          std::unordered_map<std::string, AttrValue>(),
           "/job:localhost/replica:0/task:0/device:CPU:0",
           &remote_enqueue_request);
 
     } else {
-      std::unordered_map<string, AttrValue> const_attrs;
+      std::unordered_map<std::string, AttrValue> const_attrs;
       AttrValue val;
       val.set_type(tensorflow::DataType::DT_FLOAT);
       const_attrs.insert({"dtype", val});
@@ -581,7 +583,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
           &remote_enqueue_request);
       AddOperationToEnqueueRequest(
           2, function_name, {std::make_pair(1, 0)},
-          std::unordered_map<string, AttrValue>(),
+          std::unordered_map<std::string, AttrValue>(),
           "/job:localhost/replica:0/task:0/device:CPU:0",
           &remote_enqueue_request);
     }
@@ -629,10 +631,10 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
 
   // Creates a context and attempts to execute a component function.
   void TestComponentFunction(const RegisterFunctionOp& register_op,
-                             const string& function_name,
+                             const std::string& function_name,
                              const bool test_cancel) {
     TestEagerServiceImpl eager_service_impl(&worker_env_);
-    uint64 context_id = random::New64();
+    uint64_t context_id = random::New64();
 
     // Create context.
     CreateContextRequest request;
@@ -655,7 +657,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     remote_enqueue_request.set_context_id(context_id);
     EnqueueResponse remote_enqueue_response;
 
-    std::unordered_map<string, AttrValue> const_attrs;
+    std::unordered_map<std::string, AttrValue> const_attrs;
     AttrValue val;
     val.set_type(tensorflow::DataType::DT_FLOAT);
     const_attrs.insert({"dtype", val});
@@ -675,7 +677,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         2, function_name, {std::make_pair(1, 0)},
-        std::unordered_map<string, AttrValue>(),
+        std::unordered_map<std::string, AttrValue>(),
         "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
         &run_comp_func_request);
 
@@ -772,7 +774,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionTest) {
 
 TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
 
   // Create context.
   CreateContextRequest request;
@@ -820,7 +822,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
   remote_enqueue_request.set_context_id(context_id);
   EnqueueResponse remote_enqueue_response;
 
-  std::unordered_map<string, AttrValue> const_attrs;
+  std::unordered_map<std::string, AttrValue> const_attrs;
   AttrValue val;
   val.set_type(tensorflow::DataType::DT_FLOAT);
   const_attrs.insert({"dtype", val});
@@ -841,7 +843,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
     const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         2, "MatMulNestedFunction", {std::make_pair(1, 0)},
-        std::unordered_map<string, AttrValue>(),
+        std::unordered_map<std::string, AttrValue>(),
         "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
         &run_comp_func_request);
 
@@ -883,7 +885,7 @@ TEST_F(EagerServiceImplFunctionTest, ComponentNestedFunctionWithNameClashTest) {
     const int output_num = 5;
     AddOperationToRunComponentFunctionRequest(
         3, "MatMulNestedTransposeFunction", {std::make_pair(1, 0)},
-        std::unordered_map<string, AttrValue>(),
+        std::unordered_map<std::string, AttrValue>(),
         "/job:localhost/replica:0/task:0/device:CPU:0", output_num,
         &run_comp_func_request);
 
@@ -984,7 +986,7 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
     EnqueueRequest remote_enqueue_request;
     remote_enqueue_request.set_context_id(context_id_);
     EnqueueResponse remote_enqueue_response;
-    std::unordered_map<string, AttrValue> const_attrs;
+    std::unordered_map<std::string, AttrValue> const_attrs;
     AttrValue val;
     val.set_type(tensorflow::DataType::DT_FLOAT);
     const_attrs.insert({"dtype", val});
@@ -1045,11 +1047,13 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
   }
 
  protected:
-  const string local_device_ = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string remote_device_ = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const std::string local_device_ =
+      "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string remote_device_ =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
   TestEagerServiceImpl eager_service_impl_;
   std::unique_ptr<DeviceMgr> remote_device_mgr_;
-  uint64 context_id_;
+  uint64_t context_id_;
   tensorflow::FunctionDef fdef_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> eager_pflr_;
   std::unique_ptr<EagerClusterFunctionLibraryRuntime> eager_cluster_flr_;
@@ -1072,7 +1076,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
       fdef_.signature().name(), AttrSlice(&fdef_.attr()), options, &handle));
   EagerContext* ctx = nullptr;
   TF_ASSERT_OK(eager_service_impl_.GetEagerContext(context_id_, &ctx));
-  for (const string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
+  for (const std::string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
     const FunctionDef* fdef = ctx->FuncLibDef()->Find(func_name);
     EXPECT_TRUE(fdef != nullptr);
     if (absl::StartsWith(func_name, "MatMulFunction")) {
@@ -1085,7 +1089,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
 
   // Run MatMulFunction on remote_device.
   FunctionLibraryRuntime::Options opts;
-  const uint64 op_id = 2;
+  const uint64_t op_id = 2;
   opts.op_id = op_id;
   absl::Notification done;
   absl::Status status;
@@ -1133,7 +1137,7 @@ TEST_F(FunctionWithRemoteInputsTest,
   TF_ASSERT_OK(status);
   EagerContext* ctx = nullptr;
   TF_ASSERT_OK(eager_service_impl_.GetEagerContext(context_id_, &ctx));
-  for (const string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
+  for (const std::string& func_name : ctx->FuncLibDef()->ListFunctionNames()) {
     const FunctionDef* fdef = ctx->FuncLibDef()->Find(func_name);
     EXPECT_TRUE(fdef != nullptr);
     if (absl::StartsWith(func_name, "MatMulFunction")) {
@@ -1288,7 +1292,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
 TEST_F(EagerServiceImplTest, SendTensorTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
 
   CreateContextRequest request;
   request.mutable_server_def()->set_job_name("localhost");
@@ -1306,7 +1310,7 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
   send_tensor->set_op_id(1);
   SetTensorProto(send_tensor->add_tensors());
 
-  std::unordered_map<string, AttrValue> attrs;
+  std::unordered_map<std::string, AttrValue> attrs;
   AttrValue val;
   val.Clear();
   val.set_type(tensorflow::DataType::DT_FLOAT);
@@ -1351,13 +1355,13 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
 TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
-  const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
-  const string composite_device =
+  const std::string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const std::string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+  const std::string composite_device =
       "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
   CreateContextRequest request;
   auto* server_def = request.mutable_server_def();
   server_def->set_job_name("localhost");
@@ -1465,7 +1469,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
       /*async=*/false, device_mgr_.get(), false, std::move(rendezvous), nullptr,
       nullptr,
       /*run_eager_op_as_function=*/true);
-  const uint64 context_id = random::New64();
+  const uint64_t context_id = random::New64();
 
   // Set RemoteMgr to ctx.
   auto remote_mgr =
@@ -1506,7 +1510,7 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
 TEST_F(EagerServiceImplTest, KeepAliveTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
 
-  uint64 context_id = random::New64();
+  uint64_t context_id = random::New64();
   CreateContextRequest request;
   request.mutable_server_def()->set_job_name("localhost");
   request.mutable_server_def()->set_task_index(0);
@@ -1531,7 +1535,7 @@ TEST_F(EagerServiceImplTest, KeepAliveTest) {
   EXPECT_PRED_FORMAT2(::testing::IsSubstring, "Unable to find a context_id",
                       std::string(status.message()));
 
-  uint64 new_context_id = random::New64();
+  uint64_t new_context_id = random::New64();
   // Create a new context.
   request.set_context_id(new_context_id);
   TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 8066664cd0e456..e532bdff5e657a 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -66,8 +66,8 @@ absl::Status CreateUncachedKernelAndDeviceOp(
 
 // This gets a unique wire ID. We add a random identifier so that if the
 // worker has other clients that it is servicing, we don't have any collision.
-string GetUniqueWireID() {
-  static tensorflow::uint64 random_seed = random::New64();
+std::string GetUniqueWireID() {
+  static uint64_t random_seed = random::New64();
   static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
   static std::atomic<int64_t> wire_id;
   return absl::StrCat(random_seed, "_", wire_id++);
@@ -77,7 +77,7 @@ string GetUniqueWireID() {
 
 RemoteCopyNode::RemoteCopyNode(EagerContext* ctx, EagerExecutor* executor,
                                TensorHandle* src, TensorHandle* dst,
-                               Device* recv_device, uint64 recv_op_id)
+                               Device* recv_device, uint64_t recv_op_id)
     : AsyncEagerNode(),
       src_(src),
       ctx_(ctx),
@@ -220,12 +220,12 @@ absl::Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
 
 void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
   EnqueueRequest request;
-  uint64 context_id = ctx_->GetContextId();
+  uint64_t context_id = ctx_->GetContextId();
   request.set_context_id(context_id);
   auto* remote_op = request.add_queue()->mutable_operation();
   PrepareRemoteOp(remote_op, op);
   remote_op->set_id(recv_op_id_);
-  uint64 context_view_id = ctx_->GetContextViewId();
+  uint64_t context_view_id = ctx_->GetContextViewId();
 
   core::RefCountPtr<eager::EagerClient> eager_client;
   absl::Status status = ctx_->GetClient(recv_device_, &eager_client);
@@ -316,7 +316,7 @@ void RemoteCopyNode::StartRecv(StatusCallback done) {
   }
 }
 
-absl::Status SerializePackedHandle(const uint64 op_id,
+absl::Status SerializePackedHandle(const uint64_t op_id,
                                    TensorHandle* packed_handle,
                                    const Device* target_device,
                                    EagerContext* ctx, SendPackedHandleOp* op) {
@@ -362,7 +362,7 @@ absl::Status SerializePackedHandle(const uint64 op_id,
 
 void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
   absl::Status s;
-  const uint64 context_view_id = ctx_->GetContextViewId();
+  const uint64_t context_view_id = ctx_->GetContextViewId();
   if (!send_device_->IsLocal()) {
     s = errors::InvalidArgument(
         "Copy a packed handle from a remote device is not supported");
@@ -372,7 +372,7 @@ void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
   }
 
   EnqueueRequest request;
-  uint64 context_id = ctx_->GetContextId();
+  uint64_t context_id = ctx_->GetContextId();
   request.set_context_id(context_id);
   s = SerializePackedHandle(recv_op_id_, src_, recv_device_, ctx_,
                             request.add_queue()->mutable_send_packed_handle());
@@ -426,12 +426,12 @@ void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
 void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   absl::Status s;
   EnqueueRequest request;
-  uint64 context_id = ctx_->GetContextId();
+  uint64_t context_id = ctx_->GetContextId();
   request.set_context_id(context_id);
   auto* send_tensor = request.add_queue()->mutable_send_tensor();
   send_tensor->set_op_id(recv_op_id_);
   send_tensor->set_device_name(recv_device_->name());
-  uint64 context_view_id = ctx_->GetContextViewId();
+  uint64_t context_view_id = ctx_->GetContextViewId();
 
   // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
   // copy it to the CPU before copying it out.
@@ -515,7 +515,7 @@ void RemoteCopyNode::RunAsync(StatusCallback done) {
 
 void RemoteCopyNode::Abort(absl::Status status) {
   if (!started_) {
-    uint64 context_view_id = ctx_->GetContextViewId();
+    uint64_t context_view_id = ctx_->GetContextViewId();
     captured_state_->dst()->PoisonRemote(status, recv_device_, context_view_id);
   }
 }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
index 572b650651b0c3..a8dc387d9a7dbf 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -63,7 +63,7 @@ namespace eager {
 class RemoteCopyNode : public AsyncEagerNode {
  public:
   RemoteCopyNode(EagerContext* ctx, EagerExecutor* executor, TensorHandle* src,
-                 TensorHandle* dst, Device* recv_device, uint64 recv_op_id);
+                 TensorHandle* dst, Device* recv_device, uint64_t recv_op_id);
 
   ~RemoteCopyNode() override;
 
@@ -73,8 +73,8 @@ class RemoteCopyNode : public AsyncEagerNode {
 
   void Abort(absl::Status status) override;
 
-  string DebugString() const override {
-    string out = "[RemoteCopyNode]";
+  std::string DebugString() const override {
+    std::string out = "[RemoteCopyNode]";
     absl::StrAppend(&out, " send_device: ", send_device_->name());
     absl::StrAppend(&out, ", recv_device: ", recv_device_->name());
     absl::StrAppend(&out, ", send_tensor: ", src_->DebugString());
@@ -167,8 +167,8 @@ class RemoteCopyNode : public AsyncEagerNode {
   EagerExecutor* const executor_;
   Device* const send_device_;
   Device* const recv_device_;
-  const string wire_id_;
-  const uint64 recv_op_id_;
+  const std::string wire_id_;
+  const uint64_t recv_op_id_;
 
   std::shared_ptr<CapturedSharedState> captured_state_;
   bool started_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
index f118ecaeb2bbad..3c526f2904d34c 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.cc
@@ -32,9 +32,9 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
   Device* device = device_;
 
   // Filled and used only when VLOG(3) is on.
-  string rpc_description;
+  std::string rpc_description;
   if (VLOG_IS_ON(3)) {
-    std::vector<string> ops;
+    std::vector<std::string> ops;
     ops.reserve(request_->queue_size());
     for (const QueueItem& item : request_->queue()) {
       if (item.has_operation()) {
@@ -96,7 +96,7 @@ void RemoteExecuteNode::RunAsync(StatusCallback done) {
         }
         for (size_t i = 0; i < retvals.size(); ++i) {
           if (status.ok()) {
-            const string output_device =
+            const std::string output_device =
                 response->queue_response(0).device().empty()
                     ? ""
                     : response->queue_response(0).device(i);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index e29d8d1c187f31..8cc9501efb06d4 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -40,7 +40,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
  public:
   RemoteExecuteNode(EagerContext* eager_context,
                     std::unique_ptr<EnqueueRequest> request, Device* device,
-                    uint64 context_view_id, EagerClient* eager_client,
+                    uint64_t context_view_id, EagerClient* eager_client,
                     CancellationManager* cancellation_manager,
                     const NodeDef& ndef,
                     const FunctionLibraryDefinition* lib_def,
@@ -118,8 +118,8 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
     return eager_client_->allow_multiple_pending_requests();
   }
 
-  string DebugString() const override {
-    string out = "[RemoteExecuteNode]";
+  std::string DebugString() const override {
+    std::string out = "[RemoteExecuteNode]";
     absl::StrAppend(&out, " request: ", request_->DebugString());
     absl::StrAppend(&out, ", target_device: ", device_->name());
     return out;
@@ -129,7 +129,7 @@ class RemoteExecuteNode : public AsyncRemoteExecuteNode {
   EagerContext* eager_context_;  // Not owned, and must outlive this node.
   std::unique_ptr<EnqueueRequest> request_;
   Device* device_;             // Not owned
-  uint64 context_view_id_;
+  uint64_t context_view_id_;
   bool needs_remote_inputs_;
   EagerClient* eager_client_;  // Not owned, and must outlive this node.
   CancellationManager* cancellation_manager_;
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index acd34fd9ccbc86..5cec8424c2e14d 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -127,7 +127,7 @@ absl::Status RemoteMgr::GetMirroredResourceShape(
 
 absl::Status RemoteMgr::GetRemoteTensorHandle(
     const tensorflow::TensorHandle* handle, const bool wait_until_ready,
-    int64_t* op_id, int32* output_num) {
+    int64_t* op_id, int32_t* output_num) {
   TF_RETURN_IF_ERROR(handle->RemoteAddress(handle->device(), wait_until_ready,
                                            op_id, output_num));
   tensorflow::TensorHandle* h;
@@ -213,7 +213,7 @@ absl::Status RemoteMgr::DeserializeRemoteTensorHandle(
   } else {
     // Create a remote TensorHandle for remote tensors which have not been
     // copied to the local worker yet (e.g. remote function inputs).
-    const string& device_name =
+    const std::string& device_name =
         in.op_device().empty() ? in.device() : in.op_device();
     TF_RETURN_IF_ERROR(
         parent_->FindDeviceFromName(device_name.c_str(), &device));
@@ -241,7 +241,7 @@ absl::Status RemoteMgr::DeserializeRemoteTensorHandle(
   return absl::OkStatus();
 }
 
-EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
+EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64_t stream_id) {
   mutex_lock l(executor_map_mu_);
   auto it = executor_map_.find(stream_id);
   if (it == executor_map_.end()) {
@@ -254,7 +254,7 @@ EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
   return it->second;
 }
 
-void RemoteMgr::DeleteExecutorForStream(uint64 stream_id) {
+void RemoteMgr::DeleteExecutorForStream(uint64_t stream_id) {
   mutex_lock l(executor_map_mu_);
   auto it = executor_map_.find(stream_id);
   if (it == executor_map_.end()) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index b62134cd6e5860..975cfa13e45ef7 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -58,7 +58,7 @@ class RemoteMgr {
 
   // Helper function to create monotonically increasing ids unique to this
   // context.
-  uint64 NextOpId() {
+  uint64_t NextOpId() {
     DCHECK(is_master_);
     mutex_lock l(next_id_mutex_);
     return next_op_id_++;
@@ -77,20 +77,20 @@ class RemoteMgr {
   absl::Status DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
                                              TensorHandle** out);
 
-  EagerExecutor& GetOrCreateExecutorForStream(uint64 stream_id);
+  EagerExecutor& GetOrCreateExecutorForStream(uint64_t stream_id);
 
-  void DeleteExecutorForStream(uint64 stream_id);
+  void DeleteExecutorForStream(uint64_t stream_id);
 
  protected:
   mutex next_id_mutex_;
-  uint64 next_op_id_ TF_GUARDED_BY(next_id_mutex_) = 1;
+  uint64_t next_op_id_ TF_GUARDED_BY(next_id_mutex_) = 1;
 
  private:
   // Returns the op_id and output_num if the given local TensorHandle exists in
   // remote_tensor_handle_map_.
   absl::Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
                                      const bool wait_until_ready,
-                                     int64_t* op_id, int32* output_num)
+                                     int64_t* op_id, int32_t* output_num)
       TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   absl::Status GetTensorHandleImpl(
@@ -129,7 +129,7 @@ class RemoteMgr {
   EagerContext* parent_;  // not owned.
 
   mutex executor_map_mu_;
-  std::unordered_map<uint64, EagerExecutor> executor_map_
+  std::unordered_map<uint64_t, EagerExecutor> executor_map_
       TF_GUARDED_BY(executor_map_mu_);
 };
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index ae05ce640cf0dc..89901367b49b2d 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -35,7 +35,7 @@ class TestRemoteMgr : public RemoteMgr {
   TestRemoteMgr(bool is_master, EagerContext* ctx)
       : RemoteMgr(is_master, ctx) {}
 
-  uint64 OpId() {
+  uint64_t OpId() {
     tf_shared_lock l(next_id_mutex_);
     return next_op_id_;
   }
@@ -75,7 +75,7 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
 
   TensorHandle* handle = TensorHandle::CreateLocalHandle(
       std::move(t), local_device_, local_device_, ctx_);
-  const uint64 op_id = 2;
+  const uint64_t op_id = 2;
   const int output_num = 3;
   TF_ASSERT_OK(handle->AddUnshapedRemoteMirror(remote_device_, op_id,
                                                output_num, "", ctx_));
@@ -94,7 +94,7 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
 TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
   RemoteMgr remote_mgr(false, ctx_);
 
-  const uint64 op_id = 3;
+  const uint64_t op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateLazyRemoteHandle(
       op_id, output_num, DT_FLOAT, remote_device_, /*is_ready=*/true, ctx_);
@@ -113,7 +113,7 @@ TEST_F(RemoteMgrTest, InvalidateRemoteMirrorWithClusterUpdate) {
 
   TensorHandle* handle = TensorHandle::CreateLocalHandle(
       std::move(t), local_device_, local_device_, ctx_);
-  const uint64 op_id = 2;
+  const uint64_t op_id = 2;
   const int output_num = 3;
   TF_ASSERT_OK(handle->AddUnshapedRemoteMirror(remote_device_, op_id,
                                                output_num, "", ctx_));
@@ -134,7 +134,7 @@ TEST_F(RemoteMgrTest, InvalidateRemoteMirrorWithClusterUpdate) {
 TEST_F(RemoteMgrTest, SetRemoteShapeWithClusterUpdate) {
   RemoteMgr remote_mgr(false, ctx_);
 
-  const uint64 op_id = 3;
+  const uint64_t op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateUnshapedRemoteHandle(
       op_id, output_num,
@@ -157,7 +157,7 @@ TEST_F(RemoteMgrTest, SetRemoteShapeWithClusterUpdate) {
 TEST_F(RemoteMgrTest, ErrorSourcesShouldExist) {
   RemoteMgr remote_mgr(false, ctx_);
 
-  const uint64 op_id = 3;
+  const uint64_t op_id = 3;
   const int output_num = 1;
   TensorHandle* handle = TensorHandle::CreateLazyRemoteHandle(
       op_id, output_num, DT_FLOAT, remote_device_, /*is_ready=*/true, ctx_);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
index 903d019172a457..51f8d97e6ce6f8 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
@@ -28,7 +28,7 @@ struct RemoteTensorHandleInternal {
   RemoteTensorHandleInternal(int64_t op_id, int32_t output_num)
       : op_id(op_id), output_num(output_num) {}
   int64_t op_id;
-  int32 output_num;
+  int32_t output_num;
 };
 
 struct RemoteTensorHandleInternalHash {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 73427ed1372ed8..32ec58774d99cb 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -29,9 +29,10 @@ namespace tensorflow {
 
 namespace {
 
-void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
-                               uint64 context_id, uint64 op_id, int output_num,
-                               bool ready) {
+void DestroyRemoteTensorHandle(EagerContext* ctx,
+                               const std::string& remote_task,
+                               uint64_t context_id, uint64_t op_id,
+                               int output_num, bool ready) {
   if (ctx->GetContextId() != context_id) {
     // This means that this tensor was pointing to a remote device, which
     // has been changed out from under us. Simply return since there is
@@ -89,7 +90,7 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task,
 }  // namespace
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64_t op_id, int output_num,
-                                               uint64 context_view_id,
+                                               uint64_t context_view_id,
                                                bool is_ready)
     : is_ready_(is_ready),
       op_id_(op_id),
@@ -102,7 +103,7 @@ RemoteTensorHandleData::RemoteTensorHandleData(int64_t op_id, int output_num,
 }
 
 RemoteTensorHandleData::RemoteTensorHandleData(int64_t op_id, int output_num,
-                                               const string& remote_task,
+                                               const std::string& remote_task,
                                                EagerContext* ctx)
     : is_ready_(false),
       op_id_(op_id),
@@ -182,7 +183,7 @@ absl::Status RemoteTensorHandleData::SetShape(const TensorShape& shape) {
 }
 
 absl::Status RemoteTensorHandleData::SetShapeAndRemoteTask(
-    const TensorShape& shape, const string& remote_task) {
+    const TensorShape& shape, const std::string& remote_task) {
   // If `is_ready_` is set previously due to poisoning, return the original
   // error that poisoned this tensor.
   TF_RETURN_IF_ERROR(IsPoisoned());
@@ -216,13 +217,13 @@ absl::Status RemoteTensorHandleData::SetShapeAndRemoteTask(
   return absl::OkStatus();
 }
 
-string RemoteTensorHandleData::DebugString() const {
+std::string RemoteTensorHandleData::DebugString() const {
   return absl::StrCat("RemoteTensorHandleData:", " op_id: ", op_id_,
                       " output_num: ", output_num_);
 }
 
 absl::Status RemoteTensorHandleData::OpIdAndOutputNum(
-    const bool wait_until_ready, int64_t* op_id, int32* output_num) const {
+    const bool wait_until_ready, int64_t* op_id, int32_t* output_num) const {
   if (wait_until_ready) {
     TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
   }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 892d82bd5f7efe..1c7099cc66b1a4 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -31,12 +31,12 @@ class RemoteTensorHandleData {
   // the corresponding remote tensor is ready. So the remote tensor should be
   // ready when we create a lazy remote handle. If it refers to a remote output,
   // it's not ready until the shape is set.
-  RemoteTensorHandleData(int64_t op_id, int output_num, uint64 context_view_id,
-                         bool is_ready);
+  RemoteTensorHandleData(int64_t op_id, int output_num,
+                         uint64_t context_view_id, bool is_ready);
   // Constructor for unshaped remote handles. It controls the lifetime of a
   // remote handle that it refers to.
   RemoteTensorHandleData(int64_t op_id, int output_num,
-                         const string& remote_task, EagerContext* ctx);
+                         const std::string& remote_task, EagerContext* ctx);
   ~RemoteTensorHandleData();
 
   // A remote tensor handle does not have a Tensor object, hence it can only
@@ -51,18 +51,18 @@ class RemoteTensorHandleData {
   absl::Status WaitReady(const char* caller) const;
   absl::Status SetShape(const TensorShape& shape);
   absl::Status SetShapeAndRemoteTask(const TensorShape& shape,
-                                     const string& remote_task);
+                                     const std::string& remote_task);
   void Poison(absl::Status status);
   absl::Status IsPoisoned() const;
 
-  string DebugString() const;
+  std::string DebugString() const;
 
   // Return the op id and output num. If wait_until_ready is true, block until
   // the remote tensor is ready on a remote worker.
   absl::Status OpIdAndOutputNum(bool wait_until_ready, int64_t* op_id,
-                                int32* output_num) const;
+                                int32_t* output_num) const;
 
-  uint64 context_view_id() const { return context_view_id_; }
+  uint64_t context_view_id() const { return context_view_id_; }
 
  private:
   mutable mutex mu_;
@@ -72,10 +72,10 @@ class RemoteTensorHandleData {
 
   // IDs required when this class is representing a remote tensor handle.
   const int64_t op_id_;
-  const int32 output_num_;
-  string remote_task_ TF_GUARDED_BY(mu_);
-  uint64 context_id_;
-  uint64 context_view_id_;
+  const int32_t output_num_;
+  std::string remote_task_ TF_GUARDED_BY(mu_);
+  uint64_t context_id_;
+  uint64_t context_view_id_;
   EagerContext* ctx_;
 };
 

From e12987ed2dbf446de3f6dbabf67e90ecf609f02f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:04:23 -0800
Subject: [PATCH 391/753] Update GraphDef version to 2444.

PiperOrigin-RevId: 845643947
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a94b3e73fba6ba..607ccceb50bda9 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2443  // Updated: 2025/12/16
+#define TF_GRAPH_DEF_VERSION 2444  // Updated: 2025/12/17
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From e7d96bda2e2d1087e10e450694cdb04aad5a80da Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:04:34 -0800
Subject: [PATCH 392/753] compat: Update forward compatibility horizon to
 2025-12-17

PiperOrigin-RevId: 845643992
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 397612746c514a..67a1d81ddb3b58 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 22dda6f4ea1d075c655fc8d133a6c33f23dfa574 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:07:06 -0800
Subject: [PATCH 393/753] Automated Code Change

PiperOrigin-RevId: 845644872
---
 .../xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc
index 1a94b5e1be242b..a40f44c0008fc7 100644
--- a/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc
+++ b/third_party/xla/xla/backends/gpu/profiler/kernel_name_tracer_test.cc
@@ -172,7 +172,7 @@ void LaunchCommandBufferThunk(stream_executor::StreamExecutor* executor,
   CommandBufferThunk thunk(std::move(cmd_buffer_executor), Thunk::ThunkInfo());
 
   ServiceExecutableRunOptions run_options;
-  stream_executor::StreamExecutorMemoryAllocator allocator(executor);
+  stream_executor::StreamExecutorAddressAllocator allocator(executor);
   BufferAllocations allocations({a, b, c}, 0, &allocator);
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(

From 348ca28805eec9ba06d6d8dc3e7be63a0aab9f8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:12:25 -0800
Subject: [PATCH 394/753] Automated Code Change

PiperOrigin-RevId: 845646502
---
 .../xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc  | 7 ++++---
 .../gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc   | 2 +-
 .../service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc | 2 +-
 .../xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc  | 6 +++---
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
index e56efa0be3a17f..41b0698075c502 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel.cc
@@ -85,8 +85,8 @@ static std::optional<Dim> As(std::optional<Dim3> dim3) {
 }
 
 // Returns a pointer to device memory holding a slice offset.
-static int32_t* SlicePtr(const se::KernelArgsDeviceMemoryArray* args,
-                         int64_t index) {
+static int32_t* SlicePtr(
+    const stream_executor::KernelArgsDeviceAddressArray* args, int64_t index) {
   const void* opaque = args->device_memory_ptr(index);
   return static_cast<int32_t*>(const_cast<void*>(opaque));
 }
@@ -111,7 +111,8 @@ KernelArgsPacking ArgsPacking(GemmMode mode, int32_t batch_count, int32_t m,
   };
 
   return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
-    auto* mem_args = se::Cast<se::KernelArgsDeviceMemoryArray>(&args);
+    auto* mem_args =
+        se::Cast<stream_executor::KernelArgsDeviceAddressArray>(&args);
 
     Arguments arguments = {mode, batch_count, m, n, k};
     arguments.lhs = const_cast<void*>(mem_args->device_memory_ptr(indices.lhs));
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
index 7c45a2664ee8f0..1b88d1847de619 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
@@ -72,7 +72,7 @@ static void BM_RowMajorGemm(benchmark::State& state) {
   CHECK_OK(stream->Memset32(&b, BitPattern(1.2f), b.size()));
   CHECK_OK(stream->MemZero(&c, c.size()));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
index 592b8fd7731b6c..15355017c55b1a 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
@@ -71,7 +71,7 @@ TEST(CutlassGemmKernelTest, SimpleGemm) {
   TF_ASSERT_OK(stream->MemZero(&c, byte_length));
 
   // Launch gemm kernel with device memory arguments.
-  se::KernelArgsDeviceMemoryArray arr(
+  stream_executor::KernelArgsDeviceAddressArray arr(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   TF_ASSERT_OK(gemm->Launch(custom_kernel.thread_dims(),
diff --git a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
index c37e0ff701464d..983fdd7728851b 100644
--- a/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/ptx_custom_kernel_test.cc
@@ -103,7 +103,7 @@ TEST(PtxCustomKernelTest, GetPtxCustomKernel) {
   CHECK_OK(stream->Memset32(&b, 2, byte_length));
   CHECK_OK(stream->MemZero(&c, byte_length));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
@@ -143,7 +143,7 @@ TEST(PtxCustomKernelTest, GetPtxCustomKernelWithClusterDim) {
   CHECK_OK(stream->Memset32(&b, 2, byte_length));
   CHECK_OK(stream->MemZero(&c, byte_length));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),
@@ -222,7 +222,7 @@ TEST(PtxCustomKernelTest, GetOwnedPtxCustomKernel) {
   CHECK_OK(stream->Memset32(&b, 2, byte_length));
   CHECK_OK(stream->MemZero(&c, byte_length));
 
-  se::KernelArgsDeviceMemoryArray args(
+  stream_executor::KernelArgsDeviceAddressArray args(
       std::vector<se::DeviceAddressBase>({a, b, c}),
       custom_kernel.shared_memory_bytes());
   CHECK_OK(kernel->Launch(custom_kernel.thread_dims(),

From 7addf9852a1472f9fadc87de8ec29f0b355f5c63 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:12:29 -0800
Subject: [PATCH 395/753] Automated Code Change

PiperOrigin-RevId: 845646520
---
 tensorflow/core/graph/benchmark_testlib.h     |  20 +--
 tensorflow/core/graph/collective_order.cc     |  18 +--
 tensorflow/core/graph/control_flow.cc         |  11 +-
 tensorflow/core/graph/control_flow.h          |   4 +-
 tensorflow/core/graph/costmodel.cc            |   6 +-
 tensorflow/core/graph/costmodel.h             |  12 +-
 tensorflow/core/graph/costmodel_test.cc       |  12 +-
 tensorflow/core/graph/edgeset.h               |   4 +-
 tensorflow/core/graph/graph.cc                |  12 +-
 tensorflow/core/graph/graph.h                 |  30 ++---
 .../graph/graph_debug_info_builder_test.cc    |   2 +-
 tensorflow/core/graph/graph_def_builder.cc    |  15 ++-
 tensorflow/core/graph/graph_def_builder.h     |  23 ++--
 tensorflow/core/graph/graph_node_util.cc      |  13 +-
 tensorflow/core/graph/graph_node_util.h       |   4 +-
 tensorflow/core/graph/graph_partition.cc      | 120 ++++++++++--------
 tensorflow/core/graph/graph_partition.h       |  17 +--
 tensorflow/core/graph/graph_partition_test.cc |  78 ++++++------
 tensorflow/core/graph/node_builder.cc         |   2 +-
 tensorflow/core/graph/node_builder.h          |  10 +-
 tensorflow/core/graph/optimizer_cse_test.cc   |  33 ++---
 tensorflow/core/graph/subgraph.cc             |  39 +++---
 tensorflow/core/graph/subgraph.h              |  23 ++--
 tensorflow/core/graph/tensor_id.h             |  12 +-
 tensorflow/core/graph/tensor_id_test.cc       |  16 ++-
 tensorflow/core/graph/testlib.cc              |  39 +++---
 tensorflow/core/graph/testlib.h               |  35 ++---
 tensorflow/core/graph/validate.cc             |   2 +-
 tensorflow/core/graph/validate_test.cc        |  22 ++--
 tensorflow/core/graph/while_context.h         |   4 +-
 30 files changed, 334 insertions(+), 304 deletions(-)

diff --git a/tensorflow/core/graph/benchmark_testlib.h b/tensorflow/core/graph/benchmark_testlib.h
index 54716405fd2a6a..98a488d4209a9b 100644
--- a/tensorflow/core/graph/benchmark_testlib.h
+++ b/tensorflow/core/graph/benchmark_testlib.h
@@ -73,7 +73,7 @@ inline GraphDef CreateGraphDef(int num_nodes, int num_edges_per_node) {
   const int kNumInNodes = 10 * num_edges_per_node;
   GraphDef graph_def;
 
-  auto create_node = [](const string& name, const string& op) {
+  auto create_node = [](const std::string& name, const std::string& op) {
     NodeDef node;
     node.set_name(name);
     node.set_op(op);
@@ -115,17 +115,17 @@ inline GraphDef CreateRandomGraph(int size) {
   random::PhiloxRandom philox(0x12345);
   random::SimplePhilox rnd(&philox);
 
-  string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
+  std::string prefix = "long_node_name_prefix_to_measure_string_copy_overhead";
 
   GraphDef graph;
   for (int i = 0; i < size; ++i) {
-    const string name = absl::StrCat(prefix, i);
-    const uint32 num_inputs = rnd.Uniform(std::min(i, 5));
+    const std::string name = absl::StrCat(prefix, i);
+    const uint32_t num_inputs = rnd.Uniform(std::min(i, 5));
 
     NodeDef node;
     node.set_name(name);
     for (int n = 0; n < num_inputs; ++n) {
-      const uint32 input_node = rnd.Uniform(i);
+      const uint32_t input_node = rnd.Uniform(i);
       node.add_input(absl::StrCat(prefix, input_node));
     }
 
@@ -142,7 +142,7 @@ inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
                                            bool fanout_unique_index) {
   GraphDef graph;
 
-  auto create_node = [](const string& name) {
+  auto create_node = [](const std::string& name) {
     NodeDef node;
     node.set_name(name);
     return node;
@@ -151,14 +151,14 @@ inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
   NodeDef node = create_node(/*name=*/"node");
 
   for (int i = 0; i < num_regular_fanins; ++i) {
-    const string input_node_name = absl::StrFormat("in%05d", i);
+    const std::string input_node_name = absl::StrFormat("in%05d", i);
     NodeDef input_node = create_node(/*name=*/input_node_name);
     *graph.add_node() = std::move(input_node);
     node.add_input(input_node_name);
   }
 
   for (int i = 0; i < num_controlling_fanins; ++i) {
-    const string input_node_name = absl::StrFormat("control_in%05d", i);
+    const std::string input_node_name = absl::StrFormat("control_in%05d", i);
     NodeDef input_node = create_node(/*name=*/input_node_name);
     *graph.add_node() = std::move(input_node);
     node.add_input(absl::StrCat("^", input_node_name));
@@ -166,13 +166,13 @@ inline GraphDef CreateFaninFanoutNodeGraph(int num_regular_fanins,
 
   for (int i = 0; i < num_regular_fanouts; ++i) {
     NodeDef output_node = create_node(/*name=*/absl::StrFormat("out%05d", i));
-    const string input_node_index =
+    const std::string input_node_index =
         fanout_unique_index ? absl::StrCat(node.name(), ":", i) : node.name();
     output_node.add_input(input_node_index);
     *graph.add_node() = std::move(output_node);
   }
 
-  const string controlled_fanout_input = absl::StrCat("^", node.name());
+  const std::string controlled_fanout_input = absl::StrCat("^", node.name());
   for (int i = 0; i < num_controlled_fanouts; ++i) {
     NodeDef output_node =
         create_node(/*name=*/absl::StrFormat("control_out%05d", i));
diff --git a/tensorflow/core/graph/collective_order.cc b/tensorflow/core/graph/collective_order.cc
index 9f8a498d88b47e..3ca3748eeb18be 100644
--- a/tensorflow/core/graph/collective_order.cc
+++ b/tensorflow/core/graph/collective_order.cc
@@ -25,8 +25,9 @@ namespace {
 // them.
 absl::Status DiscoverDataDependencies(
     const Graph* graph, std::vector<Node*>* collective_nodes,
-    std::vector<int32>* instance_keys,
-    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies) {
+    std::vector<int32_t>* instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>>*
+        data_dependencies) {
   absl::Status s;
   // Algorithm: do Reverse DFS starting at sink.  `node_leave` is called when
   // all parents of `node` have been visited.  At that point,
@@ -69,8 +70,8 @@ absl::Status DiscoverDataDependencies(
 // If there exists an edge a -> b then `dependency_edges[a]` contains `b`
 absl::Status CreateControlDependencies(
     const std::vector<Node*>& collective_nodes,
-    const std::vector<int32>& instance_keys,
-    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>>* data_dependencies,
+    const std::vector<int32_t>& instance_keys,
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>>* data_dependencies,
     absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>>* dependency_edges) {
   // If there exists some path a -> ... -> b then `all_paths[a]` contains `b`
   absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>> all_paths;
@@ -158,7 +159,7 @@ absl::Status InsertControlDependencies(
   } else if (order_type == GraphCollectiveOrder::kAttrs) {
     // `wait_for` is the inverse of `dependency_edges`, i.e. `wait_for[node]`
     // contains the list of instance keys for which `node` must wait.
-    absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> wait_for;
+    absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>> wait_for;
     for (const auto& pair : dependency_edges) {
       int32_t src_instance;
       TF_RETURN_IF_ERROR(
@@ -168,7 +169,8 @@ absl::Status InsertControlDependencies(
       }
     }
     for (const auto& pair : wait_for) {
-      std::vector<int32> wait_for_list(pair.second.begin(), pair.second.end());
+      std::vector<int32_t> wait_for_list(pair.second.begin(),
+                                         pair.second.end());
       pair.first->ClearAttr("wait_for");
       pair.first->AddAttr("wait_for", wait_for_list);
     }
@@ -184,9 +186,9 @@ absl::Status InsertControlDependencies(
 absl::Status OrderCollectives(Graph* graph, GraphCollectiveOrder order_type) {
   // `instance_keys[i]` corresponds to `collective_nodes[i]`
   std::vector<Node*> collective_nodes;
-  std::vector<int32> instance_keys;
+  std::vector<int32_t> instance_keys;
   // node -> set of collectives on which node depends.
-  absl::flat_hash_map<Node*, absl::flat_hash_set<int32>> data_dependencies;
+  absl::flat_hash_map<Node*, absl::flat_hash_set<int32_t>> data_dependencies;
   TF_RETURN_IF_ERROR(DiscoverDataDependencies(
       graph, &collective_nodes, &instance_keys, &data_dependencies));
 
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 4cd9316a4607e3..e443dadc678c26 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -27,7 +27,7 @@ namespace tensorflow {
 namespace {
 // Information about a loop frame structure.
 struct Frame {
-  string name;
+  std::string name;
 
   // Pointer to the parent frame. The root frame has a pointer to itself.
   Frame* parent = nullptr;
@@ -40,7 +40,7 @@ struct Frame {
 // Verify that the ControlFlowInfo of the graph has valid loop structure.
 absl::Status ValidateControlFlowInfo(
     const Graph* graph, const std::vector<ControlFlowInfo>& cf_info) {
-  std::unordered_map<string, Frame> frames;
+  std::unordered_map<std::string, Frame> frames;
   for (const Node* node : graph->op_nodes()) {
     const ControlFlowInfo& cf = cf_info[node->id()];
     if (!cf.frame || !cf.parent_frame) {
@@ -85,7 +85,7 @@ absl::Status ValidateControlFlowInfo(
 
 absl::Status BuildControlFlowInfo(const Graph* g,
                                   std::vector<ControlFlowInfo>* info,
-                                  std::vector<string>* unreachable_nodes) {
+                                  std::vector<std::string>* unreachable_nodes) {
   info->clear();
   info->resize(g->num_node_ids());
 
@@ -97,7 +97,7 @@ absl::Status BuildControlFlowInfo(const Graph* g,
   src_info.frame = src_node;
   src_info.parent_frame = src_node;
 
-  string frame_name;
+  std::string frame_name;
   std::deque<const Node*> ready;
   ready.push_back(src_node);
   while (!ready.empty()) {
@@ -135,7 +135,8 @@ absl::Status BuildControlFlowInfo(const Graph* g,
       // Process the node 'out'.
       if (IsEnter(out)) {
         if (is_visited) {
-          const string& parent_frame = (*info)[out_parent->id()].frame_name;
+          const std::string& parent_frame =
+              (*info)[out_parent->id()].frame_name;
           if (parent_frame != frame_name) {
             return errors::InvalidArgument(
                 FormatNodeForError(*out),
diff --git a/tensorflow/core/graph/control_flow.h b/tensorflow/core/graph/control_flow.h
index c1e2db339122df..b15bb671f7e1ce 100644
--- a/tensorflow/core/graph/control_flow.h
+++ b/tensorflow/core/graph/control_flow.h
@@ -36,7 +36,7 @@ struct ControlFlowInfo {
 
   const Node* frame = nullptr;         // frame of a node
   const Node* parent_frame = nullptr;  // parent frame of a node
-  string frame_name;                   // frame name of a node
+  std::string frame_name;              // frame name of a node
 };
 
 // Clear and populate `info` with each node's frame and the level it belongs to.
@@ -54,7 +54,7 @@ struct ControlFlowInfo {
 // which all sane front-ends should satisfy.
 absl::Status BuildControlFlowInfo(
     const Graph* g, std::vector<ControlFlowInfo>* info,
-    std::vector<string>* unreachable_nodes = nullptr);
+    std::vector<std::string>* unreachable_nodes = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/costmodel.cc b/tensorflow/core/graph/costmodel.cc
index 37d1e69c5b3c66..6026522f28cfb0 100644
--- a/tensorflow/core/graph/costmodel.cc
+++ b/tensorflow/core/graph/costmodel.cc
@@ -35,7 +35,7 @@ void CostModel::SuppressInfrequent() {
   // Find the median of the non-zero counts, and use half of its value
   // as the cutoff for a "normal" execution mode node.
   if (count_.empty()) return;
-  std::vector<int32> non_zero;
+  std::vector<int32_t> non_zero;
   for (auto v : count_) {
     if (v > 0) non_zero.push_back(v);
   }
@@ -192,7 +192,7 @@ void CostModel::RecordCount(const Node* node, int count) {
   count_[id] += count;
 }
 
-int32 CostModel::TotalCount(const Node* node) const {
+int32_t CostModel::TotalCount(const Node* node) const {
   const int id = Id(node);
   if (id < 0) return 0;
   return (static_cast<size_t>(id) < slot_bytes_.size()) ? count_[id] : 0;
@@ -419,7 +419,7 @@ Microseconds CostModel::ComputationTimeEstimate(int64_t math_ops) {
 
 void CostModel::IncrementUpdateTimes() { update_times_++; }
 
-int32 CostModel::GetUpdateTimes() const { return update_times_; }
+int32_t CostModel::GetUpdateTimes() const { return update_times_; }
 
 // ----------------------------------------------------------------------------
 // InitCostModel
diff --git a/tensorflow/core/graph/costmodel.h b/tensorflow/core/graph/costmodel.h
index 795d94720415b5..9bfd9b2a60ce1b 100644
--- a/tensorflow/core/graph/costmodel.h
+++ b/tensorflow/core/graph/costmodel.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
-typedef std::unordered_map<absl::string_view, int32, StringPieceHasher>
+typedef std::unordered_map<absl::string_view, int32_t, StringPieceHasher>
     NodeNameToCostIdMap;
 
 class StepStats;
@@ -95,7 +95,7 @@ class CostModel {
   void RecordCount(const Node* node, int num_count);
 
   // Returns how many times "node" has been executed.
-  int32 TotalCount(const Node* node) const;
+  int32_t TotalCount(const Node* node) const;
 
   // Records that "output_slot" of "node" has produced tensors of
   // aggregated "bytes".
@@ -184,7 +184,7 @@ class CostModel {
   void IncrementUpdateTimes();
 
   // Get the times that the cost model is updated.
-  int32 GetUpdateTimes() const;
+  int32_t GetUpdateTimes() const;
 
  private:
   static Bytes MinTensorMemoryUsage(const TensorShapeProto& tensor_shape,
@@ -197,13 +197,13 @@ class CostModel {
 
   // Nodes and Edges whose count is < this value
   // get type/byte estimates of 0.
-  int32 min_count_ = 0;
+  int32_t min_count_ = 0;
 
   // The number of times the cost model is updated.
-  int32 update_times_ = 0;
+  int32_t update_times_ = 0;
 
   // Number of times each Node has been executed.
-  std::vector<int32> count_;
+  std::vector<int32_t> count_;
   // Cumulative execution time.
   std::vector<Microseconds> time_;
   // Cumulative Bytes output on each channel.
diff --git a/tensorflow/core/graph/costmodel_test.cc b/tensorflow/core/graph/costmodel_test.cc
index 0e5c2273f53b20..c062f58856523b 100644
--- a/tensorflow/core/graph/costmodel_test.cc
+++ b/tensorflow/core/graph/costmodel_test.cc
@@ -56,7 +56,7 @@ MATCHER_P(ShapeProtoEquals, other, "") {
   return true;
 }
 
-static void InitGraph(const string& s, Graph* graph) {
+static void InitGraph(const std::string& s, Graph* graph) {
   GraphDef graph_def;
 
   auto parser = protobuf::TextFormat::Parser();
@@ -97,8 +97,8 @@ Node* FindNode(const Graph& graph, std::string name) {
   return nullptr;
 }
 
-Node* AddNode(Graph& graph, const string& name, const string& node_type,
-              int num_inputs) {
+Node* AddNode(Graph& graph, const std::string& name,
+              const std::string& node_type, int num_inputs) {
   auto builder = NodeDefBuilder(name, node_type);
   for (int i = 0; i < num_inputs; ++i) {
     builder = builder.Input(absl::StrCat("node_", i), i, DT_FLOAT);
@@ -114,7 +114,7 @@ Node* AddNode(Graph& graph, const string& name, const string& node_type,
 }
 
 static void GenerateStepStats(Graph* graph, StepStats* step_stats,
-                              const string& device_name) {
+                              const std::string& device_name) {
   // Fill RunMetadata's step_stats and partition_graphs fields.
   DeviceStepStats* device_stepstats = step_stats->add_dev_stats();
   device_stepstats->set_device(device_name);
@@ -150,7 +150,7 @@ TEST(CostModelTest, WorksWithManager) {
   GenerateStepStats(graph1.get(), &step_stats, "DummyDevice1");
   GenerateStepStats(graph2.get(), &step_stats, "DummyDevice2");
   StepStatsCollector collector(&step_stats);
-  std::unordered_map<string, const Graph*> device_map;
+  std::unordered_map<std::string, const Graph*> device_map;
   device_map["DummyDevice1"] = graph1.get();
   device_map["DummyDevice2"] = graph2.get();
   CostModelManager cost_model_manager;
@@ -161,7 +161,7 @@ TEST(CostModelTest, WorksWithManager) {
   TF_ASSERT_OK(
       cost_model_manager.AddToCostGraphDef(graph2.get(), &cost_graph_def));
   ASSERT_EQ(cost_graph_def.node_size(), 12);
-  absl::flat_hash_map<int32, const CostGraphDef::Node> ids;
+  absl::flat_hash_map<int32_t, const CostGraphDef::Node> ids;
   for (auto node : cost_graph_def.node()) {
     int32_t index = node.id();
     auto result = ids.insert({index, node});
diff --git a/tensorflow/core/graph/edgeset.h b/tensorflow/core/graph/edgeset.h
index 6d6cb3ff630591..e3f50ef59484ea 100644
--- a/tensorflow/core/graph/edgeset.h
+++ b/tensorflow/core/graph/edgeset.h
@@ -82,7 +82,7 @@ class EdgeSet {
 #ifdef NDEBUG
   void RegisterMutation() {}
 #else
-  uint32 mutations_ = 0;
+  uint32_t mutations_ = 0;
   void RegisterMutation() { mutations_++; }
 #endif
 
@@ -127,7 +127,7 @@ class EdgeSet::const_iterator {
     CHECK_EQ(init_mutations_, owner_->mutations_);
   }
   const EdgeSet* owner_ = nullptr;
-  uint32 init_mutations_ = 0;
+  uint32_t init_mutations_ = 0;
 #endif
 };
 
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index a3e14eac396859..c7acee2bd056eb 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -190,7 +190,7 @@ void Node::ClearTypeInfo() {
 
 absl::Status Node::ShrinkTypeInfo(
     const absl::flat_hash_map<int, int>& index_mapping,
-    const string& type_attr_name, bool update_full_type) {
+    const std::string& type_attr_name, bool update_full_type) {
   std::vector<DataType> dtypes;
   TF_RETURN_IF_ERROR(GetNodeAttr(def(), type_attr_name, &dtypes));
 
@@ -239,11 +239,11 @@ const OpDef& Node::op_def() const { return *props_->op_def; }
 
 NodeDef* Node::mutable_def() { return &props_->node_def; }
 
-int32 Node::num_inputs() const { return props_->input_types.size(); }
+int32_t Node::num_inputs() const { return props_->input_types.size(); }
 DataType Node::input_type(int32_t i) const { return props_->input_types[i]; }
 const DataTypeVector& Node::input_types() const { return props_->input_types; }
 
-int32 Node::num_outputs() const { return props_->output_types.size(); }
+int32_t Node::num_outputs() const { return props_->output_types.size(); }
 DataType Node::output_type(int32_t o) const { return props_->output_types[o]; }
 const DataTypeVector& Node::output_types() const {
   return props_->output_types;
@@ -416,7 +416,7 @@ bool InputTensor::operator==(const InputTensor& other) const {
   return node == other.node && index == other.index;
 }
 
-uint64 InputTensor::Hash::operator()(InputTensor const& s) const {
+uint64_t InputTensor::Hash::operator()(InputTensor const& s) const {
   return Hash64Combine(std::hash<const Node*>()(s.node),
                        std::hash<int>()(s.index));
 }
@@ -427,7 +427,7 @@ bool OutputTensor::operator==(const OutputTensor& other) const {
   return node == other.node && index == other.index;
 }
 
-uint64 OutputTensor::Hash::operator()(OutputTensor const& s) const {
+uint64_t OutputTensor::Hash::operator()(OutputTensor const& s) const {
   return Hash64Combine(std::hash<const Node*>()(s.node),
                        std::hash<int>()(s.index));
 }
@@ -1086,7 +1086,7 @@ GraphDebugInfo Graph::BuildDebugInfo() const {
 std::string Edge::DebugString() const {
   auto src_name = src_ ? src_->name().c_str() : "<NULL>";
   auto dst_name = dst_ ? dst_->name().c_str() : "<NULL>";
-  return strings::Printf("[id=%d %s:%d -> %s:%d]", id_, src_name, src_output_,
+  return absl::StrFormat("[id=%d %s:%d -> %s:%d]", id_, src_name, src_output_,
                          dst_name, dst_input_);
 }
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 6e70b0cdfa8322..10b29e0975625f 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -107,11 +107,11 @@ class Node {
   NodeDef* mutable_def();
 
   // input and output types
-  int32 num_inputs() const;
+  int32_t num_inputs() const;
   DataType input_type(int32_t i) const;
   const DataTypeVector& input_types() const;
 
-  int32 num_outputs() const;
+  int32_t num_outputs() const;
   DataType output_type(int32_t o) const;
   const DataTypeVector& output_types() const;
 
@@ -139,14 +139,14 @@ class Node {
 
   // Sets 'original_node_names' field of this node's DebugInfo proto to
   // 'names'.
-  void set_original_node_names(const std::vector<string>& names);
-  void set_original_func_names(const std::vector<string>& names);
+  void set_original_node_names(const std::vector<std::string>& names);
+  void set_original_func_names(const std::vector<std::string>& names);
 
   // Read only access to attributes
   AttrSlice attrs() const;
 
   // Inputs requested by the NodeDef.  For the actual inputs, use in_edges.
-  const protobuf::RepeatedPtrField<string>& requested_inputs() const;
+  const protobuf::RepeatedPtrField<std::string>& requested_inputs() const;
 
   // Get the neighboring nodes via edges either in or out of this node.  This
   // includes control edges.
@@ -220,7 +220,7 @@ class Node {
     UpdateProperties();
   }
 
-  void AddAttr(const std::string& name, std::vector<string>&& val) {
+  void AddAttr(const std::string& name, std::vector<std::string>&& val) {
     MoveAttrValue(std::move(val), AddAttrHelper(name));
     UpdateProperties();
   }
@@ -278,7 +278,7 @@ class Node {
   // update the node's full type information (if present).
   absl::Status ShrinkTypeInfo(
       const absl::flat_hash_map<int, int>& index_mapping,
-      const string& type_attr_name, bool update_full_type);
+      const std::string& type_attr_name, bool update_full_type);
 
   // Called after an incident non-control edge has changed. Does nothing if not
   // all input edges are defined.
@@ -383,8 +383,8 @@ class Node {
 // Stores debug information associated with the Node.
 struct NodeDebugInfo {
   const std::string name;
-  std::vector<string> original_node_names;
-  std::vector<string> original_func_names;
+  std::vector<std::string> original_node_names;
+  std::vector<std::string> original_func_names;
 
   NodeDebugInfo(const Node& n);
   NodeDebugInfo(const NodeDef& ndef);
@@ -407,7 +407,7 @@ struct InputTensor {
   // A hash function for InputTensors. Nodes are hashed based on their pointer
   // value.
   struct Hash {
-    uint64 operator()(InputTensor const& s) const;
+    uint64_t operator()(InputTensor const& s) const;
   };
 };
 
@@ -428,7 +428,7 @@ struct OutputTensor {
   // A hash function for OutputTensors. Nodes are hashed based on their pointer
   // value.
   struct Hash {
-    uint64 operator()(OutputTensor const& s) const;
+    uint64_t operator()(OutputTensor const& s) const;
   };
 };
 
@@ -803,7 +803,7 @@ class Graph {
                                WhileContext** result);
 
   // Builds a node name to node pointer index for all nodes in the graph.
-  std::unordered_map<string, Node*> BuildNodeNameIndex() const;
+  std::unordered_map<std::string, Node*> BuildNodeNameIndex() const;
 
   absl::optional<std::vector<bool>>& GetConstArgIndicesCache() const {
     return const_arg_indices_cache_;
@@ -906,16 +906,16 @@ class Graph {
 
   // A table of the unique assigned device names.  Indices do NOT correspond
   // to node IDs.  Index 0 is always the empty string.
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 
   // Maps unique device names to indices within device_names_[i].
-  std::unordered_map<string, int> device_names_map_;
+  std::unordered_map<std::string, int> device_names_map_;
 
   // All the while contexts owned by this graph, keyed by frame name,
   // corresponding to all the while loops contained in this graph (including
   // nested loops). The stored contexts are usually accessed via
   // AddWhileContext() or Node::while_ctx(), but this manages the lifetime.
-  std::map<string, WhileContext> while_ctxs_;
+  std::map<std::string, WhileContext> while_ctxs_;
 
   // Cache of the indices of the arguments which need to be constant for the XLA
   // compilation.
diff --git a/tensorflow/core/graph/graph_debug_info_builder_test.cc b/tensorflow/core/graph/graph_debug_info_builder_test.cc
index cbe4a8a8ae9287..5680800a5592c5 100644
--- a/tensorflow/core/graph/graph_debug_info_builder_test.cc
+++ b/tensorflow/core/graph/graph_debug_info_builder_test.cc
@@ -47,7 +47,7 @@ class TestStackTrace : public AbstractStackTrace {
 
   StackFrame LastUserFrame() const override { return frames_.back(); }
 
-  string ToString(const TracePrintingOptions& opts) const override {
+  std::string ToString(const TracePrintingOptions& opts) const override {
     auto frame = LastUserFrame();
     return absl::StrCat(frame.file_name, ":", frame.line_number, ":",
                         frame.function_name);
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index 168fc1a0da3da7..a4f08eab66b090 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -44,12 +44,12 @@ GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputs(
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithNameImpl(
     absl::string_view name) {
-  name_ = string(name);
+  name_ = std::string(name);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithDeviceImpl(
     absl::string_view device) {
-  device_ = string(device);
+  device_ = std::string(device);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputImpl(
@@ -72,7 +72,7 @@ absl::Status GraphDefBuilder::ToGraphDef(GraphDef* graph_def) const {
   return status_;
 }
 
-string GraphDefBuilder::Options::GetNameForOp(absl::string_view op) const {
+std::string GraphDefBuilder::Options::GetNameForOp(absl::string_view op) const {
   if (name_.empty()) return graph_->NewName(op);
   return name_;
 }
@@ -99,14 +99,15 @@ void GraphDefBuilder::Options::UpdateStatus(const absl::Status& status) const {
 
 namespace ops {
 
-Node* SourceOp(const string& op_name, const GraphDefBuilder::Options& opts) {
+Node* SourceOp(const std::string& op_name,
+               const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
                            opts.op_registry());
   return opts.FinalizeBuilder(&node_builder);
 }
 
-Node* UnaryOp(const string& op_name, NodeOut input,
+Node* UnaryOp(const std::string& op_name, NodeOut input,
               const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
@@ -115,7 +116,7 @@ Node* UnaryOp(const string& op_name, NodeOut input,
   return opts.FinalizeBuilder(&node_builder);
 }
 
-Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
+Node* BinaryOp(const std::string& op_name, NodeOut a, NodeOut b,
                const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
@@ -124,7 +125,7 @@ Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
   return opts.FinalizeBuilder(&node_builder);
 }
 
-Node* TernaryOp(const string& op_name, NodeOut a, NodeOut b, NodeOut c,
+Node* TernaryOp(const std::string& op_name, NodeOut a, NodeOut b, NodeOut c,
                 const GraphDefBuilder::Options& opts) {
   if (opts.HaveError()) return nullptr;
   NodeBuilder node_builder(opts.GetNameForOp(op_name), op_name,
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index b635ece0eab707..afe3aebe55d62c 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -104,14 +104,14 @@ class GraphDefBuilder {
 
     // Returns a string representation of the status associated with *this.
     // Returns the string `"OK"` if the status doesn't have any error.
-    string StatusToString() const {
+    std::string StatusToString() const {
       return status_->ok() ? "OK" : std::string(status_->message());
     }
 
     // Given the Op type name, return a name for a node of that type.
     // Uses the value set in WithName() if that has been called.  Otherwise,
     // returns a name built out of the Op type name.
-    string GetNameForOp(absl::string_view op) const;
+    std::string GetNameForOp(absl::string_view op) const;
 
     // Sets the device, adds control inputs, adds attrs, and calls Finalize().
     // If Finalize returns an error, it is saved and this function returns
@@ -133,17 +133,17 @@ class GraphDefBuilder {
     Options WithControlInputsImpl(absl::Span<Node* const> control_inputs);
     template <class T>
     Options WithAttrImpl(absl::string_view name, T&& value) {
-      attrs_.emplace_back(string(name), AttrValue());
+      attrs_.emplace_back(std::string(name), AttrValue());
       SetAttrValue(std::forward<T>(value), &attrs_.back().second);
       return *this;
     }
 
     Graph* const graph_;
     absl::Status* const status_;
-    string name_;
-    string device_;
+    std::string name_;
+    std::string device_;
     std::vector<Node*> control_inputs_;
-    std::vector<std::pair<string, AttrValue>> attrs_;
+    std::vector<std::pair<std::string, AttrValue>> attrs_;
   };
 
   // Start building a new graph.
@@ -176,7 +176,7 @@ class GraphDefBuilder {
 
   // Returns whether a user-defined function with `name` already exists in the
   // graph.
-  bool HasFunction(const string& name) {
+  bool HasFunction(const std::string& name) {
     return flib_def_.Find(name) != nullptr;
   }
 
@@ -196,18 +196,19 @@ namespace ops {
 typedef NodeBuilder::NodeOut NodeOut;
 
 // For adding an Op with no inputs to a GraphDefBuilder.
-Node* SourceOp(const string& op_name, const GraphDefBuilder::Options& opts);
+Node* SourceOp(const std::string& op_name,
+               const GraphDefBuilder::Options& opts);
 
 // For adding an Op with one input to a GraphDefBuilder.
-Node* UnaryOp(const string& op_name, NodeOut input,
+Node* UnaryOp(const std::string& op_name, NodeOut input,
               const GraphDefBuilder::Options& opts);
 
 // For adding an Op with two inputs to a GraphDefBuilder.
-Node* BinaryOp(const string& op_name, NodeOut a, NodeOut b,
+Node* BinaryOp(const std::string& op_name, NodeOut a, NodeOut b,
                const GraphDefBuilder::Options& opts);
 
 // For adding an Op with three inputs to a GraphDefBuilder.
-Node* TernaryOp(const string& op_name, NodeOut a, NodeOut b, NodeOut c,
+Node* TernaryOp(const std::string& op_name, NodeOut a, NodeOut b, NodeOut c,
                 const GraphDefBuilder::Options& opts);
 
 }  // namespace ops
diff --git a/tensorflow/core/graph/graph_node_util.cc b/tensorflow/core/graph/graph_node_util.cc
index 3bf14ed2944394..ed6a23e3813d80 100644
--- a/tensorflow/core/graph/graph_node_util.cc
+++ b/tensorflow/core/graph/graph_node_util.cc
@@ -25,9 +25,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
+std::string SummarizeNode(const Node& node) {
+  return SummarizeNodeDef(node.def());
+}
 
-string FormatNodeForError(const Node& node) {
+std::string FormatNodeForError(const Node& node) {
   return FormatNodeDefForError(node.def());
 }
 
@@ -41,9 +43,10 @@ absl::Status AttachDef(const absl::Status& status, const Node& node,
   return AttachDef(status, node.def(), allow_multiple_formatted_node);
 }
 
-absl::btree_set<string> GetMergedNames(const std::vector<string>& from_names,
-                                       const std::vector<string>& to_names) {
-  absl::btree_set<string> merged_names;
+absl::btree_set<std::string> GetMergedNames(
+    const std::vector<std::string>& from_names,
+    const std::vector<std::string>& to_names) {
+  absl::btree_set<std::string> merged_names;
   merged_names.insert(from_names.begin(), from_names.end());
   merged_names.insert(to_names.begin(), to_names.end());
   return merged_names;
diff --git a/tensorflow/core/graph/graph_node_util.h b/tensorflow/core/graph/graph_node_util.h
index 146c4c07ca833a..8d7a44c5fed2e0 100644
--- a/tensorflow/core/graph/graph_node_util.h
+++ b/tensorflow/core/graph/graph_node_util.h
@@ -29,12 +29,12 @@ class OpDef;
 
 // Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
-string SummarizeNode(const Node& node);
+std::string SummarizeNode(const Node& node);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
 // followed is: {{node <node_name>}}
-string FormatNodeForError(const Node& node);
+std::string FormatNodeForError(const Node& node);
 
 // Merges the original node names from the debug information of 'from' to the
 // debug information of 'to'.
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index be5a5423ae57c6..1328c5c8b57b4c 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -160,16 +160,17 @@ bool IsDstInputOnHost(const Edge* edge, const GraphInfo& info) {
 
 // Add a control edge from each input to each recv.
 void AddReadControl(const std::vector<NodeDef*>& recvs,
-                    const std::vector<string>& inputs) {
+                    const std::vector<std::string>& inputs) {
   for (NodeDef* recv : recvs) {
-    for (const string& input : inputs) {
+    for (const std::string& input : inputs) {
       recv->add_input(absl::StrCat("^", input));
     }
   }
 }
 
 void SetSendRecvAttrs(const PartitionOptions& opts, const Edge* edge,
-                      const string& tensor_name_attr, NodeDefBuilder* builder) {
+                      const std::string& tensor_name_attr,
+                      NodeDefBuilder* builder) {
   builder->Attr("tensor_name", tensor_name_attr);
   builder->Attr("send_device", edge->src()->assigned_device_name());
   builder->Attr("send_device_incarnation",
@@ -184,7 +185,7 @@ void SetSendRecvAttrs(const PartitionOptions& opts, const Edge* edge,
 NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
                  GraphDef* gdef, const Edge* edge,
                  NodeDefBuilder::NodeOut send_from, int64_t start_time,
-                 const string& tensor_name_attr, absl::Status* status) {
+                 const std::string& tensor_name_attr, absl::Status* status) {
   const DataType dtype = send_from.data_type;
   const DataType cast_dtype = opts.should_cast ? opts.should_cast(edge) : dtype;
   const Node* src = edge->src();
@@ -201,7 +202,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
   // Add a cast node that casts dtype to cast_dtype.
   // NOTE(yuanbyu): Only cast for cross-device send/recv.
   if (dtype != cast_dtype && !NeedSameDeviceSendRecv(edge, g_info)) {
-    const string cast_op = (host_memory) ? "_HostCast" : "Cast";
+    const std::string cast_op = (host_memory) ? "_HostCast" : "Cast";
     NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
                                 NodeDebugInfo(*src));
     cast_builder.Device(src->assigned_device_name()).Input(send_from);
@@ -226,7 +227,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
   }
 
   // Add the send node.
-  const string send_op = (host_memory) ? "_HostSend" : "_Send";
+  const std::string send_op = (host_memory) ? "_HostSend" : "_Send";
   NodeDefBuilder send_builder(opts.new_name(src->name()), send_op,
                               NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, tensor_name_attr, &send_builder);
@@ -241,7 +242,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
 
 NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
                  GraphDef* gdef, const Edge* edge, NodeDef** real_recv,
-                 const string& tensor_name_attr, absl::Status* status) {
+                 const std::string& tensor_name_attr, absl::Status* status) {
   const DataType dtype = EdgeType(edge);
   const Node* src = edge->src();
   const Node* dst = edge->dst();
@@ -285,7 +286,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
   }
 
   // Add the recv node.
-  const string recv_op = (host_memory) ? "_HostRecv" : "_Recv";
+  const std::string recv_op = (host_memory) ? "_HostRecv" : "_Recv";
   NodeDefBuilder recv_builder(opts.new_name(src->name()), recv_op,
                               NodeDebugInfo(*src));
   SetSendRecvAttrs(opts, edge, tensor_name_attr, &recv_builder);
@@ -298,7 +299,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info,
 
   // Add the cast node (from cast_dtype to dtype) or an Identity node.
   if (dtype != cast_dtype) {
-    const string cast_op = (host_memory) ? "_HostCast" : "Cast";
+    const std::string cast_op = (host_memory) ? "_HostCast" : "Cast";
     NodeDefBuilder cast_builder(opts.new_name(src->name()), cast_op,
                                 NodeDebugInfo(*src));
     cast_builder.Attr("DstT", dtype);
@@ -339,8 +340,9 @@ NodeDef* AddDummyConst(const PartitionOptions& opts, GraphDef* gdef,
 
 // A dummy node for scheduling.
 NodeDef* AddControlTrigger(const PartitionOptions& opts, GraphDef* gdef,
-                           const string& assigned_device_name, int64_t epoch,
-                           int64_t starttime, absl::Status* status) {
+                           const std::string& assigned_device_name,
+                           int64_t epoch, int64_t starttime,
+                           absl::Status* status) {
   NodeDef* result = gdef->add_node();
   *status = NodeDefBuilder(opts.new_name(absl::StrCat("synch_", epoch)),
                            "ControlTrigger")
@@ -398,18 +400,19 @@ void OptimizeControlFlowColocation(Graph* graph) {
   DFS(*graph, visit, {});
 }
 
-string ControlLoopName(const string& name) {
+std::string ControlLoopName(const std::string& name) {
   return absl::StrCat("_cloop", name);
 }
 
 bool IsControlLoop(const Node* node) {
-  const string& name = node->name();
+  const std::string& name = node->name();
   return absl::StartsWith(name, "_cloop");
 }
 
 // An enter node for control flow.
-Node* AddControlEnter(Graph* g, const string& node_name,
-                      const string& device_name, const string& frame_name,
+Node* AddControlEnter(Graph* g, const std::string& node_name,
+                      const std::string& device_name,
+                      const std::string& frame_name,
                       const int parallel_iterations, absl::Status* status) {
   NodeBuilder node_builder(node_name, "Enter", g->op_registry());
   node_builder.Input({"dummy", 0, DT_FLOAT});
@@ -423,9 +426,9 @@ Node* AddControlEnter(Graph* g, const string& node_name,
 }
 
 // A merge node for control flow.
-Node* AddControlMerge(const string& in_name1, const string& in_name2, Graph* g,
-                      const string& node_name, const string& device_name,
-                      absl::Status* status) {
+Node* AddControlMerge(const std::string& in_name1, const std::string& in_name2,
+                      Graph* g, const std::string& node_name,
+                      const std::string& device_name, absl::Status* status) {
   NodeBuilder node_builder(node_name, "Merge", g->op_registry());
   node_builder.Input({{in_name1, 0, DT_FLOAT}, {in_name2, 0, DT_FLOAT}});
   Node* res_node;
@@ -437,7 +440,7 @@ Node* AddControlMerge(const string& in_name1, const string& in_name2, Graph* g,
 
 // A switch node for control flow.
 Node* AddControlSwitch(NodeBuilder::NodeOut input1, NodeBuilder::NodeOut input2,
-                       const string& device_name,
+                       const std::string& device_name,
                        const GraphDefBuilder::Options& bopts) {
   Node* res_node =
       ops::BinaryOp("Switch", std::move(input1), std::move(input2), bopts);
@@ -447,7 +450,7 @@ Node* AddControlSwitch(NodeBuilder::NodeOut input1, NodeBuilder::NodeOut input2,
 }
 
 // A next_iteration node for control flow.
-Node* AddControlNext(NodeBuilder::NodeOut input, const string& device_name,
+Node* AddControlNext(NodeBuilder::NodeOut input, const std::string& device_name,
                      const GraphDefBuilder::Options& bopts) {
   Node* res_node = ops::UnaryOp("NextIteration", std::move(input), bopts);
   if (bopts.HaveError()) return nullptr;
@@ -469,7 +472,7 @@ Node* EmptyConst(const GraphDefBuilder::Options& options) {
 }
 
 // A dummy const node for control flow.
-Node* AddControlConst(const string& device_name,
+Node* AddControlConst(const std::string& device_name,
                       const GraphDefBuilder::Options& bopts) {
   Node* res_node = EmptyConst(bopts);
   if (bopts.HaveError()) return nullptr;
@@ -513,21 +516,22 @@ absl::Status AddControlLoop(const PartitionOptions& opts, Graph* g,
   absl::Status status;
   GraphDefBuilder::Options bopts(g, &status);
   const ControlFlowInfo& src_info = (*cf_info)[src->id()];
-  const string& device_name = edge->dst()->assigned_device_name();
-  const string& frame_name = src_info.frame_name;
+  const std::string& device_name = edge->dst()->assigned_device_name();
+  const std::string& frame_name = src_info.frame_name;
   int parallel_iterations;
   status = GetNodeAttr(src_info.frame->attrs(), "parallel_iterations",
                        &parallel_iterations);
   if (!status.ok()) return status;
 
   // The names of the nodes to be added.
-  const string& enter_name =
+  const std::string& enter_name =
       ControlLoopName(opts.new_name(edge->dst()->name()));
-  const string& merge_name =
+  const std::string& merge_name =
       ControlLoopName(opts.new_name(edge->dst()->name()));
-  const string& switch_name =
+  const std::string& switch_name =
+      ControlLoopName(opts.new_name(edge->dst()->name()));
+  const std::string& next_name =
       ControlLoopName(opts.new_name(edge->dst()->name()));
-  const string& next_name = ControlLoopName(opts.new_name(edge->dst()->name()));
 
   // Add the nodes to the graph g.
   Node* enter = AddControlEnter(g, enter_name, device_name, frame_name,
@@ -634,14 +638,14 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
   OptimizeControlFlowColocation(g);
 
   // The map from frames to their LoopCond nodes.
-  std::unordered_map<string, Node*> frame_cond_map;
+  std::unordered_map<std::string, Node*> frame_cond_map;
   int num_node_ids = g->num_node_ids();
   for (int i = 0; i < num_node_ids; ++i) {
     Node* node = g->FindNodeId(i);
     if (node == nullptr) continue;
 
     if (IsLoopCond(node)) {
-      const string& frame_name = cf_info[node->id()].frame_name;
+      const std::string& frame_name = cf_info[node->id()].frame_name;
       DCHECK(!frame_name.empty());
       frame_cond_map[frame_name] = node;
     }
@@ -655,7 +659,7 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
   // the merge of the outer loop to the enter of the inner loop.
   //
   // A map from <frame_name, device_name> to ControlLoop.
-  std::unordered_map<string, ControlLoop> control_loops;
+  std::unordered_map<std::string, ControlLoop> control_loops;
   int num_edge_ids = g->num_edge_ids();
   for (int i = 0; i < num_edge_ids; ++i) {
     const Edge* edge = g->FindEdgeId(i);
@@ -666,15 +670,15 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
     // Skip Sink/Source nodes.
     if (!src->IsOp() || !dst->IsOp()) continue;
 
-    const string& src_device = src->assigned_device_name();
-    const string& dst_device = dst->assigned_device_name();
+    const std::string& src_device = src->assigned_device_name();
+    const std::string& dst_device = dst->assigned_device_name();
     // Skip local edges.
     if (src_device == dst_device) continue;
 
     const Node* src_frame = OutputFrame(src, cf_info);
     const Node* dst_frame = InputFrame(dst, cf_info);
-    const string& src_frame_name = cf_info[src_frame->id()].frame_name;
-    const string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
+    const std::string& src_frame_name = cf_info[src_frame->id()].frame_name;
+    const std::string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
     // Skip if src and dst are not in the same frame.
     if (src_frame_name.empty() || src_frame_name != dst_frame_name) {
       continue;
@@ -685,12 +689,12 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
     // for its outer frame when nested.
     ControlLoop child_loop;
     while (true) {
-      const string& curr_frame_name = cf_info[src_frame->id()].frame_name;
+      const std::string& curr_frame_name = cf_info[src_frame->id()].frame_name;
       if (curr_frame_name.empty()) {
         // We have reached the root frame.
         if (child_loop.merge != nullptr) {
-          const string& node_name = opts.new_name(edge->dst()->name());
-          const string& device_name = edge->dst()->assigned_device_name();
+          const std::string& node_name = opts.new_name(edge->dst()->name());
+          const std::string& device_name = edge->dst()->assigned_device_name();
           Node* const_node =
               AddControlConst(device_name, bopts.WithName(node_name));
           if (!status.ok()) return status;
@@ -700,7 +704,8 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
         break;
       }
 
-      const string& cl_key = absl::StrCat(curr_frame_name, "$$", dst_device);
+      const std::string& cl_key =
+          absl::StrCat(curr_frame_name, "$$", dst_device);
       auto it = control_loops.find(cl_key);
       if (it != control_loops.end()) {
         if (child_loop.enter != nullptr) {
@@ -748,15 +753,16 @@ absl::Status AddControlFlow(const PartitionOptions& opts, Graph* g,
     // Skip Sink/Source nodes.
     if (!src->IsOp() || !dst->IsOp()) continue;
 
-    const string& src_device = src->assigned_device_name();
-    const string& dst_device = dst->assigned_device_name();
+    const std::string& src_device = src->assigned_device_name();
+    const std::string& dst_device = dst->assigned_device_name();
     if (src_device != dst_device) {
       const Node* src_frame = OutputFrame(src, cf_info);
       const Node* dst_frame = InputFrame(dst, cf_info);
-      const string& src_frame_name = cf_info[src_frame->id()].frame_name;
-      const string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
+      const std::string& src_frame_name = cf_info[src_frame->id()].frame_name;
+      const std::string& dst_frame_name = cf_info[dst_frame->id()].frame_name;
       if (!src_frame_name.empty() && src_frame_name == dst_frame_name) {
-        const string& cl_key = absl::StrCat(dst_frame_name, "$$", dst_device);
+        const std::string& cl_key =
+            absl::StrCat(dst_frame_name, "$$", dst_device);
         ControlLoop loop = control_loops[cl_key];
         DCHECK(loop.enter != nullptr);
         // Note that we'll create multiple duplicate edges if dst has multiple
@@ -812,12 +818,13 @@ absl::Status TopologicalSortNodesWithTimePriority(
   };
 
   // Build initial structures, initial contents of queue.
-  std::unordered_map<string, std::vector<const NodeDef*>> node_to_output_nodes;
+  std::unordered_map<std::string, std::vector<const NodeDef*>>
+      node_to_output_nodes;
   std::unordered_map<const NodeDef*, int> inputs_needed;
   for (int n = 0; n < gdef->node_size(); ++n) {
     const NodeDef* ndef = &gdef->node(n);
     for (int i = 0; i < ndef->input_size(); ++i) {
-      node_to_output_nodes[string(ParseTensorName(ndef->input(i)).first)]
+      node_to_output_nodes[std::string(ParseTensorName(ndef->input(i)).first)]
           .push_back(ndef);
     }
     int64_t start_time;
@@ -872,8 +879,9 @@ absl::Status TopologicalSortNodesWithTimePriority(
   return absl::OkStatus();
 }
 
-absl::Status AddControlEdges(const PartitionOptions& opts,
-                             std::unordered_map<string, GraphDef>* partitions) {
+absl::Status AddControlEdges(
+    const PartitionOptions& opts,
+    std::unordered_map<std::string, GraphDef>* partitions) {
   absl::Status status;
   // TODO(yuanbyu): Very naive for now. To be improved.
   const int num_epochs = 100;
@@ -891,7 +899,7 @@ absl::Status AddControlEdges(const PartitionOptions& opts,
 
     // Add a dummy node for every epoch, and add a control edge from the
     // "last" node in the preceding epoch to the dummy node.
-    string device_name = gdef->node(0).device();
+    std::string device_name = gdef->node(0).device();
     int64_t makespan = start_times.back().second;
     int64_t resolution = (makespan / num_epochs) + 1;
 
@@ -909,7 +917,7 @@ absl::Status AddControlEdges(const PartitionOptions& opts,
         }
         dummys.push_back(dummy);
         if (j > 0) {
-          string src_name = start_times[j - 1].first->name();
+          std::string src_name = start_times[j - 1].first->name();
           Graph::AddInput(dummy, src_name, Graph::kControlSlot);
         }
         i++;
@@ -940,7 +948,7 @@ void SetIncarnation(const PartitionOptions& opts, NodeDef* ndef) {
     // Not related to send/recv.
     return;
   }
-  const string& send_device = GetNodeAttrString(*ndef, "send_device");
+  const std::string& send_device = GetNodeAttrString(*ndef, "send_device");
   if (send_device.empty()) {
     // No known send_device. The runtime will detect it later.
     return;
@@ -968,10 +976,10 @@ void SetIncarnation(const PartitionOptions& opts, GraphDef* gdef) {
 }
 
 absl::Status Partition(const PartitionOptions& opts, Graph* g,
-                       std::unordered_map<string, GraphDef>* partitions) {
+                       std::unordered_map<std::string, GraphDef>* partitions) {
   // TODO(b/290689453) Refactor this into smaller functions
   absl::Status status;
-  absl::flat_hash_map<string, std::unique_ptr<GraphDebugInfoBuilder>>
+  absl::flat_hash_map<std::string, std::unique_ptr<GraphDebugInfoBuilder>>
       debug_info_builders;
   partitions->clear();
 
@@ -991,7 +999,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
   status = BuildMemoryDeviceInfo(*g, &g_info);
   if (!status.ok()) return status;
 
-  string dstp;
+  std::string dstp;
   std::vector<const Edge*> inputs;
   DupRecvTable dup_recv(3);
   // For a node dst, 'ref_recvs' remembers the recvs introduced by a ref
@@ -999,7 +1007,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
   // edge to dst. We will add a control edge for every pair in
   // (ref_recvs x ref_control_inputs).
   std::vector<NodeDef*> ref_recvs;
-  std::vector<string> ref_control_inputs;
+  std::vector<std::string> ref_control_inputs;
 
   int32_t num_data = 0;
   int32_t num_control = 0;
@@ -1121,7 +1129,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
       auto iter = dup_recv.find(key);
       if (iter != dup_recv.end()) {
         // We found one. Reuse the data/control transferred already.
-        const string& recv_node_name = iter->second.recv->name();
+        const std::string& recv_node_name = iter->second.recv->name();
         if (edge->IsControlEdge()) {
           Graph::AddInput(dst_def, recv_node_name, Graph::kControlSlot);
         } else {
@@ -1157,7 +1165,7 @@ absl::Status Partition(const PartitionOptions& opts, Graph* g,
         send_from.Reset(src->name(), edge->src_output(), EdgeType(edge));
       }
 
-      string tensor_name_attr;
+      std::string tensor_name_attr;
       if (opts.get_tensor_name_attr) {
         tensor_name_attr = opts.get_tensor_name_attr(edge);
       } else {
diff --git a/tensorflow/core/graph/graph_partition.h b/tensorflow/core/graph/graph_partition.h
index 59e9fe0e61c35d..c1d9493c76c6b5 100644
--- a/tensorflow/core/graph/graph_partition.h
+++ b/tensorflow/core/graph/graph_partition.h
@@ -31,19 +31,19 @@ namespace tensorflow {
 struct PartitionOptions {
   // A function that returns a location for the execution of a given
   // Node.
-  typedef std::function<string(const Node*)> NodeToLocFunc;
+  typedef std::function<std::string(const Node*)> NodeToLocFunc;
   NodeToLocFunc node_to_loc = nullptr;
 
   // A function that returns a unique graph node name with the given
   // prefix.
-  typedef std::function<string(const string&)> NewNameFunc;
+  typedef std::function<std::string(const std::string&)> NewNameFunc;
   NewNameFunc new_name = nullptr;
 
   // A function that returns the incarnation of a device given the
   // device's fullname. If not found, GetIncarnationFunc should return
   // kIllegalIncarnation.
-  static constexpr uint64 kIllegalIncarnation = 0;
-  typedef std::function<uint64(const string&)> GetIncarnationFunc;
+  static constexpr uint64_t kIllegalIncarnation = 0;
+  typedef std::function<uint64_t(const std::string&)> GetIncarnationFunc;
   GetIncarnationFunc get_incarnation = nullptr;
 
   // If specified, flib_def defines a function library that should be
@@ -79,7 +79,7 @@ struct PartitionOptions {
 
   // Optional customized function to compute the "tensor_name" attr value of
   // Send/Recv ops inserted during partitioning.
-  std::function<string(const Edge*)> get_tensor_name_attr = nullptr;
+  std::function<std::string(const Edge*)> get_tensor_name_attr = nullptr;
 
   // If true, the `Partition()` function can make destructive changes to the
   // passed-in `Graph`.
@@ -96,13 +96,14 @@ struct PartitionOptions {
 //
 // Stores the partitions in *partitions.
 absl::Status Partition(const PartitionOptions& opts, Graph* input,
-                       std::unordered_map<string, GraphDef>* partitions);
+                       std::unordered_map<std::string, GraphDef>* partitions);
 
 // Add control edges to the partitions to control the ordering
 // and timing of the recv nodes based on the start times calculated
 // using some scheduling algorithm.
-absl::Status AddControlEdges(const PartitionOptions& opts,
-                             std::unordered_map<string, GraphDef>* partitions);
+absl::Status AddControlEdges(
+    const PartitionOptions& opts,
+    std::unordered_map<std::string, GraphDef>* partitions);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 5f3d0a1b4117f2..4f5e431b87df50 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -68,21 +68,23 @@ using ::testing::Ne;
 
 const char gpu_device[] = "/job:a/replica:0/task:0/device:GPU:0";
 
-string SplitByDevice(const Node* node) { return node->assigned_device_name(); }
+std::string SplitByDevice(const Node* node) {
+  return node->assigned_device_name();
+}
 
-string DeviceName(const Node* node) {
+std::string DeviceName(const Node* node) {
   char first = node->name()[0];
   if (first == 'G') {
     return gpu_device;
   } else {
-    const string cpu_prefix = "/job:a/replica:0/task:0/cpu:";
+    const std::string cpu_prefix = "/job:a/replica:0/task:0/cpu:";
     int index = first - 'A';
     return absl::StrCat(cpu_prefix, index);
   }
 }
 
 void Partition(const GraphDef& graph_def,
-               std::unordered_map<string, GraphDef>* partitions) {
+               std::unordered_map<std::string, GraphDef>* partitions) {
   Graph g(OpRegistry::Global());
   GraphConstructorOptions opts;
   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &g));
@@ -90,16 +92,18 @@ void Partition(const GraphDef& graph_def,
   // Assigns devices to each node. Uses 1st letter of the node name as the
   // device index if no device is specified.
   for (Node* node : g.nodes()) {
-    string device_name = !node->requested_device().empty()
-                             ? node->requested_device()
-                             : DeviceName(node);
+    std::string device_name = !node->requested_device().empty()
+                                  ? node->requested_device()
+                                  : DeviceName(node);
     node->set_assigned_device_name(device_name);
   }
 
   PartitionOptions popts;
   popts.node_to_loc = SplitByDevice;
-  popts.new_name = [&g](const string& prefix) { return g.NewName(prefix); };
-  popts.get_incarnation = [](const string& name) {
+  popts.new_name = [&g](const std::string& prefix) {
+    return g.NewName(prefix);
+  };
+  popts.get_incarnation = [](const std::string& name) {
     return (name[0] - 'A') + 100;
   };
   absl::Status s = Partition(popts, &g, partitions);
@@ -116,7 +120,7 @@ void Partition(const GraphDef& graph_def,
 }
 
 void CheckLoopConstruction(const GraphDef& graph_def) {
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   Partition(graph_def, &partitions);
   for (const auto& kv : partitions) {
     const GraphDef& gdef = kv.second;
@@ -128,7 +132,7 @@ void CheckLoopConstruction(const GraphDef& graph_def) {
       // _recvs must have a control input
       if (ndef.op() == "_Recv") {
         bool has_control = false;
-        for (const string& input_name : ndef.input()) {
+        for (const std::string& input_name : ndef.input()) {
           if (absl::StartsWith(input_name, "^")) {
             has_control = true;
             break;
@@ -171,10 +175,10 @@ REGISTER_OP("Combine")
     .Output("o: float")
     .SetShapeFn(shape_inference::UnknownShape);
 
-Output ConstructOp(const Scope& scope, const string& op_type,
+Output ConstructOp(const Scope& scope, const std::string& op_type,
                    const absl::Span<const Input> inputs) {
   if (!scope.ok()) return Output();
-  const string unique_name = scope.GetUniqueNameForOp(op_type);
+  const std::string unique_name = scope.GetUniqueNameForOp(op_type);
   auto builder =
       NodeBuilder(unique_name, op_type, scope.graph()->op_registry());
   for (auto const& input : inputs) {
@@ -230,20 +234,20 @@ class GraphPartitionTest : public ::testing::Test {
   void ExpectMatchA() {
     GraphDef graph_def;
     TF_EXPECT_OK(scope_a_.ToGraphDef(&graph_def));
-    string a = "/job:a/replica:0/task:0/cpu:0";
+    std::string a = "/job:a/replica:0/task:0/cpu:0";
     TF_EXPECT_GRAPH_EQ(graph_def, partitions_[a]);
   }
 
   void ExpectMatchB() {
     GraphDef graph_def;
     TF_EXPECT_OK(scope_b_.ToGraphDef(&graph_def));
-    string b = "/job:a/replica:0/task:0/cpu:1";
+    std::string b = "/job:a/replica:0/task:0/cpu:1";
     TF_EXPECT_GRAPH_EQ(graph_def, partitions_[b]);
   }
 
   void ExpectFunctions(const FunctionDefLibrary& library,
-                       const std::set<string>& expected_names) {
-    std::set<string> actual_names;
+                       const std::set<std::string>& expected_names) {
+    std::set<std::string> actual_names;
     for (const FunctionDef& fdef : library.function()) {
       actual_names.insert(fdef.signature().name());
     }
@@ -254,7 +258,7 @@ class GraphPartitionTest : public ::testing::Test {
   GraphDef in_graph_def_;
   Scope scope_a_;
   Scope scope_b_;
-  std::unordered_map<string, GraphDef> partitions_;
+  std::unordered_map<std::string, GraphDef> partitions_;
 };
 
 TEST_F(GraphPartitionTest, SingleDevice) {
@@ -277,8 +281,8 @@ TEST_F(GraphPartitionTest, CrossDeviceData) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   ExpectMatchA();
@@ -298,8 +302,8 @@ TEST_F(GraphPartitionTest, CrossDeviceControl) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c =
       Const(scope_a_.WithOpName("A1/ctrl/_0").WithControlDependencies(a1), {});
@@ -323,8 +327,8 @@ TEST_F(GraphPartitionTest, CrossDeviceData_MultiUse) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   ExpectMatchA();
@@ -346,8 +350,8 @@ TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c =
       Const(scope_a_.WithOpName("A1/ctrl/_0").WithControlDependencies(a1), {});
@@ -372,8 +376,8 @@ TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
   auto c =
@@ -417,7 +421,7 @@ TEST_F(GraphPartitionTest, CrossDeviceLoopSimple1) {
   auto b1 = Identity(in_.WithOpName("B1"), a3);
   NextIteration(in_.WithOpName("B5"), b1);
 
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   Partition(ToGraphDef(), &partitions);
   for (const auto& kv : partitions) {
     const GraphDef& gdef = kv.second;
@@ -471,10 +475,12 @@ TEST_F(GraphPartitionTest, PartitionIncompleteGraph) {
 
   PartitionOptions popts;
   popts.node_to_loc = SplitByDevice;
-  popts.new_name = [&g](const string& prefix) { return g.NewName(prefix); };
-  popts.get_incarnation = [](const string&) { return 1; };
+  popts.new_name = [&g](const std::string& prefix) {
+    return g.NewName(prefix);
+  };
+  popts.get_incarnation = [](const std::string&) { return 1; };
 
-  std::unordered_map<string, GraphDef> partitions;
+  std::unordered_map<std::string, GraphDef> partitions;
   status = Partition(popts, &g, &partitions);
   // Partitioning should fail, but not crash like it did before the
   // changes that accompanied the addition of this test.
@@ -498,8 +504,8 @@ TEST_F(GraphPartitionTest, Functions) {
   EXPECT_EQ(2, partitions_.size());
 
   // Test that partition graphs inherit function library from original graph.
-  string a = "/job:a/replica:0/task:0/cpu:0";
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
 
   // Node "A2" is placed in part `a`, and uses only "XTimesTwo".
   ExpectFunctions(partitions_[a].library(), {"XTimesTwo"});
@@ -602,7 +608,7 @@ TEST_F(GraphPartitionTest, GraphDebugInfo) {
 
   // Expect each partitioned graph to contain the stack traces for its nodes.
   // A stack trace for A1 should be in the A partition (".../cpu:0").
-  string a = "/job:a/replica:0/task:0/cpu:0";
+  std::string a = "/job:a/replica:0/task:0/cpu:0";
   const GraphDebugInfo& a_debug_info = partitions_[a].debug_info();
   StackTracesMap traces = LoadTracesFromDebugInfo(a_debug_info);
   const auto& a_it = traces.find("A1");
@@ -611,7 +617,7 @@ TEST_F(GraphPartitionTest, GraphDebugInfo) {
               ::testing::ContainsRegex("alpha.cc.*30"));
 
   // Stack traces for B1 and B2 should be in the B partition (".../cpu:1").
-  string b = "/job:a/replica:0/task:0/cpu:1";
+  std::string b = "/job:a/replica:0/task:0/cpu:1";
   const GraphDebugInfo& b_debug_info = partitions_[b].debug_info();
   traces = LoadTracesFromDebugInfo(b_debug_info);
   const auto& b1_it = traces.find("B1");
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index e2fe533ce4b238..e29d2d92d4c597 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -108,7 +108,7 @@ NodeBuilder& NodeBuilder::Device(absl::string_view device_spec) {
 }
 
 NodeBuilder& NodeBuilder::AssignedDevice(absl::string_view device) {
-  assigned_device_ = string(device);
+  assigned_device_ = std::string(device);
   return *this;
 }
 
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 6f249371606b3e..476393cae8166b 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -67,8 +67,8 @@ class NodeBuilder {
     // * a nullptr Node* was passed to the NodeOut constructor, or
     // * an out-of-range index was passed to the NodeOut constructor.
     bool error;
-    string name;
-    int32 index;
+    std::string name;
+    int32_t index;
     DataType dt;
   };
 
@@ -132,7 +132,7 @@ class NodeBuilder {
   absl::StatusOr<Node*> Finalize(Graph* graph, bool consume = false);
 
   // Accessors for the values set in the constructor.
-  const string& node_name() const { return def_builder_.node_name(); }
+  const std::string& node_name() const { return def_builder_.node_name(); }
   const OpDef& op_def() const { return def_builder_.op_def(); }
 
  private:
@@ -157,8 +157,8 @@ class NodeBuilder {
   const OpRegistryInterface* op_registry_;
   std::vector<NodeOut> inputs_;
   std::vector<Node*> control_inputs_;
-  std::vector<string> errors_;
-  string assigned_device_;
+  std::vector<std::string> errors_;
+  std::string assigned_device_;
 };
 
 // IMPLEMENTATION -------------------------------------------------------------
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 94b4cabb2fd884..bac15370ae039e 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static void InitGraph(const string& s, Graph* graph) {
+static void InitGraph(const std::string& s, Graph* graph) {
   GraphDef graph_def;
 
   auto parser = protobuf::TextFormat::Parser();
@@ -50,14 +50,14 @@ class OptimizerCSETest : public ::testing::Test {
  public:
   OptimizerCSETest() : graph_(OpRegistry::Global()) {}
 
-  void InitGraph(const string& s) {
+  void InitGraph(const std::string& s) {
     ::tensorflow::InitGraph(s, &graph_);
     original_ = CanonicalGraphString(&graph_);
   }
 
   static bool IncludeNode(const Node* n) { return n->IsOp(); }
 
-  static string EdgeId(const Node* n, int index) {
+  static std::string EdgeId(const Node* n, int index) {
     if (index == 0) {
       return n->name();
     } else if (index == Graph::kControlSlot) {
@@ -67,9 +67,9 @@ class OptimizerCSETest : public ::testing::Test {
     }
   }
 
-  string CanonicalGraphString(Graph* g) {
-    std::vector<string> nodes;
-    std::vector<string> edges;
+  std::string CanonicalGraphString(Graph* g) {
+    std::vector<std::string> nodes;
+    std::vector<std::string> edges;
     for (const Node* n : g->nodes()) {
       if (IncludeNode(n)) {
         nodes.push_back(absl::StrCat(n->name(), "(", n->type_string(), ")"));
@@ -88,21 +88,22 @@ class OptimizerCSETest : public ::testing::Test {
                         absl::StrJoin(edges, ";"));
   }
 
-  string DoCSE(const std::function<bool(const Node*)>& consider_fn = nullptr) {
-    string before = CanonicalGraphString(&graph_);
+  std::string DoCSE(
+      const std::function<bool(const Node*)>& consider_fn = nullptr) {
+    std::string before = CanonicalGraphString(&graph_);
     LOG(ERROR) << "Before rewrites: " << before;
 
     OptimizeCSE(&graph_, consider_fn);
 
-    string result = CanonicalGraphString(&graph_);
+    std::string result = CanonicalGraphString(&graph_);
     LOG(ERROR) << "After rewrites:  " << result;
     return result;
   }
 
-  const string& OriginalGraph() const { return original_; }
+  const std::string& OriginalGraph() const { return original_; }
 
   Graph graph_;
-  string original_;
+  std::string original_;
 };
 
 REGISTER_OP("Input").Output("o: float").SetIsStateful();
@@ -339,8 +340,8 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
   EXPECT_EQ(OriginalGraph(),
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
             "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
-  std::vector<string> nodes = str_util::Split(DoCSE(), ";|");
-  std::set<string> node_set(nodes.begin(), nodes.end());
+  std::vector<std::string> nodes = str_util::Split(DoCSE(), ";|");
+  std::set<std::string> node_set(nodes.begin(), nodes.end());
   // Expect exactly one of each type of node to be retained after CSE.
   EXPECT_EQ(node_set.count("n/_0(Const)") + node_set.count("n/_7(Const)"), 1);
   EXPECT_EQ(node_set.count("n/_1(Const)") + node_set.count("n/_6(Const)"), 1);
@@ -350,14 +351,14 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
 
 void BM_CSE(::testing::benchmark::State& state) {
   const int op_nodes = state.range(0);
-  string s;
+  std::string s;
   for (int in = 0; in < 10; in++) {
-    s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in);
+    s += absl::StrFormat("node { name: 'in%04d' op: 'Input'}", in);
   }
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
   for (int op = 0; op < op_nodes; op++) {
-    s += strings::Printf(
+    s += absl::StrFormat(
         "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { "
         "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }",
         op, rnd.Uniform(10), rnd.Uniform(10));
diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc
index 1d03877c02583c..697defb2ef2558 100644
--- a/tensorflow/core/graph/subgraph.cc
+++ b/tensorflow/core/graph/subgraph.cc
@@ -61,7 +61,7 @@ absl::Status FeedInputs(
   out_feed_types->clear();
   out_feed_types->reserve(feed_rewrites.size());
   for (size_t i = 0; i < feed_rewrites.size(); ++i) {
-    const string& t = feed_rewrites[i]->endpoint_name();
+    const std::string& t = feed_rewrites[i]->endpoint_name();
     TensorId id(ParseTensorName(t));
 
     auto iter = name_index->find(id.first);
@@ -127,7 +127,7 @@ absl::Status FetchOutputs(
   out_fetch_nodes->clear();
   out_fetch_nodes->reserve(fetch_rewrites.size());
   for (size_t i = 0; i < fetch_rewrites.size(); ++i) {
-    const string& t = fetch_rewrites[i]->endpoint_name();
+    const std::string& t = fetch_rewrites[i]->endpoint_name();
 
     // Parse t into node_name and output_index.
     TensorId id(ParseTensorName(t));
@@ -174,7 +174,7 @@ absl::Status FetchOutputs(
   return absl::OkStatus();
 }
 
-bool AddNodeToTargets(const string& node_or_tensor_name,
+bool AddNodeToTargets(const std::string& node_or_tensor_name,
                       const NameIndex& name_index,
                       std::unordered_set<const Node*>* targets) {
   TensorId id = ParseTensorName(node_or_tensor_name);
@@ -188,17 +188,18 @@ bool AddNodeToTargets(const string& node_or_tensor_name,
   return true;
 }
 
-absl::Status PruneForTargets(Graph* g, const NameIndex& name_index,
-                             const std::vector<Node*>& fetch_nodes,
-                             const absl::Span<const string>& target_nodes) {
-  string not_found;
+absl::Status PruneForTargets(
+    Graph* g, const NameIndex& name_index,
+    const std::vector<Node*>& fetch_nodes,
+    const absl::Span<const std::string>& target_nodes) {
+  std::string not_found;
   std::unordered_set<const Node*> targets;
   for (Node* n : fetch_nodes) {
     if (!AddNodeToTargets(n->name(), name_index, &targets)) {
       absl::StrAppend(&not_found, n->name(), " ");
     }
   }
-  for (const string& s : target_nodes) {
+  for (const std::string& s : target_nodes) {
     if (!AddNodeToTargets(s, name_index, &targets)) {
       absl::StrAppend(&not_found, s, " ");
     }
@@ -295,9 +296,9 @@ absl::Status SendFetchRewrite::AddNode(Graph* g,
 }
 
 absl::Status RewriteGraphForExecution(
-    Graph* g, const absl::Span<const string>& fed_outputs,
-    const absl::Span<const string>& fetch_outputs,
-    const absl::Span<const string>& target_node_names,
+    Graph* g, const absl::Span<const std::string>& fed_outputs,
+    const absl::Span<const std::string>& fetch_outputs,
+    const absl::Span<const std::string>& target_node_names,
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata) {
   std::vector<std::unique_ptr<PruneRewrite>> feed_rewrites;
@@ -305,10 +306,10 @@ absl::Status RewriteGraphForExecution(
   if (use_function_convention) {
     for (size_t i = 0; i < fed_outputs.size(); ++i) {
       feed_rewrites.emplace_back(new ArgFeedRewrite(
-          &fed_outputs[i], &device_info, static_cast<int32>(i)));
+          &fed_outputs[i], &device_info, static_cast<int32_t>(i)));
     }
   } else {
-    for (const string& fed_output : fed_outputs) {
+    for (const std::string& fed_output : fed_outputs) {
       feed_rewrites.emplace_back(
           new RecvFeedRewrite(&fed_output, &device_info));
     }
@@ -319,10 +320,10 @@ absl::Status RewriteGraphForExecution(
   if (use_function_convention) {
     for (size_t i = 0; i < fetch_outputs.size(); ++i) {
       fetch_rewrites.emplace_back(new RetvalFetchRewrite(
-          &fetch_outputs[i], &device_info, static_cast<int32>(i)));
+          &fetch_outputs[i], &device_info, static_cast<int32_t>(i)));
     }
   } else {
-    for (const string& fetch_output : fetch_outputs) {
+    for (const std::string& fetch_output : fetch_outputs) {
       fetch_rewrites.emplace_back(
           new SendFetchRewrite(&fetch_output, &device_info));
     }
@@ -334,22 +335,22 @@ absl::Status RewriteGraphForExecution(
 
 namespace {
 template <typename StringContainer>
-std::vector<string> ConvertToVector(StringContainer field) {
-  return std::vector<string>(field.begin(), field.end());
+std::vector<std::string> ConvertToVector(StringContainer field) {
+  return std::vector<std::string>(field.begin(), field.end());
 }
 }  // namespace
 
 absl::Status RewriteGraphForExecution(
     Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
     const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
-    const absl::Span<const string>& target_node_names,
+    const absl::Span<const std::string>& target_node_names,
     RewriteGraphMetadata* out_metadata) {
   if (fetch_rewrites.empty() && target_node_names.empty()) {
     return errors::InvalidArgument(
         "Must specify at least one target to fetch or execute.");
   }
 
-  std::unordered_set<string> endpoints;
+  std::unordered_set<std::string> endpoints;
   for (const auto& feed_rewrite : feed_rewrites) {
     auto result = endpoints.insert(feed_rewrite->endpoint_name());
     if (!result.second) {
diff --git a/tensorflow/core/graph/subgraph.h b/tensorflow/core/graph/subgraph.h
index 37013b8f7d09ee..c8843a37d58fa9 100644
--- a/tensorflow/core/graph/subgraph.h
+++ b/tensorflow/core/graph/subgraph.h
@@ -50,7 +50,8 @@ struct RewriteGraphMetadata {
 class PruneRewrite {
  public:
   // `endpoint_name` and `device_info` must outlive this object.
-  PruneRewrite(const string* endpoint_name, const DeviceAttributes* device_info)
+  PruneRewrite(const std::string* endpoint_name,
+               const DeviceAttributes* device_info)
       : endpoint_name_(endpoint_name), device_info_(device_info) {}
   virtual ~PruneRewrite() {}
 
@@ -60,14 +61,14 @@ class PruneRewrite {
                                Node** out_node) = 0;
 
   // Returns the name of the tensor to which this rewrite applies.
-  const string& endpoint_name() { return *endpoint_name_; }
+  const std::string& endpoint_name() { return *endpoint_name_; }
 
  protected:
   // The device on which the new node will be created.
   const DeviceAttributes& device_info() { return *device_info_; }
 
  private:
-  const string* const endpoint_name_;          // Not owned.
+  const std::string* const endpoint_name_;     // Not owned.
   const DeviceAttributes* const device_info_;  // Not owned.
 };
 
@@ -98,9 +99,9 @@ class PruneRewrite {
 //    - fetch output "node:output_index" does not exist in "*g"
 //    - target node "node" does not exist in "*g"
 absl::Status RewriteGraphForExecution(
-    Graph* g, const absl::Span<const string>& fed_outputs,
-    const absl::Span<const string>& fetch_outputs,
-    const absl::Span<const string>& target_node_names,
+    Graph* g, const absl::Span<const std::string>& fed_outputs,
+    const absl::Span<const std::string>& fetch_outputs,
+    const absl::Span<const std::string>& target_node_names,
     const DeviceAttributes& device_info, bool use_function_convention,
     RewriteGraphMetadata* out_metadata);
 
@@ -109,7 +110,7 @@ absl::Status RewriteGraphForExecution(
 absl::Status RewriteGraphForExecution(
     Graph* g, const std::vector<std::unique_ptr<PruneRewrite>>& feed_rewrites,
     const std::vector<std::unique_ptr<PruneRewrite>>& fetch_rewrites,
-    const absl::Span<const string>& target_node_names,
+    const absl::Span<const std::string>& target_node_names,
     RewriteGraphMetadata* out_metadata);
 
 /////////////////////////////////////////////////////////
@@ -119,14 +120,14 @@ absl::Status RewriteGraphForExecution(
 // A rewrite action that adds an _Arg node for a fed tensor.
 class ArgFeedRewrite : public PruneRewrite {
  public:
-  ArgFeedRewrite(const string* endpoint_name,
+  ArgFeedRewrite(const std::string* endpoint_name,
                  const DeviceAttributes* device_info, int32_t arg_index)
       : PruneRewrite(endpoint_name, device_info), arg_index_(arg_index) {}
   absl::Status AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor,
                        Node** out_node) override;
 
  private:
-  const int32 arg_index_;
+  const int32_t arg_index_;
 };
 
 // A rewrite action that adds a client-terminated _Recv node for a fed tensor.
@@ -140,14 +141,14 @@ class RecvFeedRewrite : public PruneRewrite {
 // A rewrite action that adds a _Retval node for a fetched tensor.
 class RetvalFetchRewrite : public PruneRewrite {
  public:
-  RetvalFetchRewrite(const string* endpoint_name,
+  RetvalFetchRewrite(const std::string* endpoint_name,
                      const DeviceAttributes* device_info, int32_t retval_index)
       : PruneRewrite(endpoint_name, device_info), retval_index_(retval_index) {}
   absl::Status AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor,
                        Node** out_node) override;
 
  private:
-  const int32 retval_index_;
+  const int32_t retval_index_;
 };
 
 // A rewrite action that adds a client-terminated _Send node for a
diff --git a/tensorflow/core/graph/tensor_id.h b/tensorflow/core/graph/tensor_id.h
index 31b30fa14af463..30caf3857e303c 100644
--- a/tensorflow/core/graph/tensor_id.h
+++ b/tensorflow/core/graph/tensor_id.h
@@ -44,7 +44,7 @@ struct TensorId : public std::pair<absl::string_view, int> {
   const absl::string_view node() const { return first; }
   int index() const { return second; }
 
-  string ToString() const {
+  std::string ToString() const {
     if (second == Graph::kControlSlot) return absl::StrCat("^", first);
     return absl::StrCat(first, ":", second);
   }
@@ -63,19 +63,19 @@ bool IsTensorIdControl(const TensorId& tensor_id);
 
 // Same as TensorId, except owns the backing storage for the op name. This makes
 // the memory management simpler at the expense of a copy.
-struct SafeTensorId : public std::pair<string, int> {
-  typedef std::pair<string, int> Base;
+struct SafeTensorId : public std::pair<std::string, int> {
+  typedef std::pair<std::string, int> Base;
 
   // NOTE(skyewm): this is required on some platforms. I'm not sure why the
   // using "using Base::pair;" isn't always sufficient.
   SafeTensorId() : Base() {}
-  SafeTensorId(const string& str, int idx) : Base(str, idx) {}
+  SafeTensorId(const std::string& str, int idx) : Base(str, idx) {}
   SafeTensorId(const TensorId& id);
 
-  const string& node() const { return first; }
+  const std::string& node() const { return first; }
   int index() const { return second; }
 
-  string ToString() const {
+  std::string ToString() const {
     if (second == Graph::kControlSlot) return absl::StrCat("^", first);
     return absl::StrCat(first, ":", second);
   }
diff --git a/tensorflow/core/graph/tensor_id_test.cc b/tensorflow/core/graph/tensor_id_test.cc
index 15bffd170642c8..4bec9298680b78 100644
--- a/tensorflow/core/graph/tensor_id_test.cc
+++ b/tensorflow/core/graph/tensor_id_test.cc
@@ -23,7 +23,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-string ParseHelper(const string& n) { return ParseTensorName(n).ToString(); }
+std::string ParseHelper(const std::string& n) {
+  return ParseTensorName(n).ToString();
+}
 
 TEST(TensorIdTest, ParseTensorName) {
   EXPECT_EQ(ParseHelper("W1"), "W1:0");
@@ -35,8 +37,8 @@ TEST(TensorIdTest, ParseTensorName) {
   EXPECT_EQ(ParseHelper("^foo"), "^foo");
 }
 
-uint32 Skewed(random::SimplePhilox* rnd, int max_log) {
-  const uint32 space = 1 << (rnd->Rand32() % (max_log + 1));
+uint32_t Skewed(random::SimplePhilox* rnd, int max_log) {
+  const uint32_t space = 1 << (rnd->Rand32() % (max_log + 1));
   return rnd->Rand32() % space;
 }
 
@@ -44,9 +46,9 @@ void BM_ParseTensorName(::testing::benchmark::State& state) {
   const int arg = state.range(0);
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  std::vector<string> names;
+  std::vector<std::string> names;
   for (int i = 0; i < 100; i++) {
-    string name;
+    std::string name;
     switch (arg) {
       case 0: {  // Generate random names
         size_t len = Skewed(&rnd, 4);
@@ -92,7 +94,7 @@ void BM_ParseTensorName(::testing::benchmark::State& state) {
 BENCHMARK(BM_ParseTensorName)->Arg(0)->Arg(1)->Arg(2)->Arg(3)->Arg(4)->Arg(5);
 
 TEST(TensorIdTest, IsTensorIdControl) {
-  string input = "^foo";
+  std::string input = "^foo";
   TensorId tensor_id = ParseTensorName(input);
   EXPECT_TRUE(IsTensorIdControl(tensor_id));
 
@@ -106,7 +108,7 @@ TEST(TensorIdTest, IsTensorIdControl) {
 }
 
 TEST(TensorIdTest, PortZero) {
-  for (string input : {"foo", "foo:0"}) {
+  for (std::string input : {"foo", "foo:0"}) {
     TensorId tensor_id = ParseTensorName(input);
     EXPECT_EQ("foo", tensor_id.node());
     EXPECT_EQ(0, tensor_id.index());
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index f83bf238cde9d1..b882361aa8093e 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -32,8 +32,9 @@ namespace tensorflow {
 namespace test {
 namespace graph {
 
-Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
-           const uint64 sender_incarnation, const string& receiver) {
+Node* Send(Graph* g, Node* input, const std::string& tensor,
+           const std::string& sender, const uint64_t sender_incarnation,
+           const std::string& receiver) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "_Send")
                   .Input(input, 0)
@@ -46,9 +47,9 @@ Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
   return ret;
 }
 
-Node* Recv(Graph* g, const string& tensor, const string& type,
-           const string& sender, const uint64 sender_incarnation,
-           const string& receiver) {
+Node* Recv(Graph* g, const std::string& tensor, const std::string& type,
+           const std::string& sender, const uint64_t sender_incarnation,
+           const std::string& receiver) {
   Node* ret;
   DataType dtype;
   CHECK(DataTypeFromString(type, &dtype));
@@ -72,7 +73,7 @@ Node* Constant(Graph* g, const Tensor& tensor) {
   return ret;
 }
 
-Node* Constant(Graph* g, const Tensor& tensor, const string& name) {
+Node* Constant(Graph* g, const Tensor& tensor, const std::string& name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(name, "Const")
                   .Attr("dtype", tensor.dtype())
@@ -85,7 +86,7 @@ Node* HostConstant(Graph* g, const Tensor& tensor) {
   return HostConstant(g, tensor, g->NewName("n"));
 }
 
-Node* HostConstant(Graph* g, const Tensor& tensor, const string& name) {
+Node* HostConstant(Graph* g, const Tensor& tensor, const std::string& name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(name, "HostConst")
                   .Attr("dtype", tensor.dtype())
@@ -104,7 +105,7 @@ Node* Var(Graph* g, const DataType dtype, const TensorShape& shape) {
 }
 
 Node* Var(Graph* g, const DataType dtype, const TensorShape& shape,
-          const string& name) {
+          const std::string& name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(name, "Variable")
                   .Attr("dtype", dtype)
@@ -134,7 +135,7 @@ Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive, bool reverse) {
   return ret;
 }
 
-Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
+Node* Reduce(Graph* g, const std::string& reduce, Node* data, Node* axes,
              bool keep_dims) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), reduce, g->op_registry())
@@ -179,7 +180,7 @@ Node* BatchMatmul(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y) {
   return ret;
 }
 
-Node* RandomNumberGenerator(const string& op, Graph* g, Node* input,
+Node* RandomNumberGenerator(const std::string& op, Graph* g, Node* input,
                             DataType dtype) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), op, g->op_registry())
@@ -222,7 +223,7 @@ Node* RandomPoisson(Graph* g, Node* shape, Node* lam) {
   return ret;
 }
 
-Node* Unary(Graph* g, const string& func, Node* input, int index) {
+Node* Unary(Graph* g, const std::string& func, Node* input, int index) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), func, g->op_registry())
                   .Input(input, index)
@@ -230,7 +231,7 @@ Node* Unary(Graph* g, const string& func, Node* input, int index) {
   return ret;
 }
 
-Node* Binary(Graph* g, const string& func, Node* in0, Node* in1) {
+Node* Binary(Graph* g, const std::string& func, Node* in0, Node* in1) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), func, g->op_registry())
                   .Input(in0)
@@ -239,7 +240,7 @@ Node* Binary(Graph* g, const string& func, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Multi(Graph* g, const string& func, absl::Span<Node* const> ins) {
+Node* Multi(Graph* g, const std::string& func, absl::Span<Node* const> ins) {
   Node* ret;
   auto b = NodeBuilder(g->NewName("n"), func, g->op_registry());
   for (Node* n : ins) b = b.Input(n);
@@ -271,7 +272,7 @@ Node* Roll(Graph* g, Node* input, Node* shift, Node* axis) {
   return ret;
 }
 
-Node* Error(Graph* g, Node* input, const string& errmsg, bool log_error) {
+Node* Error(Graph* g, Node* input, const std::string& errmsg, bool log_error) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Error")
                   .Input(input)
@@ -317,7 +318,7 @@ Node* Switch(Graph* g, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Enter(Graph* g, Node* input, const string& frame_name) {
+Node* Enter(Graph* g, Node* input, const std::string& frame_name) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Enter")
                   .Input(input)
@@ -341,11 +342,11 @@ Node* Merge(Graph* g, Node* in0, Node* in1) {
   return ret;
 }
 
-Node* Merge(Graph* g, Node* in0, absl::Span<const string> remaining_in) {
+Node* Merge(Graph* g, Node* in0, absl::Span<const std::string> remaining_in) {
   std::vector<NodeBuilder::NodeOut> inputs;
   inputs.reserve(remaining_in.size() + 1);
   inputs.emplace_back(in0);
-  for (const string& in_name : remaining_in) {
+  for (const std::string& in_name : remaining_in) {
     inputs.emplace_back(in_name, 0, inputs[0].dt);
   }
 
@@ -383,7 +384,7 @@ Node* ConcatV2(Graph* g, absl::Span<Node* const> tensors, Node* concat_dim) {
   return ret;
 }
 
-Node* Next(Graph* g, const string& name, Node* input) {
+Node* Next(Graph* g, const std::string& name, Node* input) {
   Node* ret;
   TF_CHECK_OK(
       NodeBuilder(name, "NextIteration").Input(input).Finalize(g, &ret));
@@ -497,7 +498,7 @@ Node* DiagPart(Graph* g, Node* in, DataType type) {
   return ret;
 }
 
-Node* CheckNumerics(Graph* g, Node* in, const string& message) {
+Node* CheckNumerics(Graph* g, Node* in, const std::string& message) {
   Node* ret;
   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "CheckNumerics")
                   .Input(in)
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index df7843f884b17d..f4df5a4ed4d038 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -39,7 +39,7 @@ void ToGraphDef(Graph* g, GraphDef* def);
 
 // Adds a node in "g" producing a constant "tensor".
 Node* Constant(Graph* g, const Tensor& tensor);
-Node* Constant(Graph* g, const Tensor& tensor, const string& name);
+Node* Constant(Graph* g, const Tensor& tensor, const std::string& name);
 
 // Adds a node in "g" producing a constant "tensor" on the host.
 // The given node which, unlike the regular Constant above, always
@@ -47,26 +47,27 @@ Node* Constant(Graph* g, const Tensor& tensor, const string& name);
 // in GPU tests where the test Op in question runs on the device
 // but requires some arguments to be pinned to the host.
 Node* HostConstant(Graph* g, const Tensor& tensor);
-Node* HostConstant(Graph* g, const Tensor& tensor, const string& name);
+Node* HostConstant(Graph* g, const Tensor& tensor, const std::string& name);
 
 // Adds a variable in "g" of the given "shape" and "dtype".
 Node* Var(Graph* g, DataType dtype, const TensorShape& shape);
 Node* Var(Graph* g, DataType dtype, const TensorShape& shape,
-          const string& name);
+          const std::string& name);
 
 // Adds an assign node in "g" which assigns "val" into "var".
 Node* Assign(Graph* g, Node* var, Node* val);
 
 // Adds a send node "g" sending "input" as a named "tensor" from
 // "sender" to "receiver".
-Node* Send(Graph* g, Node* input, const string& tensor, const string& sender,
-           uint64 sender_incarnation, const string& receiver);
+Node* Send(Graph* g, Node* input, const std::string& tensor,
+           const std::string& sender, uint64_t sender_incarnation,
+           const std::string& receiver);
 
 // Adds a recv node in "g" receiving a named "tensor" from "sender"
 // to "receiver".
-Node* Recv(Graph* g, const string& tensor, const string& type,
-           const string& sender, uint64 sender_incarnation,
-           const string& receiver);
+Node* Recv(Graph* g, const std::string& tensor, const std::string& type,
+           const std::string& sender, uint64_t sender_incarnation,
+           const std::string& receiver);
 
 // Adds a cumsum "node" in "g" doing cumsum(data, axes).
 Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
@@ -74,7 +75,7 @@ Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
 
 // Adds a reduction "node" in "g" doing sum(data, axes).  "reduce" is
 // a reduction, e.g., Sum, Max, Min, Mean, etc.
-Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
+Node* Reduce(Graph* g, const std::string& reduce, Node* data, Node* axes,
              bool keep_dims = false);
 
 // Adds a Matmul node in g doing in0.contract(in1).
@@ -89,17 +90,17 @@ Node* BatchMatmul(Graph* g, Node* in0, Node* in1, bool adj_x, bool adj_y);
 Node* QuantizeToUINT8(Graph* g, Node* data);
 
 // Adds a unary function "func" "node" in "g" taking "input".
-Node* Unary(Graph* g, const string& func, Node* input, int index = 0);
+Node* Unary(Graph* g, const std::string& func, Node* input, int index = 0);
 
 // Adds an identity node in "g" taking "input" and producing an
 // identity copy.
 Node* Identity(Graph* g, Node* input, int index = 0);
 
 // Adds a binary function "func" node in "g" taking "in0" and "in1".
-Node* Binary(Graph* g, const string& func, Node* in0, Node* in1);
+Node* Binary(Graph* g, const std::string& func, Node* in0, Node* in1);
 
 // Adds a function "func" node in "g" taking inputs "ins".
-Node* Multi(Graph* g, const string& func, absl::Span<Node* const> ins);
+Node* Multi(Graph* g, const std::string& func, absl::Span<Node* const> ins);
 
 // Adds a binary add node in "g" doing in0 + in1.
 Node* Add(Graph* g, Node* in0, Node* in1);
@@ -131,7 +132,7 @@ Node* TruncatedNormal(Graph* g, Node* input, DataType dtype);
 
 // Adds an error node in "g". The node's computation always
 // generates an error with the given error message "errmsg".
-Node* Error(Graph* g, Node* input, const string& errmsg,
+Node* Error(Graph* g, Node* input, const std::string& errmsg,
             bool log_error = false);
 
 // Adds a node that generates a invalid ref output.
@@ -150,7 +151,7 @@ Node* NoOp(Graph* g, const std::vector<Node*>& control_inputs);
 Node* Switch(Graph* g, Node* in0, Node* in1);
 
 // Adds an Enter node in "g", which enters a new frame.
-Node* Enter(Graph* g, Node* input, const string& frame_name);
+Node* Enter(Graph* g, Node* input, const std::string& frame_name);
 
 // Adds an Exit node in "g", which exits a frame.
 Node* Exit(Graph* g, Node* input);
@@ -160,11 +161,11 @@ Node* Merge(Graph* g, Node* in0, Node* in1);
 
 // Adds a Merge node in "g". The first input is "in0", the remaining
 // inputs are only given by their names in remaining_in.
-Node* Merge(Graph* g, Node* in0, absl::Span<const string> remaining_in);
+Node* Merge(Graph* g, Node* in0, absl::Span<const std::string> remaining_in);
 
 // Adds a NextIteration node in "g", which makes its input available
 // to the next iteration.
-Node* Next(Graph* g, const string& name, Node* input);
+Node* Next(Graph* g, const std::string& name, Node* input);
 
 // Adds a LoopCond node in "g", representing the "pivot" termination
 // condition of a loop.
@@ -215,7 +216,7 @@ Node* Diag(Graph* g, Node* in, DataType type);
 Node* DiagPart(Graph* g, Node* in, DataType type);
 
 // Add a CheckNumerics node in "g".
-Node* CheckNumerics(Graph* g, Node* in, const string& message);
+Node* CheckNumerics(Graph* g, Node* in, const std::string& message);
 
 // Add an _Arg node in "g".
 Node* Arg(Graph* g, int64_t index, DataType type);
diff --git a/tensorflow/core/graph/validate.cc b/tensorflow/core/graph/validate.cc
index 154d9f26c80cf5..4572ceb9de7897 100644
--- a/tensorflow/core/graph/validate.cc
+++ b/tensorflow/core/graph/validate.cc
@@ -100,7 +100,7 @@ absl::Status ValidateGraphHasNoCycle(const Graph& graph) {
   }
 
   if (processed < graph.num_nodes()) {
-    std::vector<string> nodes_in_cycle;
+    std::vector<std::string> nodes_in_cycle;
     for (int i = 0; i < pending_count.size() && nodes_in_cycle.size() < 3;
          ++i) {
       if (pending_count[i] != 0) {
diff --git a/tensorflow/core/graph/validate_test.cc b/tensorflow/core/graph/validate_test.cc
index b593a2c9b63c7e..35e7ebb4cff6e0 100644
--- a/tensorflow/core/graph/validate_test.cc
+++ b/tensorflow/core/graph/validate_test.cc
@@ -38,7 +38,7 @@ REGISTER_OP("FloatInput").Output("o: float");
 REGISTER_OP("Int32Input").Output("o: int32");
 
 TEST(ValidateGraphDefTest, TestValidGraph) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'FloatInput' }"
       "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -50,7 +50,7 @@ TEST(ValidateGraphDefTest, TestValidGraph) {
 }
 
 TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'Int32Input' }"
       "node { "
@@ -74,7 +74,7 @@ TEST(ValidateGraphDefTest, GraphWithUnspecifiedDefaultAttr) {
 
 TEST(ValidateGraphDefTest, GraphWithUnspecifiedRequiredAttr) {
   // "DstT" attribute is missing.
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { "
       "       name: 'B' op: 'Cast' "
@@ -102,7 +102,7 @@ TEST(ValidateGraphDefAgainstOpListTest, GraphWithOpOnlyInOpList) {
   TF_ASSERT_OK(OpDefBuilder("UniqueSnowflake").Finalize(&op_reg_data));
   OpList op_list;
   *op_list.add_op() = op_reg_data.op_def;
-  const string graph_def_str = "node { name: 'A' op: 'UniqueSnowflake' }";
+  const std::string graph_def_str = "node { name: 'A' op: 'UniqueSnowflake' }";
   GraphDef graph_def;
   auto parser = protobuf::TextFormat::Parser();
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
@@ -114,7 +114,7 @@ TEST(ValidateGraphDefAgainstOpListTest, GraphWithGlobalOpNotInOpList) {
   TF_ASSERT_OK(OpDefBuilder("NotAnywhere").Finalize(&op_reg_data));
   OpList op_list;
   *op_list.add_op() = op_reg_data.op_def;
-  const string graph_def_str = "node { name: 'A' op: 'FloatInput' }";
+  const std::string graph_def_str = "node { name: 'A' op: 'FloatInput' }";
   GraphDef graph_def;
   auto parser = protobuf::TextFormat::Parser();
   CHECK(parser.MergeFromString(graph_def_str, &graph_def)) << graph_def_str;
@@ -150,7 +150,7 @@ TEST(GetOpListForValidationTest, ShouldStripDocs) {
 }
 
 TEST(VerifyNoDuplicateNodeNames, NoDuplicateNodeNames) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'Int32Input' }"
       "node { "
@@ -165,7 +165,7 @@ TEST(VerifyNoDuplicateNodeNames, NoDuplicateNodeNames) {
 }
 
 TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'A' op: 'Int32Input' }"
       "node { "
@@ -181,7 +181,7 @@ TEST(VerifyNoDuplicateNodeNames, DuplicateNodeNames) {
 }
 
 TEST(ValidateGraphHasNoCycleTest, NoCyclePasses) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       "node { name: 'A' op: 'FloatInput' }"
       "node { name: 'B' op: 'FloatInput' }"
       "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
@@ -198,7 +198,7 @@ TEST(ValidateGraphHasNoCycleTest, NoCyclePasses) {
 }
 
 TEST(ValidateGraphHasNoCycleTest, NoCycleWithMergePasses) {
-  const string graph_def_str =
+  const std::string graph_def_str =
       R"EOF(
       node { name: 'A' op: 'FloatInput' }
       node { name: 'merge' op: 'Merge' input: [ 'A:0', 'next:0' ]
@@ -221,8 +221,8 @@ TEST(ValidateGraphHasNoCycleTest, NoCycleWithMergePasses) {
   TF_EXPECT_OK(graph::ValidateGraphHasNoCycle(graph));
 }
 
-Node* AddNodeFromNodeDef(Graph& graph, const string& name,
-                         const string& node_type, int num_inputs) {
+Node* AddNodeFromNodeDef(Graph& graph, const std::string& name,
+                         const std::string& node_type, int num_inputs) {
   auto builder = NodeDefBuilder(name, node_type);
   for (int i = 0; i < num_inputs; ++i) {
     builder = builder.Input(absl::StrCat("node_", i), i, DT_FLOAT);
diff --git a/tensorflow/core/graph/while_context.h b/tensorflow/core/graph/while_context.h
index e23e9df90afd2d..4f15b7d37c7b18 100644
--- a/tensorflow/core/graph/while_context.h
+++ b/tensorflow/core/graph/while_context.h
@@ -39,7 +39,7 @@ class WhileContext {
                std::vector<OutputTensor> body_inputs,
                std::vector<OutputTensor> body_outputs);
 
-  const string& frame_name() const { return frame_name_; }
+  const std::string& frame_name() const { return frame_name_; }
   const std::vector<Node*>& enter_nodes() const { return enter_nodes_; }
   const std::vector<Node*>& exit_nodes() const { return exit_nodes_; }
   const OutputTensor& cond_output() const { return cond_output_; }
@@ -53,7 +53,7 @@ class WhileContext {
   // uniquely identified by its frame name. Frames are used by the executor to
   // manage the iterations of a loop. See the FrameState comment in
   // core/common_runtime/executor.cc for more details.
-  const string frame_name_;
+  const std::string frame_name_;
 
   // The enter nodes defining the input loop variables to the while loop. This
   // vector defines the order of the loop variables.

From 0be4d53ad546022f396792d269f779534c6231fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:17:12 -0800
Subject: [PATCH 396/753] Automated Code Change

PiperOrigin-RevId: 845648351
---
 tensorflow/compiler/jit/kernels/xla_ops.cc | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 325f79b95e3a5e..54d6276c05cc32 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -166,7 +166,7 @@ class ExecutableClosureStore {
  public:
   ExecutableClosureStore() : key_counter_(0) {}
 
-  using KeyT = string;
+  using KeyT = std::string;
 
   KeyT Produce(ExecutableClosure<ExecutableType, ClientType> result) {
     mutex_lock l(mutex_);
@@ -217,7 +217,8 @@ se::Stream* GetStream(OpKernelContext* ctx) {
 
 XlaComputationLaunchContext GetLaunchContext(
     const XlaPlatformInfo& platform_info, OpKernelContext* ctx,
-    xla::LocalClient* client, se::DeviceMemoryAllocator* allocator) {
+    xla::LocalClient* client,
+    stream_executor::DeviceAddressAllocator* allocator) {
   se::Stream* stream = GetStream(ctx);
   int device_ordinal = stream ? stream->parent()->device_ordinal()
                               : client->default_device_ordinal();
@@ -230,7 +231,7 @@ XlaComputationLaunchContext GetLaunchContext(
 
 absl::Status GetTaskName(const absl::string_view device_name,
                          std::string* task_name) {
-  string ignored;
+  std::string ignored;
   if (!DeviceNameUtils::SplitDeviceName(device_name, task_name, &ignored)) {
     return errors::InvalidArgument("Unable to parse device name: ",
                                    device_name);
@@ -246,7 +247,7 @@ xla::SendDeviceMemoryFunction GetSendDeviceMemoryFunction(
   return
       [ctx, program_key](
           int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
-          const se::DeviceMemoryBase& device_memory_base,
+          const stream_executor::DeviceAddressBase& device_memory_base,
           const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
           -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
         auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
@@ -293,7 +294,7 @@ xla::RecvDeviceMemoryFunction GetRecvDeviceMemoryFunction(
   return
       [ctx, program_key](
           int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
-          se::DeviceMemoryBase* device_memory_base,
+          stream_executor::DeviceAddressBase* device_memory_base,
           const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
           -> absl::StatusOr<tsl::AsyncValueRef<std::unique_ptr<se::Event>>> {
         auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
@@ -339,7 +340,7 @@ absl::StatusOr<xla::ExecutionOutput> RunExecutable(
     const XlaComputationLaunchContext& launch_context,
     std::vector<xla::ExecutionInput> execution_inputs,
     xla::ExecutableRunOptions run_options, xla::LocalExecutable* executable,
-    OpKernelContext* ctx, se::DeviceMemoryAllocator* allocator) {
+    OpKernelContext* ctx, stream_executor::DeviceAddressAllocator* allocator) {
   VLOG(2) << "Executing Xla Computation.";
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
@@ -620,7 +621,7 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         resource_var_ptrs[resources[i]] = variable_infos[i].var()->tensor();
       }
 
-      std::shared_ptr<se::DeviceMemoryAllocator> allocator =
+      std::shared_ptr<stream_executor::DeviceAddressAllocator> allocator =
           GetAllocator(ctx->device(), GetStream(ctx), platform_info);
       XlaComputationLaunchContext launch_context =
           GetLaunchContext(platform_info, ctx, client, allocator.get());
@@ -928,7 +929,7 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
 
   XlaExecutableClosure closure =
       XlaExecutableClosureStore::Global()->Consume(key);
-  std::shared_ptr<se::DeviceMemoryAllocator> allocator =
+  std::shared_ptr<stream_executor::DeviceAddressAllocator> allocator =
       GetAllocator(ctx->device(), GetStream(ctx), platform_info_);
   XlaComputationLaunchContext launch_context =
       GetLaunchContext(platform_info_, ctx, closure.client(), allocator.get());

From c3f05a1f6ffca60dd699bc9ae48481011c9ad571 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:19:13 -0800
Subject: [PATCH 397/753] Automated Code Change

PiperOrigin-RevId: 845648967
---
 .../base_rendezvous_mgr.cc                    |   5 +-
 .../distributed_runtime/cancellable_call.h    |   6 +-
 .../cluster_function_library_runtime.cc       |  24 +--
 .../cluster_function_library_runtime.h        |  20 +-
 .../cluster_function_library_runtime_test.cc  |  17 +-
 .../collective_param_resolver_distributed.cc  |  16 +-
 .../collective_param_resolver_distributed.h   |   7 +-
 ...lective_param_resolver_distributed_test.cc |  79 +++----
 .../collective_rma_distributed.cc             |  21 +-
 .../collective_rma_distributed.h              |  12 +-
 .../collective_rma_distributed_test.cc        |  63 +++---
 .../device_resolver_distributed.cc            |   6 +-
 .../device_resolver_distributed.h             |  10 +-
 .../device_resolver_distributed_test.cc       |   3 +-
 .../core/distributed_runtime/graph_mgr.cc     |   2 +-
 .../core/distributed_runtime/local_master.cc  |   6 +-
 .../core/distributed_runtime/local_master.h   |   4 +-
 tensorflow/core/distributed_runtime/master.cc |  57 ++---
 tensorflow/core/distributed_runtime/master.h  |   4 +-
 .../core/distributed_runtime/master_env.h     |   4 +-
 .../distributed_runtime/master_session.cc     | 189 ++++++++--------
 .../core/distributed_runtime/master_session.h |  39 ++--
 .../core/distributed_runtime/master_test.cc   |  34 +--
 .../distributed_runtime/message_wrappers.cc   | 129 +++++------
 .../distributed_runtime/message_wrappers.h    | 202 +++++++++---------
 .../message_wrappers_test.cc                  |  20 +-
 .../distributed_runtime/recent_request_ids.cc |   2 +-
 .../distributed_runtime/recent_request_ids.h  |   6 +-
 .../core/distributed_runtime/remote_device.cc |   5 +-
 .../core/distributed_runtime/remote_device.h  |   3 +-
 30 files changed, 515 insertions(+), 480 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 9f0ef2cf3c2886..495743c8d64c0c 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -257,9 +257,10 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
                           recv_args.alloc_attrs.gpu_compatible());
   Allocator* out_allocator = dst_device->GetAllocator(attr);
   AllocationAttributes allocation_attr;
-  uint64 safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
+  uint64_t safe_alloc_frontier = dst_device->SafeAllocFrontier(0);
   bool sync_dst_compute = (safe_alloc_frontier == 0);
-  std::function<uint64()> freed_by_func = [dst_device, &safe_alloc_frontier]() {
+  std::function<uint64_t()> freed_by_func = [dst_device,
+                                             &safe_alloc_frontier]() {
     safe_alloc_frontier = dst_device->SafeAllocFrontier(safe_alloc_frontier);
     return safe_alloc_frontier;
   };
diff --git a/tensorflow/core/distributed_runtime/cancellable_call.h b/tensorflow/core/distributed_runtime/cancellable_call.h
index 7311c8e3a44f42..3a2691b7cff22f 100644
--- a/tensorflow/core/distributed_runtime/cancellable_call.h
+++ b/tensorflow/core/distributed_runtime/cancellable_call.h
@@ -27,8 +27,8 @@ namespace tensorflow {
 // registration with a CancellationManager.
 class CancellableCall {
  public:
-  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
-                  WorkerCacheInterface* wc)
+  CancellableCall(CancellationManager* cancel_mgr,
+                  const std::string& remote_worker, WorkerCacheInterface* wc)
       : is_cancelled_(false),
         cancel_mgr_(cancel_mgr),
         remote_worker_(remote_worker),
@@ -51,7 +51,7 @@ class CancellableCall {
   mutex mu_;
   bool is_cancelled_;
   CancellationManager* const cancel_mgr_;  // Not owned
-  const string remote_worker_;
+  const std::string remote_worker_;
   WorkerCacheInterface* const wc_;  // Not owned
   WorkerInterface* const wi_;       // Owned by wc_, must be released.
   CallOptions opts_;
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index 966a281c1d2b66..c974bb4c520655 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -39,9 +39,9 @@ absl::Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
     const OpDef& sig, AttrSlice attrs,
     const FunctionLibraryRuntime::InstantiateOptions& options,
     const FunctionLibraryDefinition& flib_def, GraphDef* gdef,
-    std::vector<string>* send_keys, std::vector<string>* recv_keys) {
-  const string& target = options.target;
-  const string& func_name = sig.name();
+    std::vector<std::string>* send_keys, std::vector<std::string>* recv_keys) {
+  const std::string& target = options.target;
+  const std::string& func_name = sig.name();
   const FunctionDef* func_def = flib_def.Find(sig.name());
   if (func_def == nullptr) {
     return errors::InvalidArgument("Function ", func_name,
@@ -90,7 +90,7 @@ absl::Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
 
     // src_incarnation = 1 works because the transfer is across the same device.
     // TODO(rohanj): Find the src_incarnation for the remote device and set it.
-    const string& key = Rendezvous::CreateKey(
+    const std::string& key = Rendezvous::CreateKey(
         target, 1 /* src_incarnation */, target, in.name(), FrameAndIter(0, 0));
     send_keys->push_back(key);
     ++i;
@@ -140,7 +140,7 @@ absl::Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
 
     g.AddEdge(function_node, i, output_node, 0);
 
-    const string& key =
+    const std::string& key =
         Rendezvous::CreateKey(target, 1 /* src_incarnation */, target,
                               out.name(), FrameAndIter(0, 0));
     recv_keys->push_back(key);
@@ -180,7 +180,7 @@ ClusterFunctionLibraryRuntime::~ClusterFunctionLibraryRuntime() {
 }
 
 void ClusterFunctionLibraryRuntime::Instantiate(
-    const string& function_name, const FunctionLibraryDefinition& lib_def,
+    const std::string& function_name, const FunctionLibraryDefinition& lib_def,
     AttrSlice attrs, const FunctionLibraryRuntime::InstantiateOptions& options,
     FunctionLibraryRuntime::LocalHandle* handle,
     FunctionLibraryRuntime::DoneCallback done) {
@@ -192,7 +192,7 @@ void ClusterFunctionLibraryRuntime::Instantiate(
   WorkerInterface* wi = worker_cache->GetOrCreateWorker(target);
 
   if (wi == nullptr) {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     worker_session_->worker_cache()->ListWorkers(&workers);
     done(errors::InvalidArgument(
         "Could not find worker with target: ", target,
@@ -202,8 +202,8 @@ void ClusterFunctionLibraryRuntime::Instantiate(
 
   // Make RPC and obtain a graph handle.
   GraphDef gdef;
-  auto* send_keys = new std::vector<string>;
-  auto* recv_keys = new std::vector<string>;
+  auto* send_keys = new std::vector<std::string>;
+  auto* recv_keys = new std::vector<std::string>;
   auto construct_graph_fn = [&](const FunctionLibraryDefinition* lib_def) {
     const FunctionDef* fdef = lib_def->Find(function_name);
     const OpDef& sig = fdef->signature();
@@ -285,7 +285,7 @@ void ClusterFunctionLibraryRuntime::Run(
     args[i].AsProtoTensorContent(send->mutable_tensor());
     i++;
   }
-  const std::vector<string>& recv_keys = function_data->recv_keys;
+  const std::vector<std::string>& recv_keys = function_data->recv_keys;
   for (const auto& recv_key : recv_keys) {
     req->add_recv_key(recv_key);
   }
@@ -308,7 +308,7 @@ void ClusterFunctionLibraryRuntime::Run(
         if (!local_status->ok()) {
           return;
         }
-        std::map<string, TensorProto*> mapped_recvs;
+        std::map<std::string, TensorProto*> mapped_recvs;
         for (auto& recv : *resp->mutable_recv()) {
           mapped_recvs[recv.name()] = recv.mutable_tensor();
         }
@@ -363,7 +363,7 @@ void ClusterFunctionLibraryRuntime::Run(
 }
 
 void ClusterFunctionLibraryRuntime::CleanUp(
-    uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+    uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
     FunctionLibraryRuntime::DoneCallback done) {
   FunctionData* function_data = nullptr;
   {
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
index a016a5eea418df..2d66854ec8c2ca 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.h
@@ -41,7 +41,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
 
   ~ClusterFunctionLibraryRuntime() override;
 
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
                    FunctionLibraryRuntime::LocalHandle* handle,
@@ -57,7 +57,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
            absl::Span<const FunctionArg> args, std::vector<FunctionRet>* rets,
            FunctionLibraryRuntime::DoneCallback done) override;
 
-  void CleanUp(uint64 step_id, FunctionLibraryRuntime::LocalHandle handle,
+  void CleanUp(uint64_t step_id, FunctionLibraryRuntime::LocalHandle handle,
                FunctionLibraryRuntime::DoneCallback done) override;
 
   DeviceMgr* remote_device_mgr() const override { return remote_device_mgr_; }
@@ -67,7 +67,7 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
       const OpDef& sig, AttrSlice attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const FunctionLibraryDefinition& flib_def, GraphDef* g,
-      std::vector<string>* send_keys, std::vector<string>* recv_keys);
+      std::vector<std::string>* send_keys, std::vector<std::string>* recv_keys);
   friend class ClusterFunctionLibraryRuntimeTest;
 
   mutable mutex mu_;
@@ -77,19 +77,19 @@ class ClusterFunctionLibraryRuntime : public DistributedFunctionLibraryRuntime {
   DeviceMgr* remote_device_mgr_;  // not owned.
 
   struct FunctionData {
-    const string graph_handle;
-    const string target;
+    const std::string graph_handle;
+    const std::string target;
     // Hold a shared pointer to the underlying worker cache to avoid it being
     // deleted in potential cluster update.
     const std::shared_ptr<WorkerCacheInterface> worker_cache;
     WorkerInterface* wi = nullptr;
-    const std::vector<string> send_keys;
-    const std::vector<string> recv_keys;
+    const std::vector<std::string> send_keys;
+    const std::vector<std::string> recv_keys;
 
-    FunctionData(const string& graph_handle, const string& target,
+    FunctionData(const std::string& graph_handle, const std::string& target,
                  std::shared_ptr<WorkerCacheInterface> worker_cache,
-                 WorkerInterface* wi, const std::vector<string>& send_keys,
-                 const std::vector<string>& recv_keys)
+                 WorkerInterface* wi, const std::vector<std::string>& send_keys,
+                 const std::vector<std::string>& recv_keys)
         : graph_handle(graph_handle),
           target(target),
           worker_cache(std::move(worker_cache)),
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 40290ef3e4f54e..9be587fb48880c 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -42,7 +42,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
         &cluster_));
     GrpcChannelSpec spec;
 
-    std::map<int, string> host_ports;
+    std::map<int, std::string> host_ports;
     int i = 0;
     for (const auto& target : cluster_->targets("localhost")) {
       host_ports[i++] = target;
@@ -72,12 +72,13 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
       const OpDef& sig, test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const FunctionLibraryDefinition& lib_def, GraphDef* g,
-      std::vector<string>* send_keys, std::vector<string>* recv_keys) {
+      std::vector<std::string>* send_keys,
+      std::vector<std::string>* recv_keys) {
     return ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
         sig, attrs, options, lib_def, g, send_keys, recv_keys);
   }
 
-  void Instantiate(const string& function_name,
+  void Instantiate(const std::string& function_name,
                    const FunctionLibraryDefinition& lib_def,
                    test::function::Attrs attrs,
                    const FunctionLibraryRuntime::InstantiateOptions& options,
@@ -88,8 +89,8 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   absl::Status InstantiateAndRun(
-      const string& function_name, const FunctionLibraryDefinition& lib_def,
-      test::function::Attrs attrs,
+      const std::string& function_name,
+      const FunctionLibraryDefinition& lib_def, test::function::Attrs attrs,
       const FunctionLibraryRuntime::InstantiateOptions& options,
       const std::vector<Tensor>& args, std::vector<Tensor*> rets) {
     FunctionLibraryRuntime::LocalHandle handle;
@@ -135,7 +136,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
 
 TEST_F(ClusterFunctionLibraryRuntimeTest, ConstructFunctionGraph) {
   GraphDef actual;
-  std::vector<string> send_keys, recv_keys;
+  std::vector<std::string> send_keys, recv_keys;
   FunctionDefLibrary proto;
   *(proto.add_function()) = test::function::Swap();
   FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
@@ -402,10 +403,10 @@ TEST_F(ClusterFunctionLibraryRuntimeTest, DISABLED_InstantiateAndRun) {
   instantiate_opts.target = "/job:localhost/replica:0/task:1/cpu:0";
 
   Tensor y;
-  auto x = test::AsTensor<int32>({1, 2, 3, 4});
+  auto x = test::AsTensor<int32_t>({1, 2, 3, 4});
   TF_EXPECT_OK(InstantiateAndRun("XTimesTwoInt32", lib_def, {},
                                  instantiate_opts, {x}, {&y}));
-  test::ExpectTensorEqual<int32>(y, test::AsTensor<int32>({2, 4, 6, 8}));
+  test::ExpectTensorEqual<int32_t>(y, test::AsTensor<int32_t>({2, 4, 6, 8}));
 }
 
 TEST_F(ClusterFunctionLibraryRuntimeTest,
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index ab13146b73bbbd..5acf12ccea0f69 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -34,7 +34,7 @@ class CompleteGroupCall : public CancellableCall {
   CompleteGroupCall(const CollGroupParams& group,
                     const DeviceAttributes& device,
                     CancellationManager* cancel_mgr,
-                    const string& remote_worker, WorkerCacheInterface* wc)
+                    const std::string& remote_worker, WorkerCacheInterface* wc)
       : CancellableCall(cancel_mgr, remote_worker, wc) {
     req_.set_group_key(group.group_key);
     req_.set_group_size(group.group_size);
@@ -55,9 +55,11 @@ class CompleteInstanceCall : public CancellableCall {
  public:
   CompleteInstanceCall(const CollGroupParams& group,
                        const CollInstanceParams& instance,
-                       const string& node_name, const string& device_name,
-                       bool is_source, CancellationManager* cancel_mgr,
-                       const string& remote_worker, WorkerCacheInterface* wc)
+                       const std::string& node_name,
+                       const std::string& device_name, bool is_source,
+                       CancellationManager* cancel_mgr,
+                       const std::string& remote_worker,
+                       WorkerCacheInterface* wc)
       : CancellableCall(cancel_mgr, remote_worker, wc) {
     req_.set_name(node_name);
     req_.set_type(instance.type);
@@ -91,7 +93,7 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     DeviceResolverDistributed* dev_resolver,
     NcclCommunicatorInterface* nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& task_name)
+    WorkerCacheInterface* worker_cache, const std::string& task_name)
     : CollectiveParamResolverLocal(config, dev_mgr, dev_resolver,
                                    nccl_communicator, task_name),
       worker_cache_(worker_cache),
@@ -364,8 +366,8 @@ absl::Status CollectiveParamResolverDistributed::UpdateInstanceCache(
 }
 
 void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const StatusCallback& done) {
+    const std::string& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
   if (group_leader_.empty()) {
     // This is the group leader so resolution is local.
     return CompleteInstanceLocal(device, cp, done);
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
index 63006c1253547e..d885fe0bb81a0e 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -32,7 +32,7 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
       const ConfigProto& config, const DeviceMgr* dev_mgr,
       DeviceResolverDistributed* dev_resolver,
       NcclCommunicatorInterface* nccl_communicator,
-      WorkerCacheInterface* worker_cache, const string& task_name);
+      WorkerCacheInterface* worker_cache, const std::string& task_name);
 
   void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
@@ -82,13 +82,14 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
   // Finish populating *cp.  Semantics are like those of
   // CompleteInstanceLocal but will make a remote call to the group
   // leader if necessary.
-  void CompleteInstanceDistributed(const string& device, CollectiveParams* cp,
+  void CompleteInstanceDistributed(const std::string& device,
+                                   CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    const StatusCallback& done)
       TF_LOCKS_EXCLUDED(instance_mu_, group_mu_);
 
   WorkerCacheInterface* worker_cache_;  // Not owned
-  const string group_leader_;
+  const std::string group_leader_;
   CancellationManager abortion_cancel_mgr_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 31140bf0755740..2880d722f0efbf 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -34,8 +34,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static std::unique_ptr<Device> NewDevice(const string& type,
-                                         const string& name) {
+static std::unique_ptr<Device> NewDevice(const std::string& type,
+                                         const std::string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -54,15 +54,16 @@ class FakeCache : public TestWorkerCache {
  public:
   // Override the Locality methods to actually pass through to the
   // worker.
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return false;
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
-    string task_name;
-    string dev_part;
+    std::string task_name;
+    std::string dev_part;
     if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
       done(errors::Internal("failed to parse device name"));
       return;
@@ -94,7 +95,9 @@ class FakeCache : public TestWorkerCache {
 class FakeNcclCommunicator : public NcclCommunicatorInterface {
  public:
   // We only need to define GenerateCommunicatorKey().
-  string GenerateCommunicatorKey() override { return "mock-communicator-key"; }
+  std::string GenerateCommunicatorKey() override {
+    return "mock-communicator-key";
+  }
 
   void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
                StatusCallback done) override {
@@ -114,15 +117,16 @@ class DeviceResDistTest : public ::testing::Test {
 
  protected:
   void DefineWorkers(int num_workers, int num_devices,
-                     const string& device_type, bool nccl) {
+                     const std::string& device_type, bool nccl) {
     for (int w = 0; w < num_workers; ++w) {
-      string name = absl::StrCat("/job:worker/replica:0/task:", w);
+      std::string name = absl::StrCat("/job:worker/replica:0/task:", w);
       DefineWorker(name, device_type, num_devices, nccl);
     }
   }
 
-  void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices, bool nccl) {
+  void DefineWorker(const std::string& worker_name,
+                    const std::string& device_type, int num_devices,
+                    bool nccl) {
     ConfigProto config;
     config.mutable_experimental()->set_collective_group_leader(
         "/job:worker/replica:0/task:0");
@@ -136,7 +140,7 @@ class DeviceResDistTest : public ::testing::Test {
     }
     device_mgrs_[worker_name] =
         std::make_unique<StaticDeviceMgr>(std::move(devices));
-    std::vector<string>* dv = &dev_by_task_[worker_name];
+    std::vector<std::string>* dv = &dev_by_task_[worker_name];
     dv->clear();
     for (auto* d : device_mgrs_[worker_name]->ListDevices()) {
       dv->push_back(d->name());
@@ -160,14 +164,14 @@ class DeviceResDistTest : public ::testing::Test {
   }
 
   void DefineCollectiveParams(int num_workers, int num_devices,
-                              const string& device_type,
+                              const std::string& device_type,
                               CollectiveType coll_type = REDUCTION_COLLECTIVE,
                               int source_rank = 0) {
     for (int wi = 0; wi < num_workers; ++wi) {
-      string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
+      std::string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
         int idx = wi * num_devices + di;
-        string device_name =
+        std::string device_name =
             strings::StrCat(task_name, "/device:", device_type, ":", di);
         cp_[device_name] =
             CreateCollectiveParams(num_workers, num_devices, device_type,
@@ -177,7 +181,7 @@ class DeviceResDistTest : public ::testing::Test {
   }
 
   CollectiveParams* CreateCollectiveParams(int num_workers, int num_devices,
-                                           const string& device_type,
+                                           const std::string& device_type,
                                            CollectiveType coll_type,
                                            bool is_source) {
     const int kGroupKey = 5;
@@ -203,16 +207,16 @@ class DeviceResDistTest : public ::testing::Test {
     }
     int group_size = num_workers * num_devices;
     for (int wi = 0; wi < num_workers; ++wi) {
-      string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
+      std::string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        string device_name = absl::StrCat(task_name, "/device:CPU:", di);
+        std::string device_name = absl::StrCat(task_name, "/device:CPU:", di);
         IssueRequest(task_name, device_name, group_size);
       }
     }
   }
 
-  void IssueRequest(const string& task_name, const string& device_name,
-                    int group_size) {
+  void IssueRequest(const std::string& task_name,
+                    const std::string& device_name, int group_size) {
     Device* device = nullptr;
     TF_CHECK_OK(device_mgrs_[task_name]->LookupDevice(device_name, &device));
     CollectiveParams* cp = cp_[device_name];
@@ -243,11 +247,11 @@ class DeviceResDistTest : public ::testing::Test {
     // Verify that all cp_ values get the same set of task and device
     // names, with unique default_rank in the expected order.
     const int dev_count = num_workers * num_devices;
-    string dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
+    std::string dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
     for (int wi = 0; wi < num_workers; ++wi) {
-      string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
+      std::string task_name = absl::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        string device_name = absl::StrCat(task_name, "/device:CPU:", di);
+        std::string device_name = absl::StrCat(task_name, "/device:CPU:", di);
         int idx = wi * num_devices + di;
         TF_ASSERT_OK(status_[device_name]);
         EXPECT_EQ(cp_[device_name]->default_rank, idx);
@@ -270,7 +274,8 @@ class DeviceResDistTest : public ::testing::Test {
     }
   }
 
-  void ValidateDeviceResolver(const CollectiveParams& cp, const string& task) {
+  void ValidateDeviceResolver(const CollectiveParams& cp,
+                              const std::string& task) {
     for (const CollGroupMember& member : cp.group.members) {
       DeviceAttributes attributes;
       TF_ASSERT_OK(dev_resolvers_[task]->GetDeviceAttributes(
@@ -279,14 +284,14 @@ class DeviceResDistTest : public ::testing::Test {
   }
 
   void RestartWorker(int worker_idx, int num_workers, int num_devices,
-                     const string& device_type, bool nccl,
+                     const std::string& device_type, bool nccl,
                      CollectiveType coll_type = REDUCTION_COLLECTIVE,
                      bool is_source = false) {
-    string worker_name =
+    std::string worker_name =
         absl::StrCat("/job:worker/replica:0/task:", worker_idx);
     DefineWorker(worker_name, device_type, num_devices, nccl);
     for (int i = 0; i < num_devices; ++i) {
-      string device_name =
+      std::string device_name =
           strings::StrCat(worker_name, "/device:", device_type, ":", i);
       if (cp_.find(device_name) != cp_.end()) {
         cp_[device_name]->Unref();
@@ -301,18 +306,18 @@ class DeviceResDistTest : public ::testing::Test {
   FakeNcclCommunicator nccl_communicator_;
   CancellationManager cm_;
   // Below are keyed by task names.
-  absl::flat_hash_map<string, std::unique_ptr<DeviceMgr>> device_mgrs_;
-  absl::flat_hash_map<string, std::unique_ptr<DeviceResolverDistributed>>
+  absl::flat_hash_map<std::string, std::unique_ptr<DeviceMgr>> device_mgrs_;
+  absl::flat_hash_map<std::string, std::unique_ptr<DeviceResolverDistributed>>
       dev_resolvers_;
-  absl::flat_hash_map<string,
+  absl::flat_hash_map<std::string,
                       std::unique_ptr<CollectiveParamResolverDistributed>>
       cp_resolvers_;
-  absl::flat_hash_map<string, std::vector<string>> dev_by_task_;
-  absl::flat_hash_map<string, std::unique_ptr<WorkerEnv>> worker_envs_;
-  absl::flat_hash_map<string, std::unique_ptr<Worker>> workers_;
+  absl::flat_hash_map<std::string, std::vector<std::string>> dev_by_task_;
+  absl::flat_hash_map<std::string, std::unique_ptr<WorkerEnv>> worker_envs_;
+  absl::flat_hash_map<std::string, std::unique_ptr<Worker>> workers_;
   // Below are keyed by device names;
-  absl::flat_hash_map<string, CollectiveParams*> cp_;
-  absl::flat_hash_map<string, absl::Status> status_;
+  absl::flat_hash_map<std::string, CollectiveParams*> cp_;
+  absl::flat_hash_map<std::string, absl::Status> status_;
   mutex mu_;
   int num_done_ TF_GUARDED_BY(mu_);
   condition_variable done_;
@@ -343,8 +348,8 @@ TEST_F(DeviceResDistTest, DifferentIncarnation) {
   DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   RestartWorker(1, num_workers, num_devices, "CPU", /*nccl*/ false);
-  const string task_name = "/job:worker/replica:0/task:1";
-  const string device_name = absl::StrCat(task_name, "/device:CPU:0");
+  const std::string task_name = "/job:worker/replica:0/task:1";
+  const std::string device_name = absl::StrCat(task_name, "/device:CPU:0");
   IssueRequest(task_name, device_name, num_workers * num_devices);
   EXPECT_TRUE(absl::IsFailedPrecondition(status_[device_name]));
 }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 1b4ba6296f4978..afab5707e58e4e 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -39,9 +39,9 @@ namespace {
 
 class RecvBufCall : public CancellableCall {
  public:
-  RecvBufCall(int64_t step_id, const string& peer_device,
-              const string& peer_task, const string& key, Device* to_device,
-              DeviceContext* to_device_ctx,
+  RecvBufCall(int64_t step_id, const std::string& peer_device,
+              const std::string& peer_task, const std::string& key,
+              Device* to_device, DeviceContext* to_device_ctx,
               const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
               const DeviceLocality& client_locality,
               const DeviceAttributes& server_attributes,
@@ -107,11 +107,12 @@ absl::Status PopulateTensorFromResponse(const RecvBufResponse& response,
 }  // namespace
 
 void CollectiveRemoteAccessDistributed::RecvFromPeer(
-    const string& peer_device, const string& peer_task, bool peer_is_local,
-    const string& key, Device* to_device, DeviceContext* to_device_ctx,
-    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
-    const DeviceLocality& client_locality, int dev_to_dev_stream_index,
-    CancellationManager* cancellation_manager, const StatusCallback& done) {
+    const std::string& peer_device, const std::string& peer_task,
+    bool peer_is_local, const std::string& key, Device* to_device,
+    DeviceContext* to_device_ctx, const AllocatorAttributes& to_alloc_attr,
+    Tensor* to_tensor, const DeviceLocality& client_locality,
+    int dev_to_dev_stream_index, CancellationManager* cancellation_manager,
+    const StatusCallback& done) {
   if (peer_is_local) {
     CollectiveRemoteAccessLocal::RecvFromPeer(
         peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
@@ -232,7 +233,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
 }
 
 void CollectiveRemoteAccessDistributed::CheckPeerHealth(
-    const string& peer_task, int64_t timeout_in_ms,
+    const std::string& peer_task, int64_t timeout_in_ms,
     const StatusCallback& done) {
   if (peer_task == task_name_) {
     // Fast path if the peer is the worker itself.
@@ -265,7 +266,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
           s = dev_resolver_->GetAllDeviceAttributes(peer_task, &cached_attrs);
         }
         if (s.ok()) {
-          absl::flat_hash_set<uint64> remote_incarnations;
+          absl::flat_hash_set<uint64_t> remote_incarnations;
           for (const DeviceAttributes& da : resp->device_attributes()) {
             remote_incarnations.insert(da.incarnation());
           }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
index 22d4d6f5a119e6..4557e9b36ac206 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -29,7 +29,8 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
   CollectiveRemoteAccessDistributed(
       const DeviceMgr* dev_mgr, DeviceResolverInterface* dev_resolver,
       std::shared_ptr<UnboundedWorkQueue> work_queue,
-      WorkerCacheInterface* worker_cache, int64_t step_id, string task_name)
+      WorkerCacheInterface* worker_cache, int64_t step_id,
+      std::string task_name)
       : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
         worker_cache_(worker_cache),
         work_queue_(std::move(work_queue)),
@@ -37,8 +38,9 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
 
   ~CollectiveRemoteAccessDistributed() override {}
 
-  void RecvFromPeer(const string& peer_device, const string& peer_task,
-                    bool peer_is_local, const string& key, Device* to_device,
+  void RecvFromPeer(const std::string& peer_device,
+                    const std::string& peer_task, bool peer_is_local,
+                    const std::string& key, Device* to_device,
                     DeviceContext* to_device_ctx,
                     const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
                     const DeviceLocality& client_locality,
@@ -46,7 +48,7 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
                     CancellationManager* cancellation_manager,
                     const StatusCallback& done) override;
 
-  void CheckPeerHealth(const string& peer_task, int64_t timeout_in_ms,
+  void CheckPeerHealth(const std::string& peer_task, int64_t timeout_in_ms,
                        const StatusCallback& done) override;
 
   void StartAbort(const absl::Status& s) override;
@@ -57,7 +59,7 @@ class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
   // `CollectiveExecutorMgr`.
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
   CancellationManager abortion_cancel_mgr_;
-  string task_name_;
+  std::string task_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index a2ec3b1aff2834..4d626cb9f49a9c 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -50,14 +50,16 @@ namespace {
 
 class FakeAllocator : public Allocator {
  public:
-  string Name() override { return "fake"; }
+  std::string Name() override { return "fake"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
+    return tsl::port::AlignedMalloc(num_bytes,
+                                    static_cast<std::align_val_t>(alignment));
   }
   void DeallocateRaw(void* ptr) override { return port::AlignedFree(ptr); }
 };
 
-static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
+static std::unique_ptr<Device> NewDevice(const std::string& type,
+                                         const std::string& name,
                                          Allocator* allocator) {
   class FakeDevice : public Device {
    public:
@@ -81,7 +83,7 @@ static int64_t kStepId = 123;
 
 class FakeWorker : public TestWorkerInterface {
  public:
-  FakeWorker(const string& name, DeviceMgr* dev_mgr,
+  FakeWorker(const std::string& name, DeviceMgr* dev_mgr,
              DeviceResolverDistributed* dres, bool is_failed,
              bool set_tensor_in_extra)
       : name_(name),
@@ -144,7 +146,7 @@ class FakeWorker : public TestWorkerInterface {
               // Since this is not really RDMA into pre-allocated memory send
               // the bytes in the response.
               RecvBufRespExtra extra;
-              extra.add_tensor_content(string(
+              extra.add_tensor_content(std::string(
                   reinterpret_cast<const char*>(DMAHelper::base(h->prod_value)),
                   num_bytes));
               response->mutable_transport_options()->PackFrom(extra);
@@ -164,7 +166,7 @@ class FakeWorker : public TestWorkerInterface {
   }
 
  private:
-  string name_;
+  std::string name_;
   DeviceMgr* device_mgr_;
   DeviceResolverDistributed* device_resolver_;
   BufRendezvous buf_rendezvous_;
@@ -176,15 +178,16 @@ class FakeCache : public TestWorkerCache {
  public:
   // Override the Locality methods to actually pass through to the
   // worker.
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return false;
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
-    string task_name;
-    string dev_part;
+    std::string task_name;
+    std::string dev_part;
     if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
       done(errors::Internal("failed to parse device name"));
       return;
@@ -246,10 +249,10 @@ class CollRMADistTest
   void SetUp() override {
     const int num_workers = 2;
     const int num_devices = 1;
-    string device_type = "CPU";
-    string dev0_worker_name;
+    std::string device_type = "CPU";
+    std::string dev0_worker_name;
     for (int w = 0; w < num_workers; ++w) {
-      string name = absl::StrCat("/job:worker/replica:0/task:", w);
+      std::string name = absl::StrCat("/job:worker/replica:0/task:", w);
       if (w == 0) {
         dev0_worker_name = name;
       }
@@ -288,8 +291,9 @@ class CollRMADistTest
     }
   }
 
-  void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices, bool is_failed = false) {
+  void DefineWorker(const std::string& worker_name,
+                    const std::string& device_type, int num_devices,
+                    bool is_failed = false) {
     std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
@@ -316,8 +320,9 @@ class CollRMADistTest
     wc_.AddWorker(worker_name, fw);
   }
 
-  void RestartWorker(const string& worker_name, const string& device_type,
-                     int num_devices, bool is_failed = false) {
+  void RestartWorker(const std::string& worker_name,
+                     const std::string& device_type, int num_devices,
+                     bool is_failed = false) {
     auto it = dev_resolvers_.find(worker_name);
     if (it != dev_resolvers_.end()) {
       delete it->second;
@@ -354,8 +359,8 @@ class CollRMADistTest
   FakeCache wc_;
   CancellationManager cm_;
   std::vector<DeviceMgr*> device_mgrs_;
-  std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
-  std::unordered_map<string, std::vector<DeviceAttributes>> dev_by_task_;
+  std::unordered_map<std::string, DeviceResolverDistributed*> dev_resolvers_;
+  std::unordered_map<std::string, std::vector<DeviceAttributes>> dev_by_task_;
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<FakeWorker*> workers_;
   std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
@@ -379,7 +384,7 @@ TEST_P(CollRMADistTest, ProdFirstOK) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   wi->buf_rendezvous()->ProvideBuf(
       kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
       AllocatorAttributes(),
@@ -389,7 +394,7 @@ TEST_P(CollRMADistTest, ProdFirstOK) {
       },
       nullptr /*cancellation_manager*/);
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   DeviceContext* to_device_ctx = nullptr;
   MaybeSetGPUDevice(dst_device);
@@ -418,9 +423,9 @@ TEST_P(CollRMADistTest, ConsFirstOK) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   MaybeSetGPUDevice(dst_device);
   DeviceContext* to_device_ctx = nullptr;
@@ -454,9 +459,9 @@ TEST_P(CollRMADistTest, ConsFirstAbort) {
   ResolveDeviceAttributes();
   absl::Notification consumer_note;
   absl::Status consumer_status;
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   MaybeSetGPUDevice(dst_device);
   DeviceContext* to_device_ctx = nullptr;
@@ -483,7 +488,7 @@ TEST_P(CollRMADistTest, ResponseTooLarge) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string kBufKey = "fake_buf_key";
+  const std::string kBufKey = "fake_buf_key";
   wi->buf_rendezvous()->ProvideBuf(
       kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &large_response_,
       AllocatorAttributes(),
@@ -493,7 +498,7 @@ TEST_P(CollRMADistTest, ResponseTooLarge) {
       },
       nullptr /*cancellation_manager*/);
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   DeviceContext* to_device_ctx = nullptr;
   MaybeSetGPUDevice(dst_device);
@@ -523,9 +528,9 @@ TEST_P(CollRMADistTest, WorkerRestart) {
   absl::Status consumer_status;
   absl::Status producer_status;
   FakeWorker* wi = workers_[1];
-  const string buf_key = "fake_buf_key";
+  const std::string buf_key = "fake_buf_key";
   Device* dst_device = nullptr;
-  string dev_name = "CPU:0";
+  std::string dev_name = "CPU:0";
   TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
   MaybeSetGPUDevice(dst_device);
   DeviceContext* to_device_ctx = nullptr;
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
index f0f8c50b2fd50a..3de97cc08726ff 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
@@ -28,7 +28,7 @@ DeviceResolverDistributed::DeviceResolverDistributed(const DeviceMgr* dev_mgr) {
 }
 
 absl::Status DeviceResolverDistributed::GetDeviceAttributes(
-    const string& device, DeviceAttributes* attributes) {
+    const std::string& device, DeviceAttributes* attributes) {
   mutex_lock l(mu_);
   auto it = attr_table_.find(device);
   if (it == attr_table_.end()) {
@@ -39,11 +39,11 @@ absl::Status DeviceResolverDistributed::GetDeviceAttributes(
 }
 
 absl::Status DeviceResolverDistributed::GetAllDeviceAttributes(
-    const string& task, std::vector<DeviceAttributes>* attributes) {
+    const std::string& task, std::vector<DeviceAttributes>* attributes) {
   mutex_lock l(mu_);
   attributes->clear();
   for (const auto& it : attr_table_) {
-    const string& device_name = it.first;
+    const std::string& device_name = it.first;
     if (DeviceNameUtils::IsSameAddressSpace(task, device_name)) {
       attributes->push_back(it.second);
     }
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
index b46c288cb3a456..3bf6cfa813fe2f 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -31,19 +31,21 @@ class DeviceResolverDistributed : public DeviceResolverInterface {
  public:
   explicit DeviceResolverDistributed(const DeviceMgr* dev_mgr);
 
-  absl::Status GetDeviceAttributes(const string& device,
+  absl::Status GetDeviceAttributes(const std::string& device,
                                    DeviceAttributes* attributes) override;
 
   absl::Status GetAllDeviceAttributes(
-      const string& task, std::vector<DeviceAttributes>* attributes) override;
+      const std::string& task,
+      std::vector<DeviceAttributes>* attributes) override;
 
   absl::Status UpdateDeviceAttributes(
       const std::vector<DeviceAttributes>& attributes) override;
 
  protected:
-  const string task_name_;
+  const std::string task_name_;
   mutex mu_;
-  absl::flat_hash_map<string, DeviceAttributes> attr_table_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<std::string, DeviceAttributes> attr_table_
+      TF_GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index 0c2bdba1da59d4..8a3245ce2ee3e5 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -34,7 +34,8 @@ using ::testing::UnorderedElementsAre;
 
 // Create a fake 'Device' whose only interesting attribute is a non-default
 // DeviceLocality and incarnation.
-std::unique_ptr<Device> NewDevice(const string& type, const string& name) {
+std::unique_ptr<Device> NewDevice(const std::string& type,
+                                  const std::string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 13d130d289418c..507915a74152be 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -337,7 +337,7 @@ absl::Status GraphMgr::Register(const std::string& handle, const GraphDef& gdef,
   {
     mutex_lock l(mu_);
     *graph_handle =
-        strings::Printf("%016llx", static_cast<long long>(++next_id_));
+        absl::StrFormat("%016llx", static_cast<long long>(++next_id_));
     item->handle = *graph_handle;
     CHECK(table_.insert({*graph_handle, item}).second);
   }
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index 54a50da2ace799..af41d4ad1d4b49 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -223,7 +223,7 @@ struct MasterInfo {
       : master(master), default_timeout_in_ms(default_timeout_in_ms) {}
 };
 
-typedef std::unordered_map<string, MasterInfo> LocalMasterRegistry;
+typedef std::unordered_map<std::string, MasterInfo> LocalMasterRegistry;
 LocalMasterRegistry* local_master_registry() {
   static LocalMasterRegistry* local_master_registry_ = new LocalMasterRegistry;
   return local_master_registry_;
@@ -231,7 +231,7 @@ LocalMasterRegistry* local_master_registry() {
 }  // namespace
 
 /* static */
-void LocalMaster::Register(const string& target, Master* master,
+void LocalMaster::Register(const std::string& target, Master* master,
                            int64_t default_timeout_in_ms) {
   mutex_lock l(*get_local_master_registry_lock());
   local_master_registry()->insert(
@@ -239,7 +239,7 @@ void LocalMaster::Register(const string& target, Master* master,
 }
 
 /* static */
-std::unique_ptr<LocalMaster> LocalMaster::Lookup(const string& target) {
+std::unique_ptr<LocalMaster> LocalMaster::Lookup(const std::string& target) {
   std::unique_ptr<LocalMaster> ret;
   mutex_lock l(*get_local_master_registry_lock());
   auto iter = local_master_registry()->find(target);
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index e4fc37e4f60f50..b9fe78e8591f17 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -89,12 +89,12 @@ class LocalMaster : public MasterInterface {
   // any LocalMaster objects that may wrap this master. There is no
   // corresponding deregister method, since clean server shutdown is
   // not currently implemented for any server type.
-  static void Register(const string& target, Master* master,
+  static void Register(const std::string& target, Master* master,
                        int64_t default_timeout_in_ms);
 
   // Returns a pointer to the local master associated with the given
   // `target`, or nullptr if none exists.
-  static std::unique_ptr<LocalMaster> Lookup(const string& target);
+  static std::unique_ptr<LocalMaster> Lookup(const std::string& target);
 
  private:
   Master* master_impl_;  // Not owned.
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 9a2c553f841faf..bc7fa3c80bb678 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -102,7 +102,7 @@ void Master::GC() {
     if (shutdown_) {
       break;
     }
-    std::vector<string> handles;
+    std::vector<std::string> handles;
     const int64_t num_micros =
         static_cast<int64_t>(session_gc_seconds_ * 1000000);
     for (const auto& entry : sessions_) {
@@ -124,7 +124,7 @@ void Master::GC() {
   }
 }
 
-MasterSession* Master::FindMasterSession(const string& handle) {
+MasterSession* Master::FindMasterSession(const std::string& handle) {
   MasterSession* session = nullptr;
   {
     mutex_lock l(mu_);
@@ -139,8 +139,8 @@ MasterSession* Master::FindMasterSession(const string& handle) {
 class DeviceFinder {
  public:
   static absl::Status GetRemoteDevices(
-      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
-      WorkerCacheInterface* worker_cache,
+      const protobuf::RepeatedPtrField<std::string>& device_filters,
+      MasterEnv* env, WorkerCacheInterface* worker_cache,
       std::vector<std::unique_ptr<Device>>* out_remote) {
     DeviceFinder finder(device_filters, env, worker_cache);
     finder.Start();
@@ -150,19 +150,20 @@ class DeviceFinder {
   }
 
   static void GetRemoteWorkers(
-      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
-      WorkerCacheInterface* worker_cache, std::vector<string>* workers) {
+      const protobuf::RepeatedPtrField<std::string>& device_filters,
+      MasterEnv* env, WorkerCacheInterface* worker_cache,
+      std::vector<std::string>* workers) {
     DeviceFinder finder(device_filters, env, worker_cache);
     *workers = finder.targets_;
   }
 
  private:
   explicit DeviceFinder(
-      const protobuf::RepeatedPtrField<string>& device_filters, MasterEnv* env,
-      WorkerCacheInterface* worker_cache)
+      const protobuf::RepeatedPtrField<std::string>& device_filters,
+      MasterEnv* env, WorkerCacheInterface* worker_cache)
       : env_(env), worker_cache_(worker_cache) {
     CHECK(worker_cache) << "Worker cache was null!";
-    auto process_filter = [this](const string& filter) {
+    auto process_filter = [this](const std::string& filter) {
       DeviceNameUtils::ParsedName parsed;
       if (DeviceNameUtils::ParseFullName(filter, &parsed)) {
         filters_.push_back(parsed);
@@ -170,7 +171,7 @@ class DeviceFinder {
         LOG(FATAL) << "Skipping invalid filter: " << filter;
       }
     };
-    for (const string& filter : device_filters) {
+    for (const std::string& filter : device_filters) {
       process_filter(filter);
     }
     // Enumerates all known workers' target. A target name is a
@@ -178,19 +179,19 @@ class DeviceFinder {
     if (filters_.empty()) {
       // If no filters were specified, we list all known workers in
       // `worker_cache`.
-      std::vector<string> workers;
+      std::vector<std::string> workers;
       worker_cache->ListWorkers(&workers);
       std::swap(workers, targets_);
     } else {
       // When applying filters, we must include the local worker, even if it
       // does not match any of the filters.
       CHECK_GT(env_->local_devices.size(), 0) << "No local devices provided.";
-      const string& local_device_name = env_->local_devices[0]->name();
+      const std::string& local_device_name = env_->local_devices[0]->name();
       DeviceNameUtils::ParsedName local_parsed_name;
       CHECK(DeviceNameUtils::ParseFullName(local_device_name,
                                            &local_parsed_name));
       bool all_filters_have_job = true;
-      std::unordered_set<string> filter_job_names({local_parsed_name.job});
+      std::unordered_set<std::string> filter_job_names({local_parsed_name.job});
       for (const DeviceNameUtils::ParsedName& filter : filters_) {
         all_filters_have_job = all_filters_have_job && filter.has_job;
         if (filter.has_job) {
@@ -198,14 +199,14 @@ class DeviceFinder {
         }
       }
 
-      std::vector<string> workers;
+      std::vector<std::string> workers;
       if (all_filters_have_job) {
         // If all of the device filters have a job specified, then we only need
         // to list the workers in the jobs named in the filter, because a worker
         // in any other job would not match any filter.
-        for (const string& job_name : filter_job_names) {
+        for (const std::string& job_name : filter_job_names) {
           VLOG(2) << "Selectively listing workers in job: " << job_name;
-          std::vector<string> workers_in_job;
+          std::vector<std::string> workers_in_job;
           worker_cache->ListWorkersInJob(job_name, &workers_in_job);
           workers.insert(workers.end(), workers_in_job.begin(),
                          workers_in_job.end());
@@ -218,13 +219,13 @@ class DeviceFinder {
         if (device_filters.empty()) {
           VLOG(2) << "- <NO FILTERS>";
         } else {
-          for (const string& filter : device_filters) {
+          for (const std::string& filter : device_filters) {
             VLOG(2) << "- " << filter;
           }
         }
         worker_cache->ListWorkers(&workers);
       }
-      for (const string& name : workers) {
+      for (const std::string& name : workers) {
         if (MatchFilters(name) ||
             DeviceNameUtils::IsSameAddressSpace(name, local_device_name)) {
           targets_.push_back(name);
@@ -263,7 +264,7 @@ class DeviceFinder {
   // Every `kLoggingPeriodMs`, while the DeviceFinder is still waiting
   // to hear from workers, log a list of the workers who have not
   // responded.
-  const int32 kLoggingPeriodMs = 10 * 1000;
+  const int32_t kLoggingPeriodMs = 10 * 1000;
 
   absl::Status Wait() {
     mutex_lock l(mu_);
@@ -287,11 +288,11 @@ class DeviceFinder {
   // The caller takes the ownership of returned remote devices.
   void GetRemoteDevices(const std::vector<Device*>& local,
                         std::vector<std::unique_ptr<Device>>* remote) {
-    std::unordered_set<string> names(local.size());
+    std::unordered_set<std::string> names(local.size());
     for (Device* dev : local) names.insert(dev->name());
     mutex_lock l(mu_);
     for (Device* dev : found_) {
-      const string& name = dev->name();
+      const std::string& name = dev->name();
       if (names.insert(name).second && MatchFilters(name)) {
         remote->push_back(std::unique_ptr<Device>(dev));
       } else {
@@ -313,7 +314,7 @@ class DeviceFinder {
   // List of targets to be contacted by this DeviceFinder. The
   // respective `bool` in `seen_targets_` indicates whether we have
   // heard from this target or not.
-  std::vector<string> targets_;
+  std::vector<std::string> targets_;
   std::vector<bool> seen_targets_ TF_GUARDED_BY(mu_);
   absl::Status status_;
 
@@ -347,7 +348,7 @@ class DeviceFinder {
   }
 
   // Returns true iff 'name' matches one of the filters_.
-  bool MatchFilters(const string& name) {
+  bool MatchFilters(const std::string& name) {
     if (filters_.empty()) return true;
     DeviceNameUtils::ParsedName x;
     if (DeviceNameUtils::ParseFullName(name, &x)) {
@@ -386,7 +387,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
     if (!cluster_def.job().empty()) {
       worker_cache_factory_options.cluster_def = cluster_def;
       // If the target starts with gRPC protocol prefix, remove the prefix
-      string normalized_string(req->target());
+      std::string normalized_string(req->target());
       RE2::Replace(&normalized_string, kGrpcPrefixRegex, "");
 
       // Set the server_def's job_name and task_index fields.
@@ -472,7 +473,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
     options.config.mutable_experimental()
         ->set_disable_optimize_for_static_graph(true);
 
-    std::vector<string> filtered_worker_list;
+    std::vector<std::string> filtered_worker_list;
     DeviceFinder::GetRemoteWorkers(req->config().device_filters(), env_,
                                    worker_cache, &filtered_worker_list);
 
@@ -555,7 +556,7 @@ void Master::RunStep(CallOptions* opts, const RunStepRequestWrapper* req,
   SchedClosure([this, start_time, session, opts, req, resp, done]() {
     absl::Status status = session->Run(opts, *req, resp);
     session->Unref();
-    uint64 done_time = env_->env->NowMicros();
+    uint64_t done_time = env_->env->NowMicros();
     done(status);
     mutex_lock l(mu_);
     last_1000_steps_.AddValue((done_time - start_time) / 1e9);
@@ -624,7 +625,7 @@ void Master::ListDevices(const ListDevicesRequest* req,
 }
 
 void Master::CleanupWorkers(const ResetRequest& reset) {
-  std::vector<string> worker_names;
+  std::vector<std::string> worker_names;
   DeviceFinder::GetRemoteWorkers(reset.device_filters(), env_,
                                  env_->worker_cache, &worker_names);
   if (!worker_names.empty()) {
@@ -635,7 +636,7 @@ void Master::CleanupWorkers(const ResetRequest& reset) {
     std::vector<CleanupAllResponse> resp(num_workers);
     int c = 0;
     for (int i = 0; i < num_workers; ++i) {
-      const string& worker_name = worker_names[i];
+      const std::string& worker_name = worker_names[i];
       auto worker = env_->worker_cache->GetOrCreateWorker(worker_name);
       if (worker) {
         worker->CleanupAllAsync(
diff --git a/tensorflow/core/distributed_runtime/master.h b/tensorflow/core/distributed_runtime/master.h
index a3930249b629ee..f39fd34d0a5900 100644
--- a/tensorflow/core/distributed_runtime/master.h
+++ b/tensorflow/core/distributed_runtime/master.h
@@ -84,7 +84,7 @@ class Master {
   Thread* gc_thread_;
 
   // Maps session handles to sessions.
-  std::unordered_map<string, MasterSession*> sessions_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, MasterSession*> sessions_ TF_GUARDED_BY(mu_);
 
   // Moving average of step times.
   MovingAverage last_1000_steps_ TF_GUARDED_BY(mu_);
@@ -107,7 +107,7 @@ class Master {
 
   // Find master session by session handle, and increments the reference count
   // on the returned MasterSession if not null.
-  MasterSession* FindMasterSession(const string& handle);
+  MasterSession* FindMasterSession(const std::string& handle);
 
   Master(const Master&) = delete;
   void operator=(const Master&) = delete;
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index b8dcf1963df50d..5845a96836f913 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -41,7 +41,7 @@ class OpRegistryInterface;
 // Options passed to the worker_cache_factory function.
 struct WorkerCacheFactoryOptions {
   ClusterDef cluster_def;
-  string job_name;
+  std::string job_name;
   int task_index;
   int replica_index = 0;
   RPCOptions rpc_options;
@@ -96,7 +96,7 @@ struct MasterEnv {
       std::unique_ptr<std::vector<std::unique_ptr<Device>>>,
       std::unique_ptr<WorkerCacheInterface>,
       std::unique_ptr<DeviceSet> device_set,
-      std::vector<string> filtered_worker_list)>
+      std::vector<std::string> filtered_worker_list)>
       master_session_factory;
 
   std::function<absl::Status(const WorkerCacheFactoryOptions&,
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 87eea4ec558c2d..b24bdc24a765c8 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -75,7 +75,7 @@ namespace tensorflow {
 // TODO(zhifengc): Cleanup this class. It's becoming messy.
 class MasterSession::ReffedClientGraph : public core::RefCounted {
  public:
-  ReffedClientGraph(const string& handle, const BuildGraphOptions& bopts,
+  ReffedClientGraph(const std::string& handle, const BuildGraphOptions& bopts,
                     std::unique_ptr<ClientGraph> client_graph,
                     const SessionOptions& session_opts,
                     const StatsPublisherFactory& stats_publisher_factory,
@@ -122,7 +122,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   int64_t collective_graph_key() { return collective_graph_key_; }
 
-  std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
+  std::unique_ptr<ProfileHandler> GetProfileHandler(uint64_t step,
                                                     int64_t execution_count,
                                                     const RunOptions& ropts) {
     return stats_publisher_->GetProfileHandler(step, execution_count, ropts);
@@ -239,7 +239,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                             GraphExecutionState* execution_state);
 
  private:
-  const string session_handle_;
+  const std::string session_handle_;
   const BuildGraphOptions bg_opts_;
 
   // NOTE(mrry): This pointer will be null after `RegisterPartitions()` returns.
@@ -250,13 +250,13 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   WorkerCacheInterface* const worker_cache_;  // Not owned.
 
   struct NodeDetails {
-    explicit NodeDetails(string type_string, string detail_text)
+    explicit NodeDetails(std::string type_string, std::string detail_text)
         : type_string(std::move(type_string)),
           detail_text(std::move(detail_text)) {}
-    const string type_string;
-    const string detail_text;
+    const std::string type_string;
+    const std::string detail_text;
   };
-  std::unordered_map<string, NodeDetails> name_to_node_details_;
+  std::unordered_map<std::string, NodeDetails> name_to_node_details_;
 
   const bool should_deregister_;
   const int64_t collective_graph_key_;
@@ -265,20 +265,20 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // Graph partitioned into per-location subgraphs.
   struct Part {
     // Worker name.
-    string name;
+    std::string name;
 
     // Maps feed names to rendezvous keys. Empty most of the time.
-    std::unordered_map<string, string> feed_key;
+    std::unordered_map<std::string, std::string> feed_key;
 
     // Maps rendezvous keys to fetch names. Empty most of the time.
-    std::unordered_map<string, string> key_fetch;
+    std::unordered_map<std::string, std::string> key_fetch;
 
     // The interface to the worker. Owned.
     WorkerInterface* worker = nullptr;
 
     // After registration with the worker, graph_handle identifies
     // this partition on the worker.
-    string graph_handle;
+    std::string graph_handle;
 
     Part() : feed_key(3), key_fetch(3) {}
   };
@@ -300,14 +300,15 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   std::unique_ptr<StatsPublisherInterface> stats_publisher_;
 
-  string DetailText(const NodeDetails& details, const NodeExecStats& stats) {
+  std::string DetailText(const NodeDetails& details,
+                         const NodeExecStats& stats) {
     int64_t tot = 0;
     for (auto& no : stats.output()) {
       tot += no.tensor_description().allocation_description().requested_bytes();
     }
-    string bytes;
+    std::string bytes;
     if (tot >= 0.1 * 1048576.0) {
-      bytes = strings::Printf("[%.1fMB] ", tot / 1048576.0);
+      bytes = absl::StrFormat("[%.1fMB] ", tot / 1048576.0);
     }
     return strings::StrCat(bytes, stats.node_name(), " = ", details.type_string,
                            details.detail_text);
@@ -322,10 +323,10 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
   // The actual graph partitioning and registration implementation.
   absl::Status DoBuildPartitions(
       PartitionOptions popts, ClientGraph* client_graph,
-      std::unordered_map<string, GraphDef>* out_partitions);
+      std::unordered_map<std::string, GraphDef>* out_partitions);
   absl::Status DoRegisterPartitions(
       const PartitionOptions& popts,
-      std::unordered_map<string, GraphDef> graph_partitions);
+      std::unordered_map<std::string, GraphDef> graph_partitions);
 
   // Prepares a number of calls to workers. One call per partition.
   // This is a generic method that handles Run, PartialRun, and RunCallable.
@@ -359,7 +360,7 @@ absl::Status MasterSession::ReffedClientGraph::RegisterPartitions(
       std::unique_ptr<ClientGraph> client_graph;
       std::swap(client_graph_before_register_, client_graph);
       mu_.unlock();
-      std::unordered_map<string, GraphDef> graph_defs;
+      std::unordered_map<std::string, GraphDef> graph_defs;
       popts.flib_def = client_graph->flib_def.get();
       absl::Status s =
           DoBuildPartitions(popts, client_graph.get(), &graph_defs);
@@ -390,9 +391,9 @@ absl::Status MasterSession::ReffedClientGraph::RegisterPartitions(
   }
 }
 
-static string SplitByWorker(const Node* node) {
-  string task;
-  string device;
+static std::string SplitByWorker(const Node* node) {
+  std::string task;
+  std::string device;
   CHECK(DeviceNameUtils::SplitDeviceName(node->assigned_device_name(), &task,
                                          &device))
       << "node: " << node->name() << " dev: " << node->assigned_device_name();
@@ -413,17 +414,17 @@ void MasterSession::ReffedClientGraph::TrackFeedsAndFetches(
       bool client_terminated;
       TF_CHECK_OK(GetNodeAttr(ndef, "client_terminated", &client_terminated));
       if (client_terminated) {
-        string name;
+        std::string name;
         TF_CHECK_OK(GetNodeAttr(ndef, "tensor_name", &name));
-        string send_device;
+        std::string send_device;
         TF_CHECK_OK(GetNodeAttr(ndef, "send_device", &send_device));
-        string recv_device;
+        std::string recv_device;
         TF_CHECK_OK(GetNodeAttr(ndef, "recv_device", &recv_device));
-        uint64 send_device_incarnation;
+        uint64_t send_device_incarnation;
         TF_CHECK_OK(
             GetNodeAttr(ndef, "send_device_incarnation",
                         reinterpret_cast<int64_t*>(&send_device_incarnation)));
-        const string& key =
+        const std::string& key =
             Rendezvous::CreateKey(send_device, send_device_incarnation,
                                   recv_device, name, FrameAndIter(0, 0));
 
@@ -439,7 +440,7 @@ void MasterSession::ReffedClientGraph::TrackFeedsAndFetches(
 
 absl::Status MasterSession::ReffedClientGraph::DoBuildPartitions(
     PartitionOptions popts, ClientGraph* client_graph,
-    std::unordered_map<string, GraphDef>* out_partitions) {
+    std::unordered_map<std::string, GraphDef>* out_partitions) {
   if (popts.need_to_record_start_times) {
     CostModel cost_model(true);
     cost_model.InitFromGraph(client_graph->graph);
@@ -455,7 +456,7 @@ absl::Status MasterSession::ReffedClientGraph::DoBuildPartitions(
 
 absl::Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     const PartitionOptions& popts,
-    std::unordered_map<string, GraphDef> graph_partitions) {
+    std::unordered_map<std::string, GraphDef> graph_partitions) {
   partitions_.reserve(graph_partitions.size());
   absl::Status s;
   for (auto& name_def : graph_partitions) {
@@ -524,7 +525,7 @@ class RunManyGraphs {
   // Returns the index-th call.
   struct Call {
     CallOptions opts;
-    const string* worker_name;
+    const std::string* worker_name;
     std::atomic<bool> done{false};
     std::unique_ptr<MutableRunGraphRequestWrapper> req;
     std::unique_ptr<MutableRunGraphResponseWrapper> resp;
@@ -625,13 +626,15 @@ class RunManyGraphs {
 
 absl::Status AddSendFromClientRequest(const RunStepRequestWrapper& client_req,
                                       MutableRunGraphRequestWrapper* worker_req,
-                                      size_t index, const string& send_key) {
+                                      size_t index,
+                                      const std::string& send_key) {
   return worker_req->AddSendFromRunStepRequest(client_req, index, send_key);
 }
 
 absl::Status AddSendFromClientRequest(const RunCallableRequest& client_req,
                                       MutableRunGraphRequestWrapper* worker_req,
-                                      size_t index, const string& send_key) {
+                                      size_t index,
+                                      const std::string& send_key) {
   return worker_req->AddSendFromRunCallableRequest(client_req, index, send_key);
 }
 
@@ -639,13 +642,13 @@ absl::Status AddSendFromClientRequest(const RunCallableRequest& client_req,
 // in-process messages.
 struct RunCallableResponseWrapper {
   RunCallableResponse* resp;  // Not owned.
-  std::unordered_map<string, TensorProto> fetch_key_to_protos;
+  std::unordered_map<std::string, TensorProto> fetch_key_to_protos;
 
   RunMetadata* mutable_metadata() { return resp->mutable_metadata(); }
 
   absl::Status AddTensorFromRunGraphResponse(
-      const string& tensor_name, MutableRunGraphResponseWrapper* worker_resp,
-      size_t index) {
+      const std::string& tensor_name,
+      MutableRunGraphResponseWrapper* worker_resp, size_t index) {
     return worker_resp->RecvValue(index, &fetch_key_to_protos[tensor_name]);
   }
 };
@@ -709,18 +712,18 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
       for (const auto& name_index : feeds) {
-        const auto iter = part.feed_key.find(string(name_index.first));
+        const auto iter = part.feed_key.find(std::string(name_index.first));
         if (iter == part.feed_key.end()) {
           // The provided feed must be for a different partition.
           continue;
         }
-        const string& key = iter->second;
+        const std::string& key = iter->second;
         TF_RETURN_IF_ERROR(AddSendFromClientRequest(req, c->req.get(),
                                                     name_index.second, key));
       }
       // TODO(suharshs): Make a map from feed to fetch_key to make this faster.
       // For now, we just iterate through partitions to find the matching key.
-      for (const string& req_fetch : fetches) {
+      for (const std::string& req_fetch : fetches) {
         for (const auto& key_fetch : part.key_fetch) {
           if (key_fetch.second == req_fetch) {
             c->req->add_recv_key(key_fetch.first);
@@ -730,8 +733,8 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
       }
     } else {
       for (const auto& feed_key : part.feed_key) {
-        const string& feed = feed_key.first;
-        const string& key = feed_key.second;
+        const std::string& feed = feed_key.first;
+        const std::string& key = feed_key.second;
         auto iter = feeds.find(feed);
         if (iter == feeds.end()) {
           return errors::Internal("No feed index found for feed: ", feed);
@@ -741,7 +744,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
             AddSendFromClientRequest(req, c->req.get(), feed_index, key));
       }
       for (const auto& key_fetch : part.key_fetch) {
-        const string& key = key_fetch.first;
+        const std::string& key = key_fetch.first;
         c->req->add_recv_key(key);
       }
     }
@@ -790,7 +793,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
                                        run_graph_resp->recv_key(j)));
         break;
       }
-      const string& fetch = iter->second;
+      const std::string& fetch = iter->second;
       status.Update(
           resp->AddTensorFromRunGraphResponse(fetch, run_graph_resp, j));
       if (!status.ok()) {
@@ -834,7 +837,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitions(
     }
   }
 
-  std::vector<string> fetches;
+  std::vector<std::string> fetches;
   fetches.reserve(req.num_fetches());
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     fetches.push_back(req.fetch_name(i));
@@ -870,7 +873,7 @@ absl::Status MasterSession::ReffedClientGraph::RunPartitions(
       call_opts, req, &wrapped_resp, cm, false /* is_last_partial_run */));
 
   // Collects fetches.
-  for (const string& fetch : callable_opts_.fetch()) {
+  for (const std::string& fetch : callable_opts_.fetch()) {
     TensorProto* fetch_proto = resp->mutable_fetch()->Add();
     auto iter = wrapped_resp.fetch_key_to_protos.find(fetch);
     if (iter == wrapped_resp.fetch_key_to_protos.end()) {
@@ -1001,7 +1004,7 @@ void MasterSession::ReffedClientGraph::ProcessStats(int64_t step_id,
 
 void MasterSession::ReffedClientGraph::ProcessDeviceStats(
     ProfileHandler* ph, const DeviceStepStats& ds, bool is_rpc) {
-  const string& dev_name = ds.device();
+  const std::string& dev_name = ds.device();
   VLOG(1) << "Device " << dev_name << " reports stats for "
           << ds.node_stats_size() << " nodes";
   for (const auto& ns : ds.node_stats()) {
@@ -1026,9 +1029,9 @@ void MasterSession::ReffedClientGraph::ProcessDeviceStats(
         }
         continue;
       }
-      const string& optype =
+      const std::string& optype =
           found_node_in_graph ? iter->second.type_string : ns.node_name();
-      string details;
+      std::string details;
       if (!ns.timeline_label().empty()) {
         details = ns.timeline_label();
       } else if (found_node_in_graph) {
@@ -1055,7 +1058,7 @@ absl::Status MasterSession::ReffedClientGraph::CheckFetches(
     // Skip if already fed.
     if (input.second) continue;
     TensorId id(ParseTensorName(input.first));
-    const Node* n = execution_state->get_node_by_name(string(id.first));
+    const Node* n = execution_state->get_node_by_name(std::string(id.first));
     if (n == nullptr) {
       return errors::NotFound("Feed ", input.first, ": not found");
     }
@@ -1069,9 +1072,9 @@ absl::Status MasterSession::ReffedClientGraph::CheckFetches(
   // Initialize the stack with the fetch nodes.
   std::vector<const Node*> stack;
   for (size_t i = 0; i < req.num_fetches(); ++i) {
-    const string& fetch = req.fetch_name(i);
+    const std::string& fetch = req.fetch_name(i);
     const TensorId id(ParseTensorName(fetch));
-    const Node* n = execution_state->get_node_by_name(string(id.first));
+    const Node* n = execution_state->get_node_by_name(std::string(id.first));
     if (n == nullptr) {
       return errors::NotFound("Fetch ", fetch, ": not found");
     }
@@ -1120,7 +1123,7 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
       // NOTE(mrry): We must capture `worker_cache_` since `this`
       // could be deleted before the callback is called.
       WorkerCacheInterface* worker_cache = worker_cache_;
-      const string name = part.name;
+      const std::string name = part.name;
       WorkerInterface* w = part.worker;
       CHECK_NOTNULL(w);
       auto cb = [worker_cache, c, name, w](const absl::Status& s) {
@@ -1138,10 +1141,10 @@ void MasterSession::ReffedClientGraph::DeregisterPartitions() {
 }
 
 namespace {
-void CopyAndSortStrings(size_t size,
-                        const std::function<string(size_t)>& input_accessor,
-                        protobuf::RepeatedPtrField<string>* output) {
-  std::vector<string> temp;
+void CopyAndSortStrings(
+    size_t size, const std::function<std::string(size_t)>& input_accessor,
+    protobuf::RepeatedPtrField<std::string>* output) {
+  std::vector<std::string> temp;
   temp.reserve(size);
   for (size_t i = 0; i < size; ++i) {
     output->Add(input_accessor(i));
@@ -1194,22 +1197,22 @@ void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
   // TODO(cais): Add TFDBG support to partial runs.
 }
 
-uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
-  uint64 h = 0x2b992ddfa23249d6ull;
-  for (const string& name : opts.callable_options.feed()) {
+uint64_t HashBuildGraphOptions(const BuildGraphOptions& opts) {
+  uint64_t h = 0x2b992ddfa23249d6ull;
+  for (const std::string& name : opts.callable_options.feed()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
-  for (const string& name : opts.callable_options.target()) {
+  for (const std::string& name : opts.callable_options.target()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
-  for (const string& name : opts.callable_options.fetch()) {
+  for (const std::string& name : opts.callable_options.fetch()) {
     h = Hash64(name.c_str(), name.size(), h);
   }
 
   const DebugOptions& debug_options =
       opts.callable_options.run_options().debug_options();
   if (!debug_options.debug_tensor_watch_opts().empty()) {
-    const string watch_summary =
+    const std::string watch_summary =
         SummarizeDebugTensorWatches(debug_options.debug_tensor_watch_opts());
     h = Hash64(watch_summary.c_str(), watch_summary.size(), h);
   }
@@ -1217,17 +1220,17 @@ uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
   return h;
 }
 
-string BuildGraphOptionsString(const BuildGraphOptions& opts) {
-  string buf;
-  for (const string& name : opts.callable_options.feed()) {
+std::string BuildGraphOptionsString(const BuildGraphOptions& opts) {
+  std::string buf;
+  for (const std::string& name : opts.callable_options.feed()) {
     absl::StrAppend(&buf, " FdE: ", name);
   }
   absl::StrAppend(&buf, "\n");
-  for (const string& name : opts.callable_options.target()) {
+  for (const std::string& name : opts.callable_options.target()) {
     absl::StrAppend(&buf, " TN: ", name);
   }
   absl::StrAppend(&buf, "\n");
-  for (const string& name : opts.callable_options.fetch()) {
+  for (const std::string& name : opts.callable_options.fetch()) {
     absl::StrAppend(&buf, " FeE: ", name);
   }
   if (opts.collective_graph_key != BuildGraphOptions::kNoCollectiveGraphKey) {
@@ -1242,7 +1245,7 @@ MasterSession::MasterSession(
     std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceSet> device_set,
-    std::vector<string> filtered_worker_list,
+    std::vector<std::string> filtered_worker_list,
     StatsPublisherFactory stats_publisher_factory)
     : session_opts_(opt),
       env_(env),
@@ -1301,12 +1304,12 @@ absl::Status MasterSession::Create(GraphDef&& graph_def,
 
 absl::Status MasterSession::CreateWorkerSessions(
     const ClusterDef& cluster_def) {
-  const std::vector<string> worker_names = filtered_worker_list_;
+  const std::vector<std::string> worker_names = filtered_worker_list_;
   WorkerCacheInterface* worker_cache = get_worker_cache();
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
-    const string* name;
+    const std::string* name;
 
     // The worker referenced by name. (Not owned.)
     WorkerInterface* worker = nullptr;
@@ -1328,8 +1331,8 @@ absl::Status MasterSession::CreateWorkerSessions(
     }
   });
 
-  string task_name;
-  string local_device_name;
+  std::string task_name;
+  std::string local_device_name;
   DeviceNameUtils::SplitDeviceName(devices_->client_device()->name(),
                                    &task_name, &local_device_name);
   const int64_t client_device_incarnation =
@@ -1435,11 +1438,11 @@ absl::Status MasterSession::CreateWorkerSessions(
 
 absl::Status MasterSession::DeleteWorkerSessions() {
   WorkerCacheInterface* worker_cache = get_worker_cache();
-  const std::vector<string>& worker_names = filtered_worker_list_;
+  const std::vector<std::string>& worker_names = filtered_worker_list_;
 
   struct WorkerGroup {
     // The worker name. (Not owned.)
-    const string* name;
+    const std::string* name;
 
     // The worker referenced by name. (Not owned.)
     WorkerInterface* worker = nullptr;
@@ -1554,7 +1557,7 @@ absl::Status MasterSession::StartStep(const BuildGraphOptions& opts,
                                       bool is_partial,
                                       ReffedClientGraph** out_rcg,
                                       int64_t* out_count) {
-  const uint64 hash = HashBuildGraphOptions(opts);
+  const uint64_t hash = HashBuildGraphOptions(opts);
   {
     mutex_lock l(mu_);
     // TODO(suharshs): We cache partial run graphs and run graphs separately
@@ -1599,12 +1602,12 @@ void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
   rcg_map->clear();
 }
 
-uint64 MasterSession::NewStepId(int64_t graph_key) {
+uint64_t MasterSession::NewStepId(int64_t graph_key) {
   if (graph_key == BuildGraphOptions::kNoCollectiveGraphKey) {
     // StepId must leave the most-significant 7 bits empty for future use.
     return random::New64() & (((1uLL << 56) - 1) | (1uLL << 56));
   } else {
-    uint64 step_id = env_->collective_executor_mgr->NextStepId(graph_key);
+    uint64_t step_id = env_->collective_executor_mgr->NextStepId(graph_key);
     int32_t retry_count = 0;
     while (static_cast<int64_t>(step_id) == CollectiveExecutor::kInvalidId) {
       absl::Notification note;
@@ -1631,7 +1634,7 @@ uint64 MasterSession::NewStepId(int64_t graph_key) {
 
 absl::Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
                                             PartialRunSetupResponse* resp) {
-  std::vector<string> inputs, outputs, targets;
+  std::vector<std::string> inputs, outputs, targets;
   for (const auto& feed : req->feed()) {
     inputs.push_back(feed);
   }
@@ -1642,7 +1645,7 @@ absl::Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
     targets.push_back(target);
   }
 
-  string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
+  std::string handle = std::to_string(partial_run_handle_counter_.fetch_add(1));
 
   ReffedClientGraph* rcg = nullptr;
 
@@ -1706,11 +1709,11 @@ absl::Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
   // The closures popts.{new_name,get_incarnation} are called synchronously in
   // RegisterPartitions() below, so do not need a Ref()/Unref() pair to keep
   // "this" alive during the closure.
-  popts.new_name = [this](const string& prefix) {
+  popts.new_name = [this](const std::string& prefix) {
     mutex_lock l(mu_);
     return absl::StrCat(prefix, "_S", next_node_id_++);
   };
-  popts.get_incarnation = [this](const string& name) -> int64 {
+  popts.get_incarnation = [this](const std::string& name) -> int64_t {
     Device* d = devices_->FindDeviceByName(name);
     if (d == nullptr) {
       return PartitionOptions::kIllegalIncarnation;
@@ -1746,7 +1749,7 @@ absl::Status MasterSession::DoPartialRun(CallOptions* opts,
                                          const RunStepRequestWrapper& req,
                                          MutableRunStepResponseWrapper* resp) {
   auto cleanup = gtl::MakeCleanup([this] { MarkRunCompletion(); });
-  const string& prun_handle = req.partial_run_handle();
+  const std::string& prun_handle = req.partial_run_handle();
   RunState* run_state = nullptr;
   {
     mutex_lock l(mu_);
@@ -1802,7 +1805,7 @@ absl::Status MasterSession::DoPartialRun(CallOptions* opts,
 
   // Make sure that this is a new set of feeds that are still pending.
   for (size_t i = 0; i < req.num_feeds(); ++i) {
-    const string& feed = req.feed_name(i);
+    const std::string& feed = req.feed_name(i);
     auto it = run_state->pending_inputs.find(feed);
     if (it == run_state->pending_inputs.end()) {
       return errors::InvalidArgument(
@@ -1814,7 +1817,7 @@ absl::Status MasterSession::DoPartialRun(CallOptions* opts,
   }
   // Check that this is a new set of fetches that are still pending.
   for (size_t i = 0; i < req.num_fetches(); ++i) {
-    const string& fetch = req.fetch_name(i);
+    const std::string& fetch = req.fetch_name(i);
     auto it = run_state->pending_outputs.find(fetch);
     if (it == run_state->pending_outputs.end()) {
       return errors::InvalidArgument(
@@ -1879,17 +1882,17 @@ absl::Status MasterSession::CreateDebuggerState(
   TF_RETURN_IF_ERROR(
       DebuggerStateRegistry::CreateState(debug_options, debugger_state));
 
-  std::vector<string> input_names;
+  std::vector<std::string> input_names;
   input_names.reserve(req.num_feeds());
   for (size_t i = 0; i < req.num_feeds(); ++i) {
     input_names.push_back(req.feed_name(i));
   }
-  std::vector<string> output_names;
+  std::vector<std::string> output_names;
   output_names.reserve(req.num_fetches());
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     output_names.push_back(req.fetch_name(i));
   }
-  std::vector<string> target_names;
+  std::vector<std::string> target_names;
   target_names.reserve(req.num_targets());
   for (size_t i = 0; i < req.num_targets(); ++i) {
     target_names.push_back(req.target_name(i));
@@ -1908,7 +1911,7 @@ absl::Status MasterSession::CreateDebuggerState(
 
 void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
                                      const RunOptions& run_options,
-                                     uint64 step_id, int64_t count,
+                                     uint64_t step_id, int64_t count,
                                      PerStepState* out_pss,
                                      std::unique_ptr<ProfileHandler>* out_ph) {
   out_pss->collect_timeline =
@@ -1935,7 +1938,7 @@ void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
 }
 
 absl::Status MasterSession::PostRunCleanup(
-    MasterSession::ReffedClientGraph* rcg, uint64 step_id,
+    MasterSession::ReffedClientGraph* rcg, uint64_t step_id,
     const RunOptions& run_options, PerStepState* pss,
     const std::unique_ptr<ProfileHandler>& ph, const absl::Status& run_status,
     RunMetadata* out_run_metadata) {
@@ -2004,7 +2007,7 @@ absl::Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  uint64 step_id = NewStepId(rcg->collective_graph_key());
+  uint64_t step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   std::unique_ptr<ProfileHandler> ph;
@@ -2054,7 +2057,7 @@ absl::Status MasterSession::MakeCallable(const MakeCallableRequest& req,
     return s;
   }
 
-  uint64 handle;
+  uint64_t handle;
   {
     mutex_lock l(mu_);
     handle = next_callable_handle_++;
@@ -2077,7 +2080,7 @@ absl::Status MasterSession::DoRunCallable(CallOptions* opts,
   // Prepare.
   int64_t count = rcg->get_and_increment_execution_count();
 
-  const uint64 step_id = NewStepId(rcg->collective_graph_key());
+  const uint64_t step_id = NewStepId(rcg->collective_graph_key());
   TRACEPRINTF("stepid %llu", step_id);
 
   const RunOptions& run_options = rcg->callable_options().run_options();
@@ -2176,10 +2179,10 @@ void MasterSession::GarbageCollect() {
   Unref();
 }
 
-MasterSession::RunState::RunState(const std::vector<string>& input_names,
-                                  const std::vector<string>& output_names,
-                                  ReffedClientGraph* rcg, const uint64 step_id,
-                                  const int64_t count)
+MasterSession::RunState::RunState(const std::vector<std::string>& input_names,
+                                  const std::vector<std::string>& output_names,
+                                  ReffedClientGraph* rcg,
+                                  const uint64_t step_id, const int64_t count)
     : rcg(rcg), step_id(step_id), count(count) {
   // Initially all the feeds and fetches are pending.
   for (auto& name : input_names) {
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index f7016518bca5a9..b22953547b8f7c 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -52,7 +52,7 @@ class MasterSession : public core::RefCounted {
       std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       std::unique_ptr<DeviceSet> device_set,
-      std::vector<string> filtered_worker_list,
+      std::vector<std::string> filtered_worker_list,
       StatsPublisherFactory stats_publisher_factory);
 
   // Initialize the MasterSession for "def".  Must be called before Extend(),
@@ -60,11 +60,13 @@ class MasterSession : public core::RefCounted {
   absl::Status Create(GraphDef&& def, const ClusterDef& cluster_def);
 
   // Returns the session handle.
-  const string& handle() const { return handle_; }
+  const std::string& handle() const { return handle_; }
 
   // Returns the last access time (the number of micro-seconds since
   // some fixed point in time) of this session.
-  uint64 last_access_time_usec() const { return last_access_time_usec_.load(); }
+  uint64_t last_access_time_usec() const {
+    return last_access_time_usec_.load();
+  }
 
   // Attempt to extend the graph according to the given "req".
   // (See master.proto for details of valid extensions.)
@@ -117,7 +119,7 @@ class MasterSession : public core::RefCounted {
   const MasterEnv* env_;
 
   // The opaque session handle.
-  const string handle_;
+  const std::string handle_;
 
   std::unique_ptr<std::vector<std::unique_ptr<Device>>> remote_devs_;
 
@@ -132,7 +134,7 @@ class MasterSession : public core::RefCounted {
 
   // The (partial device) names of remote worker tasks that this
   // session will contact.
-  const std::vector<string> filtered_worker_list_;
+  const std::vector<std::string> filtered_worker_list_;
 
   StatsPublisherFactory stats_publisher_factory_;
 
@@ -140,7 +142,7 @@ class MasterSession : public core::RefCounted {
 
   std::atomic<int64_t> partial_run_handle_counter_ = {0};
 
-  uint64 NewStepId(int64_t graph_key);
+  uint64_t NewStepId(int64_t graph_key);
 
   mutex mu_;
   std::unique_ptr<GraphExecutionState> execution_state_ TF_GUARDED_BY(mu_);
@@ -152,7 +154,7 @@ class MasterSession : public core::RefCounted {
   // before a new substitute has been created, Variables can go out of
   // scope and lose their state.
   class ReffedClientGraph;
-  typedef std::unordered_map<uint64, ReffedClientGraph*> RCGMap;
+  typedef std::unordered_map<uint64_t, ReffedClientGraph*> RCGMap;
   RCGMap run_graphs_ TF_GUARDED_BY(mu_);
   RCGMap partial_run_graphs_ TF_GUARDED_BY(mu_);
   int64_t next_callable_handle_ TF_GUARDED_BY(mu_) = 0;
@@ -172,35 +174,36 @@ class MasterSession : public core::RefCounted {
   };
 
   struct RunState {
-    std::unordered_map<string, bool> pending_inputs;   // true if fed
-    std::unordered_map<string, bool> pending_outputs;  // true if fetched
+    std::unordered_map<std::string, bool> pending_inputs;   // true if fed
+    std::unordered_map<std::string, bool> pending_outputs;  // true if fetched
     ReffedClientGraph* rcg = nullptr;
-    uint64 step_id;
+    uint64_t step_id;
     int64_t collective_graph_key;
     int64_t count = 0;
     PerStepState pss;
     std::unique_ptr<ProfileHandler> ph;
     bool step_started = false;
 
-    RunState(const std::vector<string>& input_names,
-             const std::vector<string>& output_names, ReffedClientGraph* rcg,
-             const uint64 step_id, const int64_t count);
+    RunState(const std::vector<std::string>& input_names,
+             const std::vector<std::string>& output_names,
+             ReffedClientGraph* rcg, const uint64_t step_id,
+             const int64_t count);
 
     bool PendingDone() const;
 
     ~RunState();
   };
-  std::unordered_map<string, std::unique_ptr<RunState>> partial_runs_
+  std::unordered_map<std::string, std::unique_ptr<RunState>> partial_runs_
       TF_GUARDED_BY(mu_);
 
   // Active RunStep calls.
   condition_variable num_running_is_zero_;
-  int32 num_running_ TF_GUARDED_BY(mu_) = 0;
+  int32_t num_running_ TF_GUARDED_BY(mu_) = 0;
 
   bool closed_ TF_GUARDED_BY(mu_) = false;
   bool garbage_collected_ TF_GUARDED_BY(mu_) = false;
 
-  std::unordered_map<uint64, int64_t> subgraph_execution_counts_
+  std::unordered_map<uint64_t, int64_t> subgraph_execution_counts_
       TF_GUARDED_BY(mu_);
 
   // We need to ensure that certain nodes added (e.g., send and recv
@@ -228,7 +231,7 @@ class MasterSession : public core::RefCounted {
   void ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
                       RCGMap* rcg_map) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   void FillPerStepState(MasterSession::ReffedClientGraph* rcg,
-                        const RunOptions& run_options, uint64 step_id,
+                        const RunOptions& run_options, uint64_t step_id,
                         int64_t count, PerStepState* out_pss,
                         std::unique_ptr<ProfileHandler>* out_ph);
   absl::Status DoRunWithLocalExecution(CallOptions* opts,
@@ -240,7 +243,7 @@ class MasterSession : public core::RefCounted {
                              const RunCallableRequest& req,
                              RunCallableResponse* resp);
   absl::Status PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
-                              uint64 step_id, const RunOptions& run_options,
+                              uint64_t step_id, const RunOptions& run_options,
                               PerStepState* pss,
                               const std::unique_ptr<ProfileHandler>& ph,
                               const absl::Status& run_status,
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index ed6461c63b07ac..8269f1dca201cd 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -44,7 +44,7 @@ namespace tensorflow {
 class MasterTest : public ::testing::Test {
  protected:
   MasterTest() {
-    std::vector<string> targets;
+    std::vector<std::string> targets;
     SessionOptions options;
     (*options.config.mutable_device_count())["CPU"] = 1;
     (*options.config.mutable_device_count())["GPU"] = 0;
@@ -64,7 +64,7 @@ class MasterTest : public ::testing::Test {
   // Helpers for MasterService.{CreateSession,RunStep,CloseSession}
   // rpc calls.
 
-  absl::Status CreateSession(const GraphDef& def, string* handle,
+  absl::Status CreateSession(const GraphDef& def, std::string* handle,
                              int64_t* initial_version) {
     ::grpc::ClientContext ctx;
     CreateSessionRequest req;
@@ -81,7 +81,7 @@ class MasterTest : public ::testing::Test {
     return s;
   }
 
-  absl::Status ExtendSession(const string& handle, const GraphDef& def,
+  absl::Status ExtendSession(const std::string& handle, const GraphDef& def,
                              int64_t current_version, int64_t* new_version) {
     ::grpc::ClientContext ctx;
     ExtendSessionRequest req;
@@ -98,21 +98,21 @@ class MasterTest : public ::testing::Test {
   }
 
   absl::Status RunStep(
-      const string& handle,
-      const std::vector<std::pair<string, const Tensor*> >& feed,
-      const std::map<string, Tensor*>& fetch) {
+      const std::string& handle,
+      const std::vector<std::pair<std::string, const Tensor*> >& feed,
+      const std::map<std::string, Tensor*>& fetch) {
     ::grpc::ClientContext ctx;
     RunStepRequest req;
     req.set_session_handle(handle);
     for (const auto& p : feed) {
-      const string& feed_name = p.first;
+      const std::string& feed_name = p.first;
       const Tensor* feed_tensor = p.second;
       auto f = req.add_feed();
       f->set_name(feed_name);
       feed_tensor->AsProtoTensorContent(f->mutable_tensor());
     }
     for (const auto& p : fetch) {
-      const string& fetch_name = p.first;
+      const std::string& fetch_name = p.first;
       req.add_fetch(fetch_name);
     }
     RunStepResponse resp;
@@ -127,7 +127,7 @@ class MasterTest : public ::testing::Test {
     return s;
   }
 
-  absl::Status CloseSession(const string& handle) {
+  absl::Status CloseSession(const std::string& handle) {
     ::grpc::ClientContext ctx;
     CloseSessionRequest req;
     req.set_session_handle(handle);
@@ -145,7 +145,7 @@ class MasterTest : public ::testing::Test {
 
 TEST_F(MasterTest, CreateClose) {
   GraphDef def;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def, &handle, &initial_version));
   EXPECT_TRUE(absl::IsAborted(CloseSession("randombits")));
@@ -164,7 +164,7 @@ TEST_F(MasterTest, ListDevices) {
 
 TEST_F(MasterTest, Reset) {
   GraphDef def;  // Empty.
-  string s1, s2;
+  std::string s1, s2;
   int64_t initial_version1, initial_version2;
   TF_ASSERT_OK(CreateSession(def, &s1, &initial_version1));
   TF_ASSERT_OK(CreateSession(def, &s2, &initial_version2));
@@ -175,7 +175,7 @@ TEST_F(MasterTest, Reset) {
 
 TEST_F(MasterTest, Extend) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -216,7 +216,7 @@ TEST_F(MasterTest, Extend) {
 
 TEST_F(MasterTest, ExtendUpdateStatefulFails) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -235,7 +235,7 @@ TEST_F(MasterTest, ExtendUpdateStatefulFails) {
 
 TEST_F(MasterTest, ExtendTwiceFails) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -254,7 +254,7 @@ TEST_F(MasterTest, ExtendTwiceFails) {
 
 TEST_F(MasterTest, ConcurrentExtendOnlyOneSucceeds) {
   GraphDef def_0;  // Empty.
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -306,7 +306,7 @@ TEST_F(MasterTest, ConcurrentExtendAndRun) {
   GraphDef def_0;
   test::graph::ToGraphDef(&graph_0, &def_0);
 
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_ASSERT_OK(CreateSession(def_0, &handle, &initial_version));
 
@@ -388,7 +388,7 @@ TEST_F(MasterTest, EigenProblem) {
   GraphDef def;
   test::graph::ToGraphDef(&graph, &def);
 
-  string handle;
+  std::string handle;
   int64_t initial_version;
   TF_CHECK_OK(CreateSession(def, &handle, &initial_version));
 
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 60a264565dbb61..7eabcadcc173bf 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -38,24 +38,24 @@ bool ParseTensorProtoToTensor(const TensorProto& tensor_proto,
   return false;
 }
 
-const string& InMemoryRunStepRequest::session_handle() const {
+const std::string& InMemoryRunStepRequest::session_handle() const {
   return session_handle_;
 }
 
-void InMemoryRunStepRequest::set_session_handle(const string& handle) {
+void InMemoryRunStepRequest::set_session_handle(const std::string& handle) {
   session_handle_ = handle;
 }
 
-const string& InMemoryRunStepRequest::partial_run_handle() const {
+const std::string& InMemoryRunStepRequest::partial_run_handle() const {
   return partial_run_handle_;
 }
 
-void InMemoryRunStepRequest::set_partial_run_handle(const string& handle) {
+void InMemoryRunStepRequest::set_partial_run_handle(const std::string& handle) {
   partial_run_handle_ = handle;
 }
 
 size_t InMemoryRunStepRequest::num_feeds() const { return feeds_.size(); }
-const string& InMemoryRunStepRequest::feed_name(size_t i) const {
+const std::string& InMemoryRunStepRequest::feed_name(size_t i) const {
   return feeds_[i].first;
 }
 
@@ -71,23 +71,24 @@ absl::Status InMemoryRunStepRequest::FeedValue(size_t i,
   return absl::OkStatus();
 }
 
-void InMemoryRunStepRequest::add_feed(const string& name, const Tensor& value) {
+void InMemoryRunStepRequest::add_feed(const std::string& name,
+                                      const Tensor& value) {
   feeds_.emplace_back(name, value);
 }
 
 size_t InMemoryRunStepRequest::num_fetches() const { return fetches_.size(); }
-const string& InMemoryRunStepRequest::fetch_name(size_t i) const {
+const std::string& InMemoryRunStepRequest::fetch_name(size_t i) const {
   return fetches_[i];
 }
-void InMemoryRunStepRequest::add_fetch(const string& name) {
+void InMemoryRunStepRequest::add_fetch(const std::string& name) {
   fetches_.push_back(name);
 }
 
 size_t InMemoryRunStepRequest::num_targets() const { return targets_.size(); }
-const string& InMemoryRunStepRequest::target_name(size_t i) const {
+const std::string& InMemoryRunStepRequest::target_name(size_t i) const {
   return targets_[i];
 }
-void InMemoryRunStepRequest::add_target(const string& name) {
+void InMemoryRunStepRequest::add_target(const std::string& name) {
   targets_.push_back(name);
 }
 
@@ -108,7 +109,7 @@ void InMemoryRunStepRequest::set_store_errors_in_response_body(
   store_errors_in_response_body_ = store_errors;
 }
 
-string InMemoryRunStepRequest::DebugString() const {
+std::string InMemoryRunStepRequest::DebugString() const {
   return ToProto().DebugString();
 }
 
@@ -133,24 +134,25 @@ const RunStepRequest& InMemoryRunStepRequest::ToProto() const {
   return *proto_version_;
 }
 
-const string& MutableProtoRunStepRequest::session_handle() const {
+const std::string& MutableProtoRunStepRequest::session_handle() const {
   return request_.session_handle();
 }
-void MutableProtoRunStepRequest::set_session_handle(const string& handle) {
+void MutableProtoRunStepRequest::set_session_handle(const std::string& handle) {
   request_.set_session_handle(handle);
 }
 
-const string& MutableProtoRunStepRequest::partial_run_handle() const {
+const std::string& MutableProtoRunStepRequest::partial_run_handle() const {
   return request_.partial_run_handle();
 }
-void MutableProtoRunStepRequest::set_partial_run_handle(const string& handle) {
+void MutableProtoRunStepRequest::set_partial_run_handle(
+    const std::string& handle) {
   request_.set_partial_run_handle(handle);
 }
 
 size_t MutableProtoRunStepRequest::num_feeds() const {
   return request_.feed_size();
 }
-const string& MutableProtoRunStepRequest::feed_name(size_t i) const {
+const std::string& MutableProtoRunStepRequest::feed_name(size_t i) const {
   return request_.feed(i).name();
 }
 absl::Status MutableProtoRunStepRequest::FeedValue(size_t i,
@@ -168,7 +170,7 @@ absl::Status MutableProtoRunStepRequest::FeedValue(
   return absl::OkStatus();
 }
 
-void MutableProtoRunStepRequest::add_feed(const string& name,
+void MutableProtoRunStepRequest::add_feed(const std::string& name,
                                           const Tensor& value) {
   NamedTensorProto* feed = request_.add_feed();
   feed->set_name(name);
@@ -180,10 +182,10 @@ size_t MutableProtoRunStepRequest::num_fetches() const {
   return request_.fetch_size();
 }
 
-const string& MutableProtoRunStepRequest::fetch_name(size_t i) const {
+const std::string& MutableProtoRunStepRequest::fetch_name(size_t i) const {
   return request_.fetch(i);
 }
-void MutableProtoRunStepRequest::add_fetch(const string& name) {
+void MutableProtoRunStepRequest::add_fetch(const std::string& name) {
   request_.add_fetch(name);
 }
 
@@ -191,11 +193,11 @@ size_t MutableProtoRunStepRequest::num_targets() const {
   return request_.target_size();
 }
 
-const string& MutableProtoRunStepRequest::target_name(size_t i) const {
+const std::string& MutableProtoRunStepRequest::target_name(size_t i) const {
   return request_.target(i);
 }
 
-void MutableProtoRunStepRequest::add_target(const string& name) {
+void MutableProtoRunStepRequest::add_target(const std::string& name) {
   request_.add_target(name);
 }
 
@@ -220,7 +222,7 @@ int64_t MutableProtoRunStepRequest::request_id() const {
   return request_.request_id();
 }
 
-string MutableProtoRunStepRequest::DebugString() const {
+std::string MutableProtoRunStepRequest::DebugString() const {
   return request_.DebugString();
 }
 
@@ -231,17 +233,17 @@ const RunStepRequest& MutableProtoRunStepRequest::ToProto() const {
 ProtoRunStepRequest::ProtoRunStepRequest(const RunStepRequest* request)
     : request_(request) {}
 
-const string& ProtoRunStepRequest::session_handle() const {
+const std::string& ProtoRunStepRequest::session_handle() const {
   return request_->session_handle();
 }
 
-const string& ProtoRunStepRequest::partial_run_handle() const {
+const std::string& ProtoRunStepRequest::partial_run_handle() const {
   return request_->partial_run_handle();
 }
 
 size_t ProtoRunStepRequest::num_feeds() const { return request_->feed_size(); }
 
-const string& ProtoRunStepRequest::feed_name(size_t i) const {
+const std::string& ProtoRunStepRequest::feed_name(size_t i) const {
   return request_->feed(i).name();
 }
 
@@ -264,7 +266,7 @@ size_t ProtoRunStepRequest::num_fetches() const {
   return request_->fetch_size();
 }
 
-const string& ProtoRunStepRequest::fetch_name(size_t i) const {
+const std::string& ProtoRunStepRequest::fetch_name(size_t i) const {
   return request_->fetch(i);
 }
 
@@ -272,7 +274,7 @@ size_t ProtoRunStepRequest::num_targets() const {
   return request_->target_size();
 }
 
-const string& ProtoRunStepRequest::target_name(size_t i) const {
+const std::string& ProtoRunStepRequest::target_name(size_t i) const {
   return request_->target(i);
 }
 
@@ -288,13 +290,13 @@ int64_t ProtoRunStepRequest::request_id() const {
   return request_->request_id();
 }
 
-string ProtoRunStepRequest::DebugString() const {
+std::string ProtoRunStepRequest::DebugString() const {
   return request_->DebugString();
 }
 
 const RunStepRequest& ProtoRunStepRequest::ToProto() const { return *request_; }
 
-const string& InMemoryRunGraphRequest::session_handle() const {
+const std::string& InMemoryRunGraphRequest::session_handle() const {
   return session_handle_;
 }
 
@@ -302,7 +304,7 @@ bool InMemoryRunGraphRequest::create_worker_session_called() const {
   return create_worker_session_called_;
 }
 
-void InMemoryRunGraphRequest::set_session_handle(const string& handle) {
+void InMemoryRunGraphRequest::set_session_handle(const std::string& handle) {
   session_handle_ = handle;
 }
 
@@ -310,11 +312,11 @@ void InMemoryRunGraphRequest::set_create_worker_session_called(bool called) {
   create_worker_session_called_ = called;
 }
 
-const string& InMemoryRunGraphRequest::graph_handle() const {
+const std::string& InMemoryRunGraphRequest::graph_handle() const {
   return graph_handle_;
 }
 
-void InMemoryRunGraphRequest::set_graph_handle(const string& handle) {
+void InMemoryRunGraphRequest::set_graph_handle(const std::string& handle) {
   graph_handle_ = handle;
 }
 
@@ -334,7 +336,7 @@ ExecutorOpts* InMemoryRunGraphRequest::mutable_exec_opts() {
 
 size_t InMemoryRunGraphRequest::num_sends() const { return sends_.size(); }
 
-const string& InMemoryRunGraphRequest::send_key(size_t i) const {
+const std::string& InMemoryRunGraphRequest::send_key(size_t i) const {
   return sends_[i].first;
 }
 
@@ -346,7 +348,7 @@ absl::Status InMemoryRunGraphRequest::SendValue(size_t i,
 
 absl::Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
     const RunStepRequestWrapper& run_step_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   Tensor tensor;
   TF_RETURN_IF_ERROR(run_step_request.FeedValue(i, &tensor));
   sends_.emplace_back(send_key, std::move(tensor));
@@ -355,7 +357,7 @@ absl::Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
 
 absl::Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
     const RunCallableRequest& run_callable_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   Tensor tensor;
   if (!ParseTensorProtoToTensor(run_callable_request.feed(i), &tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
@@ -366,11 +368,11 @@ absl::Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
 
 size_t InMemoryRunGraphRequest::num_recvs() const { return recvs_.size(); }
 
-const string& InMemoryRunGraphRequest::recv_key(size_t i) const {
+const std::string& InMemoryRunGraphRequest::recv_key(size_t i) const {
   return recvs_[i];
 }
 
-void InMemoryRunGraphRequest::add_recv_key(const string& recv_key) {
+void InMemoryRunGraphRequest::add_recv_key(const std::string& recv_key) {
   recvs_.push_back(recv_key);
 }
 
@@ -430,11 +432,12 @@ const RunGraphRequest& InMemoryRunGraphRequest::ToProto() const {
   return *proto_version_;
 }
 
-const string& MutableProtoRunGraphRequest::session_handle() const {
+const std::string& MutableProtoRunGraphRequest::session_handle() const {
   return request_.session_handle();
 }
 
-void MutableProtoRunGraphRequest::set_session_handle(const string& handle) {
+void MutableProtoRunGraphRequest::set_session_handle(
+    const std::string& handle) {
   request_.set_session_handle(handle);
 }
 
@@ -447,11 +450,11 @@ void MutableProtoRunGraphRequest::set_create_worker_session_called(
   request_.set_create_worker_session_called(called);
 }
 
-const string& MutableProtoRunGraphRequest::graph_handle() const {
+const std::string& MutableProtoRunGraphRequest::graph_handle() const {
   return request_.graph_handle();
 }
 
-void MutableProtoRunGraphRequest::set_graph_handle(const string& handle) {
+void MutableProtoRunGraphRequest::set_graph_handle(const std::string& handle) {
   request_.set_graph_handle(handle);
 }
 
@@ -475,7 +478,7 @@ size_t MutableProtoRunGraphRequest::num_sends() const {
   return request_.send_size();
 }
 
-const string& MutableProtoRunGraphRequest::send_key(size_t i) const {
+const std::string& MutableProtoRunGraphRequest::send_key(size_t i) const {
   return request_.send(i).name();
 }
 
@@ -490,7 +493,7 @@ absl::Status MutableProtoRunGraphRequest::SendValue(size_t i,
 
 absl::Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
     const RunStepRequestWrapper& run_step_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   NamedTensorProto* send = request_.add_send();
   send->set_name(send_key);
   TF_RETURN_IF_ERROR(run_step_request.FeedValue(i, send->mutable_tensor()));
@@ -499,7 +502,7 @@ absl::Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
 
 absl::Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
     const RunCallableRequest& run_callable_request, size_t i,
-    const string& send_key) {
+    const std::string& send_key) {
   NamedTensorProto* send = request_.add_send();
   send->set_name(send_key);
   *send->mutable_tensor() = run_callable_request.feed(i);
@@ -510,11 +513,11 @@ size_t MutableProtoRunGraphRequest::num_recvs() const {
   return request_.recv_key_size();
 }
 
-const string& MutableProtoRunGraphRequest::recv_key(size_t i) const {
+const std::string& MutableProtoRunGraphRequest::recv_key(size_t i) const {
   return request_.recv_key(i);
 }
 
-void MutableProtoRunGraphRequest::add_recv_key(const string& recv_key) {
+void MutableProtoRunGraphRequest::add_recv_key(const std::string& recv_key) {
   request_.add_recv_key(recv_key);
 }
 
@@ -559,7 +562,7 @@ const RunGraphRequest& MutableProtoRunGraphRequest::ToProto() const {
 ProtoRunGraphRequest::ProtoRunGraphRequest(const RunGraphRequest* request)
     : request_(request) {}
 
-const string& ProtoRunGraphRequest::session_handle() const {
+const std::string& ProtoRunGraphRequest::session_handle() const {
   return request_->session_handle();
 }
 
@@ -567,7 +570,7 @@ bool ProtoRunGraphRequest::create_worker_session_called() const {
   return request_->create_worker_session_called();
 }
 
-const string& ProtoRunGraphRequest::graph_handle() const {
+const std::string& ProtoRunGraphRequest::graph_handle() const {
   return request_->graph_handle();
 }
 
@@ -579,7 +582,7 @@ const ExecutorOpts& ProtoRunGraphRequest::exec_opts() const {
 
 size_t ProtoRunGraphRequest::num_sends() const { return request_->send_size(); }
 
-const string& ProtoRunGraphRequest::send_key(size_t i) const {
+const std::string& ProtoRunGraphRequest::send_key(size_t i) const {
   return request_->send(i).name();
 }
 
@@ -596,7 +599,7 @@ size_t ProtoRunGraphRequest::num_recvs() const {
   return request_->recv_key_size();
 }
 
-const string& ProtoRunGraphRequest::recv_key(size_t i) const {
+const std::string& ProtoRunGraphRequest::recv_key(size_t i) const {
   return request_->recv_key(i);
 }
 
@@ -620,7 +623,7 @@ const RunGraphRequest& ProtoRunGraphRequest::ToProto() const {
 
 size_t InMemoryRunGraphResponse::num_recvs() const { return recvs_.size(); }
 
-const string& InMemoryRunGraphResponse::recv_key(size_t i) const {
+const std::string& InMemoryRunGraphResponse::recv_key(size_t i) const {
   return recvs_[i].first;
 }
 
@@ -635,7 +638,8 @@ absl::Status InMemoryRunGraphResponse::RecvValue(size_t i, Tensor* out_tensor) {
   return absl::OkStatus();
 }
 
-void InMemoryRunGraphResponse::AddRecv(const string& key, const Tensor& value) {
+void InMemoryRunGraphResponse::AddRecv(const std::string& key,
+                                       const Tensor& value) {
   recvs_.emplace_back(key, value);
 }
 
@@ -679,7 +683,7 @@ size_t OwnedProtoRunGraphResponse::num_recvs() const {
   return response_.recv_size();
 }
 
-const string& OwnedProtoRunGraphResponse::recv_key(size_t i) const {
+const std::string& OwnedProtoRunGraphResponse::recv_key(size_t i) const {
   return response_.recv(i).name();
 }
 
@@ -698,7 +702,7 @@ absl::Status OwnedProtoRunGraphResponse::RecvValue(size_t i,
   }
 }
 
-void OwnedProtoRunGraphResponse::AddRecv(const string& key,
+void OwnedProtoRunGraphResponse::AddRecv(const std::string& key,
                                          const Tensor& value) {
   NamedTensorProto* recv = response_.add_recv();
   recv->set_name(key);
@@ -752,7 +756,7 @@ size_t NonOwnedProtoRunGraphResponse::num_recvs() const {
   return response_->recv_size();
 }
 
-const string& NonOwnedProtoRunGraphResponse::recv_key(size_t i) const {
+const std::string& NonOwnedProtoRunGraphResponse::recv_key(size_t i) const {
   return response_->recv(i).name();
 }
 
@@ -771,7 +775,7 @@ absl::Status NonOwnedProtoRunGraphResponse::RecvValue(size_t i,
   }
 }
 
-void NonOwnedProtoRunGraphResponse::AddRecv(const string& key,
+void NonOwnedProtoRunGraphResponse::AddRecv(const std::string& key,
                                             const Tensor& value) {
   NamedTensorProto* recv = response_->add_recv();
   recv->set_name(key);
@@ -823,7 +827,7 @@ MutableRunStepResponseWrapper::~MutableRunStepResponseWrapper() {}
 
 size_t InMemoryRunStepResponse::num_tensors() const { return tensors_.size(); }
 
-const string& InMemoryRunStepResponse::tensor_name(size_t i) const {
+const std::string& InMemoryRunStepResponse::tensor_name(size_t i) const {
   return tensors_[i].first;
 }
 
@@ -838,7 +842,8 @@ const RunMetadata& InMemoryRunStepResponse::metadata() const {
 }
 
 absl::Status InMemoryRunStepResponse::AddTensorFromRunGraphResponse(
-    const string& name, MutableRunGraphResponseWrapper* wrapper, size_t i) {
+    const std::string& name, MutableRunGraphResponseWrapper* wrapper,
+    size_t i) {
   Tensor tensor;
   TF_RETURN_IF_ERROR(wrapper->RecvValue(i, &tensor));
   tensors_.emplace_back(name, tensor);
@@ -866,7 +871,7 @@ size_t OwnedProtoRunStepResponse::num_tensors() const {
   return response_.tensor_size();
 }
 
-const string& OwnedProtoRunStepResponse::tensor_name(size_t i) const {
+const std::string& OwnedProtoRunStepResponse::tensor_name(size_t i) const {
   return response_.tensor(i).name();
 }
 
@@ -884,7 +889,7 @@ const RunMetadata& OwnedProtoRunStepResponse::metadata() const {
 }
 
 absl::Status OwnedProtoRunStepResponse::AddTensorFromRunGraphResponse(
-    const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+    const std::string& name, MutableRunGraphResponseWrapper* run_graph_response,
     size_t i) {
   NamedTensorProto* response_tensor = response_.add_tensor();
   response_tensor->set_name(name);
@@ -919,7 +924,7 @@ size_t NonOwnedProtoRunStepResponse::num_tensors() const {
   return response_->tensor_size();
 }
 
-const string& NonOwnedProtoRunStepResponse::tensor_name(size_t i) const {
+const std::string& NonOwnedProtoRunStepResponse::tensor_name(size_t i) const {
   return response_->tensor(i).name();
 }
 
@@ -937,7 +942,7 @@ const RunMetadata& NonOwnedProtoRunStepResponse::metadata() const {
 }
 
 absl::Status NonOwnedProtoRunStepResponse::AddTensorFromRunGraphResponse(
-    const string& name, MutableRunGraphResponseWrapper* run_graph_response,
+    const std::string& name, MutableRunGraphResponseWrapper* run_graph_response,
     size_t i) {
   NamedTensorProto* response_tensor = response_->add_tensor();
   response_tensor->set_name(name);
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.h b/tensorflow/core/distributed_runtime/message_wrappers.h
index d4b07fb51ce4a3..b911d23245b4ad 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.h
+++ b/tensorflow/core/distributed_runtime/message_wrappers.h
@@ -53,15 +53,15 @@ class RunStepRequestWrapper {
 
   // REQUIRED: session_handle must be returned by a CreateSession call
   // to the same master service.
-  virtual const string& session_handle() const = 0;
+  virtual const std::string& session_handle() const = 0;
 
   // Partial run handle (optional). If specified, this will be a partial run
   // execution, run up to the specified fetches.
-  virtual const string& partial_run_handle() const = 0;
+  virtual const std::string& partial_run_handle() const = 0;
 
   // Tensors to be fed in the step. Each feed is a named tensor.
   virtual size_t num_feeds() const = 0;
-  virtual const string& feed_name(size_t i) const = 0;
+  virtual const std::string& feed_name(size_t i) const = 0;
 
   // Stores the content of the feed value at index `i` in `tensor`.
   virtual absl::Status FeedValue(size_t i, Tensor* out_tensor) const = 0;
@@ -71,12 +71,12 @@ class RunStepRequestWrapper {
   // be returned for each fetch[i] (see RunStepResponse.tensor). The
   // order of specified fetches does not change the execution order.
   virtual size_t num_fetches() const = 0;
-  virtual const string& fetch_name(size_t i) const = 0;
+  virtual const std::string& fetch_name(size_t i) const = 0;
 
   // Target Nodes. A list of node names. The named nodes will be run
   // to but their outputs will not be fetched.
   virtual size_t num_targets() const = 0;
-  virtual const string& target_name(size_t i) const = 0;
+  virtual const std::string& target_name(size_t i) const = 0;
 
   // Options for the run call.
   virtual const RunOptions& options() const = 0;
@@ -94,7 +94,7 @@ class RunStepRequestWrapper {
   virtual int64_t request_id() const = 0;
 
   // Returns a human-readable representation of this message for debugging.
-  virtual string DebugString() const = 0;
+  virtual std::string DebugString() const = 0;
 
   // Returns the wrapped data as a protocol buffer message.
   virtual const RunStepRequest& ToProto() const = 0;
@@ -105,11 +105,11 @@ class RunStepRequestWrapper {
 // See `RunStepRequestWrapper` above for a description of the fields.
 class MutableRunStepRequestWrapper : public RunStepRequestWrapper {
  public:
-  virtual void set_session_handle(const string& handle) = 0;
-  virtual void set_partial_run_handle(const string& handle) = 0;
-  virtual void add_feed(const string& name, const Tensor& value) = 0;
-  virtual void add_fetch(const string& name) = 0;
-  virtual void add_target(const string& name) = 0;
+  virtual void set_session_handle(const std::string& handle) = 0;
+  virtual void set_partial_run_handle(const std::string& handle) = 0;
+  virtual void add_feed(const std::string& name, const Tensor& value) = 0;
+  virtual void add_fetch(const std::string& name) = 0;
+  virtual void add_target(const std::string& name) = 0;
   virtual RunOptions* mutable_options() = 0;
   virtual void set_store_errors_in_response_body(bool store_errors) = 0;
 };
@@ -119,37 +119,37 @@ class MutableRunStepRequestWrapper : public RunStepRequestWrapper {
 class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
  public:
   // RunStepRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& partial_run_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& partial_run_handle() const override;
   size_t num_feeds() const override;
-  const string& feed_name(size_t i) const override;
+  const std::string& feed_name(size_t i) const override;
   absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
   absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
   size_t num_fetches() const override;
-  const string& fetch_name(size_t i) const override;
+  const std::string& fetch_name(size_t i) const override;
   size_t num_targets() const override;
-  const string& target_name(size_t i) const override;
+  const std::string& target_name(size_t i) const override;
   const RunOptions& options() const override;
-  string DebugString() const override;
+  std::string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
   int64_t request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
-  void set_partial_run_handle(const string& handle) override;
-  void add_feed(const string& name, const Tensor& value) override;
-  void add_fetch(const string& name) override;
-  void add_target(const string& name) override;
+  void set_session_handle(const std::string& handle) override;
+  void set_partial_run_handle(const std::string& handle) override;
+  void add_feed(const std::string& name, const Tensor& value) override;
+  void add_fetch(const std::string& name) override;
+  void add_target(const std::string& name) override;
   RunOptions* mutable_options() override;
   void set_store_errors_in_response_body(bool store_errors) override;
 
  private:
-  string session_handle_;
-  string partial_run_handle_;
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> feeds_;
-  absl::InlinedVector<string, 4UL> fetches_;
-  absl::InlinedVector<string, 4UL> targets_;
+  std::string session_handle_;
+  std::string partial_run_handle_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> feeds_;
+  absl::InlinedVector<std::string, 4UL> fetches_;
+  absl::InlinedVector<std::string, 4UL> targets_;
   RunOptions options_;
   bool store_errors_in_response_body_ = false;
 
@@ -170,28 +170,28 @@ class InMemoryRunStepRequest : public MutableRunStepRequestWrapper {
 class MutableProtoRunStepRequest : public MutableRunStepRequestWrapper {
  public:
   // RunStepRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& partial_run_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& partial_run_handle() const override;
   size_t num_feeds() const override;
-  const string& feed_name(size_t i) const override;
+  const std::string& feed_name(size_t i) const override;
   absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
   absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
   size_t num_fetches() const override;
-  const string& fetch_name(size_t i) const override;
+  const std::string& fetch_name(size_t i) const override;
   size_t num_targets() const override;
-  const string& target_name(size_t i) const override;
+  const std::string& target_name(size_t i) const override;
   const RunOptions& options() const override;
-  string DebugString() const override;
+  std::string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
   int64_t request_id() const override;
 
   // MutableRunStepRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
-  void set_partial_run_handle(const string& handle) override;
-  void add_feed(const string& name, const Tensor& value) override;
-  void add_fetch(const string& name) override;
-  void add_target(const string& name) override;
+  void set_session_handle(const std::string& handle) override;
+  void set_partial_run_handle(const std::string& handle) override;
+  void add_feed(const std::string& name, const Tensor& value) override;
+  void add_fetch(const std::string& name) override;
+  void add_target(const std::string& name) override;
   RunOptions* mutable_options() override;
   void set_store_errors_in_response_body(bool store_errors) override;
 
@@ -211,18 +211,18 @@ class ProtoRunStepRequest : public RunStepRequestWrapper {
   ProtoRunStepRequest(const RunStepRequest* request);
 
   // RunStepRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& partial_run_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& partial_run_handle() const override;
   size_t num_feeds() const override;
-  const string& feed_name(size_t i) const override;
+  const std::string& feed_name(size_t i) const override;
   absl::Status FeedValue(size_t i, Tensor* out_tensor) const override;
   absl::Status FeedValue(size_t i, TensorProto* out_tensor) const override;
   size_t num_fetches() const override;
-  const string& fetch_name(size_t i) const override;
+  const std::string& fetch_name(size_t i) const override;
   size_t num_targets() const override;
-  const string& target_name(size_t i) const override;
+  const std::string& target_name(size_t i) const override;
   const RunOptions& options() const override;
-  string DebugString() const override;
+  std::string DebugString() const override;
   const RunStepRequest& ToProto() const override;
   bool store_errors_in_response_body() const override;
   int64_t request_id() const override;
@@ -254,14 +254,14 @@ class RunGraphRequestWrapper {
 
   // The session handle used to register the graph. If empty, a single global
   // namespace is used.
-  virtual const string& session_handle() const = 0;
+  virtual const std::string& session_handle() const = 0;
 
   // Set to true if `CreateWorkerSession` was called for `session_handle`.
   virtual bool create_worker_session_called() const = 0;
 
   // REQUIRED: graph_handle must be returned by a RegisterGraph call
   // to the same WorkerService.
-  virtual const string& graph_handle() const = 0;
+  virtual const std::string& graph_handle() const = 0;
 
   // A unique ID to distinguish different runs of the same graph.
   //
@@ -276,12 +276,12 @@ class RunGraphRequestWrapper {
 
   // Sends the tensors in "send" into the graph before the run.
   virtual size_t num_sends() const = 0;
-  virtual const string& send_key(size_t i) const = 0;
+  virtual const std::string& send_key(size_t i) const = 0;
   virtual absl::Status SendValue(size_t i, Tensor* out_tensor) const = 0;
 
   // Fetches the keys into `RunGraphResponse.recv` after the run.
   virtual size_t num_recvs() const = 0;
-  virtual const string& recv_key(size_t i) const = 0;
+  virtual const std::string& recv_key(size_t i) const = 0;
 
   // True if the RunGraphRequest is a partial run request.
   virtual bool is_partial() const = 0;
@@ -307,9 +307,9 @@ class RunGraphRequestWrapper {
 // See `RunGraphRequestWrapper` above for a description of the fields.
 class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
  public:
-  virtual void set_session_handle(const string& handle) = 0;
+  virtual void set_session_handle(const std::string& handle) = 0;
   virtual void set_create_worker_session_called(bool called) = 0;
-  virtual void set_graph_handle(const string& handle) = 0;
+  virtual void set_graph_handle(const std::string& handle) = 0;
   virtual void set_step_id(int64_t step_id) = 0;
   virtual ExecutorOpts* mutable_exec_opts() = 0;
 
@@ -317,12 +317,12 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
   // request with the given `send_key`.
   virtual absl::Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
-      const string& send_key) = 0;
+      const std::string& send_key) = 0;
   virtual absl::Status AddSendFromRunCallableRequest(
       const RunCallableRequest& run_callable_request, size_t i,
-      const string& send_key) = 0;
+      const std::string& send_key) = 0;
 
-  virtual void add_recv_key(const string& recv_key) = 0;
+  virtual void add_recv_key(const std::string& recv_key) = 0;
   virtual void set_is_partial(bool is_partial) = 0;
   virtual void set_is_last_partial_run(bool is_last_partial_run) = 0;
   virtual void set_store_errors_in_response_body(bool store_errors) = 0;
@@ -332,16 +332,16 @@ class MutableRunGraphRequestWrapper : public RunGraphRequestWrapper {
 class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
-  const string& session_handle() const override;
-  const string& graph_handle() const override;
+  const std::string& session_handle() const override;
+  const std::string& graph_handle() const override;
   bool create_worker_session_called() const override;
   int64_t step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
-  const string& send_key(size_t i) const override;
+  const std::string& send_key(size_t i) const override;
   absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
   const RunGraphRequest& ToProto() const override;
@@ -349,31 +349,31 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
   int64_t request_id() const override;
 
   // MutableRunGraphRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
+  void set_session_handle(const std::string& handle) override;
   void set_create_worker_session_called(bool called) override;
-  void set_graph_handle(const string& handle) override;
+  void set_graph_handle(const std::string& handle) override;
   void set_step_id(int64_t step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
   absl::Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
-      const string& send_key) override;
+      const std::string& send_key) override;
   absl::Status AddSendFromRunCallableRequest(
       const RunCallableRequest& run_callable_request, size_t i,
-      const string& send_key) override;
-  void add_recv_key(const string& recv_key) override;
+      const std::string& send_key) override;
+  void add_recv_key(const std::string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
   void set_store_errors_in_response_body(bool store_errors) override;
   void set_request_id(int64_t request_id) override;
 
  private:
-  string session_handle_;
+  std::string session_handle_;
   bool create_worker_session_called_ = false;
-  string graph_handle_;
+  std::string graph_handle_;
   int64_t step_id_;
   ExecutorOpts exec_opts_;
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> sends_;
-  absl::InlinedVector<string, 4UL> recvs_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> sends_;
+  absl::InlinedVector<std::string, 4UL> recvs_;
   bool is_partial_ = false;
   bool is_last_partial_run_ = false;
   bool store_errors_in_response_body_ = false;
@@ -392,16 +392,16 @@ class InMemoryRunGraphRequest : public MutableRunGraphRequestWrapper {
 class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
  public:
   // RunGraphRequestWrapper methods.
-  const string& session_handle() const override;
+  const std::string& session_handle() const override;
   bool create_worker_session_called() const override;
-  const string& graph_handle() const override;
+  const std::string& graph_handle() const override;
   int64_t step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
-  const string& send_key(size_t i) const override;
+  const std::string& send_key(size_t i) const override;
   absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
   bool store_errors_in_response_body() const override;
@@ -409,18 +409,18 @@ class MutableProtoRunGraphRequest : public MutableRunGraphRequestWrapper {
   const RunGraphRequest& ToProto() const override;
 
   // MutableRunGraphRequestWrapper methods.
-  void set_session_handle(const string& handle) override;
+  void set_session_handle(const std::string& handle) override;
   void set_create_worker_session_called(bool called) override;
-  void set_graph_handle(const string& handle) override;
+  void set_graph_handle(const std::string& handle) override;
   void set_step_id(int64_t step_id) override;
   ExecutorOpts* mutable_exec_opts() override;
   absl::Status AddSendFromRunStepRequest(
       const RunStepRequestWrapper& run_step_request, size_t i,
-      const string& send_key) override;
+      const std::string& send_key) override;
   absl::Status AddSendFromRunCallableRequest(
       const RunCallableRequest& run_callable_request, size_t i,
-      const string& send_key) override;
-  void add_recv_key(const string& recv_key) override;
+      const std::string& send_key) override;
+  void add_recv_key(const std::string& recv_key) override;
   void set_is_partial(bool is_partial) override;
   void set_is_last_partial_run(bool is_last_partial_run) override;
   void set_store_errors_in_response_body(bool store_errors) override;
@@ -435,16 +435,16 @@ class ProtoRunGraphRequest : public RunGraphRequestWrapper {
   ProtoRunGraphRequest(const RunGraphRequest* request);
 
   // RunGraphRequestWrapper methods.
-  const string& session_handle() const override;
+  const std::string& session_handle() const override;
   bool create_worker_session_called() const override;
-  const string& graph_handle() const override;
+  const std::string& graph_handle() const override;
   int64_t step_id() const override;
   const ExecutorOpts& exec_opts() const override;
   size_t num_sends() const override;
-  const string& send_key(size_t i) const override;
+  const std::string& send_key(size_t i) const override;
   absl::Status SendValue(size_t i, Tensor* out_tensor) const override;
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   bool is_partial() const override;
   bool is_last_partial_run() const override;
   bool store_errors_in_response_body() const override;
@@ -480,12 +480,12 @@ class MutableRunGraphResponseWrapper {
   // A list of tensors corresponding to those requested by
   // `RunGraphRequest.recv_key`.
   virtual size_t num_recvs() const = 0;
-  virtual const string& recv_key(size_t i) const = 0;
+  virtual const std::string& recv_key(size_t i) const = 0;
   // NOTE: The following methods may perform a destructive read, for
   // efficiency.
   virtual absl::Status RecvValue(size_t i, TensorProto* out_tensor) = 0;
   virtual absl::Status RecvValue(size_t i, Tensor* out_tensor) = 0;
-  virtual void AddRecv(const string& key, const Tensor& value) = 0;
+  virtual void AddRecv(const std::string& key, const Tensor& value) = 0;
 
   // Submessages that store performance statistics about the subgraph
   // execution, if necessary.
@@ -520,10 +520,10 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
  public:
   // MutableRunGraphResponseWrapper methods.
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
   absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
-  void AddRecv(const string& key, const Tensor& value) override;
+  void AddRecv(const std::string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
   size_t num_partition_graphs() const override;
@@ -539,7 +539,7 @@ class InMemoryRunGraphResponse : public MutableRunGraphResponseWrapper {
   RunGraphResponse* get_proto() override;
 
  private:
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> recvs_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> recvs_;
   StepStats step_stats_;
   CostGraphDef cost_graph_;
   std::vector<GraphDef> partition_graphs_;
@@ -553,10 +553,10 @@ class OwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
  public:
   // MutableRunGraphResponseWrapper methods.
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
   absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
-  void AddRecv(const string& key, const Tensor& value) override;
+  void AddRecv(const std::string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
   size_t num_partition_graphs() const override;
@@ -580,10 +580,10 @@ class NonOwnedProtoRunGraphResponse : public MutableRunGraphResponseWrapper {
 
   // MutableRunGraphResponseWrapper methods.
   size_t num_recvs() const override;
-  const string& recv_key(size_t i) const override;
+  const std::string& recv_key(size_t i) const override;
   absl::Status RecvValue(size_t i, TensorProto* out_tensor) override;
   absl::Status RecvValue(size_t i, Tensor* out_tensor) override;
-  void AddRecv(const string& key, const Tensor& value) override;
+  void AddRecv(const std::string& key, const Tensor& value) override;
   StepStats* mutable_step_stats() override;
   CostGraphDef* mutable_cost_graph() override;
   size_t num_partition_graphs() const override;
@@ -628,14 +628,14 @@ class MutableRunStepResponseWrapper {
   // NOTE: The order of the returned tensors may or may not match
   // the fetch order specified in RunStepRequest.
   virtual size_t num_tensors() const = 0;
-  virtual const string& tensor_name(size_t i) const = 0;
+  virtual const std::string& tensor_name(size_t i) const = 0;
   virtual absl::Status TensorValue(size_t i, Tensor* out_tensor) const = 0;
 
   // Stores the i^{th} recv value in `run_graph_response` in this
   // response with the given `name`.
   virtual absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) = 0;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) = 0;
 
   // Returned metadata if requested in the options.
   virtual const RunMetadata& metadata() const = 0;
@@ -666,11 +666,11 @@ class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
  public:
   // MutableRunStepResponseWrapper methods.
   size_t num_tensors() const override;
-  const string& tensor_name(size_t i) const override;
+  const std::string& tensor_name(size_t i) const override;
   absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
   absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) override;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   absl::Status status() const override;
@@ -683,7 +683,7 @@ class InMemoryRunStepResponse : public MutableRunStepResponseWrapper {
   RunStepResponse* get_proto() override;
 
  private:
-  absl::InlinedVector<std::pair<string, Tensor>, 4UL> tensors_;
+  absl::InlinedVector<std::pair<std::string, Tensor>, 4UL> tensors_;
   RunMetadata metadata_;
   // Store the code and message separately so that they can be updated
   // independently by setters.
@@ -695,11 +695,11 @@ class OwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
  public:
   // MutableRunStepResponseWrapper methods.
   size_t num_tensors() const override;
-  const string& tensor_name(size_t i) const override;
+  const std::string& tensor_name(size_t i) const override;
   absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
   absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) override;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   absl::Status status() const override;
@@ -720,11 +720,11 @@ class NonOwnedProtoRunStepResponse : public MutableRunStepResponseWrapper {
 
   // MutableRunStepResponseWrapper methods.
   size_t num_tensors() const override;
-  const string& tensor_name(size_t i) const override;
+  const std::string& tensor_name(size_t i) const override;
   absl::Status TensorValue(size_t i, Tensor* out_tensor) const override;
   absl::Status AddTensorFromRunGraphResponse(
-      const string& name, MutableRunGraphResponseWrapper* run_graph_response,
-      size_t i) override;
+      const std::string& name,
+      MutableRunGraphResponseWrapper* run_graph_response, size_t i) override;
   const RunMetadata& metadata() const override;
   RunMetadata* mutable_metadata() override;
   absl::Status status() const override;
diff --git a/tensorflow/core/distributed_runtime/message_wrappers_test.cc b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
index f64d476d6dde3e..9f0af827eee591 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers_test.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers_test.cc
@@ -27,13 +27,13 @@ namespace {
 
 Tensor TensorA() {
   Tensor a_tensor(DT_INT32, TensorShape({2, 2}));
-  test::FillValues<int32>(&a_tensor, {3, 2, -1, 0});
+  test::FillValues<int32_t>(&a_tensor, {3, 2, -1, 0});
   return a_tensor;
 }
 
 Tensor TensorB() {
   Tensor b_tensor(DT_INT32, TensorShape({1, 2}));
-  test::FillValues<int32>(&b_tensor, {1, 2});
+  test::FillValues<int32_t>(&b_tensor, {1, 2});
   return b_tensor;
 }
 
@@ -57,9 +57,9 @@ void CheckRunStepRequest(const RunStepRequestWrapper& request) {
   EXPECT_EQ("feed_b:0", request.feed_name(1));
   Tensor val;
   TF_EXPECT_OK(request.FeedValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(request.FeedValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
 
   EXPECT_EQ(2, request.num_fetches());
   EXPECT_EQ("fetch_x:0", request.fetch_name(0));
@@ -92,9 +92,9 @@ void CheckRunGraphRequest(const RunGraphRequestWrapper& request) {
   EXPECT_EQ(2, request.num_sends());
   Tensor val;
   TF_EXPECT_OK(request.SendValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(request.SendValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
   EXPECT_TRUE(request.is_partial());
   EXPECT_FALSE(request.is_last_partial_run());
 }
@@ -117,9 +117,9 @@ void CheckRunGraphResponse(MutableRunGraphResponseWrapper* response) {
   EXPECT_EQ("recv_3", response->recv_key(1));
   Tensor val;
   TF_EXPECT_OK(response->RecvValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(response->RecvValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
   ASSERT_EQ(1, response->mutable_step_stats()->dev_stats_size());
   EXPECT_EQ("/cpu:0", response->mutable_step_stats()->dev_stats(0).device());
   ASSERT_EQ(1, response->mutable_cost_graph()->node_size());
@@ -152,9 +152,9 @@ void CheckRunStepResponse(const MutableRunStepResponseWrapper& response) {
   EXPECT_EQ("fetch_y:0", response.tensor_name(1));
   Tensor val;
   TF_EXPECT_OK(response.TensorValue(0, &val));
-  test::ExpectTensorEqual<int32>(TensorA(), val);
+  test::ExpectTensorEqual<int32_t>(TensorA(), val);
   TF_EXPECT_OK(response.TensorValue(1, &val));
-  test::ExpectTensorEqual<int32>(TensorB(), val);
+  test::ExpectTensorEqual<int32_t>(TensorB(), val);
   ASSERT_EQ(1, response.metadata().step_stats().dev_stats_size());
   EXPECT_EQ("/cpu:0", response.metadata().step_stats().dev_stats(0).device());
   ASSERT_EQ(1, response.metadata().partition_graphs_size());
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
index f75390b26bd338..f98da9aa19629e 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -61,7 +61,7 @@ bool RecentRequestIds::Insert(int64_t request_id) {
 }
 
 absl::Status RecentRequestIds::TrackUnique(int64_t request_id,
-                                           const string& method_name,
+                                           const std::string& method_name,
                                            const protobuf::Message& request) {
   if (Insert(request_id)) {
     return absl::OkStatus();
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 2eb35ac7266c6c..0299d3d9289118 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -60,11 +60,11 @@ class RecentRequestIds {
   // num_tracked_request_ids insertions. For backwards compatibility, this
   // always returns OK for request_id 0. The method_name and the request's
   // ShortDebugString are added to returned errors.
-  absl::Status TrackUnique(int64_t request_id, const string& method_name,
+  absl::Status TrackUnique(int64_t request_id, const std::string& method_name,
                            const protobuf::Message& request);
   // Overloaded version of the above function for wrapped protos.
   template <typename RequestWrapper>
-  absl::Status TrackUnique(int64_t request_id, const string& method_name,
+  absl::Status TrackUnique(int64_t request_id, const std::string& method_name,
                            const RequestWrapper* wrapper);
 
  private:
@@ -88,7 +88,7 @@ class RecentRequestIds {
 
 template <typename RequestWrapper>
 absl::Status RecentRequestIds::TrackUnique(int64_t request_id,
-                                           const string& method_name,
+                                           const std::string& method_name,
                                            const RequestWrapper* wrapper) {
   if (Insert(request_id)) {
     return absl::OkStatus();
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index ad8ac2080ab833..5bcf27d54abd1c 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -53,7 +53,7 @@ class RemoteDevice : public Device {
   bool IsRemoteCallAllowed() const override { return true; }
 
  private:
-  const string local_dev_name_;
+  const std::string local_dev_name_;
 
   RemoteDevice(const RemoteDevice&) = delete;
   void operator=(const RemoteDevice&) = delete;
@@ -78,7 +78,8 @@ void AsRemoteDevices(
 }
 
 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
-                      const string& worker_name, NewRemoteDevicesDone done) {
+                      const std::string& worker_name,
+                      NewRemoteDevicesDone done) {
   WorkerInterface* wi = worker_cache->GetOrCreateWorker(worker_name);
   if (wi == nullptr) {
     std::vector<Device*> empty;
diff --git a/tensorflow/core/distributed_runtime/remote_device.h b/tensorflow/core/distributed_runtime/remote_device.h
index 591531f94d567f..806123ed71b205 100644
--- a/tensorflow/core/distributed_runtime/remote_device.h
+++ b/tensorflow/core/distributed_runtime/remote_device.h
@@ -62,7 +62,8 @@ void AsRemoteDevices(
 typedef std::function<void(const absl::Status&, std::vector<Device*>*)>
     NewRemoteDevicesDone;
 void NewRemoteDevices(Env* env, WorkerCacheInterface* worker_cache,
-                      const string& worker_name, NewRemoteDevicesDone done);
+                      const std::string& worker_name,
+                      NewRemoteDevicesDone done);
 
 // Create Remote Device based on the given attributes.
 std::unique_ptr<Device> NewRemoteDevice(Env* env,

From efddb34c27796bfefda22ddaaceb66e2c6091d2d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:19:16 -0800
Subject: [PATCH 398/753] Automated Code Change

PiperOrigin-RevId: 845648979
---
 .../common_runtime/gpu/gpu_debug_allocator.cc | 15 +++++++------
 .../core/common_runtime/gpu/gpu_util.cc       | 21 ++++++++++---------
 tensorflow/core/common_runtime/gpu/gpu_util.h | 13 ++++++------
 3 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index 0a2d98e840a849..1bebcc31d45c76 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -44,7 +44,8 @@ int64_t* before_mask = NewMask(0xabababababababab);
 int64_t* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
 
 bool CheckMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
-  se::DeviceMemory<int64_t> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
+  stream_executor::DeviceAddress<int64_t> gpu_ptr{
+      stream_executor::DeviceAddressBase{ptr, MASK_BYTES}};
   int64_t tmp[MASK_WORDS];
 
   absl::Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
@@ -66,7 +67,8 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
 }
 
 void InitMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
-  se::DeviceMemory<int64_t> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
+  stream_executor::DeviceAddress<int64_t> gpu_ptr{
+      stream_executor::DeviceAddressBase{ptr, MASK_BYTES}};
   absl::Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
   if (!result.ok()) {
     LOG(FATAL) << "Could not copy debug mask, " << result;
@@ -175,8 +177,9 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
   std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
                           std::nanf(""));
-  se::DeviceMemory<float> nan_ptr{
-      se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
+  stream_executor::DeviceAddress<float> nan_ptr{
+      stream_executor::DeviceAddressBase{static_cast<float*>(allocated_ptr),
+                                         req_size}};
 
   absl::Status result =
       stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
@@ -192,8 +195,8 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
     size_t req_size = base_allocator_->RequestedSize(ptr);
     std::vector<float> nans((req_size + sizeof(float) - 1) / sizeof(float),
                             std::nanf(""));
-    se::DeviceMemory<float> nan_ptr{
-        se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
+    stream_executor::DeviceAddress<float> nan_ptr{
+        stream_executor::DeviceAddressBase{static_cast<float*>(ptr), req_size}};
     absl::Status result =
         stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
     if (!result.ok()) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 4d192d8af9fab4..6fb3a800d0ab60 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -548,34 +548,35 @@ absl::Status GPUUtil::SyncAll(Device* gpu_device) {
   return absl::OkStatus();
 }
 
-string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
-  string ret;
+std::string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
+  std::string ret;
   CHECK(tensor);
   const int64_t num_bytes = std::min<int64_t>(
       FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
   void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr;
-  strings::Appendf(&ret, "%p:", ptr);
+  void* arg1 = ptr;
+  absl::StrAppendFormat(&ret, "%p:", arg1);
   if (num_bytes > 0) {
     auto* dev_info = device->tensorflow_accelerator_device_info();
     if (!dev_info) {
-      strings::StrAppend(
+      absl::StrAppend(
           &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes));
     } else {
-      string buf;
+      std::string buf;
       buf.resize(num_bytes);
       DeviceMemoryBase gpu_ptr(ptr, num_bytes);
       auto s = dev_info->stream->parent()->SynchronousMemcpyD2H(
           gpu_ptr, num_bytes, &*buf.begin());
-      strings::StrAppend(&ret, PrintMemory(&*buf.begin(), num_bytes));
+      absl::StrAppend(&ret, PrintMemory(&*buf.begin(), num_bytes));
     }
   }
   return ret;
 }
 
 // TODO(pbar) Checksum is called from places without a valid device context.
-uint64 GPUUtil::Checksum(Device* gpu_device,
-                         const DeviceContext* device_context,
-                         const Tensor& tensor) {
+uint64_t GPUUtil::Checksum(Device* gpu_device,
+                           const DeviceContext* device_context,
+                           const Tensor& tensor) {
   Tensor copy(tensor.dtype(), tensor.shape());
   absl::Status s;
   absl::Notification n;
@@ -589,7 +590,7 @@ uint64 GPUUtil::Checksum(Device* gpu_device,
   return Checksum(copy);
 }
 
-uint64 GPUUtil::Checksum(const Tensor& tensor) {
+uint64_t GPUUtil::Checksum(const Tensor& tensor) {
   const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
   size_t num_bytes = tensor.TotalBytes();
   size_t num_floats = num_bytes / sizeof(float);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.h b/tensorflow/core/common_runtime/gpu/gpu_util.h
index 0b650ad9804343..6675aa3802c081 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.h
@@ -62,7 +62,7 @@ class GPUUtil {
   // For debugging purpose, given a "device" and a "tensor" allocated
   // on the device, return a string printing each byte in the tensor
   // (up to a limit).  "device" can be either a CPU or a GPU device.
-  static string MemoryDebugString(const Device* device, Tensor* tensor);
+  static std::string MemoryDebugString(const Device* device, Tensor* tensor);
 
   // Map a Tensor as a DeviceMemory object wrapping the given typed
   // buffer.
@@ -72,18 +72,19 @@ class GPUUtil {
   template <typename T>
   static se::DeviceMemory<T> AsDeviceMemory(const Tensor& t) {
     T* ptr = reinterpret_cast<T*>(const_cast<void*>(DMAHelper::base(&t)));
-    return se::DeviceMemory<T>(se::DeviceMemoryBase(ptr, t.TotalBytes()));
+    return se::DeviceMemory<T>(
+        stream_executor::DeviceAddressBase(ptr, t.TotalBytes()));
   }
 
   // Computes a checksum over the contents of "tensor", which is allocated
   // on "gpu_device".
-  static uint64 Checksum(Device* gpu_device,
-                         const DeviceContext* device_context,
-                         const Tensor& tensor);
+  static uint64_t Checksum(Device* gpu_device,
+                           const DeviceContext* device_context,
+                           const Tensor& tensor);
 
   // Computes a checksum over the contents of "tensor", which is allocated
   // in local CPU RAM.
-  static uint64 Checksum(const Tensor& tensor);
+  static uint64_t Checksum(const Tensor& tensor);
 
   static void CopyCPUTensorToGPU(const Tensor* cpu_tensor,
                                  const DeviceContext* device_context,

From db8b9de813d3c1fb18f498532cc1ce1371339ab7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:19:16 -0800
Subject: [PATCH 399/753] Automated Code Change

PiperOrigin-RevId: 845648984
---
 .../core/kernels/data/batch_dataset_op.cc     | 10 +++---
 .../core/kernels/data/cache_dataset_ops.cc    | 34 +++++++++----------
 .../kernels/data/cache_dataset_ops_test.cc    | 19 ++++++-----
 tensorflow/core/kernels/data/cache_ops.cc     |  2 +-
 tensorflow/core/kernels/data/cache_ops.h      |  4 +--
 .../kernels/data/concatenate_dataset_op.cc    |  6 ++--
 tensorflow/core/kernels/data/dataset_ops.cc   |  4 +--
 7 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 62122f5d50987f..1813f7e9e02005 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -73,7 +73,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
         op_version_(op_version),
         traceme_metadata_(
             {{"batch_size",
-              strings::Printf("%lld", static_cast<long long>(batch_size))},
+              absl::StrFormat("%lld", static_cast<long long>(batch_size))},
              {"drop_remainder", drop_remainder ? "true" : "false"},
              {"parallel_copy", parallel_copy ? "true" : "false"}}) {
     input_->Ref();
@@ -106,7 +106,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.op_version = op_version_;
     return std::make_unique<Iterator>(Iterator::Params{
@@ -121,7 +121,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.op_version = op_version_;
     params.set_args(batch_size_);
@@ -146,9 +146,9 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return input_->CheckExternalState();
   }
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
-    const int64 cardinality = Cardinality();
+    const int64_t cardinality = Cardinality();
     if (index < 0 || index >= cardinality) {
       return errors::OutOfRange("Index out of range [0, ", cardinality,
                                 "):", index);
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 1cc826e8c17b3d..ad5ba2464ce9c3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -86,7 +86,7 @@ class DatasetRandomAccessCache {
 
   // Extends the temporary cache up to a given index and then updates
   // out_tensors with the element at that index.
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) {
     if (!iter_resource_) {
       TF_ASSIGN_OR_RETURN(iter_resource_,
@@ -104,7 +104,7 @@ class DatasetRandomAccessCache {
   std::vector<std::vector<Tensor>> GetCacheData() { return cache_; }
 
  private:
-  absl::Status ExtendTempCacheToIndex(int64 index, OpKernelContext* ctx) {
+  absl::Status ExtendTempCacheToIndex(int64_t index, OpKernelContext* ctx) {
     bool end_of_sequence;
     while (cache_.size() <= index) {
       std::vector<Tensor> out_tensors;
@@ -169,7 +169,7 @@ class IteratorRandomAccessCache {
 class CacheDatasetOp::FileDatasetBase : public DatasetBase {
  public:
   FileDatasetBase(OpKernelContext* ctx, const DatasetBase* input,
-                  string filename, Env* env)
+                  std::string filename, Env* env)
       : DatasetBase(DatasetContext(ctx)),
         input_(input),
         filename_(std::move(filename)),
@@ -184,7 +184,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
   ~FileDatasetBase() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.dataset_prefix = kFileDatasetPrefix;
     return std::make_unique<FileIterator>(FileIterator::Params{
@@ -199,7 +199,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.dataset_prefix = kFileDatasetPrefix;
     return name_utils::DatasetDebugString(kDatasetType, params);
@@ -225,7 +225,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
 
  private:
   static size_t StringPaddingSize(size_t num_tensors) {
-    return strings::Printf(kPaddingSizeStrFormat, num_tensors - 1).size();
+    return absl::StrFormat(kPaddingSizeStrFormat, num_tensors - 1).size();
   }
 
   std::string FormatName(size_t item_index, size_t tensor_index) const {
@@ -328,14 +328,14 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
       ~FileWriterIterator() override {
         if (!dataset()->env_->FileExists(MetaFilename(filename_)).ok()) {
           LOG(WARNING) << kIncompleteCacheErrorMessage;
-          std::vector<string> cache_files;
+          std::vector<std::string> cache_files;
           absl::Status s = dataset()->env_->GetMatchingPaths(
               absl::StrCat(filename_, "*"), &cache_files);
           if (!s.ok()) {
             LOG(WARNING) << "Failed to get matching files on " << filename_
                          << "* : " << s.ToString();
           }
-          for (const string& path : cache_files) {
+          for (const std::string& path : cache_files) {
             s = dataset()->env_->DeleteFile(path);
             if (!s.ok()) {
               LOG(WARNING) << "Failed to delete " << path << " : "
@@ -387,7 +387,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         size_t tensor_index = 0;
         for (const Tensor& t : *out_tensors) {
           DCHECK_LT(tensor_index, dataset()->num_tensors_);
-          string key = dataset()->FormatName(cur_index_, tensor_index++);
+          std::string key = dataset()->FormatName(cur_index_, tensor_index++);
           TF_RETURN_IF_ERROR(writer_->Add(key, t));
         }
         if (*end_of_sequence) {
@@ -576,9 +576,9 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
       std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       // The current prefix for the cache file. This is equal to
       // `StrCat(dataset()->filename_, "_", shard_id_)`.
-      string filename_;
+      std::string filename_;
       std::unique_ptr<BundleWriter> writer_ TF_GUARDED_BY(mu_);
-      string lockfile_ TF_GUARDED_BY(mu_);
+      std::string lockfile_ TF_GUARDED_BY(mu_);
       bool lockfile_created_ TF_GUARDED_BY(mu_);
       bool iteration_completed_ TF_GUARDED_BY(mu_);
     };  // FileWriterIterator
@@ -730,7 +730,7 @@ class CacheDatasetOp::FileDataset : public CacheDatasetOp::FileDatasetBase {
 class CacheDatasetOp::FileDatasetV2 : public CacheDatasetOp::FileDatasetBase {
  public:
   explicit FileDatasetV2(OpKernelContext* ctx, const DatasetBase* input,
-                         string filename, Env* env,
+                         std::string filename, Env* env,
                          const Tensor& resource_handle)
       : FileDatasetBase(ctx, input, filename, env),
         resource_handle_(resource_handle) {}
@@ -768,7 +768,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
   ~MemoryDatasetBase() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.dataset_prefix = kMemoryDatasetPrefix;
     return std::make_unique<MemoryIterator>(
@@ -785,7 +785,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return input_->output_shapes();
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.dataset_prefix = kMemoryDatasetPrefix;
     return name_utils::DatasetDebugString(kDatasetType, params);
@@ -795,7 +795,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return input_->Cardinality(options);
   };
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     mutex_lock l(mu_);
 
@@ -815,7 +815,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
     return dataset_random_access_cache_->Get(ctx, index, out_tensors);
   }
 
-  absl::Status Get(AnyContext ctx, int64 index,
+  absl::Status Get(AnyContext ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     mutex_lock l(mu_);
     if (!iterator_random_access_cache_) {
@@ -1182,7 +1182,7 @@ void CacheDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
   OP_REQUIRES_OK(ctx, ParseScalarArgument<tstring>(ctx, kFileName, &filename));
   if (filename.empty()) {
     static std::atomic<int64_t> resource_id_counter(0);
-    const string& container = ctx->resource_manager()->default_container();
+    const std::string& container = ctx->resource_manager()->default_container();
     auto name = strings::StrCat(ctx->op_kernel().name(), "/", kMemoryCache, "_",
                                 resource_id_counter.fetch_add(1));
     if (op_version_ == 2) {
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
index 2ccf09149c4c34..ec4067c15110c3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
@@ -32,10 +32,10 @@ constexpr char kMemoryDatasetPrefix[] = "Memory";
 class CacheDatasetParams : public DatasetParams {
  public:
   template <typename T>
-  CacheDatasetParams(T input_dataset_params, string filename,
+  CacheDatasetParams(T input_dataset_params, std::string filename,
                      DataTypeVector output_dtypes,
                      std::vector<PartialTensorShape> output_shapes,
-                     string node_name)
+                     std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         filename_(filename) {
@@ -51,7 +51,8 @@ class CacheDatasetParams : public DatasetParams {
     return {filename_tensor};
   }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     *input_names = {CacheDatasetOp::kInputDataset, CacheDatasetOp::kFileName};
     return absl::OkStatus();
   }
@@ -63,12 +64,14 @@ class CacheDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return CacheDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return CacheDatasetOp::kDatasetType;
+  }
 
-  string filename() const { return filename_; }
+  std::string filename() const { return filename_; }
 
  private:
-  string filename_;
+  std::string filename_;
 };
 
 class CacheDatasetOpTest : public DatasetOpsTestBase {
@@ -82,14 +85,14 @@ class CacheDatasetOpTest : public DatasetOpsTestBase {
 
   ~CacheDatasetOpTest() override {
     if (!cache_filename_.empty()) {
-      std::vector<string> cache_files;
+      std::vector<std::string> cache_files;
       absl::Status s = device_->env()->GetMatchingPaths(
           absl::StrCat(cache_filename_, "*"), &cache_files);
       if (!s.ok()) {
         LOG(WARNING) << "Failed to get matching files on " << cache_filename_
                      << "* : " << s;
       }
-      for (const string& path : cache_files) {
+      for (const std::string& path : cache_files) {
         s = device_->env()->DeleteFile(path);
         if (!s.ok()) {
           LOG(WARNING) << "Failed to delete " << path << " : " << s;
diff --git a/tensorflow/core/kernels/data/cache_ops.cc b/tensorflow/core/kernels/data/cache_ops.cc
index 0dce7f73215f92..0338ca1b3fcfc8 100644
--- a/tensorflow/core/kernels/data/cache_ops.cc
+++ b/tensorflow/core/kernels/data/cache_ops.cc
@@ -80,7 +80,7 @@ AnonymousMemoryCacheHandleOp::AnonymousMemoryCacheHandleOp(
                                               /* ref_counting */ true,
                                               /* return_deleter */ true) {}
 
-string AnonymousMemoryCacheHandleOp::name() { return kMemoryCache; }
+std::string AnonymousMemoryCacheHandleOp::name() { return kMemoryCache; }
 
 absl::Status AnonymousMemoryCacheHandleOp::CreateResource(
     OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
diff --git a/tensorflow/core/kernels/data/cache_ops.h b/tensorflow/core/kernels/data/cache_ops.h
index e1e58ae9c1df89..f91f261ea79bec 100644
--- a/tensorflow/core/kernels/data/cache_ops.h
+++ b/tensorflow/core/kernels/data/cache_ops.h
@@ -62,7 +62,7 @@ class MemoryCacheManager : public ResourceBase {
  public:
   MemoryCacheManager() : cache_(std::make_shared<MemoryCache>()) {}
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
   std::shared_ptr<MemoryCache> get() { return cache_; }
 
@@ -77,7 +77,7 @@ class AnonymousMemoryCacheHandleOp
   explicit AnonymousMemoryCacheHandleOp(OpKernelConstruction* ctx);
 
  private:
-  string name() override;
+  std::string name() override;
   absl::Status CreateResource(
       OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
       std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 6d4bfc88504a7e..d9fed39b07ba88 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -105,7 +105,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
   }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(Iterator::Params{
         this, name_utils::IteratorPrefix(kDatasetType, prefix)});
   }
@@ -124,7 +124,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
@@ -155,7 +155,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return to_concatenate_->CheckExternalState();
   }
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     if (index < input_cardinality_) {
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index b3c114ce833a08..cafd1d4880b379 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -120,7 +120,7 @@ void DatasetToGraphOp::Compute(OpKernelContext* ctx) {
 DatasetCardinalityOp::DatasetCardinalityOp(OpKernelConstruction* ctx)
     : OpKernel(ctx), cardinality_options_(new CardinalityOptions) {
   if (ctx->HasAttr(kCardinalityOptions)) {
-    string options_serialized;
+    std::string options_serialized;
     OP_REQUIRES_OK(ctx, ctx->GetAttr(kCardinalityOptions, &options_serialized));
     if (!options_serialized.empty())
       cardinality_options_->ParseFromString(options_serialized);
@@ -141,7 +141,7 @@ void DatasetFromGraphOp::Compute(OpKernelContext* ctx) {
   GraphDef graph_def;
   OP_REQUIRES(ctx, graph_def.ParseFromString(graph_def_string),
               errors::InvalidArgument("Could not parse GraphDef"));
-  string output_node;
+  std::string output_node;
   for (const auto& node : graph_def.node()) {
     if (node.op() == FunctionLibraryDefinition::kRetOp) {
       output_node = node.input(0);

From 17428e9c7986e3b98c0d1cd094e6a2b40f546c8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:23:11 -0800
Subject: [PATCH 400/753] Automated Code Change

PiperOrigin-RevId: 845650168
---
 .../core/common_runtime/eager/attr_builder.cc |  25 ++--
 .../core/common_runtime/eager/attr_builder.h  |  18 +--
 .../common_runtime/eager/attr_builder_test.cc |   4 +-
 .../core/common_runtime/eager/context.cc      | 119 +++++++++---------
 .../core/common_runtime/eager/context.h       |  90 ++++++-------
 .../eager/context_distributed_manager.cc      | 116 ++++++++---------
 .../core/common_runtime/eager/context_test.cc |  10 +-
 .../eager/copy_to_device_node.h               |   4 +-
 .../core/common_runtime/eager/custom_device.h |   5 +-
 .../eager/custom_device_op_handler.cc         |   4 +-
 .../eager/custom_device_op_handler.h          |   7 +-
 .../common_runtime/eager/eager_executor.cc    |   6 +-
 .../common_runtime/eager/eager_executor.h     |  15 +--
 .../eager/eager_executor_test.cc              |   4 +-
 .../eager/eager_op_rewrite_registry.h         |   6 +-
 .../eager/eager_op_rewrite_registry_test.cc   |   2 +-
 .../common_runtime/eager/eager_operation.cc   |   8 +-
 .../common_runtime/eager/eager_operation.h    |  12 +-
 .../eager/eager_operation_test.cc             |   2 +-
 .../core/common_runtime/eager/execute.cc      |  93 +++++++-------
 .../core/common_runtime/eager/execute_node.cc |   2 +-
 .../core/common_runtime/eager/execute_test.cc |  16 +--
 .../common_runtime/eager/kernel_and_device.h  |  20 +--
 .../common_runtime/eager/placement_utils.cc   |  25 ++--
 .../eager/placement_utils_test.cc             |   4 +-
 .../common_runtime/eager/tensor_handle.cc     |  40 +++---
 .../core/common_runtime/eager/tensor_handle.h |  45 ++++---
 .../eager/tensor_handle_data.cc               |   2 +-
 .../common_runtime/eager/tensor_handle_data.h |   2 +-
 .../eager/tensor_handle_test.cc               |  14 +--
 30 files changed, 371 insertions(+), 349 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 9852cce5ee3413..e7700d1076c132 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -35,13 +35,14 @@ namespace {
 
 mutex g_op_name_to_attr_type_map_lock(LINKER_INITIALIZED);
 
-tensorflow::gtl::FlatMap<string, const AttrTypeMap*>* OpNameToAttrTypeMap() {
+tensorflow::gtl::FlatMap<std::string, const AttrTypeMap*>*
+OpNameToAttrTypeMap() {
   static auto* const m =
-      new tensorflow::gtl::FlatMap<string, const AttrTypeMap*>;
+      new tensorflow::gtl::FlatMap<std::string, const AttrTypeMap*>;
   return m;
 }
 
-const uint32 kIsList = 1U << 31;
+const uint32_t kIsList = 1U << 31;
 
 AttrTypeMap* DefaultFunctionAttrTypeMap() {
   AttrTypeMap* map = new AttrTypeMap();
@@ -57,7 +58,7 @@ const AttrTypeMap* GetDefaultFunctionAttrTypeMap() {
 
 }  // namespace
 
-absl::Status OpDefForOp(const string& op_name, const OpDef** op_def) {
+absl::Status OpDefForOp(const std::string& op_name, const OpDef** op_def) {
   const OpRegistrationData* op_reg_data = nullptr;
   absl::Status s = OpRegistry::Global()->LookUp(op_name, &op_reg_data);
   if (s.ok()) {
@@ -102,12 +103,12 @@ absl::Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
   // TODO(agarwal): Avoid having to create this "registry" at runtime,
   // perhaps can be done at op registration time?
   for (const auto& attr : op_def->attr()) {
-    string type = attr.type();
+    std::string type = attr.type();
     const bool is_list = (type.length() > 6 && type.compare(0, 4, "list") == 0);
     if (is_list) {
       type = type.substr(5, type.length() - 6);
     }
-    uint32 t = is_list ? kIsList : 0;
+    uint32_t t = is_list ? kIsList : 0;
     if (type == "string") {
       t |= TF_ATTR_STRING;
     } else if (type == "int") {
@@ -163,7 +164,7 @@ DEFINE_GET_ATTR(tensorflow::DataType, type, "type");
 template <>
 absl::Status AttrBuilder::Get(absl::string_view attr_name,
                               absl::InlinedVector<DataType, 4>* value) const {
-  auto it = encoded_attrs_.find(string(attr_name));
+  auto it = encoded_attrs_.find(std::string(attr_name));
   if (it == encoded_attrs_.end()) {
     return errors::NotFound("No attr named '", attr_name,
                             "' found in AttrBuilder for ", op_name_);
@@ -207,7 +208,7 @@ void AttrBuilder::FillAttrValueMap(AttrValueMap* m) const {
 
 namespace {
 
-bool ValueMatchesDefault(const OpDef* op_def, const string& attr_name,
+bool ValueMatchesDefault(const OpDef* op_def, const std::string& attr_name,
                          const AttrValue& attr_value) {
   // TODO(iga): It might make sense to augment OpRegistrationData with a
   // {attr_name -> default_attr_value} FlatMap to avoid the loop here.
@@ -238,7 +239,7 @@ void AttrBuilder::FillAttrValueMapWithoutDefaults(AttrValueMap* m) const {
 
 void AttrBuilder::AddAttrIfNotPresent(absl::string_view attr_name,
                                       const AttrValue& value) {
-  encoded_attrs_.emplace(string(attr_name), value.SerializeAsString());
+  encoded_attrs_.emplace(std::string(attr_name), value.SerializeAsString());
 }
 
 const NodeDef& AttrBuilder::BuildNodeDef() {
@@ -260,7 +261,7 @@ void AttrBuilder::CopyAttributes(const AttrBuilder& other) {
                         other.encoded_attrs_.end());
 }
 
-absl::Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+absl::Status AttrTypeByName(const AttrTypeMap& m, const std::string& attr_name,
                             TF_AttrType* out, unsigned char* is_list) {
   auto* t = gtl::FindOrNull(m, attr_name);
   if (t == nullptr) {
@@ -290,7 +291,7 @@ inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s,
   return FingerprintCat128(a, b);
 }
 
-inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s, uint64 b) {
+inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s, uint64_t b) {
   return CacheKeyHelper(s, {b, b});
 }
 
@@ -299,7 +300,7 @@ inline tensorflow::Fprint128 CacheKeyHelper(absl::string_view s, uint64 b) {
 tensorflow::Fprint128 AttrBuilder::CacheKey(const absl::string_view device) {
   if (!cached_cache_key_ || device != device_for_cached_cache_key_) {
     cached_cache_key_ = BuildCacheKeyForDevice(device);
-    device_for_cached_cache_key_ = string(device);
+    device_for_cached_cache_key_ = std::string(device);
   }
 
   return *cached_cache_key_;
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index 9dc480d8c8187a..bdd644a6331ca6 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -40,10 +40,10 @@ namespace tensorflow {
 // If the type is not a list type, the value is the same as the TF_AttrType type
 // of the value. Else, the highest order bit is on, and the rest of the bits
 // represent the TF_AttrType type of the values in the list.
-typedef std::unordered_map<string, uint32> AttrTypeMap;
+typedef std::unordered_map<std::string, uint32_t> AttrTypeMap;
 
 // Look up OpDef for `op_name`.
-absl::Status OpDefForOp(const string& op_name, const OpDef** op_def);
+absl::Status OpDefForOp(const std::string& op_name, const OpDef** op_def);
 
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
 // If op_name is not registered in global op registry, AttrTypeMapForOp assumes
@@ -53,7 +53,7 @@ absl::Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
                               bool* is_function);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
-absl::Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
+absl::Status AttrTypeByName(const AttrTypeMap& m, const std::string& attr_name,
                             TF_AttrType* out, unsigned char* is_list);
 
 // KernelAndDevice::Init needs a NodeDef only to pass the attribute map through.
@@ -111,8 +111,8 @@ class AttrBuilder : public AbstractOpAttrs {
     device_for_cached_cache_key_.clear();
   }
 
-  const string& op_name() const { return op_name_; }
-  void set_op_name(const string& name) { op_name_ = name; }
+  const std::string& op_name() const { return op_name_; }
+  void set_op_name(const std::string& name) { op_name_ = name; }
 
   // Needed to work around call to ValidateNodeDef in CreateOpKernel.
   AttrBuilder& NumInputs(int n);
@@ -186,7 +186,7 @@ class AttrBuilder : public AbstractOpAttrs {
   tensorflow::Fprint128 BuildCacheKeyForDevice(absl::string_view device) const;
 
   template <class T>
-  void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
+  void SetInAttrValueMap(AttrValueMap* m, const std::string& attr_name,
                          T&& value) const {
     DCHECK(!node_def_finalized_)
         << "Calling SetInAttrValueMap after BuildNodeDef.";
@@ -196,17 +196,17 @@ class AttrBuilder : public AbstractOpAttrs {
 
   void AddAttrIfNotPresent(absl::string_view attr_name, const AttrValue& value);
 
-  gtl::FlatMap<string, string> encoded_attrs_;
+  gtl::FlatMap<std::string, std::string> encoded_attrs_;
   mutable AttrValue attr_tmp_;  // For encoding
 
-  string op_name_;
+  std::string op_name_;
   int num_inputs_;
   NodeDef node_def_;
   bool node_def_initialized_;
   bool node_def_finalized_;
 
   std::optional<tensorflow::Fprint128> cached_cache_key_;
-  string device_for_cached_cache_key_;
+  std::string device_for_cached_cache_key_;
 };
 
 template <>
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 77462842f493a2..e0a35cfc59c524 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -85,8 +85,8 @@ TEST(AttrTypeMap, CacheKey) {
   ASSERT_FALSE(cache_key == a.CacheKey("cpu:0"));
 }
 
-string ToString(const AttrValueMap& m) {
-  std::vector<string> strs;
+std::string ToString(const AttrValueMap& m) {
+  std::vector<std::string> strs;
   for (const auto& e : m) {
     strs.push_back(absl::StrCat(e.first, " -> ", e.second.DebugString()));
   }
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index fe649546530f3c..358c51f22c098e 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -150,7 +150,8 @@ EagerContext::EagerContext(
       allow_soft_placement_(opts.config.allow_soft_placement()),
       num_active_steps_(0),
       step_container_(std::make_unique<ScopedStepContainer>(
-          0, [this](const string& name) { ClearResourceContainer(name); })),
+          0,
+          [this](const std::string& name) { ClearResourceContainer(name); })),
       default_executor_(async,
                         /*enable_streaming_enqueue=*/!opts.config.experimental()
                             .disable_eager_executor_streaming_enqueue()),
@@ -198,7 +199,7 @@ AbstractTensorInterface* EagerContext::CreateInt64Scalar(int64_t value) {
   return new TensorInterface(Tensor(value));
 }
 
-AbstractTensorInterface* EagerContext::CreateUint64Scalar(uint64 value) {
+AbstractTensorInterface* EagerContext::CreateUint64Scalar(uint64_t value) {
   return new TensorInterface(Tensor(value));
 }
 
@@ -285,8 +286,9 @@ void EagerContext::InitPrioritizedDeviceTypeList() {
 namespace {
 // Using absl::StrJoin with lambda does not work in tf-lite builds.
 // TODO(b/148160441): Replace with absl::StrJoin once DeviceBase has operator<<.
-std::vector<string> DevicesToString(const PrioritizedDeviceVector& devices) {
-  std::vector<string> v;
+std::vector<std::string> DevicesToString(
+    const PrioritizedDeviceVector& devices) {
+  std::vector<std::string> v;
   v.reserve(devices.size());
   for (const auto& p : devices) {
     v.push_back(p.first->name());
@@ -294,9 +296,9 @@ std::vector<string> DevicesToString(const PrioritizedDeviceVector& devices) {
   return v;
 }
 
-std::vector<string> DeviceTypesToString(
+std::vector<std::string> DeviceTypesToString(
     const PrioritizedDeviceTypeVector& types) {
-  std::vector<string> v;
+  std::vector<std::string> v;
   v.reserve(types.size());
   for (const auto& p : types) {
     v.push_back(p.first.type_string());
@@ -316,8 +318,8 @@ std::vector<string> DeviceTypesToString(
 Device* SelectBestMatchingDevice(const DeviceNameUtils::ParsedName& pattern,
                                  const PrioritizedDeviceVector& existing,
                                  const PrioritizedDeviceTypeVector& supported) {
-  for (const std::pair<DeviceType, int32>& prioritized_type : supported) {
-    for (const std::pair<Device*, int32>& prioritized_device : existing) {
+  for (const std::pair<DeviceType, int32_t>& prioritized_type : supported) {
+    for (const std::pair<Device*, int32_t>& prioritized_device : existing) {
       Device* dev = prioritized_device.first;
       if (DeviceType(dev->attributes().device_type()) ==
               prioritized_type.first &&
@@ -485,7 +487,7 @@ void EagerContext::ClearCachesAndDefaultExecutor() {
   {
     mutex_lock ml(metadata_mu_);
     step_container_ = std::make_unique<ScopedStepContainer>(
-        0, [this](const string& name) { ClearResourceContainer(name); });
+        0, [this](const std::string& name) { ClearResourceContainer(name); });
   }
 }
 
@@ -509,7 +511,7 @@ ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() const {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-std::vector<string> EagerContext::GetRemoteContexts() {
+std::vector<std::string> EagerContext::GetRemoteContexts() {
   tf_shared_lock l(remote_state_mu_);
   return remote_contexts_;
 }
@@ -520,9 +522,9 @@ bool EagerContext::IsRemoteContextsEmpty() {
 }
 
 void EagerContext::CloseAndClearAllRemoteContexts() {
-  uint64 context_id;
-  uint64 context_view_id;
-  std::vector<string> remote_contexts_copy;
+  uint64_t context_id;
+  uint64_t context_view_id;
+  std::vector<std::string> remote_contexts_copy;
   {
     mutex_lock l(remote_state_mu_);
     if (!is_master_) return;
@@ -541,8 +543,8 @@ void EagerContext::CloseAndClearAllRemoteContexts() {
 }
 
 void EagerContext::CloseRemoteContexts(
-    const std::vector<string>& remote_contexts, uint64 context_id,
-    uint64 context_view_id) {
+    const std::vector<std::string>& remote_contexts, uint64_t context_id,
+    uint64_t context_view_id) {
   // Close all remote contexts.
   eager::CloseContextRequest request;
   request.set_context_id(context_id);
@@ -689,21 +691,22 @@ EagerContext::~EagerContext() {
   }
 }
 
-bool EagerContext::FindFunctionByName(const string& name) const {
+bool EagerContext::FindFunctionByName(const std::string& name) const {
   return func_lib_def_.Find(name) != nullptr;
 }
 
 absl::Status EagerContext::FindFunctionOpData(
-    const string& name, const tensorflow::OpRegistrationData** op_data) {
+    const std::string& name, const tensorflow::OpRegistrationData** op_data) {
   return func_lib_def_.LookUp(name, op_data);
 }
 
-const FunctionDef* EagerContext::FindFunctionDef(const string& name) const {
+const FunctionDef* EagerContext::FindFunctionDef(
+    const std::string& name) const {
   return func_lib_def_.Find(name);
 }
 
 core::RefCountPtr<FunctionRecord> EagerContext::FindRecord(
-    const string& name) const {
+    const std::string& name) const {
   return func_lib_def_.FindRecord(name);
 }
 
@@ -763,7 +766,7 @@ std::vector<Device*> EagerContext::ListAllTfDevices() {
   // Since remote_device_mgr may also contain local devices, make sure no
   // duplicated device is returned.
   std::vector<Device*> devices;
-  std::unordered_set<string> dev_names;
+  std::unordered_set<std::string> dev_names;
 
   if (local_device_mgr()) {
     for (const auto& dev : local_device_mgr()->ListDevices()) {
@@ -832,7 +835,7 @@ void EagerContext::EndStep() {
     // TODO(b/139809335): This does not properly clean up remote resources
     // Clean up the previous step container and create a new one.
     step_container_ = std::make_unique<ScopedStepContainer>(
-        0, [this](const string& name) { ClearResourceContainer(name); });
+        0, [this](const std::string& name) { ClearResourceContainer(name); });
   }
 }
 
@@ -880,7 +883,7 @@ absl::Status EagerContext::MaybeRegisterFunctionRemotely(
 }
 
 absl::Status EagerContext::MaybeRemoveFunctionRemotely(
-    const string& function_name) {
+    const std::string& function_name) {
   // Only client context can remove function on remote worker context.
   if (!remote_device_manager_.Owned()) {
     return absl::OkStatus();
@@ -917,10 +920,10 @@ absl::Status EagerContext::MaybeRemoveFunctionRemotely(
 }
 
 absl::Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers(
-    const std::vector<string>& remote_workers) {
+    const std::vector<std::string>& remote_workers) {
 #if !defined(IS_MOBILE_PLATFORM)
   // Register multiple functions on selected remote workers.
-  uint64 context_id = GetContextId();
+  uint64_t context_id = GetContextId();
   FunctionDefLibrary function_defs = func_lib_def_.ToProto();
   std::vector<std::shared_ptr<eager::EnqueueRequest>> requests(
       function_defs.function_size());
@@ -1079,16 +1082,17 @@ absl::Status EagerContext::AddComponentFunction(
   return absl::OkStatus();
 }
 
-const FunctionDef* EagerContext::GetFunctionDef(const string& function_name) {
+const FunctionDef* EagerContext::GetFunctionDef(
+    const std::string& function_name) {
   return func_lib_def_.Find(function_name);
 }
 
-std::vector<string> EagerContext::ListFunctionNames() {
+std::vector<std::string> EagerContext::ListFunctionNames() {
   return func_lib_def_.ListFunctionNames();
 }
 
 absl::Status EagerContext::AddRemoveFunctionNotifier(
-    const string& func, std::function<void()> notifier) {
+    const std::string& func, std::function<void()> notifier) {
   mutex_lock l(remove_function_notifiers_mu_);
   auto iter = remove_function_notifiers_.find(func);
   if (iter != remove_function_notifiers_.end()) {
@@ -1122,7 +1126,7 @@ EagerContext::GetCacheStats() {
   return stats;
 }
 
-absl::Status EagerContext::RemoveFunction(const string& func) {
+absl::Status EagerContext::RemoveFunction(const std::string& func) {
   // TODO(mdan): The context owns these functions. Why check refcount then?
   std::vector<std::function<void()>> notifiers;
   bool is_last_ref = false;
@@ -1308,14 +1312,14 @@ absl::Status EagerContext::FindCompositeDeviceFromName(
   return errors::NotFound("Unknown composite device: ", device_name);
 }
 
-bool EagerContext::IsCustomDevice(const string& device_name) {
+bool EagerContext::IsCustomDevice(const std::string& device_name) {
   CustomDevice* device = nullptr;
   return custom_device_op_handler_.FindCustomDeviceFromName(device_name,
                                                             &device);
 }
 
 absl::Status EagerContext::RegisterCustomDevice(
-    const string& device_name, std::unique_ptr<CustomDevice> device) {
+    const std::string& device_name, std::unique_ptr<CustomDevice> device) {
   Device* existing_physical_device = nullptr;
   if (FindDeviceFromName(device_name.c_str(), &existing_physical_device).ok()) {
     return errors::AlreadyExists(device_name,
@@ -1326,14 +1330,15 @@ absl::Status EagerContext::RegisterCustomDevice(
 }
 
 absl::Status EagerContext::FindOrCreateCompositeDevice(
-    const std::vector<string>& underlying_devices, const string& device_name,
-    CompositeDevice** composite_device) {
+    const std::vector<std::string>& underlying_devices,
+    const std::string& device_name, CompositeDevice** composite_device) {
   if (!device_name.empty() &&
       FindCompositeDeviceFromName(device_name, composite_device).ok()) {
     return absl::OkStatus();
   }
 
-  const uint64 hash_key = Fingerprint64(absl::StrJoin(underlying_devices, ","));
+  const uint64_t hash_key =
+      Fingerprint64(absl::StrJoin(underlying_devices, ","));
 
   mutex_lock l(composite_devices_mu_);
   auto iter = composite_devices_.find(hash_key);
@@ -1371,14 +1376,14 @@ bool EagerContext::OnSameTask(const Device* first, const Device* second) const {
 // Gets the CPU device on the task of device.
 absl::Status EagerContext::CPUDeviceOnTask(const Device* device,
                                            Device** cpu_device) const {
-  string cpu_device_name;
+  std::string cpu_device_name;
   TF_RETURN_IF_ERROR(DeviceNameUtils::DeviceNameToCpuDeviceName(
       device->name(), &cpu_device_name));
 
   return FindDeviceFromName(cpu_device_name.c_str(), cpu_device);
 }
 
-void EagerContext::ClearResourceContainer(const string& name) {
+void EagerContext::ClearResourceContainer(const std::string& name) {
   // TODO(b/139809335): This does not properly clean up remote resources
   auto local_devices = local_device_mgr()->ListDevices();
   for (Device* device : local_devices) {
@@ -1406,8 +1411,8 @@ void EagerContext::UpdateGlobalRendezvousDeviceManager(
 }
 
 namespace {
-absl::Status GetTaskName(Device* d, string* task_name) {
-  string ignored;
+absl::Status GetTaskName(Device* d, std::string* task_name) {
+  std::string ignored;
   if (!DeviceNameUtils::SplitDeviceName(d->name(), task_name, &ignored)) {
     return errors::InvalidArgument("Unable to parse device name: ", d->name());
   }
@@ -1425,7 +1430,7 @@ absl::Status EagerContext::GetClient(
 absl::Status EagerContext::GetClient(
     const DeviceNameUtils::ParsedName& device_name,
     core::RefCountPtr<eager::EagerClient>* client) {
-  string device_task_name;
+  std::string device_task_name;
   if (!DeviceNameUtils::GetTaskName(device_name, &device_task_name)) {
     return errors::InvalidArgument(
         "Task is not fully specified in device name: ",
@@ -1457,7 +1462,8 @@ absl::Status EagerContext::GetClient(
 }
 
 absl::Status EagerContext::GetClient(
-    const string& remote_task, core::RefCountPtr<eager::EagerClient>* client) {
+    const std::string& remote_task,
+    core::RefCountPtr<eager::EagerClient>* client) {
   {
     tf_shared_lock l(remote_state_mu_);
     if (remote_eager_workers_ == nullptr) {
@@ -1474,12 +1480,12 @@ absl::Status EagerContext::GetClient(
   return absl::OkStatus();
 }
 
-uint64 EagerContext::GetContextId() const {
+uint64_t EagerContext::GetContextId() const {
   tf_shared_lock l(remote_state_mu_);
   return context_id_;
 }
 
-uint64 EagerContext::GetContextViewId() const {
+uint64_t EagerContext::GetContextViewId() const {
   tf_shared_lock l(remote_state_mu_);
   return context_view_id_;
 }
@@ -1544,9 +1550,10 @@ absl::Status EagerContext::StoreCollectiveOpsServer(
 }
 
 absl::Status EagerContext::SetRemoteDeviceFilters(
-    const string& remote_worker, const std::vector<string>& device_filters) {
+    const std::string& remote_worker,
+    const std::vector<std::string>& device_filters) {
   // Get fully specified task name for remote worker
-  string remote_worker_task_name;
+  std::string remote_worker_task_name;
   DeviceNameUtils::ParsedName pw;
   if (!DeviceNameUtils::ParseFullName(remote_worker, &pw)) {
     return tensorflow::errors::InvalidArgument(
@@ -1583,7 +1590,7 @@ absl::Status EagerContext::SetRemoteDeviceFilters(
 }
 
 void EagerContext::FilterDevicesForRemoteWorkers(
-    const string& remote_worker,
+    const std::string& remote_worker,
     const protobuf::RepeatedPtrField<DeviceAttributes>& device_attrs,
     std::vector<bool>* filtered_device_mask) {
   filtered_device_mask->resize(device_attrs.size());
@@ -1634,7 +1641,7 @@ absl::Status EagerContext::InitializeRemoteMaster(
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
-    const std::vector<string>& remote_contexts, uint64 context_id,
+    const std::vector<std::string>& remote_contexts, uint64_t context_id,
     tsl::core::RefCountPtr<Rendezvous> r, DeviceMgr* local_device_mgr,
     int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
@@ -1661,10 +1668,10 @@ absl::Status EagerContext::InitializeRemoteMaster(
 }
 
 absl::Status EagerContext::UpdateRemoteMaster(
-    uint64 context_id,
+    uint64_t context_id,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    const std::vector<string>& add_remote_contexts,
-    const std::vector<string>& remove_remote_contexts) {
+    const std::vector<std::string>& add_remote_contexts,
+    const std::vector<std::string>& remove_remote_contexts) {
   {
     tf_shared_lock l(remote_state_mu_);
     if (context_id != context_id_) {
@@ -1682,7 +1689,7 @@ absl::Status EagerContext::UpdateRemoteMaster(
     // a larger view id and ignores this request.
     CloseRemoteContexts(remove_remote_contexts, context_id, GetContextViewId());
     mutex_lock l(remote_state_mu_);
-    for (const string& remote_context : remove_remote_contexts) {
+    for (const std::string& remote_context : remove_remote_contexts) {
       remote_contexts_.erase(
           std::remove(remote_contexts_.begin(), remote_contexts_.end(),
                       remote_context),
@@ -1731,10 +1738,10 @@ absl::Status EagerContext::SetMasterContextState(
     std::unique_ptr<ServerInterface> server, WorkerEnv* worker_env,
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    std::unique_ptr<DynamicDeviceMgr> remote_device_manager, uint64 context_id,
-    uint64 context_view_id, tsl::core::RefCountPtr<Rendezvous> r,
-    DeviceMgr* local_device_mgr, int keep_alive_secs,
-    DistributedFunctionLibraryRuntime* cluster_flr,
+    std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
+    uint64_t context_id, uint64_t context_view_id,
+    tsl::core::RefCountPtr<Rendezvous> r, DeviceMgr* local_device_mgr,
+    int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
   mutex_lock l(remote_state_mu_);
@@ -1852,8 +1859,8 @@ absl::Status EagerContext::SetMasterContextState(
 absl::Status EagerContext::InitializeRemoteWorker(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     DynamicDeviceMgr* remote_device_mgr,
-    const std::vector<string>& remote_contexts, uint64 context_id,
-    uint64 context_view_id,
+    const std::vector<std::string>& remote_contexts, uint64_t context_id,
+    uint64_t context_view_id,
     std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
         rendezvous_creator,
     DistributedFunctionLibraryRuntime* cluster_flr,
@@ -1908,7 +1915,7 @@ absl::Status EagerContext::InitializeRemoteWorker(
 
 absl::Status EagerContext::UpdateRemoteWorker(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-    const std::vector<string>& remote_contexts, uint64 context_id) {
+    const std::vector<std::string>& remote_contexts, uint64_t context_id) {
   {
     mutex_lock l(remote_state_mu_);
     if (context_id != context_id_) {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 98fa2e7e31b9a7..1013cc17bf95fe 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -86,10 +86,10 @@ bool SkipRemoteHandleWaitReady();
 
 class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
  public:
-  static constexpr uint64 kInvalidContextId = 0;
+  static constexpr uint64_t kInvalidContextId = 0;
 
-  static uint64 NewContextId() {
-    uint64 context_id = random::New64();
+  static uint64_t NewContextId() {
+    uint64_t context_id = random::New64();
     while (context_id == kInvalidContextId) {
       context_id = random::New64();
     }
@@ -108,7 +108,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   void Release() override { Unref(); }
 
   AbstractTensorInterface* CreateInt64Scalar(int64_t value) override;
-  AbstractTensorInterface* CreateUint64Scalar(uint64 value) override;
+  AbstractTensorInterface* CreateUint64Scalar(uint64_t value) override;
   AbstractTensorInterface* CreateInt32Scalar(int32_t value) override;
   AbstractTensorInterface* CreateFloatScalar(float value) override;
   AbstractTensorInterface* CreateDoubleScalar(double value) override;
@@ -208,14 +208,14 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
                             const NodeDef& ndef, Device** out) const;
 
   // TODO(mdan): Rename to ContainsFunction.
-  bool FindFunctionByName(const string& name) const;
+  bool FindFunctionByName(const std::string& name) const;
 
   absl::Status FindFunctionOpData(
-      const string& name, const tensorflow::OpRegistrationData** op_data);
+      const std::string& name, const tensorflow::OpRegistrationData** op_data);
 
-  const FunctionDef* FindFunctionDef(const string& name) const override;
+  const FunctionDef* FindFunctionDef(const std::string& name) const override;
   core::RefCountPtr<FunctionRecord> FindRecord(
-      const string& name) const override;
+      const std::string& name) const override;
 
   Device* HostCPU() const { return host_cpu_device_; }
   Device* CanonicalDevice(Device* d) const {
@@ -225,7 +225,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
     return HostCPU()->parsed_name();
   }
 
-  const string& HostCPUName() const override { return HostCPU()->name(); }
+  const std::string& HostCPUName() const override { return HostCPU()->name(); }
 
   GraphCollector* GetGraphCollector() { return &graph_collector_; }
 
@@ -263,14 +263,14 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   absl::Status AddComponentFunction(const FunctionDef& fdef,
                                     const FunctionDefLibrary& library);
 
-  const FunctionDef* GetFunctionDef(const string& function_name);
+  const FunctionDef* GetFunctionDef(const std::string& function_name);
 
-  std::vector<string> ListFunctionNames() override;
+  std::vector<std::string> ListFunctionNames() override;
   tensorflow::ImmediateExecutionContext::CacheStats GetCacheStats() override;
 
-  absl::Status RemoveFunction(const string& func) override;
+  absl::Status RemoveFunction(const std::string& func) override;
   absl::Status AddRemoveFunctionNotifier(
-      const string& func, std::function<void()> notifier) override;
+      const std::string& func, std::function<void()> notifier) override;
 
   // Wait for pending nodes to be finished in local executors (including context
   // default executor and thread executors) and executors on remote workers.
@@ -401,7 +401,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   const FunctionLibraryDefinition* FuncLibDef() const { return &func_lib_def_; }
 
   FunctionLibraryDefinition* GetComponentFunctionFunctionLibraryDefinition(
-      const string& function_name) {
+      const std::string& function_name) {
     tf_shared_lock lock(cache_mu_);
     auto iter = component_function_libraries_.find(function_name);
     if (iter != component_function_libraries_.end()) {
@@ -421,11 +421,11 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
                          core::RefCountPtr<eager::EagerClient>* client);
   absl::Status GetClient(const DeviceNameUtils::ParsedName& device_name,
                          core::RefCountPtr<eager::EagerClient>* client);
-  absl::Status GetClient(const string& remote_task,
+  absl::Status GetClient(const std::string& remote_task,
                          core::RefCountPtr<eager::EagerClient>* client);
 
-  uint64 GetContextId() const;
-  uint64 GetContextViewId() const;
+  uint64_t GetContextId() const;
+  uint64_t GetContextViewId() const;
   void IncrementContextViewId();
 
   absl::Status EnableCollectiveOps(const ServerDef& server_def) override;
@@ -450,7 +450,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
-      const std::vector<string>& remote_contexts, uint64 context_id,
+      const std::vector<std::string>& remote_contexts, uint64_t context_id,
       tsl::core::RefCountPtr<Rendezvous> r,
       /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
@@ -464,18 +464,18 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // can still be accessed, and will automatically register existing functions
   // if there are newly added hosts.
   absl::Status UpdateRemoteMaster(
-      uint64 context_id,
+      uint64_t context_id,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-      const std::vector<string>& add_remote_contexts,
-      const std::vector<string>& remove_remote_contexts);
+      const std::vector<std::string>& add_remote_contexts,
+      const std::vector<std::string>& remove_remote_contexts);
 
   // Similar with InitializeRemoteMaster but this context will not kill remote
   // contexts in shutdown.
   absl::Status InitializeRemoteWorker(
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       DynamicDeviceMgr* remote_device_mgr,
-      const std::vector<string>& remote_contexts, uint64 context_id,
-      uint64 context_view_id,
+      const std::vector<std::string>& remote_contexts, uint64_t context_id,
+      uint64_t context_view_id,
       std::function<tsl::core::RefCountPtr<Rendezvous>(const int64_t)>
           rendezvous_creator,
       DistributedFunctionLibraryRuntime* cluster_flr,
@@ -487,7 +487,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // increment context_view_id.
   absl::Status UpdateRemoteWorker(
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
-      const std::vector<string>& remote_contexts, uint64 context_id);
+      const std::vector<std::string>& remote_contexts, uint64_t context_id);
 
   absl::Status StoreCollectiveOpsServer(
       std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
@@ -495,7 +495,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   // For the specified remote worker, preprocess and set its device filters.
   absl::Status SetRemoteDeviceFilters(
-      const string& remote_worker, const std::vector<string>& device_filters);
+      const std::string& remote_worker,
+      const std::vector<std::string>& device_filters);
 
   // For the specified remote worker, apply the stored device filters to the
   // list of device attributes following these rules:
@@ -507,7 +508,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // filtered_device_mask) indicating whether each of the devices is visible to
   // the remote worker.
   void FilterDevicesForRemoteWorkers(
-      const string& remote_worker,
+      const std::string& remote_worker,
       const protobuf::RepeatedPtrField<DeviceAttributes>& device_attrs,
       std::vector<bool>* filtered_device_mask);
 
@@ -567,10 +568,10 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   absl::Status FindCompositeDeviceFromName(absl::string_view device_name,
                                            CompositeDevice** device) const;
 
-  bool IsCustomDevice(const string& device_name) override;
+  bool IsCustomDevice(const std::string& device_name) override;
 
   absl::Status RegisterCustomDevice(
-      const string& name, std::unique_ptr<CustomDevice> device) override;
+      const std::string& name, std::unique_ptr<CustomDevice> device) override;
 
   CustomDeviceOpHandler& GetCustomDeviceOpHandler() override {
     return custom_device_op_handler_;
@@ -579,8 +580,8 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Find or create a composite device with the given `underlying_devices` and
   // `device_name` (if not empty).
   absl::Status FindOrCreateCompositeDevice(
-      const std::vector<string>& underlying_devices, const string& device_name,
-      CompositeDevice** composite_device);
+      const std::vector<std::string>& underlying_devices,
+      const std::string& device_name, CompositeDevice** composite_device);
 
   bool OnSameTask(const Device* first, const Device* second) const;
   // Gets the CPU device on the task of device.
@@ -667,9 +668,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   ~EagerContext() override;
 
   absl::Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
-  absl::Status MaybeRemoveFunctionRemotely(const string& function_name);
+  absl::Status MaybeRemoveFunctionRemotely(const std::string& function_name);
   absl::Status RegisterExistingFunctionsOnRemoteWorkers(
-      const std::vector<string>& remote_workers);
+      const std::vector<std::string>& remote_workers);
 
   void ResetPFLR(const DeviceMgr* device_mgr, Env* env,
                  const ConfigProto* config, int graph_def_version,
@@ -681,7 +682,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   void ResetClusterFLR(DistributedFunctionLibraryRuntime* cluster_flr);
   void UpdateGlobalRendezvousDeviceManager(tensorflow::DeviceMgr* device_mgr);
 
-  void ClearResourceContainer(const string& name);
+  void ClearResourceContainer(const std::string& name);
 
   template <typename T>
   struct OwnedOrUnownedHelper {
@@ -750,7 +751,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Maps from the fingerprint of a set of device names to a virtual
   // CompositeDevice.
   // TODO(b/145922293): Consider taking device names as keys.
-  absl::flat_hash_map<uint64, std::unique_ptr<CompositeDevice>>
+  absl::flat_hash_map<uint64_t, std::unique_ptr<CompositeDevice>>
       composite_devices_ ABSL_GUARDED_BY(composite_devices_mu_);
 
   FunctionLibraryDefinition func_lib_def_{OpRegistry::Global(),
@@ -780,10 +781,10 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   std::unordered_map<Fprint128, core::RefCountPtr<KernelAndDevice>,
                      Fprint128Hasher>
       kernel_cache_ TF_GUARDED_BY(cache_mu_);
-  std::unordered_map<string, RegisteredFunction*> registered_functions_
+  std::unordered_map<std::string, RegisteredFunction*> registered_functions_
       TF_GUARDED_BY(cache_mu_);
 
-  std::unordered_map<string, std::unique_ptr<FunctionLibraryDefinition>>
+  std::unordered_map<std::string, std::unique_ptr<FunctionLibraryDefinition>>
       component_function_libraries_ TF_GUARDED_BY(cache_mu_);
   absl::flat_hash_map<Fprint128, Device*, Fprint128Hasher> device_cache_
       TF_GUARDED_BY(device_cache_mu_);
@@ -830,11 +831,12 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   OwnedOrUnownedHelper<CollectiveExecutorMgrInterface> collective_executor_mgr_;
 
 #if !defined(IS_MOBILE_PLATFORM)
-  std::vector<string> GetRemoteContexts() TF_LOCKS_EXCLUDED(remote_state_mu_);
+  std::vector<std::string> GetRemoteContexts()
+      TF_LOCKS_EXCLUDED(remote_state_mu_);
   bool IsRemoteContextsEmpty() TF_LOCKS_EXCLUDED(remote_state_mu_);
   void CloseAndClearAllRemoteContexts();
-  void CloseRemoteContexts(const std::vector<string>& remote_contexts,
-                           uint64 context_id, uint64 context_view_id);
+  void CloseRemoteContexts(const std::vector<std::string>& remote_contexts,
+                           uint64_t context_id, uint64_t context_view_id);
 
   // TODO(b/184375824): clean up parameter order for better readability.
   absl::Status SetMasterContextState(
@@ -842,7 +844,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
       std::shared_ptr<WorkerSession> worker_session,
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
-      uint64 context_id, uint64 context_view_id,
+      uint64_t context_id, uint64_t context_view_id,
       tsl::core::RefCountPtr<Rendezvous> r,
       /*const*/ DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
@@ -858,12 +860,12 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   mutable mutex remote_state_mu_;
 
-  uint64 context_id_ TF_GUARDED_BY(remote_state_mu_);
+  uint64_t context_id_ TF_GUARDED_BY(remote_state_mu_);
   // The view id of an eager context should be set to 0 when context is created,
   // and continuously incremented when context with the same context_id gets
   // updated. The view id should be consistent between master and workers.
-  uint64 context_view_id_ TF_GUARDED_BY(remote_state_mu_);
-  std::vector<string> remote_contexts_ TF_GUARDED_BY(remote_state_mu_);
+  uint64_t context_view_id_ TF_GUARDED_BY(remote_state_mu_);
+  std::vector<std::string> remote_contexts_ TF_GUARDED_BY(remote_state_mu_);
   std::unique_ptr<eager::EagerClientCache> remote_eager_workers_
       TF_GUARDED_BY(remote_state_mu_);
 
@@ -880,7 +882,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   bool is_master_ TF_GUARDED_BY(remote_state_mu_);
 
   // Maps from a remote worker to a list of parsed device filters.
-  std::unordered_map<string, std::vector<DeviceNameUtils::ParsedName>>
+  std::unordered_map<std::string, std::vector<DeviceNameUtils::ParsedName>>
       cluster_device_filters_ TF_GUARDED_BY(remote_state_mu_);
 
   // A distributed manager that helps setup, update, and check liveness of
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index d51031b78b7387..deeab20af15aea 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -362,7 +362,7 @@ bool AreLocalDevicesCompatible(const EagerContext* context,
 }
 
 absl::Status AddRemoteDevicesToMgr(
-    const std::vector<string>& added_remote_workers,
+    const std::vector<std::string>& added_remote_workers,
     WorkerCacheInterface* worker_cache, DynamicDeviceMgr* remote_device_mgr) {
   std::vector<std::unique_ptr<Device>> remote_devices;
   mutex remote_devices_mu;
@@ -394,7 +394,7 @@ absl::Status AddRemoteDevicesToMgr(
 }
 
 absl::Status GetAllRemoteDevices(
-    const std::vector<string>& remote_workers,
+    const std::vector<std::string>& remote_workers,
     WorkerCacheInterface* worker_cache,
     std::unique_ptr<DynamicDeviceMgr>* device_mgr) {
   auto remote_device_mgr = std::make_unique<DynamicDeviceMgr>();
@@ -405,13 +405,13 @@ absl::Status GetAllRemoteDevices(
 }
 
 absl::Status RemoveRemoteDevicesFromMgr(
-    const std::vector<string>& removed_remote_workers,
+    const std::vector<std::string>& removed_remote_workers,
     DynamicDeviceMgr* remote_device_mgr) {
   const std::vector<Device*> remote_devices =
       (remote_device_mgr->ListDevices());
   std::vector<Device*> devices_to_remove;
   for (Device* d : remote_devices) {
-    for (const string& remote_worker : removed_remote_workers) {
+    for (const std::string& remote_worker : removed_remote_workers) {
       if (DeviceNameUtils::IsSameAddressSpace(remote_worker, d->name())) {
         devices_to_remove.emplace_back(d);
         break;
@@ -423,8 +423,8 @@ absl::Status RemoveRemoteDevicesFromMgr(
 }
 
 absl::Status ListRemoteWorkers(ServerInterface* server,
-                               const string& local_worker,
-                               std::vector<string>* remote_workers) {
+                               const std::string& local_worker,
+                               std::vector<std::string>* remote_workers) {
   server->master_env()->worker_cache->ListWorkers(remote_workers);
   remote_workers->erase(
       std::remove(remote_workers->begin(), remote_workers->end(), local_worker),
@@ -432,22 +432,22 @@ absl::Status ListRemoteWorkers(ServerInterface* server,
   return absl::OkStatus();
 }
 
-void DifferentiateWorkerLists(const std::vector<string>* current_list,
-                              const std::vector<string>* new_list,
-                              std::vector<string>* added,
-                              std::vector<string>* removed,
-                              std::vector<string>* existing) {
+void DifferentiateWorkerLists(const std::vector<std::string>* current_list,
+                              const std::vector<std::string>* new_list,
+                              std::vector<std::string>* added,
+                              std::vector<std::string>* removed,
+                              std::vector<std::string>* existing) {
   // Get STL set_difference and set_intersection with one list traversal.
   // Similar to the set_difference library function, the input lists
   // (`current_list` and `new_list`) must be sorted before calling the function.
   added->resize(new_list->size());
   removed->resize(current_list->size());
   existing->resize(current_list->size());
-  std::vector<string>::const_iterator curr_it = current_list->begin();
-  std::vector<string>::const_iterator new_it = new_list->begin();
-  std::vector<string>::iterator added_it = added->begin();
-  std::vector<string>::iterator removed_it = removed->begin();
-  std::vector<string>::iterator existing_it = existing->begin();
+  std::vector<std::string>::const_iterator curr_it = current_list->begin();
+  std::vector<std::string>::const_iterator new_it = new_list->begin();
+  std::vector<std::string>::iterator added_it = added->begin();
+  std::vector<std::string>::iterator removed_it = removed->begin();
+  std::vector<std::string>::iterator existing_it = existing->begin();
   while (curr_it != current_list->end() && new_it != new_list->end()) {
     if (*curr_it < *new_it) {
       *removed_it++ = *curr_it++;
@@ -466,10 +466,10 @@ void DifferentiateWorkerLists(const std::vector<string>* current_list,
 }
 
 absl::Status GetReplacedFromExistingWorkers(
-    const std::vector<string>* existing_workers, uint64 context_id,
-    uint64 context_view_id, const ServerDef& server_def,
+    const std::vector<std::string>* existing_workers, uint64_t context_id,
+    uint64_t context_view_id, const ServerDef& server_def,
     eager::EagerClientCache* client_cache,
-    std::vector<string>* replaced_workers) {
+    std::vector<std::string>* replaced_workers) {
   BlockingCounter counter(existing_workers->size());
   std::vector<absl::Status> statuses(existing_workers->size());
   eager::KeepAliveRequest request;
@@ -505,8 +505,8 @@ absl::Status GetReplacedFromExistingWorkers(
 }
 
 absl::Status CreateRemoteContexts(
-    EagerContext* context, const std::vector<string>& remote_workers,
-    uint64 context_id, uint64 context_view_id, int keep_alive_secs,
+    EagerContext* context, const std::vector<std::string>& remote_workers,
+    uint64_t context_id, uint64_t context_view_id, int keep_alive_secs,
     const ServerDef& server_def, eager::EagerClientCache* remote_eager_workers,
     bool async, const eager::CreateContextRequest& base_request,
     int64_t init_timeout_in_ms, int retries, bool clear_existing_contexts) {
@@ -514,7 +514,7 @@ absl::Status CreateRemoteContexts(
   BlockingCounter counter(num_remote_workers);
   std::vector<absl::Status> statuses(num_remote_workers);
   for (int i = 0; i < num_remote_workers; i++) {
-    const string& remote_worker = remote_workers[i];
+    const std::string& remote_worker = remote_workers[i];
     DeviceNameUtils::ParsedName parsed_name;
     if (!DeviceNameUtils::ParseFullName(remote_worker, &parsed_name)) {
       statuses[i] = errors::InvalidArgument("Unable to parse ", remote_worker,
@@ -583,10 +583,10 @@ absl::Status CreateRemoteContexts(
 }
 
 absl::Status UpdateRemoteContexts(
-    EagerContext* context, const std::vector<string>& remote_workers,
-    const std::vector<string>& added_workers,
-    const std::vector<string>& removed_workers, uint64 context_id,
-    uint64 context_view_id, const ServerDef& server_def,
+    EagerContext* context, const std::vector<std::string>& remote_workers,
+    const std::vector<std::string>& added_workers,
+    const std::vector<std::string>& removed_workers, uint64_t context_id,
+    uint64_t context_view_id, const ServerDef& server_def,
     eager::EagerClientCache* remote_eager_workers,
     const eager::CreateContextRequest& base_request) {
   int num_remote_workers = remote_workers.size();
@@ -594,8 +594,8 @@ absl::Status UpdateRemoteContexts(
   std::vector<absl::Status> statuses(num_remote_workers);
 
   int cluster_device_count = base_request.cluster_device_attributes_size();
-  std::unordered_set<string> added_or_removed(added_workers.begin(),
-                                              added_workers.end());
+  std::unordered_set<std::string> added_or_removed(added_workers.begin(),
+                                                   added_workers.end());
   std::copy(removed_workers.begin(), removed_workers.end(),
             std::inserter(added_or_removed, added_or_removed.end()));
   // Whether each device is in the updated (added or removed) workers
@@ -604,7 +604,7 @@ absl::Status UpdateRemoteContexts(
     const auto& da = base_request.cluster_device_attributes().at(i);
     DeviceNameUtils::ParsedName pn;
     DeviceNameUtils::ParseFullName(da.name(), &pn);
-    string task_name;
+    std::string task_name;
     DeviceNameUtils::GetTaskName(pn, &task_name);
     if (added_or_removed.find(task_name) != added_or_removed.end()) {
       device_added_or_removed[i] = true;
@@ -612,7 +612,7 @@ absl::Status UpdateRemoteContexts(
   }
 
   for (int i = 0; i < num_remote_workers; i++) {
-    const string& remote_worker = remote_workers[i];
+    const std::string& remote_worker = remote_workers[i];
     DeviceNameUtils::ParsedName parsed_name;
     if (!DeviceNameUtils::ParseFullName(remote_worker, &parsed_name)) {
       statuses[i] = errors::InvalidArgument("Unable to parse ", remote_worker,
@@ -689,15 +689,15 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
                                         bool reset_context, int keep_alive_secs,
                                         int64_t init_timeout_in_ms, int retries,
                                         bool clear_existing_contexts = false) {
-  string worker_name =
-      strings::StrCat("/job:", server_def.job_name(),
-                      "/replica:0/task:", server_def.task_index());
+  std::string worker_name =
+      absl::StrCat("/job:", server_def.job_name(),
+                   "/replica:0/task:", server_def.task_index());
 
   // List of current remote workers before updating server_def. Unused if
   // resetting the server_def.
-  std::vector<string> curr_remote_workers;
+  std::vector<std::string> curr_remote_workers;
   // List of updated remote workers.
-  std::vector<string> remote_workers;
+  std::vector<std::string> remote_workers;
 
   // New server created for new server_def. Unused if updating server_def.
   std::unique_ptr<ServerInterface> new_server;
@@ -722,10 +722,10 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
         ListRemoteWorkers(server, worker_name, &remote_workers));
   }
 
-  uint64 context_id = context->GetContextId();
+  uint64_t context_id = context->GetContextId();
   // TODO(b/291142876) Check for invalid context id here (instead of in the C
   // API).
-  uint64 context_view_id = context->GetContextViewId();
+  uint64_t context_view_id = context->GetContextViewId();
   if (reset_context) {
     context_id = EagerContext::NewContextId();
     context_view_id = 0;
@@ -757,10 +757,10 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
   // * existing_workers: set(curr_remote_workers) intersect set(remote_workers)
   // * replaced_workers: workers with the same task names and potentially the
   //     same `hostname:port`s, but replaced by different processes
-  std::vector<string> added_workers;
-  std::vector<string> removed_workers;
-  std::vector<string> existing_workers;
-  std::vector<string> replaced_workers;
+  std::vector<std::string> added_workers;
+  std::vector<std::string> removed_workers;
+  std::vector<std::string> existing_workers;
+  std::vector<std::string> replaced_workers;
 
   // New remote device manager created for new server_def. Unused if updating
   // server_def.
@@ -791,10 +791,11 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
         remote_eager_workers.get(), &replaced_workers));
     if (VLOG_IS_ON(1)) {
       VLOG(1) << "Updating cluster with following changes";
-      for (const string& w : added_workers) VLOG(1) << "  Added worker " << w;
-      for (const string& w : removed_workers)
+      for (const std::string& w : added_workers)
+        VLOG(1) << "  Added worker " << w;
+      for (const std::string& w : removed_workers)
         VLOG(1) << "  Removed worker " << w;
-      for (const string& w : replaced_workers)
+      for (const std::string& w : replaced_workers)
         VLOG(1) << "  Replaced worker " << w;
     }
     if (!replaced_workers.empty()) {
@@ -804,7 +805,7 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
                              replaced_workers.end());
       added_workers.insert(added_workers.end(), replaced_workers.begin(),
                            replaced_workers.end());
-      for (const string& w : replaced_workers) {
+      for (const std::string& w : replaced_workers) {
         existing_workers.erase(
             std::remove(existing_workers.begin(), existing_workers.end(), w),
             existing_workers.end());
@@ -868,7 +869,7 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
     }
     if (!existing_workers.empty()) {
       if (VLOG_IS_ON(1)) {
-        for (const string& w : existing_workers) {
+        for (const std::string& w : existing_workers) {
           VLOG(1) << "Updating cluster with existing worker " << w;
         }
       }
@@ -883,7 +884,7 @@ absl::Status UpdateContextWithServerDef(EagerContext* context,
     }
   }
 
-  auto session_name = strings::StrCat("eager_", context_id);
+  auto session_name = absl::StrCat("eager_", context_id);
   auto* session_mgr = server->worker_env()->session_mgr;
   if (reset_context) {
     tsl::core::RefCountPtr<RemoteRendezvous> r =
@@ -937,15 +938,16 @@ absl::Status EagerContextDistributedManager::SetOrUpdateServerDef(
     if (reset_context) {
       const auto& cdf = server_def.cluster_device_filters();
       for (const auto& jdf : cdf.jobs()) {
-        const string remote_prefix = "/job:" + jdf.name() + "/task:";
+        const std::string remote_prefix = "/job:" + jdf.name() + "/task:";
         for (const auto& tdf : jdf.tasks()) {
           const int32_t task_index = tdf.first;
-          std::vector<string> device_filters(tdf.second.device_filters_size());
+          std::vector<std::string> device_filters(
+              tdf.second.device_filters_size());
           for (int i = 0; i < tdf.second.device_filters_size(); i++) {
             device_filters[i] = tdf.second.device_filters(i);
           }
-          const string remote_worker =
-              strings::StrCat(remote_prefix, task_index);
+          const std::string remote_worker =
+              absl::StrCat(remote_prefix, task_index);
           TF_RETURN_IF_ERROR(
               context_->SetRemoteDeviceFilters(remote_worker, device_filters));
         }
@@ -973,9 +975,9 @@ absl::Status EagerContextDistributedManager::SetOrUpdateServerDef(
 
 absl::Status EagerContextDistributedManager::InitializeLocalOnlyContext(
     const ServerDef& server_def, int keep_alive_secs) {
-  string worker_name =
-      strings::StrCat("/job:", server_def.job_name(),
-                      "/replica:0/task:", server_def.task_index());
+  std::string worker_name =
+      absl::StrCat("/job:", server_def.job_name(),
+                   "/replica:0/task:", server_def.task_index());
   // New server created for new server_def. Unused if updating server_def.
   std::unique_ptr<ServerInterface> new_server;
   ServerInterface* server;
@@ -985,7 +987,7 @@ absl::Status EagerContextDistributedManager::InitializeLocalOnlyContext(
   LOG_AND_RETURN_IF_ERROR(
       NewServerWithOptions(server_def, {device_mgr}, &new_server));
   server = new_server.get();
-  uint64 context_id = EagerContext::NewContextId();
+  uint64_t context_id = EagerContext::NewContextId();
   // Make master eager context accessible by local eager service, which might
   // receive send tensor requests from remote workers.
   LOG_AND_RETURN_IF_ERROR(
@@ -995,7 +997,7 @@ absl::Status EagerContextDistributedManager::InitializeLocalOnlyContext(
   server->worker_env()->device_mgr->ListDeviceAttributes(
       &local_device_attributes);
 
-  auto session_name = strings::StrCat("eager_", context_id);
+  auto session_name = absl::StrCat("eager_", context_id);
   auto* session_mgr = server->worker_env()->session_mgr;
   tsl::core::RefCountPtr<RemoteRendezvous> r =
       server->worker_env()->rendezvous_mgr->Find(context_id);
@@ -1054,7 +1056,7 @@ absl::Status EagerContextDistributedManager::EnableCollectiveOps(
     const bool enable_coordination =
         !config.experimental().coordination_config().service_type().empty();
     if (enable_coordination) {
-      auto session_name = strings::StrCat("eager_", context_->GetContextId());
+      auto session_name = absl::StrCat("eager_", context_->GetContextId());
       std::shared_ptr<WorkerSession> worker_session;
       auto* session_mgr = server->worker_env()->session_mgr;
       // Start coordination service within session if this is the leader.
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index 56cdcf5c5fa746..590abf83871f67 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -50,7 +50,7 @@ using ::testing::HasSubstr;
 typedef FunctionDefHelper FDH;
 
 // Return a fake device.
-static Device* CreateDevice(const string& type, int n) {
+static Device* CreateDevice(const std::string& type, int n) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -99,7 +99,7 @@ class EagerContextTest : public ::testing::Test {
 
 TEST_F(EagerContextTest, CompositeDevice) {
   InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
-  std::vector<string> underlying_devices = {
+  std::vector<std::string> underlying_devices = {
       "/job:worker/replica:0/task:0/device:CPU:0",
       "/job:worker/replica:0/task:0/device:CPU:1"};
   CompositeDevice* composite_device_0 = nullptr;
@@ -134,10 +134,10 @@ TEST_F(EagerContextTest, CompositeDevice) {
 
 TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
   InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
-  const std::vector<string> underlying_devices_0 = {
+  const std::vector<std::string> underlying_devices_0 = {
       "/job:worker/replica:0/task:0/device:CPU:0",
       "/job:worker/replica:0/task:0/device:CPU:1"};
-  const string composite_device_name =
+  const std::string composite_device_name =
       "/job:worker1/replica:0/task:0/device:COMPOSITE:5";
   // Create a CompositeDevice with the given name.
   CompositeDevice* composite_device_0 = nullptr;
@@ -150,7 +150,7 @@ TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
       context()->FindCompositeDeviceFromName(composite_device_name, &device));
   EXPECT_EQ(device, composite_device_0);
 
-  std::vector<string> underlying_devices_1 = {
+  std::vector<std::string> underlying_devices_1 = {
       "/job:worker/replica:0/task:0/device:CPU:1",
       "/job:worker/replica:0/task:0/device:CPU:2"};
   // Find a CompositeDevice with the given name.
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 34fe7f2b122de0..d12f4965e1fded 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -71,8 +71,8 @@ class CopyToDeviceNode : public EagerNode {
 
   void Abort(absl::Status status) override { dst_->Poison(status, dstd_); }
 
-  string DebugString() const override {
-    string out = "[CopyToDeviceNode]";
+  std::string DebugString() const override {
+    std::string out = "[CopyToDeviceNode]";
     absl::StrAppend(&out, " src_tensor: ", src_->DebugString());
     absl::StrAppend(&out, ", dst_tensor: ", dst_->DebugString());
     absl::StrAppend(&out, ", dst_device: ", dstd_ ? dstd_->name() : "[]");
diff --git a/tensorflow/core/common_runtime/eager/custom_device.h b/tensorflow/core/common_runtime/eager/custom_device.h
index 2f4f5acc95549f..f72f76b0f5a7ca 100644
--- a/tensorflow/core/common_runtime/eager/custom_device.h
+++ b/tensorflow/core/common_runtime/eager/custom_device.h
@@ -37,13 +37,14 @@ class CustomDeviceTensorHandle;
 class CustomDevice {
  public:
   virtual ~CustomDevice() = default;
-  virtual const string& name() = 0;
+  virtual const std::string& name() = 0;
   virtual absl::Status CopyTensorToDevice(
       ImmediateExecutionTensorHandle* tensor,
       ImmediateExecutionTensorHandle** result) = 0;
 
   virtual absl::Status CopyTensorFromDevice(
-      ImmediateExecutionTensorHandle* tensor, const string& target_device_name,
+      ImmediateExecutionTensorHandle* tensor,
+      const std::string& target_device_name,
       ImmediateExecutionTensorHandle** result) = 0;
 
   virtual absl::Status Execute(const ImmediateExecutionOperation* op,
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
index 426930f04b8cda..2a736e67bae789 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
@@ -26,7 +26,7 @@ namespace tensorflow {
 void CustomDeviceOpHandler::Clear() { custom_devices_.clear(); }
 
 absl::Status CustomDeviceOpHandler::RegisterCustomDevice(
-    const string& device_name, std::unique_ptr<CustomDevice> device) {
+    const std::string& device_name, std::unique_ptr<CustomDevice> device) {
   DeviceNameUtils::ParsedName parsed;
   if (!DeviceNameUtils::ParseFullName(device_name, &parsed) ||
       !parsed.has_job || !parsed.has_replica || !parsed.has_task ||
@@ -46,7 +46,7 @@ absl::Status CustomDeviceOpHandler::RegisterCustomDevice(
 }
 
 bool CustomDeviceOpHandler::FindCustomDeviceFromName(
-    const string& name, CustomDevice** device) const {
+    const std::string& name, CustomDevice** device) const {
   auto dev_it = custom_devices_.find(name);
   if (dev_it == custom_devices_.end()) {
     return false;
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.h b/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
index 6c38e50d458dcd..66d186014b2176 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
@@ -29,11 +29,11 @@ class CustomDeviceOpHandler {
  public:
   ~CustomDeviceOpHandler() = default;
   // Register a new custom device.
-  absl::Status RegisterCustomDevice(const string& device_name,
+  absl::Status RegisterCustomDevice(const std::string& device_name,
                                     std::unique_ptr<CustomDevice> device);
 
   // Find the custom device from given name. Return true if it finds one.
-  bool FindCustomDeviceFromName(const string& name,
+  bool FindCustomDeviceFromName(const std::string& name,
                                 CustomDevice** device) const;
 
   absl::Status Execute(ImmediateExecutionOperation* op,
@@ -53,7 +53,8 @@ class CustomDeviceOpHandler {
   void Clear();
 
  private:
-  std::unordered_map<string, std::unique_ptr<CustomDevice>> custom_devices_;
+  std::unordered_map<std::string, std::unique_ptr<CustomDevice>>
+      custom_devices_;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index fc552f3127576d..02f8eae99fb80a 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -117,7 +117,7 @@ absl::Status EagerExecutor::SyncExecute(EagerNode* node) {
   }
   // NOTE: SyncExecute runs every node regardless of error status in executor.
 
-  uint64 id = next_node_id_++;
+  uint64_t id = next_node_id_++;
 
   absl::Status s = node->Prepare();
   if (!s.ok()) {
@@ -312,9 +312,9 @@ void EagerExecutor::NodeDone(const core::RefCountPtr<NodeItem>& item,
   // a deadlock.
 }
 
-void EagerExecutor::NotifyWaiters(uint64 id) {
+void EagerExecutor::NotifyWaiters(uint64_t id) {
   if (!node_done_notifications_.empty()) {
-    uint64 upperbound_id = 0;
+    uint64_t upperbound_id = 0;
     if (!unfinished_nodes_.empty()) {
       upperbound_id = unfinished_nodes_.begin()->first - 1;
     } else if (!node_queue_.empty()) {
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index 7826b271ec98f1..ff8ce9cbc7322c 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -76,7 +76,7 @@ class EagerNode {
   virtual AsyncEagerNode* AsAsync() { return nullptr; }
   virtual AsyncRemoteExecuteNode* AsAsyncRemoteExecuteNode() { return nullptr; }
 
-  virtual string DebugString() const = 0;
+  virtual std::string DebugString() const = 0;
 
   // Indicates whether a node failure should make the executor unusable.
   virtual bool Fatal() const { return true; }
@@ -193,7 +193,7 @@ class EagerExecutor {
   struct NodeItem : core::RefCounted {
     // Unique id generated in EagerExecutor::Add(). If item1.id < item2.id, it
     // means item1.node is added before item2.node.
-    uint64 id;
+    uint64_t id;
     std::unique_ptr<EagerNode> node;
     NodeState state;
   };
@@ -203,7 +203,8 @@ class EagerExecutor {
 
   void NodeDone(const core::RefCountPtr<NodeItem>& item,
                 const absl::Status& status, bool from_queue);
-  void NotifyWaiters(uint64 id) TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
+  void NotifyWaiters(uint64_t id)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
   // Starts execution of pending EagerNodes. This function loops till executor
   // state_ is set to kShutDown. If any errors are encountered, these are set
@@ -220,9 +221,9 @@ class EagerExecutor {
   absl::Status WaitForAllPendingNodesLocked(mutex_lock* lock)
       TF_EXCLUSIVE_LOCKS_REQUIRED(node_queue_mutex_);
 
-  absl::Status WaitImpl(bool wait_all, uint64 node_id);
+  absl::Status WaitImpl(bool wait_all, uint64_t node_id);
 
-  std::atomic<uint64> next_node_id_;
+  std::atomic<uint64_t> next_node_id_;
 
   mutable mutex node_queue_mutex_;
 
@@ -236,7 +237,7 @@ class EagerExecutor {
       TF_GUARDED_BY(node_queue_mutex_);
 
   // Ordered by NodeItem::id.
-  std::map<uint64, core::RefCountPtr<NodeItem>, std::less<uint64>>
+  std::map<uint64_t, core::RefCountPtr<NodeItem>, std::less<uint64_t>>
       unfinished_nodes_ TF_GUARDED_BY(node_queue_mutex_);
 
   // `status_` is set based on any errors raised during execution of a
@@ -248,7 +249,7 @@ class EagerExecutor {
   // These condition_variables are notified and removed when that EagerNode is
   // done executing, or if an error is found in execution of any EagerNode.
   // The map is ordered by id.
-  std::multimap<uint64, condition_variable*, std::less<uint64>>
+  std::multimap<uint64_t, condition_variable*, std::less<uint64_t>>
       node_done_notifications_ TF_GUARDED_BY(node_queue_mutex_);
 
   // thread_exited_notification_ is notified by the `thread_` right before it
diff --git a/tensorflow/core/common_runtime/eager/eager_executor_test.cc b/tensorflow/core/common_runtime/eager/eager_executor_test.cc
index 3fc6f3860085f0..acaba8320ed871 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor_test.cc
@@ -63,7 +63,7 @@ class TestEagerNode : public EagerNode {
   };
 
   void Abort(absl::Status status) override {}
-  string DebugString() const override { return "testEagerNode"; }
+  std::string DebugString() const override { return "testEagerNode"; }
 
  private:
   TestState* state_;
@@ -94,7 +94,7 @@ class TestAsyncEagerNode : public AsyncEagerNode {
   };
 
   void Abort(absl::Status status) override {}
-  string DebugString() const override { return "testAsyncEagerNode"; }
+  std::string DebugString() const override { return "testAsyncEagerNode"; }
 
  private:
   TestState* state_;
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
index bd7098473d7532..221d30d98518f6 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
@@ -28,7 +28,7 @@ namespace tensorflow {
 // implement the Run method.
 class EagerOpRewrite {
  public:
-  EagerOpRewrite(string name, string file, string line) {
+  EagerOpRewrite(std::string name, std::string file, std::string line) {
     debug_info_.name = name;
     debug_info_.file = file;
     debug_info_.line = line;
@@ -43,7 +43,7 @@ class EagerOpRewrite {
 
   // Holds information about the rewrite registration.
   struct DebugInfo {
-    string name, file, line;
+    std::string name, file, line;
   };
 
   // Returns information about the registered Eager op rewrite.
@@ -75,7 +75,7 @@ class EagerOpRewriteRegistry {
  private:
   static constexpr int32_t kNumPhases = 2;
   // Holds all the registered Eager op rewrites and their ordinal numbers.
-  std::array<std::list<std::pair<std::unique_ptr<EagerOpRewrite>, int32>>,
+  std::array<std::list<std::pair<std::unique_ptr<EagerOpRewrite>, int32_t>>,
              kNumPhases>
       rewrites_;
 };
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
index d50f3e0a4ec411..e76627a3680daf 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 
 class TestEagerOpRewrite : public EagerOpRewrite {
  public:
-  TestEagerOpRewrite(string name, string file, string line)
+  TestEagerOpRewrite(std::string name, std::string file, std::string line)
       : EagerOpRewrite(name, file, line),
         executor_(/*async=*/false, /*enable_streaming_enqueue=*/true) {}
   static int count_;
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index b14cbeeba9bb81..d730df6b608b06 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -473,7 +473,7 @@ absl::Status EagerOperation::MutableTensorHandleInputs(
 }
 
 absl::Status EagerOperation::SetDeviceName(const char* c_name) {
-  string name(c_name != nullptr ? c_name : "");
+  std::string name(c_name != nullptr ? c_name : "");
   if (name != last_set_device_name_) {
     if (!DeviceNameUtils::ParseFullName(name, &device_parsed_name_)) {
       return errors::InvalidArgument("Malformed device specification '", name,
@@ -498,7 +498,7 @@ bool EagerOperation::IsLocal() const {
          device_parsed_name_.task == host_cpu_name.task;
 }
 
-string VariantDeviceDebugString(VariantDevice device) {
+std::string VariantDeviceDebugString(VariantDevice device) {
   if (device == kVariantDeviceNull) {
     return "[]";
   } else if (std::holds_alternative<CustomDevice*>(device)) {
@@ -513,8 +513,8 @@ void EagerOperation::AddAttrs(const AbstractOpAttrs* op_attrs) {
   attrs_.CopyAttributes(*(down_cast<const AttrBuilder*>(op_attrs)));
 }
 
-string EagerOperation::DebugString() const {
-  string out;
+std::string EagerOperation::DebugString() const {
+  std::string out;
   VLOG(1) << "EagerOperation::DebugString() over " << this;
 
   absl::StrAppend(&out, "Name: ", Name(), "\n");
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 04cefa00861198..b51e098413685d 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -57,9 +57,9 @@ class EagerOperation : public ImmediateExecutionOperation {
     return Reset(op, raw_device_name, false, nullptr);
   }
 
-  const string& Name() const override { return attrs_.op_name(); }
+  const std::string& Name() const override { return attrs_.op_name(); }
 
-  const string& DeviceName() const override { return device_name_; }
+  const std::string& DeviceName() const override { return device_name_; }
 
   ImmediateExecutionContext* GetContext() const override { return &ctx_; }
 
@@ -196,7 +196,7 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   // This is useful if we want the EagerOperation to point to a different
   // function.
-  void UpdateName(const string& name) {
+  void UpdateName(const std::string& name) {
     attrs_.set_op_name(name);
     op_name_ = attrs_.op_name();
   }
@@ -242,7 +242,7 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   EagerExecutor& Executor() { return *executor_; }
 
-  string DebugString() const;
+  std::string DebugString() const;
 
   const absl::optional<EagerFunctionParams>& eager_func_params() const {
     return eager_func_params_;
@@ -289,12 +289,12 @@ class EagerOperation : public ImmediateExecutionOperation {
   // The last device name given to SetDeviceName.
   // This is used to avoid having to re-process the same device in repeated
   // calls to SetDeviceName.
-  string last_set_device_name_;
+  std::string last_set_device_name_;
 
   // The operation's device name.
   // This contains the named passed to SetDeviceName until device_ is set,
   // at which point it contains the device_ name.
-  string device_name_;
+  std::string device_name_;
 
   // The parsed device name.
   // This will always contain the result of
diff --git a/tensorflow/core/common_runtime/eager/eager_operation_test.cc b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
index 499d2ef110bfd9..2ff6952eb0d17e 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
@@ -68,7 +68,7 @@ TEST(EagerOperationTest, EagerFunctionParamsAndStepId) {
   auto op = new EagerOperation(ctx);
   EXPECT_FALSE(op->eager_func_params().has_value());
 
-  string device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  std::string device_name = "/job:localhost/replica:0/task:0/device:CPU:0";
   TF_ASSERT_OK(op->SetDeviceName(device_name.c_str()));
   TF_ASSERT_OK(op->Reset("DummyFunction", device_name.c_str()));
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index d12d51db3907f9..547336cdeb6d76 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -132,8 +132,8 @@ bool SendAsProtosWhenPossible() {
   return send_as_protos_when_possible;
 }
 
-const string& DeviceNameOrUnspecified(Device* device) {
-  static string* unspecified_string = new string("<unspecified>");
+const std::string& DeviceNameOrUnspecified(Device* device) {
+  static std::string* unspecified_string = new std::string("<unspecified>");
   return (device == nullptr) ? *unspecified_string : device->name();
 }
 
@@ -158,7 +158,7 @@ absl::Status CopyInputToExpectedDevice(EagerContext* ctx, EagerOperation* op,
   // Should only be called when these don't match
   DCHECK(expected_input_device != handle_device);
   *result = nullptr;
-  const string& op_device_name = DeviceNameOrUnspecified(op_device);
+  const std::string& op_device_name = DeviceNameOrUnspecified(op_device);
 
   switch (ctx->GetDevicePlacementPolicy()) {
     case DEVICE_PLACEMENT_SILENT_FOR_INT32:
@@ -314,7 +314,7 @@ absl::Status GetDeviceForInput(const EagerOperation& op,
                                const bool is_host_memory_arg,
                                TensorHandle* tensor_handle, Device** result) {
   Device* cpu_device = ctx.HostCPU();
-  string device_name;
+  std::string device_name;
   if (tensor_handle->Type() != TensorHandle::LOCAL) {
     Device* device = tensor_handle->device();
     device_name = device != nullptr ? device->name() : cpu_device->name();
@@ -473,7 +473,7 @@ absl::Status MustCompileWithXLA(const EagerOperation* op,
 // `has_jit_compile` and `device`.
 absl::Status HasNestedJitCompile(const EagerOperation& op,
                                  const EagerContext& ctx, bool* has_jit_compile,
-                                 string* device) {
+                                 std::string* device) {
   *has_jit_compile = false;
 
   const std::string kStatefulPartitionedCallOp = "StatefulPartitionedCall";
@@ -488,7 +488,7 @@ absl::Status HasNestedJitCompile(const EagerOperation& op,
   const FunctionLibraryDefinition* func_lib_def = op.FuncLibDef();
 
   while (!function_names.empty()) {
-    const string& function_name = function_names.front();
+    const std::string& function_name = function_names.front();
 
     const FunctionDef* function_def = func_lib_def->Find(function_name);
     if (function_def == nullptr) {
@@ -518,8 +518,8 @@ absl::Status HasNestedJitCompile(const EagerOperation& op,
   return absl::OkStatus();
 }
 
-string CanonicalizeDeviceType(std::string_view device_type) {
-  string canonical_device_type = "Unknown";
+std::string CanonicalizeDeviceType(std::string_view device_type) {
+  std::string canonical_device_type = "Unknown";
   if (device_type == "XLA_CPU" || device_type == tensorflow::DEVICE_CPU) {
     canonical_device_type = tensorflow::DEVICE_CPU;
   }
@@ -542,11 +542,12 @@ absl::Status UpdateCompileCounter(const EagerOperation* op,
     return absl::OkStatus();
   }
 
-  string device_type = CanonicalizeDeviceType(op->GetDeviceParsedName().type);
-  string compilation_option = kDisabled;
+  std::string device_type =
+      CanonicalizeDeviceType(op->GetDeviceParsedName().type);
+  std::string compilation_option = kDisabled;
   if (!compile_with_xla) {
     bool nested_jit_compile = false;
-    string device;
+    std::string device;
     if (!ctx.FuncLibDef()->HasOptimizedFunctionGraph(op->Name())) {
       TF_RETURN_IF_ERROR(
           HasNestedJitCompile(*op, ctx, &nested_jit_compile, &device));
@@ -586,14 +587,14 @@ absl::Status UpdateCompileCounter(const EagerOperation* op,
 
 using ProtoArgListType = protobuf::RepeatedPtrField<OpDef_ArgDef>;
 
-string EscapeOrigName(const string& orig_name) {
+std::string EscapeOrigName(const std::string& orig_name) {
   // Replace _ with __ in the original name to avoid name conflicts.
   return absl::StrReplaceAll(orig_name, {{"_", "__"}});
 }
 
 // Variadic args are flattened during wrapping. This utility returns the name
 // of a flattened arg/attr.
-string GetFlatName(const string orig_name, int index) {
+std::string GetFlatName(const std::string orig_name, int index) {
   return absl::StrCat(EscapeOrigName(orig_name), "_", index);
 }
 
@@ -607,13 +608,14 @@ string GetFlatName(const string orig_name, int index) {
 // IdentityN[T:[DT_FLOAT, DT_INT64]] -> __wrapped__IdentityN_T_2
 // Concat[N:2, T:DT_FLOAT] -> __wrapped__Concat_N_2
 absl::Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
-                                const AbstractOpAttrs* op_attrs, string* name) {
-  string fname = absl::StrCat("__wrapped__", EscapeOrigName(op->Name()));
+                                const AbstractOpAttrs* op_attrs,
+                                std::string* name) {
+  std::string fname = absl::StrCat("__wrapped__", EscapeOrigName(op->Name()));
   // For every variadic arg in `args`, populates `attr_to_len` with
   // (attr_name, len(arg)).
   auto FillAttrToLen = [op_attrs, op](
                            const ProtoArgListType& args,
-                           absl::btree_map<string, int>* attr_to_len) {
+                           absl::btree_map<std::string, int>* attr_to_len) {
     for (const auto& arg : args) {
       if (!arg.type_list_attr().empty()) {
         absl::InlinedVector<DataType, 4UL> type_list;
@@ -631,7 +633,7 @@ absl::Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
     }
     return absl::OkStatus();
   };
-  absl::btree_map<string, int> attr_to_len;
+  absl::btree_map<std::string, int> attr_to_len;
   TF_RETURN_IF_ERROR(FillAttrToLen(opdef.input_arg(), &attr_to_len));
   TF_RETURN_IF_ERROR(FillAttrToLen(opdef.output_arg(), &attr_to_len));
   for (auto& name_len : attr_to_len) {
@@ -768,7 +770,8 @@ absl::Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
 // Note that the N attr is preserved so that it can get copied to the
 // inner op via a placeholder. This allows additional verification.
 absl::Status BuildWrappedOpSignature(EagerOperation* op, const OpDef& opdef,
-                                     const string& fname, OpDef& signature) {
+                                     const std::string& fname,
+                                     OpDef& signature) {
   signature = opdef;
   signature.clear_input_arg();
   signature.clear_output_arg();
@@ -777,7 +780,7 @@ absl::Status BuildWrappedOpSignature(EagerOperation* op, const OpDef& opdef,
   auto FillSignatureArgs = [op_attrs, op](
                                const ProtoArgListType& opdef_args,
                                ProtoArgListType* sig_args,
-                               absl::flat_hash_set<string>& new_attrs) {
+                               absl::flat_hash_set<std::string>& new_attrs) {
     for (const auto& arg : opdef_args) {
       if (!arg.type_list_attr().empty()) {
         absl::InlinedVector<DataType, 4UL> type_list;
@@ -817,7 +820,7 @@ absl::Status BuildWrappedOpSignature(EagerOperation* op, const OpDef& opdef,
     }
     return absl::OkStatus();
   };
-  absl::flat_hash_set<string> new_attrs;
+  absl::flat_hash_set<std::string> new_attrs;
   TF_RETURN_IF_ERROR(FillSignatureArgs(
       opdef.input_arg(), signature.mutable_input_arg(), new_attrs));
   TF_RETURN_IF_ERROR(FillSignatureArgs(
@@ -838,7 +841,7 @@ absl::Status AddMixedTypeListAttrs(EagerOperation* wrapped_op,
                                    const OpDef& opdef) {
   auto FillAttrsToAdd =
       [op_attrs](const ProtoArgListType& opdef_args,
-                 absl::flat_hash_map<string, DataType>* attrs_to_add) {
+                 absl::flat_hash_map<std::string, DataType>* attrs_to_add) {
         for (const auto& arg : opdef_args) {
           if (!arg.type_list_attr().empty()) {
             absl::InlinedVector<DataType, 4UL> type_list;
@@ -852,7 +855,7 @@ absl::Status AddMixedTypeListAttrs(EagerOperation* wrapped_op,
         }
         return absl::OkStatus();
       };
-  absl::flat_hash_map<string, DataType> attrs_to_add;
+  absl::flat_hash_map<std::string, DataType> attrs_to_add;
   TF_RETURN_IF_ERROR(FillAttrsToAdd(opdef.input_arg(), &attrs_to_add));
   TF_RETURN_IF_ERROR(FillAttrsToAdd(opdef.output_arg(), &attrs_to_add));
   for (auto& name_type : attrs_to_add) {
@@ -867,7 +870,8 @@ absl::Status AddMixedTypeListAttrs(EagerOperation* wrapped_op,
 // outputs which need to be flattened.
 absl::Status PopulateRetMap(FunctionDef* fdef, const AbstractOpAttrs* op_attrs,
                             const EagerOperation* op, const OpDef& opdef,
-                            const OpDef& signature, const string& node_name) {
+                            const OpDef& signature,
+                            const std::string& node_name) {
   int next_sig_output = 0;
   for (size_t i = 0; i < opdef.output_arg_size(); i++) {
     const auto& output_arg = opdef.output_arg(i);
@@ -916,7 +920,7 @@ absl::Status WrapInCallOp(EagerOperation* op, EagerOperation** wrapped_op) {
   // TODO(srbs): Support list inputs/outputs.
   auto verify_wrappable_in_call_op = [](const OpDef& opdef,
                                         EagerOperation* op) -> absl::Status {
-    absl::flat_hash_set<string> opdef_attrs;
+    absl::flat_hash_set<std::string> opdef_attrs;
     for (const auto& attr : opdef.attr()) {
       opdef_attrs.insert(attr.name());
     }
@@ -941,7 +945,7 @@ absl::Status WrapInCallOp(EagerOperation* op, EagerOperation** wrapped_op) {
   // This can be avoided by introducing a dict in EagerContext that stores a
   // mapping from the eager op's name to its unique FunctionDef name.
   auto op_attrs = op->GetOpAttrs();
-  string fname;
+  std::string fname;
   TF_RETURN_IF_ERROR(BuildWrappedOpName(op, opdef, op_attrs, &fname));
   if (!op->EagerContext().GetFunctionDef(fname)) {
     FunctionDef fdef;
@@ -1168,7 +1172,8 @@ absl::StatusOr<Fprint128> GetKernelCacheKey(
 absl::Status ExtractFunctionInputInfo(
     EagerOperation* op, const KernelDef* kernel_def,
     std::vector<Device*>& input_device_ptrs,
-    absl::flat_hash_map<string, const std::vector<string>*>& composite_devices,
+    absl::flat_hash_map<std::string, const std::vector<std::string>*>&
+        composite_devices,
     std::unordered_map<int, DtypeAndPartialTensorShape>&
         input_resource_variable_dtypes_and_shapes) {
   tsl::profiler::TraceMe activity("EagerCopyToDevice",
@@ -1268,7 +1273,7 @@ absl::Status GetOrCreateKernelAndDevice(
   if (is_small_constant_optimization_enabled(*op)) {
     TF_ASSIGN_OR_RETURN(BoolTensorInputs bool_inputs,
                         GetBoolInputs(op, /*delete_inputs=*/false));
-    string folded_name = op->Name();
+    std::string folded_name = op->Name();
     for (const auto& [input_name, input_value] : bool_inputs) {
       folded_name = small_constants_optimizer::FoldedFunctionName(
           folded_name, input_name, input_value);
@@ -1320,7 +1325,8 @@ absl::Status GetOrCreateKernelAndDevice(
       (ctx.RunEagerOpAsFunction() && !op->is_function());
 
   std::vector<Device*> input_device_ptrs;
-  absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
+  absl::flat_hash_map<std::string, const std::vector<std::string>*>
+      composite_devices;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_variable_dtypes_and_shapes;
   const KernelDef* kernel_def = nullptr;
@@ -1380,7 +1386,7 @@ absl::Status GetOrCreateKernelAndDevice(
     bool run_function_with_flr = false;
     bool function_runs_at_most_once = FunctionRunsAtMostOnce(op, ctx);
 
-    std::optional<string> xla_compile_device_type;
+    std::optional<std::string> xla_compile_device_type;
     if (op->is_function()) {
       bool compile_with_xla;
       // By default we should run functions with FunctionLibraryRuntime.
@@ -1474,7 +1480,8 @@ absl::Status GetOrCreateKernelAndDevice(
         // Check if any of the Op's output_arg(s) are pinned to Host.
         if (kernel_def == nullptr) return false;
         const OpDef& op_def = OpRegistry::Global()->LookUp(op->Name())->op_def;
-        for (const string& host_memory_arg : kernel_def->host_memory_arg()) {
+        for (const std::string& host_memory_arg :
+             kernel_def->host_memory_arg()) {
           for (const auto& output_arg : op_def.output_arg()) {
             if (output_arg.name() == host_memory_arg) {
               return false;
@@ -1613,7 +1620,7 @@ absl::Status CreateUnshapedOutput(
     return errors::InvalidArgument(
         "Unable to find a remote op id for a remote output of ", kernel.name());
   }
-  string remote_task;
+  std::string remote_task;
   if (!DeviceNameUtils::GetTaskName(output_device->parsed_name(),
                                     &remote_task)) {
     return errors::InvalidArgument(
@@ -1762,8 +1769,8 @@ absl::Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
 
   if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
-                              kernel->device()->name());
+    std::string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
+                                   kernel->device()->name());
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
@@ -1828,15 +1835,15 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   // TODO(fishx): Remove following code when lazy tensor copy is ready.
   if (op->Device() == kVariantDeviceNull) {
     tensorflow::Device* device = nullptr;
-    string device_name = op->DeviceName();
+    std::string device_name = op->DeviceName();
     TF_RETURN_IF_ERROR(ctx.FindDeviceFromName(device_name.c_str(), &device));
     op->SetDevice(device);
   }
 
   core::RefCountPtr<eager::EagerClient> eager_client;
-  uint64 context_id = ctx.GetContextId();
+  uint64_t context_id = ctx.GetContextId();
   TF_RETURN_IF_ERROR(ctx.GetClient(op->GetDeviceParsedName(), &eager_client));
-  string remote_task;
+  std::string remote_task;
   if (!DeviceNameUtils::GetTaskName(op->GetDeviceParsedName(), &remote_task)) {
     return errors::InvalidArgument(
         "Unable to find remote task corresponding to device ",
@@ -1859,7 +1866,7 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       tensorflow::TensorHandle* input = (*inputs)[i];
       tensorflow::Device* input_device = input->device();
       tensorflow::Device* input_device_or_cpu = input->DeviceOrHostCPU(ctx);
-      const string* input_device_name = &input_device_or_cpu->name();
+      const std::string* input_device_name = &input_device_or_cpu->name();
       bool serialize_resource_dtype_and_shape = false;
       if (op_device != input_device &&
           // If the expected and actual devices are on the same task, don't
@@ -1986,7 +1993,7 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   }
   *num_retvals = num_outputs;
 
-  const tensorflow::uint64 id = remote_op->id();
+  const uint64_t id = remote_op->id();
   for (size_t i = 0; i < num_outputs; ++i) {
     // TODO(nareshmodi): Change the callback to instead add the decref to a
     // list of pending decrefs that we can send as a batch with the next
@@ -2048,7 +2055,7 @@ absl::Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       {retvals, num_outputs}));
 
   if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat(
+    std::string msg = absl::StrCat(
         "Executing op ", op->Name(), " on task ",
         DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
     if (!logging::LogToListeners(msg)) {
@@ -2362,7 +2369,7 @@ absl::Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
     return errors::Unimplemented(
         "Eager's remote execution is not available on mobile devices.");
 #else   // !IS_MOBILE_PLATFORM
-    uint64 recv_op_id = 0;
+    uint64_t recv_op_id = 0;
     if (receiver_is_local) {
       Device* d = ctx->CanonicalDevice(device);
       // TODO(gjn): Need to add support for async execution. Note if receiver
@@ -2403,7 +2410,7 @@ absl::Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
           return absl::OkStatus();
         }
       }
-      string remote_task;
+      std::string remote_task;
       if (!DeviceNameUtils::GetTaskName(device->parsed_name(), &remote_task)) {
         return errors::InvalidArgument(
             "Unable to find remote task corresponding to device ",
@@ -2523,8 +2530,8 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
   }
 
   if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
-                              kernel->device()->name());
+    std::string msg = absl::StrCat("Executing op ", op->Name(), " in device ",
+                                   kernel->device()->name());
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index 09bebd3e1f7cf2..a8fb4fc308affe 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 #if !defined(IS_MOBILE_PLATFORM)
 bool ExecuteNodeArgs::IsRemote(EagerContext* ctx, Device* input_device,
                                TensorHandle* handle) {
-  uint64 context_view_id = ctx->GetContextViewId();
+  uint64_t context_view_id = ctx->GetContextViewId();
   if (handle->Type() == TensorHandle::REMOTE ||
       handle->HasRemoteMirror(input_device, context_view_id)) {
     if (!has_remote_inputs_) {
diff --git a/tensorflow/core/common_runtime/eager/execute_test.cc b/tensorflow/core/common_runtime/eager/execute_test.cc
index ea174fd22f76a2..5427851f3d8b3f 100644
--- a/tensorflow/core/common_runtime/eager/execute_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_test.cc
@@ -70,7 +70,7 @@ TEST(ExecuteTest, SimpleFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -125,7 +125,7 @@ TEST(ExecuteTest, SimpleFunctionInt32BadFullType) {
       /*run_eager_op_as_function=*/true);
 
   const Tensor kTwo = test::AsScalar<int32_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -188,7 +188,7 @@ TEST(ExecuteTest, CompiledFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -245,7 +245,7 @@ TEST(ExecuteTest, NestedCompiledFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -266,7 +266,7 @@ TEST(ExecuteTest, NestedCompiledFunction) {
       });
   TF_ASSERT_OK(ctx->AddFunctionDef(x_times_two));
 
-  const string call_function_name = "FunctionCall";
+  const std::string call_function_name = "FunctionCall";
   const FunctionDef function_call = FunctionDefHelper::Define(
       // Name
       call_function_name,
@@ -325,7 +325,7 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
       false, &device_mgr, false, nullptr, nullptr);
 
   const Tensor kTwo = test::AsScalar<int64_t>(2);
-  const string function_name = "XTimesTwo";
+  const std::string function_name = "XTimesTwo";
   const FunctionDef x_times_two = FunctionDefHelper::Define(
       // Name
       function_name,
@@ -346,7 +346,7 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
       });
   TF_ASSERT_OK(ctx->AddFunctionDef(x_times_two));
 
-  const string call_function_name = "FunctionCall";
+  const std::string call_function_name = "FunctionCall";
   FunctionDef function_call = FunctionDefHelper::Define(
       // Name
       call_function_name,
@@ -379,7 +379,7 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
 
   TF_ASSERT_OK(ctx->AddFunctionDef(function_call));
 
-  const string call_function_name2 = "FunctionCall2";
+  const std::string call_function_name2 = "FunctionCall2";
   const FunctionDef function_call2 = FunctionDefHelper::Define(
       // Name
       call_function_name2,
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 790151d92129a4..ba437b5df5e37d 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -193,7 +193,7 @@ class KernelAndDevice : public core::RefCounted {
 
   virtual int num_inputs() const = 0;
   virtual int num_outputs() const = 0;
-  virtual const string& name() const = 0;
+  virtual const std::string& name() const = 0;
 
  protected:
   std::function<void(std::function<void()>)>* get_runner() const;
@@ -262,7 +262,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
   }
   int num_inputs() const override { return kernel_->num_inputs(); }
   int num_outputs() const override { return kernel_->num_outputs(); }
-  const string& name() const override { return kernel_->name(); }
+  const std::string& name() const override { return kernel_->name(); }
 
  private:
   std::unique_ptr<OpKernel> kernel_;
@@ -286,19 +286,20 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
-      absl::flat_hash_map<string, const std::vector<string>*> composite_devices,
+      absl::flat_hash_map<std::string, const std::vector<std::string>*>
+          composite_devices,
       std::unordered_map<int, DtypeAndPartialTensorShape>
           input_resource_dtypes_and_shapes,
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
-      Device* host_cpu_device, const string& name,
+      Device* host_cpu_device, const std::string& name,
       const bool outputs_on_op_device,
       const bool allow_small_function_optimizations,
       const bool allow_control_flow_sync_execution,
       const bool shape_inference_on_tfe_dialect_import,
       const bool int_args_and_retvals_on_device,
       const bool function_runs_at_most_once,
-      std::optional<string> xla_compile_device_type,
+      std::optional<std::string> xla_compile_device_type,
       const bool allow_soft_placement, Rendezvous::Factory rendezvous_factory,
       std::function<int64_t()> get_op_id)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
@@ -366,7 +367,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   }
   int num_inputs() const override { return input_dtypes_.size(); }
   int num_outputs() const override { return output_dtypes_.size(); }
-  const string& name() const override { return name_; };
+  const std::string& name() const override { return name_; };
 
  private:
   std::shared_ptr<FunctionLibraryRuntime::Options> PrepareForRun(
@@ -402,7 +403,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
 
   const bool function_runs_at_most_once_;
 
-  const absl::optional<string> xla_compile_device_type_;
+  const absl::optional<std::string> xla_compile_device_type_;
 
   const bool allow_soft_placement_;
 
@@ -413,13 +414,14 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   // devices.
   std::vector<Device*> input_devices_;
   // Maps from a CompositeDevice name to a list of physical device names.
-  absl::flat_hash_map<string, const std::vector<string>*> composite_devices_;
+  absl::flat_hash_map<std::string, const std::vector<std::string>*>
+      composite_devices_;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_dtypes_and_shapes_;
 
   DataTypeVector input_dtypes_;
   DataTypeVector output_dtypes_;
-  string name_;
+  std::string name_;
 
   Rendezvous::Factory rendezvous_factory_;
   std::function<int64_t()> get_op_id_;
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
index e6d547d1e9832b..9b6e0e66a72a64 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -34,19 +34,20 @@ namespace eager {
 // generate and then copy the data instead of just generating the data on the
 // device directly.
 static bool IsPinnableOp(absl::string_view op_name) {
-  static const gtl::FlatSet<string>* unpinnable_ops = new gtl::FlatSet<string>({
-      "RandomUniform",
-      "RandomUniformInt",
-      "RandomStandardNormal",
-      "StatelessRandomUniform",
-      "StatelessRandomUniformInt",
-      "StatelessRandomUniformFullInt",
-      "StatelessRandomNormal",
-  });
+  static const gtl::FlatSet<std::string>* unpinnable_ops =
+      new gtl::FlatSet<std::string>({
+          "RandomUniform",
+          "RandomUniformInt",
+          "RandomStandardNormal",
+          "StatelessRandomUniform",
+          "StatelessRandomUniformInt",
+          "StatelessRandomUniformFullInt",
+          "StatelessRandomNormal",
+      });
 
   // XRT ops refer to per-device handles that are not safe to move between
   // devices.
-  return unpinnable_ops->find(string(op_name)) == unpinnable_ops->end() &&
+  return unpinnable_ops->find(std::string(op_name)) == unpinnable_ops->end() &&
          !absl::StartsWith(op_name, "XRT");
 }
 // Validate if the remote device with the given incarnation is valid in the
@@ -64,12 +65,12 @@ static absl::Status ValidateTensorHandleRemoteDevice(
 
 bool IsColocationExempt(absl::string_view op_name) {
   const auto& exempt_ops = InputColocationExemptionRegistry::Global()->Get();
-  return exempt_ops.find(string(op_name)) != exempt_ops.end();
+  return exempt_ops.find(std::string(op_name)) != exempt_ops.end();
 }
 
 bool IsFunction(absl::string_view op_name) {
   const OpDef* op_def = nullptr;
-  absl::Status s = OpDefForOp(string(op_name), &op_def);
+  absl::Status s = OpDefForOp(std::string(op_name), &op_def);
   if (!s.ok()) {
     if (!absl::IsNotFound(s)) {
       LOG(WARNING) << "Looking up OpDef failed with error: " << s;
diff --git a/tensorflow/core/common_runtime/eager/placement_utils_test.cc b/tensorflow/core/common_runtime/eager/placement_utils_test.cc
index c543b9475a072c..aadec6deab8eb8 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils_test.cc
@@ -94,7 +94,7 @@ struct MaybePinSmallOpsToCpuTestCase {
   std::string test_name;
   DataType dtype;
   TensorShape shape;
-  string op_name;
+  std::string op_name;
   const char* device;
   bool expect;
 };
@@ -152,7 +152,7 @@ INSTANTIATE_TEST_SUITE_P(
 struct MaybePinToResourceDeviceTestCase {
   std::string test_name;
   DataType dtype;
-  string op_name;
+  std::string op_name;
   const char* device;
   bool expect;
 };
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d4faba6415579f..583a8f15a657f4 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -60,7 +60,7 @@ int64_t GetRemoteDeviceIncarnation(Device* device) {
   return device->attributes().incarnation();
 }
 
-string SafeDeviceDebugString(Device* device) {
+std::string SafeDeviceDebugString(Device* device) {
   if (device == nullptr) {
     return "[]";
   } else {
@@ -150,8 +150,8 @@ void TensorHandle::PackedTensorHandleData::Poison(absl::Status status) {
   is_poisoned_ = status;
 }
 
-string TensorHandle::PackedTensorHandleData::DebugString() const {
-  string debug_str = "PackedTensorHandleData: ";
+std::string TensorHandle::PackedTensorHandleData::DebugString() const {
+  std::string debug_str = "PackedTensorHandleData: ";
   for (const auto* handle : handles_) {
     debug_str.append(
         absl::StrCat(std::visit([](auto& data) { return data.DebugString(); },
@@ -308,7 +308,7 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 
 absl::Status TensorHandle::CreatePackedHandle(
     std::vector<TensorHandle*>&& handles, const tensorflow::DataType dtype,
-    const tensorflow::TensorShape& shape, const string& device_name,
+    const tensorflow::TensorShape& shape, const std::string& device_name,
     EagerContext* ctx, TensorHandle** packed_handle) {
   if (handles.empty()) {
     return errors::InvalidArgument("Handles should not be empty.");
@@ -319,7 +319,7 @@ absl::Status TensorHandle::CreatePackedHandle(
     TF_RETURN_IF_ERROR(
         handles.at(0)->GetResourceHandleDtypesAndShapes(&dtypes_and_shapes));
   }
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   devices.reserve(handles.size());
   for (auto* handle : handles) {
     devices.push_back(handle->op_device() ? handle->op_device()->name()
@@ -372,7 +372,7 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
 
 #if !defined(IS_MOBILE_PLATFORM)
 TensorHandle* TensorHandle::CreateUnshapedRemoteHandle(
-    int64_t op_id, int32_t output_num, const string& remote_task,
+    int64_t op_id, int32_t output_num, const std::string& remote_task,
     tensorflow::DataType dtype, Device* d, EagerContext* ctx,
     const bool unknown_device) {
   return new TensorHandle(op_id, output_num, remote_task, dtype, d, ctx,
@@ -380,7 +380,7 @@ TensorHandle* TensorHandle::CreateUnshapedRemoteHandle(
 }
 
 TensorHandle::TensorHandle(int64_t op_id, int32_t output_num,
-                           const string& remote_task,
+                           const std::string& remote_task,
                            tensorflow::DataType dtype, Device* d,
                            EagerContext* ctx, const bool unknown_device)
     : ImmediateExecutionTensorHandle(kEager),
@@ -450,7 +450,7 @@ TensorHandle::HandleType TensorHandle::Type() const {
   }
 }
 
-string TensorHandle::TypeString() const {
+std::string TensorHandle::TypeString() const {
   if (data_.index() == 0) {
     return "LOCAL";
   } else if (data_.index() == 1) {
@@ -713,7 +713,7 @@ absl::Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
 absl::Status TensorHandle::RemoteAddress(const Device* d,
                                          const bool wait_until_ready,
                                          int64_t* op_id,
-                                         int32* output_num) const {
+                                         int32_t* output_num) const {
   DVLOG(3) << "RemoteAddress on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -759,7 +759,7 @@ absl::Status TensorHandle::RemoteAddress(const Device* d,
 }
 
 bool TensorHandle::HasRemoteMirror(const Device* d,
-                                   uint64 context_view_id) const {
+                                   uint64_t context_view_id) const {
   DVLOG(3) << "HasRemoteMirror on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -777,7 +777,7 @@ bool TensorHandle::HasRemoteMirror(const Device* d,
 }
 
 bool TensorHandle::HasResourceShapeMirror(const Device* d,
-                                          uint64 context_view_id) const {
+                                          uint64_t context_view_id) const {
   DVLOG(3) << "HasResourceShapeMirror on TensorHandle: " << this
            << " device: " << d << " " << d->name();
 
@@ -793,11 +793,9 @@ bool TensorHandle::HasResourceShapeMirror(const Device* d,
   return false;
 }
 
-absl::Status TensorHandle::AddUnshapedRemoteMirror(const Device* d,
-                                                   int64_t op_id,
-                                                   int output_num,
-                                                   const string& remote_task,
-                                                   EagerContext* ctx) {
+absl::Status TensorHandle::AddUnshapedRemoteMirror(
+    const Device* d, int64_t op_id, int output_num,
+    const std::string& remote_task, EagerContext* ctx) {
   DVLOG(3) << "AddUnshapedRemoteMirror on TensorHandle: " << this
            << " device: " << d << " " << d->name() << " op_id: " << op_id
            << " output_num: " << output_num;
@@ -856,14 +854,14 @@ absl::Status TensorHandle::AddResourceShapeMirror(const Device* d,
 
 absl::Status TensorHandle::SetRemoteShape(const TensorShape& shape,
                                           const Device* d,
-                                          uint64 context_view_id) {
+                                          uint64_t context_view_id) {
   return SetRemoteShapeAndDevice(shape, d, context_view_id, /*op_device=*/"");
 }
 
 absl::Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
                                                    const Device* d,
-                                                   uint64 context_view_id,
-                                                   string op_device) {
+                                                   uint64_t context_view_id,
+                                                   std::string op_device) {
   DVLOG(3) << "SetRemoteShape on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -930,7 +928,7 @@ absl::Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
     resource_device_ = dtype == DT_RESOURCE ? device : nullptr;
     resource_remote_device_incarnation_ =
         GetRemoteDeviceIncarnation(resource_device_);
-    string remote_task;
+    std::string remote_task;
     if (!DeviceNameUtils::GetTaskName(device->parsed_name(), &remote_task)) {
       return errors::InvalidArgument(
           "Unable to find remote task corresponding to device ",
@@ -948,7 +946,7 @@ absl::Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
 }
 
 void TensorHandle::PoisonRemote(absl::Status status, const Device* d,
-                                uint64 context_view_id) {
+                                uint64_t context_view_id) {
   DVLOG(3) << "PoisonRemote on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index ca60815d76ec9e..e2fdb872c317a2 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -66,9 +66,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
                tensorflow::DataType dtype, EagerContext* ctx);
 
 #if !defined(IS_MOBILE_PLATFORM)
-  TensorHandle(int64_t op_id, int32_t output_num, const string& remote_task,
-               tensorflow::DataType dtype, Device* device, EagerContext* ctx,
-               bool unknown_device);
+  TensorHandle(int64_t op_id, int32_t output_num,
+               const std::string& remote_task, tensorflow::DataType dtype,
+               Device* device, EagerContext* ctx, bool unknown_device);
   TensorHandle(int64_t op_id, int32_t output_num, tensorflow::DataType dtype,
                Device* device, bool is_ready, EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
@@ -97,7 +97,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   static absl::Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                          tensorflow::DataType dtype,
                                          const tensorflow::TensorShape& shape,
-                                         const string& device_name,
+                                         const std::string& device_name,
                                          EagerContext* ctx,
                                          TensorHandle** packed_handle);
   static absl::Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
@@ -108,12 +108,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // An unshaped remote handle refers to a tensor on a remote worker. It's not
   // ready until the shape is set. It controls the lifetime of the remote
   // tensor.
-  static TensorHandle* CreateUnshapedRemoteHandle(int64_t op_id,
-                                                  int32_t output_num,
-                                                  const string& remote_task,
-                                                  tensorflow::DataType dtype,
-                                                  Device* d, EagerContext* ctx,
-                                                  bool unknown_device = false);
+  static TensorHandle* CreateUnshapedRemoteHandle(
+      int64_t op_id, int32_t output_num, const std::string& remote_task,
+      tensorflow::DataType dtype, Device* d, EagerContext* ctx,
+      bool unknown_device = false);
   // A lazy remote handle refers to a tensor on a remote worker. The lifetime of
   // the remote tensor is controlled by the remote worker, but not by the lazy
   // remote handle. Lazy handles are normally created on a default function
@@ -189,12 +187,12 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   absl::Status AddLocalMirror(tensorflow::Tensor&& tensor, const Device* d);
 
 #if !defined(IS_MOBILE_PLATFORM)
-  bool HasRemoteMirror(const Device* d, uint64 context_view_id) const;
-  bool HasResourceShapeMirror(const Device* d, uint64 context_view_id) const;
+  bool HasRemoteMirror(const Device* d, uint64_t context_view_id) const;
+  bool HasResourceShapeMirror(const Device* d, uint64_t context_view_id) const;
 
   absl::Status AddUnshapedRemoteMirror(const Device* d, int64_t op_id,
                                        int output_num,
-                                       const string& remote_task,
+                                       const std::string& remote_task,
                                        EagerContext* ctx);
   absl::Status AddResourceShapeMirror(const Device* d, int64_t op_id,
                                       int output_num, EagerContext* ctx);
@@ -203,7 +201,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // If wait_until_ready is true, block until the remote tensor is ready on the
   // given remote worker.
   absl::Status RemoteAddress(const Device* d, bool wait_until_ready,
-                             int64_t* op_id, int32* output_num) const;
+                             int64_t* op_id, int32_t* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
@@ -213,12 +211,13 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // This method or Poison must be called exactly once for remote tensors that
   // were created without a known shape.
   absl::Status SetRemoteShape(const TensorShape& shape, const Device* d,
-                              uint64 context_view_id);
+                              uint64_t context_view_id);
   // If op_device is not empty, reset the devices of a remote tensor which is
   // created without known devices (e.g. function outputs).
   absl::Status SetRemoteShapeAndDevice(const TensorShape& shape,
-                                       const Device* d, uint64 context_view_id,
-                                       string op_device);
+                                       const Device* d,
+                                       uint64_t context_view_id,
+                                       std::string op_device);
 
   // Poisons either this handle or a remote mirror with error `status`.
   // Poisoning means that the handle will become ready and methods trying
@@ -226,7 +225,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // Exactly one of SetRemoteShape or PoisonRemote methods must be called on a
   // unshaped handle on a remote device.
   void PoisonRemote(absl::Status status, const Device* d,
-                    uint64 context_view_id);
+                    uint64_t context_view_id);
 #endif
 
   // Sets the `tensor` for this async non-ready handle making it ready.
@@ -260,7 +259,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   enum HandleType { LOCAL = 0, PACKED = 1, REMOTE = 2 };
 
   HandleType Type() const;
-  string TypeString() const;
+  std::string TypeString() const;
 
   void SetResourceHandleDtypeAndShape(
       std::vector<DtypeAndPartialTensorShape> dtypes_and_shapes);
@@ -330,9 +329,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // TODO(yujingzhang): Remove resource_shape_mirrors_ once scalable per-replica
   // variable is ready, since we could get the shape locally without remote copy
   // then.
-  std::unordered_map<string, RemoteTensorHandleData> resource_shape_mirrors_
-      TF_GUARDED_BY(mu_);
-  std::unordered_map<string, RemoteTensorHandleData> remote_mirrors_
+  std::unordered_map<std::string, RemoteTensorHandleData>
+      resource_shape_mirrors_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, RemoteTensorHandleData> remote_mirrors_
       TF_GUARDED_BY(mu_);
 #endif
 
@@ -371,7 +370,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     bool IsReady() const;
     absl::Status WaitReady(const char* caller) const;
     void Poison(absl::Status status);
-    string DebugString() const;
+    std::string DebugString() const;
 
     // Number of packed handles.
     int NumPackedHandles() const;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
index 2212b19db9c683..b0a089874dd744 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
@@ -96,7 +96,7 @@ absl::Status LocalTensorHandleData::SetTensor(tensorflow::Tensor&& t) {
   return absl::OkStatus();
 }
 
-string LocalTensorHandleData::DebugString() const {
+std::string LocalTensorHandleData::DebugString() const {
   if (IsReady()) {
     return tensor_.DeviceSafeDebugString();
   } else {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.h b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
index ed58e83a183bfe..73a20425871156 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
@@ -60,7 +60,7 @@ class LocalTensorHandleData {
 
   absl::Status SetTensor(tensorflow::Tensor&& t);
 
-  string DebugString() const;
+  std::string DebugString() const;
 
  private:
   tensorflow::Tensor tensor_;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 209222d33f1185..0bd94f635f0f00 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -44,7 +44,7 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
   EXPECT_TRUE(t.shape().IsSameSize(TensorShape({2, 2})));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<uint16>()(a, b) = uint16(a * b);
+      t.matrix<uint16_t>()(a, b) = uint16_t(a * b);
     }
   }
 
@@ -181,7 +181,7 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   handles.push_back(h1);
 
   // Create 2 remote TensorHandles (not ready).
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d2 = ListGPUDevices().at(2);
   TensorHandle* h2 = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d2, context());
@@ -439,7 +439,7 @@ TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -478,7 +478,7 @@ TEST_F(RemoteTensorHandleTest, PoisonRemote) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -519,7 +519,7 @@ TEST_F(RemoteTensorHandleTest, PoisonRemoteMirror) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -565,7 +565,7 @@ TEST_F(RemoteTensorHandleTest, SetRemoteTensorHandleShapeTwice) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,
@@ -623,7 +623,7 @@ TEST_F(RemoteTensorHandleTest, SetRemoteMirrorShapeTwice) {
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
 
-  const string remote_task = "/job:worker/replica:0/task:1";
+  const std::string remote_task = "/job:worker/replica:0/task:1";
   Device* d1 = device_mgr.ListDevices().at(1);
   TensorHandle* h = TensorHandle::CreateUnshapedRemoteHandle(
       /*op_id=*/0, /*output_num=*/0, remote_task, dtype, d1, context,

From 70a7d0e1421c8d19841f5dc38d96a20808d17ace Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:23:34 -0800
Subject: [PATCH 401/753] Automated Code Change

PiperOrigin-RevId: 845650270
---
 third_party/xla/xla/tests/buffer_donation_test.cc      | 3 ++-
 third_party/xla/xla/tests/cpu_gpu_fusion_test.cc       | 3 ++-
 third_party/xla/xla/tests/dot_operation_test.cc        | 3 ++-
 third_party/xla/xla/tests/dynamic_ops_test.cc          | 3 ++-
 third_party/xla/xla/tests/hlo_test_base.cc             | 5 +++--
 third_party/xla/xla/tests/local_client_execute_test.cc | 3 ++-
 third_party/xla/xla/tests/local_client_test_base.cc    | 5 +++--
 third_party/xla/xla/tests/local_client_test_base.h     | 4 ++--
 third_party/xla/xla/tests/while_test.cc                | 3 ++-
 9 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/tests/buffer_donation_test.cc b/third_party/xla/xla/tests/buffer_donation_test.cc
index 870a7b659bcb27..27400cc261c601 100644
--- a/third_party/xla/xla/tests/buffer_donation_test.cc
+++ b/third_party/xla/xla/tests/buffer_donation_test.cc
@@ -105,7 +105,8 @@ class BufferDonationTest : public HloTestBase {
     TF_ASSERT_OK_AND_ASSIGN(auto stream, executor_->CreateStream());
 
     auto& executors = backend_->stream_executors();
-    se::StreamExecutorMemoryAllocator memory_allocator(platform_, executors);
+    stream_executor::StreamExecutorAddressAllocator memory_allocator(platform_,
+                                                                     executors);
     ExecutableRunOptions run_options;
     run_options.set_stream(stream.get());
     run_options.set_allocator(&memory_allocator);
diff --git a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
index 5163b9842d8ba5..2c1b2f7fa52cec 100644
--- a/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
+++ b/third_party/xla/xla/tests/cpu_gpu_fusion_test.cc
@@ -951,7 +951,8 @@ void BM_ParallelFusion(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
 
   const int64_t intra_op_parallelism_threads = 24;
   xla::LocalClientOptions client_options;
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index 28bfe038cb97f7..5965f66b18c571 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -2275,7 +2275,8 @@ ENTRY main {
 void DOT_ReorderContracting(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
 
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform);
diff --git a/third_party/xla/xla/tests/dynamic_ops_test.cc b/third_party/xla/xla/tests/dynamic_ops_test.cc
index 74930092306a27..9bb74a2d505c7a 100644
--- a/third_party/xla/xla/tests/dynamic_ops_test.cc
+++ b/third_party/xla/xla/tests/dynamic_ops_test.cc
@@ -1003,7 +1003,8 @@ ENTRY main {
 void BM_DynamicSlice(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index dce925c25e28d0..9291c2073c3140 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -176,8 +176,9 @@ ::testing::AssertionResult HloTestBase::RunAndCompareNoHloPassesFromFile(
 
 se::DeviceAddressAllocator* HloTestBase::GetAllocator() {
   if (allocator_ == nullptr) {
-    allocator_ = std::make_unique<se::StreamExecutorMemoryAllocator>(
-        backend().default_stream_executor());
+    allocator_ =
+        std::make_unique<stream_executor::StreamExecutorAddressAllocator>(
+            backend().default_stream_executor());
   }
   return allocator_.get();
 }
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index cb0675c889c052..01279073d4cf48 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -920,7 +920,8 @@ TEST_F(LocalClientExecuteTest, InfeedOutfeedTest) {
 void BM_LocalClientOverhead(::testing::benchmark::State& state) {
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
   auto* transfer_manager = TransferManager::GetForPlatform(platform).value();
   int device_ordinal = client->default_device_ordinal();
diff --git a/third_party/xla/xla/tests/local_client_test_base.cc b/third_party/xla/xla/tests/local_client_test_base.cc
index 957b24fc150f8e..1db48c62865aed 100644
--- a/third_party/xla/xla/tests/local_client_test_base.cc
+++ b/third_party/xla/xla/tests/local_client_test_base.cc
@@ -66,7 +66,7 @@ absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> TestAllocator::Allocate(
     allocation_count_++;
     device_allocation_count_[device_ordinal]++;
   }
-  return se::StreamExecutorMemoryAllocator::Allocate(
+  return stream_executor::StreamExecutorAddressAllocator::Allocate(
       device_ordinal, size, retry_on_failure, memory_space);
 }
 
@@ -78,7 +78,8 @@ absl::Status TestAllocator::Deallocate(int device_ordinal,
     deallocation_count_++;
     device_deallocation_count_[device_ordinal]++;
   }
-  return se::StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
+  return stream_executor::StreamExecutorAddressAllocator::Deallocate(
+      device_ordinal, mem);
 }
 
 int64_t TestAllocator::allocation_count() const {
diff --git a/third_party/xla/xla/tests/local_client_test_base.h b/third_party/xla/xla/tests/local_client_test_base.h
index 3afeae8c003d8c..0c813dac236408 100644
--- a/third_party/xla/xla/tests/local_client_test_base.h
+++ b/third_party/xla/xla/tests/local_client_test_base.h
@@ -47,10 +47,10 @@ limitations under the License.
 
 namespace xla {
 
-class TestAllocator : public se::StreamExecutorMemoryAllocator {
+class TestAllocator : public stream_executor::StreamExecutorAddressAllocator {
  public:
   explicit TestAllocator(se::Platform* platform)
-      : se::StreamExecutorMemoryAllocator(
+      : stream_executor::StreamExecutorAddressAllocator(
             platform, PlatformUtil::GetStreamExecutors(platform).value()) {}
 
   absl::StatusOr<se::ScopedDeviceAddress<uint8_t>> Allocate(
diff --git a/third_party/xla/xla/tests/while_test.cc b/third_party/xla/xla/tests/while_test.cc
index dbbf0d80bcd9f6..ebe82ad75740f4 100644
--- a/third_party/xla/xla/tests/while_test.cc
+++ b/third_party/xla/xla/tests/while_test.cc
@@ -1311,7 +1311,8 @@ void BM_WhileLoop(::testing::benchmark::State& state) {
 
   se::Platform* platform = PlatformUtil::GetDefaultPlatform().value();
   auto executors = PlatformUtil::GetStreamExecutors(platform).value();
-  se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).value();
 
   const int64_t seq_len = 100;

From e9333c03e800c9ff451f01e1743b8fa0aa29c9bc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:24:24 -0800
Subject: [PATCH 402/753] Automated Code Change

PiperOrigin-RevId: 845650533
---
 .../xla/xla/stream_executor/tpu/c_api_conversions.cc   | 10 ++++++----
 .../xla/xla/stream_executor/tpu/c_api_conversions.h    |  3 ++-
 .../xla/xla/stream_executor/tpu/noncopyable_buffer.h   |  6 +++---
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index b4aefe96cc6995..43ce77e155b57f 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -183,9 +183,10 @@ xla::MaybeOwningDeviceAddress FromC(
     SE_MaybeOwningDeviceAddress* se_mem,
     stream_executor::DeviceAddressAllocator* allocator) {
   if (se_mem->owned) {
-    return xla::MaybeOwningDeviceAddress(stream_executor::OwningDeviceAddress(
-        ApiConverter::FromC(se_mem->memory), se_mem->device_ordinal,
-        allocator));
+    return xla::MaybeOwningDeviceAddress(
+        stream_executor::ScopedDeviceAddress<uint8_t>(
+            ApiConverter::FromC(se_mem->memory), se_mem->device_ordinal,
+            allocator));
   } else {
     return xla::MaybeOwningDeviceAddress(ApiConverter::FromC(se_mem->memory));
   }
@@ -243,7 +244,8 @@ stream_executor::DeviceAddressAllocator* FromC(
       c_allocator.ctx);
 }
 
-SE_MaybeOwningDeviceAddress ToC(stream_executor::OwningDeviceAddress* mem) {
+SE_MaybeOwningDeviceAddress ToC(
+    stream_executor::ScopedDeviceAddress<uint8_t>* mem) {
   SE_MaybeOwningDeviceAddress se_mem;
   se_mem.device_ordinal = mem->device_ordinal();
   se_mem.memory = ApiConverter::ToC(mem->Release());
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
index cdfcab80fabb69..1ce5c3121f4cca 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.h
@@ -128,7 +128,8 @@ stream_executor::DeviceAddressAllocator* FromC(
     const SE_DeviceAddressAllocator& c_allocator);
 
 // OwningDeviceAddress
-SE_MaybeOwningDeviceAddress ToC(stream_executor::OwningDeviceAddress* mem);
+SE_MaybeOwningDeviceAddress ToC(
+    stream_executor::ScopedDeviceAddress<uint8_t>* mem);
 // mem.HasOwnership() may be true if the buffer is aliased and shouldn't be
 // released. 'aliased' should be true in this case. 'aliased' has no effect if
 // 'mem' is unowned.
diff --git a/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h b/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
index 8fb31232886372..4ae63ada43c19c 100644
--- a/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
+++ b/third_party/xla/xla/stream_executor/tpu/noncopyable_buffer.h
@@ -141,9 +141,9 @@ class NoncopyableBuffer {
   }
 
   static OwnedDataPtr AlignedAlloc(size_t size, size_t alignment) {
-    return OwnedDataPtr(
-        static_cast<uint8_t*>(tsl::port::AlignedMalloc(size, alignment)),
-        tsl::port::AlignedFree);
+    return OwnedDataPtr(static_cast<uint8_t*>(tsl::port::AlignedMalloc(
+                            size, static_cast<std::align_val_t>(alignment))),
+                        tsl::port::AlignedFree);
   }
 
  private:

From 42652a45990e3c5be8a8a56936dd0b265b7ff3d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 01:25:34 -0800
Subject: [PATCH 403/753] Automated Code Change

PiperOrigin-RevId: 845650865
---
 .../backends/interpreter/executable_base.cc    |  2 +-
 .../xla/xla/backends/interpreter/executor.cc   | 12 ++++++------
 .../xla/xla/backends/interpreter/executor.h    | 18 +++++++++---------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/backends/interpreter/executable_base.cc b/third_party/xla/xla/backends/interpreter/executable_base.cc
index eb7fa5d4c07832..8299384cf6838b 100644
--- a/third_party/xla/xla/backends/interpreter/executable_base.cc
+++ b/third_party/xla/xla/backends/interpreter/executable_base.cc
@@ -79,7 +79,7 @@ absl::StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     auto in_it = buffers.begin();
     auto out_it = argument_buffers.back().buffers().begin();
     for (; in_it != buffers.end(); ++in_it, ++out_it) {
-      out_it->second = in_it->second.AsDeviceMemoryBase();
+      out_it->second = in_it->second.AsDeviceAddress();
     }
   }
 
diff --git a/third_party/xla/xla/backends/interpreter/executor.cc b/third_party/xla/xla/backends/interpreter/executor.cc
index 93ac837ab069fd..59857bc1190f47 100644
--- a/third_party/xla/xla/backends/interpreter/executor.cc
+++ b/third_party/xla/xla/backends/interpreter/executor.cc
@@ -37,23 +37,23 @@ host::HostStream *AsExecutorStream(Stream *stream) {
   return dynamic_cast<host::HostStream *>(stream);
 }
 
-DeviceMemoryBase XlaInterpreterExecutor::Allocate(uint64_t size,
-                                                  int64_t memory_space) {
-  return DeviceMemoryBase(new char[size], size);
+DeviceAddressBase XlaInterpreterExecutor::Allocate(uint64_t size,
+                                                   int64_t memory_space) {
+  return DeviceAddressBase(new char[size], size);
 }
 
-void XlaInterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
+void XlaInterpreterExecutor::Deallocate(DeviceAddressBase* mem) {
   delete[] static_cast<char *>(mem->opaque());
 }
 
 absl::Status XlaInterpreterExecutor::SynchronousMemcpy(
-    DeviceMemoryBase *dev_dst, const void *host_src, uint64_t size) {
+    DeviceAddressBase* dev_dst, const void* host_src, uint64_t size) {
   memcpy(dev_dst->opaque(), host_src, size);
   return absl::OkStatus();
 }
 
 absl::Status XlaInterpreterExecutor::SynchronousMemcpy(
-    void *host_dst, const DeviceMemoryBase &dev_src, uint64_t size) {
+    void* host_dst, const DeviceAddressBase& dev_src, uint64_t size) {
   memcpy(host_dst, dev_src.opaque(), size);
   return absl::OkStatus();
 }
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 0171dc692be4c8..0ab41357230986 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -60,14 +60,14 @@ class InterpreterStream : public host::HostStream {
     return absl::UnimplementedError("Not implemented.");
   }
 
-  absl::Status Memcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
+  absl::Status Memcpy(void* host_dst, const DeviceAddressBase& gpu_src,
                       uint64_t size) override {
     void *src_mem = gpu_src.opaque();
     memcpy(host_dst, src_mem, size);
     return absl::OkStatus();
   }
 
-  absl::Status Memcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
+  absl::Status Memcpy(DeviceAddressBase* gpu_dst, const void* host_src,
                       uint64_t size) override {
     void *dst_mem = gpu_dst->opaque();
     memcpy(dst_mem, host_src, size);
@@ -84,8 +84,8 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
 
   int device_ordinal() const override { return device_ordinal_; };
 
-  DeviceMemoryBase Allocate(uint64_t size, int64_t memory_space) override;
-  void Deallocate(DeviceMemoryBase *mem) override;
+  DeviceAddressBase Allocate(uint64_t size, int64_t memory_space) override;
+  void Deallocate(DeviceAddressBase* mem) override;
 
   absl::StatusOr<std::unique_ptr<MemoryAllocation>> HostMemoryAllocate(
       uint64_t size) override {
@@ -97,15 +97,15 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
 
   // No "synchronize all activity" implemented for this platform at the moment.
   bool SynchronizeAllActivity() override { return true; }
-  absl::Status SynchronousMemZero(DeviceMemoryBase *location,
+  absl::Status SynchronousMemZero(DeviceAddressBase* location,
                                   uint64_t size) override {
     return absl::InternalError("Interpreter can not memzero");
   }
 
-  absl::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                 const void *host_src, uint64_t size) override;
-  absl::Status SynchronousMemcpy(void *host_dst,
-                                 const DeviceMemoryBase &dev_src,
+  absl::Status SynchronousMemcpy(DeviceAddressBase* dev_dst,
+                                 const void* host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void* host_dst,
+                                 const DeviceAddressBase& dev_src,
                                  uint64_t size) override;
 
   void DeallocateStream(Stream *stream) override {}

From d0cf19c0fa1f9044d0bd5b5fccdf875c38a21753 Mon Sep 17 00:00:00 2001
From: Shaogang Wang <shawnw@nvidia.com>
Date: Wed, 17 Dec 2025 01:52:48 -0800
Subject: [PATCH 404/753] PR #34734: [XLA:GPU] make DYNAMIC_SLICE_COPY_FUSION
 command default lowered to cuda-graph
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34734

📝 Summary of Changes

* Added `DebugOptions::DYNAMIC_SLICE_COPY_FUSION` to the list of enabled GPU command buffers in the default debug options.

🚀 Kind of Contribution
⚡️ Performance Improvement

🧪 Unit Tests:
change the default setting, unittest has already been added.

Copybara import of the project:

--
bdca8aae0307ff41188b6481b878e42b9ec90612 by Shawn Wang <shawnw@nvidia.com>:

make DYNAMIC_SLICE_COPY_FUSION command default lowered to cuda-graph

Merging this change closes #34734

PiperOrigin-RevId: 845659522
---
 third_party/xla/xla/debug_options_flags.cc     |  2 ++
 .../xla/xla/debug_options_parsers_test.cc      | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index e83d912282ac05..a8cbbe96789a02 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -244,6 +244,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUSTOM_CALL);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::CUDNN);
   opts.add_xla_gpu_enable_command_buffer(DebugOptions::DYNAMIC_SLICE_FUSION);
+  opts.add_xla_gpu_enable_command_buffer(
+      DebugOptions::DYNAMIC_SLICE_COPY_FUSION);
   opts.set_xla_gpu_graph_min_graph_size(5);
   opts.set_xla_gpu_command_buffer_scheduling_mode(DebugOptions::LHS);
   opts.set_xla_gpu_command_buffer_unroll_loops(false);
diff --git a/third_party/xla/xla/debug_options_parsers_test.cc b/third_party/xla/xla/debug_options_parsers_test.cc
index 1fea535da5e44b..7bb39cc0915e9c 100644
--- a/third_party/xla/xla/debug_options_parsers_test.cc
+++ b/third_party/xla/xla/debug_options_parsers_test.cc
@@ -393,12 +393,13 @@ TEST(ParseRepeatedEnumFlagsTest, CommandBufferCmdType) {
 
   // Check that the default setting has 6 types.
   const auto& enabled_types = debug_options.xla_gpu_enable_command_buffer();
-  ASSERT_EQ(enabled_types.size(), 6);
+  ASSERT_EQ(enabled_types.size(), 7);
   ASSERT_THAT(
       enabled_types,
       ElementsAre(DebugOptions::FUSION, DebugOptions::CUBLAS,
                   DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION));
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION));
 
   // Initialize the flag objects.
   std::vector<tsl::Flag> flag_objects;
@@ -407,29 +408,32 @@ TEST(ParseRepeatedEnumFlagsTest, CommandBufferCmdType) {
   // Removing options from the existing setting.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=-fusion,-cublas");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 4);
+  EXPECT_EQ(enabled_types.size(), 5);
   EXPECT_THAT(
       enabled_types,
       ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION));
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION));
 
   // Removing an option that isn't there and adding a duplicate.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=+cublaslt,-fusion");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 4);
+  EXPECT_EQ(enabled_types.size(), 5);
   EXPECT_THAT(
       enabled_types,
       ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
-                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION));
+                  DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION));
 
   // Adding an option.
   SetXlaFlagsEnvVar("--xla_gpu_enable_command_buffer=+cublas");
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
-  EXPECT_EQ(enabled_types.size(), 5);
+  EXPECT_EQ(enabled_types.size(), 6);
   EXPECT_THAT(
       enabled_types,
       ElementsAre(DebugOptions::CUBLASLT, DebugOptions::CUSTOM_CALL,
                   DebugOptions::CUDNN, DebugOptions::DYNAMIC_SLICE_FUSION,
+                  DebugOptions::DYNAMIC_SLICE_COPY_FUSION,
                   DebugOptions::CUBLAS));
 
   // Overwriting the default setting.

From 3ffe7a698b2a2f078ec0a2f6a91f08e8424a07c8 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 17 Dec 2025 01:58:40 -0800
Subject: [PATCH 405/753] [XLA:GPU] Increment autotuning cache key version and
 adapt autotune db.

A recent change has affected the autotuning key, so we need to update the
version.

PiperOrigin-RevId: 845661452
---
 .../xla/service/gpu/autotuning/autotune_cache_key.h    |  2 +-
 .../gpu/gpu_compiler_test_autotune_db.textproto        | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
index 83a360383f2d60..1c5cf3a2298d40 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key.h
@@ -32,7 +32,7 @@ class AutotuneCacheKey {
   // Tie a version to the cache key in order to invalidate the cache when
   // necessary. This should be incremented on triton upgrades or any other
   // changes that may affect the autotuning results.
-  static constexpr int kCurrentVersion = 20;
+  static constexpr int kCurrentVersion = 21;
 
   AutotuneCacheKey(const se::DeviceDescription& device_description,
                    const HloInstruction& instruction,
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index c44b9350d9dc19..67d67b9594af5f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -27,7 +27,7 @@ results {
 }
 results {
   device: "CUDA: 8.0, Cores: 108, GPU clock: 1.41 GHz, Memory bandwidth: 1555 GB/s, L2 cache: 40 MB, DNN version: 1.2.3"
-  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[4194304]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
   result {
     run_time {
       nanos: 1
@@ -51,7 +51,7 @@ results {
 }
 results {
   device: "CUDA: 8.0, Cores: 108, GPU clock: 1.41 GHz, Memory bandwidth: 2039 GB/s, L2 cache: 40 MB, DNN version: 1.2.3"
-  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[4194304]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[]}"
   result {
     run_time {
       nanos: 1
@@ -75,7 +75,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[128,1024,1024]{2,1,0}, s8[33554432]{0}) custom-call(bf16[128,1024,1024]{2,1,0}, bf16[128,1024,1024]{2,1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":1,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[\"0\"],\"lhs_contracting_dimensions\":[\"2\"],\"rhs_batch_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"1048576\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"1048576\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
   result {
     gemm {
       algorithm: -1
@@ -87,7 +87,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "(bf16[12288,16384]{1,0}, s8[33554432]{0}) custom-call(f8e4m3fn[12288,4096]{1,0}, f8e4m3fn[4096,16384]{0,1}, f32[], f32[]), custom_call_target=\"__cublas$lt$matmul$f8\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":0.95703125,\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[],\"lhs_contracting_dimensions\":[\"1\"],\"rhs_batch_dimensions\":[],\"rhs_contracting_dimensions\":[\"0\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"50331648\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"67108864\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
+  hlo: "(bf16[12288,16384]{1,0}, s8[33554432]{0}) custom-call(f8e4m3fn[12288,4096]{1,0}, f8e4m3fn[4096,16384]{0,1}, f32[], f32[]), custom_call_target=\"__cublas$lt$matmul$f8\", backend_config={\"device_type\":\"DEVICE_TYPE_INVALID\",\"force_earliest_schedule\":false,\"gemm_backend_config\":{\"alpha_imag\":0,\"alpha_real\":0.95703125,\"autotune_workspace_size\":\"0\",\"beta\":0,\"damax_output\":false,\"dot_dimension_numbers\":{\"lhs_batch_dimensions\":[],\"lhs_contracting_dimensions\":[\"1\"],\"rhs_batch_dimensions\":[],\"rhs_contracting_dimensions\":[\"0\"]},\"epilogue\":\"DEFAULT\",\"grad_x\":false,\"grad_y\":false,\"lhs_stride\":\"50331648\",\"precision_config\":{\"algorithm\":\"ALG_UNSET\",\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"rhs_stride\":\"67108864\"},\"operation_queue_id\":\"0\",\"reification_cost\":[],\"wait_on_operation_queues\":[]}"
   result {
     gemm {
     }
@@ -192,4 +192,4 @@ results {
       }
     }
   }
-}
\ No newline at end of file
+}

From 8b4daac3472992cbb34e2bf97a56b0a1f06ecc75 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:14:40 -0800
Subject: [PATCH 406/753] Automated Code Change

PiperOrigin-RevId: 845666754
---
 third_party/xla/xla/service/hlo_runner_pjrt.cc    | 3 ++-
 third_party/xla/xla/service/shaped_buffer_test.cc | 3 ++-
 third_party/xla/xla/service/transfer_manager.cc   | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index c85e137fe88825..5f326727735a4a 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -827,7 +827,8 @@ absl::StatusOr<Literal> HloRunnerPjRt::TransferLiteralFromDevice(
       on_device_shape.IsTuple() && on_device_shape.tuple_shapes().size() == 0) {
     return LiteralUtil::MakeTuple({});
   }
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, buffer.ToLiteralSync());
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
+                      buffer.ToLiteral().Await());
   return std::move(*literal);
 }
 
diff --git a/third_party/xla/xla/service/shaped_buffer_test.cc b/third_party/xla/xla/service/shaped_buffer_test.cc
index 4d3828a55eb56e..63cdc7f9abc3d3 100644
--- a/third_party/xla/xla/service/shaped_buffer_test.cc
+++ b/third_party/xla/xla/service/shaped_buffer_test.cc
@@ -42,7 +42,8 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
                           xla::PlatformUtil::GetDefaultPlatform());
   TF_ASSERT_OK_AND_ASSIGN(auto executors,
                           xla::PlatformUtil::GetStreamExecutors(platform));
-  xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
+  stream_executor::StreamExecutorAddressAllocator allocator(platform,
+                                                            executors);
   const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
   const int kDeviceOrdinal = 0;
   auto scoped_buffer = std::make_unique<xla::ScopedShapedBuffer>(
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 4fbdcdc58ce116..7c0fc92a23bc77 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -295,7 +295,7 @@ absl::Status TransferManager::WriteRootTupleIndexTable(
     return absl::OkStatus();
   }
   se::DeviceAddressBase device_memory =
-      buffer_tree.element({}).AsDeviceMemoryBase();
+      buffer_tree.element({}).AsDeviceAddress();
   TF_RET_CHECK(GetByteSizeRequirement(buffer_tree.shape()) ==
                device_memory.size());
 
@@ -303,7 +303,7 @@ absl::Status TransferManager::WriteRootTupleIndexTable(
   elements.reserve(ShapeUtil::TupleElementCount(buffer_tree.shape()));
   for (int64_t i = 0; i < ShapeUtil::TupleElementCount(buffer_tree.shape());
        ++i) {
-    elements.push_back(buffer_tree.element({i}).AsDeviceMemoryBase());
+    elements.push_back(buffer_tree.element({i}).AsDeviceAddress());
   }
   return WriteSingleTupleIndexTable(stream, elements, buffer_tree.shape(),
                                     &device_memory);

From 6c69aa0e0de1498ce1510e7e7a99082fc7f4bde3 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 17 Dec 2025 02:19:15 -0800
Subject: [PATCH 407/753] PR #35269: Bump actions/upload-artifact from 5.0.0 to
 6.0.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35269

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 5.0.0 to 6.0.0.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a href="https://github.com/actions/upload-artifact/releases">actions/upload-artifact's releases</a>.</em></p>
<blockquote>
<h2>v6.0.0</h2>
<h2>v6 - What's new</h2>
<blockquote>
<p>[!IMPORTANT]
actions/upload-artifact@v6 now runs on Node.js 24 (<code>runs.using: node24</code>) and requires a minimum Actions Runner version of 2.327.1. If you are using self-hosted runners, ensure they are updated before upgrading.</p>
</blockquote>
<h3>Node.js 24</h3>
<p>This release updates the runtime to Node.js 24. v5 had preliminary support for Node.js 24, however this action was by default still running on Node.js 20. Now this action by default will run on Node.js 24.</p>
<h2>What's Changed</h2>
<ul>
<li>Upload Artifact Node 24 support by <a href="https://github.com/salmanmkc"><code>@​salmanmkc</code></a> in <a href="https://redirect.github.com/actions/upload-artifact/pull/719">actions/upload-artifact#719</a></li>
<li>fix: update <code>@​actions/artifact</code> for Node.js 24 punycode deprecation by <a href="https://github.com/salmanmkc"><code>@​salmanmkc</code></a> in <a href="https://redirect.github.com/actions/upload-artifact/pull/744">actions/upload-artifact#744</a></li>
<li>prepare release v6.0.0 for Node.js 24 support by <a href="https://github.com/salmanmkc"><code>@​salmanmkc</code></a> in <a href="https://redirect.github.com/actions/upload-artifact/pull/745">actions/upload-artifact#745</a></li>
</ul>
<p><strong>Full Changelog</strong>: <a href="https://github.com/actions/upload-artifact/compare/v5.0.0...v6.0.0">https://github.com/actions/upload-artifact/compare/v5.0.0...v6.0.0</a></p>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a href="https://github.com/actions/upload-artifact/commit/b7c566a772e6b6bfb58ed0dc250532a479d7789f"><code>b7c566a</code></a> Merge pull request <a href="https://redirect.github.com/actions/upload-artifact/issues/745">#745</a> from actions/upload-artifact-v6-release</li>
<li><a href="https://github.com/actions/upload-artifact/commit/e516bc8500aaf3d07d591fcd4ae6ab5f9c391d5b"><code>e516bc8</code></a> docs: correct description of Node.js 24 support in README</li>
<li><a href="https://github.com/actions/upload-artifact/commit/ddc45ed9bca9b38dbd643978d88e3981cdc91415"><code>ddc45ed</code></a> docs: update README to correct action name for Node.js 24 support</li>
<li><a href="https://github.com/actions/upload-artifact/commit/615b319bd27bb32c3d64dca6b6ed6974d5fbe653"><code>615b319</code></a> chore: release v6.0.0 for Node.js 24 support</li>
<li><a href="https://github.com/actions/upload-artifact/commit/017748b48f8610ca8e6af1222f4a618e84a9c703"><code>017748b</code></a> Merge pull request <a href="https://redirect.github.com/actions/upload-artifact/issues/744">#744</a> from actions/fix-storage-blob</li>
<li><a href="https://github.com/actions/upload-artifact/commit/38d4c7997f5510fcc41fc4aae2a6b97becdbe7fc"><code>38d4c79</code></a> chore: rebuild dist</li>
<li><a href="https://github.com/actions/upload-artifact/commit/7d27270e0cfd253e666c44abac0711308d2d042f"><code>7d27270</code></a> chore: add missing license cache files for <code>@​actions/core</code>, <code>@​actions/io</code>, and mi...</li>
<li><a href="https://github.com/actions/upload-artifact/commit/5f643d3c9475505ccaf26d686ffbfb71a8387261"><code>5f643d3</code></a> chore: update license files for <code>@​actions/artifact</code><a href="https://github.com/5"><code>@​5</code></a>.0.1 dependencies</li>
<li><a href="https://github.com/actions/upload-artifact/commit/1df1684032c88614064493e1a0478fcb3583e1d0"><code>1df1684</code></a> chore: update package-lock.json with <code>@​actions/artifact</code><a href="https://github.com/5"><code>@​5</code></a>.0.1</li>
<li><a href="https://github.com/actions/upload-artifact/commit/b5b1a918401ee270935b6b1d857ae66c85f3be6f"><code>b5b1a91</code></a> fix: update <code>@​actions/artifact</code> to ^5.0.0 for Node.js 24 punycode fix</li>
<li>Additional commits viewable in <a href="https://github.com/actions/upload-artifact/compare/v5...v6">compare view</a></li>
</ul>
</details>
<br />

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/upload-artifact&package-manager=github_actions&previous-version=5.0.0&new-version=6.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)

</details>
Copybara import of the project:

--
273f78ea9a3ef2ff528533d2fc613ecc149e4c47 by dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>:

Bump actions/upload-artifact from 5.0.0 to 6.0.0

Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 5.0.0 to 6.0.0.
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/v5...v6)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-version: 6.0.0
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>

Merging this change closes #35269

PiperOrigin-RevId: 845668175
---
 third_party/xla/.github/workflows/benchmark_postsubmit.yml   | 2 +-
 third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml | 2 +-
 third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml | 2 +-
 third_party/xla/.github/workflows/nightly_benchmarks.yml     | 2 +-
 third_party/xla/.github/workflows/postsubmit_benchmark.yml   | 2 +-
 third_party/xla/.github/workflows/presubmit_benchmark.yml    | 2 +-
 third_party/xla/.github/workflows/scorecards-analysis.yml    | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/.github/workflows/benchmark_postsubmit.yml b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
index c7ecb2bbdb6075..014f71761c2527 100644
--- a/third_party/xla/.github/workflows/benchmark_postsubmit.yml
+++ b/third_party/xla/.github/workflows/benchmark_postsubmit.yml
@@ -235,7 +235,7 @@ jobs:
           gsutil cp "$OUTPUT_FILE" "$GCS_BUCKET/$GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@v5.0.0
+        uses: actions/upload-artifact@v6.0.0
         with:
           name: xspace-artifacts-${{ matrix.job_info.pool }}-${{ matrix.job_info.platform }}
           path: ${{ env.XSPACE_FILE }}
\ No newline at end of file
diff --git a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
index ae0f471ecfcbfb..060249354d15d9 100644
--- a/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/cpu_benchmarks_nightly.yml
@@ -207,7 +207,7 @@ jobs:
           gsutil cp "$OUTPUT_DIR/$FILENAME_GEMMA3" "$GCS_BUCKET/$GEMMA3_GCS_OBJECT_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@v5.0.0
+        uses: actions/upload-artifact@v6.0.0
         with:
           name: cpu-xla-benchmarks-xspace-${{ matrix.job_info.pool }}
           path: ${{ github.workspace }}/${{ matrix.job_info.output_dir }}/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
index 55ffd9ad5c1efe..8ab4fb1d5dba14 100644
--- a/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
+++ b/third_party/xla/.github/workflows/gpu_benchmarks_nightly.yml
@@ -198,7 +198,7 @@ jobs:
           upload_to_gcs "$GEMMA3_SAMPLE_LOOP_BASE_NAME"
 
       - name: Upload XSpace artifacts
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: gpu-xla-benchmarks-xspace-${{ matrix.job_info.os }}
           path: ${{ github.workspace }}/output/*_xspace.pb
diff --git a/third_party/xla/.github/workflows/nightly_benchmarks.yml b/third_party/xla/.github/workflows/nightly_benchmarks.yml
index 5f33b283f13a52..e65fd69daf6944 100644
--- a/third_party/xla/.github/workflows/nightly_benchmarks.yml
+++ b/third_party/xla/.github/workflows/nightly_benchmarks.yml
@@ -182,7 +182,7 @@ jobs:
           gsutil cp "$RESULTS_JSON_FILE_PATH" "$GCS_BUCKET/$GCS_OBJECT_NAME"
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/postsubmit_benchmark.yml b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
index 2d899ae24284cd..a4f249366b6ea9 100644
--- a/third_party/xla/.github/workflows/postsubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/postsubmit_benchmark.yml
@@ -224,7 +224,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: results-${{ env.CONFIG_ID }}
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/presubmit_benchmark.yml b/third_party/xla/.github/workflows/presubmit_benchmark.yml
index 02483e6158091b..4259667c73dad0 100644
--- a/third_party/xla/.github/workflows/presubmit_benchmark.yml
+++ b/third_party/xla/.github/workflows/presubmit_benchmark.yml
@@ -199,7 +199,7 @@ jobs:
 
       - name: Upload Benchmark Artifacts
         if: always()
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
         with:
           name: results-${{ env.CONFIG_ID }} 
           path: ${{ env.RESOLVED_OUTPUT_DIR }}
diff --git a/third_party/xla/.github/workflows/scorecards-analysis.yml b/third_party/xla/.github/workflows/scorecards-analysis.yml
index 0751547d4aea89..0ec69c216d7aaf 100644
--- a/third_party/xla/.github/workflows/scorecards-analysis.yml
+++ b/third_party/xla/.github/workflows/scorecards-analysis.yml
@@ -58,7 +58,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v3.pre.node20
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v3.pre.node20
         with:
           name: SARIF file
           path: results.sarif

From 854592931c404838dd7cf5e48d6969df354781c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:21:24 -0800
Subject: [PATCH 408/753] Automated Code Change

PiperOrigin-RevId: 845668948
---
 third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
index fe4de7c6776cb9..55e87a248c48da 100644
--- a/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
+++ b/third_party/xla/xla/examples/axpy/stablehlo_compile_test.cc
@@ -176,7 +176,7 @@ TEST_F(StableHloAxpyTest, CompileAndExecuteCPUTestProgram) {
 
   // Convert result buffer back to literal.
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> axpy_result_literal,
-                          axpy_result[0][0]->ToLiteralSync());
+                          axpy_result[0][0]->ToLiteral().Await());
 
   // Check to make sure that our results match what we expect.
   xla::LiteralTestUtil::ExpectR1Near<float>({13.64f, 26.78f, 39.92f, 53.06f},

From aea2e3d2827274a95467a810102d231762200d29 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:21:33 -0800
Subject: [PATCH 409/753] Automated Code Change

PiperOrigin-RevId: 845668992
---
 .../rpc_collective_executor_mgr.cc            |  5 ++--
 .../rpc_collective_executor_mgr.h             |  8 +++---
 .../rpc_collective_executor_mgr_test.cc       |  2 +-
 .../core/distributed_runtime/rpcbench_test.cc |  8 +++---
 .../core/distributed_runtime/scheduler.h      |  2 +-
 .../core/distributed_runtime/server_lib.cc    |  6 ++---
 .../core/distributed_runtime/server_lib.h     |  6 ++---
 .../core/distributed_runtime/tensor_coding.cc | 16 ++++++------
 .../distributed_runtime/tensor_coding_test.cc | 26 +++++++++----------
 .../core/distributed_runtime/test_utils.h     | 25 ++++++++++--------
 tensorflow/core/distributed_runtime/worker.cc | 10 +++----
 .../core/distributed_runtime/worker_cache.h   | 15 ++++++-----
 .../worker_cache_logger.cc                    | 26 +++++++++----------
 .../distributed_runtime/worker_cache_logger.h | 18 +++++++------
 .../worker_cache_partial.cc                   | 10 +++----
 .../worker_cache_partial.h                    |  9 ++++---
 .../worker_cache_wrapper.h                    | 16 +++++++-----
 .../distributed_runtime/worker_session.cc     | 24 +++++++++--------
 .../core/distributed_runtime/worker_session.h | 14 +++++-----
 19 files changed, 128 insertions(+), 118 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index 1af67bdb51b3ca..b80045c28f08cf 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -30,7 +30,7 @@ RpcCollectiveExecutorMgr::RpcCollectiveExecutorMgr(
     std::unique_ptr<DeviceResolverDistributed> dev_resolver,
     std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
     std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& task_name)
+    WorkerCacheInterface* worker_cache, const std::string& task_name)
     : CollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
                             std::move(param_resolver),
                             std::move(nccl_communicator)),
@@ -172,7 +172,8 @@ void RpcCollectiveExecutorMgr::RetireStepId(int64_t graph_key,
 std::unique_ptr<RpcCollectiveExecutorMgr> CreateProdRpcCollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* device_mgr,
     std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& default_worker_name) {
+    WorkerCacheInterface* worker_cache,
+    const std::string& default_worker_name) {
   auto dev_resolver = std::make_unique<DeviceResolverDistributed>(device_mgr);
   auto param_resolver = std::make_unique<CollectiveParamResolverDistributed>(
       config, device_mgr, dev_resolver.get(), nccl_communicator.get(),
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index 6836204cc1a289..aadbaf33796437 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -39,7 +39,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
       std::unique_ptr<DeviceResolverDistributed> dev_resolver,
       std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
       std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-      WorkerCacheInterface* worker_cache, const string& task_name);
+      WorkerCacheInterface* worker_cache, const std::string& task_name);
 
   virtual ~RpcCollectiveExecutorMgr();
 
@@ -60,8 +60,8 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
   virtual CollectiveExecutor* Create(int64_t step_id) override;
 
   WorkerCacheInterface* const worker_cache_;  // Not owned.
-  const string task_name_;
-  string group_leader_;
+  const std::string task_name_;
+  std::string group_leader_;
   friend class RpcCollectiveExecutorMgrTest;
 
  private:
@@ -88,7 +88,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
 std::unique_ptr<RpcCollectiveExecutorMgr> CreateProdRpcCollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* device_mgr,
     std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
-    WorkerCacheInterface* worker_cache, const string& default_worker_name);
+    WorkerCacheInterface* worker_cache, const std::string& default_worker_name);
 
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
index f830fd96110456..55eebf621e5882 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -39,7 +39,7 @@ namespace tensorflow {
 class RpcCollectiveExecutorMgrTest : public ::testing::Test {
  protected:
   RpcCollectiveExecutorMgrTest() {
-    string task_name = "/job:localhost/replica:0/task:0";
+    std::string task_name = "/job:localhost/replica:0/task:0";
     SessionOptions options;
     options.config.mutable_experimental()->set_collective_group_leader(
         task_name);
diff --git a/tensorflow/core/distributed_runtime/rpcbench_test.cc b/tensorflow/core/distributed_runtime/rpcbench_test.cc
index 666800294ac003..70816cc8a7b556 100644
--- a/tensorflow/core/distributed_runtime/rpcbench_test.cc
+++ b/tensorflow/core/distributed_runtime/rpcbench_test.cc
@@ -42,7 +42,7 @@ static const int kWorkers = 60;
 static thread::ThreadPool* worker_threads;
 
 void MakeGRPCCluster(const SessionOptions& options, int n,
-                     std::vector<string>* workers,
+                     std::vector<std::string>* workers,
                      std::vector<DeviceAttributes>* devices) {
   CHECK_GE(n, 1);
 
@@ -100,7 +100,7 @@ void MakeGRPCCluster(const SessionOptions& options, int n,
 
 struct Cluster {
   SessionOptions options;
-  std::vector<string> workers;
+  std::vector<std::string> workers;
   std::vector<DeviceAttributes> devices;  // One per process
 
   Cluster() {
@@ -153,14 +153,14 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
   return def;
 }
 
-string DebugString(const Tensor& x, const Tensor& y, int tensor_size) {
+std::string DebugString(const Tensor& x, const Tensor& y, int tensor_size) {
   CHECK_EQ(x.NumElements(), tensor_size);
   CHECK_EQ(y.NumElements(), tensor_size);
   auto x_flat = x.flat<float>();
   auto y_flat = y.flat<float>();
   // Just print the first couple of elements of each tensor
   CHECK_GE(tensor_size, 2);
-  return strings::Printf("x = [%8.6f %8.6f] y = [%8.6f %8.6f]", x_flat(0),
+  return absl::StrFormat("x = [%8.6f %8.6f] y = [%8.6f %8.6f]", x_flat(0),
                          x_flat(1), y_flat(0), y_flat(1));
 }
 
diff --git a/tensorflow/core/distributed_runtime/scheduler.h b/tensorflow/core/distributed_runtime/scheduler.h
index 4385db786ff38a..d277bdab74e835 100644
--- a/tensorflow/core/distributed_runtime/scheduler.h
+++ b/tensorflow/core/distributed_runtime/scheduler.h
@@ -86,7 +86,7 @@ class GreedyScheduler {
   const CostModel* cost_model_;
   const Graph* graph_;
   std::vector<int64_t>* priority_;
-  std::unordered_map<string, Sim*> device_states_;
+  std::unordered_map<std::string, Sim*> device_states_;
 
   GreedyScheduler(const GreedyScheduler&) = delete;
   void operator=(const GreedyScheduler&) = delete;
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 2f7cc4184662f4..527dd49507c607 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -28,7 +28,7 @@ mutex* get_server_factory_lock() {
   return &server_factory_lock;
 }
 
-typedef std::unordered_map<string, ServerFactory*> ServerFactories;
+typedef std::unordered_map<std::string, ServerFactory*> ServerFactories;
 ServerFactories* server_factories() {
   static ServerFactories* factories = new ServerFactories;
   return factories;
@@ -36,7 +36,7 @@ ServerFactories* server_factories() {
 }  // namespace
 
 /* static */
-void ServerFactory::Register(const string& server_type,
+void ServerFactory::Register(const std::string& server_type,
                              ServerFactory* factory) {
   mutex_lock l(*get_server_factory_lock());
   if (!server_factories()->insert({server_type, factory}).second) {
@@ -56,7 +56,7 @@ absl::Status ServerFactory::GetFactory(const ServerDef& server_def,
     }
   }
 
-  std::vector<string> server_names;
+  std::vector<std::string> server_names;
   for (const auto& server_factory : *server_factories()) {
     server_names.push_back(server_factory.first);
   }
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index cc92d0bae12b17..c49d47970b4ca0 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -64,7 +64,7 @@ class ServerInterface {
 
   // Returns a target string that can be used to connect to this server using
   // `tensorflow::NewSession()`.
-  virtual const string target() const = 0;
+  virtual const std::string target() const = 0;
 
   virtual WorkerEnv* worker_env() = 0;
   virtual MasterEnv* master_env() = 0;
@@ -77,7 +77,7 @@ class ServerInterface {
   // Add master eager context to local eager service in order to handle enqueue
   // requests from remote workers.
   virtual absl::Status AddMasterEagerContextToEagerService(
-      const tensorflow::uint64 context_id, EagerContext* context) = 0;
+      const uint64_t context_id, EagerContext* context) = 0;
   // Set coordination service agent instance to coordination service RPC handler
   virtual absl::Status SetCoordinationServiceAgentInstance(
       tsl::CoordinationServiceAgent* agent) = 0;
@@ -113,7 +113,7 @@ class ServerFactory {
   // be registered by calling this method.
   //
   // The `server_type` must be unique to the server factory.
-  static void Register(const string& server_type, ServerFactory* factory);
+  static void Register(const std::string& server_type, ServerFactory* factory);
 
   // Looks up a factory that can create a server based on the given
   // `server_def`, and stores it in `*out_factory`. Returns OK on
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index 1990f0c17c66a4..43524d19a35788 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -114,14 +114,14 @@ enum WireType {
   WIRETYPE_VARINT = 0,
   WIRETYPE_LENGTH_DELIMITED = 2,
 };
-inline int GetTagFieldNumber(uint32 tag) { return tag >> 3; }
-inline WireType GetTagWireType(uint32 tag) {
+inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
+inline WireType GetTagWireType(uint32_t tag) {
   return static_cast<WireType>(tag & 0x7);
 }
 
 bool ReadVarintSizeAsInt(protobuf::io::CodedInputStream* input, int* result) {
   protobuf_uint64 v;
-  if (input->ReadVarint64(&v) && v <= static_cast<uint64>(INT_MAX)) {
+  if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
     *result = static_cast<int>(v);
     return true;
   } else {
@@ -162,7 +162,7 @@ bool TensorResponse::ParseTensorSubmessage(
     }
     switch (tag) {
       case TensorProto::kDtypeFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input->ReadVarint32(&v)) return false;
         if (seen_tensor_content) return false;
         tensor_meta->set_dtype(static_cast<DataType>(static_cast<int>(v)));
@@ -177,10 +177,10 @@ bool TensorResponse::ParseTensorSubmessage(
         break;
       }
       case TensorProto::kVersionNumberFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input->ReadVarint32(&v)) return false;
         if (seen_tensor_content) return false;
-        tensor_meta->set_version_number(static_cast<int32>(v));
+        tensor_meta->set_version_number(static_cast<int32_t>(v));
         break;
       }
       case TensorProto::kTensorContentFieldNumber: {
@@ -242,7 +242,7 @@ bool TensorResponse::ParseFast(Source* source) {
         break;
       }
       case RecvTensorResponse::kIsDeadFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) return false;
         meta_.set_is_dead(v != 0);
         break;
@@ -260,7 +260,7 @@ bool TensorResponse::ParseFast(Source* source) {
         break;
       }
       case RecvTensorResponse::kRequireAckFieldNumber: {
-        uint32 v;
+        uint32_t v;
         if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) return false;
         meta_.set_require_ack(v != 0);
         break;
diff --git a/tensorflow/core/distributed_runtime/tensor_coding_test.cc b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
index 9ef513f70392e4..66ba2bdce86b3a 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding_test.cc
@@ -48,7 +48,7 @@ class DummyDevice : public DeviceBase {
 
 class StringSource : public TensorResponse::Source {
  public:
-  explicit StringSource(const string* s, int block_size)
+  explicit StringSource(const std::string* s, int block_size)
       : s_(s), stream_(nullptr), block_size_(block_size) {}
   ~StringSource() override { DeleteStream(); }
 
@@ -66,7 +66,7 @@ class StringSource : public TensorResponse::Source {
   }
 
  private:
-  const string* s_;
+  const std::string* s_;
   protobuf::io::ArrayInputStream* stream_;
   char space_[sizeof(protobuf::io::ArrayInputStream)];
   int block_size_;
@@ -83,7 +83,7 @@ class TensorResponseTest : public ::testing::Test {
     } else {
       src.AsProtoField(proto.mutable_tensor());
     }
-    string encoded;
+    std::string encoded;
     proto.AppendToString(&encoded);
 
     StringSource source(&encoded, 1024);
@@ -136,11 +136,11 @@ class TensorResponseTest : public ::testing::Test {
 TEST_F(TensorResponseTest, Simple) {
   DoTest<float>(DT_FLOAT);
   DoTest<double>(DT_DOUBLE);
-  DoTest<int32>(DT_INT32);
-  DoTest<uint16>(DT_UINT16);
-  DoTest<uint8>(DT_UINT8);
-  DoTest<int16>(DT_INT16);
-  DoTest<int8>(DT_INT8);
+  DoTest<int32_t>(DT_INT32);
+  DoTest<uint16_t>(DT_UINT16);
+  DoTest<uint8_t>(DT_UINT8);
+  DoTest<int16_t>(DT_INT16);
+  DoTest<int8_t>(DT_INT8);
   DoTest<complex64>(DT_COMPLEX64);
   DoTest<complex128>(DT_COMPLEX128);
   DoTest<int64_t>(DT_INT64);
@@ -156,19 +156,19 @@ TEST_F(TensorResponseTest, Simple) {
 
 TEST_F(TensorResponseTest, StringTensor) { DoTestForStrings(DT_STRING); }
 
-string MakeFloatTensorTestCase(int num_elems) {
-  std::vector<int8> v(num_elems);
+std::string MakeFloatTensorTestCase(int num_elems) {
+  std::vector<int8_t> v(num_elems);
   for (int i = 0; i < num_elems; i++) {
     v[i] = i % 10;
   }
   Tensor src(DT_INT8, TensorShape({1, static_cast<int64_t>(v.size())}));
-  test::FillValues<int8>(&src, v);
+  test::FillValues<int8_t>(&src, v);
 
   RecvTensorResponse proto;
   proto.set_is_dead(false);
   proto.set_send_start_micros(123456);
   src.AsProtoTensorContent(proto.mutable_tensor());
-  string encoded;
+  std::string encoded;
   proto.AppendToString(&encoded);
   return encoded;
 }
@@ -176,7 +176,7 @@ string MakeFloatTensorTestCase(int num_elems) {
 static void BM_TensorResponse(::testing::benchmark::State& state) {
   const int arg = state.range(0);
 
-  string encoded = MakeFloatTensorTestCase(arg);
+  std::string encoded = MakeFloatTensorTestCase(arg);
   DummyDevice cpu_device(Env::Default());
   size_t bytes = 0;
   for (auto i : state) {
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index e7ad1041dd73ff..b7316299e051c3 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -124,23 +124,24 @@ class TestWorkerCache : public WorkerCacheInterface {
  public:
   virtual ~TestWorkerCache() {}
 
-  void AddWorker(const string& target, WorkerInterface* wi) {
+  void AddWorker(const std::string& target, WorkerInterface* wi) {
     workers_[target] = wi;
   }
 
-  void AddDevice(const string& device_name, const DeviceLocality& dev_loc) {
+  void AddDevice(const std::string& device_name,
+                 const DeviceLocality& dev_loc) {
     localities_[device_name] = dev_loc;
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     workers->clear();
     for (auto it : workers_) {
       workers->push_back(it.first);
     }
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) const override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) const override {
     workers->clear();
     for (auto it : workers_) {
       DeviceNameUtils::ParsedName device_name;
@@ -152,7 +153,7 @@ class TestWorkerCache : public WorkerCacheInterface {
     }
   }
 
-  WorkerInterface* GetOrCreateWorker(const string& target) override {
+  WorkerInterface* GetOrCreateWorker(const std::string& target) override {
     auto it = workers_.find(target);
     if (it != workers_.end()) {
       return it->second;
@@ -160,7 +161,8 @@ class TestWorkerCache : public WorkerCacheInterface {
     return nullptr;
   }
 
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {}
+  void ReleaseWorker(const std::string& target,
+                     WorkerInterface* worker) override {}
 
   absl::Status GetEagerClientCache(
       std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
@@ -172,7 +174,7 @@ class TestWorkerCache : public WorkerCacheInterface {
     return errors::Unimplemented("Unimplemented.");
   }
 
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     auto it = localities_.find(device);
     if (it != localities_.end()) {
@@ -182,7 +184,8 @@ class TestWorkerCache : public WorkerCacheInterface {
     return false;
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
     auto it = localities_.find(device);
     if (it != localities_.end()) {
@@ -194,8 +197,8 @@ class TestWorkerCache : public WorkerCacheInterface {
   }
 
  protected:
-  std::unordered_map<string, WorkerInterface*> workers_;
-  std::unordered_map<string, DeviceLocality> localities_;
+  std::unordered_map<std::string, WorkerInterface*> workers_;
+  std::unordered_map<std::string, DeviceLocality> localities_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 9fb0a76ad866f9..04b0ee20d2cc8f 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -251,7 +251,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
 
         if (s.ok()) {
           for (const auto& p : *out) {
-            const string& key = p.first;
+            const std::string& key = p.first;
             const Tensor& val = p.second;
             response->AddRecv(key, val);
           }
@@ -271,7 +271,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
                                MutableRunGraphResponseWrapper* response,
                                StatusCallback done) {
   const int64_t step_id = request->step_id();
-  const string& graph_handle = request->graph_handle();
+  const std::string& graph_handle = request->graph_handle();
   TRACEPRINTF("PartialRunGraph: %lld", step_id);
   absl::Status s = recent_request_ids_.TrackUnique(
       request->request_id(), "PartialRunGraph (Worker)", request);
@@ -345,7 +345,7 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
         if (s.ok()) {
           // Construct and return the resp.
           for (const auto& p : *out) {
-            const string& key = p.first;
+            const std::string& key = p.first;
             const Tensor& val = p.second;
             response->AddRecv(key, val);
           }
@@ -378,7 +378,7 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
 void Worker::CleanupAllAsync(const CleanupAllRequest* request,
                              CleanupAllResponse* response,
                              StatusCallback done) {
-  std::vector<string> containers;
+  std::vector<std::string> containers;
   for (const auto& c : request->container()) containers.push_back(c);
   env_->device_mgr->ClearContainers(containers);
   done(absl::OkStatus());
@@ -474,7 +474,7 @@ void Worker::GetStepSequenceAsync(const GetStepSequenceRequest* request,
 absl::Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
                                        Device** src_dev) {
   // Figures out which device the tensor is hosted on.
-  string local_name = DeviceNameUtils::LocalName(parsed.src_device);
+  std::string local_name = DeviceNameUtils::LocalName(parsed.src_device);
   TF_RETURN_IF_ERROR(env_->device_mgr->LookupDevice(local_name, src_dev));
 
   // Does the device have the right incarnation number we expect?
diff --git a/tensorflow/core/distributed_runtime/worker_cache.h b/tensorflow/core/distributed_runtime/worker_cache.h
index 1ac4de35d9788f..0612a8321d3aac 100644
--- a/tensorflow/core/distributed_runtime/worker_cache.h
+++ b/tensorflow/core/distributed_runtime/worker_cache.h
@@ -37,22 +37,23 @@ class WorkerCacheInterface {
 
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
-  virtual void ListWorkers(std::vector<string>* workers) const = 0;
-  virtual void ListWorkersInJob(const string& job_name,
-                                std::vector<string>* workers) const = 0;
+  virtual void ListWorkers(std::vector<std::string>* workers) const = 0;
+  virtual void ListWorkersInJob(const std::string& job_name,
+                                std::vector<std::string>* workers) const = 0;
 
   // If "target" names a remote task for which an RPC channel exists
   // or can be constructed, returns a pointer to a WorkerInterface object
   // wrapping that channel. The returned value must be destroyed by
   // calling `this->ReleaseWorker(target, ret)`
-  virtual WorkerInterface* GetOrCreateWorker(const string& target) = 0;
+  virtual WorkerInterface* GetOrCreateWorker(const std::string& target) = 0;
 
   // Release a worker previously returned by this->GetOrCreateWorker(target).
   //
   // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
   // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
   //                    per-rpc-subsystem WorkerInterface creator.
-  virtual void ReleaseWorker(const string& target, WorkerInterface* worker) {
+  virtual void ReleaseWorker(const std::string& target,
+                             WorkerInterface* worker) {
     // Subclasses may override to reuse worker objects.
     delete worker;
   }
@@ -61,13 +62,13 @@ class WorkerCacheInterface {
   // within its local environment.  Returns true if *locality
   // was set, using only locally cached data.  Returns false
   // if status data for that device was not available.  Never blocks.
-  virtual bool GetDeviceLocalityNonBlocking(const string& device,
+  virtual bool GetDeviceLocalityNonBlocking(const std::string& device,
                                             DeviceLocality* locality) = 0;
 
   // Set *locality with the DeviceLocality of the specified remote device
   // within its local environment.  Callback gets Status::OK if *locality
   // was set.
-  virtual void GetDeviceLocalityAsync(const string& device,
+  virtual void GetDeviceLocalityAsync(const std::string& device,
                                       DeviceLocality* locality,
                                       StatusCallback done) = 0;
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.cc b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
index 2936c3b2667e18..5a1d3d02d4eceb 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.cc
@@ -68,7 +68,7 @@ bool WorkerCacheLogger::RetrieveLogs(int64_t step_id, StepStats* ss) {
   return false;
 }
 
-void WorkerCacheLogger::Save(const string& device, int64_t step_id,
+void WorkerCacheLogger::Save(const std::string& device, int64_t step_id,
                              NodeExecStats* ns) {
   mutex_lock l(mu_);
   StepLog* sl = &log_map_[step_id];
@@ -84,33 +84,31 @@ void WorkerCacheLogger::Save(const string& device, int64_t step_id,
 
 void WorkerCacheLogger::RecordRecvTensor(int64_t step_id, int64_t start_usecs,
                                          int64_t end_usecs,
-                                         const string& tensor_name,
-                                         const string& src_device,
-                                         const string& dst_device,
+                                         const std::string& tensor_name,
+                                         const std::string& src_device,
+                                         const std::string& dst_device,
                                          int64_t bytes) {
   RecordDataTransfer(step_id, start_usecs, end_usecs, tensor_name, src_device,
                      dst_device, bytes, "", "RecvTensor");
 }
 
-void WorkerCacheLogger::RecordDataTransfer(int64_t step_id, int64_t start_usecs,
-                                           int64_t end_usecs,
-                                           const string& tensor_name,
-                                           const string& src_device,
-                                           const string& dst_device,
-                                           int64_t bytes, const string& details,
-                                           const string& transfer_method_name) {
+void WorkerCacheLogger::RecordDataTransfer(
+    int64_t step_id, int64_t start_usecs, int64_t end_usecs,
+    const std::string& tensor_name, const std::string& src_device,
+    const std::string& dst_device, int64_t bytes, const std::string& details,
+    const std::string& transfer_method_name) {
   NodeExecStats* ns = new NodeExecStats;
   ns->set_node_name(transfer_method_name);
   int64_t elapsed_usecs = end_usecs - start_usecs;
   if (details.empty()) {
     auto byte_string = absl::StrCat("[", bytes, "B] ");
     if (bytes >= 0.1 * 1048576.0) {
-      byte_string = strings::Printf("[%.1fMB] ", bytes / 1048576.0);
+      byte_string = absl::StrFormat("[%.1fMB] ", bytes / 1048576.0);
     }
     float mbs_rate = (8.0 * static_cast<float>(bytes)) / elapsed_usecs;
     auto rate_string = (mbs_rate >= 1000.0)
-                           ? strings::Printf("[%.1fGb/s] ", mbs_rate / 1000.0)
-                           : strings::Printf("[%fMb/s] ", mbs_rate);
+                           ? absl::StrFormat("[%.1fGb/s] ", mbs_rate / 1000.0)
+                           : absl::StrFormat("[%fMb/s] ", mbs_rate);
     auto label = strings::StrCat(byte_string, rate_string, tensor_name,
                                  " from ", src_device, " to ", dst_device);
     ns->set_timeline_label(label);
diff --git a/tensorflow/core/distributed_runtime/worker_cache_logger.h b/tensorflow/core/distributed_runtime/worker_cache_logger.h
index f5ef19bf6646f7..e7a1ebf0c40708 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_logger.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_logger.h
@@ -57,20 +57,22 @@ class WorkerCacheLogger {
   // Generates a NodeExecStats record with the given data, and saves for
   // later retrieval by RetrieveLogs().
   void RecordRecvTensor(int64_t step_id, int64_t start_usecs, int64_t end_usecs,
-                        const string& tensor_name, const string& src_device,
-                        const string& dst_device, int64_t bytes);
+                        const std::string& tensor_name,
+                        const std::string& src_device,
+                        const std::string& dst_device, int64_t bytes);
 
   // Generates a NodeExecStats record with the given data, and saves for
   // later retrieval by RetrieveLogs().
   void RecordDataTransfer(int64_t step_id, int64_t start_usecs,
-                          int64_t end_usecs, const string& tensor_name,
-                          const string& src_device, const string& dst_device,
-                          int64_t bytes, const string& details,
-                          const string& transfer_method_name);
+                          int64_t end_usecs, const std::string& tensor_name,
+                          const std::string& src_device,
+                          const std::string& dst_device, int64_t bytes,
+                          const std::string& details,
+                          const std::string& transfer_method_name);
 
  private:
   mutex count_mu_;
-  int32 want_logging_count_ TF_GUARDED_BY(count_mu_) = 0;
+  int32_t want_logging_count_ TF_GUARDED_BY(count_mu_) = 0;
 
   struct StepLog {
     StepStats step_stats;
@@ -81,7 +83,7 @@ class WorkerCacheLogger {
   LogMap log_map_ TF_GUARDED_BY(mu_);
 
   // Records "ns" in log_map_ under the given device and step.
-  void Save(const string& device, int64_t step_id, NodeExecStats* ns);
+  void Save(const std::string& device, int64_t step_id, NodeExecStats* ns);
 
   void ClearLogsWithLock() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 };
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.cc b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
index 58b130228e00dd..47fdcce387297d 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow {
 
 bool WorkerCachePartial::GetDeviceLocalityNonBlocking(
-    const string& device_name, DeviceLocality* locality) {
+    const std::string& device_name, DeviceLocality* locality) {
   mutex_lock lock(mu_);  // could use reader lock
   auto iter = device_status_cache_.find(device_name);
   if (iter != device_status_cache_.end()) {
@@ -37,7 +37,7 @@ bool WorkerCachePartial::GetDeviceLocalityNonBlocking(
   return false;
 }
 
-void WorkerCachePartial::GetDeviceLocalityAsync(const string& device_name,
+void WorkerCachePartial::GetDeviceLocalityAsync(const std::string& device_name,
                                                 DeviceLocality* locality,
                                                 StatusCallback done) {
   if (!GetDeviceLocalityNonBlocking(device_name, locality)) {
@@ -55,9 +55,9 @@ void WorkerCachePartial::GetDeviceLocalityAsync(const string& device_name,
 }
 
 absl::Status WorkerCachePartial::RefreshDeviceStatus(
-    const string& device_name) {
-  string task;
-  string device;
+    const std::string& device_name) {
+  std::string task;
+  std::string device;
   absl::Status s;
   if (!DeviceNameUtils::SplitDeviceName(device_name, &task, &device)) {
     s = errors::InvalidArgument("Bad device name to RefreshDeviceStatus: ",
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.h b/tensorflow/core/distributed_runtime/worker_cache_partial.h
index b5a500b86dae00..08e272a3bb6db6 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.h
@@ -31,10 +31,11 @@ namespace tensorflow {
 // device status attributes.
 class WorkerCachePartial : public WorkerCacheInterface {
  public:
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override;
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback) override;
 
   ~WorkerCachePartial() override {}
@@ -47,9 +48,9 @@ class WorkerCachePartial : public WorkerCacheInterface {
 
   // Initiate a GetStatusAsync to the remote task named by "task", and
   // update the cache with all the DeviceAttributes reported.
-  absl::Status RefreshDeviceStatus(const string& device_name);
+  absl::Status RefreshDeviceStatus(const std::string& device_name);
 
-  typedef std::unordered_map<string, DeviceAttributes> StatusMap;
+  typedef std::unordered_map<std::string, DeviceAttributes> StatusMap;
   StatusMap device_status_cache_ TF_GUARDED_BY(mu_);
 };
 
diff --git a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
index 7f709b4fb5c1bb..8917da3825773b 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
+++ b/tensorflow/core/distributed_runtime/worker_cache_wrapper.h
@@ -29,11 +29,11 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
 
   // Updates *workers with strings naming the remote worker tasks to
   // which open channels have been established.
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     return wrapped_->ListWorkers(workers);
   }
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) const override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) const override {
     return wrapped_->ListWorkersInJob(job_name, workers);
   }
 
@@ -41,7 +41,7 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // or can be constructed, returns a pointer to a WorkerInterface object
   // wrapping that channel. The returned value must be destroyed by
   // calling `this->ReleaseWorker(target, ret)`
-  WorkerInterface* GetOrCreateWorker(const string& target) override {
+  WorkerInterface* GetOrCreateWorker(const std::string& target) override {
     return wrapped_->GetOrCreateWorker(target);
   }
 
@@ -50,7 +50,8 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // TODO(jeff,sanjay): Consider moving target into WorkerInterface.
   // TODO(jeff,sanjay): Unify all worker-cache impls and factor out a
   //                    per-rpc-subsystem WorkerInterface creator.
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+  void ReleaseWorker(const std::string& target,
+                     WorkerInterface* worker) override {
     return wrapped_->ReleaseWorker(target, worker);
   }
 
@@ -69,7 +70,7 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // within its local environment.  Returns true if *locality
   // was set, using only locally cached data.  Returns false
   // if status data for that device was not available.  Never blocks.
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
   }
@@ -77,7 +78,8 @@ class WorkerCacheWrapper : public WorkerCacheInterface {
   // Set *locality with the DeviceLocality of the specified remote device
   // within its local environment.  Callback gets Status::OK if *locality
   // was set.
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
     return wrapped_->GetDeviceLocalityAsync(device, locality, std::move(done));
   }
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index d9286d0d148843..cb66a4f845f5b7 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -43,16 +43,16 @@ class WorkerFreeListCache : public WorkerCacheInterface {
     }
   }
 
-  void ListWorkers(std::vector<string>* workers) const override {
+  void ListWorkers(std::vector<std::string>* workers) const override {
     wrapped_->ListWorkers(workers);
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) const override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) const override {
     wrapped_->ListWorkersInJob(job_name, workers);
   }
 
-  WorkerInterface* GetOrCreateWorker(const string& target) override {
+  WorkerInterface* GetOrCreateWorker(const std::string& target) override {
     {
       // Fast path if worker has been created.
       tf_shared_lock l(mu_);
@@ -88,16 +88,18 @@ class WorkerFreeListCache : public WorkerCacheInterface {
     return wrapped_->GetCoordinationClientCache(coordination_client_cache);
   }
 
-  void ReleaseWorker(const string& target, WorkerInterface* worker) override {
+  void ReleaseWorker(const std::string& target,
+                     WorkerInterface* worker) override {
     // TODO(jeff,sanjay): Should decrement ref-count when we implement eviction.
   }
 
-  bool GetDeviceLocalityNonBlocking(const string& device,
+  bool GetDeviceLocalityNonBlocking(const std::string& device,
                                     DeviceLocality* locality) override {
     return wrapped_->GetDeviceLocalityNonBlocking(device, locality);
   }
 
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+  void GetDeviceLocalityAsync(const std::string& device,
+                              DeviceLocality* locality,
                               StatusCallback done) override {
     wrapped_->GetDeviceLocalityAsync(device, locality, done);
   }
@@ -121,13 +123,13 @@ class WorkerFreeListCache : public WorkerCacheInterface {
 
   // TODO(jeff,sanjay): Eviction when the map becomes too big.
   mutex mu_;
-  std::unordered_map<string, WorkerState> workers_ TF_GUARDED_BY(mu_);
+  std::unordered_map<std::string, WorkerState> workers_ TF_GUARDED_BY(mu_);
 };
 
 }  // namespace
 
 WorkerSession::WorkerSession(
-    const string& session_name, const string& worker_name,
+    const std::string& session_name, const std::string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     std::unique_ptr<DeviceMgr> device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
@@ -165,7 +167,7 @@ absl::Status WorkerSession::UpdateWorkerCacheAndDevices(
 
 /* static */
 std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
-    const string& session_name, const string& worker_name,
+    const std::string& session_name, const std::string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
@@ -177,7 +179,7 @@ std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
 }
 
 WorkerSession::WorkerSession(
-    const string& session_name, const string& worker_name,
+    const std::string& session_name, const std::string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
     DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index e366accf18075b..5f8d66d93b6c69 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -51,8 +51,8 @@ class WorkerSession {
 
   DynamicDeviceMgr* remote_device_mgr() { return remote_device_mgr_.get(); }
 
-  const string& session_name() const { return session_name_; }
-  const string& worker_name() const { return worker_name_; }
+  const std::string& session_name() const { return session_name_; }
+  const std::string& worker_name() const { return worker_name_; }
 
   WorkerCacheInterface* worker_cache() const {
     tf_shared_lock l(worker_session_state_mu_);
@@ -64,7 +64,7 @@ class WorkerSession {
     return cluster_flr_.get();
   }
 
-  WorkerSession(const string& session_name, const string& worker_name,
+  WorkerSession(const std::string& session_name, const std::string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 std::unique_ptr<DeviceMgr> device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
@@ -72,7 +72,7 @@ class WorkerSession {
                 DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
 
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
-      const string& session_name, const string& worker_name,
+      const std::string& session_name, const std::string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
       DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
       std::unique_ptr<DynamicDeviceMgr> remote_device_mgr,
@@ -98,7 +98,7 @@ class WorkerSession {
   ~WorkerSession();
 
  private:
-  WorkerSession(const string& session_name, const string& worker_name,
+  WorkerSession(const std::string& session_name, const std::string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
                 DeviceMgr* borrowed_device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
@@ -106,10 +106,10 @@ class WorkerSession {
                 DistributedFunctionLibraryRuntimeCreator cluster_flr_creator);
 
   // The name of the session.
-  const string session_name_;
+  const std::string session_name_;
 
   // The name of the worker. E.g., /job:mnist/replica:0/task:1.
-  const string worker_name_;
+  const std::string worker_name_;
 
   mutable mutex worker_session_state_mu_;
   // Object from which WorkerInterface instances can be obtained.

From e6e6f0f6bc6076ee77875b16fdb7614059900465 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:24:27 -0800
Subject: [PATCH 410/753] Automated Code Change

PiperOrigin-RevId: 845669944
---
 .../xla/xla/tsl/platform/numa_hwloc.cc        |  2 +-
 third_party/xla/xla/tsl/platform/numa_noop.cc |  2 +-
 .../xla/xla/tsl/platform/ram_file_system.h    | 10 +++----
 .../xla/xla/tsl/platform/rocm_rocdl_path.h    |  4 +--
 third_party/xla/xla/tsl/platform/status.cc    |  2 +-
 third_party/xla/xla/tsl/platform/subprocess.h |  3 ++-
 .../xla/xla/tsl/platform/subprocess_test.cc   | 27 +++++++++----------
 .../xla/xla/tsl/platform/threadpool.cc        |  6 ++---
 8 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/third_party/xla/xla/tsl/platform/numa_hwloc.cc b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
index 971363aea7d1c1..e8163ab7eb5616 100644
--- a/third_party/xla/xla/tsl/platform/numa_hwloc.cc
+++ b/third_party/xla/xla/tsl/platform/numa_hwloc.cc
@@ -162,7 +162,7 @@ void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
       LOG(ERROR) << "Failed to find hwloc NUMA node " << node;
     }
   }
-  return ::tsl::port::AlignedMalloc(size, minimum_alignment);
+  return AlignedMalloc(size, static_cast<std::align_val_t>(minimum_alignment));
 }
 
 void NUMAFree(void* ptr, size_t size) {
diff --git a/third_party/xla/xla/tsl/platform/numa_noop.cc b/third_party/xla/xla/tsl/platform/numa_noop.cc
index 616c3ae57c5ded..2f50ad7504b260 100644
--- a/third_party/xla/xla/tsl/platform/numa_noop.cc
+++ b/third_party/xla/xla/tsl/platform/numa_noop.cc
@@ -30,7 +30,7 @@ void NUMASetThreadNodeAffinity(int node) {}
 int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
 
 void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
-  return ::tsl::port::AlignedMalloc(size, minimum_alignment);
+  return AlignedMalloc(size, static_cast<std::align_val_t>(minimum_alignment));
 }
 
 void NUMAFree(void* ptr, size_t size) { ::tsl::port::Free(ptr); }
diff --git a/third_party/xla/xla/tsl/platform/ram_file_system.h b/third_party/xla/xla/tsl/platform/ram_file_system.h
index 74c1e19a2273a4..892d5ed3fd7d68 100644
--- a/third_party/xla/xla/tsl/platform/ram_file_system.h
+++ b/third_party/xla/xla/tsl/platform/ram_file_system.h
@@ -66,13 +66,13 @@ class RamRandomAccessFile : public RandomAccessFile, public WritableFile {
     return absl::OkStatus();
   }
 
-  absl::Status Read(uint64 offset, size_t n, absl::string_view* result,
+  absl::Status Read(uint64_t offset, size_t n, absl::string_view* result,
                     char* scratch) const override {
     if (offset >= data_->size()) {
       return absl::OutOfRangeError("");
     }
 
-    uint64 left = std::min(static_cast<uint64>(n), data_->size() - offset);
+    uint64_t left = std::min(static_cast<uint64_t>(n), data_->size() - offset);
     auto start = data_->begin() + offset;
     auto end = data_->begin() + offset + left;
 
@@ -305,7 +305,7 @@ class RamFileSystem : public FileSystem {
   }
 
   absl::Status GetFileSize(const std::string& fname_, TransactionToken* token,
-                           uint64* file_size) override {
+                           uint64_t* file_size) override {
     absl::MutexLock m(mu_);
     auto fname = StripRamFsPrefix(fname_);
 
@@ -355,14 +355,14 @@ class RamFileSystem : public FileSystem {
     return absl::StartsWith(s, prefix);
   }
 
-  string StripPrefix(std::string s, std::string prefix) {
+  std::string StripPrefix(std::string s, std::string prefix) {
     if (absl::StartsWith(s, prefix)) {
       return s.erase(0, prefix.size());
     }
     return s;
   }
 
-  string StripRamFsPrefix(std::string name) {
+  std::string StripRamFsPrefix(std::string name) {
     std::string s = StripPrefix(name, "ram://");
     if (*(s.rbegin()) == '/') {
       s.pop_back();
diff --git a/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h b/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h
index a766492f0a95b2..62764273e11081 100644
--- a/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h
+++ b/third_party/xla/xla/tsl/platform/rocm_rocdl_path.h
@@ -22,10 +22,10 @@ namespace tsl {
 
 // Returns the root directory of the ROCM SDK, which contains sub-folders such
 // as bin, lib, and rocdl.
-string RocmRoot();
+std::string RocmRoot();
 
 // Returns the directory that contains ROCm-Device-Libs files in the ROCm SDK.
-string RocdlRoot();
+std::string RocdlRoot();
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/platform/status.cc b/third_party/xla/xla/tsl/platform/status.cc
index ab23dc080c9942..7e469216605715 100644
--- a/third_party/xla/xla/tsl/platform/status.cc
+++ b/third_party/xla/xla/tsl/platform/status.cc
@@ -322,7 +322,7 @@ absl::Status StatusGroup::as_concatenated_status() const {
   }
 
   if (!non_derived_.empty()) {
-    std::vector<string> fmt;
+    std::vector<std::string> fmt;
     fmt.emplace_back("\n=====================");
     for (const auto& s : non_derived_) {
       fmt.emplace_back(MakeString(s));
diff --git a/third_party/xla/xla/tsl/platform/subprocess.h b/third_party/xla/xla/tsl/platform/subprocess.h
index 8702b7795a8062..317b17da239575 100644
--- a/third_party/xla/xla/tsl/platform/subprocess.h
+++ b/third_party/xla/xla/tsl/platform/subprocess.h
@@ -52,7 +52,8 @@ class SubProcess;
 // launched with the given command-line arguments `argv`. The process
 // must be explicitly started by calling the Start() method on the
 // returned object.
-std::unique_ptr<SubProcess> CreateSubProcess(const std::vector<string>& argv);
+std::unique_ptr<SubProcess> CreateSubProcess(
+    const std::vector<std::string>& argv);
 
 }  // namespace tsl
 
diff --git a/third_party/xla/xla/tsl/platform/subprocess_test.cc b/third_party/xla/xla/tsl/platform/subprocess_test.cc
index 5bcf7824177964..0c5f9e99097a61 100644
--- a/third_party/xla/xla/tsl/platform/subprocess_test.cc
+++ b/third_party/xla/xla/tsl/platform/subprocess_test.cc
@@ -36,26 +36,25 @@ limitations under the License.
 namespace tsl {
 namespace {
 
-
-string EchoProgram() {
+std::string EchoProgram() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_echo");
   return tsl::io::AppendDotExeIfWindows(path);
 }
 
-string EchoArgv1Program() {
+std::string EchoArgv1Program() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_echo_argv_1");
   return tsl::io::AppendDotExeIfWindows(path);
 }
 
-string NoopProgram() {
+std::string NoopProgram() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_noop");
   return tsl::io::AppendDotExeIfWindows(path);
 }
 
-string StdErrProgram() {
+std::string StdErrProgram() {
   std::string path = io::JoinPath(testing::XlaSrcRoot(), "tsl", "platform",
                                   "testdata", "test_stderr");
   return tsl::io::AppendDotExeIfWindows(path);
@@ -77,7 +76,7 @@ TEST_F(SubProcessTest, NoOutput) {
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string out, err;
+  std::string out, err;
   int status = proc.Communicate(nullptr, &out, &err);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -94,7 +93,7 @@ TEST_F(SubProcessTest, Stdout) {
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string out, err;
+  std::string out, err;
   int status = proc.Communicate(nullptr, &out, &err);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -124,7 +123,7 @@ TEST_F(SubProcessTest, Stderr) {
   proc.SetChannelAction(CHAN_STDERR, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string out, err;
+  std::string out, err;
   int status = proc.Communicate(nullptr, &out, &err);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_NE(0, WEXITSTATUS(status));
@@ -151,7 +150,7 @@ TEST_F(SubProcessTest, Stdin) {
   proc.SetChannelAction(CHAN_STDIN, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string in = "foobar\nbarfoo\nhaha\n";
+  std::string in = "foobar\nbarfoo\nhaha\n";
   int status = proc.Communicate(&in, nullptr, nullptr);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -164,8 +163,8 @@ TEST_F(SubProcessTest, StdinStdout) {
   proc.SetChannelAction(CHAN_STDOUT, ACTION_PIPE);
   EXPECT_TRUE(proc.Start());
 
-  string in = "foobar\nbarfoo\nhaha\n";
-  string out;
+  std::string in = "foobar\nbarfoo\nhaha\n";
+  std::string out;
   int status = proc.Communicate(&in, &out, nullptr);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
@@ -182,7 +181,7 @@ TEST_F(SubProcessTest, StdinChildExit) {
 
   // Verify that the parent handles the child exiting immediately as the
   // parent is trying to write a large string to the child's stdin.
-  string in;
+  std::string in;
   in.reserve(1000000);
   for (int i = 0; i < 100000; i++) {
     in += "hello xyz\n";
@@ -202,13 +201,13 @@ TEST_F(SubProcessTest, StdinStdoutOverlap) {
 
   // Verify that the parent handles multiplexed reading/writing to the child
   // process.  The string is large enough to exceed the buffering of the pipes.
-  string in;
+  std::string in;
   in.reserve(1000000);
   for (int i = 0; i < 100000; i++) {
     in += "hello xyz\n";
   }
 
-  string out;
+  std::string out;
   int status = proc.Communicate(&in, &out, nullptr);
   EXPECT_TRUE(WIFEXITED(status));
   EXPECT_EQ(0, WEXITSTATUS(status));
diff --git a/third_party/xla/xla/tsl/platform/threadpool.cc b/third_party/xla/xla/tsl/platform/threadpool.cc
index 5d176deaaf945f..9d05c94a1cb816 100644
--- a/third_party/xla/xla/tsl/platform/threadpool.cc
+++ b/third_party/xla/xla/tsl/platform/threadpool.cc
@@ -60,13 +60,13 @@ struct EigenEnvironment {
   struct TaskImpl {
     std::function<void()> fn;
     Context context;
-    uint64 trace_id;
+    uint64_t trace_id;
   };
 
   struct Task {
     Task() = default;
 
-    Task(std::function<void()> fn, Context context, uint64 trace_id)
+    Task(std::function<void()> fn, Context context, uint64_t trace_id)
         : f(TaskImpl{std::move(fn), std::move(context), trace_id}) {}
 
     Task(Task&&) = default;
@@ -97,7 +97,7 @@ struct EigenEnvironment {
   }
 
   Task CreateTask(std::function<void()> f) {
-    uint64 id = 0;
+    uint64_t id = 0;
     if (ABSL_PREDICT_FALSE(tracing::EventCollector::IsEnabled())) {
       id = tracing::GetUniqueArg();
       tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);

From 46b13f4f274ca4b582144f031aeed359d81b77f9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:33:18 -0800
Subject: [PATCH 411/753] Automated Code Change

PiperOrigin-RevId: 845673018
---
 .../grappler/inputs/file_input_yielder.cc     | 13 +++++-----
 .../core/grappler/inputs/file_input_yielder.h |  4 ++--
 .../trivial_test_graph_input_yielder.cc       |  4 ++--
 tensorflow/core/grappler/inputs/utils.cc      | 13 +++++-----
 tensorflow/core/grappler/inputs/utils.h       | 10 ++++----
 tensorflow/core/grappler/inputs/utils_test.cc | 24 +++++++++----------
 6 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.cc b/tensorflow/core/grappler/inputs/file_input_yielder.cc
index 5d3e91d8dccee1..87fc1d1f141b2e 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.cc
@@ -38,7 +38,7 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-FileInputYielder::FileInputYielder(const std::vector<string>& filenames,
+FileInputYielder::FileInputYielder(const std::vector<std::string>& filenames,
                                    size_t max_iterations)
     : filenames_(filenames),
       current_file_(0),
@@ -64,7 +64,7 @@ bool FileInputYielder::NextItem(GrapplerItem* item) {
     }
   }
 
-  const string& filename = filenames_[current_file_];
+  const std::string& filename = filenames_[current_file_];
   ++current_file_;
 
   if (!Env::Default()->FileExists(filename).ok()) {
@@ -97,12 +97,12 @@ bool FileInputYielder::NextItem(GrapplerItem* item) {
     metagraph = MetaGraphDef();
     return NextItem(item);
   } else {
-    std::unordered_set<string> train_ops;
-    for (const string& val :
+    std::unordered_set<std::string> train_ops;
+    for (const std::string& val :
          metagraph.collection_def().at("train_op").node_list().value()) {
       train_ops.insert(NodeName(val));
     }
-    std::unordered_set<string> train_ops_found;
+    std::unordered_set<std::string> train_ops_found;
     for (auto& node : metagraph.graph_def().node()) {
       if (train_ops.find(node.name()) != train_ops.end()) {
         train_ops_found.insert(node.name());
@@ -120,7 +120,8 @@ bool FileInputYielder::NextItem(GrapplerItem* item) {
     }
   }
 
-  const string id = absl::StrCat(Fingerprint64(metagraph.SerializeAsString()));
+  const std::string id =
+      absl::StrCat(Fingerprint64(metagraph.SerializeAsString()));
 
   ItemConfig cfg;
   std::unique_ptr<GrapplerItem> new_item =
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index f3e9ecb677fdf8..ac1fdb7ac604bc 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -37,12 +37,12 @@ class FileInputYielder : public InputYielder {
   // Iterates over the files specified in the list of 'filename' up to
   // 'max_iterations' times.
   explicit FileInputYielder(
-      const std::vector<string>& filenames,
+      const std::vector<std::string>& filenames,
       size_t max_iterations = std::numeric_limits<size_t>::max());
   bool NextItem(GrapplerItem* item) override;
 
  private:
-  const std::vector<string> filenames_;
+  const std::vector<std::string> filenames_;
   size_t current_file_;
   size_t current_iteration_;
   size_t max_iterations_;
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
index 7f39582ba663f0..f496d48e28af82 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.cc
@@ -44,7 +44,7 @@ namespace grappler {
 namespace {
 GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
                         bool use_multiple_devices, bool insert_queue,
-                        const std::vector<string>& device_names) {
+                        const std::vector<std::string>& device_names) {
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -102,7 +102,7 @@ GraphDef CreateGraphDef(int num_stages, int width, int tensor_size,
 
 TrivialTestGraphInputYielder::TrivialTestGraphInputYielder(
     int num_stages, int width, int tensor_size, bool insert_queue,
-    const std::vector<string>& device_names)
+    const std::vector<std::string>& device_names)
     : num_stages_(num_stages),
       width_(width),
       tensor_size_(tensor_size),
diff --git a/tensorflow/core/grappler/inputs/utils.cc b/tensorflow/core/grappler/inputs/utils.cc
index 294bb2cead1111..6c6d3be7a25515 100644
--- a/tensorflow/core/grappler/inputs/utils.cc
+++ b/tensorflow/core/grappler/inputs/utils.cc
@@ -28,21 +28,22 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-bool FilesExist(const std::vector<string>& files,
+bool FilesExist(const std::vector<std::string>& files,
                 std::vector<absl::Status>* status) {
   return Env::Default()->FilesExist(files, status);
 }
 
-bool FilesExist(const std::set<string>& files) {
-  return FilesExist(std::vector<string>(files.begin(), files.end()), nullptr);
+bool FilesExist(const std::set<std::string>& files) {
+  return FilesExist(std::vector<std::string>(files.begin(), files.end()),
+                    nullptr);
 }
 
-bool FileExists(const string& file, absl::Status* status) {
+bool FileExists(const std::string& file, absl::Status* status) {
   *status = Env::Default()->FileExists(file);
   return status->ok();
 }
 
-absl::Status ReadGraphDefFromFile(const string& graph_def_path,
+absl::Status ReadGraphDefFromFile(const std::string& graph_def_path,
                                   GraphDef* result) {
   absl::Status status;
   if (!ReadBinaryProto(Env::Default(), graph_def_path, result).ok()) {
@@ -51,7 +52,7 @@ absl::Status ReadGraphDefFromFile(const string& graph_def_path,
   return status;
 }
 
-absl::Status ReadMetaGraphDefFromFile(const string& graph_def_path,
+absl::Status ReadMetaGraphDefFromFile(const std::string& graph_def_path,
                                       MetaGraphDef* result) {
   absl::Status status;
   if (!ReadBinaryProto(Env::Default(), graph_def_path, result).ok()) {
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
index 9caefcd836c171..50a35211149f15 100644
--- a/tensorflow/core/grappler/inputs/utils.h
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -29,18 +29,18 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-bool FilesExist(const std::vector<string>& files,
+bool FilesExist(const std::vector<std::string>& files,
                 std::vector<absl::Status>* status = nullptr);
-bool FilesExist(const std::set<string>& files);
+bool FilesExist(const std::set<std::string>& files);
 
-bool FileExists(const string& file, absl::Status* status);
+bool FileExists(const std::string& file, absl::Status* status);
 
 // Reads GraphDef from file in either text or raw serialized format.
-absl::Status ReadGraphDefFromFile(const string& graph_def_path,
+absl::Status ReadGraphDefFromFile(const std::string& graph_def_path,
                                   GraphDef* result);
 
 // Reads MetaGraphDef from file in either text or raw serialized format.
-absl::Status ReadMetaGraphDefFromFile(const string& meta_graph_def_path,
+absl::Status ReadMetaGraphDefFromFile(const std::string& meta_graph_def_path,
                                       MetaGraphDef* result);
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/inputs/utils_test.cc b/tensorflow/core/grappler/inputs/utils_test.cc
index b32229a051fa86..ff2e14fc930244 100644
--- a/tensorflow/core/grappler/inputs/utils_test.cc
+++ b/tensorflow/core/grappler/inputs/utils_test.cc
@@ -34,7 +34,7 @@ namespace {
 
 class UtilsTest : public ::testing::Test {
  protected:
-  string BaseDir() { return io::JoinPath(testing::TmpDir(), "base_dir"); }
+  std::string BaseDir() { return io::JoinPath(testing::TmpDir(), "base_dir"); }
 
   void SetUp() override {
     TF_CHECK_OK(env_->CreateDir(BaseDir()));
@@ -70,24 +70,24 @@ class UtilsTest : public ::testing::Test {
 
   GraphDef graph_def_;
   MetaGraphDef meta_graph_def_;
-  string non_existent_file_;
-  string actual_file_;
-  string text_graph_def_file_;
-  string binary_graph_def_file_;
-  string text_meta_graph_def_file_;
-  string binary_meta_graph_def_file_;
+  std::string non_existent_file_;
+  std::string actual_file_;
+  std::string text_graph_def_file_;
+  std::string binary_graph_def_file_;
+  std::string text_meta_graph_def_file_;
+  std::string binary_meta_graph_def_file_;
   Env* env_ = Env::Default();
 };
 
 TEST_F(UtilsTest, FilesExist) {
-  EXPECT_FALSE(FilesExist(std::vector<string>{{non_existent_file_}}));
-  EXPECT_FALSE(
-      FilesExist(std::vector<string>{{non_existent_file_}, {actual_file_}}));
-  EXPECT_TRUE(FilesExist(std::vector<string>{{actual_file_}}));
+  EXPECT_FALSE(FilesExist(std::vector<std::string>{{non_existent_file_}}));
+  EXPECT_FALSE(FilesExist(
+      std::vector<std::string>{{non_existent_file_}, {actual_file_}}));
+  EXPECT_TRUE(FilesExist(std::vector<std::string>{{actual_file_}}));
 
   std::vector<absl::Status> status;
   EXPECT_FALSE(FilesExist(
-      std::vector<string>{{non_existent_file_}, {actual_file_}}, &status));
+      std::vector<std::string>{{non_existent_file_}, {actual_file_}}, &status));
   EXPECT_EQ(status.size(), 2);
   EXPECT_FALSE(status[0].ok());
   EXPECT_TRUE(status[1].ok());

From fa3e8b7b4b34846c63164a585e19be0e32cbbb11 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:37:24 -0800
Subject: [PATCH 412/753] Automated Code Change

PiperOrigin-RevId: 845674461
---
 tensorflow/core/lib/png/png_io.cc | 23 ++++++++++++-----------
 tensorflow/core/lib/png/png_io.h  |  2 +-
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index b16584be2f3da5..82bff12556d89c 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -53,8 +53,8 @@ namespace {
                                        (del)))
 
 // Convert from 8 bit components to 16. This works in-place.
-static void Convert8to16(const uint8* p8, int num_comps, int p8_row_bytes,
-                         int width, int height_in, uint16* p16,
+static void Convert8to16(const uint8_t* p8, int num_comps, int p8_row_bytes,
+                         int width, int height_in, uint16_t* p16,
                          int p16_row_bytes) {
   // Force height*row_bytes computations to use 64 bits. Height*width is
   // enforced to < 29 bits in decode_png_op.cc, but height*row_bytes is
@@ -64,17 +64,18 @@ static void Convert8to16(const uint8* p8, int num_comps, int p8_row_bytes,
 
   // Adjust pointers to copy backwards
   width *= num_comps;
-  CPTR_INC(uint8, p8, (height - 1) * p8_row_bytes + (width - 1) * sizeof(*p8));
-  PTR_INC(uint16, p16,
+  CPTR_INC(uint8_t, p8,
+           (height - 1) * p8_row_bytes + (width - 1) * sizeof(*p8));
+  PTR_INC(uint16_t, p16,
           (height - 1) * p16_row_bytes + (width - 1) * sizeof(*p16));
   int bump8 = width * sizeof(*p8) - p8_row_bytes;
   int bump16 = width * sizeof(*p16) - p16_row_bytes;
   for (; height-- != 0;
-       CPTR_INC(uint8, p8, bump8), PTR_INC(uint16, p16, bump16)) {
+       CPTR_INC(uint8_t, p8, bump8), PTR_INC(uint16_t, p16, bump16)) {
     for (int w = width; w-- != 0; --p8, --p16) {
-      uint32 pix = *p8;
+      uint32_t pix = *p8;
       pix |= pix << 8;
-      *p16 = static_cast<uint16>(pix);
+      *p16 = static_cast<uint16_t>(pix);
     }
   }
 }
@@ -229,7 +230,7 @@ bool CommonInitDecode(absl::string_view png_string, int desired_channels,
     CommonFreeDecode(context);
     return false;
   }
-  context->data = absl::bit_cast<const uint8*>(png_string.data());
+  context->data = absl::bit_cast<const uint8_t*>(png_string.data());
   context->data_left = png_string.size();
   png_set_read_fn(context->png_ptr, context, StringReader);
   png_read_info(context->png_ptr, context->info_ptr);
@@ -342,9 +343,9 @@ bool CommonFinishDecode(png_bytep data, int row_bytes, DecodeContext* context) {
 
   // Synthesize 16 bits from 8 if requested.
   if (context->need_to_synthesize_16)
-    Convert8to16(absl::bit_cast<uint8*>(data), context->channels, row_bytes,
-                 context->width, context->height, absl::bit_cast<uint16*>(data),
-                 row_bytes);
+    Convert8to16(absl::bit_cast<uint8_t*>(data), context->channels, row_bytes,
+                 context->width, context->height,
+                 absl::bit_cast<uint16_t*>(data), row_bytes);
   return ok;
 }
 
diff --git a/tensorflow/core/lib/png/png_io.h b/tensorflow/core/lib/png/png_io.h
index a7fff84c1961ef..71d14546613328 100644
--- a/tensorflow/core/lib/png/png_io.h
+++ b/tensorflow/core/lib/png/png_io.h
@@ -45,7 +45,7 @@ namespace png {
 
 // Handy container for decoding information and struct pointers
 struct DecodeContext {
-  const uint8* data;
+  const uint8_t* data;
   int data_left;
   png_structp png_ptr;
   png_infop info_ptr;

From d536c1c3fb09a60ed721ff2770bdb3ce37328678 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:38:50 -0800
Subject: [PATCH 413/753] Automated Code Change

PiperOrigin-RevId: 845674996
---
 .../xla/xla/tsl/util/device_name_utils.cc     | 43 +++++++--------
 .../xla/xla/tsl/util/device_name_utils.h      |  5 +-
 .../xla/tsl/util/device_name_utils_test.cc    | 53 +++++++++++--------
 third_party/xla/xla/tsl/util/env_var.cc       | 18 +++----
 third_party/xla/xla/tsl/util/reporter.h       | 20 +++----
 5 files changed, 74 insertions(+), 65 deletions(-)

diff --git a/third_party/xla/xla/tsl/util/device_name_utils.cc b/third_party/xla/xla/tsl/util/device_name_utils.cc
index a79551b6b7f080..549bf3f9f7c933 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils.cc
@@ -43,7 +43,7 @@ static bool IsJobName(absl::string_view in) {
          std::all_of(in.begin(), in.end(), IsAlphaNumOrUnderscore);
 }
 
-static bool ConsumePrefix(absl::string_view* in, string* out,
+static bool ConsumePrefix(absl::string_view* in, std::string* out,
                           absl::string_view prefix_terminators) {
   if (in->empty() || !absl::ascii_isalpha(in->front())) {
     return false;
@@ -60,13 +60,13 @@ static bool ConsumePrefix(absl::string_view* in, string* out,
 }
 
 // Returns true and fills in "*job" iff "*in" starts with a job name.
-static bool ConsumeJobName(absl::string_view* in, string* job) {
+static bool ConsumeJobName(absl::string_view* in, std::string* job) {
   return ConsumePrefix(in, job, "/");
 }
 
 // Returns true and fills in "*device_type" iff "*in" starts with a device type
 // name.
-static bool ConsumeDeviceType(absl::string_view* in, string* device_type) {
+static bool ConsumeDeviceType(absl::string_view* in, std::string* device_type) {
   return ConsumePrefix(in, device_type, "/:");
 }
 
@@ -95,14 +95,15 @@ static std::string DeviceName(absl::string_view job, int replica, int task,
 }
 
 /* static */
-string DeviceNameUtils::FullName(const string& job, int replica, int task,
-                                 const string& type, int id) {
+std::string DeviceNameUtils::FullName(const std::string& job, int replica,
+                                      int task, const std::string& type,
+                                      int id) {
   return DeviceName(job, replica, task, "/device:", type, id);
 }
 
 namespace {
-string LegacyName(const string& job, int replica, int task, const string& type,
-                  int id) {
+std::string LegacyName(const std::string& job, int replica, int task,
+                       const std::string& type, int id) {
   return DeviceName(job, replica, task, "/", absl::AsciiStrToLower(type), id);
 }
 }  // anonymous namespace
@@ -214,9 +215,9 @@ void CompleteName(const DeviceNameUtils::ParsedName& parsed_basename,
 }  // namespace
 
 /* static */
-absl::Status DeviceNameUtils::CanonicalizeDeviceName(absl::string_view fullname,
-                                                     absl::string_view basename,
-                                                     string* canonical_name) {
+absl::Status DeviceNameUtils::CanonicalizeDeviceName(
+    absl::string_view fullname, absl::string_view basename,
+    std::string* canonical_name) {
   *canonical_name = "";
   ParsedName parsed_basename;
   if (!ParseFullName(basename, &parsed_basename)) {
@@ -247,8 +248,8 @@ absl::Status DeviceNameUtils::CanonicalizeDeviceName(absl::string_view fullname,
 }
 
 /* static */
-string DeviceNameUtils::ParsedNameToString(const ParsedName& pn) {
-  string buf;
+std::string DeviceNameUtils::ParsedNameToString(const ParsedName& pn) {
+  std::string buf;
   if (pn.has_job) {
     absl::StrAppend(&buf, "/job:", pn.job);
   }
@@ -533,20 +534,20 @@ const DeviceNameUtils::ParsedName DeviceNameUtils::AddressSpace(
 }
 
 /* static */
-string DeviceNameUtils::LocalName(absl::string_view type, int id) {
+std::string DeviceNameUtils::LocalName(absl::string_view type, int id) {
   return absl::StrCat("/device:", type, ":", id);
 }
 
 namespace {
 // Returns the legacy local device name given its "type" and "id" (which is
 // '/device:type:id').
-string LegacyLocalName(absl::string_view type, int id) {
+std::string LegacyLocalName(absl::string_view type, int id) {
   return absl::StrCat(type, ":", id);
 }
 }  // anonymous namespace
 
 /* static */
-string DeviceNameUtils::LocalName(absl::string_view fullname) {
+std::string DeviceNameUtils::LocalName(absl::string_view fullname) {
   ParsedName x;
   CHECK(ParseFullName(fullname, &x)) << fullname;
   return LocalName(x.type, x.id);
@@ -569,8 +570,8 @@ bool DeviceNameUtils::ParseLocalName(absl::string_view name, ParsedName* p) {
 }
 
 /* static */
-bool DeviceNameUtils::SplitDeviceName(absl::string_view name, string* task,
-                                      string* device) {
+bool DeviceNameUtils::SplitDeviceName(absl::string_view name, std::string* task,
+                                      std::string* device) {
   ParsedName pn;
   if (ParseFullName(name, &pn) && pn.has_type && pn.has_id) {
     task->clear();
@@ -595,7 +596,7 @@ bool DeviceNameUtils::SplitDeviceName(absl::string_view name, string* task,
 }
 
 /* static */
-bool DeviceNameUtils::GetTaskName(const ParsedName& pn, string* task) {
+bool DeviceNameUtils::GetTaskName(const ParsedName& pn, std::string* task) {
   if (pn.has_job && pn.has_replica && pn.has_task) {
     task->clear();
     task->reserve((5 + pn.job.size()) +
@@ -609,7 +610,7 @@ bool DeviceNameUtils::GetTaskName(const ParsedName& pn, string* task) {
   return false;
 }
 
-std::vector<string> DeviceNameUtils::GetNamesForDeviceMappings(
+std::vector<std::string> DeviceNameUtils::GetNamesForDeviceMappings(
     const ParsedName& pn) {
   if (pn.has_job && pn.has_replica && pn.has_task && pn.has_type && pn.has_id) {
     return {
@@ -619,7 +620,7 @@ std::vector<string> DeviceNameUtils::GetNamesForDeviceMappings(
   return {};
 }
 
-std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
+std::vector<std::string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
     const ParsedName& pn) {
   if (pn.has_type && pn.has_id) {
     return {DeviceNameUtils::LocalName(pn.type, pn.id),
@@ -629,7 +630,7 @@ std::vector<string> DeviceNameUtils::GetLocalNamesForDeviceMappings(
 }
 
 /*static*/ absl::Status DeviceNameUtils::DeviceNameToCpuDeviceName(
-    const string& device_name, string* host_device_name) {
+    const std::string& device_name, std::string* host_device_name) {
   DeviceNameUtils::ParsedName device;
   if (!DeviceNameUtils::ParseFullName(device_name, &device)) {
     return errors::Internal("Could not parse device name ", device_name);
diff --git a/third_party/xla/xla/tsl/util/device_name_utils.h b/third_party/xla/xla/tsl/util/device_name_utils.h
index ae012263571962..b431037c038848 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils.h
+++ b/third_party/xla/xla/tsl/util/device_name_utils.h
@@ -273,12 +273,13 @@ class DeviceNameUtils {
   // Returns canonical and legacy full names for the given parsed
   // device name 'pn'. The returned string names are often useful to
   // look up devices from a mapping.
-  static std::vector<string> GetNamesForDeviceMappings(const ParsedName& pn);
+  static std::vector<std::string> GetNamesForDeviceMappings(
+      const ParsedName& pn);
 
   // Returns canonical and legacy local names for the given parsed device name
   // 'pn'. The returned string names are often useful to look up devices from a
   // mapping.
-  static std::vector<string> GetLocalNamesForDeviceMappings(
+  static std::vector<std::string> GetLocalNamesForDeviceMappings(
       const ParsedName& pn);
 
   // Returns name of the CPU:0 device on the same host as the device
diff --git a/third_party/xla/xla/tsl/util/device_name_utils_test.cc b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
index 756c1635a18c8a..3dd25347fd0342 100644
--- a/third_party/xla/xla/tsl/util/device_name_utils_test.cc
+++ b/third_party/xla/xla/tsl/util/device_name_utils_test.cc
@@ -29,20 +29,22 @@ namespace tsl {
 
 namespace {
 
-bool RoundTripParsedName(const string& original, const string& expected) {
+bool RoundTripParsedName(const std::string& original,
+                         const std::string& expected) {
   DeviceNameUtils::ParsedName p;
   if (!DeviceNameUtils::ParseFullName(original, &p)) {
     return false;
   }
-  string round_tripped = DeviceNameUtils::ParsedNameToString(p);
+  std::string round_tripped = DeviceNameUtils::ParsedNameToString(p);
   return (round_tripped == expected);
 }
 
 enum NamePart { kJob = 0x01, kReplica = 0x02, kTask = 0x04, kDevice = 0x08 };
 
-bool RoundTripPartialName(int parts_to_test, const std::vector<string>& parts,
+bool RoundTripPartialName(int parts_to_test,
+                          const std::vector<std::string>& parts,
                           bool explicitDevice) {
-  string original, expected;
+  std::string original, expected;
   if (parts_to_test & kJob) {
     absl::StrAppend(&original, "/job:", parts[0]);
     absl::StrAppend(&expected, "/job:", parts[0]);
@@ -370,8 +372,8 @@ TEST(DeviceNameUtilsTest, IsSpecification) {
 }
 
 TEST(DeviceNameUtilsTest, SplitDeviceName) {
-  string task;
-  string device;
+  std::string task;
+  std::string device;
   EXPECT_TRUE(DeviceNameUtils::SplitDeviceName(
       "/job:foo/replica:1/task:2/cpu:1", &task, &device));
   EXPECT_EQ("/job:foo/replica:1/task:2", task);
@@ -393,14 +395,15 @@ TEST(DeviceNameUtilsTest, SplitDeviceName) {
   EXPECT_EQ("myspecialdevice:3", device);
 }
 
-static DeviceNameUtils::ParsedName Name(const string& str) {
+static DeviceNameUtils::ParsedName Name(const std::string& str) {
   DeviceNameUtils::ParsedName ret;
   CHECK(DeviceNameUtils::ParseFullName(str, &ret)) << "Invalid name: " << str;
   return ret;
 }
 
-static void MergeDevNamesHelperImpl(const string& name_a, const string& name_b,
-                                    const string& expected_merge_name,
+static void MergeDevNamesHelperImpl(const std::string& name_a,
+                                    const std::string& name_b,
+                                    const std::string& expected_merge_name,
                                     bool allow_soft_placement) {
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   TF_EXPECT_OK(DeviceNameUtils::MergeDevNames(&target_a, Name(name_b),
@@ -413,27 +416,30 @@ static void MergeDevNamesHelperImpl(const string& name_a, const string& name_b,
   EXPECT_EQ(target_b, Name(expected_merge_name));
 }
 
-static void MergeDevNamesHelper(const string& name_a, const string& name_b,
-                                const string& expected_merge_name) {
+static void MergeDevNamesHelper(const std::string& name_a,
+                                const std::string& name_b,
+                                const std::string& expected_merge_name) {
   MergeDevNamesHelperImpl(name_a, name_b, expected_merge_name, false);
 }
 
 static void MergeDevNamesHelperAllowSoftPlacement(
-    const string& name_a, const string& name_b,
-    const string& expected_merge_name) {
+    const std::string& name_a, const std::string& name_b,
+    const std::string& expected_merge_name) {
   MergeDevNamesHelperImpl(name_a, name_b, expected_merge_name, true);
 }
 
-static void MergeDevNamesError(const string& name_a, const string& name_b,
-                               const string& expected_error_substr) {
+static void MergeDevNamesError(const std::string& name_a,
+                               const std::string& name_b,
+                               const std::string& expected_error_substr) {
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   absl::Status s = DeviceNameUtils::MergeDevNames(&target_a, Name(name_b));
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
   EXPECT_TRUE(absl::StrContains(s.message(), expected_error_substr)) << s;
 }
 
-static void MergeOverrideHelper(const string& target, const string& name,
-                                const string& expected_merge_name) {
+static void MergeOverrideHelper(const std::string& target,
+                                const std::string& name,
+                                const std::string& expected_merge_name) {
   DeviceNameUtils::ParsedName parsed_target = Name(target);
   TF_EXPECT_OK(
       DeviceNameUtils::MergeOverrideDevNames(&parsed_target, Name(name)));
@@ -445,9 +451,10 @@ static void MergeOverrideHelper(const string& target, const string& name,
       << DeviceNameUtils::ParsedNameToString(parsed_expected);
 }
 
-static void MergeUnsetDevNamesHelper(const string& name_a, const string& name_b,
-                                     const string& expected_merge_name_ab,
-                                     const string& expected_merge_name_ba) {
+static void MergeUnsetDevNamesHelper(
+    const std::string& name_a, const std::string& name_b,
+    const std::string& expected_merge_name_ab,
+    const std::string& expected_merge_name_ba) {
   DeviceNameUtils::ParsedName target_a = Name(name_a);
   DeviceNameUtils::MergeUnsetDevNames(&target_a, Name(name_b));
   EXPECT_EQ(target_a, Name(expected_merge_name_ab));
@@ -592,10 +599,10 @@ TEST(DeviceNameUtilsTest, GetNamesForDeviceMappings) {
 }
 
 TEST(DeviceNameUtilsTest, CanonicalizeDeviceName) {
-  string canonical_name;
+  std::string canonical_name;
   {
     // Good basename.
-    string basename = "/job:foo/replica:10/task:0/device:CPU:0";
+    std::string basename = "/job:foo/replica:10/task:0/device:CPU:0";
     TF_EXPECT_OK(DeviceNameUtils::CanonicalizeDeviceName(
         "/job:foo/replica:10/task:0/device:CPU:1", basename, &canonical_name));
     EXPECT_EQ("/job:foo/replica:10/task:0/device:CPU:1", canonical_name);
@@ -616,7 +623,7 @@ TEST(DeviceNameUtilsTest, CanonicalizeDeviceName) {
 
   {
     // Try out malformed basenames.
-    string fullname = "/device:CPU:0";
+    std::string fullname = "/device:CPU:0";
 
     absl::Status s = DeviceNameUtils::CanonicalizeDeviceName(
         fullname, "/device:CPU:0", &canonical_name);
diff --git a/third_party/xla/xla/tsl/util/env_var.cc b/third_party/xla/xla/tsl/util/env_var.cc
index a92cc27d3365ea..351a8e3d3c3bae 100644
--- a/third_party/xla/xla/tsl/util/env_var.cc
+++ b/third_party/xla/xla/tsl/util/env_var.cc
@@ -30,11 +30,11 @@ namespace tsl {
 absl::Status ReadBoolFromEnvVar(absl::string_view env_var_name,
                                 bool default_val, bool* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return absl::OkStatus();
   }
-  string str_value = absl::AsciiStrToLower(tf_env_var_val);
+  std::string str_value = absl::AsciiStrToLower(tf_env_var_val);
   if (str_value == "0" || str_value == "false") {
     *value = false;
     return absl::OkStatus();
@@ -51,7 +51,7 @@ absl::Status ReadBoolFromEnvVar(absl::string_view env_var_name,
 absl::Status ReadInt64FromEnvVar(absl::string_view env_var_name,
                                  int64_t default_val, int64_t* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return absl::OkStatus();
   }
@@ -66,7 +66,7 @@ absl::Status ReadInt64FromEnvVar(absl::string_view env_var_name,
 absl::Status ReadFloatFromEnvVar(absl::string_view env_var_name,
                                  float default_val, float* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return absl::OkStatus();
   }
@@ -80,20 +80,20 @@ absl::Status ReadFloatFromEnvVar(absl::string_view env_var_name,
 
 absl::Status ReadStringFromEnvVar(absl::string_view env_var_name,
                                   absl::string_view default_val,
-                                  string* value) {
-  const char* tf_env_var_val = getenv(string(env_var_name).c_str());
+                                  std::string* value) {
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val != nullptr) {
     *value = tf_env_var_val;
   } else {
-    *value = string(default_val);
+    *value = std::string(default_val);
   }
   return absl::OkStatus();
 }
 
 absl::Status ReadStringsFromEnvVar(absl::string_view env_var_name,
                                    absl::string_view default_val,
-                                   std::vector<string>* value) {
-  string str_val;
+                                   std::vector<std::string>* value) {
+  std::string str_val;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar(env_var_name, default_val, &str_val));
   *value = str_util::Split(str_val, ',');
   return absl::OkStatus();
diff --git a/third_party/xla/xla/tsl/util/reporter.h b/third_party/xla/xla/tsl/util/reporter.h
index 25896589ca61ba..e5eae477f141f5 100644
--- a/third_party/xla/xla/tsl/util/reporter.h
+++ b/third_party/xla/xla/tsl/util/reporter.h
@@ -32,14 +32,14 @@ namespace tsl {
 class TestReportFile {
  public:
   // Create a TestReportFile with the test name 'test_name'.
-  TestReportFile(const string& fname, const string& test_name);
+  TestReportFile(const std::string& fname, const std::string& test_name);
 
   // Initialize the TestReportFile.  If the reporting env flag is set,
   // try to create the reporting file.  Fails if the file already exists.
   absl::Status Initialize();
 
   // Append the report file w/ 'content'.
-  absl::Status Append(const string& content);
+  absl::Status Append(const std::string& content);
 
   // Close the report file.
   absl::Status Close();
@@ -50,8 +50,8 @@ class TestReportFile {
 
  private:
   bool closed_;
-  string fname_;
-  string test_name_;
+  std::string fname_;
+  std::string test_name_;
   std::unique_ptr<WritableFile> log_file_;
   TestReportFile(const TestReportFile&) = delete;
   void operator=(const TestReportFile&) = delete;
@@ -82,11 +82,11 @@ class TestReporter {
   static constexpr const char* kTestReporterEnv = "TEST_REPORT_FILE_PREFIX";
 
   // Create a TestReporter with the test name 'test_name'.
-  explicit TestReporter(const string& test_name)
+  explicit TestReporter(const std::string& test_name)
       : TestReporter(GetLogEnv(), test_name) {}
 
   // Provide a prefix filename, mostly used for testing this class.
-  TestReporter(const string& fname, const string& test_name);
+  TestReporter(const std::string& fname, const std::string& test_name);
 
   // Initialize the TestReporter.  If the reporting env flag is set,
   // try to create the reporting file.  Fails if the file already exists.
@@ -106,19 +106,19 @@ class TestReporter {
                          double throughput);
 
   // Set property on Benchmark to the given value.
-  absl::Status SetProperty(const string& name, double value);
+  absl::Status SetProperty(const std::string& name, double value);
 
   // Set property on Benchmark to the given value.
-  absl::Status SetProperty(const string& name, const string& value);
+  absl::Status SetProperty(const std::string& name, const std::string& value);
 
   // Add the given value to the metrics on the Benchmark.
-  absl::Status AddMetric(const string& name, double value);
+  absl::Status AddMetric(const std::string& name, double value);
 
   // TODO(b/32704451): Don't just ignore the ::tensorflow::Status object!
   ~TestReporter() { Close().IgnoreError(); }  // Autoclose in destructor.
 
  private:
-  static string GetLogEnv() {
+  static std::string GetLogEnv() {
     const char* fname_ptr = getenv(kTestReporterEnv);
     return (fname_ptr != nullptr) ? fname_ptr : "";
   }

From 145b45691c3cb0fce088187872c3852ba0a9247b Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Wed, 17 Dec 2025 02:41:12 -0800
Subject: [PATCH 414/753] PR #35330: Respect print_metadata option when dumping
 HLO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35330

📝 Summary of Changes
https://github.com/openxla/xla/pull/34060 caused more stack trace metadata to be included in the text representation that is used when calculating HLO module fingerprints. This trickles through into the `fingerprint_before_lhs` attribute, which is used by profiler tooling to group executions of the same program across different processes.

🎯 Justification
Without this change, getting fingerprints to match across processes is unnecessarily difficult, requiring that (for JAX programs) all absolute Python paths and stack frames strictly match - including the parts outside the JIT.

🚀 Kind of Contribution
🐛 Bug Fix, 🧪 Tests

📊 Benchmark (for Performance Improvements)
n/a

🧪 Unit Tests:
New unit test in `xla/service/gpu/gpu_hlo_schedule_test.cc` verifying that the fingerprint is independent of the metadata.

🧪 Execution Tests:
n/a
Copybara import of the project:

--
9fd2ef7084f1152ebdb15757c37c0bd9e0e9e3ad by Olli Lupton <olupton@nvidia.com>:

Test fingerprint_before_lhs is independent of metadata

Otherwise an identical computation will get a different fingerprint if
any part of the metadata differs. For JAX programs, this includes the
full Python stack trace, with full path names, above the @jax.jit-ed
function. This fingerprint is used by profiler tooling to group
executions of the same program across different processes.

Merging this change closes #35330

PiperOrigin-RevId: 845675780
---
 .../xla/service/gpu/gpu_hlo_schedule_test.cc  | 49 ++++++++++++++++++-
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index c4669b077b1f42..4b78ae2e0fe441 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -124,12 +124,19 @@ class GpuHloScheduleTest : public HloTestBase {
                                        GetModuleConfig(test_config));
   }
 
-  static bool HasValidFingerprint(HloModule* module) {
+  static std::optional<std::string> ValidFingerprint(HloModule* module) {
     // Verify that the fingerprint of HLO prior to LHS is present.
     const FrontendAttributes& attrs = module->frontend_attributes();
     auto it = attrs.map().find(kFingerprintBeforeLHS);
     // The fingerprint is 128 bits stored as a hex string (128/4 hex digits).
-    return it != attrs.map().end() && it->second.size() == 128 / 4;
+    if (it != attrs.map().end() && it->second.size() == 128 / 4) {
+      return it->second;
+    }
+    return std::nullopt;
+  }
+
+  static bool HasValidFingerprint(HloModule* module) {
+    return ValidFingerprint(module).has_value();
   }
 };
 
@@ -1792,6 +1799,44 @@ TEST_F(GpuHloScheduleTest, AsyncOps) {
                           HloOpcode::kAsyncDone, HloOpcode::kAdd));
 }
 
+TEST_F(GpuHloScheduleTest, MetadataIgnoredInFingerprint) {
+  absl::string_view hlo = R"(
+HloModule test
+
+FileNames
+1 "$0"
+
+FunctionNames
+1 "<module>"
+
+FileLocations
+1 {file_name_id=1 function_name_id=1 line=1 end_line=2 column=0 end_column=1}
+
+StackFrames
+1 {file_location_id=1 parent_frame_id=1}
+
+fused_computation {
+  param_0 = f32[1024,1024]{1,0} parameter(0)
+  ROOT exponential.1 = f32[1024,1024]{1,0} exponential(param_0), metadata={stack_frame_id=1}
+}
+
+ENTRY e {
+  p = f32[1024,1024]{1,0} parameter(0)
+  ROOT wrapped_exp = f32[1024,1024]{1,0} fusion(p), kind=kLoop, calls=fused_computation
+})";
+  ASSERT_OK_AND_ASSIGN(auto mod1, ParseAndReturnVerifiedModule(
+                                      absl::Substitute(hlo, "filename1.py")));
+  ASSERT_OK_AND_ASSIGN(auto mod2, ParseAndReturnVerifiedModule(
+                                      absl::Substitute(hlo, "filename2.py")));
+  CHECK_OK(ScheduleGpuModule(mod1.get()).status());
+  CHECK_OK(ScheduleGpuModule(mod2.get()).status());
+  const std::optional<std::string> fp1 = ValidFingerprint(mod1.get());
+  const std::optional<std::string> fp2 = ValidFingerprint(mod2.get());
+  EXPECT_TRUE(fp1.has_value());
+  EXPECT_TRUE(fp2.has_value());
+  EXPECT_EQ(*fp1, *fp2);
+}
+
 // This test verifies that the latency hiding scheduler overlaps host memory
 // offloading (copy-start/copy-done) with computation.
 TEST_P(GpuHloScheduleParameterizedTest, CopyStartDoneScheduled) {

From 87cc79dddaacb9a2a3d8fabf4dd5181334cbb47b Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 17 Dec 2025 02:41:54 -0800
Subject: [PATCH 415/753] [XLA:GPU] Prepare to fuse iota into sort.

As a first step, codegen the iota operands in the first bitonic sort iteration.
Later we will introduce a sort fusion type that allows to fuse iota operands into
sort. The new codegen capabilities will not be used yet, as single iotas would
be wrapped into a fusion, so we would never find iota operands of sort.

PiperOrigin-RevId: 845676002
---
 .../xla/xla/backends/gpu/codegen/llvm/BUILD   |   4 +
 .../backends/gpu/codegen/llvm/llvm_emitter.cc |  11 +-
 .../backends/gpu/codegen/llvm/sort_util.cc    | 149 ++++++-----
 .../xla/backends/gpu/codegen/llvm/sort_util.h |   8 +-
 .../xla/xla/service/elemental_ir_emitter.cc   | 237 ++++++++++--------
 .../xla/xla/service/elemental_ir_emitter.h    |   9 +-
 6 files changed, 244 insertions(+), 174 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
index 45c42e67aadd6d..30a3ac1a965948 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
@@ -79,6 +79,8 @@ cc_library(
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:elemental_ir_emitter",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
@@ -87,7 +89,9 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
index 4192e2898a6c84..1fcbeb98e9d9ff 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
@@ -251,6 +251,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
 
   llvm::IRBuilderBase* builder() { return &b_; }
 
+  llvm::Module* module() { return module_; }
+
   // Generate the code for the computation passed in the constructor, if it
   // wasn't already generated previously.
   // As well as generting the code for the function, emits code for global
@@ -855,6 +857,7 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
   VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
                                 op_name, num_blocks, kThreadsPerBlock);
   ThunkSequence thunks;
+  bool emit_iota_operands = true;
   auto emit_kernel = [&](absl::Span<const int64_t> xor_masks) {
     VLOG(2) << absl::StreamFormat(
         "%s uses kernel for xor masks [%s]", op_name,
@@ -883,9 +886,9 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
 
     auto* comparator = sort->called_computations().front();
     auto* builder = ir_emitter.builder();
-    return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, output_arrays_span, llvm_ir::IrName(op_name),
-        xor_masks, ir_emitter.builder(), launch_dimensions,
+    auto result = llvm_ir::EmitSortInPlace(
+        sort, output_arrays_span, emit_iota_operands, llvm_ir::IrName(op_name),
+        xor_masks, ir_emitter.module(), ir_emitter.builder(), launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
         tile_size, kUnrollFactor,
@@ -894,6 +897,8 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
                                        llvm_module, *comparator, operands,
                                        output);
         });
+    emit_iota_operands = false;
+    return result;
   };
   std::vector<int64_t> xor_masks;
   for (int64_t stage = 0; stage < num_stages; ++stage) {
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
index 9332d9fa5ce937..40d88f803841db 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -34,8 +35,12 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Casting.h"
 #include "xla/backends/gpu/codegen/llvm/parallel_loop_emitter.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout_util.h"
 #include "xla/primitive_util.h"
+#include "xla/service/elemental_ir_emitter.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/target_util.h"
@@ -48,6 +53,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace llvm_ir {
@@ -59,7 +65,8 @@ absl::Status EmitCompareLoopBody(
     int64_t iteration_bound, int64_t num_threads, int64_t unroll_factor,
     int64_t num_values, llvm::Value* element_pair_index, int64_t xor_mask,
     llvm::Type* index_type,
-    std::function<llvm::Value*(int64_t operand, llvm::Value* index)>
+    std::function<absl::StatusOr<llvm::Value*>(int64_t operand,
+                                               llvm::Value* index)>
         element_address,
     std::function<llvm::Type*(int64_t operand, llvm::Value* index)>
         element_address_pointee_type,
@@ -173,16 +180,21 @@ absl::Status EmitCompareLoopBody(
 
     // if (index_is_inbounds)
     KernelSupportLibrary ksl(b);
-    TF_RETURN_IF_ERROR(
-        ksl.IfWithStatus("smaller_comparison_index", index_is_inbounds, [&]() {
+    RETURN_IF_ERROR(ksl.IfWithStatus(
+        "smaller_comparison_index", index_is_inbounds, [&]() -> absl::Status {
           std::vector<llvm::Value*> values_to_compare;
           std::vector<llvm::Type*> values_to_compare_types;
+          values_to_compare.reserve(num_values * 2);
+          values_to_compare_types.reserve(num_values * 2);
           for (int i = 0; i < num_values; ++i) {
-            values_to_compare.push_back(element_address(i, compare_keys_index));
+            ASSIGN_OR_RETURN(llvm::Value * address,
+                             element_address(i, compare_keys_index));
+            values_to_compare.push_back(address);
             values_to_compare_types.push_back(
                 element_address_pointee_type(i, compare_keys_index));
 
-            values_to_compare.push_back(element_address(i, current_keys_index));
+            ASSIGN_OR_RETURN(address, element_address(i, current_keys_index));
+            values_to_compare.push_back(address);
             values_to_compare_types.push_back(
                 element_address_pointee_type(i, current_keys_index));
           }
@@ -191,7 +203,7 @@ absl::Status EmitCompareLoopBody(
           llvm::Value* compare_return_buffer =
               llvm_ir::EmitAllocaAtFunctionEntry(pred_type,
                                                  "compare_return_buffer", b);
-          TF_RETURN_IF_ERROR(
+          RETURN_IF_ERROR(
               emit_compare_callback(values_to_compare, compare_return_buffer));
           llvm::Value* result = b->CreateLoad(pred_type, compare_return_buffer);
 
@@ -217,13 +229,15 @@ absl::Status EmitCompareLoopBody(
 }
 
 absl::Status EmitTiledCompareLoop(
-    const IrArray::Index& tiled_keys_index, int64_t dimension_to_sort,
+    const IrArray::Index& tiled_keys_index, const HloSortInstruction* sort,
     int64_t dimension_to_sort_bound, int64_t num_threads,
     absl::Span<const int64_t> xor_masks, absl::Span<const IrArray> params,
+    bool emit_iota_operands,
     const std::vector<llvm::GlobalVariable*>& param_shmem_buffers,
     int64_t tile_size, int64_t unroll_factor,
     const EmitCallToNestedComputationCallback& emit_compare_callback,
-    llvm::IRBuilderBase* b) {
+    llvm::Module* module, llvm::IRBuilderBase* b) {
+  int64_t dimension_to_sort = sort->sort_dimension();
   KernelSupportLibrary ksl(b);
   llvm::Value* thread_id = gpu::EmitCallToTargetIntrinsic(
       gpu::TargetIntrinsicID::kThreadIdx, {}, {}, b);
@@ -233,50 +247,61 @@ absl::Status EmitTiledCompareLoop(
   thread_id = b->CreateIntCast(thread_id, tiled_keys_index.GetType(),
                                /*isSigned=*/true, "thread.id.x");
 
-  auto copy_loop_body =
-      [&](std::function<void(llvm::Value * cache_index, llvm::Value * index)>
-              read_or_write) {
-        auto unroll = tiled_keys_index.GetConstantWithIndexType(unroll_factor);
-        auto base_keys_index =
-            b->CreateMul(tiled_keys_index[dimension_to_sort], unroll,
-                         "base_keys_index", /*HasNUW=*/true, /*HasNSW=*/true);
-        auto base_cache_index =
-            b->CreateMul(thread_id, unroll, "base_cache_index", /*HasNUW=*/true,
-                         /*HasNSW=*/true);
-        // We want to copy `unroll_factor` many adjacent elements.
-        for (int i = 0; i < unroll_factor; ++i) {
-          auto offset = tiled_keys_index.GetConstantWithIndexType(i);
-          auto current_keys_index =
-              b->CreateAdd(base_keys_index, offset, "current_keys_index",
-                           /*HasNUW=*/true, /*HasNSW=*/true);
-          // We check whether the index position is within bounds.
-          ksl.If("smaller_keys_index",
-                 b->CreateICmpSLT(current_keys_index,
-                                  tiled_keys_index.GetConstantWithIndexType(
-                                      dimension_to_sort_bound)),
-                 [&]() {
-                   auto cache_index =
-                       b->CreateAdd(base_cache_index, offset, "cache_index",
-                                    /*HasNUW=*/true, /*HasNSW=*/true);
-                   read_or_write(cache_index, current_keys_index);
-                 });
-        }
-      };
+  auto copy_loop_body = [&](std::function<absl::Status(
+                                llvm::Value * cache_index, llvm::Value * index)>
+                                read_or_write) -> absl::Status {
+    auto unroll = tiled_keys_index.GetConstantWithIndexType(unroll_factor);
+    auto base_keys_index =
+        b->CreateMul(tiled_keys_index[dimension_to_sort], unroll,
+                     "base_keys_index", /*HasNUW=*/true, /*HasNSW=*/true);
+    auto base_cache_index =
+        b->CreateMul(thread_id, unroll, "base_cache_index", /*HasNUW=*/true,
+                     /*HasNSW=*/true);
+    // We want to copy `unroll_factor` many adjacent elements.
+    for (int i = 0; i < unroll_factor; ++i) {
+      auto offset = tiled_keys_index.GetConstantWithIndexType(i);
+      auto current_keys_index =
+          b->CreateAdd(base_keys_index, offset, "current_keys_index",
+                       /*HasNUW=*/true, /*HasNSW=*/true);
+      // We check whether the index position is within bounds.
+      RETURN_IF_ERROR(ksl.IfWithStatus(
+          "smaller_keys_index",
+          b->CreateICmpSLT(current_keys_index,
+                           tiled_keys_index.GetConstantWithIndexType(
+                               dimension_to_sort_bound)),
+          [&]() {
+            auto cache_index =
+                b->CreateAdd(base_cache_index, offset, "cache_index",
+                             /*HasNUW=*/true, /*HasNSW=*/true);
+            return read_or_write(cache_index, current_keys_index);
+          }));
+    }
+    return absl::OkStatus();
+  };
 
   // Copy operand tiles from the operand buffers to shared memory.
   std::vector<llvm::Value*> keys_multi_index = tiled_keys_index.multidim();
   for (int64_t i = 0; i < params.size(); ++i) {
-    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+    RETURN_IF_ERROR(copy_loop_body([&](llvm::Value* cache_index,
+                                       llvm::Value* index) {
       keys_multi_index[dimension_to_sort] = index;
       IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
                                 tiled_keys_index.GetType());
-      auto value = params[i].EmitReadArrayElement(keys_index, b);
+      llvm::Value* value;
+      if (emit_iota_operands &&
+          HloPredicateIsOp<HloOpcode::kIota>(sort->operand(i))) {
+        ASSIGN_OR_RETURN(value,
+                         EmitIota(sort->operand(i), keys_index, module, b));
+      } else {
+        value = params[i].EmitReadArrayElement(keys_index, b);
+      }
       b->CreateStore(
           value,
           b->CreateGEP(
               param_shmem_buffers[i]->getValueType(), param_shmem_buffers[i],
               {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
-    });
+      return absl::OkStatus();
+    }));
   }
   // Wait until all reads have happened.
   gpu::EmitCallToTargetIntrinsic(gpu::TargetIntrinsicID::kBarrierId, {}, {}, b);
@@ -320,7 +345,7 @@ absl::Status EmitTiledCompareLoop(
     if (dimension_to_sort_bound % tile_size) {
       // Otherwise we need a bounds check for the last tile. The last tile has
       // size 'dimension_to_sort_bound' % 'tile_size'.
-      TF_RETURN_IF_ERROR(ksl.IfWithStatus(
+      RETURN_IF_ERROR(ksl.IfWithStatus(
           "is_last_tile",
           b->CreateICmpUGE(
               b->CreateMul(
@@ -345,7 +370,7 @@ absl::Status EmitTiledCompareLoop(
                 /*needs_bounds_checks=*/false);
           }));
     } else {
-      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+      RETURN_IF_ERROR(EmitCompareLoopBody(
           tile_size, num_threads, unroll_factor / 2, params.size(),
           element_pair_index, xor_mask, tiled_keys_index.GetType(),
           element_address, element_address_pointee_type, write_element,
@@ -359,7 +384,8 @@ absl::Status EmitTiledCompareLoop(
 
   // Copy the operand tiles back from shared memory to the operand buffers.
   for (int64_t i = 0; i < params.size(); ++i) {
-    copy_loop_body([&](llvm::Value* cache_index, llvm::Value* index) {
+    RETURN_IF_ERROR(copy_loop_body([&](llvm::Value* cache_index,
+                                       llvm::Value* index) {
       keys_multi_index[dimension_to_sort] = index;
       IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
                                 tiled_keys_index.GetType());
@@ -371,7 +397,8 @@ absl::Status EmitTiledCompareLoop(
           {tiled_keys_index.GetConstantWithIndexType(0), cache_index});
       auto value = b->CreateLoad(gep_type, gep);
       params[i].EmitWriteArrayElement(keys_index, value, b);
-    });
+      return absl::OkStatus();
+    }));
   }
   // We should normally synchronize here to make sure all writes have happened.
   // However the very next thing each thread does is reading `unroll_factor`
@@ -387,8 +414,9 @@ absl::Status EmitTiledCompareLoop(
 }  // namespace
 
 absl::Status EmitSortInPlace(
-    int64_t dimension_to_sort, absl::Span<const IrArray> values_arrays,
-    absl::string_view name, absl::Span<const int64_t> xor_masks,
+    const HloSortInstruction* sort, absl::Span<const IrArray> values_arrays,
+    bool emit_iota_operands, absl::string_view name,
+    absl::Span<const int64_t> xor_masks, llvm::Module* module,
     llvm::IRBuilderBase* b, const gpu::LaunchDimensions& launch_dimensions,
     int64_t num_iterations_in_sort_dim, int64_t tile_size,
     int64_t unroll_factor,
@@ -405,6 +433,7 @@ absl::Status EmitSortInPlace(
   const Shape& keys_shape = values_arrays[0].GetShape();
   int64_t rank = keys_shape.dimensions().size();
   int64_t num_threads = std::max(int64_t{1}, tile_size / unroll_factor);
+  int64_t dimension_to_sort = sort->sort_dimension();
   int64_t dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   std::vector<int64_t> dimensions_in_iteration_order(rank);
   std::vector<int64_t> iteration_order_to_logical_order(rank);
@@ -460,23 +489,31 @@ absl::Status EmitSortInPlace(
     if (xor_masks.size() > 1) {
       IrArray::Index keys_index(keys_multi_index, values_arrays[0].GetShape(),
                                 tiles_index.GetType());
-      TF_RETURN_IF_ERROR(EmitTiledCompareLoop(
-          keys_index, dimension_to_sort, dimension_to_sort_bound, num_threads,
-          xor_masks, values_arrays, param_shmem_buffers, tile_size,
-          unroll_factor, emit_compare_callback, b));
+      RETURN_IF_ERROR(EmitTiledCompareLoop(
+          keys_index, sort, dimension_to_sort_bound, num_threads, xor_masks,
+          values_arrays, emit_iota_operands, param_shmem_buffers, tile_size,
+          unroll_factor, emit_compare_callback, module, b));
     } else {
-      auto element_address = [&](int64_t operand, llvm::Value* index) {
+      auto element_address =
+          [&](int64_t operand,
+              llvm::Value* index) -> absl::StatusOr<llvm::Value*> {
         keys_multi_index[dimension_to_sort] = index;
         IrArray::Index keys_index(keys_multi_index,
                                   values_arrays[operand].GetShape(),
                                   tiles_index.GetType());
         PrimitiveType element_type =
             values_arrays[operand].GetShape().element_type();
-        if (!primitive_util::IsSubByteNonPredType(element_type)) {
-          return values_arrays[operand].EmitArrayElementAddress(keys_index, b);
+        llvm::Value* element;
+        if (emit_iota_operands &&
+            HloPredicateIsOp<HloOpcode::kIota>(sort->operand(operand))) {
+          ASSIGN_OR_RETURN(element, EmitIota(sort, keys_index, module, b));
+        } else {
+          if (!primitive_util::IsSubByteNonPredType(element_type)) {
+            return values_arrays[operand].EmitArrayElementAddress(keys_index,
+                                                                  b);
+          }
+          element = values_arrays[operand].EmitReadArrayElement(keys_index, b);
         }
-        auto element =
-            values_arrays[operand].EmitReadArrayElement(keys_index, b);
         auto llvm_element_type =
             llvm_ir::PrimitiveTypeToIrType(element_type, b->getContext());
         llvm::Value* element_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
@@ -495,7 +532,7 @@ absl::Status EmitSortInPlace(
                                   tiles_index.GetType());
         values_arrays[operand].EmitWriteArrayElement(keys_index, value, b);
       };
-      TF_RETURN_IF_ERROR(EmitCompareLoopBody(
+      RETURN_IF_ERROR(EmitCompareLoopBody(
           dimension_to_sort_bound, /*num_threads=*/1, unroll_factor / 2,
           values_arrays.size(), tiles_index[rank - 1], xor_masks[0],
           tiles_index.GetType(), element_address, element_address_pointee_type,
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h
index 7486d218bc3f6b..13179f189f87f9 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/llvm_ir/ir_array.h"
 
@@ -35,9 +36,12 @@ using EmitCallToNestedComputationCallback =
 // dimension of each array in 'values_arrays'. All other dimensions are kept
 // as-is. This implements the inner loop of BitonicSort. It is assumed that
 // 'xor_masks' contains only powers of 2, or values 2^k - 1 (k > 0).
+// `emit_iota_operands` should be set to true in the first call to
+// EmitSortInPlace.
 absl::Status EmitSortInPlace(
-    int64_t dimension_to_sort, absl::Span<const IrArray> values_arrays,
-    absl::string_view name, absl::Span<const int64_t> xor_masks,
+    const HloSortInstruction* sort, absl::Span<const IrArray> values_arrays,
+    bool emit_iota_operands, absl::string_view name,
+    absl::Span<const int64_t> xor_masks, llvm::Module* module,
     llvm::IRBuilderBase* b, const gpu::LaunchDimensions& launch_dimensions,
     int64_t num_iterations_in_sort_dim, int64_t tile_size,
     int64_t unroll_factor,
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index f7814ea7fe330a..6a6defabe974e8 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -513,8 +513,79 @@ llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
   }
 }
 
+llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
+                                llvm::Value* imag, llvm::Module* module,
+                                llvm::IRBuilderBase* b) {
+  auto cplx_type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(),
+                                                  module->getContext());
+  auto complex = b->CreateInsertValue(
+      llvm::ConstantAggregateZero::get(cplx_type), real, {0});
+  if (imag != nullptr) {
+    complex = b->CreateInsertValue(complex, imag, {1});
+  }
+  return complex;
+}
+
 }  // namespace
 
+absl::StatusOr<llvm::Value*> EmitIota(const HloInstruction* hlo,
+                                      const IrArray::Index& target_index,
+                                      llvm::Module* module,
+                                      llvm::IRBuilderBase* b) {
+  auto* iota = Cast<HloIotaInstruction>(hlo);
+  PrimitiveType element_type = iota->shape().element_type();
+  IrArray::Index elem_index =
+      iota->shape().dimensions().size() > 1
+          ? target_index.SourceIndexOfBroadcast(
+                iota->shape(),
+                ShapeUtil::MakeShapeWithDescendingLayout(
+                    element_type,
+                    {iota->shape().dimensions(iota->iota_dimension())}),
+                {iota->iota_dimension()}, b)
+          : target_index;
+  llvm::Value* elem_index_linear = elem_index.linear();
+  if (elem_index_linear == nullptr) {
+    std::vector<int64_t> iota_bound = {
+        iota->shape().dimensions(iota->iota_dimension())};
+    elem_index_linear = elem_index.Linearize(iota_bound, b);
+  }
+  Shape component_shape = ShapeUtil::ElementIsComplex(iota->shape())
+                              ? ShapeUtil::ComplexComponentShape(iota->shape())
+                              : iota->shape();
+  PrimitiveType component_element_type = component_shape.element_type();
+  llvm::Value* iota_result;
+  if (primitive_util::IsIntegralType(component_element_type)) {
+    iota_result =
+        b->CreateIntCast(elem_index_linear,
+                         llvm_ir::PrimitiveTypeToIrType(component_element_type,
+                                                        module->getContext()),
+                         /*isSigned=*/false);
+  } else {
+    TF_RET_CHECK(primitive_util::IsFloatingPointType(component_element_type))
+        << component_element_type;
+    llvm::Type* float_ir_type;
+    if (component_element_type == F8E4M3FNUZ ||
+        component_element_type == F8E5M2FNUZ) {
+      float_ir_type = llvm_ir::PrimitiveTypeToIrType(F16, module->getContext());
+    } else {
+      float_ir_type = llvm_ir::PrimitiveTypeToIrType(component_element_type,
+                                                     module->getContext());
+    }
+    llvm::Value* float_val = b->CreateUIToFP(elem_index_linear, float_ir_type);
+    if (component_element_type == F8E4M3FNUZ ||
+        component_element_type == F8E5M2FNUZ) {
+      iota_result =
+          EmitFxToF8e(module, F16, component_element_type, float_val, b);
+    } else {
+      iota_result = float_val;
+    }
+  }
+  if (ShapeUtil::ElementIsComplex(iota->shape())) {
+    return EmitComposeComplex(iota, iota_result, nullptr, module, b);
+  }
+  return iota_result;
+}
+
 /*static*/ bool ElementalIrEmitter::OpInvalidatesCache(
     const HloInstruction* hlo) {
   switch (hlo->opcode()) {
@@ -620,13 +691,15 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
             primitive_util::ComplexComponentType(to_type),
             module_->getContext());
         if (primitive_util::IsSignedIntegralType(from_type)) {
-          return EmitComposeComplex(
-              op, SIToFP(operand_value, to_ir_component_type), nullptr);
+          return EmitComposeComplex(op,
+                                    SIToFP(operand_value, to_ir_component_type),
+                                    nullptr, module_, b_);
         }
         if (primitive_util::IsUnsignedIntegralType(from_type) ||
             from_type == PRED) {
-          return EmitComposeComplex(
-              op, UIToFP(operand_value, to_ir_component_type), nullptr);
+          return EmitComposeComplex(op,
+                                    UIToFP(operand_value, to_ir_component_type),
+                                    nullptr, module_, b_);
         }
       }
       return Unimplemented("conversion from primitive type %s to %s",
@@ -837,14 +910,14 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
         PrimitiveType to_component_type =
             primitive_util::ComplexComponentType(to_type);
         if (from_type == to_component_type) {
-          return EmitComposeComplex(op, operand_value, nullptr);
+          return EmitComposeComplex(op, operand_value, nullptr, module_, b_);
         }
         return EmitComposeComplex(
             op,
             FPCast(operand_value,
                    llvm_ir::PrimitiveTypeToIrType(to_component_type,
                                                   module_->getContext())),
-            nullptr);
+            nullptr, module_, b_);
       }
       if (to_type == BF16) {
         // F16 to BF16 has to go through an intermediate F32.
@@ -1181,7 +1254,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
                               r);  // handles nan and inf values correctly
 
       TF_ASSIGN_OR_RETURN(auto imag_part, EmitAtan2(component_type, b, a1, ""));
-      return EmitComposeComplex(op, real_part, imag_part);
+      return EmitComposeComplex(op, real_part, imag_part, module_, b_);
     }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
@@ -1197,7 +1270,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           to_component_type, module_->getContext());
       return EmitComposeComplex(
           op, FPCast(EmitExtractReal(operand_value), to_ir_component_type),
-          FPCast(EmitExtractImag(operand_value), to_ir_component_type));
+          FPCast(EmitExtractImag(operand_value), to_ir_component_type), module_,
+          b_);
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
@@ -1232,7 +1306,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto imag_nonzero = Select(exp_a_is_inf, imag_overflow, imag_normal);
       auto imag_result = Select(b_is_zero, zero, imag_nonzero);
 
-      return EmitComposeComplex(op, real_result, imag_result);
+      return EmitComposeComplex(op, real_result, imag_result, module_, b_);
     }
     case HloOpcode::kExpm1: {
       // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
@@ -1251,7 +1325,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto cos_b = FAdd(cos_b_minus_one, one);
       auto real_result = FAdd(FMul(expm1_a, cos_b), cos_b_minus_one);
       auto imag_result = Select(b_is_zero, zero, FMul(exp_a, sin_b));
-      return EmitComposeComplex(op, real_result, imag_result);
+      return EmitComposeComplex(op, real_result, imag_result, module_, b_);
     }
     case HloOpcode::kCos:
     case HloOpcode::kSin: {
@@ -1281,7 +1355,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
         real_result = FMul(cos_x, cosh_y);
         imag_result = FNeg(FMul(sin_x, sinh_y));
       }
-      return EmitComposeComplex(op, real_result, imag_result);
+      return EmitComposeComplex(op, real_result, imag_result, module_, b_);
     }
     case HloOpcode::kTan:
       // tan(x+yi) = -i*tanh(-y + xi)
@@ -1429,8 +1503,9 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
         imag = Select(imag_is_zero, zero, imag);
       }
 
-      return op_is_tan ? EmitComposeComplex(op, imag, FMul(neg_one, real))
-                       : EmitComposeComplex(op, real, imag);
+      return op_is_tan ? EmitComposeComplex(op, imag, FMul(neg_one, real),
+                                            module_, b_)
+                       : EmitComposeComplex(op, real, imag, module_, b_);
     }
     case HloOpcode::kAbs: {
       return EmitComplexAbs(component_type, operand_value);
@@ -1442,9 +1517,10 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto oeq = FCmpOEQ(cplx_abs, zero);
       return Select(
-          oeq, EmitComposeComplex(op, zero, zero),
+          oeq, EmitComposeComplex(op, zero, zero, module_, b_),
           EmitComposeComplex(op, FDiv(EmitExtractReal(operand_value), cplx_abs),
-                             FDiv(EmitExtractImag(operand_value), cplx_abs)));
+                             FDiv(EmitExtractImag(operand_value), cplx_abs),
+                             module_, b_));
     }
     case HloOpcode::kSqrt: {
       return EmitComplexSqrt(op, component_type, operand_value);
@@ -1454,7 +1530,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
     }
     case HloOpcode::kNegate:
       return EmitComposeComplex(op, FNeg(EmitExtractReal(operand_value)),
-                                FNeg(EmitExtractImag(operand_value)));
+                                FNeg(EmitExtractImag(operand_value)), module_,
+                                b_);
     case HloOpcode::kReal:
       return EmitExtractReal(operand_value);
     case HloOpcode::kImag:
@@ -1485,7 +1562,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   switch (op->opcode()) {
     case HloOpcode::kComplex:
-      return EmitComposeComplex(op, lhs_value, rhs_value);
+      return EmitComposeComplex(op, lhs_value, rhs_value, module_, b_);
     case HloOpcode::kAdd:
       return FAdd(lhs_value, rhs_value, op->name());
     case HloOpcode::kSubtract:
@@ -1668,14 +1745,16 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexAdd(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   return EmitComposeComplex(
       op, FAdd(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
-      FAdd(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)));
+      FAdd(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)), module_,
+      b_);
 }
 
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexSubtract(
     const HloInstruction* op, llvm::Value* lhs_value, llvm::Value* rhs_value) {
   return EmitComposeComplex(
       op, FSub(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
-      FSub(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)));
+      FSub(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)), module_,
+      b_);
 }
 
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexMultiply(
@@ -1685,7 +1764,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexMultiply(
       FSub(FMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
            FMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value))),
       FAdd(FMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)),
-           FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value))));
+           FMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value))),
+      module_, b_);
 }
 
 absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
@@ -1734,7 +1814,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
   auto c_i = Select(b_r_lt_b_i,
                     FDiv(FSub(FMul(b_r_b_i_ratio, a_i), a_r), b_r_b_i_denom),
                     FDiv(FSub(a_i, FMul(b_i_b_r_ratio, a_r)), b_i_b_r_denom));
-  auto result = EmitComposeComplex(op, c_r, c_i);
+  auto result = EmitComposeComplex(op, c_r, c_i, module_, b_);
 
   // Consider corner cases, if the result is (NaN, NaN).
   auto zero = llvm::ConstantFP::get(type, 0.0);
@@ -1747,8 +1827,9 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
           Or(Not(FCmpUNO(a_r, zero)), Not(FCmpUNO(a_i, zero))));
   auto inf_with_sign_of_b_r = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::copysign, {inf, b_r}, {type}, b_);
-  auto zero_denominator_result = EmitComposeComplex(
-      op, FMul(inf_with_sign_of_b_r, a_r), FMul(inf_with_sign_of_b_r, a_i));
+  auto zero_denominator_result =
+      EmitComposeComplex(op, FMul(inf_with_sign_of_b_r, a_r),
+                         FMul(inf_with_sign_of_b_r, a_i), module_, b_);
 
   // Case 2. Infinite numerator, finite denominator.
   auto b_r_finite = FCmpONE(b_r_abs, inf);
@@ -1773,7 +1854,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
       FMul(inf,
            FAdd(FMul(a_r_inf_with_sign, b_r), FMul(a_i_inf_with_sign, b_i))),
       FMul(inf,
-           FSub(FMul(a_i_inf_with_sign, b_r), FMul(a_r_inf_with_sign, b_i))));
+           FSub(FMul(a_i_inf_with_sign, b_r), FMul(a_r_inf_with_sign, b_i))),
+      module_, b_);
 
   // Case 3. Finite numerator, infinite denominator.
   auto a_r_finite = FCmpONE(a_r_abs, inf);
@@ -1794,7 +1876,8 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexDivide(
       FMul(zero,
            FAdd(FMul(a_r, b_r_inf_with_sign), FMul(a_i, b_i_inf_with_sign))),
       FMul(zero,
-           FSub(FMul(a_i, b_r_inf_with_sign), FMul(a_r, b_i_inf_with_sign))));
+           FSub(FMul(a_i, b_r_inf_with_sign), FMul(a_r, b_i_inf_with_sign))),
+      module_, b_);
 
   auto c_nan = And(FCmpUNO(c_r, zero), FCmpUNO(c_i, zero));
   return Select(c_nan,
@@ -1816,7 +1899,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexLog(
   TF_ASSIGN_OR_RETURN(llvm::Value * abs,
                       EmitComplexAbs(component_type, operand_value));
   TF_ASSIGN_OR_RETURN(llvm::Value * log_abs, EmitLog(component_type, abs));
-  return EmitComposeComplex(op, log_abs, angle);
+  return EmitComposeComplex(op, log_abs, angle, module_, b_);
 }
 
 // Using our EmitComplexPower formula, but setting c=0.5 and d=0, we get:
@@ -1871,8 +1954,9 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexSqrt(
     imag_part = Select(FCmpOEQ(sin, zero), sin, FMul(r, sin));
   }
 
-  return Select(FCmpOEQ(r, zero), EmitComposeComplex(op, zero, zero),
-                EmitComposeComplex(op, real_part, imag_part));
+  return Select(FCmpOEQ(r, zero),
+                EmitComposeComplex(op, zero, zero, module_, b_),
+                EmitComposeComplex(op, real_part, imag_part, module_, b_));
 }
 
 // Similar to Sqrt, we can use our EmitComplexPower formula, but set
@@ -1938,7 +2022,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexRsqrt(
     imag_part = Select(is_zero_zero, nan, FMul(r, sin));
   }
 
-  return EmitComposeComplex(op, real_part, imag_part);
+  return EmitComposeComplex(op, real_part, imag_part, module_, b_);
 }
 
 //   lhs_value^rhs_value
@@ -1980,34 +2064,38 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexPower(
   // Nothing's Sign Bit, W. Kahan, Section 10.
   auto cutoff_0 =
       Select(And(And(FCmpOEQ(abs, zero), FCmpOEQ(d, zero)), FCmpOLE(zero, c)),
-             EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero),
-             EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q)));
+             EmitComposeComplex(op, Select(FCmpOEQ(zero, c), one, zero), zero,
+                                module_, b_),
+             EmitComposeComplex(op, FMul(coeff, cos_q), FMul(coeff, sin_q),
+                                module_, b_));
 
   // Case 1:
   // x^0 is defined to be 1 for any x, see
   // Branch Cuts for Complex Elementary Functions or Much Ado About
   // Nothing's Sign Bit, W. Kahan, Section 10.
-  auto cutoff_1 = Select(And(FCmpOEQ(zero, c), FCmpOEQ(d, zero)),
-                         EmitComposeComplex(op, one, zero), cutoff_0);
+  auto cutoff_1 =
+      Select(And(FCmpOEQ(zero, c), FCmpOEQ(d, zero)),
+             EmitComposeComplex(op, one, zero, module_, b_), cutoff_0);
 
   // Case 2:
   // 1^(c + d*i) = 1 + 0*i
-  auto cutoff_2 = Select(And(FCmpOEQ(a, one), FCmpOEQ(b, zero)),
-                         EmitComposeComplex(op, one, zero), cutoff_1);
+  auto cutoff_2 =
+      Select(And(FCmpOEQ(a, one), FCmpOEQ(b, zero)),
+             EmitComposeComplex(op, one, zero, module_, b_), cutoff_1);
 
   // Case 3:
   // inf^(c + 0*i) = inf + 0*i, c > 0
   auto cutoff_3 = Select(
       And(FCmpOEQ(a, inf),
           And(FCmpOEQ(b, zero), And(FCmpOEQ(d, zero), FCmpOGT(c, zero)))),
-      EmitComposeComplex(op, inf, zero), cutoff_2);
+      EmitComposeComplex(op, inf, zero, module_, b_), cutoff_2);
 
   // Case 4:
   // inf^(c + 0*i) = 0 + 0*i, c < 0
   auto cutoff_4 = Select(
       And(FCmpOEQ(a, inf),
           And(FCmpOEQ(b, zero), And(FCmpOEQ(d, zero), FCmpOLT(c, zero)))),
-      EmitComposeComplex(op, zero, zero), cutoff_3);
+      EmitComposeComplex(op, zero, zero, module_, b_), cutoff_3);
 
   return cutoff_4;
 }
@@ -2073,7 +2161,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
           llvm_ir::PrimitiveTypeToIrType(component_type, module_->getContext());
       auto zero = llvm::ConstantFP::get(type, 0.0);
       auto one = llvm::ConstantFP::get(type, 1.0);
-      auto i = EmitComposeComplex(op, zero, one);
+      auto i = EmitComposeComplex(op, zero, one, module_, b_);
       TF_ASSIGN_OR_RETURN(auto i_times_y, EmitComplexMultiply(op, i, y));
       TF_ASSIGN_OR_RETURN(auto x_plus_iy, EmitComplexAdd(op, x, i_times_y));
       TF_ASSIGN_OR_RETURN(
@@ -2081,7 +2169,7 @@ absl::StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
           EmitComplexDivide(op, x_plus_iy, sqrt_x_squared_plus_y_squared));
       TF_ASSIGN_OR_RETURN(auto log_result, EmitComplexLog(op, div_result));
       auto negative_one = llvm::ConstantFP::get(type, -1.0);
-      auto negative_i = EmitComposeComplex(op, zero, negative_one);
+      auto negative_i = EmitComposeComplex(op, zero, negative_one, module_, b_);
       return EmitComplexMultiply(op, negative_i, log_result);
     }
     default:
@@ -3345,63 +3433,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kIota:
       return [this, hlo](const IrArray::Index& target_index)
                  -> absl::StatusOr<llvm::Value*> {
-        auto* iota = Cast<HloIotaInstruction>(hlo);
-        PrimitiveType element_type = iota->shape().element_type();
-        IrArray::Index elem_index =
-            iota->shape().dimensions().size() > 1
-                ? target_index.SourceIndexOfBroadcast(
-                      iota->shape(),
-                      ShapeUtil::MakeShapeWithDescendingLayout(
-                          element_type,
-                          {iota->shape().dimensions(iota->iota_dimension())}),
-                      {iota->iota_dimension()}, b_)
-                : target_index;
-        llvm::Value* elem_index_linear = elem_index.linear();
-        if (elem_index_linear == nullptr) {
-          std::vector<int64_t> iota_bound = {
-              iota->shape().dimensions(iota->iota_dimension())};
-          elem_index_linear = elem_index.Linearize(iota_bound, b_);
-        }
-        Shape component_shape =
-            ShapeUtil::ElementIsComplex(iota->shape())
-                ? ShapeUtil::ComplexComponentShape(iota->shape())
-                : iota->shape();
-        PrimitiveType component_element_type = component_shape.element_type();
-        llvm::Value* iota_result;
-        if (primitive_util::IsIntegralType(component_element_type)) {
-          iota_result = b_->CreateIntCast(
-              elem_index_linear,
-              llvm_ir::PrimitiveTypeToIrType(component_element_type,
-                                             module_->getContext()),
-              /*isSigned=*/false);
-        } else {
-          TF_RET_CHECK(
-              primitive_util::IsFloatingPointType(component_element_type))
-              << component_element_type;
-          llvm::Type* float_ir_type;
-          if (component_element_type == F8E4M3FNUZ ||
-              component_element_type == F8E5M2FNUZ) {
-            float_ir_type =
-                llvm_ir::PrimitiveTypeToIrType(F16, module_->getContext());
-          } else {
-            float_ir_type = llvm_ir::PrimitiveTypeToIrType(
-                component_element_type, module_->getContext());
-          }
-          llvm::Value* float_val =
-              b_->CreateUIToFP(elem_index_linear, float_ir_type);
-          if (component_element_type == F8E4M3FNUZ ||
-              component_element_type == F8E5M2FNUZ) {
-            iota_result = EmitFxToF8e(module_, F16, component_element_type,
-                                      float_val, b_);
-          } else {
-            iota_result = float_val;
-          }
-        }
-        if (ShapeUtil::ElementIsComplex(iota->shape())) {
-          return EmitComposeComplex(iota, iota_result, nullptr);
-        } else {
-          return iota_result;
-        }
+        return EmitIota(hlo, target_index, module_, b_);
       };
     case HloOpcode::kSlice:
       return [this, hlo, &operand_to_generator](
@@ -3537,19 +3569,6 @@ llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) {
   return ExtractValue(value, {1});
 }
 
-llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
-                                                    llvm::Value* real,
-                                                    llvm::Value* imag) {
-  auto cplx_type = llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(),
-                                                  module_->getContext());
-  auto complex =
-      InsertValue(llvm::ConstantAggregateZero::get(cplx_type), real, {0});
-  if (imag != nullptr) {
-    complex = InsertValue(complex, imag, {1});
-  }
-  return complex;
-}
-
 llvm::Value* ElementalIrEmitter::EmitMulAdd(llvm::Value* lhs, llvm::Value* rhs,
                                             llvm::Value* accumulator,
                                             xla::PrimitiveType primitive_type) {
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/xla/xla/service/elemental_ir_emitter.h
index a64dc039542b43..43deb25a8dfb7f 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -256,10 +256,6 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       absl::Span<llvm::Value* const> accumulator_addrs,
       llvm::ArrayRef<llvm::Type*> accumulator_types, bool is_variadic);
 
-  // Composes a complex struct. imag may be nullptr for simple cast operations.
-  llvm::Value* EmitComposeComplex(const HloInstruction* op, llvm::Value* real,
-                                  llvm::Value* imag);
-
   // Emit `accumulator + lhs * rhs` for the given primitive type.
   llvm::Value* EmitMulAdd(llvm::Value* lhs, llvm::Value* rhs,
                           llvm::Value* accumulator,
@@ -371,6 +367,11 @@ class ElementalIrEmitterForTests : public ElementalIrEmitter {
 
   HloToElementGeneratorMap generator_map_;
 };
+
+absl::StatusOr<llvm::Value*> EmitIota(
+    const HloInstruction* hlo, const llvm_ir::IrArray::Index& target_index,
+    llvm::Module* module, llvm::IRBuilderBase* b);
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_

From 2a55fe03a622b1c6e89f3aeecc23f400ff546372 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 02:53:32 -0800
Subject: [PATCH 416/753] Automated Code Change

PiperOrigin-RevId: 845679376
---
 .../xla/xla/backends/profiler/gpu/cupti_collector.cc     | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index 8b8fb84fd092fb..da44744967eff8 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -285,11 +285,10 @@ class PerDeviceCollector {
                           occ_stats.occupancy_pct);
       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                               GetStatTypeStr(StatType::kOccupancyMinGridSize)),
-                          static_cast<tsl::int32>(occ_stats.min_grid_size));
-      xevent.AddStatValue(
-          *plane->GetOrCreateStatMetadata(
-              GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-          static_cast<tsl::int32>(occ_stats.suggested_block_size));
+                          static_cast<int32_t>(occ_stats.min_grid_size));
+      xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
+                              StatType::kOccupancySuggestedBlockSize)),
+                          static_cast<int32_t>(occ_stats.suggested_block_size));
       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                               GetStatTypeStr(StatType::kKernelDetails)),
                           *plane->GetOrCreateStatMetadata(ToXStat(

From 8dbc66efecdffe52a4725ba6a48c8a24c1f4a20f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 03:03:48 -0800
Subject: [PATCH 417/753] Automated Code Change

PiperOrigin-RevId: 845682254
---
 .../xla/xla/stream_executor/gpu/buffer_debug_log_test.cc | 4 ++--
 .../xla/xla/stream_executor/gpu/gpu_executor_test.cc     | 9 +++++----
 third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc | 2 +-
 third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h  | 2 +-
 third_party/xla/xla/stream_executor/gpu/memcpy_test.cc   | 2 +-
 .../xla/xla/stream_executor/gpu/redzone_allocator.cc     | 6 +++---
 .../xla/xla/stream_executor/gpu/redzone_allocator.h      | 3 ++-
 .../xla/stream_executor/gpu/redzone_allocator_test.cc    | 4 ++--
 8 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
index 87f4ad79ef016e..6b888ab759ddcd 100644
--- a/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/buffer_debug_log_test.cc
@@ -52,13 +52,13 @@ class BufferDebugLogTest : public ::testing::Test {
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
-        std::make_unique<StreamExecutorMemoryAllocator>(stream_->parent());
+        std::make_unique<StreamExecutorAddressAllocator>(stream_->parent());
   }
 
   Platform* platform_;
   StreamExecutor* executor_;
   std::unique_ptr<Stream> stream_;
-  std::unique_ptr<StreamExecutorMemoryAllocator> allocator_;
+  std::unique_ptr<StreamExecutorAddressAllocator> allocator_;
 };
 
 TEST_F(BufferDebugLogTest, CreateBufferDebugLogOnDevice_InitializesEmptyLog) {
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
index fabea8c509c04f..b038ecc139d921 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -42,8 +42,8 @@ using GetPointerMemorySpaceTest = GpuExecutorTest;
 TEST_F(GetPointerMemorySpaceTest, Host) {
   StreamExecutor* executor = GetPlatform()->ExecutorForDevice(0).value();
   TF_ASSERT_OK_AND_ASSIGN(auto host_ptr, executor->HostMemoryAllocate(64));
-  TF_ASSERT_OK_AND_ASSIGN(auto memory_space,
-                          executor->GetPointerMemorySpace(host_ptr->opaque()));
+  TF_ASSERT_OK_AND_ASSIGN(auto memory_space, executor->GetPointerMemorySpace(
+                                                 host_ptr->address().opaque()));
   EXPECT_EQ(memory_space, MemorySpace::kHost);
 }
 
@@ -82,8 +82,9 @@ TEST_F(HostMemoryAllocateTest, Numa) {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> host_ptr,
                             executor->HostMemoryAllocate(kSize));
     ASSERT_TRUE(host_ptr);
-    EXPECT_NE(host_ptr->opaque(), nullptr);
-    const int numa_node = tsl::port::NUMAGetMemAffinity(host_ptr->opaque());
+    EXPECT_NE(host_ptr->address().opaque(), nullptr);
+    const int numa_node =
+        tsl::port::NUMAGetMemAffinity(host_ptr->address().opaque());
     if (numa_node == tsl::port::kNUMANoAffinity) {
       // Could be because `executor` could not determine its own NUMA node, in
       // which case numa_node() will be -1 or 0, depending on the failure mode.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
index 0294d3dfbeb21f..87dec701c5b2de 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.cc
@@ -35,6 +35,6 @@ DeviceAddress<GpuSemaphoreState> GpuSemaphore::device() {
   // This assumes unified addressing, as we do not explicitly translate the
   // host pointer into a device pointer.
   return DeviceAddress<GpuSemaphoreState>::MakeFromByteSize(
-      ptr_->opaque(), sizeof(GpuSemaphoreState));
+      ptr_->address().opaque(), sizeof(GpuSemaphoreState));
 }
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
index 010bde955cfd2f..e7f34b00264434 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_semaphore.h
@@ -42,7 +42,7 @@ class GpuSemaphore {
   explicit operator bool() const { return bool{ptr_}; }
 
   GpuSemaphoreState& operator*() {
-    return *static_cast<GpuSemaphoreState*>(ptr_->opaque());
+    return *static_cast<GpuSemaphoreState*>(ptr_->address().opaque());
   }
   DeviceAddress<GpuSemaphoreState> device();
 
diff --git a/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc b/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
index 641b96adaa51a1..4a94fe0d821fef 100644
--- a/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/memcpy_test.cc
@@ -33,7 +33,7 @@ TEST(MemcpyTest, PinnedHostMemory) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto d_ptr,
                           executor->HostMemoryAllocate(sizeof(int)));
-  DeviceAddressBase d_mem(d_ptr->opaque(), sizeof(int));
+  DeviceAddressBase d_mem(d_ptr->address().opaque(), sizeof(int));
 
   int h_ptr;
   TF_ASSERT_OK(stream->Memcpy(&h_ptr, d_mem, d_mem.size()));
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
index 60559910852bed..9d6fc02bdeb14e 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
@@ -85,7 +85,7 @@ absl::StatusOr<DeviceAddress<uint8_t>> RedzoneAllocator::AllocateBytes(
 
   int64_t rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
   TF_ASSIGN_OR_RETURN(
-      OwningDeviceAddress allocated_buffer,
+      ScopedDeviceAddress<uint8_t> allocated_buffer,
       memory_allocator_->Allocate(device_ordinal_,
                                   byte_size + 2 * redzone_size_ + rhs_slop,
                                   /*retry_on_failure=*/false));
@@ -277,13 +277,13 @@ absl::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
 
   DeviceAddressHandle out_param(executor, executor->AllocateScalar<uint64_t>());
   TF_RETURN_IF_ERROR(
-      stream_->MemZero(out_param.memory_ptr(), sizeof(uint64_t)));
+      stream_->MemZero(out_param.address_ptr(), sizeof(uint64_t)));
 
   for (const auto& buf_and_size : allocated_buffers_) {
     TF_ASSIGN_OR_RETURN(
         RedzoneCheckStatus redzone_status,
         CheckRedzonesForBuffer(stream_, *buf_and_size.first,
-                               DeviceAddress<uint64_t>(out_param.memory()),
+                               DeviceAddress<uint64_t>(out_param.address()),
                                kernel, buf_and_size.second, redzone_size_,
                                redzone_pattern_));
     if (!redzone_status.ok()) {
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
index ab8ccb8d2d94ad..22e6953225fbfe 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.h
@@ -136,7 +136,8 @@ class RedzoneAllocator : public ScratchAllocator {
   //
   // ScratchAllocators need to free all allocated memory on destruction so we
   // use `OwningDeviceAddress` here.
-  std::vector<std::pair<OwningDeviceAddress, int64_t>> allocated_buffers_;
+  std::vector<std::pair<ScopedDeviceAddress<uint8_t>, int64_t>>
+      allocated_buffers_;
 
   int64_t allocated_bytes_excluding_redzones_ = 0;
 };
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
index b18c5025d68391..c610d2304a0261 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
@@ -60,7 +60,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
   Platform* platform =
       PlatformManager::PlatformWithName(GpuPlatformName()).value();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).value();
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  StreamExecutorAddressAllocator se_allocator(platform, {stream_exec});
 
   TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec->CreateStream());
   RedzoneAllocator allocator(stream.get(), &se_allocator,
@@ -134,7 +134,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
   Platform* platform =
       PlatformManager::PlatformWithName(GpuPlatformName()).value();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).value();
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  StreamExecutorAddressAllocator se_allocator(platform, {stream_exec});
   TF_ASSERT_OK_AND_ASSIGN(auto stream, stream_exec->CreateStream());
   RedzoneAllocator allocator(stream.get(), &se_allocator,
                              /*memory_limit=*/(1LL << 32),

From 1e94b02c1bd3f66cbdb8e72e30e654fabed95e68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 03:37:32 -0800
Subject: [PATCH 418/753] Automated Code Change

PiperOrigin-RevId: 845692025
---
 third_party/xla/xla/literal.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/literal.cc b/third_party/xla/xla/literal.cc
index 3e76da9f4608aa..5b14ae306c3326 100644
--- a/third_party/xla/xla/literal.cc
+++ b/third_party/xla/xla/literal.cc
@@ -636,8 +636,8 @@ absl::Status LiteralBase::Piece::AllocateBuffers() {
   const int64_t bytes = total_bytes_dense();
   if (bytes > kMaxInlinedBytes) {
     CHECK_EQ(buffer(), nullptr);
-    storage_.Emplace<DenseRep>(
-        static_cast<char*>(tsl::port::AlignedMalloc(bytes, kMinimumAlignment)));
+    storage_.Emplace<DenseRep>(static_cast<char*>(tsl::port::AlignedMalloc(
+        bytes, static_cast<std::align_val_t>(kMinimumAlignment))));
     if (buffer() == nullptr) {
       return absl::ResourceExhaustedError(
           "Failed to allocate buffer for Literal");

From 8a32f11234dac8c49f1b352e1eb9a79238da778f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 03:57:36 -0800
Subject: [PATCH 419/753] Automated Code Change

PiperOrigin-RevId: 845697432
---
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc  |  6 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc | 74 +++++++++++--------
 2 files changed, 48 insertions(+), 32 deletions(-)

diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index 40e62765ffd351..506fae3cfe85ba 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -505,7 +505,8 @@ Future<> TfrtGpuBuffer::ToLiteralHelper(Future<MutableLiteralBase*> literal) {
             int64_t unpacked_size = ShapeUtil::ElementsIn(on_device_shape);
             if (transpose != nullptr) {
               buffer = tsl::port::AlignedMalloc(
-                  unpacked_size, tsl::Allocator::kAllocatorAlignment);
+                  unpacked_size, static_cast<std::align_val_t>(
+                                     tsl::Allocator::kAllocatorAlignment));
             } else {
               buffer = literal->untyped_data();
             }
@@ -747,7 +748,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtGpuBuffer::CopyToMemorySpace(
 
   // Copying across PjRtClients involves a copy through the host.
   if (dst_device->client() != client_) {
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
+                        PjRtBuffer::ToLiteral().Await());
     // Avoid use-after-free on `literal` due to unsequenced move and use.
     Literal* literal_pointer = literal.get();
     absl::InlinedVector<int64_t, 4> byte_strides(
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
index c078751882b00c..32a04ec6b5115a 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
@@ -139,7 +139,7 @@ absl::StatusOr<std::shared_ptr<xla::Literal>> ExtractSingleResult(
   TF_RET_CHECK(result->size() == 1);
   std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = (*result)[0];
   TF_RET_CHECK(result_buffers.size() == 1);
-  TF_ASSIGN_OR_RETURN(auto literal, result_buffers[0]->ToLiteralSync());
+  TF_ASSIGN_OR_RETURN(auto literal, result_buffers[0]->ToLiteral().Await());
   return literal;
 }
 
@@ -563,7 +563,7 @@ TEST(TfrtGpuClientTest, ShouldStageHostToDeviceTransfersSetToTrue) {
           /*device_layout=*/nullptr));
   TF_EXPECT_OK(buffer->GetReadyFuture().Await());
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          buffer->ToLiteralSync());
+                          buffer->ToLiteral().Await());
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*literal, LiteralUtil::CreateR1<int32_t>(data)));
 }
@@ -588,7 +588,7 @@ TEST(TfrtGpuClientTest, ShouldStageHostToDeviceTransfersSetToFalse) {
           /*device_layout=*/nullptr));
   TF_EXPECT_OK(buffer->GetReadyFuture().Await());
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          buffer->ToLiteralSync());
+                          buffer->ToLiteral().Await());
   EXPECT_TRUE(
       LiteralTestUtil::Equal(*literal, LiteralUtil::CreateR1<int32_t>(data)));
 }
@@ -612,7 +612,7 @@ TEST(TfrtGpuClientTest, BufferFromHostBufferPinnedMemory) {
   EXPECT_EQ(buffer->memory_space()->kind(), "pinned_host");
   EXPECT_TRUE(buffer->IsOnCpu());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   std::vector<int32_t> expected{1, 2, 3, 4};
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                      *literal));
@@ -641,7 +641,7 @@ TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpace) {
   EXPECT_EQ(result->memory_space()->kind(), "pinned_host");
   EXPECT_TRUE(result->IsOnCpu());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
   std::vector<int32_t> expected{1, 2, 3, 4};
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
                                      *literal));
@@ -664,7 +664,7 @@ TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
 
   TF_EXPECT_OK(buffer->GetReadyFuture().Await());
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> device_literal,
-                          buffer->ToLiteralSync());
+                          buffer->ToLiteral().Await());
   std::vector<xla::s4> expected{xla::s4(1), xla::s4(2), xla::s4(3), xla::s4(4)};
   Literal expected_literal = LiteralUtil::CreateR1<xla::s4>(expected);
   EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *device_literal));
@@ -677,7 +677,7 @@ TEST(TfrtGpuClientTest, CopyToPinnedHostMemorySpaceInt4) {
   EXPECT_EQ(result->memory_space()->kind(), "pinned_host");
   EXPECT_TRUE(result->IsOnCpu());
 
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(expected_literal, *literal));
 }
 
@@ -1027,7 +1027,8 @@ TEST(TfrtGpuClientTest, FromHostAsyncPinnedHostChunked) {
     }
     offset = end;
   }
-  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> lit, buf->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> lit,
+                          buf->ToLiteral().Await());
   EXPECT_THAT(lit->data<float>(), ElementsAreArray(data));
 }
 
@@ -1182,15 +1183,17 @@ TEST(TfrtGpuClientTest, CopyRawToHostFullBuffer) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
-  void* dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  void* dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 
   auto result = buffer->CopyRawToHost(dst, 0, size);
   TF_EXPECT_OK(result.Await());
   EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
   EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
 
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(TfrtGpuClientTest, CopyRawToHostSubBuffer) {
@@ -1201,14 +1204,16 @@ TEST(TfrtGpuClientTest, CopyRawToHostSubBuffer) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
-  void* dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  void* dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 
   auto result = buffer->CopyRawToHost(dst, 0, sizeof(float));
   TF_EXPECT_OK(result.Await());
   EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
 
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(TfrtGpuClientTest, CopyRawToHostOutOfRange) {
@@ -1219,13 +1224,15 @@ TEST(TfrtGpuClientTest, CopyRawToHostOutOfRange) {
       std::unique_ptr<PjRtBuffer> buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(int64_t size, buffer->GetOnDeviceSizeInBytes());
-  void* dst =
-      tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+  void* dst = tsl::port::AlignedMalloc(
+      size, static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 
   auto result = buffer->CopyRawToHost(dst, 1, size);
   EXPECT_THAT(result.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
                                        HasSubstr("invalid offset 1")));
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(TfrtGpuClientTest, CopyRawToHostFuture) {
@@ -1246,8 +1253,9 @@ TEST(TfrtGpuClientTest, CopyRawToHostFuture) {
   buffer.reset();
   ready.OnReady([dst_promise = std::move(dst_promise),
                  size](absl::Status status) mutable {
-    void* dst =
-        tsl::port::AlignedMalloc(size, tsl::Allocator::kAllocatorAlignment);
+    void* dst = tsl::port::AlignedMalloc(
+        size,
+        static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
     dst_promise.Set(dst);
   });
 
@@ -1256,7 +1264,9 @@ TEST(TfrtGpuClientTest, CopyRawToHostFuture) {
   EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
   EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
 
-  tsl::port::AlignedSizedFree(dst, tsl::Allocator::kAllocatorAlignment, size);
+  tsl::port::AlignedSizedFree(
+      dst, size,
+      static_cast<std::align_val_t>(tsl::Allocator::kAllocatorAlignment));
 }
 
 TEST(GpuTopology, FromProto) {
@@ -1413,7 +1423,7 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTest) {
   EXPECT_GT(memory_stats.peak_memory_in_bytes, 0);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          result_buffers[0]->ToLiteralSync());
+                          result_buffers[0]->ToLiteral().Await());
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
 }
 
@@ -1450,10 +1460,10 @@ TEST(TfrtGpuClientTest, ExecutePinnedHostOutputTupleTest) {
   EXPECT_EQ(result_buffers[1]->memory_space()->kind(), "pinned_host");
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> literal,
-                          result_buffers[0]->ToLiteralSync());
+                          result_buffers[0]->ToLiteral().Await());
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(kData));
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> another_literal,
-                          result_buffers[1]->ToLiteralSync());
+                          result_buffers[1]->ToLiteral().Await());
   EXPECT_THAT(another_literal->data<int32_t>(), ElementsAreArray(kData));
 }
 
@@ -1622,7 +1632,7 @@ TEST(TfrtGpuClientTest, CopyToMemorySpace) {
     TF_ASSERT_OK_AND_ASSIGN(buffer,
                             buffer->CopyToMemorySpace(buffer->memory_space()));
     TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<Literal> received_literal,
-                            buffer->ToLiteralSync());
+                            buffer->ToLiteral().Await());
     EXPECT_THAT(received_literal->data<int32_t>(),
                 ElementsAreArray(literal.data<int32_t>()));
   }
@@ -1740,10 +1750,12 @@ TEST(TfrtGpuClientTest, DmaMapUnmap) {
   auto client = tensorflow::down_cast<TfrtGpuClient*>(gpu_client.get());
   size_t dma_size = 8192;
   size_t alignment = 4096;
-  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(
+      dma_size, static_cast<std::align_val_t>(alignment));
   auto host_dma_ptr_deleter =
       absl::Cleanup([host_dma_ptr, dma_size, alignment] {
-        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+        tsl::port::AlignedSizedFree(host_dma_ptr, dma_size,
+                                    static_cast<std::align_val_t>(alignment));
       });
 
   // DmaMap the first half of the buffer.
@@ -1817,10 +1829,12 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   size_t dma_size = 2 * 1024 * 1024;
   size_t alignment = 1024;
-  auto host_dma_ptr = tsl::port::AlignedMalloc(dma_size, alignment);
+  auto host_dma_ptr = tsl::port::AlignedMalloc(
+      dma_size, static_cast<std::align_val_t>(alignment));
   auto host_dma_ptr_deleter =
       absl::Cleanup([host_dma_ptr, dma_size, alignment] {
-        tsl::port::AlignedSizedFree(host_dma_ptr, alignment, dma_size);
+        tsl::port::AlignedSizedFree(host_dma_ptr, dma_size,
+                                    static_cast<std::align_val_t>(alignment));
       });
 
   TF_EXPECT_OK(client->DmaMap(host_dma_ptr, dma_size));
@@ -1837,7 +1851,7 @@ TEST(TfrtGpuClientTest, MultipleDeviceShareDmaMapping) {
 
   TF_EXPECT_OK(transfer_manager->TransferRawDataToSubBuffer(
       0, host_dma_ptr, 0, size, true, []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, second_buffer->ToLiteral().Await());
   EXPECT_EQ(literal->element_count(), test_length);
   EXPECT_THAT(literal->data<int32_t>(), ElementsAreArray(data));
 
@@ -1944,7 +1958,7 @@ TEST(TfrtGpuClientTest, CreateAliasBuffer) {
   ASSERT_NE(alias_buffer.second, nullptr);
   TF_ASSERT_OK(std::move(alias_buffer.second)(result_buffer.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto alias_literal,
-                          alias_buffer.first->ToLiteralSync());
+                          alias_buffer.first->ToLiteral().Await());
 
   // Expected result: data + 1
   EXPECT_TRUE(LiteralTestUtil::Equal(

From 960635397c7be3d84fd4afa9b876b5d824435769 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 03:58:27 -0800
Subject: [PATCH 420/753] Automated Code Change

PiperOrigin-RevId: 845697709
---
 tensorflow/core/kernels/quantize_op.cc        |   4 +-
 tensorflow/core/kernels/quantize_op_test.cc   |  16 +-
 tensorflow/core/kernels/quantized_add_op.cc   |  12 +-
 .../core/kernels/quantized_concat_op.cc       |   2 +-
 .../core/kernels/quantized_concat_op_test.cc  |  10 +-
 tensorflow/core/kernels/quantized_conv_ops.cc |  19 +--
 .../core/kernels/quantized_matmul_op.cc       |   6 +-
 tensorflow/core/kernels/quantized_mul_op.cc   |  12 +-
 .../core/kernels/quantized_pooling_ops.cc     |  17 +-
 .../core/kernels/quantized_reshape_op_test.cc |   2 +-
 .../kernels/quantized_resize_bilinear_op.cc   |  38 ++---
 .../quantized_resize_bilinear_op_test.cc      |   6 +-
 tensorflow/core/kernels/queue_base.h          |  17 +-
 tensorflow/core/kernels/queue_op.h            |   2 +-
 .../core/kernels/ragged_range_op_test.cc      |   8 +-
 .../kernels/ragged_tensor_to_sparse_kernel.cc |   4 +-
 .../kernels/ragged_tensor_to_tensor_op.cc     |   8 +-
 .../ragged_tensor_to_tensor_op_test.cc        | 147 ++++++++++--------
 .../kernels/ragged_tensor_to_variant_op.cc    |   2 +-
 .../ragged_tensor_to_variant_op_test.cc       |  10 +-
 .../ragged_tensor_to_variant_op_test.h        |   6 +-
 .../core/kernels/ragged_tensor_variant.cc     |   6 +-
 .../core/kernels/ragged_tensor_variant.h      |   4 +-
 tensorflow/core/kernels/random_binomial_op.cc |  27 ++--
 .../core/kernels/random_binomial_op_test.cc   |   2 +-
 .../core/kernels/random_index_shuffle_test.cc |   4 +-
 tensorflow/core/kernels/random_op.cc          |   2 +-
 tensorflow/core/kernels/random_op.h           |   4 +-
 tensorflow/core/kernels/random_op_cpu.h       |   8 +-
 tensorflow/core/kernels/random_op_test.cc     |   4 +-
 tensorflow/core/kernels/random_ops_util.h     |  25 +--
 tensorflow/core/kernels/random_poisson_op.cc  |   2 +-
 .../core/kernels/random_poisson_op_test.cc    |   4 +-
 .../core/kernels/random_shuffle_queue_op.cc   |  10 +-
 tensorflow/core/kernels/range_sampler.cc      |   8 +-
 tensorflow/core/kernels/range_sampler.h       |   8 +-
 tensorflow/core/kernels/range_sampler_test.cc |  14 +-
 tensorflow/core/kernels/record_input_op.cc    |   4 +-
 tensorflow/core/kernels/record_yielder.cc     |  21 +--
 tensorflow/core/kernels/record_yielder.h      |  14 +-
 tensorflow/core/kernels/reduce_join_op.cc     |  22 +--
 tensorflow/core/kernels/reduction_ops.h       |  12 +-
 tensorflow/core/kernels/reduction_ops_all.cc  |   6 +-
 tensorflow/core/kernels/reduction_ops_any.cc  |   6 +-
 .../core/kernels/reduction_ops_common.cc      |   6 +-
 .../core/kernels/reduction_ops_common.h       |   2 +-
 tensorflow/core/kernels/reduction_ops_max.cc  |  36 ++---
 tensorflow/core/kernels/reduction_ops_min.cc  |  37 +++--
 tensorflow/core/kernels/reduction_ops_test.cc |  42 ++---
 tensorflow/core/kernels/reference_gemm.h      |   8 +-
 50 files changed, 363 insertions(+), 333 deletions(-)

diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index c6cdbed7c0d5f6..c63c07a394b6c6 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -67,7 +67,7 @@ class QuantizeV2Op : public OpKernel {
             : (static_cast<double>(std::numeric_limits<T>::max()) -
                static_cast<double>(std::numeric_limits<T>::min()) + 1) /
                   2.0f;
-    string mode_string;
+    std::string mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("mode", &mode_string));
     OP_REQUIRES(ctx,
                 (mode_string == "MIN_COMBINED" || mode_string == "MIN_FIRST" ||
@@ -83,7 +83,7 @@ class QuantizeV2Op : public OpKernel {
       mode_ = QUANTIZE_MODE_SCALED;
     }
 
-    string round_mode_string;
+    std::string round_mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
     OP_REQUIRES(ctx,
                 (round_mode_string == "HALF_AWAY_FROM_ZERO" ||
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index 76fe2e9f963bef..ec486ba87dc990 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -62,7 +62,7 @@ TEST_F(QuantizedOpTest, QuantizeV2) {
 template <typename T>
 std::vector<T> ScalePerSliceAlongAxis(std::vector<int64_t> dims, int axis,
                                       const std::vector<T>& data) {
-  uint32 seed = 123;
+  uint32_t seed = 123;
   std::minstd_rand rng(seed);
   int64_t out_size = 1;
   for (int dim : dims) {
@@ -373,14 +373,14 @@ TEST_F(QuantizedOpTest, QuantizeV2_32Bit) {
   Tensor expected(allocator(), DT_QINT32, TensorShape({element_count}));
   test::FillValues<qint32>(&expected,
                            {
-                               std::numeric_limits<int32>::min(),
+                               std::numeric_limits<int32_t>::min(),
                                0,
-                               static_cast<int32>(1.0f * (1 << 23)),
-                               static_cast<int32>(1.25f * (1 << 23)),
-                               static_cast<int32>(1.75f * (1 << 23)),
-                               static_cast<int32>(127.0f * (1 << 23)),
-                               static_cast<int32>(255.0f * (1 << 23)),
-                               std::numeric_limits<int32>::max(),
+                               static_cast<int32_t>(1.0f * (1 << 23)),
+                               static_cast<int32_t>(1.25f * (1 << 23)),
+                               static_cast<int32_t>(1.75f * (1 << 23)),
+                               static_cast<int32_t>(127.0f * (1 << 23)),
+                               static_cast<int32_t>(255.0f * (1 << 23)),
+                               std::numeric_limits<int32_t>::max(),
                            });
   // We expect there will be some fuzziness in the lower bits, since this is
   // converting from float.
diff --git a/tensorflow/core/kernels/quantized_add_op.cc b/tensorflow/core/kernels/quantized_add_op.cc
index 5cf7ed1456034e..e8904e8a088395 100644
--- a/tensorflow/core/kernels/quantized_add_op.cc
+++ b/tensorflow/core/kernels/quantized_add_op.cc
@@ -149,7 +149,7 @@ void ScalarAddition(OpKernelContext* context, const quint8* full_input,
     full_input_in_output_range_64 =
         std::min(full_input_in_output_range_64, highest_quantized);
     const int32_t full_input_in_output_range =
-        static_cast<int32>(full_input_in_output_range_64);
+        static_cast<int32_t>(full_input_in_output_range_64);
     output[i] = full_input_in_output_range + scalar_in_output_range;
   }
 }
@@ -272,13 +272,15 @@ void VectorAddition(OpKernelContext* context, const quint8* x_data, float min_x,
     int64_t x_in_output_range_64 = x_0_int64 + (x_value * x_mult_int32);
     x_in_output_range_64 = std::max(x_in_output_range_64, lowest_quantized);
     x_in_output_range_64 = std::min(x_in_output_range_64, highest_quantized);
-    const int32_t x_in_output_range = static_cast<int32>(x_in_output_range_64);
+    const int32_t x_in_output_range =
+        static_cast<int32_t>(x_in_output_range_64);
 
     const int64_t y_value = static_cast<int64_t>(y_data[i]);
     int64_t y_in_output_range_64 = y_0_int64 + (y_value * y_mult_int32);
     y_in_output_range_64 = std::max(y_in_output_range_64, lowest_quantized);
     y_in_output_range_64 = std::min(y_in_output_range_64, highest_quantized);
-    const int32_t y_in_output_range = static_cast<int32>(y_in_output_range_64);
+    const int32_t y_in_output_range =
+        static_cast<int32_t>(y_in_output_range_64);
 
     output[i] = x_in_output_range + y_in_output_range;
   }
@@ -430,7 +432,7 @@ void VectorTensorAddition(const quint8* vector_data, float min_vector,
     vector_in_output_range_64 =
         std::min(vector_in_output_range_64, highest_quantized);
     const int32_t vector_in_output_range =
-        static_cast<int32>(vector_in_output_range_64);
+        static_cast<int32_t>(vector_in_output_range_64);
 
     const int64_t tensor_value = static_cast<int64_t>(tensor_data[i]);
     int64_t tensor_in_output_range_64 =
@@ -440,7 +442,7 @@ void VectorTensorAddition(const quint8* vector_data, float min_vector,
     tensor_in_output_range_64 =
         std::min(tensor_in_output_range_64, highest_quantized);
     const int32_t tensor_in_output_range =
-        static_cast<int32>(tensor_in_output_range_64);
+        static_cast<int32_t>(tensor_in_output_range_64);
 
     output[i] = vector_in_output_range + tensor_in_output_range;
   }
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index 7f7c59e2f40fc5..613fef99ea67c9 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -183,7 +183,7 @@ class QuantizedConcatOp : public OpKernel {
         errors::InvalidArgument(
             "Concat dim tensor should be a scalar integer, but got shape ",
             concat_dim_tensor->shape().DebugString()));
-    const int32_t concat_dim = concat_dim_tensor->scalar<int32>()();
+    const int32_t concat_dim = concat_dim_tensor->scalar<int32_t>()();
     OpInputList values;
     OP_REQUIRES_OK(context, context->input_list("values", &values));
     const size_t N = values.size();
diff --git a/tensorflow/core/kernels/quantized_concat_op_test.cc b/tensorflow/core/kernels/quantized_concat_op_test.cc
index 81f8b718d2b41e..cebe247f77f460 100644
--- a/tensorflow/core/kernels/quantized_concat_op_test.cc
+++ b/tensorflow/core/kernels/quantized_concat_op_test.cc
@@ -88,7 +88,7 @@ void QuantizedConcatTest::TestInvalidMinMax(const Tensor& first_min,
   Tensor second_quantized(DT_QUINT8, {1});
   test::FillValues<quint8>(&second_quantized, {1});
 
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   AddInputFromArray<quint8>(first_quantized.shape(),
                             first_quantized.flat<quint8>());
   AddInputFromArray<quint8>(second_quantized.shape(),
@@ -144,7 +144,7 @@ void QuantizedConcatTest::TestSmall8Bit(float first_min, float first_max,
                           {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
                            13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
 
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   AddInputFromArray<quint8>(first_quantized.shape(),
                             first_quantized.flat<quint8>());
   AddInputFromArray<quint8>(second_quantized.shape(),
@@ -210,7 +210,7 @@ void QuantizedConcatTest::TestSmall32Bit(float first_min, float first_max,
       {100,  200,  300,  400,  500,  600,  700,  800,  900,  1000, 1100, 1200,
        1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400});
 
-  AddInputFromArray<int32>(TensorShape({}), {0});
+  AddInputFromArray<int32_t>(TensorShape({}), {0});
   AddInputFromArray<qint32>(first_quantized.shape(),
                             first_quantized.flat<qint32>());
   AddInputFromArray<qint32>(second_quantized.shape(),
@@ -272,7 +272,7 @@ void QuantizedConcatTest::TestSecondDim8Bit(float first_min, float first_max,
                           {1, 2, 3, 4,  5,  6,  13, 14, 15, 16, 17, 18,
                            7, 8, 9, 10, 11, 12, 19, 20, 21, 22, 23, 24});
 
-  AddInputFromArray<int32>(TensorShape({}), {1});
+  AddInputFromArray<int32_t>(TensorShape({}), {1});
   AddInputFromArray<quint8>(first_quantized.shape(),
                             first_quantized.flat<quint8>());
   AddInputFromArray<quint8>(second_quantized.shape(),
@@ -303,7 +303,7 @@ static void ConcatHelper(::testing::benchmark::State& state,
   const int kDim1 = 100;
   TensorShape shape({kDim1, dim2});
 
-  Tensor concat_dim = test::AsScalar<int32>(concat_dimension);
+  Tensor concat_dim = test::AsScalar<int32_t>(concat_dimension);
   Tensor in0(dt, shape);
   in0.flat<T>().setRandom();
   Tensor in1(dt, shape);
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 3f3e2743d674f4..14072547b310e7 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -62,8 +62,9 @@ class ReferenceConvFunctor {
                   int output_shift, int output_offset, int output_mult) {
     // Set up some constants we need for the output down-shifting and
     // saturation.
-    const int32_t highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
-    const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+    const int32_t highest =
+        static_cast<int32_t>(Eigen::NumTraits<T3>::highest());
+    const int32_t lowest = static_cast<int32_t>(Eigen::NumTraits<T3>::lowest());
 
     // When we're converting the 32 bit accumulator to a lower bit depth, we
     // need to add on 0.5 in fixed-point terms to make the operation round half
@@ -150,7 +151,7 @@ class ReferenceConvFunctor {
                     // We're promoting the T1 type to a higher bit depth here as
                     // we do the subtraction.
                     input_value =
-                        static_cast<int32>(input_source_value) - input_offset;
+                        static_cast<int32_t>(input_source_value) - input_offset;
                   } else {
                     input_value = 0;
                   }
@@ -161,7 +162,7 @@ class ReferenceConvFunctor {
                                   (in_channel * filter_count) + out_channel];
                   // Another promotion to 32 bit, as above.
                   const int32_t filter_value =
-                      static_cast<int32>(filter_source_value) - filter_offset;
+                      static_cast<int32_t>(filter_source_value) - filter_offset;
                   total += (input_value * filter_value);
                 }
               }
@@ -406,9 +407,9 @@ class Im2ColConvFunctor {
         // The gemmlowp optimized library only works for a particular set of
         // data types, so check if we meet those requirements and fall back to a
         // slower reference implementation if not.
-        const uint8* im2col_data_as_uint8 = &(im2col_buffer->value);
-        const uint8* filter_data_as_uint8 = &(filter_data->value);
-        int32* output_data_as_int32 = &(chunk_output_data->value);
+        const uint8_t* im2col_data_as_uint8 = &(im2col_buffer->value);
+        const uint8_t* filter_data_as_uint8 = &(filter_data->value);
+        int32_t* output_data_as_int32 = &(chunk_output_data->value);
         // All of the transpose_* variables are currently compile-time consts,
         // so we could just hard-code these values too, but that would break if
         // anybody changed those values in the future (e.g. to match the ability
@@ -472,7 +473,7 @@ class QuantizedConv2DOp : public OpKernel {
         context, (strides_[0] == 1 && strides_[3] == 1),
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
-    std::vector<int32> dilations;
+    std::vector<int32_t> dilations;
     OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations));
     OP_REQUIRES(context, dilations.size() == 4,
                 errors::InvalidArgument("Dilations field must "
@@ -612,7 +613,7 @@ class QuantizedConv2DOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
 };
 
diff --git a/tensorflow/core/kernels/quantized_matmul_op.cc b/tensorflow/core/kernels/quantized_matmul_op.cc
index ae65dc3b5e38ce..5f7143e183991a 100644
--- a/tensorflow/core/kernels/quantized_matmul_op.cc
+++ b/tensorflow/core/kernels/quantized_matmul_op.cc
@@ -38,9 +38,9 @@ template <bool TransposeA, bool TransposeB, bool TransposeC>
 void GemmlowpMultiply(OpKernelContext* op_context, const quint8* a_data,
                       const quint8* b_data, qint32* c_data, int m, int n, int k,
                       int offset_a, int offset_b, int lda, int ldb, int ldc) {
-  const uint8* a_data_as_uint8 = &(a_data->value);
-  const uint8* b_data_as_uint8 = &(b_data->value);
-  int32* c_data_as_int32 = &(c_data->value);
+  const uint8_t* a_data_as_uint8 = &(a_data->value);
+  const uint8_t* b_data_as_uint8 = &(b_data->value);
+  int32_t* c_data_as_int32 = &(c_data->value);
   static const gemmlowp::MapOrder ResultOrder =
       !TransposeC ? gemmlowp::MapOrder::RowMajor : gemmlowp::MapOrder::ColMajor;
   static const gemmlowp::MapOrder LhsOrder =
diff --git a/tensorflow/core/kernels/quantized_mul_op.cc b/tensorflow/core/kernels/quantized_mul_op.cc
index fed18e3a6f917d..9028137e49949d 100644
--- a/tensorflow/core/kernels/quantized_mul_op.cc
+++ b/tensorflow/core/kernels/quantized_mul_op.cc
@@ -38,9 +38,9 @@ void ScalarMultiply(OpKernelContext* context, const T* full_input,
                     T scalar_input, int32_t scalar_input_offset,
                     Toutput* output) {
   const int32_t scalar_minus_offset =
-      static_cast<int32>(scalar_input) - scalar_input_offset;
+      static_cast<int32_t>(scalar_input) - scalar_input_offset;
   for (int i = 0; i < num_elements; ++i) {
-    output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
+    output[i] = (static_cast<int32_t>(full_input[i]) - full_input_offset) *
                 scalar_minus_offset;
   }
 }
@@ -115,8 +115,8 @@ void VectorMultiply(OpKernelContext* context, const T* x_data, int32_t offset_x,
                     const T* y_data, int32_t offset_y, int64_t num_elements,
                     Toutput* output) {
   for (int i = 0; i < num_elements; ++i) {
-    output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
-                (static_cast<int32>(y_data[i]) - offset_y);
+    output[i] = (static_cast<int32_t>(x_data[i]) - offset_x) *
+                (static_cast<int32_t>(y_data[i]) - offset_y);
   }
 }
 
@@ -193,8 +193,8 @@ void VectorTensorMultiply(const T* vector_data, int32_t vector_offset,
                           Toutput* output) {
   for (int i = 0; i < tensor_num_elements; ++i) {
     const int64_t vector_i = i % vector_num_elements;
-    output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
-                (static_cast<int32>(tensor_data[i]) - tensor_offset);
+    output[i] = (static_cast<int32_t>(vector_data[vector_i]) - vector_offset) *
+                (static_cast<int32_t>(tensor_data[i]) - tensor_offset);
   }
 }
 
diff --git a/tensorflow/core/kernels/quantized_pooling_ops.cc b/tensorflow/core/kernels/quantized_pooling_ops.cc
index 5efedd082c4aea..5a05d1635c1d6b 100644
--- a/tensorflow/core/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/core/kernels/quantized_pooling_ops.cc
@@ -95,8 +95,9 @@ class QuantizedAvgPoolingOp : public OpKernel {
                    params.forward_output_shape(&params_forward_output_shape));
     OP_REQUIRES_OK(context, context->allocate_output(
                                 0, params_forward_output_shape, &output));
-    const int32_t highest = static_cast<int32>(Eigen::NumTraits<T>::highest());
-    const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T>::lowest());
+    const int32_t highest =
+        static_cast<int32_t>(Eigen::NumTraits<T>::highest());
+    const int32_t lowest = static_cast<int32_t>(Eigen::NumTraits<T>::lowest());
 
     // TODO(vrv): Switch this to the Eigen::Tensor version of
     // SpatialAvgPooling once that version is running quickly.
@@ -105,12 +106,12 @@ class QuantizedAvgPoolingOp : public OpKernel {
     Tensor int32_output(DT_INT32, params_forward_output_shape);
     // Cast input to int32 tensor and call SpatialAvgPool.
     Tensor int32_input(DT_INT32, tensor_in.shape());
-    int32_input.flat<int32>() = tensor_in.flat<T>().template cast<int32>();
-    SpatialAvgPool<Device, int32>(context, &int32_output, int32_input, params,
-                                  padding_);
+    int32_input.flat<int32_t>() = tensor_in.flat<T>().template cast<int32_t>();
+    SpatialAvgPool<Device, int32_t>(context, &int32_output, int32_input, params,
+                                    padding_);
 
     // Clamp the int32 output back into quantized space.
-    output->flat<T>() = int32_output.flat<int32>()
+    output->flat<T>() = int32_output.flat<int32_t>()
                             .cwiseMax(lowest)
                             .cwiseMin(highest)
                             .template cast<T>();
@@ -124,8 +125,8 @@ class QuantizedAvgPoolingOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
 };
 
diff --git a/tensorflow/core/kernels/quantized_reshape_op_test.cc b/tensorflow/core/kernels/quantized_reshape_op_test.cc
index a7066f98f39e99..a2c7b60bbc71db 100644
--- a/tensorflow/core/kernels/quantized_reshape_op_test.cc
+++ b/tensorflow/core/kernels/quantized_reshape_op_test.cc
@@ -56,7 +56,7 @@ TEST_F(QuantizedReshapeTest, Reshape) {
     expected.flat<quint8>()(i) = quint8(i);
   }
   AddInputFromArray<quint8>(input.shape(), input.flat<quint8>());
-  AddInputFromList<int32>({3}, {5, 10, 4});  // shape
+  AddInputFromList<int32_t>({3}, {5, 10, 4});  // shape
   AddInputFromArray<float>(TensorShape({1}), {-10});
   AddInputFromArray<float>(TensorShape({1}), {20});
   TF_ASSERT_OK(RunOpKernel());
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
index 2efdd38dc6ef45..4e6f072973b3e1 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op.cc
@@ -132,7 +132,7 @@ inline T ComputeLerp(const T top_left, const T top_right, const T bottom_left,
       MulOffset<T, T_SCALE, T_CALC>(bottom_right, bottom_left, x_lerp);
   const T_CALC out = top + (bottom - top) / RESOLUTION_MULT * y_lerp;
   return static_cast<T>(
-      static_cast<int32>((out + RESOLUTION_MULT / 2) / RESOLUTION_MULT));
+      static_cast<int32_t>((out + RESOLUTION_MULT / 2) / RESOLUTION_MULT));
 }
 
 #ifdef QUANTIZED_RESIZE_BILINEAR_USE_NEON
@@ -266,7 +266,7 @@ inline void OutputLerpForChannels(const InterpolationCache<T_SCALE>& xs,
 }
 
 template <int RES>
-inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
+inline void OutputLerp8x8x1(const InterpolationCache<int16_t>& xs,
                             const int64_t x_start, const int16_t ys_ilerp,
                             const float min, const float max,
                             const quint8* const ys_input_lower_ptr,
@@ -284,7 +284,7 @@ inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
 
 #else
   for (int x = x_start; x < x_start + 8; ++x) {
-    OutputLerpForChannels<RES, quint8, int16, int16>(
+    OutputLerpForChannels<RES, quint8, int16_t, int16_t>(
         xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -292,7 +292,7 @@ inline void OutputLerp8x8x1(const InterpolationCache<int16>& xs,
 }
 
 template <int RES>
-inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
+inline void OutputLerp8x8x3(const InterpolationCache<int16_t>& xs,
                             const int64_t x_start, const int16_t ys_ilerp,
                             const float min, const float max,
                             const quint8* const ys_input_lower_ptr,
@@ -325,7 +325,7 @@ inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
 
 #else
   for (int x = x_start; x < x_start + 8; ++x) {
-    OutputLerpForChannels<RES, quint8, int16, int16>(
+    OutputLerpForChannels<RES, quint8, int16_t, int16_t>(
         xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -333,7 +333,7 @@ inline void OutputLerp8x8x3(const InterpolationCache<int16>& xs,
 }
 
 template <int RESOLUTION>
-inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
+inline void OutputLerp32x4x1(const InterpolationCache<int32_t>& xs,
                              const int64_t x_start, const int32_t ys_ilerp,
                              const float min, const float max,
                              const qint32* const ys_input_lower_ptr,
@@ -373,7 +373,7 @@ inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
 
 #else
   for (int x = x_start; x < x_start + 4; ++x) {
-    OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
+    OutputLerpForChannels<RESOLUTION, qint32, int32_t, int64_t>(
         xs, x, ys_ilerp, 1, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -381,7 +381,7 @@ inline void OutputLerp32x4x1(const InterpolationCache<int32>& xs,
 }
 
 template <int RESOLUTION>
-inline void OutputLerp32x4x3(const InterpolationCache<int32>& xs,
+inline void OutputLerp32x4x3(const InterpolationCache<int32_t>& xs,
                              const int64_t x_start, const int32_t ys_ilerp,
                              const float min, const float max,
                              const qint32* const ys_input_lower_ptr,
@@ -458,7 +458,7 @@ inline void OutputLerp32x4x3(const InterpolationCache<int32>& xs,
 
 #else
   for (int x = x_start; x < x_start + 4; ++x) {
-    OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
+    OutputLerpForChannels<RESOLUTION, qint32, int32_t, int64_t>(
         xs, x, ys_ilerp, 3, min, max, ys_input_lower_ptr, ys_input_upper_ptr,
         output_y_ptr);
   }
@@ -543,10 +543,10 @@ void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
 
   CHECK_NOTNULL(output);
 
-  const InterpolationCache<int32> xs =
-      BuildLerpCache<int32>(out_width, in_width, width_scale, channels,
-                            RESOLUTION, half_pixel_centers);
-  const InterpolationCache<int32> ys = BuildLerpCache<int32>(
+  const InterpolationCache<int32_t> xs =
+      BuildLerpCache<int32_t>(out_width, in_width, width_scale, channels,
+                              RESOLUTION, half_pixel_centers);
+  const InterpolationCache<int32_t> ys = BuildLerpCache<int32_t>(
       out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);
 
   const int64_t in_row_size = in_width * channels;
@@ -581,7 +581,7 @@ void ResizeImage<qint32>(typename TTypes<qint32, 4>::ConstTensor images,
         }
       }
       for (; x < out_width; ++x) {
-        OutputLerpForChannels<RESOLUTION, qint32, int32, int64_t>(
+        OutputLerpForChannels<RESOLUTION, qint32, int32_t, int64_t>(
             xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
             ys_input_upper_ptr, output_y_ptr);
       }
@@ -606,10 +606,10 @@ void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
 
   CHECK_NOTNULL(output);
 
-  const InterpolationCache<int16> xs =
-      BuildLerpCache<int16>(out_width, in_width, width_scale, channels,
-                            RESOLUTION, half_pixel_centers);
-  const InterpolationCache<int16> ys = BuildLerpCache<int16>(
+  const InterpolationCache<int16_t> xs =
+      BuildLerpCache<int16_t>(out_width, in_width, width_scale, channels,
+                              RESOLUTION, half_pixel_centers);
+  const InterpolationCache<int16_t> ys = BuildLerpCache<int16_t>(
       out_height, in_height, height_scale, 1, RESOLUTION, half_pixel_centers);
 
   const int64_t in_row_size = in_width * channels;
@@ -646,7 +646,7 @@ void ResizeImage<quint8>(typename TTypes<quint8, 4>::ConstTensor images,
         }
       }
       for (; x < out_width; ++x) {
-        OutputLerpForChannels<RESOLUTION, quint8, int16, int16>(
+        OutputLerpForChannels<RESOLUTION, quint8, int16_t, int16_t>(
             xs, x, ys_ilerp, channels, in_min, in_max, ys_input_lower_ptr,
             ys_input_upper_ptr, output_y_ptr);
       }
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index 52c66efd890ea6..8c2426ee6621b7 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -171,8 +171,8 @@ void CheckTensorValue(const T* in_data, const T* out_data, const int batch_size,
           const float val = QuantizedToFloat<T>(qval, min, max);
           if (!relative) {
             const int q_tolerance = std::round(tolerance);
-            EXPECT_TRUE(std::abs(static_cast<int32>(ref_qval) -
-                                 static_cast<int32>(qval)) <= q_tolerance)
+            EXPECT_TRUE(std::abs(static_cast<int32_t>(ref_qval) -
+                                 static_cast<int32_t>(qval)) <= q_tolerance)
                 << "ref = " << ref_val << ", val = " << val << ", " << b << ", "
                 << y << ", " << x << ", " << c << ", qval = " << qval
                 << ", ref qval = " << ref_qval << ", " << q_tolerance;
@@ -197,7 +197,7 @@ void TestResizeBilinear(const Tensor& image_tensor, const DataType dt,
   Scope root = Scope::NewRootScope();
 
   Output placeholder = ops::Placeholder(root.WithOpName("placeholder"), dt);
-  Output size = ops::Const<int32>(root.WithOpName("size"), new_size);
+  Output size = ops::Const<int32_t>(root.WithOpName("size"), new_size);
   Output in_min = ops::Const<float>(root.WithOpName("min"), min);
   Output in_max = ops::Const<float>(root.WithOpName("max"), max);
 
diff --git a/tensorflow/core/kernels/queue_base.h b/tensorflow/core/kernels/queue_base.h
index d39ab45498b843..e55693b4d540d4 100644
--- a/tensorflow/core/kernels/queue_base.h
+++ b/tensorflow/core/kernels/queue_base.h
@@ -46,7 +46,7 @@ class QueueBase : public QueueInterface {
   //   name: A name to use for the queue.
   QueueBase(int32_t capacity, const DataTypeVector& component_dtypes,
             const std::vector<TensorShape>& component_shapes,
-            const string& name);
+            const std::string& name);
 
   // Implementations of QueueInterface methods --------------------------------
   const DataTypeVector& component_dtypes() const override {
@@ -64,7 +64,7 @@ class QueueBase : public QueueInterface {
     return component_shapes_;
   }
 
-  int32 capacity() const { return capacity_; }
+  int32_t capacity() const { return capacity_; }
 
   bool is_closed() const override {
     mutex_lock lock(mu_);
@@ -103,7 +103,7 @@ class QueueBase : public QueueInterface {
   };
 
   // Returns the number of components in a queue-element tuple.
-  int32 num_components() const { return component_dtypes_.size(); }
+  int32_t num_components() const { return component_dtypes_.size(); }
 
   // True if shapes were specified.  If so, inputs will be validated
   // against them, etc.
@@ -135,26 +135,27 @@ class QueueBase : public QueueInterface {
   ~QueueBase() override;
 
   // Helpers for implementing MatchesNodeDef().
-  static string ShapeListString(const absl::Span<const TensorShape>& shapes);
+  static std::string ShapeListString(
+      const absl::Span<const TensorShape>& shapes);
   absl::Status MatchesNodeDefOp(const NodeDef& node_def,
-                                const string& op) const;
+                                const std::string& op) const;
   absl::Status MatchesNodeDefCapacity(const NodeDef& node_def,
                                       int32_t capacity) const;
   absl::Status MatchesNodeDefTypes(const NodeDef& node_def) const;
   absl::Status MatchesNodeDefShapes(const NodeDef& node_def) const;
 
  protected:
-  const int32 capacity_;
+  const int32_t capacity_;
   const DataTypeVector component_dtypes_;
   const std::vector<TensorShape> component_shapes_;
-  const string name_;
+  const std::string name_;
   mutable mutex mu_;
   bool closed_ TF_GUARDED_BY(mu_);
 
   struct Attempt;
   typedef std::function<RunResult(Attempt*)> RunCallback;
   struct Attempt {
-    int32 elements_requested;
+    int32_t elements_requested;
     DoneCallback done_callback;  // must be run outside mu_
     OpKernelContext* context;
     CancellationManager* cancellation_manager;  // not owned
diff --git a/tensorflow/core/kernels/queue_op.h b/tensorflow/core/kernels/queue_op.h
index 57a771d91fcb50..4c5c1ee10b0433 100644
--- a/tensorflow/core/kernels/queue_op.h
+++ b/tensorflow/core/kernels/queue_op.h
@@ -39,7 +39,7 @@ class QueueOp : public ResourceOpKernel<QueueInterface> {
 
  protected:
   // Variables accessible by subclasses
-  int32 capacity_;
+  int32_t capacity_;
   DataTypeVector component_types_;
 
  private:
diff --git a/tensorflow/core/kernels/ragged_range_op_test.cc b/tensorflow/core/kernels/ragged_range_op_test.cc
index 699531a8d3647c..9a951af9017a36 100644
--- a/tensorflow/core/kernels/ragged_range_op_test.cc
+++ b/tensorflow/core/kernels/ragged_range_op_test.cc
@@ -90,10 +90,10 @@ TEST_F(RaggedRangeOpTest, RangeSizeOverflow) {
 }
 
 TEST_F(RaggedRangeOpTest, RangeSizeOverflow2) {
-  BuildRaggedRangeGraph<int64>();
-  AddInputFromArray<int64>(TensorShape({}), {static_cast<int64_t>(5e18)});
-  AddInputFromArray<int64>(TensorShape({}), {static_cast<int64_t>(-5e18)});
-  AddInputFromArray<int64>(TensorShape({}), {-1});
+  BuildRaggedRangeGraph<int64_t>();
+  AddInputFromArray<int64_t>(TensorShape({}), {static_cast<int64_t>(5e18)});
+  AddInputFromArray<int64_t>(TensorShape({}), {static_cast<int64_t>(-5e18)});
+  AddInputFromArray<int64_t>(TensorShape({}), {-1});
 
   EXPECT_EQ(absl::StrCat("Requires ((limit - start) / delta) <= ",
                          std::numeric_limits<int64_t>::max()),
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
index 7f92a50133ce99..ffb186af87ece4 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
@@ -228,8 +228,8 @@ class RaggedTensorToSparseOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<int32>("Tsplits"),
-                        RaggedTensorToSparseOp<int32>);
+                            .TypeConstraint<int32_t>("Tsplits"),
+                        RaggedTensorToSparseOp<int32_t>);
 
 REGISTER_KERNEL_BUILDER(Name("RaggedTensorToSparse")
                             .Device(DEVICE_CPU)
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index 516a0cddcb6acc..28820593a4b5c5 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -445,8 +445,8 @@ void copy_array<tstring, int64_t>(tstring* dst, const tstring* src,
 }
 
 template <>
-void copy_array<tstring, int32>(tstring* dst, const tstring* src,
-                                int32_t size) {
+void copy_array<tstring, int32_t>(tstring* dst, const tstring* src,
+                                  int32_t size) {
   slow_copy_array(dst, src, size);
 }
 
@@ -460,8 +460,8 @@ void copy_array<Eigen::half, int64_t>(Eigen::half* dst, const Eigen::half* src,
 }
 
 template <>
-void copy_array<Eigen::half, int32>(Eigen::half* dst, const Eigen::half* src,
-                                    int32_t size) {
+void copy_array<Eigen::half, int32_t>(Eigen::half* dst, const Eigen::half* src,
+                                      int32_t size) {
   slow_copy_array(dst, src, size);
 }
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
index b0f53598d32de9..e23a2c07ed861b 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op_test.cc
@@ -51,7 +51,8 @@ class RaggedTensorToTensorOpTest : public ::tensorflow::OpsTestBase {
   // Builds the tensorflow test graph for RaggedTensorToTensor.
   template <typename VALUE_TYPE, typename INDEX_TYPE>
   void BuildRaggedTensorToTensorGraph(
-      const TensorShape& shape, const std::vector<string>& row_partition_types,
+      const TensorShape& shape,
+      const std::vector<std::string>& row_partition_types,
       const ShapeAndValues<VALUE_TYPE>& values,
       const ShapeAndValues<VALUE_TYPE>& default_value,
       const std::vector<ShapeAndValues<INDEX_TYPE>>& row_partition_tensors) {
@@ -95,12 +96,13 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor) {
   // indices = [2, 1, 0, 3]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
   // params.shape = [4, None]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({4, 4}),                 // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -117,12 +119,12 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor) {
 TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorRowSplits) {
   // indices = [2, 1, 0, 3]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({4, 4}),  // shape
       {"ROW_SPLITS"},       // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
-      createScalar<float>(1.5),               // default_value
-      {createVector<int32>({0, 3, 3, 7, 9})}  // row_partition_tensors
+      createScalar<float>(1.5),                 // default_value
+      {createVector<int32_t>({0, 3, 3, 7, 9})}  // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -143,16 +145,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParams) {
   //           [[.4, .5], [.6, .7, .8]],
   //           [[.9]]
   //          ]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({5, 2, 3}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
        "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
       {
-          createScalar<int32>(5),
-          createVector<int32>({0, 1, 1, 3, 3, 4}),
-          createVector<int32>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
+          createScalar<int32_t>(5),
+          createVector<int32_t>({0, 1, 1, 3, 3, 4}),
+          createVector<int32_t>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
       }  // row_partition_tensors
   );
   TF_ASSERT_OK(RunOpKernel());
@@ -181,14 +183,14 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsRowSplits) {
   //           [[.4, .5], [.6, .7, .8]],
   //           [[.9]]
   //          ]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({5, 2, 3}),        // shape
       {"ROW_SPLITS", "ROW_SPLITS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
       {
-          createVector<int32>({0, 1, 3, 3, 5, 6}),
-          createVector<int32>({0, 0, 2, 3, 5, 8, 9}),
+          createVector<int32_t>({0, 1, 3, 3, 5, 6}),
+          createVector<int32_t>({0, 0, 2, 3, 5, 8, 9}),
       }  // row_partition_tensors
   );
   TF_ASSERT_OK(RunOpKernel());
@@ -249,15 +251,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParams) {
   //            []
   // ]
   // params.shape = [3, 2, 3, 2]
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({4, 2, 3, 2}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS", "VALUE_ROWIDS",
-       "VALUE_ROWIDS"},                               // row_partition_types
-      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
-      createScalar<int32>(15),                        // default_value
-      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
-       createVector<int32>({1, 1, 1, 2}),
-       createVector<int32>({0, 0, 1, 1, 2, 2, 3, 3})}  // row_partition_tensors
+       "VALUE_ROWIDS"},                                 // row_partition_types
+      createVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32_t>(15),                        // default_value
+      {createScalar<int32_t>(5), createVector<int32_t>({0, 1, 1}),
+       createVector<int32_t>({1, 1, 1, 2}),
+       createVector<int32_t>({0, 0, 1, 1, 2, 2, 3, 3})}
+      // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -277,9 +280,9 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParams) {
   //             [[15,15],[15,15],[15,15]],
   //           ]
   // params.shape = [3, 2, 3, 2]
-  test::ExpectTensorEqual<int32>(
+  test::ExpectTensorEqual<int32_t>(
       *GetOutput(0),
-      test::AsTensor<int32>(
+      test::AsTensor<int32_t>(
           {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1,  2,  3,  4,
            5,  6,  7,  8,  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15},
@@ -296,14 +299,14 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsRowSplit) {
   //            []
   // ]
   // params.shape = [3, 2, 3, 2]
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({4, 2, 3, 2}),  // shape
       {"ROW_SPLITS", "ROW_SPLITS", "ROW_SPLITS"},
       // row_partition_types
-      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
-      createScalar<int32>(15),                        // default_value
-      {createVector<int32>({0, 1, 3}), createVector<int32>({0, 0, 3, 4}),
-       createVector<int32>({0, 2, 4, 6, 8})}  // row_partition_tensors
+      createVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32_t>(15),                        // default_value
+      {createVector<int32_t>({0, 1, 3}), createVector<int32_t>({0, 0, 3, 4}),
+       createVector<int32_t>({0, 2, 4, 6, 8})}  // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -323,9 +326,9 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsRowSplit) {
   //             [[15,15],[15,15],[15,15]],
   //           ]
   // params.shape = [3, 2, 3, 2]
-  test::ExpectTensorEqual<int32>(
+  test::ExpectTensorEqual<int32_t>(
       *GetOutput(0),
-      test::AsTensor<int32>(
+      test::AsTensor<int32_t>(
           {15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1,  2,  3,  4,
            5,  6,  7,  8,  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
            15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15},
@@ -334,12 +337,13 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsRowSplit) {
 
 TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpanded) {
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({3, 5}),                 // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -357,14 +361,15 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpanded) {
 // Adds a dense dimension.
 TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorContractExpandedDense) {
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({3, 5, 2}),              // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       ShapeAndValues<float>{TensorShape({9, 2}),
                             {.1, 1.1, .2, 1.2, .3, 1.3, .4, 1.4, .5, 1.5, .6,
                              1.6, .7, 1.7, .8, 1.8, .9, 1.9}},  // values
       createScalar<float>(1.5),                                 // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -386,12 +391,13 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensorConstrained) {
   //           [.4, .5, .6, .7],
   //           [.8, .9]]
   // constrained to (3, 3)
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({3, 3}),                 // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
-      {createScalar<int32>(4), createVector<int32>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
+      {createScalar<int32_t>(4),
+       createVector<int32_t>({0, 0, 0, 2, 2, 2, 2, 3, 3})}
       // row_partition_tensors
   );
 
@@ -418,16 +424,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_3DParamsConstrained) {
   //           [[.9]]
   //          ]
   // params.shape = [5, None, None]
-  BuildRaggedTensorToTensorGraph<float, int32>(
+  BuildRaggedTensorToTensorGraph<float, int32_t>(
       TensorShape({4, 1, 2}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
        "VALUE_ROWIDS"},  // row_partition_types
       createVector<float>({.1, .2, .3, .4, .5, .6, .7, .8, .9}),  // values
       createScalar<float>(1.5),  // default_value
       {
-          createScalar<int32>(5),
-          createVector<int32>({0, 1, 1, 3, 3, 4}),
-          createVector<int32>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
+          createScalar<int32_t>(5),
+          createVector<int32_t>({0, 1, 1, 3, 3, 4}),
+          createVector<int32_t>({1, 1, 2, 3, 3, 4, 4, 4, 5}),
       }  // row_partition_tensors
   );
   TF_ASSERT_OK(RunOpKernel());
@@ -457,15 +463,16 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsConstrained) {
   //            []
   // ]
   // params.shape = [3, 2, 3, 2]
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({2, 2, 2, 2}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS", "VALUE_ROWIDS",
-       "VALUE_ROWIDS"},                               // row_partition_types
-      createVector<int32>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
-      createScalar<int32>(15),                        // default_value
-      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
-       createVector<int32>({1, 1, 1, 2}),
-       createVector<int32>({0, 0, 1, 1, 2, 2, 3, 3})}  // row_partition_tensors
+       "VALUE_ROWIDS"},                                 // row_partition_types
+      createVector<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}),  // values
+      createScalar<int32_t>(15),                        // default_value
+      {createScalar<int32_t>(5), createVector<int32_t>({0, 1, 1}),
+       createVector<int32_t>({1, 1, 1, 2}),
+       createVector<int32_t>({0, 0, 1, 1, 2, 2, 3, 3})}
+      // row_partition_tensors
   );
 
   TF_ASSERT_OK(RunOpKernel());
@@ -480,25 +487,38 @@ TEST_F(RaggedTensorToTensorOpTest, RaggedTensorToTensor_4DParamsConstrained) {
   //           ],
   //          ]
   // params.shape = [3, 2, 3, 2]
-  test::ExpectTensorEqual<int32>(*GetOutput(0), test::AsTensor<int32>(
-                                                    {
-                                                        15, 15, 15, 15,  //
-                                                        15, 15, 15, 15,  //
-                                                        1, 2, 3, 4,      //
-                                                        7, 8, 15, 15,    //
-                                                    },
-                                                    TensorShape({2, 2, 2, 2})));
+  test::ExpectTensorEqual<int32_t>(*GetOutput(0),
+                                   test::AsTensor<int32_t>(
+                                       {
+                                           15,
+                                           15,
+                                           15,
+                                           15,  //
+                                           15,
+                                           15,
+                                           15,
+                                           15,  //
+                                           1,
+                                           2,
+                                           3,
+                                           4,  //
+                                           7,
+                                           8,
+                                           15,
+                                           15,  //
+                                       },
+                                       TensorShape({2, 2, 2, 2})));
 }
 
 TEST_F(RaggedTensorToTensorOpTest, ShapeWrongDimensions) {
-  BuildRaggedTensorToTensorGraph<int32, int32>(
+  BuildRaggedTensorToTensorGraph<int32_t, int32_t>(
       TensorShape({10, 7, 10, 20}),  // shape
       {"FIRST_DIM_SIZE", "VALUE_ROWIDS",
-       "VALUE_ROWIDS"},                   // row_partition_types
-      createVector<int32>({1, 2, 3, 4}),  // values
-      createScalar<int32>(15),            // default_value
-      {createScalar<int32>(5), createVector<int32>({0, 1, 1}),
-       createVector<int32>({1, 1, 1, 2})}  // row_partition_tensors
+       "VALUE_ROWIDS"},                     // row_partition_types
+      createVector<int32_t>({1, 2, 3, 4}),  // values
+      createScalar<int32_t>(15),            // default_value
+      {createScalar<int32_t>(5), createVector<int32_t>({0, 1, 1}),
+       createVector<int32_t>({1, 1, 1, 2})}  // row_partition_tensors
   );
   // Fails with an invalid argument.
   EXPECT_EQ(absl::IsInvalidArgument(RunOpKernel()), true);
@@ -508,7 +528,7 @@ class RaggedTensorToTensorOpUnknownShapeTest
     : public ::tensorflow::OpsTestBase {
  protected:
   std::unique_ptr<ShapeInferenceTestOp> op_;
-  void SetAttributes(const absl::Span<const string> row_partition_types,
+  void SetAttributes(const absl::Span<const std::string> row_partition_types,
                      int num_row_partition_tensors) {
     op_ = std::make_unique<ShapeInferenceTestOp>("RaggedTensorToTensor");
     SetAttrValue(row_partition_types,
@@ -519,7 +539,8 @@ class RaggedTensorToTensorOpUnknownShapeTest
 };
 
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
-  SetAttributes(absl::Span<const string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"}, 2);
+  SetAttributes(absl::Span<const std::string>{"FIRST_DIM_SIZE", "VALUE_ROWIDS"},
+                2);
 
   INFER_OK(*op_, "?;?;?;?;?", "?");
   INFER_OK(*op_, "?;[6];[];[];[6]", "[?,?]");
@@ -544,7 +565,7 @@ TEST_F(RaggedTensorToTensorOpUnknownShapeTest, ValueRowIDs) {
 TEST_F(RaggedTensorToTensorOpUnknownShapeTest, RowSplits) {
   // RaggedTensorToTensor(param_splits+, param_values, indices) -> [splits+,
   // values]
-  SetAttributes(absl::Span<const string>{"ROW_SPLITS"}, 1);
+  SetAttributes(absl::Span<const std::string>{"ROW_SPLITS"}, 1);
 
   // value, default_value, ROW_SPLITS
   INFER_OK(*op_, "?;?;?;?", "?");
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index b4d7fc8395b614..a46f40d177778c 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -256,7 +256,7 @@ class RaggedTensorToVariantGradientOp : public OpKernel {
     auto flat_row_splits = row_splits.flat<SPLIT_TYPE>();
     TensorShape dense_values_shape;
     OP_REQUIRES_OK(context,
-                   TensorShapeUtils::MakeShape(context->input(2).vec<int32>(),
+                   TensorShapeUtils::MakeShape(context->input(2).vec<int32_t>(),
                                                &dense_values_shape));
 
     // Validate row_splits.
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
index 95bd16bbbcbafe..f25f8b34198702 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
@@ -232,7 +232,7 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
   const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
                                            5, 6, 7, 8, 9, 8, 9};
 
-  BuildEncodeRaggedTensorGraph<int, int32>(
+  BuildEncodeRaggedTensorGraph<int, int32_t>(
       {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
       batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -240,12 +240,12 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  ExpectRaggedTensorVariantEqual<int, int32>(
-      CreateVariantFromRagged<int, int32>(
+  ExpectRaggedTensorVariantEqual<int, int32_t>(
+      CreateVariantFromRagged<int, int32_t>(
           {{0, 1, 3, 4, 5, 6}, {0, 2, 3, 4, 5, 6, 7}}, {0, 1, 1, 2, 2, 3, 4}),
       *encoded_list(0).get<RaggedTensorVariant>());
-  ExpectRaggedTensorVariantEqual<int, int32>(
-      CreateVariantFromRagged<int, int32>(
+  ExpectRaggedTensorVariantEqual<int, int32_t>(
+      CreateVariantFromRagged<int, int32_t>(
           {{0, 1, 2, 3, 4, 5}, {0, 1, 2, 5, 6, 7}}, {5, 6, 7, 8, 9, 8, 9}),
       *encoded_list(1).get<RaggedTensorVariant>());
 }
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
index 7dc63ac8fbf7f8..87cfc50f8a268a 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.h
@@ -140,7 +140,7 @@ class RaggedTensorToVariantGradientKernelTest
   void BuildEncodeRaggedTensorGradientGraph(
       const std::vector<Variant>& encoded_ragged_grad,
       const std::vector<SPLIT_TYPE>& row_splits,
-      const std::vector<int32>& dense_values_shape) {
+      const std::vector<int32_t>& dense_values_shape) {
     const auto values_dtype = DataTypeToEnum<VALUE_TYPE>::v();
     const auto splits_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
 
@@ -161,8 +161,8 @@ class RaggedTensorToVariantGradientKernelTest
     AddInputFromArray<SPLIT_TYPE>(TensorShape({splits_size}), row_splits);
 
     int64_t dense_values_shape_size = dense_values_shape.size();
-    AddInputFromArray<int32>(TensorShape({dense_values_shape_size}),
-                             dense_values_shape);
+    AddInputFromArray<int32_t>(TensorShape({dense_values_shape_size}),
+                               dense_values_shape);
   }
 
   template <typename VALUE_TYPE, typename SPLIT_TYPE>
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.cc b/tensorflow/core/kernels/ragged_tensor_variant.cc
index b6b70a283c7c48..5608888b5500d1 100644
--- a/tensorflow/core/kernels/ragged_tensor_variant.cc
+++ b/tensorflow/core/kernels/ragged_tensor_variant.cc
@@ -22,9 +22,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-string RaggedTensorVariant::TypeName() const { return "RaggedTensorVariant"; }
+std::string RaggedTensorVariant::TypeName() const {
+  return "RaggedTensorVariant";
+}
 
-string RaggedTensorVariant::DebugString() const {
+std::string RaggedTensorVariant::DebugString() const {
   return absl::StrCat(
       "RaggedTensorVariant(dtype=", DataTypeString(values_.dtype()),
       ", ragged_rank=", nested_splits_.size(), ", splits_dtype=",
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.h b/tensorflow/core/kernels/ragged_tensor_variant.h
index 1d2066b0dcf457..c75505a603c531 100644
--- a/tensorflow/core/kernels/ragged_tensor_variant.h
+++ b/tensorflow/core/kernels/ragged_tensor_variant.h
@@ -41,8 +41,8 @@ class RaggedTensorVariant {
       : values_(std::move(values)), nested_splits_(nested_splits) {}
 
   // Variant support methods.
-  string TypeName() const;
-  string DebugString() const;
+  std::string TypeName() const;
+  std::string DebugString() const;
   void Encode(VariantTensorData* data) const;
   bool Decode(const VariantTensorData& data);
 
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index 98118b78eb5b58..875744b86ecf47 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -360,8 +360,8 @@ class RandomBinomialOp : public OpKernel {
     TensorShape bcast_shape = BCast::ToShape(bcast.output_shape());
     TensorShape output_shape;
     if (shape_tensor.dtype() == DataType::DT_INT32) {
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
-                                                      &output_shape));
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                              shape_tensor.vec<int32_t>(), &output_shape));
     } else {
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
                               shape_tensor.vec<int64_t>(), &output_shape));
@@ -380,11 +380,11 @@ class RandomBinomialOp : public OpKernel {
     const int64_t num_sample_dims =
         (shape_tensor.dim_size(0) - bcast.output_shape().size());
     for (int64_t i = 0; i < num_sample_dims; ++i) {
-      samples_per_batch *= shape_tensor.flat<int32>()(i);
+      samples_per_batch *= shape_tensor.flat<int32_t>()(i);
     }
     int64_t num_batches = 1;
     for (int64_t i = num_sample_dims; i < shape_tensor.dim_size(0); ++i) {
-      num_batches *= shape_tensor.flat<int32>()(i);
+      num_batches *= shape_tensor.flat<int32_t>()(i);
     }
     const int64_t num_elements = num_batches * samples_per_batch;
 
@@ -409,8 +409,9 @@ class RandomBinomialOp : public OpKernel {
                 errors::InvalidArgument("Unsupported algorithm id: ", alg));
     static_assert(std::is_same<StateElementType, int64_t>::value,
                   "StateElementType must be int64");
-    static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
-                  "PhiloxRandom::ResultElementType must be uint32");
+    static_assert(
+        std::is_same<PhiloxRandom::ResultElementType, uint32_t>::value,
+        "PhiloxRandom::ResultElementType must be uint32");
     OP_REQUIRES(ctx, var_tensor_flat.size() >= PHILOX_MIN_STATE_SIZE,
                 errors::InvalidArgument(
                     "For Philox algorithm, the size of state must be at least ",
@@ -478,8 +479,8 @@ class StatelessRandomBinomialOp : public OpKernel {
     TensorShape bcast_shape = BCast::ToShape(bcast.output_shape());
     TensorShape output_shape;
     if (shape_tensor.dtype() == DataType::DT_INT32) {
-      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(shape_tensor.vec<int32>(),
-                                                      &output_shape));
+      OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
+                              shape_tensor.vec<int32_t>(), &output_shape));
     } else {
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
                               shape_tensor.vec<int64_t>(), &output_shape));
@@ -494,14 +495,14 @@ class StatelessRandomBinomialOp : public OpKernel {
         (shape_tensor.dim_size(0) - bcast.output_shape().size());
     for (int64_t i = 0; i < num_sample_dims; ++i) {
       samples_per_batch *= shape_tensor.dtype() == DataType::DT_INT32
-                               ? shape_tensor.flat<int32>()(i)
-                               : shape_tensor.flat<int64>()(i);
+                               ? shape_tensor.flat<int32_t>()(i)
+                               : shape_tensor.flat<int64_t>()(i);
     }
     int64_t num_batches = 1;
     for (int64_t i = num_sample_dims; i < shape_tensor.dim_size(0); ++i) {
       num_batches *= shape_tensor.dtype() == DataType::DT_INT32
-                         ? shape_tensor.flat<int32>()(i)
-                         : shape_tensor.flat<int64>()(i);
+                         ? shape_tensor.flat<int32_t>()(i)
+                         : shape_tensor.flat<int64_t>()(i);
     }
     const int64_t num_elements = num_batches * samples_per_batch;
 
@@ -557,7 +558,7 @@ class StatelessRandomBinomialOp : public OpKernel {
 REGISTER_ALL(Eigen::half);
 REGISTER_ALL(float);
 REGISTER_ALL(double);
-REGISTER_ALL(int32);
+REGISTER_ALL(int32_t);
 REGISTER_ALL(int64_t);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/random_binomial_op_test.cc b/tensorflow/core/kernels/random_binomial_op_test.cc
index 80af07f13a4083..9e715b5afccf92 100644
--- a/tensorflow/core/kernels/random_binomial_op_test.cc
+++ b/tensorflow/core/kernels/random_binomial_op_test.cc
@@ -28,7 +28,7 @@ static Graph* RandomBinomialGraph(double count, double prob, int num_batches,
                                   int samples_per_batch) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor shape_t(DT_INT32, TensorShape({2}));
-  shape_t.flat<int32>().setValues({num_batches, samples_per_batch});
+  shape_t.flat<int32_t>().setValues({num_batches, samples_per_batch});
 
   Tensor counts_t(DT_FLOAT, TensorShape({num_batches}));
   counts_t.flat<float>().setConstant(count);
diff --git a/tensorflow/core/kernels/random_index_shuffle_test.cc b/tensorflow/core/kernels/random_index_shuffle_test.cc
index 259f484cc344ca..02458f4aa99f49 100644
--- a/tensorflow/core/kernels/random_index_shuffle_test.cc
+++ b/tensorflow/core/kernels/random_index_shuffle_test.cc
@@ -32,11 +32,11 @@ class RandomIndexShuffleTest : public ::testing::TestWithParam<uint64_t> {
 
 // Check that we do a correct bijection.
 TEST_P(RandomIndexShuffleTest, Bijection) {
-  const std::array<uint32, 3>& key = {42, 73, 1991};
+  const std::array<uint32_t, 3>& key = {42, 73, 1991};
   const uint64_t max_value = GetMaxValue();
   std::vector<bool> seen(max_value + 1, false);
   for (uint64_t value = 0; value <= max_value; ++value) {
-    const uint64 output_value =
+    const uint64_t output_value =
         index_shuffle(value, key, max_value, /* rounds= */ 4);
     EXPECT_GE(output_value, 0);
     EXPECT_LE(output_value, max_value);
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 7624b56b50b587..87179f9fef5e8f 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -157,7 +157,7 @@ class RandomGammaOp : public OpKernel {
                     shape_t.DebugString()));
     TensorShape samples_shape;
     if (shape_t.dtype() == DataType::DT_INT32) {
-      auto vec = shape_t.flat<int32>();
+      auto vec = shape_t.flat<int32_t>();
       OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(vec.data(), vec.size(),
                                                       &samples_shape));
     } else if (shape_t.dtype() == DataType::DT_INT64) {
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index ea16f54ec9acb4..1d6299802f21c7 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -40,8 +40,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // nullptr, they provide the input; otherwise `gen` provides the input.
 template <class Distribution>
 struct FillPhiloxRandom<CPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d, const uint64* key,
-                  const uint64* counter, random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, const uint64_t* key,
+                  const uint64_t* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64_t size,
                   Distribution dist);
 };
diff --git a/tensorflow/core/kernels/random_op_cpu.h b/tensorflow/core/kernels/random_op_cpu.h
index cfa927c1e539ea..7d7a16dcc6a3fc 100644
--- a/tensorflow/core/kernels/random_op_cpu.h
+++ b/tensorflow/core/kernels/random_op_cpu.h
@@ -60,8 +60,8 @@ using random::SingleSampleAdapter;
 template <typename Device, class Distribution>
 struct FillPhiloxRandom {
   typedef typename Distribution::ResultElementType T;
-  void operator()(OpKernelContext* ctx, const Device&, const uint64* key,
-                  const uint64* counter, random::PhiloxRandom gen, T* data,
+  void operator()(OpKernelContext* ctx, const Device&, const uint64_t* key,
+                  const uint64_t* counter, random::PhiloxRandom gen, T* data,
                   int64_t size, Distribution dist) {
     OP_REQUIRES(
         ctx, false,
@@ -156,8 +156,8 @@ struct FillPhiloxRandomTask<Distribution, true> {
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<CPUDevice, Distribution>::operator()(
-    OpKernelContext* ctx, const CPUDevice&, const uint64* key,
-    const uint64* counter, random::PhiloxRandom gen,
+    OpKernelContext* ctx, const CPUDevice&, const uint64_t* key,
+    const uint64_t* counter, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64_t size,
     Distribution dist) {
   if (key != nullptr && counter != nullptr) {
diff --git a/tensorflow/core/kernels/random_op_test.cc b/tensorflow/core/kernels/random_op_test.cc
index 7292ce4ec8a3f0..5abe81f27f31e2 100644
--- a/tensorflow/core/kernels/random_op_test.cc
+++ b/tensorflow/core/kernels/random_op_test.cc
@@ -26,13 +26,13 @@ namespace tensorflow {
 namespace {
 
 Tensor VecShape(int64_t v) {
-  if (v >= std::numeric_limits<int32>::max()) {
+  if (v >= std::numeric_limits<int32_t>::max()) {
     Tensor shape(DT_INT64, TensorShape({1}));
     shape.vec<int64_t>()(0) = v;
     return shape;
   } else {
     Tensor shape(DT_INT32, TensorShape({1}));
-    shape.vec<int32>()(0) = v;
+    shape.vec<int32_t>()(0) = v;
     return shape;
   }
 }
diff --git a/tensorflow/core/kernels/random_ops_util.h b/tensorflow/core/kernels/random_ops_util.h
index b990456965ff59..c203181d575818 100644
--- a/tensorflow/core/kernels/random_ops_util.h
+++ b/tensorflow/core/kernels/random_ops_util.h
@@ -26,20 +26,21 @@ using random::PhiloxRandom;
 // The following 2 functions use the contract "lower 32 bits for the first
 // uint32, higher 32 bits for the second". Note that this is endian-neutral,
 // unlike a direct memory copy `memcpy(output, &input, 8)`.
-PHILOX_DEVICE_INLINE void Uint64ToUint32s(uint64 input, uint32* output1,
-                                          uint32* output2) {
-  *output1 = static_cast<uint32>(input);
-  *output2 = static_cast<uint32>(input >> 32);
+PHILOX_DEVICE_INLINE void Uint64ToUint32s(uint64_t input, uint32_t* output1,
+                                          uint32_t* output2) {
+  *output1 = static_cast<uint32_t>(input);
+  *output2 = static_cast<uint32_t>(input >> 32);
 }
 
-PHILOX_DEVICE_INLINE uint64 Uint32sToUint64(uint32 input1, uint32 input2) {
-  auto u64_1 = static_cast<uint64>(input1);
-  auto u64_2 = static_cast<uint64>(input2);
+PHILOX_DEVICE_INLINE uint64_t Uint32sToUint64(uint32_t input1,
+                                              uint32_t input2) {
+  auto u64_1 = static_cast<uint64_t>(input1);
+  auto u64_2 = static_cast<uint64_t>(input2);
   return u64_1 | (u64_2 << 32);
 }
 
 PHILOX_DEVICE_INLINE PhiloxRandom::ResultType GetCounterFromMem(
-    uint64 const* ptr) {
+    const uint64_t* ptr) {
   PhiloxRandom::ResultType counter;
   Uint64ToUint32s(ptr[0], &counter[0], &counter[1]);
   Uint64ToUint32s(ptr[1], &counter[2], &counter[3]);
@@ -47,24 +48,24 @@ PHILOX_DEVICE_INLINE PhiloxRandom::ResultType GetCounterFromMem(
 }
 
 PHILOX_DEVICE_INLINE void WriteCounterToMem(
-    PhiloxRandom::ResultType const& counter, uint64* ptr) {
+    PhiloxRandom::ResultType const& counter, uint64_t* ptr) {
   ptr[0] = Uint32sToUint64(counter[0], counter[1]);
   ptr[1] = Uint32sToUint64(counter[2], counter[3]);
 }
 
-PHILOX_DEVICE_INLINE PhiloxRandom::Key GetKeyFromMem(uint64 const* ptr) {
+PHILOX_DEVICE_INLINE PhiloxRandom::Key GetKeyFromMem(const uint64_t* ptr) {
   PhiloxRandom::Key key;
   Uint64ToUint32s(ptr[0], &key[0], &key[1]);
   return key;
 }
 
 PHILOX_DEVICE_INLINE void WriteKeyToMem(PhiloxRandom::Key const& key,
-                                        uint64* ptr) {
+                                        uint64_t* ptr) {
   *ptr = Uint32sToUint64(key[0], key[1]);
 }
 
 PHILOX_DEVICE_INLINE PhiloxRandom GetPhiloxRandomFromCounterKeyMem(
-    uint64 const* counter_ptr, uint64 const* key_ptr) {
+    const uint64_t* counter_ptr, const uint64_t* key_ptr) {
   return PhiloxRandom(GetCounterFromMem(counter_ptr), GetKeyFromMem(key_ptr));
 }
 
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index 3c703f5d0ca0d7..9b1f93584ad86b 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -351,7 +351,7 @@ TF_CALL_double(REGISTER);
 REGISTER_ALL(Eigen::half);
 REGISTER_ALL(float);
 REGISTER_ALL(double);
-REGISTER_ALL(int32);
+REGISTER_ALL(int32_t);
 REGISTER_ALL(int64_t);
 
 #undef REGISTER_ALL
diff --git a/tensorflow/core/kernels/random_poisson_op_test.cc b/tensorflow/core/kernels/random_poisson_op_test.cc
index ea2541d8bdf1b2..4d8f62a2e142d8 100644
--- a/tensorflow/core/kernels/random_poisson_op_test.cc
+++ b/tensorflow/core/kernels/random_poisson_op_test.cc
@@ -24,13 +24,13 @@ namespace tensorflow {
 namespace {
 
 Tensor VecShape(int64_t v) {
-  if (v >= std::numeric_limits<int32>::max()) {
+  if (v >= std::numeric_limits<int32_t>::max()) {
     Tensor shape(DT_INT64, TensorShape({1}));
     shape.vec<int64_t>()(0) = v;
     return shape;
   } else {
     Tensor shape(DT_INT32, TensorShape({1}));
-    shape.vec<int32>()(0) = v;
+    shape.vec<int32_t>()(0) = v;
     return shape;
   }
 }
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 856357489bdfab..c9c83d381e6ff9 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -45,7 +45,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
   RandomShuffleQueue(int32_t capacity, int32_t min_after_dequeue, int64_t seed,
                      int64_t seed2, const DataTypeVector& component_dtypes,
                      const std::vector<TensorShape>& component_shapes,
-                     const string& name);
+                     const std::string& name);
 
   absl::Status Initialize()
       override;  // Must be called before any other method.
@@ -61,7 +61,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
                       CallbackWithTuple callback) override;
   absl::Status MatchesNodeDef(const NodeDef& node_def) override;
 
-  int32 size() const override {
+  int32_t size() const override {
     mutex_lock lock(mu_);
     return queues_[0].size();
   }
@@ -78,7 +78,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
                                                    OpKernelContext* ctx,
                                                    Tensor* out_tensor);
 
-  const int32 min_after_dequeue_;
+  const int32_t min_after_dequeue_;
   const int64_t original_seed_;
   const int64_t original_seed2_;
 
@@ -93,7 +93,7 @@ class RandomShuffleQueue : public TypedQueue<std::vector<Tensor> > {
 RandomShuffleQueue::RandomShuffleQueue(
     int32_t capacity, int32_t min_after_dequeue, int64_t seed, int64_t seed2,
     const DataTypeVector& component_dtypes,
-    const std::vector<TensorShape>& component_shapes, const string& name)
+    const std::vector<TensorShape>& component_shapes, const std::string& name)
     : TypedQueue(capacity, component_dtypes, component_shapes, name),
       min_after_dequeue_(min_after_dequeue),
       original_seed_(seed),
@@ -503,7 +503,7 @@ class RandomShuffleQueueOp : public TypedQueueOp {
     return CreateTypedQueue(queue, ret);
   }
 
-  int32 min_after_dequeue_;
+  int32_t min_after_dequeue_;
   int64_t seed_;
   int64_t seed2_;
   std::vector<TensorShape> component_shapes_;
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index db4f97c3e925de..2f8fb60c3b9f44 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -248,7 +248,7 @@ FixedUnigramSampler::FixedUnigramSampler(int64_t range, float distortion,
 }
 
 absl::Status FixedUnigramSampler::SetDistributionSampler(
-    Env* env, const string& vocab_file) {
+    Env* env, const std::string& vocab_file) {
   TF_RETURN_IF_ERROR(LoadFromFile(env, vocab_file, distortion_));
   if (!TF_PREDICT_TRUE(FixedUnigramSampler::range() == weights_.size()))
     return (errors::InvalidArgument("range is ", FixedUnigramSampler::range(),
@@ -287,18 +287,18 @@ void FixedUnigramSampler::FillReservedIds(int32_t num_reserved_ids) {
 }
 
 absl::Status FixedUnigramSampler::LoadFromFile(Env* env,
-                                               const string& vocab_file,
+                                               const std::string& vocab_file,
                                                float distortion) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(vocab_file, &file));
 
   io::InputBuffer in(file.get(), 262144 /*bytes*/);
-  string line;
+  std::string line;
   int32_t word_id = weights_.size();
   while (in.ReadLine(&line).ok()) {
     // The vocabulary file should be in csv like format, with the last
     // field the weight associated with the word.
-    std::vector<string> cols = str_util::Split(line, ',');
+    std::vector<std::string> cols = str_util::Split(line, ',');
     if (cols.empty()) continue;
     // Skip entries that do not belong to this shard.
     if (word_id % num_shards_ == shard_) {
diff --git a/tensorflow/core/kernels/range_sampler.h b/tensorflow/core/kernels/range_sampler.h
index c49bbcc5b1eede..cecb681cd4e973 100644
--- a/tensorflow/core/kernels/range_sampler.h
+++ b/tensorflow/core/kernels/range_sampler.h
@@ -208,7 +208,7 @@ class FixedUnigramSampler : public RangeSampler {
                       int32_t num_shards, int32_t shard);
   // The vocab_file is assumed to be a CSV, with the last entry of each row a
   // value representing the counts or probabilities for the corresponding ID.
-  absl::Status SetDistributionSampler(Env* env, const string& vocab_file);
+  absl::Status SetDistributionSampler(Env* env, const std::string& vocab_file);
   absl::Status SetDistributionSampler(const std::vector<float>& unigrams);
   float Probability(int64_t value) const override;
 
@@ -225,14 +225,14 @@ class FixedUnigramSampler : public RangeSampler {
   // Sharding information of the sampler. The whole vocabulary is sharded
   // into num_shards_ smaller ranges and each sampler is responsible for one
   // such smaller range, identified by the shard number.
-  int32 num_shards_;
-  int32 shard_;
+  int32_t num_shards_;
+  int32_t shard_;
   float distortion_;
   // Fill the sampler with the appropriate number of reserved IDs.
   void FillReservedIds(int32_t num_reserved_ids);
   // Load IDs to sample from a CSV file. It is assumed that the last item of
   // each row contains a count or probability for the corresponding ID.
-  absl::Status LoadFromFile(Env* env, const string& vocab_file,
+  absl::Status LoadFromFile(Env* env, const std::string& vocab_file,
                             float distortion);
   // Load from an in-memory array.
   void LoadFromUnigrams(const std::vector<float>& unigrams, float distortion);
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
index 1aeadc634ccea3..93891f10446311 100644
--- a/tensorflow/core/kernels/range_sampler_test.cc
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -157,7 +157,7 @@ static const char kVocabContent[] =
     "w9,256";
 TEST_F(RangeSamplerTest, FixedUnigramProbabilities) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -169,7 +169,7 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilities) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramNoExistingFilename) {
   Env* env = Env::Default();
-  string fname = "NoExistingFile";
+  std::string fname = "NoExistingFile";
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   absl::Status s = test_sampler->SetDistributionSampler(env, fname);
   sampler_.reset(test_sampler);
@@ -177,7 +177,7 @@ TEST_F(RangeSamplerTest, FixedUnigramNoExistingFilename) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramNoMatchingRangeWeights) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(8, 0.8, 0, 1, 0);
   absl::Status s = test_sampler->SetDistributionSampler(env, fname);
@@ -186,7 +186,7 @@ TEST_F(RangeSamplerTest, FixedUnigramNoMatchingRangeWeights) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -195,7 +195,7 @@ TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramHistogram) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -204,7 +204,7 @@ TEST_F(RangeSamplerTest, FixedUnigramHistogram) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(10, 0.8, 1, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
@@ -217,7 +217,7 @@ TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve1) {
 }
 TEST_F(RangeSamplerTest, FixedUnigramProbabilitiesReserve2) {
   Env* env = Env::Default();
-  string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
+  std::string fname = io::JoinPath(testing::TmpDir(), "vocab_file");
   TF_CHECK_OK(WriteStringToFile(env, fname, kVocabContent));
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(11, 0.8, 2, 1, 0);
   TF_CHECK_OK(test_sampler->SetDistributionSampler(env, fname));
diff --git a/tensorflow/core/kernels/record_input_op.cc b/tensorflow/core/kernels/record_input_op.cc
index 1fae7e40af9abd..d1c3fbd1f70cb9 100644
--- a/tensorflow/core/kernels/record_input_op.cc
+++ b/tensorflow/core/kernels/record_input_op.cc
@@ -30,13 +30,13 @@ class RecordInputOp : public OpKernel {
   TYPE FIELD;                \
   OP_REQUIRES_OK(ctx, ctx->GetAttr(#FIELD, &FIELD));
 
-    GETATTR(string, file_pattern);
+    GETATTR(std::string, file_pattern);
     GETATTR(int64_t, file_random_seed);
     GETATTR(float, file_shuffle_shift_ratio);
     GETATTR(int64_t, file_buffer_size);
     GETATTR(int64_t, file_parallelism);
     GETATTR(int64_t, batch_size);
-    GETATTR(string, compression_type);
+    GETATTR(std::string, compression_type);
 #undef GETATTR
 
     OP_REQUIRES_OK(ctx, ctx->GetAttr("compression_type", &compression_type));
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index db8d59515c2f43..e186c92e7c3b30 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -82,10 +82,10 @@ bool RecordYielder::ShouldFinish(const absl::Status& s) {
   return stop_ || !status_.ok();
 }
 
-static absl::Status MatchFiles(const string& patterns,
-                               std::vector<string>* filenames) {
+static absl::Status MatchFiles(const std::string& patterns,
+                               std::vector<std::string>* filenames) {
   for (const auto& file_pattern : str_util::Split(patterns, ',')) {
-    std::vector<string> tmp_filenames;
+    std::vector<std::string> tmp_filenames;
     TF_RETURN_IF_ERROR(
         Env::Default()->GetMatchingPaths(file_pattern, &tmp_filenames));
     filenames->insert(filenames->end(),
@@ -102,7 +102,7 @@ void RecordYielder::MainLoop() {
     num_records_added_in_epoch_ = 0;
 
     // Finds all files.
-    std::vector<string> filenames;
+    std::vector<std::string> filenames;
     absl::Status s = MatchFiles(opts_.file_pattern, &filenames);
 
     if (filenames.empty()) {
@@ -121,7 +121,7 @@ void RecordYielder::MainLoop() {
     std::shuffle(filenames.begin(), filenames.end(), shuffle_rnd);
 
     // Left-shift the filename list.
-    const std::vector<string>::size_type num = filenames.size();
+    const std::vector<std::string>::size_type num = filenames.size();
     int64_t shift;
     if (0 <= opts_.file_shuffle_shift_ratio &&
         opts_.file_shuffle_shift_ratio < 1) {
@@ -136,7 +136,8 @@ void RecordYielder::MainLoop() {
     for (int i = 0; i < N; ++i) {
       Shard* shard = &shards[i];
       shard->index = i;
-      for (std::vector<string>::size_type j = i; j < filenames.size(); j += N) {
+      for (std::vector<std::string>::size_type j = i; j < filenames.size();
+           j += N) {
         shard->filenames.push_back(filenames[j]);
       }
       thread_->Schedule([this, shard]() { ShardLoop(shard); });
@@ -172,7 +173,7 @@ void RecordYielder::MainLoop() {
   main_loop_done_.Notify();
 }
 
-bool RecordYielder::Add(std::vector<string>* values) {
+bool RecordYielder::Add(std::vector<std::string>* values) {
   mutex_lock l(mu_);
   while (!BufNotFull()) {
     buf_not_full_.wait(l);
@@ -197,9 +198,9 @@ bool RecordYielder::Add(std::vector<string>* values) {
 }
 
 void RecordYielder::ShardLoop(Shard* shard) {
-  std::vector<string> values;
+  std::vector<std::string> values;
   const int64_t kRecords = 16;
-  for (const string& filename : shard->filenames) {
+  for (const std::string& filename : shard->filenames) {
     std::unique_ptr<RandomAccessFile> file;
     if (ShouldFinish(absl::OkStatus())) break;
     absl::Status s = Env::Default()->NewRandomAccessFile(filename, &file);
@@ -211,7 +212,7 @@ void RecordYielder::ShardLoop(Shard* shard) {
         io::RecordReaderOptions::CreateRecordReaderOptions(
             opts_.compression_type);
     io::RecordReader rdr(file.get(), options);
-    uint64 offset = 0;
+    uint64_t offset = 0;
     tstring record;
     while (true) {
       absl::Status s = rdr.ReadRecord(&offset, &record);
diff --git a/tensorflow/core/kernels/record_yielder.h b/tensorflow/core/kernels/record_yielder.h
index 6184a283ecece1..8f201082eac5f4 100644
--- a/tensorflow/core/kernels/record_yielder.h
+++ b/tensorflow/core/kernels/record_yielder.h
@@ -59,7 +59,7 @@ class RecordYielder {
  public:
   struct Options {
     // Glob pattern for tfrecords.
-    string file_pattern;
+    std::string file_pattern;
 
     // Random seed. It determines how data files are shuffled and how
     // records are shuffled.
@@ -73,13 +73,13 @@ class RecordYielder {
     float file_shuffle_shift_ratio = 0;
 
     // Randomization buffer keeps these many records.
-    uint64 bufsize = 1;
+    uint64_t bufsize = 1;
 
     // Uses these many concurrent tfrecord iterators to iterate through
     // tfrecords.
-    int32 parallelism = 1;
+    int32_t parallelism = 1;
 
-    string compression_type;
+    std::string compression_type;
   };
 
   explicit RecordYielder(OpKernelConstruction* context,
@@ -116,7 +116,7 @@ class RecordYielder {
   std::mt19937_64 rnd_ TF_GUARDED_BY(mu_);
 
   // Randomization buffer.
-  std::vector<string> buf_ TF_GUARDED_BY(mu_);
+  std::vector<std::string> buf_ TF_GUARDED_BY(mu_);
 
   // True iff we are draining an epoch.
   bool epoch_end_ = false;
@@ -145,14 +145,14 @@ class RecordYielder {
     // any.
     return stop_ || !status_.ok() || (epoch_end_ && !buf_.empty()) ||
            (!epoch_end_ &&
-            buf_.size() >= std::max<uint64>(1, opts_.bufsize / 2));
+            buf_.size() >= std::max<uint64_t>(1, opts_.bufsize / 2));
   }
 
   void MainLoop();
   struct Shard;
   void ShardLoop(Shard* shard);
   bool ShouldFinish(const absl::Status& s);
-  bool Add(std::vector<string>* values);
+  bool Add(std::vector<std::string>* values);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduce_join_op.cc b/tensorflow/core/kernels/reduce_join_op.cc
index 6ee2ef0139a427..e05e4c3b4d6030 100644
--- a/tensorflow/core/kernels/reduce_join_op.cc
+++ b/tensorflow/core/kernels/reduce_join_op.cc
@@ -47,7 +47,7 @@ const absl::InlinedVector<int64_t, 8> GetStrides(const TensorShape& shape) {
 // nonspecified dimensions set to 0.  Dimensions must be ordered from outer-most
 // to inner-most with respect to the subset linear index.
 inline int64_t LinearSubIndexToFullIndex(
-    int64_t output_index, const absl::InlinedVector<int32, 8>& dim_list,
+    int64_t output_index, const absl::InlinedVector<int32_t, 8>& dim_list,
     const TensorShape& input_shape,
     const absl::InlinedVector<int64_t, 8>& strides) {
   int64_t result = 0;
@@ -63,7 +63,7 @@ inline int64_t LinearSubIndexToFullIndex(
 
 // Computes the number of input elements reduced per output element.
 int64_t GetReductionIterSize(
-    const absl::InlinedVector<int32, 8>& reduced_indices,
+    const absl::InlinedVector<int32_t, 8>& reduced_indices,
     const TensorShape& input_shape) {
   int64_t result = 1;
   for (int32_t reduce_dim : reduced_indices) {
@@ -74,12 +74,12 @@ int64_t GetReductionIterSize(
 
 // Computes a list of all true reduced indices, accounting for negative
 // indices.
-absl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
-                                                int32_t input_dims) {
-  const auto reduction_indices_flat = reduction_indices.flat<int32>();
+absl::InlinedVector<int32_t, 8> GetReducedIndices(
+    const Tensor& reduction_indices, int32_t input_dims) {
+  const auto reduction_indices_flat = reduction_indices.flat<int32_t>();
   const int32_t reduction_dims = reduction_indices_flat.size();
 
-  absl::InlinedVector<int32, 8> reduced_indices(reduction_dims);
+  absl::InlinedVector<int32_t, 8> reduced_indices(reduction_dims);
   for (int32_t i = 0; i < reduction_dims; ++i) {
     reduced_indices[i] = reduction_indices_flat(reduction_dims - i - 1);
     reduced_indices[i] += reduced_indices[i] < 0 ? input_dims : 0;
@@ -91,7 +91,7 @@ absl::InlinedVector<int32, 8> GetReducedIndices(const Tensor& reduction_indices,
 // Appends all unreduced dimensions to the given vector.
 void MakeUnreducedIndices(absl::InlinedVector<bool, 8> index_is_reduced,
                           int32_t input_dims,
-                          absl::InlinedVector<int32, 8>* unreduced_indices) {
+                          absl::InlinedVector<int32_t, 8>* unreduced_indices) {
   for (int32_t index = 0; index < input_dims; ++index) {
     if (!index_is_reduced[index]) unreduced_indices->push_back(index);
   }
@@ -128,7 +128,7 @@ class ReduceJoinOp : public OpKernel {
     const int32_t input_dims = input_shape.dims();
 
     const Tensor& reduction_indices = context->input(1);
-    const auto reduction_indices_flat = reduction_indices.flat<int32>();
+    const auto reduction_indices_flat = reduction_indices.flat<int32_t>();
     const int32_t reduction_dims = reduction_indices_flat.size();
 
     absl::InlinedVector<bool, 8> index_is_reduced(input_dims, false);
@@ -146,9 +146,9 @@ class ReduceJoinOp : public OpKernel {
       index_is_reduced[true_reduce_index] = true;
     }
 
-    absl::InlinedVector<int32, 8> reduced_indices =
+    absl::InlinedVector<int32_t, 8> reduced_indices =
         GetReducedIndices(reduction_indices, input_dims);
-    absl::InlinedVector<int32, 8> unreduced_indices;
+    absl::InlinedVector<int32_t, 8> unreduced_indices;
     MakeUnreducedIndices(index_is_reduced, input_dims, &unreduced_indices);
     const auto strides = GetStrides(input_shape);
 
@@ -179,7 +179,7 @@ class ReduceJoinOp : public OpKernel {
 
  private:
   bool keep_dims_;
-  string separator_;
+  std::string separator_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ReduceJoin").Device(DEVICE_CPU), ReduceJoinOp);
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 510fbc9326d324..34f559704ed521 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -114,12 +114,12 @@ struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
     }                                                                         \
   }
 
-CASTING_SPECIALIZATION(uint8, uint64);
-CASTING_SPECIALIZATION(uint16, uint64);
-CASTING_SPECIALIZATION(uint32, uint64);
-CASTING_SPECIALIZATION(int8, int64_t);
-CASTING_SPECIALIZATION(int16, int64_t);
-CASTING_SPECIALIZATION(int32, int64_t);
+CASTING_SPECIALIZATION(uint8_t, uint64_t);
+CASTING_SPECIALIZATION(uint16_t, uint64_t);
+CASTING_SPECIALIZATION(uint32_t, uint64_t);
+CASTING_SPECIALIZATION(int8_t, int64_t);
+CASTING_SPECIALIZATION(int16_t, int64_t);
+CASTING_SPECIALIZATION(int32_t, int64_t);
 CASTING_SPECIALIZATION(bfloat16, float);
 #undef CASTING_SPECIALIZATION
 
diff --git a/tensorflow/core/kernels/reduction_ops_all.cc b/tensorflow/core/kernels/reduction_ops_all.cc
index 02ad8662cdc014..54c8e4969717c7 100644
--- a/tensorflow/core/kernels/reduction_ops_all.cc
+++ b/tensorflow/core/kernels/reduction_ops_all.cc
@@ -19,16 +19,16 @@ namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(
     Name("All")
-        .TypeConstraint<int32>("Tidx")
+        .TypeConstraint<int32_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int32, Eigen::internal::AndReducer>);
+    ReductionOp<CPUDevice, bool, int32_t, Eigen::internal::AndReducer>);
 REGISTER_KERNEL_BUILDER(
     Name("All")
         .TypeConstraint<int64_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int64, Eigen::internal::AndReducer>);
+    ReductionOp<CPUDevice, bool, int64_t, Eigen::internal::AndReducer>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/reduction_ops_any.cc b/tensorflow/core/kernels/reduction_ops_any.cc
index 0d5b531b6b9d22..9675bbccc0f7e2 100644
--- a/tensorflow/core/kernels/reduction_ops_any.cc
+++ b/tensorflow/core/kernels/reduction_ops_any.cc
@@ -19,16 +19,16 @@ namespace tensorflow {
 
 REGISTER_KERNEL_BUILDER(
     Name("Any")
-        .TypeConstraint<int32>("Tidx")
+        .TypeConstraint<int32_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int32, Eigen::internal::OrReducer>);
+    ReductionOp<CPUDevice, bool, int32_t, Eigen::internal::OrReducer>);
 REGISTER_KERNEL_BUILDER(
     Name("Any")
         .TypeConstraint<int64_t>("Tidx")
         .Device(DEVICE_CPU)
         .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, bool, int64, Eigen::internal::OrReducer>);
+    ReductionOp<CPUDevice, bool, int64_t, Eigen::internal::OrReducer>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index 60f5b9462f8366..028743cf9c3d18 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -44,10 +44,10 @@ TensorShape ReductionHelper::shuffled_shape() {
   return shape;
 }
 
-absl::InlinedVector<int32, 8> ReductionHelper::permutation() {
+absl::InlinedVector<int32_t, 8> ReductionHelper::permutation() {
   const int dims = data_reshape_.size();
   const int unreduced_dims = (dims + !reduce_first_axis_) / 2;
-  absl::InlinedVector<int32, 8> perm(dims);
+  absl::InlinedVector<int32_t, 8> perm(dims);
   for (int i = 0; i < unreduced_dims; i++) {
     perm[i] = 2 * i + reduce_first_axis_;
   }
@@ -84,7 +84,7 @@ absl::Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
   // bitmap[i] indicates whether to reduce data along i-th axis.
   absl::InlinedVector<bool, 4> bitmap(data.dims(), false);
   if (axis.dtype() == DT_INT32) {
-    TF_RETURN_IF_ERROR(SimplifyHelper<int32>(data, axis, bitmap));
+    TF_RETURN_IF_ERROR(SimplifyHelper<int32_t>(data, axis, bitmap));
   } else {
     TF_RETURN_IF_ERROR(SimplifyHelper<int64_t>(data, axis, bitmap));
   }
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 6ce777f748a777..daab208f725bec 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -114,7 +114,7 @@ class ReductionHelper {
   TensorShape shuffled_shape();
 
   // Permutation of reduced dims needed to put reduction dimensions at the end
-  absl::InlinedVector<int32, 8> permutation();
+  absl::InlinedVector<int32_t, 8> permutation();
 
  private:
   bool reduce_first_axis_;  // True if need to reduce the 0-th dimension.
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index 59d7c89b7795be..54025c4e612fe2 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -67,23 +67,23 @@ REGISTER_GPU_KERNELS(int64_t);
 // A special DEVICE_DEFAULT kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MaxReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64_t>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MaxReducer<int32>>);
+REGISTER_KERNEL_BUILDER(Name("Max")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int32_t,
+                                    Eigen::internal::MaxReducer<int32_t>>);
+REGISTER_KERNEL_BUILDER(Name("Max")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int64_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int64_t,
+                                    Eigen::internal::MaxReducer<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index d493cc7514b5d1..b81cd549373d2e 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -65,24 +65,23 @@ REGISTER_GPU_KERNELS(double);
 // A special DEVICE_DEFAULT kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MinReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_DEFAULT)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64_t>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MinReducer<int32>>);
-
+REGISTER_KERNEL_BUILDER(Name("Min")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int32_t,
+                                    Eigen::internal::MinReducer<int32_t>>);
+REGISTER_KERNEL_BUILDER(Name("Min")
+                            .Device(DEVICE_DEFAULT)
+                            .HostMemory("reduction_indices")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int64_t>("Tidx"),
+                        ReductionOp<CPUDevice, int32_t, int64_t,
+                                    Eigen::internal::MinReducer<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_test.cc b/tensorflow/core/kernels/reduction_ops_test.cc
index cb5fda312752ac..4c77592f5dbf36 100644
--- a/tensorflow/core/kernels/reduction_ops_test.cc
+++ b/tensorflow/core/kernels/reduction_ops_test.cc
@@ -24,58 +24,58 @@ namespace tensorflow {
 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
 // into a scalar.
 template <typename T>
-static Graph* ToScalar(const string& reduce, int num_x, int num_y) {
+static Graph* ToScalar(const std::string& reduce, int num_x, int num_y) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x, num_y}));
   data.flat<T>().setRandom();
   Tensor axes(DT_INT32, TensorShape({2}));
-  axes.flat<int32>()(0) = 0;
-  axes.flat<int32>()(1) = 1;
+  axes.flat<int32_t>()(0) = 0;
+  axes.flat<int32_t>()(1) = 1;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* ColReduce(const string& reduce, int num_x, int num_y) {
+static Graph* ColReduce(const std::string& reduce, int num_x, int num_y) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 0;
+  axes.flat<int32_t>()(0) = 0;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* RowReduce(const string& reduce, int num_x, int num_y) {
+static Graph* RowReduce(const std::string& reduce, int num_x, int num_y) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 1;
+  axes.flat<int32_t>()(0) = 1;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) {
+static Graph* ThreeDYReduce(const std::string& reduce, int num_y, int num_z) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = 1;
+  axes.flat<int32_t>()(0) = 1;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
 }
 
-static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
+static Graph* ThreeDXZReduce(const std::string& reduce, int num_y, int num_z) {
   auto* g = new Graph(OpRegistry::Global());
   Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
   data.flat<float>().setRandom();
   Tensor axes(DT_INT32, TensorShape({2}));
-  axes.flat<int32>()(0) = 0;
-  axes.flat<int32>()(1) = 2;
+  axes.flat<int32_t>()(0) = 0;
+  axes.flat<int32_t>()(1) = 2;
   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
                       test::graph::Constant(g, axes));
   return g;
@@ -85,7 +85,7 @@ static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
 // into a scalar on a "device". Runs the bench for "iters" times.
 template <typename T>
 static void ReduceToScalar(::testing::benchmark::State& state,
-                           const string& device, const string& reduce,
+                           const std::string& device, const std::string& reduce,
                            int num_x, int num_y) {
   test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
@@ -97,8 +97,8 @@ static void ReduceToScalar(::testing::benchmark::State& state,
 }
 
 static void DoRowReduce(::testing::benchmark::State& state,
-                        const string& device, const string& reduce, int num_x,
-                        int num_y) {
+                        const std::string& device, const std::string& reduce,
+                        int num_x, int num_y) {
   test::Benchmark(device, RowReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -109,8 +109,8 @@ static void DoRowReduce(::testing::benchmark::State& state,
 }
 
 static void DoColReduce(::testing::benchmark::State& state,
-                        const string& device, const string& reduce, int num_x,
-                        int num_y) {
+                        const std::string& device, const std::string& reduce,
+                        int num_x, int num_y) {
   test::Benchmark(device, ColReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -121,8 +121,8 @@ static void DoColReduce(::testing::benchmark::State& state,
 }
 
 static void Do3DYReduce(::testing::benchmark::State& state,
-                        const string& device, const string& reduce, int num_x,
-                        int num_y) {
+                        const std::string& device, const std::string& reduce,
+                        int num_x, int num_y) {
   test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
@@ -133,8 +133,8 @@ static void Do3DYReduce(::testing::benchmark::State& state,
 }
 
 static void Do3DXZReduce(::testing::benchmark::State& state,
-                         const string& device, const string& reduce, int num_x,
-                         int num_y) {
+                         const std::string& device, const std::string& reduce,
+                         int num_x, int num_y) {
   test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y),
                   /*old_benchmark_api*/ false)
       .Run(state);
diff --git a/tensorflow/core/kernels/reference_gemm.h b/tensorflow/core/kernels/reference_gemm.h
index 9d0bb60ed436b4..e90656fd36b298 100644
--- a/tensorflow/core/kernels/reference_gemm.h
+++ b/tensorflow/core/kernels/reference_gemm.h
@@ -64,8 +64,8 @@ void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
     c_j_stride = 1;
   }
 
-  const int32_t highest = static_cast<int32>(Eigen::NumTraits<T3>::highest());
-  const int32_t lowest = static_cast<int32>(Eigen::NumTraits<T3>::lowest());
+  const int32_t highest = static_cast<int32_t>(Eigen::NumTraits<T3>::highest());
+  const int32_t lowest = static_cast<int32_t>(Eigen::NumTraits<T3>::lowest());
   const int32_t rounding = (shift_c < 1) ? 0 : (1 << (shift_c - 1));
 
   int i, j, l;
@@ -74,9 +74,9 @@ void ReferenceGemm(bool transpose_a, bool transpose_b, bool transpose_c,
       int32_t total = 0;
       for (l = 0; l < k; l++) {
         const size_t a_index = ((i * a_i_stride) + (l * a_l_stride));
-        const int32_t a_value = static_cast<int32>(a[a_index]) - offset_a;
+        const int32_t a_value = static_cast<int32_t>(a[a_index]) - offset_a;
         const size_t b_index = ((j * b_j_stride) + (l * b_l_stride));
-        const int32_t b_value = static_cast<int32>(b[b_index]) - offset_b;
+        const int32_t b_value = static_cast<int32_t>(b[b_index]) - offset_b;
         total += (a_value * b_value);
       }
       const size_t c_index = ((i * c_i_stride) + (j * c_j_stride));

From 0301475b0e056024d7b32533b18aeb6fdb2c609b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 04:03:29 -0800
Subject: [PATCH 421/753] Automated Code Change

PiperOrigin-RevId: 845699350
---
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    |  9 ++--
 .../xla/xla/pjrt/cpu/cpu_client_test.cc       | 43 ++++++++++---------
 .../xla/pjrt/cpu/tracked_cpu_device_buffer.cc |  6 ++-
 3 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 125ef05f15c00a..1e56ee3735163c 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -1920,17 +1920,18 @@ static void MaybeDumpHloSnapshot(
   *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = module.ToProto();
 
   for (auto* argument : arguments) {
-    *hlo_snapshot.add_arguments() = (*argument->ToLiteralSync())->ToProto();
+    *hlo_snapshot.add_arguments() = (*argument->ToLiteral().Await())->ToProto();
   }
 
   // If there are multiple results, wrap them in a tuple.
   if (results.size() == 1) {
-    *hlo_snapshot.mutable_result() = (*results[0]->ToLiteralSync())->ToProto();
+    *hlo_snapshot.mutable_result() =
+        (*results[0]->ToLiteral().Await())->ToProto();
   } else {
     std::vector<Literal> result_literals;
     result_literals.reserve(results.size());
     for (auto& result : results) {
-      result_literals.push_back(std::move(**result->ToLiteralSync()));
+      result_literals.push_back(std::move(**result->ToLiteral().Await()));
     }
     *hlo_snapshot.mutable_result() =
         LiteralUtil::MakeTupleOwned(std::move(result_literals)).ToProto();
@@ -1987,7 +1988,7 @@ PjRtCpuExecutable::Execute(
       for (const auto& argument_handle : argument_handles) {
         HloInputs hlo_inputs;
         for (const auto& buffer : argument_handle) {
-          TF_ASSIGN_OR_RETURN(auto literal, buffer->ToLiteralSync());
+          TF_ASSIGN_OR_RETURN(auto literal, buffer->ToLiteral().Await());
           *hlo_inputs.add_arguments() = literal->ToProto();
         }
         *hlo_snapshot.add_partitions() = std::move(hlo_inputs);
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
index c7b6b231e5a7db..b1d1bd1ce8e27f 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client_test.cc
@@ -445,7 +445,7 @@ TEST(PjRtCpuClientTest, AsyncTransferRawData) {
   absl::string_view raw_data_view(raw_data, raw_data_size);
   TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
       0, absl::string_view(raw_data, raw_data_size), []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   ASSERT_EQ(literal->element_count(), 3 * 2);
   EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
 }
@@ -466,7 +466,7 @@ TEST(PjRtCpuClientTest, AsyncTransferWithSpecs) {
   absl::string_view raw_data_view(raw_data, raw_data_size);
   TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
       0, absl::string_view(raw_data, raw_data_size), []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   ASSERT_EQ(literal->element_count(), 3 * 2);
   EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
 }
@@ -482,7 +482,7 @@ TEST(PjRtCpuClientTest, AsyncTransferLiteral) {
   EXPECT_THAT(ready_future.IsReady(), IsFalse());
   TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToBuffer(0, literal, []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<float>(),
               ElementsAreArray(literal.data<float>()));
 }
@@ -498,7 +498,7 @@ TEST(PjRtCpuClientTest, AsyncTransferLiteralInt4) {
   EXPECT_THAT(ready_future.IsReady(), IsFalse());
   TF_ASSERT_OK_AND_ASSIGN(auto literal, xla::MakeFakeLiteral(shape));
   TF_ASSERT_OK(transfer_manager->TransferLiteralToBuffer(0, literal, []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<s4>(),
               ElementsAreArray(literal.data<s4>()));
 }
@@ -510,7 +510,7 @@ TEST(PjRtCpuClientTest, BufferFromLiteralInt4) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<s4>(),
               ElementsAreArray(literal.data<s4>()));
 }
@@ -524,7 +524,7 @@ TEST(PjRtCpuClientTest, CopyToMemorySpace) {
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
   TF_ASSERT_OK_AND_ASSIGN(buffer,
                           buffer->CopyToMemorySpace(buffer->memory_space()));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
   EXPECT_THAT(received_literal->data<int32_t>(),
               ElementsAreArray(literal.data<int32_t>()));
 }
@@ -556,7 +556,7 @@ TEST(PjRtCpuClientTest, AsyncTransferNeverTransferred) {
   auto buffer = transfer_manager->RetrieveBuffer(0);
   transfer_manager.reset();
   EXPECT_THAT(
-      buffer->ToLiteralSync(),
+      buffer->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL,
                              HasSubstr("Async transfer object was deleted "
                                        "before transfers completed.")));
@@ -603,7 +603,7 @@ TEST(PjRtCpuClientTest, AsyncTransferSetBufferError) {
   auto buffer = transfer_manager->RetrieveBuffer(0);
   transfer_manager->SetBufferError(0, Internal("foobar"));
   EXPECT_THAT(
-      buffer->ToLiteralSync(),
+      buffer->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
 }
 
@@ -615,7 +615,7 @@ TEST(PjRtCpuClientTest, CreateErrorBuffer) {
         auto buffer,
         client->CreateErrorBuffer(Internal("foobar"), shape, memory_space));
     EXPECT_THAT(
-        buffer->ToLiteralSync(),
+        buffer->ToLiteral().Await(),
         absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
     EXPECT_EQ(buffer->memory_space(), memory_space);
   }
@@ -640,7 +640,7 @@ TEST(PjRtCpuClientTest, AsyncTransferRawDataToSubBuffer) {
   TF_ASSERT_OK(transfer_manager->TransferRawDataToSubBuffer(
       0, raw_data_view.data(), raw_data_size - 1, 1, /*is_last_transfer=*/true,
       []() {}));
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteral().Await());
   ASSERT_EQ(literal->element_count(), 3 * 2);
   EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
 }
@@ -679,7 +679,7 @@ ENTRY Identity() -> f32[2, 2] {
   ASSERT_THAT(result, absl_testing::StatusIs(tsl::error::OK));
   // However, the buffer is expected to be poisoned.
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
 }
 
@@ -718,7 +718,7 @@ ENTRY Identity() -> f32[2, 2] {
   ASSERT_EQ(result->size(), 1);
   ASSERT_EQ(result->at(0).size(), 1);
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
 }
 
@@ -776,11 +776,11 @@ ENTRY Identity() -> f32[2, 2] {
   }
   for (int i = 0; i < output_buffers.size(); ++i) {
     if (i % 2 == 0) {
-      EXPECT_THAT(output_buffers[i]->ToLiteralSync(),
+      EXPECT_THAT(output_buffers[i]->ToLiteral().Await(),
                   absl_testing::StatusIs(tsl::error::OK));
     } else {
       EXPECT_THAT(
-          output_buffers[i]->ToLiteralSync(),
+          output_buffers[i]->ToLiteral().Await(),
           absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
     }
   }
@@ -833,7 +833,7 @@ ENTRY Identity() -> f32[2, 2] {
   ASSERT_EQ(result->size(), 1);
   ASSERT_EQ(result->at(0).size(), 1);
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
 
   // A later error (propagated from the input buffer) would not affect the
@@ -841,7 +841,7 @@ ENTRY Identity() -> f32[2, 2] {
   transfer_manager->SetBufferError(0, Internal("foobar2"));
 
   EXPECT_THAT(
-      result->at(0).at(0)->ToLiteralSync(),
+      result->at(0).at(0)->ToLiteral().Await(),
       absl_testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
 
   // Attempting to poison a non-existent execution should fail.
@@ -900,7 +900,7 @@ TEST(PjRtCpuClientTest, ForwardUserDataToFfiHandler) {
   auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({42.0f, 42.0f, 42.0f, 42.0f}),
       *result_literal));
@@ -944,7 +944,7 @@ TEST(PjRtCpuClientTest, PassAttrToFfiHandler) {
   auto result = executable->Execute(/*argument_handles=*/{{}}, opts);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({3.0f, 3.0f, 3.0f, 3.0f}), *result_literal));
 }
@@ -1026,7 +1026,7 @@ TEST(PjRtCpuClientTest, CustomAllocator) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer,
       client->BufferFromHostLiteral(literal, client->memory_spaces()[0]));
-  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto received_literal, buffer->ToLiteral().Await());
 
   // Check that buffer was constructed in the data array provided by the custom
   // allocator.
@@ -1074,7 +1074,7 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
   auto result = executable->Execute({{buf.get(), buf.get()}}, opts);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>(literal_data_x2_squared), *result_literal));
 
@@ -1087,7 +1087,8 @@ TEST(PjRtCpuClientTest, SerializeYnnFusions) {
       client->LoadSerializedExecutable(serialized, std::nullopt, {}));
 
   result = executable->Execute({{buf.get(), buf.get()}}, opts);
-  TF_ASSERT_OK_AND_ASSIGN(result_literal, result->at(0).at(0)->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(result_literal,
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>(literal_data_x2_squared), *result_literal));
 }
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
index 78d53986c7db66..3e2f8aafc0000b 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
@@ -81,7 +81,8 @@ class AlignedMemory final : public CpuDeviceMemory::RawMemory {
       : base_(base), size_bytes_(size_bytes) {}
 
   ~AlignedMemory() final {
-    tsl::port::AlignedSizedFree(base_, cpu::MinAlign(), size_bytes_);
+    tsl::port::AlignedSizedFree(base_, size_bytes_,
+                                static_cast<std::align_val_t>(cpu::MinAlign()));
   }
 
   void* base() const final { return base_; }
@@ -96,7 +97,8 @@ class AlignedAllocator final : public CpuDeviceMemory::Allocator {
  public:
   absl::StatusOr<std::unique_ptr<CpuDeviceMemory::RawMemory>> Allocate(
       size_t size_bytes, size_t alignment) const final {
-    if (void* base = tsl::port::AlignedMalloc(size_bytes, alignment)) {
+    if (void* base = tsl::port::AlignedMalloc(
+            size_bytes, static_cast<std::align_val_t>(alignment))) {
       return std::make_unique<AlignedMemory>(base, size_bytes);
     }
     return ResourceExhausted("Out of memory allocating %d bytes.", size_bytes);

From eae231d4402acc107809b1254038ff16b6473091 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 04:12:49 -0800
Subject: [PATCH 422/753] Automated Code Change

PiperOrigin-RevId: 845702513
---
 .../xla/xla/stream_executor/generic_memory_allocation_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc b/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc
index cd3fe22ea1c34b..023cc5b3917938 100644
--- a/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc
+++ b/third_party/xla/xla/stream_executor/generic_memory_allocation_test.cc
@@ -34,8 +34,8 @@ TEST(GenericMemoryAllocationTest, DeleterIsCalledWithCorrectArguments) {
   };
   {
     GenericMemoryAllocation allocation(array.data(), array.size(), deleter);
-    EXPECT_EQ(allocation.opaque(), array.data());
-    EXPECT_EQ(allocation.size(), array.size());
+    EXPECT_EQ(allocation.address().opaque(), array.data());
+    EXPECT_EQ(allocation.address().size(), array.size());
     EXPECT_FALSE(deleter_called);
   }
   EXPECT_TRUE(deleter_called);

From 4999f8e8e7479f5cc270496504cd2f2b5e3f0835 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 04:41:45 -0800
Subject: [PATCH 423/753] Automated Code Change

PiperOrigin-RevId: 845710509
---
 .../xla/tsl/platform/cloud/gcs_file_system.cc  | 18 +++++++++---------
 .../tsl/platform/cloud/google_auth_provider.cc |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
index 0f625395e0489d..5b81602719b6f6 100644
--- a/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/gcs_file_system.cc
@@ -405,7 +405,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
       return read_fn_(filename_, offset, n, result, scratch);
     }
     {
-      absl::MutexLock l(&buffer_mutex_);
+      absl::MutexLock l(buffer_mutex_);
       size_t buffer_end = buffer_start_ + buffer_.size();
       size_t copy_size = 0;
       if (offset < buffer_end && offset >= buffer_start_) {
@@ -1061,7 +1061,7 @@ absl::Status GcsFileSystem::NewRandomAccessFile(
                                                      uint64 offset, size_t n,
                                                      absl::string_view* result,
                                                      char* scratch) {
-      absl::ReaderMutexLock l(&block_cache_lock_);
+      absl::ReaderMutexLock l(block_cache_lock_);
       GcsFileStat stat;
       TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
           fname, &stat,
@@ -1110,7 +1110,7 @@ absl::Status GcsFileSystem::NewRandomAccessFile(
 void GcsFileSystem::ResetFileBlockCache(size_t block_size_bytes,
                                         size_t max_bytes,
                                         uint64 max_staleness_secs) {
-  absl::MutexLock l(&block_cache_lock_);
+  absl::MutexLock l(block_cache_lock_);
   file_block_cache_ =
       MakeFileBlockCache(block_size_bytes, max_bytes, max_staleness_secs);
   if (stats_ != nullptr) {
@@ -1346,7 +1346,7 @@ absl::Status GcsFileSystem::ParseGcsPath(absl::string_view fname,
 }
 
 void GcsFileSystem::ClearFileCaches(const string& fname) {
-  absl::ReaderMutexLock l(&block_cache_lock_);
+  absl::ReaderMutexLock l(block_cache_lock_);
   file_block_cache_->RemoveFile(fname);
   stat_cache_->Delete(fname);
   // TODO(rxsang): Remove the patterns that match the file in
@@ -2337,7 +2337,7 @@ absl::Status GcsFileSystem::RenameFolderHns(const string& src,
 // reclaiming memory once filesystem operations are done (e.g. model is loaded),
 // or for resetting the filesystem to a consistent state.
 void GcsFileSystem::FlushCaches(TransactionToken* token) {
-  absl::ReaderMutexLock l(&block_cache_lock_);
+  absl::ReaderMutexLock l(block_cache_lock_);
   file_block_cache_->Flush();
   stat_cache_->Clear();
   matching_paths_cache_->Clear();
@@ -2348,13 +2348,13 @@ void GcsFileSystem::FlushCaches(TransactionToken* token) {
 void GcsFileSystem::SetStats(GcsStatsInterface* stats) {
   CHECK(stats_ == nullptr) << "SetStats() has already been called.";
   CHECK(stats != nullptr);
-  absl::MutexLock l(&block_cache_lock_);
+  absl::MutexLock l(block_cache_lock_);
   stats_ = stats;
   stats_->Configure(this, &throttle_, file_block_cache_.get());
 }
 
 void GcsFileSystem::SetCacheStats(FileBlockCacheStatsInterface* cache_stats) {
-  absl::ReaderMutexLock l(&block_cache_lock_);
+  absl::ReaderMutexLock l(block_cache_lock_);
   if (file_block_cache_ == nullptr) {
     LOG(ERROR) << "Tried to set cache stats of non-initialized file block "
                   "cache object. This may result in not exporting the intended "
@@ -2366,7 +2366,7 @@ void GcsFileSystem::SetCacheStats(FileBlockCacheStatsInterface* cache_stats) {
 
 void GcsFileSystem::SetAuthProvider(
     std::unique_ptr<AuthProvider> auth_provider) {
-  absl::MutexLock l(&mu_);
+  absl::MutexLock l(mu_);
   auth_provider_ = std::move(auth_provider);
 }
 
@@ -2382,7 +2382,7 @@ absl::Status GcsFileSystem::CreateHttpRequest(
 
   string auth_token;
   {
-    absl::ReaderMutexLock l(&mu_);
+    absl::ReaderMutexLock l(mu_);
     TF_RETURN_IF_ERROR(
         AuthProvider::GetToken(auth_provider_.get(), &auth_token));
   }
diff --git a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
index df762f15acb831..c3c1ada70a5df4 100644
--- a/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
+++ b/third_party/xla/xla/tsl/platform/cloud/google_auth_provider.cc
@@ -151,7 +151,7 @@ GoogleAuthProvider::GoogleAuthProvider(
       env_(env) {}
 
 absl::Status GoogleAuthProvider::GetToken(string* t) {
-  absl::MutexLock lock(&mu_);
+  absl::MutexLock lock(mu_);
   const uint64 now_sec = env_->NowSeconds();
 
   if (now_sec + kExpirationTimeMarginSec < expiration_timestamp_sec_) {

From 42d635292653b59dfb21386e65e21b5f6b295b24 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 04:41:45 -0800
Subject: [PATCH 424/753] Automated Code Change

PiperOrigin-RevId: 845710511
---
 tensorflow/core/lib/strings/base64_test.cc    |   2 +-
 tensorflow/core/lib/strings/ordered_code.cc   |  40 ++--
 tensorflow/core/lib/strings/ordered_code.h    |  12 +-
 .../core/lib/strings/ordered_code_test.cc     | 173 +++++++++---------
 4 files changed, 117 insertions(+), 110 deletions(-)

diff --git a/tensorflow/core/lib/strings/base64_test.cc b/tensorflow/core/lib/strings/base64_test.cc
index df4a4bcf59353c..ce68ee2b4dea72 100644
--- a/tensorflow/core/lib/strings/base64_test.cc
+++ b/tensorflow/core/lib/strings/base64_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(Base64, EncodeDecode) {
-  const string original = "a simple test message!";
+  const std::string original = "a simple test message!";
   tstring encoded;
   TF_EXPECT_OK(Base64Encode(original, &encoded));
   EXPECT_EQ("YSBzaW1wbGUgdGVzdCBtZXNzYWdlIQ", encoded);
diff --git a/tensorflow/core/lib/strings/ordered_code.cc b/tensorflow/core/lib/strings/ordered_code.cc
index 31b08152f963e2..8e69dbe7fc6809 100644
--- a/tensorflow/core/lib/strings/ordered_code.cc
+++ b/tensorflow/core/lib/strings/ordered_code.cc
@@ -134,7 +134,7 @@ static const char kFFCharacter = '\000';  // Combined with kEscape2
 static const char kEscape1_Separator[2] = {kEscape1, kSeparator};
 
 // Append to "*dest" the "len" bytes starting from "*src".
-inline static void AppendBytes(string* dest, const char* src, size_t len) {
+inline static void AppendBytes(std::string* dest, const char* src, size_t len) {
   dest->append(src, len);
 }
 
@@ -164,7 +164,8 @@ const char* OrderedCode::TEST_SkipToNextSpecialByte(const char* start,
 
 // Helper routine to encode "s" and append to "*dest", escaping special
 // characters.
-inline static void EncodeStringFragment(string* dest, absl::string_view s) {
+inline static void EncodeStringFragment(std::string* dest,
+                                        absl::string_view s) {
   const char* p = s.data();
   const char* limit = p + s.size();
   const char* copy_start = p;
@@ -191,12 +192,12 @@ inline static void EncodeStringFragment(string* dest, absl::string_view s) {
   }
 }
 
-void OrderedCode::WriteString(string* dest, absl::string_view s) {
+void OrderedCode::WriteString(std::string* dest, absl::string_view s) {
   EncodeStringFragment(dest, s);
   AppendBytes(dest, kEscape1_Separator, 2);
 }
 
-void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
+void OrderedCode::WriteNumIncreasing(std::string* dest, uint64_t val) {
   // Values are encoded with a single byte length prefix, followed
   // by the actual value in big-endian format with leading 0 bytes
   // dropped.
@@ -216,7 +217,8 @@ void OrderedCode::WriteNumIncreasing(string* dest, uint64 val) {
 // If parse succeeds, return true, consume encoding from
 // "*src", and if result != NULL append the decoded string to "*result".
 // Otherwise, return false and leave both undefined.
-inline static bool ReadStringInternal(absl::string_view* src, string* result) {
+inline static bool ReadStringInternal(absl::string_view* src,
+                                      std::string* result) {
   const char* start = src->data();
   const char* string_limit = src->data() + src->size();
 
@@ -271,11 +273,11 @@ inline static bool ReadStringInternal(absl::string_view* src, string* result) {
   return false;
 }
 
-bool OrderedCode::ReadString(absl::string_view* src, string* result) {
+bool OrderedCode::ReadString(absl::string_view* src, std::string* result) {
   return ReadStringInternal(src, result);
 }
 
-bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64* result) {
+bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64_t* result) {
   if (src->empty()) {
     return false;  // Not enough bytes
   }
@@ -294,7 +296,7 @@ bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64* result) {
   }
 
   if (result) {
-    uint64 tmp = 0;
+    uint64_t tmp = 0;
     for (size_t i = 0; i < len; i++) {
       tmp <<= 8;
       tmp |= static_cast<unsigned char>((*src)[1 + i]);
@@ -305,7 +307,7 @@ bool OrderedCode::ReadNumIncreasing(absl::string_view* src, uint64* result) {
   return true;
 }
 
-void OrderedCode::TEST_Corrupt(string* str, int k) {
+void OrderedCode::TEST_Corrupt(std::string* str, int k) {
   int seen_seps = 0;
   for (size_t i = 0; i + 1 < str->size(); i++) {
     if ((*str)[i] == kEscape1 && (*str)[i + 1] == kSeparator) {
@@ -389,7 +391,7 @@ static const char kLengthToHeaderBits[1 + kMaxSigned64Length][2] = {
 
 // This array maps encoding lengths to the header bits that overlap with
 // the payload and need fixing when reading.
-static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
+static const uint64_t kLengthToMask[1 + kMaxSigned64Length] = {
     0ULL,
     0x80ULL,
     0xc000ULL,
@@ -408,7 +410,7 @@ static const uint64 kLengthToMask[1 + kMaxSigned64Length] = {
 // bit position (the highest bit position in a positive int64 is 63).
 // For a negative number n, we count the bits in ~n.
 // That is, length = kBitsToLength[tsl::Log2Floor64(n < 0 ? ~n : n) + 1].
-static const int8 kBitsToLength[1 + 63] = {
+static const int8_t kBitsToLength[1 + 63] = {
     1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4,
     4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 7, 7,
     7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 10};
@@ -418,23 +420,23 @@ static inline int SignedEncodingLength(int64_t n) {
   return kBitsToLength[tsl::Log2Floor64(n < 0 ? ~n : n) + 1];
 }
 
-static void StoreBigEndian64(char* dst, uint64 v) {
+static void StoreBigEndian64(char* dst, uint64_t v) {
   for (int i = 0; i < 8; i++) {
     dst[i] = (v >> (56 - 8 * i)) & 0xff;
   }
 }
 
-static uint64 LoadBigEndian64(const char* src) {
-  uint64 result = 0;
+static uint64_t LoadBigEndian64(const char* src) {
+  uint64_t result = 0;
   for (int i = 0; i < 8; i++) {
     unsigned char c = static_cast<unsigned char>(src[i]);
-    result |= static_cast<uint64>(c) << (56 - 8 * i);
+    result |= static_cast<uint64_t>(c) << (56 - 8 * i);
   }
   return result;
 }
 
-void OrderedCode::WriteSignedNumIncreasing(string* dest, int64_t val) {
-  const uint64 x = val < 0 ? ~val : val;
+void OrderedCode::WriteSignedNumIncreasing(std::string* dest, int64_t val) {
+  const uint64_t x = val < 0 ? ~val : val;
   if (x < 64) {  // fast path for encoding length == 1
     *dest += kLengthToHeaderBits[1][0] ^ val;
     return;
@@ -458,12 +460,12 @@ void OrderedCode::WriteSignedNumIncreasing(string* dest, int64_t val) {
 bool OrderedCode::ReadSignedNumIncreasing(absl::string_view* src,
                                           int64_t* result) {
   if (src->empty()) return false;
-  const uint64 xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
+  const uint64_t xor_mask = (!((*src)[0] & 0x80)) ? ~0ULL : 0ULL;
   const unsigned char first_byte = (*src)[0] ^ (xor_mask & 0xff);
 
   // now calculate and test length, and set x to raw (unmasked) result
   int len;
-  uint64 x;
+  uint64_t x;
   if (first_byte != 0xff) {
     len = 7 - tsl::Log2Floor64(first_byte ^ 0xff);
     if (src->size() < static_cast<size_t>(len)) return false;
diff --git a/tensorflow/core/lib/strings/ordered_code.h b/tensorflow/core/lib/strings/ordered_code.h
index e7485bd57f7e15..2d2811ac5af30f 100644
--- a/tensorflow/core/lib/strings/ordered_code.h
+++ b/tensorflow/core/lib/strings/ordered_code.h
@@ -54,9 +54,9 @@ class OrderedCode {
   // Encoding routines: each one of the following routines append
   // one item to "*dest" in an encoding where larger values are
   // ordered lexicographically after smaller values.
-  static void WriteString(string* dest, absl::string_view str);
-  static void WriteNumIncreasing(string* dest, uint64 num);
-  static void WriteSignedNumIncreasing(string* dest, int64_t num);
+  static void WriteString(std::string* dest, absl::string_view str);
+  static void WriteNumIncreasing(std::string* dest, uint64_t num);
+  static void WriteSignedNumIncreasing(std::string* dest, int64_t num);
 
   // -------------------------------------------------------------------
   // Decoding routines: these extract an item earlier encoded using
@@ -66,13 +66,13 @@ class OrderedCode {
   // result.  In case of string result, the decoded string is appended to
   // "*result".  Returns true if the next item was read successfully, false
   // otherwise.
-  static bool ReadString(absl::string_view* src, string* result);
-  static bool ReadNumIncreasing(absl::string_view* src, uint64* result);
+  static bool ReadString(absl::string_view* src, std::string* result);
+  static bool ReadNumIncreasing(absl::string_view* src, uint64_t* result);
   static bool ReadSignedNumIncreasing(absl::string_view* src, int64_t* result);
 
   // Helper for testing: corrupt "*str" by changing the kth item separator
   // in the string.
-  static void TEST_Corrupt(string* str, int k);
+  static void TEST_Corrupt(std::string* str, int k);
 
   // Helper for testing.
   // SkipToNextSpecialByte is an internal routine defined in the .cc file
diff --git a/tensorflow/core/lib/strings/ordered_code_test.cc b/tensorflow/core/lib/strings/ordered_code_test.cc
index 4717007fc27fc2..9ef3a8dafd2138 100644
--- a/tensorflow/core/lib/strings/ordered_code_test.cc
+++ b/tensorflow/core/lib/strings/ordered_code_test.cc
@@ -32,8 +32,8 @@ namespace tensorflow {
 namespace strings {
 namespace {
 
-string RandomString(random::SimplePhilox* rnd, size_t len) {
-  string x;
+std::string RandomString(random::SimplePhilox* rnd, size_t len) {
+  std::string x;
   for (size_t i = 0; i < len; i++) {
     x += rnd->Uniform(256);
   }
@@ -45,33 +45,34 @@ string RandomString(random::SimplePhilox* rnd, size_t len) {
 
 // Read/WriteIncreasing are defined for string, uint64, int64 below.
 template <typename T>
-void OCWriteIncreasing(string* dest, const T& val);
+void OCWriteIncreasing(std::string* dest, const T& val);
 template <typename T>
 bool OCReadIncreasing(absl::string_view* src, T* result);
 
 // Read/WriteIncreasing<string>
 template <>
-void OCWriteIncreasing<string>(string* dest, const string& val) {
+void OCWriteIncreasing<std::string>(std::string* dest, const std::string& val) {
   OrderedCode::WriteString(dest, val);
 }
 template <>
-bool OCReadIncreasing<string>(absl::string_view* src, string* result) {
+bool OCReadIncreasing<std::string>(absl::string_view* src,
+                                   std::string* result) {
   return OrderedCode::ReadString(src, result);
 }
 
 // Read/WriteIncreasing<uint64>
 template <>
-void OCWriteIncreasing<uint64>(string* dest, const uint64& val) {
+void OCWriteIncreasing<uint64_t>(std::string* dest, const uint64_t& val) {
   OrderedCode::WriteNumIncreasing(dest, val);
 }
 template <>
-bool OCReadIncreasing<uint64>(absl::string_view* src, uint64* result) {
+bool OCReadIncreasing<uint64_t>(absl::string_view* src, uint64_t* result) {
   return OrderedCode::ReadNumIncreasing(src, result);
 }
 
 // Read/WriteIncreasing<int64_t>
 template <>
-void OCWriteIncreasing<int64_t>(string* dest, const int64_t& val) {
+void OCWriteIncreasing<int64_t>(std::string* dest, const int64_t& val) {
   OrderedCode::WriteSignedNumIncreasing(dest, val);
 }
 template <>
@@ -80,14 +81,14 @@ bool OCReadIncreasing<int64_t>(absl::string_view* src, int64_t* result) {
 }
 
 template <typename T>
-string OCWrite(T val) {
-  string result;
+std::string OCWrite(T val) {
+  std::string result;
   OCWriteIncreasing<T>(&result, val);
   return result;
 }
 
 template <typename T>
-void OCWriteToString(string* result, T val) {
+void OCWriteToString(std::string* result, T val) {
   OCWriteIncreasing<T>(result, val);
 }
 
@@ -100,7 +101,7 @@ bool OCRead(absl::string_view* s, T* val) {
 // Numbers
 
 template <typename T>
-T TestRead(const string& a) {
+T TestRead(const std::string& a) {
   // gracefully reject any proper prefix of an encoding
   for (int i = 0; i < a.size() - 1; ++i) {
     absl::string_view s(a.data(), i);
@@ -124,9 +125,9 @@ void TestWriteRead(T expected) {
 // output.
 template <typename T, typename U>
 void TestWriteAppends(T first, U second) {
-  string encoded;
+  std::string encoded;
   OCWriteToString<T>(&encoded, first);
-  string encoded_first_only = encoded;
+  std::string encoded_first_only = encoded;
   OCWriteToString<U>(&encoded, second);
   EXPECT_NE(encoded, encoded_first_only);
   EXPECT_TRUE(absl::StartsWith(encoded, encoded_first_only));
@@ -149,7 +150,7 @@ void TestNumbers(T multiplier) {
   random::SimplePhilox rnd(&philox);
   for (int bits = 1; bits <= std::numeric_limits<T>().digits; ++bits) {
     // test random non-negative numbers with given number of significant bits
-    const uint64 mask = (~0ULL) >> (64 - bits);
+    const uint64_t mask = (~0ULL) >> (64 - bits);
     for (int i = 0; i < 1000; i++) {
       T x = rnd.Rand64() & mask;
       TestWriteRead(multiplier * x);
@@ -160,16 +161,18 @@ void TestNumbers(T multiplier) {
 }
 
 // Return true iff 'a' is "before" 'b'
-bool CompareStrings(const string& a, const string& b) { return (a < b); }
+bool CompareStrings(const std::string& a, const std::string& b) {
+  return (a < b);
+}
 
 template <typename T>
 void TestNumberOrdering() {
   // first the negative numbers (if T is signed, otherwise no-op)
-  string laststr = OCWrite<T>(std::numeric_limits<T>().min());
+  std::string laststr = OCWrite<T>(std::numeric_limits<T>().min());
   for (T num = std::numeric_limits<T>().min() / 2; num != 0; num /= 2) {
-    string strminus1 = OCWrite<T>(num - 1);
-    string str = OCWrite<T>(num);
-    string strplus1 = OCWrite<T>(num + 1);
+    std::string strminus1 = OCWrite<T>(num - 1);
+    std::string str = OCWrite<T>(num);
+    std::string strplus1 = OCWrite<T>(num + 1);
 
     CHECK(CompareStrings(strminus1, str));
     CHECK(CompareStrings(str, strplus1));
@@ -185,9 +188,9 @@ void TestNumberOrdering() {
   T num = 1;
   while (num < std::numeric_limits<T>().max() / 2) {
     num *= 2;
-    string strminus1 = OCWrite<T>(num - 1);
-    string str = OCWrite<T>(num);
-    string strplus1 = OCWrite<T>(num + 1);
+    std::string strminus1 = OCWrite<T>(num - 1);
+    std::string str = OCWrite<T>(num);
+    std::string strplus1 = OCWrite<T>(num + 1);
 
     CHECK(CompareStrings(strminus1, str));
     CHECK(CompareStrings(str, strplus1));
@@ -199,7 +202,7 @@ void TestNumberOrdering() {
 }
 
 // Helper routine for testing TEST_SkipToNextSpecialByte
-size_t FindSpecial(const string& x) {
+size_t FindSpecial(const std::string& x) {
   const char* p = x.data();
   const char* limit = p + x.size();
   const char* result = OrderedCode::TEST_SkipToNextSpecialByte(p, limit);
@@ -209,15 +212,15 @@ size_t FindSpecial(const string& x) {
 // Helper function template to create strings from string literals (excluding
 // the terminal zero byte of the underlying character array).
 template <size_t N>
-string ByteSequence(const char (&arr)[N]) {
-  return string(arr, N - 1);
+std::string ByteSequence(const char (&arr)[N]) {
+  return std::string(arr, N - 1);
 }
 
 TEST(OrderedCode, SkipToNextSpecialByte) {
   for (size_t len = 0; len < 256; len++) {
     random::PhiloxRandom philox(301, 17);
     random::SimplePhilox rnd(&philox);
-    string x;
+    std::string x;
     while (x.size() < len) {
       char c = 1 + rnd.Uniform(254);
       ASSERT_NE(c, 0);
@@ -228,7 +231,7 @@ TEST(OrderedCode, SkipToNextSpecialByte) {
     for (size_t special_pos = 0; special_pos < len; special_pos++) {
       for (size_t special_test = 0; special_test < 2; special_test++) {
         const char special_byte = (special_test == 0) ? 0 : 255;
-        string y = x;
+        std::string y = x;
         y[special_pos] = special_byte;
         EXPECT_EQ(FindSpecial(y), special_pos);
         if (special_pos < 16) {
@@ -283,9 +286,9 @@ TEST(OrderedCode, ExhaustiveFindSpecial) {
   EXPECT_EQ(count, 256 * 256 * 256 * 2);
 }
 
-TEST(Uint64, EncodeDecode) { TestNumbers<uint64>(1); }
+TEST(Uint64, EncodeDecode) { TestNumbers<uint64_t>(1); }
 
-TEST(Uint64, Ordering) { TestNumberOrdering<uint64>(); }
+TEST(Uint64, Ordering) { TestNumberOrdering<uint64_t>(); }
 
 TEST(Int64, EncodeDecode) {
   TestNumbers<int64_t>(1);
@@ -295,15 +298,15 @@ TEST(Int64, EncodeDecode) {
 TEST(Int64, Ordering) { TestNumberOrdering<int64_t>(); }
 
 // Returns the bitwise complement of s.
-inline string StrNot(const string& s) {
-  string result;
-  for (string::const_iterator it = s.begin(); it != s.end(); ++it)
+inline std::string StrNot(const std::string& s) {
+  std::string result;
+  for (std::string::const_iterator it = s.begin(); it != s.end(); ++it)
     result.push_back(~*it);
   return result;
 }
 
 template <typename T>
-void TestInvalidEncoding(const string& s) {
+void TestInvalidEncoding(const std::string& s) {
   absl::string_view p(s);
   EXPECT_FALSE(OCRead<T>(&p, nullptr));
   EXPECT_EQ(s, p);
@@ -311,11 +314,11 @@ void TestInvalidEncoding(const string& s) {
 
 TEST(OrderedCodeInvalidEncodingsTest, Overflow) {
   // 1U << 64, increasing and decreasing
-  const string k2xx64U = "\x09\x01" + string(8, 0);
-  TestInvalidEncoding<uint64>(k2xx64U);
+  const std::string k2xx64U = "\x09\x01" + std::string(8, 0);
+  TestInvalidEncoding<uint64_t>(k2xx64U);
 
   // 1 << 63 and ~(1 << 63), increasing and decreasing
-  const string k2xx63 = "\xff\xc0\x80" + string(7, 0);
+  const std::string k2xx63 = "\xff\xc0\x80" + std::string(7, 0);
   TestInvalidEncoding<int64_t>(k2xx63);
   TestInvalidEncoding<int64_t>(StrNot(k2xx63));
 }
@@ -332,11 +335,11 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
 
   for (int n = 2; n <= 9; ++n) {
     // The zero in non_minimal[1] is "redundant".
-    string non_minimal =
-        string(1, n - 1) + string(1, 0) + RandomString(&rnd, n - 2);
+    std::string non_minimal =
+        std::string(1, n - 1) + std::string(1, 0) + RandomString(&rnd, n - 2);
     EXPECT_EQ(n, non_minimal.length());
 
-    EXPECT_NE(OCWrite<uint64>(0), non_minimal);
+    EXPECT_NE(OCWrite<uint64_t>(0), non_minimal);
 #ifndef NDEBUG
     absl::string_view s(non_minimal);
     EXPECT_DEATH(OrderedCode::ReadNumIncreasing(&s, nullptr),
@@ -348,11 +351,12 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
 
   for (int n = 2; n <= 10; ++n) {
     // Header with 1 sign bit and n-1 size bits.
-    string header = string(n / 8, 0xff) + string(1, 0xff << (8 - (n % 8)));
+    std::string header =
+        std::string(n / 8, 0xff) + std::string(1, 0xff << (8 - (n % 8)));
     // There are more than 7 zero bits between header bits and "payload".
-    string non_minimal = header +
-                         string(1, rnd.Uniform(256) & ~*header.rbegin()) +
-                         RandomString(&rnd, n - header.length() - 1);
+    std::string non_minimal =
+        header + std::string(1, rnd.Uniform(256) & ~*header.rbegin()) +
+        RandomString(&rnd, n - header.length() - 1);
     EXPECT_EQ(n, non_minimal.length());
 
     EXPECT_NE(OCWrite<int64_t>(0), non_minimal);
@@ -369,7 +373,7 @@ TEST(OrderedCodeInvalidEncodingsDeathTest, NonCanonical) {
 
 // Returns random number with specified number of bits,
 // i.e., in the range [2^(bits-1),2^bits).
-uint64 NextBits(random::SimplePhilox* rnd, int bits) {
+uint64_t NextBits(random::SimplePhilox* rnd, int bits) {
   return (bits != 0)
              ? (rnd->Rand64() % (1LL << (bits - 1))) + (1LL << (bits - 1))
              : 0;
@@ -385,7 +389,7 @@ void BM_WriteNum(::testing::benchmark::State& state, T multiplier) {
   for (int i = 0; i < kValues; i++) {
     values[i] = NextBits(&rnd, state.max_iterations % 64) * multiplier;
   }
-  string result;
+  std::string result;
   int index = 0;
   for (auto i : state) {
     result.clear();
@@ -400,12 +404,12 @@ void BM_ReadNum(::testing::benchmark::State& state, T multiplier) {
   random::SimplePhilox rnd(&philox);
   // Use enough distinct values to confuse the branch predictor
   constexpr int kValues = 64;
-  string values[kValues];
+  std::string values[kValues];
   for (int i = 0; i < kValues; i++) {
     T val = NextBits(&rnd, i % 64) * multiplier;
     values[i] = OCWrite<T>(val);
   }
-  uint32 index = 0;
+  uint32_t index = 0;
   for (auto i : state) {
     T val;
     absl::string_view s = values[index++ % kValues];
@@ -423,7 +427,7 @@ void BM_ReadNum(::testing::benchmark::State& state, T multiplier) {
   }                                                         \
   BENCHMARK(BM_Read##name)
 
-BENCHMARK_NUM(NumIncreasing, uint64, 1);
+BENCHMARK_NUM(NumIncreasing, uint64_t, 1);
 BENCHMARK_NUM(SignedNum, int64_t, 1);
 BENCHMARK_NUM(SignedNumNegative, int64_t, -1);
 
@@ -437,30 +441,30 @@ TEST(String, EncodeDecode) {
   random::SimplePhilox rnd(&philox);
 
   for (int len = 0; len < 256; len++) {
-    const string a = RandomString(&rnd, len);
+    const std::string a = RandomString(&rnd, len);
     TestWriteRead(a);
     for (int len2 = 0; len2 < 64; len2++) {
-      const string b = RandomString(&rnd, len2);
+      const std::string b = RandomString(&rnd, len2);
 
       TestWriteAppends(a, b);
 
-      string out;
-      OCWriteToString<string>(&out, a);
-      OCWriteToString<string>(&out, b);
+      std::string out;
+      OCWriteToString<std::string>(&out, a);
+      OCWriteToString<std::string>(&out, b);
 
-      string a2, b2, dummy;
+      std::string a2, b2, dummy;
       absl::string_view s = out;
       absl::string_view s2 = out;
-      CHECK(OCRead<string>(&s, &a2));
-      CHECK(OCRead<string>(&s2, nullptr));
+      CHECK(OCRead<std::string>(&s, &a2));
+      CHECK(OCRead<std::string>(&s2, nullptr));
       CHECK_EQ(s, s2);
 
-      CHECK(OCRead<string>(&s, &b2));
-      CHECK(OCRead<string>(&s2, nullptr));
+      CHECK(OCRead<std::string>(&s, &b2));
+      CHECK(OCRead<std::string>(&s2, nullptr));
       CHECK_EQ(s, s2);
 
-      CHECK(!OCRead<string>(&s, &dummy));
-      CHECK(!OCRead<string>(&s2, nullptr));
+      CHECK(!OCRead<std::string>(&s, &dummy));
+      CHECK(!OCRead<std::string>(&s2, nullptr));
       CHECK_EQ(a, a2);
       CHECK_EQ(b, b2);
       CHECK(s.empty());
@@ -472,8 +476,8 @@ TEST(String, EncodeDecode) {
 // 'str' is a string literal that may contain '\0'.
 #define STATIC_STR(str) StringPiece((str), sizeof(str) - 1)
 
-string EncodeStringIncreasing(absl::string_view value) {
-  string encoded;
+std::string EncodeStringIncreasing(absl::string_view value) {
+  std::string encoded;
   OrderedCode::WriteString(&encoded, value);
   return encoded;
 }
@@ -515,19 +519,20 @@ TEST(String, Increasing) {
 }
 
 TEST(EncodingIsExpected, String) {
-  std::vector<std::pair<string, string>> data = {
-      {"", string("\x00\x01", 2)},
-      {"foo", string("foo\x00\x01", 5)},
-      {"hello", string("hello\x00\x01", 7)},
-      {string("\x00\x01\xff", 3), string("\x00\xff\x01\xff\x00\x00\x01", 7)},
+  std::vector<std::pair<std::string, std::string>> data = {
+      {"", std::string("\x00\x01", 2)},
+      {"foo", std::string("foo\x00\x01", 5)},
+      {"hello", std::string("hello\x00\x01", 7)},
+      {std::string("\x00\x01\xff", 3),
+       std::string("\x00\xff\x01\xff\x00\x00\x01", 7)},
   };
   for (const auto& t : data) {
-    string result;
+    std::string result;
     OrderedCode::WriteString(&result, t.first);
     EXPECT_EQ(t.second, result);
 
     absl::string_view in = result;
-    string decoded;
+    std::string decoded;
     EXPECT_TRUE(OrderedCode::ReadString(&in, &decoded));
     EXPECT_EQ(t.first, decoded);
     EXPECT_EQ("", in);
@@ -535,7 +540,7 @@ TEST(EncodingIsExpected, String) {
 }
 
 TEST(EncodingIsExpected, Unsigned) {
-  std::vector<std::pair<uint64, string>> data = {
+  std::vector<std::pair<uint64_t, std::string>> data = {
       {0x0ull, ByteSequence("\000")},
       {0x1ull, ByteSequence("\001\001")},
       {0x2ull, ByteSequence("\001\002")},
@@ -753,13 +758,13 @@ TEST(EncodingIsExpected, Unsigned) {
        ByteSequence("\010\200\000\000\000\000\000\000\001")},
   };
   for (const auto& t : data) {
-    uint64 num = t.first;
-    string result;
+    uint64_t num = t.first;
+    std::string result;
     OrderedCode::WriteNumIncreasing(&result, num);
     EXPECT_EQ(t.second, result) << std::hex << num;
 
     absl::string_view in = result;
-    uint64 decoded;
+    uint64_t decoded;
     EXPECT_TRUE(OrderedCode::ReadNumIncreasing(&in, &decoded));
     EXPECT_EQ(num, decoded);
     EXPECT_EQ("", in);
@@ -767,7 +772,7 @@ TEST(EncodingIsExpected, Unsigned) {
 }
 
 TEST(EncodingIsExpected, Signed) {
-  std::vector<std::pair<int64_t, string>> data = {
+  std::vector<std::pair<int64_t, std::string>> data = {
       {0ll, ByteSequence("\200")},
       {1ll, ByteSequence("\201")},
       {2ll, ByteSequence("\202")},
@@ -1201,7 +1206,7 @@ TEST(EncodingIsExpected, Signed) {
   };
   for (const auto& t : data) {
     int64_t num = t.first;
-    string result;
+    std::string result;
     OrderedCode::WriteSignedNumIncreasing(&result, num);
     EXPECT_EQ(t.second, result) << std::hex << num;
 
@@ -1216,15 +1221,15 @@ TEST(EncodingIsExpected, Signed) {
 void BM_WriteString(::testing::benchmark::State& state, int len) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  string x;
+  std::string x;
   for (int i = 0; i < len; i++) {
     x += rnd.Uniform(256);
   }
-  string y;
+  std::string y;
 
   for (auto s : state) {
     y.clear();
-    OCWriteToString<string>(&y, x);
+    OCWriteToString<std::string>(&y, x);
   }
   state.SetBytesProcessed(state.iterations() * len);
 }
@@ -1232,18 +1237,18 @@ void BM_WriteString(::testing::benchmark::State& state, int len) {
 void BM_ReadString(::testing::benchmark::State& state, int len) {
   random::PhiloxRandom philox(301, 17);
   random::SimplePhilox rnd(&philox);
-  string x;
+  std::string x;
   for (int i = 0; i < len; i++) {
     x += rnd.Uniform(256);
   }
-  string data;
-  OCWriteToString<string>(&data, x);
-  string result;
+  std::string data;
+  OCWriteToString<std::string>(&data, x);
+  std::string result;
 
   for (auto i : state) {
     result.clear();
     absl::string_view s = data;
-    OCRead<string>(&s, &result);
+    OCRead<std::string>(&s, &result);
   }
   state.SetBytesProcessed(state.iterations() * len);
 }

From ee8cc68fc7697db01c7270345d6c89d315e0d4ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 04:45:34 -0800
Subject: [PATCH 425/753] Automated Code Change

PiperOrigin-RevId: 845711498
---
 tensorflow/core/kernels/sparse/add_op.cc      | 14 ++--
 .../sparse/csr_sparse_matrix_to_dense_op.cc   | 18 ++---
 .../csr_sparse_matrix_to_sparse_tensor_op.cc  | 18 ++---
 .../sparse/dense_to_csr_sparse_matrix_op.cc   | 11 ++--
 tensorflow/core/kernels/sparse/kernels.cc     |  4 +-
 tensorflow/core/kernels/sparse/kernels.h      | 34 +++++-----
 .../core/kernels/sparse/kernels_test.cc       | 65 ++++++++++---------
 tensorflow/core/kernels/sparse/nnz_op.cc      |  2 +-
 .../core/kernels/sparse/sparse_cholesky_op.cc |  8 +--
 .../core/kernels/sparse/sparse_mat_mul_op.cc  | 14 ++--
 .../core/kernels/sparse/sparse_matrix.h       | 46 ++++++-------
 .../sparse/sparse_matrix_components_op.cc     | 14 ++--
 .../sparse_tensor_to_csr_sparse_matrix_op.cc  | 10 +--
 .../core/kernels/sparse/transpose_op.cc       |  4 +-
 tensorflow/core/kernels/sparse/zeros_op.h     |  6 +-
 15 files changed, 136 insertions(+), 132 deletions(-)

diff --git a/tensorflow/core/kernels/sparse/add_op.cc b/tensorflow/core/kernels/sparse/add_op.cc
index c454241c1574c2..24e9a8cc5fb98e 100644
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@@ -93,19 +93,19 @@ class CSRSparseMatrixAddFunctor {
 
     Tensor c_batch_ptr_t(cpu_allocator(), DT_INT32,
                          TensorShape({batch_size + 1}));
-    auto c_batch_ptr = c_batch_ptr_t.vec<int32>();
+    auto c_batch_ptr = c_batch_ptr_t.vec<int32_t>();
     c_batch_ptr(0) = 0;
 
     Tensor c_row_ptr_t;
     TF_RETURN_IF_ERROR(ctx_->allocate_temp(
         DT_INT32, TensorShape({batch_size * (rows + 1)}), &c_row_ptr_t));
-    auto c_row_ptr = c_row_ptr_t.vec<int32>();
+    auto c_row_ptr = c_row_ptr_t.vec<int32_t>();
 
     // Set the output row pointers to zero, in case we hit any empty
     // combinations of rows in a and b.
-    functor::SetZeroFunctor<Device, int32> set_zero;
+    functor::SetZeroFunctor<Device, int32_t> set_zero;
     const Device& d = ctx_->eigen_device<Device>();
-    set_zero(d, c_row_ptr_t.flat<int32>());
+    set_zero(d, c_row_ptr_t.flat<int32_t>());
 
     size_t maxWorkspaceSize = 0;
     for (int i = 0; i < batch_size; ++i) {
@@ -125,7 +125,7 @@ class CSRSparseMatrixAddFunctor {
     Tensor temp;
     TF_RETURN_IF_ERROR(ctx_->allocate_temp(
         DT_INT8, TensorShape({static_cast<int64_t>(maxWorkspaceSize)}), &temp));
-    void* workspace = temp.flat<int8>().data();
+    void* workspace = temp.flat<int8_t>().data();
 
     for (int i = 0; i < batch_size; ++i) {
       // Calculate output sizes for all minibatch entries.
@@ -138,8 +138,8 @@ class CSRSparseMatrixAddFunctor {
                                   a.values_vec<T>(i), a_dense_shape};
       ConstCSRComponent<T> b_comp{b.row_pointers_vec(i), b.col_indices_vec(i),
                                   b.values_vec<T>(i), b_dense_shape};
-      TTypes<int32>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
-                                              rows + 1);
+      TTypes<int32_t>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
+                                                rows + 1);
       int c_nnz_i;
       TF_RETURN_IF_ERROR(csr_geam.GetOutputStructure(
           a_comp, b_comp, c_row_ptr_i, &c_nnz_i, workspace));
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
index 2991f7bad9af89..6829145263baa5 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
@@ -73,9 +73,9 @@ class CSRSparseMatrixToDenseCPUOp : public OpKernel {
     const int64_t num_rows = dense_shape((rank == 2) ? 0 : 1);
     const int64_t num_cols = dense_shape((rank == 2) ? 1 : 2);
 
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
-    auto row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto col_ind = csr_sparse_matrix->col_indices().vec<int32>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
+    auto row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
     auto values = csr_sparse_matrix->values().vec<T>();
 
     TensorShape dense_tensor_shape;
@@ -159,14 +159,14 @@ class CSRSparseMatrixToDenseGPUOp : public OpKernel {
     functor::CSRSparseMatrixToCOOSparseMatrix<Device> csr_to_coo;
     auto indices = indices_t.matrix<int64_t>();
 
-    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32>();
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
 
     Tensor coo_row_ind_t;
     OP_REQUIRES_OK(c, c->allocate_temp(DT_INT32, TensorShape({total_nnz}),
                                        &coo_row_ind_t));
-    auto coo_row_ind = coo_row_ind_t.vec<int32>();
+    auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
 
     // TODO(ebrevdo): just write a custom kernel that converts from
     // csr to dense.
@@ -176,9 +176,9 @@ class CSRSparseMatrixToDenseGPUOp : public OpKernel {
         // No copying required.  Avoid failure case below.
         continue;
       }
-      const TTypes<int32>::UnalignedConstVec csr_row_ptr_i(
+      const TTypes<int32_t>::UnalignedConstVec csr_row_ptr_i(
           &csr_row_ptr((rows + 1) * i), rows + 1);
-      const TTypes<int32>::UnalignedVec coo_row_ind_i(
+      const TTypes<int32_t>::UnalignedVec coo_row_ind_i(
           &coo_row_ind(csr_sparse_matrix->batch_offset(i)), nnz_i);
       OP_REQUIRES_OK(c, csr_to_coo(c, csr_row_ptr_i, coo_row_ind_i));
     }
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
index 403af12bb8fb52..903d3acbc67966 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
@@ -91,9 +91,9 @@ class CSRSparseMatrixToSparseTensorCPUOp : public OpKernel {
         c, c->allocate_output(0, TensorShape({total_nnz, rank}), &indices));
     auto indices_flat = indices->template flat<int64_t>();
 
-    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto csr_col_ind = csr_sparse_matrix->col_indices().vec<int32>();
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto csr_col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
 
     // Process the individual batches in parallel using a threadpool.
     auto shard = [&](int64_t batch_begin, int64_t batch_end) {
@@ -165,14 +165,14 @@ class CSRSparseMatrixToSparseTensorGPUOp : public OpKernel {
     functor::CSRSparseMatrixToCOOSparseMatrix<Device> csr_to_coo;
     auto indices = indices_t->matrix<int64_t>();
 
-    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32>();
-    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32>();
-    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+    auto csr_row_ptr = csr_sparse_matrix->row_pointers().vec<int32_t>();
+    auto coo_col_ind = csr_sparse_matrix->col_indices().vec<int32_t>();
+    auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
 
     Tensor coo_row_ind_t;
     OP_REQUIRES_OK(c, c->allocate_temp(DT_INT32, TensorShape({total_nnz}),
                                        &coo_row_ind_t));
-    auto coo_row_ind = coo_row_ind_t.vec<int32>();
+    auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
 
     // TODO(ebrevdo): Convert to one or two single kernel calls,
     // where the kernels are batch-friendly.
@@ -182,9 +182,9 @@ class CSRSparseMatrixToSparseTensorGPUOp : public OpKernel {
         // No copying required.  Avoid failure case below.
         continue;
       }
-      const TTypes<int32>::UnalignedConstVec csr_row_ptr_i(
+      const TTypes<int32_t>::UnalignedConstVec csr_row_ptr_i(
           &csr_row_ptr((rows + 1) * i), rows + 1);
-      const TTypes<int32>::UnalignedVec coo_row_ind_i(
+      const TTypes<int32_t>::UnalignedVec coo_row_ind_i(
           &coo_row_ind(csr_sparse_matrix->batch_offset(i)), nnz_i);
       OP_REQUIRES_OK(c, csr_to_coo(c, csr_row_ptr_i, coo_row_ind_i));
     }
diff --git a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
index 6e635d140ad7df..11601d8cf3b6ef 100644
--- a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
@@ -99,15 +99,16 @@ class DenseToCSRSparseMatrixCPUOp : public OpKernel {
                        TensorShape({(num_rows + 1) * batch_size}));
 
     // Fill the row pointers with zeros.
-    functor::SetZeroFunctor<Device, int32> set_zero;
-    set_zero(ctx->eigen_device<Device>(), csr_row_ptr.flat<int32>());
+    functor::SetZeroFunctor<Device, int32_t> set_zero;
+    set_zero(ctx->eigen_device<Device>(), csr_row_ptr.flat<int32_t>());
 
     // Convert from COO to CSR format.
     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
     OP_REQUIRES_OK(
-        ctx, coo_to_csr(batch_size, num_rows, num_cols,
-                        indices.matrix<int64_t>(), batch_ptr.vec<int32>(),
-                        csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()));
+        ctx,
+        coo_to_csr(batch_size, num_rows, num_cols, indices.matrix<int64_t>(),
+                   batch_ptr.vec<int32_t>(), csr_row_ptr.vec<int32_t>(),
+                   csr_col_ind.vec<int32_t>()));
 
     CSRSparseMatrix output_csr_matrix;
     OP_REQUIRES_OK(ctx, CSRSparseMatrix::CreateCSRSparseMatrix(
diff --git a/tensorflow/core/kernels/sparse/kernels.cc b/tensorflow/core/kernels/sparse/kernels.cc
index ca7009f942112f..dd84b556e002ab 100644
--- a/tensorflow/core/kernels/sparse/kernels.cc
+++ b/tensorflow/core/kernels/sparse/kernels.cc
@@ -31,8 +31,8 @@ namespace functor {
 
 absl::Status SparseTensorToCSRSparseMatrixCPUFunctor::operator()(
     int64_t batch_size, int num_rows, int num_cols,
-    TTypes<int64_t>::ConstMatrix indices, TTypes<int32>::Vec batch_ptr,
-    TTypes<int32>::Vec csr_row_ptr, TTypes<int32>::Vec csr_col_ind) {
+    TTypes<int64_t>::ConstMatrix indices, TTypes<int32_t>::Vec batch_ptr,
+    TTypes<int32_t>::Vec csr_row_ptr, TTypes<int32_t>::Vec csr_col_ind) {
   // Validate inputs.
   if (batch_ptr.size() != batch_size + 1) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/sparse/kernels.h b/tensorflow/core/kernels/sparse/kernels.h
index aff14ca07910fa..14441de5d3cad3 100644
--- a/tensorflow/core/kernels/sparse/kernels.h
+++ b/tensorflow/core/kernels/sparse/kernels.h
@@ -42,7 +42,7 @@ template <typename Device>
 struct CalculateNNZPerBatchMatrixFromIndices {
   absl::Status operator()(OpKernelContext* c,
                           TTypes<int64_t>::ConstMatrix indices,
-                          TTypes<int32>::Vec nnz_per_batch);
+                          TTypes<int32_t>::Vec nnz_per_batch);
 };
 
 // Split a subset of a SparseTensors' indices into two vectors:
@@ -63,8 +63,8 @@ template <typename Device>
 struct SparseTensorToCOOSparseMatrix {
   void operator()(const Device& d, TTypes<int64_t>::ConstVec host_dense_shape,
                   TTypes<int64_t>::ConstMatrix indices,
-                  TTypes<int32>::Vec coo_row_ind,
-                  TTypes<int32>::Vec coo_col_ind);
+                  TTypes<int32_t>::Vec coo_row_ind,
+                  TTypes<int32_t>::Vec coo_col_ind);
 };
 
 // Write coo batch, row, and column vectors to output matrix indices:
@@ -89,9 +89,9 @@ template <typename Device>
 struct COOSparseMatrixToSparseTensor {
   absl::Status operator()(OpKernelContext* c,
                           TTypes<int64_t>::ConstVec host_dense_shape,
-                          TTypes<int32>::ConstVec host_batch_ptrs,
-                          TTypes<int32>::Vec coo_row_ind,
-                          TTypes<int32>::ConstVec coo_col_ind,
+                          TTypes<int32_t>::ConstVec host_batch_ptrs,
+                          TTypes<int32_t>::Vec coo_row_ind,
+                          TTypes<int32_t>::ConstVec coo_col_ind,
                           TTypes<int64_t>::Matrix indices);
 };
 
@@ -105,8 +105,8 @@ struct COOSparseMatrixToSparseTensor {
 template <typename Device>
 struct COOSparseMatrixToCSRSparseMatrix {
   absl::Status operator()(OpKernelContext* c, const int rows, const int cols,
-                          TTypes<int32>::UnalignedVec coo_row_ind,
-                          TTypes<int32>::UnalignedVec csr_row_ptr);
+                          TTypes<int32_t>::UnalignedVec coo_row_ind,
+                          TTypes<int32_t>::UnalignedVec csr_row_ptr);
 };
 
 // Convert a matrix of (batched) coo row and column indices to CSR SparseMatrix
@@ -126,9 +126,9 @@ struct COOSparseMatrixToCSRSparseMatrix {
 struct SparseTensorToCSRSparseMatrixCPUFunctor {
   absl::Status operator()(int64_t batch_size, int num_rows, int num_cols,
                           TTypes<int64_t>::ConstMatrix indices,
-                          TTypes<int32>::Vec batch_ptr,
-                          TTypes<int32>::Vec csr_row_ptr,
-                          TTypes<int32>::Vec csr_col_ind);
+                          TTypes<int32_t>::Vec batch_ptr,
+                          TTypes<int32_t>::Vec csr_row_ptr,
+                          TTypes<int32_t>::Vec csr_col_ind);
 };
 
 // Convert a vector of csr row pointers to coo row indices.
@@ -141,8 +141,8 @@ struct SparseTensorToCSRSparseMatrixCPUFunctor {
 template <typename Device>
 struct CSRSparseMatrixToCOOSparseMatrix {
   absl::Status operator()(OpKernelContext* c,
-                          TTypes<int32>::UnalignedConstVec csr_row_ptr,
-                          TTypes<int32>::UnalignedVec coo_row_ind);
+                          TTypes<int32_t>::UnalignedConstVec csr_row_ptr,
+                          TTypes<int32_t>::UnalignedVec coo_row_ind);
 };
 
 // Calculates C = matmul(A, B) or C = matmul(A, B)^T, where A is in CSR format
@@ -176,10 +176,10 @@ struct CSRStructureModifyingFunctor {
                                         const ConstCSRComponent<T>& b,
                                         size_t* bufferSize) = 0;
 
-  virtual absl::Status GetOutputStructure(const ConstCSRComponent<T>& a,
-                                          const ConstCSRComponent<T>& b,
-                                          TTypes<int32>::UnalignedVec c_row_ptr,
-                                          int* output_nnz, void* workspace) = 0;
+  virtual absl::Status GetOutputStructure(
+      const ConstCSRComponent<T>& a, const ConstCSRComponent<T>& b,
+      TTypes<int32_t>::UnalignedVec c_row_ptr, int* output_nnz,
+      void* workspace) = 0;
 
   virtual absl::Status Compute(const ConstCSRComponent<T>& a,
                                const ConstCSRComponent<T>& b,
diff --git a/tensorflow/core/kernels/sparse/kernels_test.cc b/tensorflow/core/kernels/sparse/kernels_test.cc
index 018b8b77a81e34..dc13ef62256357 100644
--- a/tensorflow/core/kernels/sparse/kernels_test.cc
+++ b/tensorflow/core/kernels/sparse/kernels_test.cc
@@ -38,19 +38,19 @@ TEST(SparseTensorToCSRSparseMatrix, SingleBatchConversion) {
       test::AsTensor<int64_t>({0, 0, 2, 3, 2, 4, 3, 0}, TensorShape({4, 2}));
   Tensor batch_ptr(DT_INT32, {2});
   Tensor csr_col_ind(DT_INT32, {4});
-  auto csr_row_ptr = test::AsTensor<int32>({0, 0, 0, 0, 0});
+  auto csr_row_ptr = test::AsTensor<int32_t>({0, 0, 0, 0, 0});
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   TF_EXPECT_OK(coo_to_csr(/*batch_size=*/1, /*num_rows=*/4, /*num_cols=*/5,
                           indices.template matrix<int64_t>(),
-                          batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
-                          csr_col_ind.vec<int32>()));
-
-  test::ExpectTensorEqual<int32>(batch_ptr, test::AsTensor<int32>({0, 4}));
-  test::ExpectTensorEqual<int32>(csr_row_ptr,
-                                 test::AsTensor<int32>({0, 1, 1, 3, 4}));
-  test::ExpectTensorEqual<int32>(csr_col_ind,
-                                 test::AsTensor<int32>({0, 3, 4, 0}));
+                          batch_ptr.vec<int32_t>(), csr_row_ptr.vec<int32_t>(),
+                          csr_col_ind.vec<int32_t>()));
+
+  test::ExpectTensorEqual<int32_t>(batch_ptr, test::AsTensor<int32_t>({0, 4}));
+  test::ExpectTensorEqual<int32_t>(csr_row_ptr,
+                                   test::AsTensor<int32_t>({0, 1, 1, 3, 4}));
+  test::ExpectTensorEqual<int32_t>(csr_col_ind,
+                                   test::AsTensor<int32_t>({0, 3, 4, 0}));
 }
 
 TEST(SparseTensorToCSRSparseMatrix, BatchConversion) {
@@ -63,21 +63,22 @@ TEST(SparseTensorToCSRSparseMatrix, BatchConversion) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   TF_EXPECT_OK(coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
                           indices.template matrix<int64_t>(),
-                          batch_ptr.vec<int32>(), csr_row_ptr.vec<int32>(),
-                          csr_col_ind.vec<int32>()));
-
-  test::ExpectTensorEqual<int32>(batch_ptr,
-                                 test::AsTensor<int32>({0, 2, 2, 3}));
-  test::ExpectTensorEqual<int32>(csr_row_ptr,
-                                 test::AsTensor<int32>({0, 1, 1, 2,  //
-                                                        0, 0, 0, 0,  //
-                                                        0, 1, 1, 1}));
-  test::ExpectTensorEqual<int32>(csr_col_ind, test::AsTensor<int32>({0, 3, 1}));
+                          batch_ptr.vec<int32_t>(), csr_row_ptr.vec<int32_t>(),
+                          csr_col_ind.vec<int32_t>()));
+
+  test::ExpectTensorEqual<int32_t>(batch_ptr,
+                                   test::AsTensor<int32_t>({0, 2, 2, 3}));
+  test::ExpectTensorEqual<int32_t>(csr_row_ptr,
+                                   test::AsTensor<int32_t>({0, 1, 1, 2,  //
+                                                            0, 0, 0, 0,  //
+                                                            0, 1, 1, 1}));
+  test::ExpectTensorEqual<int32_t>(csr_col_ind,
+                                   test::AsTensor<int32_t>({0, 3, 1}));
 }
 
 TEST(SparseTensorToCSRSparseMatrix, InvalidBatchThrowsIllegalArgument) {
@@ -90,13 +91,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidBatchThrowsIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
                              ::testing::ContainsRegex(
                                  "Batch index .* is outside of valid range")));
@@ -111,13 +112,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidRowThrowsIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(
           tsl::error::Code::INVALID_ARGUMENT,
           ::testing::ContainsRegex("Row index .* is outside of valid range")));
@@ -132,13 +133,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidColThrowsIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
                              ::testing::ContainsRegex(
                                  "Column index .* is outside of valid range")));
@@ -154,13 +155,13 @@ TEST(SparseTensorToCSRSparseMatrix, InvalidRankIllegalArgument) {
   Tensor csr_col_ind(DT_INT32, {3});
   // row pointers have size = batch_size * (num_rows + 1) = 3 * 4 = 12
   Tensor csr_row_ptr(DT_INT32, {12});
-  test::FillFn<int32>(&csr_row_ptr, [](int unused) { return 0; });
+  test::FillFn<int32_t>(&csr_row_ptr, [](int unused) { return 0; });
 
   functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
   EXPECT_THAT(
       coo_to_csr(/*batch_size=*/3, /*num_rows=*/3, /*num_cols=*/4,
-                 indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                 csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()),
+                 indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                 csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()),
       absl_testing::StatusIs(tsl::error::Code::INVALID_ARGUMENT,
                              ::testing::ContainsRegex(
                                  "Indices must have either 2 or 3 columns.")));
diff --git a/tensorflow/core/kernels/sparse/nnz_op.cc b/tensorflow/core/kernels/sparse/nnz_op.cc
index 2006abfe4459b1..ad8095b1cdc925 100644
--- a/tensorflow/core/kernels/sparse/nnz_op.cc
+++ b/tensorflow/core/kernels/sparse/nnz_op.cc
@@ -53,7 +53,7 @@ class CSRNNZOp : public OpKernel {
           c, nnz_shape.AddDimWithStatus(csr_sparse_matrix->batch_size()));
     }
     OP_REQUIRES_OK(c, c->allocate_output(0, nnz_shape, &nnz_t));
-    auto nnz = nnz_t->flat<int32>();
+    auto nnz = nnz_t->flat<int32_t>();
     for (int i = 0; i < csr_sparse_matrix->batch_size(); ++i) {
       nnz(i) = csr_sparse_matrix->nnz(i);
     }
diff --git a/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc b/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
index 51f867277c6a55..afe0a1322ba866 100644
--- a/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
@@ -93,7 +93,7 @@ class CSRSparseCholeskyCPUOp : public OpKernel {
 
     // Allocate batch pointers.
     Tensor batch_ptr(cpu_allocator(), DT_INT32, TensorShape({batch_size + 1}));
-    auto batch_ptr_vec = batch_ptr.vec<int32>();
+    auto batch_ptr_vec = batch_ptr.vec<int32_t>();
     batch_ptr_vec(0) = 0;
 
     // Temporary vector of Eigen SparseMatrices to store the Sparse Cholesky
@@ -130,7 +130,7 @@ class CSRSparseCholeskyCPUOp : public OpKernel {
                                    Eigen::NaturalOrdering<int>>
                   solver;
               auto permutation_indices_flat =
-                  input_permutation_indices.flat<int32>().data();
+                  input_permutation_indices.flat<int32_t>().data();
 
               // Invert the fill-in reducing ordering and apply it to the input
               // sparse matrix.
@@ -183,8 +183,8 @@ class CSRSparseCholeskyCPUOp : public OpKernel {
     Tensor output_col_ind(cpu_allocator(), DT_INT32, TensorShape({total_nnz}));
     Tensor output_values(cpu_allocator(), DataTypeToEnum<T>::value,
                          TensorShape({total_nnz}));
-    auto output_row_ptr_ptr = output_row_ptr.flat<int32>().data();
-    auto output_col_ind_ptr = output_col_ind.flat<int32>().data();
+    auto output_row_ptr_ptr = output_row_ptr.flat<int32_t>().data();
+    auto output_col_ind_ptr = output_col_ind.flat<int32_t>().data();
     auto output_values_ptr = output_values.flat<T>().data();
 
     // Copy the output matrices from each batch into the CSRSparseMatrix
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index c961ec282b4ed0..0455fa374538fc 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -188,7 +188,7 @@ class CSRSparseMatMulCPUOp : public OpKernel {
 
     // Set batch pointers.
     Tensor batch_ptr(cpu_allocator(), DT_INT32, TensorShape({batch_size + 1}));
-    auto batch_ptr_vec = batch_ptr.vec<int32>();
+    auto batch_ptr_vec = batch_ptr.vec<int32_t>();
     batch_ptr_vec(0) = 0;
 
     // Store intermediate matrix products for each batch.
@@ -248,8 +248,8 @@ class CSRSparseMatMulCPUOp : public OpKernel {
     Tensor output_col_ind(cpu_allocator(), DT_INT32, TensorShape({total_nnz}));
     Tensor output_values(cpu_allocator(), DataTypeToEnum<T>::value,
                          TensorShape({total_nnz}));
-    auto output_row_ptr_ptr = output_row_ptr.flat<int32>().data();
-    auto output_col_ind_ptr = output_col_ind.flat<int32>().data();
+    auto output_row_ptr_ptr = output_row_ptr.flat<int32_t>().data();
+    auto output_col_ind_ptr = output_col_ind.flat<int32_t>().data();
     auto output_values_ptr = output_values.flat<T>().data();
 
     // Copy the output matrices from each batch into the CSRSparseMatrix
@@ -411,14 +411,14 @@ class CSRSparseMatMulGPUOp : public OpKernel {
 
     Tensor c_batch_ptr_t(cpu_allocator(), DT_INT32,
                          TensorShape({batch_size + 1}));
-    auto c_batch_ptr = c_batch_ptr_t.vec<int32>();
+    auto c_batch_ptr = c_batch_ptr_t.vec<int32_t>();
     c_batch_ptr(0) = 0;
 
     Tensor c_row_ptr_t;
     OP_REQUIRES_OK(ctx, ctx->allocate_temp(
                             DT_INT32, TensorShape({batch_size * (rows + 1)}),
                             &c_row_ptr_t));
-    auto c_row_ptr = c_row_ptr_t.vec<int32>();
+    auto c_row_ptr = c_row_ptr_t.vec<int32_t>();
 
     // Possibly transpose a.
     const CSRSparseMatrix* a_input_matrix;
@@ -643,8 +643,8 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                                   b_input_matrix->values_vec<T>(b_batch),
                                   b_input_dense_shape};
 
-      TTypes<int32>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
-                                              rows + 1);
+      TTypes<int32_t>::UnalignedVec c_row_ptr_i(&c_row_ptr(i * (rows + 1)),
+                                                rows + 1);
 
       int c_nnz_i;
       OP_REQUIRES_OK(ctx,
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix.h b/tensorflow/core/kernels/sparse/sparse_matrix.h
index 8e5ff45f57d30a..d3db1f29871d80 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix.h
+++ b/tensorflow/core/kernels/sparse/sparse_matrix.h
@@ -217,40 +217,40 @@ class CSRSparseMatrix {
     return dense_shape_;
   }
 
-  inline TTypes<int32>::UnalignedVec row_pointers_vec(int batch) {
+  inline TTypes<int32_t>::UnalignedVec row_pointers_vec(int batch) {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int64_t rows = dense_shape().vec<int64_t>()((dims() == 2) ? 0 : 1);
     const int offset = batch * (rows + 1);
-    return TTypes<int32>::UnalignedVec(row_pointers_vec_->data() + offset,
-                                       rows + 1);
+    return TTypes<int32_t>::UnalignedVec(row_pointers_vec_->data() + offset,
+                                         rows + 1);
   }
 
-  inline TTypes<int32>::UnalignedConstVec row_pointers_vec(int batch) const {
+  inline TTypes<int32_t>::UnalignedConstVec row_pointers_vec(int batch) const {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int64_t rows = dense_shape().vec<int64_t>()((dims() == 2) ? 0 : 1);
     const int offset = batch * (rows + 1);
-    return TTypes<int32>::UnalignedConstVec(row_pointers_vec_->data() + offset,
-                                            rows + 1);
+    return TTypes<int32_t>::UnalignedConstVec(
+        row_pointers_vec_->data() + offset, rows + 1);
   }
 
-  inline TTypes<int32>::UnalignedVec col_indices_vec(int batch) {
+  inline TTypes<int32_t>::UnalignedVec col_indices_vec(int batch) {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int offset = (*batch_pointers_vec_)(batch);
     const int nnz_in_batch = nnz(batch);
-    return TTypes<int32>::UnalignedVec(col_indices_vec_->data() + offset,
-                                       nnz_in_batch);
+    return TTypes<int32_t>::UnalignedVec(col_indices_vec_->data() + offset,
+                                         nnz_in_batch);
   }
 
-  inline TTypes<int32>::UnalignedConstVec col_indices_vec(int batch) const {
+  inline TTypes<int32_t>::UnalignedConstVec col_indices_vec(int batch) const {
     DCHECK(valid());
     DCHECK_LT(batch, batch_size());
     const int offset = (*batch_pointers_vec_)(batch);
     const int nnz_in_batch = nnz(batch);
-    return TTypes<int32>::UnalignedConstVec(col_indices_vec_->data() + offset,
-                                            nnz_in_batch);
+    return TTypes<int32_t>::UnalignedConstVec(col_indices_vec_->data() + offset,
+                                              nnz_in_batch);
   }
 
   template <typename T>
@@ -411,9 +411,11 @@ class CSRSparseMatrix {
   void SetupVecs() {
     if (!metadata_.validated) return;
     batch_pointers_vec_.reset(
-        new TTypes<int32>::Vec(batch_pointers_.vec<int32>()));
-    row_pointers_vec_.reset(new TTypes<int32>::Vec(row_pointers_.vec<int32>()));
-    col_indices_vec_.reset(new TTypes<int32>::Vec(col_indices_.vec<int32>()));
+        new TTypes<int32_t>::Vec(batch_pointers_.vec<int32_t>()));
+    row_pointers_vec_.reset(
+        new TTypes<int32_t>::Vec(row_pointers_.vec<int32_t>()));
+    col_indices_vec_.reset(
+        new TTypes<int32_t>::Vec(col_indices_.vec<int32_t>()));
   }
 
   void ClearVecs() {
@@ -537,9 +539,9 @@ class CSRSparseMatrix {
   Tensor row_pointers_;
   Tensor col_indices_;
   Tensor values_;
-  std::unique_ptr<TTypes<int32>::Vec> batch_pointers_vec_;
-  std::unique_ptr<TTypes<int32>::Vec> row_pointers_vec_;
-  std::unique_ptr<TTypes<int32>::Vec> col_indices_vec_;
+  std::unique_ptr<TTypes<int32_t>::Vec> batch_pointers_vec_;
+  std::unique_ptr<TTypes<int32_t>::Vec> row_pointers_vec_;
+  std::unique_ptr<TTypes<int32_t>::Vec> col_indices_vec_;
 };
 
 // Call BinaryFunctor<Device, T>()(ctx, a, b, c)
@@ -616,16 +618,16 @@ absl::Status CSRSparseMatrixUnaryHelper(OpKernelContext* ctx,
 
 template <typename T>
 struct ConstCSRComponent {
-  TTypes<int32>::UnalignedConstVec row_ptr;
-  TTypes<int32>::UnalignedConstVec col_ind;
+  TTypes<int32_t>::UnalignedConstVec row_ptr;
+  TTypes<int32_t>::UnalignedConstVec col_ind;
   typename TTypes<T>::UnalignedConstVec values;
   TTypes<int64_t>::ConstVec dense_shape_host;
 };
 
 template <typename T>
 struct CSRComponent {
-  TTypes<int32>::UnalignedVec row_ptr;
-  TTypes<int32>::UnalignedVec col_ind;
+  TTypes<int32_t>::UnalignedVec row_ptr;
+  TTypes<int32_t>::UnalignedVec col_ind;
   typename TTypes<T>::UnalignedVec values;
   TTypes<int64_t>::Vec dense_shape_host;
 };
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
index 5c1a0f007ed656..353a8ecb0aa86b 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
@@ -57,7 +57,7 @@ class CSRSparseMatrixComponentsOp : public OpKernel {
     OP_REQUIRES(c, index_t.dims() == 0,
                 errors::InvalidArgument("index should be a scalar, but saw: ",
                                         index_t.DebugString()));
-    int32_t index = index_t.scalar<int32>()();
+    int32_t index = index_t.scalar<int32_t>()();
     OP_REQUIRES(c, index >= 0 && index < csr_sparse_matrix->batch_size(),
                 errors::InvalidArgument("index (", index, ") not in [0, ",
                                         csr_sparse_matrix->batch_size(), ")"));
@@ -67,7 +67,7 @@ class CSRSparseMatrixComponentsOp : public OpKernel {
       c->set_output(1, csr_sparse_matrix->col_indices());
       c->set_output(2, csr_sparse_matrix->values());
     } else {
-      auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32>();
+      auto batch_ptrs = csr_sparse_matrix->batch_pointers().vec<int32_t>();
       auto dense_shape = csr_sparse_matrix->dense_shape().vec<int64_t>();
       int64_t rows = dense_shape(1);
       int nnz = batch_ptrs(index + 1) - batch_ptrs(index);
@@ -78,23 +78,23 @@ class CSRSparseMatrixComponentsOp : public OpKernel {
           c, c->allocate_output(0, TensorShape({rows + 1}), &row_ptrs_t));
       OP_REQUIRES_OK(c, c->allocate_output(1, TensorShape({nnz}), &col_inds_t));
       OP_REQUIRES_OK(c, c->allocate_output(2, TensorShape({nnz}), &values_t));
-      auto row_ptrs = row_ptrs_t->vec<int32>();
-      auto col_inds = col_inds_t->vec<int32>();
+      auto row_ptrs = row_ptrs_t->vec<int32_t>();
+      auto col_inds = col_inds_t->vec<int32_t>();
       auto values = values_t->vec<T>();
 
-      functor::Slice<Device, int32, 1> slice_int;
+      functor::Slice<Device, int32_t, 1> slice_int;
       functor::Slice<Device, T, 1> slice_t;
       typedef Eigen::DSizes<Eigen::DenseIndex, 1> EVec;
       const Device& d = c->eigen_device<Device>();
       slice_int(d,
                 /*output*/ row_ptrs,
-                /*input*/ csr_sparse_matrix->row_pointers().vec<int32>(),
+                /*input*/ csr_sparse_matrix->row_pointers().vec<int32_t>(),
                 /*slice_indices*/
                 EVec{static_cast<Eigen::DenseIndex>(index * (rows + 1))},
                 /*slice_sizes*/ EVec{static_cast<Eigen::DenseIndex>(rows + 1)});
       slice_int(d,
                 /*output*/ col_inds,
-                /*input*/ csr_sparse_matrix->col_indices().vec<int32>(),
+                /*input*/ csr_sparse_matrix->col_indices().vec<int32_t>(),
                 /*slice_indices*/ EVec{batch_ptrs(index)},
                 /*slice_sizes*/ EVec{nnz});
       slice_t(d,
diff --git a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
index e93e2b0a018845..259e9a97cd2ff3 100644
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@@ -74,7 +74,7 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
     const int64_t num_cols = dense_shape_vec((rank == 2) ? 1 : 2);
     const int64_t total_nnz = values.NumElements();
 
-    static constexpr int64_t kInt32Max = std::numeric_limits<int32>::max();
+    static constexpr int64_t kInt32Max = std::numeric_limits<int32_t>::max();
     OP_REQUIRES(
         ctx, batch_size < kInt32Max,
         errors::InvalidArgument("dense_shape batch_size must be < Int32Max,"
@@ -106,16 +106,16 @@ class SparseTensorToCSRSparseMatrixCPUOp : public OpKernel {
     Tensor csr_row_ptr(cpu_allocator(), DT_INT32, csr_row_ind_shape);
 
     // Fill the row pointers with zeros.
-    functor::SetZeroFunctor<CPUDevice, int32> set_zero;
-    set_zero(ctx->eigen_device<CPUDevice>(), csr_row_ptr.flat<int32>());
+    functor::SetZeroFunctor<CPUDevice, int32_t> set_zero;
+    set_zero(ctx->eigen_device<CPUDevice>(), csr_row_ptr.flat<int32_t>());
 
     // Convert from COO to CSR format.
     functor::SparseTensorToCSRSparseMatrixCPUFunctor coo_to_csr;
     OP_REQUIRES_OK(
         ctx,
         coo_to_csr(batch_size, num_rows, num_cols,
-                   indices.template matrix<int64_t>(), batch_ptr.vec<int32>(),
-                   csr_row_ptr.vec<int32>(), csr_col_ind.vec<int32>()));
+                   indices.template matrix<int64_t>(), batch_ptr.vec<int32_t>(),
+                   csr_row_ptr.vec<int32_t>(), csr_col_ind.vec<int32_t>()));
 
     // Create the CSRSparseMatrix object from its component Tensors and prepare
     // the Variant output Tensor.
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index 74e0b85f393e40..d81c80672b31bc 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -182,9 +182,9 @@ absl::Status CSRSparseMatrixTranspose<Device, T>::operator()(
 
   // Set the output row pointers to zero, in case we hit any empty
   // input batches.
-  functor::SetZeroFunctor<Device, int32> set_zero;
+  functor::SetZeroFunctor<Device, int32_t> set_zero;
   const Device& d = ctx->eigen_device<Device>();
-  set_zero(d, output_row_ptr_t.flat<int32>());
+  set_zero(d, output_row_ptr_t.flat<int32_t>());
 
   functor::CSRSparseMatrixTransposeComponent<Device, T> transpose_component;
   for (int i = 0; i < batch_size; ++i) {
diff --git a/tensorflow/core/kernels/sparse/zeros_op.h b/tensorflow/core/kernels/sparse/zeros_op.h
index 2a86089e04e62e..8f6c09fdb0fa68 100644
--- a/tensorflow/core/kernels/sparse/zeros_op.h
+++ b/tensorflow/core/kernels/sparse/zeros_op.h
@@ -54,7 +54,7 @@ struct CSRSparseMatrixZeros {
 
     Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
                        TensorShape({batch_size + 1}));
-    batch_ptr_t.vec<int32>().setZero();  // On host.
+    batch_ptr_t.vec<int32_t>().setZero();  // On host.
 
     Allocator* allocator = c->device()->GetAllocator(AllocatorAttributes());
     // An all-zeros CSR matrix is composed of an empty set of column
@@ -66,10 +66,10 @@ struct CSRSparseMatrixZeros {
     Tensor coo_col_ind_t(allocator, DT_INT32, TensorShape({0}));
     Tensor csr_values_t(allocator, dtype, TensorShape({0}));
     const Device& d = c->eigen_device<Device>();
-    functor::SetZeroFunctor<Device, int32> set_zero;
+    functor::SetZeroFunctor<Device, int32_t> set_zero;
     TF_RETURN_IF_ERROR(c->allocate_temp(
         DT_INT32, TensorShape({batch_size * (rows + 1)}), &csr_row_ptr_t));
-    set_zero(d, csr_row_ptr_t.flat<int32>());
+    set_zero(d, csr_row_ptr_t.flat<int32_t>());
 
     TF_RETURN_IF_ERROR(CSRSparseMatrix::CreateCSRSparseMatrix(
         dtype, dense_shape_t, batch_ptr_t, csr_row_ptr_t, coo_col_ind_t,

From 4f02ba06660ddd909021904ae18e0b6e19643c6d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 04:45:39 -0800
Subject: [PATCH 426/753] Automated Code Change

PiperOrigin-RevId: 845711508
---
 tensorflow/core/lib/wav/wav_io.h       |  10 +-
 tensorflow/core/lib/wav/wav_io_test.cc | 323 +++++++++++++++++--------
 2 files changed, 224 insertions(+), 109 deletions(-)

diff --git a/tensorflow/core/lib/wav/wav_io.h b/tensorflow/core/lib/wav/wav_io.h
index 99a3df5038e68b..4ffe789dd282d7 100644
--- a/tensorflow/core/lib/wav/wav_io.h
+++ b/tensorflow/core/lib/wav/wav_io.h
@@ -65,9 +65,9 @@ extern template Status EncodeAudioAsS16LEWav<tstring>(const float* audio,
 // The results are output as floats within the range -1 to 1,
 absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
                                           std::vector<float>* float_values,
-                                          uint32* sample_count,
-                                          uint16* channel_count,
-                                          uint32* sample_rate);
+                                          uint32_t* sample_count,
+                                          uint16_t* channel_count,
+                                          uint32_t* sample_rate);
 
 // Everything below here is only exposed publicly for testing purposes.
 
@@ -88,8 +88,8 @@ absl::Status ReadValue(const std::string& data, T* value, int* offset) {
     memcpy(value, data.data() + *offset, sizeof(T));
   } else {
     *value = 0;
-    const uint8* data_buf =
-        reinterpret_cast<const uint8*>(data.data() + *offset);
+    const uint8_t* data_buf =
+        reinterpret_cast<const uint8_t*>(data.data() + *offset);
     int shift = 0;
     for (int i = 0; i < sizeof(T); ++i, shift += 8) {
       *value = *value | (data_buf[i] << shift);
diff --git a/tensorflow/core/lib/wav/wav_io_test.cc b/tensorflow/core/lib/wav/wav_io_test.cc
index dfc75257cc85f5..68d0c0fa35fbe7 100644
--- a/tensorflow/core/lib/wav/wav_io_test.cc
+++ b/tensorflow/core/lib/wav/wav_io_test.cc
@@ -31,10 +31,10 @@ namespace wav {
 
 // These are defined in wav_io.cc, and the signatures are here so we don't have
 // to expose them in the public header.
-absl::Status ExpectText(const string& data, const string& expected_text,
-                        int* offset);
-absl::Status ReadString(const string& data, int expected_length, string* value,
-                        int* offset);
+absl::Status ExpectText(const std::string& data,
+                        const std::string& expected_text, int* offset);
+absl::Status ReadString(const std::string& data, int expected_length,
+                        std::string* value, int* offset);
 
 TEST(WavIO, BadArguments) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
@@ -76,7 +76,7 @@ TEST(WavIO, BadArguments) {
 
 TEST(WavIO, BasicEven) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 3, &result));
   EXPECT_EQ(56, result.size());
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 22050, 1, 6, &result));
@@ -87,19 +87,19 @@ TEST(WavIO, BasicEven) {
 
 TEST(WavIO, BasicOdd) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 22050, 1, 5, &result));
   EXPECT_EQ(54, result.size());
 }
 
 TEST(WavIO, EncodeThenDecode) {
   float audio[] = {0.0f, 0.1f, 0.2f, 0.3f, 0.4f, 0.5f};
-  string wav_data;
+  std::string wav_data;
   TF_ASSERT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 3, &wav_data));
   std::vector<float> decoded_audio;
-  uint32 decoded_sample_count;
-  uint16 decoded_channel_count;
-  uint32 decoded_sample_rate;
+  uint32_t decoded_sample_count;
+  uint16_t decoded_channel_count;
+  uint32_t decoded_sample_rate;
   TF_ASSERT_OK(DecodeLin16WaveAsFloatVector(
       wav_data, &decoded_audio, &decoded_sample_count, &decoded_channel_count,
       &decoded_sample_rate));
@@ -112,59 +112,129 @@ TEST(WavIO, EncodeThenDecode) {
 }
 
 TEST(WavIO, BasicMono) {
-  std::vector<uint8> wav_data = {
-      'R', 'I', 'F', 'F',  // ChunkID
-      44, 0, 0, 0,         // ChunkSize: 36 + SubChunk2Size
-      'W', 'A', 'V', 'E',  // Format
-      'f', 'm', 't', ' ',  // Subchunk1ID
-      16, 0, 0, 0,         // Subchunk1Size
-      1, 0,                // AudioFormat: 1=PCM
-      1, 0,                // NumChannels
-      0x44, 0xac, 0, 0,    // SampleRate: 44100
-      0x88, 0x58, 0x1, 0,  // BytesPerSecond: SampleRate * NumChannels *
-                           //                 BitsPerSample/8
-      2, 0,                // BytesPerSample: NumChannels * BitsPerSample/8
-      16, 0,               // BitsPerSample
-      'd', 'a', 't', 'a',  // Subchunk2ID
-      8, 0, 0, 0,          // Subchunk2Size: NumSamples * NumChannels *
-                           //                BitsPerSample/8
-      0, 0,                // Sample 1: 0
-      0xff, 0x7f,          // Sample 2: 32767 (saturated)
-      0, 0,                // Sample 3: 0
-      0x00, 0x80,          // Sample 4: -32768 (saturated)
+  std::vector<uint8_t> wav_data = {
+      'R',
+      'I',
+      'F',
+      'F',  // ChunkID
+      44,
+      0,
+      0,
+      0,  // ChunkSize: 36 + SubChunk2Size
+      'W',
+      'A',
+      'V',
+      'E',  // Format
+      'f',
+      'm',
+      't',
+      ' ',  // Subchunk1ID
+      16,
+      0,
+      0,
+      0,  // Subchunk1Size
+      1,
+      0,  // AudioFormat: 1=PCM
+      1,
+      0,  // NumChannels
+      0x44,
+      0xac,
+      0,
+      0,  // SampleRate: 44100
+      0x88,
+      0x58,
+      0x1,
+      0,  // BytesPerSecond: SampleRate * NumChannels *
+          //                 BitsPerSample/8
+      2,
+      0,  // BytesPerSample: NumChannels * BitsPerSample/8
+      16,
+      0,  // BitsPerSample
+      'd',
+      'a',
+      't',
+      'a',  // Subchunk2ID
+      8,
+      0,
+      0,
+      0,  // Subchunk2Size: NumSamples * NumChannels *
+          //                BitsPerSample/8
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
   };
-  string expected(wav_data.begin(), wav_data.end());
+  std::string expected(wav_data.begin(), wav_data.end());
   float audio[] = {0.0f, 1.0f, 0.0f, -1.0f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 1, 4, &result));
   EXPECT_EQ(expected, result);
 }
 
 TEST(WavIO, BasicStereo) {
-  std::vector<uint8> wav_data = {
-      'R', 'I', 'F', 'F',  // ChunkID
-      44, 0, 0, 0,         // ChunkSize: 36 + SubChunk2Size
-      'W', 'A', 'V', 'E',  // Format
-      'f', 'm', 't', ' ',  // Subchunk1ID
-      16, 0, 0, 0,         // Subchunk1Size
-      1, 0,                // AudioFormat: 1=PCM
-      2, 0,                // NumChannels
-      0x44, 0xac, 0, 0,    // SampleRate: 44100
-      0x10, 0xb1, 0x2, 0,  // BytesPerSecond: SampleRate * NumChannels *
-                           //                 BitsPerSample/8
-      4, 0,                // BytesPerSample: NumChannels * BitsPerSample/8
-      16, 0,               // BitsPerSample
-      'd', 'a', 't', 'a',  // Subchunk2ID
-      8, 0, 0, 0,          // Subchunk2Size: NumSamples * NumChannels *
-                           //                BitsPerSample/8
-      0, 0,                // Sample 1: 0
-      0xff, 0x7f,          // Sample 2: 32767 (saturated)
-      0, 0,                // Sample 3: 0
-      0x00, 0x80,          // Sample 4: -32768 (saturated)
+  std::vector<uint8_t> wav_data = {
+      'R',
+      'I',
+      'F',
+      'F',  // ChunkID
+      44,
+      0,
+      0,
+      0,  // ChunkSize: 36 + SubChunk2Size
+      'W',
+      'A',
+      'V',
+      'E',  // Format
+      'f',
+      'm',
+      't',
+      ' ',  // Subchunk1ID
+      16,
+      0,
+      0,
+      0,  // Subchunk1Size
+      1,
+      0,  // AudioFormat: 1=PCM
+      2,
+      0,  // NumChannels
+      0x44,
+      0xac,
+      0,
+      0,  // SampleRate: 44100
+      0x10,
+      0xb1,
+      0x2,
+      0,  // BytesPerSecond: SampleRate * NumChannels *
+          //                 BitsPerSample/8
+      4,
+      0,  // BytesPerSample: NumChannels * BitsPerSample/8
+      16,
+      0,  // BitsPerSample
+      'd',
+      'a',
+      't',
+      'a',  // Subchunk2ID
+      8,
+      0,
+      0,
+      0,  // Subchunk2Size: NumSamples * NumChannels *
+          //                BitsPerSample/8
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
   };
-  string expected(wav_data.begin(), wav_data.end());
+  std::string expected(wav_data.begin(), wav_data.end());
   float audio[] = {0.0f, 1.0f, 0.0f, -1.0f};
-  string result;
+  std::string result;
   TF_EXPECT_OK(EncodeAudioAsS16LEWav(audio, 44100, 2, 2, &result));
   EXPECT_EQ(expected, result);
 }
@@ -175,38 +245,83 @@ TEST(WavIO, BasicStereo) {
 // large WAV files are not common, and are unsupported by many readers.
 // See b/72655902.
 TEST(WavIO, ChunkSizeOverflow) {
-  std::vector<uint8> wav_data = {
-      'R', 'I', 'F', 'F',      // ChunkID
-      60, 0, 0, 0,             // ChunkSize: 36 + SubChunk2Size
-      'W', 'A', 'V', 'E',      // Format
-      'f', 'm', 't', ' ',      // Subchunk1ID
-      16, 0, 0, 0,             // Subchunk1Size
-      1, 0,                    // AudioFormat: 1=PCM
-      1, 0,                    // NumChannels
-      0x44, 0xac, 0, 0,        // SampleRate: 44100
-      0x88, 0x58, 0x1, 0,      // BytesPerSecond: SampleRate * NumChannels *
-                               //                 BitsPerSample/8
-      2, 0,                    // BytesPerSample: NumChannels * BitsPerSample/8
-      16, 0,                   // BitsPerSample
-      'd', 'a', 't', 'a',      // Subchunk2ID
-      8, 0, 0, 0,              // Subchunk2Size: NumSamples * NumChannels *
-                               //                BitsPerSample/8
-      0, 0,                    // Sample 1: 0
-      0xff, 0x7f,              // Sample 2: 32767 (saturated)
-      0, 0,                    // Sample 3: 0
-      0x00, 0x80,              // Sample 4: -32768 (saturated)
-      'f', 'o', 'o', 'o',      // Subchunk2ID
-      0xff, 0xff, 0xff, 0xf8,  // Chunk size that could cause an infinite loop.
-      0, 0,                    // Sample 1: 0
-      0xff, 0x7f,              // Sample 2: 32767 (saturated)
-      0, 0,                    // Sample 3: 0
-      0x00, 0x80,              // Sample 4: -32768 (saturated)
+  std::vector<uint8_t> wav_data = {
+      'R',
+      'I',
+      'F',
+      'F',  // ChunkID
+      60,
+      0,
+      0,
+      0,  // ChunkSize: 36 + SubChunk2Size
+      'W',
+      'A',
+      'V',
+      'E',  // Format
+      'f',
+      'm',
+      't',
+      ' ',  // Subchunk1ID
+      16,
+      0,
+      0,
+      0,  // Subchunk1Size
+      1,
+      0,  // AudioFormat: 1=PCM
+      1,
+      0,  // NumChannels
+      0x44,
+      0xac,
+      0,
+      0,  // SampleRate: 44100
+      0x88,
+      0x58,
+      0x1,
+      0,  // BytesPerSecond: SampleRate * NumChannels *
+          //                 BitsPerSample/8
+      2,
+      0,  // BytesPerSample: NumChannels * BitsPerSample/8
+      16,
+      0,  // BitsPerSample
+      'd',
+      'a',
+      't',
+      'a',  // Subchunk2ID
+      8,
+      0,
+      0,
+      0,  // Subchunk2Size: NumSamples * NumChannels *
+          //                BitsPerSample/8
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
+      'f',
+      'o',
+      'o',
+      'o',  // Subchunk2ID
+      0xff,
+      0xff,
+      0xff,
+      0xf8,  // Chunk size that could cause an infinite loop.
+      0,
+      0,  // Sample 1: 0
+      0xff,
+      0x7f,  // Sample 2: 32767 (saturated)
+      0,
+      0,  // Sample 3: 0
+      0x00,
+      0x80,  // Sample 4: -32768 (saturated)
   };
-  string wav_data_string(wav_data.begin(), wav_data.end());
+  std::string wav_data_string(wav_data.begin(), wav_data.end());
   std::vector<float> decoded_audio;
-  uint32 decoded_sample_count;
-  uint16 decoded_channel_count;
-  uint32 decoded_sample_rate;
+  uint32_t decoded_sample_count;
+  uint16_t decoded_channel_count;
+  uint32_t decoded_sample_rate;
   absl::Status decode_status = DecodeLin16WaveAsFloatVector(
       wav_data_string, &decoded_audio, &decoded_sample_count,
       &decoded_channel_count, &decoded_sample_rate);
@@ -244,10 +359,10 @@ TEST(WavIO, IncrementOffset) {
 }
 
 TEST(WavIO, ExpectText) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       'E', 'x', 'p', 'e', 'c', 't', 'e', 'd',
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   TF_EXPECT_OK(ExpectText(test_string, "Expected", &offset));
@@ -267,13 +382,13 @@ TEST(WavIO, ExpectText) {
 }
 
 TEST(WavIO, ReadString) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       'E', 'x', 'p', 'e', 'c', 't', 'e', 'd',
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  string read_value;
+  std::string read_value;
   TF_EXPECT_OK(ReadString(test_string, 2, &read_value, &offset));
   EXPECT_EQ("Ex", read_value);
   EXPECT_EQ(2, offset);
@@ -287,8 +402,8 @@ TEST(WavIO, ReadString) {
 }
 
 TEST(WavIO, ReadValueInt8) {
-  std::vector<uint8> test_data = {0x00, 0x05, 0xff, 0x80};
-  string test_string(test_data.begin(), test_data.end());
+  std::vector<uint8_t> test_data = {0x00, 0x05, 0xff, 0x80};
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   int8_t read_value;
@@ -313,11 +428,11 @@ TEST(WavIO, ReadValueInt8) {
 }
 
 TEST(WavIO, ReadValueUInt8) {
-  std::vector<uint8> test_data = {0x00, 0x05, 0xff, 0x80};
-  string test_string(test_data.begin(), test_data.end());
+  std::vector<uint8_t> test_data = {0x00, 0x05, 0xff, 0x80};
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  uint8 read_value;
+  uint8_t read_value;
   TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
   EXPECT_EQ(0, read_value);
   EXPECT_EQ(1, offset);
@@ -339,14 +454,14 @@ TEST(WavIO, ReadValueUInt8) {
 }
 
 TEST(WavIO, ReadValueInt16) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00,  // 0
       0xff, 0x00,  // 255
       0x00, 0x01,  // 256
       0xff, 0xff,  // -1
       0x00, 0x80,  // -32768
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   int16_t read_value;
@@ -375,17 +490,17 @@ TEST(WavIO, ReadValueInt16) {
 }
 
 TEST(WavIO, ReadValueUInt16) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00,  // 0
       0xff, 0x00,  // 255
       0x00, 0x01,  // 256
       0xff, 0xff,  // 65535
       0x00, 0x80,  // 32768
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  uint16 read_value;
+  uint16_t read_value;
   TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
   EXPECT_EQ(0, read_value);
   EXPECT_EQ(2, offset);
@@ -411,14 +526,14 @@ TEST(WavIO, ReadValueUInt16) {
 }
 
 TEST(WavIO, ReadValueInt32) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00, 0x00, 0x00,  // 0
       0xff, 0x00, 0x00, 0x00,  // 255
       0x00, 0xff, 0x00, 0x00,  // 65280
       0x00, 0x00, 0xff, 0x00,  // 16,711,680
       0xff, 0xff, 0xff, 0xff,  // -1
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
   int32_t read_value;
@@ -447,17 +562,17 @@ TEST(WavIO, ReadValueInt32) {
 }
 
 TEST(WavIO, ReadValueUInt32) {
-  std::vector<uint8> test_data = {
+  std::vector<uint8_t> test_data = {
       0x00, 0x00, 0x00, 0x00,  // 0
       0xff, 0x00, 0x00, 0x00,  // 255
       0x00, 0xff, 0x00, 0x00,  // 65280
       0x00, 0x00, 0xff, 0x00,  // 16,711,680
       0xff, 0xff, 0xff, 0xff,  // 4,294,967,295
   };
-  string test_string(test_data.begin(), test_data.end());
+  std::string test_string(test_data.begin(), test_data.end());
 
   int offset = 0;
-  uint32 read_value;
+  uint32_t read_value;
   TF_EXPECT_OK(ReadValue(test_string, &read_value, &offset));
   EXPECT_EQ(0, read_value);
   EXPECT_EQ(4, offset);

From a9954dfdd2371bddf90191018601bc621e5dc115 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 04:45:49 -0800
Subject: [PATCH 427/753] Automated Code Change

PiperOrigin-RevId: 845711551
---
 .../lite/tools/optimize/calibration/calibrator_test.cc     | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index f73bbfa1288754..d80391b3967130 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/calibration/calibration_reader.h"
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -716,7 +716,7 @@ TEST(CalibratorTest, CalibrationWithCallOnce) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -727,8 +727,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }

From 10228fc8791496123088f7ea25b064878eb0d744 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 05:11:30 -0800
Subject: [PATCH 428/753] Automated Code Change

PiperOrigin-RevId: 845718981
---
 tensorflow/c/kernels_test.cc | 13 +++++++------
 tensorflow/c/python_api.cc   |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
index b8b8b2f29cfe13..3064224e9b12bf 100644
--- a/tensorflow/c/kernels_test.cc
+++ b/tensorflow/c/kernels_test.cc
@@ -405,7 +405,7 @@ TEST_F(TestKernelAttr, String) {
                                           /*max_length*/ 5, status);
 
     EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    EXPECT_EQ("bunny", string(static_cast<const char*>(val.get()), 5));
+    EXPECT_EQ("bunny", std::string(static_cast<const char*>(val.get()), 5));
     TF_DeleteStatus(status);
     return static_cast<void*>(s);
   };
@@ -421,7 +421,7 @@ TEST_F(TestKernelAttr, StringList) {
     s->created = true;
     s->compute_called = false;
 
-    std::vector<string> list = {"bugs", "bunny", "duck"};
+    std::vector<std::string> list = {"bugs", "bunny", "duck"};
     int list_total_size = 0;
     for (const auto& s : list) {
       list_total_size += s.size();
@@ -440,7 +440,8 @@ TEST_F(TestKernelAttr, StringList) {
 
     for (size_t i = 0; i < list.size(); ++i) {
       EXPECT_EQ(list[i].size(), lens[i]) << i;
-      EXPECT_EQ(list[i], string(static_cast<const char*>(values[i]), lens[i]))
+      EXPECT_EQ(list[i],
+                std::string(static_cast<const char*>(values[i]), lens[i]))
           << i;
     }
     TF_DeleteStatus(status);
@@ -823,7 +824,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
     TF_Status* s = TF_NewStatus();
     TF_GetInput(ctx, 0, &input, s);
     EXPECT_EQ(TF_OK, TF_GetCode(s)) << "Failed to get input: " << TF_Message(s);
-    EXPECT_EQ(123, *static_cast<tensorflow::uint8*>(TF_TensorData(input)));
+    EXPECT_EQ(123, *static_cast<uint8_t*>(TF_TensorData(input)));
     TF_GetInput(ctx, -1, &input, s);
     EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
     TF_GetInput(ctx, 3, &input, s);
@@ -866,7 +867,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
     p.device = &dummy_device;
     p.step_id = 43;
 
-    Tensor t(tensorflow::uint8(123));
+    Tensor t(uint8_t(123));
 
     absl::InlinedVector<TensorValue, 4UL> inputs;
     // Simulate 2 inputs
@@ -886,7 +887,7 @@ TEST(TestKernel, TestInputAndOutputCount) {
 
     ASSERT_EQ(2, num_inputs);
     ASSERT_EQ(1, num_outputs);
-    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<tensorflow::uint8>()());
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<uint8_t>()());
   }
 }
 
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index c2a4d73f8ad620..e49f5a099ee72d 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -84,7 +84,7 @@ std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output) {
       *out_shape_and_type->mutable_type() = p.type;
     }
   }
-  string result;
+  std::string result;
   handle_data.SerializeToString(&result);
   return result;
 }

From 2b9d8450d26cc2f41560781c9267dc0bc7487f99 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 05:13:33 -0800
Subject: [PATCH 429/753] Automated Code Change

PiperOrigin-RevId: 845719651
---
 .../core/kernels/immutable_constant_op.cc     |  4 +--
 .../core/kernels/immutable_constant_op.h      |  2 +-
 .../kernels/immutable_constant_op_test.cc     | 27 ++++++++++---------
 tensorflow/core/kernels/in_topk_op.cc         | 12 ++++-----
 tensorflow/core/kernels/in_topk_op.h          |  2 +-
 tensorflow/core/kernels/inplace_ops.cc        | 18 ++++++-------
 6 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/tensorflow/core/kernels/immutable_constant_op.cc b/tensorflow/core/kernels/immutable_constant_op.cc
index be0194413a3b81..4fbd1edfba920a 100644
--- a/tensorflow/core/kernels/immutable_constant_op.cc
+++ b/tensorflow/core/kernels/immutable_constant_op.cc
@@ -26,7 +26,7 @@ class MemmappedTensorAllocator : public Allocator {
  public:
   MemmappedTensorAllocator() {}
 
-  absl::Status InitializeFromRegion(const string& name, Env* env) {
+  absl::Status InitializeFromRegion(const std::string& name, Env* env) {
     const auto status =
         env->NewReadOnlyMemoryRegionFromFile(name, &memory_region_);
     if (!status.ok()) {
@@ -34,7 +34,7 @@ class MemmappedTensorAllocator : public Allocator {
     }
     return absl::OkStatus();
   }
-  string Name() override { return "MemmappedTensorAllocator"; }
+  std::string Name() override { return "MemmappedTensorAllocator"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
     if ((reinterpret_cast<intptr_t>(memory_region_->data())) % alignment != 0) {
diff --git a/tensorflow/core/kernels/immutable_constant_op.h b/tensorflow/core/kernels/immutable_constant_op.h
index 264abc8401b3b4..cd645686bddcfa 100644
--- a/tensorflow/core/kernels/immutable_constant_op.h
+++ b/tensorflow/core/kernels/immutable_constant_op.h
@@ -38,7 +38,7 @@ class ImmutableConstantOp : public OpKernel {
   static constexpr char const* kMemoryRegionNameAttr = "memory_region_name";
 
  private:
-  string region_name_;
+  std::string region_name_;
   DataType dtype_;
   TensorShape shape_;
   ImmutableConstantOp(const ImmutableConstantOp&) = delete;
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index 1cfed79bf3318e..955d3f8751c12a 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -40,7 +40,7 @@ constexpr size_t kTestTensorSizeBytes = kTestTensorSize * sizeof(float);
 class TestReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
  public:
   TestReadOnlyMemoryRegion() = delete;
-  explicit TestReadOnlyMemoryRegion(uint64 length)
+  explicit TestReadOnlyMemoryRegion(uint64_t length)
       : memptr_(cpu_allocator()->AllocateRaw(kTestAlignment, length)),
         length_(length) {}
   ~TestReadOnlyMemoryRegion() override {
@@ -48,11 +48,11 @@ class TestReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
   }
   const void* data() override { return memptr_; }
   float* GetWritableDataStart() { return reinterpret_cast<float*>(memptr_); }
-  uint64 length() override { return length_; }
+  uint64_t length() override { return length_; }
 
  protected:
   void* memptr_;
-  uint64 length_;
+  uint64_t length_;
 };
 
 // A mock file system and environment class that creates ReadOnlyMemoryRegion
@@ -65,7 +65,7 @@ class TestFileSystem : public NullFileSystem {
   using NullFileSystem::NewReadOnlyMemoryRegionFromFile;
 
   absl::Status NewReadOnlyMemoryRegionFromFile(
-      const string& fname, TransactionToken* token,
+      const std::string& fname, TransactionToken* token,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override {
     float val = 0;
     absl::string_view scheme, host, path;
@@ -146,13 +146,13 @@ TEST(ImmutableConstantOpTest, ExecutionError) {
       error::INTERNAL);
 }
 
-absl::Status CreateTempFileFloat(Env* env, float value, uint64 size,
-                                 string* filename) {
-  const string dir = testing::TmpDir();
+absl::Status CreateTempFileFloat(Env* env, float value, uint64_t size,
+                                 std::string* filename) {
+  const std::string dir = testing::TmpDir();
   *filename = io::JoinPath(dir, absl::StrCat("file_", value));
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(*filename, &file));
-  for (uint64 i = 0; i < size; ++i) {
+  for (uint64_t i = 0; i < size; ++i) {
     absl::string_view sp(static_cast<char*>(static_cast<void*>(&value)),
                          sizeof(value));
     TF_RETURN_IF_ERROR(file->Append(sp));
@@ -166,7 +166,7 @@ TEST(ImmutableConstantOpTest, FromFile) {
   Env* env = Env::Default();
   auto root = Scope::NewRootScope().ExitOnError();
 
-  string two_file, three_file;
+  std::string two_file, three_file;
   TF_ASSERT_OK(CreateTempFileFloat(env, 2.0f, 1000, &two_file));
   TF_ASSERT_OK(CreateTempFileFloat(env, 3.0f, 1000, &three_file));
   auto node1 = ops::ImmutableConst(root, DT_FLOAT, kFileTensorShape, two_file);
@@ -191,9 +191,10 @@ TEST(ImmutableConstantOpTest, FromFile) {
   EXPECT_EQ(outputs.front().flat<float>()(2), 2.0f * 3.0f);
 }
 
-absl::Status CreateTempFileBadString(Env* env, char value, uint64 size,
-                                     const string suffix, string* filename) {
-  const string dir = testing::TmpDir();
+absl::Status CreateTempFileBadString(Env* env, char value, uint64_t size,
+                                     const std::string suffix,
+                                     std::string* filename) {
+  const std::string dir = testing::TmpDir();
   *filename = io::JoinPath(dir, absl::StrCat("file_", suffix));
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(*filename, &file));
@@ -207,7 +208,7 @@ TEST(ImmutableConstantOpTest, FromFileStringUnimplmented) {
   Env* env = Env::Default();
   auto root = Scope::NewRootScope().ExitOnError();
 
-  string bad_file;
+  std::string bad_file;
   TF_ASSERT_OK(CreateTempFileBadString(env, '\xe2', 128, "bad_e2", &bad_file));
   auto result =
       ops::ImmutableConst(root, DT_STRING, kFileTensorShape, bad_file);
diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index 169d331ad24487..e5d03f902eb58f 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -89,15 +89,15 @@ REGISTER_KERNEL_BUILDER(Name("InTopK")
                             .HostMemory("predictions")
                             .HostMemory("targets")
                             .HostMemory("precision")
-                            .TypeConstraint<int32>("T"),
-                        InTopK<CPUDevice, float, int32>);
+                            .TypeConstraint<int32_t>("T"),
+                        InTopK<CPUDevice, float, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("InTopK")
                             .Device(DEVICE_CPU)
                             .HostMemory("predictions")
                             .HostMemory("targets")
                             .HostMemory("precision")
                             .TypeConstraint<int64_t>("T"),
-                        InTopK<CPUDevice, float, int64>);
+                        InTopK<CPUDevice, float, int64_t>);
 
 REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .Device(DEVICE_CPU)
@@ -105,8 +105,8 @@ REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .HostMemory("targets")
                             .HostMemory("k")
                             .HostMemory("precision")
-                            .TypeConstraint<int32>("T"),
-                        InTopK<CPUDevice, float, int32>);
+                            .TypeConstraint<int32_t>("T"),
+                        InTopK<CPUDevice, float, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .Device(DEVICE_CPU)
                             .HostMemory("predictions")
@@ -114,7 +114,7 @@ REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                             .HostMemory("k")
                             .HostMemory("precision")
                             .TypeConstraint<int64_t>("T"),
-                        InTopK<CPUDevice, float, int64>);
+                        InTopK<CPUDevice, float, int64_t>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/in_topk_op.h b/tensorflow/core/kernels/in_topk_op.h
index 877777642ebeb6..ad10dad72bf717 100644
--- a/tensorflow/core/kernels/in_topk_op.h
+++ b/tensorflow/core/kernels/in_topk_op.h
@@ -62,7 +62,7 @@ struct InTopKFunctor<CPUDevice, T, TargetT> {
     int64_t k_val = k.k_value;
     if (k.k_tensor != nullptr) {
       if (k.k_tensor->dtype() == DT_INT32) {
-        k_val = k.k_tensor->scalar<int32>()();
+        k_val = k.k_tensor->scalar<int32_t>()();
       } else {
         k_val = k.k_tensor->scalar<int64_t>()();
       }
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 45db7d3b2d3f49..1380406a8a33cc 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -106,7 +106,7 @@ class ParallelConcatUpdate : public OpKernel {
   }
 
  private:
-  int32 loc_;
+  int32_t loc_;
 };
 
 template <typename Device, typename T>
@@ -251,7 +251,7 @@ namespace functor {
 template <typename T>
 void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
-  auto Ti = i.flat<int32>();
+  auto Ti = i.flat<int32_t>();
   auto Tv = v.flat_outer_dims<T>();
   auto Ty = y->flat_outer_dims<T>();
   auto nrows = Ty.dimension(0);
@@ -274,7 +274,7 @@ void DoInplaceOp(const CPUDevice& d, InplaceOpType op, const Tensor& i,
 // String type only supports inplace update.
 void DoInplaceStringUpdateOp(const CPUDevice& d, const Tensor& i,
                              const Tensor& v, Tensor* y) {
-  auto Ti = i.flat<int32>();
+  auto Ti = i.flat<int32_t>();
   auto Tv = v.flat_outer_dims<tstring>();
   auto Ty = y->flat_outer_dims<tstring>();
   auto nrows = Ty.dimension(0);
@@ -398,10 +398,10 @@ class EmptyOp : public OpKernel {
         ctx, TensorShapeUtils::IsVector(shape.shape()),
         errors::InvalidArgument("shape must be a vector of int32, got shape ",
                                 shape.shape().DebugString()));
-    auto dims = shape.flat<int32>();
+    auto dims = shape.flat<int32_t>();
     TensorShape out_shape;
     OP_REQUIRES_OK(ctx, TensorShapeUtils::MakeShape(
-                            reinterpret_cast<const int32*>(dims.data()),
+                            reinterpret_cast<const int32_t*>(dims.data()),
                             dims.size(), &out_shape));
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
@@ -474,7 +474,7 @@ REGISTER_KERNEL_BUILDER(Name("InplaceUpdate")
                             .HostMemory("i")
                             .HostMemory("v")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         InplaceOp<CPUDevice, functor::I_UPDATE>);
 REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
                             .Device(DEVICE_DEFAULT)
@@ -482,7 +482,7 @@ REGISTER_KERNEL_BUILDER(Name("InplaceAdd")
                             .HostMemory("i")
                             .HostMemory("v")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         InplaceOp<CPUDevice, functor::I_ADD>);
 REGISTER_KERNEL_BUILDER(Name("InplaceSub")
                             .Device(DEVICE_DEFAULT)
@@ -490,14 +490,14 @@ REGISTER_KERNEL_BUILDER(Name("InplaceSub")
                             .HostMemory("i")
                             .HostMemory("v")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         InplaceOp<CPUDevice, functor::I_SUB>);
 
 REGISTER_KERNEL_BUILDER(Name("DeepCopy")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         CopyOp<CPUDevice>);
 
 }  // end namespace

From 8f2afda3704da057a5e1ab1cd31991015099ae1b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 05:27:23 -0800
Subject: [PATCH 430/753] Automated Code Change

PiperOrigin-RevId: 845723345
---
 .../core/kernels/conv_grad_shape_utils.cc     | 25 +++++++++--------
 .../core/kernels/conv_grad_shape_utils.h      |  6 ++--
 tensorflow/core/kernels/conv_ops.cc           |  2 +-
 tensorflow/core/kernels/conv_ops.h            | 10 +++----
 tensorflow/core/kernels/conv_ops_3d.cc        |  6 ++--
 .../core/kernels/conv_ops_benchmark_test.cc   | 15 +++++-----
 .../kernels/conv_ops_fused_image_transform.cc |  4 +--
 tensorflow/core/kernels/conv_ops_fused_impl.h |  4 +--
 .../core/kernels/conv_ops_fused_int8.cc       |  5 ++--
 tensorflow/core/kernels/conv_ops_impl.h       | 10 +++----
 tensorflow/core/kernels/conv_ops_int32.cc     |  8 +++---
 tensorflow/core/kernels/conv_ops_test.cc      | 28 ++++++++++---------
 .../core/kernels/conv_ops_using_gemm.cc       |  4 +--
 tensorflow/core/kernels/count_up_to_op.cc     |  2 +-
 tensorflow/core/kernels/ctc_decoder_ops.cc    |  6 ++--
 tensorflow/core/kernels/ctc_loss_op.cc        |  4 +--
 tensorflow/core/kernels/cwise_op_abs.cc       |  6 ++--
 tensorflow/core/kernels/cwise_op_acos.cc      |  6 ++--
 tensorflow/core/kernels/cwise_op_add_1.cc     | 14 +++++-----
 tensorflow/core/kernels/cwise_op_add_2.cc     |  8 +++---
 .../core/kernels/cwise_op_bitwise_and.cc      |  4 +--
 .../core/kernels/cwise_op_bitwise_or.cc       |  4 +--
 .../core/kernels/cwise_op_bitwise_xor.cc      |  4 +--
 tensorflow/core/kernels/cwise_op_clip.cc      | 10 +++----
 tensorflow/core/kernels/cwise_op_div.cc       | 12 ++++----
 .../core/kernels/cwise_op_equal_to_1.cc       | 10 +++----
 .../core/kernels/cwise_op_equal_to_2.cc       |  4 +--
 tensorflow/core/kernels/cwise_op_floor_div.cc |  8 +++---
 tensorflow/core/kernels/cwise_op_floor_mod.cc |  8 +++---
 tensorflow/core/kernels/cwise_op_greater.cc   |  9 +++---
 .../core/kernels/cwise_op_greater_equal.cc    | 11 ++++----
 tensorflow/core/kernels/cwise_op_invert.cc    |  4 +--
 .../core/kernels/cwise_op_left_shift.cc       |  4 +--
 tensorflow/core/kernels/cwise_op_less.cc      | 10 +++----
 .../core/kernels/cwise_op_less_equal.cc       | 10 +++----
 tensorflow/core/kernels/cwise_op_maximum.cc   |  8 +++---
 tensorflow/core/kernels/cwise_op_minimum.cc   |  8 +++---
 tensorflow/core/kernels/cwise_op_mod.cc       | 12 ++++----
 tensorflow/core/kernels/cwise_op_mul_1.cc     |  8 +++---
 tensorflow/core/kernels/cwise_op_mul_2.cc     |  4 +--
 tensorflow/core/kernels/cwise_op_neg_1.cc     |  6 ++--
 .../core/kernels/cwise_op_not_equal_to_1.cc   | 10 +++----
 .../core/kernels/cwise_op_not_equal_to_2.cc   |  2 +-
 tensorflow/core/kernels/cwise_op_pow.cc       |  3 +-
 .../core/kernels/cwise_op_right_shift.cc      |  4 +--
 tensorflow/core/kernels/cwise_op_round.cc     |  2 +-
 tensorflow/core/kernels/cwise_op_sign.cc      |  7 +++--
 tensorflow/core/kernels/cwise_op_square.cc    | 10 +++----
 .../kernels/cwise_op_squared_difference.cc    | 10 +++----
 tensorflow/core/kernels/cwise_op_sub.cc       | 12 ++++----
 50 files changed, 198 insertions(+), 193 deletions(-)

diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.cc b/tensorflow/core/kernels/conv_grad_shape_utils.cc
index 42e114ad33581d..a7e53647b72bf9 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.cc
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.cc
@@ -53,10 +53,10 @@ namespace {
 absl::Status ConvBackpropExtractAndVerifyDimension(
     absl::string_view label, const TensorShape& input_shape,
     const TensorShape& filter_shape, const TensorShape& output_shape,
-    const absl::Span<const int32> dilations, const std::vector<int32>& strides,
-    Padding padding, int64_t padding_before, int64_t padding_after,
-    int spatial_dim, int filter_spatial_dim,
-    ConvBackpropSpatialDimension* dim) {
+    const absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
+    int64_t padding_before, int64_t padding_after, int spatial_dim,
+    int filter_spatial_dim, ConvBackpropSpatialDimension* dim) {
   dim->input_size = input_shape.dim_size(spatial_dim);
   dim->filter_size = filter_shape.dim_size(filter_spatial_dim);
   dim->output_size = output_shape.dim_size(spatial_dim);
@@ -96,9 +96,10 @@ absl::Status ConvBackpropComputeDimensionsV2(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
     const TensorShape& out_backprop_shape,
-    const absl::Span<const int32> dilations, const std::vector<int32>& strides,
-    Padding padding, absl::Span<const int64_t> explicit_paddings,
-    TensorFormat data_format, ConvBackpropDimensions* dims) {
+    const absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
+    absl::Span<const int64_t> explicit_paddings, TensorFormat data_format,
+    ConvBackpropDimensions* dims) {
   // The + 2 in the following line is for the batch and feature dimensions.
   const int num_dims = num_spatial_dims + 2;
   if (input_shape.dims() != num_dims) {
@@ -161,9 +162,9 @@ absl::Status ConvBackpropComputeDimensionsV2(
 absl::Status ConvBackpropComputeDimensions(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
-    const TensorShape& out_backprop_shape, const std::vector<int32>& strides,
+    const TensorShape& out_backprop_shape, const std::vector<int32_t>& strides,
     Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims) {
-  static constexpr std::array<int32, 5> one_dilations = {{1, 1, 1, 1, 1}};
+  static constexpr std::array<int32_t, 5> one_dilations = {{1, 1, 1, 1, 1}};
   return ConvBackpropComputeDimensionsV2(
       label, num_spatial_dims, input_shape, filter_shape, out_backprop_shape,
       one_dilations, strides, padding, /*explicit_paddings=*/{}, data_format,
@@ -181,13 +182,13 @@ absl::Status Conv2DBackpropComputeInputShape(
   }
 
   if (input_sizes.dim_size(0) == 4) {
-    return TensorShapeUtils::MakeShape(input_sizes.vec<int32>(), input_shape);
+    return TensorShapeUtils::MakeShape(input_sizes.vec<int32_t>(), input_shape);
   }
 
   if (input_sizes.dim_size(0) == 2) {
     const int batch_size = GetTensorDim(out_backprop_shape, data_format, 'N');
-    const int output_height = input_sizes.vec<int32>()(0);
-    const int output_width = input_sizes.vec<int32>()(1);
+    const int output_height = input_sizes.vec<int32_t>()(0);
+    const int output_width = input_sizes.vec<int32_t>()(1);
     const int output_depth = filter_shape.dim_size(2);
     if (output_height < 0 || output_width < 0) {
       return errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.h b/tensorflow/core/kernels/conv_grad_shape_utils.h
index d83c1bb25ee02f..cc0708c4fe4f74 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.h
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.h
@@ -69,7 +69,7 @@ struct ConvBackpropDimensions {
 absl::Status ConvBackpropComputeDimensions(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
-    const TensorShape& out_backprop_shape, const std::vector<int32>& strides,
+    const TensorShape& out_backprop_shape, const std::vector<int32_t>& strides,
     Padding padding, TensorFormat data_format, ConvBackpropDimensions* dims);
 
 // The V2 version computes the same outputs with arbitrary dilation rate and
@@ -78,8 +78,8 @@ absl::Status ConvBackpropComputeDimensions(
 absl::Status ConvBackpropComputeDimensionsV2(
     absl::string_view label, int num_spatial_dims,
     const TensorShape& input_shape, const TensorShape& filter_shape,
-    const TensorShape& out_backprop_shape, absl::Span<const int32> dilations,
-    const std::vector<int32>& strides, Padding padding,
+    const TensorShape& out_backprop_shape, absl::Span<const int32_t> dilations,
+    const std::vector<int32_t>& strides, Padding padding,
     absl::Span<const int64_t> explicit_paddings, TensorFormat data_format,
     ConvBackpropDimensions* dims);
 
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 33b114df328cef..51003ace32475c 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -48,7 +48,7 @@ absl::Status InitConv2DParameters(const OpKernelConstruction* context,
     TF_RETURN_IF_ERROR(
         context->GetAttr("explicit_paddings", &params->explicit_paddings));
   }
-  string data_format_string;
+  std::string data_format_string;
   TF_RETURN_IF_ERROR(context->GetAttr("data_format", &data_format_string));
   TF_REQUIRES(FormatFromString(data_format_string, &params->data_format),
               errors::InvalidArgument("Invalid data format"));
diff --git a/tensorflow/core/kernels/conv_ops.h b/tensorflow/core/kernels/conv_ops.h
index 65c63fec1e439f..199cd94c99cbaa 100644
--- a/tensorflow/core/kernels/conv_ops.h
+++ b/tensorflow/core/kernels/conv_ops.h
@@ -45,8 +45,8 @@ template <typename Device, typename T>
 struct LaunchConvOp {
   void operator()(OpKernelContext* context, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter,
-                  const std::vector<int64>& dilations,
-                  const std::vector<int64>& strides, Padding padding,
+                  const std::vector<int64_t>& dilations,
+                  const std::vector<int64_t>& strides, Padding padding,
                   const std::vector<int64_t>& explicit_paddings,
                   TensorFormat data_format, Tensor* output);
 };
@@ -85,13 +85,13 @@ struct Im2ColBufferResource : public ResourceBase {
   // the buffer memory held by this resource.
   mutex mu;
   T* data;
-  string DebugString() const { return "Im2ColBufferResource"; }
+  std::string DebugString() const { return "Im2ColBufferResource"; }
 };
 
 // Convolution parameters specified by Op attributes.
 struct Conv2DParameters {
-  std::vector<int32> dilations;
-  std::vector<int32> strides;
+  std::vector<int32_t> dilations;
+  std::vector<int32_t> strides;
   Padding padding;
   TensorFormat data_format;
   std::vector<int64_t> explicit_paddings;
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 72bad756b4d0fd..00c02ccd51c711 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -65,7 +65,7 @@ template <typename Device, typename T>
 class Conv3DOp : public BinaryOp<T> {
  public:
   explicit Conv3DOp(OpKernelConstruction* context) : BinaryOp<T>(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -175,8 +175,8 @@ class Conv3DOp : public BinaryOp<T> {
   }
 
  private:
-  std::vector<int32> dilation_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> dilation_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
   bool cudnn_use_autotune_;
diff --git a/tensorflow/core/kernels/conv_ops_benchmark_test.cc b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
index 183372705aa3df..779fbb7a50bcd6 100644
--- a/tensorflow/core/kernels/conv_ops_benchmark_test.cc
+++ b/tensorflow/core/kernels/conv_ops_benchmark_test.cc
@@ -189,7 +189,7 @@ static int64_t Conv2DWithPostOpsFlops(int batch, int height, int width,
 template <typename T>
 static Conv2DWithBiasAndActivationGraph Conv2DWithBiasAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type,
+    int out_depth, const std::string& activation_type,
     TensorFormat data_format = FORMAT_NHWC) {
   Conv2DWithBiasGraph conv_graph =
       Conv2DWithBias<T>(batch, height, width, in_depth, filter_w, filter_h,
@@ -249,7 +249,7 @@ static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(
 template <typename T>
 static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const string& activation_type,
+    int out_depth, const std::string& activation_type,
     TensorFormat data_format = FORMAT_NHWC) {
   Conv2DWithBatchNormGraph conv_graph =
       Conv2DWithBatchNorm<T>(batch, height, width, in_depth, filter_w, filter_h,
@@ -271,11 +271,10 @@ static Conv2DWithBatchNormAndActivationGraph Conv2DWithBatchNormAndActivation(
 // Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
 // fuses into it additional computations (e.g. Relu).
 template <typename T>
-static Graph* FusedConv2DWithBias(int batch, int height, int width,
-                                  int in_depth, int filter_w, int filter_h,
-                                  int out_depth,
-                                  const std::vector<string>& fused_ops = {},
-                                  TensorFormat data_format = FORMAT_NHWC) {
+static Graph* FusedConv2DWithBias(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, const std::vector<std::string>& fused_ops = {},
+    TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
   Tensor images_t = data_format == FORMAT_NHWC
@@ -341,7 +340,7 @@ static Graph* FusedConv2DWithBias(int batch, int height, int width,
 template <typename T>
 static Graph* FusedConv2DWithBatchNorm(
     int batch, int height, int width, int in_depth, int filter_w, int filter_h,
-    int out_depth, const std::vector<string>& fused_ops = {},
+    int out_depth, const std::vector<std::string>& fused_ops = {},
     TensorFormat data_format = FORMAT_NHWC) {
   Graph* graph = new Graph(OpRegistry::Global());
 
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 8887103240c9d7..ef031685c4093e 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -711,7 +711,7 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
 
     // Compute the shape of the output tensor, and allocate it.
     TensorShape padded_shape;
-    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
+    TTypes<int32_t>::ConstMatrix paddings_matrix = paddings.matrix<int32_t>();
     for (int d = 0; d < dims; ++d) {
       const int32_t before =
           paddings_matrix(d, 0);  // Pad before existing elements.
@@ -867,7 +867,7 @@ class FusedResizeConv2DUsingGemmOp : public OpKernel {
   }
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   bool align_corners_;
   int offset_;
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index 51a33288c8e8bb..154f43a226cfdb 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -307,7 +307,7 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
 };
 
 template <>
-struct LaunchFusedConv2DOp<CPUDevice, int8>;
+struct LaunchFusedConv2DOp<CPUDevice, int8_t>;
 
 template <>
 struct LaunchFusedConv2DOp<CPUDevice, qint8>;
@@ -732,7 +732,7 @@ class FusedConv2DOp : public OpKernel {
     // convolution with BiasAdd, but in practice it doesn't work, cuDNN ignores
     // this parameter and always does Relu activation.
     if (std::is_same<Device, GPUDevice>::value) {
-      if (std::is_same<T, int8>::value || std::is_same<T, qint8>::value) {
+      if (std::is_same<T, int8_t>::value || std::is_same<T, qint8>::value) {
         patterns = {{FCT::kBiasAdd, {"BiasAdd"}},
                     {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}}};
       } else {
diff --git a/tensorflow/core/kernels/conv_ops_fused_int8.cc b/tensorflow/core/kernels/conv_ops_fused_int8.cc
index 7f919d5087dbbe..e23864960c1568 100644
--- a/tensorflow/core/kernels/conv_ops_fused_int8.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_int8.cc
@@ -300,9 +300,8 @@ struct LaunchFusedConv2DOpCpuInt8Helper {
 };
 
 template <>
-struct LaunchFusedConv2DOp<CPUDevice, int8>
-    : LaunchFusedConv2DOpCpuInt8Helper<int8> {
-};
+struct LaunchFusedConv2DOp<CPUDevice, int8_t>
+    : LaunchFusedConv2DOpCpuInt8Helper<int8_t> {};
 
 template <>
 struct LaunchFusedConv2DOp<CPUDevice, qint8>
diff --git a/tensorflow/core/kernels/conv_ops_impl.h b/tensorflow/core/kernels/conv_ops_impl.h
index 0d3fc798bbe3c2..3d5a0ac76e5b5b 100644
--- a/tensorflow/core/kernels/conv_ops_impl.h
+++ b/tensorflow/core/kernels/conv_ops_impl.h
@@ -178,13 +178,13 @@ struct LaunchGrouped {
     std::array<int64_t, 5> shuffle({3, 0, 1, 2, 4});
 
     // Compute pre shuffle dimemnsions.
-    auto pre_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> {
+    auto pre_shuffle = [&](const Tensor& tensor) -> std::array<int64_t, 5> {
       return {tensor.dim_size(0), tensor.dim_size(1), tensor.dim_size(2),
               num_groups, tensor.dim_size(3) / num_groups};
     };
 
     // Compute post shuffle dimemnsions.
-    auto post_shuffle = [&](const Tensor& tensor) -> std::array<int64, 5> {
+    auto post_shuffle = [&](const Tensor& tensor) -> std::array<int64_t, 5> {
       return {num_groups, tensor.dim_size(0), tensor.dim_size(1),
               tensor.dim_size(2), tensor.dim_size(3) / num_groups};
     };
@@ -262,8 +262,8 @@ template <typename T>
 struct LaunchConvOp<CPUDevice, T> {
   void operator()(OpKernelContext* context, bool cudnn_use_autotune,
                   const Tensor& input, const Tensor& filter,
-                  const std::vector<int64>& dilations,
-                  const std::vector<int64>& strides, const Padding padding,
+                  const std::vector<int64_t>& dilations,
+                  const std::vector<int64_t>& strides, const Padding padding,
                   const std::vector<int64_t>& explicit_paddings,
                   TensorFormat data_format, Tensor* output) {
     // For now just calling existing launchers based on spatial dimensions.
@@ -292,7 +292,7 @@ class ConvOp : public BinaryOp<T> {
     OP_REQUIRES(context, groups_ == 1,
                 absl::UnimplementedError(
                     "Grouped/Depthwise Convolutions are not supported yet."));
-    string data_format_str;
+    std::string data_format_str;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str));
     OP_REQUIRES(context,
                 data_format_str == "CHANNELS_LAST" ||
diff --git a/tensorflow/core/kernels/conv_ops_int32.cc b/tensorflow/core/kernels/conv_ops_int32.cc
index 46320bded04997..a582aeb4b7277c 100644
--- a/tensorflow/core/kernels/conv_ops_int32.cc
+++ b/tensorflow/core/kernels/conv_ops_int32.cc
@@ -30,12 +30,12 @@ template struct Conv2DOp<CPUDevice, int32>;
 // CPU implementation, don't register this EigenTensor-based version.
 #if !defined(USE_GEMM_FOR_CONV)
 REGISTER_KERNEL_BUILDER(
-    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    Conv2DOp<CPUDevice, int32>);
+    Name("Conv2D").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    Conv2DOp<CPUDevice, int32_t>);
 #endif  // USE_GEMM_FOR_CONV
 REGISTER_KERNEL_BUILDER(
-    Name("Conv").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    ConvOp<CPUDevice, int32>);
+    Name("Conv").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    ConvOp<CPUDevice, int32_t>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <>
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 929d5cb51b4c08..caff583b570092 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -86,8 +86,9 @@ class FusedResizePadConvOpTest : public OpsTestBase {
     const int right_padding = 0;
 
     AddInputFromArray<T>(image.shape(), image.flat<T>());
-    AddInputFromArray<int32>(TensorShape({2}), {resized_height, resized_width});
-    AddInputFromArray<int32>(
+    AddInputFromArray<int32_t>(TensorShape({2}),
+                               {resized_height, resized_width});
+    AddInputFromArray<int32_t>(
         TensorShape({4, 2}),
         {0, 0, top_padding, bottom_padding, left_padding, right_padding, 0, 0});
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
@@ -128,8 +129,8 @@ class FusedResizePadConvOpTest : public OpsTestBase {
                                int resize_height, int y_padding, int x_padding,
                                int filter_size, int filter_count,
                                bool resize_align_corners,
-                               const string& pad_mode, int stride,
-                               const string& padding, DataType dtype) {
+                               const std::string& pad_mode, int stride,
+                               const std::string& padding, DataType dtype) {
     Scope root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
@@ -188,8 +189,9 @@ class FusedResizePadConvOpTest : public OpsTestBase {
   void CompareFusedPadOnlyAndSeparate(int input_width, int input_height,
                                       int input_depth, int y_padding,
                                       int x_padding, int filter_size,
-                                      int filter_count, const string& pad_mode,
-                                      int stride, const string& padding,
+                                      int filter_count,
+                                      const std::string& pad_mode, int stride,
+                                      const std::string& padding,
                                       DataType dtype) {
     Scope root = tensorflow::Scope::NewRootScope();
     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
@@ -488,7 +490,7 @@ class FusedConv2DOpTest : public OpsTestBase {
   static constexpr int kImageBatchCount = 8;
 
   static constexpr bool kIsInt8 =
-      std::is_same<T, int8>::value || std::is_same<T, qint8>::value;
+      std::is_same<T, int8_t>::value || std::is_same<T, qint8>::value;
 
   using BiasAddGraphRunner =
       std::function<void(const Tensor& input_data, const Tensor& filter_data,
@@ -680,7 +682,7 @@ class FusedConv2DOpTest : public OpsTestBase {
       const Tensor& input_data, const Tensor& filter_data,
       const Tensor& scale_data, const Tensor& offset_data,
       const Tensor& mean_data, const Tensor& variance_data,
-      const string& activation_type, const std::string& padding,
+      const std::string& activation_type, const std::string& padding,
       const std::vector<int>& explicit_paddings, Tensor* output,
       bool allow_gpu_device = false, int stride = 1) {
     Scope root = tensorflow::Scope::NewRootScope();
@@ -780,7 +782,7 @@ class FusedConv2DOpTest : public OpsTestBase {
         TensorShape shape = arg_data.shape();
         Tensor arg_data_float = Tensor(dtype_args, shape);
         for (int index = 0; index < arg_data.NumElements(); index++) {
-          int8 v = *(reinterpret_cast<int8*>(arg_data.data()) + index);
+          int8_t v = *(reinterpret_cast<int8_t*>(arg_data.data()) + index);
           *(reinterpret_cast<float*>(arg_data_float.data()) + index) =
               static_cast<float>(v);
         }
@@ -886,7 +888,7 @@ class FusedConv2DOpTest : public OpsTestBase {
 
   void ExpectMatch(const Tensor& x, const Tensor& y, double atol) {
     constexpr bool exact_match =
-        std::is_same<T, int8>::value || std::is_same<T, qint8>::value;
+        std::is_same<T, int8_t>::value || std::is_same<T, qint8>::value;
     if (exact_match) {
       test::ExpectEqual(x, y);
     } else {
@@ -903,7 +905,7 @@ class FusedConv2DOpTest : public OpsTestBase {
 
     constexpr int int8_scale = 80;
 
-    using ConvT = typename std::conditional<kIsInt8, int8, T>::type;
+    using ConvT = typename std::conditional<kIsInt8, int8_t, T>::type;
     DataType dtype_conv = DataTypeToEnum<ConvT>::v();
 
     TensorShape image_shape{image_batch_count, image_height, image_width,
@@ -1120,7 +1122,7 @@ class FusedConv2DOpTest : public OpsTestBase {
   // Verifies that computing Conv2D+FusedBatchNorm+{Activation} in a graph is
   // identical to FusedConv2D.
   void VerifyConv2DWithBatchNormAndActivation(
-      const string& activation, int filter_size, int filter_count,
+      const std::string& activation, int filter_size, int filter_count,
       const std::vector<int>& explicit_paddings = {}, int depth = kDepth,
       int image_width = kImageWidth, int image_height = kImageHeight,
       int image_batch_count = kImageBatchCount) {
@@ -1353,7 +1355,7 @@ REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
                             SpatialConvolutionAndActivation);
 #endif
 
-using FusedBiasAddDataTypes = ::testing::Types<float, double, int8, qint8>;
+using FusedBiasAddDataTypes = ::testing::Types<float, double, int8_t, qint8>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 3ebd3a4fa76d93..531b6377b2ff64 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -433,7 +433,7 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
   explicit Conv2DUsingGemmOp(OpKernelConstruction* context)
       : BinaryOp<T>(context) {
     OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -557,7 +557,7 @@ class Conv2DUsingGemmOp : public BinaryOp<T> {
   }
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   TensorFormat data_format_;
 
diff --git a/tensorflow/core/kernels/count_up_to_op.cc b/tensorflow/core/kernels/count_up_to_op.cc
index 5abc17a8aa2aaf..fe0709186c6809 100644
--- a/tensorflow/core/kernels/count_up_to_op.cc
+++ b/tensorflow/core/kernels/count_up_to_op.cc
@@ -102,7 +102,7 @@ class ResourceCountUpToOp : public OpKernel {
       Name("ResourceCountUpTo").TypeConstraint<TYPE>("T").Device(DEVICE_CPU), \
       ResourceCountUpToOp<TYPE>)
 
-REGISTER(int32);
+REGISTER(int32_t);
 REGISTER(int64_t);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index 401f1572298d9b..7c6d9132dd2142 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -91,7 +91,7 @@ class CTCDecodeHelper {
           " batch_size: ", batch_size);
     }
 
-    auto seq_len_t = (*seq_len)->vec<int32>();
+    auto seq_len_t = (*seq_len)->vec<int32_t>();
 
     for (int b = 0; b < batch_size; ++b) {
       if (!(seq_len_t(b) <= max_time)) {
@@ -220,7 +220,7 @@ class CTCGreedyDecoderOp : public OpKernel {
       input_list_t.emplace_back(inputs_t.data() + t * batch_size * num_classes,
                                 batch_size, num_classes);
     }
-    auto seq_len_t = seq_len->vec<int32>();
+    auto seq_len_t = seq_len->vec<int32_t>();
     auto log_prob_t = log_prob->matrix<T>();
 
     log_prob_t.setZero();
@@ -309,7 +309,7 @@ class CTCBeamSearchDecoderOp : public OpKernel {
                             &decoded_values, &decoded_shape));
 
     auto inputs_t = inputs->tensor<T, 3>();
-    auto seq_len_t = seq_len->vec<int32>();
+    auto seq_len_t = seq_len->vec<int32_t>();
     auto log_prob_t = log_prob->matrix<T>();
 
     const TensorShape& inputs_shape = inputs->shape();
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index 63d31fcf62d46d..a1b851feb206db 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -127,7 +127,7 @@ class CTCLossOp : public OpKernel {
         errors::InvalidArgument("len(sequence_length) != batch_size.  ",
                                 "len(sequence_length):  ", seq_len->dim_size(0),
                                 " batch_size: ", batch_size));
-    auto seq_len_t = seq_len->vec<int32>();
+    auto seq_len_t = seq_len->vec<int32_t>();
 
     OP_REQUIRES(ctx, labels_indices->dim_size(0) == labels_values->dim_size(0),
                 errors::InvalidArgument(
@@ -166,7 +166,7 @@ class CTCLossOp : public OpKernel {
                                           0, " and ", batch_size,
                                           " but saw: ", batch_indices));
 
-      auto values = g.values<int32>();
+      auto values = g.values<int32_t>();
       std::vector<int>* b_values = &labels_t[batch_indices];
       b_values->resize(values.size());
       for (int i = 0; i < values.size(); ++i) (*b_values)[i] = values(i);
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index 0f32478dcc7dc6..4f3e04d7cd4c7f 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER8(UnaryOp, CPU, "Abs", functor::abs, Eigen::half, bfloat16, float,
-          double, int8, int16, int32, int64_t);
+          double, int8_t, int16_t, int32_t, int64_t);
 
 REGISTER2(UnaryOp, CPU, "ComplexAbs", functor::abs, complex64, complex128);
 
@@ -44,7 +44,7 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::abs<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::abs<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index f2ca5677469f18..50fce03c1e0f95 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Acos", functor::acos, Eigen::half, bfloat16, float,
-          double, int8);
-REGISTER5(UnaryOp, CPU, "Acos", functor::acos, int16, int32, int64_t, complex64,
-          complex128);
+          double, int8_t);
+REGISTER5(UnaryOp, CPU, "Acos", functor::acos, int16_t, int32_t, int64_t,
+          complex64, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index fdb3de69b65033..35483f244836fa 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER6(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double, int32,
-          int64_t, bfloat16);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, float, Eigen::half, double,
+          int32_t, int64_t, bfloat16);
 
 REGISTER6(BinaryOp, CPU, "AddV2", functor::add, float, Eigen::half, double,
-          int32, int64_t, bfloat16);
+          int32_t, int64_t, bfloat16);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -55,14 +55,14 @@ REGISTER_KERNEL_BUILDER(Name("Add")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32_t>>);
 REGISTER_KERNEL_BUILDER(Name("AddV2")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::add<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_add_2.cc b/tensorflow/core/kernels/cwise_op_add_2.cc
index 088a10fcb738f7..bb897eedca48b0 100644
--- a/tensorflow/core/kernels/cwise_op_add_2.cc
+++ b/tensorflow/core/kernels/cwise_op_add_2.cc
@@ -22,13 +22,13 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Add", functor::add, int8, int16, complex64, uint8,
-          complex128, tstring);
+REGISTER6(BinaryOp, CPU, "Add", functor::add, int8_t, int16_t, complex64,
+          uint8_t, complex128, tstring);
 
 // Notice: String is excluded to allow marking AddV2 is_commutative and
 // is_aggregate.
-REGISTER8(BinaryOp, CPU, "AddV2", functor::add, int8, int16, complex64, uint8,
-          uint16, uint32, uint64, complex128);
+REGISTER8(BinaryOp, CPU, "AddV2", functor::add, int8_t, int16_t, complex64,
+          uint8_t, uint16_t, uint32_t, uint64_t, complex128);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
index 6509665e89864d..927c017cbabb82 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_and.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
index d974d249fac06f..aab01711419c2c 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_or.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -19,8 +19,8 @@ namespace tensorflow {
 
 #if !defined(MLIR_GENERATED_CPU_KERNELS_ENABLED) || \
     !defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
-REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
index 831f4d86c48277..a7a7c91fde59f0 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index 949b162509ecff..f8cbd536b24731 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -269,12 +269,12 @@ REGISTER_CPU_KERNEL(Eigen::half);
 REGISTER_CPU_KERNEL(float);
 REGISTER_CPU_KERNEL(double);
 REGISTER_CPU_KERNEL(bfloat16);
-REGISTER_CPU_KERNEL(int8);
-REGISTER_CPU_KERNEL(int16);
-REGISTER_CPU_KERNEL(int32);
+REGISTER_CPU_KERNEL(int8_t);
+REGISTER_CPU_KERNEL(int16_t);
+REGISTER_CPU_KERNEL(int32_t);
 REGISTER_CPU_KERNEL(int64_t);
-REGISTER_CPU_KERNEL(uint8);
-REGISTER_CPU_KERNEL(uint16);
+REGISTER_CPU_KERNEL(uint8_t);
+REGISTER_CPU_KERNEL(uint16_t);
 REGISTER_CPU_KERNEL(std::complex<float>);
 REGISTER_CPU_KERNEL(std::complex<double>);
 #undef REGISTER_CPU_KERNEL
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 8840579bdeccee..d537a7f39e0753 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -19,10 +19,10 @@ namespace tensorflow {
 
 REGISTER6(BinaryOp, CPU, "Div", functor::div, float, Eigen::half, double,
           bfloat16, complex64, complex128);
-REGISTER8(BinaryOp, CPU, "Div", functor::safe_div, uint8, uint16, uint32,
-          uint64, int8, int16, int32, int64_t);
-REGISTER8(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8, uint16,
-          uint32, uint64, int8, int16, int32, int64_t);
+REGISTER8(BinaryOp, CPU, "Div", functor::safe_div, uint8_t, uint16_t, uint32_t,
+          uint64_t, int8_t, int16_t, int32_t, int64_t);
+REGISTER8(BinaryOp, CPU, "TruncateDiv", functor::safe_div, uint8_t, uint16_t,
+          uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t);
 REGISTER4(BinaryOp, CPU, "TruncateDiv", functor::truncate_div_real, Eigen::half,
           bfloat16, float, double);
 REGISTER6(BinaryOp, CPU, "RealDiv", functor::div, float, Eigen::half, double,
@@ -35,8 +35,8 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_div<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 87b499c96fdd05..7aecd4f62b2bf7 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Equal", functor::equal_to, float, Eigen::half, double,
-          uint8, int8, int16, bfloat16);
-REGISTER8(BinaryOp, CPU, "Equal", functor::equal_to, uint16, uint32, uint64,
-          qint8, qint16, quint8, quint16, qint32);
+          uint8_t, int8_t, int16_t, bfloat16);
+REGISTER8(BinaryOp, CPU, "Equal", functor::equal_to, uint16_t, uint32_t,
+          uint64_t, qint8, qint16, quint8, quint16, qint32);
 REGISTER_KERNEL_BUILDER(
     Name("ApproximateEqual").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     ApproximateEqualOp<CPUDevice, float>);
@@ -32,8 +32,8 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::equal_to<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::equal_to<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
index 29f15d7f7d9a67..e17cda8f2fbab6 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, int32, int64_t, complex64,
-          complex128, tstring, bool);
+REGISTER6(BinaryOp, CPU, "Equal", functor::equal_to, int32_t, int64_t,
+          complex64, complex128, tstring, bool);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Equal", functor::equal_to, int8, int16, int64,
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index 0b4584988ad526..95c5652548004a 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
-          uint32, uint64, int8, int16, int32, int64_t);
+REGISTER8(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8_t, uint16_t,
+          uint32_t, uint64_t, int8_t, int16_t, int32_t, int64_t);
 REGISTER4(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
           Eigen::half, bfloat16, double);
 
@@ -49,7 +49,7 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_div<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 590f6e592df7d3..9cc40508e1adce 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int8, int16,
-          int32, int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 REGISTER4(BinaryOp, CPU, "FloorMod", functor::floor_fmod, Eigen::half, bfloat16,
           float, double);
 
@@ -39,7 +39,7 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index b05e875e19b3d9..1cd27097ce66fe 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER9(BinaryOp, CPU, "Greater", functor::greater, float, Eigen::half,
-          double, int32, int64_t, uint8, uint16, uint32, uint64);
-REGISTER3(BinaryOp, CPU, "Greater", functor::greater, int8, int16, bfloat16);
+          double, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
+REGISTER3(BinaryOp, CPU, "Greater", functor::greater, int8_t, int16_t,
+          bfloat16);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Greater", functor::greater, float, Eigen::half,
@@ -44,6 +45,6 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::greater<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 7ccfc5501a6fda..1c9e7df836deb7 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -17,9 +17,10 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER9(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, float,
-          Eigen::half, double, int32, int64_t, uint8, uint16, uint32, uint64);
-REGISTER3(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, int8, int16,
-          bfloat16);
+          Eigen::half, double, int32_t, int64_t, uint8_t, uint16_t, uint32_t,
+          uint64_t);
+REGISTER3(BinaryOp, CPU, "GreaterEqual", functor::greater_equal, int8_t,
+          int16_t, bfloat16);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER9(BinaryOp, GPU, "GreaterEqual", functor::greater_equal, float,
@@ -45,7 +46,7 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::greater_equal<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index 5e5e827217ca1b..2f54bd8292b3b6 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64_t,
-          uint8, uint16, uint32, uint64);
+REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8_t, int16_t, int32_t,
+          int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_left_shift.cc b/tensorflow/core/kernels/cwise_op_left_shift.cc
index 19c9f84ead760e..76632030feec8f 100644
--- a/tensorflow/core/kernels/cwise_op_left_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_left_shift.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "LeftShift", functor::left_shift, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "LeftShift", functor::left_shift, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 9c7535fae31365..62dd9a18a5d86e 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double,
-          bfloat16, int32);
-REGISTER7(BinaryOp, CPU, "Less", functor::less, uint8, uint16, uint32, uint64,
-          int8, int16, int64_t);
+          bfloat16, int32_t);
+REGISTER7(BinaryOp, CPU, "Less", functor::less, uint8_t, uint16_t, uint32_t,
+          uint64_t, int8_t, int16_t, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -45,6 +45,6 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::less<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index b6acec213cb6ff..e17272986381fb 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "LessEqual", functor::less_equal, float, Eigen::half,
-          bfloat16, double, int32);
-REGISTER7(BinaryOp, CPU, "LessEqual", functor::less_equal, int64_t, uint8,
-          uint16, uint32, uint64, int8, int16);
+          bfloat16, double, int32_t);
+REGISTER7(BinaryOp, CPU, "LessEqual", functor::less_equal, int64_t, uint8_t,
+          uint16_t, uint32_t, uint64_t, int8_t, int16_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -45,7 +45,7 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::less_equal<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 9be2a3a0fc9062..74db589e7783d2 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER4(BinaryOp, CPU, "Maximum", functor::maximum, float, Eigen::half,
           bfloat16, double);
-REGISTER8(BinaryOp, CPU, "Maximum", functor::maximum, int8, uint8, int16,
-          uint16, int32, uint32, int64_t, uint64);
+REGISTER8(BinaryOp, CPU, "Maximum", functor::maximum, int8_t, uint8_t, int16_t,
+          uint16_t, int32_t, uint32_t, int64_t, uint64_t);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Maximum", functor::maximum, float, Eigen::half,
@@ -44,7 +44,7 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::maximum<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::maximum<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 67d1c6a8452517..5a101acf5e47ce 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER4(BinaryOp, CPU, "Minimum", functor::minimum, float, Eigen::half,
           bfloat16, double);
-REGISTER8(BinaryOp, CPU, "Minimum", functor::minimum, int8, uint8, int16,
-          uint16, int32, uint32, int64_t, uint64);
+REGISTER8(BinaryOp, CPU, "Minimum", functor::minimum, int8_t, uint8_t, int16_t,
+          uint16_t, int32_t, uint32_t, int64_t, uint64_t);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(BinaryOp, GPU, "Minimum", functor::minimum, float, Eigen::half,
@@ -45,7 +45,7 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::minimum<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::minimum<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mod.cc b/tensorflow/core/kernels/cwise_op_mod.cc
index 32fd740a38ccca..51b91ceb85c2fd 100644
--- a/tensorflow/core/kernels/cwise_op_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_mod.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER2(BinaryOp, CPU, "Mod", functor::safe_mod, int32, int64_t);
+REGISTER2(BinaryOp, CPU, "Mod", functor::safe_mod, int32_t, int64_t);
 REGISTER2(BinaryOp, CPU, "Mod", functor::fmod, float, double);
-REGISTER2(BinaryOp, CPU, "TruncateMod", functor::safe_mod, int32, int64_t);
+REGISTER2(BinaryOp, CPU, "TruncateMod", functor::safe_mod, int32_t, int64_t);
 REGISTER2(BinaryOp, CPU, "TruncateMod", functor::fmod, float, double);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -45,13 +45,13 @@ REGISTER_KERNEL_BUILDER(Name("Mod")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_mod<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_mod<int32_t>>);
 REGISTER_KERNEL_BUILDER(Name("TruncateMod")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_mod<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::safe_mod<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 9af3108676258b..cc6fd91248766c 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double, uint8,
-          int32, bfloat16);
+REGISTER6(BinaryOp, CPU, "Mul", functor::mul, float, Eigen::half, double,
+          uint8_t, int32_t, bfloat16);
 REGISTER6(BinaryOp, CPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
           double, complex64, complex128, bfloat16);
 
@@ -53,8 +53,8 @@ REGISTER_KERNEL_BUILDER(Name("Mul")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::mul<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::mul<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_mul_2.cc b/tensorflow/core/kernels/cwise_op_mul_2.cc
index 66ed75f469ebb5..31080a3e01bc0a 100644
--- a/tensorflow/core/kernels/cwise_op_mul_2.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_2.cc
@@ -22,8 +22,8 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER8(BinaryOp, CPU, "Mul", functor::mul, int8, uint16, uint32, uint64,
-          int16, int64_t, complex64, complex128);
+REGISTER8(BinaryOp, CPU, "Mul", functor::mul, int8_t, uint16_t, uint32_t,
+          uint64_t, int16_t, int64_t, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER8(BinaryOp, GPU, "Mul", functor::mul, int8, uint16, uint32, uint64,
diff --git a/tensorflow/core/kernels/cwise_op_neg_1.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
index f3bd66c8986e5c..7f589ece2e313f 100644
--- a/tensorflow/core/kernels/cwise_op_neg_1.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64_t);
+REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8_t, int16_t, int32_t, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -37,6 +37,6 @@ REGISTER_KERNEL_BUILDER(Name("Neg")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::neg<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::neg<int32_t>>);
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 22b8ff992ce37b..6e787b88bb1694 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -17,17 +17,17 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
-          double, uint8, int8, int16, bfloat16);
-REGISTER8(BinaryOp, CPU, "NotEqual", functor::not_equal_to, uint16, uint32,
-          uint64, qint8, qint16, quint8, quint16, qint32);
+          double, uint8_t, int8_t, int16_t, bfloat16);
+REGISTER8(BinaryOp, CPU, "NotEqual", functor::not_equal_to, uint16_t, uint32_t,
+          uint64_t, qint8, qint16, quint8, quint16, qint32);
 
 REGISTER_KERNEL_BUILDER(Name("NotEqual")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::not_equal_to<int32_t>>);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
index 9f5a2508733ebe..537a8c4c0b8bf9 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_2.cc
@@ -22,7 +22,7 @@ namespace tensorflow {
 // sharded files, only make its register calls when not __ANDROID_TYPES_SLIM__.
 #if !defined(__ANDROID_TYPES_SLIM__)
 
-REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, int32, int64_t,
+REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, int32_t, int64_t,
           complex64, complex128, tstring, bool);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index d052c6021faf37..ae21c4613f1bc4 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, bfloat16,
           double, complex64, complex128);
-REGISTER4(BinaryOp, CPU, "Pow", functor::safe_pow, int8, int16, int32, int64_t);
+REGISTER4(BinaryOp, CPU, "Pow", functor::safe_pow, int8_t, int16_t, int32_t,
+          int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_right_shift.cc b/tensorflow/core/kernels/cwise_op_right_shift.cc
index 7386c3674ba9c0..cc960b023390a1 100644
--- a/tensorflow/core/kernels/cwise_op_right_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_right_shift.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER8(BinaryOp, CPU, "RightShift", functor::right_shift, int8, int16, int32,
-          int64_t, uint8, uint16, uint32, uint64);
+REGISTER8(BinaryOp, CPU, "RightShift", functor::right_shift, int8_t, int16_t,
+          int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index fa3289f9e57f32..bab42c5b58f5cc 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER6(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
-          bfloat16, int32, int64_t);
+          bfloat16, int32_t, int64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 895280a22ab890..632e4a8cce12d5 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER6(UnaryOp, CPU, "Sign", functor::sign, float, double, Eigen::half,
           bfloat16, complex64, complex128);
-REGISTER4(UnaryOp, CPU, "Sign", functor::sign, int8, int16, int32, int64_t);
+REGISTER4(UnaryOp, CPU, "Sign", functor::sign, int8_t, int16_t, int32_t,
+          int64_t);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
@@ -41,7 +42,7 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::sign<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::sign<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index e8122ba19a2632..ddca86ae25b7c9 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -18,9 +18,9 @@ limitations under the License.
 namespace tensorflow {
 
 REGISTER7(UnaryOp, CPU, "Square", functor::square, float, Eigen::half, double,
-          int32, int64_t, complex64, complex128);
-REGISTER7(UnaryOp, CPU, "Square", functor::square, bfloat16, int8, int16, uint8,
-          uint16, uint32, uint64);
+          int32_t, int64_t, complex64, complex128);
+REGISTER7(UnaryOp, CPU, "Square", functor::square, bfloat16, int8_t, int16_t,
+          uint8_t, uint16_t, uint32_t, uint64_t);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -45,7 +45,7 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                             .Device(DEVICE_DEFAULT)
                             .HostMemory("x")
                             .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::square<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        UnaryOp<CPUDevice, functor::square<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 2a34dd2c5290aa..c6f3fe2567afea 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
-          float, Eigen::half, double, bfloat16, int32, int64_t, complex64,
+          float, Eigen::half, double, bfloat16, int32_t, int64_t, complex64,
           complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
@@ -37,8 +37,8 @@ REGISTER_KERNEL_BUILDER(
         .HostMemory("x")
         .HostMemory("y")
         .HostMemory("z")
-        .TypeConstraint<int32>("T"),
-    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
+        .TypeConstraint<int32_t>("T"),
+    BinaryOp<CPUDevice, functor::squared_difference<int32_t>>);
 
 REGISTER_KERNEL_BUILDER(
     Name("SquaredDifference")
@@ -46,7 +46,7 @@ REGISTER_KERNEL_BUILDER(
         .HostMemory("x")
         .HostMemory("y")
         .HostMemory("z")
-        .TypeConstraint<int32>("T"),
-    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
+        .TypeConstraint<int32_t>("T"),
+    BinaryOp<CPUDevice, functor::squared_difference<int32_t>>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index db8c81db3cf460..b4eb0447115d22 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -16,12 +16,12 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER8(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double, int32,
-          int64_t, bfloat16, complex64, complex128);
+REGISTER8(BinaryOp, CPU, "Sub", functor::sub, float, Eigen::half, double,
+          int32_t, int64_t, bfloat16, complex64, complex128);
 #if !defined(__ANDROID_TYPES_SLIM__)
 // Sub op for int8, uint8, int16, uint16
-REGISTER6(BinaryOp, CPU, "Sub", functor::sub, int8, uint8, int16, uint16,
-          uint32, uint64);
+REGISTER6(BinaryOp, CPU, "Sub", functor::sub, int8_t, uint8_t, int16_t,
+          uint16_t, uint32_t, uint64_t);
 #else
 // We only register the first type when we have multi-argument calls in the
 // case where we're trying to reduce executable size, but it turns out that the
@@ -53,7 +53,7 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                             .HostMemory("x")
                             .HostMemory("y")
                             .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::sub<int32>>);
+                            .TypeConstraint<int32_t>("T"),
+                        BinaryOp<CPUDevice, functor::sub<int32_t>>);
 
 }  // namespace tensorflow

From 3649532053c9b9e2ceffa28e60494a8171fcabc3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 05:43:46 -0800
Subject: [PATCH 431/753] Automated Code Change

PiperOrigin-RevId: 845728042
---
 .../xla/stream_executor/rocm/rocm_executor_test.cc   | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
index 60e1e72cd4b657..0bfef481d9f4f4 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
@@ -93,8 +93,8 @@ TEST(RocmExecutorTest, CreateUnifiedMemoryAllocatorWorks) {
       executor->CreateMemoryAllocator(MemorySpace::kUnified));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
   allocation.reset();
 }
 
@@ -107,8 +107,8 @@ TEST(RocmExecutorTest, CreateHostMemoryAllocatorWorks) {
                           executor->CreateMemoryAllocator(MemorySpace::kHost));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
   allocation.reset();
 }
 
@@ -122,8 +122,8 @@ TEST(RocmExecutorTest, CreateCollectiveMemoryAllocatorWorks) {
       executor->CreateMemoryAllocator(MemorySpace::kCollective));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<MemoryAllocation> allocation,
                           allocator->Allocate(1024));
-  EXPECT_NE(allocation->opaque(), nullptr);
-  EXPECT_EQ(allocation->size(), 1024);
+  EXPECT_NE(allocation->address().opaque(), nullptr);
+  EXPECT_EQ(allocation->address().size(), 1024);
   allocation.reset();
 }
 

From 7e784e899a791ba931889078b6dea9a6c3d41978 Mon Sep 17 00:00:00 2001
From: Nikita Putikhin <nputikhin@google.com>
Date: Wed, 17 Dec 2025 05:44:44 -0800
Subject: [PATCH 432/753] Reverts a8ad7b1565df6a2e737756f7fa55002be3d58b98

PiperOrigin-RevId: 845728341
---
 .../gpu/transforms/gemm_fusion_test.cc        | 52 +++++++++++++++++++
 .../xla/service/gpu/triton_fusion_analysis.cc | 30 ++++++++---
 .../service/gpu/triton_tiling_propagation.cc  | 46 ++++++++++++++--
 .../service/gpu/triton_tiling_propagation.h   |  2 +
 4 files changed, 120 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
index c77d76d6954aed..8036bc09ab2ce6 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_fusion_test.cc
@@ -227,6 +227,58 @@ ENTRY e {
   EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
 }
 
+TEST_F(GemmFusionTest, FuseSliceWithOtherUsersWhenDotHasSmallK) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = bf16[512,3584]{1,0} parameter(0)
+  p1 = bf16[3584,14400]{0,1} parameter(1)
+  p2 = bf16[64,14336]{1,0} parameter(2)
+
+  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
+
+  sl1 = bf16[512,64]{1,0} slice(d0), slice={[0:512], [14336:14400]}
+  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
+})"));
+
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
+  EXPECT_TRUE(GemmFusion(cc).Run(module.get()).value());
+
+  // Check that the second dot is fused and the fusion contains sl1.
+  // We make no assumptions about other fusions.
+  constexpr absl::string_view kExpectedHloText = R"(
+    CHECK: %[[FUSION_DOT:.*]] (
+    CHECK:   %[[SLICE:.*]] = bf16[512,64]{1,0} slice(%parameter_0), slice={[0:512], [14336:14400]}
+    CHECK:   ROOT {{.*}} = bf16[512,14336]{1,0} dot(%[[SLICE]], %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    CHECK: ENTRY
+    CHECK-DAG: %[[FUSION_D1:.*]] = bf16[512,14336]{1,0} fusion({{.*}}, {{.*}}), kind=kCustom, calls=%[[FUSION_DOT]]
+    CHECK-DAG: ROOT %a0 = bf16[512,14336]{1,0} add({{.*}}, %[[FUSION_D1]])
+  )";
+  MatchHloModule(*module, kExpectedHloText);
+}
+
+TEST_F(GemmFusionTest, DoNotFuseSliceWithOtherUsersWhenDotHasLargeK) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+ENTRY e {
+  p0 = bf16[512,3584]{1,0} parameter(0)
+  p1 = bf16[3584,14400]{0,1} parameter(1)
+  p2 = bf16[1400,14336]{1,0} parameter(2)
+
+  d0 = bf16[512,14400]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  sl0 = bf16[512,14336]{1,0} slice(d0), slice={[0:512], [0:14336]}
+  sl1 = bf16[512,1400]{1,0} slice(d0), slice={[0:512], [13000:14400]}
+
+  d1 = bf16[512,14336]{1,0} dot(sl1, p2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT a0 = bf16[512,14336]{1,0} add(sl0, d1)
+})"));
+
+  const se::CudaComputeCapability cc{se::CudaComputeCapability::kHopper, 0};
+  EXPECT_FALSE(GemmFusion(cc).Run(module.get()).value());
+}
+
 TEST_F(GemmFusionTest, DoNotFuseSliceOfMixedDimensions) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(R"(
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index cfc47a333955d0..9f7fbcba0aa98b 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -57,6 +57,16 @@ using triton_fusion::GetPropagatedDimOrdersAndRequirements;
 using triton_fusion::kNoSplitRequirement;
 using triton_fusion::TransformDirection;
 
+int64_t GetContractingDimSize(const HloInstruction& dot) {
+  const auto& contracting_dims =
+      ContractingDimensionsForOperand(dot, /*operand_number=*/0);
+  int64_t contracting_dim_size = 1;
+  for (int64_t dim : contracting_dims) {
+    contracting_dim_size *= dot.operand(0)->shape().dimensions(dim);
+  }
+  return contracting_dim_size;
+}
+
 }  // namespace
 
 namespace triton_fusion {
@@ -81,9 +91,13 @@ namespace triton_fusion {
           0) {
     splittable_dimension_index = non_contracting_dimension_index;
   }
-  FusionContext context(DotProperties{non_contracting_dimension_index,
-                                      splittable_dimension_index},
-                        DotRequirements(kNoSplitRequirement));
+
+  int64_t contracting_size = GetContractingDimSize(dot);
+
+  FusionContext context(
+      DotProperties{non_contracting_dimension_index, splittable_dimension_index,
+                    contracting_size},
+      DotRequirements(kNoSplitRequirement));
   context.dim_orders_[dot.operand(operand_number)] =
       DimensionOrder::FromDotOperandOrOutput(*dot.operand(operand_number),
                                              split_k_dimension_index);
@@ -102,9 +116,13 @@ namespace triton_fusion {
     // LHS non-contracting follows (batch is absent in this case).
     splittable_dimension_index = (split_k > 1) ? 1 : 0;
   }
-  FusionContext context(DotProperties{/*noncontracting_dimension=*/-1,
-                                      splittable_dimension_index},
-                        std::move(requirements));
+
+  int64_t contracting_size = GetContractingDimSize(dot);
+
+  FusionContext context(
+      DotProperties{/*noncontracting_dimension=*/-1, splittable_dimension_index,
+                    contracting_size},
+      std::move(requirements));
   context.dim_orders_[&dot] = DimensionOrder::FromDotOperandOrOutput(dot);
   return context;
 }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index 926307d6f0c0e7..9cb21bb0f6710b 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -992,11 +992,29 @@ bool CanNotBeFusedIntoAUser(const HloInstruction& hlo) {
                           hlo.users()[0]->opcode() == HloOpcode::kTuple);
 }
 
+// Maximum contracting dimension size for which slice fusion is allowed when
+// the operand has multiple users.
+constexpr int kMaxContractingDimSizeForSliceFusion = 1024;
+
 // Let input and output data volumes of a fusion grow by small amounts.
 constexpr int kIoToleranceBytes = 1024;
 
+// Returns true if all users of the given operand are kSlice operations
+// with the same shape as `slice_shape`.
+bool AllUsersAreSlicesWithSameShape(const HloInstruction& operand,
+                                    const Shape& slice_shape) {
+  for (const HloInstruction* user : operand.users()) {
+    if (user->opcode() != HloOpcode::kSlice ||
+        !ShapeUtil::SameDimensions(user->shape(), slice_shape)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Tells that fusing an instruction as an input is efficient.
-bool IsInputWorthFusing(const HloInstruction& hlo) {
+bool IsInputWorthFusing(const HloInstruction& hlo,
+                        const DotProperties& properties) {
   std::optional<int64_t> input_minus_output_bytes = InputMinusOutputBytes(hlo);
   if (!input_minus_output_bytes.has_value()) {
     return false;
@@ -1011,6 +1029,26 @@ bool IsInputWorthFusing(const HloInstruction& hlo) {
       hlo_query::AllOperandsAreParametersOrConstants(hlo)) {
     return true;
   }
+
+  // Explanation:
+  // * Operand user count > 1 - if the producer of the slice has a single user
+  //   the slice can be fused into the producer instead of here.
+  // * contracting_dim_size < 1024 - fusing slices disables split-K rewriter,
+  //   which may outweigh the benefit of fusing it in the first place. Small
+  //   contracting dimension almost never benefits from splitting it, so we
+  //   allow the fusion.
+  // * AllUsersAreSlicesWithSameShape - slices of the same shape can be
+  //   fused into the producer by the multi output fusion pass.
+  //
+  // TODO: b/393299275 - Remove the contracting dim size restriction once the
+  // new emitter lands and we can support slices in contracting dimension with
+  // splits.
+  if (hlo.opcode() == HloOpcode::kSlice && hlo.operand(0)->user_count() > 1 &&
+      properties.contracting_dim_size <= kMaxContractingDimSizeForSliceFusion &&
+      !AllUsersAreSlicesWithSameShape(*hlo.operand(0), hlo.shape())) {
+    return true;
+  }
+
   const bool enable_subchannel_dequantisation_fusion =
       hlo.GetModule()
           ->config()
@@ -1018,8 +1056,8 @@ bool IsInputWorthFusing(const HloInstruction& hlo) {
           .xla_gpu_experimental_enable_subchannel_dequantisation_fusion();
   if (hlo.opcode() == HloOpcode::kMultiply) {
     return enable_subchannel_dequantisation_fusion &&
-           IsInputWorthFusing(*hlo.operand(0)) &&
-           IsInputWorthFusing(*hlo.operand(1));
+           IsInputWorthFusing(*hlo.operand(0), properties) &&
+           IsInputWorthFusing(*hlo.operand(1), properties);
   }
   return hlo_query::AllOperandsAreParametersOrConstantsWithSingleUser(hlo);
 }
@@ -1139,7 +1177,7 @@ GetPropagatedDimOrdersAndRequirementsIfProfitablyFusible(
         }
       }
     }
-    if (!accepted && !IsInputWorthFusing(hlo)) {
+    if (!accepted && !IsInputWorthFusing(hlo, properties)) {
       return FusionDecision::Forbid(
           "Not obviously profitable to fuse as input.");
     }
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
index a83dd9c976f8c4..df09b35a1f0ffc 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.h
@@ -249,6 +249,8 @@ struct DotProperties {
   // Index of dot dimension that can be split.
   // Currently typically LHS non-contracting one.
   const int splittable_dimension_index;
+  // Size of the contracting dimension (K).
+  const int64_t contracting_dim_size;
 };
 
 // A special value for splittable_dimension_major_part_size.

From c660aef0d5c2b73f3e97684a335c8ab94a629825 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Wed, 17 Dec 2025 05:58:16 -0800
Subject: [PATCH 433/753] [XLA:GPU] Run DotMerger after the simplification
 fix-point pass pipeline.

The simplification passes alg-simplifier, constant_folding, cse, and dot-strength-reduction are cheap compared to dot-merger. Running these passes to a fix-point first can greatly reduce the number of merge candidates that dot-merger needs to consider. The dot-merger pass itself is designed to do all merging in one go. Hence, there is no need to run it to a fix point.

Also clarify the semantic of the dot-merger threshold in xla.proto and remove an early exit in dot_merger.cc that was not in-line with it.

PiperOrigin-RevId: 845731882
---
 .../hlo/transforms/simplifiers/dot_merger.cc  | 14 +++++---
 .../xla/xla/service/gpu/gpu_compiler.cc       | 35 ++++++++++++-------
 third_party/xla/xla/xla.proto                 |  3 ++
 3 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
index be61f6be2a96c6..21611ee0129540 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dot_merger.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <functional>
+#include <numeric>
 #include <set>
 #include <string>
 #include <utility>
@@ -478,7 +479,14 @@ absl::StatusOr<bool> MergeDots(
   }
 
   VLOG(0) << "Merging Dots in computation: " << comp->name();
-  VLOG(1) << "Found " << equivalence_classes.size() << " equivalence classes.";
+  VLOG(1) << "Found " << equivalence_classes.size()
+          << " equivalence classes with "
+          << std::accumulate(equivalence_classes.begin(),
+                             equivalence_classes.end(), std::uint64_t{0},
+                             [](std::uint64_t total, auto const& values) {
+                               return values.second.size() + total;
+                             })
+          << " dots in total.";
 
   // Build a dependency graph representing the whole computation.
   GraphCycles graph;
@@ -567,10 +575,6 @@ absl::StatusOr<bool> MergeDots(
           dead_instrs.insert(b);
           dots[i] = merged;
           dots[j] = nullptr;
-          if (!is_merge_candidate(merged)) {
-            // The merged dot is not a candidate for futher merging.
-            break;
-          }
         }
       }
     }
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index ca28137784bfdf..36471b34d7a2ac 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -754,18 +754,6 @@ absl::Status RunOptimizationPasses(
         gpu_target_config.device_description.gpu_compute_capability());
     pipeline.AddPass<GpuAlgebraicSimplifier>(layout_insensitive_algsimp_opts,
                                              gpu_version);
-    // Only merge "smallish" dots.  This threshold defaults to 32MB today, with
-    // a flag to override.
-    // Do not merge dots when they are assigned different stream ids.
-    std::function<int64_t(const HloInstruction* dot)> queue_id =
-        [&](const HloInstruction* dot) -> int64_t {
-      return dot->backend_config<GpuBackendConfig>()->operation_queue_id();
-    };
-    pipeline.AddPass<DotMerger>(
-        /*max_size_to_merge=*/int64_t{debug_options
-                                          .xla_gpu_dot_merger_threshold_mb()}
-            << 20,
-        queue_id);
     pipeline.AddPass<SortSimplifier>();
     pipeline.AddPass<TupleSimplifier>();
     pipeline.AddPass<WhileLoopConstantSinking>();
@@ -778,11 +766,32 @@ absl::Status RunOptimizationPasses(
     pipeline.AddPass<HloConstantFolding>();
     pipeline.AddPass<ConditionalSimplifier>();
     pipeline.AddPass<RealImagExpander>();
-    pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot);
+    // Do not fold transpose operands into dots yet. This can undo the normal
+    // form established by DotDecomposer, which the DotMerger pass requires.
+    pipeline.AddPass<TransposeFolding>(
+        /*dot_can_fold_transpose_operand=*/
+        [&](const HloInstruction& dot,
+            int64_t operand) -> absl::StatusOr<bool> { return false; });
     pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pipeline.AddPass<HloDCE>();
   }();
 
+  // Do not merge dots when they are assigned different stream ids.
+  std::function<int64_t(const HloInstruction* dot)> queue_id =
+      [&](const HloInstruction* dot) -> int64_t {
+    return dot->backend_config<GpuBackendConfig>()->operation_queue_id();
+  };
+  // Only merge "smallish" dots. This threshold defaults to 32MB today, with
+  // a flag to override.
+  pipeline.AddPass<DotMerger>(
+      /*max_size_to_merge=*/int64_t{debug_options
+                                        .xla_gpu_dot_merger_threshold_mb()}
+          << 20,
+      queue_id);
+  // Folding transpose operands into dots can undo the normal form established
+  // by DotDecomposer. Subsequent passes must not rely on it from this point on.
+  pipeline.AddPass<TransposeFolding>(CanFoldTransposeOperandIntoDot);
+
   // ConvertMover and ReshapeMover fight with each other: ConvertMover wants
   // to move some converts down the graph, but ReshapeMover wants to move them
   // up the graph.  As a compromise, let ReshapeMover run to a fixed point,
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 054a2de5446d64..6575b4803029e2 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -468,6 +468,9 @@ message DebugOptions {
   optional bool xla_gpu_disable_gpuasm_optimizations = 103;
 
   // DotMerger pass threshold size to be used in MB.
+  // This pass merges dots that are too small to achieve good occupancy with
+  // other dots. Dots are considered for merging when the size of their
+  // inputs+output is within the threshold.
   optional int32 xla_gpu_dot_merger_threshold_mb = 331;
 
   // File to write autotune logs to. It will stored in txt format.

From 36c07984c2dbbd9561106a2666b8d892fa3a9bea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 06:01:07 -0800
Subject: [PATCH 434/753] Automated Code Change

PiperOrigin-RevId: 845732556
---
 .../core/grappler/graph_analyzer/gen_node.cc  |  8 +--
 .../core/grappler/graph_analyzer/gen_node.h   |  8 +--
 .../grappler/graph_analyzer/graph_analyzer.cc |  8 +--
 .../grappler/graph_analyzer/graph_analyzer.h  |  4 +-
 .../graph_analyzer/graph_analyzer_test.cc     | 12 ++--
 .../graph_analyzer/graph_analyzer_tool.cc     |  6 +-
 .../graph_analyzer/graph_analyzer_tool.h      |  2 +-
 .../core/grappler/graph_analyzer/sig_node.cc  | 12 ++--
 .../core/grappler/graph_analyzer/sig_node.h   |  8 +--
 .../grappler/graph_analyzer/sig_node_test.cc  | 16 ++---
 .../core/grappler/graph_analyzer/subgraph.cc  |  4 +-
 .../core/grappler/graph_analyzer/subgraph.h   |  2 +-
 .../grappler/graph_analyzer/subgraph_test.cc  |  6 +-
 .../grappler/graph_analyzer/test_tools.cc     | 58 ++++++++++---------
 .../core/grappler/graph_analyzer/test_tools.h | 53 +++++++++--------
 15 files changed, 109 insertions(+), 98 deletions(-)

diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.cc b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
index 42dffe79ecabe3..1e355d45a91ec1 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
@@ -34,7 +34,7 @@ GenNode::GenNode(const NodeDef* node) : node_(node), op_(nullptr) {}
 
 absl::Status GenNode::BuildGraphInMap(const GraphDef& source, GenNodeMap* map) {
   for (const auto& n : source.node()) {
-    const string& name = n.name();
+    const std::string& name = n.name();
     if (map->find(name) != map->end()) {
       // This error code looks more meaningful than ALREADY_EXISTS.
       return absl::Status(absl::StatusCode::kInvalidArgument,
@@ -95,7 +95,7 @@ absl::Status GenNode::ParseInputs(const GenNodeMap* map) {
 
   for (int i = 0; i < n_inputs; ++i) {
     int other_position;
-    string other_name = ParseNodeName(node_->input(i), &other_position);
+    std::string other_name = ParseNodeName(node_->input(i), &other_position);
     auto other_it = map->find(other_name);
     if (other_it == map->end()) {
       return absl::Status(
@@ -138,8 +138,8 @@ bool GenNode::IsMultiInput(Port port) const {
   return (it->second.size() > 1);
 }
 
-GenNode::Port::operator string() const {
-  string result = this->IsInbound() ? "i" : "o";
+GenNode::Port::operator std::string() const {
+  std::string result = this->IsInbound() ? "i" : "o";
   if (this->IsControl()) {
     result.append("C");
   } else {
diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.h b/tensorflow/core/grappler/graph_analyzer/gen_node.h
index 57d5f59ec2ccd7..7194a48a6a2538 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node.h
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node.h
@@ -35,7 +35,7 @@ namespace graph_analyzer {
 class GenNode;
 
 // To find nodes by name.
-using GenNodeMap = std::unordered_map<string, std::unique_ptr<GenNode>>;
+using GenNodeMap = std::unordered_map<std::string, std::unique_ptr<GenNode>>;
 
 // One node in the graph, in the form convenient for traversal and generation of
 // subgraphs. It refers to the original NodeDef protobuf for most information
@@ -51,8 +51,8 @@ class GenNode {
   explicit GenNode(const NodeDef* node);
 
   // Access wrappers.
-  const string& name() const { return node_->name(); }
-  const string& opcode() const { return node_->op(); }
+  const std::string& name() const { return node_->name(); }
+  const std::string& opcode() const { return node_->op(); }
   const NodeDef* node_def() const { return node_; }
 
   // Parse the inputs of this node and update the map accordingly, creating the
@@ -111,7 +111,7 @@ class GenNode {
 
     // Convenient for printing. I've really wanted it to be implicit but
     // ClangTidy insists on making it explicit.
-    explicit operator string() const;
+    explicit operator std::string() const;
 
    private:
     explicit Port(IntPort value) : value_(value) {}
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
index 2d47abda617615..dde0fb720c0170 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
@@ -315,16 +315,16 @@ absl::Status GraphAnalyzer::CollateResult() {
   return absl::OkStatus();
 }
 
-std::vector<string> GraphAnalyzer::DumpRawSubgraphs() {
-  std::vector<string> result;
+std::vector<std::string> GraphAnalyzer::DumpRawSubgraphs() {
+  std::vector<std::string> result;
   for (const auto& it : result_) {
     result.emplace_back(it->Dump());
   }
   return result;
 }
 
-std::vector<string> GraphAnalyzer::DumpSubgraphs() {
-  std::vector<string> result;
+std::vector<std::string> GraphAnalyzer::DumpSubgraphs() {
+  std::vector<std::string> result;
   for (auto ptr : ordered_collation_) {
     result.emplace_back(
         absl::StrFormat("%d %s", ptr->count, ptr->sig->ToString()));
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
index 9a321e69b531fb..be46b6843225a6 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.h
@@ -46,7 +46,7 @@ class GraphAnalyzer {
   absl::Status Run();
 
   // Returns the subgraphs found in Run() printed to text.
-  std::vector<string> DumpSubgraphs();
+  std::vector<std::string> DumpSubgraphs();
 
   // Prints the subgraphs found in Run() to stdout.
   absl::Status OutputSubgraphs();
@@ -78,7 +78,7 @@ class GraphAnalyzer {
   absl::Status CollateResult();
 
   // Returns the raw subgraphs found in FindSubgraphs() printed to text.
-  std::vector<string> DumpRawSubgraphs();
+  std::vector<std::string> DumpRawSubgraphs();
 
   // Finds and adds appropriately to either partial_ or result_ all the
   // subgraphs that can be created by extending the parent subgraph by one node.
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
index 9822f5446f4b39..4e9220d3a5c7e5 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
@@ -56,10 +56,12 @@ class GraphAnalyzerTest : public ::testing::Test, protected TestGraphs {
     gran_->ExtendSubgraphAllOrNone(parent, node);
   }
 
-  std::vector<string> DumpRawSubgraphs() { return gran_->DumpRawSubgraphs(); }
+  std::vector<std::string> DumpRawSubgraphs() {
+    return gran_->DumpRawSubgraphs();
+  }
 
-  std::vector<string> DumpPartials() {
-    std::vector<string> result;
+  std::vector<std::string> DumpPartials() {
+    std::vector<std::string> result;
     for (const auto& it : gran_->partial_) {
       result.emplace_back(it->Dump());
     }
@@ -68,7 +70,9 @@ class GraphAnalyzerTest : public ::testing::Test, protected TestGraphs {
 
   const GenNodeMap& GetNodes() { return gran_->nodes_; }
 
-  GenNode* GetNode(const string& name) { return gran_->nodes_.at(name).get(); }
+  GenNode* GetNode(const std::string& name) {
+    return gran_->nodes_.at(name).get();
+  }
 
   SubgraphPtrSet& GetResult() { return gran_->result_; }
   SubgraphPtrSet& GetPartial() { return gran_->partial_; }
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
index 72662005ecdec7..0b1b3af2ea5571 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.cc
@@ -31,7 +31,7 @@ namespace grappler {
 namespace graph_analyzer {
 
 // Dies on failure.
-static void LoadModel(const string& filename,
+static void LoadModel(const std::string& filename,
                       tensorflow::MetaGraphDef* metagraph) {
   LOG(INFO) << "Loading model from " << filename;
   absl::Status st;
@@ -49,7 +49,7 @@ static void LoadModel(const string& filename,
 // of train ops (if provided).
 void MaybePruneGraph(const tensorflow::MetaGraphDef& metagraph,
                      tensorflow::GraphDef* graph) {
-  std::vector<string> fetch_nodes;
+  std::vector<std::string> fetch_nodes;
   for (const auto& fetch :
        metagraph.collection_def().at("train_op").node_list().value()) {
     LOG(INFO) << "Fetch node: " << fetch;
@@ -72,7 +72,7 @@ void MaybePruneGraph(const tensorflow::MetaGraphDef& metagraph,
   }
 }
 
-void GraphAnalyzerTool(const string& file_name, int n) {
+void GraphAnalyzerTool(const std::string& file_name, int n) {
   if (n < 1) {
     LOG(FATAL) << "Invalid subgraph size " << n << ", must be at least 1";
   }
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
index 5a91fe7dc8eb7d..85f75706acf4cb 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_tool.h
@@ -22,7 +22,7 @@ namespace tensorflow {
 namespace grappler {
 namespace graph_analyzer {
 
-void GraphAnalyzerTool(const string& file_name, int n);
+void GraphAnalyzerTool(const std::string& file_name, int n);
 
 }  // end namespace graph_analyzer
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.cc b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
index 9210bf56b8047b..123bd0f060bccf 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
@@ -99,7 +99,7 @@ void SigNode::ComputeTopoHash0() {
   last_hashed_nodes_ = next_hashed_nodes_ = node_mask_;
 
   // TODO(babkin): include the attributes too, as an option.
-  size_t hval = std::hash<string>()(opcode());
+  size_t hval = std::hash<std::string>()(opcode());
 
   // Getting the topology of the links in to the hash early should get more
   // conflicts resolved early.
@@ -208,8 +208,8 @@ bool SigNode::operator==(const SigNode& other) const {
 
 constexpr int Signature::kMaxGraphSize;
 
-string Signature::ToString() const {
-  string result;
+std::string Signature::ToString() const {
+  std::string result;
   for (size_t n = 0; n < nodes.size(); ++n) {
     // TODO(babkin): add attributes too.
     result += absl::StrFormat("%d:%s", n, nodes[n]->opcode());
@@ -219,9 +219,9 @@ string Signature::ToString() const {
       // The link entries are already sorted, by tags and then by the
       // node ranks.
       if (link.tag.local.IsInbound()) {
-        result +=
-            absl::StrFormat("[%s:%s:%d]", string(link.tag.local),
-                            string(link.tag.remote), entry.peer->unique_rank_);
+        result += absl::StrFormat("[%s:%s:%d]", std::string(link.tag.local),
+                                  std::string(link.tag.remote),
+                                  entry.peer->unique_rank_);
       }
     }
     result.push_back(',');
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.h b/tensorflow/core/grappler/graph_analyzer/sig_node.h
index 6c0731ebb92b54..2caaf605615796 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.h
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.h
@@ -42,7 +42,7 @@ class SigNode;
 // To find nodes by name. Having the map ordered makes the tests easier,
 // and it isn't used in production code often enough to get any win from
 // using an unordered map.
-using SigNodeMap = std::map<string, std::unique_ptr<SigNode>>;
+using SigNodeMap = std::map<std::string, std::unique_ptr<SigNode>>;
 
 // One node in the graph, in the form convenient for generation of the signature
 // of the graph, and comparison of two (sub)graphs for equivalence. It refers to
@@ -61,8 +61,8 @@ class SigNode {
   explicit SigNode(const NodeDef* node);
 
   // Access wrappers.
-  const string& name() const { return node_->name(); }
-  const string& opcode() const { return node_->op(); }
+  const std::string& name() const { return node_->name(); }
+  const std::string& opcode() const { return node_->op(); }
   const NodeDef* node_def() const { return node_; }
 
   // For extraction of subgraphs into a separate SigNodeMap, copies the links
@@ -261,7 +261,7 @@ struct Signature {
   absl::Status Compute();
 
   // Convert the computed signature to a string representation.
-  string ToString() const;
+  std::string ToString() const;
 
   SigNodeMap map;        // The nodes in the graph, accessible by name.
   size_t sig_short = 0;  // Hash of the signature, for the quick equality check.
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
index 6f38b4dc8b075c..56980ccedf459c 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
@@ -312,7 +312,7 @@ TEST_F(SigNodeTest, ComputeTopoHash0) {
   EXPECT_THAT(RefNextHashedNodes(&sn1), Eq(0x02));
   EXPECT_THAT(RefTopoHash(&sn1), SizeIs(1));
 
-  size_t exp_hval = std::hash<string>()(sn1.opcode());
+  size_t exp_hval = std::hash<std::string>()(sn1.opcode());
   CombineHash(1, &exp_hval);
   CombineHash(1, &exp_hval);
   CombineHash(2, &exp_hval);
@@ -640,14 +640,14 @@ class SignatureTest : public SigBaseTest {
     std::vector<size_t> countdown;
     InitPermutation(graph_size, &plain_permutation, &countdown);
 
-    std::set<string> signatures;
+    std::set<std::string> signatures;
     std::vector<size_t> permutation;
     do {
       BuildPermutation(plain_permutation, countdown, &permutation);
 
       constexpr bool kDebugPermutation = false;
       if (kDebugPermutation) {
-        string p;
+        std::string p;
         for (int i = 0; i < permutation.size(); ++i) {
           p.push_back('0' + permutation[i]);
         }
@@ -1070,7 +1070,7 @@ TEST_F(SignatureTest, OrderLinks) {
   }
 
   // How it was ordered in the original graph.
-  string before = sig_.ToString();
+  std::string before = sig_.ToString();
   // clang-format off
   EXPECT_THAT(before, Eq(
     "0:Mul[i0:o0:5][i0:o0:4][i0:o1:4][i0:o2:3][i0:o2:2][i0:o3:2],"
@@ -1084,7 +1084,7 @@ TEST_F(SignatureTest, OrderLinks) {
 
   OrderLinks(&sig_);
 
-  string after = sig_.ToString();
+  std::string after = sig_.ToString();
   // clang-format off
   EXPECT_THAT(after, Eq(
       "0:Mul[i0:o0:4][i0:o0:5][i0:o1:4][i0:o2:2][i0:o2:3][i0:o3:2],"
@@ -1132,7 +1132,7 @@ TEST_F(SignatureTest, ToString) {
     RefHashIsFinal(sig_.nodes[i]) = true;
   }
 
-  string result = sig_.ToString();
+  std::string result = sig_.ToString();
 
   // clang-format off
   ASSERT_THAT(result, Eq(
@@ -1151,14 +1151,14 @@ TEST_F(SignatureTest, Permutation) {
   std::vector<size_t> countdown;
   InitPermutation(5, &plain_permutation, &countdown);
 
-  std::set<string> results;
+  std::set<std::string> results;
 
   std::vector<size_t> permutation;
   do {
     BuildPermutation(plain_permutation, countdown, &permutation);
     EXPECT_THAT(permutation, SizeIs(5));
 
-    string p;
+    std::string p;
     for (int i = 0; i < permutation.size(); ++i) {
       p.push_back('0' + permutation[i]);
     }
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph.cc b/tensorflow/core/grappler/graph_analyzer/subgraph.cc
index cfa26f243b20df..c08f23d97468cf 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph.cc
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph.cc
@@ -81,9 +81,9 @@ size_t Subgraph::Identity::Hash() const {
   return result;
 }
 
-string Subgraph::Dump() {
+std::string Subgraph::Dump() {
   // TODO(babkin): this is simplified for now.
-  std::vector<string> nodes;
+  std::vector<std::string> nodes;
   for (const auto& n : id_) {
     if (specific_) {
       nodes.emplace_back(absl::StrFormat("%s(%s)", n->opcode(), n->name()));
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph.h b/tensorflow/core/grappler/graph_analyzer/subgraph.h
index 7d3494cdc43540..140d7d626d8030 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph.h
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph.h
@@ -64,7 +64,7 @@ class Subgraph {
   size_t Hash() const { return hash_; }
 
   // Dump the subgraph information to a string.
-  string Dump();
+  std::string Dump();
 
   // Extract this subgraph into a separate graph representation for signature
   // building, that includes only the links between the nodes in the subgraph
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
index da29e6cff5d803..2d6849cafbcb57 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
@@ -120,15 +120,15 @@ TEST(SubgraphTest, Iteration) {
   {
     SubgraphIterator sit(&sg);
     SubgraphIterator sit2(&sg);
-    std::vector<string> links;
+    std::vector<std::string> links;
     for (; !sit.AtEnd(); sit.Next()) {
       EXPECT_TRUE(sit == sit2);
       sit2.Next();
       EXPECT_FALSE(sit == sit2);
 
-      links.push_back(absl::StrFormat("[%s,%s,%s]", string(sit.GetPort()),
+      links.push_back(absl::StrFormat("[%s,%s,%s]", std::string(sit.GetPort()),
                                       sit.GetNeighbor().node->name(),
-                                      string(sit.GetNeighbor().port)));
+                                      std::string(sit.GetNeighbor().port)));
     }
     EXPECT_TRUE(sit == sit2);
 
diff --git a/tensorflow/core/grappler/graph_analyzer/test_tools.cc b/tensorflow/core/grappler/graph_analyzer/test_tools.cc
index fe24424d81cd1b..f9cc5cda65a40d 100644
--- a/tensorflow/core/grappler/graph_analyzer/test_tools.cc
+++ b/tensorflow/core/grappler/graph_analyzer/test_tools.cc
@@ -33,15 +33,15 @@ namespace test {
 
 //=== Helper methods to construct the nodes.
 
-NodeDef MakeNodeConst(const string& name) {
+NodeDef MakeNodeConst(const std::string& name) {
   NodeDef n;
   n.set_name(name);
   n.set_op("Const");
   return n;
 }
 
-NodeDef MakeNode2Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2) {
+NodeDef MakeNode2Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2) {
   NodeDef n;
   n.set_name(name);
   n.set_op(opcode);
@@ -50,9 +50,9 @@ NodeDef MakeNode2Arg(const string& name, const string& opcode,
   return n;
 }
 
-NodeDef MakeNode4Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2, const string& arg3,
-                     const string& arg4) {
+NodeDef MakeNode4Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2,
+                     const std::string& arg3, const std::string& arg4) {
   NodeDef n;
   n.set_name(name);
   n.set_op(opcode);
@@ -64,45 +64,47 @@ NodeDef MakeNode4Arg(const string& name, const string& opcode,
 }
 
 // Not really a 2-argument but convenient to construct.
-NodeDef MakeNodeShapeN(const string& name, const string& arg1,
-                       const string& arg2) {
+NodeDef MakeNodeShapeN(const std::string& name, const std::string& arg1,
+                       const std::string& arg2) {
   // This opcode is multi-input but not commutative.
   return MakeNode2Arg(name, "ShapeN", arg1, arg2);
 }
 
 // Not really a 2-argument but convenient to construct.
-NodeDef MakeNodeIdentityN(const string& name, const string& arg1,
-                          const string& arg2) {
+NodeDef MakeNodeIdentityN(const std::string& name, const std::string& arg1,
+                          const std::string& arg2) {
   // The argument is of a list type.
   return MakeNode2Arg(name, "IdentityN", arg1, arg2);
 }
 
-NodeDef MakeNodeQuantizedConcat(const string& name, const string& arg1,
-                                const string& arg2, const string& arg3,
-                                const string& arg4) {
+NodeDef MakeNodeQuantizedConcat(const std::string& name,
+                                const std::string& arg1,
+                                const std::string& arg2,
+                                const std::string& arg3,
+                                const std::string& arg4) {
   // This opcode has multiple multi-inputs.
   return MakeNode4Arg(name, "QuantizedConcat", arg1, arg2, arg3, arg4);
 }
 
 //=== Helper methods for analysing the structures.
 
-std::vector<string> DumpLinkMap(const GenNode::LinkMap& link_map) {
+std::vector<std::string> DumpLinkMap(const GenNode::LinkMap& link_map) {
   // This will order the entries first.
-  std::map<string, string> ordered;
+  std::map<std::string, std::string> ordered;
   for (const auto& link : link_map) {
-    string key = string(link.first);
+    std::string key = std::string(link.first);
 
     // Order the other sides too. They may be repeating, so store them
     // in a multiset.
-    std::multiset<string> others;
+    std::multiset<std::string> others;
     for (const auto& other : link.second) {
-      others.emplace(
-          absl::StrFormat("%s[%s]", other.node->name(), string(other.port)));
+      others.emplace(absl::StrFormat("%s[%s]", other.node->name(),
+                                     std::string(other.port)));
     }
     ordered[key] = absl::StrJoin(others, ", ");
   }
   // Now dump the result in a predictable order.
-  std::vector<string> result;
+  std::vector<std::string> result;
   result.reserve(ordered.size());
   for (const auto& link : ordered) {
     result.emplace_back(link.first + ": " + link.second);
@@ -110,7 +112,8 @@ std::vector<string> DumpLinkMap(const GenNode::LinkMap& link_map) {
   return result;
 }
 
-std::vector<string> DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map) {
+std::vector<std::string> DumpLinkHashMap(
+    const SigNode::LinkHashMap& link_hash_map) {
   // The entries in this map are ordered by hash value which might change
   // at any point. Re-order them by the link tag.
   std::map<SigNode::LinkTag, size_t> tags;
@@ -118,23 +121,24 @@ std::vector<string> DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map) {
     tags[entry.second.tag] = entry.first;
   }
 
-  std::vector<string> result;
+  std::vector<std::string> result;
   for (const auto& id : tags) {
     // For predictability, the nodes need to be sorted.
-    std::vector<string> nodes;
+    std::vector<std::string> nodes;
     for (const auto& peer : link_hash_map.at(id.second).peers) {
       nodes.emplace_back(peer->name());
     }
     std::sort(nodes.begin(), nodes.end());
-    result.emplace_back(string(id.first.local) + ":" + string(id.first.remote) +
-                        ": " + absl::StrJoin(nodes, ", "));
+    result.emplace_back(std::string(id.first.local) + ":" +
+                        std::string(id.first.remote) + ": " +
+                        absl::StrJoin(nodes, ", "));
   }
   return result;
 }
 
-std::vector<string> DumpHashedPeerVector(
+std::vector<std::string> DumpHashedPeerVector(
     const SigNode::HashedPeerVector& hashed_peers) {
-  std::vector<string> result;
+  std::vector<std::string> result;
 
   // Each subset of nodes with the same hash has to be sorted by name.
   // Other than that, the vector is already ordered by full tags.
diff --git a/tensorflow/core/grappler/graph_analyzer/test_tools.h b/tensorflow/core/grappler/graph_analyzer/test_tools.h
index e53c8e9b198cb0..89c6f146e0ab01 100644
--- a/tensorflow/core/grappler/graph_analyzer/test_tools.h
+++ b/tensorflow/core/grappler/graph_analyzer/test_tools.h
@@ -33,47 +33,49 @@ namespace test {
 
 //=== Helper methods to construct the nodes.
 
-NodeDef MakeNodeConst(const string& name);
+NodeDef MakeNodeConst(const std::string& name);
 
-NodeDef MakeNode2Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2);
+NodeDef MakeNode2Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2);
 
-NodeDef MakeNode4Arg(const string& name, const string& opcode,
-                     const string& arg1, const string& arg2, const string& arg3,
-                     const string& arg4);
+NodeDef MakeNode4Arg(const std::string& name, const std::string& opcode,
+                     const std::string& arg1, const std::string& arg2,
+                     const std::string& arg3, const std::string& arg4);
 
-inline NodeDef MakeNodeMul(const string& name, const string& arg1,
-                           const string& arg2) {
+inline NodeDef MakeNodeMul(const std::string& name, const std::string& arg1,
+                           const std::string& arg2) {
   return MakeNode2Arg(name, "Mul", arg1, arg2);
 }
 
 // Not really a 2-argument but convenient to construct.
-inline NodeDef MakeNodeAddN(const string& name, const string& arg1,
-                            const string& arg2) {
+inline NodeDef MakeNodeAddN(const std::string& name, const std::string& arg1,
+                            const std::string& arg2) {
   return MakeNode2Arg(name, "AddN", arg1, arg2);
 }
 
-inline NodeDef MakeNodeSub(const string& name, const string& arg1,
-                           const string& arg2) {
+inline NodeDef MakeNodeSub(const std::string& name, const std::string& arg1,
+                           const std::string& arg2) {
   return MakeNode2Arg(name, "Sub", arg1, arg2);
 }
 
 // Has 2 honest outputs.
-inline NodeDef MakeNodeBroadcastGradientArgs(const string& name,
-                                             const string& arg1,
-                                             const string& arg2) {
+inline NodeDef MakeNodeBroadcastGradientArgs(const std::string& name,
+                                             const std::string& arg1,
+                                             const std::string& arg2) {
   return MakeNode2Arg(name, "BroadcastGradientArgs", arg1, arg2);
 }
 
-NodeDef MakeNodeShapeN(const string& name, const string& arg1,
-                       const string& arg2);
+NodeDef MakeNodeShapeN(const std::string& name, const std::string& arg1,
+                       const std::string& arg2);
 
-NodeDef MakeNodeIdentityN(const string& name, const string& arg1,
-                          const string& arg2);
+NodeDef MakeNodeIdentityN(const std::string& name, const std::string& arg1,
+                          const std::string& arg2);
 
-NodeDef MakeNodeQuantizedConcat(const string& name, const string& arg1,
-                                const string& arg2, const string& arg3,
-                                const string& arg4);
+NodeDef MakeNodeQuantizedConcat(const std::string& name,
+                                const std::string& arg1,
+                                const std::string& arg2,
+                                const std::string& arg3,
+                                const std::string& arg4);
 
 //=== A container of pre-constructed graphs.
 
@@ -106,12 +108,13 @@ class TestGraphs {
 
 //=== Helper methods for analysing the structures.
 
-std::vector<string> DumpLinkMap(const GenNode::LinkMap& link_map);
+std::vector<std::string> DumpLinkMap(const GenNode::LinkMap& link_map);
 
 // Also checks for the consistency of hash values.
-std::vector<string> DumpLinkHashMap(const SigNode::LinkHashMap& link_hash_map);
+std::vector<std::string> DumpLinkHashMap(
+    const SigNode::LinkHashMap& link_hash_map);
 
-std::vector<string> DumpHashedPeerVector(
+std::vector<std::string> DumpHashedPeerVector(
     const SigNode::HashedPeerVector& hashed_peers);
 
 }  // end namespace test

From a72e016e020f84ebc1857ec96c8101ee7f4ccd06 Mon Sep 17 00:00:00 2001
From: Aleksei Nurmukhametov <anurmukh@amd.com>
Date: Wed, 17 Dec 2025 06:35:19 -0800
Subject: [PATCH 435/753] PR #35211: [ROCm] Reimplement register spilling
 detection

Imported from GitHub PR https://github.com/openxla/xla/pull/35211

Replace amd_comgr library with LLVM's native API to find NT_AMDGPU_METADATA note sections and extract the stack usage and register spill counts from there.

Add detection for dynamic stack usage.

Add VLOG(2) dumps for per-kernel stats as well as register counts.

Change the logic of discarding the module. The module is discarded only if the stack is used, i.e., either .private_segment_fixed_size is not zero or .uses_dynamic_stack is true. There are examples where there are SGPR spills, but they are saved to VGPRs and not to the stack.

Add tests in amdgpu_register_spilling_test.cc which cover cases where no spills, VGPR-only spills, SGPR-only spills, or dynamic stack usage occur. For that, the following LLVM IR inputs are added:
- amdgpu_no_spills.ll: Simple kernel with minimal register usage
- amdgpu_vgpr_spills.ll: High VGPR pressure with limited VGPRs (64)
- amdgpu_sgpr_spills.ll: High SGPR pressure with limited SGPRs (32)
- amdgpu_dynamic_stack.ll: Indirect function call requiring dynamic stack
Copybara import of the project:

--
b83efc6a7addcfe617459280d1cea22cd8d0c4c8 by Aleksei Nurmukhametov <anurmukh@amd.com>:

[ROCm] Reimplement register spilling detection

Replace amd_comgr library with LLVM's native API to find
NT_AMDGPU_METADATA note sections and extract the stack usage and
register spill counts from there.

Add detection for dynamic stack usage.

Add VLOG(2) dumps for per-kernel stats as well as register counts.

Change the logic of discarding the module. The module is discarded only
if the stack is used, i.e., either .private_segment_fixed_size is not
zero or .uses_dynamic_stack is true. There are examples where there are
SGPR spills, but they are saved to VGPRs and not to the stack.

Add tests in amdgpu_register_spilling_test.cc which cover cases where no
spills, VGPR-only spills, SGPR-only spills, or dynamic stack usage
occur. For that, the following LLVM IR inputs are added:
- amdgpu_no_spills.ll: Simple kernel with minimal register usage
- amdgpu_vgpr_spills.ll: High VGPR pressure with limited VGPRs (64)
- amdgpu_sgpr_spills.ll: High SGPR pressure with limited SGPRs (32)
- amdgpu_dynamic_stack.ll: Indirect function call requiring dynamic
  stack

Merging this change closes #35211

PiperOrigin-RevId: 845742402
---
 .../xla/third_party/gpus/rocm/BUILD.tpl       |   1 -
 .../xla/service/gpu/llvm_gpu_backend/BUILD    |  32 +-
 .../gpu/llvm_gpu_backend/amdgpu_backend.cc    | 369 ++++++++++++------
 .../amdgpu_register_spilling_test.cc          | 127 ++++++
 .../tests_data/amdgpu_dynamic_stack.ll        |  26 ++
 .../tests_data/amdgpu_no_spills.ll            |  29 ++
 .../tests_data/amdgpu_sgpr_spills.ll          | 166 ++++++++
 .../tests_data/amdgpu_vgpr_spills.ll          | 145 +++++++
 8 files changed, 776 insertions(+), 119 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
 create mode 100644 third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_dynamic_stack.ll
 create mode 100644 third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_no_spills.ll
 create mode 100644 third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_sgpr_spills.ll
 create mode 100644 third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_vgpr_spills.ll

diff --git a/third_party/xla/third_party/gpus/rocm/BUILD.tpl b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
index f14780e2b4a194..de7d5421af6ffa 100644
--- a/third_party/xla/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/xla/third_party/gpus/rocm/BUILD.tpl
@@ -593,7 +593,6 @@ alias(
         threshold = 71000,
         value = rocm_version_number(),
     ),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index f70483614dd8c9..fbd35f6609e597 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -199,6 +199,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:AMDGPUAsmParser",  # buildcleaner: keep
         "@llvm-project//llvm:Analysis",
+        "@llvm-project//llvm:BinaryFormat",
         "@llvm-project//llvm:BitReader",
         "@llvm-project//llvm:BitWriter",
         "@llvm-project//llvm:CodeGen",
@@ -207,13 +208,12 @@ cc_library(
         "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:MC",
         "@llvm-project//llvm:ObjCARC",  # buildcleaner: keep
+        "@llvm-project//llvm:Object",
         "@llvm-project//llvm:Passes",
         "@llvm-project//llvm:Scalar",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:TargetParser",
-        "@local_config_rocm//rocm:amd_comgr",
-        "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:random",
         "@local_tsl//tsl/profiler/lib:traceme",
@@ -313,6 +313,34 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "amdgpu_register_spilling_test",
+    size = "small",
+    srcs = ["amdgpu_register_spilling_test.cc"],
+    data = [
+        "tests_data/amdgpu_dynamic_stack.ll",
+        "tests_data/amdgpu_no_spills.ll",
+        "tests_data/amdgpu_sgpr_spills.ll",
+        "tests_data/amdgpu_vgpr_spills.ll",
+    ],
+    tags = [
+        "gpu",
+        "rocm-only",
+    ],
+    deps = [
+        ":amdgpu_backend",
+        ":load_ir_module",
+        "//xla:xla_proto_cc",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:ir_headers",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
 xla_cc_test(
     name = "load_ir_module_test",
     size = "small",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
index 536216735c9adf..fe72d821755106 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
-#include "amd_comgr/amd_comgr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
@@ -48,6 +47,8 @@ limitations under the License.
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CodeGen/CommandFlags.h"
@@ -63,9 +64,13 @@ limitations under the License.
 #include "llvm/InitializePasses.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Support/AMDGPUMetadata.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/FileSystem.h"
@@ -151,6 +156,232 @@ struct HsacoCache {
 
 static HsacoCache g_hsacoCache;  // NOLINT: static/global vars forbidden
 
+// Structure to hold register spilling and stack information from HSACO metadata
+struct RegisterSpillInfo {
+  uint64_t sgpr_spill_count = 0;
+  uint64_t vgpr_spill_count = 0;
+  uint64_t private_segment_size = 0;
+  bool uses_dynamic_stack = false;
+
+  bool HasSpilling() const {
+    return sgpr_spill_count > 0 || vgpr_spill_count > 0;
+  }
+
+  bool HasStackUsage() const {
+    return private_segment_size > 0 || uses_dynamic_stack;
+  }
+};
+
+// Parse NT_AMDGPU_METADATA note contents and extract register spill counts.
+// The metadata is in MessagePack format containing kernel information.
+RegisterSpillInfo ParseAMDGPUMetadataForSpills(llvm::StringRef metadata) {
+  RegisterSpillInfo spill_info;
+
+  // Parse the MsgPack metadata
+  llvm::msgpack::Document doc;
+  if (!doc.readFromBlob(metadata, /*Multi=*/false)) {
+    VLOG(2) << "Could not parse MsgPack metadata from NT_AMDGPU_METADATA note";
+    return spill_info;
+  }
+
+  llvm::msgpack::DocNode root = doc.getRoot();
+  if (!root.isMap()) {
+    VLOG(2) << "AMDGPU metadata root is not a map (unexpected format)";
+    return spill_info;
+  }
+
+  // Look for "amdhsa.kernels" array
+  llvm::msgpack::MapDocNode root_map = root.getMap();
+  auto kernels_it = root_map.find("amdhsa.kernels");
+
+  if (kernels_it == root_map.end() || !kernels_it->second.isArray()) {
+    VLOG(2) << "NT_AMDGPU_METADATA found but missing 'amdhsa.kernels' array";
+    return spill_info;
+  }
+
+  llvm::msgpack::ArrayDocNode kernels_array = kernels_it->second.getArray();
+
+  // Iterate through each kernel
+  for (auto& kernel_node : kernels_array) {
+    uint64_t kernel_sgpr_spill = 0;
+    uint64_t kernel_vgpr_spill = 0;
+    uint64_t kernel_sgpr_count = 0;
+    uint64_t kernel_vgpr_count = 0;
+    uint64_t kernel_private_size = 0;
+    bool kernel_uses_dynamic = false;
+
+    if (!kernel_node.isMap()) continue;
+
+    llvm::msgpack::MapDocNode kernel_map = kernel_node.getMap();
+
+    // Look for ".sgpr_spill_count"
+    auto sgpr_it = kernel_map.find(".sgpr_spill_count");
+    if (sgpr_it != kernel_map.end() &&
+        sgpr_it->second.getKind() == llvm::msgpack::Type::UInt) {
+      kernel_sgpr_spill = sgpr_it->second.getUInt();
+      spill_info.sgpr_spill_count =
+          std::max(spill_info.sgpr_spill_count, kernel_sgpr_spill);
+    }
+
+    // Look for ".vgpr_spill_count"
+    auto vgpr_it = kernel_map.find(".vgpr_spill_count");
+    if (vgpr_it != kernel_map.end() &&
+        vgpr_it->second.getKind() == llvm::msgpack::Type::UInt) {
+      kernel_vgpr_spill = vgpr_it->second.getUInt();
+      spill_info.vgpr_spill_count =
+          std::max(spill_info.vgpr_spill_count, kernel_vgpr_spill);
+    }
+
+    // Look for ".private_segment_fixed_size"
+    auto priv_it = kernel_map.find(".private_segment_fixed_size");
+    if (priv_it != kernel_map.end() &&
+        priv_it->second.getKind() == llvm::msgpack::Type::UInt) {
+      kernel_private_size = priv_it->second.getUInt();
+      spill_info.private_segment_size =
+          std::max(spill_info.private_segment_size, kernel_private_size);
+    }
+
+    // Look for ".uses_dynamic_stack"
+    auto dyn_it = kernel_map.find(".uses_dynamic_stack");
+    if (dyn_it != kernel_map.end() &&
+        dyn_it->second.getKind() == llvm::msgpack::Type::Boolean) {
+      kernel_uses_dynamic = dyn_it->second.getBool();
+      spill_info.uses_dynamic_stack =
+          spill_info.uses_dynamic_stack || kernel_uses_dynamic;
+    }
+
+    // Helper to get kernel name for logging (only when needed)
+    auto get_kernel_name = [&kernel_map]() -> std::string {
+      auto name_it = kernel_map.find(".name");
+      if (name_it != kernel_map.end() &&
+          name_it->second.getKind() == llvm::msgpack::Type::String) {
+        return name_it->second.getString().str();
+      }
+      return "unknown";
+    };
+
+    // Log per-kernel spill information with register usage
+    if (kernel_sgpr_spill > 0 || kernel_vgpr_spill > 0) {
+      // Look for ".sgpr_count" (total SGPRs used)
+      auto sgpr_count_it = kernel_map.find(".sgpr_count");
+      if (sgpr_count_it != kernel_map.end() &&
+          sgpr_count_it->second.getKind() == llvm::msgpack::Type::UInt) {
+        kernel_sgpr_count = sgpr_count_it->second.getUInt();
+      }
+
+      // Look for ".vgpr_count" (total VGPRs used)
+      auto vgpr_count_it = kernel_map.find(".vgpr_count");
+      if (vgpr_count_it != kernel_map.end() &&
+          vgpr_count_it->second.getKind() == llvm::msgpack::Type::UInt) {
+        kernel_vgpr_count = vgpr_count_it->second.getUInt();
+      }
+
+      VLOG(2) << "Kernel '" << get_kernel_name() << "' has register spilling: "
+              << "SGPR=" << kernel_sgpr_spill << ", VGPR=" << kernel_vgpr_spill
+              << ". Register count: SGPR=" << kernel_sgpr_count
+              << ", VGPR=" << kernel_vgpr_count;
+    }
+
+    // Log per-kernel stack usage
+    if (kernel_private_size > 0 || kernel_uses_dynamic) {
+      VLOG(2) << "Kernel '" << get_kernel_name() << "' stack usage: "
+              << "private=" << kernel_private_size
+              << ", dynamic=" << (kernel_uses_dynamic ? "true" : "false");
+    }
+  }
+
+  return spill_info;
+}
+
+// ELF note descriptor alignment per ELF specification
+constexpr int kElfNoteDescAlignment = 4;
+
+// Returns spill counts by parsing AMDGPU metadata from note sections of HSACO
+// ELF binary.
+//
+// HSACO file (ELF binary)
+//   -- .note section(s)
+//       -- ELF Note with type=NT_AMDGPU_METADATA
+//           -- MessagePack data
+//               -- Root map
+//                   -- "amdhsa.kernels" array
+//                       -- Each kernel object
+//                           - ".sgpr_spill_count"
+//                           - ".vgpr_spill_count"
+//                           - ... (other kernel properties)
+RegisterSpillInfo ExtractRegisterSpillingFromHsaco(
+    const std::vector<uint8_t>& hsaco) {
+  RegisterSpillInfo spill_info;
+
+  // Create memory buffer from HSACO data
+  std::unique_ptr<llvm::MemoryBuffer> mem_buffer =
+      llvm::MemoryBuffer::getMemBuffer(
+          llvm::StringRef(reinterpret_cast<const char*>(hsaco.data()),
+                          hsaco.size()),
+          "", /*RequiresNullTerminator=*/false);
+
+  // Parse as ELF object file
+  llvm::Expected<std::unique_ptr<llvm::object::ObjectFile>> obj_or_err =
+      llvm::object::ObjectFile::createObjectFile(mem_buffer->getMemBufferRef());
+
+  if (!obj_or_err) {
+    VLOG(2) << "Could not parse HSACO as ELF object file: "
+            << llvm::toString(obj_or_err.takeError());
+    return spill_info;
+  }
+
+  llvm::object::ObjectFile* obj = obj_or_err->get();
+
+  // Cast to ELF64LE object file (AMDGPU uses 64-bit little-endian ELF)
+  auto* elf_obj = llvm::dyn_cast<llvm::object::ELF64LEObjectFile>(obj);
+  if (!elf_obj) {
+    VLOG(2) << "HSACO is not a 64-bit little-endian ELF file";
+    return spill_info;
+  }
+
+  // Get the underlying ELFFile to access the notes() API
+  const auto& elf_file = elf_obj->getELFFile();
+
+  for (const auto& section : elf_obj->sections()) {
+    llvm::Expected<const typename llvm::object::ELF64LEObjectFile::Elf_Shdr*>
+        shdr_or_err = elf_obj->getSection(section.getRawDataRefImpl());
+
+    if (!shdr_or_err) {
+      continue;  // Skip sections we can't access
+    }
+
+    const auto* shdr = *shdr_or_err;
+
+    if (shdr->sh_type != llvm::ELF::SHT_NOTE) {
+      continue;
+    }
+
+    llvm::Error err = llvm::Error::success();
+    for (const auto& note : elf_file.notes(*shdr, err)) {
+      if (note.getType() == llvm::ELF::NT_AMDGPU_METADATA) {
+        llvm::StringRef metadata =
+            note.getDescAsStringRef(kElfNoteDescAlignment);
+
+        if (metadata.empty()) {
+          VLOG(2) << "Found NT_AMDGPU_METADATA note but it contains no data";
+          continue;
+        }
+
+        // Parse the metadata and extract spill counts, return immediately
+        return ParseAMDGPUMetadataForSpills(metadata);
+      }
+    }
+
+    if (err) {
+      VLOG(2) << "Error parsing notes: " << llvm::toString(std::move(err));
+    }
+  }
+
+  // If we reach here, no metadata was found
+  VLOG(2) << "No AMDGPU metadata found in HSACO";
+  return spill_info;
+}
+
 bool HsacoCache::Find(const std::string& ir, uint64_t& hash,
                       const std::string& gfx, std::vector<uint8_t>& hsaco) {
   absl::MutexLock lock(g_hsacoCache.mutex);
@@ -332,136 +563,42 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
   hsaco_file.close();
 
   // Check for register spilling using HSACO metadata
-  // Use amd_comgr library for fast in-process metadata extraction
   VLOG(2) << "Checking for register spilling in: "
           << module->getModuleIdentifier();
 
-  bool has_spilling = false;
-  int sgpr_spill_count = 0;
-  int vgpr_spill_count = 0;
-  int private_segment_size = 0;
+  RegisterSpillInfo spill_info = ExtractRegisterSpillingFromHsaco(hsaco);
 
-  // Use already-loaded HSACO data for amd_comgr parsing
-  {
-    // Create amd_comgr data object from HSACO
-    amd_comgr_data_t comgr_data;
-    amd_comgr_status_t status =
-        amd_comgr_create_data(AMD_COMGR_DATA_KIND_EXECUTABLE, &comgr_data);
-
-    if (status == AMD_COMGR_STATUS_SUCCESS) {
-      status = amd_comgr_set_data(comgr_data, hsaco.size(),
-                                  reinterpret_cast<const char*>(hsaco.data()));
-
-      if (status == AMD_COMGR_STATUS_SUCCESS) {
-        // Get metadata from the executable
-        amd_comgr_metadata_node_t metadata;
-        status = amd_comgr_get_data_metadata(comgr_data, &metadata);
-
-        if (status == AMD_COMGR_STATUS_SUCCESS) {
-          // Helper lambda to lookup integer value from metadata map
-          auto lookup_int_value = [](amd_comgr_metadata_node_t root,
-                                     const char* key) -> int {
-            amd_comgr_metadata_node_t value_node;
-            amd_comgr_status_t s =
-                amd_comgr_metadata_lookup(root, key, &value_node);
-            if (s != AMD_COMGR_STATUS_SUCCESS) {
-              return 0;
-            }
-
-            size_t size = 0;
-            s = amd_comgr_get_metadata_string(value_node, &size, nullptr);
-            if (s != AMD_COMGR_STATUS_SUCCESS || size == 0) {
-              amd_comgr_destroy_metadata(value_node);
-              return 0;
-            }
-
-            std::string str_value(size, '\0');
-            s = amd_comgr_get_metadata_string(value_node, &size,
-                                              str_value.data());
-            amd_comgr_destroy_metadata(value_node);
-
-            if (s != AMD_COMGR_STATUS_SUCCESS) {
-              return 0;
-            }
-
-            // Parse the integer value
-            try {
-              return std::stoi(str_value);
-            } catch (...) {
-              return 0;
-            }
-          };
-
-          // Navigate to amdhsa.kernels array and check each kernel
-          amd_comgr_metadata_node_t kernels_node;
-          if (amd_comgr_metadata_lookup(metadata, "amdhsa.kernels",
-                                        &kernels_node) ==
-              AMD_COMGR_STATUS_SUCCESS) {
-            size_t kernel_count = 0;
-            amd_comgr_get_metadata_list_size(kernels_node, &kernel_count);
-
-            for (size_t i = 0; i < kernel_count; ++i) {
-              amd_comgr_metadata_node_t kernel_node;
-              if (amd_comgr_index_list_metadata(kernels_node, i,
-                                                &kernel_node) ==
-                  AMD_COMGR_STATUS_SUCCESS) {
-                // Get spill counts for this kernel
-                int kernel_sgpr_spill =
-                    lookup_int_value(kernel_node, ".sgpr_spill_count");
-                int kernel_vgpr_spill =
-                    lookup_int_value(kernel_node, ".vgpr_spill_count");
-                int kernel_private_size = lookup_int_value(
-                    kernel_node, ".private_segment_fixed_size");
-
-                // Aggregate max values across all kernels
-                sgpr_spill_count =
-                    std::max(sgpr_spill_count, kernel_sgpr_spill);
-                vgpr_spill_count =
-                    std::max(vgpr_spill_count, kernel_vgpr_spill);
-                private_segment_size =
-                    std::max(private_segment_size, kernel_private_size);
-
-                amd_comgr_destroy_metadata(kernel_node);
-              }
-            }
-            amd_comgr_destroy_metadata(kernels_node);
-          }
-
-          amd_comgr_destroy_metadata(metadata);
-        } else {
-          VLOG(2) << "Could not get HSACO metadata via amd_comgr";
-        }
-      }
-      amd_comgr_release_data(comgr_data);
-    } else {
-      VLOG(2) << "Could not create amd_comgr data object";
-    }
-
-    if (sgpr_spill_count > 0 || vgpr_spill_count > 0 ||
-        private_segment_size > 0) {
-      has_spilling = true;
-    }
+  if (spill_info.HasSpilling()) {
+    // We can have SGPR spills without stack being used. They are saved to
+    // VGPRs. In that case, we don't want to discard such kernel, so just
+    // report such cases.
+    VLOG(1) << "Register spilling (SGPR: " << spill_info.sgpr_spill_count
+            << ", VGPR: " << spill_info.vgpr_spill_count << ") detected in "
+            << module->getModuleIdentifier();
+  } else {
+    VLOG(2) << "No register spilling detected in "
+            << module->getModuleIdentifier();
   }
 
-  if (has_spilling) {
-    VLOG(0) << "====== REGISTER SPILLING DETECTED ======";
-    VLOG(0) << "Module: " << module->getModuleIdentifier();
-    VLOG(0) << "SGPR spill count: " << sgpr_spill_count;
-    VLOG(0) << "VGPR spill count: " << vgpr_spill_count;
-    VLOG(0) << "Private segment size: " << private_segment_size << " bytes";
-    VLOG(0) << "Performance may be degraded due to register pressure";
-    VLOG(0) << "========================================";
+  if (spill_info.HasStackUsage()) {
+    VLOG(1) << "Stack usage (private: " << spill_info.private_segment_size
+            << ", dynamic: "
+            << (spill_info.uses_dynamic_stack ? "true" : "false")
+            << ") detected in " << module->getModuleIdentifier();
 
     // Filter out kernels with register spilling during autotuning
     // This matches NVIDIA's behavior in ptx_compiler_impl.cc
     // TODO: remove ptx from xla_gpu_fail_ptx_compilation_on_register_spilling
     // to make the flag more general
     if (debug_options.xla_gpu_fail_ptx_compilation_on_register_spilling()) {
+      VLOG(0) << "Discard module " << module->getModuleIdentifier()
+              << " due register spilling or stack usage";
       return xla::Cancelled(
-          "Compilation result discarded due to register spilling");
+          "Compilation result discarded due to register spilling or stack "
+          "usage");
     }
   } else {
-    VLOG(2) << "No register spilling detected";
+    VLOG(2) << "No stack usage detected in " << module->getModuleIdentifier();
   }
 
   // Clean up temp files
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
new file mode 100644
index 00000000000000..74b1c94feffa47
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h"
+#include "xla/service/gpu/llvm_gpu_backend/load_ir_module.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/xla.pb.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/test.h"
+
+namespace xla::gpu {
+namespace {
+
+namespace se = ::stream_executor;
+
+static std::string RemoveLLExtension(const std::string& filename) {
+  return filename.substr(0, filename.find(".ll"));
+}
+
+// Test parameter structure
+struct SpillingTestParam {
+  std::string ir_filename;         // IR file to compile
+  bool fail_on_spilling;           // Flag value
+  absl::StatusCode expected_code;  // Expected status code
+  std::string expected_substring;  // Expected substring in error (if any)
+};
+
+class AMDGPURegisterSpillingTest
+    : public ::testing::TestWithParam<SpillingTestParam> {
+ protected:
+  // Helper to load IR module from test data
+  std::unique_ptr<llvm::Module> LoadTestModule(llvm::LLVMContext* context,
+                                               const std::string& filename) {
+    return LoadIRModule(
+        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", "gpu",
+                          "llvm_gpu_backend", "tests_data", filename),
+        context);
+  }
+
+  // Helper to compile with given debug options
+  absl::StatusOr<std::vector<uint8_t>> CompileModule(
+      llvm::Module* module, const std::string& module_id,
+      bool fail_on_spilling) {
+    DebugOptions debug_options;
+    debug_options.set_xla_gpu_fail_ptx_compilation_on_register_spilling(
+        fail_on_spilling);
+
+    module->setModuleIdentifier(module_id);
+
+    return amdgpu::CompileToHsaco(
+        module, se::GpuComputeCapability{se::RocmComputeCapability{"gfx1100"}},
+        debug_options, module_id);
+  }
+};
+
+TEST_P(AMDGPURegisterSpillingTest, CompileTest) {
+  const SpillingTestParam& param = GetParam();
+  llvm::LLVMContext context;
+
+  auto module = LoadTestModule(&context, param.ir_filename);
+  ASSERT_NE(module, nullptr);
+
+  // Generate module ID from filename and flag state
+  std::string module_id =
+      RemoveLLExtension(param.ir_filename) +
+      (param.fail_on_spilling ? "_fail_on_spilling" : "_allow_spilling");
+
+  auto result = CompileModule(module.get(), module_id, param.fail_on_spilling);
+
+  EXPECT_EQ(result.status().code(), param.expected_code)
+      << "IR: " << param.ir_filename
+      << ", Flag: " << (param.fail_on_spilling ? "enabled" : "disabled")
+      << ", Status: " << result.status().message();
+
+  if (!param.expected_substring.empty()) {
+    EXPECT_THAT(result.status().message(),
+                ::testing::HasSubstr(param.expected_substring))
+        << "IR: " << param.ir_filename;
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RegisterSpillingTests, AMDGPURegisterSpillingTest,
+    ::testing::Values(
+        SpillingTestParam{"amdgpu_no_spills.ll",
+                          /*fail_on_spilling=*/true, absl::StatusCode::kOk, ""},
+        SpillingTestParam{"amdgpu_vgpr_spills.ll",
+                          /*fail_on_spilling=*/false, absl::StatusCode::kOk,
+                          ""},
+        SpillingTestParam{"amdgpu_vgpr_spills.ll",
+                          /*fail_on_spilling=*/true,
+                          absl::StatusCode::kCancelled, "register spilling"},
+        SpillingTestParam{"amdgpu_sgpr_spills.ll",
+                          /*fail_on_spilling=*/false, absl::StatusCode::kOk,
+                          ""},
+        SpillingTestParam{"amdgpu_sgpr_spills.ll",
+                          /*fail_on_spilling=*/true, absl::StatusCode::kOk, ""},
+        SpillingTestParam{"amdgpu_dynamic_stack.ll",
+                          /*fail_on_spilling=*/true,
+                          absl::StatusCode::kCancelled, "stack usage"}),
+    [](const ::testing::TestParamInfo<SpillingTestParam>& info) {
+      return RemoveLLExtension(info.param.ir_filename) +
+             (info.param.fail_on_spilling ? "_fail_on_spilling"
+                                          : "_allow_spilling");
+    });
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_dynamic_stack.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_dynamic_stack.ll
new file mode 100644
index 00000000000000..5a8b76446c3e55
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_dynamic_stack.ll
@@ -0,0 +1,26 @@
+; AMDGPU kernel with dynamic stack usage (indirect function call)
+; Based on real HIP code that uses function pointers
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+@__hip_cuid_40fa47637d275275 = addrspace(1) global i8 0
+
+@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_40fa47637d275275 to ptr)], section "llvm.metadata"
+
+; Kernel that uses indirect function call requiring dynamic stack
+define protected amdgpu_kernel void @_Z4TestPDF16bS_S_(ptr addrspace(1) noundef %dst.coerce, ptr addrspace(1) noundef %ptr1.coerce, ptr addrspace(1) noundef %ptr2.coerce) local_unnamed_addr {
+entry:
+  %0 = ptrtoint ptr addrspace(1) %dst.coerce to i64
+  %1 = inttoptr i64 %0 to ptr
+  %2 = ptrtoint ptr addrspace(1) %ptr1.coerce to i64
+  %3 = inttoptr i64 %2 to ptr
+  %4 = ptrtoint ptr addrspace(1) %ptr2.coerce to i64
+  %5 = inttoptr i64 %4 to ptr
+  %6 = tail call ptr asm "", "=s"() #1
+  tail call void %6(ptr noundef %1, ptr noundef %3, ptr noundef %5) #2
+  ret void
+}
+
+attributes #1 = { nounwind }
+attributes #2 = { nounwind }
+
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_no_spills.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_no_spills.ll
new file mode 100644
index 00000000000000..4ab9829a36f90d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_no_spills.ll
@@ -0,0 +1,29 @@
+; Simple AMDGPU kernel for testing register spilling detection
+; This module has no external dependencies and minimal module flags
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Simple kernel that adds two arrays
+define amdgpu_kernel void @simple_add(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c) {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx = zext i32 %tid to i64
+
+  %a_ptr = getelementptr float, ptr addrspace(1) %a, i64 %tidx
+  %b_ptr = getelementptr float, ptr addrspace(1) %b, i64 %tidx
+  %c_ptr = getelementptr float, ptr addrspace(1) %c, i64 %tidx
+
+  %a_val = load float, ptr addrspace(1) %a_ptr, align 4
+  %b_val = load float, ptr addrspace(1) %b_ptr, align 4
+
+  %sum = fadd float %a_val, %b_val
+
+  store float %sum, ptr addrspace(1) %c_ptr, align 4
+  ret void
+}
+
+; Intrinsic declaration
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone speculatable }
+
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_sgpr_spills.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_sgpr_spills.ll
new file mode 100644
index 00000000000000..51dbc634d680c9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_sgpr_spills.ll
@@ -0,0 +1,166 @@
+; AMDGPU kernel with high SGPR pressure to force scalar register spilling
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Kernel using many scalar operations with limited SGPRs
+; We use readfirstlane to force values into SGPRs
+define amdgpu_kernel void @sgpr_pressure(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx = zext i32 %tid to i64
+
+  ; Load many scalar values from memory
+  ; Using readfirstlane forces values into SGPRs (uniform across wavefront)
+  %ptr0 = getelementptr i32, ptr addrspace(1) %in, i64 0
+  %v0_vec = load i32, ptr addrspace(1) %ptr0, align 4
+  %v0 = call i32 @llvm.amdgcn.readfirstlane(i32 %v0_vec)
+
+  %ptr1 = getelementptr i32, ptr addrspace(1) %in, i64 1
+  %v1_vec = load i32, ptr addrspace(1) %ptr1, align 4
+  %v1 = call i32 @llvm.amdgcn.readfirstlane(i32 %v1_vec)
+
+  %ptr2 = getelementptr i32, ptr addrspace(1) %in, i64 2
+  %v2_vec = load i32, ptr addrspace(1) %ptr2, align 4
+  %v2 = call i32 @llvm.amdgcn.readfirstlane(i32 %v2_vec)
+
+  %ptr3 = getelementptr i32, ptr addrspace(1) %in, i64 3
+  %v3_vec = load i32, ptr addrspace(1) %ptr3, align 4
+  %v3 = call i32 @llvm.amdgcn.readfirstlane(i32 %v3_vec)
+
+  %ptr4 = getelementptr i32, ptr addrspace(1) %in, i64 4
+  %v4_vec = load i32, ptr addrspace(1) %ptr4, align 4
+  %v4 = call i32 @llvm.amdgcn.readfirstlane(i32 %v4_vec)
+
+  %ptr5 = getelementptr i32, ptr addrspace(1) %in, i64 5
+  %v5_vec = load i32, ptr addrspace(1) %ptr5, align 4
+  %v5 = call i32 @llvm.amdgcn.readfirstlane(i32 %v5_vec)
+
+  %ptr6 = getelementptr i32, ptr addrspace(1) %in, i64 6
+  %v6_vec = load i32, ptr addrspace(1) %ptr6, align 4
+  %v6 = call i32 @llvm.amdgcn.readfirstlane(i32 %v6_vec)
+
+  %ptr7 = getelementptr i32, ptr addrspace(1) %in, i64 7
+  %v7_vec = load i32, ptr addrspace(1) %ptr7, align 4
+  %v7 = call i32 @llvm.amdgcn.readfirstlane(i32 %v7_vec)
+
+  %ptr8 = getelementptr i32, ptr addrspace(1) %in, i64 8
+  %v8_vec = load i32, ptr addrspace(1) %ptr8, align 4
+  %v8 = call i32 @llvm.amdgcn.readfirstlane(i32 %v8_vec)
+
+  %ptr9 = getelementptr i32, ptr addrspace(1) %in, i64 9
+  %v9_vec = load i32, ptr addrspace(1) %ptr9, align 4
+  %v9 = call i32 @llvm.amdgcn.readfirstlane(i32 %v9_vec)
+
+  %ptr10 = getelementptr i32, ptr addrspace(1) %in, i64 10
+  %v10_vec = load i32, ptr addrspace(1) %ptr10, align 4
+  %v10 = call i32 @llvm.amdgcn.readfirstlane(i32 %v10_vec)
+
+  %ptr11 = getelementptr i32, ptr addrspace(1) %in, i64 11
+  %v11_vec = load i32, ptr addrspace(1) %ptr11, align 4
+  %v11 = call i32 @llvm.amdgcn.readfirstlane(i32 %v11_vec)
+
+  %ptr12 = getelementptr i32, ptr addrspace(1) %in, i64 12
+  %v12_vec = load i32, ptr addrspace(1) %ptr12, align 4
+  %v12 = call i32 @llvm.amdgcn.readfirstlane(i32 %v12_vec)
+
+  %ptr13 = getelementptr i32, ptr addrspace(1) %in, i64 13
+  %v13_vec = load i32, ptr addrspace(1) %ptr13, align 4
+  %v13 = call i32 @llvm.amdgcn.readfirstlane(i32 %v13_vec)
+
+  %ptr14 = getelementptr i32, ptr addrspace(1) %in, i64 14
+  %v14_vec = load i32, ptr addrspace(1) %ptr14, align 4
+  %v14 = call i32 @llvm.amdgcn.readfirstlane(i32 %v14_vec)
+
+  %ptr15 = getelementptr i32, ptr addrspace(1) %in, i64 15
+  %v15_vec = load i32, ptr addrspace(1) %ptr15, align 4
+  %v15 = call i32 @llvm.amdgcn.readfirstlane(i32 %v15_vec)
+
+  ; Create many scalar computations - chain A
+  %a0 = add i32 %v0, %v1
+  %a1 = mul i32 %a0, %v2
+  %a2 = add i32 %a1, %v3
+  %a3 = mul i32 %a2, %v4
+  %a4 = add i32 %a3, %v5
+  %a5 = mul i32 %a4, %v6
+  %a6 = add i32 %a5, %v7
+  %a7 = mul i32 %a6, %v8
+  %a8 = add i32 %a7, %v9
+  %a9 = mul i32 %a8, %v10
+  %a10 = add i32 %a9, %v11
+  %a11 = mul i32 %a10, %v12
+  %a12 = add i32 %a11, %v13
+  %a13 = mul i32 %a12, %v14
+  %a14 = add i32 %a13, %v15
+
+  ; Chain B - reverse
+  %b0 = mul i32 %v15, %v14
+  %b1 = add i32 %b0, %v13
+  %b2 = mul i32 %b1, %v12
+  %b3 = add i32 %b2, %v11
+  %b4 = mul i32 %b3, %v10
+  %b5 = add i32 %b4, %v9
+  %b6 = mul i32 %b5, %v8
+  %b7 = add i32 %b6, %v7
+  %b8 = mul i32 %b7, %v6
+  %b9 = add i32 %b8, %v5
+  %b10 = mul i32 %b9, %v4
+  %b11 = add i32 %b10, %v3
+  %b12 = mul i32 %b11, %v2
+  %b13 = add i32 %b12, %v1
+  %b14 = mul i32 %b13, %v0
+
+  ; Chain C - subtraction
+  %c0 = sub i32 %v0, %v1
+  %c1 = mul i32 %c0, %v2
+  %c2 = sub i32 %c1, %v3
+  %c3 = mul i32 %c2, %v4
+  %c4 = sub i32 %c3, %v5
+  %c5 = mul i32 %c4, %v6
+  %c6 = sub i32 %c5, %v7
+  %c7 = mul i32 %c6, %v8
+  %c8 = sub i32 %c7, %v9
+  %c9 = mul i32 %c8, %v10
+  %c10 = sub i32 %c9, %v11
+  %c11 = mul i32 %c10, %v12
+  %c12 = sub i32 %c11, %v13
+  %c13 = mul i32 %c12, %v14
+  %c14 = sub i32 %c13, %v15
+
+  ; Chain D - cross dependencies
+  %d0 = add i32 %a0, %b0
+  %d1 = mul i32 %d0, %c0
+  %d2 = add i32 %a1, %b1
+  %d3 = mul i32 %d2, %c1
+  %d4 = add i32 %a2, %b2
+  %d5 = mul i32 %d4, %c2
+  %d6 = add i32 %a3, %b3
+  %d7 = mul i32 %d6, %c3
+  %d8 = add i32 %a4, %b4
+  %d9 = mul i32 %d8, %c4
+  %d10 = add i32 %a5, %b5
+  %d11 = mul i32 %d10, %c5
+  %d12 = add i32 %a6, %b6
+  %d13 = mul i32 %d12, %c6
+
+  ; Combine all chains
+  %r0 = add i32 %a14, %b14
+  %r1 = add i32 %r0, %c14
+  %r2 = add i32 %r1, %d1
+  %r3 = add i32 %r2, %d3
+  %r4 = add i32 %r3, %d5
+  %r5 = add i32 %r4, %d7
+  %r6 = add i32 %r5, %d9
+  %r7 = add i32 %r6, %d11
+  %result = add i32 %r7, %d13
+
+  %out_ptr = getelementptr i32, ptr addrspace(1) %out, i64 %tidx
+  store i32 %result, ptr addrspace(1) %out_ptr, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare i32 @llvm.amdgcn.readfirstlane(i32) #1
+
+; Limit SGPRs to 32, this should force SGPR spilling
+attributes #0 = { "amdgpu-num-sgpr"="32" "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { nounwind readnone speculatable }
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_vgpr_spills.ll b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_vgpr_spills.ll
new file mode 100644
index 00000000000000..5634790c8e6eba
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/tests_data/amdgpu_vgpr_spills.ll
@@ -0,0 +1,145 @@
+; AMDGPU kernel with high register pressure to force spilling
+; This uses many vector operations to exhaust available VGPRs
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Kernel with many live values to force register spilling
+define amdgpu_kernel void @high_register_pressure(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidx = zext i32 %tid to i64
+
+  ; Load many vectors from memory - using volatile to prevent optimization
+  %ptr0 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 %tidx
+  %v0 = load volatile <4 x float>, ptr addrspace(1) %ptr0, align 16
+
+  %ptr1 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 1
+  %v1 = load volatile <4 x float>, ptr addrspace(1) %ptr1, align 16
+
+  %ptr2 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 2
+  %v2 = load volatile <4 x float>, ptr addrspace(1) %ptr2, align 16
+
+  %ptr3 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 3
+  %v3 = load volatile <4 x float>, ptr addrspace(1) %ptr3, align 16
+
+  %ptr4 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 4
+  %v4 = load volatile <4 x float>, ptr addrspace(1) %ptr4, align 16
+
+  %ptr5 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 5
+  %v5 = load volatile <4 x float>, ptr addrspace(1) %ptr5, align 16
+
+  %ptr6 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 6
+  %v6 = load volatile <4 x float>, ptr addrspace(1) %ptr6, align 16
+
+  %ptr7 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 7
+  %v7 = load volatile <4 x float>, ptr addrspace(1) %ptr7, align 16
+
+  %ptr8 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 8
+  %v8 = load volatile <4 x float>, ptr addrspace(1) %ptr8, align 16
+
+  %ptr9 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 9
+  %v9 = load volatile <4 x float>, ptr addrspace(1) %ptr9, align 16
+
+  %ptr10 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 10
+  %v10 = load volatile <4 x float>, ptr addrspace(1) %ptr10, align 16
+
+  %ptr11 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 11
+  %v11 = load volatile <4 x float>, ptr addrspace(1) %ptr11, align 16
+
+  %ptr12 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 12
+  %v12 = load volatile <4 x float>, ptr addrspace(1) %ptr12, align 16
+
+  %ptr13 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 13
+  %v13 = load volatile <4 x float>, ptr addrspace(1) %ptr13, align 16
+
+  %ptr14 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 14
+  %v14 = load volatile <4 x float>, ptr addrspace(1) %ptr14, align 16
+
+  %ptr15 = getelementptr <4 x float>, ptr addrspace(1) %in, i64 15
+  %v15 = load volatile <4 x float>, ptr addrspace(1) %ptr15, align 16
+
+  ; Create many dependent calculations - chain A
+  %a0 = fadd <4 x float> %v0, %v1
+  %a1 = fmul <4 x float> %a0, %v2
+  %a2 = fadd <4 x float> %a1, %v3
+  %a3 = fmul <4 x float> %a2, %v4
+  %a4 = fadd <4 x float> %a3, %v5
+  %a5 = fmul <4 x float> %a4, %v6
+  %a6 = fadd <4 x float> %a5, %v7
+  %a7 = fmul <4 x float> %a6, %v8
+  %a8 = fadd <4 x float> %a7, %v9
+  %a9 = fmul <4 x float> %a8, %v10
+  %a10 = fadd <4 x float> %a9, %v11
+  %a11 = fmul <4 x float> %a10, %v12
+  %a12 = fadd <4 x float> %a11, %v13
+  %a13 = fmul <4 x float> %a12, %v14
+  %a14 = fadd <4 x float> %a13, %v15
+
+  ; Chain B - reverse direction
+  %b0 = fmul <4 x float> %v15, %v14
+  %b1 = fadd <4 x float> %b0, %v13
+  %b2 = fmul <4 x float> %b1, %v12
+  %b3 = fadd <4 x float> %b2, %v11
+  %b4 = fmul <4 x float> %b3, %v10
+  %b5 = fadd <4 x float> %b4, %v9
+  %b6 = fmul <4 x float> %b5, %v8
+  %b7 = fadd <4 x float> %b6, %v7
+  %b8 = fmul <4 x float> %b7, %v6
+  %b9 = fadd <4 x float> %b8, %v5
+  %b10 = fmul <4 x float> %b9, %v4
+  %b11 = fadd <4 x float> %b10, %v3
+  %b12 = fmul <4 x float> %b11, %v2
+  %b13 = fadd <4 x float> %b12, %v1
+  %b14 = fmul <4 x float> %b13, %v0
+
+  ; Chain C - subtraction chain
+  %c0 = fsub <4 x float> %v0, %v1
+  %c1 = fmul <4 x float> %c0, %v2
+  %c2 = fsub <4 x float> %c1, %v3
+  %c3 = fmul <4 x float> %c2, %v4
+  %c4 = fsub <4 x float> %c3, %v5
+  %c5 = fmul <4 x float> %c4, %v6
+  %c6 = fsub <4 x float> %c5, %v7
+  %c7 = fmul <4 x float> %c6, %v8
+  %c8 = fsub <4 x float> %c7, %v9
+  %c9 = fmul <4 x float> %c8, %v10
+  %c10 = fsub <4 x float> %c9, %v11
+  %c11 = fmul <4 x float> %c10, %v12
+  %c12 = fsub <4 x float> %c11, %v13
+  %c13 = fmul <4 x float> %c12, %v14
+  %c14 = fsub <4 x float> %c13, %v15
+
+  ; Chain D - cross dependencies
+  %d0 = fadd <4 x float> %a0, %b0
+  %d1 = fmul <4 x float> %d0, %c0
+  %d2 = fadd <4 x float> %a1, %b1
+  %d3 = fmul <4 x float> %d2, %c1
+  %d4 = fadd <4 x float> %a2, %b2
+  %d5 = fmul <4 x float> %d4, %c2
+  %d6 = fadd <4 x float> %a3, %b3
+  %d7 = fmul <4 x float> %d6, %c3
+  %d8 = fadd <4 x float> %a4, %b4
+  %d9 = fmul <4 x float> %d8, %c4
+  %d10 = fadd <4 x float> %a5, %b5
+  %d11 = fmul <4 x float> %d10, %c5
+
+  ; Final combination to keep all values live
+  %result0 = fadd <4 x float> %a14, %b14
+  %result1 = fadd <4 x float> %result0, %c14
+  %result2 = fadd <4 x float> %result1, %d1
+  %result3 = fadd <4 x float> %result2, %d3
+  %result4 = fadd <4 x float> %result3, %d5
+  %result5 = fadd <4 x float> %result4, %d7
+  %result6 = fadd <4 x float> %result5, %d9
+  %result = fadd <4 x float> %result6, %d11
+
+  %out_ptr = getelementptr <4 x float>, ptr addrspace(1) %out, i64 %tidx
+  store <4 x float> %result, ptr addrspace(1) %out_ptr, align 16
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Limit VGPRs to 64 to force spilling
+attributes #0 = { "amdgpu-num-vgpr"="64" "amdgpu-flat-work-group-size"="1,256" }
+attributes #1 = { nounwind readnone speculatable }

From 3ecff135cca46ba19478718938c7448e457b9226 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 06:36:54 -0800
Subject: [PATCH 436/753] [XLA:GPU] Precompute peer access capabilities in
 CudaExecutor initialization.

PiperOrigin-RevId: 845742912
---
 .../xla/stream_executor/cuda/cuda_executor.cc | 53 ++++++++-----------
 .../cuda/cuda_executor_multigpu_test.cc       | 16 +++++-
 2 files changed, 37 insertions(+), 32 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 1dbcf218156283..938f437ae57062 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -416,26 +416,6 @@ bool CanEnablePeerAccess(CUdevice from, CUdevice to) {
   return can_access_peer;
 }
 
-bool CanEnablePeerAccess(Context* from, Context* to) {
-  if (from == to) {
-    return true;  // A context can always access its own memory.
-  }
-
-  auto from_device = DeviceFromContext(from);
-  if (!from_device.ok()) {
-    LOG(ERROR) << "failed to resolve 'from' peer access context to a device: "
-               << from_device.status();
-    return false;
-  }
-  auto to_device = DeviceFromContext(to);
-  if (!to_device.ok()) {
-    LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
-               << to_device.status();
-    return false;
-  }
-  return CanEnablePeerAccess(from_device.value(), to_device.value());
-}
-
 absl::Status EnablePeerAccess(Context* from, Context* to) {
   if (from == to) {
     return absl::OkStatus();  // A context can always access its own
@@ -1082,6 +1062,17 @@ absl::Status CudaExecutor::Init() {
   if (numa_node_ == tsl::port::kNUMANoAffinity) {
     XLA_VLOG_DEVICE(2, device_ordinal()) << "Could not determine NUMA node";
   }
+
+  int cuda_device_count = 0;
+  TF_RETURN_IF_ERROR(cuda::ToStatus(cudaGetDeviceCount(&cuda_device_count)));
+  for (int i = 0; i < cuda_device_count; ++i) {
+    if (i == device_ordinal()) {
+      peer_access_cache_[i] = true;
+      continue;
+    }
+
+    peer_access_cache_[i] = CanEnablePeerAccess(device_, i);
+  }
   return absl::OkStatus();
 }
 
@@ -1603,27 +1594,27 @@ fft::FftSupport* CudaExecutor::AsFft() {
   return fft_.get();
 }
 
-// TODO(468297175): Precalculate peer access in stream executor constructor.
 bool CudaExecutor::CanEnablePeerAccessTo(StreamExecutor* other) {
   CudaExecutor* cuda_other = static_cast<CudaExecutor*>(other);
-  return CanEnablePeerAccess(cuda_context_, cuda_other->cuda_context_);
+  absl::StatusOr<int> to_device = DeviceFromContext(cuda_other->cuda_context_);
+  if (!to_device.ok()) {
+    LOG(ERROR) << "failed to resolve 'to' peer access context to a device: "
+               << to_device.status();
+    return false;
+  }
+  return CanEnablePeerAccessTo(*to_device);
 }
 
 bool CudaExecutor::CanEnablePeerAccessTo(int other_device_ordinal) {
-  if (other_device_ordinal == device_ordinal()) {
-    // Self-access is always allowed.
-    return true;
-  }
-
   auto it = peer_access_cache_.find(other_device_ordinal);
   if (it != peer_access_cache_.end()) {
     return it->second;
   }
 
-  const bool result =
-      CanEnablePeerAccess(device_ordinal(), other_device_ordinal);
-  peer_access_cache_[other_device_ordinal] = result;
-  return result;
+  LOG(WARNING) << "Attemping to enable peer access from: " << device_ordinal()
+               << " to: " << other_device_ordinal
+               << " which was not available during initialization.";
+  return false;
 }
 
 absl::Status CudaExecutor::EnablePeerAccessTo(StreamExecutor* other) {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
index 9ad1336dc5343f..6d9d181703312f 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
@@ -81,6 +81,21 @@ StreamExecutor* GetGpuExecutor(int64_t device_ordinal) {
   return platform->ExecutorForDevice(device_ordinal).value();
 }
 
+TEST(CudaExecutorMultiGpuTest, PeerAccess) {
+  std::vector<CudaExecutor*> executors = {
+      static_cast<CudaExecutor*>(GetGpuExecutor(0)),
+      static_cast<CudaExecutor*>(GetGpuExecutor(1))};
+
+  if (!executors[0]->is_multicast_supported()) {
+    GTEST_SKIP() << "Test requires multicast support.";
+  }
+  EXPECT_TRUE(executors[0]->CanEnablePeerAccessTo(0));
+  EXPECT_TRUE(executors[0]->CanEnablePeerAccessTo(1));
+  EXPECT_TRUE(executors[1]->CanEnablePeerAccessTo(0));
+  EXPECT_TRUE(executors[1]->CanEnablePeerAccessTo(1));
+  EXPECT_FALSE(executors[0]->CanEnablePeerAccessTo(3));
+}
+
 TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryResubscriptionFails) {
   std::vector<CudaExecutor*> executors = {
       static_cast<CudaExecutor*>(GetGpuExecutor(0)),
@@ -130,7 +145,6 @@ TEST(CudaExecutorMultiGpuTest, CudaMulticastMemorySubscribeMoreDevices) {
   EXPECT_THAT(multicast_memory->SubscribeDevice(2),
               StatusIs(absl::StatusCode::kInvalidArgument,
                        "All devices are already subscribed."));
-  ;
 }
 
 TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryUsingNonVmmMemory) {

From cd5c0d06ba8ac08cdb97e1202762b825c6525bce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 06:45:08 -0800
Subject: [PATCH 437/753] Automated Code Change

PiperOrigin-RevId: 845745910
---
 .../gpu/host_offloading/gpu_host_offloading_allocator.cc     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc b/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc
index cddd983bdaddbb..cc3d66dbff9d03 100644
--- a/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc
+++ b/third_party/xla/xla/backends/gpu/host_offloading/gpu_host_offloading_allocator.cc
@@ -146,8 +146,9 @@ void* TransferBufferSubAllocator::Alloc(size_t alignment, size_t num_bytes,
     return nullptr;
   }
 
-  void* opaque = allocation.value()->opaque();
-  allocated_buffers_[allocation.value()->opaque()] = std::move(*allocation);
+  void* opaque = allocation.value()->address().opaque();
+  allocated_buffers_[allocation.value()->address().opaque()] =
+      std::move(*allocation);
   *bytes_received = num_bytes;
 
   return opaque;

From 44e7ac6a6bd57628ab96d4f5192b7d34d9a8d47f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 07:04:43 -0800
Subject: [PATCH 438/753] Automated Code Change

PiperOrigin-RevId: 845751970
---
 third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc  | 3 ++-
 .../xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc     | 6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index b0f25add8a9ff3..44841721f09a1f 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -3154,7 +3154,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
         std::make_unique<PjRtCApiBuffer>(client_, args.dst_buffer));
   } else {
     // Copy across PjRtClients by copying through host
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
+                        PjRtBuffer::ToLiteral().Await());
     absl::InlinedVector<int64_t, 4> byte_strides(
         literal->shape().dimensions().size());
     TF_RETURN_IF_ERROR(ShapeUtil::UnpackedByteStrides(
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
index cfe59018a80413..e9ac7f309dc6f3 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
@@ -125,7 +125,7 @@ TEST(PjRtCApiClientTest, FulfillAliasBuffer) {
   ASSERT_NE(alias_buffer.second, nullptr);
   TF_ASSERT_OK(std::move(alias_buffer.second)(result_buffer.get()));
   TF_ASSERT_OK_AND_ASSIGN(auto alias_literal,
-                          alias_buffer.first->ToLiteralSync());
+                          alias_buffer.first->ToLiteral().Await());
 
   // Expected result: data + 1
   EXPECT_TRUE(LiteralTestUtil::Equal(
@@ -370,7 +370,7 @@ TEST(PjRtClientTest, CreateViewAndCopyToDeviceAsyncExternalCpuOnly) {
       buffer->CopyToMemorySpace(client->memory_spaces()[1]));
   buffer.reset();
   ASSERT_TRUE(result);
-  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteralSync());
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, result->ToLiteral().Await());
 
   std::vector<int32_t> expected(4, 0);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<int32_t>(expected),
@@ -526,7 +526,7 @@ TEST(PjRtCApiClientTest, ForwardExecuteContext) {
   auto result = executable->Execute(/*argument_handles=*/{{}}, options);
 
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
-                          result->at(0).at(0)->ToLiteralSync());
+                          result->at(0).at(0)->ToLiteral().Await());
   EXPECT_TRUE(LiteralTestUtil::Equal(
       LiteralUtil::CreateR1<float>({42.0f, 42.0f, 42.0f, 42.0f}),
       *result_literal));

From 9c9429b376626d7cfc83e65420de467ffee90a57 Mon Sep 17 00:00:00 2001
From: Ashish Rao <asrao@nvidia.com>
Date: Wed, 17 Dec 2025 07:12:23 -0800
Subject: [PATCH 439/753] PR #35113: Enqueue cross-host send after send buffer
 definition events are recorded, not complete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35113

📝 Summary of Changes
This PR modifies `StreamExecutorGpuClient::ScheduleSendsOnLocalDevice` to enqueue cross-host sends as soon as the definition event of the buffer that needs to be sent has been recorded, instead of waiting for it to complete.

🎯 Justification
The original implementation blocks the execute thread until the send buffer is fully materialized before enqueuing the cross-host send. This prevents the execute thread from 'running ahead' and enqueuing multiple executables to launch on the device.

We can see this happening with [this toy program](https://gist.github.com/rao-ashish/0ecb3e3874328798e30412ab7e4870e3), which performs a matmul on devices 0-3 and transfers the result to devices 4-7. Without this fix, \~19 ms are spent blocking the execute thread between the `ncclGroupStart` and `ncclGroupEnd` for one batch of transfers:

<img width="2132" height="450" alt="cross_host_transfer_before_fix" src="https://github.com/user-attachments/assets/fff3f5c1-026f-4ead-9b64-837fc0f71d74" />

With this fix, this is reduced to \~88 us:

<img width="1195" height="636" alt="cross_host_transfer_after_fix" src="https://github.com/user-attachments/assets/b338632a-6a07-43dc-96f5-2ef26b12ef07" />

On this program, this ends up allowing us to launch matmul and data transfer kernels back-to-back, instead of incurring \~300 us of idle time on the device.

Profile screenshot before the fix, with the 300 us gap:
<img width="1587" height="76" alt="cross_host_transfer_device_before_fix" src="https://github.com/user-attachments/assets/3d4320cf-b922-463c-9b94-9e5462d9ed43" />

Profile screenshot after the fix, showing back-to-back launches:
<img width="2004" height="72" alt="cross_host_transfer_device_after_fix" src="https://github.com/user-attachments/assets/8a3d4463-afa7-449c-9f81-731f1b16993b" />

🚀 Kind of Contribution
🐛 Bug Fix

🧪 Unit Tests:
The previously added unit tests inside `se_gpu_pjrt_client_test.cc` continue to pass.

🧪 Execution Tests:
Verified that the implementation still works on these [four end-to-end tests](https://gist.github.com/rao-ashish/24ac0df0cb18243c649ac535964b31b8).
Copybara import of the project:

--
e9c4e7418b4ee22b43d60945fea0f2754b62873f by Ashish Rao <asrao@nvidia.com>:

Enqueue send after send buffer definition events are recorded, not when they are complete

Merging this change closes #35113

PiperOrigin-RevId: 845754674
---
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index be168f00082456..172806fbcb2128 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -793,7 +793,12 @@ void StreamExecutorGpuClient::ScheduleSendsOnLocalDevice(
     for (PreparedSend& prepared_send : prepared_sends) {
       // Wait until the buffer we want to send is fully materialized.
       for (const auto& event : prepared_send.definition_events_) {
-        tsl::BlockUntilReady(event.get());
+        if (event->IsType<BufferSequencingEvent>()) {
+          tsl::AsyncValueRef<BufferSequencingEvent> event_ref(event);
+          event_ref->WaitForEventOnStream(stream);
+        } else {
+          tsl::BlockUntilReady(event.get());
+        }
         if (auto* status = event->GetErrorIfPresent(); status != nullptr) {
           return *status;
         }

From f40e47ecab285fc5117a468e540ab09e4374b64d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 07:20:32 -0800
Subject: [PATCH 440/753] Automated Code Change

PiperOrigin-RevId: 845757249
---
 ...ht_outside_compilation_kernels_for_test.cc | 68 ++++++++++---------
 .../compiler/tf2xla/literal_util_test.cc      |  6 +-
 tensorflow/compiler/tf2xla/mlir_tf2xla.cc     |  2 +-
 .../compiler/tf2xla/xla_compilation_device.cc |  3 +-
 tensorflow/compiler/tf2xla/xla_compiler.h     |  2 +-
 5 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
index dd493a5606b597..993b98e61dc0ed 100644
--- a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
+++ b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
@@ -64,14 +64,15 @@ class TestStaticTfOp : public OpKernel {
 
     // Just pass the value through.
     uint64_t size = input.AllocatedBytes();
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(), size};
+    stream_executor::DeviceAddressBase gpu_dst{out_tensor->data(), size};
     se::Stream* stream = ctx->op_device_context()->stream();
 
-    OP_REQUIRES_OK(ctx,
-                   stream->MemcpyD2D(
-                       /*gpu_dst=*/&gpu_dst,
-                       /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
-                       /*size=*/input.AllocatedBytes()));
+    OP_REQUIRES_OK(
+        ctx,
+        stream->MemcpyD2D(
+            /*gpu_dst=*/&gpu_dst,
+            /*gpu_src=*/stream_executor::DeviceAddressBase{input.data(), size},
+            /*size=*/input.AllocatedBytes()));
   }
 };
 
@@ -105,21 +106,23 @@ class TestStaticMultipleOutputTfOp : public OpKernel {
 
     // Just pass the value through.
     uint64_t size = input.AllocatedBytes();
-    se::DeviceMemoryBase gpu_dst1{out_tensor1->data(), size};
-    se::DeviceMemoryBase gpu_dst2{out_tensor2->data(), size};
+    stream_executor::DeviceAddressBase gpu_dst1{out_tensor1->data(), size};
+    stream_executor::DeviceAddressBase gpu_dst2{out_tensor2->data(), size};
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
 
-    OP_REQUIRES_OK(ctx,
-                   stream->MemcpyD2D(
-                       /*gpu_dst=*/&gpu_dst1,
-                       /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
-                       /*size=*/input.AllocatedBytes()));
-    OP_REQUIRES_OK(ctx,
-                   stream->MemcpyD2D(
-                       /*gpu_dst=*/&gpu_dst2,
-                       /*gpu_src=*/se::DeviceMemoryBase{input.data(), size},
-                       /*size=*/input.AllocatedBytes()));
+    OP_REQUIRES_OK(
+        ctx,
+        stream->MemcpyD2D(
+            /*gpu_dst=*/&gpu_dst1,
+            /*gpu_src=*/stream_executor::DeviceAddressBase{input.data(), size},
+            /*size=*/input.AllocatedBytes()));
+    OP_REQUIRES_OK(
+        ctx,
+        stream->MemcpyD2D(
+            /*gpu_dst=*/&gpu_dst2,
+            /*gpu_src=*/stream_executor::DeviceAddressBase{input.data(), size},
+            /*size=*/input.AllocatedBytes()));
   }
 };
 
@@ -165,12 +168,12 @@ class TestDynamicTfOp : public OpKernel {
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
 
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy};
+    stream_executor::DeviceAddressBase gpu_dst{out_tensor->data(), size_to_cpy};
     OP_REQUIRES_OK(ctx, stream->MemcpyD2D(
                             /*gpu_dst=*/&gpu_dst,
                             /*gpu_src=*/
-                            se::DeviceMemoryBase{input.data(),
-                                                 static_cast<uint64_t>(size)},
+                            stream_executor::DeviceAddressBase{
+                                input.data(), static_cast<uint64_t>(size)},
                             /*size=*/size_to_cpy));
   }
 
@@ -211,7 +214,7 @@ class DynamicMultidimOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     TensorShape output_shape;
-    auto vec = ctx->input(0).flat<int32>();
+    auto vec = ctx->input(0).flat<int32_t>();
     for (int i = 0; i < vec.size(); i++) {
       OP_REQUIRES_OK(ctx, output_shape.AddDimWithStatus(vec(i)));
     }
@@ -225,8 +228,8 @@ class DynamicMultidimOp : public OpKernel {
     for (int i = 0; i < output_shape.num_elements(); i++) {
       host_data[i] = 1.0;
     }
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(),
-                                 static_cast<uint64_t>(num_elements)};
+    stream_executor::DeviceAddressBase gpu_dst{
+        out_tensor->data(), static_cast<uint64_t>(num_elements)};
 
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
@@ -302,10 +305,10 @@ class TestTfMustBeConstantOp : public OpKernel {
     TF_CHECK_OK(ctx->allocate_temp(input.dtype(), input.shape(), &tmp,
                                    pinned_alloc_attrs));
 
-    OP_REQUIRES_OK(
-        ctx, stream->Memcpy(tmp.data(),
-                            se::DeviceMemoryBase{input.data(), allocated_size},
-                            allocated_size));
+    OP_REQUIRES_OK(ctx, stream->Memcpy(tmp.data(),
+                                       stream_executor::DeviceAddressBase{
+                                           input.data(), allocated_size},
+                                       allocated_size));
 
     OP_REQUIRES_OK(ctx, stream->BlockHostUntilDone());
 
@@ -316,8 +319,8 @@ class TestTfMustBeConstantOp : public OpKernel {
     Tensor* out_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output("output", ctx->input(0).shape(),
                                              &out_tensor));
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(),
-                                 static_cast<uint64_t>(allocated_size)};
+    stream_executor::DeviceAddressBase gpu_dst{
+        out_tensor->data(), static_cast<uint64_t>(allocated_size)};
     OP_REQUIRES_OK(ctx, stream->Memcpy(&gpu_dst, tmp.data(), allocated_size));
   }
 };
@@ -361,11 +364,12 @@ class TestDynamicTfWithBoundOp : public OpKernel {
 
     se::Stream* stream =
         ctx->device()->tensorflow_accelerator_device_info()->stream;
-    se::DeviceMemoryBase gpu_dst{out_tensor->data(), size_to_cpy};
+    stream_executor::DeviceAddressBase gpu_dst{out_tensor->data(), size_to_cpy};
     OP_REQUIRES_OK(
         ctx, stream->MemcpyD2D(
                  /*gpu_dst=*/&gpu_dst,
-                 /*gpu_src=*/se::DeviceMemoryBase{input.data(), size_to_cpy},
+                 /*gpu_src=*/
+                 stream_executor::DeviceAddressBase{input.data(), size_to_cpy},
                  /*size=*/size_to_cpy));
   }
 
diff --git a/tensorflow/compiler/tf2xla/literal_util_test.cc b/tensorflow/compiler/tf2xla/literal_util_test.cc
index b7c9b5fd7bbf13..e8c7dc1a579b6b 100644
--- a/tensorflow/compiler/tf2xla/literal_util_test.cc
+++ b/tensorflow/compiler/tf2xla/literal_util_test.cc
@@ -52,9 +52,9 @@ TEST(LiteralUtil, LiteralToHostTensor) {
 template <class T>
 using LiteralUtilTest = ::testing::Test;
 using Types =
-    ::testing::Types<std::pair<int8, qint8>, std::pair<uint8, quint8>,
-                     std::pair<int16, qint16>, std::pair<uint16, quint16>,
-                     std::pair<int32, qint32>>;
+    ::testing::Types<std::pair<int8_t, qint8>, std::pair<uint8_t, quint8>,
+                     std::pair<int16_t, qint16>, std::pair<uint16_t, quint16>,
+                     std::pair<int32_t, qint32>>;
 
 TYPED_TEST_SUITE(LiteralUtilTest, Types);
 
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index a9b2ead7b4d839..114905925cbf20 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -127,7 +127,7 @@ absl::Status ConvertGraphDefToXlaViaMlir(
   // with a placeholder node that contains a single output.
   FunctionLibraryDefinition flib_def(OpRegistry::Global(), graph_def.library());
   std::unique_ptr<Graph> graph(new Graph(flib_def));
-  std::unordered_map<string, string> feed_name_remap;
+  std::unordered_map<std::string, std::string> feed_name_remap;
   TF_RETURN_IF_ERROR(AddPlaceholdersForFeeds(config, graph->op_registry(),
                                              &feed_name_remap, &graph_def));
 
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index add79c369b69ef..e7925a011f9eb5 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -45,7 +45,8 @@ class XlaCompilationAllocator : public Allocator {
     // Regardless of the size requested, always allocates an XlaExpression.
     // Respects the alignment request because there is alignment checking even
     // for Tensors whose data is never accessed.
-    void* p = port::AlignedMalloc(sizeof(XlaExpression), alignment);
+    void* p = tsl::port::AlignedMalloc(
+        sizeof(XlaExpression), static_cast<std::align_val_t>(alignment));
     XlaExpression* expression = reinterpret_cast<XlaExpression*>(p);
     new (expression) XlaExpression();
     return expression;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 216125f9cb153e..b9abd5006a958a 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -226,7 +226,7 @@ class XlaCompiler {
     // This must be a shared_ptr, as this is passed all the way down to the
     // cluster compilation. This allows asynchronous compilation to hold a
     // reference until the compilation is finished.
-    std::shared_ptr<se::DeviceMemoryAllocator> device_allocator;
+    std::shared_ptr<stream_executor::DeviceAddressAllocator> device_allocator;
 
     // Alias input and output buffers for parameters that are passed-through XLA
     // modules without being changed.

From 114681a76c9723496bb7b7530ebcdf910cd528a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 07:22:41 -0800
Subject: [PATCH 441/753] Automated Code Change

PiperOrigin-RevId: 845757791
---
 .../distributed_runtime/rpc/grpc_channel.cc   | 74 ++++++++++---------
 .../distributed_runtime/rpc/grpc_channel.h    | 28 +++----
 .../rpc/grpc_channel_test.cc                  | 60 +++++++--------
 3 files changed, 83 insertions(+), 79 deletions(-)

diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
index b373d0a4a9bf0b..becff37233c630 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -46,20 +46,20 @@ namespace tsl {
 
 namespace {
 
-string MakeAddress(const string& job, int replica, int task) {
+std::string MakeAddress(const std::string& job, int replica, int task) {
   return strings::StrCat("/job:", job, "/replica:", replica, "/task:", task);
 }
 
 // Allows the host to be a raw IP (either v4 or v6).
-absl::Status ValidateHostPortPair(const string& host_port) {
-  string bns_prefix = "/bns/";
+absl::Status ValidateHostPortPair(const std::string& host_port) {
+  std::string bns_prefix = "/bns/";
   if (host_port.substr(0, bns_prefix.length()) == bns_prefix) {
     return absl::OkStatus();
   }
-  uint32 port;
+  uint32_t port;
   auto colon_index = host_port.find_last_of(':');
   if (!absl::SimpleAtoi(host_port.substr(colon_index + 1), &port) ||
-      host_port.substr(0, colon_index).find('/') != string::npos) {
+      host_port.substr(0, colon_index).find('/') != std::string::npos) {
     return absl::InvalidArgumentError(absl::StrCat(
         "Could not interpret \"", host_port, "\" as a host-port pair."));
   }
@@ -71,7 +71,7 @@ ::grpc::ChannelArguments* CreateDefaultChannelArguments() {
   const char* env = std::getenv("TF_GRPC_DEFAULT_OPTIONS");
   if (env != nullptr) {
     for (auto& grpc_option : absl::StrSplit(env, ',')) {
-      std::vector<string> name_value = absl::StrSplit(grpc_option, '=');
+      std::vector<std::string> name_value = absl::StrSplit(grpc_option, '=');
       if (name_value.size() != 2) {
         LOG(ERROR) << "Invalid GRPC options format: " << grpc_option;
         continue;
@@ -79,9 +79,10 @@ ::grpc::ChannelArguments* CreateDefaultChannelArguments() {
       VLOG(3) << "Setting GRPC default for '" << name_value[0] << "' to '"
               << name_value[1] << "'";
       if (name_value[1].size() >= 2 && name_value[1][0] == '"') {
-        string ue_value = name_value[1].substr(1, name_value[1].size() - 2);
-        string value;
-        string error;
+        std::string ue_value =
+            name_value[1].substr(1, name_value[1].size() - 2);
+        std::string value;
+        std::string error;
         if (!absl::CUnescape(ue_value, &value, &error)) {
           LOG(ERROR) << "Failed to parse escaped string for " << grpc_option
                      << ": " << error;
@@ -111,7 +112,7 @@ const ::grpc::ChannelArguments* GetDefaultChannelArguments() {
 ::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options) {
   // TODO(mrry): Implement secure channels.
   ::grpc::ChannelArguments args = *GetDefaultChannelArguments();
-  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
+  args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32_t>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
   args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 1000);
@@ -142,7 +143,7 @@ ::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options) {
   return args;
 }
 
-absl::Status NewHostPortGrpcChannel(const string& target,
+absl::Status NewHostPortGrpcChannel(const std::string& target,
                                     const RPCOptions* rpc_options,
                                     SharedGrpcChannelPtr* channel_pointer) {
   // Minimally ensure that the target is valid
@@ -155,10 +156,11 @@ absl::Status NewHostPortGrpcChannel(const string& target,
 }
 
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<absl::Status(string, const RPCOptions*,
+    const std::function<absl::Status(std::string, const RPCOptions*,
                                      SharedGrpcChannelPtr*)>&
         new_channel_func_ptr) {
-  return [new_channel_func_ptr](const string& target) -> SharedGrpcChannelPtr {
+  return [new_channel_func_ptr](
+             const std::string& target) -> SharedGrpcChannelPtr {
     SharedGrpcChannelPtr channel_ptr;
     if (new_channel_func_ptr(target, /*rpc_options=*/nullptr, &channel_ptr)
             .ok()) {
@@ -170,7 +172,7 @@ ChannelCreationFunction ConvertToChannelCreationFunction(
 }
 
 absl::Status GrpcChannelSpec::AddHostPortsJob(
-    const string& job_id, const std::map<int, string>& host_ports) {
+    const std::string& job_id, const std::map<int, std::string>& host_ports) {
   if (!job_ids_.insert(job_id).second) {
     return absl::InvalidArgumentError(
         absl::StrCat("Duplicate job ID in cluster specification: ", job_id));
@@ -201,25 +203,25 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
-  void ListWorkers(std::vector<string>* workers) override {
+  void ListWorkers(std::vector<std::string>* workers) override {
     for (GrpcChannelCache* cache : caches_) {
       cache->ListWorkers(workers);
     }
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) override {
     for (GrpcChannelCache* cache : caches_) {
       cache->ListWorkersInJob(job_name, workers);
     }
   }
 
-  string TranslateTask(const string& target) override {
+  std::string TranslateTask(const std::string& target) override {
     absl::MutexLock l(mu_);  // could use reader lock
     GrpcChannelCache* cache = gtl::FindPtrOrNull(target_caches_, target);
     if (cache == nullptr) {
       for (GrpcChannelCache* c : caches_) {
-        string r = c->TranslateTask(target);
+        std::string r = c->TranslateTask(target);
         if (!r.empty()) {
           target_caches_.insert({target, c});
           cache = c;
@@ -233,7 +235,7 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
  protected:
-  SharedGrpcChannelPtr FindChannelOnce(const string& target) override {
+  SharedGrpcChannelPtr FindChannelOnce(const std::string& target) override {
     for (GrpcChannelCache* cache : caches_) {
       SharedGrpcChannelPtr ch(cache->FindWorkerChannel(target));
       if (ch) {
@@ -252,14 +254,14 @@ class MultiGrpcChannelCache : public CachingGrpcChannelCache {
   absl::Mutex mu_;
   // Cache of channels keyed by the target they are handling.
   // The same GrpcChannelCache can appear multiple times in the cache.
-  std::unordered_map<string, GrpcChannelCache*> target_caches_
+  std::unordered_map<std::string, GrpcChannelCache*> target_caches_
       TF_GUARDED_BY(mu_);
 };
 
 class SparseGrpcChannelCache : public CachingGrpcChannelCache {
  public:
-  SparseGrpcChannelCache(const string& job_id,
-                         const std::map<int, string>& host_ports,
+  SparseGrpcChannelCache(const std::string& job_id,
+                         const std::map<int, std::string>& host_ports,
                          ChannelCreationFunction channel_func,
                          int num_channels_per_target)
       : CachingGrpcChannelCache(num_channels_per_target),
@@ -270,7 +272,7 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
   ~SparseGrpcChannelCache() override {}
 
-  void ListWorkers(std::vector<string>* workers) override {
+  void ListWorkers(std::vector<std::string>* workers) override {
     workers->reserve(workers->size() + host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
       std::vector<std::string> replicas =
@@ -282,14 +284,14 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
     }
   }
 
-  void ListWorkersInJob(const string& job_name,
-                        std::vector<string>* workers) override {
+  void ListWorkersInJob(const std::string& job_name,
+                        std::vector<std::string>* workers) override {
     if (job_name == job_id_) {
       ListWorkers(workers);
     }
   }
 
-  string TranslateTask(const string& target) override {
+  std::string TranslateTask(const std::string& target) override {
     DeviceNameUtils::ParsedName parsed;
     if (!DeviceNameUtils::ParseFullName(target, &parsed)) {
       LOG(WARNING) << "Invalid target: " << target;
@@ -319,8 +321,8 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
  protected:
-  SharedGrpcChannelPtr FindChannelOnce(const string& target) override {
-    const string host_port = TranslateTask(target);
+  SharedGrpcChannelPtr FindChannelOnce(const std::string& target) override {
+    const std::string host_port = TranslateTask(target);
     if (host_port.empty()) {
       return nullptr;
     }
@@ -332,19 +334,19 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   }
 
  private:
-  string ToString() {
-    std::vector<string> task_strings;
+  std::string ToString() {
+    std::vector<std::string> task_strings;
     task_strings.reserve(host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
       task_strings.emplace_back(
-          strings::StrCat(id_host_port.first, " -> ", id_host_port.second));
+          absl::StrCat(id_host_port.first, " -> ", id_host_port.second));
     }
-    return strings::StrCat(job_id_, " -> {", absl::StrJoin(task_strings, ", "),
-                           "}");
+    return absl::StrCat(job_id_, " -> {", absl::StrJoin(task_strings, ", "),
+                        "}");
   }
 
-  const string job_id_;
-  const std::map<int, string> host_ports_;
+  const std::string job_id_;
+  const std::map<int, std::string> host_ports_;
   const ChannelCreationFunction channel_func_;
   SparseGrpcChannelCache(const SparseGrpcChannelCache&) = delete;
   void operator=(const SparseGrpcChannelCache&) = delete;
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
index e608b614704564..7611409f936424 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel.h
@@ -37,14 +37,15 @@ using tensorflow::RPCOptions;
 class GrpcChannelSpec {
  public:
   struct HostPortsJob {
-    HostPortsJob(const string& job_id, const std::map<int, string>& host_ports)
+    HostPortsJob(const std::string& job_id,
+                 const std::map<int, std::string>& host_ports)
         : job_id(job_id), host_ports(host_ports) {}
-    const string job_id;
-    const std::map<int, string> host_ports;
+    const std::string job_id;
+    const std::map<int, std::string> host_ports;
   };
 
-  absl::Status AddHostPortsJob(const string& job_id,
-                               const std::map<int, string>& host_ports);
+  absl::Status AddHostPortsJob(const std::string& job_id,
+                               const std::map<int, std::string>& host_ports);
 
   const std::vector<HostPortsJob>& host_ports_jobs() const {
     return host_ports_jobs_;
@@ -52,7 +53,7 @@ class GrpcChannelSpec {
 
  private:
   std::vector<HostPortsJob> host_ports_jobs_;
-  std::set<string> job_ids_;
+  std::set<std::string> job_ids_;
 };
 
 class GrpcChannelCache {
@@ -63,21 +64,22 @@ class GrpcChannelCache {
   // was created to handle.  Worker names are in the format
   //  /job:<job identifier>/task:<task id>
   // e.g. /job:mnist/task:2
-  virtual void ListWorkers(std::vector<string>* workers) = 0;
-  virtual void ListWorkersInJob(const string& job_name,
-                                std::vector<string>* workers) = 0;
+  virtual void ListWorkers(std::vector<std::string>* workers) = 0;
+  virtual void ListWorkersInJob(const std::string& job_name,
+                                std::vector<std::string>* workers) = 0;
 
   // If found, returns a gRPC channel that is connected to the remote
   // worker named by 'target'. 'target' is of the following
   // format: /job:<job identifier>/task:<task id>
   // E.g., /job:mnist/task:2
-  virtual SharedGrpcChannelPtr FindWorkerChannel(const string& target) = 0;
+  virtual SharedGrpcChannelPtr FindWorkerChannel(const std::string& target) = 0;
 
   // Translates a string in the form `/job:X/task:Z` into a host_port.
-  virtual string TranslateTask(const string& task) = 0;
+  virtual std::string TranslateTask(const std::string& task) = 0;
 };
 
-typedef std::function<SharedGrpcChannelPtr(string)> ChannelCreationFunction;
+typedef std::function<SharedGrpcChannelPtr(std::string)>
+    ChannelCreationFunction;
 
 GrpcChannelCache* NewGrpcChannelCache(
     const GrpcChannelSpec& channel_spec, ChannelCreationFunction channel_func,
@@ -92,7 +94,7 @@ ChannelCreationFunction ConvertToChannelCreationFunction(
                                      SharedGrpcChannelPtr*)>&
         new_channel_func_ptr);
 
-absl::Status NewHostPortGrpcChannel(const string& target,
+absl::Status NewHostPortGrpcChannel(const std::string& target,
                                     const RPCOptions* rpc_options,
                                     SharedGrpcChannelPtr* channel_pointer);
 
diff --git a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index eae6d0a6c26169..5d4940cbccdfe0 100644
--- a/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/third_party/xla/xla/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -95,10 +95,10 @@ TEST(GrpcChannelTest, HostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
     EXPECT_EQ(
-        std::vector<string>(
+        std::vector<std::string>(
             {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
              "/job:mnist/replica:0/task:2", "/job:mnist/replica:0/task:3",
              "/job:mnist/replica:0/task:4", "/job:mnist/replica:0/task:5"}),
@@ -106,10 +106,10 @@ TEST(GrpcChannelTest, HostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("mnist", &workers);
     EXPECT_EQ(
-        std::vector<string>(
+        std::vector<std::string>(
             {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
              "/job:mnist/replica:0/task:2", "/job:mnist/replica:0/task:3",
              "/job:mnist/replica:0/task:4", "/job:mnist/replica:0/task:5"}),
@@ -117,7 +117,7 @@ TEST(GrpcChannelTest, HostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }
@@ -179,25 +179,25 @@ TEST(GrpcChannelTest, HostPortsMultiChannelPerTarget) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:1",
-                                   "/job:mnist/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:1",
+                                        "/job:mnist/replica:0/task:2"}),
               workers);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("mnist", &workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:1",
-                                   "/job:mnist/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:1",
+                                        "/job:mnist/replica:0/task:2"}),
               workers);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }
@@ -262,10 +262,10 @@ TEST(GrpcChannelTest, HostPortsMultiGrpcMultiChannelPerTarget) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
     EXPECT_EQ(
-        std::vector<string>(
+        std::vector<std::string>(
             {"/job:mnist/replica:0/task:0", "/job:mnist/replica:0/task:1",
              "/job:mnist/replica:0/task:2", "/job:mnist2/replica:0/task:0",
              "/job:mnist2/replica:0/task:1", "/job:mnist2/replica:0/task:2"}),
@@ -273,21 +273,21 @@ TEST(GrpcChannelTest, HostPortsMultiGrpcMultiChannelPerTarget) {
   }
 
   {
-    std::vector<string> workers, workers2;
+    std::vector<std::string> workers, workers2;
     cc->ListWorkersInJob("mnist", &workers);
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:1",
-                                   "/job:mnist/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:1",
+                                        "/job:mnist/replica:0/task:2"}),
               workers);
     cc->ListWorkersInJob("mnist2", &workers2);
-    EXPECT_EQ(std::vector<string>({"/job:mnist2/replica:0/task:0",
-                                   "/job:mnist2/replica:0/task:1",
-                                   "/job:mnist2/replica:0/task:2"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist2/replica:0/task:0",
+                                        "/job:mnist2/replica:0/task:1",
+                                        "/job:mnist2/replica:0/task:2"}),
               workers2);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }
@@ -332,17 +332,17 @@ TEST(GrpcChannelTest, SparseHostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkers(&workers);
     std::sort(workers.begin(), workers.end());
-    EXPECT_EQ(std::vector<string>({"/job:mnist/replica:0/task:0",
-                                   "/job:mnist/replica:0/task:3",
-                                   "/job:mnist/replica:0/task:4"}),
+    EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
+                                        "/job:mnist/replica:0/task:3",
+                                        "/job:mnist/replica:0/task:4"}),
               workers);
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("mnist", &workers);
     EXPECT_EQ(std::vector<std::string>({"/job:mnist/replica:0/task:0",
                                         "/job:mnist/replica:0/task:3",
@@ -351,7 +351,7 @@ TEST(GrpcChannelTest, SparseHostPorts) {
   }
 
   {
-    std::vector<string> workers;
+    std::vector<std::string> workers;
     cc->ListWorkersInJob("other", &workers);
     EXPECT_TRUE(workers.empty());
   }

From df6b63e83e43e065fe90c639ce0f6754d5ab944c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 07:24:37 -0800
Subject: [PATCH 442/753] Automated Code Change

PiperOrigin-RevId: 845758361
---
 tensorflow/core/grappler/verifiers/graph_verifier.h           | 2 +-
 tensorflow/core/grappler/verifiers/structure_verifier.h       | 2 +-
 tensorflow/core/grappler/verifiers/structure_verifier_test.cc | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/verifiers/graph_verifier.h b/tensorflow/core/grappler/verifiers/graph_verifier.h
index 53d62e4c986d68..0e59d4ed3a28c7 100644
--- a/tensorflow/core/grappler/verifiers/graph_verifier.h
+++ b/tensorflow/core/grappler/verifiers/graph_verifier.h
@@ -41,7 +41,7 @@ class GraphVerifier {
   virtual ~GraphVerifier() {}
 
   // A name for the verifier.
-  virtual string name() const = 0;
+  virtual std::string name() const = 0;
 
   // Implement an algorithm to verify the specified graph.
   // The return value is a Status that represents a concatenation of Status of
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier.h b/tensorflow/core/grappler/verifiers/structure_verifier.h
index de77933fedac10..968f840b41c8a0 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier.h
+++ b/tensorflow/core/grappler/verifiers/structure_verifier.h
@@ -32,7 +32,7 @@ class StructureVerifier : public GraphVerifier {
   StructureVerifier() {}
   ~StructureVerifier() override {}
 
-  string name() const override { return "structure_verifier"; };
+  std::string name() const override { return "structure_verifier"; };
 
   absl::Status Verify(const GraphDef& graph) override;
 };
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
index 562deb5367493c..d01a729d6c0796 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -35,7 +35,7 @@ namespace {
 class StructureVerifierTest : public ::testing::Test {
  protected:
   StructureVerifierTest() { verifier_ = std::make_unique<StructureVerifier>(); }
-  void SetGraph(const string& gdef_ascii) {
+  void SetGraph(const std::string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &graph_));
   }
   GraphDef graph_;

From a6d99fab4e6d6bfacd6b727967446c84fc0e6846 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 07:26:00 -0800
Subject: [PATCH 443/753] Automated Code Change

PiperOrigin-RevId: 845758758
---
 .../next_pluggable_device/next_pluggable_device.cc          | 3 ++-
 .../next_pluggable_device/next_pluggable_device.h           | 6 +++---
 .../next_pluggable_device/next_pluggable_device_factory.cc  | 4 ++--
 .../next_pluggable_device/next_pluggable_device_factory.h   | 2 +-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
index 79b1eebbb3c6c7..29c45068316914 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
@@ -44,7 +44,8 @@ namespace tensorflow {
 // TODO(chuanhao): implement an API to query device memory, and make
 // memory_limit a parameter instead of hard coding.
 static DeviceAttributes BuildNextPluggableDeviceAttributes(
-    const string& name_prefix, const string& device_name, int device_ordinal) {
+    const std::string& name_prefix, const std::string& device_name,
+    int device_ordinal) {
   return Device::BuildDeviceAttributes(
       absl::StrCat(name_prefix, "/device:", device_name, ":", device_ordinal),
       DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(),
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
index cb8ecf514101b0..8ad6c2051a87ac 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.h
@@ -36,13 +36,13 @@ class NextPluggableDevice : public PjRtBaseDevice {
  public:
   struct Options {
     // The device name's prefix (e.g., "/task:7")
-    string device_name_prefix;
+    std::string device_name_prefix;
 
     // The name of the  device (e.g., "GPU")
-    string device_name;
+    std::string device_name;
 
     // The name of the compilation device (e.g., "XLA_TPU_JIT");
-    string compilation_device_name;
+    std::string compilation_device_name;
 
     // The TfDeviceId.
     int device_ordinal = -1;
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
index 857d7f56a43355..f915ecdf47ce24 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
@@ -64,14 +64,14 @@ absl::StatusOr<xla::Shape> DeviceShapeRepresentation(
 }  // namespace
 
 absl::Status NextPluggableDeviceFactory::ListPhysicalDevices(
-    std::vector<string>* devices) {
+    std::vector<std::string>* devices) {
   TF_Status* c_status = TF_NewStatus();
   int32_t device_count = api_->TFNPD_GetDeviceCount(c_status);
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status));
   TF_DeleteStatus(c_status);
 
   for (int i = 0; i < device_count; ++i) {
-    const string device_name =
+    const std::string device_name =
         absl::StrCat("/physical_device:", device_type_, ":", i);
     devices->push_back(device_name);
   }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
index 5ccfb6dd336848..f23e5cd00cd76d 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h
@@ -36,7 +36,7 @@ class NextPluggableDeviceFactory : public DeviceFactory {
         device_type_(device_type),
         compilation_device_name_(compilation_device_name) {}
 
-  absl::Status ListPhysicalDevices(std::vector<string>* devices) override;
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override;
 
   absl::Status CreateDevices(
       const SessionOptions& session_options, const std::string& name_prefix,

From b295c98d0afad6ae7024227dd403764bc20cc115 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 07:28:19 -0800
Subject: [PATCH 444/753] Automated Code Change

PiperOrigin-RevId: 845759379
---
 third_party/xla/xla/service/gpu/BUILD                        | 5 +++++
 third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc     | 2 --
 third_party/xla/xla/service/gpu/stream_executor_util_test.cc | 1 +
 third_party/xla/xla/service/gpu/thunk_emitter.cc             | 4 ++++
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 79e13fb1726a28..5e47de0edd403d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -468,6 +468,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
         "//xla/backends/gpu/codegen:fusion_emitter",
         "//xla/backends/gpu/codegen:fusions",
         "//xla/backends/gpu/codegen/llvm:llvm_emitter",
@@ -490,6 +491,7 @@ cc_library(
         "//xla/backends/gpu/runtime:command_buffer_cmd_emitter",
         "//xla/backends/gpu/runtime:command_buffer_thunk",
         "//xla/backends/gpu/runtime:conditional_thunk",
+        "//xla/backends/gpu/runtime:convolution_filter_thunk_proto_cc",
         "//xla/backends/gpu/runtime:convolution_reorder_thunk",
         "//xla/backends/gpu/runtime:convolution_thunk",
         "//xla/backends/gpu/runtime:copy_thunk",
@@ -526,6 +528,7 @@ cc_library(
         "//xla/backends/gpu/runtime:wait_for_streams_thunk",
         "//xla/backends/gpu/runtime:while_thunk",
         "//xla/codegen/emitters:kernel_arguments",
+        "//xla/core/host_offloading:host_offloading_executable_proto_cc",
         "//xla/ffi:attribute_map",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
@@ -540,6 +543,7 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:hlo_creation_utils",
+        "//xla/service:hlo_proto_cc",
         "//xla/service:name_uniquer",
         "//xla/service:platform_util",
         "//xla/service/gpu/kernels:custom_kernel",
@@ -2713,6 +2717,7 @@ xla_cc_test(
         ":stream_executor_util",
         "//xla:autotuning_proto_cc",
         "//xla/service:hlo_module_config",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util/proto:proto_utils",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
index 153e4b8676c970..1857526b3fc404 100644
--- a/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_alias_info_test.cc
@@ -15,12 +15,10 @@ limitations under the License.
 
 #include "xla/service/gpu/nvptx_alias_info.h"
 
-#include <cstdint>
 #include <memory>
 #include <optional>
 
 #include "absl/log/check.h"
-#include "absl/log/log.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
index 34cd3b7067eef4..cf22f97fea8042 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/autotuning.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/tsl/util/proto/proto_utils.h"
 
 namespace xla::gpu {
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index d058a7276b1b96..72bc9096cc4d34 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -73,6 +73,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/command_buffer_cmd_emitter.h"
 #include "xla/backends/gpu/runtime/command_buffer_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
+#include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_thunk.h"
 #include "xla/backends/gpu/runtime/copy_thunk.h"
@@ -108,6 +109,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/wait_for_streams_thunk.h"
 #include "xla/backends/gpu/runtime/while_thunk.h"
 #include "xla/codegen/emitters/kernel_arguments.h"
+#include "xla/core/host_offloading/host_offloading_executable.pb.h"
 #include "xla/ffi/attribute_map.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -146,6 +148,7 @@ limitations under the License.
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/gpu/triton_call.h"
+#include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_creation_utils.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/shape.h"
@@ -160,6 +163,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
+#include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/human_readable_json.h"

From 3b2c89f5ed2fe55c049b32fa77c8488a86d09ce4 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Wed, 17 Dec 2025 07:40:18 -0800
Subject: [PATCH 445/753] Always write a valid initial cache file when starting
 a cache build.

Before this change, if the XNNPack delegate is used but no operation is
delegated, then an invalid cache file is created.

PiperOrigin-RevId: 845763378
---
 tensorflow/lite/delegates/xnnpack/weight_cache.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index d92060ad2357d2..4ceb2df985c989 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -120,12 +120,17 @@ bool WeightCacheBuilder::Start(const char* path, const FileDescriptor& fd) {
   XNNPackCacheHeader header{XNNPackCacheHeader::kInvalidHeader};
   header.buffer_list_offset = sizeof(header);
 
-  XNNPACK_RETURN_CHECK(fd_.Truncate(0), "could not truncate weight cache");
+  XNNPACK_RETURN_CHECK(fd_.Truncate(0), "could not truncate weight cache.");
+  XNNPACK_RETURN_CHECK(fd_.SetPos(0) == 0, "couldn't move to file start.");
   XNNPACK_RETURN_CHECK(fd_.Write(&header, sizeof(header)),
                        "could not write initial cache header in %s: %s.",
                        file_path_.c_str(), strerror(errno));
 
   schema_.base_offset = Align(sizeof(header), kMinAlignment);
+
+  XNNPACK_RETURN_CHECK(StartBuildStep(), "failed to start initial write step.");
+  XNNPACK_RETURN_CHECK(StopBuildStep(), "failed to write initial step.");
+
   return true;
 }
 

From b22ae073cde56e9043683784da17232238d1d8b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 08:23:47 -0800
Subject: [PATCH 446/753] Canonicalize convolutions with float inputs and
 non-float outputs by performing the convolution in F32 and casting the result
 back to the original output type.

PiperOrigin-RevId: 845778539
---
 .../xla/xla/hlo/transforms/expanders/BUILD    | 26 +++++
 .../convolution_type_canonicalizer.cc         | 62 ++++++++++++
 .../convolution_type_canonicalizer.h          | 44 +++++++++
 .../convolution_type_canonicalizer_test.cc    | 98 +++++++++++++++++++
 third_party/xla/xla/pjrt/interpreter/BUILD    |  1 +
 .../pjrt/interpreter/interpreter_client.cc    |  2 +
 6 files changed, 233 insertions(+)
 create mode 100644 third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.cc
 create mode 100644 third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.h
 create mode 100644 third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer_test.cc

diff --git a/third_party/xla/xla/hlo/transforms/expanders/BUILD b/third_party/xla/xla/hlo/transforms/expanders/BUILD
index 315802b86326c1..0e3524878643d0 100644
--- a/third_party/xla/xla/hlo/transforms/expanders/BUILD
+++ b/third_party/xla/xla/hlo/transforms/expanders/BUILD
@@ -638,6 +638,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "convolution_type_canonicalizer",
+    srcs = ["convolution_type_canonicalizer.cc"],
+    hdrs = ["convolution_type_canonicalizer.h"],
+    deps = [
+        ":op_expander_pass",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "convolution_type_canonicalizer_test",
+    srcs = ["convolution_type_canonicalizer_test.cc"],
+    deps = [
+        ":convolution_type_canonicalizer",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/utils:hlo_matchers",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "stochastic_convert_decomposer_test",
     srcs = ["stochastic_convert_decomposer_test.cc"],
diff --git a/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.cc b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.cc
new file mode 100644
index 00000000000000..a2dd02ade08ed6
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.cc
@@ -0,0 +1,62 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "xla/hlo/transforms/expanders/convolution_type_canonicalizer.h"
+
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+bool ConvolutionTypeCanonicalizer::InstructionMatchesPattern(
+    HloInstruction* instruction) {
+  return (instruction->opcode() == HloOpcode::kDot ||
+          instruction->opcode() == HloOpcode::kConvolution) &&
+         (primitive_util::IsFloatingPointType(
+              instruction->operand(0)->shape().element_type()) &&
+          primitive_util::IsFloatingPointType(
+              instruction->operand(1)->shape().element_type())) &&
+         primitive_util::IsIntegralType(instruction->shape().element_type());
+}
+
+absl::StatusOr<HloInstruction*> ConvolutionTypeCanonicalizer::ExpandInstruction(
+    HloInstruction* instruction) {
+  auto original_shape = instruction->shape();
+  auto new_shape = ShapeUtil::ChangeElementType(original_shape, F32);
+  HloInstruction* replacement_instruction;
+  if (instruction->opcode() == HloOpcode::kDot) {
+    replacement_instruction = instruction->parent()->AddInstruction(
+        HloInstruction::CreateDot(new_shape, instruction->mutable_operand(0),
+                                  instruction->mutable_operand(1),
+                                  instruction->dot_dimension_numbers(),
+                                  instruction->precision_config()));
+  } else {
+    replacement_instruction =
+        instruction->parent()->AddInstruction(HloInstruction::CreateConvolve(
+            new_shape, instruction->mutable_operand(0),
+            instruction->mutable_operand(1), instruction->feature_group_count(),
+            instruction->batch_group_count(), instruction->window(),
+            instruction->convolution_dimension_numbers(),
+            instruction->precision_config()));
+  }
+  HloInstruction* output_cast = instruction->parent()->AddInstruction(
+      HloInstruction::CreateConvert(original_shape, replacement_instruction));
+  return output_cast;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.h b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.h
new file mode 100644
index 00000000000000..6fea6e9b58a991
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer.h
@@ -0,0 +1,44 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_TYPE_CANONICALIZER_H_
+#define XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_TYPE_CANONICALIZER_H_
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/transforms/expanders/op_expander_pass.h"
+
+namespace xla {
+
+class ConvolutionTypeCanonicalizer : public OpExpanderPass {
+ public:
+  ConvolutionTypeCanonicalizer() = default;
+  absl::string_view name() const override {
+    return "ConvolutionTypeCanonicalizer";
+  }
+
+ private:
+  // Returns `true` if `instruction` should be expanded by this pass.
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+  // Returns a replacement for `instruction`, or nullptr if no replacement is
+  // needed (e.g. only the to_apply subcomputation of the instruction was
+  // modified).
+  absl::StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+};
+
+}  // namespace xla
+#endif  // XLA_HLO_TRANSFORMS_EXPANDERS_CONVOLUTION_TYPE_CANONICALIZER_H_
diff --git a/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer_test.cc b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer_test.cc
new file mode 100644
index 00000000000000..74092479fd07b8
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/expanders/convolution_type_canonicalizer_test.cc
@@ -0,0 +1,98 @@
+// Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "xla/hlo/transforms/expanders/convolution_type_canonicalizer.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/utils/hlo_matchers.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ConvolutionTypeCanonicalizerTest : public HloHardwareIndependentTestBase {
+};
+
+TEST_F(ConvolutionTypeCanonicalizerTest, DotBf16ToS32) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY main {
+  p0 = bf16[10,10]{1,0} parameter(0)
+  p1 = bf16[10,10]{1,0} parameter(1)
+  ROOT dot = s32[10,10]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  ConvolutionTypeCanonicalizer pass;
+  ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Convert(op::Dot(op::Parameter(0), op::Parameter(1))));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              xla::testing::opcode_matchers::Shape("s32[10,10]{1,0}"));
+}
+
+TEST_F(ConvolutionTypeCanonicalizerTest, ConvolutionBf16ToS32) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY main {
+  p0 = bf16[1,1024,1024,1]{3,2,1,0} parameter(0)
+  p1 = bf16[1,1,1,1]{3,2,1,0} parameter(1)
+  ROOT conv = s32[1,1024,1024,1]{3,2,1,0} convolution(p0, p1), window={size=1x1}, dim_labels=b01f_01io->b01f
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  ConvolutionTypeCanonicalizer pass;
+  ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Convert(op::Convolution(op::Parameter(0), op::Parameter(1))));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      xla::testing::opcode_matchers::Shape("s32[1,1024,1024,1]{3,2,1,0}"));
+}
+
+TEST_F(ConvolutionTypeCanonicalizerTest, NoChangeNeeded) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY main {
+  p0 = f32[10,10]{1,0} parameter(0)
+  p1 = f32[10,10]{1,0} parameter(1)
+  ROOT dot = f32[10,10]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  ConvolutionTypeCanonicalizer pass;
+  ASSERT_OK_AND_ASSIGN(bool changed, pass.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/pjrt/interpreter/BUILD b/third_party/xla/xla/pjrt/interpreter/BUILD
index c8cb8170f82a97..2b8f71df9da43d 100644
--- a/third_party/xla/xla/pjrt/interpreter/BUILD
+++ b/third_party/xla/xla/pjrt/interpreter/BUILD
@@ -28,6 +28,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass_pipeline",
         "//xla/hlo/transforms/expanders:cholesky_expander",
+        "//xla/hlo/transforms/expanders:convolution_type_canonicalizer",
         "//xla/hlo/transforms/expanders:dynamic_index_splitter",
         "//xla/hlo/transforms/expanders:eigh_expander",
         "//xla/hlo/transforms/expanders:qr_expander",
diff --git a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
index ece4b088dda58c..757850fd2aa392 100644
--- a/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
+++ b/third_party/xla/xla/pjrt/interpreter/interpreter_client.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/pass/hlo_pass_pipeline.h"
 #include "xla/hlo/transforms/expanders/cholesky_expander.h"
+#include "xla/hlo/transforms/expanders/convolution_type_canonicalizer.h"
 #include "xla/hlo/transforms/expanders/dynamic_index_splitter.h"
 #include "xla/hlo/transforms/expanders/eigh_expander.h"
 #include "xla/hlo/transforms/expanders/qr_expander.h"
@@ -490,6 +491,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> InterpreterClient::RunHloPasses(
       /*rewrite_grad_op=*/true);
   pipeline.AddPass<LayoutAssignment>(
       hlo_module->mutable_entry_computation_layout());
+  pipeline.AddPass<ConvolutionTypeCanonicalizer>();
 
   TF_RETURN_IF_ERROR(pipeline.Run(hlo_module.get()).status());
   return hlo_module;

From 95ef4892badd78a1480004bd918124e31a257321 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Wed, 17 Dec 2025 08:58:26 -0800
Subject: [PATCH 447/753] Remove unused gpu_types.h header and build target.

The gpu_types.h header is no longer included by any code and can be safely removed. This also allows for the removal of the associated build target.

PiperOrigin-RevId: 845790036
---
 third_party/xla/xla/service/gpu/BUILD         |  3 +-
 .../xla/xla/service/gpu/custom_call_test.cc   | 21 ++++---
 third_party/xla/xla/stream_executor/gpu/BUILD | 22 --------
 .../xla/xla/stream_executor/gpu/gpu_types.h   | 55 -------------------
 4 files changed, 11 insertions(+), 90 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/gpu/gpu_types.h

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 5e47de0edd403d..4796e6b3e31cbd 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -193,7 +193,7 @@ xla_test(
     name = "custom_call_test",
     srcs = ["custom_call_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     tags = ["no-oneapi"],  # TODO(intel-tf): Remove it when macro substitutions for SYCL are available in xla/stream_executor/sycl/*.
     deps = [
         "//xla:debug_options_flags",
@@ -219,7 +219,6 @@ xla_test(
         "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
-        "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tests:client_library_test_runner_mixin",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index 28ad1dd81231a8..4807574fcdbdb9 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_address.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
@@ -76,6 +75,7 @@ limitations under the License.
 #define gpuMemcpy cudaMemcpy
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStream CUstream
 #elif TENSORFLOW_USE_ROCM
 #define gpuSuccess hipSuccess
 #define gpuMemcpyAsync hipMemcpyAsync
@@ -83,6 +83,7 @@ limitations under the License.
 #define gpuMemcpy hipMemcpy
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStream hipStream_t
 #endif
 
 namespace xla {
@@ -123,8 +124,8 @@ struct TokenTestCase {
   std::string opaque;
 };
 
-void Callback_Tokens(se::gpu::GpuStreamHandle stream, void** buffers,
-                     const char* opaque, size_t opaque_len) {
+void Callback_Tokens(gpuStream stream, void** buffers, const char* opaque,
+                     size_t opaque_len) {
   for (int i = 0; i < opaque_len; ++i) {
     char c = opaque[i];
     ASSERT_TRUE(c == 'A' || c == 'T');
@@ -190,9 +191,8 @@ class CustomCallTokensTest
   }
 };
 
-void Callback_WithStatusSucceeded(se::gpu::GpuStreamHandle /*stream*/,
-                                  void** /*buffers*/, const char* /*opaque*/,
-                                  size_t /*opaque_len*/,
+void Callback_WithStatusSucceeded(gpuStream /*stream*/, void** /*buffers*/,
+                                  const char* /*opaque*/, size_t /*opaque_len*/,
                                   XlaCustomCallStatus* status) {
   XlaCustomCallStatusSetSuccess(status);
 }
@@ -210,9 +210,8 @@ TEST_F(CustomCallTest, WithStatusSucceeded) {
   TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
-void Callback_WithStatusFailed(se::gpu::GpuStreamHandle /*stream*/,
-                               void** /*buffers*/, const char* /*opaque*/,
-                               size_t /*opaque_len*/,
+void Callback_WithStatusFailed(gpuStream /*stream*/, void** /*buffers*/,
+                               const char* /*opaque*/, size_t /*opaque_len*/,
                                XlaCustomCallStatus* status) {
   XlaCustomCallStatusSetFailure(status, "Failed", 6);
 }
@@ -875,8 +874,8 @@ TEST_F(CustomCallTest, AsyncCustomCalls) {
 
 class CustomCallHloTest : public HloTestBase {};
 
-void CallBack_AddOne(se::gpu::GpuStreamHandle stream, void** buffers,
-                     const char* /*opaque*/, size_t /*opaque_len*/) {
+void CallBack_AddOne(gpuStream stream, void** buffers, const char* /*opaque*/,
+                     size_t /*opaque_len*/) {
   // Expect that the input and output buffers are the same.
   if (buffers[0] != buffers[1]) {
     return;
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 0529fdcdcf0e89..bf76eb7cea13a4 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -5,10 +5,6 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load(
-    "@local_config_sycl//sycl:build_defs.bzl",
-    "if_sycl_is_configured",
-)
 load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
@@ -254,24 +250,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gpu_types_header",
-    hdrs = ["gpu_types.h"],
-    defines = if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]) + if_sycl_is_configured([
-        "TENSORFLOW_USE_SYCL=1",
-    ]),
-    tags = ["gpu"],
-    deps = if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]) + if_sycl_is_configured([
-        "@local_config_sycl//sycl:sycl_headers",
-    ]),
-)
-
 cc_library(
     name = "gpu_asm_opts",
     hdrs = ["gpu_asm_opts.h"],
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_types.h b/third_party/xla/xla/stream_executor/gpu/gpu_types.h
deleted file mode 100644
index 84c5d400c99106..00000000000000
--- a/third_party/xla/xla/stream_executor/gpu/gpu_types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// GPU (SYCL / ROCm / CUDA) specific type handle resolution
-
-#ifndef XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
-#define XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_
-
-#if TENSORFLOW_USE_SYCL
-
-#include "sycl/sycl.hpp"
-
-#elif TENSORFLOW_USE_ROCM
-
-#include "rocm/include/hip/hip_runtime.h"
-#include "rocm/include/hiprand/hiprand.h"
-
-#else  // CUDA
-
-#include "third_party/gpus/cuda/include/cuda.h"
-
-#endif
-
-namespace stream_executor {
-namespace gpu {
-
-#if TENSORFLOW_USE_SYCL
-
-using GpuStreamHandle = ::sycl::queue*;
-
-#elif TENSORFLOW_USE_ROCM
-
-using GpuStreamHandle = hipStream_t;
-#else  // CUDA
-
-using GpuStreamHandle = CUstream;
-
-#endif
-
-}  // namespace gpu
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_GPU_GPU_TYPES_H_

From 7375fe4a62ef9e3410e823eaa2c2c8e6ee32ce57 Mon Sep 17 00:00:00 2001
From: Jie Luo <jieluo@google.com>
Date: Wed, 17 Dec 2025 09:04:40 -0800
Subject: [PATCH 448/753] Automated Code Change

PiperOrigin-RevId: 845792411
---
 third_party/systemlibs/grpc.bazel.generate_cc.bzl | 2 +-
 third_party/systemlibs/grpc.bazel.protobuf.bzl    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/systemlibs/grpc.bazel.generate_cc.bzl b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
index aa5d18eaa9a488..f396b1f853e71c 100644
--- a/third_party/systemlibs/grpc.bazel.generate_cc.bzl
+++ b/third_party/systemlibs/grpc.bazel.generate_cc.bzl
@@ -46,7 +46,7 @@ def generate_cc_impl(ctx):
     includes = [
         f
         for src in ctx.attr.srcs
-        for f in src[ProtoInfo].transitive_imports.to_list()
+        for f in src[ProtoInfo].transitive_sources.to_list()
     ]
     outs = []
     proto_root = get_proto_root(
diff --git a/third_party/systemlibs/grpc.bazel.protobuf.bzl b/third_party/systemlibs/grpc.bazel.protobuf.bzl
index cfb124ce43b1ef..9eeb4cb4475188 100644
--- a/third_party/systemlibs/grpc.bazel.protobuf.bzl
+++ b/third_party/systemlibs/grpc.bazel.protobuf.bzl
@@ -163,7 +163,7 @@ def includes_from_deps(deps):
     return [
         file
         for src in deps
-        for file in src[ProtoInfo].transitive_imports.to_list()
+        for file in src[ProtoInfo].transitive_sources.to_list()
     ]
 
 def get_proto_arguments(protos, genfiles_dir_path):

From e8cfd652f5d2dde56fe5e5042086fd91f5723152 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 09:11:20 -0800
Subject: [PATCH 449/753] [XLA:GPU] Temporary revert statically registered
 collectives allocators since they are breaking OSS JAX tests due dynamic
 linking.

PiperOrigin-RevId: 845795344
---
 .../xla/stream_executor/cuda/cuda_executor.cc | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 938f437ae57062..32d6ef67a058dd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -1035,7 +1035,28 @@ CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
   }
 
   if (type == MemorySpace::kCollective) {
-    return CreateCollectiveMemoryAllocator(this, collective_allocator_type_);
+    // TODO(469289220): Use NCCL/NVSHMEM memory allocator here instead.
+    return std::make_unique<GenericMemoryAllocator>(
+        [this](uint64_t size)
+            -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
+          TF_ASSIGN_OR_RETURN(void* ptr, CollectiveMemoryAllocate(this, size));
+          XLA_VLOG_DEVICE(2, device_ordinal())
+              << "allocated " << ptr << " for context " << cuda_context_
+              << " of " << size << " bytes of collective memory";
+          return std::make_unique<GenericMemoryAllocation>(
+              ptr, size, [this](void* location, uint64_t size) {
+                auto status = CollectiveMemoryDeallocate(this, location);
+                if (!status.ok()) {
+                  XLA_LOG_DEVICE(ERROR, device_ordinal())
+                      << "failed to free collective memory at " << location
+                      << "; result: " << status;
+                } else {
+                  XLA_VLOG_DEVICE(2, device_ordinal())
+                      << "deallocated collective memory at " << location
+                      << " for context " << cuda_context_;
+                }
+              });
+        });
   }
 
   if (type == MemorySpace::kHost) {

From 55afa175b0d9970eaa3a6edfab27fd04d3bdf1dc Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Wed, 17 Dec 2025 09:48:33 -0800
Subject: [PATCH 450/753] [XLA:GPU] Add a utility to get GpuTargetConfig.

PiperOrigin-RevId: 845808764
---
 third_party/xla/xla/backends/gpu/BUILD        |   9 --
 .../backends/gpu/specs/gpu_target_config.cc   |  87 ++++++++++++++
 .../xla/xla/backends/gpu/target_config/BUILD  |  65 ++++++++++
 .../gpu/{specs => target_config}/README.md    |   0
 .../backends/gpu/target_config/build_defs.bzl | 112 ++++++++++++++++++
 .../specs/a100_pcie_80.txtpb                  |   0
 .../specs/a100_sxm_40.txtpb                   |   0
 .../specs/a100_sxm_80.txtpb                   |   0
 .../gpu/{ => target_config}/specs/a6000.txtpb |   0
 .../gpu/{ => target_config}/specs/b200.txtpb  |   0
 .../gpu/{ => target_config}/specs/b300.txtpb  |  28 ++---
 .../{ => target_config}/specs/h100_pcie.txtpb |   0
 .../{ => target_config}/specs/h100_sxm.txtpb  |   0
 .../gpu/{ => target_config}/specs/mi200.txtpb |   0
 .../gpu/{ => target_config}/specs/p100.txtpb  |   0
 .../gpu/{ => target_config}/specs/v100.txtpb  |   0
 .../gpu/target_config/target_config.cc        |  87 ++++++++++++++
 .../gpu/target_config/target_config.h         |  32 +++++
 .../gpu/target_config/target_config_test.cc   |  76 ++++++++++++
 third_party/xla/xla/lit.bzl                   |   8 +-
 third_party/xla/xla/service/BUILD             |  10 +-
 .../xla/xla/service/gpu/autotuning/BUILD      |  16 +--
 .../gpu/autotuning/autotune_cache_key_test.cc |   4 +-
 .../xla/service/gpu/gpu_spmd_pipeline_test.cc |   3 +-
 .../xla/service/gpu/tests/bitcast-convert.hlo |   2 +-
 .../service/gpu/tests/calling_convention.hlo  |   2 +-
 .../xla/xla/service/gpu/tests/dot_bf16.hlo    |   6 +-
 .../xla/service/gpu/tests/kernel_reuse.hlo    |   2 +-
 .../service/gpu/tests/offload_scan_output.hlo |   2 +-
 .../xla/service/gpu/tests/pad_to_static.hlo   |   2 +-
 .../service/gpu/tests/reduce-precision.hlo    |   2 +-
 .../gpu/tests/reduce_fold_zero_add.hlo        |   2 +-
 .../gpu/tests/rng_get_and_update_state.hlo    |   2 +-
 .../service/gpu/tests/single_instruction.hlo  |   6 +-
 .../service/gpu/tests/slice_to_dynamic.hlo    |   2 +-
 .../xla/xla/service/gpu/tests/sorting.hlo     |   2 +-
 .../gpu/tests/sub_byte_collectives.hlo        |   2 +-
 .../gpu/tests/triton_calling_convention.hlo   |   2 +-
 .../xla/service/gpu/tests/triton_naming.hlo   |   2 +-
 .../gpu/tests/zero_clamp_abs_index.hlo        |   2 +-
 .../xla/xla/service/gpu/transforms/BUILD      |   6 +-
 .../gpu/transforms/layout_assignment_a100.hlo |   2 +-
 .../gpu/transforms/layout_assignment_h100.hlo |   2 +-
 .../gpu/transforms/layout_assignment_v100.hlo |   2 +-
 third_party/xla/xla/stream_executor/gpu/BUILD |  13 +-
 .../gpu/gpu_device_info_test.cc               |   5 +-
 third_party/xla/xla/tools/BUILD               |   2 +-
 third_party/xla/xla/tools/hlo_opt/BUILD       |   2 +-
 .../xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo   |   2 +-
 .../tools/hlo_opt/tests/gpu_hlo_backend.hlo   |   2 +-
 .../tools/hlo_opt/tests/gpu_hlo_buffers.hlo   |   2 +-
 .../hlo_opt/tests/gpu_hlo_collective_cse.hlo  |   2 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_html.hlo  |   4 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo  |   2 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo  |   2 +-
 .../xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo   |   2 +-
 .../tests/gpu_hlo_unoptimized_llvm.hlo        |   2 +-
 .../xla/xla/tools/xla_gpu_compile_lib_test.cc |   5 +-
 58 files changed, 529 insertions(+), 107 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/specs/gpu_target_config.cc
 create mode 100644 third_party/xla/xla/backends/gpu/target_config/BUILD
 rename third_party/xla/xla/backends/gpu/{specs => target_config}/README.md (100%)
 create mode 100644 third_party/xla/xla/backends/gpu/target_config/build_defs.bzl
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/a100_pcie_80.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/a100_sxm_40.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/a100_sxm_80.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/a6000.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/b200.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/b300.txtpb (98%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/h100_pcie.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/h100_sxm.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/mi200.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/p100.txtpb (100%)
 rename third_party/xla/xla/backends/gpu/{ => target_config}/specs/v100.txtpb (100%)
 create mode 100644 third_party/xla/xla/backends/gpu/target_config/target_config.cc
 create mode 100644 third_party/xla/xla/backends/gpu/target_config/target_config.h
 create mode 100644 third_party/xla/xla/backends/gpu/target_config/target_config_test.cc

diff --git a/third_party/xla/xla/backends/gpu/BUILD b/third_party/xla/xla/backends/gpu/BUILD
index 385fb6153ff023..b0a85c6ca377fc 100644
--- a/third_party/xla/xla/backends/gpu/BUILD
+++ b/third_party/xla/xla/backends/gpu/BUILD
@@ -32,12 +32,3 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
     ],
 )
-
-filegroup(
-    name = "all_gpu_specs",
-    data = glob(["specs/*.txtpb"]),
-)
-
-exports_files(glob([
-    "specs/*.txtpb",
-]))
diff --git a/third_party/xla/xla/backends/gpu/specs/gpu_target_config.cc b/third_party/xla/xla/backends/gpu/specs/gpu_target_config.cc
new file mode 100644
index 00000000000000..056bbb9d6c1560
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/specs/gpu_target_config.cc
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/specs/gpu_target_config.h"
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
+#include "xla/backends/gpu/specs/all_gpu_specs.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+absl::StatusOr<absl::string_view> GetEmbeddedGpuTargetConfigData(
+    const std::string& gpu_model) {
+  if (gpu_model == "a100_pcie_80") {
+    return get_a100_pcie_80();
+  }
+  if (gpu_model == "a100_sxm_40") {
+    return get_a100_sxm_40();
+  }
+  if (gpu_model == "a100_sxm_80") {
+    return get_a100_sxm_80();
+  }
+  if (gpu_model == "a6000") {
+    return get_a6000();
+  }
+  if (gpu_model == "b200") {
+    return get_b200();
+  }
+  if (gpu_model == "b300") {
+    return get_b300();
+  }
+  if (gpu_model == "h100_pcie") {
+    return get_h100_pcie();
+  }
+  if (gpu_model == "h100_sxm") {
+    return get_h100_sxm();
+  }
+  if (gpu_model == "mi200") {
+    return get_mi200();
+  }
+  if (gpu_model == "p100") {
+    return get_p100();
+  }
+  if (gpu_model == "v100") {
+    return get_v100();
+  }
+  return absl::NotFoundError(
+      absl::StrCat("Embedded file not found: ", gpu_model, ".txtpb"));
+}
+
+}  // namespace
+
+absl::StatusOr<stream_executor::GpuTargetConfigProto> GetGpuTargetConfig(
+    const std::string& gpu_model) {
+  TF_ASSIGN_OR_RETURN(absl::string_view gpu_spec,
+                      GetEmbeddedGpuTargetConfigData(gpu_model));
+
+  stream_executor::GpuTargetConfigProto config;
+  if (!google::protobuf::TextFormat::ParseFromString(std::string(gpu_spec), &config)) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to parse GpuTargetConfigProto from embedded data for: ",
+        gpu_model));
+  }
+  return config;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/target_config/BUILD b/third_party/xla/xla/backends/gpu/target_config/BUILD
new file mode 100644
index 00000000000000..251cf4717792c5
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/BUILD
@@ -0,0 +1,65 @@
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
+load("//xla:xla.default.bzl", "xla_cc_test")
+load("//xla/backends/gpu/target_config:build_defs.bzl", "embed_files")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//xla:__subpackages__",
+    ],
+    licenses = ["notice"],
+)
+
+filegroup(
+    name = "all_gpu_specs",
+    data = glob(["specs/*.txtpb"]),
+)
+
+exports_files(glob([
+    "specs/*.txtpb",
+]))
+
+embed_files(
+    name = "embed_gpu_specs",
+    srcs = glob(["specs/*.txtpb"]),
+    cpp_namespace = "xla::gpu",
+)
+
+cc_library(
+    name = "target_config",
+    srcs = ["target_config.cc"],
+    hdrs = ["target_config.h"],
+    deps = [
+        ":embed_gpu_specs",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_protobuf//:protobuf",
+    ],
+)
+
+xla_cc_test(
+    name = "target_config_test",
+    srcs = ["target_config_test.cc"],
+    deps = [
+        ":target_config",
+        "//xla/stream_executor:device_description_proto_cc",
+        "//xla/tsl/platform:status_matchers",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+bzl_library(
+    name = "build_defs_bzl",
+    srcs = ["build_defs.bzl"],
+    visibility = ["//visibility:private"],
+    deps = [
+        "//xla/tsl:package_groups_bzl",
+        "//xla/tsl:tsl_default_bzl",
+        "//xla/tsl/platform:rules_cc_bzl",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/specs/README.md b/third_party/xla/xla/backends/gpu/target_config/README.md
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/README.md
rename to third_party/xla/xla/backends/gpu/target_config/README.md
diff --git a/third_party/xla/xla/backends/gpu/target_config/build_defs.bzl b/third_party/xla/xla/backends/gpu/target_config/build_defs.bzl
new file mode 100644
index 00000000000000..1105d747dd961c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/build_defs.bzl
@@ -0,0 +1,112 @@
+"""Contains embed_files build rule."""
+
+load("//xla/tsl:package_groups.bzl", "DEFAULT_LOAD_VISIBILITY")
+load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
+
+visibility(DEFAULT_LOAD_VISIBILITY)
+
+def embed_files(name, srcs, cpp_namespace = "", **kwargs):
+    """Compiles srcs into a cc_library with functions returning embedded file data.
+
+    Example:
+        embed_files(
+            name = "embed_some_file",
+            srcs = ["file1.txt", "file2.txt"],
+            cpp_namespace = "my_namespace",
+        )
+
+    will generate a cc_library with the following functions:
+
+        const std::string& get_file1();
+        const std::string& get_file2();
+
+    Args:
+        name: name for the generated cc_library target
+        srcs: files to embed
+        cpp_namespace: If set, the generated code will be wrapped in this namespace
+        **kwargs: keyword arguments passed onto the generated cc_library() rule.
+    """
+
+    namespace_open = ""
+    namespace_close = ""
+    if cpp_namespace:
+        namespace_open = "namespace " + cpp_namespace + " { "
+        namespace_close = "}  // namespace " + cpp_namespace + "\n"
+
+    native.genrule(
+        name = name + "_gen",
+        srcs = srcs,
+        outs = [
+            name + ".cc",
+            name + ".h",
+        ],
+        cmd = """
+            HDR_OUT=$(location {name}.h)
+            CC_OUT=$(location {name}.cc)
+            GUARD="{guard}"
+
+            # 1. Start Header File
+            echo "#ifndef $${{GUARD}}" > "$${{HDR_OUT}}"
+            echo "#define $${{GUARD}}" >> "$${{HDR_OUT}}"
+            echo "#include <string>" >> "$${{HDR_OUT}}"
+            echo "" >> "$${{HDR_OUT}}"
+            echo "{namespace_open}" >> "$${{HDR_OUT}}"
+
+            # 2. Start CC File
+            # Include standard headers FIRST to avoid namespace issues if header is malformed
+            echo "#include <cstddef>" > "$${{CC_OUT}}"
+            echo "#include <string>" >> "$${{CC_OUT}}"
+            echo '#include "{name}.h"' >> "$${{CC_OUT}}"
+            echo "" >> "$${{CC_OUT}}"
+            echo "{namespace_open}" >> "$${{CC_OUT}}"
+
+            # 3. Iterate over source files
+            for src in $(SRCS); do
+                # Extract filename without path
+                FILENAME=$$(basename "$${{src}}")
+                # Extract stem (filename without extension)
+                STEM=$$(echo "$${{FILENAME}}" | sed 's/\\.[^.]*$$//')
+                # Create C++ identifier safe names
+                SAFE_STEM=$$(echo "$${{STEM}}" | sed 's/[^a-zA-Z0-9_]/_/g')
+                FUNC_NAME="get_$${{SAFE_STEM}}"
+                VAR_NAME="$${{SAFE_STEM}}_data"
+
+                # Header: Add function declaration
+                echo "const std::string& $${{FUNC_NAME}}();" >> "$${{HDR_OUT}}"
+
+                # CC: Embed data using xxd
+                xxd -i "$${{src}}" | \
+                sed -e "s/^unsigned char [^[]*/static const unsigned char $${{VAR_NAME}}/" \
+                    -e "s/^unsigned int .*_len/static const size_t $${{VAR_NAME}}_size/" \
+                    >> "$${{CC_OUT}}"
+                echo "" >> "$${{CC_OUT}}"
+
+                # CC: Define the accessor function
+                echo "const std::string& $${{FUNC_NAME}}() {{" >> "$${{CC_OUT}}"
+                echo "  static const std::string* const kInstance = new std::string(" >> "$${{CC_OUT}}"
+                echo "      reinterpret_cast<const char*>($${{VAR_NAME}}), $${{VAR_NAME}}_size);" >> "$${{CC_OUT}}"
+                echo "  return *kInstance;" >> "$${{CC_OUT}}"
+                echo "}}" >> "$${{CC_OUT}}"
+                echo "" >> "$${{CC_OUT}}"
+            done
+
+            # 4. Finish Header File
+            echo "{namespace_close}" >> "$${{HDR_OUT}}"
+            echo "{namespace_close}" >> "$${{CC_OUT}}"
+            echo "#endif  // $${{GUARD}}" >> "$${{HDR_OUT}}"
+        """.format(
+            name = name,
+            guard = name.upper() + "_H_",
+            namespace_open = namespace_open,
+            namespace_close = namespace_close,
+        ),
+        compatible_with = get_compatible_with_portable(),
+    )
+
+    cc_library(
+        name = name,
+        srcs = [name + ".cc"],
+        hdrs = [name + ".h"],
+        **kwargs
+    )
diff --git a/third_party/xla/xla/backends/gpu/specs/a100_pcie_80.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a100_pcie_80.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/a100_pcie_80.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a100_pcie_80.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/a100_sxm_40.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_40.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/a100_sxm_40.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_40.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/a100_sxm_80.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_80.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/a100_sxm_80.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a100_sxm_80.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/a6000.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/a6000.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/a6000.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/a6000.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/b200.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/b200.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/b200.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/b200.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/b300.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/b300.txtpb
similarity index 98%
rename from third_party/xla/xla/backends/gpu/specs/b300.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/b300.txtpb
index 76d77b2cda6089..016292524680d4 100644
--- a/third_party/xla/xla/backends/gpu/specs/b300.txtpb
+++ b/third_party/xla/xla/backends/gpu/target_config/specs/b300.txtpb
@@ -1,17 +1,17 @@
-# Copyright 2025 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-{
+# Copyright 2025 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+gpu_device_info {
   threads_per_block_limit: 1024
   threads_per_warp: 32
   shared_memory_per_block: 49152
diff --git a/third_party/xla/xla/backends/gpu/specs/h100_pcie.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/h100_pcie.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/h100_pcie.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/h100_pcie.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/h100_sxm.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/h100_sxm.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/h100_sxm.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/h100_sxm.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/mi200.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/mi200.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/mi200.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/mi200.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/p100.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/p100.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/p100.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/p100.txtpb
diff --git a/third_party/xla/xla/backends/gpu/specs/v100.txtpb b/third_party/xla/xla/backends/gpu/target_config/specs/v100.txtpb
similarity index 100%
rename from third_party/xla/xla/backends/gpu/specs/v100.txtpb
rename to third_party/xla/xla/backends/gpu/target_config/specs/v100.txtpb
diff --git a/third_party/xla/xla/backends/gpu/target_config/target_config.cc b/third_party/xla/xla/backends/gpu/target_config/target_config.cc
new file mode 100644
index 00000000000000..778308bcbce48d
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/target_config.cc
@@ -0,0 +1,87 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/target_config/target_config.h"
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "google/protobuf/text_format.h"
+#include "xla/backends/gpu/target_config/embed_gpu_specs.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla::gpu {
+namespace {
+
+absl::StatusOr<absl::string_view> GetEmbeddedGpuTargetConfigData(
+    const std::string& gpu_model) {
+  if (gpu_model == "a100_pcie_80") {
+    return get_a100_pcie_80();
+  }
+  if (gpu_model == "a100_sxm_40") {
+    return get_a100_sxm_40();
+  }
+  if (gpu_model == "a100_sxm_80") {
+    return get_a100_sxm_80();
+  }
+  if (gpu_model == "a6000") {
+    return get_a6000();
+  }
+  if (gpu_model == "b200") {
+    return get_b200();
+  }
+  if (gpu_model == "b300") {
+    return get_b300();
+  }
+  if (gpu_model == "h100_pcie") {
+    return get_h100_pcie();
+  }
+  if (gpu_model == "h100_sxm") {
+    return get_h100_sxm();
+  }
+  if (gpu_model == "mi200") {
+    return get_mi200();
+  }
+  if (gpu_model == "p100") {
+    return get_p100();
+  }
+  if (gpu_model == "v100") {
+    return get_v100();
+  }
+  return absl::NotFoundError(
+      absl::StrCat("Embedded file not found: ", gpu_model, ".txtpb"));
+}
+
+}  // namespace
+
+absl::StatusOr<stream_executor::GpuTargetConfigProto> GetGpuTargetConfig(
+    const std::string& gpu_model) {
+  TF_ASSIGN_OR_RETURN(absl::string_view gpu_spec,
+                      GetEmbeddedGpuTargetConfigData(gpu_model));
+
+  stream_executor::GpuTargetConfigProto config;
+  if (!google::protobuf::TextFormat::ParseFromString(std::string(gpu_spec), &config)) {
+    return absl::InternalError(absl::StrCat(
+        "Failed to parse GpuTargetConfigProto from embedded data for: ",
+        gpu_model));
+  }
+  return config;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/target_config/target_config.h b/third_party/xla/xla/backends/gpu/target_config/target_config.h
new file mode 100644
index 00000000000000..eb4be618c2ce76
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/target_config.h
@@ -0,0 +1,32 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_BACKENDS_GPU_TARGET_CONFIG_TARGET_CONFIG_H_
+#define XLA_BACKENDS_GPU_TARGET_CONFIG_TARGET_CONFIG_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/device_description.pb.h"
+
+namespace xla::gpu {
+
+// Returns the GpuTargetConfigProto for the given GPU model.
+absl::StatusOr<stream_executor::GpuTargetConfigProto> GetGpuTargetConfig(
+    const std::string& gpu_model);
+
+}  // namespace xla::gpu
+
+#endif  // XLA_BACKENDS_GPU_TARGET_CONFIG_TARGET_CONFIG_H_
diff --git a/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc b/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc
new file mode 100644
index 00000000000000..e94d8100f157f3
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/target_config/target_config.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "xla/stream_executor/device_description.pb.h"
+#include "xla/tsl/platform/status_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::testing::HasSubstr;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+struct GpuTargetConfigTestCase {
+  std::string test_name;
+  std::string gpu_model;
+  bool expect_ok;
+};
+
+using GetGpuTargetConfigTest =
+    ::testing::TestWithParam<GpuTargetConfigTestCase>;
+
+TEST_P(GetGpuTargetConfigTest, TestProtoRetrieval) {
+  const GpuTargetConfigTestCase& test_case = GetParam();
+  auto config = GetGpuTargetConfig(test_case.gpu_model);
+
+  if (test_case.expect_ok) {
+    ASSERT_THAT(config, IsOk());
+    EXPECT_TRUE(config->has_gpu_device_info());
+    EXPECT_GT(config->gpu_device_info().threads_per_block_limit(), 0);
+  } else {
+    EXPECT_THAT(config, StatusIs(absl::StatusCode::kNotFound,
+                                 HasSubstr("Embedded file not found")));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    GetGpuTargetConfigTests, GetGpuTargetConfigTest,
+    ::testing::ValuesIn<GpuTargetConfigTestCase>({
+        {"A100_PCIE_80", "a100_pcie_80", true},
+        {"A100_SXM_40", "a100_sxm_40", true},
+        {"A100_SXM_80", "a100_sxm_80", true},
+        {"A6000", "a6000", true},
+        {"B200", "b200", true},
+        {"B300", "b300", true},
+        {"H100_PCIE", "h100_pcie", true},
+        {"H100_SXM", "h100_sxm", true},
+        {"MI200", "mi200", true},
+        {"P100", "p100", true},
+        {"V100", "v100", true},
+        {"UnknownModel", "unknown_gpu", false},
+    }),
+    [](const ::testing::TestParamInfo<GetGpuTargetConfigTest::ParamType>&
+           info) { return info.param.test_name; });
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl
index 75bd84958410bd..6412cf872001fc 100644
--- a/third_party/xla/xla/lit.bzl
+++ b/third_party/xla/xla/lit.bzl
@@ -206,13 +206,7 @@ def lit_test_suite_for_gpus(
             "--param=GPU=%s" % (gpu),
         ]
         gpu_data = data + [
-            "//xla/backends/gpu:specs/a100_pcie_80.txtpb",
-            "//xla/backends/gpu:specs/a6000.txtpb",
-            "//xla/backends/gpu:specs/b200.txtpb",
-            "//xla/backends/gpu:specs/h100_sxm.txtpb",
-            "//xla/backends/gpu:specs/mi200.txtpb",
-            "//xla/backends/gpu:specs/p100.txtpb",
-            "//xla/backends/gpu:specs/v100.txtpb",
+            "//xla/backends/gpu/target_config:all_gpu_specs",
         ]
         lit_test_suite(
             "%s_%s" % (name, gpu),
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 681ca64d46e3e6..01f4c8fe62fd32 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -5817,34 +5817,34 @@ xla_aot_compile_cpu(
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_hlo",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test.hlo",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_constant",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_constant.mlir",
 )
 
 xla_aot_compile_gpu(
     name = "xla_aot_compile_test_gpu_executable_convolution",
     autotune_results = "xla_aot_compile_test_autotune_results.txtpb",
-    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
 xla_aot_compile_gpu_runtime_autotuning(
     name = "xla_aot_compile_test_gpu_executable_convolution_runtime_autotuning",
-    gpu_target_config = "//xla/backends/gpu:specs/h100_sxm.txtpb",
+    gpu_target_config = "//xla/backends/gpu/target_config:specs/h100_sxm.txtpb",
     module = "xla_aot_compile_test_convolution.mlir",
 )
 
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index ddbf483fc5a710..5ee867bd06f211 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -410,11 +410,7 @@ cc_library(
 xla_cc_test(
     name = "autotune_cache_key_test",
     srcs = ["autotune_cache_key_test.cc"],
-    data = [
-        "//xla/backends/gpu:specs/a100_sxm_40.txtpb",
-        "//xla/backends/gpu:specs/a100_sxm_80.txtpb",
-        "//xla/backends/gpu:specs/mi200.txtpb",
-    ],
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
     deps = [
         ":autotune_cache_key",
         "//xla/hlo/ir:hlo",
@@ -677,14 +673,8 @@ tf_proto_library(
 xla_cc_test(
     name = "autotuner_util_test",
     srcs = ["autotuner_util_test.cc"],
-    data = [
-        "//xla/backends/gpu:specs/a100_sxm_40.txtpb",
-        "//xla/backends/gpu:specs/a100_sxm_80.txtpb",
-        "//xla/backends/gpu:specs/mi200.txtpb",
-    ],
-    tags = [
-        "gpu",
-    ],
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
+    tags = ["gpu"],
     deps = [
         ":autotune_cache_key",
         ":autotuner_status_key",
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
index dcb44e7f8cec60..9033896ccae947 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotune_cache_key_test.cc
@@ -66,8 +66,8 @@ TEST(AutotuneCacheKeyTest, DeviceDescriptionToCacheKey) {
     std::string spec_string;
     CHECK_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
-        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "backends", "gpu",
-                          "specs", spec_file_name),
+        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+                          "backends/gpu/target_config/specs", spec_file_name),
         &spec_string));
     EXPECT_TRUE(
         tsl::protobuf::TextFormat::ParseFromString(spec_string, &proto));
diff --git a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
index a99ce566bf8681..54af8dd292eea6 100644
--- a/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_spmd_pipeline_test.cc
@@ -60,7 +60,8 @@ class GpuSpmdPartitioningTest : public HloHardwareIndependentTestBase,
     HloPassPipeline spmd_pipeline("spmd-partitioner");
     se::CudaComputeCapability ampere(8, 0);
     AlgebraicSimplifierOptions alg_simplifier_options;
-    // Ampere Core_count from tensorflow/compiler/xla/backends/gpu/specs/.
+    // Ampere Core_count from
+    // tensorflow/compiler/xla/backends/gpu/target_config/specs/.
     AddSPMDPasses(module.get(), alg_simplifier_options, ampere, spmd_pipeline,
                   std::nullopt);
     TF_RETURN_IF_ERROR(spmd_pipeline.Run(module.get()).status());
diff --git a/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo b/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
index 831e2cce5977ce..df82c8c89526ba 100644
--- a/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
+++ b/third_party/xla/xla/service/gpu/tests/bitcast-convert.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = s4[8,2]{1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
index 6ba3138d32f91c..5e91d5fa997469 100644
--- a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // Arguments are passed separately.
 // Even constant arguments are passed as arguments.
diff --git a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
index 9c2a3ffe406710..da5ad41a3c177c 100644
--- a/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dot_bf16.hlo
@@ -1,6 +1,6 @@
-// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %}
-// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/a100_pcie_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
-// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70 %}
+// RUN: %if !IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
+// RUN: %if IS_ROCM %{ hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/mi200.txtpb --split-input-file --xla_gpu_autotune_level=0 --xla_gpu_enable_triton_gemm=false | FileCheck %s --check-prefixes=CHECK-SM80 %}
 
 
 // CHECK-SM70: %[[convert1:.+]] = f32[1536,6144]{1,0} fusion(%{{.+}})
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index a0a40ea5c8d9e1..d3e2fb2a8de1ab 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // All fusions must reuse the same kernel:
 // CHECK-LABEL: target triple
diff --git a/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo b/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
index 6ed7b42bdd369a..5e68f9853367e6 100644
--- a/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
+++ b/third_party/xla/xla/service/gpu/tests/offload_scan_output.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK %s
 
 HloModule jit_f, entry_computation_layout={()->(f32[4]{0:S(5)}, f32[4]{0})}, allow_spmd_sharding_propagation_to_output={true,true}
 
diff --git a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
index 4681d8bb6293b3..0b8df6244c3b15 100644
--- a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
+++ b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
index 2686645ffa76b0..e233780616d8a2 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce-precision.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = bf16[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo b/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
index a351a41eeb4212..48b73ef80dee86 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_fold_zero_add.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-after-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-after-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule test
 
diff --git a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
index f218bfd5f46c40..30acdae0aef8e5 100644
--- a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index 25b1ddbd10d262..31b2c4e34689ce 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,6 +1,6 @@
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
-// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/h100_sxm.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/h100_sxm.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
 
 // CHECK-DAG: sqrt.approx.f32
 
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index c353ad741a4dee..cd0fabcf205468 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index 868666512e6b2f..8585504b855764 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
index acd8b35b1f5252..26fca0c7fb1d6a 100644
--- a/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sub_byte_collectives.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --split-input-file --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck  %s
+// RUN: hlo-opt %s --platform=gpu --split-input-file --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck  %s
 
 e {
   a = s4[4,16]{1,0:E(4)} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
index 37674565a81d50..1722fd44a349ae 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // Verify that Triton kernels have the correct calling convention:
 // - PTX_KERNEL (71) for NVIDIA targets
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index f042371e435f10..3ccf5bb8327503 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // CHECK-PTX: define ptx_kernel void @triton_gemm_r(
 // CHECK-GCN: define amdgpu_kernel void @triton_gemm_r(
diff --git a/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo b/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
index 39294a000c2d8a..d0897c286192ca 100644
--- a/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
+++ b/third_party/xla/xla/service/gpu/tests/zero_clamp_abs_index.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 e {
   p0 = s32[8,9] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 9802b9648076d5..0b702ad9af2acb 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1775,11 +1775,7 @@ lit_test_suite(
         ],
     ),
     cfg = "//xla:lit.cfg.py",
-    data = [
-        "//xla/backends/gpu:specs/a100_pcie_80.txtpb",
-        "//xla/backends/gpu:specs/h100_sxm.txtpb",
-        "//xla/backends/gpu:specs/v100.txtpb",
-    ],
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
     default_tags = tf_gpu_tests_tags(),
     tools = [
         "//xla/tools:hlo-opt",
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
index 8230d28582b039..0281e68b03e4ba 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: bf16[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
index ab91085ea9b00f..10cc948cf6a288 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/h100_sxm.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/h100_sxm.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: f8e4m3fn[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
index a627b6551cf7ed..5ae06c318a1cf9 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/v100.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/v100.txtpb --split-input-file | FileCheck %s
 
 // CHECK: fused_transpose
 // CHECK-NEXT: f16[3,3,16,32]{3,2,1,0} parameter(0)
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index bf76eb7cea13a4..471b043735dcf8 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -671,18 +671,7 @@ xla_test(
     name = "gpu_device_info_test",
     srcs = ["gpu_device_info_test.cc"],
     backends = ["gpu"],
-    data = [
-        "//xla/backends/gpu:specs/a100_pcie_80.txtpb",
-        "//xla/backends/gpu:specs/a100_sxm_40.txtpb",
-        "//xla/backends/gpu:specs/a100_sxm_80.txtpb",
-        "//xla/backends/gpu:specs/a6000.txtpb",
-        "//xla/backends/gpu:specs/b200.txtpb",
-        "//xla/backends/gpu:specs/h100_pcie.txtpb",
-        "//xla/backends/gpu:specs/h100_sxm.txtpb",
-        "//xla/backends/gpu:specs/mi200.txtpb",
-        "//xla/backends/gpu:specs/p100.txtpb",
-        "//xla/backends/gpu:specs/v100.txtpb",
-    ],
+    data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
     deps = [
         "//xla/service:platform_util",
         "//xla/stream_executor:device_description",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
index 3470b878182ab7..e6d057659da911 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
@@ -45,8 +45,9 @@ TEST(DeviceInfoTest, DeviceInfoMatches) {
     std::string spec_string;
     TF_ASSERT_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
-        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "backends", "gpu",
-                          "specs", absl::StrCat(file_name, ".txtpb")),
+        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+                          "backends/gpu/target_config/specs",
+                          absl::StrCat(file_name, ".txtpb")),
         &spec_string));
     ASSERT_TRUE(
         tsl::protobuf::TextFormat::ParseFromString(spec_string, &proto));
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index 7cfbccb52aba71..de88faf0390424 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -1103,7 +1103,7 @@ xla_test(
     ],
     data = [
         ":data/add.hlo",
-        "//xla/backends/gpu:specs/h100_sxm.txtpb",
+        "//xla/backends/gpu/target_config:all_gpu_specs",
         "//xla/service/gpu:gpu_compiler_test_autotune_db.textproto",
     ],
     deps = [
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index 4eeffad9c2b681..27a4a69f7292c3 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -258,7 +258,7 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "//xla/backends/gpu:all_gpu_specs",
+        "//xla/backends/gpu/target_config:all_gpu_specs",
         "//xla/tools:hlo-opt",
         "@llvm-project//llvm:FileCheck",
     ],
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
index cb45f800808a59..43ab1735faa499 100755
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
index 9a60d938a9562f..f1e4e166d7fed6 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_backend.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo-backend --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule module
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
index 65abb9b3ab2c09..267720369ba9d9 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_buffers.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=buffer-assignment --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
index f2d9127e5bff21..ba200234183f5c 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_collective_cse.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=hlo --passes=schedule-aware-collective-cse --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --xla_gpu_experimental_collective_cse_distance_threshold=100 | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=hlo --passes=schedule-aware-collective-cse --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --xla_gpu_experimental_collective_cse_distance_threshold=100 | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
index 74675fddafa9ed..f53ec227451880 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_html.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=html --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=html --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK: <!DOCTYPE html>
 // CHECK: bitcast
@@ -9,4 +9,4 @@ ENTRY computation {
     c = f32[6000,5000] transpose(p), dimensions={1,0}
     r = f32[300,20,5000] reshape(c)
     ROOT out = (f32[5000,6000], f32[300,20,5000]) tuple(e,r)
-}
\ No newline at end of file
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
index dc1ca2693ffb6e..59416aa1821e6b 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
index 60fe903cee5326..95342aae192d09 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_pass.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --passes=dot-algorithm-rewriter --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --passes=dot-algorithm-rewriter --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule Algorithm3xBF16
 // CHECK-LABEL: HloModule Algorithm3xBF16, entry_computation_layout={(f32[128,128]{1,0}, f32[128,128]{1,0})->f32[128,128]{1,0}}
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
index 20a86530be50cb..b4ba7b9549c7d0 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_ptx.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule m
 
diff --git a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
index b8888a5d362944..b0e6e7115dc157 100644
--- a/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/tests/gpu_hlo_unoptimized_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // CHECK-PTX:     define ptx_kernel void @fusion
 // CHECK-GCN:     define amdgpu_kernel void @fusion
diff --git a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
index d3eec58a3de994..e8ac6bc0a14bdc 100644
--- a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
@@ -71,8 +71,9 @@ TEST_F(XlaCompileLibTest, CompilesForGpuWithDevice) {
 }
 
 TEST_F(XlaCompileLibTest, CompilesForGpuWithoutDevice) {
-  const std::string target_config_path = tsl::io::JoinPath(
-      tsl::testing::XlaSrcRoot(), "backends/gpu/specs", "h100_sxm.txtpb");
+  const std::string target_config_path =
+      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+                        "backends/gpu/target_config/specs", "h100_sxm.txtpb");
   stream_executor::GpuTargetConfigProto target_config;
   TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,
                                   &target_config));

From 24d56183bf16fb2de62d270a6cd36397cc9cdcad Mon Sep 17 00:00:00 2001
From: Olli Lupton <olupton@nvidia.com>
Date: Wed, 17 Dec 2025 10:32:16 -0800
Subject: [PATCH 451/753] PR #35339: Improve memory allocation error message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35339

📝 Summary of Changes
Show the actual error message when allocator fails, and fix stringification of an enum.

🎯 Justification
Without this, the error message is vague.

🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix

📊 Benchmark (for Performance Improvements)
n/a

🧪 Unit Tests:
n/a

🧪 Execution Tests:
n/a
Copybara import of the project:

--
d0bee7837f1c70a7494acc7e7088d211e554b981 by Olli Lupton <olupton@nvidia.com>:

Improve memory allocation error message

Merging this change closes #35339

PiperOrigin-RevId: 845826162
---
 .../xla/xla/stream_executor/cuda/cuda_memory_allocator.h  | 8 ++++----
 .../integrations/stream_executor_allocator.cc             | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h
index 6aef4027a7b77f..b6a705673e91f9 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_memory_allocator.h
@@ -29,14 +29,14 @@ namespace stream_executor::gpu {
 // A type of memory allocator for kCollective memory space.
 enum class CollectiveAllocatorType { kNccl, kNvshmem };
 
-template <typename T>
-void AbslStringify(std::string* str, CollectiveAllocatorType allocator_type) {
+template <typename Sink>
+void AbslStringify(Sink& sink, CollectiveAllocatorType allocator_type) {
   switch (allocator_type) {
     case CollectiveAllocatorType::kNccl:
-      *str = "NCCL";
+      sink.Append("NCCL");
       break;
     case CollectiveAllocatorType::kNvshmem:
-      *str = "NVSHMEM";
+      sink.Append("NVSHMEM");
       break;
   }
 }
diff --git a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
index c49e1b17c61e49..cf2ad44a4b0b85 100644
--- a/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
+++ b/third_party/xla/xla/stream_executor/integrations/stream_executor_allocator.cc
@@ -65,7 +65,8 @@ void* StreamExecutorAllocator::Alloc(size_t alignment, size_t num_bytes,
     auto allocation = memory_allocator_->Allocate(num_bytes);
     if (!allocation.ok()) {
       LOG(WARNING) << "could not allocate " << MemorySpaceToString(memory_type_)
-                   << " of size: " << num_bytes;
+                   << " of size: " << num_bytes << " (" << allocation.status()
+                   << ')';
       *bytes_received = 0;
       return nullptr;
     }

From 356223b183a386eb95e4206c6fb485388d7aee6c Mon Sep 17 00:00:00 2001
From: Alex <alexandros.theodoridis@amd.com>
Date: Wed, 17 Dec 2025 10:43:06 -0800
Subject: [PATCH 452/753] PR #34945: [ROCm] Add support for parametrized rocm
 hermetic dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/34945

📝 Summary of Changes
This change adds support of parametrized
hermetic rocm dependency through the environment variables.
One can set these 3 environment variables:
```
--repo_env="ROCM_DISTRO_URL=https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-gfx94X-dcgpu-7.10.0a20251107.tar.gz"
--repo_env="ROCM_DISTRO_HASH=486dbf647bcf9b78f21d7477f43addc7b2075b1a322a119045db9cdc5eb98380"
--repo_env="ROCM_DISTRO_LINKS=llvm/amdgcn:amdgcn"
```
Where ROCM_DISTRO_LINKS is a list of pairs formed like:
"target:link,target2:link2" that will create symlinks after the extraction of the tar file.

🎯 Justification
This change adds more flexibility to rocm hermetic builds and eliminates the need
to modify the rocm_redist.bzl file.

🚀 Kind of Contribution
Please remove what does not apply:
✨ New Feature

📊 Benchmark (for Performance Improvements)
Not relevant

🧪 Unit Tests:
Not relevant

🧪 Execution Tests:
Not relevant

Copybara import of the project:

--
8a9522d97fb0514ef4c16e205be75d0817a7f5d6 by Alex Theodoridis <alekstheod@gmail.com>:

Add support for parametrized rocm hermetic dependency

--
6dcf113b25ab4530372ed9b7d6c04124a1ce09ba by Alexandros Theodoridis <atheodor@amd.com>:

Clean up

Merging this change closes #34945

PiperOrigin-RevId: 845830426
---
 third_party/xla/tensorflow.bazelrc            |  4 +-
 .../xla/third_party/gpus/rocm/rocm_redist.bzl | 19 +++++++++
 .../xla/third_party/gpus/rocm_configure.bzl   | 40 ++++++++++++++-----
 3 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index a08ecafdedae0a..c933ef4fb1072e 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -287,7 +287,9 @@ common:rocm_ci --config=rocm
 
 common:rocm_ci_hermetic --dynamic_mode=off
 common:rocm_ci_hermetic --config=rocm_clang_official
-common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_VERSION=rocm_7.10.0_gfx94X"
+common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_URL=https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-gfx94X-dcgpu-7.10.0a20251107.tar.gz"
+common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_HASH=486dbf647bcf9b78f21d7477f43addc7b2075b1a322a119045db9cdc5eb98380"
+common:rocm_ci_hermetic --repo_env="ROCM_DISTRO_LINKS=llvm/amdgcn:amdgcn"
 common:rocm_ci_hermetic --@local_config_rocm//rocm:rocm_path_type=hermetic
 
 # This config option is used for SYCL as GPU backend.
diff --git a/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl b/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
index 0628122609f8a2..6f7db647259a84 100644
--- a/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
+++ b/third_party/xla/third_party/gpus/rocm/rocm_redist.bzl
@@ -39,3 +39,22 @@ rocm_redist = {
         rocm_root = "_rocm_sdk_devel",
     ),
 }
+
+def _parse_rocm_distro_links(distro_links):
+    result = []
+    for pair in distro_links.split(","):
+        link = pair.split(":")
+        result.append(struct(target = link[0], link = link[1]))
+    return result
+
+def create_rocm_distro(distro_url, distro_hash, symlinks):
+    return struct(
+        packages = [
+            {
+                "url": distro_url,
+                "sha256": distro_hash,
+            },
+        ],
+        required_softlinks = _parse_rocm_distro_links(symlinks),
+        rocm_root = "",
+    )
diff --git a/third_party/xla/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/gpus/rocm_configure.bzl
index 2679e2e0447a44..9415b7c86db5dd 100644
--- a/third_party/xla/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/gpus/rocm_configure.bzl
@@ -17,6 +17,7 @@
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load(
     "//third_party/gpus/rocm:rocm_redist.bzl",
+    "create_rocm_distro",
     "rocm_redist",
 )
 load(
@@ -54,6 +55,9 @@ _TF_ROCM_AMDGPU_TARGETS = "TF_ROCM_AMDGPU_TARGETS"
 _TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
 _DISTRIBUTION_PATH = "rocm/rocm_dist"
 _ROCM_DISTRO_VERSION = "ROCM_DISTRO_VERSION"
+_ROCM_DISTRO_URL = "ROCM_DISTRO_URL"
+_ROCM_DISTRO_HASH = "ROCM_DISTRO_HASH"
+_ROCM_DISTRO_LINKS = "ROCM_DISTRO_LINKS"
 _TMPDIR = "TMPDIR"
 
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
@@ -545,23 +549,36 @@ def _remove_root_dir(path, root_dir):
         return path[len(root_dir) + 1:]
     return path
 
+def _setup_rocm_distro_dir_impl(repository_ctx, rocm_distro):
+    repository_ctx.file("rocm/.index")
+    for pkg in rocm_distro.packages:
+        _download_package(repository_ctx, pkg)
+
+    for entry in rocm_distro.required_softlinks:
+        repository_ctx.symlink(
+            "{}/{}".format(_DISTRIBUTION_PATH, entry.target),
+            "{}/{}".format(_DISTRIBUTION_PATH, entry.link),
+        )
+    bash_bin = get_bash_bin(repository_ctx)
+    return _get_rocm_config(repository_ctx, bash_bin, _canonical_path("{}/{}".format(_DISTRIBUTION_PATH, rocm_distro.rocm_root)), "")
+
 def _setup_rocm_distro_dir(repository_ctx):
     """Sets up the rocm hermetic installation directory to be used in hermetic build"""
     bash_bin = get_bash_bin(repository_ctx)
+    rocm_distro_url = repository_ctx.os.environ.get(_ROCM_DISTRO_URL)
+    if rocm_distro_url:
+        rocm_distro_hash = repository_ctx.os.environ.get(_ROCM_DISTRO_HASH)
+        if not rocm_distro_hash:
+            fail("{} environment variable is required", _ROCM_DISTRO_HASH)
+        rocm_distro_links = repository_ctx.os.environ.get(_ROCM_DISTRO_LINKS, "")
+        rocm_distro = create_rocm_distro(rocm_distro_url, rocm_distro_hash, rocm_distro_links)
+        return _setup_rocm_distro_dir_impl(repository_ctx, rocm_distro)
+
     rocm_distro = repository_ctx.os.environ.get(_ROCM_DISTRO_VERSION)
     multiple_paths = repository_ctx.os.environ.get(_TF_ROCM_MULTIPLE_PATHS)
     if rocm_distro:
         redist = rocm_redist[rocm_distro]
-        repository_ctx.file("rocm/.index")
-        for pkg in redist.packages:
-            _download_package(repository_ctx, pkg)
-
-        for entry in redist.required_softlinks:
-            repository_ctx.symlink(
-                "{}/{}".format(_DISTRIBUTION_PATH, entry.target),
-                "{}/{}".format(_DISTRIBUTION_PATH, entry.link),
-            )
-        return _get_rocm_config(repository_ctx, bash_bin, _canonical_path("{}/{}".format(_DISTRIBUTION_PATH, redist.rocm_root)), "")
+        return _setup_rocm_distro_dir_impl(repository_ctx, rocm_distro)
     elif multiple_paths:
         paths_list = multiple_paths.split(":")
         for rocm_custom_path in paths_list:
@@ -855,6 +872,9 @@ _ENVIRONS = [
     _ROCM_TOOLKIT_PATH,
     _TF_ROCM_AMDGPU_TARGETS,
     _ROCM_DISTRO_VERSION,
+    _ROCM_DISTRO_URL,
+    _ROCM_DISTRO_HASH,
+    _ROCM_DISTRO_LINKS,
     _TF_ROCM_RBE_DOCKER_IMAGE,
     _TF_ROCM_RBE_SINGLE_GPU_POOL,
     _TF_ROCM_RBE_MULTI_GPU_POOL,

From 413f4136c32312514f5cdf55a684f08ebef4248e Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Wed, 17 Dec 2025 11:04:18 -0800
Subject: [PATCH 453/753] Update XNNPack version

PiperOrigin-RevId: 845839475
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake | 2 +-
 tensorflow/workspace2.bzl                         | 6 +++---
 third_party/xla/third_party/xnnpack/workspace.bzl | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index ecbbf91866a8c0..03e94cf830ce29 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 77468446ebfd9baab7fc4349c32608c9675cf6d9
+  GIT_TAG 2c1a512208d0481d6e6bd87c2bd5e23408febc3e
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 0b42230ec0f651..cf1f355df8bd3e 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -168,9 +168,9 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "a89879422c6da8240cffb8ff67f5cd11f0362cb2a174ee9cd96b450e53902ca3",
-        strip_prefix = "XNNPACK-77468446ebfd9baab7fc4349c32608c9675cf6d9",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/77468446ebfd9baab7fc4349c32608c9675cf6d9.zip"),
+        sha256 = "961965b04b0cee7c0ece34bb21dbdf69e483772ae7bdb275a08e6d457ed7e38b",
+        strip_prefix = "XNNPACK-2c1a512208d0481d6e6bd87c2bd5e23408febc3e",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/2c1a512208d0481d6e6bd87c2bd5e23408febc3e.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
diff --git a/third_party/xla/third_party/xnnpack/workspace.bzl b/third_party/xla/third_party/xnnpack/workspace.bzl
index f67720ec702007..a06db393d261a6 100644
--- a/third_party/xla/third_party/xnnpack/workspace.bzl
+++ b/third_party/xla/third_party/xnnpack/workspace.bzl
@@ -6,9 +6,9 @@ def repo():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "a89879422c6da8240cffb8ff67f5cd11f0362cb2a174ee9cd96b450e53902ca3",
-        strip_prefix = "XNNPACK-77468446ebfd9baab7fc4349c32608c9675cf6d9",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/77468446ebfd9baab7fc4349c32608c9675cf6d9.zip"),
+        sha256 = "961965b04b0cee7c0ece34bb21dbdf69e483772ae7bdb275a08e6d457ed7e38b",
+        strip_prefix = "XNNPACK-2c1a512208d0481d6e6bd87c2bd5e23408febc3e",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/2c1a512208d0481d6e6bd87c2bd5e23408febc3e.zip"),
         patch_file = ["//third_party/xnnpack:layering_check_fix.patch"],
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)

From 722bf3e739ba2d6347f1b4e435cc79486a174113 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Wed, 17 Dec 2025 11:06:48 -0800
Subject: [PATCH 454/753] Add ExecuteReplicatedWithExecutable to
 HloRunnerInterface.

Works just like ExecuteReplicated but lets us avoid creating redundant
executables and avoid using ExecuteReplicated with the `executable_provider` set
to return the same executable for each device.

PiperOrigin-RevId: 845840535
---
 third_party/xla/xla/service/BUILD             |  2 ++
 third_party/xla/xla/service/hlo_runner.cc     | 29 ++++++++++++++++++-
 third_party/xla/xla/service/hlo_runner.h      | 10 +++++++
 .../xla/xla/service/hlo_runner_interface.h    | 10 +++++++
 .../xla/xla/service/hlo_runner_pjrt.cc        | 29 +++++++++++++++++--
 third_party/xla/xla/service/hlo_runner_pjrt.h | 15 ++++++----
 6 files changed, 86 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 01f4c8fe62fd32..d049724ae6fe90 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -4491,6 +4491,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
@@ -4531,6 +4532,7 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index 9ce2dec42dc218..4d071600c4e405 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/tsl/platform/logging.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/threadpool.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -474,7 +475,15 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
-  return ExecuteReplicated(executable.get(), options, device_assignment,
+  return ExecuteReplicatedWithExecutable(executable.get(), options,
+                                         device_assignment);
+}
+
+absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
+    const ReplicatedExecuteOptions& options,
+    DeviceAssignment* device_assignment) {
+  return ExecuteReplicated(executable, options, device_assignment,
                            /*profile=*/nullptr);
 }
 
@@ -620,6 +629,14 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedImpl(
 absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     OpaqueExecutable* executable, const ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment, ExecutionProfile* profile) {
+  DeviceAssignment computation_device_assignment;
+  if (device_assignment == nullptr) {
+    ASSIGN_OR_RETURN(
+        computation_device_assignment,
+        backend().computation_placer()->AssignDevices(options.num_devices, 1));
+    device_assignment = &computation_device_assignment;
+  }
+  CHECK_NE(device_assignment, nullptr);
   TF_ASSIGN_OR_RETURN(HloRunnerExecutable* const wrapped_executable,
                       HloRunnerExecutable::TryUnwrap(*this, executable));
   return ExecuteReplicatedImpl(
@@ -736,6 +753,16 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
   return ExecuteReplicated(std::move(module), options, &device_assignment);
 }
 
+absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
+    const ReplicatedExecuteOptions& options) {
+  ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      backend().computation_placer()->AssignDevices(options.num_devices, 1));
+  return ExecuteReplicatedWithExecutable(executable, options,
+                                         &device_assignment);
+}
+
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>> HloRunner::CreateExecutable(
     std::unique_ptr<HloModule> module, bool run_hlo_passes) {
   return CreateExecutableWithBufferAssignment(
diff --git a/third_party/xla/xla/service/hlo_runner.h b/third_party/xla/xla/service/hlo_runner.h
index b2b5feab32b491..122a4b6c766376 100644
--- a/third_party/xla/xla/service/hlo_runner.h
+++ b/third_party/xla/xla/service/hlo_runner.h
@@ -173,6 +173,16 @@ class HloRunner : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options) override;
+
+  // Same as above, but with specified device assignment.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
   // Same as above, but with a reusable Executable.  This may update the profile
   // information in *executable.
   //
diff --git a/third_party/xla/xla/service/hlo_runner_interface.h b/third_party/xla/xla/service/hlo_runner_interface.h
index eb498d61185311..6956847ee83f72 100644
--- a/third_party/xla/xla/service/hlo_runner_interface.h
+++ b/third_party/xla/xla/service/hlo_runner_interface.h
@@ -290,6 +290,16 @@ class HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) = 0;
 
+  virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options) = 0;
+
+  // Same as above, but with specified device assignment.
+  virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) = 0;
+
   virtual absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 5f326727735a4a..78404dade0960f 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "tsl/platform/fingerprint.h"
 #include "tsl/platform/path.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -551,13 +552,35 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<OpaqueExecutable> executable,
       CreateExecutable(std::move(module), options.run_hlo_passes));
-  return ExecuteReplicated(executable.get(), options, device_assignment);
+  return ExecuteReplicatedWithExecutable(executable.get(), options,
+                                         device_assignment);
 }
 
-absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
-    OpaqueExecutable* executable,
+absl::StatusOr<std::vector<Literal>>
+HloRunnerPjRt::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
+    const ReplicatedExecuteOptions& options) {
+  ASSIGN_OR_RETURN(const HloModule* const module,
+                   HloModuleFromWrapped(executable));
+  ASSIGN_OR_RETURN(
+      DeviceAssignment device_assignment,
+      GetStaticDeviceAssignmentOrComputeDefault(*module, *pjrt_client_));
+  return ExecuteReplicatedWithExecutable(executable, options,
+                                         &device_assignment);
+}
+
+absl::StatusOr<std::vector<Literal>>
+HloRunnerPjRt::ExecuteReplicatedWithExecutable(
+    OpaqueExecutable* const absl_nonnull executable,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
+  std::optional<DeviceAssignment> default_device_assignment = std::nullopt;
+  if (device_assignment == nullptr) {
+    ASSIGN_OR_RETURN(default_device_assignment,
+                     GetDefaultDeviceAssignment(options.num_devices, 1));
+    device_assignment = &*default_device_assignment;
+  }
+  CHECK_NE(device_assignment, nullptr);
   TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
                       HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
 
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index 488a0777aadcf4..08b4f4b12fa227 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -101,6 +101,16 @@ class HloRunnerPjRt : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options) override;
+
+  // Same as above, but with specified device assignment.
+  absl::StatusOr<std::vector<Literal>> ExecuteReplicatedWithExecutable(
+      OpaqueExecutable* absl_nonnull executable,
+      const ReplicatedExecuteOptions& options,
+      DeviceAssignment* device_assignment) override;
+
   absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
@@ -108,11 +118,6 @@ class HloRunnerPjRt : public HloRunnerInterface {
       const ReplicatedExecuteOptions& options,
       DeviceAssignment* device_assignment) override;
 
-  absl::StatusOr<std::vector<Literal>> ExecuteReplicated(
-      OpaqueExecutable* executable,
-      const HloRunnerInterface::ReplicatedExecuteOptions& options,
-      DeviceAssignment* device_assignment);
-
   absl::string_view Name() const override;
 
   int device_count() const override { return pjrt_client_->device_count(); }

From a0aeb4fa6f1b31998b1b804231f8404afe7912d3 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Wed, 17 Dec 2025 11:27:42 -0800
Subject: [PATCH 455/753] Update `rules_ml_toolchain` version to remove
 redundant `fake_nvshmem_bootstrap_uid` library from hermetic CUDA deps.

PiperOrigin-RevId: 845848986
---
 third_party/xla/MODULE.bazel   | 6 +++---
 third_party/xla/WORKSPACE      | 6 +++---
 third_party/xla/workspace0.bzl | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index ae3daa05e8f78f..defeb37f2e1b10 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -45,9 +45,9 @@ bazel_dep(name = "rules_ml_toolchain")
 # echo "sha256-${HASH}"
 archive_override(
     module_name = "rules_ml_toolchain",
-    integrity = "sha256-6YQt4/77WhINOxZH06CebnBx6N+NHNLf5vZu4x/SWV4=",
-    strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
-    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz"],
+    integrity = "sha256-U5Be3lDj7rx4ImbiDpuawdcWbvaLh3vqWT02ANz+A+Y=",
+    strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
+    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz"],
 )
 
 # TODO: Upstream the patch?
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index f7786569d234ab..e5e460105a164c 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -9,10 +9,10 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
-    strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
+    sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
+    strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
     urls = tf_mirror_urls(
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
     ),
 )
 
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 89a7cc6454943e..52e509da92e800 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -140,10 +140,10 @@ def workspace():
     if "rules_ml_toolchain" not in native.existing_rules():
         tf_http_archive(
             name = "rules_ml_toolchain",
-            sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
-            strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
+            sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
+            strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
             urls = tf_mirror_urls(
-                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
+                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
             ),
         )
 

From 5841fece45bfb47d89815d4a3bde260106b5f2c0 Mon Sep 17 00:00:00 2001
From: Fengwu Yao <fengwuyao@google.com>
Date: Wed, 17 Dec 2025 12:31:37 -0800
Subject: [PATCH 456/753] Internal changes only.

PiperOrigin-RevId: 845873615
---
 tensorflow/lite/kernels/BUILD                 | 30 ++++--
 tensorflow/lite/kernels/activations_test.cc   | 43 ++++-----
 tensorflow/lite/kernels/atan2_test.cc         | 29 +++---
 tensorflow/lite/kernels/cast_test.cc          | 19 ++--
 tensorflow/lite/kernels/comparisons_test.cc   | 11 +--
 tensorflow/lite/kernels/concatenation_test.cc | 44 ++++-----
 .../lite/kernels/dynamic_update_slice_test.cc | 17 ++--
 tensorflow/lite/kernels/fill_test.cc          |  5 +-
 tensorflow/lite/kernels/floor_test.cc         | 31 +++---
 tensorflow/lite/kernels/gather_nd_test.cc     | 62 ++++++------
 tensorflow/lite/kernels/gather_test.cc        |  4 +-
 .../lite/kernels/maximum_minimum_test.cc      | 96 +++++++++----------
 tensorflow/lite/kernels/neg_test.cc           | 13 ++-
 tensorflow/lite/kernels/pad_test.cc           | 46 +++++----
 tensorflow/lite/kernels/reverse_test.cc       | 42 ++++----
 tensorflow/lite/kernels/round_test.cc         | 37 ++++---
 tensorflow/lite/kernels/slice_test.cc         | 39 ++++----
 tensorflow/lite/kernels/strided_slice_test.cc |  8 +-
 tensorflow/lite/kernels/test_util.cc          |  1 +
 tensorflow/lite/kernels/test_util.h           | 11 ++-
 tensorflow/lite/kernels/test_util_test.cc     |  9 ++
 21 files changed, 303 insertions(+), 294 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 6a3ec9f57e2a02..db2435b081d36b 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -167,7 +167,7 @@ cc_library(
         "//tensorflow/lite/tools/optimize:quantization_utils",
         "//tensorflow/lite/tools/serialization:writer_lib",
         "//tensorflow/lite/tools/versioning",
-        "@FP16",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -574,6 +574,7 @@ cc_test(
         "//tensorflow/lite:array",
         "//tensorflow/lite:util",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -1118,6 +1119,7 @@ cc_test(
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -1499,6 +1501,7 @@ cc_test(
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
@@ -1515,8 +1518,8 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -1709,6 +1712,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1739,6 +1743,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1853,6 +1858,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -1913,8 +1919,8 @@ cc_test(
         ":test_util",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
@@ -1972,6 +1978,7 @@ cc_test(
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2082,12 +2089,12 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
-        "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2101,7 +2108,9 @@ cc_test(
         "//tensorflow/lite:string",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
         "@flatbuffers",
     ],
 )
@@ -2485,7 +2494,9 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2530,6 +2541,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2544,6 +2556,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -2581,8 +2594,8 @@ cc_test(
         "//tensorflow/lite/kernels/internal:tensor_ctypes",
         "//tensorflow/lite/kernels/internal:tensor_utils_no_eigen",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -2881,6 +2894,7 @@ cc_test(
         ":test_util",
         "//tensorflow/lite:string",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
     ],
@@ -2905,6 +2919,7 @@ cc_test(
         ":test_main",
         ":test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -3173,12 +3188,13 @@ cc_test(
     size = "small",
     srcs = ["dynamic_update_slice_test.cc"],
     deps = [
+        ":subgraph_test_util",
         ":test_main",
         ":test_util",
         "//tensorflow/lite:framework_stable",
         "//tensorflow/lite/core:framework_stable",
-        "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
         "@flatbuffers",
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 42747a87e61b2a..96bb22ed76c431 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 
@@ -574,18 +575,17 @@ TEST_P(TanhOpTest, Tanh) {
 }
 
 TEST_P(TanhOpTest, TanhFloat16) {
-  FloatActivationsOpModel<Eigen::half> m(
-      GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_TANH,
+                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      Eigen::half(0),
-      Eigen::half(-6),
-      Eigen::half(2),
-      Eigen::half(4),
-      Eigen::half(3),
-      Eigen::half(-2),
-      Eigen::half(10),
-      Eigen::half(1),
+      half(0),
+      half(-6),
+      half(2),
+      half(4),
+      half(3),
+      half(-2),
+      half(10),
+      half(1),
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
@@ -1210,18 +1210,17 @@ TEST_P(LogisticOpTest, SigmoidFloat32) {
 }
 
 TEST_P(LogisticOpTest, SigmoidFloat16) {
-  FloatActivationsOpModel<Eigen::half> m(
-      GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
+  FloatActivationsOpModel<half> m(GetRegistration(), BuiltinOperator_LOGISTIC,
+                                  /*input=*/{TensorType_FLOAT16, {1, 2, 4, 1}});
   m.SetInput({
-      Eigen::half{-1.2f},
-      Eigen::half{-6.0f},
-      Eigen::half{2.0f},
-      Eigen::half{4.0f},
-      Eigen::half{3.0f},
-      Eigen::half{-2.0f},
-      Eigen::half{10.0f},
-      Eigen::half{1.0f},
+      half{-1.2f},
+      half{-6.0f},
+      half{2.0f},
+      half{4.0f},
+      half{3.0f},
+      half{-2.0f},
+      half{10.0f},
+      half{1.0f},
   });
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/atan2_test.cc b/tensorflow/lite/kernels/atan2_test.cc
index 309ba79f284f3f..0c3839361570a6 100644
--- a/tensorflow/lite/kernels/atan2_test.cc
+++ b/tensorflow/lite/kernels/atan2_test.cc
@@ -17,6 +17,7 @@
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -34,7 +35,7 @@ tflite::TensorType GetTTEnum<double>() {
 }
 
 template <>
-tflite::TensorType GetTTEnum<Eigen::half>() {
+tflite::TensorType GetTTEnum<half>() {
   return tflite::TensorType_FLOAT16;
 }
 
@@ -74,7 +75,7 @@ class Atan2Test : public ::testing::Test {
   using FloatType = Float;
 };
 
-using TestTypes = ::testing::Types<float, double, Eigen::half, Eigen::bfloat16>;
+using TestTypes = ::testing::Types<float, double, half, Eigen::bfloat16>;
 
 TYPED_TEST_SUITE(Atan2Test, TestTypes);
 
@@ -85,15 +86,15 @@ TYPED_TEST(Atan2Test, TestScalar) {
   tflite::TensorData output = {GetTTEnum<Float>(), {}};
   Atan2Model m(y, x, output);
 
-  auto got = m.GetOutput<Float>({Float(0.0)}, {Float(0.0)});
+  auto got = m.GetOutput<Float>({Float(0.0f)}, {Float(0.0f)});
   ASSERT_EQ(got.size(), 1);
   EXPECT_FLOAT_EQ(got[0], 0.0);
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0)}, {Float(0.0)})[0],
-                  Float(M_PI / 2));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0)}, {Float(1.0)})[0],
-                  Float(0.0));
-  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0)}, {Float(0.0)})[0],
-                  Float(-M_PI / 2));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(1.0f)}, {Float(0.0f)})[0],
+                  Float(static_cast<float>(M_PI / 2)));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(0.0f)}, {Float(1.0f)})[0],
+                  Float(0.0f));
+  ASSERT_FLOAT_EQ(m.GetOutput<Float>({Float(-1.0f)}, {Float(0.0f)})[0],
+                  Float(-static_cast<float>(M_PI / 2)));
 }
 
 TYPED_TEST(Atan2Test, TestBatch) {
@@ -102,10 +103,12 @@ TYPED_TEST(Atan2Test, TestBatch) {
   tflite::TensorData x = {GetTTEnum<Float>(), {4, 2, 1}};
   tflite::TensorData output = {GetTTEnum<Float>(), {4, 2, 1}};
   Atan2Model m(y, x, output);
-  std::vector<Float> y_data = {Float(0.1), Float(0.2), Float(0.3), Float(0.4),
-                               Float(0.5), Float(0.6), Float(0.7), Float(0.8)};
-  std::vector<Float> x_data = {Float(0.8), Float(0.7), Float(0.6), Float(0.5),
-                               Float(0.4), Float(0.3), Float(0.2), Float(0.1)};
+  std::vector<Float> y_data = {Float(0.1f), Float(0.2f), Float(0.3f),
+                               Float(0.4f), Float(0.5f), Float(0.6f),
+                               Float(0.7f), Float(0.8f)};
+  std::vector<Float> x_data = {Float(0.8f), Float(0.7f), Float(0.6f),
+                               Float(0.5f), Float(0.4f), Float(0.3f),
+                               Float(0.2f), Float(0.1f)};
   auto got = m.GetOutput<Float>(y_data, x_data);
   ASSERT_EQ(got.size(), 8);
   for (int i = 0; i < 8; ++i) {
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index 77cc2f3442b1c2..bcc9b4bc058003 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -413,11 +414,10 @@ TEST(CastOpModel, CastFloatToFloat16) {
   m.PopulateTensor<float>(m.input(), {100.f, 1.0f, 0.f, 0.4f, 1.999f, 1.1f});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
-      m.ExtractVector<Eigen::half>(m.output()),
-      ElementsAreArray(
-          {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
-           static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
-           static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1)}));
+      m.ExtractVector<half>(m.output()),
+      ElementsAreArray({static_cast<half>(100.f), static_cast<half>(1.0f),
+                        static_cast<half>(0.f), static_cast<half>(0.4f),
+                        static_cast<half>(1.999f), static_cast<half>(1.1f)}));
 }
 
 TEST(CastOpModel, CastFloatToBFloat16) {
@@ -435,11 +435,10 @@ TEST(CastOpModel, CastFloatToBFloat16) {
 
 TEST(CastOpModel, CastFloat16ToFloat) {
   CastOpModel m({TensorType_FLOAT16, {3, 2}}, {TensorType_FLOAT32, {3, 2}});
-  m.PopulateTensor<Eigen::half>(
-      m.input(),
-      {static_cast<Eigen::half>(100.f), static_cast<Eigen::half>(1.0f),
-       static_cast<Eigen::half>(0.f), static_cast<Eigen::half>(0.4f),
-       static_cast<Eigen::half>(1.999f), static_cast<Eigen::half>(1.1f)});
+  m.PopulateTensor<half>(m.input(),
+                         {static_cast<half>(100.f), static_cast<half>(1.0f),
+                          static_cast<half>(0.f), static_cast<half>(0.4f),
+                          static_cast<half>(1.999f), static_cast<half>(1.1f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.ExtractVector<float>(m.output()),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/comparisons_test.cc b/tensorflow/lite/kernels/comparisons_test.cc
index 10226bb60a8ed8..bc2091aa823832 100644
--- a/tensorflow/lite/kernels/comparisons_test.cc
+++ b/tensorflow/lite/kernels/comparisons_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -396,12 +397,10 @@ TEST(ComparisonsTest, LessFloat) {
 TEST(ComparisonsTest, LessFloat16) {
   ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT16,
                           BuiltinOperator_LESS);
-  model.PopulateTensor<Eigen::half>(
-      model.input1(),
-      {Eigen::half(0.1), Eigen::half(0.9), Eigen::half(0.7), Eigen::half(0.3)});
-  model.PopulateTensor<Eigen::half>(
-      model.input2(),
-      {Eigen::half(0.1), Eigen::half(0.2), Eigen::half(0.6), Eigen::half(0.5)});
+  model.PopulateTensor<half>(model.input1(),
+                             {half(0.1f), half(0.9f), half(0.7f), half(0.3f)});
+  model.PopulateTensor<half>(model.input2(),
+                             {half(0.1f), half(0.2f), half(0.6f), half(0.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true));
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 28692ae1528dd3..f9c765375cc20f 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -121,12 +122,11 @@ TEST(ConcatenationOpTest, ThreeDimensionalOneInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, ThreeDimensionalOneInputFloat16) {
-  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2}},
-                                      /*axis=*/1,
-                                      /*num_inputs=*/1);
-  m.SetInput(0,
-             {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(3.0f),
-              static_cast<Eigen::half>(4.0f), static_cast<Eigen::half>(7.0f)});
+  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2}},
+                               /*axis=*/1,
+                               /*num_inputs=*/1);
+  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(3.0f),
+                 static_cast<half>(4.0f), static_cast<half>(7.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
 }
@@ -206,23 +206,21 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInputBFloat16) {
 }
 
 TEST(ConcatenationOpTest, FiveDimensionalTwoInputFloat16) {
-  ConcatenationOpModel<Eigen::half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
-                                      /*axis=*/0,
-                                      /*num_inputs=*/2);
-  m.SetInput(
-      0, {static_cast<Eigen::half>(1.0f), static_cast<Eigen::half>(2.0f),
-          static_cast<Eigen::half>(3.0f), static_cast<Eigen::half>(4.0f),
-          static_cast<Eigen::half>(5.0f), static_cast<Eigen::half>(6.0f),
-          static_cast<Eigen::half>(7.0f), Eigen::half{8.0f},
-          static_cast<Eigen::half>(9.0f), static_cast<Eigen::half>(10.0f),
-          static_cast<Eigen::half>(11.0f), static_cast<Eigen::half>(12.0f)});
-  m.SetInput(
-      1, {static_cast<Eigen::half>(13.0f), static_cast<Eigen::half>(14.0f),
-          Eigen::half{15.0f}, static_cast<Eigen::half>(16.0f),
-          Eigen::half{17.0f}, static_cast<Eigen::half>(18.0f),
-          static_cast<Eigen::half>(19.0f), static_cast<Eigen::half>(20.0f),
-          static_cast<Eigen::half>(21.0f), static_cast<Eigen::half>(22.0f),
-          static_cast<Eigen::half>(23.0f), static_cast<Eigen::half>(24.0f)});
+  ConcatenationOpModel<half> m({TensorType_FLOAT16, {2, 1, 2, 1, 3}},
+                               /*axis=*/0,
+                               /*num_inputs=*/2);
+  m.SetInput(0, {static_cast<half>(1.0f), static_cast<half>(2.0f),
+                 static_cast<half>(3.0f), static_cast<half>(4.0f),
+                 static_cast<half>(5.0f), static_cast<half>(6.0f),
+                 static_cast<half>(7.0f), half{8.0f}, static_cast<half>(9.0f),
+                 static_cast<half>(10.0f), static_cast<half>(11.0f),
+                 static_cast<half>(12.0f)});
+  m.SetInput(1,
+             {static_cast<half>(13.0f), static_cast<half>(14.0f), half{15.0f},
+              static_cast<half>(16.0f), half{17.0f}, static_cast<half>(18.0f),
+              static_cast<half>(19.0f), static_cast<half>(20.0f),
+              static_cast<half>(21.0f), static_cast<half>(22.0f),
+              static_cast<half>(23.0f), static_cast<half>(24.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/dynamic_update_slice_test.cc b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
index 373a719d5ac412..99aa637a068d23 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice_test.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -112,10 +113,9 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   DynamicUpdateSliceOpModel m({TensorType_FLOAT16, {3, 3}},
                               {TensorType_FLOAT16, {2, 1}},
                               {TensorType_INT32, {2}});
-  m.SetInput<Eigen::half>({Eigen::half(1), Eigen::half(2), Eigen::half(3),
-                           Eigen::half(4), Eigen::half(5), Eigen::half(6),
-                           Eigen::half(7), Eigen::half(8), Eigen::half(9)});
-  m.SetUpdate<Eigen::half>({Eigen::half(-1), Eigen::half(-2)});
+  m.SetInput<half>({half(1), half(2), half(3), half(4), half(5), half(6),
+                    half(7), half(8), half(9)});
+  m.SetUpdate<half>({half(-1), half(-2)});
   m.SetStartIndices<int32_t>({1, 1});
   const int kInplaceInputTensorIdx = 0;
   const int kInplaceOutputTensorIdx = 0;
@@ -123,11 +123,10 @@ TEST(DynamicUpdateSliceOpTest, SimpleTestF16InPlaceInput) {
   TfLiteTensor* output_tensor = m.GetOutputTensor(kInplaceOutputTensorIdx);
   output_tensor->data.data = input_tensor->data.data;
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<Eigen::half>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {Eigen::half(1), Eigen::half(2), Eigen::half(3),
-                   Eigen::half(4), Eigen::half(-1), Eigen::half(6),
-                   Eigen::half(7), Eigen::half(-2), Eigen::half(9)})));
+  EXPECT_THAT(m.GetOutput<half>(),
+              ElementsAreArray(
+                  ArrayFloatNear({half(1), half(2), half(3), half(4), half(-1),
+                                  half(6), half(7), half(-2), half(9)})));
   EXPECT_EQ(output_tensor->data.data, input_tensor->data.data);
 }
 
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index 028623e3a0a321..a8e9815f30bc61 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -139,8 +140,8 @@ TEST_P(FillOpTest, FillFloat) {
 }
 
 TEST_P(FillOpTest, FillFloat16) {
-  FillOpModel<int64_t, Eigen::half> m(TensorType_INT64, {3}, {2, 2, 2},
-                                      Eigen::half(4.0f), GetParam());
+  FillOpModel<int64_t, half> m(TensorType_INT64, {3}, {2, 2, 2}, half(4.0f),
+                               GetParam());
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
       m.GetOutput(),
diff --git a/tensorflow/lite/kernels/floor_test.cc b/tensorflow/lite/kernels/floor_test.cc
index 86ea68ad39e599..13154175e334cc 100644
--- a/tensorflow/lite/kernels/floor_test.cc
+++ b/tensorflow/lite/kernels/floor_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -79,28 +80,28 @@ TEST(FloorOpTest, MultiDims) {
 
 TEST(FloorOpTest, SingleDimFloat16) {
   FloorOpModel model({2}, TensorType_FLOAT16);
-  model.PopulateTensor<>(model.input(), {Eigen::half(8.5), Eigen::half(0.0)});
+  model.PopulateTensor<>(model.input(), {half(8.5f), half(0.0f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<Eigen::half>(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutput<half>(), ElementsAreArray({8, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
 }
 
 TEST(FloorOpTest, MultiDimsFloat16) {
   FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT16);
-  model.PopulateTensor<Eigen::half>(model.input(), {
-                                                       Eigen::half(0.75),
-                                                       Eigen::half(8.25),
-                                                       Eigen::half(0.49),
-                                                       Eigen::half(9.99),
-                                                       Eigen::half(0.5),
-                                                       Eigen::half(-0.25),
-                                                       Eigen::half(-8.75),
-                                                       Eigen::half(-0.99),
-                                                       Eigen::half(-9.49),
-                                                       Eigen::half(-0.5),
-                                                   });
+  model.PopulateTensor<half>(model.input(), {
+                                                half(0.75f),
+                                                half(8.25f),
+                                                half(0.49f),
+                                                half(9.99f),
+                                                half(0.5f),
+                                                half(-0.25f),
+                                                half(-8.75f),
+                                                half(-0.99f),
+                                                half(-9.49f),
+                                                half(-0.5f),
+                                            });
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(model.GetOutput<Eigen::half>(),
+  EXPECT_THAT(model.GetOutput<half>(),
               ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
 }
diff --git a/tensorflow/lite/kernels/gather_nd_test.cc b/tensorflow/lite/kernels/gather_nd_test.cc
index 2bd9a0235ebe2c..f4b9f65711fbdc 100644
--- a/tensorflow/lite/kernels/gather_nd_test.cc
+++ b/tensorflow/lite/kernels/gather_nd_test.cc
@@ -20,10 +20,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -244,21 +246,19 @@ TEST(GatherNdOpTest, BFloat16Int32) {
 TEST(GatherNdOpTest, Float16Int32) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT32, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int32_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int32) {
@@ -297,21 +297,19 @@ TEST(GatherNdOpTest, BFloat16Int64) {
 TEST(GatherNdOpTest, Float16Int64) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT64, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int64_t>({0LL, 1LL, 1LL, 0LL});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int64) {
@@ -462,21 +460,19 @@ TEST(GatherNdOpTest, BFloat16Int16) {
 TEST(GatherNdOpTest, Float16Int16) {
   GatherNdOpModel m({TensorType_FLOAT16, {3, 2, 3}},
                     {TensorType_INT16, {2, 2}});
-  m.SetInput<Eigen::half>(
-      {Eigen::half(1.1), Eigen::half(-1.2), Eigen::half(1.3), Eigen::half(-2.1),
-       Eigen::half(2.2), Eigen::half(2.3),  //
-       Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3), Eigen::half(-4.1),
-       Eigen::half(-4.2), Eigen::half(4.3),  //
-       Eigen::half(5.1), Eigen::half(-5.2), Eigen::half(5.3), Eigen::half(6.1),
-       Eigen::half(-6.2), Eigen::half(6.3)});
+  m.SetInput<half>({half(1.1f), half(-1.2f), half(1.3f), half(-2.1f),
+                    half(2.2f), half(2.3f),  //
+                    half(3.1f), half(3.2f), half(-3.3f), half(-4.1f),
+                    half(-4.2f), half(4.3f),  //
+                    half(5.1f), half(-5.2f), half(5.3f), half(6.1f),
+                    half(-6.2f), half(6.3f)});
   m.SetPositions<int16_t>({0, 1, 1, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(
-      m.GetOutput<Eigen::half>(),
-      Pointwise(FloatingPointEq(),
-                {Eigen::half(-2.1), Eigen::half(2.2), Eigen::half(2.3),
-                 Eigen::half(3.1), Eigen::half(3.2), Eigen::half(-3.3)}));
+      m.GetOutput<half>(),
+      Pointwise(FloatingPointEq(), {half(-2.1f), half(2.2f), half(2.3f),
+                                    half(3.1f), half(3.2f), half(-3.3f)}));
 }
 
 TEST(GatherNdOpTest, Float32Int16) {
diff --git a/tensorflow/lite/kernels/gather_test.cc b/tensorflow/lite/kernels/gather_test.cc
index 23e30eb7867774..61ca1b654f6160 100644
--- a/tensorflow/lite/kernels/gather_test.cc
+++ b/tensorflow/lite/kernels/gather_test.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -252,7 +254,7 @@ TEST_P(GatherOpTest, LastAxis0DIndex) {
 }
 
 using TestTypes = testing::Types<int8_t, uint8_t, int16_t, int32_t, int64_t,
-                                 float, Eigen::half, Eigen::bfloat16>;
+                                 float, half, Eigen::bfloat16>;
 
 template <typename T>
 struct TypedGatherOpTest : public testing::Test {};
diff --git a/tensorflow/lite/kernels/maximum_minimum_test.cc b/tensorflow/lite/kernels/maximum_minimum_test.cc
index babdb4f69fad03..00e25ee9b86500 100644
--- a/tensorflow/lite/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/kernels/maximum_minimum_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -247,24 +248,20 @@ TEST(MaximumOpTest, Int32WithBroadcastTest5D) {
 }
 
 TEST(MaximumOpTest, Float16Test) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0),  Eigen::half(-1.0),
-      Eigen::half(11.0), Eigen::half(-2.0), Eigen::half(-1.44)};
-  std::initializer_list<Eigen::half> data2 = {
-      Eigen::half(-1.0), Eigen::half(0.0),  Eigen::half(1.0),
-      Eigen::half(12.0), Eigen::half(-3.0), Eigen::half(-1.43)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(1.0), Eigen::half(12.0),
-       Eigen::half(-2.0), Eigen::half(-1.43)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
-      {TensorType_FLOAT16, {3, 1, 2}}, {TensorType_FLOAT16, {3, 1, 2}}, data1,
-      data2,
-      {Eigen::half(-1.0), Eigen::half(0.0), Eigen::half(-1.0),
-       Eigen::half(11.0), Eigen::half(-3.0), Eigen::half(-1.44)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),  half(-1.0f),
+                                       half(11.0f), half(-2.0f), half(-1.44f)};
+  std::initializer_list<half> data2 = {half(-1.0f), half(0.0f),  half(1.0f),
+                                       half(12.0f), half(-3.0f), half(-1.43f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
+                  {half(1.0f), half(0.0f), half(1.0f), half(12.0f), half(-2.0f),
+                   half(-1.43f)});
+  TestModel<half>(BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}},
+                  {TensorType_FLOAT16, {3, 1, 2}}, data1, data2,
+                  {half(-1.0f), half(0.0f), half(-1.0f), half(11.0f),
+                   half(-3.0f), half(-1.44f)});
 }
 
 TEST(MaximumOpTest, BFloat16Test) {
@@ -308,42 +305,39 @@ TEST(MaximumOpTest, BFloat16WithBroadcastTest5DScalarY) {
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5DScalarY) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0), Eigen::half(-1.0),
-      Eigen::half(-2.0), Eigen::half(3.0), Eigen::half(11.0)};
-  std::initializer_list<Eigen::half> data2 = {Eigen::half(2.0)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
-      data2,
-      {Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0), Eigen::half(2.0),
-       Eigen::half(3.0), Eigen::half(11.0)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
-      {TensorType_FLOAT16, {1}}, {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
-       Eigen::half(2.0), Eigen::half(2.0)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f), half(-1.0f),
+                                       half(-2.0f), half(3.0f), half(11.0f)};
+  std::initializer_list<half> data2 = {half(2.0f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM,
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+                  {TensorType_FLOAT16, {1}},
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
+                  {half(2.0f), half(2.0f), half(2.0f), half(2.0f), half(3.0f),
+                   half(11.0f)});
+  TestModel<half>(BuiltinOperator_MINIMUM,
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}},
+                  {TensorType_FLOAT16, {1}},
+                  {TensorType_FLOAT16, {3, 1, 2, 1, 1}}, data1, data2,
+                  {half(1.0f), half(0.0f), half(-1.0f), half(-2.0f), half(2.0f),
+                   half(2.0f)});
 }
 
 TEST(MaximumOpTest, Float16WithBroadcastTest5D) {
-  std::initializer_list<Eigen::half> data1 = {
-      Eigen::half(1.0),  Eigen::half(0.0),   Eigen::half(-1.0),
-      Eigen::half(-2.0), Eigen::half(-1.44), Eigen::half(11.0)};
-  std::initializer_list<Eigen::half> data2 = {Eigen::half(0.5),
-                                              Eigen::half(2.0)};
-  TestModel<Eigen::half>(
-      BuiltinOperator_MAXIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
-      data2,
-      {Eigen::half(1.0), Eigen::half(2.0), Eigen::half(0.5), Eigen::half(2.0),
-       Eigen::half(0.5), Eigen::half(11.0)});
-  TestModel<Eigen::half>(
-      BuiltinOperator_MINIMUM, {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
-      {TensorType_FLOAT16, {2}}, {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1,
-      data2,
-      {Eigen::half(0.5), Eigen::half(0.0), Eigen::half(-1.0), Eigen::half(-2.0),
-       Eigen::half(-1.44), Eigen::half(2.0)});
+  std::initializer_list<half> data1 = {half(1.0f),  half(0.0f),   half(-1.0f),
+                                       half(-2.0f), half(-1.44f), half(11.0f)};
+  std::initializer_list<half> data2 = {half(0.5f), half(2.0f)};
+  TestModel<half>(BuiltinOperator_MAXIMUM,
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+                  {TensorType_FLOAT16, {2}},
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
+                  {half(1.0f), half(2.0f), half(0.5f), half(2.0f), half(0.5f),
+                   half(11.0f)});
+  TestModel<half>(BuiltinOperator_MINIMUM,
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}},
+                  {TensorType_FLOAT16, {2}},
+                  {TensorType_FLOAT16, {3, 1, 1, 1, 2}}, data1, data2,
+                  {half(0.5f), half(0.0f), half(-1.0f), half(-2.0f),
+                   half(-1.44f), half(2.0f)});
 }
 
 TEST(MaximumOpTest, BFloat16WithBroadcastTest5D) {
diff --git a/tensorflow/lite/kernels/neg_test.cc b/tensorflow/lite/kernels/neg_test.cc
index fe9cc68bdf8a4d..883f9182758412 100644
--- a/tensorflow/lite/kernels/neg_test.cc
+++ b/tensorflow/lite/kernels/neg_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -67,14 +68,12 @@ TEST(NegOpModel, NegFloat32) {
 
 TEST(NegOpModel, NegFloat16) {
   NegOpModel m({TensorType_FLOAT16, {6}}, {TensorType_FLOAT16, {6}});
-  m.SetInput<Eigen::half>({Eigen::half(-2.0f), Eigen::half(-1.0f),
-                           Eigen::half(0.f), Eigen::half(1.0f),
-                           Eigen::half(2.0f), Eigen::half(3.0f)});
+  m.SetInput<half>({half(-2.0f), half(-1.0f), half(0.f), half(1.0f), half(2.0f),
+                    half(3.0f)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput<Eigen::half>(),
-              ElementsAreArray({Eigen::half(2.0f), Eigen::half(1.0f),
-                                Eigen::half(0.f), Eigen::half(-1.0f),
-                                Eigen::half(-2.0f), Eigen::half(-3.0f)}));
+  EXPECT_THAT(m.GetOutput<half>(),
+              ElementsAreArray({half(2.0f), half(1.0f), half(0.f), half(-1.0f),
+                                half(-2.0f), half(-3.0f)}));
 }
 
 TEST(NegOpModel, NegBfloat16) {
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index 971be96a915b4b..b985abccddcee7 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -927,19 +928,16 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleConstFloat32ValuedTestInt8) {
 
 template <typename padding_integer_type>
 void SimpleConstFloat16ValuedTest() {
-  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+  PadV2OpConstModel<half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
-      Eigen::half{4.0f}, {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{1.5f}, Eigen::half{2.5f}, Eigen::half{3.5f},
-              Eigen::half{4.5}});
+      half{4.0f}, {TensorType_FLOAT16});
+  m.SetInput({half{1.5f}, half{2.5f}, half{3.5f}, half{4.5f}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray(ArrayFloatNear(
-          {Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{1.5}, Eigen::half{2.5}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{3.5}, Eigen::half{4.5}, Eigen::half{4},
-           Eigen::half{4}, Eigen::half{4}, Eigen::half{4}, Eigen::half{4}})));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{4}, half{4}, half{4}, half{4}, half{4}, half{1.5f},
+                   half{2.5f}, half{4}, half{4}, half{3.5f}, half{4.5f},
+                   half{4}, half{4}, half{4}, half{4}, half{4}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
@@ -1050,12 +1048,15 @@ TEST_F(PadV2OpTest, Int16PaddingSimple4DConstFloat32ValuedTest) {
 
 template <typename padding_integer_type>
 void Simple4DConstFloat16ValuedTest() {
-  PadV2OpConstModel<Eigen::half, padding_integer_type> m(
+  PadV2OpConstModel<half, padding_integer_type> m(
       {TensorType_FLOAT16, {1, 1, 2, 1}}, {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1},
-      Eigen::half{7.0}, {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{3.0f}, Eigen::half{6.0f}});
+      half{7.0f}, {TensorType_FLOAT16});
+  m.SetInput({half{3.0f}, half{6.0f}});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 7, 6, 7, 7, 7, 7, 7}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{3.0f}, half{7.0f}, half{6.0f}, half{7.0f}, half{7.0f},
+                   half{7.0f}, half{7.0f}, half{7.0f}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
 }
 
@@ -1167,15 +1168,18 @@ TEST_F(PadV2OpTest, Int16PaddingSimpleDynamicTest) {
 
 template <typename padding_integer_type>
 void SimpleDynamicTestV2Float16() {
-  PadV2OpDynamicModel<Eigen::half, padding_integer_type> m(
-      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, Eigen::half{0.0},
+  PadV2OpDynamicModel<half, padding_integer_type> m(
+      {TensorType_FLOAT16, {1, 2, 2, 1}}, {4, 2}, half{0.0f},
       {TensorType_FLOAT16});
-  m.SetInput({Eigen::half{1.0f}, Eigen::half{2.0f}, Eigen::half{3.0f},
-              Eigen::half{4.0f}});
+  m.SetInput({half{1.0f}, half{2.0f}, half{3.0f}, half{4.0f}});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
-                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
+                   half{1.0f}, half{2.0f}, half{0.0f}, half{0.0f}, half{3.0f},
+                   half{4.0f}, half{0.0f}, half{0.0f}, half{0.0f}, half{0.0f},
+                   half{0.0f}})));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
diff --git a/tensorflow/lite/kernels/reverse_test.cc b/tensorflow/lite/kernels/reverse_test.cc
index 4301b0120f53c3..7e2d3df543ba28 100644
--- a/tensorflow/lite/kernels/reverse_test.cc
+++ b/tensorflow/lite/kernels/reverse_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -354,45 +355,38 @@ TEST(ReverseOpTest, Int16MultiDimensions) {
 
 // float16 tests.
 TEST(ReverseOpTest, Float16OneDimension) {
-  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4}},
-                                    {TensorType_INT32, {1}});
-  model.PopulateTensor<Eigen::half>(
-      model.input(),
-      {Eigen::half(1), Eigen::half(2), Eigen::half(3), Eigen::half(4)});
+  ReverseOpModel<half> model({TensorType_FLOAT16, {4}},
+                             {TensorType_INT32, {1}});
+  model.PopulateTensor<half>(model.input(),
+                             {half(1), half(2), half(3), half(4)});
   model.PopulateTensor<int32_t>(model.axis(), {0});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
   EXPECT_THAT(model.GetOutput(),
-              ElementsAreArray({Eigen::half(4), Eigen::half(3), Eigen::half(2),
-                                Eigen::half(1)}));
+              ElementsAreArray({half(4), half(3), half(2), half(1)}));
 }
 
 TEST(ReverseOpTest, Float16MultiDimensions) {
-  ReverseOpModel<Eigen::half> model({TensorType_FLOAT16, {4, 3, 2}},
-                                    {TensorType_INT32, {1}});
-  model.PopulateTensor<Eigen::half>(
+  ReverseOpModel<half> model({TensorType_FLOAT16, {4, 3, 2}},
+                             {TensorType_INT32, {1}});
+  model.PopulateTensor<half>(
       model.input(),
-      {Eigen::half(1),  Eigen::half(2),  Eigen::half(3),  Eigen::half(4),
-       Eigen::half(5),  Eigen::half(6),  Eigen::half(7),  Eigen::half(8),
-       Eigen::half(9),  Eigen::half(10), Eigen::half(11), Eigen::half(12),
-       Eigen::half(13), Eigen::half(14), Eigen::half(15), Eigen::half(16),
-       Eigen::half(17), Eigen::half(18), Eigen::half(19), Eigen::half(20),
-       Eigen::half(21), Eigen::half(22), Eigen::half(23), Eigen::half(24)});
+      {half(1),  half(2),  half(3),  half(4),  half(5),  half(6),
+       half(7),  half(8),  half(9),  half(10), half(11), half(12),
+       half(13), half(14), half(15), half(16), half(17), half(18),
+       half(19), half(20), half(21), half(22), half(23), half(24)});
   model.PopulateTensor<int32_t>(model.axis(), {1});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
 
   EXPECT_THAT(model.GetOutputShape(), ElementsAre(4, 3, 2));
   EXPECT_THAT(
       model.GetOutput(),
-      ElementsAreArray({Eigen::half(5),  Eigen::half(6),  Eigen::half(3),
-                        Eigen::half(4),  Eigen::half(1),  Eigen::half(2),
-                        Eigen::half(11), Eigen::half(12), Eigen::half(9),
-                        Eigen::half(10), Eigen::half(7),  Eigen::half(8),
-                        Eigen::half(17), Eigen::half(18), Eigen::half(15),
-                        Eigen::half(16), Eigen::half(13), Eigen::half(14),
-                        Eigen::half(23), Eigen::half(24), Eigen::half(21),
-                        Eigen::half(22), Eigen::half(19), Eigen::half(20)}));
+      ElementsAreArray({half(5),  half(6),  half(3),  half(4),  half(1),
+                        half(2),  half(11), half(12), half(9),  half(10),
+                        half(7),  half(8),  half(17), half(18), half(15),
+                        half(16), half(13), half(14), half(23), half(24),
+                        half(21), half(22), half(19), half(20)}));
 }
 
 // bfloat16 tests.
diff --git a/tensorflow/lite/kernels/round_test.cc b/tensorflow/lite/kernels/round_test.cc
index c3752827f3e61c..e3fccf888c9815 100644
--- a/tensorflow/lite/kernels/round_test.cc
+++ b/tensorflow/lite/kernels/round_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -68,33 +69,29 @@ TEST(RoundOpTest, MultiDims) {
 }
 
 TEST(RoundOpTest, Float16SingleDim) {
-  RoundOpModel<Eigen::half> model({6});
-  model.PopulateTensor<Eigen::half>(
-      model.input(), {Eigen::half(8.5), Eigen::half(0.0), Eigen::half(3.5),
-                      Eigen::half(4.2), Eigen::half(-3.5), Eigen::half(-4.5)});
+  RoundOpModel<half> model({6});
+  model.PopulateTensor<half>(model.input(),
+                             {half(8.5f), half(0.0f), half(3.5f), half(4.2f),
+                              half(-3.5f), half(-4.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      model.GetOutput(),
-      ElementsAreArray({Eigen::half(8), Eigen::half(0), Eigen::half(4),
-                        Eigen::half(4), Eigen::half(-4), Eigen::half(-4)}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray(
+                  {half(8), half(0), half(4), half(4), half(-4), half(-4)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({6}));
 }
 
 TEST(RoundOpTest, Float16MultiDims) {
-  RoundOpModel<Eigen::half> model({2, 1, 1, 6});
-  model.PopulateTensor<Eigen::half>(
+  RoundOpModel<half> model({2, 1, 1, 6});
+  model.PopulateTensor<half>(
       model.input(),
-      {Eigen::half(0.0001), Eigen::half(8.0001), Eigen::half(0.9999),
-       Eigen::half(9.9999), Eigen::half(0.5), Eigen::half(-0.0001),
-       Eigen::half(-8.0001), Eigen::half(-0.9999), Eigen::half(-9.9999),
-       Eigen::half(-0.5), Eigen::half(-2.5), Eigen::half(1.5)});
+      {half(0.0001f), half(8.0001f), half(0.9999f), half(9.9999f), half(0.5f),
+       half(-0.0001f), half(-8.0001f), half(-0.9999f), half(-9.9999f),
+       half(-0.5f), half(-2.5f), half(1.5f)});
   ASSERT_EQ(model.Invoke(), kTfLiteOk);
-  EXPECT_THAT(
-      model.GetOutput(),
-      ElementsAreArray({Eigen::half(0), Eigen::half(8), Eigen::half(1),
-                        Eigen::half(10), Eigen::half(0), Eigen::half(0),
-                        Eigen::half(-8), Eigen::half(-1), Eigen::half(-10),
-                        Eigen::half(-0), Eigen::half(-2), Eigen::half(2)}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({half(0), half(8), half(1), half(10), half(0),
+                                half(0), half(-8), half(-1), half(-10),
+                                half(-0), half(-2), half(2)}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 6}));
 }
 
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index feb02c48d2f3aa..2f3430770f7b68 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
-#include "Eigen/Core"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/common.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -338,20 +338,16 @@ TEST_P(SliceOpTest, SliceBool) {
 }
 
 TEST_P(SliceOpTest, SliceFloat16) {
-  SliceOpModel<Eigen::half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
-                                       {2, 1, -1, 1}, TensorType_INT32,
-                                       TensorType_FLOAT16, GetParam());
-  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(1), Eigen::half(2),
-              Eigen::half(2), Eigen::half(2), Eigen::half(3), Eigen::half(3),
-              Eigen::half(3), Eigen::half(4), Eigen::half(4), Eigen::half(4),
-              Eigen::half(5), Eigen::half(5), Eigen::half(5), Eigen::half(6),
-              Eigen::half(6), Eigen::half(6)});
+  SliceOpModel<half, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                {2, 1, -1, 1}, TensorType_INT32,
+                                TensorType_FLOAT16, GetParam());
+  m.SetInput({half(1), half(1), half(1), half(2), half(2), half(2), half(3),
+              half(3), half(3), half(4), half(4), half(4), half(5), half(5),
+              half(5), half(6), half(6), half(6)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
-  EXPECT_THAT(
-      m.GetOutput(),
-      ElementsAreArray({Eigen::half(3), Eigen::half(3), Eigen::half(3),
-                        Eigen::half(5), Eigen::half(5), Eigen::half(5)}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({half(3), half(3), half(3),
+                                               half(5), half(5), half(5)}));
 }
 
 TEST_P(SliceOpTest, SliceBFloat16) {
@@ -373,19 +369,16 @@ TEST_P(SliceOpTest, SliceBFloat16) {
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1Float16) {
-  SliceOpModel<Eigen::half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
-                                       {2, -1, 1, 1}, TensorType_INT32,
-                                       TensorType_FLOAT16, GetParam());
-  m.SetInput({Eigen::half(1), Eigen::half(1), Eigen::half(2), Eigen::half(2),
-              Eigen::half(3), Eigen::half(3), Eigen::half(4), Eigen::half(4),
-              Eigen::half(5), Eigen::half(5), Eigen::half(6), Eigen::half(6),
-              Eigen::half(7), Eigen::half(7), Eigen::half(8), Eigen::half(8),
-              Eigen::half(9), Eigen::half(9)});
+  SliceOpModel<half, int32_t> m({3, 3, 2, 1}, {4}, {1, 1, 0, 0}, {4},
+                                {2, -1, 1, 1}, TensorType_INT32,
+                                TensorType_FLOAT16, GetParam());
+  m.SetInput({half(1), half(1), half(2), half(2), half(3), half(3), half(4),
+              half(4), half(5), half(5), half(6), half(6), half(7), half(7),
+              half(8), half(8), half(9), half(9)});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1, 1}));
   EXPECT_THAT(m.GetOutput(),
-              ElementsAreArray({Eigen::half(5), Eigen::half(6), Eigen::half(8),
-                                Eigen::half(9)}));
+              ElementsAreArray({half(5), half(6), half(8), half(9)}));
 }
 
 TEST_P(SliceOpTest, BeginNonZeroSizeMinus1Axis1BFloat16) {
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 6ba4ef3b78977f..f7c79680576fe1 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -22,8 +22,10 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive  // IWYU pragma: keep
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace {
@@ -152,7 +154,7 @@ class StridedSliceOpModel : public SingleOpModel {
 template <typename T>
 class StridedSliceOpTest : public ::testing::Test {};
 
-using DataTypes = ::testing::Types<float, Eigen::half, Eigen::bfloat16, uint8_t,
+using DataTypes = ::testing::Types<float, half, Eigen::bfloat16, uint8_t,
                                    uint32_t, int8_t, int16_t, int32_t>;
 TYPED_TEST_SUITE(StridedSliceOpTest, DataTypes);
 
@@ -347,7 +349,9 @@ TYPED_TEST(StridedSliceOpTest, In1D_Int32End) {
       continue;
     }
     std::vector<TypeParam> values(32768);
-    std::iota(values.begin(), values.end(), TypeParam(0));
+    for (int i = 0; i < 32768; ++i) {
+      values[i] = static_cast<TypeParam>(i);
+    }
 
     StridedSliceOpModel<TypeParam> m({32768}, {1}, {1}, {1}, values, {0},
                                      {32768}, {1}, 0, 0, 0, 0, 0,
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 2ebeb4a9457280..f792bd31529582 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/logging.h"
 #include "tensorflow/lite/tools/serialization/writer_lib.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
+#include "tensorflow/lite/types/fp16.h"  // IWYU pragma: keep
 #include "tensorflow/lite/version.h"
 #include "tsl/platform/logging.h"
 
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index cbdb74d29d04aa..69053a598785a2 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -38,7 +38,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "fp16/fp16.h"  // from @FP16
 #include "absl/algorithm/container.h"
 #include "absl/log/absl_check.h"
 #include "absl/log/absl_log.h"
@@ -57,6 +56,8 @@ limitations under the License.
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/util.h"  // IWYU pragma: keep
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
+#include "tensorflow/lite/types/fp16.h"
+#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 #include "tsl/platform/logging.h"
 
@@ -134,7 +135,7 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 }
 
 template <>
-constexpr TfLiteType typeToTfLiteType<Eigen::half>() {
+constexpr TfLiteType typeToTfLiteType<half>() {
   return kTfLiteFloat16;
 }
 
@@ -1362,7 +1363,7 @@ TFLITE_TENSOR_TYPE_ASSOC(uint16_t, TensorType_UINT16);
 TFLITE_TENSOR_TYPE_ASSOC(uint32_t, TensorType_UINT32);
 TFLITE_TENSOR_TYPE_ASSOC(uint64_t, TensorType_UINT64);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteFloat16, TensorType_FLOAT16);
-TFLITE_TENSOR_TYPE_ASSOC(Eigen::half, TensorType_FLOAT16);
+TFLITE_TENSOR_TYPE_ASSOC(half, TensorType_FLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(TfLiteBFloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(Eigen::bfloat16, TensorType_BFLOAT16);
 TFLITE_TENSOR_TYPE_ASSOC(float, TensorType_FLOAT32);
@@ -1461,13 +1462,13 @@ struct TypeUnion<uint8_t> {
 };
 
 template <>
-struct TypeUnion<Eigen::half> {
+struct TypeUnion<half> {
  public:
   // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT16;
   // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat16;
-  typedef Eigen::half ScalarType;
+  typedef half ScalarType;
 };
 
 template <>
diff --git a/tensorflow/lite/kernels/test_util_test.cc b/tensorflow/lite/kernels/test_util_test.cc
index ed9a679b4e4d33..01f514692b0616 100644
--- a/tensorflow/lite/kernels/test_util_test.cc
+++ b/tensorflow/lite/kernels/test_util_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/array.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
+#include "tensorflow/lite/types/half.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -197,6 +198,14 @@ TEST(TestUtilTest, QuantizeVectorScalingUp) {
   EXPECT_THAT(q_data, ElementsAreArray(expected));
 }
 
+TEST(TestUtilTest, DequantizeVectorFp16) {
+  std::vector<half> data = {half(-1.0f), half(-0.5f), half(0.0f), half(0.5f),
+                            half(1.0f)};
+  auto f_data = Dequantize<half>(data, /*scale=*/0.1f, /*zero_point=*/0);
+  std::vector<float> expected = {-0.1f, -0.05f, 0.0f, 0.05f, 0.1f};
+  EXPECT_THAT(f_data, ElementsAreArray(tflite::ArrayFloatNear(expected, 1e-7)));
+}
+
 TEST(DimsAreMatcherTestTensor, ValidOneD) {
   TensorUniquePtr t = BuildTfLiteTensor(kTfLiteInt32, {2}, kTfLiteDynamic);
   EXPECT_THAT(t.get(), DimsAre({2}));

From 667249ad7a114f057af2cfb879097a785e23b13a Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Wed, 17 Dec 2025 12:31:50 -0800
Subject: [PATCH 457/753] Update `rules_ml_toolchain` version to remove
 redundant `fake_nvshmem_bootstrap_uid` library from hermetic CUDA deps.

PiperOrigin-RevId: 845873716
---
 WORKSPACE                 | 6 +++---
 tensorflow/workspace0.bzl | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 33c2a4a4ac691d..c7944b7b28c0db 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -21,10 +21,10 @@ tf_http_archive(
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
-    strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
+    sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
+    strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
     urls = tf_mirror_urls(
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
     ),
 )
 
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index 3385041e0c76da..3ac7ef7b409c34 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -108,10 +108,10 @@ def workspace():
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
     tf_http_archive(
         name = "rules_ml_toolchain",
-        sha256 = "e9842de3fefb5a120d3b1647d3a09e6e7071e8df8d1cd2dfe6f66ee31fd2595e",
-        strip_prefix = "rules_ml_toolchain-cb79a8fc8dcf3f75743dcd9b3418a70c884a7269",
+        sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
+        strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
         urls = tf_mirror_urls(
-            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/cb79a8fc8dcf3f75743dcd9b3418a70c884a7269.tar.gz",
+            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
         ),
     )
 

From fe02aab3112b67dedd407d39e720741a8dcc1282 Mon Sep 17 00:00:00 2001
From: Misha Gutman <aelphy@google.com>
Date: Wed, 17 Dec 2025 12:37:25 -0800
Subject: [PATCH 458/753] Supported int2 in xnnpack_delegate.

PiperOrigin-RevId: 845875757
---
 .../delegates/xnnpack/xnnpack_delegate.cc     | 77 ++++++++++++++++---
 1 file changed, 67 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index a869ca74a4cc5e..f390b8065caac2 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -211,7 +211,7 @@ bool CheckZeroPointForPerChannelQuantization(
   // be 8.
   for (int c = 0; c < quantization_zero_point.size; c++) {
     const int zero_point = quantization_zero_point.data[c];
-    if (zero_point != 0 && (tensor.type != kTfLiteInt4 && zero_point != 8)) {
+    if (zero_point != 0 && (tensor.type != kTfLiteInt4 || zero_point != 8)) {
       TF_LITE_KERNEL_LOG(context,
                          "unsupported zero-point value (%d) in channel %d of "
                          "%s tensor %d in XNNPACK delegate",
@@ -268,7 +268,8 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
       return xnn_datatype_quint8;
     }
     case kTfLiteInt8:
-    case kTfLiteInt4: {
+    case kTfLiteInt4:
+    case kTfLiteInt2: {
       switch (tensor.quantization.type) {
         case kTfLiteAffineQuantization: {
           const auto quantization_params =
@@ -320,6 +321,8 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
               return xnn_datatype_qcint8;
             case kTfLiteInt4:
               return xnn_datatype_qcint4;
+            case kTfLiteInt2:
+              return xnn_datatype_qcint2;
             default:
               // Outermost switch prevents this
               TFL_UNREACHABLE();
@@ -528,6 +531,22 @@ TfLiteStatus DefineXNNPACKValue(TfLiteContext* context, xnn_subgraph_t subgraph,
           dims.size(), dims.data(), data, XNN_INVALID_VALUE_ID, flags,
           xnnpack_id);
     } break;
+    case xnn_datatype_qcint2: {
+      status = xnn_define_channelwise_quantized_tensor_value_v3(
+          subgraph, datatype,
+          static_cast<const TfLiteAffineQuantization*>(
+              tensor.quantization.params)
+              ->zero_point->data[0],
+          static_cast<const TfLiteAffineQuantization*>(
+              tensor.quantization.params)
+              ->scale->data,
+          dims.size(),
+          static_cast<const TfLiteAffineQuantization*>(
+              tensor.quantization.params)
+              ->quantized_dimension,
+          dims.data(), data, XNN_INVALID_VALUE_ID, flags, xnnpack_id,
+          /*channelwise_zero_point=*/nullptr);
+    } break;
     case xnn_datatype_qcint4:
     case xnn_datatype_qcint8:
     case xnn_datatype_qcint32:
@@ -2228,18 +2247,21 @@ class Subgraph {
     return kTfLiteError;
   }
 
-  static TfLiteStatus CheckTensorFloat32OrFloat16OrQCInt4OrQCInt8Type(
-      const Delegate& delegate, TfLiteContext* context,
-      const TfLiteTensor& tensor, int expected_quantized_dimension,
-      int tensor_index, int node_index) {
+  static TfLiteStatus CheckTensorFilterType(const Delegate& delegate,
+                                            TfLiteContext* context,
+                                            const TfLiteTensor& tensor,
+                                            int expected_quantized_dimension,
+                                            int tensor_index, int node_index) {
     switch (tensor.type) {
       case kTfLiteFloat32:
       case kTfLiteFloat16:
         return kTfLiteOk;
+      case kTfLiteInt2:
       case kTfLiteInt4:
       case kTfLiteInt8:
         if (delegate.support_signed_8bit_quantization() &&
-            (kTfLiteInt8 == tensor.type || kTfLiteInt4 == tensor.type)) {
+            (kTfLiteInt8 == tensor.type || kTfLiteInt4 == tensor.type ||
+             kTfLiteInt2 == tensor.type)) {
           switch (tensor.quantization.type) {
             case kTfLiteAffineQuantization: {
               const TfLiteAffineQuantization* quantization_params =
@@ -2277,6 +2299,20 @@ class Subgraph {
                     quantization_params->quantized_dimension, tensor_index,
                     node_index);
                 return kTfLiteError;
+              } else if (tensor.type == kTfLiteInt2 &&
+                         quantization_params->scale->size !=
+                             SizeOfDimension(
+                                 &tensor,
+                                 quantization_params->quantized_dimension)) {
+                // Only per channel quantized 2 bit weights are supported.
+                TF_LITE_MAYBE_KERNEL_LOG(
+                    context,
+                    "2 bit weights must be per channel and not per tensor "
+                    "quantized in channel #%" PRId32
+                    " in tensor #%d in node #%d",
+                    quantization_params->quantized_dimension, tensor_index,
+                    node_index);
+                return kTfLiteError;
               }
               break;
             }
@@ -4489,7 +4525,7 @@ class Subgraph {
     // Dynamic filter is supported, but only for FP32.
     if (!(delegate.support_dynamic_fully_connected_operator() &&
           filter_tensor.type == kTfLiteFloat32)) {
-      TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrFloat16OrQCInt4OrQCInt8Type(
+      TF_LITE_ENSURE_STATUS(CheckTensorFilterType(
           delegate, logging_context, filter_tensor,
           /*expected_quantized_dimension=*/0, filter_tensor_id, node_index));
       if (quasi_static_tensors.count(filter_tensor_id) == 0) {
@@ -4543,10 +4579,12 @@ class Subgraph {
     bool dynamically_quantized =
         (!delegate.disable_dynamically_quantized_ops() &&
          (input_tensor.type == kTfLiteFloat32 &&
-          (filter_tensor.type == kTfLiteInt4 ||
+          (filter_tensor.type == kTfLiteInt2 ||
+           filter_tensor.type == kTfLiteInt4 ||
            filter_tensor.type == kTfLiteInt8)));
     bool supported_srq = (input_tensor.type == kTfLiteInt8 &&
-                          (filter_tensor.type == kTfLiteInt4 ||
+                          (filter_tensor.type == kTfLiteInt2 ||
+                           filter_tensor.type == kTfLiteInt4 ||
                            filter_tensor.type == kTfLiteInt8));
     if (input_tensor.type != output_tensor.type ||
         ((input_tensor.type != filter_tensor.type) &&
@@ -4567,6 +4605,15 @@ class Subgraph {
       return kTfLiteError;
     }
 
+    if (filter_tensor.type == kTfLiteInt2 && input_channels % 4 != 0) {
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unsupported non-multiple of 4 number of inputs channels (%d) in"
+          " FULLY_CONNECTED operator #%d",
+          input_channels, node_index);
+      return kTfLiteError;
+    }
+
     float output_min = -std::numeric_limits<float>::infinity();
     float output_max = +std::numeric_limits<float>::infinity();
     TF_LITE_ENSURE_STATUS(ConvertActivationToOutputRange(
@@ -4644,6 +4691,16 @@ class Subgraph {
             &filter_tensor.dims->data[NumDimensions(&filter_tensor)]);
         uint32_t kernel_id = XNN_INVALID_VALUE_ID;
         switch (filter_datatype) {
+          case xnn_datatype_qcint2: {
+            int32_t zero_point_value = filter_params->zero_point->data[0];
+            status = xnn_define_channelwise_quantized_tensor_value_v3(
+                subgraph, filter_datatype, zero_point_value,
+                filter_params->scale->data, filter_dims.size(),
+                /*channel_dim=*/0, filter_dims.data(),
+                GetTensorData<int8_t>(&filter_tensor), XNN_INVALID_VALUE_ID,
+                /*flags=*/0, &kernel_id, /*channelwise_zero_point=*/nullptr);
+            break;
+          }
           case xnn_datatype_qcint4:
           case xnn_datatype_qcint8: {
             int32_t zero_point_value = filter_params->zero_point->data[0];

From 336910b0b3387e81c267d5a3be1474c2e38f8a9a Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Wed, 17 Dec 2025 12:45:56 -0800
Subject: [PATCH 459/753] Add PoisonExecution to PjRtCApiDevice.

PiperOrigin-RevId: 845879114
---
 third_party/xla/xla/pjrt/c/CHANGELOG.md       |  4 ++
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 27 +++++++-
 third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc |  6 ++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 16 +++++
 third_party/xla/xla/pjrt/c_api_client/BUILD   |  3 +
 .../pjrt/c_api_client/pjrt_c_api_client.cc    | 22 +++++++
 .../xla/pjrt/c_api_client/pjrt_c_api_client.h |  3 +
 .../c_api_client/pjrt_c_api_client_test.cc    | 65 ++++++++++++++++++-
 8 files changed, 141 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index c47d05a264835b..49e8c858d4d5a8 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,9 @@
 # PJRT C API changelog
 
+## 0.85
+
+* Add `PJRT_Device_PoisonExecution`.
+
 ## 0.84
 
 * Add `PJRT_Buffer_CopyRawToHostFuture`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index 7dd3139bd7658c..eaeb72ce4164c3 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -104,7 +104,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 84
+#define PJRT_API_MINOR 85
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -1416,6 +1416,27 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_MemoryStats_Args, peak_pool_bytes_is_set);
 // also return PJRT_Error_Code_UNIMPLEMENTED. Intended for diagnostic purposes.
 typedef PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
 
+struct PJRT_Device_PoisonExecution_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+
+  PJRT_Device* device;
+  int32_t launch_id;
+
+  // Status fields.
+  PJRT_Error_Code error_code;
+  const char* error_message;
+  size_t error_message_size;
+
+  bool poisoned;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_PoisonExecution_Args, poisoned);
+
+// Poisons the earliest execution on this device with given launch_id if it's
+// not finished yet, i.e. makes its output buffers error.
+typedef PJRT_Error* PJRT_Device_PoisonExecution(
+    PJRT_Device_PoisonExecution_Args* args);
+
 //-------------------------------- Memory --------------------------------------
 
 struct PJRT_Memory_Id_Args {
@@ -2772,11 +2793,11 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_Client_CreateErrorBuffer);
   _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_TransferLiteral);
   _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyRawToHostFuture);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_PoisonExecution);
 } PJRT_Api;
 
 enum {
-  PJRT_Api_STRUCT_SIZE =
-      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Buffer_CopyRawToHostFuture)
+  PJRT_Api_STRUCT_SIZE = PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Device_PoisonExecution)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index ce115dd3958adb..fa74c035be033a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -953,6 +953,9 @@ FieldOffsetsAndSizesForVersion(int major_version, int minor_version) {
     if (minor_version >= 84) {
       add_field("PJRT_Buffer_CopyRawToHostFuture", kFnPtrSize);
     }
+    if (minor_version >= 85) {
+      add_field("PJRT_Device_PoisonExecution", kFnPtrSize);
+    }
     return version_offsets_and_sizes;
   }
   LOG(FATAL) << "Unsupported API version: " << major_version << "."
@@ -1355,6 +1358,9 @@ TEST_F(PjrtCAbiTestBase, FieldOffsetsAndSizes) {
           {"PJRT_Buffer_CopyRawToHostFuture",
            {offsetof(PJRT_Api, PJRT_Buffer_CopyRawToHostFuture),
             sizeof(PJRT_Api::PJRT_Buffer_CopyRawToHostFuture)}},
+          {"PJRT_Device_PoisonExecution",
+           {offsetof(PJRT_Api, PJRT_Device_PoisonExecution),
+            sizeof(PJRT_Api::PJRT_Device_PoisonExecution)}},
       };
   ASSERT_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
   ASSERT_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 801328d8288165..223f6b0148db4e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -774,6 +774,21 @@ PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_TransferLiteral(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Device_PoisonExecution(
+    PJRT_Device_PoisonExecution_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Device_PoisonExecution_Args",
+      PJRT_Device_PoisonExecution_Args_STRUCT_SIZE, args->struct_size));
+
+  absl::Status error = absl::Status(
+      pjrt::PjrtErrorCodeToStatusCode(args->error_code),
+      absl::string_view(args->error_message, args->error_message_size));
+
+  PJRT_ASSIGN_OR_RETURN(args->poisoned, args->device->device->PoisonExecution(
+                                            args->launch_id, error));
+  return nullptr;
+}
+
 PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer(
     PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -3227,6 +3242,7 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       pjrt::PJRT_AsyncHostToDeviceTransferManager_TransferLiteral,
       /*PJRT_Buffer_CopyRawToHostFuture=*/
       pjrt::PJRT_Buffer_CopyRawToHostFuture,
+      /*PJRT_Device_PoisonExecution=*/pjrt::PJRT_Device_PoisonExecution,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c_api_client/BUILD b/third_party/xla/xla/pjrt/c_api_client/BUILD
index ee111d84b4c8bb..263de8a8c3b868 100644
--- a/third_party/xla/xla/pjrt/c_api_client/BUILD
+++ b/third_party/xla/xla/pjrt/c_api_client/BUILD
@@ -116,6 +116,7 @@ xla_cc_test(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:types",
+        "//xla:util",
         "//xla/backends/cpu:alignment",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
@@ -134,9 +135,11 @@ xla_cc_test(
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 44841721f09a1f..0e727c473edbc6 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -1720,6 +1720,28 @@ absl::StatusOr<std::intptr_t> PjRtCApiDevice::GetStreamForExternalReadyEvents()
   return args.stream;
 }
 
+absl::StatusOr<bool> PjRtCApiDevice::PoisonExecution(int32_t launch_id,
+                                                     absl::Status error) {
+  if (client_->pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      client_->pjrt_c_api()->pjrt_api_version.minor_version < 85) {
+    return absl::UnimplementedError(
+        "PJRT_Device_PoisonExecution requires PJRT C API version 0.85 or "
+        "higher.");
+  }
+  const PJRT_Api* c_api = client_->pjrt_c_api();
+  PJRT_Device_PoisonExecution_Args args;
+  args.struct_size = PJRT_Device_PoisonExecution_Args_STRUCT_SIZE;
+  args.device = device_;
+  args.launch_id = launch_id;
+
+  args.error_code = pjrt::StatusCodeToPjrtErrorCode(error.code());
+  args.error_message = error.message().data();
+  args.error_message_size = error.message().size();
+
+  RETURN_STATUS_IF_PJRT_ERROR(c_api->PJRT_Device_PoisonExecution(&args), c_api);
+  return args.poisoned;
+}
+
 // ------------------------------- Memory --------------------------------------
 
 const PJRT_Api* PjRtCApiMemorySpace::pjrt_c_api() const {
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index 24e781396cefbe..d61987f8b8c23c 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -179,6 +179,9 @@ class PjRtCApiDevice : public PjRtDevice {
     return nullptr;
   }
 
+  absl::StatusOr<bool> PoisonExecution(int32_t launch_id,
+                                       absl::Status error) override;
+
   PJRT_Device* c_device() const { return device_; }
 
   const PjRtCApiDeviceDescription& description() const override {
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
index e9ac7f309dc6f3..caf4bdc409b6f9 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/blocking_counter.h"
 #include "absl/types/span.h"
@@ -58,11 +59,15 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/types.h"
+#include "xla/util.h"
 
+using ::absl_testing::IsOkAndHolds;
+using ::absl_testing::StatusIs;
 using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 
@@ -668,8 +673,64 @@ TEST(PjRtCApiClientTest, CopyRawToHostFuture) {
   result = buffer->CopyRawToHostFuture(error_dst_future, 0, size);
   error_dst_promise.Set(absl::InternalError("Future error"));
   absl::Status status = result.Await();
-  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
-  EXPECT_EQ(status.message(), "Future error");
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInternal, "Future error"));
+}
+
+TEST(PjRtCApiClientTest, PoisonExecution) {
+  SetUpCpuPjRtApi();
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                       GetCApiClient("cpu"));
+
+  ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseAndReturnUnverifiedModule(R"(
+HloModule Identity
+ENTRY Identity() -> f32[2, 2] {
+    ROOT %result = f32[2, 2] parameter(0)
+})",
+                                                                       {}));
+  XlaComputation xla_computation(hlo_module->ToProto());
+  ASSERT_OK_AND_ASSIGN(auto pjrt_executable,
+                       client->CompileAndLoad(xla_computation, {}));
+
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 2});
+  ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                       client->CreateBuffersForAsyncHostToDevice(
+                           {shape}, client->memory_spaces()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+
+  const int32_t kLaunchId = 123;
+  ExecuteOptions opts;
+  opts.launch_id = kLaunchId;
+  // PoisonExecution only works for asynchronous executions. Synchronous
+  // executions are executed inline and will not be poisonable.
+  opts.execution_mode = ExecuteOptions::ExecutionMode::kAsynchronous;
+
+  auto result =
+      pjrt_executable->Execute(/*argument_handles=*/{{buffer.get()}}, opts);
+  ASSERT_OK(result);
+
+  // Poisoning the execution should succeed because the execution has not
+  // started with the input buffer not defined yet.
+  auto poison_result = client->addressable_devices().front()->PoisonExecution(
+      kLaunchId, Internal("foobar1"));
+  ASSERT_THAT(poison_result, IsOkAndHolds(true));
+
+  // The buffer is expected to be poisoned with the error.
+  ASSERT_EQ(result->size(), 1);
+  ASSERT_EQ(result->at(0).size(), 1);
+  EXPECT_THAT(result->at(0).at(0)->ToLiteral().Await(),
+              StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
+
+  // A later error (propagated from the input buffer) would not affect the
+  // already poisoned output buffer.
+  transfer_manager->SetBufferError(0, Internal("foobar2"));
+
+  EXPECT_THAT(result->at(0).at(0)->ToLiteral().Await(),
+              StatusIs(tsl::error::INTERNAL, HasSubstr("foobar1")));
+
+  // Attempting to poison a non-existent execution should fail.
+  poison_result = client->addressable_devices().front()->PoisonExecution(
+      kLaunchId + 12, Internal("foobar3"));
+  EXPECT_THAT(poison_result, IsOkAndHolds(false));
 }
 
 }  // namespace

From be344bbfb6be645abd697a2a589e8e32988f6659 Mon Sep 17 00:00:00 2001
From: Karlo Basioli <basioli@google.com>
Date: Wed, 17 Dec 2025 12:55:55 -0800
Subject: [PATCH 460/753] Breaking internal models in g3.

Reverts 578a4f83310db800aafa19d466028f75f98286b0

PiperOrigin-RevId: 845882935
---
 .../xla/backends/gpu/codegen/triton/dot_algorithms_test.cc  | 6 +++++-
 .../gpu/codegen/triton/fusion_emitter_device_test.cc        | 4 +++-
 .../xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc | 6 ------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
index a03aaa8fcd81ef..bdd679412e0261 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/dot_algorithms_test.cc
@@ -2086,7 +2086,11 @@ INSTANTIATE_TEST_SUITE_P(
     PrecisionTests, PrecisionTests,
     Combine(Values(PC::ALG_DOT_TF32_TF32_F32, PC::ALG_DOT_TF32_TF32_F32_X3,
                    PC::ALG_DOT_BF16_BF16_F32, PC::ALG_DOT_BF16_BF16_F32_X3,
-                   PC::ALG_DOT_BF16_BF16_F32_X6, PC::ALG_DOT_BF16_BF16_F32_X9,
+                   PC::ALG_DOT_BF16_BF16_F32_X6,
+                   // TODO(basioli): re-enable this algorithm testing once the
+                   // attribute
+                   // importer supports the conversion.
+                   //  PC::ALG_DOT_BF16_BF16_F32_X9,
                    PC::ALG_DOT_F32_F32_F32),
             Values(Backend::kTriton, Backend::kBlas)),
     AlgorithmAndBackendTestParamToString);
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
index 39c8a8afe129ee..e5df20ac1480d7 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/fusion_emitter_device_test.cc
@@ -4361,7 +4361,9 @@ constexpr std::array kMultiDotAlgorithms = {
     PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3,
     PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6,
     PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3,
-    PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9,
+    // TODO(basioli): re-enable this algorithm testing once the attribute
+    // importer supports the conversion.
+    // PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9,
 };
 
 TEST_P(MultiDotAlgorithmEmitterTest, MultiDotAlgorithmIsEmittedCorrectly) {
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
index b784f882ac546a..8b68c112604104 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/attribute_importer.cc
@@ -199,12 +199,6 @@ mlir::stablehlo::DotAlgorithmAttr ConvertDotAlgorithm(
       numPrimitiveOperations = 6;
       break;
     }
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9: {
-      lhs = rhs = builder->getBF16Type();
-      accum = builder->getF32Type();
-      numPrimitiveOperations = 9;
-      break;
-    }
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32: {
       lhs = rhs = builder->getTF32Type();
       accum = builder->getF32Type();

From 9f69553083930e42838fbadcc7f27dcbc0e65357 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Wed, 17 Dec 2025 13:00:38 -0800
Subject: [PATCH 461/753] Run `shardy/google/integrate_latest.sh`.

This cl unblock the submission of cl/845112049.

PiperOrigin-RevId: 845884574
---
 .../xla/third_party/shardy/temporary.patch        | 15 ---------------
 third_party/xla/third_party/shardy/workspace.bzl  |  4 ++--
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 3f2ad26d310aa0..e69de29bb2d1d6 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,15 +0,0 @@
-diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index 69a8c63..f2c3289 100644
---- a/third_party/llvm/workspace.bzl
-+++ b/third_party/llvm/workspace.bzl
-@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
- 
- def repo(name):
-     """Imports LLVM."""
--    LLVM_COMMIT = "43bfec29cbecc1ff2e5aa6f8908c4d63e9c896c5"
--    LLVM_SHA256 = "d9c35a7c3764666abcf464955530154d528b2e5edeb97bfa8890f02cb52d1f30"
-+    LLVM_COMMIT = "8f264586d7521b0e305ca7bb78825aa3382ffef7"
-+    LLVM_SHA256 = "5784c4af94caba66bc8c460e07e222f751e4f4c9db9c45b3a68ff55379cf587d"
- 
-     tf_http_archive(
-         name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 504e4d6b2c6ce2..971064ea06b0c9 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "940091203da82097e358114a6622d81b73693698"
-    SHARDY_SHA256 = "fa4cdeda270efd2faf3bd957d0a11c2dca6a36a9f071423dcbcbbb6cee43af0d"
+    SHARDY_COMMIT = "e74939f4948986b2b5fe0e04cefb0afc2300672b"
+    SHARDY_SHA256 = "04243cb1d585b5d43cf0d8bd8e611bc732090859a0ab1370bc93dcec0efe8e9e"
 
     tf_http_archive(
         name = "shardy",

From a8f27858b8af4ddf767059ee152f28820c30c855 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Wed, 17 Dec 2025 13:15:31 -0800
Subject: [PATCH 462/753] Add Shape to DynamicSliceThunk buffer_uses

Modify Thunk's serialization

PiperOrigin-RevId: 845890906
---
 .../xla/xla/backends/gpu/codegen/custom.cc    | 53 +++++++--------
 .../xla/xla/backends/gpu/runtime/BUILD        |  1 +
 .../gpu/runtime/command_buffer_cmd.cc         | 15 +++--
 .../backends/gpu/runtime/command_buffer_cmd.h |  2 +-
 .../gpu/runtime/command_buffer_cmd_emitter.cc |  2 +-
 .../gpu/runtime/command_buffer_thunk_test.cc  |  6 +-
 .../gpu/runtime/dynamic_slice_thunk.cc        | 64 +++++++++----------
 .../gpu/runtime/dynamic_slice_thunk.h         | 10 +--
 .../gpu/runtime/dynamic_slice_thunk.proto     |  4 +-
 .../gpu/runtime/dynamic_slice_thunk_test.cc   | 34 +++++-----
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  2 +-
 11 files changed, 98 insertions(+), 95 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/custom.cc b/third_party/xla/xla/backends/gpu/codegen/custom.cc
index 6cbc17cc246fe0..6bd21df61eb9cd 100644
--- a/third_party/xla/xla/backends/gpu/codegen/custom.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/custom.cc
@@ -308,7 +308,7 @@ absl::Status CollectSliceInfo(
     std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>& offsets,
     std::vector<std::optional<Shape>>& orig_shapes,
     std::vector<std::optional<Shape>>& sliced_shapes,
-    std::vector<std::optional<uint64_t>>& offset_byte_sizes,
+    std::vector<std::optional<PrimitiveType>>& offset_primitive_types,
     std::vector<std::unique_ptr<HloModule>>& extracted_offset_modules,
     unsigned arg_idx, bool can_compute_indvar_on_host,
     std::optional<const HloInstruction*> while_op,
@@ -381,8 +381,8 @@ absl::Status CollectSliceInfo(
   sliced_shapes[arg_idx] = DynCast<HloDynamicSliceInstruction>(arg_slice_instr)
                                ? arg_slice_instr->shape()
                                : arg_slice_instr->operand(1)->shape();
-  offset_byte_sizes[arg_idx] = ShapeUtil::ByteSizeOfPrimitiveType(
-      arg_slice_instr->index_operands().front()->shape().element_type());
+  offset_primitive_types[arg_idx] =
+      arg_slice_instr->index_operands().front()->shape().element_type();
 
   return absl::OkStatus();
 }
@@ -556,7 +556,8 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
       offset_buffer_indices(4, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(4, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(4, std::nullopt);
-  std::vector<std::optional<uint64_t>> offset_byte_sizes(4, std::nullopt);
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types(
+      4, std::nullopt);
 
   std::vector<HloInstruction*> slice_instrs(4, nullptr);
 
@@ -594,7 +595,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                       /*shape_idx=*/{}, arg_idx));
   TF_RETURN_IF_ERROR(CollectSliceInfo(
       buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_primitive_types,
       extracted_offset_modules, arg_idx++, can_compute_indvar_on_host, while_op,
       indvar_idx, inlined_module));
 
@@ -605,7 +606,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                       /*shape_idx=*/{}, arg_idx));
   TF_RETURN_IF_ERROR(CollectSliceInfo(
       buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-      offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
+      offset_buffer_indices, orig_shapes, sliced_shapes, offset_primitive_types,
       extracted_offset_modules, arg_idx++, can_compute_indvar_on_host, while_op,
       indvar_idx, inlined_module));
 
@@ -624,9 +625,9 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                                slice_instrs, /*shape_idx=*/{}, arg_idx));
     TF_RETURN_IF_ERROR(CollectSliceInfo(
         buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
-        extracted_offset_modules, arg_idx, can_compute_indvar_on_host, while_op,
-        indvar_idx, inlined_module));
+        offset_buffer_indices, orig_shapes, sliced_shapes,
+        offset_primitive_types, extracted_offset_modules, arg_idx,
+        can_compute_indvar_on_host, while_op, indvar_idx, inlined_module));
   } else {
     TF_ASSIGN_OR_RETURN(
         output,
@@ -635,9 +636,9 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                        arg_idx));
     TF_RETURN_IF_ERROR(CollectSliceInfo(
         buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
-        extracted_offset_modules, arg_idx++, can_compute_indvar_on_host,
-        while_op, indvar_idx, inlined_module));
+        offset_buffer_indices, orig_shapes, sliced_shapes,
+        offset_primitive_types, extracted_offset_modules, arg_idx++,
+        can_compute_indvar_on_host, while_op, indvar_idx, inlined_module));
 
     // TODO(vuson): If we want to support slices of workspace, we'd need to
     // start `HloFindIf` with `get-tuple-element` with the right index.
@@ -646,9 +647,9 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
                                       /*index=*/{kGEMMWorkspaceBufferIndex}));
     TF_RETURN_IF_ERROR(CollectSliceInfo(
         buffer_assignment, fusion, absl::Span<HloInstruction*>(slice_instrs),
-        offset_buffer_indices, orig_shapes, sliced_shapes, offset_byte_sizes,
-        extracted_offset_modules, arg_idx, can_compute_indvar_on_host, while_op,
-        indvar_idx, inlined_module));
+        offset_buffer_indices, orig_shapes, sliced_shapes,
+        offset_primitive_types, extracted_offset_modules, arg_idx,
+        can_compute_indvar_on_host, while_op, indvar_idx, inlined_module));
     fake_allocations[arg_idx] = BufferAllocation(
         /*index=*/arg_idx, workspace->size(), /*color=*/0);
     slice_workspace_fake = BufferAllocation::Slice(&fake_allocations[arg_idx],
@@ -723,7 +724,7 @@ absl::StatusOr<FusionEmissionResult> EmitGemm(
         thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
         std::move(arguments), std::move(fake_allocations),
         std::move(offset_buffer_indices), std::move(orig_shapes),
-        std::move(sliced_shapes), std::move(offset_byte_sizes),
+        std::move(sliced_shapes), std::move(offset_primitive_types),
         std::move(offset_modules_metadata));
   } else {
     thunk = std::make_unique<GemmThunk>(thunk_info, std::move(config),
@@ -777,8 +778,8 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
       num_args, std::nullopt);
   std::vector<std::optional<Shape>> orig_shapes(num_args, std::nullopt);
   std::vector<std::optional<Shape>> sliced_shapes(num_args, std::nullopt);
-  std::vector<std::optional<uint64_t>> offset_byte_sizes(num_args,
-                                                         std::nullopt);
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types(
+      num_args, std::nullopt);
 
   std::vector<HloInstruction*> slice_instrs(num_args, nullptr);
   std::vector<std::optional<BufferAllocation::Slice>> arguments;
@@ -830,7 +831,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
           TF_RETURN_IF_ERROR(CollectSliceInfo(
               buffer_assignment, fusion,
               absl::Span<HloInstruction*>(slice_instrs), offsets, orig_shapes,
-              sliced_shapes, offset_byte_sizes, extracted_offset_modules,
+              sliced_shapes, offset_primitive_types, extracted_offset_modules,
               arg_idx++, can_compute_indvar_on_host, while_op, indvar_idx,
               inlined_module));
 
@@ -858,7 +859,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         TF_RETURN_IF_ERROR(CollectSliceInfo(
             buffer_assignment, fusion,
             absl::Span<HloInstruction*>(slice_instrs), offsets, orig_shapes,
-            sliced_shapes, offset_byte_sizes, extracted_offset_modules,
+            sliced_shapes, offset_primitive_types, extracted_offset_modules,
             arg_idx++, can_compute_indvar_on_host, while_op, indvar_idx,
             inlined_module));
 
@@ -1040,7 +1041,7 @@ absl::StatusOr<FusionEmissionResult> EmitCustomCall(
         thunk_info, std::make_unique<ThunkSequence>(std::move(seq)),
         std::move(arguments), std::move(fake_allocations), std::move(offsets),
         std::move(orig_shapes), std::move(sliced_shapes),
-        std::move(offset_byte_sizes), std::move(offset_modules_metadata));
+        std::move(offset_primitive_types), std::move(offset_modules_metadata));
   } else {
     TF_ASSIGN_OR_RETURN(
         thunk, found_ffi_handler
@@ -1073,7 +1074,7 @@ struct SliceDataForCollectives {
   std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>>
       offset_buffer_indices;
   std::vector<std::optional<Shape>> orig_shapes, sliced_shapes;
-  std::vector<std::optional<uint64_t>> offset_byte_sizes;
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types;
   std::vector<std::unique_ptr<HloModule>> extracted_offset_modules;
   std::unique_ptr<HloModule> init_module, update_module;
   bool isDynamic, can_compute_indvar_on_host;
@@ -1085,7 +1086,7 @@ struct SliceDataForCollectives {
         offset_buffer_indices(num_args, std::nullopt),
         orig_shapes(num_args, std::nullopt),
         sliced_shapes(num_args, std::nullopt),
-        offset_byte_sizes(num_args, std::nullopt),
+        offset_primitive_types(num_args, std::nullopt),
         init_module(nullptr),
         update_module(nullptr),
         isDynamic(false),
@@ -1138,7 +1139,7 @@ CollectSliceArgumentMetadataForCollectives(
         buffer_assignment, fusion_instr,
         /*slice_instrs=*/absl::Span<HloInstruction*>(slice_data.slice_instrs),
         /*offsets=*/slice_data.offset_buffer_indices, slice_data.orig_shapes,
-        slice_data.sliced_shapes, slice_data.offset_byte_sizes,
+        slice_data.sliced_shapes, slice_data.offset_primitive_types,
         slice_data.extracted_offset_modules, arg_idx,
         slice_data.can_compute_indvar_on_host, while_op, indvar_idx,
         inlined_module));
@@ -1165,7 +1166,7 @@ CollectSliceArgumentMetadataForCollectives(
         buffer_assignment, fusion_instr,
         /*slice_instrs=*/absl::Span<HloInstruction*>(slice_data.slice_instrs),
         /*offsets=*/slice_data.offset_buffer_indices, slice_data.orig_shapes,
-        slice_data.sliced_shapes, slice_data.offset_byte_sizes,
+        slice_data.sliced_shapes, slice_data.offset_primitive_types,
         slice_data.extracted_offset_modules, arg_idx,
         slice_data.can_compute_indvar_on_host, while_op, indvar_idx,
         inlined_module));
@@ -1359,7 +1360,7 @@ absl::StatusOr<FusionEmissionResult> EmitCollective(
         std::move(slice_data.arguments), std::move(slice_data.fake_allocations),
         std::move(slice_data.offset_buffer_indices),
         std::move(slice_data.orig_shapes), std::move(slice_data.sliced_shapes),
-        std::move(slice_data.offset_byte_sizes),
+        std::move(slice_data.offset_primitive_types),
         std::move(offset_modules_metadata));
     result.thunks.push_back(std::move(thunk));
   } else {
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 69b1b5d0f589c1..f0aad60b990ab0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -245,6 +245,7 @@ cc_library(
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:buffer_use",
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
index a99e25ac16bf21..0ede07cb6a30bd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.cc
@@ -2578,7 +2578,7 @@ DynamicSliceFusionCmd::DynamicSliceFusionCmd(
     std::vector<std::optional<std::vector<DynamicSliceThunk::Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
-    std::vector<std::optional<uint64_t>> offset_byte_sizes,
+    std::vector<std::optional<PrimitiveType>> offset_primitive_types,
     std::optional<
         const DynamicSliceThunk::OffsetAsFunctionOfIndvarModulesMetadata*>
         offset_as_function_of_indvar_metadata)
@@ -2588,15 +2588,15 @@ DynamicSliceFusionCmd::DynamicSliceFusionCmd(
       offset_as_function_of_indvar_metadata_(
           std::move(offset_as_function_of_indvar_metadata)) {
   // Zip all arguments together to create a list of SliceDef.
-  for (auto [arg, offset, orig_shape, sliced_shape, offset_byte_size] :
+  for (auto [arg, offset, orig_shape, sliced_shape, offset_primitive_type] :
        llvm::zip_equal(arguments, offsets, orig_shapes, sliced_shapes,
-                       offset_byte_sizes)) {
+                       offset_primitive_types)) {
     slices_.push_back(DynamicSliceThunk::SliceDef{
         std::move(arg),
         std::move(offset),
         std::move(orig_shape),
         std::move(sliced_shape),
-        std::move(offset_byte_size),
+        std::move(offset_primitive_type),
     });
   }
 
@@ -2657,7 +2657,7 @@ absl::Status DynamicSliceFusionCmd::Prepare(
       TF_RET_CHECK(slice.embedded_thunk_argument.has_value());
       TF_RET_CHECK(slice.orig_shape.has_value());
       TF_RET_CHECK(slice.sliced_shape.has_value());
-      TF_RET_CHECK(slice.offset_byte_size.has_value());
+      TF_RET_CHECK(slice.offset_primitive_type.has_value());
       TF_RET_CHECK(slice.orig_shape->IsArray());
       TF_RET_CHECK(slice.sliced_shape->IsArray());
       TF_RET_CHECK(slice.offsets->size() ==
@@ -2771,8 +2771,9 @@ absl::StatusOr<const se::CommandBuffer::Command*> DynamicSliceFusionCmd::Record(
 
         // Copy the `offset_idx`-th component of the offset for the
         // `argument_idx`-th argument from device to host.
-        TF_RETURN_IF_ERROR(
-            stream.Memcpy(offset_dst, offset_src, *slice.offset_byte_size));
+        TF_RETURN_IF_ERROR(stream.Memcpy(
+            offset_dst, offset_src,
+            ShapeUtil::ByteSizeOfPrimitiveType(*slice.offset_primitive_type)));
         ++num_transfers;
       }
     }
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
index f6bc947262cde5..0b666803dcf058 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd.h
@@ -1284,7 +1284,7 @@ class DynamicSliceFusionCmd : public CommandBufferCmd {
           offsets,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
-      std::vector<std::optional<uint64_t>> offset_byte_sizes,
+      std::vector<std::optional<PrimitiveType>> offset_primitive_types,
       std::optional<
           const DynamicSliceThunk::OffsetAsFunctionOfIndvarModulesMetadata*>
           offset_as_function_of_indvar_metadata = std::nullopt);
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
index 8865fdf5853060..1f2d499e5d6906 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -213,7 +213,7 @@ static absl::StatusOr<Command> Convert(
   return std::make_unique<DynamicSliceFusionCmd>(
       std::move(embedded_cmds), thunk.get_arguments(),
       std::move(fake_allocations), thunk.get_offsets(), thunk.get_orig_shapes(),
-      thunk.get_sliced_shapes(), thunk.get_offset_byte_sizes(),
+      thunk.get_sliced_shapes(), thunk.offset_primitive_types(),
       thunk.get_offset_function());
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
index 35e1ab249d4489..9a6928a711ce55 100644
--- a/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/command_buffer_thunk_test.cc
@@ -1013,13 +1013,13 @@ TEST(CommandBufferThunkTest, DISABLED_DynamicSliceFusionCmd) {
   std::vector<std::optional<Shape>> sliced_shapes = {
       ShapeUtil::MakeShape(PrimitiveType::F32, {2, 4}), std::nullopt,
       std::nullopt, std::nullopt};
-  std::vector<std::optional<uint64_t>> offset_byte_sizes = {
-      sizeof(int64_t), std::nullopt, std::nullopt, std::nullopt};
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types = {
+      S64, std::nullopt, std::nullopt, std::nullopt};
 
   CommandBufferCmdSequence commands;
   commands.Emplace<DynamicSliceFusionCmd>(
       std::move(embed_executor), arguments, std::move(fake_allocations),
-      offsets, orig_shapes, sliced_shapes, offset_byte_sizes);
+      offsets, orig_shapes, sliced_shapes, offset_primitive_types);
   TF_ASSERT_OK_AND_ASSIGN(
       CommandBufferCmdExecutor executor,
       CommandBufferCmdExecutor::Create(std::move(commands), serialize));
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
index 23227934cd3775..1feb1410564764 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/primitive_util.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
@@ -56,6 +57,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -149,25 +151,24 @@ std::string DynamicSliceThunk::SliceDef::ToString() const {
     absl::StrAppend(&result, ", offsets:null");
   }
 
-  // orig_shape
   if (orig_shape.has_value()) {
     absl::StrAppend(&result, ", orig_shape:", orig_shape->ToString());
   } else {
     absl::StrAppend(&result, ", orig_shape:null");
   }
 
-  // sliced_shape
   if (sliced_shape.has_value()) {
     absl::StrAppend(&result, ", sliced_shape:", sliced_shape->ToString());
   } else {
     absl::StrAppend(&result, ", sliced_shape:null");
   }
 
-  // offset_byte_size
-  if (offset_byte_size.has_value()) {
-    absl::StrAppend(&result, ", offset_byte_size:", *offset_byte_size);
+  if (offset_primitive_type.has_value()) {
+    absl::StrAppend(
+        &result, ", offset_primitive_type:",
+        primitive_util::LowercasePrimitiveTypeName(*offset_primitive_type));
   } else {
-    absl::StrAppend(&result, ", offset_byte_size:null");
+    absl::StrAppend(&result, ", offset_primitive_type:null");
   }
 
   absl::StrAppend(&result, "}");
@@ -181,7 +182,7 @@ DynamicSliceThunk::DynamicSliceThunk(
     std::vector<std::optional<std::vector<Offset>>> offsets,
     std::vector<std::optional<Shape>> orig_shapes,
     std::vector<std::optional<Shape>> sliced_shapes,
-    std::vector<std::optional<uint64_t>> offset_byte_sizes,
+    std::vector<std::optional<PrimitiveType>> offset_primitive_types,
     std::optional<OffsetAsFunctionOfIndvarModulesMetadata>
         offset_as_function_of_indvar_metadata)
     : Thunk(Kind::kDynamicSlice, thunk_info),
@@ -192,19 +193,19 @@ DynamicSliceThunk::DynamicSliceThunk(
       offsets_(offsets),
       orig_shapes_(orig_shapes),
       sliced_shapes_(sliced_shapes),
-      offset_byte_sizes_(offset_byte_sizes),
+      offset_primitive_types_(offset_primitive_types),
       offset_as_function_of_indvar_metadata_(
           std::move(offset_as_function_of_indvar_metadata)) {
   // Zip all arguments together to create a list of SliceDef.
-  for (auto [arg, offsets, orig_shape, sliced_shape, offset_byte_size] :
+  for (auto [arg, offsets, orig_shape, sliced_shape, offset_primitive_type] :
        llvm::zip_equal(arguments, offsets, orig_shapes, sliced_shapes,
-                       offset_byte_sizes)) {
+                       offset_primitive_types)) {
     slices_.push_back(SliceDef{
         std::move(arg),
         std::move(offsets),
         std::move(orig_shape),
         std::move(sliced_shape),
-        std::move(offset_byte_size),
+        std::move(offset_primitive_type),
     });
   }
 
@@ -226,7 +227,7 @@ absl::Status DynamicSliceThunk::Prepare(const PrepareParams& params) {
       TF_RET_CHECK(slice.embedded_thunk_argument.has_value());
       TF_RET_CHECK(slice.orig_shape.has_value());
       TF_RET_CHECK(slice.sliced_shape.has_value());
-      TF_RET_CHECK(slice.offset_byte_size.has_value());
+      TF_RET_CHECK(slice.offset_primitive_type.has_value());
 
       TF_RET_CHECK(slice.orig_shape->IsArray());
       TF_RET_CHECK(slice.sliced_shape->IsArray());
@@ -358,8 +359,9 @@ absl::Status DynamicSliceThunk::ExecuteOnStream(const ExecuteParams& params) {
 
         // Copy the `offset_idx`-th component of the offset for the
         // `argument_idx`-th argument from device to host.
-        TF_RETURN_IF_ERROR(
-            stream.Memcpy(offset_dst, offset_src, *slice.offset_byte_size));
+        TF_RETURN_IF_ERROR(stream.Memcpy(
+            offset_dst, offset_src,
+            ShapeUtil::ByteSizeOfPrimitiveType(*slice.offset_primitive_type)));
         ++num_transfers;
       }
     }
@@ -473,7 +475,9 @@ Thunk::BufferUses DynamicSliceThunk::buffer_uses() const {
       if (!alloc_slice) {
         continue;
       }
-      res.push_back(BufferUse::Read(*alloc_slice));
+      res.push_back(BufferUse::Read(
+          *alloc_slice,
+          ShapeUtil::MakeShape(*slice.offset_primitive_type, {})));
     }
   }
   return res;
@@ -630,10 +634,11 @@ absl::StatusOr<ThunkProto> DynamicSliceThunk::ToProto() const {
   }
 
   // offset_byte_sizes
-  for (const auto& size : offset_byte_sizes_) {
-    auto& proto_size = *dynamic_slice_proto->add_offset_byte_sizes();
-    if (size.has_value()) {
-      proto_size.set_value(size.value());
+  for (const std::optional<PrimitiveType>& primtive_type :
+       offset_primitive_types_) {
+    auto& proto_size = *dynamic_slice_proto->add_offset_primitive_types();
+    if (primtive_type.has_value()) {
+      proto_size.set_value(primtive_type.value());
     }
   }
 
@@ -667,7 +672,6 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
             proto.offset_as_function_of_indvar_modules_metadata()));
   }
 
-  // arguments
   std::vector<std::optional<BufferAllocation::Slice>> arguments;
   for (auto& arg_proto : proto.arguments()) {
     arguments.push_back(std::nullopt);
@@ -678,13 +682,11 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
-  // offsets
   TF_ASSIGN_OR_RETURN(
       std::vector<std::optional<std::vector<Offset>>> offsets,
       DeserializeOffsetsFromProto(proto, buffer_allocations,
                                   offset_as_function_of_indvar_metadata));
 
-  // orig_shapes
   std::vector<std::optional<Shape>> orig_shapes;
   for (auto& shape_proto : proto.orig_shapes()) {
     orig_shapes.push_back(std::nullopt);
@@ -694,7 +696,6 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
-  // sliced_shapes
   std::vector<std::optional<Shape>> sliced_shapes;
   for (auto& shape_proto : proto.sliced_shapes()) {
     sliced_shapes.push_back(std::nullopt);
@@ -704,23 +705,22 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
     }
   }
 
-  // offset_byte_sizes
-  std::vector<std::optional<uint64_t>> offset_byte_sizes;
-  for (auto& size_proto : proto.offset_byte_sizes()) {
-    offset_byte_sizes.push_back(std::nullopt);
-    if (size_proto.has_value()) {
-      offset_byte_sizes.back() = size_proto.value();
+  std::vector<std::optional<PrimitiveType>> offset_primtitive_types;
+  offset_primtitive_types.reserve(proto.offset_primitive_types_size());
+  for (const OptionalPrimitiveType& type_proto :
+       proto.offset_primitive_types()) {
+    offset_primtitive_types.push_back(std::nullopt);
+    if (type_proto.has_value()) {
+      offset_primtitive_types.back() = type_proto.value();
     }
   }
 
-  // fake_allocations
   std::vector<BufferAllocation> fake_allocations;
   for (const auto& fake_allocation_proto : proto.fake_allocations()) {
     fake_allocations.push_back(
         BufferAllocation::FromProto(fake_allocation_proto));
   }
 
-  // embedded_thunk
   std::vector<std::unique_ptr<Thunk>> embedded_thunks;
   for (const auto& thunk_proto : proto.embedded_thunk().thunks()) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> embedded_thunk,
@@ -732,7 +732,7 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> DynamicSliceThunk::FromProto(
       thunk_info, std::make_unique<ThunkSequence>(std::move(embedded_thunks)),
       std::move(arguments), std::move(fake_allocations), std::move(offsets),
       std::move(orig_shapes), std::move(sliced_shapes),
-      std::move(offset_byte_sizes),
+      std::move(offset_primtitive_types),
       std::move(offset_as_function_of_indvar_metadata));
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
index 68d724566b4d85..10a620158e9654 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.h
@@ -118,7 +118,7 @@ class DynamicSliceThunk : public Thunk {
       std::vector<std::optional<std::vector<Offset>>> offsets,
       std::vector<std::optional<Shape>> orig_shapes,
       std::vector<std::optional<Shape>> sliced_shapes,
-      std::vector<std::optional<uint64_t>> offset_byte_sizes,
+      std::vector<std::optional<PrimitiveType>> offset_primitive_types,
       std::optional<OffsetAsFunctionOfIndvarModulesMetadata>
           offset_as_function_of_indvar_metadata = std::nullopt);
   DynamicSliceThunk(const DynamicSliceThunk&) = delete;
@@ -137,7 +137,7 @@ class DynamicSliceThunk : public Thunk {
     std::optional<std::vector<Offset>> offsets;
     std::optional<Shape> orig_shape;
     std::optional<Shape> sliced_shape;
-    std::optional<uint64_t> offset_byte_size;
+    std::optional<PrimitiveType> offset_primitive_type;
     std::string ToString() const;
   };
 
@@ -165,8 +165,8 @@ class DynamicSliceThunk : public Thunk {
     return sliced_shapes_;
   }
 
-  std::vector<std::optional<uint64_t>> get_offset_byte_sizes() const {
-    return offset_byte_sizes_;
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types() const {
+    return offset_primitive_types_;
   }
 
   void ForAllThunks(absl::FunctionRef<void(const Thunk*)> fn) const override;
@@ -204,7 +204,7 @@ class DynamicSliceThunk : public Thunk {
   std::vector<std::optional<std::vector<Offset>>> offsets_;
   std::vector<std::optional<Shape>> orig_shapes_;
   std::vector<std::optional<Shape>> sliced_shapes_;
-  std::vector<std::optional<uint64_t>> offset_byte_sizes_;
+  std::vector<std::optional<PrimitiveType>> offset_primitive_types_;
 
   std::vector<SliceDef> slices_;
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto
index 6fc504f10ca83e..98616dd2ca48ac 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk.proto
@@ -33,8 +33,8 @@ message OptionalShapeProto {
   optional xla.ShapeProto shape = 1;
 }
 
-message OptionalInt64Proto {
-  optional int64 value = 1;
+message OptionalPrimitiveType {
+  optional PrimitiveType value = 1;
 }
 
 // Reflects std::optional<std::vector<Offset>>
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
index 9668eda443f0b3..99790239ebee45 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
@@ -238,8 +238,8 @@ absl::StatusOr<std::unique_ptr<DynamicSliceThunk>> CreateSlicedGemmThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedGemmProtoRoundTrip) {
@@ -411,8 +411,8 @@ CreateMultipleSlicedOperandsGemmThunk(
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}),
           ShapeUtil::MakeShape(PrimitiveType::F32, {3, 1}), std::nullopt,
           std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), sizeof(int64_t),
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, S64, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, MultipleSlicedOperandsGemmProtoRoundTrip) {
@@ -601,7 +601,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpy) {
       // Make sure to pass a dst shape with the same rank as src shape (i.e.
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 8, 8}), std::nullopt},
-      {sizeof(int64_t), std::nullopt});
+      {S64, std::nullopt});
 
   // Step 2:
   // Execute dynamic slice thunk.
@@ -767,7 +767,7 @@ TEST_F(DynamicSliceThunkTest, SlicedOutputMemcpy) {
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
-      {sizeof(int64_t), sizeof(int64_t)});
+      {S64, S64});
 
   // Step 2:
   // Execute dynamic slice thunk.
@@ -945,8 +945,8 @@ CreateSlicedGemmArbitraryArgumentOrderThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedGemmArbitraryArgumentOrderProtoRoundTrip) {
@@ -1118,8 +1118,8 @@ CreateSlicedGemmArbitraryNumberOfArgumentsThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest,
@@ -1282,8 +1282,8 @@ CreateSlicedTupledOperandGemmThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedTupledOperandGemmProtoRoundTrip) {
@@ -1475,7 +1475,7 @@ TEST_F(DynamicSliceThunkTest, SlicedMemcpyOOB) {
       // original slice result and not bitcasted one)
       {ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2}),
        ShapeUtil::MakeShape(PrimitiveType::S32, {1, 1, 2, 2})},
-      {sizeof(int64_t), sizeof(int64_t)});
+      {S64, S64});
 
   // Step 2:
   // Execute dynamic slice thunk.
@@ -1658,8 +1658,8 @@ CreateSlicedOperandsSameBufferGemmThunk(
       std::vector<std::optional<Shape>>{
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 3}), std::nullopt,
           std::nullopt, std::nullopt},
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt});
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt});
 }
 
 TEST_F(DynamicSliceThunkTest, SlicedOperandsSameBufferGemmProtoRoundTrip) {
@@ -1876,8 +1876,8 @@ CreateHostInductionVariableAndOffsetEvaluationThunk(
           ShapeUtil::MakeShape(PrimitiveType::F32, {1, 4}), std::nullopt,
           std::nullopt, std::nullopt},
       /*offset_byte_sizes=*/
-      std::vector<std::optional<uint64_t>>{sizeof(int64_t), std::nullopt,
-                                           std::nullopt, std::nullopt},
+      std::vector<std::optional<PrimitiveType>>{S64, std::nullopt, std::nullopt,
+                                                std::nullopt},
       /*offset_as_function_of_indvar_metadata=*/
       std::move(offset_as_function_of_indvar_modules_metadata));
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 6d464b253e4409..c60e3ea5c94e36 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -261,7 +261,7 @@ message DynamicSliceThunkProto {
   repeated OptionalDynamicSliceOffsetsProto offsets = 3;
   repeated OptionalShapeProto orig_shapes = 4;
   repeated OptionalShapeProto sliced_shapes = 5;
-  repeated OptionalInt64Proto offset_byte_sizes = 6;
+  repeated OptionalPrimitiveType offset_primitive_types = 6;
   optional OffsetAsFunctionOfIndvarModulesMetadataProto
       offset_as_function_of_indvar_modules_metadata = 7;
   repeated BufferAllocationProto fake_allocations = 8;

From 26ebc05f4f9498c99400a6c8e88378ef689255d4 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Wed, 17 Dec 2025 13:35:18 -0800
Subject: [PATCH 463/753] Implement CommonPjRtLoadedExecutable::Execute,
 CommonPjRtLoadedExecutable::ExecutePortable and
 CommonPjRtLoadedExecutable::ExecuteSharded.

PiperOrigin-RevId: 845898855
---
 .../xla/xla/pjrt/common_pjrt_client.cc        | 258 ++++++++++++++++++
 third_party/xla/xla/pjrt/common_pjrt_client.h |  70 ++++-
 2 files changed, 322 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index 86833819c6c356..8c187f326821f1 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/any_invocable.h"
@@ -867,6 +868,263 @@ PjRtLoadedExecutable::Result CommonPjRtLoadedExecutable::ExecuteLaunch(
            launch_args.is_predetermined_error)});
 }
 
+absl::Status CommonPjRtLoadedExecutable::ExecutePrepareWithOomRetries(
+    std::optional<ExecuteLaunchArgs>& launch_args,
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    const ExecuteOptions& options, size_t host_callback_idx,
+    PjRtDevice* device) const {
+  absl::Status prepare_status;
+  int attempts = 0;
+  while (true) {
+    launch_args.emplace();
+    prepare_status =
+        ExecutePrepare(*launch_args, argument_handles, replica, partition,
+                       options, host_callback_idx, device);
+    ++attempts;
+    if (!absl::IsResourceExhausted(prepare_status)) {
+      break;
+    }
+    if (!ShouldRetryOnOom(attempts, launch_args->device, prepare_status)) {
+      break;
+    }
+  }
+  return prepare_status;
+}
+
+static absl::Status ValidateHostTransferCallbacks(
+    absl::Span<const std::vector<SendCallback>> send_callbacks,
+    absl::Span<const std::vector<RecvCallback>> recv_callbacks,
+    size_t num_devices) {
+  if (!send_callbacks.empty() && send_callbacks.size() != num_devices) {
+    return InvalidArgument(
+        "The number of send callback vectors does not match the number of "
+        "devices");
+  }
+  if (!recv_callbacks.empty() && recv_callbacks.size() != num_devices) {
+    return InvalidArgument(
+        "The number of recv callback vectors does not match the number of "
+        "devices");
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<PjRtLoadedExecutable::Result>
+CommonPjRtLoadedExecutable::ExecuteHelperOnSingleDevice(
+    absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
+    const ExecuteOptions& options, bool fill_future, PjRtDevice* device) const {
+  tsl::profiler::TraceMe traceme(
+      "CommonPjRtLoadedExecutable::ExecuteHelperOnSingleDevice");
+  std::optional<ExecuteLaunchArgs> launch_args;
+  TF_RETURN_IF_ERROR(ExecutePrepareWithOomRetries(
+      launch_args, argument_handles, replica, partition, options,
+      /*host_callback_idx=*/0, device));
+  return ExecuteLaunch(*launch_args, fill_future);
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+CommonPjRtLoadedExecutable::ExecuteSharded(
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+    const ExecuteOptions& options,
+    std::optional<tsl::Future<void>>& returned_future, bool fill_future) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::ExecuteSharded");
+  for (int i = 0; i < addressable_devices_.size(); ++i) {
+    if (addressable_devices_[i] == device) {
+      TF_RETURN_IF_ERROR(ValidateHostTransferCallbacks(
+          options.send_callbacks, options.recv_callbacks, /*num_devices=*/1));
+      TF_ASSIGN_OR_RETURN(
+          auto result,
+          ExecuteHelperOnSingleDevice(
+              argument_handles, addressable_device_logical_ids_[i].replica,
+              addressable_device_logical_ids_[i].partition, options,
+              fill_future));
+      returned_future = std::move(result.future);
+      return std::move(result.buffers);
+    }
+  }
+  return InvalidArgument(
+      "ExecuteShard attempted to execute on device id %d which is not "
+      "addressable by this client",
+      device->global_device_id().value());
+}
+
+absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>>
+CommonPjRtLoadedExecutable::ExecutePortable(
+    absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+    const ExecuteOptions& options,
+    std::optional<tsl::Future<void>>& returned_future, bool fill_future) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::ExecutePortable");
+  if (num_replicas() != 1 || num_partitions() != 1) {
+    return InvalidArgument(
+        "ExecutePortable expects a single-core executable but gets "
+        "one with %d replica %d partition",
+        num_replicas(), num_partitions());
+  }
+  if (device == nullptr) {
+    return InvalidArgument("ExecutePortable expects a device to be specified");
+  }
+
+  TF_RETURN_IF_ERROR(ValidateHostTransferCallbacks(
+      options.send_callbacks, options.recv_callbacks, /*num_devices=*/1));
+  VLOG(1) << "ExecutePortable executes single-core portable executable "
+          << name();
+  TF_ASSIGN_OR_RETURN(
+      auto result, ExecuteHelperOnSingleDevice(argument_handles, /*replica=*/0,
+                                               /*partition=*/0, options,
+                                               fill_future, device));
+  returned_future = std::move(result.future);
+  return std::move(result.buffers);
+}
+
+absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>
+CommonPjRtLoadedExecutable::Execute(
+    absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+    const ExecuteOptions& options,
+    std::optional<std::vector<tsl::Future<void>>>& returned_futures) const {
+  tsl::profiler::TraceMe traceme("CommonPjRtLoadedExecutable::Execute");
+  VLOG(1) << "CommonPjRtLoadedExecutable::Execute";
+  if (!client()->allows_recursion() && ThisThreadIsInsideHostCallback()) {
+    // Because TPU is single threaded, and the host callback currently blocking
+    // the TPU, we should not initiate any outstanding computations because that
+    // risks deadlocking the TPU.
+    return InvalidArgument("Execute() called from inside host callback.");
+  }
+
+  tsl::profiler::TraceMeProducer producer("CommonPjRtLoadedExecutable::Execute",
+                                          tsl::profiler::ContextType::kPjRt);
+
+  const int num_addressable_devices = addressable_devices_.size();
+
+  if (argument_handles.size() != num_addressable_devices) {
+    return InvalidArgument(
+        "Attempted to execute with %d argument lists when local device "
+        "count is %d (total replica count: %d, partition count: %d)",
+        argument_handles.size(), num_addressable_devices, num_replicas(),
+        num_partitions());
+  }
+
+  VLOG(1) << "Executing computation " << name()
+          << "; num_replicas=" << num_replicas()
+          << " num_partitions=" << num_partitions()
+          << " num_addressable_devices=" << num_addressable_devices;
+
+  TF_RETURN_IF_ERROR(ValidateHostTransferCallbacks(
+      options.send_callbacks, options.recv_callbacks,
+      addressable_devices_.size()));
+
+  std::vector<absl::StatusOr<Result>> results(num_addressable_devices);
+  if (num_addressable_devices == 1) {
+    // Fast-path if there is only one device — run the computation on the
+    // current thread.
+    const int replica = addressable_device_logical_ids_[0].replica;
+    const int partition = addressable_device_logical_ids_[0].partition;
+    results[0] =
+        ExecuteHelperOnSingleDevice(argument_handles[0], replica, partition,
+                                    options, returned_futures.has_value());
+  } else {
+    absl::Mutex mu;
+    int preparing = num_addressable_devices;
+    int launching = num_addressable_devices;
+    int failed = 0;
+    absl::Status first_failure_status;
+
+    {
+      // The gang_schedule mutex ensures that all calls to Schedule() happen
+      // atomically and cannot interleave with calls to Execute on other
+      // threads. If calls to Schedule are not atomic, then the threads can get
+      // stuck waiting for done_preparing to become true.
+      absl::MutexLock gang_schedule(client()->gang_scheduler());
+      auto context_id = producer.GetContextId();
+      for (int i = 0; i < num_addressable_devices; ++i) {
+        const int replica = addressable_device_logical_ids_[i].replica;
+        const int partition = addressable_device_logical_ids_[i].partition;
+        PjRtDevice* device = addressable_devices_[i];
+        LaunchOnDevice(device, [&, replica, partition, i, context_id] {
+          tsl::profiler::TraceMeConsumer consumer(
+              "Scheduled CommonPjRtLoadedExecutable::Execute",
+              tsl::profiler::ContextType::kPjRt, context_id);
+
+          // Two phase launch. Phase 1: Prepare on all cores. Abort
+          // launch on prepare failure.
+          std::optional<ExecuteLaunchArgs> launch_args;
+          absl::Status launch_status = ExecutePrepareWithOomRetries(
+              launch_args, argument_handles[i], replica, partition, options,
+              /*host_callback_idx=*/i);
+          // Wait for prepare to finish on all cores.
+          {
+            absl::MutexLock lock(mu);
+            preparing--;
+            auto done_preparing = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+              return preparing == 0;
+            };
+            mu.Await(absl::Condition(&done_preparing));
+            if (!launch_status.ok()) {
+              if (failed == 0) {
+                first_failure_status = launch_status;
+              }
+              failed++;
+            }
+            if (failed > 0) {
+              // Poison results for all cores.
+              results[i] = first_failure_status;
+              // Abort phase 2 if Prepare fails for any core.
+              --launching;
+              return;
+            }
+          }
+
+          // Phase 2: Launch. It cannot fail.
+          results[i] =
+              ExecuteLaunch(*launch_args, returned_futures.has_value());
+
+          absl::MutexLock lock(mu);
+          --launching;
+        });
+      }
+    }
+
+    // Wait until we either fail Phase 1 or completes two phases.
+    auto done = [&]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+      return launching == 0;
+    };
+    absl::MutexLock lock(mu);
+    mu.Await(absl::Condition(&done));
+  }
+  VLOG(3) << "Replicated execution complete.";
+
+  std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> wrapped_results(
+      num_addressable_devices);
+  if (returned_futures.has_value()) {
+    returned_futures->reserve(num_addressable_devices);
+  }
+  for (int i = 0; i < num_addressable_devices; ++i) {
+    const int replica = addressable_device_logical_ids_[i].replica;
+    const int partition = addressable_device_logical_ids_[i].partition;
+    auto& statusor = results[i];
+    if (!statusor.ok()) {
+      if (absl::IsResourceExhausted(statusor.status())) {
+        client()->CallOomHandlers();
+      }
+      if (returned_futures.has_value()) {
+        returned_futures->clear();
+      }
+      if (num_addressable_devices == 1) {
+        return statusor.status();
+      }
+      return AppendStatus(
+          statusor.status(),
+          absl::StrFormat("while running replica %d and partition %d of a "
+                          "replicated computation (other "
+                          "replicas may have failed as well).",
+                          replica, partition));
+    }
+    wrapped_results[i] = std::move(statusor->buffers);
+    if (returned_futures.has_value()) {
+      returned_futures->push_back(*std::move(statusor->future));
+    }
+  }
+  return wrapped_results;
+}
+
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
 CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
                                            PjRtMemorySpace* dst_memory_space) {
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index 839403ce4e7a67..27084fbc94e1b6 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -62,6 +62,9 @@ class CommonPjRtClient : public PjRtClient {
   // callbacks. Those clients should return false here.
   virtual bool allows_recursion() const { return true; }
 
+  // Backend specific handlers for when an oom is detected during execute.
+  virtual void CallOomHandlers() const {}
+
   // Computes the memory requirements for storing shape on memory_space.
   // TODO(parkers): make pure virtual and update all clients.
   virtual absl::StatusOr<int64_t> GetOnDeviceBytesCount(
@@ -266,6 +269,11 @@ class CommonPjRtClient : public PjRtClient {
       absl::InlinedVector<tsl::RCReference<CommonPjRtRawBuffer>, 4>
           output_leaf_buffers,
       bool is_predetermined_error);
+
+  absl::Mutex& gang_scheduler() const { return gang_scheduler_mu_; }
+
+ private:
+  mutable absl::Mutex gang_scheduler_mu_;
 };
 
 // Represents the launch state for a loaded executable. This state must be
@@ -293,15 +301,17 @@ class PjRtRawLoadedExecutable {
 
 class CommonPjRtLoadedExecutable : public PjRtLoadedExecutable {
  public:
-  CommonPjRtLoadedExecutable(CommonPjRtClient* client,
-                             std::vector<Shape> parameter_device_shapes,
-                             Shape output_device_shape,
-                             std::vector<int> output_memory_space_kind_ids,
-                             std::vector<PjRtDevice*> addressable_devices)
+  CommonPjRtLoadedExecutable(
+      CommonPjRtClient* client, std::vector<Shape> parameter_device_shapes,
+      Shape output_device_shape, std::vector<int> output_memory_space_kind_ids,
+      std::vector<PjRtDevice*> addressable_devices,
+      std::vector<LogicalDeviceIds> addressable_device_logical_ids)
       : parameter_device_shapes_(std::move(parameter_device_shapes)),
         output_device_shape_(std::move(output_device_shape)),
         output_memory_space_kind_ids_(std::move(output_memory_space_kind_ids)),
-        addressable_devices_(std::move(addressable_devices)) {}
+        addressable_devices_(std::move(addressable_devices)),
+        addressable_device_logical_ids_(
+            std::move(addressable_device_logical_ids)) {}
 
   CommonPjRtClient* client() const override = 0;
 
@@ -309,6 +319,27 @@ class CommonPjRtLoadedExecutable : public PjRtLoadedExecutable {
     return addressable_devices_;
   }
 
+  using PjRtLoadedExecutable::Execute;
+  absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
+      absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
+      const ExecuteOptions& options,
+      std::optional<std::vector<tsl::Future<void>>>& returned_futures)
+      const override;
+
+  using PjRtLoadedExecutable::ExecuteSharded;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecuteSharded(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<tsl::Future<void>>& returned_future,
+      bool fill_future) const override;
+
+  using PjRtLoadedExecutable::ExecutePortable;
+  absl::StatusOr<std::vector<std::unique_ptr<PjRtBuffer>>> ExecutePortable(
+      absl::Span<PjRtBuffer* const> argument_handles, PjRtDevice* device,
+      const ExecuteOptions& options,
+      std::optional<tsl::Future<void>>& returned_future,
+      bool fill_future) const override;
+
  protected:
   // Execute is split into Prepare and Launch.
   // Prepare can fail and be retried, while Launch is guaranteed to succeed.
@@ -350,6 +381,26 @@ class CommonPjRtLoadedExecutable : public PjRtLoadedExecutable {
                               size_t host_callback_idx,
                               PjRtDevice* device) const;
 
+  // Run Prepare and Launch phases on a single device.
+  absl::StatusOr<Result> ExecuteHelperOnSingleDevice(
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const ExecuteOptions& options, bool fill_future,
+      PjRtDevice* device = nullptr) const;
+
+  absl::Status ExecutePrepareWithOomRetries(
+      std::optional<ExecuteLaunchArgs>& launch_args,
+      absl::Span<PjRtBuffer* const> argument_handles, int replica,
+      int partition, const ExecuteOptions& options, size_t host_callback_idx,
+      PjRtDevice* device = nullptr) const;
+
+  virtual void LaunchOnDevice(PjRtDevice* device,
+                              absl::AnyInvocable<void()> execute_fn) const = 0;
+
+  virtual bool ShouldRetryOnOom(int attempts, PjRtDevice* device,
+                                absl::Status perpare_status) const {
+    return false;
+  }
+
   Result ExecuteLaunch(ExecuteLaunchArgs& launch_args, bool fill_future) const;
 
   // Parameter shapes.
@@ -368,6 +419,13 @@ class CommonPjRtLoadedExecutable : public PjRtLoadedExecutable {
   // addressable_device_logical_ids_[i] is assigned. shared_ptrs instead of
   // unique_ptrs to play well with the Python bindings (see xla.cc).
   std::vector<PjRtDevice*> addressable_devices_;
+  // The replica and partition indices of device_assignment_ to be run by this
+  // client. On single-host platforms without partitioning, this is all
+  // replicas (i.e. addressable_device_logical_ids_[i] = (i, 0)), but this may
+  // not be the case on multi-host platforms. If there are 4 replicas and 2
+  // partitions on a single host platform, size of
+  // addressable_device_logical_ids_ is 4*2 = 8.
+  std::vector<LogicalDeviceIds> addressable_device_logical_ids_;
 };
 
 // TODO(parkers): Merge everything here into CommonPjRtBuffer.

From b788805f73714d15b6f5fb850ec60be6e8cf1a24 Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Wed, 17 Dec 2025 13:36:40 -0800
Subject: [PATCH 464/753] Remove unused `ReportError` RPC from coordination
 service.

PiperOrigin-RevId: 845899374
---
 .../coordination/coordination_client.h        | 13 -----
 .../coordination/coordination_service.h       |  2 +
 .../coordination_service_agent.cc             | 44 ---------------
 .../coordination_service_agent_test.cc        | 56 -------------------
 .../coordination_service_rpc_handler.cc       | 40 -------------
 .../coordination_service_rpc_handler.h        | 10 ----
 .../coordination/coordination_service_test.cc | 11 ----
 .../coordination/grpc_coordination_client.cc  | 25 ---------
 .../grpc_coordination_service_impl.cc         |  2 -
 .../grpc_coordination_service_impl.h          |  2 -
 10 files changed, 2 insertions(+), 203 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
index 49313c4177e4bf..b9533934ab206f 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
@@ -48,10 +48,6 @@ using tensorflow::PollForErrorRequest;
 using tensorflow::PollForErrorResponse;
 using tensorflow::RegisterTaskRequest;
 using tensorflow::RegisterTaskResponse;
-using tensorflow::ReportErrorToServiceRequest;
-using tensorflow::ReportErrorToServiceResponse;
-using tensorflow::ReportErrorToTaskRequest;
-using tensorflow::ReportErrorToTaskResponse;
 using tensorflow::ResetTaskRequest;
 using tensorflow::ResetTaskResponse;
 using tensorflow::ShutdownTaskRequest;
@@ -92,15 +88,6 @@ class CoordinationClient {
                               ResetTaskResponse* response,
                               tsl::StatusCallback done) = 0;
 
-  virtual void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                                      const ReportErrorToTaskRequest* request,
-                                      ReportErrorToTaskResponse* response,
-                                      tsl::StatusCallback done) = 0;
-
-  virtual void ReportErrorToServiceAsync(
-      const ReportErrorToServiceRequest* request,
-      ReportErrorToServiceResponse* response, tsl::StatusCallback done) = 0;
-
   virtual void GetTaskStateAsync(const GetTaskStateRequest* request,
                                  GetTaskStateResponse* response,
                                  tsl::StatusCallback done) = 0;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
index 898f262691fbad..73550626494ca4 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
@@ -177,6 +177,8 @@ class CoordinationService {
                                IncarnationId incarnation);
 
   // Set a task in error state permanently.
+  //
+  // TODO: mwhittaker - Remove this. It's only used for testing.
   absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
                                const absl::Status& error);
 
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index fe6c625e91c910..98e8a70bdd3b1c 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -454,50 +454,6 @@ CoordinationServiceAgent::WatchJobState(absl::string_view job_name,
   return response;
 }
 
-absl::Status CoordinationServiceAgent::ReportError(const absl::Status& error) {
-  {
-    absl::MutexLock l(state_mu_);
-    if (state_ == CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
-      return MakeCoordinationError(absl::FailedPreconditionError(
-          "Coordination service agent must be initialized first before "
-          "reporting error."));
-    }
-    if (state_ == CoordinatedTaskState::TASKSTATE_ERROR) {
-      return MakeCoordinationError(absl::FailedPreconditionError(
-          "Coordination service agent is already in error state."));
-    }
-  }
-  SetError(MakeCoordinationError(error, task_,
-                                 /*is_reported_error=*/true));
-  LOG(INFO) << "Reporting error to coordination service: " << error;
-  ReportErrorToServiceRequest request;
-  request.set_error_code(error.raw_code());
-  request.set_error_message(std::string(error.message()));
-  *request.mutable_error_origin() = task_;
-  VLOG(5) << "ReportErrorToServiceRequest: " << request.DebugString();
-  ReportErrorToServiceResponse response;
-
-  absl::Notification n;
-  leader_client_->ReportErrorToServiceAsync(
-      &request, &response, [&](const absl::Status& s) {
-        VLOG(5) << "ReportErrorToServiceResponse: " << s;
-        if (!s.ok()) {
-          LOG(ERROR)
-              << "Encountered another error when reporting error to "
-                 "coordination service: "
-              << s
-              << "\nThis is usually caused by an earlier error during "
-                 "execution. Check the logs of (a) this task, (b) the "
-                 "leader (usually slice 0 task 0) and (c) the scheduler "
-                 "(e.g. preemption, eviction) for an earlier error to debug "
-                 "further.";
-        }
-        n.Notify();
-      });
-  n.WaitForNotification();
-  return absl::OkStatus();
-}
-
 absl::Status CoordinationServiceAgent::Shutdown() { return ShutdownInternal(); }
 
 absl::Status CoordinationServiceAgent::ShutdownInternal() {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
index 0caa18b97102a1..6985fe7f71d7a9 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
@@ -108,10 +108,6 @@ class TestCoordinationClient : public CoordinationClient {
               (const ResetTaskRequest*, ResetTaskResponse*,
                tsl::StatusCallback),
               (override));
-  MOCK_METHOD(void, ReportErrorToServiceAsync,
-              (const ReportErrorToServiceRequest*,
-               ReportErrorToServiceResponse*, tsl::StatusCallback),
-              (override));
   MOCK_METHOD(void, BarrierAsync,
               (tsl::CallOptions * call_opts, const BarrierRequest*,
                BarrierResponse*, tsl::StatusCallback),
@@ -150,12 +146,6 @@ class TestCoordinationClient : public CoordinationClient {
 
   UNIMPLEMENTED(WaitForAllTasks);
 #undef UNIMPLEMENTED
-  void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                              const ReportErrorToTaskRequest* request,
-                              ReportErrorToTaskResponse* response,
-                              tsl::StatusCallback done) override {
-    done(absl::UnimplementedError("ReportErrorToTaskAsync"));
-  }
 };
 
 class CoordinationServiceAgentTest : public ::testing::Test {
@@ -167,8 +157,6 @@ class CoordinationServiceAgentTest : public ::testing::Test {
         .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, ShutdownTaskAsync(_, _, _, _))
         .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
-    ON_CALL(*client_, ReportErrorToServiceAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, ResetTaskAsync(_, _, _))
         .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
     ON_CALL(*client_, BarrierAsync(_, _, _, _))
@@ -411,18 +399,6 @@ TEST_F(CoordinationServiceAgentTest, GetKeyValueDir_Simple_Success) {
   EXPECT_THAT(*result, UnorderedPointwise(KvEq(), test_values));
 }
 
-TEST_F(CoordinationServiceAgentTest, ShutdownInErrorShouldReturnError) {
-  // Connect coordination agent and set it to error.
-  InitializeAgent();
-  TF_ASSERT_OK(agent_->Connect());
-  TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
-
-  // Shutdown should return error.
-  absl::Status s = agent_->Shutdown();
-
-  EXPECT_TRUE(absl::IsFailedPrecondition(s));
-}
-
 TEST_F(CoordinationServiceAgentTest, Reset_ConnectedButNotInError_Fail) {
   // Connect agent.
   InitializeAgent();
@@ -434,18 +410,6 @@ TEST_F(CoordinationServiceAgentTest, Reset_ConnectedButNotInError_Fail) {
   EXPECT_TRUE(absl::IsFailedPrecondition(status));
 }
 
-TEST_F(CoordinationServiceAgentTest, ConnectAfterResetError) {
-  // Connect coordination agent and set it to error.
-  InitializeAgent();
-  TF_ASSERT_OK(agent_->Connect());
-  TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
-
-  // Reset error.
-  TF_ASSERT_OK(agent_->Reset());
-  // Agent should be able to reconnect to the service after resetting.
-  TF_EXPECT_OK(agent_->Connect());
-}
-
 TEST_F(CoordinationServiceAgentTest, ConnectAfterReset_WithErrorPolling) {
   // Connect coordination agent and set it to error.
   PollForErrorResponse mocked_response;
@@ -527,26 +491,6 @@ TEST_F(CoordinationServiceAgentTest,
   ASSERT_TRUE(agent_->IsError());
 }
 
-TEST_F(CoordinationServiceAgentTest, ResetCanBeRetried) {
-  // Mock reset error failing for the first time.
-  EXPECT_CALL(*GetClient(), ResetTaskAsync(_, _, _))
-      .WillOnce(InvokeArgument<2>(absl::InternalError("Reset error")))
-      .WillOnce(InvokeArgument<2>(absl::OkStatus()));
-  // Connect coordination agent and set it to error.
-  InitializeAgent();
-  TF_ASSERT_OK(agent_->Connect());
-  TF_ASSERT_OK(agent_->ReportError(absl::InternalError("Test Error.")));
-
-  // Reset error fails for the first time.
-  absl::Status reset_status = agent_->Reset();
-  EXPECT_TRUE(absl::IsInternal(reset_status));
-
-  // Agent should be able to attempt resetting again.
-  TF_ASSERT_OK(agent_->Reset());
-  // Agent should be able to reconnect to the service after resetting.
-  TF_EXPECT_OK(agent_->Connect());
-}
-
 TEST_F(CoordinationServiceAgentTest, GetOwnTask) {
   InitializeAgent();
 
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
index 1f95a806c7add1..2382aed192e709 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
@@ -136,46 +136,6 @@ void CoordinationServiceRpcHandler::ResetTaskAsync(
   done(service_->ResetTask(request->source_task()));
 }
 
-void CoordinationServiceRpcHandler::ReportErrorToTaskAsync(
-    const tensorflow::ReportErrorToTaskRequest* request,
-    tensorflow::ReportErrorToTaskResponse* response, tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (agent_ == nullptr) {
-    done(MakeCoordinationError(absl::InternalError(
-        "CoordinationServiceAgent is uninitialized or has already shutdown.")));
-    return;
-  }
-  const CoordinationServiceError& error_payload = request->error_payload();
-  absl::Status error(
-      static_cast<absl::StatusCode>(request->error_code()),
-      absl::StrCat(
-          "Error reported from /job:", error_payload.source_task().job_name(),
-          "/task:", error_payload.source_task().task_id(), ": ",
-          request->error_message()));
-  error = MakeCoordinationError(error, error_payload);
-  agent_->SetError(error);
-  done(absl::OkStatus());
-}
-
-void CoordinationServiceRpcHandler::ReportErrorToServiceAsync(
-    const tensorflow::ReportErrorToServiceRequest* request,
-    tensorflow::ReportErrorToServiceResponse* response,
-    tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (service_ == nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Coordination service is not enabled.")));
-    return;
-  }
-  done(service_->ReportTaskError(
-      request->error_origin(),
-      MakeCoordinationError(
-          absl::Status{static_cast<absl::StatusCode>(request->error_code()),
-                       request->error_message()},
-          request->error_origin(),
-          /*is_reported_error=*/true)));
-}
-
 void CoordinationServiceRpcHandler::GetTaskStateAsync(
     const tensorflow::GetTaskStateRequest* request,
     tensorflow::GetTaskStateResponse* response, tsl::StatusCallback done) {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
index 7e71594ddfb284..432db619703bbe 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
@@ -52,16 +52,6 @@ class CoordinationServiceRpcHandler {
                       tensorflow::ResetTaskResponse* response,
                       tsl::StatusCallback done);
 
-  void ReportErrorToTaskAsync(
-      const tensorflow::ReportErrorToTaskRequest* request,
-      tensorflow::ReportErrorToTaskResponse* response,
-      tsl::StatusCallback done);
-
-  void ReportErrorToServiceAsync(
-      const tensorflow::ReportErrorToServiceRequest* request,
-      tensorflow::ReportErrorToServiceResponse* response,
-      tsl::StatusCallback done);
-
   void GetTaskStateAsync(const tensorflow::GetTaskStateRequest* request,
                          tensorflow::GetTaskStateResponse* response,
                          tsl::StatusCallback done);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
index 7a08a8c0bf3100..5c2fdccbc7f9a8 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
@@ -103,16 +103,6 @@ class TestCoordinationClient : public CoordinationClient {
     done(absl::OkStatus());
   }
 
-  void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                              const ReportErrorToTaskRequest* request,
-                              ReportErrorToTaskResponse* response,
-                              tsl::StatusCallback done) override {
-    absl::MutexLock l(mu_);
-    status_ = absl::Status(static_cast<absl::StatusCode>(request->error_code()),
-                           request->error_message());
-    done(absl::OkStatus());
-  }
-
 #define UNIMPLEMENTED(method)                                              \
   void method##Async(const method##Request* request,                       \
                      method##Response* response, tsl::StatusCallback done) \
@@ -122,7 +112,6 @@ class TestCoordinationClient : public CoordinationClient {
 
   UNIMPLEMENTED(WaitForAllTasks);
   UNIMPLEMENTED(ResetTask);
-  UNIMPLEMENTED(ReportErrorToService);
   UNIMPLEMENTED(GetTaskState);
   UNIMPLEMENTED(InsertKeyValue);
   UNIMPLEMENTED(TryGetKeyValue);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
index ee5f5ebafc27cc..e6be2a6efde087 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
@@ -65,10 +65,6 @@ using tensorflow::PollForErrorRequest;
 using tensorflow::PollForErrorResponse;
 using tensorflow::RegisterTaskRequest;
 using tensorflow::RegisterTaskResponse;
-using tensorflow::ReportErrorToServiceRequest;
-using tensorflow::ReportErrorToServiceResponse;
-using tensorflow::ReportErrorToTaskRequest;
-using tensorflow::ReportErrorToTaskResponse;
 using tensorflow::ResetTaskRequest;
 using tensorflow::ResetTaskResponse;
 using tensorflow::ShutdownTaskRequest;
@@ -178,27 +174,6 @@ class GrpcCoordinationClient : public CoordinationClient {
         /*fail_fast=*/true, &target_);
   }
 
-  void ReportErrorToTaskAsync(tsl::CallOptions* call_opts,
-                              const ReportErrorToTaskRequest* request,
-                              ReportErrorToTaskResponse* response,
-                              tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/ReportErrorToTask",
-        *request, response, std::move(done), call_opts,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
-  void ReportErrorToServiceAsync(const ReportErrorToServiceRequest* request,
-                                 ReportErrorToServiceResponse* response,
-                                 tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/ReportErrorToService",
-        *request, response, std::move(done), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
   void GetTaskStateAsync(const GetTaskStateRequest* request,
                          GetTaskStateResponse* response,
                          tsl::StatusCallback done) override {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
index 27af5e9104ffb4..a9a8d614a3b207 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
@@ -48,8 +48,6 @@ void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
   ENQUEUE_REQUEST(ShutdownTask);
   ENQUEUE_REQUEST(ResetTask);
   ENQUEUE_REQUEST(Heartbeat);
-  ENQUEUE_REQUEST(ReportErrorToTask);
-  ENQUEUE_REQUEST(ReportErrorToService);
   ENQUEUE_REQUEST(GetTaskState);
   ENQUEUE_REQUEST(WatchJobState);
   ENQUEUE_REQUEST(InsertKeyValue);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
index 3f699619273755..5dc9a46d56d743 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
@@ -89,8 +89,6 @@ class GrpcCoordinationServiceImpl : public tsl::AsyncServiceInterface {
   HANDLER(ShutdownTask);
   HANDLER(ResetTask);
   HANDLER(Heartbeat);
-  HANDLER(ReportErrorToTask);
-  HANDLER(ReportErrorToService);
   HANDLER(GetTaskState);
   HANDLER(WatchJobState);
   HANDLER(InsertKeyValue);

From 7585d543b0837c2f6a05087d3ba44d741fda92fe Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Wed, 17 Dec 2025 13:39:40 -0800
Subject: [PATCH 465/753] [PJRT] Change BuildPlanNodes,
 ChooseParallelizationStrategy, and the loop ordering code to look only at
 Loop objects, not other parts of the transpose plan.

This simplifies the code since we can compute the properties of a loop nest once from the transpose specification, and then subsequent phases can work on those loops only, rather than looking up properties of dimensions in other data structures.

Refactoring only, no behavior changes intended.

PiperOrigin-RevId: 845900348
---
 third_party/xla/xla/pjrt/transpose.cc | 195 +++++++++++++-------------
 third_party/xla/xla/pjrt/transpose.h  |  37 ++++-
 2 files changed, 125 insertions(+), 107 deletions(-)

diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index ab20440d7105d8..b352f02d531325 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -91,6 +91,7 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/blocking_counter.h"
@@ -712,15 +713,11 @@ static absl::Status ParseTilingSpecification(
 }
 
 // Helper function that builds a plan.
-void TransposePlan::BuildPlanNodes(
-    absl::Span<int64_t const> inverse_permutation, int thread_id,
-    std::vector<TransposePlan::Node>& nodes) {
+void TransposePlan::BuildPlanNodes(int thread_id,
+                                   std::vector<TransposePlan::Node>& nodes) {
   VLOG(8) << "Before plan build: " << ToString();
   const int ndim = a_dims_.size();
   DCHECK_GT(ndim, 0);
-  const int pos_stride1a = ndim - 1;
-  const int pos_stride1b_in_a = permutation_.back();
-  const int pos_stride1a_in_b = inverse_permutation[pos_stride1a];
 
   // We build plans in a depth-first order, visiting loops from outermost to
   // innermost. We use a stack (depth-first) order to handle trailing partial
@@ -745,8 +742,10 @@ void TransposePlan::BuildPlanNodes(
   };
   std::stack<Agendum> agenda;
 
-  int total_tasks =
-      absl::c_accumulate(loop_parallelism_, int{1}, std::multiplies<int>());
+  int total_tasks = 1;
+  for (const Loop& loop : loop_order_) {
+    total_tasks *= loop.parallelism;
+  }
 
   agenda.push(Agendum{/*loop_id=*/0, /*parent_node_id=*/-1,
                       /*num_tasks_at_loop=*/total_tasks,
@@ -777,12 +776,8 @@ void TransposePlan::BuildPlanNodes(
       if (!inner_kernel_is_memcpy_) {
         Node node;
         node.start = node.end = node.inc = -1;
-        node.lda = a_tiling_[pos_stride1b_in_a] > 1
-                       ? lda_tile_[pos_stride1b_in_a]
-                       : lda_[pos_stride1b_in_a];
-        node.ldb = b_tiling_[pos_stride1a_in_b] > 1
-                       ? ldb_tile_[pos_stride1a_in_b]
-                       : ldb_[pos_stride1a_in_b];
+        node.lda = sentinel_lda_;
+        node.ldb = sentinel_ldb_;
         nodes.push_back(node);
       }
       DCHECK(!(inner_kernel_is_memcpy_ && agendum.parent_node_id >= 0));
@@ -791,38 +786,34 @@ void TransposePlan::BuildPlanNodes(
 
     const Loop& loop = loop_order_[agendum.loop_id];
     int a_dim = loop.dim_in_a;
-    int b_dim = inverse_permutation[a_dim];
-    DCHECK(a_tiling_[a_dim] == 1 || b_tiling_[b_dim] == 1 ||
-           a_tiling_[a_dim] == b_tiling_[b_dim]);
-    int64_t tile_size = std::max(a_tiling_[a_dim], b_tiling_[b_dim]);
 
     // Compute the number of tasks for the next loop iteration.
     int task_id_at_loop = agendum.task_id_at_loop;
-    int num_tasks_at_loop =
-        agendum.num_tasks_at_loop / loop_parallelism_[agendum.loop_id];
+    int num_tasks_at_loop = agendum.num_tasks_at_loop / loop.parallelism;
     int task_id_at_next_loop = task_id_at_loop % num_tasks_at_loop;
 
+    Node node;
+    node.lda = loop.lda;
+    node.ldb = loop.ldb;
+    node.inc = 1;
+    node.is_inner_dim_in_a = loop.is_inner_dim_in_a;
+    node.is_inner_dim_in_b = loop.is_inner_dim_in_b;
+    if (node.is_inner_dim_in_a) {
+      node.inc = inner_block_elems_ * outer_block_elems_a_;
+    } else if (node.is_inner_dim_in_b) {
+      node.inc = inner_block_elems_ * outer_block_elems_b_;
+    }
+
+    int task_id = task_id_at_loop / num_tasks_at_loop;
+
     if (loop.tile_interior) {
       // We are visiting the tile interior of a tiled dimension.
       bool partial = agendum.partial_tiles[a_dim];
 
-      Node node;
-      node.lda = a_tiling_[a_dim] > 1 ? lda_tile_[a_dim] : lda_[a_dim];
-      node.ldb = b_tiling_[b_dim] > 1 ? ldb_tile_[b_dim] : ldb_[b_dim];
-      node.inc = 1;
-      node.is_inner_dim_in_a = (a_dim == pos_stride1a);
-      node.is_inner_dim_in_b = (a_dim == pos_stride1b_in_a);
-      if (node.is_inner_dim_in_a) {
-        node.inc = inner_block_elems_ * outer_block_elems_a_;
-      } else if (node.is_inner_dim_in_b) {
-        node.inc = inner_block_elems_ * outer_block_elems_b_;
-      }
-
-      int task_id = task_id_at_loop / num_tasks_at_loop;
-      int64_t size = partial ? a_dims_[a_dim] % tile_size : tile_size;
+      int64_t size = partial ? loop.dim_size % loop.tile_size : loop.tile_size;
       int64_t num_iterations = CeilOfRatio(size, node.inc);
-      int64_t num_iterations_per_task = CeilOfRatio<int64_t>(
-          num_iterations, loop_parallelism_[agendum.loop_id]);
+      int64_t num_iterations_per_task =
+          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
       node.start = std::min(size, task_id * num_iterations_per_task * node.inc);
       node.end =
           std::min(size, (task_id + 1) * num_iterations_per_task * node.inc);
@@ -845,15 +836,14 @@ void TransposePlan::BuildPlanNodes(
     } else {
       // We are either visiting an untiled dimension, or the loop that iterates
       // over tile exteriors.
-      int task_id = task_id_at_loop / num_tasks_at_loop;
-      int64_t num_complete_tiles = a_dims_[a_dim] / tile_size;
-      bool has_partial_tile = (a_dims_[a_dim] % tile_size != 0);
+      int64_t num_complete_tiles = loop.dim_size / loop.tile_size;
+      bool has_partial_tile = (loop.dim_size % loop.tile_size != 0);
 
       // If there is a trailing partial tile as well as complete tiles, handle
       // it as a trailer on the loop over complete tiles.
       bool has_trailing_plan_node = false;
       if (num_complete_tiles > 0 && has_partial_tile &&
-          task_id == loop_parallelism_[agendum.loop_id] - 1) {
+          task_id == loop.parallelism - 1) {
         Agendum new_agendum;
         new_agendum.loop_id = agendum.loop_id + 1;
         new_agendum.parent_node_id = node_id;
@@ -864,17 +854,6 @@ void TransposePlan::BuildPlanNodes(
         agenda.push(std::move(new_agendum));
         has_trailing_plan_node = true;
       }
-      Node node;
-      node.lda = lda_[a_dim] * tile_size / a_tiling_[a_dim];
-      node.ldb = ldb_[b_dim] * tile_size / b_tiling_[b_dim];
-      node.inc = 1;
-      node.is_inner_dim_in_a = (tile_size == 1 && a_dim == ndim - 1);
-      node.is_inner_dim_in_b = (tile_size == 1 && a_dim == pos_stride1b_in_a);
-      if (node.is_inner_dim_in_a) {
-        node.inc = inner_block_elems_ * outer_block_elems_a_;
-      } else if (node.is_inner_dim_in_b) {
-        node.inc = inner_block_elems_ * outer_block_elems_b_;
-      }
 
       // If this tiled dimension consists only of a single partial tile, handle
       // it here; there's no point emitting a degenerate loop and a separate
@@ -884,8 +863,8 @@ void TransposePlan::BuildPlanNodes(
       // Evenly divide the loop iterations amongst the threads.
       int64_t num_tiles = partial ? 1 : num_complete_tiles;
       int64_t num_iterations = CeilOfRatio(num_tiles, node.inc);
-      int64_t num_iterations_per_task = CeilOfRatio<int64_t>(
-          num_iterations, loop_parallelism_[agendum.loop_id]);
+      int64_t num_iterations_per_task =
+          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
       node.start =
           std::min(num_tiles, task_id * num_iterations_per_task * node.inc);
       node.end = std::min(num_tiles,
@@ -1123,11 +1102,45 @@ void TransposePlan::Initialize() {
   const int pos_stride1b_in_a = permutation_.back();
   inner_kernel_is_memcpy_ = (pos_stride1b_in_a == pos_stride1a);
 
+  // Calculate sentinel strides.
+  if (!inner_kernel_is_memcpy_) {
+    int pos_stride1a_in_b = inverse_permutation[ndim - 1];
+    sentinel_lda_ = a_tiling_[pos_stride1b_in_a] > 1
+                        ? lda_tile_[pos_stride1b_in_a]
+                        : lda_[pos_stride1b_in_a];
+    sentinel_ldb_ = b_tiling_[pos_stride1a_in_b] > 1
+                        ? ldb_tile_[pos_stride1a_in_b]
+                        : ldb_[pos_stride1a_in_b];
+  }
+
   loop_order_.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
-    loop_order_.push_back(Loop{i, /*tile_interior=*/false});
-    if (a_tiling_[i] != 1 || b_tiling_[inverse_permutation[i]] != 1) {
-      loop_order_.push_back(Loop{i, /*tile_interior=*/true});
+    Loop loop;
+    loop.dim_in_a = i;
+    loop.tile_interior = false;
+    loop.dim_size = a_dims_[i];
+    loop.tile_size = std::max(a_tiling_[i], b_tiling_[inverse_permutation[i]]);
+
+    loop.lda = lda_[i];
+    if (a_tiling_[i] == 1) {
+      loop.lda *= loop.tile_size;
+    }
+    loop.ldb = ldb_[inverse_permutation[i]];
+    if (b_tiling_[inverse_permutation[i]] == 1) {
+      loop.ldb *= loop.tile_size;
+    }
+    loop.is_inner_dim_in_a = (loop.tile_size == 1) && (i == pos_stride1a);
+    loop.is_inner_dim_in_b = (loop.tile_size == 1) && (i == pos_stride1b_in_a);
+    loop_order_.push_back(loop);
+
+    if (loop.tile_size > 1) {
+      loop.tile_interior = true;
+      loop.lda = a_is_tiled_ ? lda_tile_[i] : lda_[i];
+      loop.ldb = b_is_tiled_ ? ldb_tile_[inverse_permutation[i]]
+                             : ldb_[inverse_permutation[i]];
+      loop.is_inner_dim_in_a = (i == pos_stride1a);
+      loop.is_inner_dim_in_b = (i == pos_stride1b_in_a);
+      loop_order_.push_back(loop);
     }
   }
 
@@ -1196,21 +1209,12 @@ void TransposePlan::Initialize() {
 
   // Loop order heuristic: try to make loops with small strides innermost.
   auto cost = [&](const Loop& l) {
-    int64_t a_stride =
-        std::abs((l.tile_interior && a_is_tiled_) ? lda_tile_[l.dim_in_a]
-                                                  : lda_[l.dim_in_a]);
-    bool is_inner_dim_in_a =
-        (!a_is_tiled_ || l.tile_interior) && (l.dim_in_a == pos_stride1a);
-
-    if (!inner_kernel_is_memcpy_ && is_inner_dim_in_a) {
+    int64_t a_stride = std::abs(l.lda);
+    if (!inner_kernel_is_memcpy_ && l.is_inner_dim_in_a) {
       a_stride *= inner_block_elems_ * outer_block_elems_a_;
     }
-    int b_dim = inverse_permutation[l.dim_in_a];
-    int64_t b_stride =
-        (l.tile_interior && b_is_tiled_) ? ldb_tile_[b_dim] : ldb_[b_dim];
-    bool is_inner_dim_in_b =
-        (!b_is_tiled_ || l.tile_interior) && (l.dim_in_a == pos_stride1b_in_a);
-    if (!inner_kernel_is_memcpy_ && is_inner_dim_in_b) {
+    int64_t b_stride = std::abs(l.ldb);
+    if (!inner_kernel_is_memcpy_ && l.is_inner_dim_in_b) {
       b_stride *= inner_block_elems_ * outer_block_elems_b_;
     }
     // Add a small penalty to the input strides: given the choice between
@@ -1220,10 +1224,7 @@ void TransposePlan::Initialize() {
 
     // If the inner kernel is a memcpy make sure the innermost loop is the
     // stride-1 dimension. This is a requirement of the memcpy kernel.
-    bool dim_must_go_last =
-        inner_kernel_is_memcpy_ && l.dim_in_a == pos_stride1a &&
-        (l.tile_interior ||
-         (a_tiling_[l.dim_in_a] == 1 && b_tiling_[b_dim] == 1));
+    bool dim_must_go_last = inner_kernel_is_memcpy_ && l.is_inner_dim_in_a;
     return std::make_tuple(dim_must_go_last,
                            inner_kernel_is_memcpy_ && l.tile_interior,
                            -std::min<double>(a_stride * penalty, b_stride));
@@ -1237,15 +1238,13 @@ void TransposePlan::Initialize() {
   // both input and output.
 
   // The stride-1 loop must be innermost for a memcpy loop.
-  DCHECK(!inner_kernel_is_memcpy_ || loop_order_.back().dim_in_a == ndim - 1)
+  DCHECK(!inner_kernel_is_memcpy_ || loop_order_.back().is_inner_dim_in_a)
       << ToString();
 
-  loop_parallelism_ = ChooseParallelizationStrategy(inverse_permutation);
-  int num_threads =
-      absl::c_accumulate(loop_parallelism_, int{1}, std::multiplies<int>());
+  int num_threads = ChooseParallelizationStrategy();
   nodes_.resize(num_threads);
   for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
-    BuildPlanNodes(inverse_permutation, thread_id, nodes_[thread_id]);
+    BuildPlanNodes(thread_id, nodes_[thread_id]);
   }
 
   switch (transformation_) {
@@ -1260,28 +1259,20 @@ void TransposePlan::Initialize() {
   }
 }
 
-std::vector<int> TransposePlan::ChooseParallelizationStrategy(
-    absl::Span<int64_t const> inverse_permutation) {
-  std::vector<int> parallelism;
+int TransposePlan::ChooseParallelizationStrategy() {
   int available_parallelism = num_threads_requested_;
-  parallelism.reserve(loop_order_.size());
 
-  int ndim = permutation_.size();
-  const int pos_stride1a = ndim - 1;
-  const int pos_stride1b_in_a = permutation_.back();
   // Compute the number of iterations in `loop`.
   auto loop_iterations = [&](const Loop& loop) {
-    int a_dim = loop.dim_in_a;
-    int b_dim = inverse_permutation[a_dim];
-    int64_t tile_size = std::max(a_tiling_[a_dim], b_tiling_[b_dim]);
     int64_t size = loop.tile_interior
-                       ? tile_size
-                       : (CeilOfRatio(a_dims_[loop.dim_in_a], tile_size));
-    if (!inner_kernel_is_memcpy_ && (loop.tile_interior || tile_size == 1)) {
-      if (loop.dim_in_a == pos_stride1a) {
+                       ? loop.tile_size
+                       : (CeilOfRatio(loop.dim_size, loop.tile_size));
+    if (!inner_kernel_is_memcpy_ &&
+        (loop.tile_interior || loop.tile_size == 1)) {
+      if (loop.is_inner_dim_in_a) {
         size = CeilOfRatio<int64_t>(size,
                                     inner_block_elems_ * outer_block_elems_a_);
-      } else if (loop.dim_in_a == pos_stride1b_in_a) {
+      } else if (loop.is_inner_dim_in_b) {
         size = CeilOfRatio<int64_t>(size,
                                     inner_block_elems_ * outer_block_elems_b_);
       }
@@ -1306,8 +1297,9 @@ std::vector<int> TransposePlan::ChooseParallelizationStrategy(
 
   // Heuristic that attempts to parallelize the outermost loops, down to a
   // minimum per-thread number of bytes processed.
+  int num_threads = 1;
   for (size_t i = 0; i < loop_order_.size(); ++i) {
-    const Loop& loop = loop_order_[i];
+    Loop& loop = loop_order_[i];
     CHECK_GE(available_parallelism, 1);
     int64_t iterations = loop_iterations(loop);
     int kMinBytesPerThread = inner_kernel_is_memcpy_ ? (1 << 20) : (1 << 26);
@@ -1318,14 +1310,15 @@ std::vector<int> TransposePlan::ChooseParallelizationStrategy(
     VLOG(8) << "iterations=" << iterations << " parallel_work=" << parallel_work
             << " available_parallelism=" << available_parallelism;
     if (parallel_work >= available_parallelism) {
-      parallelism.push_back(available_parallelism);
+      loop.parallelism = available_parallelism;
       available_parallelism = 1;
     } else {
-      parallelism.push_back(parallel_work);
+      loop.parallelism = parallel_work;
       available_parallelism /= parallel_work;
     }
+    num_threads *= loop.parallelism;
   }
-  return parallelism;
+  return num_threads;
 }
 
 std::string TransposePlan::ToString() const {
@@ -1348,7 +1341,8 @@ std::string TransposePlan::ToString() const {
       });
   auto format_loop_order = [](std::string* out, const Loop& loop) {
     return absl::StrAppend(out, loop.dim_in_a,
-                           loop.tile_interior ? "[tile]" : "");
+                           loop.tile_interior ? "[tile]" : "", "(",
+                           loop.parallelism, ")");
   };
   std::string transformation_str;
   switch (transformation_) {
@@ -1362,7 +1356,7 @@ std::string TransposePlan::ToString() const {
   return absl::StrFormat(
       "elem_size=%d a_dims=%s b_dims=%s permutation=%s a_tiling=%s b_tiling=%s "
       "lda=%s lda_tile=%s ldb=%s ldb_tile=%s loop_order=%s "
-      "loop_parallelism=%s outer_bs=[%d,%d] inner_bs=%d "
+      "outer_bs=[%d,%d] inner_bs=%d "
       "transformation=%s scratch_size=%d\n"
       "nodes:\n%s",
       elem_size_in_bytes_, absl::StrJoin(a_dims_, ","),
@@ -1371,8 +1365,7 @@ std::string TransposePlan::ToString() const {
       absl::StrJoin(b_tiling_, ","), absl::StrJoin(lda_, ","),
       absl::StrJoin(lda_tile_, ","), absl::StrJoin(ldb_, ","),
       absl::StrJoin(ldb_tile_, ","),
-      absl::StrJoin(loop_order_, ",", format_loop_order),
-      absl::StrJoin(loop_parallelism_, ","), outer_block_elems_a_,
+      absl::StrJoin(loop_order_, ",", format_loop_order), outer_block_elems_a_,
       outer_block_elems_b_, inner_block_elems_, transformation_str,
       scratch_size_, nodes_str);
 }
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index 975e2bccc22c0c..c428d2df0f7f9d 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -169,11 +169,11 @@ class TransposePlan {
   // Performs plan initialization that cannot fail.
   void Initialize();
 
-  void BuildPlanNodes(absl::Span<int64_t const> inverse_permutation,
-                      int thread_id, std::vector<Node>& output_nodes);
+  void BuildPlanNodes(int thread_id, std::vector<Node>& output_nodes);
 
-  std::vector<int> ChooseParallelizationStrategy(
-      absl::Span<int64_t const> inverse_permutation);
+  // Chooses a parallelism for each loop. Returns the total number of parallel
+  // work units.
+  int ChooseParallelizationStrategy();
 
   // The signature of ExecuteTyped uses char* pointers because we perform
   // address calculations with strides in bytes; the strides need not be
@@ -222,13 +222,34 @@ class TransposePlan {
 
   // Order to traverse dimensions, from slowest-varying to fastest-varying.
   struct Loop {
-    // The integers are dimension numbers in A.
+    // Dimension number in A from which this loop originated. This is mostly
+    // for debugging the plan.
     int dim_in_a;
+
     // If true, the loop iterates over the interior of a tile.
+    // For an untiled dimension, this is always false. For a tiled dimension,
+    // we will have two loops: one over the tile exteriors and one over the tile
+    // interiors.
     bool tile_interior;
+
+    // Size of the iteration space.
+    int64_t dim_size;
+
+    // Size of the tiles, if this a tiled dimension.
+    int64_t tile_size;
+
+    int64_t lda;  // Stride in A for this loop.
+    int64_t ldb;  // Stride in B for this loop.
+
+    // Is this the innermost (stride 1) dimension in A or B? These dimensions
+    // are special for the kernels.
+    bool is_inner_dim_in_a;
+    bool is_inner_dim_in_b;
+
+    // Number of parallel threads to use for this loop.
+    int64_t parallelism;
   };
   std::vector<Loop> loop_order_;
-  std::vector<int> loop_parallelism_;
 
   // Root nodes of the plan, i.e., pointing to the outermost loops in the loop
   // nest. The outer vector is indexed on the thread ID.
@@ -246,6 +267,10 @@ class TransposePlan {
   int outer_block_elems_a_ = 4;
   int outer_block_elems_b_ = 4;
 
+  // Strides used by an inner transpose kernel. Unused for memcpy kernels.
+  int64_t sentinel_lda_ = -1;
+  int64_t sentinel_ldb_ = -1;
+
   // Transformations to apply to the input before transposition.
   // Currently the only supported transformation is EF57 conversion, which is
   // a pair-of-floats extended precision representation used on TPU. We

From 4350883de62491e22d27f4cb496e9a944377d2fc Mon Sep 17 00:00:00 2001
From: Ionel Gog <icgog@google.com>
Date: Wed, 17 Dec 2025 13:44:21 -0800
Subject: [PATCH 466/753] Add platform name to xla::ifrt::Device

PiperOrigin-RevId: 845901966
---
 .../xla/xla/backends/cpu/nanort/ifrt_client.cc        |  2 ++
 third_party/xla/xla/python/compile_only_ifrt/client.h | 11 ++++++++---
 third_party/xla/xla/python/ifrt/device.h              |  3 +++
 .../xla/python/ifrt/ir/compiled_ifrt_ir_program.cc    | 10 ++--------
 third_party/xla/xla/python/ifrt/mock.cc               |  3 +++
 third_party/xla/xla/python/ifrt/mock.h                |  1 +
 .../xla/xla/python/ifrt_proxy/client/client.cc        |  6 +++---
 .../xla/xla/python/ifrt_proxy/client/device.cc        |  7 +++++--
 third_party/xla/xla/python/ifrt_proxy/client/device.h |  6 ++++--
 .../xla/xla/python/ifrt_proxy/common/VERSION.md       |  6 ++++++
 .../xla/python/ifrt_proxy/common/ifrt_service.proto   |  1 +
 .../xla/xla/python/ifrt_proxy/common/versions.h       |  3 +++
 .../xla/xla/python/ifrt_proxy/server/ifrt_backend.cc  |  1 +
 .../xla/python/ifrt_proxy/server/ifrt_backend_test.cc |  2 ++
 third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc   |  3 ++-
 third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc   |  8 ++++++--
 third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h    |  7 +++++--
 third_party/xla/xla/python/version.h                  |  3 ++-
 18 files changed, 59 insertions(+), 24 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
index 6f5e41e3391830..709f9f0f914f34 100644
--- a/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/ifrt_client.cc
@@ -1261,6 +1261,8 @@ class NanoDevice final : public llvm::RTTIExtends<NanoDevice, ifrt::Device> {
     return *attributes;
   }
 
+  absl::string_view PlatformName() const override { return "cpu"; }
+
   absl::string_view Kind() const override { return "cpu"; }
 
   absl::string_view ToString() const override { return "NanoRT CPU"; }
diff --git a/third_party/xla/xla/python/compile_only_ifrt/client.h b/third_party/xla/xla/python/compile_only_ifrt/client.h
index ef30a57d8cdd8a..621547a757b962 100644
--- a/third_party/xla/xla/python/compile_only_ifrt/client.h
+++ b/third_party/xla/xla/python/compile_only_ifrt/client.h
@@ -101,8 +101,10 @@ class CompileOnlyMemory
 class CompileOnlyDevice
     : public llvm::RTTIExtends<CompileOnlyDevice, ifrt::Device> {
  public:
-  explicit CompileOnlyDevice(const PjRtDeviceDescription* description)
+  explicit CompileOnlyDevice(const PjRtDeviceDescription* description,
+                             absl::string_view platform_name)
       : description_(std::move(description)),
+        platform_name_(platform_name),
         attributes_(ifrt::FromPjRtAttributeMap(description_->Attributes())) {}
 
   const PjRtDeviceDescription& description() const { return *description_; }
@@ -115,6 +117,8 @@ class CompileOnlyDevice
 
   int ProcessIndex() const override { return description_->process_index(); }
 
+  absl::string_view PlatformName() const override { return platform_name_; }
+
   absl::string_view Kind() const override {
     return description_->device_kind();
   }
@@ -150,6 +154,7 @@ class CompileOnlyDevice
 
  private:
   const PjRtDeviceDescription* description_;
+  const std::string platform_name_;
   ifrt::AttributeMap attributes_;
   ifrt::Memory* default_memory_ = nullptr;
   std::vector<ifrt::Memory*> unowned_memories_;
@@ -193,8 +198,8 @@ class CompileOnlyIfRtClient final
         attributes_(ifrt::AttributeMap::Map()) {
     int offset = 0;
     for (auto& description : descriptions_) {
-      owned_devices_.push_back(
-          std::make_unique<CompileOnlyDevice>(description.get()));
+      owned_devices_.push_back(std::make_unique<CompileOnlyDevice>(
+          description.get(), topology_->platform_name()));
       auto* device = owned_devices_.back().get();
       devices_.push_back(device);
       if (description->process_index() == process_index()) {
diff --git a/third_party/xla/xla/python/ifrt/device.h b/third_party/xla/xla/python/ifrt/device.h
index a9d86a7d17e3ed..0298701f54ba2f 100644
--- a/third_party/xla/xla/python/ifrt/device.h
+++ b/third_party/xla/xla/python/ifrt/device.h
@@ -57,6 +57,9 @@ class Device : public llvm::RTTIExtends<Device, llvm::RTTIRoot> {
   // reference will remain valid for the lifetime of the Device.
   virtual const AttributeMap& Attributes() const = 0;
 
+  // A string that uniquely identifies the platform, e.g., "tpu", "cuda", "cpu".
+  virtual absl::string_view PlatformName() const = 0;
+
   // A vendor-dependent string that uniquely identifies the kind of device,
   // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
   // compatible compilation.
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
index 67c912b86cafc2..e1d96bff98046b 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
@@ -386,14 +386,8 @@ absl::StatusOr<CompiledIfrtIrProgram> CompiledIfrtIrProgram::Create(
     compile_pipeline_options.propagate_shardings =
         compile_options->propagate_shardings;
     for (const auto device : devices) {
-      auto platform_name =
-          device->Attributes().Get<std::string>("platform_name");
-      if (platform_name.ok()) {
-        compile_pipeline_options.platform_names.push_back(*platform_name);
-      } else {
-        compile_pipeline_options.platform_names.push_back(
-            std::string(client->platform_name()));
-      }
+      compile_pipeline_options.platform_names.push_back(
+          std::string(device->PlatformName()));
     }
     TF_RETURN_IF_ERROR(xla::ifrt::createOutlinedAtomProgramsToCompiledPipeline(
         pm, std::move(atom_program_compiler), compile_pipeline_options,
diff --git a/third_party/xla/xla/python/ifrt/mock.cc b/third_party/xla/xla/python/ifrt/mock.cc
index b71a27124a61df..3ecaef013dd39c 100644
--- a/third_party/xla/xla/python/ifrt/mock.cc
+++ b/third_party/xla/xla/python/ifrt/mock.cc
@@ -261,6 +261,9 @@ MockDevice::MockDevice(Device* delegated) : delegated_(delegated) {
   ON_CALL(*this, ProcessIndex).WillByDefault([this]() {
     return delegated_->ProcessIndex();
   });
+  ON_CALL(*this, PlatformName).WillByDefault([this]() {
+    return delegated_->PlatformName();
+  });
   ON_CALL(*this, Kind).WillByDefault([this]() { return delegated_->Kind(); });
   ON_CALL(*this, Attributes).WillByDefault([this]() -> const AttributeMap& {
     return delegated_->Attributes();
diff --git a/third_party/xla/xla/python/ifrt/mock.h b/third_party/xla/xla/python/ifrt/mock.h
index 7da107a1b79f01..ae686297484915 100644
--- a/third_party/xla/xla/python/ifrt/mock.h
+++ b/third_party/xla/xla/python/ifrt/mock.h
@@ -237,6 +237,7 @@ class MockDevice : public Device {
   MOCK_METHOD(bool, IsAddressable, (), (const, final));
   MOCK_METHOD(int, ProcessIndex, (), (const, final));
   MOCK_METHOD(DeviceId, Id, (), (const, final));
+  MOCK_METHOD(absl::string_view, PlatformName, (), (const, final));
   MOCK_METHOD(absl::string_view, Kind, (), (const, final));
   MOCK_METHOD((const AttributeMap&), Attributes, (), (const, final));
   MOCK_METHOD(absl::StatusOr<Memory*>, DefaultMemory, (), (const, final));
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index 03c92861e6560f..e902620da7dd7c 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -139,9 +139,9 @@ absl::StatusOr<std::unique_ptr<Client>> Client::Create(
     bool is_addressable = addressable_device_ids.contains(d.id());
     bool is_primary = primary_device_ids.contains(d.id());
 
-    auto device =
-        std::make_unique<Device>(std::move(desc), d.local_device_id(),
-                                 d.local_hardware_id(), is_addressable);
+    auto device = std::make_unique<Device>(
+        std::move(desc), d.platform_name(), d.local_device_id(),
+        d.local_hardware_id(), is_addressable);
     all_device_ptrs.push_back(device.get());
     if (is_primary) {
       primary_device_ptrs.push_back(device.get());
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.cc b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
index 63a8b084512dc1..cb1019ae548705 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
@@ -14,6 +14,7 @@
 
 #include "xla/python/ifrt_proxy/client/device.h"
 
+#include <string>
 #include <utility>
 
 #include "absl/status/status.h"
@@ -28,9 +29,10 @@ namespace xla {
 namespace ifrt {
 namespace proxy {
 
-Device::Device(DeviceDescription description, int local_device_id,
-               int local_hardware_id, bool is_addressable)
+Device::Device(DeviceDescription description, std::string platform_name,
+               int local_device_id, int local_hardware_id, bool is_addressable)
     : description_(std::move(description)),
+      platform_name_(std::move(platform_name)),
       attributes_(FromPjRtAttributeMap(description_.Attributes())),
       local_device_id_(local_device_id),
       local_hardware_id_(local_hardware_id),
@@ -42,6 +44,7 @@ DeviceId Device::Id() const { return DeviceId(description_.id()); }
 
 bool Device::IsAddressable() const { return is_addressable_; }
 
+absl::string_view Device::PlatformName() const { return platform_name_; }
 absl::string_view Device::Kind() const { return description_.device_kind(); }
 absl::string_view Device::ToString() const { return description_.ToString(); }
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.h b/third_party/xla/xla/python/ifrt_proxy/client/device.h
index 7e0c684d8b0e5a..c81b0359b7ba9d 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/device.h
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.h
@@ -76,13 +76,14 @@ class DeviceDescription final : public xla::PjRtDeviceDescription {
 
 class Device final : public llvm::RTTIExtends<Device, xla::ifrt::Device> {
  public:
-  Device(DeviceDescription description, int local_device_id,
-         int local_hardware_id, bool is_addressable);
+  Device(DeviceDescription description, std::string platform_name,
+         int local_device_id, int local_hardware_id, bool is_addressable);
 
   ifrt::Client* client() const override;
   bool IsAddressable() const override;
 
   DeviceId Id() const override;
+  absl::string_view PlatformName() const override;
   absl::string_view Kind() const override;
   absl::string_view ToString() const override;
   absl::string_view DebugString() const override;
@@ -100,6 +101,7 @@ class Device final : public llvm::RTTIExtends<Device, xla::ifrt::Device> {
 
   ifrt::Client* client_;
   const DeviceDescription description_;
+  const std::string platform_name_;
 
   const AttributeMap attributes_;
 
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
index 84691a8c5de6bd..59147e46daf025 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
+++ b/third_party/xla/xla/python/ifrt_proxy/common/VERSION.md
@@ -131,3 +131,9 @@
 *   Changes:
     *   Added a new op `LoadedExecutableFetchExecuteResult` for reading
         execution results.
+
+## Version kkDevicePlatformName
+
+*   Added date: 2025-12-13
+*   Changes:
+    *   Added `Device::PlatformName()` for getting the platform of a device.
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
index d8638b40629de4..6b20e3bad369cc 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -248,6 +248,7 @@ message InitResponse {
     int32 id = 1;
     int32 local_device_id = 9;
     int32 local_hardware_id = 2;
+    string platform_name = 11;
     string device_kind = 3;
     optional int32 default_memory_id = 7;
     repeated int32 memory_ids = 8;
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/versions.h b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
index fbbc0675084768..2b04ea6a778bf4 100644
--- a/third_party/xla/xla/python/ifrt_proxy/common/versions.h
+++ b/third_party/xla/xla/python/ifrt_proxy/common/versions.h
@@ -70,6 +70,9 @@ enum {
   // results to return extra information such as device time measurement.
   kExecuteResult = 21,
 
+  // kDevicePlatformName adds a PlatformName() method to Device.
+  kDevicePlatformName = 22,
+
   // kSentiel is used to derive kCurrent below. Keep this as the last value of
   // the enum.
   kSentiel,
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
index 21d8272662662a..b4408a2bcf8dbf 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -763,6 +763,7 @@ absl::StatusOr<BackendInterface::Response> IfrtBackend::HandleInit(
   for (auto* device : all_devices) {
     InitResponse::Device* d = init_resp->add_all_devices();
     d->set_id(device->Id().value());
+    d->set_platform_name(AsProtoStringData(device->PlatformName()));
     d->set_device_kind(AsProtoStringData(device->Kind()));
     if (auto default_memory = device->DefaultMemory(); default_memory.ok()) {
       d->set_default_memory_id((*default_memory)->Id().value());
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
index 332c4c7e3b201c..9e8b511d18bc72 100644
--- a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -471,6 +471,7 @@ TEST_P(IfrtBackendHandlerTest, Init) {
 
     MockDevice& mock_device = *mock_devices_[i];
     // TODO(b/314368788): Clean up PJRT device ID APIs.
+    EXPECT_CALL(mock_device, PlatformName()).WillRepeatedly(Return("mock"));
     EXPECT_CALL(mock_device, Kind()).WillRepeatedly(Return("mock"));
     EXPECT_CALL(mock_device, Memories())
         .WillRepeatedly(Return(device_memories[i]));
@@ -500,6 +501,7 @@ TEST_P(IfrtBackendHandlerTest, Init) {
   EXPECT_EQ(init_response.all_devices().size(), 2);
   for (auto device : init_response.all_devices()) {
     int device_canonical_num = device.id();
+    EXPECT_EQ(device.platform_name(), "mock");
     EXPECT_EQ(device.device_kind(), "mock");
     EXPECT_EQ(device.default_memory_id(), device_canonical_num);
     EXPECT_EQ(device.memory_ids().size(), 1);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index d906dd44f5bf78..e2cdde0b602b88 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -498,6 +498,7 @@ MakePjRtDevicesFromGlobalTopology(PjRtClient* client,
       }
     }
 
+    std::string platform_name(pjrt_client->platform_name());
     const bool node_is_me = process_index == global_topology.my_process_index;
     for (const DeviceProto& device_proto : node.devices()) {
       absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes;
@@ -536,7 +537,7 @@ MakePjRtDevicesFromGlobalTopology(PjRtClient* client,
         }
       }
       auto ifrt_device = std::make_unique<PjRtDevice>(
-          client, ifrt_device_id, device_proto.device_kind(),
+          client, ifrt_device_id, platform_name, device_proto.device_kind(),
           std::move(to_string), std::move(debug_string), process_index,
           std::move(attributes), pjrt_device);
       devices.push_back(std::move(ifrt_device));
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
index 5fb57bb8882f7e..fa8db5f6b9f2bc 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.cc
@@ -39,13 +39,15 @@ char PjRtCompatibleDevice::ID = 0;
 char PjRtDevice::ID = 0;
 
 PjRtDevice::PjRtDevice(
-    PjRtClient* client, DeviceId id, std::string kind, std::string to_string,
-    std::string debug_string, int process_index,
+    PjRtClient* client, DeviceId id, std::string platform_name,
+    std::string kind, std::string to_string, std::string debug_string,
+    int process_index,
     absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes,
     xla::PjRtDevice* pjrt_device)
     : client_(client),
       id_(id),
       attributes_(FromPjRtAttributeMap(std::move(attributes))),
+      platform_name_(std::move(platform_name)),
       kind_(std::move(kind)),
       to_string_(std::move(to_string)),
       debug_string_(std::move(debug_string)),
@@ -56,6 +58,8 @@ DeviceId PjRtDevice::Id() const { return id_; }
 
 const AttributeMap& PjRtDevice::Attributes() const { return attributes_; }
 
+absl::string_view PjRtDevice::PlatformName() const { return platform_name_; }
+
 absl::string_view PjRtDevice::Kind() const { return kind_; }
 
 absl::string_view PjRtDevice::ToString() const { return to_string_; }
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
index 596db196304df4..6a832174feaaa0 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_device.h
@@ -43,8 +43,9 @@ class PjRtCompatibleDevice : public llvm::RTTIExtends<PjRtDevice, Device> {
 class PjRtDevice final
     : public llvm::RTTIExtends<PjRtDevice, PjRtCompatibleDevice> {
  public:
-  PjRtDevice(PjRtClient* client, DeviceId id, std::string kind,
-             std::string to_string, std::string debug_string, int process_index,
+  PjRtDevice(PjRtClient* client, DeviceId id, std::string platform_name,
+             std::string kind, std::string to_string, std::string debug_string,
+             int process_index,
              absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes,
              xla::PjRtDevice* pjrt_device);
 
@@ -57,6 +58,7 @@ class PjRtDevice final
 
   DeviceId Id() const final;
   const AttributeMap& Attributes() const final;
+  absl::string_view PlatformName() const final;
   absl::string_view Kind() const final;
   absl::string_view ToString() const final;
   absl::string_view DebugString() const final;
@@ -74,6 +76,7 @@ class PjRtDevice final
 
   DeviceId id_;
   AttributeMap attributes_;
+  std::string platform_name_;
   std::string kind_;
   std::string to_string_;
   std::string debug_string_;
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 6ee37c65188467..6460623314ce63 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER 43  // Transpose API update
+#define JAX_IFRT_VERSION_NUMBER \
+  44  // xla::ifrt::Device has a new PlatformName() API.
 
 #endif  // XLA_PYTHON_VERSION_H_

From b525b848e7b79f7583665ee82b0fced245871425 Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Wed, 17 Dec 2025 13:52:26 -0800
Subject: [PATCH 467/753] Implement `AbslStringify` for strong int types in TSL

PiperOrigin-RevId: 845904790
---
 third_party/xla/xla/tsl/lib/gtl/BUILD           |  2 ++
 third_party/xla/xla/tsl/lib/gtl/int_type.h      | 17 +++++++++++++++++
 .../xla/xla/tsl/lib/gtl/int_type_test.cc        |  7 +++++++
 3 files changed, 26 insertions(+)

diff --git a/third_party/xla/xla/tsl/lib/gtl/BUILD b/third_party/xla/xla/tsl/lib/gtl/BUILD
index 31e08d6c09686e..9f1ff1dc6723eb 100644
--- a/third_party/xla/xla/tsl/lib/gtl/BUILD
+++ b/third_party/xla/xla/tsl/lib/gtl/BUILD
@@ -98,6 +98,7 @@ cc_library(
     deps = [
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:types",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -247,6 +248,7 @@ tsl_cc_test(
         "//xla/tsl/platform:test",
         "//xla/tsl/platform:types",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:hash",
     ],
diff --git a/third_party/xla/xla/tsl/lib/gtl/int_type.h b/third_party/xla/xla/tsl/lib/gtl/int_type.h
index c0760d45cae7c0..7930484b948a83 100644
--- a/third_party/xla/xla/tsl/lib/gtl/int_type.h
+++ b/third_party/xla/xla/tsl/lib/gtl/int_type.h
@@ -154,11 +154,13 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <cstdint>
 #include <functional>
 #include <iosfwd>
 #include <ostream>  // NOLINT
 #include <unordered_map>
 
+#include "absl/strings/str_format.h"
 #include "xla/tsl/platform/macros.h"
 #include "xla/tsl/platform/types.h"
 
@@ -290,6 +292,21 @@ std::ostream& operator<<(std::ostream& os,  // NOLINT
   return os << arg.value();
 }
 
+template <typename Sink, typename... T>
+void AbslStringify(Sink& sink, IntType<T...> arg) {
+  using ValueType = typename decltype(arg)::ValueType;
+
+  // int8_t/uint8_t are not supported by the "%v" specifier due to it being
+  // ambiguous whether an integer or character should be printed.
+  if constexpr (std::is_same_v<ValueType, int8_t>) {
+    absl::Format(&sink, "%d", arg.value());
+  } else if constexpr (std::is_same_v<ValueType, uint8_t>) {
+    absl::Format(&sink, "%u", arg.value());
+  } else {
+    absl::Format(&sink, "%v", arg.value());
+  }
+}
+
 // -- NON-MEMBER ARITHMETIC OPERATORS ------------------------------------------
 // We support only the +, -, *, and / operators with the same IntType and
 // ValueType types.  The reason is to allow simple manipulation on these IDs
diff --git a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
index 1205cd7a3e3251..0120e2f96e1ecc 100644
--- a/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
+++ b/third_party/xla/xla/tsl/lib/gtl/int_type_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/types.h"
 
@@ -291,4 +292,10 @@ TYPED_TEST(IntTypeTest, TestMove) {
   EXPECT_EQ(321, *foo.ptr);
 }
 
+TYPED_TEST(IntTypeTest, TestAbslStringify) {
+  TypeParam a(1);
+
+  EXPECT_EQ(absl::StrCat(a), absl::StrCat(a.value()));
+}
+
 }  // namespace tsl

From a207484d9eb07cfa24da75c9428a20112b410fd2 Mon Sep 17 00:00:00 2001
From: Fengwu Yao <fengwuyao@google.com>
Date: Wed, 17 Dec 2025 13:58:35 -0800
Subject: [PATCH 468/753] Update to use half data type in split.

PiperOrigin-RevId: 845906959
---
 tensorflow/lite/testing/BUILD            |  4 ++++
 tensorflow/lite/testing/split.h          |  9 ++++-----
 tensorflow/lite/testing/split_test.cc    | 13 +++++++++++++
 tensorflow/lite/testing/tflite_driver.cc |  7 ++++---
 4 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 958928db4663d8..ba3ce1c9e0b9e8 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -275,6 +275,7 @@ cc_library(
     hdrs = ["split.h"],
     deps = [
         "//tensorflow/lite:string",
+        "//tensorflow/lite/types:half",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -286,7 +287,9 @@ cc_test(
     deps = [
         ":split",
         "//tensorflow/lite:string",
+        "//tensorflow/lite/types:half",
         "@com_google_googletest//:gtest_main",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -333,6 +336,7 @@ cc_library(
         "//tensorflow/lite/tools:logging",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/evaluation:utils",
+        "//tensorflow/lite/types:half",
         "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
     ] + select({
diff --git a/tensorflow/lite/testing/split.h b/tensorflow/lite/testing/split.h
index ec932a8de8d68f..5431bccf1a72f6 100644
--- a/tensorflow/lite/testing/split.h
+++ b/tensorflow/lite/testing/split.h
@@ -25,6 +25,7 @@ limitations under the License.
 
 #include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace testing {
@@ -199,12 +200,10 @@ inline std::vector<std::complex<double>> Split(const string& s,
 }
 
 template <>
-inline std::vector<Eigen::half> Split(const string& s,
-                                      const string& delimiter) {
-  std::vector<Eigen::half> fields;
+inline std::vector<half> Split(const string& s, const string& delimiter) {
+  std::vector<half> fields;
   for (const auto& p : SplitToPos(s, delimiter)) {
-    fields.push_back(Eigen::half_impl::float_to_half_rtne(
-        strtof(s.data() + p.first, nullptr)));
+    fields.push_back(static_cast<half>(strtof(s.data() + p.first, nullptr)));
   }
   return fields;
 }
diff --git a/tensorflow/lite/testing/split_test.cc b/tensorflow/lite/testing/split_test.cc
index c8824395ea97dc..90b8276b3ed654 100644
--- a/tensorflow/lite/testing/split_test.cc
+++ b/tensorflow/lite/testing/split_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/types/half.h"
 
 namespace tflite {
 namespace testing {
@@ -45,6 +47,17 @@ TEST(SplitTest, SplitFloat) {
   EXPECT_THAT(Split<float>("1.0 B 1e-5", " "), ElementsAre(1.0, 0.0, 1e-5));
 }
 
+TEST(SplitTest, SplitHalf) {
+  EXPECT_THAT(Split<half>("1.0 2.5 1e-2", " "),
+              ElementsAre(half(1.0f), half(2.5f), half(0.01f)));
+}
+
+TEST(SplitTest, SplitBfloat16) {
+  EXPECT_THAT(Split<Eigen::bfloat16>("1.0 2.5 1e-2", " "),
+              ElementsAre(Eigen::bfloat16(1.0f), Eigen::bfloat16(2.5f),
+                          Eigen::bfloat16(0.01f)));
+}
+
 TEST(SplitTest, SplitInt) {
   EXPECT_THAT(Split<int>("1,-1,258", ","), ElementsAre(1, -1, 258));
 }
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 89fed23bb7d2a8..5b15e6a6ed0ee5 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/lite/testing/result_expectations.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/logging.h"
+#include "tensorflow/lite/types/half.h"
 #if !defined(__APPLE__)
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #endif
@@ -405,11 +406,11 @@ void TfLiteDriver::SetInput(const std::string& name,
       break;
     }
     case kTfLiteFloat16: {
-      const auto& values = testing::Split<Eigen::half>(csv_values, ",");
+      const auto& values = testing::Split<half>(csv_values, ",");
       for (auto k : values) {
         TFLITE_LOG(INFO) << "input" << k;
       }
-      if (!CheckSizes<Eigen::half>(tensor->bytes, values.size())) return;
+      if (!CheckSizes<half>(tensor->bytes, values.size())) return;
       SetTensorData(values, tensor->data.raw);
       break;
     }
@@ -500,7 +501,7 @@ void TfLiteDriver::SetExpectation(const std::string& name,
       expected_output_[id]->SetData<std::complex<double>>(csv_values);
       break;
     case kTfLiteFloat16:
-      expected_output_[id]->SetData<Eigen::half>(csv_values);
+      expected_output_[id]->SetData<half>(csv_values);
       break;
     case kTfLiteBFloat16:
       expected_output_[id]->SetData<Eigen::bfloat16>(csv_values);

From 1262b408a282a7c1a98685eb5b7ed5198430f91f Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Wed, 17 Dec 2025 14:04:30 -0800
Subject: [PATCH 469/753] [PjRt-IFRT] Internally track the output spec of
 `ifrt::PjRtExecutable`

This change adds dtype/shape/sharding/layout discovery within `ifrt::PjRtExecutable`. This closely matches the internals of `ifrt::PjRtLoadedExecutable`.

This output spec information is not used at the moment, but will be used for implementing unified `Serialize()` methods that store IFRT-level metadata of serialized executables in the `SerializedXlaExecutableMetadata` proto format (and make this information preserved across serialization/deserialization roundtrip).

PiperOrigin-RevId: 845909500
---
 .../xla/python/pjrt_ifrt/pjrt_executable.cc   | 81 +++++++++++++++----
 .../xla/python/pjrt_ifrt/pjrt_executable.h    | 19 ++++-
 2 files changed, 80 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index d668d190ba5ce6..a700a495ca48a8 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -33,6 +33,8 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Types.h"
 #include "xla/ffi/execution_context.h"
 #include "xla/ffi/type_registry.h"
 #include "xla/future.h"
@@ -345,6 +347,23 @@ absl::StatusOr<std::vector<xla::LayoutMode>> GetOutputLayoutModesFromHloModules(
                         output_dtypes.size());
 }
 
+// Returns a list of result shapes from the given MLIR module.
+absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
+    mlir::ModuleOp module) {
+  mlir::func::FuncOp main = module.lookupSymbol<mlir::func::FuncOp>("main");
+  if (!main) {
+    return InvalidArgument("MLIR module has no main function");
+  }
+  mlir::FunctionType type = main.getFunctionType();
+  std::vector<xla::Shape> result_shapes;
+  result_shapes.reserve(type.getNumResults());
+  for (unsigned i = 0; i < type.getNumResults(); ++i) {
+    mlir::Type result_type = type.getResult(i);
+    result_shapes.push_back(xla::TypeToShape(result_type));
+  }
+  return result_shapes;
+}
+
 // Returns a new `DeviceListRef` that contains the addressable devices of the
 // PjRt executable if the supplied `executable_devices` has an incomplete set of
 // devices.
@@ -421,12 +440,56 @@ char PjRtLoadedExecutable::ID = 0;
 absl::StatusOr<ExecutableRef> PjRtExecutable::Create(
     mlir::ModuleOp module, xla::CompileOptions compile_options,
     const xla::PjRtTopologyDescription& topology) {
+  // We have to do process the MLIR before the compile call, since the latter
+  // will use the MLIR as scratch space, or possibly even deallocate it.
+  TF_ASSIGN_OR_RETURN(
+      const std::vector<xla::Shape> mlir_module_output_xla_shapes,
+      ResultShapesOfModule(module));
+  TF_ASSIGN_OR_RETURN(const std::vector<xla::LayoutMode> output_layout_modes,
+                      GetOutputLayoutModes(module));
+
   TF_ASSIGN_OR_RETURN(auto pjrt_executable,
                       PjRtCompile(std::move(compile_options), std::move(module),
                                   topology, /*client=*/nullptr));
-  return ExecutableRef(new PjRtExecutable(std::move(pjrt_executable)));
+
+  TF_ASSIGN_OR_RETURN(auto output_dtypes_and_shapes,
+                      GetDTypesAndShapes(mlir_module_output_xla_shapes));
+  std::vector<DType> output_dtypes = std::move(output_dtypes_and_shapes.first);
+  std::vector<Shape> output_shapes = std::move(output_dtypes_and_shapes.second);
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+      GetHloShardings(pjrt_executable->GetOutputShardings(), output_dtypes,
+                      /*is_output=*/true));
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<absl::string_view> output_memory_kinds,
+      GetMemoryKinds(pjrt_executable->GetOutputMemoryKinds(), output_dtypes));
+
+  TF_ASSIGN_OR_RETURN(
+      std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          output_layouts,
+      GetLayouts(pjrt_executable->GetOutputLayouts(), output_layout_modes));
+
+  return ExecutableRef(new PjRtExecutable(
+      std::move(pjrt_executable), std::move(output_dtypes),
+      std::move(output_shapes), std::move(output_hlo_shardings),
+      std::move(output_memory_kinds), std::move(output_layouts)));
 }
 
+PjRtExecutable::PjRtExecutable(
+    std::shared_ptr<xla::PjRtExecutable> pjrt_executable,
+    std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
+    std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+    std::vector<absl::string_view> output_memory_kinds,
+    std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+        output_layouts)
+    : pjrt_executable_(std::move(pjrt_executable)),
+      output_dtypes_(std::move(output_dtypes)),
+      output_shapes_(std::move(output_shapes)),
+      output_hlo_shardings_(std::move(output_hlo_shardings)),
+      output_memory_kinds_(std::move(output_memory_kinds)),
+      output_layouts_(std::move(output_layouts)) {}
+
 absl::StatusOr<std::optional<std::string>> PjRtExecutable::Fingerprint() const {
   DCHECK(this);
   return pjrt_executable_->FingerprintExecutable();
@@ -507,22 +570,6 @@ absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
       std::move(output_layouts)));
 }
 
-static absl::StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
-    mlir::ModuleOp module) {
-  auto main = module.lookupSymbol<mlir::func::FuncOp>("main");
-  if (!main) {
-    return InvalidArgument("MLIR module has no main function");
-  }
-  auto type = main.getFunctionType();
-  std::vector<xla::Shape> result_shapes;
-  result_shapes.reserve(type.getNumResults());
-  for (unsigned i = 0; i < type.getNumResults(); ++i) {
-    auto result_type = type.getResult(i);
-    result_shapes.push_back(xla::TypeToShape(result_type));
-  }
-  return result_shapes;
-}
-
 absl::StatusOr<LoadedExecutableRef> PjRtLoadedExecutable::Create(
     PjRtClient* client, mlir::ModuleOp module,
     xla::CompileOptions compile_options,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
index 71a3fb944ca0be..976a905b7afefc 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
@@ -175,10 +175,23 @@ class PjRtExecutable final
   static char ID;  // NOLINT
 
  protected:
-  explicit PjRtExecutable(std::shared_ptr<xla::PjRtExecutable> pjrt_executable)
-      : pjrt_executable_(std::move(pjrt_executable)) {}
+  PjRtExecutable(
+      std::shared_ptr<xla::PjRtExecutable> pjrt_executable,
+      std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
+      std::optional<std::vector<xla::HloSharding>> output_hlo_shardings,
+      std::vector<absl::string_view> output_memory_kinds,
+      std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+          output_layouts);
 
   std::shared_ptr<xla::PjRtExecutable> pjrt_executable_;
+
+  // Output array specs.
+  std::vector<DType> output_dtypes_;
+  std::vector<Shape> output_shapes_;
+  std::optional<std::vector<xla::HloSharding>> output_hlo_shardings_;
+  std::vector<absl::string_view> output_memory_kinds_;
+  std::optional<std::vector<std::shared_ptr<const xla::PjRtLayout>>>
+      output_layouts_;
 };
 
 // `LoadedExecutable` implementation that wraps a `xla::PjRtLoadedExecutable`.

From 6d3c0f702ff1a90769541228ac10ba1fa5774aa8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 14:45:49 -0800
Subject: [PATCH 470/753] [stream_executor:cuda] Use
 Nccl/NvshmemMemoryAllocator to allocate collective memory

It is a layering violation to depend from SE to XLA:GPU collectives. All memory allocations should be done via correct se::MemoryAllocator instances. Prepare for removing memory allocation APIs from GPU collectives.

PiperOrigin-RevId: 845925468
---
 .../xla/xla/stream_executor/cuda/BUILD        | 63 +++++++++++++--
 .../xla/stream_executor/cuda/cuda_executor.cc | 77 +++----------------
 .../cuda/nvshmem_memory_allocator_stub.cc     | 29 +++++++
 3 files changed, 99 insertions(+), 70 deletions(-)
 create mode 100644 third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index f2126b3e9ad1ba..622c4fa354e72f 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1030,13 +1030,67 @@ cc_library(
 )
 
 cc_library(
-    name = "nvshmem_memory_allocator",
-    srcs = ["nvshmem_memory_allocator.cc"],
+    name = "nvshmem_memory_allocator_if_builtin_used",
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = select({
+        "//xla/stream_executor/cuda:no_builtin_used": [
+            ":nvshmem_memory_allocator_stub",
+        ],
+        "//conditions:default": [":nvshmem_memory_allocator"],
+    }),
+)
+
+cc_library(
+    name = "nvshmem_memory_allocator_if_supported",
     hdrs = ["nvshmem_memory_allocator.h"],
     tags = [
         "cuda-only",
         "gpu",
     ],
+    deps = select({
+        "//xla/stream_executor/cuda:nvshmem_supported": [
+            ":nvshmem_memory_allocator_if_builtin_used",
+        ],
+        "//conditions:default": [":nvshmem_memory_allocator_stub"],
+    }) + [
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+# Used when NVSHMEM is not linked or can't be used.
+cc_library(
+    name = "nvshmem_memory_allocator_stub",
+    srcs = [
+        "nvshmem_memory_allocator.h",
+        "nvshmem_memory_allocator_stub.cc",
+    ],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
+    deps = [
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_allocator",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+cc_library(
+    name = "nvshmem_memory_allocator",
+    srcs = [
+        "nvshmem_memory_allocator.cc",
+        "nvshmem_memory_allocator.h",
+    ],
+    tags = [
+        "cuda-only",
+        "gpu",
+    ],
     deps = [
         ":cuda_memory_allocator",
         ":nvshmem",
@@ -1267,11 +1321,10 @@ cc_library(
         ":cuda_timer",
         ":cuda_version_parser",
         ":cudnn_api_wrappers",
+        ":nccl_memory_allocator",
+        ":nvshmem_memory_allocator_if_supported",
         ":tma_util",
         "//xla:util",
-        "//xla/backends/gpu/collectives:gpu_collectives",
-        "//xla/core/collectives",
-        "//xla/core/collectives:collectives_registry",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:command_buffer",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 32d6ef67a058dd..7aa0c84b67ce53 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -51,9 +51,6 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
 #include "third_party/gpus/cuda/nvml/include/nvml.h"
-#include "xla/backends/gpu/collectives/gpu_collectives.h"
-#include "xla/core/collectives/collectives.h"
-#include "xla/core/collectives/collectives_registry.h"
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
@@ -69,6 +66,8 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_timer.h"
 #include "xla/stream_executor/cuda/cuda_version_parser.h"
 #include "xla/stream_executor/cuda/cudnn_api_wrappers.h"
+#include "xla/stream_executor/cuda/nccl_memory_allocator.h"
+#include "xla/stream_executor/cuda/nvshmem_memory_allocator.h"
 #include "xla/stream_executor/cuda/tma_util.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
@@ -841,14 +840,6 @@ CudaExecutor::~CudaExecutor() {
   CHECK(gpu_binary_to_module_.empty()) << "CudaExecutor has loaded modules.";
 }
 
-absl::StatusOr<xla::gpu::GpuCollectives*> GetGpuCollectives(
-    StreamExecutor* executor) {
-  std::unique_ptr<ActivateContext> activation = executor->Activate();
-  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
-                      xla::CollectivesRegistry::Default("gpu"));
-  return tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
-}
-
 CudaExecutor::VmmMemoryHandle::~VmmMemoryHandle() { CHECK_OK(Release()); }
 
 absl::Status CudaExecutor::VmmMemoryHandle::Release() {
@@ -979,27 +970,6 @@ absl::StatusOr<bool> CudaExecutor::VmmDeallocateMemory(void* ptr) {
   return true;
 }
 
-absl::StatusOr<void*> CollectiveMemoryAllocate(StreamExecutor* executor,
-                                               uint64_t bytes) {
-  if (bytes == 0) {
-    return nullptr;
-  }
-
-  std::unique_ptr<ActivateContext> activation = executor->Activate();
-  TF_ASSIGN_OR_RETURN(xla::gpu::GpuCollectives * gpu_collectives,
-                      GetGpuCollectives(executor));
-  return gpu_collectives->Allocate(bytes);
-}
-
-absl::Status CollectiveMemoryDeallocate(StreamExecutor* executor,
-                                        void* location) {
-  std::unique_ptr<ActivateContext> activation = executor->Activate();
-
-  TF_ASSIGN_OR_RETURN(xla::gpu::GpuCollectives * gpu_collectives,
-                      GetGpuCollectives(executor));
-  return gpu_collectives->Deallocate(location);
-}
-
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
 CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
   if (type == MemorySpace::kUnified) {
@@ -1035,28 +1005,16 @@ CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
   }
 
   if (type == MemorySpace::kCollective) {
-    // TODO(469289220): Use NCCL/NVSHMEM memory allocator here instead.
-    return std::make_unique<GenericMemoryAllocator>(
-        [this](uint64_t size)
-            -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
-          TF_ASSIGN_OR_RETURN(void* ptr, CollectiveMemoryAllocate(this, size));
-          XLA_VLOG_DEVICE(2, device_ordinal())
-              << "allocated " << ptr << " for context " << cuda_context_
-              << " of " << size << " bytes of collective memory";
-          return std::make_unique<GenericMemoryAllocation>(
-              ptr, size, [this](void* location, uint64_t size) {
-                auto status = CollectiveMemoryDeallocate(this, location);
-                if (!status.ok()) {
-                  XLA_LOG_DEVICE(ERROR, device_ordinal())
-                      << "failed to free collective memory at " << location
-                      << "; result: " << status;
-                } else {
-                  XLA_VLOG_DEVICE(2, device_ordinal())
-                      << "deallocated collective memory at " << location
-                      << " for context " << cuda_context_;
-                }
-              });
-        });
+    switch (collective_allocator_type_) {
+      case CollectiveAllocatorType::kNvshmem:
+        return std::make_unique<NvshmemMemoryAllocator>();
+      case CollectiveAllocatorType::kNccl:
+        return std::make_unique<NcclMemoryAllocator>(this);
+      default:
+        return absl::UnimplementedError(
+            absl::StrCat("Unsupported collective allocator type: ",
+                         collective_allocator_type_));
+    }
   }
 
   if (type == MemorySpace::kHost) {
@@ -1405,17 +1363,6 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
       << "CudaExecutor::Allocate size: " << size
       << " memory_space: " << memory_space;
 
-  if (memory_space == static_cast<int64_t>(MemorySpace::kCollective)) {
-    auto result = CollectiveMemoryAllocate(this, size);
-    if (!result.ok()) {
-      XLA_LOG_DEVICE(ERROR, device_ordinal())
-          << "CudaExecutor::Allocate returns " << result.value();
-    }
-    XLA_VLOG_DEVICE(1, device_ordinal())
-        << "CudaExecutor::Allocate returns " << result.value();
-    return DeviceAddressBase(result.value(), size);
-  }
-
   if (memory_space == static_cast<int64_t>(MemorySpace::kHost)) {
     auto result = HostAllocate(cuda_context_, numa_node_, size);
     if (!result.ok()) {
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc
new file mode 100644
index 00000000000000..d4d124b89af8db
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc
@@ -0,0 +1,29 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/stream_executor/cuda/nvshmem_memory_allocator.h"
+#include "xla/stream_executor/memory_allocation.h"
+
+namespace stream_executor::gpu {
+absl::StatusOr<std::unique_ptr<MemoryAllocation>>
+NvshmemMemoryAllocator::Allocate(uint64_t size) {
+  return absl::UnimplementedError("NVSHMEM is not supported on this platform.");
+}
+}  // namespace stream_executor::gpu

From 5c0a168ea10caf7fbae3da7ea64deff6072ee11d Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Wed, 17 Dec 2025 15:13:44 -0800
Subject: [PATCH 471/753] Fix HloRunnerPjRt incorrectly not re-tupling results
 for replicated execution.

PiperOrigin-RevId: 845936810
---
 third_party/xla/xla/service/hlo_runner_pjrt.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 78404dade0960f..ab2d8958683683 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -699,6 +699,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   std::vector<PjRtDevice*> replica_devices(options.num_devices, nullptr);
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> argument_buffer_slices;
   argument_buffer_slices.reserve(options.num_devices);
+  std::vector<bool> is_tuple_result(options.num_devices, false);
   for (int64_t i = 0; i < options.num_devices; ++i) {
     // Amortize device lookup.
     TF_ASSIGN_OR_RETURN(PjRtDevice* const device_ptr,
@@ -711,6 +712,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
     TF_ASSIGN_OR_RETURN(const HloModule* const module,
                         HloModuleFromWrapped(wrapped_executable));
     const ComputationLayout& ecl = module->entry_computation_layout();
+    is_tuple_result[i] = ecl.result_shape().IsTuple();
 
     // Transfer literals to device.
     const int64_t argument_count = argument_count_provider(i);
@@ -807,9 +809,9 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   std::vector<Literal> result_literals;
   result_literals.reserve(options.num_devices);
   for (int64_t i = 0; i < options.num_devices; ++i) {
-    TF_ASSIGN_OR_RETURN(Literal literal,
-                        TransferLiteralsFromDevice(
-                            result_buffers[i], result_buffers[i].size() != 1));
+    TF_ASSIGN_OR_RETURN(
+        Literal literal,
+        TransferLiteralsFromDevice(result_buffers[i], is_tuple_result[i]));
     result_literals.push_back(std::move(literal));
   }
 

From e1f8fd4ccb9fe4d467f41eceaf62b84e5cf2594d Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Wed, 17 Dec 2025 15:21:48 -0800
Subject: [PATCH 472/753] Add Google-specific signal handling.

PiperOrigin-RevId: 845939788
---
 .../preemption/preemption_notifier.cc         | 14 +++++++++
 .../preemption/preemption_notifier_test.cc    | 31 +++++++++++++++----
 2 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc
index f09feeb05fcb03..e25a85a40e4d38 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.cc
@@ -28,6 +28,10 @@ limitations under the License.
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "xla/tsl/platform/env.h"
+#if defined(PLATFORM_GOOGLE)
+#include "thread/executor.h"
+#include "thread/signal.h"
+#endif
 
 namespace xla {
 
@@ -53,7 +57,17 @@ class SigtermNotifier : public PreemptionNotifier {
 SigtermNotifier::SigtermNotifier(tsl::Env* env) : PreemptionNotifier(env) {
   sigterm_received.store(false);
   StartListenerThread();
+#if defined(PLATFORM_GOOGLE)
+  thread::signal::Token unused_token;
+
+  thread::signal::AddHandler(
+      SIGTERM, thread::Executor::DefaultExecutor(),
+      []() { sigterm_received.store(true); },
+      /*flags=*/0,  // Don't override existing signal handlers.
+      &unused_token);
+#else
   std::signal(SIGTERM, [](int signal) { sigterm_received.store(true); });
+#endif
 }
 
 void SigtermNotifier::StartListenerThread() {
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
index 7bf8216b338ef5..3a012e7632c776 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
@@ -27,11 +27,30 @@ limitations under the License.
 #include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#if defined(PLATFORM_GOOGLE)
+#include "thread/executor.h"
+#include "thread/signal.h"
+#endif
 
 namespace xla {
 namespace {
 
-TEST(PreemptNotifierTest, WillBePreemptedAt) {
+class PreemptNotifierTest : public ::testing::Test {
+ public:
+  PreemptNotifierTest() {
+#if defined(PLATFORM_GOOGLE)
+    // Override default test SIGTERM handler so that test does not exit
+    // prematurely.
+    thread::signal::Token unused_token;
+
+    thread::signal::AddHandler(
+        SIGTERM, thread::Executor::DefaultExecutor(), []() {},
+        thread::signal::kOverrideDefault, &unused_token);
+#endif
+  }
+};
+
+TEST_F(PreemptNotifierTest, WillBePreemptedAt) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -52,8 +71,8 @@ TEST(PreemptNotifierTest, WillBePreemptedAt) {
   EXPECT_LT(time_diff, absl::Seconds(3));
 }
 
-TEST(PreemptNotifierTest,
-     WillBePreemptedAt_AlreadyPreempted_ReturnsImmediately) {
+TEST_F(PreemptNotifierTest,
+       WillBePreemptedAt_AlreadyPreempted_ReturnsImmediately) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -78,7 +97,7 @@ TEST(PreemptNotifierTest,
   EXPECT_LT(time_diff, absl::Seconds(2));
 }
 
-TEST(PreemptNotifierTest, WillBePreemptedAtAsync_SameResultForAllCallbacks) {
+TEST_F(PreemptNotifierTest, WillBePreemptedAtAsync_SameResultForAllCallbacks) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -109,7 +128,7 @@ TEST(PreemptNotifierTest, WillBePreemptedAtAsync_SameResultForAllCallbacks) {
   EXPECT_EQ(preempt_time.value(), preempt_time_2.value());
 }
 
-TEST(PreemptNotifierTest, Reset_TwoDifferentPreemptTimesRecorded) {
+TEST_F(PreemptNotifierTest, Reset_TwoDifferentPreemptTimesRecorded) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);
@@ -131,7 +150,7 @@ TEST(PreemptNotifierTest, Reset_TwoDifferentPreemptTimesRecorded) {
   EXPECT_NE(preempt_time, preempt_time_2);
 }
 
-TEST(PreemptNotifierTest, DestructorCancelsPendingCalls) {
+TEST_F(PreemptNotifierTest, DestructorCancelsPendingCalls) {
   auto env = tsl::Env::Default();
   std::unique_ptr<PreemptionNotifier> preempt_notifier =
       PreemptionNotifier::CreatePreemptionNotifier("sigterm", env);

From 9f44d0753c701e66f0484c39f934650531926aa1 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Wed, 17 Dec 2025 15:42:44 -0800
Subject: [PATCH 473/753] Export `sdy.replicated_to_unreduced` to a manual
 computation.

The `sdy.replicated_to_unreduced` operation is lowered to a `sdy.manual_computation`. The manual computation checks if the current device is the first one in the partition.
* If yes, the input tensor is used
* Otherwise, a tensor of zeros is produced.

PiperOrigin-RevId: 845947531
---
 .../export_manual_reduction_collectives.cc    | 91 +++++++++++++++----
 .../export_manual_reduction_collectives.h     |  9 +-
 ...o_export_manual_reduction_collectives.mlir | 35 +++++++
 3 files changed, 110 insertions(+), 25 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
index c0f7e612472574..edb5c3b54457df 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
+#include <utility>
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -259,6 +260,25 @@ int64_t convertReduceScatter(sdy::ReduceScatterOp op, int64_t nextChannelId,
   return nextChannelId;
 }
 
+std::pair<llvm::StringMap<Value>, llvm::StringMap<Value>>
+getAxesCoordinateAndSize(OpBuilder& builder, mlir::Location loc,
+                         MeshAttr mesh) {
+  Value partitionId = stablehlo::PartitionIdOp::create(builder, loc);
+  Value currentRem = stablehlo::ConvertOp::create(
+      builder, loc, RankedTensorType::get({}, builder.getIntegerType(32)),
+      partitionId);
+  llvm::StringMap<Value> axisSizes, axisCoordinates;
+  for (sdy::MeshAxisAttr axis : llvm::reverse(mesh.getAxes())) {
+    Value axisSize = stablehlo::ConstantOp::create(
+        builder, loc, builder.getI32IntegerAttr(axis.getSize()));
+    axisSizes[axis.getName()] = axisSize;
+    axisCoordinates[axis.getName()] =
+        stablehlo::RemOp::create(builder, loc, currentRem, axisSize);
+    currentRem = stablehlo::DivOp::create(builder, loc, currentRem, axisSize);
+  }
+  return {axisCoordinates, axisSizes};
+}
+
 void convertShardedToUnreduced(sdy::ShardedToUnreducedOp op,
                                mlir::IRRewriter& rewriter) {
   TensorShardingAttr outSharding = op.getOutSharding();
@@ -289,23 +309,8 @@ void convertShardedToUnreduced(sdy::ShardedToUnreducedOp op,
         Value broadcast = stablehlo::BroadcastOp::create(
             blockBuilder, loc, outputType, zero, outputType.getShape());
 
-        // Decompose partitionId into axis coordinates.
-        Value partitionId = stablehlo::PartitionIdOp::create(blockBuilder, loc);
-        Value currentRem = stablehlo::ConvertOp::create(
-            blockBuilder, loc,
-            RankedTensorType::get({}, blockBuilder.getIntegerType(32)),
-            partitionId);
-        llvm::StringMap<Value> axisSizes, axisCoordinates;
-        for (sdy::MeshAxisAttr axis : llvm::reverse(mesh.getAxes())) {
-          Value axisSize = stablehlo::ConstantOp::create(
-              blockBuilder, loc,
-              blockBuilder.getI32IntegerAttr(axis.getSize()));
-          axisSizes[axis.getName()] = axisSize;
-          axisCoordinates[axis.getName()] =
-              stablehlo::RemOp::create(blockBuilder, loc, currentRem, axisSize);
-          currentRem =
-              stablehlo::DivOp::create(blockBuilder, loc, currentRem, axisSize);
-        }
+        auto [axisCoordinates, axisSizes] =
+            getAxesCoordinateAndSize(blockBuilder, loc, mesh);
 
         SmallVector<Value> offsets;
         offsets.reserve(outputType.getRank());
@@ -348,6 +353,48 @@ void convertShardedToUnreduced(sdy::ShardedToUnreducedOp op,
   rewriter.replaceOp(op, manualComputation);
 }
 
+void convertReplicatedToUnreduced(sdy::ReplicatedToUnreducedOp op,
+                                  mlir::IRRewriter& rewriter) {
+  TensorShardingAttr outSharding = op.getOutSharding();
+  MeshAttr mesh = outSharding.getMesh(op);
+
+  mlir::Location loc = op.getLoc();
+  rewriter.setInsertionPoint(op);
+
+  ManualComputationOp manualComputation = createFullyManualComputation(
+      loc, op.getTensor(), outSharding, mesh, rewriter,
+      [&](mlir::BlockArgument arg, OpBuilder& blockBuilder) {
+        auto [axisCoordinates, axisSizes] =
+            getAxesCoordinateAndSize(blockBuilder, loc, mesh);
+        (void)axisSizes;
+
+        Value i32Zero = stablehlo::ConstantOp::create(
+            blockBuilder, loc, blockBuilder.getI32IntegerAttr(0));
+        Value pred = nullptr;
+        for (AxisRefAttr axis : op.getAxes()) {
+          CHECK(!axis.getSubAxisInfo()) << "Sub-axes not supported in "
+                                           "ReplicatedToUnreducedOp.";
+          Value coord = axisCoordinates[axis.getName()];
+          Value isZero =
+              stablehlo::CompareOp::create(blockBuilder, loc, coord, i32Zero,
+                                           stablehlo::ComparisonDirection::EQ);
+          pred = pred
+                     ? stablehlo::AndOp::create(blockBuilder, loc, pred, isZero)
+                     : isZero;
+        }
+        CHECK(pred != nullptr) << "No replicated-to-unreduced axes.";
+
+        RankedTensorType type = mlir::cast<RankedTensorType>(arg.getType());
+        Value zeroVal = stablehlo::ConstantOp::create(
+            blockBuilder, loc, blockBuilder.getZeroAttr(type.getElementType()));
+        Value zeroBroadcast = stablehlo::BroadcastOp::create(
+            blockBuilder, loc, type, zeroVal, type.getShape());
+        return stablehlo::SelectOp::create(blockBuilder, loc, pred, arg,
+                                           zeroBroadcast);
+      });
+  rewriter.replaceOp(op, manualComputation);
+}
+
 void syncInOutUnreducedAxes(mlir::Operation* op) {
   Value input = op->getOperand(0);
   TensorShardingAttr outSharding = sdy::getSharding(op->getResult(0));
@@ -415,6 +462,9 @@ class StablehloExportManualReductionCollectivesPass
       } else if (auto shardedToUnreduced =
                      mlir::dyn_cast<sdy::ShardedToUnreducedOp>(op)) {
         convertShardedToUnreduced(shardedToUnreduced, rewriter);
+      } else if (auto replicatedToUnreduced =
+                     mlir::dyn_cast<sdy::ReplicatedToUnreducedOp>(op)) {
+        convertReplicatedToUnreduced(replicatedToUnreduced, rewriter);
       }
     });
   }
@@ -424,9 +474,10 @@ class StablehloExportManualReductionCollectivesPass
   }
 
   StringRef getDescription() const override {
-    return "Exports `sdy.all_reduce`, that originate from user defined "
-           "shardings with unreduced axes, to `stablehlo.all_reduce` inside a "
-           "fully manual `sdy.manual_computation`";
+    return "Exports `sdy.all_reduce`, `sdy.reduce_scatter`, "
+           "`sdy.sharded_to_unreduced` and `sdy.replicated_to_unreduced` that "
+           "originate from user-defined shardings with unreduced axes. The "
+           "exported ops are inside a full manual `sdy.manual_computation`.";
   }
 
   void getDependentDialects(mlir::DialectRegistry& registry) const final {
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h
index 98575deebb9f3c..be473a371ca545 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.h
@@ -23,11 +23,10 @@ limitations under the License.
 namespace xla {
 namespace sdy {
 
-// TODO(tomnatan): mention reduce-scatter and how collectives are marked.
-// TODO(tomnatan): mention if the shard map is fully manual or not.
-
-// Exports `sdy.all_reduce`, that originate from user defined shardings with
-// unreduced axes, to `stablehlo.all_reduce` inside an `sdy.manual_computation`.
+// Exports `sdy.all_reduce`, `sdy.reduce_scatter`, `sdy.sharded_to_unreduced`
+// and `sdy.replicated_to_unreduced` that originate from user-defined shardings
+// with unreduced axes. The exported ops are inside a full manual
+// `sdy.manual_computation`.
 std::unique_ptr<mlir::Pass>
 createStablehloExportManualReductionCollectivesPass();
 
diff --git a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
index 288dff8d476c67..be4e2a5243a255 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
@@ -371,3 +371,38 @@ func.func @sharded_to_unreduced(%arg0: tensor<16x16xf32> {sdy.sharding = #sdy.sh
   %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {"y"}], unreduced={"x"}> : tensor<16x16xf32>
   return %0 : tensor<16x16xf32>
 }
+
+// -----
+
+sdy.mesh @mesh = <["x"=4, "y"=2, "z"=3]>
+
+// CHECK-LABEL: func @replicated_to_unreduced
+func.func @replicated_to_unreduced(%arg0: tensor<16x16xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"y"}>}) -> tensor<16x16xf32> {
+  // CHECK-NEXT: %[[MANUAL_COMP:.*]] = sdy.manual_computation(%arg0)
+  // CHECK-SAME:     in_shardings=[<@mesh, [{}, {}], unreduced={"y"}>]
+  // CHECK-SAME:     out_shardings=[<@mesh, [{}, {}], unreduced={"x", "y", "z"}>]
+  // CHECK-SAME:     manual_axes={"x", "y", "z"} (%arg1: tensor<16x16xf32>) {
+  // CHECK-NEXT:   %[[PID:.*]] = stablehlo.partition_id : tensor<ui32>
+  // CHECK-NEXT:   %[[PID_I32:.*]] = stablehlo.convert %[[PID]] : (tensor<ui32>) -> tensor<i32>
+  // CHECK-NEXT:   %[[C3:.*]] = stablehlo.constant dense<3> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_Z:.*]] = stablehlo.remainder %[[PID_I32]], %[[C3]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_Z:.*]] = stablehlo.divide %[[PID_I32]], %[[C3]] : tensor<i32>
+  // CHECK-NEXT:   %[[C2:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_Y:.*]] = stablehlo.remainder %[[DIV_Z]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_Y:.*]] = stablehlo.divide %[[DIV_Z]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[C4:.*]] = stablehlo.constant dense<4> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_X:.*]] = stablehlo.remainder %[[DIV_Y]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_X:.*]] = stablehlo.divide %[[DIV_Y]], %[[C4]] : tensor<i32>
+  // CHECK-NEXT:   %[[C0:.*]] = stablehlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT:   %[[CMP_X:.*]] = stablehlo.compare  EQ, %[[REM_X]], %[[C0]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT:   %[[CMP_Z:.*]] = stablehlo.compare  EQ, %[[REM_Z]], %[[C0]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT:   %[[PRED:.*]] = stablehlo.and %[[CMP_X]], %[[CMP_Z]] : tensor<i1>
+  // CHECK-NEXT:   %[[ZERO:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:   %[[ZERO_BCAST:.*]] = stablehlo.broadcast %[[ZERO]], sizes = [16, 16] : (tensor<f32>) -> tensor<16x16xf32>
+  // CHECK-NEXT:   %[[SELECT:.*]] = stablehlo.select %[[PRED]], %arg1, %[[ZERO_BCAST]] : tensor<i1>, tensor<16x16xf32>
+  // CHECK-NEXT:   sdy.return %[[SELECT]] : tensor<16x16xf32>
+  // CHECK-NEXT: } : (tensor<16x16xf32>) -> tensor<16x16xf32>
+  // CHECK-NEXT: return %[[MANUAL_COMP]] : tensor<16x16xf32>
+  %0 = sdy.replicated_to_unreduced {"x", "z"} %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<16x16xf32>
+  return %0 : tensor<16x16xf32>
+}

From 3015ca53fe15b4ccb9b61db19ff7895533bfcb88 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Wed, 17 Dec 2025 16:43:33 -0800
Subject: [PATCH 474/753] Add proto serialization for AllReduceStartThunk

PiperOrigin-RevId: 845967844
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  20 +++
 .../backends/gpu/runtime/all_reduce_thunk.cc  | 131 +++++++++++++++++-
 .../backends/gpu/runtime/all_reduce_thunk.h   |  23 ++-
 .../gpu/runtime/all_reduce_thunk_test.cc      |  74 ++++++++++
 .../gpu/runtime/collective_kernel_thunk.h     |   9 ++
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  22 +++
 .../runtime/thunk_proto_deserialization.cc    |   5 +
 7 files changed, 279 insertions(+), 5 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index f0aad60b990ab0..c9221c3ffb8ce3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1400,7 +1400,9 @@ cc_library(
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/backends/gpu/collectives:gpu_communicator",
         "//xla/core/collectives:communicator",
+        "//xla/core/collectives:reduction_kind",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
@@ -1409,15 +1411,32 @@ cc_library(
         "//xla/stream_executor:stream_executor_h",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:casts",
     ],
 )
 
+xla_cc_test(
+    name = "all_reduce_thunk_test",
+    srcs = ["all_reduce_thunk_test.cc"],
+    deps = [
+        ":all_reduce_thunk",
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "all_to_all_thunk",
     srcs = ["all_to_all_thunk.cc"],
@@ -2864,6 +2883,7 @@ cc_library(
     hdrs = ["thunk_proto_deserialization.h"],
     deps = [
         ":all_gather_thunk",
+        ":all_reduce_thunk",
         ":collective_thunk",
         ":conditional_thunk",
         ":convolution_reorder_thunk",
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
index c8efc4c3c933cd..4347670507e220 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
@@ -22,7 +22,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
@@ -30,9 +32,11 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
@@ -46,11 +50,42 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+ReductionKindProto ToReductionKindProto(ReductionKind kind) {
+  switch (kind) {
+    case ReductionKind::SUM:
+      return REDUCTION_KIND_SUM;
+    case ReductionKind::PRODUCT:
+      return REDUCTION_KIND_PRODUCT;
+    case ReductionKind::MIN:
+      return REDUCTION_KIND_MIN;
+    case ReductionKind::MAX:
+      return REDUCTION_KIND_MAX;
+  }
+}
+
+absl::StatusOr<ReductionKind> FromReductionKindProto(
+    const ReductionKindProto& proto) {
+  switch (proto) {
+    case REDUCTION_KIND_SUM:
+      return ReductionKind::SUM;
+    case REDUCTION_KIND_PRODUCT:
+      return ReductionKind::PRODUCT;
+    case REDUCTION_KIND_MIN:
+      return ReductionKind::MIN;
+    case REDUCTION_KIND_MAX:
+      return ReductionKind::MAX;
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unknown ReductionKindProto: ", proto));
+  }
+}
+
 absl::Status CheckImplementableInst(const HloInstruction* inst,
                                     Thunk::Kind reduction_op) {
   for (HloInstruction* operand : inst->operands()) {
@@ -120,14 +155,36 @@ AllReduceReduceScatterThunkBase::AllReduceReduceScatterThunkBase(
   CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
+AllReduceReduceScatterThunkBase::AllReduceReduceScatterThunkBase(
+    Thunk::Kind kind, ThunkInfo thunk_info, AllReduceConfig config,
+    std::vector<Buffer> buffers,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events)
+    : CollectiveThunk(kind, thunk_info, async_events,
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      config_(std::move(config)),
+      buffers_(std::move(buffers)) {
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
+}
+
 AllReduceStartThunk::AllReduceStartThunk(
     ThunkInfo thunk_info, const HloAllReduceInstruction* inst,
     std::vector<Buffer> buffers,
     std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
     bool p2p_memcpy_enabled)
-    : AllReduceReduceScatterThunkBase(
-          Thunk::kAllReduceStart, thunk_info, GetAllReduceConfigInst(inst),
-          std::move(buffers), IsGPUSyncCollective(*inst)),
+    : AllReduceStartThunk(
+          thunk_info, GetAllReduceConfigInst(inst), std::move(buffers),
+          std::move(collective_kernel_thunk),
+          IsGPUSyncCollective(*inst)
+              ? nullptr
+              : std::make_shared<CollectiveThunk::AsyncEvents>()) {}
+
+AllReduceStartThunk::AllReduceStartThunk(
+    ThunkInfo thunk_info, const AllReduceConfig& config,
+    std::vector<Buffer> buffers,
+    std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
+    std::shared_ptr<CollectiveThunk::AsyncEvents> async_events)
+    : AllReduceReduceScatterThunkBase(Thunk::kAllReduceStart, thunk_info,
+                                      config, std::move(buffers), async_events),
       collective_kernel_thunk_(std::move(collective_kernel_thunk)) {}
 
 absl::Status AllReduceStartThunk::CheckImplementable(
@@ -188,6 +245,74 @@ absl::StatusOr<bool> AllReduceStartThunk::RunCollective(
   return true;
 }
 
+absl::StatusOr<std::unique_ptr<AllReduceStartThunk>>
+AllReduceStartThunk::FromProto(
+    ThunkInfo thunk_info, const AllReduceStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
+      async_events_map[AsyncEventsUniqueId{
+          thunk_proto.async_events_unique_id()}];
+  if (!async_events) {
+    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  ASSIGN_OR_RETURN(ReductionKind reduction_kind,
+                   FromReductionKindProto(thunk_proto.reduction_kind()));
+
+  auto kernel_thunk = std::make_unique<CollectiveKernelThunk>(
+      thunk_info, config, reduction_kind, thunk_proto.is_async(), buffers,
+      thunk_proto.collective_kernel_enabled(), thunk_proto.kernel_name(),
+      thunk_proto.shmem_bytes(), thunk_proto.is_multimem_enabled());
+
+  return std::make_unique<AllReduceStartThunk>(
+      std::move(thunk_info), AllReduceConfig{config, reduction_kind},
+      std::move(buffers), std::move(kernel_thunk), async_events);
+}
+
+absl::StatusOr<ThunkProto> AllReduceStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  AllReduceStartThunkProto* thunk_proto =
+      proto.mutable_all_reduce_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (!async_events_id.has_value()) {
+    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  }
+  thunk_proto->set_async_events_unique_id(async_events_id->value());
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  thunk_proto->set_reduction_kind(ToReductionKindProto(config_.reduction_kind));
+
+  thunk_proto->set_is_multimem_enabled(
+      collective_kernel_thunk_->is_multimem_enabled());
+  thunk_proto->set_shmem_bytes(collective_kernel_thunk_->shmem_bytes());
+  thunk_proto->set_kernel_name(collective_kernel_thunk_->kernel_name());
+  thunk_proto->set_collective_kernel_enabled(
+      collective_kernel_thunk_->collective_kernel_enabled());
+  thunk_proto->set_is_async(collective_kernel_thunk_->is_async());
+
+  return proto;
+}
+
 ReduceScatterStartThunk::ReduceScatterStartThunk(
     ThunkInfo thunk_info, const HloReduceScatterInstruction* inst,
     std::vector<Buffer> buffers, bool p2p_memcpy_enabled)
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
index 4825fc672ef669..88856dd6e608d5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.h
@@ -22,12 +22,15 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_kernel_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/core/collectives/communicator.h"
+#include "xla/core/collectives/reduction_kind.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
 
@@ -48,6 +51,10 @@ class AllReduceReduceScatterThunkBase : public CollectiveThunk {
   AllReduceReduceScatterThunkBase(Kind kind, ThunkInfo thunk_info,
                                   AllReduceConfig config,
                                   std::vector<Buffer> buffers, bool is_sync);
+  AllReduceReduceScatterThunkBase(
+      Kind kind, ThunkInfo thunk_info, AllReduceConfig config,
+      std::vector<Buffer> buffers,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events);
 
   const CollectiveConfig& config() const override { return config_.config; }
   ReductionKind reduction_kind() const { return config_.reduction_kind; }
@@ -70,8 +77,13 @@ class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
       std::vector<Buffer> buffers,
       std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
       bool p2p_memcpy_enabled = false);
+  AllReduceStartThunk(
+      ThunkInfo thunk_info, const AllReduceConfig& config,
+      std::vector<Buffer> buffers,
+      std::unique_ptr<CollectiveKernelThunk> collective_kernel_thunk,
+      std::shared_ptr<CollectiveThunk::AsyncEvents> async_events);
 
-  static const char* GetHloOpName() { return "all-reduce-start"; }
+  static absl::string_view GetHloOpName() { return "all-reduce-start"; }
 
   static absl::Status CheckImplementable(const HloAllReduceInstruction* inst,
                                          int64_t replica_count,
@@ -83,6 +95,13 @@ class AllReduceStartThunk : public AllReduceReduceScatterThunkBase {
   absl::Status Prepare(const PrepareParams& params) override;
   absl::Status Initialize(const InitializeParams& params) override;
 
+  static absl::StatusOr<std::unique_ptr<AllReduceStartThunk>> FromProto(
+      ThunkInfo thunk_info, const AllReduceStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
@@ -104,7 +123,7 @@ class ReduceScatterStartThunk : public AllReduceReduceScatterThunkBase {
                           std::vector<Buffer> buffers,
                           bool p2p_memcpy_enabled = false);
 
-  static const char* GetHloOpName() { return "reduce-scatter-start"; }
+  static absl::string_view GetHloOpName() { return "reduce-scatter-start"; }
 
   static absl::Status CheckImplementable(
       const HloReduceScatterInstruction* inst, int64_t replica_count,
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc
new file mode 100644
index 00000000000000..96c258b30d7623
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_reduce_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          reduction_kind: 1
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllReduceStartThunk> thunk,
+      AllReduceStartThunk::FromProto(thunk_info, proto.all_reduce_start_thunk(),
+                                     buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_all_reduce_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.all_reduce_start_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
index 8476cf19e90f64..2d7de400a4cb1d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk.h
@@ -78,6 +78,15 @@ class CollectiveKernelThunk : public Thunk {
     per_stream_state_.reserve(kMaxNumExecutors);
   }
 
+  bool is_multimem_enabled() const { return is_multimem_enabled_; }
+
+  int32_t shmem_bytes() const { return shmem_bytes_; }
+
+  absl::string_view kernel_name() const { return kernel_name_; }
+
+  bool collective_kernel_enabled() const { return collective_kernel_enabled_; }
+  bool is_async() const { return is_async_; }
+
   // Returns true if the collective kernel is supported for the given clique.
   absl::StatusOr<bool> IsSupported(
       const GpuCliqueKey& clique_key, se::StreamExecutor& executor,
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index c60e3ea5c94e36..eb564b5e4eb376 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -405,6 +405,27 @@ message AllGatherStartThunkProto {
   repeated CollectiveBufferProto buffers = 3;
 }
 
+enum ReductionKindProto {
+  REDUCTION_KIND_UNSPECIFIED = 0;
+  REDUCTION_KIND_SUM = 1;
+  REDUCTION_KIND_PRODUCT = 2;
+  REDUCTION_KIND_MIN = 3;
+  REDUCTION_KIND_MAX = 4;
+}
+
+message AllReduceStartThunkProto {
+  uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+
+  ReductionKindProto reduction_kind = 4;
+  bool is_multimem_enabled = 5;
+  int32 shmem_bytes = 6;
+  string kernel_name = 7;
+  bool collective_kernel_enabled = 8;
+  bool is_async = 9;
+}
+
 message CollectiveDoneThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
@@ -451,6 +472,7 @@ message ThunkProto {
     CustomKernelThunkProto custom_kernel_thunk = 36;
     CollectiveDoneThunkProto collective_done_thunk = 37;
     AllGatherStartThunkProto all_gather_start_thunk = 38;
+    AllReduceStartThunkProto all_reduce_start_thunk = 39;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index 4b1a965a41a449..a4c8bbdd7e9e51 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "google/protobuf/descriptor.h"
 #include "google/protobuf/message.h"
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
+#include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
@@ -247,6 +248,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return AllGatherStartThunk::FromProto(
           std::move(thunk_info), thunk_proto.all_gather_start_thunk(),
           buffer_allocations, collective_async_events_map);
+    case ThunkProto::kAllReduceStartThunk:
+      return AllReduceStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.all_reduce_start_thunk(),
+          buffer_allocations, collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);

From 11ec4073f6e6dd69d688130edf5f9b2ac0e3e06b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 16:43:44 -0800
Subject: [PATCH 475/753] Handle cupti hardware trace correctly. Flip to true
 only once.

PiperOrigin-RevId: 845967897
---
 .../xla/backends/profiler/gpu/cupti_tracer.cc | 55 ++++++++++---------
 .../gpu/profile_with_cuda_kernels_test.cc     |  4 ++
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index cb42984230bba6..b7f90db95f1b9c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -1077,6 +1077,13 @@ const char* GetCuptiErrorString(CuptiInterface* cupti_interface,
   return err_str;
 }
 
+bool& IsCuptiHardwareEventSystemEnabled() {
+  // This flag can not flip to true once per process. Once enabled, it will stay
+  // enabled until the process is terminated.
+  static bool is_enabled = false;
+  return is_enabled;
+}
+
 }  // namespace
 
 CuptiTracer::CuptiTracer(CuptiInterface* cupti_interface)
@@ -1447,19 +1454,31 @@ absl::Status CuptiTracer::EnableActivityTracing() {
                    << err;
     }
     if (option_->enable_activity_hardware_tracing) {
-      auto err = cupti_interface_->ActivityEnableHWTrace(true);
-      if (err == CUPTI_ERROR_NOT_SUPPORTED) {
-        LOG(INFO)
-            << "CUPTI activity HW trace not enabled due to not supported on "
-               "this platform!";
-      } else if (err != CUPTI_SUCCESS) {
-        LOG(WARNING)
-            << "Fail to enable CUPTI activity HW trace, CUPTI ERROR CODE:"
-            << err << " (" << GetCuptiErrorString(cupti_interface_, err) << ")";
+      if (IsCuptiHardwareEventSystemEnabled()) {
+        LOG(INFO) << "CUPTI activity HW trace already enabled.";
       } else {
-        LOG(INFO) << "CUPTI activity HW trace successfully enabled.";
+        auto err = cupti_interface_->ActivityEnableHWTrace(true);
+        if (err == CUPTI_ERROR_NOT_SUPPORTED) {
+          LOG(INFO)
+              << "CUPTI activity HW trace not enabled due to not supported on "
+                 "this platform!";
+        } else if (err != CUPTI_SUCCESS) {
+          LOG(WARNING)
+              << "Fail to enable CUPTI activity HW trace, CUPTI ERROR CODE:"
+              << err << " (" << GetCuptiErrorString(cupti_interface_, err)
+              << ")";
+        } else {
+          LOG(INFO) << "CUPTI activity HW trace successfully enabled.";
+          IsCuptiHardwareEventSystemEnabled() = true;
+        }
+      }
+    } else {
+      if (IsCuptiHardwareEventSystemEnabled()) {
+        LOG(INFO)
+            << "CUPTI activity HW trace already enabled, continue with it.";
       }
     }
+
     RETURN_IF_CUPTI_ERROR(ActivityRegisterCallbacks(
         RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer));
     VLOG(1) << "Enabling activity tracing for "
@@ -1497,22 +1516,6 @@ absl::Status CuptiTracer::DisableActivityTracing() {
     }
     option_->activities_selected.clear();
 
-    if (option_->enable_activity_hardware_tracing) {
-      auto err = cupti_interface_->ActivityEnableHWTrace(false);
-      // CUPTI_ERROR_NOT_SUPPORTED here is ok as it already handled/logged
-      // in EnableActivityTracing.
-      if (err == CUPTI_ERROR_NOT_SUPPORTED) {
-        LOG(INFO) << "CUPTI activity HW trace not disabled due to not "
-                     "supported on this platform!";
-      } else if (err != CUPTI_SUCCESS) {
-        LOG(WARNING)
-            << "Fail to disable CUPTI activity HW trace, CUPTI ERROR CODE:"
-            << err << " (" << GetCuptiErrorString(cupti_interface_, err) << ")";
-      } else {
-        LOG(INFO) << "CUPTI activity HW trace successfully disabled.";
-      }
-    }
-
     VLOG(1) << "Flushing CUPTI activity buffer";
     RETURN_IF_CUPTI_ERROR(ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
     LOG(INFO) << "CUPTI activity buffer flushed";
diff --git a/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc b/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc
index 3ca82f9d92161f..29b635c1b23887 100644
--- a/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/profile_with_cuda_kernels_test.cc
@@ -187,6 +187,10 @@ void SimpleAddSubWithProfilerTest(bool enable_activity_hardware_tracing,
   EXPECT_EQ(vec.size(), kNumElements);
   EXPECT_THAT(vec, Each(DistanceFrom(0, Lt(0.001))));
 
+  auto space = std::make_unique<tensorflow::profiler::XSpace>();
+  collector->Export(space.get(), CuptiTracer::GetTimestamp());
+  EXPECT_GE(space->planes_size(), 1);
+
   if (enable_pm_sampling) {
     // Expect 4 * elems / (32 elemn / warp) +- 5% double instructions
     // (if they were sampled)

From 07660093c8b57a104784200b3b590f0e43f9fd0f Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Wed, 17 Dec 2025 16:59:47 -0800
Subject: [PATCH 476/753] Remove unused `WaitForAllTasks` from coordination
 service

PiperOrigin-RevId: 845973213
---
 .../coordination/coordination_client.h        |   6 -
 .../coordination/coordination_service.cc      |  56 ------
 .../coordination/coordination_service.h       |  19 --
 .../coordination_service_agent.cc             |  29 ---
 .../coordination/coordination_service_agent.h |   8 -
 .../coordination_service_agent_test.cc        |  10 -
 .../coordination_service_rpc_handler.cc       |  20 --
 .../coordination_service_rpc_handler.h        |   4 -
 .../coordination/coordination_service_test.cc | 187 ------------------
 .../coordination/grpc_coordination_client.cc  |  12 --
 .../grpc_coordination_service_impl.cc         |   1 -
 .../grpc_coordination_service_impl.h          |   1 -
 12 files changed, 353 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
index b9533934ab206f..39246d62754382 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
@@ -54,8 +54,6 @@ using tensorflow::ShutdownTaskRequest;
 using tensorflow::ShutdownTaskResponse;
 using tensorflow::TryGetKeyValueRequest;
 using tensorflow::TryGetKeyValueResponse;
-using tensorflow::WaitForAllTasksRequest;
-using tensorflow::WaitForAllTasksResponse;
 using tensorflow::WatchJobStateRequest;
 using tensorflow::WatchJobStateResponse;
 
@@ -75,10 +73,6 @@ class CoordinationClient {
                               HeartbeatResponse* response,
                               tsl::StatusCallback done) = 0;
 
-  virtual void WaitForAllTasksAsync(const WaitForAllTasksRequest* request,
-                                    WaitForAllTasksResponse* response,
-                                    tsl::StatusCallback done) = 0;
-
   virtual void ShutdownTaskAsync(tsl::CallOptions* call_opts,
                                  const ShutdownTaskRequest* request,
                                  ShutdownTaskResponse* response,
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
index d524685fc760c1..73fbf4c0e04b07 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
@@ -670,31 +670,6 @@ void CoordinationService::RegisterTaskAsync(const CoordinatedTask& task,
   done(error);
 }
 
-void CoordinationService::WaitForAllTasks(const CoordinatedTask& task,
-                                          const DeviceInfo& devices,
-                                          tsl::StatusCallback done) {
-  {
-    absl::MutexLock l(state_mu_);
-    if (ServiceHasStopped()) {
-      done(MakeCoordinationError(absl::InternalError(
-          "Coordination service has stopped. WaitForAllTasks() failed.")));
-      return;
-    }
-    const auto& task_state = cluster_state_.find(GetTaskName(task));
-    // Collect task device info for the first time that task
-    // has called WaitForAllTasks(). This will be aggregated when the barrier
-    // passes.
-    if (task_state != cluster_state_.end() &&
-        !task_state->second->DeviceInfoIsCollected()) {
-      task_state->second->CollectDeviceInfo(devices);
-    }
-  }
-  BarrierAsync(device_propagation_barrier_id_, kUniqueBarrierCounter,
-               kDevicePropagationTimeout, task, {},
-               [done = std::move(done)](const absl::Status& s,
-                                        int64_t unused_counter) { done(s); });
-}
-
 void CoordinationService::ShutdownTaskAsync(const CoordinatedTask& task,
                                             tsl::StatusCallback done) {
   VLOG(3) << "Task " << GetTaskName(task) << " invoked ShutdownTaskAsync()";
@@ -1278,7 +1253,6 @@ void CoordinationService::BarrierAsyncLocked(
       task.recoverable() && counter == 0 &&
       // Not a special once-only barrier.
       barrier_id != kClusterRegisterBarrierId &&
-      barrier_id != device_propagation_barrier_id_ &&
       barrier_id != shutdown_barrier_id_) {
     should_initialize_new_instance = true;
     // Use the service's counter to initialize the new barrier.
@@ -1415,9 +1389,6 @@ void CoordinationService::PassBarrier(BarrierState* barrier,
   LOG(INFO) << "Barrier(" << BarrierName(*barrier)
             << ") has passed with status: " << result;
   // Special hook for device propagation barrier to set global device ids.
-  if (barrier->id == device_propagation_barrier_id_) {
-    AggregateClusterDevices();
-  }
   for (const auto& task_at_barrier : barrier->tasks_at_barrier) {
     // Clean up task state (used as error hooks).
     const CoordinatedTask& task = task_at_barrier.first;
@@ -1738,33 +1709,6 @@ void CoordinationService::ReachBarrier(BarrierState* barrier,
   }
 };
 
-void CoordinationService::AggregateClusterDevices() {
-  assert(cluster_devices_.device_size() == 0);
-  std::vector<CoordinatedTask> ordered_tasks;
-  // Sort by task name to set deterministic order for cluster devices.
-  ordered_tasks.reserve(cluster_state_.size());
-  for (const auto& task : cluster_state_) {
-    ordered_tasks.push_back(GetTaskFromName(task.first));
-  }
-  std::sort(ordered_tasks.begin(), ordered_tasks.end(),
-            [](const CoordinatedTask& task1, const CoordinatedTask& task2) {
-              if (task1.job_name() != task2.job_name()) {
-                return task1.job_name() < task2.job_name();
-              }
-              return task1.task_id() < task2.task_id();
-            });
-
-  // Aggregate to global device list.
-  for (const auto& task : ordered_tasks) {
-    cluster_devices_.MergeFrom(
-        cluster_state_[GetTaskName(task)]->GetDeviceInfo());
-  }
-
-  if (post_aggregate_device_fn_ != nullptr) {
-    cluster_devices_ = post_aggregate_device_fn_(cluster_devices_);
-  }
-}
-
 void CoordinationService::DisconnectAllNonRecoverableTasks() {
   for (const auto& [task_name, state] : cluster_state_) {
     if (state->IsRecoverable()) {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
index 73550626494ca4..c10c90fbbd7588 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
@@ -144,15 +144,6 @@ class CoordinationService {
   void RegisterTaskAsync(const tensorflow::CoordinatedTask& task,
                          IncarnationId incarnation, tsl::StatusCallback done);
 
-  // Wait for all tasks to be up and running, and register local device
-  // info. The callback is invoked when all tasks are up and registered, or some
-  // error occurs.
-  // Each task's local devices will be appended in a deterministic order, and
-  // post-processed by the callback in SetDeviceAggregationFunction() (if set).
-  void WaitForAllTasks(const tensorflow::CoordinatedTask& task,
-                       const tensorflow::DeviceInfo& devices,
-                       tsl::StatusCallback done);
-
   // Disconnects task from the service. If `shutdown_barrier_timeout_in_ms` is
   // specified in the config, blocks until all tasks reach the barrier before
   // disconnecting together.
@@ -564,13 +555,6 @@ class CoordinationService {
     // Sets the error and returns true if the task state is not ERROR.
     // Otherwise, don't overwrite the error and return false.
     bool SetError(const absl::Status& status);
-    tensorflow::DeviceInfo GetDeviceInfo() { return devices_; }
-    void CollectDeviceInfo(const tensorflow::DeviceInfo& devices) {
-      devices_ = devices;
-    }
-    // Checks if task has called WaitForAllTasks() previously, which gathers the
-    // local device info.
-    bool DeviceInfoIsCollected() { return !devices_.device().empty(); }
 
     // This is used to propagate state changes (disconnect, error) to ongoing
     // barriers.
@@ -601,7 +585,6 @@ class CoordinationService {
     // accounts for the lag time between the service recording the state change
     // and the agent stopping heartbeats/error polling.
     uint64_t disconnect_grace_period_us_ = 0;
-    tensorflow::DeviceInfo devices_;
     // For now, we assume there won't be many simultaneous barriers so we simply
     // use a set.
     absl::flat_hash_set<std::string> ongoing_barriers_for_task_;
@@ -654,8 +637,6 @@ class CoordinationService {
   std::function<tensorflow::DeviceInfo(const tensorflow::DeviceInfo& devices)>
       post_aggregate_device_fn_;
 
-  const std::string device_propagation_barrier_id_ =
-      absl::StrCat("WaitForAllTasks::", service_incarnation_.value());
   const std::string shutdown_barrier_id_ =
       absl::StrCat("Shutdown::", service_incarnation_.value());
   std::vector<tensorflow::CoordinatedTask> shutdown_barrier_tasks_
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index 98e8a70bdd3b1c..a883a870dc028c 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -349,35 +349,6 @@ void CoordinationServiceAgent::PollForErrorAsync(tsl::StatusCallback done) {
       });
 }
 
-absl::Status CoordinationServiceAgent::WaitForAllTasks(
-    const DeviceInfo& local_devices) {
-  absl::Status agent_running_status = ValidateRunningAgent();
-  if (!agent_running_status.ok()) {
-    return agent_running_status;
-  }
-  WaitForAllTasksRequest request;
-  *request.mutable_source_task() = task_;
-  *request.mutable_device_info() = local_devices;
-  VLOG(3) << "WaitForAllTasksRequest: " << request.DebugString();
-  WaitForAllTasksResponse response;
-  absl::Status status;
-  absl::Notification n;
-  leader_client_->WaitForAllTasksAsync(&request, &response,
-                                       [&](const absl::Status& s) {
-                                         status = s;
-                                         n.Notify();
-                                       });
-  n.WaitForNotification();
-  if (!status.ok()) {
-    VLOG(3) << "WaitForAllTasksResponse: " << status;
-    SetError(status);
-    return status;
-  }
-  VLOG(3) << "WaitForAllTasksResponse: " << response.DebugString();
-  cluster_devices_ = response.device_info();
-  return absl::OkStatus();
-}
-
 const DeviceInfo& CoordinationServiceAgent::GetClusterDeviceInfo() {
   return cluster_devices_;
 }
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
index 77621a073b14cd..3671f98aa11007 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
@@ -149,14 +149,6 @@ class CoordinationServiceAgent {
   //              the configured timeout)
   absl::Status Connect();
 
-  // Wait for all tasks to be up and registered. The call blocks until all tasks
-  // in the cluster are up, or some error occurs.
-  // Possible service errors:
-  //   - Internal: Coordination service has shut down.
-  //   - FailedPrecondition: Agent is not in CONNECTED state.
-  //   - InvalidArgument: Unexpected task request
-  absl::Status WaitForAllTasks(const tensorflow::DeviceInfo& local_devices);
-
   // Get the device attributes of tasks from remote tasks in the cluster.
   const tensorflow::DeviceInfo& GetClusterDeviceInfo();
 
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
index 6985fe7f71d7a9..5e0f80b504d979 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
@@ -136,16 +136,6 @@ class TestCoordinationClient : public CoordinationClient {
               (tsl::CallOptions * call_opts, const PollForErrorRequest*,
                PollForErrorResponse*, tsl::StatusCallback),
               (override));
-
-#define UNIMPLEMENTED(method)                                              \
-  void method##Async(const method##Request* request,                       \
-                     method##Response* response, tsl::StatusCallback done) \
-      override {                                                           \
-    done(absl::UnimplementedError(#method "Async"));                       \
-  }
-
-  UNIMPLEMENTED(WaitForAllTasks);
-#undef UNIMPLEMENTED
 };
 
 class CoordinationServiceAgentTest : public ::testing::Test {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
index 2382aed192e709..d07dd613bb7667 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
@@ -91,26 +91,6 @@ void CoordinationServiceRpcHandler::HeartbeatAsync(
   done(absl::OkStatus());
 }
 
-void CoordinationServiceRpcHandler::WaitForAllTasksAsync(
-    const tensorflow::WaitForAllTasksRequest* request,
-    tensorflow::WaitForAllTasksResponse* response, tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (service_ == nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Coordination service is not enabled.")));
-    return;
-  }
-  service_->WaitForAllTasks(
-      request->source_task(), request->device_info(),
-      [response, service = service_, done = std::move(done)](absl::Status s) {
-        if (s.ok()) {
-          service->state_mu_.AssertHeld();
-          *response->mutable_device_info() = service->ListClusterDevices();
-        }
-        done(s);
-      });
-}
-
 void CoordinationServiceRpcHandler::ShutdownTaskAsync(
     const tensorflow::ShutdownTaskRequest* request,
     tensorflow::ShutdownTaskResponse* response, tsl::StatusCallback done) {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
index 432db619703bbe..a5cf6bfea97288 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
@@ -40,10 +40,6 @@ class CoordinationServiceRpcHandler {
                       tensorflow::HeartbeatResponse* response,
                       tsl::StatusCallback done);
 
-  void WaitForAllTasksAsync(const tensorflow::WaitForAllTasksRequest* request,
-                            tensorflow::WaitForAllTasksResponse* response,
-                            tsl::StatusCallback done);
-
   void ShutdownTaskAsync(const tensorflow::ShutdownTaskRequest* request,
                          tensorflow::ShutdownTaskResponse* response,
                          tsl::StatusCallback done);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
index 5c2fdccbc7f9a8..2a31a73e68cab4 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
@@ -110,7 +110,6 @@ class TestCoordinationClient : public CoordinationClient {
     done(absl::UnimplementedError(#method "Async"));                       \
   }
 
-  UNIMPLEMENTED(WaitForAllTasks);
   UNIMPLEMENTED(ResetTask);
   UNIMPLEMENTED(GetTaskState);
   UNIMPLEMENTED(InsertKeyValue);
@@ -270,19 +269,8 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
   task_2.set_task_id(2);
 
   ASSERT_OK(coord_service_->RegisterTask(task_0_, incarnation_0_));
-  absl::Notification wait_for_all;
-  coord_service_->WaitForAllTasks(task_0_, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    wait_for_all.Notify();
-  });
   // Not all tasks have registered, so must not be notified here.
-  ASSERT_FALSE(wait_for_all.HasBeenNotified());
   ASSERT_OK(coord_service_->RegisterTask(task_1_, incarnation_1_));
-  coord_service_->WaitForAllTasks(task_1_, {},
-                                  [&](absl::Status s) { ASSERT_OK(s); });
-  // All tasks have registered.
-  wait_for_all.WaitForNotification();
-
   ASSERT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
   ASSERT_OK(coord_service_->RecordHeartbeat(task_1_, incarnation_1_));
   EXPECT_THAT(coord_service_->RecordHeartbeat(task_2, IncarnationId(0)),
@@ -295,64 +283,6 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
               StatusIs(absl::StatusCode::kAborted));
 }
 
-TEST(CoordinationServiceTest, TestCoordinatedJobs) {
-  CoordinatedTask chief;
-  chief.set_job_name("chief");
-  chief.set_task_id(0);
-  CoordinatedTask task_0;
-  task_0.set_job_name("worker");
-  task_0.set_task_id(0);
-  CoordinatedTask task_1;
-  task_1.set_job_name("worker");
-  task_1.set_task_id(1);
-  CoordinatedTask evaluator;
-  evaluator.set_job_name("evaluator");
-  evaluator.set_task_id(0);
-
-  CoordinationService::Config config;
-  CoordinatedJob chief_job;
-  chief_job.set_name("chief");
-  chief_job.set_num_tasks(1);
-  config.coordinated_job_list.push_back(chief_job);
-  CoordinatedJob worker_job;
-  worker_job.set_name("worker");
-  worker_job.set_num_tasks(2);
-  config.coordinated_job_list.push_back(worker_job);
-
-  auto coord_service =
-      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
-
-  // Each coordinated task registers and waits for other tasks.
-  absl::Notification register_chief;
-  ASSERT_OK(coord_service->RegisterTask(chief, IncarnationId(0)));
-  coord_service->WaitForAllTasks(chief, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    register_chief.Notify();
-  });
-  absl::Notification register_task0;
-  ASSERT_OK(coord_service->RegisterTask(task_0, IncarnationId(0)));
-  coord_service->WaitForAllTasks(task_0, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    register_task0.Notify();
-  });
-  absl::Notification register_task1;
-  ASSERT_OK(coord_service->RegisterTask(task_1, IncarnationId(0)));
-  coord_service->WaitForAllTasks(task_1, {}, [&](absl::Status s) {
-    ASSERT_OK(s);
-    register_task1.Notify();
-  });
-  // All tasks in the coordinated jobs have registered.
-  register_chief.WaitForNotification();
-  register_task0.WaitForNotification();
-  register_task1.WaitForNotification();
-
-  // Registering the evaluator task is unexpected
-  absl::Status status =
-      coord_service->RegisterTask(evaluator, IncarnationId(0));
-
-  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument));
-}
-
 // RegisterTask() may succeed in the service, but the agent response times out.
 // In this case, the agent would retry Connect() and should succeed if it has
 // the same incarnation.
@@ -960,123 +890,6 @@ TEST_F(CoordinateTwoTasksTest,
 
 }  // namespace
 
-// Verify that coordination service can gather each task's device info and
-// propagate the aggregated cluster device info correctly.
-TEST(CoordinationServiceTest, ListClusterDevices_TfDevice) {
-  const CoordinationService::Config config =
-      GetCoordinationServiceConfig(/*num_tasks=*/3);
-  CoordinatedTask task_0;
-  task_0.set_job_name("worker");
-  task_0.set_task_id(0);
-  CoordinatedTask task_1;
-  task_1.set_job_name("worker");
-  task_1.set_task_id(1);
-  CoordinatedTask task_2;
-  task_2.set_job_name("worker");
-  task_2.set_task_id(2);
-  absl::Status status = absl::OkStatus();
-  std::unique_ptr<CoordinationService> coord_service =
-      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
-  absl::Notification n;
-  // Map fake devices to each task.
-  DeviceInfo local_devices_0;
-  DeviceInfo local_devices_1;
-  DeviceInfo local_devices_2;
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device0"));
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device1"));
-  local_devices_1.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task1_device0"));
-  local_devices_2.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task2_device0"));
-
-  // Each task sends its device info.
-  DeviceInfo cluster_devices;
-  coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [&](absl::Status s) { ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [&](absl::Status s) { ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_2, local_devices_2, [&](absl::Status s) {
-    ASSERT_OK(s);
-    // Gather the cluster device info.
-    coord_service->state_mu_.AssertHeld();
-    cluster_devices = coord_service->ListClusterDevices();
-    n.Notify();
-  });
-  n.WaitForNotification();
-
-  DeviceInfo expected_cluster_devices;
-  auto expected_devices = expected_cluster_devices.mutable_device();
-  expected_devices->Add(local_devices_0.device().begin(),
-                        local_devices_0.device().end());
-  expected_devices->Add(local_devices_1.device().begin(),
-                        local_devices_1.device().end());
-  expected_devices->Add(local_devices_2.device().begin(),
-                        local_devices_2.device().end());
-  EXPECT_THAT(cluster_devices, EqualsProto(expected_cluster_devices));
-}
-
-// Task devices should not be added twice if same task calls WaitForAllDevices()
-// twice.
-TEST(CoordinationServiceTest, ListClusterDevices_DevicesAreNotAddedTwice) {
-  const CoordinationService::Config config =
-      GetCoordinationServiceConfig(/*num_tasks=*/2);
-  CoordinatedTask task_0;
-  task_0.set_job_name("worker");
-  task_0.set_task_id(0);
-  CoordinatedTask task_1;
-  task_1.set_job_name("worker");
-  task_1.set_task_id(1);
-  absl::Status status = absl::OkStatus();
-  absl::Status initial_wait_for_all_tasks_status;
-  std::unique_ptr<CoordinationService> coord_service =
-      std::make_unique<CoordinationService>(tsl::Env::Default(), config);
-  absl::Notification n;
-  // Map fake devices to each task.
-  DeviceInfo local_devices_0;
-  DeviceInfo local_devices_1;
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device0"));
-  local_devices_0.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task0_device1"));
-  local_devices_1.mutable_device()->Add()->PackFrom(
-      CreateTestDevice("task1_device0"));
-  // Task0 sends device info.
-  DeviceInfo cluster_devices;
-  coord_service->WaitForAllTasks(
-      task_0, local_devices_0,
-      [&initial_wait_for_all_tasks_status](absl::Status s) {
-        initial_wait_for_all_tasks_status = s;
-      });
-
-  // Task0 sends device info again.
-  coord_service->WaitForAllTasks(task_0, local_devices_0,
-                                 [](absl::Status s) { ASSERT_OK(s); });
-  coord_service->WaitForAllTasks(task_1, local_devices_1,
-                                 [coord_service = coord_service.get(),
-                                  &cluster_devices, &n](absl::Status s) {
-                                   ASSERT_OK(s);
-                                   // Gather the cluster device info.
-                                   coord_service->state_mu_.AssertHeld();
-                                   cluster_devices =
-                                       coord_service->ListClusterDevices();
-                                   n.Notify();
-                                 });
-  n.WaitForNotification();
-
-  // No duplicates found.
-  DeviceInfo expected_cluster_devices;
-  auto expected_devices = expected_cluster_devices.mutable_device();
-  expected_devices->Add(local_devices_0.device().begin(),
-                        local_devices_0.device().end());
-  expected_devices->Add(local_devices_1.device().begin(),
-                        local_devices_1.device().end());
-  EXPECT_THAT(cluster_devices, EqualsProto(expected_cluster_devices));
-  EXPECT_THAT(initial_wait_for_all_tasks_status,
-              StatusIs(absl::StatusCode::kCancelled));
-}
-
 TEST_F(CoordinationBarrierTest, Barrier) {
   const std::string barrier_id = "barrier_id";
   absl::Duration timeout = absl::Seconds(5);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
index e6be2a6efde087..fda1f65f1c9947 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
@@ -71,8 +71,6 @@ using tensorflow::ShutdownTaskRequest;
 using tensorflow::ShutdownTaskResponse;
 using tensorflow::TryGetKeyValueRequest;
 using tensorflow::TryGetKeyValueResponse;
-using tensorflow::WaitForAllTasksRequest;
-using tensorflow::WaitForAllTasksResponse;
 using tensorflow::WatchJobStateRequest;
 using tensorflow::WatchJobStateResponse;
 
@@ -130,16 +128,6 @@ class GrpcCoordinationClient : public CoordinationClient {
         &target_);
   }
 
-  void WaitForAllTasksAsync(const WaitForAllTasksRequest* request,
-                            WaitForAllTasksResponse* response,
-                            tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/WaitForAllTasks",
-        *request, response, std::move(done), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
   void ShutdownTaskAsync(tsl::CallOptions* call_opts,
                          const ShutdownTaskRequest* request,
                          ShutdownTaskResponse* response,
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
index a9a8d614a3b207..c92026bd6d7b84 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
@@ -44,7 +44,6 @@ void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
                        &GrpcCoordinationServiceImpl::method##Handler, false); \
   } while (0)
   ENQUEUE_REQUEST(RegisterTask);
-  ENQUEUE_REQUEST(WaitForAllTasks);
   ENQUEUE_REQUEST(ShutdownTask);
   ENQUEUE_REQUEST(ResetTask);
   ENQUEUE_REQUEST(Heartbeat);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
index 5dc9a46d56d743..8a5e5a3e307022 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
@@ -85,7 +85,6 @@ class GrpcCoordinationServiceImpl : public tsl::AsyncServiceInterface {
                        /*supports_cancel=*/false);                            \
   }
   HANDLER(RegisterTask);
-  HANDLER(WaitForAllTasks);
   HANDLER(ShutdownTask);
   HANDLER(ResetTask);
   HANDLER(Heartbeat);

From ad2b228d6f9222ce49445d5af86528632c0f47d7 Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Wed, 17 Dec 2025 17:13:58 -0800
Subject: [PATCH 477/753] [XLA:CPU] Add initial support of grouped convolutions
 with YNNPACK enabled.

There are currently a few limitations:
* Only supports F32xF32->F32 and BF16xBF16->F32. S8xS8->S32 should work too, but there seem to be some rewrite in the middle which changes the op from supported convolution to unsupported -- will track it down separetely.

* Only stride=1 cases are supported until the limitation of ynn_split_dim is fixed.

PiperOrigin-RevId: 845977975
---
 .../xla/xla/backends/cpu/ynn_emitter.cc       | 121 ++++++++++++++++--
 .../xla/xla/backends/cpu/ynn_support.cc       |  37 ++++--
 2 files changed, 133 insertions(+), 25 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index 5229eadb70dbc9..fb5f1abbb33e3b 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -432,19 +432,87 @@ static ynn_status DefineBatchMatrixMultiply(ynn_subgraph_t subgraph,
 }
 
 static ynn_status DefineConvolution(
-    ynn_subgraph_t subgraph, ynn_type input1_id_type, uint32_t input1_id,
-    uint32_t input2_id, uint32_t output_id,
-    const std::vector<int32_t>& stencil_axes,
-    const std::vector<int32_t> new_axes,
+    ynn_subgraph_t subgraph, ynn_type input1_id_type, ynn_type output_id_type,
+    uint32_t input1_id, uint32_t input2_id, uint32_t output_id,
+    const std::vector<size_t>& filter_dims, const std::vector<size_t>& out_dims,
+    size_t feature_group_count, size_t input_channels,
+    size_t kernel_output_channels, const std::vector<int32_t>& stencil_axes,
+    const std::vector<int32_t>& new_axes,
     const std::vector<size_t>& stencil_dims,
     const std::vector<size_t>& stencil_strides,
     const std::vector<size_t>& stencil_dilations,
     const std::vector<int64_t>& padding_lows,
     const std::vector<int64_t>& padding_highs) {
-  uint32_t stencil_id = YNN_INVALID_VALUE_ID;
-
   ynn_status status;
 
+  // Make a copy in case we need to shift these for grouped convolution.
+  std::vector<int32_t> new_axes_shifted = new_axes;
+
+  // We will need to create an intermediate buffer for the output if it's
+  // grouped convolution.
+  uint32_t output_unfused_id =
+      feature_group_count != 1 ? YNN_INVALID_VALUE_ID : output_id;
+
+  if (feature_group_count != 1) {
+    uint32_t split_id = YNN_INVALID_VALUE_ID;
+
+    // [n, h, w, ci] -> [n, h, w, g, 1, ci/g].
+    size_t input_split[] = {feature_group_count, 1,
+                            input_channels / feature_group_count};
+    status =
+        ynn_define_split_dim(subgraph, /*axis=*/-1, /*num_splits=*/3,
+                             input_split, input1_id, &split_id, /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input1_id = split_id;
+    split_id = YNN_INVALID_VALUE_ID;
+    CHECK_EQ(filter_dims.size(), 4);
+    // [kh, kw, ci/g, co] -> [kh, kw, ci/g, g, co/g].
+    size_t filter_split[] = {feature_group_count,
+                             kernel_output_channels / feature_group_count};
+    status =
+        ynn_define_split_dim(subgraph, /*axis=*/-1, /*num_splits=*/2,
+                             filter_split, input2_id, &split_id, /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input2_id = split_id;
+
+    uint32_t transposed_filter_id = YNN_INVALID_VALUE_ID;
+    // [kh, kw, ci/g, g, co/g] -> [g, kh, kw, ci/g, co/g]
+    int32_t swap_co_ci[5] = {3, 0, 1, 2, 4};
+    status =
+        ynn_define_static_transpose(subgraph, /*rank=*/5, swap_co_ci, input2_id,
+                                    &transposed_filter_id, /*flags=*/0);
+
+    if (status != ynn_status_success) {
+      return status;
+    }
+    input2_id = transposed_filter_id;
+
+    // Create intermediate output buffer.
+    std::vector<size_t> unfused_dims(out_dims.begin(), out_dims.end() - 1);
+    unfused_dims.push_back(feature_group_count);
+    unfused_dims.push_back(1);
+    unfused_dims.push_back(kernel_output_channels / feature_group_count);
+    status = ynn_define_tensor_value(subgraph, output_id_type,
+                                     /*rank=*/out_dims.size() + 2,
+                                     /*dims=*/unfused_dims.data(),
+                                     /*data=*/nullptr,
+                                     /*zero_point_id=*/YNN_INVALID_VALUE_ID,
+                                     /*scale_id=*/YNN_INVALID_VALUE_ID,
+                                     /*flags=*/0, &output_unfused_id);
+    if (status != ynn_status_success) {
+      return status;
+    }
+
+    // Shift new stencil axes by two.
+    for (int i = 0; i < new_axes_shifted.size(); ++i) {
+      new_axes_shifted[i] += 2;
+    }
+  }
+
   // If any of paddings is not zero, define a padding value and pad the input.
   if (absl::c_any_of(padding_lows, [](int32_t i) { return i != 0; }) ||
       absl::c_any_of(padding_highs, [](int32_t i) { return i != 0; })) {
@@ -475,18 +543,38 @@ static ynn_status DefineConvolution(
     padding_id = YNN_INVALID_VALUE_ID;
   }
 
+  uint32_t stencil_id = YNN_INVALID_VALUE_ID;
   // Make a stenciled view of the input [n, h, w, ci] -> [n, h, w, kh, kw, ci].
   status = ynn_define_stencil_copy(
       subgraph, /*num_stencils=*/stencil_dims.size(), stencil_axes.data(),
-      new_axes.data(), stencil_dims.data(), stencil_strides.data(),
+      new_axes_shifted.data(), stencil_dims.data(), stencil_strides.data(),
       stencil_dilations.data(), input1_id, YNN_INVALID_VALUE_ID, &stencil_id,
       /*flags=*/0);
   if (status != ynn_status_success) {
     return status;
   }
-  return ynn_define_dot(subgraph, /*num_k_dims=*/stencil_dims.size() + 1,
-                        stencil_id, input2_id, YNN_INVALID_VALUE_ID, &output_id,
-                        /*flags=*/0);
+
+  status = ynn_define_dot(subgraph, /*num_k_dims=*/stencil_dims.size() + 1,
+                          stencil_id, input2_id, YNN_INVALID_VALUE_ID,
+                          &output_unfused_id,
+                          /*flags=*/0);
+
+  if (status != ynn_status_success) {
+    return status;
+  }
+
+  if (feature_group_count > 1) {
+    // The output of the grouped convolution is [n, h, w, g, 1, co/g], so we
+    // need to fuse three of the innermost dimensions.
+    status = ynn_define_fuse_dim(subgraph, /*axis=*/-3, /*axes_count=*/3,
+                                 output_unfused_id, &output_id,
+                                 /*flags=*/0);
+    if (status != ynn_status_success) {
+      return status;
+    }
+  }
+
+  return status;
 }
 
 static absl::StatusOr<YnnSubgraph> EmitYnnDotSubgraph(
@@ -644,10 +732,15 @@ static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
 
   std::iota(new_axes.begin(), new_axes.end(), lhs_dims.size() - 1);
 
-  YNN_RETURN_IF_ERROR(
-      DefineConvolution(subgraph.get(), ynn_lhs_type, lhs_id, rhs_id, out_id,
-                        stencil_axes, new_axes, stencil_dims, stencil_strides,
-                        stencil_dilations, padding_lows, padding_highs));
+  YNN_RETURN_IF_ERROR(DefineConvolution(
+      subgraph.get(), ynn_lhs_type, ynn_out_type, lhs_id, rhs_id, out_id,
+      rhs_dims, out_dims, conv->feature_group_count(),
+      conv->operand(0)->shape().dimensions(
+          conv_dimensions.input_feature_dimension()),
+      conv->operand(1)->shape().dimensions(
+          conv_dimensions.kernel_output_feature_dimension()),
+      stencil_axes, new_axes, stencil_dims, stencil_strides, stencil_dilations,
+      padding_lows, padding_highs));
 
   ynn_status status = ynn_optimize_subgraph(
       subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index a4281088298088..1c7f427934a622 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -296,20 +296,42 @@ bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
   CHECK_EQ(instr->opcode(), HloOpcode::kConvolution);
   const HloConvolutionInstruction* conv =
       Cast<HloConvolutionInstruction>(instr);
+
+  ConvolutionDimensionNumbers conv_dimensions =
+      conv->convolution_dimension_numbers();
+  Window window = conv->window();
+
+  if (conv->batch_group_count() != 1) {
+    return false;
+  }
+
+  // Only support 2D convolution.
+  if (window.dimensions_size() != 2) {
+    return false;
+  }
+
   // Stores tuple of allowed (input, output) dtypes.
   static const absl::NoDestructor<absl::flat_hash_set<
       std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
-      kAllowedTypes({{F32, F32, F32}, {BF16, BF16, F32}, {S8, S8, S32}});
+      kAllowedTypesNonGrouped(
+          {{F32, F32, F32}, {BF16, BF16, F32}, {S8, S8, S32}});
+
+  static const absl::NoDestructor<absl::flat_hash_set<
+      std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
+      kAllowedTypesGrouped({{F32, F32, F32}, {BF16, BF16, F32}});
 
   PrimitiveType lhs_dtype = conv->operand(0)->shape().element_type();
   PrimitiveType rhs_dtype = conv->operand(1)->shape().element_type();
   PrimitiveType out_dtype = conv->shape().element_type();
-  if (!kAllowedTypes->contains({lhs_dtype, rhs_dtype, out_dtype})) {
+  if (conv->feature_group_count() == 1 &&
+      !kAllowedTypesNonGrouped->contains({lhs_dtype, rhs_dtype, out_dtype})) {
     return false;
   }
 
-  ConvolutionDimensionNumbers conv_dimensions =
-      conv->convolution_dimension_numbers();
+  if (conv->feature_group_count() > 1 &&
+      !kAllowedTypesGrouped->contains({lhs_dtype, rhs_dtype, out_dtype})) {
+    return false;
+  }
 
   // Make sure that this layout is supported.
   if (conv_dimensions.input_feature_dimension() != 3 ||
@@ -337,13 +359,6 @@ bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
     return false;
   }
 
-  Window window = conv->window();
-
-  // Only support 2D convolution.
-  if (window.dimensions_size() != 2) {
-    return false;
-  }
-
   // No dilation for now.
   if ((window.dimensions(0).window_dilation() != 1) ||
       (window.dimensions(1).window_dilation() != 1) ||

From 18c6bdf2e70bacd45bad0b1f936f32fae69952ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 17:49:31 -0800
Subject: [PATCH 478/753] Add mlir definition for `PostProcessPrediction`.

PiperOrigin-RevId: 845986632
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 20 +++++++++++++++++++
 .../transforms/legalization_op_config_test.cc |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index ecd4f7560c359a..931fb51426257b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -11497,6 +11497,26 @@ representation of that entry.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_PostProcessPredictionOp : TF_Op<"PostProcessPrediction", []> {
+  let summary = [{
+Performs post-processing on prediction inputs. This op has no tensor outputs.
+  }];
+
+  let description = [{
+Send an rpc to the external service that builds rpc payload based on prediction result.
+  }];
+
+  let arguments = (ins
+    Arg<TF_Int64Tensor, [{Scalar int64 tensor representing steps.}]>:$steps,
+    Arg<TF_Int64Tensor, [{Scalar int64 tensor representing the GAIA ID.}]>:$gaia_id,
+    Arg<TF_Uint64Tensor, [{1-D uint64 tensor representing a list of video IDs.}]>:$video_id,
+
+    StrAttr:$op_config
+  );
+
+  let results = (outs);
+}
+
 def TF_PowOp : TF_Op<"Pow", [Pure, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                WithBroadcastableBinOpBuilder {
   let summary = "Computes the power of one value to another.";
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index 7d00bc41716979..f40ada575d2f4a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -84,7 +84,7 @@ TEST(LegalizationOpConfigTest, CountLoweringsSet) {
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 67);
   EXPECT_EQ(tf2xla_fallback_count, 333);
-  EXPECT_EQ(non_categorized_count, 434);
+  EXPECT_EQ(non_categorized_count, 435);
 }
 
 // Just a counter test to see which ops have duplicate lowerings. This isn't a

From 0702b4623e1443de0ef4b62ea46532797ad2af31 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Wed, 17 Dec 2025 17:52:10 -0800
Subject: [PATCH 479/753] Add Shape to ConvolutionThunk buffer_uses

Modify Thunk's serialization

PiperOrigin-RevId: 845987211
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  1 +
 .../backends/gpu/runtime/convolution_thunk.cc | 43 ++++++++++---------
 .../backends/gpu/runtime/convolution_thunk.h  | 21 ++++-----
 .../gpu/runtime/convolution_thunk_test.cc     | 39 +++++++++++++++--
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  4 +-
 .../xla/xla/service/buffer_assignment.cc      | 30 +++++++++++++
 .../xla/xla/service/buffer_assignment.h       |  4 ++
 .../xla/xla/service/buffer_assignment_test.cc |  4 ++
 third_party/xla/xla/service/gpu/BUILD         |  1 +
 .../xla/xla/service/gpu/thunk_emitter.cc      | 27 ++++++++----
 .../xla/xla/service/gpu/thunk_emitter.h       |  5 ++-
 11 files changed, 134 insertions(+), 45 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index c9221c3ffb8ce3..eff15f7bd6646c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -505,6 +505,7 @@ cc_library(
     srcs = ["convolution_thunk.cc"],
     hdrs = ["convolution_thunk.h"],
     deps = [
+        ":shaped_slice",
         ":thunk",
         "//xla:util",
         "//xla/runtime:buffer_use",
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
index 3e020680be7d4c..d6d8da0379def0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
@@ -45,8 +46,8 @@ using buffer_assignment::BufferAllocationSliceProto;
 
 absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::Create(
     ThunkInfo thunk_info, GpuConvDescriptor descriptor,
-    std::vector<BufferAllocation::Slice> operand_slices,
-    std::vector<BufferAllocation::Slice> result_slices,
+    std::vector<ShapedSlice> operand_slices,
+    std::vector<ShapedSlice> result_slices,
     BufferAllocation::Slice scratch_slice) {
   TF_ASSIGN_OR_RETURN(GpuConvConfig config,
                       GetGpuConvConfig(descriptor, /*inst_as_string=*/""));
@@ -57,11 +58,12 @@ absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::Create(
       std::move(operand_slices), std::move(result_slices), scratch_slice));
 }
 
-ConvolutionThunk::ConvolutionThunk(
-    ThunkInfo thunk_info, GpuConvDescriptor descriptor, GpuConvConfig config,
-    std::vector<BufferAllocation::Slice> operand_slices,
-    std::vector<BufferAllocation::Slice> result_slices,
-    BufferAllocation::Slice scratch_slice)
+ConvolutionThunk::ConvolutionThunk(ThunkInfo thunk_info,
+                                   GpuConvDescriptor descriptor,
+                                   GpuConvConfig config,
+                                   std::vector<ShapedSlice> operand_slices,
+                                   std::vector<ShapedSlice> result_slices,
+                                   BufferAllocation::Slice scratch_slice)
     : Thunk(Kind::kConvolution, thunk_info),
       operand_buffers_(std::move(operand_slices)),
       result_buffers_(std::move(result_slices)),
@@ -87,13 +89,15 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   std::vector<se::DeviceAddressBase> operand_se_buffers, result_se_buffers;
   operand_se_buffers.reserve(operand_buffers_.size());
-  for (BufferAllocation::Slice buffer : operand_buffers_) {
-    operand_se_buffers.push_back(buffer_allocations.GetDeviceAddress(buffer));
+  for (const ShapedSlice& buffer : operand_buffers_) {
+    operand_se_buffers.push_back(
+        buffer_allocations.GetDeviceAddress(buffer.slice));
   }
 
   result_se_buffers.reserve(result_buffers_.size());
-  for (BufferAllocation::Slice buffer : result_buffers_) {
-    result_se_buffers.push_back(buffer_allocations.GetDeviceAddress(buffer));
+  for (const ShapedSlice& buffer : result_buffers_) {
+    result_se_buffers.push_back(
+        buffer_allocations.GetDeviceAddress(buffer.slice));
   }
 
   se::DeviceAddressBase scratch =
@@ -150,21 +154,20 @@ absl::StatusOr<std::unique_ptr<ConvolutionThunk>> ConvolutionThunk::FromProto(
   TF_ASSIGN_OR_RETURN(GpuConvDescriptor descriptor,
                       GpuConvDescriptor::FromProto(proto.conv_descriptor()));
 
-  std::vector<BufferAllocation::Slice> operand_slices;
+  std::vector<ShapedSlice> operand_slices;
   operand_slices.reserve(proto.operand_buffers_size());
-  for (const BufferAllocationSliceProto& slice_proto :
-       proto.operand_buffers()) {
+  for (const ShapedSliceProto& slice_proto : proto.operand_buffers()) {
     TF_ASSIGN_OR_RETURN(
         operand_slices.emplace_back(),
-        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+        ShapedSlice::FromProto(slice_proto, buffer_allocations));
   }
 
-  std::vector<BufferAllocation::Slice> result_slices;
+  std::vector<ShapedSlice> result_slices;
   result_slices.reserve(proto.result_buffers_size());
-  for (const BufferAllocationSliceProto& slice_proto : proto.result_buffers()) {
+  for (const ShapedSliceProto& slice_proto : proto.result_buffers()) {
     TF_ASSIGN_OR_RETURN(
         result_slices.emplace_back(),
-        BufferAllocation::Slice::FromProto(slice_proto, buffer_allocations));
+        ShapedSlice::FromProto(slice_proto, buffer_allocations));
   }
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
@@ -183,10 +186,10 @@ absl::StatusOr<ThunkProto> ConvolutionThunk::ToProto() const {
   ConvolutionThunkProto* conv_proto = proto.mutable_convolution_thunk();
   *conv_proto->mutable_conv_descriptor() = descriptor_.ToProto();
 
-  for (const BufferAllocation::Slice& slice : operand_buffers_) {
+  for (const ShapedSlice& slice : operand_buffers_) {
     TF_ASSIGN_OR_RETURN(*conv_proto->add_operand_buffers(), slice.ToProto());
   }
-  for (const BufferAllocation::Slice& slice : result_buffers_) {
+  for (const ShapedSlice& slice : result_buffers_) {
     TF_ASSIGN_OR_RETURN(*conv_proto->add_result_buffers(), slice.ToProto());
   }
   TF_ASSIGN_OR_RETURN(*conv_proto->mutable_scratch_buffer(),
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
index c13653ba69c8f2..72845ff25d4536 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
@@ -45,8 +46,8 @@ class ConvolutionThunk : public Thunk {
   // operand_slices should be in the same order as cudnn_call->operands().
   static absl::StatusOr<std::unique_ptr<ConvolutionThunk>> Create(
       ThunkInfo thunk_info, GpuConvDescriptor descriptor,
-      std::vector<BufferAllocation::Slice> operand_slices,
-      std::vector<BufferAllocation::Slice> result_slices,
+      std::vector<ShapedSlice> operand_slices,
+      std::vector<ShapedSlice> result_slices,
       BufferAllocation::Slice scratch_slice);
 
   ConvolutionThunk(const ConvolutionThunk&) = delete;
@@ -58,11 +59,11 @@ class ConvolutionThunk : public Thunk {
     BufferUses res;
     res.reserve(operand_buffers_.size() + result_buffers_.size() + 1);
 
-    for (const BufferAllocation::Slice& slice : operand_buffers_) {
-      res.push_back(BufferUse::Read(slice));
+    for (const ShapedSlice& slice : operand_buffers_) {
+      res.push_back(BufferUse::Read(slice.slice, slice.shape));
     }
-    for (const BufferAllocation::Slice& slice : result_buffers_) {
-      res.push_back(BufferUse::Write(slice));
+    for (const ShapedSlice& slice : result_buffers_) {
+      res.push_back(BufferUse::Write(slice.slice, slice.shape));
     }
     res.emplace_back(scratch_buffer_, BufferUse::MemoryAccess::kWrite,
                      BufferUse::ContentValidity::kUndefined);
@@ -78,12 +79,12 @@ class ConvolutionThunk : public Thunk {
  private:
   ConvolutionThunk(ThunkInfo thunk_info, GpuConvDescriptor descriptor,
                    GpuConvConfig config,
-                   std::vector<BufferAllocation::Slice> operand_slices,
-                   std::vector<BufferAllocation::Slice> result_slices,
+                   std::vector<ShapedSlice> operand_slices,
+                   std::vector<ShapedSlice> result_slices,
                    BufferAllocation::Slice scratch_slice);
 
-  std::vector<BufferAllocation::Slice> operand_buffers_;
-  std::vector<BufferAllocation::Slice> result_buffers_;
+  std::vector<ShapedSlice> operand_buffers_;
+  std::vector<ShapedSlice> result_buffers_;
   BufferAllocation::Slice scratch_buffer_;
   GenericConvRunner& GetOrCreateRunner(const stream_executor::Stream* stream,
                                        bool* runner_created);
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
index 6ebd309f06e7af..acfdf2cd39209c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_thunk_test.cc
@@ -108,9 +108,42 @@ TEST(ConvolutionThunkTest, ProtoRoundTrip) {
           output_spatial_dimensions: [ 2, 3 ]
         }
       }
-      operand_buffers { offset: 0 size: 4 buffer_allocation_index: 0 }
-      operand_buffers { offset: 0 size: 4 buffer_allocation_index: 1 }
-      result_buffers { offset: 0 size: 4 buffer_allocation_index: 2 }
+      operand_buffers {
+        slice { offset: 0 size: 4 buffer_allocation_index: 0 }
+        shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+      }
+      operand_buffers {
+        slice { offset: 0 size: 4 buffer_allocation_index: 1 }
+        shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+      }
+      result_buffers {
+        slice { offset: 0 size: 4 buffer_allocation_index: 2 }
+        shape {
+          element_type: F32
+          dimensions: [ 1, 1, 1, 1 ]
+          layout {
+            minor_to_major: [ 3, 2, 1, 0 ]
+            tail_padding_alignment_in_elements: 1
+          }
+          is_dynamic_dimension: [ false, false, false, false ]
+        }
+      }
       scratch_buffer { offset: 0 size: 1024 buffer_allocation_index: 3 }
     }
   )pb");
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index eb564b5e4eb376..a4d08af3e7d4b3 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -338,8 +338,8 @@ message NormThunkProto {
 
 message ConvolutionThunkProto {
   GpuConvDescriptorProto conv_descriptor = 1;
-  repeated xla.buffer_assignment.BufferAllocationSliceProto operand_buffers = 2;
-  repeated xla.buffer_assignment.BufferAllocationSliceProto result_buffers = 3;
+  repeated ShapedSliceProto operand_buffers = 2;
+  repeated ShapedSliceProto result_buffers = 3;
   xla.buffer_assignment.BufferAllocationSliceProto scratch_buffer = 4;
 }
 
diff --git a/third_party/xla/xla/service/buffer_assignment.cc b/third_party/xla/xla/service/buffer_assignment.cc
index 5446e720cc2e81..5c1945d05b7343 100644
--- a/third_party/xla/xla/service/buffer_assignment.cc
+++ b/third_party/xla/xla/service/buffer_assignment.cc
@@ -578,6 +578,36 @@ absl::StatusOr<BufferAllocation::Slice> BufferAssignment::GetUniqueSlice(
   return result;
 }
 
+absl::StatusOr<Shape> BufferAssignment::GetShapeForUniqueSlice(
+    const HloInstruction* instruction, const ShapeIndex& index) const {
+  VLOG(3) << "Trying to find shape for unique slice for " << instruction->name()
+          << " [" << index << "]";
+  std::optional<Shape> result;
+  for (const HloValue* value :
+       dataflow_analysis().GetValueSet(instruction, index).values()) {
+    VLOG(3) << "Examining value " << *value;
+    if (HasAllocation(*value)) {
+      VLOG(3) << "Has allocation";
+      if (result == std::nullopt) {
+        result = value->shape();
+      } else if (result != value->shape()) {
+        return FailedPrecondition(
+            "Shape for instruction %s at index %s cannot "
+            "be determined at compile-time.",
+            instruction->name(), index.ToString());
+      }
+    } else {
+      VLOG(3) << "No allocation";
+    }
+  }
+  if (result == std::nullopt) {
+    return FailedPrecondition(
+        "BufferAllocation::Slice not assigned for instruction %s at index %s",
+        instruction->name(), index.ToString());
+  }
+  return *result;
+}
+
 absl::StatusOr<BufferAllocation::Slice>
 BufferAssignment::GetUniqueTopLevelSlice(
     const HloInstruction* instruction) const {
diff --git a/third_party/xla/xla/service/buffer_assignment.h b/third_party/xla/xla/service/buffer_assignment.h
index 709833efe98336..67f6cfabfde934 100644
--- a/third_party/xla/xla/service/buffer_assignment.h
+++ b/third_party/xla/xla/service/buffer_assignment.h
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/service/hlo_value.h"
 #include "xla/service/logical_buffer.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 
 namespace xla {
@@ -493,6 +494,9 @@ class BufferAssignment {
   // the slice cannot be determined at compile time then an error is returned.
   absl::StatusOr<BufferAllocation::Slice> GetUniqueSlice(
       const HloInstruction* instruction, const ShapeIndex& index) const;
+  absl::StatusOr<Shape> GetShapeForUniqueSlice(
+      const HloInstruction* instruction, const ShapeIndex& index) const;
+
   // Like GetUniqueSlice but fixes the index to the top-level of the shape
   // (index = {}).
   absl::StatusOr<BufferAllocation::Slice> GetUniqueTopLevelSlice(
diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index 7627b0afd76bab..56b5c5d34f8ab2 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -2504,6 +2504,10 @@ TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
   // Verify 'weights1' and read-only use while1{1} alias.
   EXPECT_EQ(assignment->GetUniqueSlice(weights1, {}).value(),
             assignment->GetUniqueSlice(while1, {1}).value());
+
+  TF_ASSERT_OK_AND_ASSIGN(Shape shape,
+                          assignment->GetShapeForUniqueSlice(while1, {1}));
+  EXPECT_EQ(shape, data_shape_);
 }
 
 // Tests that two colocated buffer sets are not merged if an entry parameter
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 4796e6b3e31cbd..daa3908e799508 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -567,6 +567,7 @@ cc_library(
         "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 72bc9096cc4d34..2519719f9a727f 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -153,7 +153,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
-#include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/launch_dim.h"
@@ -168,6 +167,7 @@ limitations under the License.
 #include "tsl/platform/casts.h"
 #include "tsl/platform/human_readable_json.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::gpu {
 namespace {
@@ -523,21 +523,20 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCommandBufferThunk(
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionThunk(
     const HloCustomCallInstruction* instr) {
-  std::vector<BufferAllocation::Slice> operand_slices;
+  std::vector<ShapedSlice> operand_slices;
   operand_slices.reserve(instr->operand_count());
   for (const HloInstruction* operand : instr->operands()) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                        GetAllocationSliceForHlo(operand, {}));
+    ASSIGN_OR_RETURN(ShapedSlice slice, GetShapedSliceForHlo(operand, {}));
     operand_slices.push_back(slice);
   }
 
   // The first and the last element in the result tuple for a convolution are
   // always the result and the scratch buffer. It may have auxiliary results in
   // addition to the main result.
-  std::vector<BufferAllocation::Slice> result_slices;
+  std::vector<ShapedSlice> result_slices;
   for (int i = 0; i < instr->shape().tuple_shapes().size() - 1; i++) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                        GetAllocationSliceForHlo(instr, {i}));
+    ASSIGN_OR_RETURN(ShapedSlice result_slice,
+                     GetShapedSliceForHlo(instr, {i}));
     result_slices.push_back(result_slice);
   }
 
@@ -926,8 +925,18 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitPtxCustomCall(
 
 absl::StatusOr<BufferAllocation::Slice> ThunkEmitter::GetAllocationSliceForHlo(
     const HloInstruction* instr, const ShapeIndex& index) const {
-  return xla::gpu::GetAllocationSlice(ir_emitter_context_->buffer_assignment(),
-                                      instr, index);
+  return ir_emitter_context_->buffer_assignment().GetUniqueSlice(instr, index);
+}
+
+absl::StatusOr<ShapedSlice> ThunkEmitter::GetShapedSliceForHlo(
+    const HloInstruction* instr, const ShapeIndex& index) const {
+  ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
+                   GetAllocationSliceForHlo(instr, index));
+  ASSIGN_OR_RETURN(
+      Shape shape,
+      ir_emitter_context_->buffer_assignment().GetShapeForUniqueSlice(instr,
+                                                                      index));
+  return ShapedSlice{slice, shape};
 }
 
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCubDeviceRadixSort(
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.h b/third_party/xla/xla/service/gpu/thunk_emitter.h
index a478a2aff0729a..0680fa2cd9c8c2 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.h
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/host_send_recv_thunk.h"
 #include "xla/backends/gpu/runtime/nvshmem_collective_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -214,7 +215,9 @@ class ThunkEmitter {
   absl::Status AssertNonDeterminismIsOkay(const std::string& op_name);
 
   absl::StatusOr<BufferAllocation::Slice> GetAllocationSliceForHlo(
-      const HloInstruction* hlo, const ShapeIndex& index = {}) const;
+      const HloInstruction* instr, const ShapeIndex& index = {}) const;
+  absl::StatusOr<ShapedSlice> GetShapedSliceForHlo(
+      const HloInstruction* instr, const ShapeIndex& index = {}) const;
 
   CollectivesAsyncEvents& GetCollectivesAsyncEvents() {
     return ir_emitter_context_->collectives_async_events();

From 39d92388316cbd73786efceb69e17a5ea4eaf1a4 Mon Sep 17 00:00:00 2001
From: Fengwu Yao <fengwuyao@google.com>
Date: Wed, 17 Dec 2025 17:58:54 -0800
Subject: [PATCH 480/753] Update to use half data type in Cast kernel.

PiperOrigin-RevId: 845989438
---
 tensorflow/lite/kernels/BUILD        |  1 +
 tensorflow/lite/kernels/cast.cc      | 39 +++++++------------
 tensorflow/lite/kernels/cast_test.cc | 56 +++++++++++++++++++++++++++-
 tensorflow/lite/types/BUILD          |  4 ++
 4 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index db2435b081d36b..5a47ace22d912b 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -834,6 +834,7 @@ cc_library(
         "@ruy//ruy/profiler:instrumentation",
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/types:half",
         "//tensorflow/lite:array",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:cc_api_stable",
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index 192a552bca4ea2..3560c21e5d498a 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/types/fp16.h"
+#include "tensorflow/lite/types/half.h"
 
 #ifdef __ARM_NEON
 #include <arm_neon.h>
@@ -99,17 +101,9 @@ void copyCast(const std::complex<float>* in, std::complex<float>* out,
 }
 
 template <typename ToT>
-void copyCast(const Eigen::half* in, ToT* out, int num_elements) {
-  std::transform(in, in + num_elements, out, [](Eigen::half a) {
-    return static_cast<ToT>(Eigen::half_impl::half_to_float(a));
-  });
-}
-
-template <>
-void copyCast(const Eigen::half* in, std::complex<float>* out,
-              int num_elements) {
-  std::transform(in, in + num_elements, out, [](Eigen::half a) {
-    return std::complex<float>(Eigen::half_impl::half_to_float(a));
+void copyCast(const half* in, ToT* out, int num_elements) {
+  std::transform(in, in + num_elements, out, [](half a) {
+    return static_cast<ToT>(fp16_ieee_to_fp32_value(a));
   });
 }
 
@@ -122,33 +116,26 @@ void copyCast(const Eigen::bfloat16* in, std::complex<float>* out,
 }
 
 template <typename FromT>
-void copyCastToFloat16(const FromT* in, Eigen::half* out, int num_elements) {
+void copyCastToFloat16(const FromT* in, half* out, int num_elements) {
   std::transform(in, in + num_elements, out, [](FromT a) {
-    return Eigen::half_impl::float_to_half_rtne(static_cast<float>(a));
+    return half::from_bits(fp16_ieee_from_fp32_value(static_cast<float>(a)));
   });
 }
 
 template <>
-void copyCastToFloat16(const std::complex<float>* in, Eigen::half* out,
+void copyCastToFloat16(const std::complex<float>* in, half* out,
                        int num_elements) {
   std::transform(in, in + num_elements, out, [](std::complex<float> a) {
-    return Eigen::half_impl::float_to_half_rtne(std::real(a));
+    return half::from_bits(fp16_ieee_from_fp32_value(std::real(a)));
   });
 }
 
 template <>
-void copyCastToFloat16(const Eigen::half* in, Eigen::half* out,
-                       int num_elements) {
-  std::transform(in, in + num_elements, out, [](Eigen::half a) { return a; });
-}
-
-template <>
-void copyCastToFloat16(const Eigen::bfloat16* in, Eigen::half* out,
-                       int num_elements) {
+void copyCastToFloat16(const Eigen::bfloat16* in, half* out, int num_elements) {
   // bfloat16 -> float -> half (fp16)
   std::transform(in, in + num_elements, out, [](Eigen::bfloat16 a) {
-    return Eigen::half_impl::float_to_half_rtne(
-        Eigen::bfloat16_impl::bfloat16_to_float(a));
+    return half::from_bits(
+        fp16_ieee_from_fp32_value(Eigen::bfloat16_impl::bfloat16_to_float(a)));
   });
 }
 
@@ -310,7 +297,7 @@ TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
       copyCast(in, out->data.int8, num_elements);
       break;
     case kTfLiteFloat16:
-      copyCastToFloat16(in, reinterpret_cast<Eigen::half*>(out->data.f16),
+      copyCastToFloat16(in, reinterpret_cast<half*>(out->data.f16),
                         num_elements);
       break;
     case kTfLiteBFloat16:
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index bcc9b4bc058003..09cc8fbfbda37c 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/random/random.h"
 #include "absl/types/span.h"
-#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/cast_test_common.h"
@@ -461,6 +460,61 @@ TEST(CastOpModel, CastBFloat16ToFloat) {
                   /*max_abs_err=*/0.05f)));
 }
 
+TEST(CastOpModel, CastFloat16ToInt32) {
+  CastOpModel m({TensorType_FLOAT16, {1, 6}}, {TensorType_INT32, {1, 6}});
+  m.PopulateTensor<half>(m.input(),
+                         {static_cast<half>(100.f), static_cast<half>(20.f),
+                          static_cast<half>(3.f), static_cast<half>(0.4f),
+                          static_cast<half>(0.999f), static_cast<half>(1.1f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({100, 20, 3, 0, 0, 1}));
+}
+
+TEST(CastOpModel, CastInt32ToFloat16) {
+  CastOpModel m({TensorType_INT32, {1, 6}}, {TensorType_FLOAT16, {1, 6}});
+  m.PopulateTensor<int32_t>(m.input(), {100, 20, 3, 0, 1, -1});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m.ExtractVector<half>(m.output()),
+      ElementsAreArray({static_cast<half>(100.f), static_cast<half>(20.f),
+                        static_cast<half>(3.f), static_cast<half>(0.f),
+                        static_cast<half>(1.f), static_cast<half>(-1.f)}));
+}
+
+TEST(CastOpModel, CastFloat16ToBFloat16) {
+  CastOpModel m({TensorType_FLOAT16, {1, 6}}, {TensorType_BFLOAT16, {1, 6}});
+  m.PopulateTensor<half>(m.input(),
+                         {static_cast<half>(100.f), static_cast<half>(20.f),
+                          static_cast<half>(3.f), static_cast<half>(0.4f),
+                          static_cast<half>(0.999f), static_cast<half>(1.1f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<Eigen::bfloat16>(m.output()),
+              ElementsAreArray({static_cast<Eigen::bfloat16>(100.f),
+                                static_cast<Eigen::bfloat16>(20.f),
+                                static_cast<Eigen::bfloat16>(3.f),
+                                static_cast<Eigen::bfloat16>(0.4f),
+                                static_cast<Eigen::bfloat16>(0.999f),
+                                static_cast<Eigen::bfloat16>(1.1f)}));
+}
+
+TEST(CastOpModel, CastBFloat16ToFloat16) {
+  CastOpModel m({TensorType_BFLOAT16, {1, 6}}, {TensorType_FLOAT16, {1, 6}});
+  m.PopulateTensor<Eigen::bfloat16>(
+      m.input(),
+      {static_cast<Eigen::bfloat16>(100.f), static_cast<Eigen::bfloat16>(20.f),
+       static_cast<Eigen::bfloat16>(3.f), static_cast<Eigen::bfloat16>(0.4f),
+       static_cast<Eigen::bfloat16>(0.999f),
+       static_cast<Eigen::bfloat16>(1.1f)});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<half>(m.output()),
+              ElementsAreArray(ArrayFloatNear(
+                  {static_cast<half>(100.f), static_cast<half>(20.f),
+                   static_cast<half>(3.f), static_cast<half>(0.4f),
+                   static_cast<half>(0.999f), static_cast<half>(1.1f)},
+                  /*max_abs_err=*/0.05f)));
+}
+
 TEST(CastOpModel, CastConstInputCachingWorks) {
   // This tests the implementation of a performance optimization. If that
   // optimization is changed, this test will likely break/need to be updated.
diff --git a/tensorflow/lite/types/BUILD b/tensorflow/lite/types/BUILD
index c00aadb6ae46e9..0bc596f7782e2a 100644
--- a/tensorflow/lite/types/BUILD
+++ b/tensorflow/lite/types/BUILD
@@ -28,4 +28,8 @@ cc_library(
         "fp16.h",
         "half.h",
     ],
+    # copybara:uncomment_begin(google-only)
+    # compatible_with = ["//buildenv/target:non_prod"],
+    # copybara:uncomment_end
+    deps = ["@FP16"],
 )

From ebdfe2dc372e10c856578b56e1e8c2fce52d3874 Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Wed, 17 Dec 2025 18:46:25 -0800
Subject: [PATCH 481/753] Removed unused `GetTaskState` from coordination
 service.

PiperOrigin-RevId: 846002428
---
 .../coordination/coordination_client.h        |  6 -----
 .../coordination/coordination_service.cc      | 13 -----------
 .../coordination/coordination_service.h       |  4 ----
 .../coordination_service_agent.cc             | 23 -------------------
 .../coordination/coordination_service_agent.h |  4 ----
 .../coordination_service_agent_test.cc        |  6 -----
 .../coordination_service_rpc_handler.cc       | 16 -------------
 .../coordination_service_rpc_handler.h        |  4 ----
 .../coordination/coordination_service_test.cc |  1 -
 .../coordination/grpc_coordination_client.cc  | 12 ----------
 .../grpc_coordination_service_impl.cc         |  1 -
 .../grpc_coordination_service_impl.h          |  1 -
 12 files changed, 91 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
index 39246d62754382..c7e70795b97fba 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_client.h
@@ -36,8 +36,6 @@ using tensorflow::GetKeyValueDirRequest;
 using tensorflow::GetKeyValueDirResponse;
 using tensorflow::GetKeyValueRequest;
 using tensorflow::GetKeyValueResponse;
-using tensorflow::GetTaskStateRequest;
-using tensorflow::GetTaskStateResponse;
 using tensorflow::HeartbeatRequest;
 using tensorflow::HeartbeatResponse;
 using tensorflow::IncrementKeyValueRequest;
@@ -82,10 +80,6 @@ class CoordinationClient {
                               ResetTaskResponse* response,
                               tsl::StatusCallback done) = 0;
 
-  virtual void GetTaskStateAsync(const GetTaskStateRequest* request,
-                                 GetTaskStateResponse* response,
-                                 tsl::StatusCallback done) = 0;
-
   virtual void WatchJobStateAsync(tsl::CallOptions* call_opts,
                                   const WatchJobStateRequest* request,
                                   WatchJobStateResponse* response,
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
index 73fbf4c0e04b07..0326b3c5cbb270 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
@@ -796,19 +796,6 @@ CoordinatedTaskStateInfo CoordinationService::CreateTaskStateInfo(
   return info;
 }
 
-std::vector<CoordinatedTaskStateInfo> CoordinationService::GetTaskState(
-    const std::vector<CoordinatedTask>& tasks) {
-  std::vector<CoordinatedTaskStateInfo> states_info;
-  states_info.reserve(tasks.size());
-
-  absl::MutexLock l(state_mu_);
-  for (const auto& task : tasks) {
-    states_info.push_back(
-        CreateTaskStateInfo(task, *cluster_state_[GetTaskName(task)]));
-  }
-  return states_info;
-}
-
 std::vector<CoordinatedTaskStateInfo> CoordinationService::GetJobState(
     absl::string_view job_name) {
   std::vector<CoordinatedTaskStateInfo> states_info;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
index c10c90fbbd7588..1c302da245c90a 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
@@ -173,10 +173,6 @@ class CoordinationService {
   absl::Status ReportTaskError(const tensorflow::CoordinatedTask& task,
                                const absl::Status& error);
 
-  // Get the state and the error status of the tasks.
-  std::vector<tensorflow::CoordinatedTaskStateInfo> GetTaskState(
-      const std::vector<tensorflow::CoordinatedTask>& task);
-
   // Watches the state and the error status of the job.
   using WatchJobStateCallback = absl::AnyInvocable<void(
       std::vector<tensorflow::CoordinatedTaskStateInfo>, int64_t)>;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index a883a870dc028c..bc6788cb7ecabc 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -362,29 +362,6 @@ absl::StatusOr<CoordinatedTask> CoordinationServiceAgent::GetOwnTask() {
   return task_;
 }
 
-absl::StatusOr<std::vector<CoordinatedTaskStateInfo>>
-CoordinationServiceAgent::GetTaskState(
-    const std::vector<CoordinatedTask>& tasks) {
-  GetTaskStateRequest request;
-  *request.mutable_source_task() = {tasks.begin(), tasks.end()};
-  GetTaskStateResponse response;
-  absl::Notification n;
-  absl::StatusOr<std::vector<CoordinatedTaskStateInfo>> result;
-  leader_client_->GetTaskStateAsync(
-      &request, &response, [&](const absl::Status& s) {
-        if (s.ok()) {
-          result = std::vector<CoordinatedTaskStateInfo>(
-              std::make_move_iterator(response.task_state().begin()),
-              std::make_move_iterator(response.task_state().end()));
-        } else {
-          result = s;
-        }
-        n.Notify();
-      });
-  n.WaitForNotification();
-  return result;
-}
-
 std::shared_ptr<tsl::CallOptions> CoordinationServiceAgent::WatchJobStateAsync(
     absl::string_view job_name, std::optional<int64_t> version_number,
     std::function<void(absl::StatusOr<tensorflow::WatchJobStateResponse>)>
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
index 3671f98aa11007..cbec2894d15793 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
@@ -163,10 +163,6 @@ class CoordinationServiceAgent {
   // Get task associated with this agent.
   absl::StatusOr<tensorflow::CoordinatedTask> GetOwnTask();
 
-  // Get status of a remote task.
-  absl::StatusOr<std::vector<tensorflow::CoordinatedTaskStateInfo>>
-  GetTaskState(const std::vector<tensorflow::CoordinatedTask>& task);
-
   // Watches the status of a remote job.
   absl::StatusOr<tensorflow::WatchJobStateResponse> WatchJobState(
       absl::string_view job_name, std::optional<int64_t> version_number);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
index 5e0f80b504d979..43c59e7a611ce5 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
@@ -120,10 +120,6 @@ class TestCoordinationClient : public CoordinationClient {
               (const GetAliveTasksRequest*, GetAliveTasksResponse*,
                tsl::StatusCallback),
               (override));
-  MOCK_METHOD(void, GetTaskStateAsync,
-              (const GetTaskStateRequest*, GetTaskStateResponse*,
-               tsl::StatusCallback),
-              (override));
   MOCK_METHOD(void, WatchJobStateAsync,
               (tsl::CallOptions*, const WatchJobStateRequest*,
                WatchJobStateResponse*, tsl::StatusCallback),
@@ -153,8 +149,6 @@ class CoordinationServiceAgentTest : public ::testing::Test {
         .WillByDefault(InvokeArgument<3>(absl::OkStatus()));
     ON_CALL(*client_, CancelBarrierAsync(_, _, _))
         .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
-    ON_CALL(*client_, GetTaskStateAsync(_, _, _))
-        .WillByDefault(InvokeArgument<2>(absl::OkStatus()));
   }
 
   // Should be called after mocking service responses, before testing the agent.
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
index d07dd613bb7667..a94a4189881c4e 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.cc
@@ -116,22 +116,6 @@ void CoordinationServiceRpcHandler::ResetTaskAsync(
   done(service_->ResetTask(request->source_task()));
 }
 
-void CoordinationServiceRpcHandler::GetTaskStateAsync(
-    const tensorflow::GetTaskStateRequest* request,
-    tensorflow::GetTaskStateResponse* response, tsl::StatusCallback done) {
-  absl::ReaderMutexLock l(mu_);
-  if (service_ == nullptr) {
-    done(MakeCoordinationError(
-        absl::InternalError("Coordination service is not enabled.")));
-    return;
-  }
-  auto result = service_->GetTaskState(
-      {request->source_task().begin(), request->source_task().end()});
-  absl::c_move(result, tsl::protobuf::RepeatedFieldBackInserter(
-                           response->mutable_task_state()));
-  done(absl::OkStatus());
-}
-
 void CoordinationServiceRpcHandler::WatchJobStateAsync(
     const tensorflow::WatchJobStateRequest* request,
     tensorflow::WatchJobStateResponse* response, tsl::StatusCallback done) {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
index a5cf6bfea97288..04e7645cd9c766 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_rpc_handler.h
@@ -48,10 +48,6 @@ class CoordinationServiceRpcHandler {
                       tensorflow::ResetTaskResponse* response,
                       tsl::StatusCallback done);
 
-  void GetTaskStateAsync(const tensorflow::GetTaskStateRequest* request,
-                         tensorflow::GetTaskStateResponse* response,
-                         tsl::StatusCallback done);
-
   void WatchJobStateAsync(const tensorflow::WatchJobStateRequest* request,
                           tensorflow::WatchJobStateResponse* response,
                           tsl::StatusCallback done);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
index 2a31a73e68cab4..ec75076d4c6208 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
@@ -111,7 +111,6 @@ class TestCoordinationClient : public CoordinationClient {
   }
 
   UNIMPLEMENTED(ResetTask);
-  UNIMPLEMENTED(GetTaskState);
   UNIMPLEMENTED(InsertKeyValue);
   UNIMPLEMENTED(TryGetKeyValue);
   UNIMPLEMENTED(IncrementKeyValue);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
index fda1f65f1c9947..9e7a3269aa9655 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_client.cc
@@ -53,8 +53,6 @@ using tensorflow::GetKeyValueDirRequest;
 using tensorflow::GetKeyValueDirResponse;
 using tensorflow::GetKeyValueRequest;
 using tensorflow::GetKeyValueResponse;
-using tensorflow::GetTaskStateRequest;
-using tensorflow::GetTaskStateResponse;
 using tensorflow::HeartbeatRequest;
 using tensorflow::HeartbeatResponse;
 using tensorflow::IncrementKeyValueRequest;
@@ -162,16 +160,6 @@ class GrpcCoordinationClient : public CoordinationClient {
         /*fail_fast=*/true, &target_);
   }
 
-  void GetTaskStateAsync(const GetTaskStateRequest* request,
-                         GetTaskStateResponse* response,
-                         tsl::StatusCallback done) override {
-    new tsl::RPCState<tsl::protobuf::Message>(
-        &stub_, cq_, "/tensorflow.CoordinationService/GetTaskState", *request,
-        response, std::move(done), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
-        &target_);
-  }
-
   void WatchJobStateAsync(tsl::CallOptions* call_opts,
                           const WatchJobStateRequest* request,
                           WatchJobStateResponse* response,
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
index c92026bd6d7b84..1e4c9030f4aed9 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.cc
@@ -47,7 +47,6 @@ void GrpcCoordinationServiceImpl::HandleRPCsLoop() {
   ENQUEUE_REQUEST(ShutdownTask);
   ENQUEUE_REQUEST(ResetTask);
   ENQUEUE_REQUEST(Heartbeat);
-  ENQUEUE_REQUEST(GetTaskState);
   ENQUEUE_REQUEST(WatchJobState);
   ENQUEUE_REQUEST(InsertKeyValue);
   ENQUEUE_REQUEST(GetKeyValue);
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
index 8a5e5a3e307022..bf5941d78deadc 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/grpc_coordination_service_impl.h
@@ -88,7 +88,6 @@ class GrpcCoordinationServiceImpl : public tsl::AsyncServiceInterface {
   HANDLER(ShutdownTask);
   HANDLER(ResetTask);
   HANDLER(Heartbeat);
-  HANDLER(GetTaskState);
   HANDLER(WatchJobState);
   HANDLER(InsertKeyValue);
   HANDLER(GetKeyValue);

From b8d2866c3581883ad793be22949a1b1b19480efe Mon Sep 17 00:00:00 2001
From: "Jeffrey A. Dean" <jeff@google.com>
Date: Wed, 17 Dec 2025 19:28:15 -0800
Subject: [PATCH 482/753] Avoid redundant memset to clear the allocated backing
 store.

It turns out make_unique<...> default initializes the integer values
to 0, so this was showing up on profiles as two calls to memset (one inside
the inlined make_unique code, and then the explicit memset after the
make_unique).  For large compilations, the allocated memory can
be large.

PiperOrigin-RevId: 846017416
---
 third_party/xla/xla/hlo/analysis/hlo_reachability.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/third_party/xla/xla/hlo/analysis/hlo_reachability.cc b/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
index 2cbf312f7f5a3b..54e09fa300dd9b 100644
--- a/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
+++ b/third_party/xla/xla/hlo/analysis/hlo_reachability.cc
@@ -41,10 +41,8 @@ HloReachabilityMap::HloReachabilityMap(
   while (row < total_rows) {
     const int rows_to_allocate = std::min(kRowsPerAllocation, total_rows - row);
     size_t words_to_allocate = rows_to_allocate * words_per_bitset_;
+    // make_unique initializes the array of words to 0
     bit_storage_.push_back(std::make_unique<BitSet::Word[]>(words_to_allocate));
-    // Initialize all the bitsets to 0
-    memset(bit_storage_.back().get(), 0,
-           words_to_allocate * sizeof(BitSet::Word));
     row += rows_to_allocate;
   }
 

From af047e5a0ddfe475346a78a56e21ee881d259a51 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 19:38:06 -0800
Subject: [PATCH 483/753] Automated Code Change

PiperOrigin-RevId: 846020744
---
 third_party/xla/xla/service/gpu/model/BUILD                     | 2 ++
 .../xla/xla/service/gpu/model/sol_latency_estimator_test.cc     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 2b1781b7923784..00ba04b9c37463 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -89,12 +89,14 @@ xla_cc_test(
         ":sol_latency_estimator",
         "//xla:literal_util",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_module_config",
         "//xla/service:latency_hiding_scheduler",
+        "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index 49756f5d8bb53e..b18d54daa25df2 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/utils/hlo_query.h"
 #include "xla/literal_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/collective_interpolator.h"
 #include "xla/service/gpu/model/sol_gpu_cost_model.h"
@@ -45,6 +46,7 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
 namespace {

From ad13e0db77cfe530814233d620c8fe5d598560ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 19:55:50 -0800
Subject: [PATCH 484/753] Automated Code Change

PiperOrigin-RevId: 846024893
---
 third_party/xla/xla/pjrt/distributed/preemption/BUILD         | 3 +++
 .../xla/xla/pjrt/distributed/preemption/preemption_notifier.h | 2 ++
 .../pjrt/distributed/preemption/preemption_notifier_test.cc   | 4 ++--
 .../distributed/preemption/preemption_sync_manager_test.cc    | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/preemption/BUILD b/third_party/xla/xla/pjrt/distributed/preemption/BUILD
index bdd4dff4e0eac1..d0cc66f91b4c70 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/preemption/BUILD
@@ -41,6 +41,8 @@ xla_cc_test(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
@@ -87,6 +89,7 @@ xla_cc_test(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:test",
         "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
+        "//xla/tsl/protobuf:coordination_service_proto_cc",
         "@com_github_grpc_grpc//:grpc++",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h
index d5e073886fb41b..6080754a22f246 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
index 3a012e7632c776..92da8a0acd020d 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_notifier_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 #include "xla/pjrt/distributed/preemption/preemption_notifier.h"
 
 #include <csignal>
-#include <functional>
 #include <memory>
-#include <utility>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
index 3fed3b8b36cd1b..ab33fea7b2ab82 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/platform/threadpool.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
+#include "xla/tsl/protobuf/coordination_service.pb.h"
 
 namespace xla {
 namespace {

From 0e07a3d2e3832807dde0b867ed7c507d572f04c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:01:16 -0800
Subject: [PATCH 485/753] Automated Code Change

PiperOrigin-RevId: 846026272
---
 .../core/tpu/kernels/image_resize_ops.cc      |   2 +-
 tensorflow/core/tpu/kernels/infeed_ops.cc     |   4 +-
 .../core/tpu/kernels/sparse_core_ops_utils.cc |  10 +-
 .../core/tpu/kernels/sparse_core_ops_utils.h  |   6 +-
 .../tpu/kernels/sparse_core_ops_utils_test.cc |  12 +-
 .../tpu/kernels/sparse_core_preprocess_ops.cc | 194 +++++++++---------
 .../tpu/kernels/sparse_core_preprocess_ops.h  |  27 +--
 .../core/tpu/kernels/sparse_core_xla_ops.cc   |   4 +-
 tensorflow/core/tpu/kernels/topk_ops.cc       |  20 +-
 .../tpu_compilation_cache_rpc_lookup.h        |   8 +-
 .../core/tpu/kernels/tpu_compile_op_common.cc |   2 +-
 .../core/tpu/kernels/tpu_compile_op_common.h  |   4 +-
 .../tpu_embedding_engine_state_interface.h    |   2 +-
 .../tpu/kernels/tpu_embedding_enqueue_ops.h   |   3 +-
 .../core/tpu/kernels/tpu_functional_ops.h     |  36 ++--
 .../tpu/kernels/tpu_mesh_state_interface.h    |   2 +-
 tensorflow/core/tpu/kernels/tpu_op_util.cc    |  12 +-
 tensorflow/core/tpu/kernels/tpu_op_util.h     |   8 +-
 .../core/tpu/kernels/tpu_ordinal_selector.h   |   2 +-
 .../kernels/tpu_ordinal_selector_interface.h  |   2 +-
 tensorflow/core/tpu/kernels/tpu_pod_state.cc  |   2 +-
 tensorflow/core/tpu/kernels/tpu_pod_state.h   |   2 +-
 tensorflow/core/tpu/kernels/tpu_util.cc       |   2 +-
 tensorflow/core/tpu/kernels/transfer_ops.cc   |   8 +-
 24 files changed, 190 insertions(+), 184 deletions(-)

diff --git a/tensorflow/core/tpu/kernels/image_resize_ops.cc b/tensorflow/core/tpu/kernels/image_resize_ops.cc
index 7e255bab054550..dfc4077e8f10a1 100644
--- a/tensorflow/core/tpu/kernels/image_resize_ops.cc
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@@ -57,7 +57,7 @@ class TpuCustomResizeOp : public XlaOpKernel {
     return output_shape;
   }
 
-  string OpaqueField() const {
+  std::string OpaqueField() const {
     return absl::StrCat("\"", align_corners_, half_pixel_centers_, "\"");
   }
 
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
index d59c6c4b6d4683..2d13813db101cf 100644
--- a/tensorflow/core/tpu/kernels/infeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -188,7 +188,9 @@ struct LinearizedBuffersWrapper {
   ~LinearizedBuffersWrapper() = default;
 
   // These functions are tensorflow::Variant requirements.
-  string TypeName() const { return "(anonymous)::LinearizedBuffersWrapper"; }
+  std::string TypeName() const {
+    return "(anonymous)::LinearizedBuffersWrapper";
+  }
   void Encode(tensorflow::VariantTensorData* data) const {
     LOG(ERROR) << "Encode() is not implemented for LinearizedBuffersWrapper "
                   "objects.";
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
index 182f5bf29ca32b..2fa5972f29af46 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.cc
@@ -44,10 +44,10 @@ limitations under the License.
 
 namespace tensorflow {
 
-std::vector<int> ConvertBinarySplitsToBucketSplits(int64 split,
+std::vector<int> ConvertBinarySplitsToBucketSplits(int64_t split,
                                                    int max_division_level) {
   std::vector<int> bucket_splits;
-  uint32 current_index = 0;
+  uint32_t current_index = 0;
   while (split > 0) {
     if (split % 2 == 1) {
       int split_level = absl::bit_width(current_index + 1) - 1;
@@ -62,9 +62,9 @@ std::vector<int> ConvertBinarySplitsToBucketSplits(int64 split,
   return bucket_splits;
 }
 
-int64 ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
-                                        int max_division_level) {
-  int64 binary_splits = 0;
+int64_t ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
+                                          int max_division_level) {
+  int64_t binary_splits = 0;
   for (auto& bucket_split : bucket_splits) {
     int split_level = max_division_level - 1;
     while (bucket_split > 0 && bucket_split % 2 == 0) {
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
index 72419504760aa6..cd958fc5d2218d 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils.h
@@ -33,11 +33,11 @@ namespace tensorflow {
 // Pad value used for SparseCore mini batching logic.
 const int32_t kXlaPadValue = std::numeric_limits<int32_t>::max();
 
-std::vector<int> ConvertBinarySplitsToBucketSplits(int64 split,
+std::vector<int> ConvertBinarySplitsToBucketSplits(int64_t split,
                                                    int max_division_level);
 
-int64 ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
-                                        int max_division_level);
+int64_t ConvertBucketSplitsToBinarySplits(std::vector<int> bucket_splits,
+                                          int max_division_level);
 
 absl::Status ValidateInputCombiner(const std::string& combiner);
 
diff --git a/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc b/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc
index 9af20e1f2a540d..6a241cdb3a3795 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_ops_utils_test.cc
@@ -25,11 +25,11 @@ namespace {
 TEST(ConvertSplitsAndBackTest, Split0) {
   const int max_division_level = 6;
 
-  int64 original_split = 0;
+  int64_t original_split = 0;
   std::vector<int> actual_buckets =
       ConvertBinarySplitsToBucketSplits(original_split, max_division_level);
   std::vector<int> expected_buckets = {};
-  int64 re_split =
+  int64_t re_split =
       ConvertBucketSplitsToBinarySplits(expected_buckets, max_division_level);
   ASSERT_EQ(re_split, original_split);
 }
@@ -37,11 +37,11 @@ TEST(ConvertSplitsAndBackTest, Split0) {
 TEST(ConvertSplitsAndBackTest, Split2) {
   const int max_division_level = 6;
 
-  int64 original_split = 2;
+  int64_t original_split = 2;
   std::vector<int> actual_buckets =
       ConvertBinarySplitsToBucketSplits(original_split, max_division_level);
   std::vector<int> expected_buckets = {16};
-  int64 re_split =
+  int64_t re_split =
       ConvertBucketSplitsToBinarySplits(expected_buckets, max_division_level);
   ASSERT_EQ(re_split, original_split);
 }
@@ -49,11 +49,11 @@ TEST(ConvertSplitsAndBackTest, Split2) {
 TEST(ConvertSplitsAndBackTest, Split3) {
   const int max_division_level = 6;
 
-  int64 original_split = 3;
+  int64_t original_split = 3;
   std::vector<int> actual_buckets =
       ConvertBinarySplitsToBucketSplits(original_split, max_division_level);
   std::vector<int> expected_buckets = {16, 32};
-  int64 re_split =
+  int64_t re_split =
       ConvertBucketSplitsToBinarySplits(expected_buckets, max_division_level);
   ASSERT_EQ(re_split, original_split);
 }
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
index 0815f742b4e9e5..ddd47e0d53c701 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.cc
@@ -118,9 +118,9 @@ absl::Status ValidateInputs(const Tensor& indices_or_row_splits,
 }
 
 absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
-                                        const int32 total_id_count,
-                                        const int32 sample_count,
-                                        int32* row_ids_before_padding,
+                                        const int32_t total_id_count,
+                                        const int32_t sample_count,
+                                        int32_t* row_ids_before_padding,
                                         std::vector<int> shape_strides) {
   // The only difference between dense tensor, sparse tensor and ragged tensor
   // is the row ids output.
@@ -129,7 +129,7 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
     // Row ids are just the index ids.
     // Note: this path is also taken when the input is a ragged/sparse tensor
     // with 0 elements. In that case, the row_ids will just be empty as well.
-    for (int32 i = 0; i < total_id_count; ++i) {
+    for (int32_t i = 0; i < total_id_count; ++i) {
       *(row_ids_before_padding + i) = i;
     }
   } else if (indices_or_row_splits.dims() == 2 &&
@@ -140,12 +140,12 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
     // For 2D sparse tensor, as we always combine on the last dimension.
     // The row ids are just the sample ids which is the first dim of the
     // indices.
-    auto indices_matrix = indices_or_row_splits.matrix<int32>();
+    auto indices_matrix = indices_or_row_splits.matrix<int32_t>();
     // TODO(b/432045101): remove this once the bug is fixed.
     if (indices_matrix.dimension(1) == 2) {
-      int32 previous_row_id = -1;
-      for (int32 i = 0; i < total_id_count; ++i) {
-        int32 current_row_id = indices_matrix(i, 0);
+      int32_t previous_row_id = -1;
+      for (int32_t i = 0; i < total_id_count; ++i) {
+        int32_t current_row_id = indices_matrix(i, 0);
         if (current_row_id < previous_row_id) {
           return absl::InvalidArgumentError(
               "Invalid indices_or_row_splits input, indices of SparseTensor "
@@ -173,7 +173,7 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
             "Invalid shape_strides input, expected non-empty shape_strides for "
             "SparseTensor with rank > 2.");
       }
-      int32 previous_row_id = -1;
+      int32_t previous_row_id = -1;
       int32_t rank = indices_matrix.dimension(1) - 1;
       for (int32_t i = 0; i < total_id_count; ++i) {
         int32_t current_row_id = 0;
@@ -205,10 +205,10 @@ absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
   } else if (indices_or_row_splits.dims() == 1 &&
              indices_or_row_splits.NumElements() > 0) {
     // Ragged tensor to COO format.
-    const int32* indices_or_row_splits_ptr =
-        indices_or_row_splits.flat<int32>().data();
-    int32 current_row_id = -1;
-    for (int32 i = 0; i < total_id_count; ++i) {
+    const int32_t* indices_or_row_splits_ptr =
+        indices_or_row_splits.flat<int32_t>().data();
+    int32_t current_row_id = -1;
+    for (int32_t i = 0; i < total_id_count; ++i) {
       while (i == *(indices_or_row_splits_ptr + 1 + current_row_id)) {
         current_row_id += 1;
       }
@@ -308,7 +308,7 @@ absl::Status SortDedupAndCountStatsOfCooTensor(
     uint32_t previous_id_array_index = 0;
     for (int32_t index = 0; index < total_id_count; ++index) {
       uint64_t item = per_feature_col_ids_index_list[index];
-      int32 col_id = item >> 32;
+      int32_t col_id = item >> 32;
       uint32_t id_array_index = item & 0xffffffff;
       int32_t row_id = *(row_ids_ptr + id_array_index);
       // If the row ids and col ids are both same as the previous one,
@@ -362,9 +362,9 @@ class ConvertToCooTensorOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ValidateInputs(*indices_or_row_splits, *values,
                                        *weights, sample_count_));
 
-    const int32 total_id_count = values->NumElements();
+    const int32_t total_id_count = values->NumElements();
 
-    auto row_ids_before_dedup = std::make_unique<int32[]>(total_id_count);
+    auto row_ids_before_dedup = std::make_unique<int32_t[]>(total_id_count);
 
     OP_REQUIRES_OK(ctx, ComputeRowIdsBeforePadding(
                             *indices_or_row_splits, total_id_count,
@@ -382,14 +382,14 @@ class ConvertToCooTensorOp : public OpKernel {
     auto combiner_scale_transform_fn =
         GetCombinerScaleTransformFunction(combiner_);
 
-    const int32* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
-    const int32* values_ptr = values->flat<int32>().data();
+    const int32_t* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
+    const int32_t* values_ptr = values->flat<int32_t>().data();
     const float* weights_ptr = weights->flat<float>().data();
 
     // Dedup the ids within one sample by just checking the adjacent ids. This
     // will NOT result in a full deduplication.
-    std::vector<int32> row_ids;
-    std::vector<int32> col_ids;
+    std::vector<int32_t> row_ids;
+    std::vector<int32_t> col_ids;
     std::vector<float> gains;
     row_ids.reserve(total_id_count);
     col_ids.reserve(total_id_count);
@@ -400,8 +400,8 @@ class ConvertToCooTensorOp : public OpKernel {
       const float gain = *weights_ptr;
       const float rescaled_gain = combiner_scale_contribution_fn(gain);
       for (int token_id = 0; token_id < total_id_count; ++token_id) {
-        const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-        const int32 col_id = *(values_ptr + token_id);
+        const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+        const int32_t col_id = *(values_ptr + token_id);
         if (gains_rescale.has_value()) {
           // Compute the gain rescale before doing the dedup.
           (*gains_rescale)[row_id] += rescaled_gain;
@@ -417,8 +417,8 @@ class ConvertToCooTensorOp : public OpKernel {
       }
     } else {
       for (int token_id = 0; token_id < total_id_count; ++token_id) {
-        const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-        const int32 col_id = *(values_ptr + token_id);
+        const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+        const int32_t col_id = *(values_ptr + token_id);
         const float gain = *(weights_ptr + token_id);
         if (gains_rescale.has_value()) {
           // Compute the gain rescale before doing the dedup.
@@ -435,7 +435,7 @@ class ConvertToCooTensorOp : public OpKernel {
       }
     }
 
-    const int32 output_id_count = row_ids.size();
+    const int32_t output_id_count = row_ids.size();
 
     Tensor* gains_tensor;
     OP_REQUIRES_OK(ctx,
@@ -450,8 +450,8 @@ class ConvertToCooTensorOp : public OpKernel {
         ctx, ctx->allocate_output("col_ids", TensorShape({output_id_count}),
                                   &col_ids_tensor));
 
-    int32* row_ids_tensor_ptr = row_ids_tensor->flat<int32>().data();
-    int32* col_ids_tensor_ptr = col_ids_tensor->flat<int32>().data();
+    int32_t* row_ids_tensor_ptr = row_ids_tensor->flat<int32_t>().data();
+    int32_t* col_ids_tensor_ptr = col_ids_tensor->flat<int32_t>().data();
     float* gains_tensor_ptr = gains_tensor->flat<float>().data();
 
     if (gains_rescale.has_value()) {
@@ -535,11 +535,11 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
                           feature_width_, &max_ids_per_partition,
                           &max_unique_ids_per_partition));
 
-  const int32* row_ids_tensor_ptr = row_ids->flat<int32>().data();
-  const int32* col_ids_tensor_ptr = col_ids->flat<int32>().data();
+  const int32_t* row_ids_tensor_ptr = row_ids->flat<int32_t>().data();
+  const int32_t* col_ids_tensor_ptr = col_ids->flat<int32_t>().data();
   const float* gains_tensor_ptr = gains->flat<float>().data();
-  const int64* splits_tensor_ptr = splits->flat<int64>().data();
-  const int32* id_counts_tensor_ptr = id_counts->flat<int32>().data();
+  const int64_t* splits_tensor_ptr = splits->flat<int64_t>().data();
+  const int32_t* id_counts_tensor_ptr = id_counts->flat<int32_t>().data();
 
   const int32_t total_id_count = row_ids->NumElements();
 
@@ -556,9 +556,9 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
 
   const int max_division_level = GetMinibatchMaxDivisionLevel();
 
-  const int32 kMaxDivisions = 1 << max_division_level;
+  const int32_t kMaxDivisions = 1 << max_division_level;
 
-  int64 binary_splits = 0;
+  int64_t binary_splits = 0;
   for (int i = 0; i < splits->NumElements(); ++i) {
     binary_splits |= *(splits_tensor_ptr + i);
   }
@@ -566,7 +566,7 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   std::vector<int> bucket_splits =
       ConvertBinarySplitsToBucketSplits(binary_splits, max_division_level);
 
-  const int32 num_minibatch_per_sc = bucket_splits.size() + 1;
+  const int32_t num_minibatch_per_sc = bucket_splits.size() + 1;
   sparse_core_ops_stats_handler_->Record(StatsType::NUM_MINIBATCHES_PER_SC,
                                          num_minibatch_per_sc, device_name_,
                                          table_name_);
@@ -588,7 +588,7 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   bucket_splits.insert(bucket_splits.begin(), 0);
   bucket_splits.push_back(kMaxDivisions);
 
-  const int32 max_ids_per_chip = max_ids_per_chip_per_sample_ * sample_count_;
+  const int32_t max_ids_per_chip = max_ids_per_chip_per_sample_ * sample_count_;
 
   OP_REQUIRES(
       ctx, max_ids_per_chip % xla_pad_size == 0,
@@ -596,8 +596,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
           "The max_ids_per_chip is set to be ", max_ids_per_chip,
           " which is not divisible by the xla_pad_size ", xla_pad_size, " .")));
 
-  const int32 padded_row_pointers_size_per_sc =
-      xla::RoundUpTo<int32>(num_physical_replica, xla_pad_size);
+  const int32_t padded_row_pointers_size_per_sc =
+      xla::RoundUpTo<int32_t>(num_physical_replica, xla_pad_size);
 
   Tensor* row_pointers_tensor;
   OP_REQUIRES_OK(ctx,
@@ -619,11 +619,12 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, ctx->allocate_output("sorted_gains", TensorShape({max_ids_per_chip}),
                                 &sorted_gains_tensor));
-  int32* row_pointers_tensor_ptr = row_pointers_tensor->flat<int32>().data();
-  int32* sorted_sample_ids_tensor_ptr =
-      sorted_sample_ids_tensor->flat<int32>().data();
-  int32* sorted_token_ids_tensor_ptr =
-      sorted_token_ids_tensor->flat<int32>().data();
+  int32_t* row_pointers_tensor_ptr =
+      row_pointers_tensor->flat<int32_t>().data();
+  int32_t* sorted_sample_ids_tensor_ptr =
+      sorted_sample_ids_tensor->flat<int32_t>().data();
+  int32_t* sorted_token_ids_tensor_ptr =
+      sorted_token_ids_tensor->flat<int32_t>().data();
   float* sorted_gains_tensor_ptr = sorted_gains_tensor->flat<float>().data();
 
   // This packed id count is used to track how many ids we have packed into
@@ -631,8 +632,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   // dropped.
   int32_t packed_id_count = 0;
 
-  int32 global_index = 0;
-  int32 row_pointers_index = 0;
+  int32_t global_index = 0;
+  int32_t row_pointers_index = 0;
   for (int sc_id = 0; sc_id < num_sc_per_chip_; ++sc_id) {
     for (int i = 1; i < bucket_splits.size(); ++i) {
       for (int replica_id = 0; replica_id < num_physical_replica;
@@ -686,8 +687,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
         }
 
         *(row_pointers_tensor_ptr + row_pointers_index) = global_index;
-        int32 num_ids_to_pad_per_replica =
-            xla::RoundUpTo<int32>(global_index, xla_pad_size) - global_index;
+        int32_t num_ids_to_pad_per_replica =
+            xla::RoundUpTo<int32_t>(global_index, xla_pad_size) - global_index;
         std::fill_n(sorted_token_ids_tensor_ptr + global_index,
                     num_ids_to_pad_per_replica, kXlaPadValue);
         std::fill_n(sorted_sample_ids_tensor_ptr + global_index,
@@ -698,8 +699,8 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
         ++row_pointers_index;
       }
       // Pad the row_pointers to be memory aligned.
-      int32 num_row_pointers_to_pad =
-          xla::RoundUpTo<int32>(row_pointers_index, xla_pad_size) -
+      int32_t num_row_pointers_to_pad =
+          xla::RoundUpTo<int32_t>(row_pointers_index, xla_pad_size) -
           row_pointers_index;
       std::fill_n(row_pointers_tensor_ptr + row_pointers_index,
                   num_row_pointers_to_pad, global_index);
@@ -718,7 +719,7 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
                  << " . This could potentially impact the model quality.";
   }
 
-  int32 row_pointers_unpadded_size =
+  int32_t row_pointers_unpadded_size =
       total_num_minibatch * padded_row_pointers_size_per_sc;
 
   Tensor* num_minibatches_per_physical_sparse_core_tensor;
@@ -736,11 +737,11 @@ void GetMinibatchesInCsrWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ctx->allocate_output("ids_unpadded_size", TensorShape({}),
                                            &ids_unpadded_size_tensor));
 
-  num_minibatches_per_physical_sparse_core_tensor->flat<int32>()(0) =
+  num_minibatches_per_physical_sparse_core_tensor->flat<int32_t>()(0) =
       num_minibatch_per_sc;
-  row_pointers_unpadded_size_tensor->flat<int32>()(0) =
+  row_pointers_unpadded_size_tensor->flat<int32_t>()(0) =
       row_pointers_unpadded_size;
-  ids_unpadded_size_tensor->flat<int32>()(0) = ids_unpadded_size;
+  ids_unpadded_size_tensor->flat<int32_t>()(0) = ids_unpadded_size;
 }
 
 #ifdef LIBTPU_ON_GCE
@@ -778,7 +779,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ctx->input("program_key", &program_key_t));
   tstring program_key = program_key_t->vec<tstring>()(0);
 
-  int32 per_sc_sample_count = sample_count_ / num_sc_per_chip_;
+  int32_t per_sc_sample_count = sample_count_ / num_sc_per_chip_;
 
   int64_t max_ids_per_partition = -1;
   int64_t max_unique_ids_per_partition = -1;
@@ -802,10 +803,10 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   const Tensor* gains;
   OP_REQUIRES_OK(ctx, ctx->input("gains", &gains));
 
-  const int32 total_id_count = row_ids->NumElements();
+  const int32_t total_id_count = row_ids->NumElements();
 
-  const int32* row_ids_ptr = row_ids->flat<int32>().data();
-  const int32* col_ids_ptr = col_ids->flat<int32>().data();
+  const int32_t* row_ids_ptr = row_ids->flat<int32_t>().data();
+  const int32_t* col_ids_ptr = col_ids->flat<int32_t>().data();
   const float* gains_ptr = gains->flat<float>().data();
 
 #ifndef NDEBUG
@@ -829,7 +830,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
 
   const int max_division_level = GetMinibatchMaxDivisionLevel();
 
-  const int32 kMaxDivisions = 1 << max_division_level;
+  const int32_t kMaxDivisions = 1 << max_division_level;
 
   // The id counts tensor is the running sum of the number of ids for all
   // buckets for all the replicas on each SparseCore.
@@ -842,7 +843,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
           TensorShape(
               {kMaxDivisions * num_sc_per_chip_ * num_physical_replica + 1}),
           &id_counts_tensor));
-  int32* id_counts_tensor_ptr = id_counts_tensor->flat<int32>().data();
+  int32_t* id_counts_tensor_ptr = id_counts_tensor->flat<int32_t>().data();
   *id_counts_tensor_ptr = 0;
 
   const int32_t division_size =
@@ -855,8 +856,8 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   //                0001011 -> 0001 01 1
   //      which mean split at level 0 section 0, level 1 section 0 and level
   //      2 section 0. the split points are [128, 256, 512].
-  int64 pre_merge_splits = 0;
-  int64 after_merge_splits = 0;
+  int64_t pre_merge_splits = 0;
+  int64_t after_merge_splits = 0;
   // Vector of uint64_t storing the col ids in the upper 32 bit and the index
   // to the original id array in the lower 32 bit.
   std::vector<std::vector<uint64_t>> col_ids_index_list(
@@ -926,7 +927,7 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
     int32_t previous_row_id = -1;
     uint32_t previous_id_array_index = 0;
     for (uint64_t item : col_ids_index_list[sc_id]) {
-      int32 col_id = item >> 32;
+      int32_t col_id = item >> 32;
       uint32_t id_array_index = item & 0xffffffff;
       int32_t row_id = *(row_ids_ptr + id_array_index);
       // If the row ids and col ids are both same as the previous one,
@@ -1027,9 +1028,9 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
           if (level > 0 && (pre_merge_splits &
                             (1LL << (pre_start_bit_pos + (section >> 1)))) == 0)
             continue;
-          int32 id_count = id_counter[(section + 1) * section_size] -
-                           id_counter[section * section_size];
-          int32 unique_id_count =
+          int32_t id_count = id_counter[(section + 1) * section_size] -
+                             id_counter[section * section_size];
+          int32_t unique_id_count =
               unique_id_counter[(section + 1) * section_size] -
               unique_id_counter[section * section_size];
           // If the number of ids or unique ids exceeds the limit, We need to
@@ -1155,17 +1156,17 @@ void GetMinibatchSplitsWithPhysicalReplicaOp::Compute(OpKernelContext* ctx) {
   Tensor* splits_tensor;
   OP_REQUIRES_OK(
       ctx, ctx->allocate_output("splits", TensorShape({}), &splits_tensor));
-  splits_tensor->flat<int64>()(0) = after_merge_splits;
+  splits_tensor->flat<int64_t>()(0) = after_merge_splits;
 
   Tensor* max_ids_tensor;
   OP_REQUIRES_OK(
       ctx, ctx->allocate_output("max_ids", TensorShape({}), &max_ids_tensor));
-  max_ids_tensor->flat<int32>()(0) = this_max_ids;
+  max_ids_tensor->flat<int32_t>()(0) = this_max_ids;
 
   Tensor* max_uniques_tensor;
   OP_REQUIRES_OK(ctx, ctx->allocate_output("max_uniques", TensorShape({}),
                                            &max_uniques_tensor));
-  max_uniques_tensor->flat<int32>()(0) = this_max_uniques;
+  max_uniques_tensor->flat<int32_t>()(0) = this_max_uniques;
 }
 
 #ifdef LIBTPU_ON_GCE
@@ -1197,12 +1198,12 @@ void StoreMinibatchStatisticsInFdoOp::Compute(OpKernelContext* ctx) {
 
   const Tensor* max_ids_t;
   OP_REQUIRES_OK(ctx, ctx->input("max_ids", &max_ids_t));
-  int64_t max_ids = max_ids_t->scalar<int64>()();
+  int64_t max_ids = max_ids_t->scalar<int64_t>()();
   const Tensor* max_uniques_t;
   OP_REQUIRES_OK(ctx, ctx->input("max_uniques", &max_uniques_t));
-  int64_t max_uniques = max_uniques_t->scalar<int64>()();
+  int64_t max_uniques = max_uniques_t->scalar<int64_t>()();
 
-  int32 per_sc_sample_count = sample_count_ / num_sc_per_chip_;
+  int32_t per_sc_sample_count = sample_count_ / num_sc_per_chip_;
 
   int64_t max_ids_per_partition = -1;
   int64_t max_unique_ids_per_partition = -1;
@@ -1264,10 +1265,10 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ValidateInputs(*indices_or_row_splits, *values, *weights,
                                      sample_count_));
 
-  const int32 total_id_count = values->NumElements();
+  const int32_t total_id_count = values->NumElements();
 
-  auto row_ids_before_dedup = std::unique_ptr<int32[]>(
-      new std::remove_extent_t<int32[]>[total_id_count]);
+  auto row_ids_before_dedup = std::unique_ptr<int32_t[]>(
+      new std::remove_extent_t<int32_t[]>[total_id_count]);
 
   OP_REQUIRES_OK(ctx, ComputeRowIdsBeforePadding(*indices_or_row_splits,
                                                  total_id_count, sample_count_,
@@ -1285,14 +1286,14 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
   auto combiner_scale_transform_fn =
       GetCombinerScaleTransformFunction(combiner_);
 
-  const int32* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
-  const int32* values_ptr = values->flat<int32>().data();
+  const int32_t* row_ids_before_dedup_ptr = row_ids_before_dedup.get();
+  const int32_t* values_ptr = values->flat<int32_t>().data();
   const float* weights_ptr = weights->flat<float>().data();
 
   // Dedup the ids within one sample by just checking the adjacent ids. This
   // will NOT result in a full deduplication.
-  std::vector<int32> row_ids;
-  std::vector<int32> col_ids;
+  std::vector<int32_t> row_ids;
+  std::vector<int32_t> col_ids;
   std::vector<float> gains;
   row_ids.reserve(total_id_count);
   col_ids.reserve(total_id_count);
@@ -1306,8 +1307,8 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
     const float gain = *weights_ptr;
     const float rescaled_gain = combiner_scale_contribution_fn(gain);
     for (int token_id = 0; token_id < total_id_count; ++token_id) {
-      const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-      const int32 col_id = *(values_ptr + token_id);
+      const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+      const int32_t col_id = *(values_ptr + token_id);
       if (gains_rescale.has_value()) {
         // Compute the gain rescale before doing the dedup.
         (*gains_rescale)[row_id] += rescaled_gain;
@@ -1324,8 +1325,8 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
     }
   } else {
     for (int token_id = 0; token_id < total_id_count; ++token_id) {
-      const int32 row_id = *(row_ids_before_dedup_ptr + token_id);
-      const int32 col_id = *(values_ptr + token_id);
+      const int32_t row_id = *(row_ids_before_dedup_ptr + token_id);
+      const int32_t col_id = *(values_ptr + token_id);
       const float gain = *(weights_ptr + token_id);
       if (gains_rescale.has_value()) {
         // Compute the gain rescale before doing the dedup.
@@ -1371,8 +1372,8 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
         ctx, col_ids_output_list.allocate(
                  i, TensorShape({per_sc_token_count[i]}), &col_ids_tensor));
 
-    int32* row_ids_tensor_ptr = row_ids_tensor->flat<int32>().data();
-    int32* col_ids_tensor_ptr = col_ids_tensor->flat<int32>().data();
+    int32_t* row_ids_tensor_ptr = row_ids_tensor->flat<int32_t>().data();
+    int32_t* col_ids_tensor_ptr = col_ids_tensor->flat<int32_t>().data();
     float* gains_tensor_ptr = gains_tensor->flat<float>().data();
 
     WriteToOutputTensor(
@@ -1384,10 +1385,10 @@ void ConvertToListOfSparseCoreCooTensorsOp::Compute(OpKernelContext* ctx) {
 }
 
 void ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor(
-    int32* row_ids, int32* col_ids, float* gains, int32* row_ids_tensor_ptr,
-    int32* col_ids_tensor_ptr, float* gains_tensor_ptr, int32_t begin_index,
-    int32_t end_index, int32_t sc_id,
-    std::optional<std::vector<float>> gains_rescale) {
+    int32_t* row_ids, int32_t* col_ids, float* gains,
+    int32_t* row_ids_tensor_ptr, int32_t* col_ids_tensor_ptr,
+    float* gains_tensor_ptr, int32_t begin_index, int32_t end_index,
+    int32_t sc_id, std::optional<std::vector<float>> gains_rescale) {
   tsl::profiler::TraceMe traceme(
       "ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor");
   if (gains_rescale.has_value()) {
@@ -1407,12 +1408,13 @@ void ConvertToListOfSparseCoreCooTensorsOp::WriteToOutputTensor(
     }
   } else {
     std::transform(row_ids + begin_index, row_ids + end_index,
-                   row_ids_tensor_ptr, [this, &sc_id](int32 row_id) -> int32 {
+                   row_ids_tensor_ptr,
+                   [this, &sc_id](int32_t row_id) -> int32_t {
                      return row_id % per_sc_sample_count_ + per_sc_row_offset_ +
                             per_sc_stacked_table_sample_count_ * sc_id;
                    });
     std::transform(col_ids + begin_index, col_ids + end_index,
-                   col_ids_tensor_ptr, [this](int32 col_id) -> int32 {
+                   col_ids_tensor_ptr, [this](int32_t col_id) -> int32_t {
                      return ((col_id + col_shift_) & num_sc_shards_bit_mod_) +
                             (col_id & num_sc_shards_bit_mod_inv_) + col_offset_;
                    });
@@ -1804,7 +1806,7 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
         }
 
         *(row_pointers_tensor_ptr + row_pointers_index) = global_index;
-        int32 num_ids_to_pad_per_replica =
+        int32_t num_ids_to_pad_per_replica =
             xla::RoundUpTo<int32_t>(global_index, xla_pad_size) - global_index;
 
         std::fill_n(sorted_token_ids_tensor_ptr + global_index,
@@ -1818,8 +1820,8 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
         ++row_pointers_index;
       }
       // Pad the row_pointers to be memory aligned.
-      int32 num_row_pointers_to_pad =
-          xla::RoundUpTo<int32>(row_pointers_index, xla_pad_size) -
+      int32_t num_row_pointers_to_pad =
+          xla::RoundUpTo<int32_t>(row_pointers_index, xla_pad_size) -
           row_pointers_index;
       std::fill_n(row_pointers_tensor_ptr + row_pointers_index,
                   num_row_pointers_to_pad, global_index);
@@ -1838,7 +1840,7 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
                  << " . This could potentially impact the model quality.";
   }
 
-  int32 row_pointers_unpadded_size =
+  int32_t row_pointers_unpadded_size =
       total_num_minibatch * padded_row_pointers_size_per_sc;
 
   Tensor* num_minibatches_per_sc_tensor;
@@ -1855,10 +1857,10 @@ void ConvertToSparseCoreCsrWrappedCooTensorOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ctx->allocate_output("ids_unpadded_size", TensorShape({}),
                                            &ids_unpadded_size_tensor));
 
-  num_minibatches_per_sc_tensor->flat<int32>()(0) = num_minibatch_per_sc;
-  row_pointers_unpadded_size_tensor->flat<int32>()(0) =
+  num_minibatches_per_sc_tensor->flat<int32_t>()(0) = num_minibatch_per_sc;
+  row_pointers_unpadded_size_tensor->flat<int32_t>()(0) =
       row_pointers_unpadded_size;
-  ids_unpadded_size_tensor->flat<int32>()(0) = ids_unpadded_size;
+  ids_unpadded_size_tensor->flat<int32_t>()(0) = ids_unpadded_size;
 }
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
index 05bc79e416de8a..706622ae1dfbe4 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
+++ b/tensorflow/core/tpu/kernels/sparse_core_preprocess_ops.h
@@ -34,15 +34,15 @@ namespace tensorflow {
 // Struct to describe an embedding lookup input data.
 struct EmbeddingLookupInput {
   // Which replica it belongs.
-  int32 replica_id;
+  int32_t replica_id;
   // Token id.
-  int32 token_id;
+  int32_t token_id;
   // Sample id.
-  int32 sample_id;
+  int32_t sample_id;
   // Gain.
   float gain;
 
-  EmbeddingLookupInput(int32 replica_id, int32 token_id, int32 sample_id,
+  EmbeddingLookupInput(int32_t replica_id, int32_t token_id, int32_t sample_id,
                        float gain)
       : replica_id(replica_id),
         token_id(token_id),
@@ -56,9 +56,9 @@ absl::Status ValidateInputs(const Tensor& indices_or_row_splits,
 
 // Compute the row id list before padding.
 absl::Status ComputeRowIdsBeforePadding(const Tensor& indices_or_row_splits,
-                                        int32 total_id_count,
-                                        int32 sample_count,
-                                        int32* row_ids_before_padding,
+                                        int32_t total_id_count,
+                                        int32_t sample_count,
+                                        int32_t* row_ids_before_padding,
                                         std::vector<int> shape_strides = {});
 
 class GetMinibatchesInCsrWithPhysicalReplicaOp : public OpKernel {
@@ -101,7 +101,7 @@ class GetMinibatchSplitsWithPhysicalReplicaOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  protected:
-  virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
+  virtual void CalculateHeadroom(int32_t this_max_ids, int32_t this_max_uniques,
                                  tstring program_key,
                                  int64_t max_ids_per_partition,
                                  int64_t max_unique_ids_per_partition,
@@ -138,7 +138,7 @@ class StoreMinibatchStatisticsInFdoOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  protected:
-  virtual void CalculateHeadroom(int32 this_max_ids, int32 this_max_uniques,
+  virtual void CalculateHeadroom(int32_t this_max_ids, int32_t this_max_uniques,
                                  tstring program_key,
                                  int64_t max_ids_per_partition,
                                  int64_t max_unique_ids_per_partition) {}
@@ -165,10 +165,11 @@ class ConvertToListOfSparseCoreCooTensorsOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 
  private:
-  void WriteToOutputTensor(int32* row_ids, int32* col_ids, float* gains,
-                           int32* row_ids_tensor_ptr, int32* col_ids_tensor_ptr,
-                           float* gains_tensor_ptr, int32_t begin_index,
-                           int32_t end_index, int32_t sc_id,
+  void WriteToOutputTensor(int32_t* row_ids, int32_t* col_ids, float* gains,
+                           int32_t* row_ids_tensor_ptr,
+                           int32_t* col_ids_tensor_ptr, float* gains_tensor_ptr,
+                           int32_t begin_index, int32_t end_index,
+                           int32_t sc_id,
                            std::optional<std::vector<float>> gains_rescale);
   int sample_count_;
   int num_sc_per_chip_;
diff --git a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
index 50e86ba0198602..f3576628d048bc 100644
--- a/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
+++ b/tensorflow/core/tpu/kernels/sparse_core_xla_ops.cc
@@ -143,7 +143,7 @@ class XlaSparseDenseMatmulOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* builder = ctx->builder();
 
-    const int32 num_physical_replica =
+    const int32_t num_physical_replica =
         stream_executor::tpu::OpsApiFn()->TpuTopology_AvailableCoreCountFn(
             /*mesh_state=*/nullptr,
             /*tpu_core_type=*/TpuCoreTypeEnum::kEmbeddingV2);
@@ -662,7 +662,7 @@ class XlaSparseDenseMatmulGradWithCsrInputBase : public XlaOpKernel {
                 errors::InvalidArgument(
                     "activations input has non static or non-rank 2 shape: ",
                     activation_shape.ToString()));
-    int64 num_samples_per_chip = activation_shape.dimensions(0);
+    int64_t num_samples_per_chip = activation_shape.dimensions(0);
     OP_REQUIRES(ctx, num_samples_per_chip % num_sparsecores_per_device_ == 0,
                 errors::InvalidArgument(
                     "num_samples_per_chip ", num_samples_per_chip,
diff --git a/tensorflow/core/tpu/kernels/topk_ops.cc b/tensorflow/core/tpu/kernels/topk_ops.cc
index 16334632946c25..22d18e39220146 100644
--- a/tensorflow/core/tpu/kernels/topk_ops.cc
+++ b/tensorflow/core/tpu/kernels/topk_ops.cc
@@ -51,21 +51,21 @@ xla::XlaOp CreateKthOrderStatisticComputation(xla::XlaBuilder* builder,
   const int64_t width = input_shape.dim_size(1);
 
   xla::XlaOp input_sm32 = xla::BitcastConvertType(input, xla::S32);
-  xla::XlaOp zero_r0 = xla::ConstantR0<int32>(builder, 0);
+  xla::XlaOp zero_r0 = xla::ConstantR0<int32_t>(builder, 0);
   xla::XlaOp zero_r1 = xla::Broadcast(zero_r0, {height});
   xla::XlaOp zero_r2 = xla::Broadcast(zero_r0, {height, width});
 
-  xla::XlaOp max_r0 = xla::ConstantR0<int32>(builder, 0x7FFFFFFF);
+  xla::XlaOp max_r0 = xla::ConstantR0<int32_t>(builder, 0x7FFFFFFF);
   xla::XlaOp max_r1 = xla::Broadcast(max_r0, {height});
 
   // Start at positive zero, so that pivot is always less than top.
-  xla::XlaOp negative_zero_r0 = xla::ConstantR0<int32>(builder, 0x80000000);
+  xla::XlaOp negative_zero_r0 = xla::ConstantR0<int32_t>(builder, 0x80000000);
   xla::XlaOp negative_zero_r1 = xla::Broadcast(negative_zero_r0, {height});
   xla::XlaOp top_r1 = zero_r1;
 
-  for (uint32 mask = 1U << 31; mask; mask >>= 1) {
+  for (uint32_t mask = 1U << 31; mask; mask >>= 1) {
     xla::XlaOp broadcast_mask_r1 =
-        xla::Broadcast(xla::ConstantR0<int32>(builder, mask), {height});
+        xla::Broadcast(xla::ConstantR0<int32_t>(builder, mask), {height});
 
     // The first iteration of the loop determines if the kth element
     // is positive or negative. If the kth element is negative, we
@@ -111,14 +111,14 @@ class KthOrderStatistic : public XlaOpKernel {
         ctx, input_shape.dims() == 2,
         InvalidArgument("input must be rank-2: ", input_shape.DebugString()));
 
-    xla::XlaOp k = xla::ConstantR0<int32>(builder, k_);
+    xla::XlaOp k = xla::ConstantR0<int32_t>(builder, k_);
     xla::XlaOp kth_order_statistics =
         CreateKthOrderStatisticComputation(builder, input_shape, input, k);
     ctx->SetOutput(0, kth_order_statistics);
   }
 
  private:
-  int32 k_;
+  int32_t k_;
 };
 
 REGISTER_XLA_OP(Name("KthOrderStatistic"), KthOrderStatistic);
@@ -269,21 +269,21 @@ xla::XlaOp CreateMakeUnique(xla::XlaBuilder* builder, const xla::XlaOp input,
   // count_mask is used to mask away the low order bits to ensure
   // that every element is distinct.
   uint32_t next_power_of_two = absl::bit_ceil<uint64_t>(width);
-  uint32 count_mask = ~(next_power_of_two - 1);
+  uint32_t count_mask = ~(next_power_of_two - 1);
   xla::XlaOp count_mask_r0 = xla::ConstantR0(builder, count_mask);
   xla::XlaOp count_mask_r2 = xla::Broadcast(count_mask_r0, {height, width});
 
   // smallest_normal is the bit representation of the smallest
   // positive normal floating point number. The sign is zero,
   // exponent is one, and the fraction is zero.
-  uint32 smallest_normal = 1U << 23;
+  uint32_t smallest_normal = 1U << 23;
   xla::XlaOp smallest_normal_r0 = xla::ConstantR0(builder, smallest_normal);
   xla::XlaOp smallest_normal_r2 =
       xla::Broadcast(smallest_normal_r0, {height, width});
 
   // Used to mask away the sign bit when computing the absolute
   // value.
-  uint32 low_bit_mask = ~(1U << 31);
+  uint32_t low_bit_mask = ~(1U << 31);
   xla::XlaOp low_bit_mask_r0 = xla::ConstantR0(builder, low_bit_mask);
   xla::XlaOp low_bit_mask_r2 = xla::Broadcast(low_bit_mask_r0, {height, width});
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
index e8666ec63e171a..06fde06bdcac84 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
@@ -37,11 +37,11 @@ class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
  public:
   using StubType = tpu::grpc::TpuCompilationCacheService::Stub;
 
-  TpuCompilationCacheRpcLookup(const string& server_address,
+  TpuCompilationCacheRpcLookup(const std::string& server_address,
                                int64_t max_cache_size);
   ~TpuCompilationCacheRpcLookup() override = default;
 
-  absl::Status Lookup(const string& proto_key,
+  absl::Status Lookup(const std::string& proto_key,
                       std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
                       tpu::CompilationCacheFetchTarget fetch_target) override;
 
@@ -49,11 +49,11 @@ class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
                       std::unique_ptr<tpu::CompilationCacheEntryRef>* entry,
                       tpu::CompilationCacheFetchTarget fetch_target) override;
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
  private:
   // Helper method to make the RPC request to the central cache.
-  absl::Status RemoteLookupLocked(const string& local_proto_key,
+  absl::Status RemoteLookupLocked(const std::string& local_proto_key,
                                   const tpu::GetTpuProgramRequest& request,
                                   std::shared_ptr<CacheEntry>* cache_entry)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index a456a473c1a836..4f7af33e8c1c35 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -167,7 +167,7 @@ void TpuCompileOpKernelCommon::Compute(OpKernelContext* ctx) {
   });
 
   absl::Status compile_status = ComputeInternal(ctx);
-  string status_payload;
+  std::string status_payload;
   // Construct payload if compile_status is not ok and there's no payload for
   // compilation yet.
   if (!compile_status.ok() &&
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 66f7b02e6bc04d..56e2130495750c 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -178,7 +178,7 @@ class TpuCompileOpKernelCommon {
   std::string mlir_module_;
   // Fingerprint of the MLIR Module created once on construction to avoid paying
   // the cost on each invocation.
-  uint64 mlir_module_fingerprint_ = 0;
+  uint64_t mlir_module_fingerprint_ = 0;
 
   // Number of different programs to compile. This maps to number of cores in
   // each replica.
@@ -198,7 +198,7 @@ class TpuCompileOpKernelCommon {
 
   absl::Status RegisterXLAFingerprints(
       const std::vector<TensorShape>& arg_shapes,
-      TpuProgramGroupInterface* tpu_program_group, uint64 fingerprint);
+      TpuProgramGroupInterface* tpu_program_group, uint64_t fingerprint);
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h b/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
index 73b0a492b3551c..a6bf93239dc3d4 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_engine_state_interface.h
@@ -61,7 +61,7 @@ class TpuEmbeddingEngineStateInterface : public ResourceBase {
     return new TpuEmbeddingEngineStateInterface(state);
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return "TpuEmbeddingEngineStateInterface";
   }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
index e06c02c99b6cbb..46981718facdb4 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_enqueue_ops.h
@@ -31,7 +31,8 @@ absl::Status ValidateCombiners(absl::Span<const std::string> combiners);
 // Validates the `mode_override` input of the TPUEnqueue* ops, and, if correct,
 // sets the `mode` to pass on to the TPU Embedding manager.
 absl::Status GetValidatedModeOverride(
-    const string& mode_override, tpu::TPUEmbeddingConfiguration::Mode* mode);
+    const std::string& mode_override,
+    tpu::TPUEmbeddingConfiguration::Mode* mode);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_EMBEDDING_ENQUEUE_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.h b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
index 1d9e5cd57697ec..45c5fb52e1d9c9 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
@@ -84,18 +84,19 @@ GroupedEdges GroupTensorsForOutputPacking(Graph* graph,
                                           GraphShapeInfo* shape_info);
 
 absl::Status CreateConcatAndSplitNodesForInputTensor(
-    Graph* graph, const string& cluster_name, EdgeShapes* tpu_input_shapes,
+    Graph* graph, const std::string& cluster_name, EdgeShapes* tpu_input_shapes,
     const absl::flat_hash_map<std::string, std::vector<const Edge*>>&
         grouped_input_edges,
     int32_t minimum_input_tensors_packing, bool xla_spmd_input_sharded,
     const XlaShardingInfoMap& xla_sharding_info,
     const TpuReplicatedInputInfoMap& tpu_replicated_input_info);
 absl::Status CreateConcatAndSplitNodesForOutputTensor(
-    Graph* graph, const string& cluster_name, EdgeShapes* tpu_output_shapes,
-    GraphShapeInfo* tpu_inferred_info, GroupedEdges shape_to_output,
-    int32_t minimum_output_tensors_packing);
+    Graph* graph, const std::string& cluster_name,
+    EdgeShapes* tpu_output_shapes, GraphShapeInfo* tpu_inferred_info,
+    GroupedEdges shape_to_output, int32_t minimum_output_tensors_packing);
 
-absl::Status InsertReshapeNodePairs(Graph* graph, const string& cluster_name,
+absl::Status InsertReshapeNodePairs(Graph* graph,
+                                    const std::string& cluster_name,
                                     EdgeShapes* tpu_input_shapes,
                                     int num_cores_per_replica);
 
@@ -172,7 +173,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   };
 
   // This method is thread-safe.
-  absl::Status GetTpuCoreOrdinal(OpKernelContext* ctx, uint64 input_hash,
+  absl::Status GetTpuCoreOrdinal(OpKernelContext* ctx, uint64_t input_hash,
                                  int64_t* ordinal_selector_req_id,
                                  int32_t* core_ordinal);
 
@@ -196,11 +197,10 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   // device_ordinal: The index of the TPU core that is scheduled to run
   //   the computation. In the case of XLA SPMD, it is the "primary" core, which
   //   is the smallest index of all the cores.
-  absl::Status InitializeShardedVarOnTPU(OpKernelContext* ctx,
-                                         const core::RefCountPtr<Var>& var,
-                                         std::vector<NodeDef>& ndefs,
-                                         int split_dim,
-                                         const std::vector<string>& tpu_devices)
+  absl::Status InitializeShardedVarOnTPU(
+      OpKernelContext* ctx, const core::RefCountPtr<Var>& var,
+      std::vector<NodeDef>& ndefs, int split_dim,
+      const std::vector<std::string>& tpu_devices)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Check if any of the immediate successors of node has attribute
@@ -250,7 +250,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   absl::Status PlacementHelper(
       const DeviceSet& device_set,
       const GraphOptimizationPassOptions& optimization_options,
-      const string& function_name);
+      const std::string& function_name);
   // Partitions `graph`, populates `subgraphs` with the partitions, and runs
   // the post-partitioning graph optimization passes.
   absl::Status PartitionHelper(
@@ -263,15 +263,15 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   // If `out_flib_def` is not null, it will be set to a copy of `flib_def_` and
   // used for instantiation.
   absl::Status InstantiatePartition(
-      const Graph& graph, const string& function_name,
-      const string& target_device, FHandle* handle,
+      const Graph& graph, const std::string& function_name,
+      const std::string& target_device, FHandle* handle,
       std::unique_ptr<FunctionLibraryDefinition>* out_flib_def)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Adds and instantiates functions for each subgraph in `subgraphs` after
   // rewriting nodes' `device_ordinal` attributes to match `replica_id` when
   // num_cores_per_replica == 1.
   absl::Status InstantiateFunctionsFromSubgraphs(
-      const DeviceSet& device_set, int replica_id, uint64 cache_hash,
+      const DeviceSet& device_set, int replica_id, uint64_t cache_hash,
       int num_cores_per_replica,
       std::unordered_map<std::string, std::unique_ptr<Graph>> subgraphs)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -344,7 +344,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   const std::string local_device_name_;
   // Maps from cache key to their corresponding functions, which are
   // represented as (device, handle) pairs.
-  gtl::FlatMap<uint64, std::vector<DeviceAndFHandle>> partition_cache_
+  gtl::FlatMap<uint64_t, std::vector<DeviceAndFHandle>> partition_cache_
       ABSL_GUARDED_BY(mu_);
 
   // A set contains seen ordinals. Used by variable initialization on TPU.
@@ -362,7 +362,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   FunctionLibraryRuntime* library_runtime_;
 
   // Used to uniquify function names in `flib_def_`.
-  uint32 suffix_ = 0;
+  uint32_t suffix_ = 0;
 
   // Minimum number of run steps (batches) necessary to trigger xla autotuner.
   int autotuner_thresh_ = 0;
@@ -371,7 +371,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
   std::shared_ptr<tpu::TPUOrdinalSelector> ordinal_selector_;
 
   // Maps input hash to TF fingerprint.
-  absl::flat_hash_map<uint64, uint64> inputs_to_fingerprint_;
+  absl::flat_hash_map<uint64_t, uint64_t> inputs_to_fingerprint_;
 
   // List of TPU devices
   std::vector<Device*> tpu_devices_;
diff --git a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
index 6e84dde261bb24..1d50e75bb804b3 100644
--- a/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h
@@ -75,7 +75,7 @@ class TpuMeshStateInterface : public tensorflow::ResourceBase {
                mesh_state_, tpu_core_type);
   }
 
-  string DebugString() const override { return "TpuMeshStateInterface"; }
+  std::string DebugString() const override { return "TpuMeshStateInterface"; }
 
  private:
   XLA_TpuMeshState* mesh_state_;
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 8d1d4861b6fcca..6da81d1ffefabe 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -73,8 +73,8 @@ std::string CreateConfigPrefix(const TPUCompileMetadataProto& metadata) {
 }
 }  // namespace
 
-uint64 CreateFingerprintWithNameAndShapes(
-    uint64 name, const std::vector<tensorflow::TensorShape>& shapes) {
+uint64_t CreateFingerprintWithNameAndShapes(
+    uint64_t name, const std::vector<tensorflow::TensorShape>& shapes) {
   std::string shape_prefix = CreateShapePrefix(shapes);
   VLOG(2) << "CreateFingerprintWithNameAndShapes, name: " << name
           << ", shape_prefix: " << shape_prefix;
@@ -85,7 +85,7 @@ uint64 CreateFingerprintWithNameAndShapes(
 // Return fingerprint_in_metadata if it's not empty; otherwise read input tensor
 // data to compute the fingerprint.
 std::string GuaranteedConstFingerprint(
-    const string& fingerprint_in_metadata,
+    const std::string& fingerprint_in_metadata,
     const OpInputList& guaranteed_constants) {
   if (fingerprint_in_metadata.empty()) {
     uint64_t fingerprint = 0;
@@ -104,8 +104,8 @@ std::string GuaranteedConstFingerprint(
 // The `guaranteed_constants` must be passed as reference due to the lazy
 // evaluation of `guaranteed_const_fingerprint()` callback.
 TpuCompilationCacheKey CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    uint64 mlir_module_fingerprint, const OpInputList& guaranteed_constants,
+    absl::string_view function_name, uint64_t function_library_fingerprint,
+    uint64_t mlir_module_fingerprint, const OpInputList& guaranteed_constants,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state, uint64_t session_id,
@@ -151,7 +151,7 @@ TpuCompilationCacheKey CreateCompilationCacheKey(
     // reference based on the assumption that these variables lifetime is
     // managed through the `TPUCompileOpKernelImpl` that outlives the
     // lifetime of the compilation cache lookups.
-    string fingerprint;
+    std::string fingerprint;
     key.guaranteed_const_fingerprint = [&metadata, &guaranteed_constants,
                                         fingerprint]() mutable {
       if (fingerprint.empty()) {
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.h b/tensorflow/core/tpu/kernels/tpu_op_util.h
index d0ca805fec4757..df68fdaaff39e5 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.h
@@ -28,13 +28,13 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 // Creates a fingerprint given the name and the vector of shapes.
-uint64 CreateFingerprintWithNameAndShapes(
-    uint64 name, const std::vector<tensorflow::TensorShape>& shapes);
+uint64_t CreateFingerprintWithNameAndShapes(
+    uint64_t name, const std::vector<tensorflow::TensorShape>& shapes);
 
 // Creates a unique compilation cache `key`.
 TpuCompilationCacheKey CreateCompilationCacheKey(
-    absl::string_view function_name, uint64 function_library_fingerprint,
-    uint64 mlir_module_fingerprint, const OpInputList& guaranteed_constants,
+    absl::string_view function_name, uint64_t function_library_fingerprint,
+    uint64_t mlir_module_fingerprint, const OpInputList& guaranteed_constants,
     const std::vector<TensorShape>& dynamic_shapes,
     const TPUCompileMetadataProto& metadata,
     const TpuMeshStateInterface& mesh_state, uint64_t session_id = 0,
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
index 9ea689b317f551..3bf1bfac3fe0bb 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
@@ -39,7 +39,7 @@ class TPUOrdinalSelector : TPUOrdinalSelectorInterface {
     stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_DestroyFn(
         ordinal_selector_);
   }
-  int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) override {
+  int64_t GetOrdinal(std::optional<uint64_t> key, int64_t* req_id) override {
     int64_t ordinal;
     stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_GetOrdinalFn(
         ordinal_selector_, key, req_id, &ordinal);
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
index 040959d592a1bd..21ce7b393d6195 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
@@ -26,7 +26,7 @@ namespace tpu {
 class TPUOrdinalSelectorInterface {
  public:
   virtual ~TPUOrdinalSelectorInterface() = default;
-  virtual int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) = 0;
+  virtual int64_t GetOrdinal(std::optional<uint64_t> key, int64_t* req_id) = 0;
   virtual void DequeueFromCoreSelector(int32_t device_ordinal,
                                        int64_t req_id) = 0;
 };
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
index 1457ceac9b790b..73acdd65ef166c 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.cc
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -123,7 +123,7 @@ TpuPodState::~TpuPodState() {
   VLOG(1) << "Shutting down Compilation Cache Service done.";
 }
 
-string TpuPodState::DebugString() const {
+std::string TpuPodState::DebugString() const {
   return "Wrapper for distributed TPU state";
 }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.h b/tensorflow/core/tpu/kernels/tpu_pod_state.h
index b24a512d341cbe..99e2cff3e1f948 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.h
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.h
@@ -38,7 +38,7 @@ class TpuPodState : public ResourceBase {
 
   ~TpuPodState() override;
 
-  string DebugString() const override;
+  std::string DebugString() const override;
 
  private:
   std::unique_ptr<TpuCompilationCacheService> cache_service_;
diff --git a/tensorflow/core/tpu/kernels/tpu_util.cc b/tensorflow/core/tpu/kernels/tpu_util.cc
index 14223164d1e1b2..25e57e71da8dbf 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_util.cc
@@ -48,7 +48,7 @@ absl::StatusOr<TpuCompilationCacheKey> ParseCompilationCacheKey(
   TpuCompilationCacheKey parsed_key(splits.at(0));
   parsed_key.has_guaranteed_const = true;
   parsed_key.session_handle = splits.at(1);
-  const string fingerprint = splits.at(2);
+  const std::string fingerprint = splits.at(2);
   parsed_key.guaranteed_const_fingerprint = [fingerprint] {
     return fingerprint;
   };
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
index 703dc3e7589134..1610d807411cdb 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -51,7 +51,7 @@ limitations under the License.
 namespace tensorflow {
 
 TpuTransferAsyncOpKernelBase::TpuTransferAsyncOpKernelBase(
-    OpKernelConstruction* ctx, const string& transfer_type,
+    OpKernelConstruction* ctx, const std::string& transfer_type,
     int number_of_threads, std::unique_ptr<TpuTransferOpInterface> transfer_op)
     : AsyncOpKernel(ctx),
       transfer_type_(transfer_type),
@@ -113,7 +113,7 @@ absl::Status TpuTransferAsyncOpKernelBase::RunTransferWithOrdinal(
 }
 
 TpuTransferAsyncOpKernel::TpuTransferAsyncOpKernel(
-    OpKernelConstruction* ctx, const string& transfer_type,
+    OpKernelConstruction* ctx, const std::string& transfer_type,
     int number_of_threads, std::unique_ptr<TpuTransferOpInterface> transfer_op)
     : TpuTransferAsyncOpKernelBase(ctx, transfer_type, number_of_threads,
                                    std::move(transfer_op)) {
@@ -132,7 +132,7 @@ absl::Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
 }
 
 TpuTransferAsyncDynamicOrdinalOpKernel::TpuTransferAsyncDynamicOrdinalOpKernel(
-    OpKernelConstruction* ctx, const string& transfer_type,
+    OpKernelConstruction* ctx, const std::string& transfer_type,
     int number_of_threads, std::unique_ptr<TpuTransferOpInterface> transfer_op)
     : TpuTransferAsyncOpKernelBase(ctx, transfer_type, number_of_threads,
                                    std::move(transfer_op)) {}
@@ -140,7 +140,7 @@ TpuTransferAsyncDynamicOrdinalOpKernel::TpuTransferAsyncDynamicOrdinalOpKernel(
 absl::Status TpuTransferAsyncDynamicOrdinalOpKernel::RunTransfer(
     OpKernelContext* ctx) {
   const Tensor& device_ordinal_tensor = ctx->input(0);
-  const int device_ordinal = device_ordinal_tensor.scalar<int32>()();
+  const int device_ordinal = device_ordinal_tensor.scalar<int32_t>()();
   XlaDevice* xla_device =
       dynamic_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
   if (((xla_device == nullptr) || (xla_device->device_type() == DEVICE_CPU)) &&

From 6a5e39020dd1d6c764c2a7eff18743f7fe21d119 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:01:20 -0800
Subject: [PATCH 486/753] Automated Code Change

PiperOrigin-RevId: 846026317
---
 .../c/tf_rendezvous_c_api_test.cc                  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
index 28f029350da1a2..9570b9407e1574 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
@@ -58,7 +58,7 @@ Tensor CreateTestTensor() {
   Tensor t(DT_INT8, TensorShape({10, 20}));
   for (int64_t a = 0; a < t.shape().dim_size(0); a++) {
     for (int64_t b = 0; b < t.shape().dim_size(1); b++) {
-      t.matrix<int8>()(a, b) = static_cast<int8>((a + 1) * (b + 1));
+      t.matrix<int8_t>()(a, b) = static_cast<int8_t>((a + 1) * (b + 1));
     }
   }
   return t;
@@ -68,7 +68,8 @@ class FakeAllocator : public Allocator {
  public:
   std::string Name() override { return "fake"; }
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
+    return tsl::port::AlignedMalloc(num_bytes,
+                                    static_cast<std::align_val_t>(alignment));
   }
   void DeallocateRaw(void* ptr) override { return port::AlignedFree(ptr); }
 };
@@ -112,8 +113,9 @@ class FakeDeviceManager : public DeviceMgr {
   bool ContainsDevice(int64_t device_incarnation) const override {
     return false;
   }
-  void ClearContainers(absl::Span<const string> containers) const override {}
-  int NumDeviceType(const string& type) const override { return 0; }
+  void ClearContainers(
+      absl::Span<const std::string> containers) const override {}
+  int NumDeviceType(const std::string& type) const override { return 0; }
   int NumDevices() const override { return 0; }
   Device* HostCPU() const override { return nullptr; }
 
@@ -127,7 +129,7 @@ class TestDeviceContext : public DeviceContext {
                              Tensor* device_tensor, StatusCallback done,
                              bool sync_dst_compute) const override {
     Tensor test_tensor = CreateTestTensor();
-    test::ExpectTensorEqual<int8>(test_tensor, *cpu_tensor);
+    test::ExpectTensorEqual<int8_t>(test_tensor, *cpu_tensor);
     done(absl::OkStatus());
   }
 
@@ -191,7 +193,7 @@ TEST(RendezvousCAPI, DeviceToHost) {
                         });
   callback_done.WaitForNotification();
   Tensor test_tensor = CreateTestTensor();
-  test::ExpectTensorEqual<int8>(test_tensor, result);
+  test::ExpectTensorEqual<int8_t>(test_tensor, result);
 
   Destroy(thunk);
   delete thunk;

From ea54dd41da6871a3607aa3379a6e7fd6626462af Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:08:09 -0800
Subject: [PATCH 487/753] Automated Code Change

PiperOrigin-RevId: 846029274
---
 .../compiler/tf2xla/kernels/light_outside_compilation.cc       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index 390bc09c33057d..a5322903a9fbc7 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -548,7 +548,8 @@ absl::Status PopulateMetadataBufferIfNeeded(OpKernelContext& ctx,
               callback_data.outputs(i).buffer_description().shape()));
       void* location = static_cast<char*>(allocated->data()) +
                        xla::ShapeUtil::ByteSizeOf(xla_shape);
-      se::DeviceMemoryBase m{location, num_dimensions * sizeof(int32_t)};
+      stream_executor::DeviceAddressBase m{location,
+                                           num_dimensions * sizeof(int32_t)};
       TF_RETURN_IF_ERROR(stream->Memcpy(&m, shape_info.data(),
                                         num_dimensions * sizeof(int32_t)));
     }

From f14f3b94e7210a98e307d0d85088efcf45e91f2a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:09:15 -0800
Subject: [PATCH 488/753] Automated Code Change

PiperOrigin-RevId: 846029628
---
 tensorflow/core/kernels/cwise_ops_common.cc | 4 ++--
 tensorflow/core/kernels/cwise_ops_test.cc   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/cwise_ops_common.cc b/tensorflow/core/kernels/cwise_ops_common.cc
index 733fa8af2f9cae..a202e3717938e9 100644
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@@ -36,7 +36,7 @@ void BinaryOpShared::SetComputeError(OpKernelContext* ctx) {
   // associated information.  This is sufficient for now, since the only binary
   // ops that have compute errors are integer division and mod, and the only
   // error they produce is zero division.
-  const string& op = ctx->op_kernel().type_string();
+  const std::string& op = ctx->op_kernel().type_string();
   if ((op == "Div" || op == "Mod" || op == "FloorMod" || op == "FloorDiv") &&
       DataTypeIsInteger(ctx->op_kernel().input_type(0))) {
     ctx->CtxFailure(errors::InvalidArgument("Integer division by zero"));
@@ -62,7 +62,7 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
         TryGetNodeAttr(ctx->op_kernel().def(), "incompatible_shape_error",
                        &(incompatible_shape_error));
     if (has_attr && !incompatible_shape_error) {
-      const string& op = ctx->op_kernel().type_string();
+      const std::string& op = ctx->op_kernel().type_string();
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
       result = (op == "NotEqual");
       return;
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index 41b851fda99b4b..5c6663b666aea3 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -27,7 +27,7 @@ namespace {
 // Creates a Graph which applies a unary "func" on a 3D tensor of
 // type T with "num" elements.
 template <typename T>
-static Graph* Unary(const string& func, int num, DataType dtype) {
+static Graph* Unary(const std::string& func, int num, DataType dtype) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor data(dtype, TensorShape({64, 64, num / (64 * 64)}));
   CHECK_GT(data.NumElements(), 0);
@@ -97,7 +97,7 @@ BM_UNARY(gpu, Round, float, DT_FLOAT);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // data func scalar.
-Graph* BinaryScalar(int num, const string& func) {
+Graph* BinaryScalar(int num, const std::string& func) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
   lhs.flat<float>().setRandom();

From 880c73f0a97671f21b0ca8f47f03edf14a84b17c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:30:13 -0800
Subject: [PATCH 489/753] Automated Code Change

PiperOrigin-RevId: 846037511
---
 tensorflow/core/lib/core/arena.h        |  4 +-
 tensorflow/core/lib/core/coding_test.cc | 90 ++++++++++++-------------
 2 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/tensorflow/core/lib/core/arena.h b/tensorflow/core/lib/core/arena.h
index 14d80422496bd2..d5f6c765e51dd0 100644
--- a/tensorflow/core/lib/core/arena.h
+++ b/tensorflow/core/lib/core/arena.h
@@ -62,7 +62,7 @@ class Arena {
 
  protected:
   bool SatisfyAlignment(const size_t alignment);
-  void MakeNewBlock(const uint32 alignment);
+  void MakeNewBlock(const uint32_t alignment);
   void* GetMemoryFallback(const size_t size, const int align);
   void* GetMemory(const size_t size, const int align) {
     assert(remaining_ <= block_size_);                  // an invariant
@@ -88,7 +88,7 @@ class Arena {
   // The returned AllocatedBlock* is valid until the next call to AllocNewBlock
   // or Reset (i.e. anything that might affect overflow_blocks_).
   AllocatedBlock* AllocNewBlock(const size_t block_size,
-                                const uint32 alignment);
+                                const uint32_t alignment);
 
   const size_t block_size_;
   char* freestart_;  // beginning of the free space in most recent block
diff --git a/tensorflow/core/lib/core/coding_test.cc b/tensorflow/core/lib/core/coding_test.cc
index 9efe3d8ec10f2c..4769cddaca0906 100644
--- a/tensorflow/core/lib/core/coding_test.cc
+++ b/tensorflow/core/lib/core/coding_test.cc
@@ -22,46 +22,46 @@ namespace tensorflow {
 namespace core {
 
 TEST(Coding, Fixed16) {
-  static const uint16 N = 50000;
+  static const uint16_t N = 50000;
 
-  string s;
-  for (uint16 v = 0; v < N; v++) {
-    char buf[sizeof(uint16)];
+  std::string s;
+  for (uint16_t v = 0; v < N; v++) {
+    char buf[sizeof(uint16_t)];
     EncodeFixed16(buf, v);
     s.append(buf, sizeof(buf));
   }
 
   const char* p = s.data();
-  for (uint16 v = 0; v < N; v++) {
-    uint16 actual = DecodeFixed16(p);
+  for (uint16_t v = 0; v < N; v++) {
+    uint16_t actual = DecodeFixed16(p);
     ASSERT_EQ(v, actual);
-    p += sizeof(uint16);
+    p += sizeof(uint16_t);
   }
 }
 
 TEST(Coding, Fixed32) {
-  static const uint32 N = 100000;
+  static const uint32_t N = 100000;
 
-  string s;
-  for (uint32 v = 0; v < N; v++) {
-    char buf[sizeof(uint32)];
+  std::string s;
+  for (uint32_t v = 0; v < N; v++) {
+    char buf[sizeof(uint32_t)];
     EncodeFixed32(buf, v);
     s.append(buf, sizeof(buf));
   }
 
   const char* p = s.data();
-  for (uint32 v = 0; v < N; v++) {
-    uint32 actual = DecodeFixed32(p);
+  for (uint32_t v = 0; v < N; v++) {
+    uint32_t actual = DecodeFixed32(p);
     ASSERT_EQ(v, actual);
-    p += sizeof(uint32);
+    p += sizeof(uint32_t);
   }
 }
 
 TEST(Coding, Fixed64) {
-  string s;
+  std::string s;
   for (int power = 0; power <= 63; power++) {
-    uint64 v = static_cast<uint64>(1) << power;
-    char buf[sizeof(uint64)];
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    char buf[sizeof(uint64_t)];
     EncodeFixed64(buf, v - 1);
     s.append(buf, sizeof(buf));
     EncodeFixed64(buf, v + 0);
@@ -72,19 +72,19 @@ TEST(Coding, Fixed64) {
 
   const char* p = s.data();
   for (int power = 0; power <= 63; power++) {
-    uint64 v = static_cast<uint64>(1) << power;
-    uint64 actual;
+    uint64_t v = static_cast<uint64_t>(1) << power;
+    uint64_t actual;
     actual = DecodeFixed64(p);
     ASSERT_EQ(v - 1, actual);
-    p += sizeof(uint64);
+    p += sizeof(uint64_t);
 
     actual = DecodeFixed64(p);
     ASSERT_EQ(v + 0, actual);
-    p += sizeof(uint64);
+    p += sizeof(uint64_t);
 
     actual = DecodeFixed64(p);
     ASSERT_EQ(v + 1, actual);
-    p += sizeof(uint64);
+    p += sizeof(uint64_t);
   }
 }
 
@@ -113,17 +113,17 @@ TEST(Coding, EncodingOutput) {
 }
 
 TEST(Coding, Varint32) {
-  string s;
-  for (uint32 i = 0; i < (32 * 32); i++) {
-    uint32 v = (i / 32) << (i % 32);
+  std::string s;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t v = (i / 32) << (i % 32);
     PutVarint32(&s, v);
   }
 
   const char* p = s.data();
   const char* limit = p + s.size();
-  for (uint32 i = 0; i < (32 * 32); i++) {
-    uint32 expected = (i / 32) << (i % 32);
-    uint32 actual;
+  for (uint32_t i = 0; i < (32 * 32); i++) {
+    uint32_t expected = (i / 32) << (i % 32);
+    uint32_t actual;
     p = GetVarint32Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
     ASSERT_EQ(expected, actual);
@@ -133,21 +133,21 @@ TEST(Coding, Varint32) {
 
 TEST(Coding, Varint64) {
   // Construct the list of values to check
-  std::vector<uint64> values;
+  std::vector<uint64_t> values;
   // Some special values
   values.push_back(0);
   values.push_back(100);
-  values.push_back(~static_cast<uint64>(0));
-  values.push_back(~static_cast<uint64>(0) - 1);
-  for (uint32 k = 0; k < 64; k++) {
+  values.push_back(~static_cast<uint64_t>(0));
+  values.push_back(~static_cast<uint64_t>(0) - 1);
+  for (uint32_t k = 0; k < 64; k++) {
     // Test values near powers of two
-    const uint64 power = 1ull << k;
+    const uint64_t power = 1ull << k;
     values.push_back(power);
     values.push_back(power - 1);
     values.push_back(power + 1);
   }
 
-  string s;
+  std::string s;
   for (size_t i = 0; i < values.size(); i++) {
     PutVarint64(&s, values[i]);
   }
@@ -156,7 +156,7 @@ TEST(Coding, Varint64) {
   const char* limit = p + s.size();
   for (size_t i = 0; i < values.size(); i++) {
     ASSERT_TRUE(p < limit);
-    uint64 actual;
+    uint64_t actual;
     p = GetVarint64Ptr(p, limit, &actual);
     ASSERT_TRUE(p != nullptr);
     ASSERT_EQ(values[i], actual);
@@ -165,17 +165,17 @@ TEST(Coding, Varint64) {
 }
 
 TEST(Coding, Varint32Overflow) {
-  uint32 result;
-  string input("\x81\x82\x83\x84\x85\x11");
+  uint32_t result;
+  std::string input("\x81\x82\x83\x84\x85\x11");
   ASSERT_TRUE(GetVarint32Ptr(input.data(), input.data() + input.size(),
                              &result) == nullptr);
 }
 
 TEST(Coding, Varint32Truncation) {
-  uint32 large_value = (1u << 31) + 100;
-  string s;
+  uint32_t large_value = (1u << 31) + 100;
+  std::string s;
   PutVarint32(&s, large_value);
-  uint32 result;
+  uint32_t result;
   for (size_t len = 0; len < s.size() - 1; len++) {
     ASSERT_TRUE(GetVarint32Ptr(s.data(), s.data() + len, &result) == nullptr);
   }
@@ -185,17 +185,17 @@ TEST(Coding, Varint32Truncation) {
 }
 
 TEST(Coding, Varint64Overflow) {
-  uint64 result;
-  string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
+  uint64_t result;
+  std::string input("\x81\x82\x83\x84\x85\x81\x82\x83\x84\x85\x11");
   ASSERT_TRUE(GetVarint64Ptr(input.data(), input.data() + input.size(),
                              &result) == nullptr);
 }
 
 TEST(Coding, Varint64Truncation) {
-  uint64 large_value = (1ull << 63) + 100ull;
-  string s;
+  uint64_t large_value = (1ull << 63) + 100ull;
+  std::string s;
   PutVarint64(&s, large_value);
-  uint64 result;
+  uint64_t result;
   for (size_t len = 0; len < s.size() - 1; len++) {
     ASSERT_TRUE(GetVarint64Ptr(s.data(), s.data() + len, &result) == nullptr);
   }

From 8cfef0f4ff3117744b42312efa33042ac7f4e534 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:31:21 -0800
Subject: [PATCH 490/753] Automated Code Change

PiperOrigin-RevId: 846037931
---
 tensorflow/lite/tools/optimize/quantization_utils_test.cc | 7 +++----
 tensorflow/lite/tools/optimize/quantize_model_test.cc     | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index 33f62f0c850363..e1494788dc45b4 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -36,7 +36,7 @@ limitations under the License.
 #include "tensorflow/lite/testing/util.h"
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -901,7 +901,7 @@ TEST_F(QuantizationUtilsTest, ExtendToPowerOfTwo) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -912,8 +912,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 8a0013b09e6851..319da9523aea7e 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -39,7 +39,7 @@ limitations under the License.
 // Note: More rigorous model tests can be found in subgraph_quantizer_test.cc
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -2309,7 +2309,7 @@ TEST_P(BiasInputTest, QuantizationSucceeds) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -2320,8 +2320,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }

From be173da8ef3c07f751308d096498fb6420f7d129 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:39:56 -0800
Subject: [PATCH 491/753] Automated Code Change

PiperOrigin-RevId: 846040757
---
 third_party/xla/xla/stream_executor/host/host_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/host/host_executor.cc b/third_party/xla/xla/stream_executor/host/host_executor.cc
index 151d7b51306d24..b854988eafdaf1 100644
--- a/third_party/xla/xla/stream_executor/host/host_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_executor.cc
@@ -85,7 +85,7 @@ DeviceAddressBase HostExecutor::Allocate(uint64_t size, int64_t memory_space) {
   // This should probably be kept in sync with
   // tsl::Allocator::kAllocatorAlignment.
   return DeviceAddressBase(
-      tsl::port::AlignedMalloc(size, /*minimum_alignment=*/64), size);
+      tsl::port::AlignedMalloc(size, static_cast<std::align_val_t>(64)), size);
 }
 
 void HostExecutor::Deallocate(DeviceAddressBase* mem) {

From 702b294b4111100405c98812ac01598402f89829 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 20:40:29 -0800
Subject: [PATCH 492/753] Removal of tsl-specific integral types.

PiperOrigin-RevId: 846041013
---
 .../xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc   | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 88fc6bc4f2da29..1c21e3533f9986 100644
--- a/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/hlo/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -121,13 +121,6 @@ limitations under the License.
 #define DEBUG_TYPE "xla-translate"
 
 using ::int64_t;
-using ::tsl::int16;
-using ::tsl::int32;
-using ::tsl::int8;
-using ::tsl::uint16;
-using ::tsl::uint32;
-using ::tsl::uint64;
-using ::tsl::uint8;
 
 // All Module level and Function level attributes must be included in:
 //   xla/mlir_hlo/utils/unregistered_attributes.h

From 066852ef9177198bd9459a225ae8c7d2cbdb4d51 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 21:09:14 -0800
Subject: [PATCH 493/753] Automated Code Change

PiperOrigin-RevId: 846051105
---
 tensorflow/dtensor/mlir/collectives.h                       | 2 +-
 tensorflow/dtensor/mlir/collectives_common.cc               | 6 +++---
 tensorflow/dtensor/mlir/collectives_common.h                | 2 +-
 .../dtensor/mlir/dtensor_allreduce_combine_optimization.cc  | 2 +-
 .../dtensor/mlir/dtensor_allreduce_scatter_optimization.cc  | 6 +++---
 tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc   | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/dtensor/mlir/collectives.h b/tensorflow/dtensor/mlir/collectives.h
index fc0f8f0203d68a..101e944b84d813 100644
--- a/tensorflow/dtensor/mlir/collectives.h
+++ b/tensorflow/dtensor/mlir/collectives.h
@@ -84,7 +84,7 @@ StatusOr<mlir::Operation*> EmitAllReduce(
 StatusOr<mlir::Operation*> EmitBarrierWithConstValue(mlir::OpBuilder& builder,
                                                      mlir::Location loc,
                                                      const Mesh& mesh,
-                                                     int32 value);
+                                                     int32_t value);
 
 // Given input `tensor` that is sharded across spatial dimensions, conduct
 // halo exchange such that each spatially sharded input blocks exchange
diff --git a/tensorflow/dtensor/mlir/collectives_common.cc b/tensorflow/dtensor/mlir/collectives_common.cc
index fcda6c26d51988..37bdd53366af82 100644
--- a/tensorflow/dtensor/mlir/collectives_common.cc
+++ b/tensorflow/dtensor/mlir/collectives_common.cc
@@ -38,7 +38,7 @@ namespace dtensor {
 // a multi-host cluster will generate the same grouping, and therefore the same
 // XLA program fingerprint, independently. std::map guarantees the same
 // iteration order.
-using AllReducePartitions = std::map<DeviceLocation, std::vector<int32>>;
+using AllReducePartitions = std::map<DeviceLocation, std::vector<int32_t>>;
 
 // Computes AllReduce partitions using reduced mesh dimension names.
 //
@@ -60,11 +60,11 @@ StatusOr<AllReducePartitions> GetAllReducePartitionsFromReducedDims(
     const dtensor::Layout& output_layout,
     const absl::flat_hash_set<std::string>& reduced_dims) {
   AllReducePartitions partitions;
-  for (int64 device = 0; device < output_layout.num_devices(); ++device) {
+  for (int64_t device = 0; device < output_layout.num_devices(); ++device) {
     TF_ASSIGN_OR_RETURN(const DeviceLocation device_loc,
                         output_layout.mesh().device_location(device));
     DeviceLocation kept_dims;
-    for (int64 dim_idx = 0; dim_idx < device_loc.size(); ++dim_idx) {
+    for (int64_t dim_idx = 0; dim_idx < device_loc.size(); ++dim_idx) {
       if (!reduced_dims.contains(output_layout.mesh().dim_name(dim_idx))) {
         kept_dims.push_back(device_loc[dim_idx]);
       }
diff --git a/tensorflow/dtensor/mlir/collectives_common.h b/tensorflow/dtensor/mlir/collectives_common.h
index 6041eb4501de3f..fe8688ebc673af 100644
--- a/tensorflow/dtensor/mlir/collectives_common.h
+++ b/tensorflow/dtensor/mlir/collectives_common.h
@@ -29,7 +29,7 @@ namespace tensorflow {
 namespace dtensor {
 
 // Computes AllReduce partitions using reduced mesh dimension names.
-StatusOr<std::map<DeviceLocation, std::vector<int32>>>
+StatusOr<std::map<DeviceLocation, std::vector<int32_t>>>
 GetAllReducePartitionsFromReducedDims(
     const dtensor::Layout& output_layout,
     const absl::flat_hash_set<std::string>& reduced_dims);
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
index 9261255c304033..e4cea2348f3d09 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
@@ -72,7 +72,7 @@ namespace ops_util = ::mlir::TF::collection_ops_util;
 
 // Pad the merged tensor shape to multiples of 1024B, so delinearization
 // skipping optimization in XLA can get activated.
-constexpr int32 kAllReducePadding = 1024;
+constexpr int32_t kAllReducePadding = 1024;
 
 // Returns true if `successor` depends on `predecessor`.
 // TODO(jiawenhao): Repeatedly computing dependency sets for a large cluster can
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
index 682af5ae92b021..b16eeb8230f860 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
@@ -64,16 +64,16 @@ mlir::DenseIntElementsAttr GetScatterGroupAssignment(
   auto partitions =
       GetAllReducePartitionsFromReducedDims(original_layout, scattered_dims)
           .value();
-  const int32 num_partitions = partitions.size();
+  const int32_t num_partitions = partitions.size();
 
   // Construct a flattened list of scatter partitions.
-  std::vector<int32> partitions_flat;
+  std::vector<int32_t> partitions_flat;
   for (auto& p : partitions) {
     partitions_flat.insert(partitions_flat.end(), p.second.begin(),
                            p.second.end());
   }
 
-  int32 partition_size = partitions.begin()->second.size();
+  int32_t partition_size = partitions.begin()->second.size();
   mlir::OpBuilder builder(all_scatter);
   auto group_shaped_type = mlir::RankedTensorType::get(
       {num_partitions, partition_size},
diff --git a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
index f563dceb065671..b722e1bba45e0d 100644
--- a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
@@ -44,7 +44,7 @@ namespace {
 // the list of devices that are a part of the same reduction group.
 template <class ReduceOpType>
 mlir::LogicalResult GetAllReduceGroupSize(ReduceOpType reduce_op,
-                                          int32* group_size) {
+                                          int32_t* group_size) {
   mlir::DenseIntElementsAttr group_assignment_attr;
   if (!matchPattern(reduce_op.getGroupAssignment(),
                     m_Constant(&group_assignment_attr)))
@@ -80,7 +80,7 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   mlir::OpBuilder builder(reduce_op);
   const mlir::Location loc = reduce_op.getLoc();
 
-  int32 group_size;
+  int32_t group_size;
   if (mlir::failed(GetAllReduceGroupSize(reduce_op, &group_size)))
     return mlir::failure();
   if (group_size <= ReduceInBfloat16MaxGroupSize())

From 1bbc0c679a38b4a0c708a0266e35925c71da8ef8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 21:16:01 -0800
Subject: [PATCH 494/753] Automated Code Change

PiperOrigin-RevId: 846053555
---
 ...embedding_optimization_parameters_utils.cc |  2 +-
 tensorflow/core/tpu/tpu_execute.cc            | 21 ++++++++++---------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 587d6341527a20..95044439f5b894 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -256,7 +256,7 @@ absl::Status UseGradientAccumulation(const OptimizationParameters& params,
     }
     case GradientAccumulationSupport::kNotSupported: {
       if (raw_gradient_accumulation_status) {
-        return errors::InvalidArgument(strings::Printf(
+        return errors::InvalidArgument(absl::StrFormat(
             "Optimization algorithm %s does not support gradient accumulation "
             "but parameters specify it.",
             GetOptimizationAlgorithmName(params.parameters_case()).c_str()));
diff --git a/tensorflow/core/tpu/tpu_execute.cc b/tensorflow/core/tpu/tpu_execute.cc
index a8edf650bc1718..251cde239bcf6c 100644
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@@ -115,16 +115,16 @@ absl::Status FixTupleTableAsync(se::Stream* stream,
         if (!element_shape.IsTuple()) {
           return absl::OkStatus();
         }
-        std::vector<se::DeviceMemoryBase> elements;
+        std::vector<stream_executor::DeviceAddressBase> elements;
         xla::ShapeIndex element_index = index;
         element_index.push_back(0);
         for (int i = 0; i < element_shape.tuple_shapes().size(); ++i) {
           // Gather all children of the tuple element.
           element_index.back() = i;
-          elements.push_back(mem->Buffer(element_index).AsDeviceMemoryBase());
+          elements.push_back(mem->Buffer(element_index).AsDeviceAddress());
         }
-        se::DeviceMemoryBase tuple_table_addr =
-            mem->Buffer(index).AsDeviceMemoryBase();
+        stream_executor::DeviceAddressBase tuple_table_addr =
+            mem->Buffer(index).AsDeviceAddress();
         return transfer_manager->WriteSingleTupleIndexTable(
             stream, elements, element_shape, &tuple_table_addr);
       });
@@ -160,7 +160,7 @@ bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
 // Metadata contains the sizes of shape without padding, eventually
 // representing the size of valid data.
 absl::Status UpdateDynamicInputs(
-    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    se::Stream* stream, stream_executor::DeviceAddressAllocator* allocator,
     std::vector<xla::ExecutionInput>* runtime_inputs,
     const std::vector<xla::Shape>& compile_time_shapes) {
   TF_RET_CHECK(runtime_inputs->size() == compile_time_shapes.size());
@@ -193,14 +193,15 @@ absl::Status UpdateDynamicInputs(
           TF_RET_CHECK(
               DynamicShapeIsCompatible(runtime_shape, compile_time_shape));
 
-          xla::MaybeOwningDeviceMemory* mutable_input_mem =
+          xla::MaybeOwningDeviceAddress* mutable_input_mem =
               runtime_input.MutableBuffer(index);
           auto padded_data = std::make_shared<std::vector<int8_t>>(
               ShapeSizeCompact(compile_time_shape), -1);
           auto raw_input_runtime = std::make_shared<std::vector<uint32_t>>(
               ShapeSizeCompact(runtime_shape) / sizeof(uint32_t));
           TF_RETURN_IF_ERROR(stream->MemcpyD2H(
-              se::DeviceMemory<int8_t>(mutable_input_mem->AsDeviceMemoryBase()),
+              stream_executor::DeviceAddress<int8_t>(
+                  mutable_input_mem->AsDeviceAddress()),
               absl::MakeSpan(absl::bit_cast<int8_t*>(raw_input_runtime->data()),
                              ShapeSizeCompactRaw(runtime_shape))));
           TF_RETURN_IF_ERROR(stream->DoHostCallbackWithStatus(
@@ -239,7 +240,7 @@ absl::Status UpdateDynamicInputs(
               allocator->Allocate(stream->parent()->device_ordinal(),
                                   ShapeSizeCompact(compile_time_shape)));
           auto typed_new_input_memory =
-              se::DeviceMemory<int8_t>(new_input.cref());
+              stream_executor::DeviceAddress<int8_t>(new_input.cref());
           TF_RETURN_IF_ERROR(
               stream->MemcpyH2D<int8_t>(*padded_data, &typed_new_input_memory));
 
@@ -249,7 +250,7 @@ absl::Status UpdateDynamicInputs(
           // Modify the memory location in the input shape tree to point to the
           // new input.
           *mutable_input_mem =
-              xla::MaybeOwningDeviceMemory(std::move(new_input));
+              xla::MaybeOwningDeviceAddress(std::move(new_input));
           element_modified = true;
           return absl::OkStatus();
         }));
@@ -499,7 +500,7 @@ absl::StatusOr<xla::ExecutionOutput> TPUExecute(
   VLOG(1) << "TPUExecute: Adding " << device_memory_addrs_count
           << " TPUEmbedding memory addresses to HLO parameters.";
   for (int i = 0; i < device_memory_addrs_count; ++i) {
-    xla::ShapeTree<xla::MaybeOwningDeviceMemory> tree(
+    xla::ShapeTree<xla::MaybeOwningDeviceAddress> tree(
         xla::ShapeUtil::MakeOpaqueShape());
     const SE_DeviceAddressBase& addr = device_memory_addrs[i];
     VLOG(2) << absl::StrFormat("Device memory addr[%i] = {%p, %llu, %llu}", i,

From d5d3d8a86870c2129dba478ec41877fd754007fc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 21:22:29 -0800
Subject: [PATCH 495/753] Automated Code Change

PiperOrigin-RevId: 846055623
---
 third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
index 43f9c6f2c07bb0..f0a791c4fd84a9 100644
--- a/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/buffer_pool.cc
@@ -50,8 +50,8 @@ uint8_t* BufferPool::GetOrCreateBuffer() {
 
   // Allocate and return a new buffer.
   constexpr size_t kBufferAlignSize = 8;
-  uint8_t* buffer = reinterpret_cast<uint8_t*>(
-      port::AlignedMalloc(buffer_size_in_bytes_, kBufferAlignSize));
+  uint8_t* buffer = reinterpret_cast<uint8_t*>(port::AlignedMalloc(
+      buffer_size_in_bytes_, static_cast<std::align_val_t>(kBufferAlignSize)));
   if (buffer == nullptr) {
     LOG(WARNING) << "Buffer not allocated.";
     return nullptr;

From 2bc359c7b12f4e1a5dd6e845574cf82bf73f50f7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 21:30:13 -0800
Subject: [PATCH 496/753] Automated Code Change

PiperOrigin-RevId: 846058829
---
 tensorflow/core/grappler/optimizers/static_schedule_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/static_schedule_test.cc b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
index b46afca62dc4a3..46ae83f1f04a3b 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule_test.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule_test.cc
@@ -40,7 +40,7 @@ class StaticScheduleTest : public ::testing::Test {
     cpu_device.set_l1_cache_size(32 * 1024);
     cpu_device.set_l2_cache_size(256 * 1024);
     cpu_device.set_l3_cache_size(4 * 1024 * 1024);
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     devices["/job:localhost/replica:0/task:0/cpu:0"] = cpu_device;
     return std::unique_ptr<VirtualCluster>(new VirtualCluster(devices));
   }

From 08c7eea5191d293487fef24eae8c9b53eebe4df6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 22:48:54 -0800
Subject: [PATCH 497/753] Automated Code Change

PiperOrigin-RevId: 846084506
---
 third_party/xla/xla/pjrt/partial_program_utils.cc       | 4 ++--
 third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/pjrt/partial_program_utils.cc b/third_party/xla/xla/pjrt/partial_program_utils.cc
index c7568af47e565c..c1b1febd34d288 100644
--- a/third_party/xla/xla/pjrt/partial_program_utils.cc
+++ b/third_party/xla/xla/pjrt/partial_program_utils.cc
@@ -46,8 +46,8 @@ ConvertCharBuffersToPjRtPartialProgramProtos(
   partial_programs.reserve(char_buffers.size());
   for (size_t i = 0; i < char_buffers.size(); ++i) {
     xla::PjRtPartialProgramProto partial_program;
-    bool success =
-        partial_program.ParseFromArray(char_buffers[i], char_buffer_sizes[i]);
+    bool success = partial_program.ParseFromString(
+        absl::string_view(char_buffers[i], char_buffer_sizes[i]));
     if (!success) {
       return absl::InvalidArgumentError(
           "Failed to deserialize PjRtPartialProgramProto");
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index b47aaf7f446ea0..56f631b46a9d0e 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -2843,7 +2843,7 @@ PjRtStreamExecutorClient::DeserializeToLocalExecutable(
   if (serialized.size() > std::numeric_limits<int>::max()) {
     return Internal("Proto is too large (>2GB)");
   }
-  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+  if (!proto.ParseFromString(serialized)) {
     return Internal("Proto deserialization failed");
   }
   if (!proto.pjrt_client_name().empty() &&

From d0ac32c16b2e0f600137c5c0b3977ef79ee843df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 23:14:39 -0800
Subject: [PATCH 498/753] Automated Code Change

PiperOrigin-RevId: 846092700
---
 tensorflow/core/framework/allocator_test.cc        | 2 +-
 tensorflow/core/framework/function_handle_cache.cc | 2 +-
 tensorflow/core/framework/model_test.cc            | 4 ++--
 tensorflow/core/framework/resource_mgr.cc          | 2 +-
 tensorflow/core/framework/tensor.h                 | 7 ++++---
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index ba3f396b6c3ef0..76bfb059935786 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -210,7 +210,7 @@ TEST(CPUAllocatorTest, Sizes) {
 
 TEST(CPUAllocatorTest, ProfilerReporting) {
   // TODO(b/196611863): Make debugging work even without GetAllocatedSize.
-  void* p = port::AlignedMalloc(8, 1);
+  void* p = tsl::port::AlignedMalloc(8, static_cast<std::align_val_t>(1));
   const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p);
   port::AlignedFree(p);
   if (alloc_size == 0) {
diff --git a/tensorflow/core/framework/function_handle_cache.cc b/tensorflow/core/framework/function_handle_cache.cc
index d0d995cbcc3712..e26467011ac2dd 100644
--- a/tensorflow/core/framework/function_handle_cache.cc
+++ b/tensorflow/core/framework/function_handle_cache.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 FunctionHandleCache::FunctionHandleCache(FunctionLibraryRuntime* lib)
     : lib_(lib),
       state_handle_(
-          strings::Printf("%lld", static_cast<long long>(random::New64()))) {}
+          absl::StrFormat("%lld", static_cast<long long>(random::New64()))) {}
 
 FunctionHandleCache::~FunctionHandleCache() {
   absl::Status s = Clear();
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 6ad728f1a0de2c..16e9df7641753b 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -1657,7 +1657,7 @@ TEST_F(ModelTimingTest, TestDefaultParallelismInParallelInterleave) {
   const int32_t parallelism = 1;
   const int32_t deterministic = 1;
   const int32_t cycle_length = 3;
-  ComputeModelTiming(strings::Printf(
+  ComputeModelTiming(absl::StrFormat(
       R"pb(
         nodes: {
           key: 1
@@ -1841,7 +1841,7 @@ TEST_P(ParallelInterleaveTimingTest, ScenarioTest) {
   const int32_t parallelism = std::get<0>(GetParam());
   const int32_t deterministic = std::get<1>(GetParam());
   const int32_t cycle_length = std::get<2>(GetParam());
-  ComputeModelTiming(strings::Printf(
+  ComputeModelTiming(absl::StrFormat(
       R"pb(
         nodes: {
           key: 1
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index c83acfe5329311..0c59566c84261b 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -204,7 +204,7 @@ std::string ResourceMgr::DebugString() const {
   std::vector<std::string> text;
   text.reserve(lines.size());
   for (const Line& line : lines) {
-    text.push_back(strings::Printf(
+    text.push_back(absl::StrFormat(
         "%-20s | %-40s | %-40s | %-s", line.container->c_str(),
         line.type.c_str(), line.resource->c_str(), line.detail.c_str()));
   }
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 5db5b0bcd74e84..fa19396557bf0a 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -1095,9 +1095,10 @@ void Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer::operator delete(
 
 template <typename T>
 Tensor::Tensor(T value, host_scalar_tag tag) {
-  auto* value_and_buf = static_cast<Tensor::ValueAndTensorBuffer<T>*>(
-      port::AlignedMalloc(sizeof(typename Tensor::ValueAndTensorBuffer<T>),
-                          EIGEN_MAX_ALIGN_BYTES));
+  auto* value_and_buf =
+      static_cast<Tensor::ValueAndTensorBuffer<T>*>(tsl::port::AlignedMalloc(
+          sizeof(typename Tensor::ValueAndTensorBuffer<T>),
+          static_cast<std::align_val_t>(EIGEN_MAX_ALIGN_BYTES)));
   new (&value_and_buf->value) T(std::move(value));
   new (&value_and_buf->tensor_buffer)
       typename Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer(

From 5223bfde21ab3c59380060304fb089c52cdaa64d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 17 Dec 2025 23:46:40 -0800
Subject: [PATCH 499/753] Automated Code Change

PiperOrigin-RevId: 846104031
---
 .../runtime/fallback_batch_kernel.cc           |  5 +++--
 .../runtime/fallback_batch_kernel.h            | 18 +++++++++---------
 .../runtime_fallback_batch_tf_opkernels.cc     |  2 +-
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
index 04149fd2b397b4..a35e77ae99776f 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
@@ -50,8 +50,9 @@ constexpr char kBatchesToAverageOverAttr[] = "_batches_to_average_over";
 
 }  // namespace
 
-int32 BatchFunctionFallbackKernelBase::
-    NumBatchThreadsFromEnvironmentWithDefault(int default_num_batch_threads) {
+int32_t
+BatchFunctionFallbackKernelBase::NumBatchThreadsFromEnvironmentWithDefault(
+    int default_num_batch_threads) {
   int32_t num;
   const char* val = std::getenv("TF_NUM_BATCH_THREADS");
 
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
index f053704fd50dcb..3b26516602d4d2 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
@@ -67,7 +67,7 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   void SetAdaptiveBatchSchedulerOptions(OpKernelConstruction* c,
                                         int32_t num_batch_threads);
 
-  static int32 NumBatchThreadsFromEnvironmentWithDefault(
+  static int32_t NumBatchThreadsFromEnvironmentWithDefault(
       int default_num_batch_threads);
   static thread::ThreadPool* GetOrCreateBatchThreadsPool();
   static constexpr int64_t kBatchThreadPoolSize = 128;
@@ -80,10 +80,10 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   int32_t batch_timeout_micros_;
   int32_t max_enqueued_batches_;
   std::vector<int32_t> allowed_batch_sizes_;
-  int32 low_priority_max_batch_size_;
-  int32 low_priority_batch_timeout_micros_;
-  int32 low_priority_max_enqueued_batches_;
-  std::vector<int32> low_priority_allowed_batch_sizes_;
+  int32_t low_priority_max_batch_size_;
+  int32_t low_priority_batch_timeout_micros_;
+  int32_t low_priority_max_enqueued_batches_;
+  std::vector<int32_t> low_priority_allowed_batch_sizes_;
   std::string mixed_priority_policy_;
   bool enable_large_batch_splitting_;
   bool has_attribute_enable_large_batch_splitting_;
@@ -100,10 +100,10 @@ class BatchFunctionFallbackKernelBase : public AsyncOpKernel {
   static constexpr int64_t kMaxInflightBatches = 64;
   bool enable_adaptive_batch_threads_ = false;
   struct AdaptiveBatchSchedulerOptions {
-    int32 min_in_flight_batches_limit = kMinInflightBatches;
-    int32 initial_in_flight_batches_limit = kInitialInflightBatches;
-    int32 max_in_flight_batches_limit = kMaxInflightBatches;
-    int32 batches_to_average_over = kBatchesToAverageOver;
+    int32_t min_in_flight_batches_limit = kMinInflightBatches;
+    int32_t initial_in_flight_batches_limit = kInitialInflightBatches;
+    int32_t max_in_flight_batches_limit = kMaxInflightBatches;
+    int32_t batches_to_average_over = kBatchesToAverageOver;
   };
   std::optional<AdaptiveBatchSchedulerOptions>
       adaptive_batch_scheduler_options_ = std::nullopt;
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
index 100290da8bff1e..9e7e9678635db5 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
@@ -193,7 +193,7 @@ class FallbackBatchResource : public tensorflow::serving::BatchResourceBase {
     return absl::OkStatus();
   }
 
-  string DebugString() const final { return "FallbackBatchResource"; }
+  std::string DebugString() const final { return "FallbackBatchResource"; }
 
   const tsl::RCReference<const tfrt::Function>& batch_function() const {
     return bef_func_;

From cf9a56a83a718240191440e124b97de8857cd579 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 01:01:41 -0800
Subject: [PATCH 500/753] Automated Code Change

PiperOrigin-RevId: 846130775
---
 .../tensorflow/transforms/host_runtime/tpu_metadata_utils.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
index ac9c18602804d7..a5bd582b7c2b5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_metadata_utils.cc
@@ -224,8 +224,8 @@ LogicalResult SetMetadataProtoFromClusterFuncOp(
     tensorflow::tpu::TPUCompileMetadataProto* metadata) {
   if (auto options_attr =
           op->getAttrOfType<StringAttr>("tpu_compile_options_proto")) {
-    if (!metadata->mutable_compile_options()->ParseFromArray(
-            options_attr.data(), options_attr.size())) {
+    if (!metadata->mutable_compile_options()->ParseFromString(
+            absl::string_view(options_attr.data(), options_attr.size()))) {
       return failure();
     }
   }

From 4a142bc1a6451c6d2789ba4ca49605f42adb145b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 01:03:49 -0800
Subject: [PATCH 501/753] Update GraphDef version to 2445.

PiperOrigin-RevId: 846131536
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 607ccceb50bda9..d7d8fd441463ee 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2444  // Updated: 2025/12/17
+#define TF_GRAPH_DEF_VERSION 2445  // Updated: 2025/12/18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 8c32a65652c2312c37dcaede1c11c5f0bca70dbc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 01:03:50 -0800
Subject: [PATCH 502/753] compat: Update forward compatibility horizon to
 2025-12-18

PiperOrigin-RevId: 846131540
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 67a1d81ddb3b58..a84c2af8863d2c 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e560901dcd27f09dccdb8ec6c26ec4ad46d0ab79 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 01:04:32 -0800
Subject: [PATCH 503/753] Reverts 6d3c0f702ff1a90769541228ac10ba1fa5774aa8

PiperOrigin-RevId: 846131788
---
 .../xla/xla/stream_executor/cuda/BUILD        | 63 ++-------------
 .../xla/stream_executor/cuda/cuda_executor.cc | 77 ++++++++++++++++---
 .../cuda/nvshmem_memory_allocator_stub.cc     | 29 -------
 3 files changed, 70 insertions(+), 99 deletions(-)
 delete mode 100644 third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc

diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index 622c4fa354e72f..f2126b3e9ad1ba 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -1029,64 +1029,10 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "nvshmem_memory_allocator_if_builtin_used",
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps = select({
-        "//xla/stream_executor/cuda:no_builtin_used": [
-            ":nvshmem_memory_allocator_stub",
-        ],
-        "//conditions:default": [":nvshmem_memory_allocator"],
-    }),
-)
-
-cc_library(
-    name = "nvshmem_memory_allocator_if_supported",
-    hdrs = ["nvshmem_memory_allocator.h"],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps = select({
-        "//xla/stream_executor/cuda:nvshmem_supported": [
-            ":nvshmem_memory_allocator_if_builtin_used",
-        ],
-        "//conditions:default": [":nvshmem_memory_allocator_stub"],
-    }) + [
-        "//xla/stream_executor:memory_allocation",
-        "//xla/stream_executor:memory_allocator",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
-# Used when NVSHMEM is not linked or can't be used.
-cc_library(
-    name = "nvshmem_memory_allocator_stub",
-    srcs = [
-        "nvshmem_memory_allocator.h",
-        "nvshmem_memory_allocator_stub.cc",
-    ],
-    tags = [
-        "cuda-only",
-        "gpu",
-    ],
-    deps = [
-        "//xla/stream_executor:memory_allocation",
-        "//xla/stream_executor:memory_allocator",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-    ],
-)
-
 cc_library(
     name = "nvshmem_memory_allocator",
-    srcs = [
-        "nvshmem_memory_allocator.cc",
-        "nvshmem_memory_allocator.h",
-    ],
+    srcs = ["nvshmem_memory_allocator.cc"],
+    hdrs = ["nvshmem_memory_allocator.h"],
     tags = [
         "cuda-only",
         "gpu",
@@ -1321,10 +1267,11 @@ cc_library(
         ":cuda_timer",
         ":cuda_version_parser",
         ":cudnn_api_wrappers",
-        ":nccl_memory_allocator",
-        ":nvshmem_memory_allocator_if_supported",
         ":tma_util",
         "//xla:util",
+        "//xla/backends/gpu/collectives:gpu_collectives",
+        "//xla/core/collectives",
+        "//xla/core/collectives:collectives_registry",
         "//xla/stream_executor:activate_context",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:command_buffer",
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index 7aa0c84b67ce53..32d6ef67a058dd 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -51,6 +51,9 @@ limitations under the License.
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
 #include "third_party/gpus/cuda/nvml/include/nvml.h"
+#include "xla/backends/gpu/collectives/gpu_collectives.h"
+#include "xla/core/collectives/collectives.h"
+#include "xla/core/collectives/collectives_registry.h"
 #include "xla/stream_executor/activate_context.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
@@ -66,8 +69,6 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_timer.h"
 #include "xla/stream_executor/cuda/cuda_version_parser.h"
 #include "xla/stream_executor/cuda/cudnn_api_wrappers.h"
-#include "xla/stream_executor/cuda/nccl_memory_allocator.h"
-#include "xla/stream_executor/cuda/nvshmem_memory_allocator.h"
 #include "xla/stream_executor/cuda/tma_util.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
@@ -840,6 +841,14 @@ CudaExecutor::~CudaExecutor() {
   CHECK(gpu_binary_to_module_.empty()) << "CudaExecutor has loaded modules.";
 }
 
+absl::StatusOr<xla::gpu::GpuCollectives*> GetGpuCollectives(
+    StreamExecutor* executor) {
+  std::unique_ptr<ActivateContext> activation = executor->Activate();
+  TF_ASSIGN_OR_RETURN(xla::Collectives * collectives,
+                      xla::CollectivesRegistry::Default("gpu"));
+  return tsl::down_cast<xla::gpu::GpuCollectives*>(collectives);
+}
+
 CudaExecutor::VmmMemoryHandle::~VmmMemoryHandle() { CHECK_OK(Release()); }
 
 absl::Status CudaExecutor::VmmMemoryHandle::Release() {
@@ -970,6 +979,27 @@ absl::StatusOr<bool> CudaExecutor::VmmDeallocateMemory(void* ptr) {
   return true;
 }
 
+absl::StatusOr<void*> CollectiveMemoryAllocate(StreamExecutor* executor,
+                                               uint64_t bytes) {
+  if (bytes == 0) {
+    return nullptr;
+  }
+
+  std::unique_ptr<ActivateContext> activation = executor->Activate();
+  TF_ASSIGN_OR_RETURN(xla::gpu::GpuCollectives * gpu_collectives,
+                      GetGpuCollectives(executor));
+  return gpu_collectives->Allocate(bytes);
+}
+
+absl::Status CollectiveMemoryDeallocate(StreamExecutor* executor,
+                                        void* location) {
+  std::unique_ptr<ActivateContext> activation = executor->Activate();
+
+  TF_ASSIGN_OR_RETURN(xla::gpu::GpuCollectives * gpu_collectives,
+                      GetGpuCollectives(executor));
+  return gpu_collectives->Deallocate(location);
+}
+
 absl::StatusOr<std::unique_ptr<MemoryAllocator>>
 CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
   if (type == MemorySpace::kUnified) {
@@ -1005,16 +1035,28 @@ CudaExecutor::CreateMemoryAllocator(MemorySpace type) {
   }
 
   if (type == MemorySpace::kCollective) {
-    switch (collective_allocator_type_) {
-      case CollectiveAllocatorType::kNvshmem:
-        return std::make_unique<NvshmemMemoryAllocator>();
-      case CollectiveAllocatorType::kNccl:
-        return std::make_unique<NcclMemoryAllocator>(this);
-      default:
-        return absl::UnimplementedError(
-            absl::StrCat("Unsupported collective allocator type: ",
-                         collective_allocator_type_));
-    }
+    // TODO(469289220): Use NCCL/NVSHMEM memory allocator here instead.
+    return std::make_unique<GenericMemoryAllocator>(
+        [this](uint64_t size)
+            -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {
+          TF_ASSIGN_OR_RETURN(void* ptr, CollectiveMemoryAllocate(this, size));
+          XLA_VLOG_DEVICE(2, device_ordinal())
+              << "allocated " << ptr << " for context " << cuda_context_
+              << " of " << size << " bytes of collective memory";
+          return std::make_unique<GenericMemoryAllocation>(
+              ptr, size, [this](void* location, uint64_t size) {
+                auto status = CollectiveMemoryDeallocate(this, location);
+                if (!status.ok()) {
+                  XLA_LOG_DEVICE(ERROR, device_ordinal())
+                      << "failed to free collective memory at " << location
+                      << "; result: " << status;
+                } else {
+                  XLA_VLOG_DEVICE(2, device_ordinal())
+                      << "deallocated collective memory at " << location
+                      << " for context " << cuda_context_;
+                }
+              });
+        });
   }
 
   if (type == MemorySpace::kHost) {
@@ -1363,6 +1405,17 @@ DeviceAddressBase CudaExecutor::Allocate(uint64_t size, int64_t memory_space) {
       << "CudaExecutor::Allocate size: " << size
       << " memory_space: " << memory_space;
 
+  if (memory_space == static_cast<int64_t>(MemorySpace::kCollective)) {
+    auto result = CollectiveMemoryAllocate(this, size);
+    if (!result.ok()) {
+      XLA_LOG_DEVICE(ERROR, device_ordinal())
+          << "CudaExecutor::Allocate returns " << result.value();
+    }
+    XLA_VLOG_DEVICE(1, device_ordinal())
+        << "CudaExecutor::Allocate returns " << result.value();
+    return DeviceAddressBase(result.value(), size);
+  }
+
   if (memory_space == static_cast<int64_t>(MemorySpace::kHost)) {
     auto result = HostAllocate(cuda_context_, numa_node_, size);
     if (!result.ok()) {
diff --git a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc b/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc
deleted file mode 100644
index d4d124b89af8db..00000000000000
--- a/third_party/xla/xla/stream_executor/cuda/nvshmem_memory_allocator_stub.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2025 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "xla/stream_executor/cuda/nvshmem_memory_allocator.h"
-#include "xla/stream_executor/memory_allocation.h"
-
-namespace stream_executor::gpu {
-absl::StatusOr<std::unique_ptr<MemoryAllocation>>
-NvshmemMemoryAllocator::Allocate(uint64_t size) {
-  return absl::UnimplementedError("NVSHMEM is not supported on this platform.");
-}
-}  // namespace stream_executor::gpu

From b79b6d8f75ed3e9fc8fc638fdf7b21a7e830b6be Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 01:14:34 -0800
Subject: [PATCH 504/753] Automated Code Change

PiperOrigin-RevId: 846135593
---
 .../mlir/lite/quantization/lite/quantize_weights_test.cc   | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
index 1e1f79af16cbd6..b131a5f0e1060b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
@@ -43,7 +43,7 @@ limitations under the License.
 // Note: branched from tensorflow/lite/tools/optimize/quantize_weights_test.cc
 
 namespace {
-tensorflow::string* g_test_model_dir = nullptr;
+std::string* g_test_model_dir = nullptr;
 }  // namespace
 
 namespace tflite {
@@ -766,7 +766,7 @@ TEST_F(QuantizeWeightsTest, DequantizeConvBlocklisted) {
 }  // namespace tflite
 
 int main(int argc, char** argv) {
-  tensorflow::string model_file;
+  std::string model_file;
   const std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("test_model_file", &model_file,
                        "Path to test tflite model file."),
@@ -777,8 +777,7 @@ int main(int argc, char** argv) {
     std::cerr << "Required test_model_file\n";
     std::abort();
   }
-  g_test_model_dir =
-      new tensorflow::string(tensorflow::io::Dirname(model_file));
+  g_test_model_dir = new std::string(tensorflow::io::Dirname(model_file));
   ::tensorflow::port::InitMain(argv[0], &argc, &argv);
   return RUN_ALL_TESTS();
 }

From 5174b1f74c877706498a605e6a6a75244e705dd4 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 18 Dec 2025 01:22:36 -0800
Subject: [PATCH 505/753] [XLA:GPU] Add Sort Fusion kind and corresponding
 FusionInterface.

PiperOrigin-RevId: 846138786
---
 .../xla/xla/backends/gpu/codegen/BUILD        |  25 ++++
 .../xla/xla/backends/gpu/codegen/fusions.cc   |   4 +
 .../xla/xla/backends/gpu/codegen/llvm/BUILD   |   2 +-
 .../backends/gpu/codegen/llvm/llvm_emitter.cc |  67 +++++-----
 .../xla/xla/backends/gpu/codegen/sort.cc      | 114 ++++++++++++++++++
 .../xla/xla/backends/gpu/codegen/sort.h       |  40 ++++++
 third_party/xla/xla/service/gpu/BUILD         |   1 -
 .../xla/service/gpu/hlo_fusion_analysis.cc    |   4 +
 .../xla/xla/service/gpu/hlo_fusion_analysis.h |   1 +
 .../service/gpu/hlo_fusion_analysis_test.cc   |  66 +++++++---
 .../xla/xla/service/gpu/tests/sorting_test.cc |  30 +++++
 .../service/gpu/transforms/priority_fusion.cc |   1 +
 12 files changed, 304 insertions(+), 51 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/codegen/sort.cc
 create mode 100644 third_party/xla/xla/backends/gpu/codegen/sort.h

diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index edf523644fe7ba..702d1850efef92 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -297,6 +297,7 @@ cc_library(
         ":cudnn",
         ":custom",
         ":fusion_emitter",
+        ":sort",
         "//xla:shape_util",
         "//xla/backends/gpu/codegen/emitters:concatenate",
         "//xla/backends/gpu/codegen/emitters:in_place_dynamic_update_slice",
@@ -316,3 +317,27 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "sort",
+    srcs = ["sort.cc"],
+    hdrs = ["sort.h"],
+    deps = [
+        ":fusion_emitter",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/codegen/llvm:llvm_emitter",
+        "//xla/backends/gpu/runtime:copy_thunk",
+        "//xla/backends/gpu/runtime:shaped_slice",
+        "//xla/backends/gpu/runtime:thunk",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emitter_context",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/backends/gpu/codegen/fusions.cc b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
index d5528bf9eb6d5d..ebce9ff67d9355 100644
--- a/third_party/xla/xla/backends/gpu/codegen/fusions.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/fusions.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/backends/gpu/codegen/emitters/scatter.h"
 #include "xla/backends/gpu/codegen/emitters/transpose.h"
 #include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/sort.h"
 #include "xla/backends/gpu/codegen/triton/fusion.h"
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -121,6 +122,9 @@ std::unique_ptr<FusionInterface> GetFusionEmitter(
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate: {
       return std::make_unique<ConcatenateFusion>(analysis);
     }
+    case HloFusionAnalysis::EmitterFusionKind::kSort: {
+      return std::make_unique<SortFusion>();
+    }
     case HloFusionAnalysis::EmitterFusionKind::kTriton:
       return std::make_unique<TritonFusion>(analysis);
     case HloFusionAnalysis::EmitterFusionKind::kCuDnn:
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
index 30a3ac1a965948..00d63c372342ef 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/BUILD
@@ -54,7 +54,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/gpu:tma_metadata",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
index 1fcbeb98e9d9ff..3837599da053fb 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/llvm_emitter.cc
@@ -89,10 +89,10 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/tma_metadata.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/fingerprint.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::gpu {
 namespace {
@@ -363,7 +363,7 @@ absl::Status CallNestedComputation(llvm::IRBuilderBase* builder,
                                    llvm::Value* output) {
   TF_RET_CHECK(computation.num_parameters() > 0);
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       llvm::Function * emitted_function,
       IrEmitter(&ir_emitter_context, llvm_module, /*is_nested=*/true)
           .CodegenNestedComputation(computation));
@@ -461,7 +461,7 @@ absl::StatusOr<llvm::Function*> IrEmitter::CodegenNestedComputation(
     return function;
   }
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       EmitConstants(module_, ir_emitter_context_, nested_computation));
   std::vector<const HloInstruction*> io_hlos;
   std::vector<llvm::Type*> argument_types;
@@ -522,7 +522,7 @@ absl::StatusOr<llvm::Function*> IrEmitter::CodegenNestedComputation(
   }
   bindings_.EmitBasePointersForHlos(io_hlos, non_io_hlos);
 
-  TF_RETURN_IF_ERROR(nested_computation.root_instruction()->Accept(this));
+  RETURN_IF_ERROR(nested_computation.root_instruction()->Accept(this));
   b_.SetInsertPoint(ret_instr);
 
   // Function epilogue: copy the output value back.
@@ -576,7 +576,7 @@ absl::Status IrEmitter::EmitTargetElementLoop(
   if (hlo.shape().IsTuple()) {
     std::vector<llvm_ir::IrArray> target_arrays =
         ConstructIrArrayForOutputs(hlo);
-    TF_RETURN_IF_ERROR(
+    RETURN_IF_ERROR(
         llvm_ir::LoopEmitter(element_generator, target_arrays, &b_).EmitLoop());
     llvm_ir::EmitTuple(GetIrArray(hlo, hlo), target_arrays, &b_);
     return absl::OkStatus();
@@ -599,13 +599,13 @@ absl::StatusOr<KernelThunkInfo> BuildKernelThunkForNonFusionOp(
     IrEmitter& ir_emitter, const LaunchDimensions& launch_dimensions) {
   std::string suggested_kernel_name(hlo->name());
 
-  TF_ASSIGN_OR_RETURN(auto kernel_arguments,
-                      emitters::KernelArguments::Create(
-                          buffer_assignment, GetDefaultBufferAlignment(), hlo));
+  ASSIGN_OR_RETURN(auto kernel_arguments,
+                   emitters::KernelArguments::Create(
+                       buffer_assignment, GetDefaultBufferAlignment(), hlo));
 
   VLOG(3) << "Generating (without reuse check): " << suggested_kernel_name;
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       llvm::Function * kernel,
       BuildKernelPrototype(llvm_module, gpu_device_info, suggested_kernel_name,
                            sanitized_kernel_name, kernel_arguments,
@@ -867,14 +867,17 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
-    TF_ASSIGN_OR_RETURN(
-        KernelThunkInfo kernel_thunk_info,
-        BuildKernelThunkForNonFusionOp(
-            llvm_module, sort, ir_emitter_context->buffer_assignment(),
-            ir_emitter_context->GetNextThunkId(),
-            ir_emitter_context->gpu_device_info(),
-            ir_emitter_context->GetSanitizedUniqueName(op_name), ir_emitter,
-            launch_dimensions));
+    bool is_fusion = sort->parent()->IsFusionComputation();
+    const HloInstruction* hlo_with_buffers =
+        is_fusion ? sort->parent()->FusionInstruction() : sort;
+    ASSIGN_OR_RETURN(KernelThunkInfo kernel_thunk_info,
+                     BuildKernelThunkForNonFusionOp(
+                         llvm_module, hlo_with_buffers,
+                         ir_emitter_context->buffer_assignment(),
+                         ir_emitter_context->GetNextThunkId(),
+                         ir_emitter_context->gpu_device_info(),
+                         ir_emitter_context->GetSanitizedUniqueName(op_name),
+                         ir_emitter, launch_dimensions));
     thunks.push_back(std::move(kernel_thunk_info.thunk));
 
     // The first `operand_count()` elements of `ir_arrays` are the input
@@ -882,7 +885,7 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
     // outputs, so we need to pass only the outputs to the in-place sort kernel.
     auto output_arrays_span =
         absl::Span<const llvm_ir::IrArray>(kernel_thunk_info.ir_arrays)
-            .subspan(sort->operand_count());
+            .subspan(hlo_with_buffers->operand_count());
 
     auto* comparator = sort->called_computations().front();
     auto* builder = ir_emitter.builder();
@@ -911,17 +914,17 @@ absl::StatusOr<ThunkSequence> EmitBitonicSortLLVMIR(
       }
       if (xor_mask >= tile_size) {
         if (!xor_masks.empty()) {
-          TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+          RETURN_IF_ERROR(emit_kernel(xor_masks));
           xor_masks.clear();
         }
-        TF_RETURN_IF_ERROR(emit_kernel({xor_mask}));
+        RETURN_IF_ERROR(emit_kernel({xor_mask}));
       } else {
         xor_masks.push_back(xor_mask);
       }
     }
   }
   if (!xor_masks.empty()) {
-    TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
+    RETURN_IF_ERROR(emit_kernel(xor_masks));
   }
   return thunks;
 }
@@ -941,7 +944,7 @@ absl::StatusOr<ThunkSequence> EmitPadToStaticLLVMIR(
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       input_shape, ir_emitter_context->gpu_device_info(), {kUnrollFactor});
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       KernelThunkInfo kernel_thunk_info,
       BuildKernelThunkForNonFusionOp(
           llvm_module, hlo, ir_emitter_context->buffer_assignment(),
@@ -1067,10 +1070,10 @@ absl::StatusOr<ThunkSequence> EmitPadToStaticLLVMIR(
   };
 
   const Shape& data_shape = hlo->shape().tuple_shapes(0);
-  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
-                                         launch_dimensions,
-                                         ir_emitter.builder(), {kUnrollFactor})
-                         .EmitLoop(ir_name, index_ty));
+  RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
+                                      launch_dimensions, ir_emitter.builder(),
+                                      {kUnrollFactor})
+                      .EmitLoop(ir_name, index_ty));
   return thunk_sequence;
 }
 
@@ -1089,7 +1092,7 @@ absl::StatusOr<ThunkSequence> EmitSliceToDynamicLLVMIR(
       input_shape, ir_emitter_context->gpu_device_info(), {kUnrollFactor});
   llvm::Type* index_ty = GetIndexTypeForKernel(
       hlo, launch_dimensions.launch_bound(), ir_emitter.builder());
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       KernelThunkInfo kernel_thunk_info,
       BuildKernelThunkForNonFusionOp(
           llvm_module, hlo, ir_emitter_context->buffer_assignment(),
@@ -1205,10 +1208,10 @@ absl::StatusOr<ThunkSequence> EmitSliceToDynamicLLVMIR(
     return absl::OkStatus();
   };
 
-  TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
-                                         launch_dimensions,
-                                         ir_emitter.builder(), {kUnrollFactor})
-                         .EmitLoop(ir_name, index_ty));
+  RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
+                                      launch_dimensions, ir_emitter.builder(),
+                                      {kUnrollFactor})
+                      .EmitLoop(ir_name, index_ty));
   return thunk_sequence;
 }
 
@@ -1222,7 +1225,7 @@ absl::StatusOr<ThunkSequence> EmitRngGetAndUpdateStateLLVMIR(
   auto& b = *ir_emitter.builder();
   // Emit a kernel to increment the global state for Philox RNG
   // algorithm.
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       KernelThunkInfo kernel_thunk_info,
       BuildKernelThunkForNonFusionOp(
           llvm_module, hlo, ir_emitter_context->buffer_assignment(),
diff --git a/third_party/xla/xla/backends/gpu/codegen/sort.cc b/third_party/xla/xla/backends/gpu/codegen/sort.cc
new file mode 100644
index 00000000000000..35c0049f09486c
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/sort.cc
@@ -0,0 +1,114 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/backends/gpu/codegen/sort.h"
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/backends/gpu/codegen/llvm/llvm_emitter.h"
+#include "xla/backends/gpu/runtime/copy_thunk.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla {
+namespace gpu {
+
+absl::StatusOr<FusionEmissionResult> SortFusion::Emit(
+    IrEmitterContext& ir_emitter_context,
+    const HloFusionInstruction& fusion) const {
+  std::vector<BufferAllocation::Slice> src_buffers;
+  std::vector<BufferAllocation::Slice> dst_buffers;
+  std::vector<Shape> src_shapes;
+  src_buffers.reserve(fusion.operand_count());
+  dst_buffers.reserve(fusion.operand_count());
+  src_shapes.reserve(fusion.operand_count());
+  const HloSortInstruction* sort =
+      Cast<HloSortInstruction>(fusion.fused_expression_root());
+  Shape keys_shape = sort->operand(0)->shape();
+  for (int64_t i = 0; i < sort->operand_count(); ++i) {
+    // We assume that the layout of all involved operands and
+    // outputs is the same.
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, sort->operand(i)->shape(),
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
+    ShapeIndex shape_index =
+        sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
+    TF_RET_CHECK(LayoutUtil::LayoutsInShapesEqual(
+        keys_shape, ShapeUtil::GetSubshape(sort->shape(), shape_index),
+        Layout::Equal().IgnoreMemorySpace().IgnoreElementSize()));
+    // We expect only parameters or iotas as operand of sort.
+    if (HloPredicateIsOp<HloOpcode::kParameter>(sort->operand(i))) {
+      const HloInstruction* src_instr =
+          fusion.operand(sort->operand(i)->parameter_number());
+      ASSIGN_OR_RETURN(
+          BufferAllocation::Slice slice,
+          ir_emitter_context.buffer_assignment().GetUniqueSlice(src_instr, {}));
+      src_buffers.push_back(slice);
+      src_shapes.push_back(sort->operand(i)->shape());
+      ASSIGN_OR_RETURN(slice,
+                       ir_emitter_context.buffer_assignment().GetUniqueSlice(
+                           &fusion, shape_index));
+      dst_buffers.push_back(slice);
+    } else {
+      TF_RET_CHECK(HloPredicateIsOp<HloOpcode::kIota>(sort->operand(i)));
+    }
+  }
+
+  FusionEmissionResult result;
+  for (int i = 0; i < src_buffers.size(); ++i) {
+    if (src_buffers[i] != dst_buffers[i]) {
+      result.thunks.emplace_back(std::make_unique<DeviceToDeviceCopyThunk>(
+          Thunk::ThunkInfo::WithProfileAnnotation(
+              &fusion, ir_emitter_context.GetNextThunkId()),
+          /*source_buffer=*/ShapedSlice{src_buffers[i], src_shapes[i]},
+          /*destination_buffer=*/ShapedSlice{dst_buffers[i], src_shapes[i]},
+          /*mem_size=*/src_buffers[i].size()));
+    }
+  }
+  std::string op_name(sort->name());
+  result.module = ir_emitter_context.CreateLLVMModule(op_name);
+  ASSIGN_OR_RETURN(
+      ThunkSequence sort_thunks,
+      EmitBitonicSortLLVMIR(sort, result.module.get(), &ir_emitter_context));
+  result.thunks.insert(result.thunks.end(),
+                       std::make_move_iterator(sort_thunks.begin()),
+                       std::make_move_iterator(sort_thunks.end()));
+  return result;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/gpu/codegen/sort.h b/third_party/xla/xla/backends/gpu/codegen/sort.h
new file mode 100644
index 00000000000000..3125278f529c9e
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/codegen/sort.h
@@ -0,0 +1,40 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_BACKENDS_GPU_CODEGEN_SORT_H_
+#define XLA_BACKENDS_GPU_CODEGEN_SORT_H_
+
+#include "absl/status/statusor.h"
+#include "xla/backends/gpu/codegen/fusion_emitter.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/ir_emitter_context.h"
+
+namespace xla {
+namespace gpu {
+
+// A fusion consisting of a sort op with operands which are either parameters or
+// iotas.
+class SortFusion : public FusionInterface {
+ public:
+  SortFusion() = default;
+
+  absl::StatusOr<FusionEmissionResult> Emit(
+      IrEmitterContext& ir_emitter_context,
+      const HloFusionInstruction& fusion) const final;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_BACKENDS_GPU_CODEGEN_SORT_H_
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index daa3908e799508..5f81a412401377 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2788,7 +2788,6 @@ xla_cc_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tests:xla_internal_test_main",
-        "//xla/tsl/platform:statusor",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_googletest//:gtest",
     ],
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index 3566a20ac1c4a3..feedcbdf5a4e6b 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -182,6 +182,10 @@ HloFusionAnalysis::EmitterFusionKind GetEmitterFusionKind(
     return HloFusionAnalysis::EmitterFusionKind::kScatter;
   }
 
+  if (fusion_roots[0].opcode() == HloOpcode::kSort) {
+    return HloFusionAnalysis::EmitterFusionKind::kSort;
+  }
+
   if (UseConcatenateFusion(fusion_roots, fusion_heroes)) {
     return HloFusionAnalysis::EmitterFusionKind::kConcatenate;
   }
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index a6bcd04e371213..78dba963b95ac1 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -50,6 +50,7 @@ class HloFusionAnalysis {
     kScatter,
     kCuDnn,
     kDynamicMemcpy,
+    kSort,
   };
 
   // Precomputed information about inputs (arguments) and outputs (roots) of the
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
index 93914ae3232637..7a02fcddf8ecf6 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
-#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 
 namespace xla::gpu {
@@ -34,7 +33,7 @@ using ::tsl::proto_testing::EqualsProto;
 class HloFusionAnalysisTest : public HloHardwareIndependentTestBase {};
 
 TEST_F(HloFusionAnalysisTest, DoesNotPeekOutsideBoundary) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -64,7 +63,7 @@ TEST_F(HloFusionAnalysisTest, DoesNotPeekOutsideBoundary) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionWithMultipleUsers) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -100,7 +99,7 @@ TEST_F(HloFusionAnalysisTest, ReductionWithMultipleUsers) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -133,7 +132,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusion) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFused) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -166,7 +165,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFused) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInConsumer) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -197,7 +196,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInConsumer) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInBoth) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -234,7 +233,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInBoth) {
 }
 
 TEST_F(HloFusionAnalysisTest, ReduceMultiOutputFusionWithTransposeBitcast) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -266,7 +265,7 @@ TEST_F(HloFusionAnalysisTest, ReduceMultiOutputFusionWithTransposeBitcast) {
 }
 
 TEST_F(HloFusionAnalysisTest, InvalidReduceMultiOutputFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -303,7 +302,7 @@ TEST_F(HloFusionAnalysisTest, InvalidDevice) {
   // Verifies that an analysis can be created even with an invalid/empty device
   // info, and that the emitter type is determined correctly.
   // Don't rely on this behavior.
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     add {
@@ -320,7 +319,7 @@ TEST_F(HloFusionAnalysisTest, InvalidDevice) {
     })"));
 
   stream_executor::GpuDeviceInfoProto device_info_proto;
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto device_info,
       stream_executor::DeviceDescription::FromProto(device_info_proto));
   device_info.set_threads_per_warp(32);
@@ -333,7 +332,7 @@ TEST_F(HloFusionAnalysisTest, InvalidDevice) {
 }
 
 TEST_F(HloFusionAnalysisTest, ConcatFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule test_module
 
     fused_computation {
@@ -360,8 +359,41 @@ TEST_F(HloFusionAnalysisTest, ConcatFusion) {
             HloFusionAnalysis::EmitterFusionKind::kConcatenate);
 }
 
+TEST_F(HloFusionAnalysisTest, SortFusion) {
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    less_than {
+      lhs.0 = f32[] parameter(0)
+      rhs.0 = f32[] parameter(1)
+      lhs.1 = s32[] parameter(2)
+      rhs.1 = s32[] parameter(3)
+      ROOT lt = pred[] compare(lhs.0, rhs.0), direction=LT
+    }
+
+    fused_computation {
+      p0 = f32[256] parameter(0)
+      iota = s32[256] iota(), iota_dimension=0
+      ROOT sort = (f32[256], s32[256]) sort(p0, iota), dimensions={0}, to_apply=less_than, is_stable=false
+    }
+
+    ENTRY main {
+      p = f32[256] parameter(0)
+      ROOT fusion = (f32[256], s32[256]) fusion(p), kind=kInput, calls=fused_computation
+    })"));
+
+  auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = HloFusionAnalysis::Create(
+      FusionBackendConfig::default_instance(),
+      HloFusionAdaptor::ForInstruction(root), &device_info);
+  EXPECT_EQ(analysis.emitter_fusion_kind(),
+            HloFusionAnalysis::EmitterFusionKind::kSort);
+}
+
 TEST_F(HloFusionAnalysisTest, ExtractValidGpuBackendConfig) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fused_computation.1 {
@@ -399,7 +431,7 @@ TEST_F(HloFusionAnalysisTest, ExtractValidGpuBackendConfig) {
 
 TEST_F(HloFusionAnalysisTest,
        InvalidGpuBackendConfig_SingleInstruction_Ignored) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     ENTRY entry {
@@ -418,7 +450,7 @@ TEST_F(HloFusionAnalysisTest,
 
 TEST_F(HloFusionAnalysisTest,
        InvalidGpuBackendConfig_ProducerConsumer_Ignored) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fused_computation {
@@ -444,7 +476,7 @@ TEST_F(HloFusionAnalysisTest,
 }
 
 TEST_F(HloFusionAnalysisTest, ConcatenateFusion) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fusion {
@@ -475,7 +507,7 @@ TEST_F(HloFusionAnalysisTest, ConcatenateFusion) {
 }
 
 TEST_F(HloFusionAnalysisTest, ConcatenateFusionFallbackToLoop) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
     HloModule module
 
     fusion {
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index a36504161cae5e..e8b6fad5b6d45c 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -146,6 +146,36 @@ TEST_F(SortingTest, PackedElementType) {
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+TEST_F(SortingTest, SortFusionWithIotaOperand) {
+  const char* hlo_text = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_index = s32[] parameter(2)
+      %rhs_index = s32[] parameter(3)
+      %lt_key = pred[] compare(%lhs_key, %rhs_key), direction=LT
+      %gt_key = pred[] compare(%rhs_key, %lhs_key), direction=LT
+      %eq_key = pred[] compare(%lt_key, %gt_key), direction=EQ
+      %lt_index = pred[] compare(%lhs_index, %rhs_index), direction=LT
+      ROOT res = pred[] select(%eq_key, %lt_index, %lt_key)
+    }
+
+    sort_fusion {
+      p0 = s32[16384]{0} parameter(0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}) sort(p0, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[16384]{0} parameter(0)
+      ROOT fusion = (s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kInput, calls=sort_fusion
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
 // Test that verifies the IgnoreMemorySpace option works correctly
 TEST_F(SortingTest, LayoutsInShapesEqualWithIgnoreMemorySpace) {
   const char* hlo_text = R"(
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
index e5669c1fa4440f..3630635a713d2e 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
@@ -1300,6 +1300,7 @@ HloInstruction::FusionKind PriorityFusion::ChooseKind(
     case HloFusionAnalysis::EmitterFusionKind::kReduction:
     case HloFusionAnalysis::EmitterFusionKind::kTranspose:
     case HloFusionAnalysis::EmitterFusionKind::kScatter:
+    case HloFusionAnalysis::EmitterFusionKind::kSort:
       return HloInstruction::FusionKind::kInput;
   }
 }

From 3d8a8b3367d27329e37d983c2ce9242b6cfae15d Mon Sep 17 00:00:00 2001
From: Aleksei Nurmukhametov <anurmukh@amd.com>
Date: Thu, 18 Dec 2025 01:43:15 -0800
Subject: [PATCH 506/753] PR #35353: [WIP ROCm] Fix flaky
 PersistedAutotuningTest.SingleOperationGetsAutotuned

Imported from GitHub PR https://github.com/openxla/xla/pull/35353

Clear the shared autotune cache in PersistedAutotuningTest::SetUp.

The test was randomly passing or failing depending on test execution order due to a shared global autotune_cache that persists across test executions.

With this fix, the test now consistently fails for ROCm/AMDGPU, which is the expected behavior since transpose autotuning (implemented in #35098) is not yet supported on ROCm/AMDGPU. The test was occasionally and incorrectly passing when it inherited autotune results from other tests via the shared cache.

Copybara import of the project:

--
b993d4130398474ac6d30b94a73f2e78ac26119c by Aleksei Nurmukhametov <anurmukh@amd.com>:

[ROCm] Fix flaky PersistedAutotuningTest.SingleOperationGetsAutotuned

Clear the shared autotune cache in PersistedAutotuningTest::SetUp.

The test was randomly passing or failing depending on test execution
order due to a shared global autotune_cache that persists across test
executions.

With this fix, the test now consistently fails for ROCm/AMDGPU, which is
the expected behavior since transpose autotuning (implemented in #35098)
is not yet supported on ROCm/AMDGPU. The test was occasionally and
incorrectly passing when it inherited autotune results from other tests
via the shared cache.

Merging this change closes #35353

PiperOrigin-RevId: 846146297
---
 third_party/xla/xla/service/gpu/gpu_compiler_test.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 61c501b9fa8b3f..855267bd792c9a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -395,6 +395,13 @@ ENTRY e {
 
 class PersistedAutotuningTest : public HloTestBase {
  protected:
+  void SetUp() override {
+    AutotunerUtil::ClearAutotuneResults();
+    xla_gpu_dump_autotune_results_to_ = GetUniqueTempFilePath(".txt");
+  }
+
+  void TearDown() override { AutotunerUtil::ClearAutotuneResults(); }
+
   static constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -428,7 +435,6 @@ ENTRY e {
 
 TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
   constexpr absl::string_view kInvalidTextProto = "Invalid!";
-  xla_gpu_dump_autotune_results_to_ = GetUniqueTempFilePath(".txt");
 
   // Check that it writes the results on the first compilation.
   TF_EXPECT_OK(GetOptimizedModule(kHloText).status());
@@ -459,8 +465,6 @@ TEST_F(PersistedAutotuningTest, WriteResultsOnEachCompilation) {
 }
 
 TEST_F(PersistedAutotuningTest, SingleOperationGetsAutotuned) {
-  xla_gpu_dump_autotune_results_to_ = GetUniqueTempFilePath(".txt");
-
   TF_EXPECT_OK(GetOptimizedModule(R"(
 e {
   a = f32[64,128] parameter(0)

From 2df2c4fac75e6792b3f304d1a6ec7e9a23e3c25b Mon Sep 17 00:00:00 2001
From: Theotime Combes <tcombes@google.com>
Date: Thu, 18 Dec 2025 01:53:53 -0800
Subject: [PATCH 507/753] [XLA] Extend reshape-transpose chain removal to
 include bitcasts.

+ Allow the chain to start from <transpose, reshape, bitcast> instead of only reshape
+ Add a layout sensitive mode to the simplification

PiperOrigin-RevId: 846150097
---
 .../xla/xla/hlo/transforms/simplifiers/BUILD  |   1 +
 .../simplifiers/algebraic_simplifier.cc       | 113 ++++++++++++++----
 .../simplifiers/algebraic_simplifier.h        |   8 +-
 .../simplifiers/algebraic_simplifier_test.cc  | 108 ++++++++++++++++-
 4 files changed, 199 insertions(+), 31 deletions(-)

diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
index 3bc314542c2b85..85d50f4c030665 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
@@ -405,6 +405,7 @@ cc_library(
         "//xla/service:pattern_matcher",
         "//xla/service:shape_inference",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:nullability",
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
index 413520c0f2ab48..53d73d88643a5e 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.cc
@@ -74,6 +74,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -1300,6 +1301,7 @@ absl::Status AlgebraicSimplifierVisitor::HandleBitcast(
     VLOG(3) << bitcast->ToString() << " has control predecessors, skipping.";
     return absl::OkStatus();
   }
+
   // If a bitcast feeds a bitcast, make it a single bitcast.
   // Make sure the whole chain of bitcasts is optimized.
   if (bitcast->operand(0)->opcode() == HloOpcode::kBitcast) {
@@ -1325,6 +1327,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleBitcast(
     bitcast = new_bitcast;
   }
 
+  ASSIGN_OR_RETURN(bool transpose_chain_removed,
+                   TryRemovingBitcastOrReshapeTransposeChain(bitcast));
+  if (transpose_chain_removed) {
+    return absl::OkStatus();
+  }
+
   // Check whether we can potentially simplify the bitcast into a broadcast
   // operand.
   if (bitcast->opcode() == HloOpcode::kBitcast &&
@@ -6138,12 +6146,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleRemainder(
 }
 
 absl::StatusOr<bool>
-AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
-    HloInstruction* reshape) {
-  // Detect a chain of transposes and reshapes that can be replaced with a
-  // nop. All reshapes only add, remove or shuffle degenerate dimensions, such
-  // as [x,y,z]->[x,y,1,z] or its reverse, [x,y,1,z]->[x,1,y,z], etc. And all
-  // the shapes in the chain have at most one degenerate dimension. Then all
+AlgebraicSimplifierVisitor::TryRemovingBitcastOrReshapeTransposeChain(
+    HloInstruction* instruction) {
+  // Detect a chain of transposes and reshapes/bitcasts that can be replaced
+  // with a nop. All reshapes only add, remove or shuffle degenerate dimensions,
+  // such as [x,y,z]->[x,y,1,z] or its reverse, [x,y,1,z]->[x,1,y,z], etc. And
+  // all the shapes in the chain have at most one degenerate dimension. Then all
   // the transposes in the chain effectively permute x,y,z, while the
   // degenerate dimension is ignored. As long as all transposes compose to
   // identity permutation, the chain can be replaced with a nop if the
@@ -6157,12 +6165,19 @@ AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
                            });
   };
 
-  auto is_valid_reshape = [&](const HloInstruction* reshape) {
-    CHECK(reshape->opcode() == HloOpcode::kReshape);
-    return get_num_of_degenerate_dimensions(reshape->shape()) <= 1 &&
-           get_num_of_degenerate_dimensions(reshape->operand(0)->shape()) <=
-               1 &&
-           reshape->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
+  auto is_valid_reshape_or_bitcast = [&](const HloInstruction* inst) {
+    if (inst->opcode() != HloOpcode::kReshape &&
+        inst->opcode() != HloOpcode::kBitcast) {
+      return false;
+    }
+    if (inst->opcode() == HloOpcode::kBitcast &&
+        !options_.ReshapeIsBitcast(inst->operand(0)->shape(), inst->shape())) {
+      return false;
+    }
+    return get_num_of_degenerate_dimensions(inst->shape()) <= 1 &&
+           get_num_of_degenerate_dimensions(inst->operand(0)->shape()) <= 1 &&
+           ShapeUtil::InsertedOrDeleted1SizedDimensions(
+               inst->operand(0)->shape(), inst->shape());
   };
 
   auto get_degenerate_dimension = [](const Shape& shape) {
@@ -6204,19 +6219,34 @@ AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
     return DimensionVector(permutation.begin(), permutation.end());
   };
 
-  if (!options_.is_layout_sensitive() && is_valid_reshape(reshape)) {
-    int64_t effective_size = ShapeUtil::TrueNumDimensions(reshape->shape());
+  bool is_valid_start = false;
+  if (instruction->opcode() == HloOpcode::kTranspose) {
+    is_valid_start = !IsIdentityPermutation(instruction->dimensions());
+  } else {
+    is_valid_start = is_valid_reshape_or_bitcast(instruction);
+  }
+
+  if (is_valid_start) {
+    int64_t effective_size = ShapeUtil::TrueNumDimensions(instruction->shape());
     std::vector<int64_t> permutation(effective_size);
     // Init with identity permutation.
     std::iota(permutation.begin(), permutation.end(), 0);
 
+    if (instruction->opcode() == HloOpcode::kTranspose) {
+      auto effective_perm = get_effective_permutation(
+          instruction->dimensions(), instruction->operand(0)->shape(),
+          instruction->shape());
+      permutation.assign(effective_perm.begin(), effective_perm.end());
+    }
+
     bool is_nop = true;
     HloInstruction* starting_instruction = nullptr;
-    HloInstruction* current = reshape->mutable_operand(0);
+    HloInstruction* current = instruction->mutable_operand(0);
     while (current->opcode() == HloOpcode::kReshape ||
-           current->opcode() == HloOpcode::kTranspose) {
-      if (current->opcode() == HloOpcode::kReshape &&
-          !is_valid_reshape(current)) {
+           current->opcode() == HloOpcode::kTranspose ||
+           current->opcode() == HloOpcode::kBitcast) {
+      if (current->opcode() != HloOpcode::kTranspose &&
+          !is_valid_reshape_or_bitcast(current)) {
         is_nop = false;
         break;
       }
@@ -6238,13 +6268,38 @@ AlgebraicSimplifierVisitor::TryRemovingReshapeTransposeChain(
     }
 
     if (is_nop && starting_instruction != nullptr &&
-        Shape::Equal().IgnoreLayout()(
-            reshape->shape(), starting_instruction->operand(0)->shape()) &&
         IsIdentityPermutation(permutation)) {
-      VLOG(2) << "Deleting reshape-transpose chain: " << reshape->ToString();
-      TF_RETURN_IF_ERROR(ReplaceInstruction(
-          reshape, starting_instruction->mutable_operand(0)));
-      return true;
+      HloInstruction* new_operand = starting_instruction->mutable_operand(0);
+      bool replace_success = false;
+      if (options_.is_layout_sensitive()) {
+        if (ShapeUtil::Equal(instruction->shape(), new_operand->shape())) {
+          RETURN_IF_ERROR(ReplaceInstruction(instruction, new_operand));
+          replace_success = true;
+        } else if (options_.ReshapeIsBitcast(new_operand->shape(),
+                                             instruction->shape())) {
+          // If ReshapeIsBitcast is true, the shapes are guaranteed to have the
+          // same in-memory representation, including padding and tiling
+          // effects. Therefore, their byte sizes must be equal.
+          DCHECK_EQ(ShapeUtil::ByteSizeOf(new_operand->shape()),
+                    ShapeUtil::ByteSizeOf(instruction->shape()))
+              << "ReshapeIsBitcast is true, but byte sizes differ.";
+          RETURN_IF_ERROR(ReplaceWithNewInstruction(
+              instruction, HloInstruction::CreateBitcast(instruction->shape(),
+                                                         new_operand)));
+          replace_success = true;
+        }
+      } else {  // Non-layout sensitive.
+        if (Shape::Equal().IgnoreLayout()(instruction->shape(),
+                                          new_operand->shape())) {
+          RETURN_IF_ERROR(ReplaceInstruction(instruction, new_operand));
+          replace_success = true;
+        }
+      }
+      if (replace_success) {
+        VLOG(2) << "Deleting bitcast-or-reshape-transpose chain: "
+                << instruction->ToString();
+        return true;
+      }
     }
   }
   return false;
@@ -6287,8 +6342,8 @@ absl::Status AlgebraicSimplifierVisitor::HandleReshape(
     return ReplaceInstruction(reshape, operand);
   }
 
-  TF_ASSIGN_OR_RETURN(bool reshape_transpose_chain_removed,
-                      TryRemovingReshapeTransposeChain(reshape));
+  ASSIGN_OR_RETURN(bool reshape_transpose_chain_removed,
+                   TryRemovingBitcastOrReshapeTransposeChain(reshape));
   if (reshape_transpose_chain_removed) {
     return absl::OkStatus();
   }
@@ -9058,6 +9113,12 @@ absl::Status AlgebraicSimplifierVisitor::HandleTranspose(
                                            transpose->dimensions())));
   }
 
+  ASSIGN_OR_RETURN(bool chain_removed,
+                   TryRemovingBitcastOrReshapeTransposeChain(transpose));
+  if (chain_removed) {
+    return absl::OkStatus();
+  }
+
   const auto consider_swapping_dot_operands = [&](HloInstruction* dot) {
     // If the RHS is a parameter-like, and the LHS is not, do not swap the
     // operands, since the dot operands are in a convenient order for layout
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
index da0eb0f8612d7c..87848713101343 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier.h
@@ -855,10 +855,10 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
                                         bool multi_output_reduce,
                                         HloReduceInstruction* reduce);
 
-  // Detects a chain of transposes and reshapes that can be replaced with a
-  // nop.
-  absl::StatusOr<bool> TryRemovingReshapeTransposeChain(
-      HloInstruction* reshape);
+  // Detects a chain of transposes and reshapes (or bitcasts) that can be
+  // replaced with a nop.
+  absl::StatusOr<bool> TryRemovingBitcastOrReshapeTransposeChain(
+      HloInstruction* instruction);
 
   // Helper function for HandleReduce. Reorders reduce dot
   // to a dot reduce. reduce(dot(A, B)) to dot(A, reduce(B))
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
index 34bea124e05379..377ec2c76a81f8 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/algebraic_simplifier_test.cc
@@ -338,13 +338,91 @@ TEST_F(AlgebraicSimplifierTest, EliminateReshapeTransposeChain) {
   ROOT %reshape.96336 = f32[224,4,1,4096] reshape(%transpose.8665)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
   AlgebraicSimplifier(default_options_).Run(m.get()).value();
   VLOG(2) << "Module after: " << m->ToString();
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::Parameter(0)));
 }
 
+TEST_F(AlgebraicSimplifierTest, EliminateBitcastTransposeChain) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule m
+    test {
+      param = f32[10, 20] parameter(0)
+      transpose = f32[20, 10] transpose(param), dimensions={1, 0}
+      bitcast = f32[1, 20, 10] reshape(transpose)
+      transpose2 = f32[1, 10, 20] transpose(bitcast), dimensions={0, 2, 1}
+      ROOT bitcast2 = f32[10, 20] reshape(transpose2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(false);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
+
+TEST_F(AlgebraicSimplifierTest, EliminateBitcastTransposeChain_DifferentTypes) {
+  constexpr absl::string_view kModuleStr = R"(
+    HloModule m
+    test {
+      param = f32[10, 20] parameter(0)
+      transpose = f32[20, 10] transpose(param), dimensions={1, 0}
+      bitcast = s32[1, 20, 10] bitcast(transpose)
+      transpose2 = s32[1, 10, 20] transpose(bitcast), dimensions={0, 2, 1}
+      ROOT bitcast2 = f32[10, 20] bitcast(transpose2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(false);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_FALSE(simplifier.Run(m.get()).value());
+}
+
+TEST_F(AlgebraicSimplifierTest, BitcastTransposeChainReshapeIsBitcast) {
+  const std::string hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      p0 = bf16[512,16,3072]{2,1,0} parameter(0)
+      transpose.3 = bf16[512,3072,16]{2,1,0} transpose(p0), dimensions={0,2,1}
+      bitcast = bf16[1,512,3072,16]{3,2,1,0} bitcast(transpose.3)
+      transpose.2 = bf16[1,512,16,3072]{3,2,1,0} transpose(bitcast), dimensions={0,1,3,2}
+      ROOT bitcast.1 = bf16[8192,3072]{1,0} bitcast(transpose.2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
+}
+
+TEST_F(AlgebraicSimplifierTest, LayoutSensitive_EqualShapes_StartTranspose) {
+  const std::string hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      p0 = f32[2,3]{1,0} parameter(0)
+      t1 = f32[3,2]{1,0} transpose(p0), dimensions={1,0}
+      b1 = f32[1,3,2]{2,1,0} bitcast(t1)
+      t2 = f32[1,2,3]{2,1,0} transpose(b1), dimensions={0,2,1}
+      ROOT b2 = f32[2,3]{1,0} bitcast(t2)
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  ASSERT_TRUE(simplifier.Run(m.get()).value());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
+
 // Reshape-transpose chain is not eliminated since effective transposes
 // do not compose to identity permutation.
 TEST_F(AlgebraicSimplifierTest, NotEliminateReshapeTransposeChain) {
@@ -13090,5 +13168,33 @@ TEST_F(AlgebraicSimplifierTest, ConditionalWithConvert) {
                   )));
 }
 
+TEST_F(AlgebraicSimplifierTest,
+       BitcastTransposeChain_InvalidBitcastLayoutChange) {
+  // This test ensures that a bitcast which effectively acts as a transpose (due
+  // to layout change) prevents the removal of the transpose chain.
+  //
+  // Buggy behavior: Simplifier sees Transpose(1,0) ... Transpose(1,0), thinks
+  // they cancel out, ignores the Bitcast's layout effect, and simplifies to p0.
+  const std::string hlo_string = R"(
+    HloModule m
+    ENTRY test {
+      p0 = f32[10,10]{0,1} parameter(0)
+      t1 = f32[10,10]{0,1} transpose(p0), dimensions={1,0}
+      b1 = f32[10,10]{1,0} bitcast(t1)
+      ROOT t2 = f32[10,10]{0,1} transpose(b1), dimensions={1,0}
+    }
+  )";
+  ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  simplifier.Run(m.get()).value();
+
+  // Ensure it didn't incorrectly simplify to the parameter.
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              Not(GmockMatch(m::Parameter(0))));
+}
+
 }  // namespace
 }  // namespace xla

From 69f8ca2e2887556bd18098d7f45d9dbf4ddf864b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 18 Dec 2025 02:13:37 -0800
Subject: [PATCH 508/753] PR #35479: Add clangd files and directories to
 .gitignore

Imported from GitHub PR https://github.com/openxla/xla/pull/35479

Add clangd files and directories to .gitignore
Copybara import of the project:

--
2999b064c6b756dfc0355d863b863aff1bdea2fa by Eugene Zhulenev <ezv@amazon.com>:

Add clangd files and directories to .gitignore

Add clangd files and directories to .gitignore

Merging this change closes #35479

PiperOrigin-RevId: 846156873
---
 third_party/xla/.gitignore | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/.gitignore b/third_party/xla/.gitignore
index 619ec239a7809c..734c302636dbb4 100644
--- a/third_party/xla/.gitignore
+++ b/third_party/xla/.gitignore
@@ -28,3 +28,8 @@ tools/python_bin_path.sh
 *.VC.opendb
 *.suo
 *.user
+
+# Ignore clangd files and directories: https://openxla.org/xla/lsp
+.cache
+compile_commands.json
+external

From fe216f0f45b330cd2a9cdb1b59621fd15ba085db Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 02:44:57 -0800
Subject: [PATCH 509/753] Automated Code Change

PiperOrigin-RevId: 846167560
---
 tensorflow/core/tfrt/utils/graph_partition.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/tfrt/utils/graph_partition.cc b/tensorflow/core/tfrt/utils/graph_partition.cc
index 08f5dce6d5734d..ddf50ab8c7ef4d 100644
--- a/tensorflow/core/tfrt/utils/graph_partition.cc
+++ b/tensorflow/core/tfrt/utils/graph_partition.cc
@@ -436,7 +436,7 @@ absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
   auto new_graph = std::make_unique<Graph>(graph->flib_def());
   FunctionDefLibrary flib = graph->flib_def().ToProto();
 
-  std::unordered_map<string, std::unique_ptr<Graph>> partitions;
+  std::unordered_map<std::string, std::unique_ptr<Graph>> partitions;
   TF_RETURN_IF_ERROR(
       PartitionFunctionGraph(device_set, std::move(graph), &partitions));
 
@@ -447,7 +447,7 @@ absl::StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
   std::map<std::string, OutputNodeInfo> device_to_output_info_map;
 
   for (auto& partition : partitions) {
-    const string& device = partition.first;
+    const std::string& device = partition.first;
     VLOG(1) << "Process the partitioin on device: " << device;
 
     Graph* subgraph = partition.second.get();

From 90f6a0227660e4cf7997607f71c033ec36a5234c Mon Sep 17 00:00:00 2001
From: Greg Olechwierowicz <olechwierowicz@google.com>
Date: Thu, 18 Dec 2025 02:48:01 -0800
Subject: [PATCH 510/753] [XLA:GPU] Add method for printing unsatisfied
 Constraints for ConstraintExpression.

Helps with narrowing down which constraints are unsat. There can be many constraints (e.g. WGMMA in Mosaic), and while debugging it's unclear which one is violated at a glance.

As a follow up, we can also introduce names to each Constraint to make the identification even easier.

PiperOrigin-RevId: 846168559
---
 third_party/xla/xla/codegen/tiling/BUILD      |  1 -
 .../codegen/tiling/constraint_expression.cc   | 31 ++++++++++++++++--
 .../codegen/tiling/constraint_expression.h    |  5 +++
 .../tiling/constraint_expression_test.cc      | 32 ++++++++++++++++++-
 4 files changed, 64 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/codegen/tiling/BUILD b/third_party/xla/xla/codegen/tiling/BUILD
index e974b0c17d2eb6..612dee3a700cb2 100644
--- a/third_party/xla/xla/codegen/tiling/BUILD
+++ b/third_party/xla/xla/codegen/tiling/BUILD
@@ -208,7 +208,6 @@ xla_cc_test(
     srcs = ["constraint_expression_test.cc"],
     deps = [
         ":constraint_expression",
-        "//xla/hlo/analysis:indexing_analysis",
         "//xla/hlo/analysis:indexing_test_utils",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/codegen/tiling/constraint_expression.cc b/third_party/xla/xla/codegen/tiling/constraint_expression.cc
index 1c3a558970d2c9..8226932ba80bbb 100644
--- a/third_party/xla/xla/codegen/tiling/constraint_expression.cc
+++ b/third_party/xla/xla/codegen/tiling/constraint_expression.cc
@@ -184,6 +184,31 @@ bool ConstraintExpression::IsSatisfiedBy(
       });
 }
 
+void ConstraintExpression::PrintUnsatisfiedConstraints(
+    absl::Span<const int64_t> dim_values, std::ostream& out) const {
+  auto is_conjunction_satisfied = [&](const auto& conjunction) {
+    return absl::c_all_of(conjunction, [&](const Constraint& constraint) {
+      int64_t value = EvaluateAffineExpr(constraint.expr, dim_values);
+      return constraint.interval.Contains(value);
+    });
+  };
+
+  for (const auto& [i, conjunction] :
+       llvm::enumerate(disjoint_conjoint_constraints_)) {
+    if (is_conjunction_satisfied(conjunction)) {
+      continue;
+    }
+    out << "Unsatisfied conjunction: #" << i << "\n";
+    for (const Constraint& constraint : conjunction) {
+      int64_t value = EvaluateAffineExpr(constraint.expr, dim_values);
+      if (!constraint.interval.Contains(value)) {
+        out << " -- " << constraint.expr << " in "
+            << constraint.interval.ToString() << ". Value: " << value << "\n";
+      }
+    }
+  }
+}
+
 std::string ConstraintExpression::ToString() const {
   std::stringstream ss;
   Print(ss);
@@ -204,10 +229,10 @@ void ConstraintExpression::Print(std::ostream& out) const {
   // order and to get deterministic output.
   std::vector<std::string> conjunction_strings;
   conjunction_strings.reserve(disjoint_conjoint_constraints_.size());
-  for (const auto& disjunction : disjoint_conjoint_constraints_) {
+  for (const auto& conjunction : disjoint_conjoint_constraints_) {
     std::vector<std::string> constraint_strings;
-    constraint_strings.reserve(disjunction.size());
-    for (const auto& [expr, interval] : disjunction) {
+    constraint_strings.reserve(conjunction.size());
+    for (const auto& [expr, interval] : conjunction) {
       constraint_strings.push_back(
           absl::StrCat(xla::ToString(expr), " in ", interval.ToString()));
     }
diff --git a/third_party/xla/xla/codegen/tiling/constraint_expression.h b/third_party/xla/xla/codegen/tiling/constraint_expression.h
index 553c44ee9b1a7b..86c180695b21c1 100644
--- a/third_party/xla/xla/codegen/tiling/constraint_expression.h
+++ b/third_party/xla/xla/codegen/tiling/constraint_expression.h
@@ -99,6 +99,11 @@ class ConstraintExpression {
   // constraints.
   bool IsSatisfiedBy(absl::Span<const int64_t> dim_values) const;
 
+  // Prints unsatisfied constraints which are not satisfied by the provided
+  // `dim_values`.
+  void PrintUnsatisfiedConstraints(absl::Span<const int64_t> dim_values,
+                                   std::ostream& out) const;
+
   std::string ToString() const;
 
   void Print(std::ostream& out) const;
diff --git a/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc b/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
index 72152b9cee4444..d671d6aad7f516 100644
--- a/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
+++ b/third_party/xla/xla/codegen/tiling/constraint_expression_test.cc
@@ -16,19 +16,21 @@ limitations under the License.
 #include "xla/codegen/tiling/constraint_expression.h"
 
 #include <cstdint>
+#include <sstream>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/types/span.h"
-#include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
 
 namespace xla {
 namespace {
 
 using ::testing::ExplainMatchResult;
+using ::testing::HasSubstr;
+using ::testing::Not;
 
 using Constraint = ConstraintExpression::Constraint;
 
@@ -96,6 +98,34 @@ TEST_F(ConstraintExpressionTest, PrettyPrintingTest) {
                                "d0 in [1, 2] && d1 in [3, 4] || d2 in [5, 6]"));
 }
 
+TEST_F(ConstraintExpressionTest, PrintUnsatisfiedConstraints) {
+  Constraint c0 = GetConstraint("d0 mod 6", 0, 0);
+  Constraint c1 = GetConstraint("d1 mod 8", 0, 0);
+  Constraint c2 = GetConstraint("d0 mod 13", 0, 0);
+  ConstraintExpression constraints = (c0 && c1) || (c1 && c2);
+
+  // (c0 && c1) is satisfied.
+  std::vector<int64_t> lhs_satisfied({6, 8});
+  ASSERT_TRUE(constraints.IsSatisfiedBy(lhs_satisfied));
+  std::stringstream ss;
+  constraints.PrintUnsatisfiedConstraints(lhs_satisfied, ss);
+
+  EXPECT_THAT(ss.str(), HasSubstr("Unsatisfied conjunction: #1"));
+  EXPECT_THAT(ss.str(), HasSubstr("d0 mod 13 in [0, 0]. Value: 6"));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d1 mod 8 in [0, 0]")));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d0 mod 6 in [0, 0]")));
+
+  // (c1 && c2) is satisfied.
+  std::vector<int64_t> rhs_satisfied({13, 8});
+  ASSERT_TRUE(constraints.IsSatisfiedBy(rhs_satisfied));
+  ss.str("");
+  constraints.PrintUnsatisfiedConstraints(rhs_satisfied, ss);
+  EXPECT_THAT(ss.str(), HasSubstr("Unsatisfied conjunction: #0"));
+  EXPECT_THAT(ss.str(), HasSubstr("d0 mod 6 in [0, 0]. Value: 1"));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d0 mod 13 in [0, 0]")));
+  EXPECT_THAT(ss.str(), Not(HasSubstr("d1 mod 8 in [0, 0]")));
+}
+
 TEST_F(ConstraintExpressionTest,
        ConjunctionOfConstraintsOnTheSameExpressionAreIntersected) {
   ConstraintExpression constraints{GetConstraint("d0", 0, 5)};

From 35808079a20f1473f08b1278ae521256e2f651dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 02:58:06 -0800
Subject: [PATCH 511/753] Automated Code Change

PiperOrigin-RevId: 846171859
---
 third_party/xla/xla/hlo/builder/lib/BUILD               | 1 +
 third_party/xla/xla/hlo/builder/lib/comparators_test.cc | 1 +
 third_party/xla/xla/hlo/builder/lib/math.h              | 1 +
 3 files changed, 3 insertions(+)

diff --git a/third_party/xla/xla/hlo/builder/lib/BUILD b/third_party/xla/xla/hlo/builder/lib/BUILD
index 997525cec41e5b..597ac38c912fe3 100644
--- a/third_party/xla/xla/hlo/builder/lib/BUILD
+++ b/third_party/xla/xla/hlo/builder/lib/BUILD
@@ -96,6 +96,7 @@ xla_test(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf_lite",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/hlo/builder/lib/comparators_test.cc b/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
index 974ae4899046b9..523c11b479d614 100644
--- a/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
+++ b/third_party/xla/xla/hlo/builder/lib/comparators_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/repeated_ptr_field.h"
 #include "xla/hlo/builder/lib/constants.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
diff --git a/third_party/xla/xla/hlo/builder/lib/math.h b/third_party/xla/xla/hlo/builder/lib/math.h
index 921e7cd3f4a0f2..4614c0442aafac 100644
--- a/third_party/xla/xla/hlo/builder/lib/math.h
+++ b/third_party/xla/xla/hlo/builder/lib/math.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 

From 9024ef1e4ccb9ad859154ca641cf16fe9595162f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 03:03:08 -0800
Subject: [PATCH 512/753] Automated Code Change

PiperOrigin-RevId: 846173555
---
 third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h b/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
index d91f41dcec389a..a0d11f61c53d42 100644
--- a/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
+++ b/third_party/xla/xla/backends/cpu/runtime/buffer_allocations.h
@@ -86,7 +86,7 @@ inline BufferAllocations::BufferAllocations(
       buffers_data_(buffers_.data()),
       num_buffers_(buffers_.size()) {
   for (size_t i = 0; i < buffers.size(); ++i) {
-    buffers_[i] = buffers[i].AsDeviceMemoryBase();
+    buffers_[i] = buffers[i].AsDeviceAddress();
   }
 }
 

From 408bf09796590bc66233afff288bf926e2736a9d Mon Sep 17 00:00:00 2001
From: Theotime Combes <tcombes@google.com>
Date: Thu, 18 Dec 2025 03:59:57 -0800
Subject: [PATCH 513/753] [XLA:GPU]Disable TransposeDimensionGrouper pass and
 replace it with OTF normalization in emitters

0) Fix a bug (?) in normalization util when normalized dim contains a single dimension
1) Perform normalization OTF for Transpose emitter selection
2) Use normalized shape for unrolling decision in kLoop emitter
3) Use normalized shape to detect slow transposes in triton fusion rewriter

PiperOrigin-RevId: 846191206
---
 .../codegen/triton/triton_gemm_fusion_test.cc | 21 +++--
 .../xla/xla/service/gpu/gpu_compiler.cc       |  3 -
 .../xla/xla/service/gpu/gpu_compiler_test.cc  | 28 -------
 .../gpu_compiler_test_autotune_db.textproto   |  4 +-
 .../xla/xla/service/gpu/gpu_fusible.cc        | 19 ++++-
 .../xla/xla/service/gpu/gpu_fusible_test.cc   | 44 +++++++---
 .../xla/xla/service/gpu/ir_emission_utils.cc  | 42 +++++-----
 .../xla/service/gpu/ir_emission_utils_test.cc | 81 ++++++++++++++-----
 .../gpu/model/coalescing_analysis_test.cc     | 18 ++---
 .../xla/xla/service/gpu/transforms/BUILD      |  2 +-
 .../transforms/cudnn_norm_rewriter_test.cc    | 30 +++----
 .../transforms/fusion_block_level_rewriter.cc | 27 ++++---
 .../fusion_block_level_rewriter_test.cc       | 32 ++++++++
 .../gpu/transforms/layout_assignment_a100.hlo |  7 +-
 .../gpu/transforms/layout_assignment_h100.hlo |  7 +-
 .../gpu/transforms/layout_assignment_v100.hlo |  7 +-
 third_party/xla/xla/shape_util.cc             |  1 +
 third_party/xla/xla/shape_util_test.cc        | 15 +++-
 18 files changed, 247 insertions(+), 141 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
index 52e84704781010..127b0d60a72832 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
@@ -562,7 +562,10 @@ ENTRY e {
                                ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTest, SplitLhsNoncontractingTransposeRhs) {
+// TODO: b/422676780 - Enable the tests once the indexing maps-based tiling is
+// deprecated. The test is disabled after we remove TransposeDimensionGrouper
+// pass, because the infra currently requires grouping of adjacent dimensions.
+TEST_F(TritonGemmTest, DISABLED_SplitLhsNoncontractingTransposeRhs) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -587,7 +590,10 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
 }
 
-TEST_F(TritonGemmTest, SplitLhsNoncontracting) {
+// TODO: b/422676780 - Enable the tests once the indexing maps-based tiling is
+// deprecated. The test is disabled after we remove TransposeDimensionGrouper
+// pass, because the infra currently requires grouping of adjacent dimensions.
+TEST_F(TritonGemmTest, DISABLED_SplitLhsNoncontracting) {
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
   p0 = f32[72,72] parameter(0)
@@ -1776,12 +1782,17 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      GmockMatch(m::Bitcast(
+      root,
+      GmockMatch(
           m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
                         .WithFusionKind(HloInstruction::FusionKind::kCustom))
-              .WithFusionKind(HloInstruction::FusionKind::kInput))));
+              .WithFusionKind(HloInstruction::FusionKind::kInput)));
+
+  const HloFusionInstruction* root_fusion = Cast<HloFusionInstruction>(root);
+  EXPECT_EQ(root_fusion->fused_expression_root()->opcode(),
+            HloOpcode::kTranspose);
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 36471b34d7a2ac..45185f41fab4d0 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1751,7 +1751,6 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
       // introduced the nested fusions. We also want to keep it close to the
       // gemm rewriter to avoid the possibility of new passes to rewrite the
       // transpose.
-      pipeline.AddPass<TransposeDimensionGrouper>();
       pipeline.AddPass<GemmFusion>(gpu_version);
       pipeline.AddPass<GemmFusionSwapOperands>();
     } else if (cuda_cc != nullptr &&
@@ -1779,8 +1778,6 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // also have unsorted update_window_dims.
     pipeline.AddPass<ScatterSimplifier>();
     pipeline.AddPass<BroadcastCanonicalizer>();
-    // BroadcastCanonicalizer can create transposes.
-    pipeline.AddPass<TransposeDimensionGrouper>();
     pipeline.AddPass<ReductionDegenerateDimRemover>();
     pipeline.AddPass<ReductionLayoutNormalizer>();
     // Run Softmax fusion after layout normalization. We expect a default layout
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 855267bd792c9a..33abfb08f9faec 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -1855,34 +1855,6 @@ TEST_F(PassOrderTest, NestGemmFusionRunsAfterHoistFusedBitcasts) {
   VerifyPassOrder("hoist-fused-bitcasts", "nest_gemm_fusion");
 }
 
-TEST_F(PassOrderTest, TransposeDimensionGrouperRunsBeforeGemmRewriter) {
-  if (!get_cuda_cc().IsAtLeastAmpere()) {
-    GTEST_SKIP() << "triton-gemm-rewriter requires at least Ampere to run.";
-  }
-  if (!optimized_module_) {
-    CompileModule(GetModuleConfigForTest());
-  }
-  // DebugOptions options = GetDebugOptionsForTest();
-  // options.set_xla_gpu_enable_triton_gemm(true);
-  // SetDebugOptions(options);
-  // Verify that transpose-dimension-grouper runs immediately before
-  // triton-gemm-rewriter. We want to keep them close together to avoid the
-  // possibility of new passes to rewrite the transpose and make it
-  // not compatible with the generic triton emitter.
-  // Simple VerifyPassOrder does not work here as we want to check that passes
-  // are run next to each other, also transpose-dimension-grouper runs one more
-  // time after the gemm rewriter.
-  CHECK(optimized_module_);
-  std::string previous_pass_name;
-  for (const HloPassMetadata& pass_metadata :
-       optimized_module_->metadata().proto().pass_metadata()) {
-    if (pass_metadata.pass_name() == "triton-gemm-rewriter") {
-      EXPECT_EQ(previous_pass_name, "transpose-dimension-grouper");
-    }
-    previous_pass_name = pass_metadata.pass_name();
-  }
-}
-
 TEST_F(PassOrderTest,
        ReducePrecisionIsRemovedAfterAllCallsToSimplifyFPConversions) {
   // Because of an issue with JAX remat and `SimplifyFPConversions` (see PR:
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index 67d67b9594af5f..e5b7f65cab6f69 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -63,7 +63,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_5), dimensions={0,2,1}\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_6)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_7)\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_10 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_10), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_12 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_11)\n}"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_4), dimensions={0,1,3,2}\n  tmp_6 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_5)\n  tmp_7 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_7)\n  tmp_9 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_6, bf16[128,1024,1024]{2,1,0} tmp_8), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_10 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_9)\n}"
   result {
     gemm {
       algorithm: -1
@@ -183,7 +183,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "{\n  tmp_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[3,32768,4,1024]{3,2,1,0} bitcast(bf16[3,32,1024,4,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[3,4,32768,1024]{3,2,1,0} transpose(bf16[3,32768,4,1024]{3,2,1,0} tmp_1), dimensions={0,2,1,3}\n  tmp_3 = bf16[3,4,32,1024,1024]{4,3,2,1,0} bitcast(bf16[3,4,32768,1024]{3,2,1,0} tmp_2)\n  tmp_4 = bf16[1,3,32,1024]{3,2,1,0} parameter(1)\n  tmp_5 = bf16[3,32,1024]{2,1,0} bitcast(bf16[1,3,32,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[3,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[3,32,1024]{2,1,0} tmp_5), dimensions={0,2,3}\n  tmp_7 = bf16[3,4,32,1024,1024]{4,3,2,1,0} add(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_3, bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[1:2], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[0:1], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_10 = bf16[] constant({...})\n  tmp_11 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_10), dimensions={}\n  tmp_12 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_11)\n  tmp_13 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_12)\n  tmp_14 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_13), dimensions={0,2,1}\n  ROOT tmp_15 = (bf16[1,4,32,1024,1024]{4,3,2,1,0}, bf16[128,1024,1024]{2,1,0}) tuple(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_14)\n}"
+  hlo: "{\n  tmp_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[3,4,32,1024,1024]{4,3,2,1,0} transpose(bf16[3,32,1024,4,1024]{4,3,2,1,0} tmp_0), dimensions={0,3,1,2,4}\n  tmp_2 = bf16[1,3,32,1024]{3,2,1,0} parameter(1)\n  tmp_3 = bf16[3,32,1024]{2,1,0} bitcast(bf16[1,3,32,1024]{3,2,1,0} tmp_2)\n  tmp_4 = bf16[3,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[3,32,1024]{2,1,0} tmp_3), dimensions={0,2,3}\n  tmp_5 = bf16[3,4,32,1024,1024]{4,3,2,1,0} add(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_1, bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_5), slice={[1:2], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_7 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_5), slice={[0:1], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_8 = bf16[] constant({...})\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_8), dimensions={}\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_7, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9)\n  tmp_11 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_11), dimensions={0,1,3,2}\n  ROOT tmp_13 = (bf16[1,4,32,1024,1024]{4,3,2,1,0}, bf16[4,32,1024,1024]{3,2,1,0}) tuple(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6, bf16[4,32,1024,1024]{3,2,1,0} tmp_12)\n}"
   result {
     other {
       name: "NativeEmitter"
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 5f665644c5719e..eab17134475244 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -64,9 +64,22 @@ bool ContainsTransposeWithSmallMostMinorDim(const HloFusionAdaptor& fusion,
       return false;
     }
     const HloInstruction& transpose = instr.instruction();
-    // We can assume that TransposeDimensionGrouper pass has run, so no need
-    // to try to combine adjacent dimensions.
-    return transpose.shape().dimensions().back() < unroll_factor;
+    // The kLoop emitter operates on the original transpose, but it handles the
+    // index calculation. The critical factor for performance (coalescing) is
+    // the size of the contiguous memory block being accessed in the minor
+    // dimension. Normalization reveals this true physical dimension size by
+    // merging adjacent logical dimensions. If this normalized dimension is
+    // large enough, the unrolled accesses will be coalesced, justifying the
+    // unroll factor.
+    absl::InlinedVector<int64_t, 3> permutation;
+    auto normalized_dims_or = ShapeUtil::GetNormalizedLogicalTransposeShape(
+        transpose.operand(0)->shape(), transpose.shape(),
+        transpose.dimensions(), permutation);
+    if (normalized_dims_or.ok()) {
+      return normalized_dims_or.value().back() < unroll_factor;
+    } else {
+      return transpose.shape().dimensions().back() < unroll_factor;
+    }
   });
 }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index d1ad49adfd1278..b19b554d0f4a3a 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -603,9 +603,7 @@ TEST_F(GpuFusibleTest, FusionHeroesAreCompatible_TransposeFusionNotCompatible) {
     fused_computation_1 {
       p0.1 = f32[64,32]{1,0} parameter(0)
       neg = f32[64,32]{1,0} negate(p0.1)
-      bc = f32[1,64,32]{2,1,0} bitcast(neg)
-      transpose = f32[1,32,64]{2,1,0} transpose(bc), dimensions={0,2,1}
-      ROOT bc2 = f32[32,64]{1,0} bitcast(transpose)
+      ROOT transpose = f32[32,64]{1,0} transpose(neg), dimensions={1,0}
     }
 
     fused_computation_2 {
@@ -623,12 +621,10 @@ TEST_F(GpuFusibleTest, FusionHeroesAreCompatible_TransposeFusionNotCompatible) {
   const HloInstruction* fusion_1 =
       module->entry_computation()->root_instruction();
   const HloInstruction* fusion_2 = fusion_1->operand(0);
-  EXPECT_FALSE(
-      FusionHeroesAreCompatible(fusion_1->fused_expression_root(),
-                                fusion_2->fused_expression_root()->operand(0)));
-  EXPECT_FALSE(
-      FusionHeroesAreCompatible(fusion_2->fused_expression_root()->operand(0),
-                                fusion_1->fused_expression_root()));
+  EXPECT_FALSE(FusionHeroesAreCompatible(fusion_1->fused_expression_root(),
+                                         fusion_2->fused_expression_root()));
+  EXPECT_FALSE(FusionHeroesAreCompatible(fusion_2->fused_expression_root(),
+                                         fusion_1->fused_expression_root()));
 }
 
 TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_LoopFusions) {
@@ -1310,9 +1306,9 @@ TEST_F(GpuFusibleTest, ChooseFusionKind) {
 HloModule module
 
 ENTRY computation {
-    p = f32[1,5000,6000]{2,1,0} parameter(0)
-    c = f32[1,6000,5000]{2,1,0} transpose(p), dimensions={0,2,1}
-    ROOT r = f32[300,20,5000]{2,1,0} reshape(c)
+    p = f32[5000,6000]{1,0} parameter(0)
+    c = f32[6000,5000] transpose(p), dimensions={1,0}
+    ROOT r = f32[300,20,5000] reshape(c)
 }
 )")
                     .value();
@@ -1802,6 +1798,30 @@ ENTRY main {
   EXPECT_EQ(config.unroll_factor, 8);
 }
 
+TEST_F(GpuFusibleTest,
+       ComputeLoopFusionConfigForLoopTransposeEffectiveLargerMinorDim) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY main {
+  p0 = f16[256,2048,4,2]{3,2,1,0} parameter(0)
+  ROOT res = f16[2048,256,4,2]{3,2,1,0} transpose(p0), dimensions={1,0,2,3}
+}
+)"));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  se::DeviceDescription device_info_h100{
+      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
+  auto config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 4);
+
+  se::DeviceDescription device_info_b200{
+      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
+  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
+  config = ComputeLoopFusionConfig(analysis, root->shape());
+  EXPECT_EQ(config.unroll_factor, 8);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 72c74c7b8ea8d2..6eca640b573d94 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -245,20 +245,23 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     return std::nullopt;
   }
 
-  // We can assume that TransposeDimensionGrouper pass has run, so no need to
-  // call GetNormalizedLogicalTransposeShape here.
-  absl::InlinedVector<int64_t, 3> permutation(hero.dimensions().begin(),
-                                              hero.dimensions().end());
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_dims_or = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      hero.operand(0)->shape(), hero.shape(), hero.dimensions(), permutation);
+  if (!normalized_dims_or.ok()) {
+    return std::nullopt;
+  }
+  auto normalized_dims = normalized_dims_or.value();
+  auto normalized_operand_dims =
+      Permute(normalized_dims, InversePermutation(permutation));
   // A real transpose needs at least 2 transpose dimensions.
   if (permutation.size() < 2) {
     return std::nullopt;
   }
   auto bit_width = GetBitwidth(hero.shape().element_type());
-  absl::InlinedVector<int64_t, 3> dimensions(hero.shape().dimensions().begin(),
-                                             hero.shape().dimensions().end());
-  int64_t operand_most_minor_dim = hero.operand(0)->shape().dimensions().back();
+  int64_t operand_most_minor_dim = normalized_operand_dims.back();
 
-  TransposeDescription desc{&hero, dimensions, permutation,
+  TransposeDescription desc{&hero, normalized_dims, permutation,
                             /*shmem_usage=*/0};
   if (CanEmitPackedTranspose(desc)) {
     int64_t vector_size =
@@ -267,27 +270,28 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
         kNumShmemBanks * (kBankBitwidth / 8) * kNumShmemBanks * vector_size;
     return desc;
   }
-  if (permutation.back() == dimensions.size() - 1) {
+  // Minor dimension is preserved.
+  if (permutation.back() == normalized_dims.size() - 1) {
     operand_most_minor_dim =
-        hero.operand(0)->shape().dimensions(dimensions.size() - 2);
-    if (bit_width * dimensions.back() <= kMaxBitsInMostMinorDimension &&
-        bit_width * dimensions.back() *
+        normalized_operand_dims[normalized_dims.size() - 2];
+    if (bit_width * normalized_dims.back() <= kMaxBitsInMostMinorDimension &&
+        bit_width * normalized_dims.back() *
                 std::min(operand_most_minor_dim,
-                         dimensions[dimensions.size() - 2]) >=
+                         normalized_dims[normalized_dims.size() - 2]) >=
             8 * kMinDimensionToTransposeTiled) {
       // Tile size for transposition.
       int64_t shmem_usage_bytes =
           CeilOfRatio(kNumShmemBanks * (kNumShmemBanks + 1LL) * bit_width *
-                          dimensions.back(),
+                          normalized_dims.back(),
                       8LL);
-      return TransposeDescription{&hero, dimensions, permutation,
+      return TransposeDescription{&hero, normalized_dims, permutation,
                                   shmem_usage_bytes};
     }
   } else if ((operand_most_minor_dim >= kMinDimensionToTransposeTiled &&
-              dimensions.back() >= kMinDimensionToTransposeTiled) ||
+              normalized_dims.back() >= kMinDimensionToTransposeTiled) ||
              (operand_most_minor_dim >= kMinDimensionToTransposeTiled2 &&
-              dimensions.back() >= kMinDimensionToTransposeTiled2 &&
-              operand_most_minor_dim * dimensions.back() >=
+              normalized_dims.back() >= kMinDimensionToTransposeTiled2 &&
+              operand_most_minor_dim * normalized_dims.back() >=
                   kMinTotalDimensionsToTransposeTiled)) {
     // TODO(b/415741994): TransposeEmitter is regressing for S4 when the last
     // dimension is being transposed. The issue seems to be related to bank
@@ -297,7 +301,7 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     }
     int64_t shmem_usage_bytes =
         CeilOfRatio(kNumShmemBanks * (kNumShmemBanks + 1LL) * bit_width, 8LL);
-    return TransposeDescription{&hero, dimensions, permutation,
+    return TransposeDescription{&hero, normalized_dims, permutation,
                                 shmem_usage_bytes};
   }
   return std::nullopt;
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index f14ff91e330107..dea30196de473e 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -82,12 +82,13 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTranspose) {
 HloModule module
 
 ENTRY entry {
-  p = f32[1536,64]{1,0} parameter(0)
-  ROOT t = f32[64,1536]{1,0} transpose(p), dimensions={1,0}
+  p = f32[32,48,64]{2,1,0} parameter(0)
+  ROOT t = f32[64,32,48]{2,1,0} transpose(p), dimensions={2,0,1}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo));
+
   HloInstruction* tr = module->entry_computation()->root_instruction();
 
   auto result = GetDescriptionForTiledTransposeEmitter(*tr);
@@ -102,12 +103,12 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogical102Transpose) {
 HloModule module
 
 ENTRY entry {
-  p = f32[32,48,2]{2,1,0} parameter(0)
-  ROOT t = f32[48,32,2]{2,1,0} transpose(p), dimensions={1,0,2}
+  p = f32[32,48,1,2]{3,2,1,0} parameter(0)
+  ROOT t = f32[48,32,1,2]{3,2,1,0} transpose(p), dimensions={1,0,2,3}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo));
   HloInstruction* tr = module->entry_computation()->root_instruction();
 
   auto result = GetDescriptionForTiledTransposeEmitter(*tr);
@@ -412,10 +413,8 @@ fusion {
   p = f32[32,48,64]{2,1,0} parameter(0)
   p2 = f32[48,32,64]{2,1,0} parameter(1)
   t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
-  bc = f32[1,1536,64]{2,1,0} bitcast(p2)
-  t2 = f32[1,64,1536]{2,1,0} transpose(bc), dimensions={0,2,1}
-  bc2 = f32[64,48,32]{2,1,0} bitcast(t2)
-  ROOT add = f32[64,48,32]{2,1,0} add(t, bc2)
+  t2 = f32[64,48,32]{2,1,0} transpose(p2), dimensions={2,0,1}
+  ROOT add = f32[64,48,32]{2,1,0} add(t, t2)
 }
 
 ENTRY main {
@@ -434,6 +433,26 @@ ENTRY main {
   EXPECT_EQ(&FindNonTrivialHero(*r), r);
 }
 
+TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeWithGrouping) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[32,32,64]{2,1,0} parameter(0)
+  ROOT t = f32[64,32,32]{2,1,0} transpose(p), dimensions={2,0,1}
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo));
+  HloInstruction* tr = module->entry_computation()->root_instruction();
+
+  auto result = GetDescriptionForTiledTransposeEmitter(*tr);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result->instr, tr);
+  EXPECT_EQ(result->dimensions, InlinedVector({64, 1024}));
+  EXPECT_EQ(result->permutation, InlinedVector({1, 0}));
+}
+
 TEST_F(IrEmissionUtilsTest, FindNonTrivialHeroOutsideFusion) {
   const char* hlo = R"(
 HloModule module
@@ -533,13 +552,13 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOneSwapDimIsSmall) {
 HloModule module
 
 fusion {
-  p = f32[1100,12,8]{2,1,0} parameter(0)
-  ROOT t = f32[8,12,1100]{2,1,0} transpose(p), dimensions={2,1,0}
+  p = f32[100,11,12,8]{3,2,1,0} parameter(0)
+  ROOT t = f32[8,12,100,11]{3,2,1,0} transpose(p), dimensions={3,2,0,1}
 }
 
 ENTRY main {
-  param = f32[1100,12,8]{2,1,0} parameter(0)
-  ROOT fusion = f32[8,12,1100]{2,1,0} fusion(param), kind=kInput, calls=fusion
+  param = f32[100,11,12,8]{3,2,1,0} parameter(0)
+  ROOT fusion = f32[8,12,100,11]{3,2,1,0} fusion(param), kind=kInput, calls=fusion
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -559,13 +578,13 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOtherSwapDimIsSmall) {
 HloModule module
 
 fusion {
-  p = f32[8,12,1100]{2,1,0} parameter(0)
-  ROOT t = f32[1100,12,8]{2,1,0} transpose(p), dimensions={2,1,0}
+  p = f32[8,12,100,11]{3,2,1,0} parameter(0)
+  ROOT t = f32[100,11,12,8]{3,2,1,0} transpose(p), dimensions={2,3,1,0}
 }
 
 ENTRY main {
-  param = f32[8,12,1100]{2,1,0} parameter(0)
-  ROOT fusion = f32[1100,12,8]{2,1,0} fusion(param), kind=kInput, calls=fusion
+  param = f32[8,12,100,11]{3,2,1,0} parameter(0)
+  ROOT fusion = f32[100,11,12,8]{3,2,1,0} fusion(param), kind=kInput, calls=fusion
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -580,6 +599,28 @@ ENTRY main {
   EXPECT_EQ(result->permutation, InlinedVector({2, 1, 0}));
 }
 
+TEST_F(IrEmissionUtilsTest,
+       FindTiledLogicalTransposeWithSize1DimensionInRawShape) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[32,1,16,2]{3,2,1,0} parameter(0)
+  ROOT t = f32[16,1,32,2]{3,2,1,0} transpose(p), dimensions={2,1,0,3}
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* tr = module->entry_computation()->root_instruction();
+
+  auto result = GetDescriptionForTiledTransposeEmitter(*tr);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result->instr, tr);
+  EXPECT_EQ(result->dimensions, InlinedVector({16, 32, 2}));
+  EXPECT_EQ(result->permutation, InlinedVector({1, 0, 2}));
+}
+
 TEST_F(IrEmissionUtilsTest, IsContiguousSlice) {
   const char* hlo = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 6d81b8c935ec17..098b8f94fd04c0 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -177,13 +177,13 @@ TEST_F(CoalescingTest, Transpose) {
     HloModule module
 
     fusion {
-      %input = f32[1, 6400, 32] parameter(0)
-      ROOT transpose = f32[1, 32, 6400] transpose(%input), dimensions={0, 2, 1}
+      %input = f32[100, 64, 32] parameter(0)
+      ROOT transpose = f32[32, 100, 64] transpose(%input), dimensions={2, 0, 1}
     }
 
     ENTRY entry {
-      %input = f32[1, 6400, 32] parameter(0)
-      ROOT %fusion = f32[1, 32, 6400] fusion(%input), kind=kLoop, calls=fusion
+      %input = f32[100, 64, 32] parameter(0)
+      ROOT %fusion = f32[32, 100, 64] fusion(%input), kind=kLoop, calls=fusion
   })";
   // thread_x to linearized input mapping for thread_x in [0, 31]:
   // Operand 1:  (thread_x)[s0] -> (thread_x + s0 * 128) for s0 in [0, 7]
@@ -258,15 +258,15 @@ TEST_F(CoalescingTest, TransposeOfBroadcastHeuristic) {
     HloModule module
 
     fusion {
-      input = f32[1, 32, 6400] parameter(0)
-      ROOT slice = f32[1, 32, 100] slice(input), slice={[0:1:1], [0:32:1], [0:6400:64]}
+      input = f32[32, 100, 64] parameter(0)
+      ROOT slice = f32[32, 100, 1] slice(input), slice={[0:32:1], [0:100:1], [0:1:1]}
     }
 
     ENTRY entry {
       p0 = f32[32] parameter(0)
-      broadcast = f32[1, 6400, 32] broadcast(p0), dimensions={2}
-      transpose = f32[1, 32, 6400] transpose(broadcast), dimensions={0, 2, 1}
-      ROOT %fusion = f32[1, 32, 100] fusion(transpose), kind=kLoop, calls=fusion
+      broadcast = f32[100, 64, 32] broadcast(p0), dimensions={2}
+      transpose = f32[32, 100, 64] transpose(broadcast), dimensions={2, 0, 1}
+      ROOT %fusion = f32[32, 100, 1] fusion(transpose), kind=kLoop, calls=fusion
   })";
   EXPECT_TRUE(IsReadCoalescedHeuristic(ir));
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 0b702ad9af2acb..08a13b2bb05b9b 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -210,11 +210,11 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
index 24a469ff6c3d8f..2073fc9f90b858 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
@@ -287,7 +287,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[8,8,6]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
@@ -299,8 +299,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[8,6,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} bitcast([[FUSION]])
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -348,7 +347,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2Degenerate1) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,1,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,1,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,1,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,6]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,6]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
@@ -360,8 +359,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2Degenerate1) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[2,6,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,1,6,8]{3,2,1,0} bitcast([[FUSION]])
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,1,6,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -409,7 +407,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,24]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
@@ -421,8 +419,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[2,24,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} bitcast([[FUSION]])
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -470,7 +467,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12Degenerate2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> f32[2,4,1,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
@@ -482,8 +479,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12Degenerate2) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[2,4,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,1,8]{3,2,1,0} bitcast([[FUSION]])
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,1,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -825,7 +821,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> (f32[2,4,6,8], f32[2,8], f32[2,8], f32[2,8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,24]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
@@ -885,7 +881,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12Degenerate2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> (f32[2,4,1,8], f32[2,8], f32[2,8], f32[2,8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
@@ -1181,7 +1177,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[8,8,6]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
@@ -1274,7 +1270,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,24]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
@@ -1367,7 +1363,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1], {{.*}}: f32[2,4,1,8]) -> (f32[2,4,1,8], f32[2,4,1,8], f32[4,1], f32[4,1]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4]{2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
index 3f510f05645814..06d92020417493 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <variant>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/span.h"
 #include "llvm/Support/MathExtras.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -44,6 +44,7 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -78,20 +79,28 @@ bool ShouldRewriteLoopTransposeFusion(
   // is neither the minormost nor the second minormost dimension in the output,
   // and the output minormost dimension is swapped with the new minormost
   // dimension.
-  int64_t rank = root->shape().dimensions().size();
 
-  // The transpose dimension grouper has run, so it should be enough to check
-  // that the minormost dimension's index within the result is smaller than
-  // rank - 2, and that the new minormost dimension is swapped with it.
+  // We use the normalized logical transpose shape so it should be enough to
+  // check that the minormost dimension's index within the result is smaller
+  // than rank - 2, and that the new minormost dimension is swapped with it.
+  absl::InlinedVector<int64_t, 3> permutation;
+  auto normalized_dims_or = ShapeUtil::GetNormalizedLogicalTransposeShape(
+      root->operand(0)->shape(), root->shape(), root->dimensions(),
+      permutation);
+  if (!normalized_dims_or.ok()) {
+    return false;
+  }
+  auto normalized_dims = normalized_dims_or.value();
+  int64_t rank = normalized_dims.size();
+
   // This only triggers for transposes with major-to-minor layout.
   bool has_major_to_minor_layout =
       LayoutUtil::IsMonotonicWithDim0Major(root->shape().layout());
-  absl::Span<int64_t const> transpose_dimensions = root->dimensions();
-  int64_t result_minormost_dim_in_operand = transpose_dimensions.back();
+  int64_t result_minormost_dim_in_operand = permutation.back();
 
   if (!(has_major_to_minor_layout &&
-        transpose_dimensions[result_minormost_dim_in_operand] == rank - 1 &&
-        transpose_dimensions[rank - 1] < rank - 2)) {
+        permutation[result_minormost_dim_in_operand] == rank - 1 &&
+        permutation[rank - 1] < rank - 2)) {
     return false;
   }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
index dfe161f86da219..8fc1960a1a628d 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
@@ -211,6 +211,38 @@ ENTRY entry  {
   EXPECT_TRUE(HasTritonBlockLevelFusionConfig(root));
 }
 
+TEST_F(FusionBlockLevelRewriterTest,
+       RewritesLoopTransposeFusionWithSplitDimensions) {
+  // This test checks if the rewriter can handle a transpose where dimensions
+  // are split in the HLO but logically contiguous.
+  // Logical shape: [100, 200, 300] -> [300, 200, 100] (Swap dim 0 and 2).
+  // Physical shape: [100, 200, 10, 30] -> [10, 30, 200, 100].
+  // The normalized logical transpose shape should recover the logical swap.
+  const absl::string_view hlo_text = R"(
+fusion_computation {
+  p0 = f32[100,200,10,30] parameter(0)
+  ROOT transpose = f32[10,30,200,100] transpose(p0), dimensions={2,3,1,0}
+}
+
+ENTRY entry {
+  p0 = f32[100,200,10,30] parameter(0)
+  ROOT fusion = f32[10,30,200,100] fusion(p0), kind=kLoop,
+    calls=fusion_computation
+})";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_text));
+
+  EXPECT_THAT(
+      FusionBlockLevelRewriter(device_info_, HloCostAnalysis::DefaultShapeSize,
+                               &mlir_context_)
+          .Run(module.get()),
+      absl_testing::IsOkAndHolds(true));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kFusion);
+  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kCustom);
+  EXPECT_TRUE(HasTritonBlockLevelFusionConfig(root));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
index 0281e68b03e4ba..3b5c92a19388d2 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
@@ -1,10 +1,9 @@
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
 
-// CHECK: fused_transpose
+// CHECK: %wrapped_transpose_computation
 // CHECK-NEXT: bf16[3,3,16,32]{3,2,1,0} parameter(0)
-// CHECK-NEXT: bf16[144,32]{1,0} bitcast
-// CHECK-NEXT: bf16[32,144]{1,0} transpose
-// CHECK-SAME: dimensions={1,0}
+// CHECK-NEXT: bf16[32,3,3,16]{3,2,1,0} transpose
+// CHECK-SAME: dimensions={3,0,1,2}
 // CHECK: (bf16[1,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call
 // CHECK-SAME: window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward
 
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
index 10cc948cf6a288..1b82bb55c80b2b 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
@@ -1,10 +1,9 @@
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/h100_sxm.txtpb --split-input-file | FileCheck %s
 
-// CHECK: fused_transpose
+// CHECK: %wrapped_transpose_computation
 // CHECK-NEXT: f8e4m3fn[3,3,16,32]{3,2,1,0} parameter(0)
-// CHECK-NEXT: f8e4m3fn[144,32]{1,0} bitcast
-// CHECK-NEXT: f8e4m3fn[32,144]{1,0} transpose
-// CHECK-SAME: dimensions={1,0}
+// CHECK-NEXT: f8e4m3fn[32,3,3,16]{3,2,1,0} transpose
+// CHECK-SAME: dimensions={3,0,1,2}
 // CHECK: (f8e4m3fn[1,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call
 // CHECK-SAME: window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward
 
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
index 5ae06c318a1cf9..d5baeb8a42af7d 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
@@ -1,10 +1,9 @@
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/v100.txtpb --split-input-file | FileCheck %s
 
-// CHECK: fused_transpose
+// CHECK: %wrapped_transpose_computation
 // CHECK-NEXT: f16[3,3,16,32]{3,2,1,0} parameter(0)
-// CHECK-NEXT: f16[144,32]{1,0} bitcast
-// CHECK-NEXT: f16[32,144]{1,0} transpose
-// CHECK-SAME: dimensions={1,0}
+// CHECK-NEXT: f16[32,3,3,16]{3,2,1,0} transpose
+// CHECK-SAME: dimensions={3,0,1,2}
 // CHECK: (f16[1,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call
 // CHECK-SAME: window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward
 
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index a9e0f6dee030c2..5d29d0c2b76fb3 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -2379,6 +2379,7 @@ absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
       normalized_shape.dimensions().begin(),
       normalized_shape.dimensions().end());
   if (segments.size() == 1) {
+    permutation.push_back(0);
     return normalized_dims;
   }
   // Derive the permutation from the segments.
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index 8d3cedb6d1d21a..5af96d0805c84d 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -1815,7 +1815,20 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NoTranspose) {
                            input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(8192));
-  EXPECT_THAT(permutation, IsEmpty());
+  EXPECT_THAT(permutation, ElementsAre(0));
+}
+
+TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_IdentityWithMerges) {
+  Shape output_shape = ShapeUtil::MakeShape(F32, {10, 20});
+  Shape input_shape = ShapeUtil::MakeShape(F32, {20, 10});
+  // Identity transpose that allows merging dimensions.
+  absl::InlinedVector<int64_t, 3> dimensions = {0, 1};
+  absl::InlinedVector<int64_t, 3> permutation;
+  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
+                       ShapeUtil::GetNormalizedLogicalTransposeShape(
+                           input_shape, output_shape, dimensions, permutation));
+  EXPECT_THAT(normalized_shape, ElementsAre(200));
+  EXPECT_THAT(permutation, ElementsAre(0));
 }
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple2D) {

From 6d5546d597e9bec3a25c7a53942dd050554156b9 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 18 Dec 2025 04:13:39 -0800
Subject: [PATCH 514/753] Refactor: Dynamically register custom call targets in
 custom_call_test.cc

This change updates custom_call_test.cc to dynamically register custom call targets and FFI handlers using the runtime-determined platform name (CUDA or ROCM). This replaces the use of static registration macros, allowing the tests to run correctly across different GPU platforms and the reference interpreter.

This way we can avoid compile time branches like `#ifdef GOOGLE_CUDA` and similar.

Also:

1. Converts usage of raw CUDA driver API functions to StreamExecutor functionality
2. Replaces some legacy CustomCalls by FFI
3. Converts the while test target to HloRunnerPjRt
4. Removes a test case from the Token tests with a nested type in the output type, since that's not supported by our PjRt implementation.

PiperOrigin-RevId: 846196106
---
 third_party/xla/xla/service/gpu/BUILD         |  32 +-
 .../xla/xla/service/gpu/custom_call_test.cc   | 352 +++++++++++-------
 2 files changed, 227 insertions(+), 157 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 5f81a412401377..6c9258ca18a55c 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -193,14 +193,15 @@ xla_test(
     name = "custom_call_test",
     srcs = ["custom_call_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-    tags = ["no-oneapi"],  # TODO(intel-tf): Remove it when macro substitutions for SYCL are available in xla/stream_executor/sycl/*.
+    tags = [
+        # TODO(intel-tf): Remove it when macro substitutions for SYCL are available in xla/stream_executor/sycl/*.
+        "no-oneapi",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
     deps = [
-        "//xla:debug_options_flags",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
-        "//xla:status_macros",
         "//xla:xla_data_proto_cc",
         "//xla/backends/gpu:ffi",
         "//xla/ffi",
@@ -210,37 +211,32 @@ xla_test(
         "//xla/hlo/builder/lib:constants",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
-        "//xla/hlo/testlib:test_helpers",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
-        "//xla/service:executable",
         "//xla/service:hlo_module_config",
+        "//xla/service:hlo_runner_interface",
         "//xla/stream_executor:device_address",
-        "//xla/stream_executor:device_address_allocator",
         "//xla/stream_executor:scratch_allocator",
         "//xla/stream_executor:stream",
         "//xla/tests:client_library_test_runner_mixin",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
-        "//xla/tsl/platform:test",
-        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:no_destructor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+        "@com_google_googletest//:gtest_main",
+    ],
 )
 
 xla_cc_test(
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index 4807574fcdbdb9..d29efd3a727dbd 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -16,31 +16,26 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/const_init.h"
 #include "absl/base/no_destructor.h"
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
-#include "xla/literal_util.h"
-
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#include "third_party/gpus/cuda/include/driver_types.h"
-#define PLATFORM "CUDA"
-#elif TENSORFLOW_USE_ROCM
-#include "rocm/include/hip/hip_runtime.h"
-#define PLATFORM "ROCM"
-#endif
-
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/ffi.h"
 #include "xla/ffi/execution_context.h"
@@ -53,39 +48,24 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/hlo_module_config.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
-#include "xla/tsl/platform/test.h"
 #include "xla/xla_data.pb.h"
 
-#if GOOGLE_CUDA
-#define gpuSuccess cudaSuccess
-#define gpuMemcpyAsync cudaMemcpyAsync
-#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
-#define gpuMemcpy cudaMemcpy
-#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-#define gpuStream CUstream
-#elif TENSORFLOW_USE_ROCM
-#define gpuSuccess hipSuccess
-#define gpuMemcpyAsync hipMemcpyAsync
-#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define gpuMemcpy hipMemcpy
-#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-#define gpuStream hipStream_t
-#endif
-
 namespace xla {
 
 struct Range {
@@ -105,7 +85,19 @@ namespace {
 using ::absl_testing::StatusIs;
 using ::testing::HasSubstr;
 
-using CustomCallTest = ClientLibraryTestRunnerMixin<HloTestBase>;
+class CustomCallTest : public ClientLibraryTestRunnerMixin<
+                           HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
+ public:
+  std::string PlatformName() {
+    if (test_runner().HasProperty(HloRunnerPropertyTag::kUsingGpuCuda)) {
+      return "CUDA";
+    }
+    if (test_runner().HasProperty(HloRunnerPropertyTag::kUsingGpuRocm)) {
+      return "ROCM";
+    }
+    LOG(FATAL) << TestName() << " was executed on an unsupported platform.";
+  }
+};
 
 // The test case for custom call with tokens encodes the arguments and result
 // type using a string with A(=Array), T(=Token) and {} for Tuples. It also
@@ -124,32 +116,17 @@ struct TokenTestCase {
   std::string opaque;
 };
 
-void Callback_Tokens(gpuStream stream, void** buffers, const char* opaque,
-                     size_t opaque_len) {
-  for (int i = 0; i < opaque_len; ++i) {
-    char c = opaque[i];
-    ASSERT_TRUE(c == 'A' || c == 'T');
-    if (c == 'A') {
-      ASSERT_NE(buffers[i], nullptr);
-    } else {
-      ASSERT_EQ(buffers[i], nullptr);
-    }
-  }
-}
-
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Tokens, PLATFORM);
-
 std::vector<TokenTestCase> GetTokenTestCases() {
-  return {{"{AT}{AT}", "{A{AT}A}", "ATATAATA"},  // tokens in input and output
-          {"{A}", "T", "AT"},                    // single token as output
-          {"{{T}}", "A", "TA"},                  // single token as input
+  return {{"{AT}{AT}", "{AATA}", "ATATAATA"},  // tokens in input and output
+          {"{A}", "T", "AT"},                  // single token as output
+          {"{{T}}", "A", "TA"},                // single token as input
           {"AA", "{TA}", "AATA"},
           {"TA{TA{TA}}", "{AA}", "TATATAAA"}};
 }
 
 class CustomCallTokensTest
     : public ::testing::WithParamInterface<TokenTestCase>,
-      public ClientLibraryTestRunnerMixin<HloTestBase> {
+      public CustomCallTest {
  public:
   static std::vector<XlaOp> BuildInputs(XlaBuilder& b,
                                         std::istringstream& str) {
@@ -191,14 +168,23 @@ class CustomCallTokensTest
   }
 };
 
-void Callback_WithStatusSucceeded(gpuStream /*stream*/, void** /*buffers*/,
+void Callback_WithStatusSucceeded(void* /*stream*/, void** /*buffers*/,
                                   const char* /*opaque*/, size_t /*opaque_len*/,
                                   XlaCustomCallStatus* status) {
   XlaCustomCallStatusSetSuccess(status);
 }
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_WithStatusSucceeded, PLATFORM);
+
+void Callback_WithStatusFailed(void* /*stream*/, void** /*buffers*/,
+                               const char* /*opaque*/, size_t /*opaque_len*/,
+                               XlaCustomCallStatus* status) {
+  XlaCustomCallStatusSetFailure(status, "Failed", 6);
+}
 
 TEST_F(CustomCallTest, WithStatusSucceeded) {
+  CustomCallTargetRegistry::Global()->Register(
+      "Callback_WithStatusSucceeded",
+      reinterpret_cast<void*>(Callback_WithStatusSucceeded), PlatformName());
+
   XlaBuilder b(TestName());
   CustomCall(
       &b, "Callback_WithStatusSucceeded", /*operands=*/{},
@@ -210,14 +196,11 @@ TEST_F(CustomCallTest, WithStatusSucceeded) {
   TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 }
 
-void Callback_WithStatusFailed(gpuStream /*stream*/, void** /*buffers*/,
-                               const char* /*opaque*/, size_t /*opaque_len*/,
-                               XlaCustomCallStatus* status) {
-  XlaCustomCallStatusSetFailure(status, "Failed", 6);
-}
-XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_WithStatusFailed, PLATFORM);
-
 TEST_F(CustomCallTest, WithStatusFailed) {
+  CustomCallTargetRegistry::Global()->Register(
+      "Callback_WithStatusFailed",
+      reinterpret_cast<void*>(Callback_WithStatusFailed), PlatformName());
+
   XlaBuilder b(TestName());
   CustomCall(
       &b, "Callback_WithStatusFailed", /*operands=*/{},
@@ -244,10 +227,12 @@ XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail,
                            .Ret<ffi::AnyBuffer>()   //
                            .Attr<int32_t>("value")  // value
 );
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_fail",
-                         PLATFORM, kAlwaysFail);
 
 TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$always_fail",
+                                       PlatformName(), kAlwaysFail);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$always_fail", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}), /*opaque=*/"{value = 42 : i32}",
@@ -263,6 +248,10 @@ TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
 // Same as the above test but just pass attribute through
 // the backend config proto string instead.
 TEST_F(CustomCallTest, PassAttributesByBackendConfig) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$always_fail",
+                                       PlatformName(), kAlwaysFail);
+
   XlaBuilder b(TestName());
   CustomCall(
       &b, "__xla_test$$always_fail", /*operands=*/{},
@@ -291,10 +280,10 @@ XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
                            .Ret<ffi::AnyBuffer>(),  // dst
                        {ffi::Traits::kCmdBufferCompatible});
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
-                         kMemcpy);
-
 TEST_F(CustomCallTest, ExportedFfiMemcpy) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PlatformName(), kMemcpy);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$memcpy",
              /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
@@ -317,10 +306,11 @@ XLA_FFI_DEFINE_HANDLER(kHandleUserPointer, HandleUserPointer,
                            .Ret<ffi::AnyBuffer>()  // buffer for result
                            .Attr<ffi::Pointer<std::string>>("message"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$user_data", PLATFORM,
-                         kHandleUserPointer);
-
 TEST_F(CustomCallTest, PassUserPointerWithAttrs) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$user_data", PlatformName(),
+                                       kHandleUserPointer);
+
   std::string message = "User-defined message";
   auto ptr = reinterpret_cast<uintptr_t>(&message);
 
@@ -347,10 +337,10 @@ XLA_FFI_DEFINE_HANDLER(
     kIsInvoked, IsInvoked,
     ffi::Ffi::Bind().Ret<ffi::AnyBuffer>());  // Buffer for result (unused).
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$isinvoked", PLATFORM,
-                         kIsInvoked);
-
 TEST_F(CustomCallTest, ExportedFfiIsInvoked) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$isinvoked", PlatformName(), kIsInvoked);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$isinvoked", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}), /*opaque=*/"",
@@ -400,10 +390,10 @@ XLA_FFI_DEFINE_HANDLER(kOpaque, Opaque,
                            .Ret<ffi::AnyBuffer>()  // Dummy result buffer.
                            .Attr<ffi::Pointer<std::string>>("opaque"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$opaque", PLATFORM,
-                         kOpaque);
-
 TEST_F(CustomCallTest, ExportedFfiOpaque) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$opaque", PlatformName(), kOpaque);
+
   XlaBuilder b(TestName());
   const std::string opaque = absl::StrFormat(
       "{opaque = %d : i64}", reinterpret_cast<uintptr_t>(&kExpectedOpaque));
@@ -461,10 +451,10 @@ XLA_FFI_DEFINE_HANDLER(
     ffi::Ffi::Bind().RemainingArgs().RemainingRets().Attr<absl::string_view>(
         "pattern"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$tokens", PLATFORM,
-                         kFfiTokens);
-
 TEST_P(CustomCallTokensTest, ExportedTokensTest) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "__xla_test$$tokens", PlatformName(), kFfiTokens);
+
   const TokenTestCase& tc = GetParam();
   XlaBuilder b(TestName());
   std::istringstream input(tc.input);
@@ -498,10 +488,11 @@ static absl::Status AlwaysSucceed(ffi::Result<ffi::AnyBuffer>) {
 XLA_FFI_DEFINE_HANDLER(kAlwaysSucceed, AlwaysSucceed,
                        ffi::Ffi::Bind().Ret<ffi::AnyBuffer>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_succeed",
-                         PLATFORM, kAlwaysSucceed);
-
 TEST_F(CustomCallTest, ExportedFfiWithStatusSucceeded) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "__xla_test$$always_succeed",
+                                       PlatformName(), kAlwaysSucceed);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "__xla_test$$always_succeed", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}), /*opaque=*/"",
@@ -541,10 +532,11 @@ XLA_FFI_DEFINE_HANDLER(kFfiAttributes, FfiAttributes,
                            .Attr<absl::Span<const int32_t>>("i32_arr")
                            .Attr<Range>("range"));
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_attributes",
-                         PLATFORM, kFfiAttributes);
-
 TEST_F(CustomCallTest, FfiAttributes) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "xla.gpu.ffi_attributes", PlatformName(),
+                                       kFfiAttributes);
+
   XlaBuilder b(TestName());
   CustomCall(&b, "xla.gpu.ffi_attributes", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}),
@@ -598,11 +590,11 @@ XLA_FFI_DEFINE_HANDLER(kMemcpyWithCalledComputation,
                            .Ret<ffi::AnyBuffer>()         // dst
                            .Ctx<ffi::CalledComputation>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
-                         "xla.gpu.ext.memcpy_with_called_computation", PLATFORM,
-                         kMemcpyWithCalledComputation);
-
 TEST_F(CustomCallTest, WithCalledComputation) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ext.memcpy_with_called_computation",
+      PlatformName(), kMemcpyWithCalledComputation);
+
   auto shape = ShapeUtil::MakeShape(F32, {128});
 
   // Build a called computation which is just a copy instruction.
@@ -625,6 +617,10 @@ TEST_F(CustomCallTest, WithCalledComputation) {
 }
 
 TEST_F(CustomCallTest, WithCalledComputationAndLayouts) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ext.memcpy_with_called_computation",
+      PlatformName(), kMemcpyWithCalledComputation);
+
   auto shape = ShapeUtil::MakeShapeWithDenseLayout(F32, {128, 128}, {0, 1});
   // Build a called computation which is just a copy instruction.
   XlaBuilder copy("copy");
@@ -643,10 +639,43 @@ TEST_F(CustomCallTest, WithCalledComputationAndLayouts) {
   TF_ASSERT_OK_AND_ASSIGN(auto result, ExecuteAndTransfer(&b, {}, &shape));
   EXPECT_THAT(result.data<float>(), ::testing::Each(42));
 }
+
 //===----------------------------------------------------------------------===//
 // XLA:FFI handler with execution context
 //===----------------------------------------------------------------------===//
 
+// HloRunnerPjRt doesn't offer a way to provide the execution context for the
+// execution. Therefore we use a global static variable to pass the execution
+// context to the custom call handler.
+absl::Mutex execution_context_mutex(absl::kConstInit);
+ffi::ExecutionContext* global_execution_context
+    ABSL_GUARDED_BY(execution_context_mutex) = nullptr;
+absl::NoDestructor<std::optional<ffi::internal::ScopedExecutionContext>>
+    scoped_execution_context;
+
+template <ffi::ExecutionStage stage>
+absl::Status ExecutionContextRegister(ffi::Result<ffi::AnyBuffer>) {
+  if constexpr (stage != ffi::ExecutionStage::kPrepare) {
+    return absl::OkStatus();
+  }
+
+  absl::MutexLock lock(execution_context_mutex);
+  // ScopedExecutionContext needs to be constructed on the same thread as the
+  // execution context is used. Therefore we use the prepare callback to
+  // create the execution context.
+  scoped_execution_context->emplace(global_execution_context);
+  return absl::OkStatus();
+};
+
+XLA_FFI_DEFINE_HANDLER(
+    kExecutionContextRegisterPrepare,
+    ExecutionContextRegister<ffi::ExecutionStage::kPrepare>,
+    ffi::Ffi::Bind<ffi::ExecutionStage::kPrepare>().Ret<ffi::AnyBuffer>());
+XLA_FFI_DEFINE_HANDLER(
+    kExecutionContextRegisterExecute,
+    ExecutionContextRegister<ffi::ExecutionStage::kExecute>,
+    ffi::Ffi::Bind<ffi::ExecutionStage::kExecute>().Ret<ffi::AnyBuffer>());
+
 // Arbitrary user-defined context passed via the execution context side channel
 // to a custom call handlers.
 struct SomeExtraContext {
@@ -658,7 +687,8 @@ struct SomeExtraContext {
 };
 
 template <ffi::ExecutionStage stage>
-static absl::Status ExecutionContext(ffi::Result<ffi::AnyBuffer>,
+static absl::Status ExecutionContext(ffi::AnyBuffer,
+                                     ffi::Result<ffi::AnyBuffer>,
                                      SomeExtraContext* ctx) {
   if (ctx->value != 42) {
     return absl::InternalError("Unexpected value");
@@ -679,33 +709,59 @@ static absl::Status ExecutionContext(ffi::Result<ffi::AnyBuffer>,
 XLA_FFI_DEFINE_HANDLER(kExecutionContextPrepare,
                        ExecutionContext<ffi::ExecutionStage::kPrepare>,
                        ffi::Ffi::Bind<ffi::ExecutionStage::kPrepare>()
+                           .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
                            .Ctx<ffi::UserData<SomeExtraContext>>());
 
 XLA_FFI_DEFINE_HANDLER(kExecutionContextInitialize,
                        ExecutionContext<ffi::ExecutionStage::kInitialize>,
                        ffi::Ffi::Bind<ffi::ExecutionStage::kInitialize>()
+                           .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
                            .Ctx<ffi::UserData<SomeExtraContext>>());
 
 XLA_FFI_DEFINE_HANDLER(kExecutionContextExecute,
                        ExecutionContext<ffi::ExecutionStage::kExecute>,
                        ffi::Ffi::Bind<ffi::ExecutionStage::kExecute>()
+                           .Arg<ffi::AnyBuffer>()
                            .Ret<ffi::AnyBuffer>()
                            .Ctx<ffi::UserData<SomeExtraContext>>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_context",
-                         PLATFORM,
-                         {
-                             /*instantiate=*/nullptr,
-                             /*prepare=*/kExecutionContextPrepare,
-                             /*initialize=*/kExecutionContextInitialize,
-                             /*execute=*/kExecutionContextExecute,
-                         });
-
 TEST_F(CustomCallTest, FfiExecutionContext) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.register_ffi_execution_context",
+      PlatformName(),
+      {
+          /*instantiate=*/nullptr,
+          /*prepare=*/kExecutionContextRegisterPrepare,
+          /*initialize=*/nullptr,
+          /*execute=*/kExecutionContextRegisterExecute,
+      });
+
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_context", PlatformName(),
+      {
+          /*instantiate=*/nullptr,
+          /*prepare=*/kExecutionContextPrepare,
+          /*initialize=*/kExecutionContextInitialize,
+          /*execute=*/kExecutionContextExecute,
+      });
+
   XlaBuilder b(TestName());
-  CustomCall(&b, "xla.gpu.ffi_execution_context", /*operands=*/{},
+
+  // This custom call users ScopedExecutionContext to register the execution
+  // context for the duration of the current XLA computation.
+  // Usually the execution context is passed in via ExecutionOptions, but that's
+  // not supported in HloRunnerPjRt.
+  XlaOp output =
+      CustomCall(&b, "xla.gpu.register_ffi_execution_context",
+                 /*operands=*/{}, ShapeUtil::MakeShape(F32, {}),
+                 /*opaque=*/"",
+                 /*has_side_effect=*/true,
+                 /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+                 /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+                 /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  CustomCall(&b, "xla.gpu.ffi_execution_context", /*operands=*/{output},
              ShapeUtil::MakeShape(F32, {}),
              /*opaque=*/"",
              /*has_side_effect=*/false,
@@ -715,9 +771,10 @@ TEST_F(CustomCallTest, FfiExecutionContext) {
 
   ffi::ExecutionContext execution_context;
   TF_ASSERT_OK(execution_context.Emplace<SomeExtraContext>(42));
-
-  ffi::internal::ScopedExecutionContext scoped_execution_context(
-      &execution_context);
+  {
+    absl::MutexLock lock(execution_context_mutex);
+    global_execution_context = &execution_context;
+  }
 
   TF_ASSERT_OK(ExecuteAndTransfer(&b, {}).status());
 
@@ -760,16 +817,16 @@ XLA_FFI_DEFINE_HANDLER(
     kGetState, GetState,
     ffi::Ffi::Bind().Ret<ffi::AnyBuffer>().Ctx<ffi::State<SomeState>>());
 
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_state",
-                         PLATFORM,
-                         {
-                             /*instantiate=*/kInstantiateState,
-                             /*prepare=*/nullptr,
-                             /*initialize=*/nullptr,
-                             /*execute=*/kGetState,
-                         });
-
 TEST_F(CustomCallTest, FfiExecutionState) {
+  xla::ffi::Ffi::RegisterStaticHandler(
+      ffi::GetXlaFfiApi(), "xla.gpu.ffi_execution_state", PlatformName(),
+      {
+          /*instantiate=*/kInstantiateState,
+          /*prepare=*/nullptr,
+          /*initialize=*/nullptr,
+          /*execute=*/kGetState,
+      });
+
   XlaBuilder b(TestName());
   CustomCall(&b, "xla.gpu.ffi_execution_state", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}),
@@ -831,17 +888,20 @@ XLA_FFI_DEFINE_HANDLER(
     kAsyncStartCustomCall, AsyncStartCustomCall,
     ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
         "channel"));
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_start_custom_call",
-                         PLATFORM, kAsyncStartCustomCall);
 
 XLA_FFI_DEFINE_HANDLER(
     kAsyncDoneCustomCall, AsyncDoneCustomCall,
     ffi::Ffi::Bind().Arg<ffi::AnyBuffer>().Ret<ffi::AnyBuffer>().Attr<int32_t>(
         "channel"));
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "xla.gpu.async_done_custom_call",
-                         PLATFORM, kAsyncDoneCustomCall);
 
 TEST_F(CustomCallTest, AsyncCustomCalls) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "xla.gpu.async_start_custom_call",
+                                       PlatformName(), kAsyncStartCustomCall);
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(),
+                                       "xla.gpu.async_done_custom_call",
+                                       PlatformName(), kAsyncDoneCustomCall);
+
   auto shape = ShapeUtil::MakeShape(F32, {});
 
   XlaBuilder b(TestName());
@@ -872,40 +932,51 @@ TEST_F(CustomCallTest, AsyncCustomCalls) {
 // Testing the use of buffers in custom calls.
 //===----------------------------------------------------------------------===//
 
-class CustomCallHloTest : public HloTestBase {};
+using CustomCallHloTest = CustomCallTest;
 
-void CallBack_AddOne(gpuStream stream, void** buffers, const char* /*opaque*/,
-                     size_t /*opaque_len*/) {
-  // Expect that the input and output buffers are the same.
-  if (buffers[0] != buffers[1]) {
-    return;
+static absl::Status AddOne(se::Stream* stream, ffi::AnyBuffer src,
+                           ffi::Result<ffi::AnyBuffer> ret) {
+  if (src.untyped_data() != ret->untyped_data()) {
+    return absl::InternalError("Input and output buffers must be the same.");
   }
-  int32_t dst[2];
-  auto err = gpuMemcpy(dst, buffers[0], /*count=*/sizeof(int32_t) * 2,
-                       gpuMemcpyDeviceToHost);
-  ASSERT_EQ(err, gpuSuccess);
-  dst[0] += 1;
-  dst[1] += 1;
-  err = gpuMemcpy(buffers[1], dst, /*count=*/sizeof(int32_t) * 2,
-                  gpuMemcpyHostToDevice);
+
+  int32_t data[2];
+  se::DeviceAddressBase buffer_mem = ret->device_memory();
+  TF_RETURN_IF_ERROR(stream->Memcpy(data, buffer_mem, sizeof(data)));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  data[0] += 1;
+  data[1] += 1;
+
+  TF_RETURN_IF_ERROR(stream->Memcpy(&buffer_mem, data, sizeof(data)));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+
+  return absl::OkStatus();
 }
-XLA_REGISTER_CUSTOM_CALL_TARGET(CallBack_AddOne, PLATFORM);
+
+XLA_FFI_DEFINE_HANDLER(kAddOne, AddOne,
+                       ffi::Ffi::Bind()
+                           .Ctx<ffi::Stream>()
+                           .Arg<ffi::AnyBuffer>()
+                           .Ret<ffi::AnyBuffer>());
 
 TEST_F(CustomCallHloTest, HloBufferStraightLine) {
-  const char* const kModuleStr = R"(
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(), "xla.gpu.add_one",
+                                       PlatformName(), kAddOne);
 
+  const char* const kModuleStr = R"(
   HloModule test
   ENTRY test_computation {
     c1 = s32[] constant(1)
     init = s32[2] broadcast(c1), dimensions={}
     b0 = b(s32[2]) custom-call(init), custom_call_target="Pin",
       output_to_operand_aliasing={{}: (0, {})}
-    b1 = b(s32[2]) custom-call(b0), custom_call_target="CallBack_AddOne",
+    b1 = b(s32[2]) custom-call(b0), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
-    b2 = b(s32[2]) custom-call(b1), custom_call_target="CallBack_AddOne",
+      api_version=API_VERSION_TYPED_FFI
+    b2 = b(s32[2]) custom-call(b1), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
+      api_version=API_VERSION_TYPED_FFI
     ROOT v = s32[2] custom-call(b2), custom_call_target="Unpin",
       output_to_operand_aliasing={{}: (0, {})}
   })";
@@ -925,6 +996,9 @@ TEST_F(CustomCallHloTest, HloBufferStraightLine) {
 }
 
 TEST_F(CustomCallHloTest, HloBufferRotated) {
+  xla::ffi::Ffi::RegisterStaticHandler(ffi::GetXlaFfiApi(), "xla.gpu.add_one",
+                                       PlatformName(), kAddOne);
+
   const char* const kModuleStr = R"(
 
   HloModule test
@@ -942,12 +1016,12 @@ TEST_F(CustomCallHloTest, HloBufferRotated) {
 
     c1 = s32[] constant(1)
     new_count = s32[] add(count, c1)
-    b4 = b(s32[2]) custom-call(b3), custom_call_target="CallBack_AddOne",
+    b4 = b(s32[2]) custom-call(b3), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
-    b5 = b(s32[2]) custom-call(b4), custom_call_target="CallBack_AddOne",
+      api_version=API_VERSION_TYPED_FFI
+    b5 = b(s32[2]) custom-call(b4), custom_call_target="xla.gpu.add_one",
       output_to_operand_aliasing={{}: (0, {})},
-      api_version=API_VERSION_STATUS_RETURNING
+      api_version=API_VERSION_TYPED_FFI
     v0 = s32[2] custom-call(b5), custom_call_target="Unpin",
       output_to_operand_aliasing={{}: (0, {})}
     c1_broadcast = s32[2] broadcast(c1), dimensions={}

From 69cd9be8994ef3ff67e2e30ff48faed67d1df93e Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 18 Dec 2025 04:47:10 -0800
Subject: [PATCH 515/753] Add a function to check for empty/non existing files.

The `fd.Size()` check doesn't work when the file descriptor is invalid and only
the path was given.

PiperOrigin-RevId: 846207406
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  1 +
 .../lite/delegates/xnnpack/file_util.cc       | 23 ++++++++++
 tensorflow/lite/delegates/xnnpack/file_util.h |  5 +++
 .../lite/delegates/xnnpack/file_util_test.cc  | 45 +++++++++++++++++++
 .../lite/delegates/xnnpack/weight_cache.cc    |  2 +-
 5 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 10d401078d0fae..227537a79f1454 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -421,6 +421,7 @@ cc_library(
     hdrs = ["file_util.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":macros",
         "//tensorflow/lite:minimal_logging",
     ],
 )
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.cc b/tensorflow/lite/delegates/xnnpack/file_util.cc
index 8a24eb9568b884..268aebc50468e3 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util.cc
@@ -39,7 +39,13 @@ limitations under the License.
 #endif  // TFLITE_XNNPACK_IN_MEMORY_FILE_ENABLED
 #endif  // defined(__linux__) || defined(__ANDROID__)
 
+#include <sys/stat.h>
+
+#include <cerrno>
 #include <cstdio>
+#include <cstring>
+
+#include "tensorflow/lite/delegates/xnnpack/macros.h"
 
 #if !TFLITE_XNNPACK_IN_MEMORY_FILE_ENABLED
 #include "tensorflow/lite/logger.h"
@@ -154,5 +160,22 @@ FileDescriptor CreateInMemoryFileDescriptor(const char* path) {
 #endif
 }
 
+bool IsFileEmpty(const char* path, const FileDescriptor& fd) {
+#if defined(_WIN32)
+  struct _stat64 file_stats{};
+  const int res = fd.IsValid() ? _fstat64(fd.Value(), &file_stats)
+                               : _stat64(path, &file_stats);
+#else
+  struct stat file_stats{};
+  const int res =
+      fd.IsValid() ? fstat(fd.Value(), &file_stats) : stat(path, &file_stats);
+#endif
+  XNNPACK_RETURN_CHECK(
+      res == 0 || errno == ENOENT,
+      "could not access file descriptor %d stats to get size ('%s'): %s.",
+      fd.Value(), path, strerror(errno));
+  return file_stats.st_size == 0;
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/file_util.h b/tensorflow/lite/delegates/xnnpack/file_util.h
index 113a378007506b..9817c74d9f7ee6 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.h
+++ b/tensorflow/lite/delegates/xnnpack/file_util.h
@@ -175,6 +175,11 @@ class FileDescriptor : public FileDescriptorView {
 // descriptor.
 bool InMemoryFileDescriptorAvailable();
 
+// Returns true if the file is empty (the file may exist)
+//
+// Note: if `fd` is valid, then `path` is ignored.
+bool IsFileEmpty(const char* path, const FileDescriptor& fd);
+
 // Creates a new file descriptor that isn't backed by a file system. The file
 // will be automatically cleaned up when the last file descriptor pointing to it
 // is closed.
diff --git a/tensorflow/lite/delegates/xnnpack/file_util_test.cc b/tensorflow/lite/delegates/xnnpack/file_util_test.cc
index 69196fefa28f52..c7f204befd4776 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util_test.cc
@@ -84,5 +84,50 @@ TEST(FileDescriptorTest, ReadFailureReturnsFalse) {
   EXPECT_FALSE(fd.Read(dst_data.data(), dst_data.size()));
 }
 
+TEST(FileDescriptorTest, IsFileEmptyReturnTrueForAnEmptyFileThatExists) {
+  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  fd.Close();
+  EXPECT_TRUE(IsFileEmpty(tmp_file.c_str(), FileDescriptor()));
+}
+
+TEST(FileDescriptorTest, IsFileEmptyReturnTrueForAnNonExistingFile) {
+  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  EXPECT_TRUE(IsFileEmpty(tmp_file.c_str(), FileDescriptor()));
+}
+
+TEST(FileDescriptorTest,
+     IsFileEmptyReturnTrueForAnNonExistingFileWithFileDescriptor) {
+  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  EXPECT_TRUE(IsFileEmpty("asdfasdf", FileDescriptor()));
+}
+
+TEST(FileDescriptorTest, IsFileEmptyReturnFalseForAFileThatHasContents) {
+  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  const std::string src_data = "The quick brown fox jumps over the lazy dog.";
+  EXPECT_TRUE(fd.Write(src_data.data(), src_data.size()));
+  EXPECT_FALSE(IsFileEmpty(tmp_file.c_str(), fd));
+}
+
+TEST(FileDescriptorTest, IsFileEmptyPrioritizesTheFileDescriptor) {
+  // We open 2 files, put some data only in one and then pass the file name of
+  // the one that has data and the file descriptor of the empty one.
+  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file2 = testing::TempDir() + __FUNCTION__ + "2";
+  FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
+                                           O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  FileDescriptor fd2 = FileDescriptor::Open(tmp_file2.c_str(),
+                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
+  const std::string src_data = "The quick brown fox jumps over the lazy dog.";
+  EXPECT_TRUE(fd.Write(src_data.data(), src_data.size()));
+  fd.Close();
+  EXPECT_TRUE(IsFileEmpty(tmp_file.c_str(), fd2));
+}
+
 }  // namespace
 }  // namespace tflite::xnnpack
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index 4ceb2df985c989..a8c86ff5a25529 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -330,7 +330,7 @@ bool MMapWeightCacheProvider::LoadOrStartBuild(const char* path,
   }
   const char* const safe_path = Sanitize(path);
   FileDescriptor build_fd = fd.Duplicate();
-  if (!IsInMemoryCachePath(safe_path) && fd.Size() &&
+  if (!IsInMemoryCachePath(safe_path) && !IsFileEmpty(safe_path, fd) &&
       Load(safe_path, std::move(fd))) {
     TFLITE_LOG_PROD(tflite::TFLITE_LOG_VERBOSE,
                     "XNNPack weight cache loaded from '%s'.", safe_path);

From 08d6df5eeaa9cde001fcae1b97021ae18a6799a0 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 18 Dec 2025 05:06:04 -0800
Subject: [PATCH 516/753] Update XNNPack version

PiperOrigin-RevId: 846213195
---
 tensorflow/lite/tools/cmake/modules/xnnpack.cmake | 2 +-
 tensorflow/workspace2.bzl                         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 03e94cf830ce29..aa11394dd86d9e 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG 2c1a512208d0481d6e6bd87c2bd5e23408febc3e
+  GIT_TAG 183297df5c945236cbc4bb1f625f9f2008bfc564
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index cf1f355df8bd3e..583ef31b4f61c0 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -168,9 +168,9 @@ def _tf_repositories():
     # LINT.IfChange(xnnpack)
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "961965b04b0cee7c0ece34bb21dbdf69e483772ae7bdb275a08e6d457ed7e38b",
-        strip_prefix = "XNNPACK-2c1a512208d0481d6e6bd87c2bd5e23408febc3e",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/2c1a512208d0481d6e6bd87c2bd5e23408febc3e.zip"),
+        sha256 = "08976c0ba6495775f78d738adbcc60a567b5826774f23d3c403486c70ff79772",
+        strip_prefix = "XNNPACK-183297df5c945236cbc4bb1f625f9f2008bfc564",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/183297df5c945236cbc4bb1f625f9f2008bfc564.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 

From f17984d352176497f49154abcd2ebba0918017fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 05:10:43 -0800
Subject: [PATCH 517/753] Automated Code Change

PiperOrigin-RevId: 846214738
---
 tensorflow/core/util/proto/decode.h           | 218 +++++++++---------
 .../util/proto/descriptor_pool_registry.cc    |   6 +-
 .../util/proto/descriptor_pool_registry.h     |   8 +-
 tensorflow/core/util/proto/descriptors.cc     |   8 +-
 tensorflow/core/util/proto/descriptors.h      |   2 +-
 tensorflow/core/util/proto/proto_utils.cc     |   8 +-
 tensorflow/core/util/proto/proto_utils.h      |   6 +-
 .../core/util/proto/proto_utils_test.cc       |  10 +-
 8 files changed, 136 insertions(+), 130 deletions(-)

diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index 7d43e34b35ce50..a3a5c5a72c2f01 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -42,7 +42,7 @@ using tensorflow::protobuf::io::StringOutputStream;
 // Converts an uint64 to an int64 without loss of information.
 // Unsigned values greater than INT64_MAX are represented as
 // negative numbers by wrapping (same as twos-complement bit equivalence).
-inline int64_t WrapUnsignedAsSigned64(uint64 unsigned_value) {
+inline int64_t WrapUnsignedAsSigned64(uint64_t unsigned_value) {
   // For a detailed explanation of why this works to wrap unsigned ints, see
   // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
   // Both if tests should be optimized out.
@@ -59,16 +59,16 @@ inline int64_t WrapUnsignedAsSigned64(uint64 unsigned_value) {
 // Converts an uint32 to an int32 without loss of information.
 // Unsigned values greater than INT_MAX are represented as
 // negative numbers by wrapping (same as twos-complement bit equivalence).
-inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
+inline int32_t WrapUnsignedAsSigned32(uint32_t unsigned_value) {
   // For a detailed explanation of why this works to wrap unsigned ints, see
   // http://stackoverflow.com/questions/13150449/efficient-unsigned-to-signed-cast-avoiding-implementation-defined-behavior
   // Both if tests should be optimized out.
   if (unsigned_value <= INT_MAX) {
-    return static_cast<int32>(unsigned_value);
+    return static_cast<int32_t>(unsigned_value);
   }
   // The C++ spec allows an architecture where this test is required.
   if (unsigned_value >= INT_MIN) {
-    return static_cast<int32>(unsigned_value - INT_MIN) + INT_MIN;
+    return static_cast<int32_t>(unsigned_value - INT_MIN) + INT_MIN;
   }
   return 0;  // This should never occur.
 }
@@ -78,8 +78,8 @@ inline int32 WrapUnsignedAsSigned32(uint32 unsigned_value) {
 // space in the buffer.
 // The ok value will be set to false if the buffer does not contain
 // a valid varint.
-inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
-                                          uint64* value);
+inline const uint8_t* ReadVarint64FromArray(const uint8_t* buffer, bool* ok,
+                                            uint64_t* value);
 
 // Reads a single varint32 from a byte array.
 // It is the caller's responsibility to ensure that there is enough
@@ -89,10 +89,10 @@ inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
 // This is slightly less efficient than the private version in
 // coded_stream.cc but we duplicate less code by calling
 // the 64 bit version instead of copying the code.
-inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
-                                          uint32* value) {
-  uint64 tmp = 0;
-  const uint8* buf = ReadVarint64FromArray(buffer, ok, &tmp);
+inline const uint8_t* ReadVarint32FromArray(const uint8_t* buffer, bool* ok,
+                                            uint32_t* value) {
+  uint64_t tmp = 0;
+  const uint8_t* buf = ReadVarint64FromArray(buffer, ok, &tmp);
   *value = tmp & 0xffffffff;
   return buf;
 }
@@ -101,12 +101,12 @@ inline const uint8* ReadVarint32FromArray(const uint8* buffer, bool* ok,
 // The array is part of a Tensor that was allocated by the caller
 // with type TensorType, while DeclaredType is the proto field type.
 template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
-const uint8* ReadFromArray(const uint8* buf, TensorType* value);
+const uint8_t* ReadFromArray(const uint8_t* buf, TensorType* value);
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT32>(
-    const uint8* buf, int64_t* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_INT32>(
+    const uint8_t* buf, int64_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int64_t>(temp);
@@ -114,19 +114,19 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_INT32>(
-    const uint8* buf, int32* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int32_t, WireFormatLite::TYPE_INT32>(
+    const uint8_t* buf, int32_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
-  *value = static_cast<int32>(temp);
+  *value = static_cast<int32_t>(temp);
   return buf;
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT64>(
-    const uint8* buf, int64_t* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_INT64>(
+    const uint8_t* buf, int64_t* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WrapUnsignedAsSigned64(temp);
@@ -134,9 +134,9 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_INT64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT32>(
-    const uint8* buf, uint64* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_UINT32>(
+    const uint8_t* buf, uint64_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = temp;
@@ -144,23 +144,23 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_UINT32>(
-    const uint8* buf, uint32* value) {
+inline const uint8_t* ReadFromArray<uint32_t, WireFormatLite::TYPE_UINT32>(
+    const uint8_t* buf, uint32_t* value) {
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   return ReadVarint32FromArray(buf, &unused_ok, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_UINT64>(
-    const uint8* buf, uint64* value) {
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_UINT64>(
+    const uint8_t* buf, uint64_t* value) {
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   return ReadVarint64FromArray(buf, &unused_ok, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT32>(
-    const uint8* buf, int64_t* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT32>(
+    const uint8_t* buf, int64_t* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -168,9 +168,9 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
-    const uint8* buf, int32* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int32_t, WireFormatLite::TYPE_SINT32>(
+    const uint8_t* buf, int32_t* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode32(temp);
@@ -178,9 +178,9 @@ inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SINT32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT64>(
-    const uint8* buf, int64_t* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT64>(
+    const uint8_t* buf, int64_t* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = WireFormatLite::ZigZagDecode64(temp);
@@ -188,10 +188,10 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SINT64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED32>(
-    const uint8* buf, uint64* value) {
-  uint32 temp;
-  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_FIXED32>(
+    const uint8_t* buf, uint64_t* value) {
+  uint32_t temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32_t,
                                                WireFormatLite::TYPE_FIXED32>(
       buf, &temp);
   *value = temp;
@@ -199,10 +199,10 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_FIXED32>(
-    const uint8* buf, uint32* value) {
-  uint32 temp;
-  buf = WireFormatLite::ReadPrimitiveFromArray<uint32,
+inline const uint8_t* ReadFromArray<uint32_t, WireFormatLite::TYPE_FIXED32>(
+    const uint8_t* buf, uint32_t* value) {
+  uint32_t temp;
+  buf = WireFormatLite::ReadPrimitiveFromArray<uint32_t,
                                                WireFormatLite::TYPE_FIXED32>(
       buf, &temp);
   *value = WrapUnsignedAsSigned32(temp);
@@ -210,8 +210,8 @@ inline const uint8* ReadFromArray<uint32, WireFormatLite::TYPE_FIXED32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED64>(
-    const uint8* buf, uint64* value) {
+inline const uint8_t* ReadFromArray<uint64_t, WireFormatLite::TYPE_FIXED64>(
+    const uint8_t* buf, uint64_t* value) {
   protobuf_uint64 temp;
   buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_uint64,
                                                WireFormatLite::TYPE_FIXED64>(
@@ -221,10 +221,10 @@ inline const uint8* ReadFromArray<uint64, WireFormatLite::TYPE_FIXED64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED32>(
-    const uint8* buf, int64_t* value) {
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED32>(
+    const uint8_t* buf, int64_t* value) {
   int32_t temp;
-  buf = WireFormatLite::ReadPrimitiveFromArray<int32,
+  buf = WireFormatLite::ReadPrimitiveFromArray<int32_t,
                                                WireFormatLite::TYPE_SFIXED32>(
       buf, &temp);
   *value = temp;
@@ -232,16 +232,16 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED32>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int32, WireFormatLite::TYPE_SFIXED32>(
-    const uint8* buf, int32* value) {
-  return WireFormatLite::ReadPrimitiveFromArray<int32,
+inline const uint8_t* ReadFromArray<int32_t, WireFormatLite::TYPE_SFIXED32>(
+    const uint8_t* buf, int32_t* value) {
+  return WireFormatLite::ReadPrimitiveFromArray<int32_t,
                                                 WireFormatLite::TYPE_SFIXED32>(
       buf, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED64>(
-    const uint8* buf, int64_t* value) {
+inline const uint8_t* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED64>(
+    const uint8_t* buf, int64_t* value) {
   protobuf_int64 temp;
   buf = WireFormatLite::ReadPrimitiveFromArray<protobuf_int64,
                                                WireFormatLite::TYPE_SFIXED64>(
@@ -251,16 +251,16 @@ inline const uint8* ReadFromArray<int64_t, WireFormatLite::TYPE_SFIXED64>(
 }
 
 template <>
-inline const uint8* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
-    const uint8* buf, float* value) {
+inline const uint8_t* ReadFromArray<float, WireFormatLite::TYPE_FLOAT>(
+    const uint8_t* buf, float* value) {
   return WireFormatLite::ReadPrimitiveFromArray<float,
                                                 WireFormatLite::TYPE_FLOAT>(
       buf, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_FLOAT>(
-    const uint8* buf, double* value) {
+inline const uint8_t* ReadFromArray<double, WireFormatLite::TYPE_FLOAT>(
+    const uint8_t* buf, double* value) {
   float temp;
   buf =
       WireFormatLite::ReadPrimitiveFromArray<float, WireFormatLite::TYPE_FLOAT>(
@@ -270,17 +270,17 @@ inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_FLOAT>(
 }
 
 template <>
-inline const uint8* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
-    const uint8* buf, double* value) {
+inline const uint8_t* ReadFromArray<double, WireFormatLite::TYPE_DOUBLE>(
+    const uint8_t* buf, double* value) {
   return WireFormatLite::ReadPrimitiveFromArray<double,
                                                 WireFormatLite::TYPE_DOUBLE>(
       buf, value);
 }
 
 template <>
-inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
-    const uint8* buf, bool* value) {
-  uint64 temp = 0;
+inline const uint8_t* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
+    const uint8_t* buf, bool* value) {
+  uint64_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint64FromArray(buf, &unused_ok, &temp);
   *value = temp != 0;
@@ -288,9 +288,9 @@ inline const uint8* ReadFromArray<bool, WireFormatLite::TYPE_BOOL>(
 }
 
 template <>
-inline const uint8* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
-    const uint8* buf, int* value) {
-  uint32 temp = 0;
+inline const uint8_t* ReadFromArray<int, WireFormatLite::TYPE_ENUM>(
+    const uint8_t* buf, int* value) {
+  uint32_t temp = 0;
   bool unused_ok;  // The Counting pass would have failed if this were corrupt.
   buf = ReadVarint32FromArray(buf, &unused_ok, &temp);
   *value = static_cast<int>(temp);
@@ -304,8 +304,8 @@ template <class TensorType, enum WireFormatLite::FieldType DeclaredType>
 inline int ReadPackedPrimitives(const void* bufp, const size_t len,
                                 const int index, const int stride,
                                 void* datap) {
-  const uint8* buf = reinterpret_cast<const uint8*>(bufp);
-  const uint8* bound = buf + len;
+  const uint8_t* buf = reinterpret_cast<const uint8_t*>(bufp);
+  const uint8_t* bound = buf + len;
   TensorType* data = reinterpret_cast<TensorType*>(datap) + index;
   int count;
 
@@ -340,7 +340,7 @@ inline absl::Status ReadPrimitive(CodedInputStream* input, int index,
 inline absl::Status ReadBytes(CodedInputStream* input, int index, void* datap) {
   tstring* data = reinterpret_cast<tstring*>(datap) + index;
 
-  uint32 length;
+  uint32_t length;
   if (!input->ReadVarint32(&length)) {
     return errors::DataLoss("Failed reading bytes");
   }
@@ -370,7 +370,7 @@ inline absl::Status ReadGroupBytes(CodedInputStream* input, int field_number,
   // TYPE_GROUP is deprecated and currently no tests in
   // tensorflow/python/kernel_tests/proto:decode_proto_op_test target a
   // TYPE_GROUP tag, we use std::string as a read buffer.
-  string buf;
+  std::string buf;
   StringOutputStream string_stream(&buf);
   {
     CodedOutputStream out(&string_stream);
@@ -412,31 +412,33 @@ inline absl::Status ReadValue(CodedInputStream* input,
       return ReadPrimitive<protobuf_int64, int64_t, WireFormatLite::TYPE_INT64>(
           input, index, datap);
     case WireFormatLite::TYPE_UINT64:
-      return ReadPrimitive<protobuf_uint64, uint64,
+      return ReadPrimitive<protobuf_uint64, uint64_t,
                            WireFormatLite::TYPE_UINT64>(input, index, datap);
     case WireFormatLite::TYPE_INT32:
       switch (dtype) {
         case DataType::DT_INT64:
-          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_INT32>(
+          return ReadPrimitive<int32_t, int64_t, WireFormatLite::TYPE_INT32>(
               input, index, datap);
         case DataType::DT_INT32:
-          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_INT32>(
+          return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_INT32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_INT32 for ",
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_FIXED64:
-      return ReadPrimitive<protobuf_uint64, uint64,
+      return ReadPrimitive<protobuf_uint64, uint64_t,
                            WireFormatLite::TYPE_FIXED64>(input, index, datap);
     case WireFormatLite::TYPE_FIXED32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          return ReadPrimitive<uint32, uint64, WireFormatLite::TYPE_FIXED32>(
-              input, index, datap);
+          return ReadPrimitive<uint32_t, uint64_t,
+                               WireFormatLite::TYPE_FIXED32>(input, index,
+                                                             datap);
         case DataType::DT_UINT32:
-          return ReadPrimitive<uint32, uint32, WireFormatLite::TYPE_FIXED32>(
-              input, index, datap);
+          return ReadPrimitive<uint32_t, uint32_t,
+                               WireFormatLite::TYPE_FIXED32>(input, index,
+                                                             datap);
         default:
           return errors::DataLoss("Failed reading TYPE_FIXED32 for ",
                                   DataTypeString(dtype));
@@ -455,25 +457,25 @@ inline absl::Status ReadValue(CodedInputStream* input,
     case WireFormatLite::TYPE_UINT32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          return ReadPrimitive<uint32, uint64, WireFormatLite::TYPE_UINT32>(
+          return ReadPrimitive<uint32_t, uint64_t, WireFormatLite::TYPE_UINT32>(
               input, index, datap);
         case DataType::DT_UINT32:
-          return ReadPrimitive<uint32, uint32, WireFormatLite::TYPE_UINT32>(
+          return ReadPrimitive<uint32_t, uint32_t, WireFormatLite::TYPE_UINT32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_UINT32 for ",
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_ENUM:
-      return ReadPrimitive<int32, int32, WireFormatLite::TYPE_ENUM>(
+      return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_ENUM>(
           input, index, datap);
     case WireFormatLite::TYPE_SFIXED32:
       switch (dtype) {
         case DataType::DT_INT64:
-          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_SFIXED32>(
+          return ReadPrimitive<int32_t, int64_t, WireFormatLite::TYPE_SFIXED32>(
               input, index, datap);
         case DataType::DT_INT32:
-          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SFIXED32>(
+          return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_SFIXED32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_SFIXED32 for ",
@@ -485,10 +487,10 @@ inline absl::Status ReadValue(CodedInputStream* input,
     case WireFormatLite::TYPE_SINT32:
       switch (dtype) {
         case DataType::DT_INT64:
-          return ReadPrimitive<int32, int64_t, WireFormatLite::TYPE_SINT32>(
+          return ReadPrimitive<int32_t, int64_t, WireFormatLite::TYPE_SINT32>(
               input, index, datap);
         case DataType::DT_INT32:
-          return ReadPrimitive<int32, int32, WireFormatLite::TYPE_SINT32>(
+          return ReadPrimitive<int32_t, int32_t, WireFormatLite::TYPE_SINT32>(
               input, index, datap);
         default:
           return errors::DataLoss("Failed reading TYPE_SINT32 for ",
@@ -533,7 +535,7 @@ inline absl::Status ReadPackedFromArray(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_UINT64:
-      *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_UINT64>(
+      *index += ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_UINT64>(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_INT32:
@@ -543,7 +545,7 @@ inline absl::Status ReadPackedFromArray(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_INT32:
-          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_INT32>(
+          *index += ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_INT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
@@ -551,18 +553,20 @@ inline absl::Status ReadPackedFromArray(
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_FIXED64:
-      *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_FIXED64>(
+      *index += ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_FIXED64>(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_FIXED32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_FIXED32>(
-              buf, buf_size, *index, stride, data);
+          *index +=
+              ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_FIXED32>(
+                  buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_UINT32:
-          *index += ReadPackedPrimitives<uint32, WireFormatLite::TYPE_FIXED32>(
-              buf, buf_size, *index, stride, data);
+          *index +=
+              ReadPackedPrimitives<uint32_t, WireFormatLite::TYPE_FIXED32>(
+                  buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
           return errors::DataLoss("Failed reading TYPE_FIXED32 for ",
@@ -580,11 +584,11 @@ inline absl::Status ReadPackedFromArray(
     case WireFormatLite::TYPE_UINT32:
       switch (dtype) {
         case DataType::DT_UINT64:
-          *index += ReadPackedPrimitives<uint64, WireFormatLite::TYPE_UINT32>(
+          *index += ReadPackedPrimitives<uint64_t, WireFormatLite::TYPE_UINT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_UINT32:
-          *index += ReadPackedPrimitives<uint32, WireFormatLite::TYPE_UINT32>(
+          *index += ReadPackedPrimitives<uint32_t, WireFormatLite::TYPE_UINT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
@@ -592,7 +596,7 @@ inline absl::Status ReadPackedFromArray(
                                   DataTypeString(dtype));
       }
     case WireFormatLite::TYPE_ENUM:
-      *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_ENUM>(
+      *index += ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_ENUM>(
           buf, buf_size, *index, stride, data);
       return absl::OkStatus();
     case WireFormatLite::TYPE_SFIXED32:
@@ -603,8 +607,9 @@ inline absl::Status ReadPackedFromArray(
                   buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_INT32:
-          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SFIXED32>(
-              buf, buf_size, *index, stride, data);
+          *index +=
+              ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_SFIXED32>(
+                  buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
           return errors::DataLoss("Failed reading TYPE_INT32 for ",
@@ -622,7 +627,7 @@ inline absl::Status ReadPackedFromArray(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         case DataType::DT_INT32:
-          *index += ReadPackedPrimitives<int32, WireFormatLite::TYPE_SINT32>(
+          *index += ReadPackedPrimitives<int32_t, WireFormatLite::TYPE_SINT32>(
               buf, buf_size, *index, stride, data);
           return absl::OkStatus();
         default:
@@ -645,14 +650,14 @@ inline absl::Status ReadPackedFromArray(
 // Important: This routine may read as much as kMaxVarintBytes from
 // the buffer. It is the caller's responsibility to make sure that there is
 // enough space in the buffer.
-inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
-                                          uint64* value) {
-  const uint8* ptr = buffer;
-  uint32 b;
+inline const uint8_t* ReadVarint64FromArray(const uint8_t* buffer, bool* ok,
+                                            uint64_t* value) {
+  const uint8_t* ptr = buffer;
+  uint32_t b;
 
   // Splitting into 32-bit pieces gives better performance on 32-bit
   // processors.
-  uint32 part0 = 0, part1 = 0, part2 = 0;
+  uint32_t part0 = 0, part1 = 0, part2 = 0;
 
   b = *(ptr++);
   part0 = b;
@@ -702,8 +707,9 @@ inline const uint8* ReadVarint64FromArray(const uint8* buffer, bool* ok,
 
 done:
   *ok = true;
-  *value = (static_cast<uint64>(part0)) | (static_cast<uint64>(part1) << 28) |
-           (static_cast<uint64>(part2) << 56);
+  *value = (static_cast<uint64_t>(part0)) |
+           (static_cast<uint64_t>(part1) << 28) |
+           (static_cast<uint64_t>(part2) << 56);
   return ptr;
 }
 
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.cc b/tensorflow/core/util/proto/descriptor_pool_registry.cc
index 5f0423f76b74c2..e8184f6b2fabfc 100644
--- a/tensorflow/core/util/proto/descriptor_pool_registry.cc
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.cc
@@ -27,19 +27,19 @@ DescriptorPoolRegistry* DescriptorPoolRegistry::Global() {
 }
 
 DescriptorPoolRegistry::DescriptorPoolFn* DescriptorPoolRegistry::Get(
-    const string& source) {
+    const std::string& source) {
   auto found = fns_.find(source);
   if (found == fns_.end()) return nullptr;
   return &found->second;
 }
 
 void DescriptorPoolRegistry::Register(
-    const string& source,
+    const std::string& source,
     const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
   auto existing = Get(source);
   CHECK_EQ(existing, nullptr)
       << "descriptor pool for source: " << source << " already registered";
-  fns_.insert(std::pair<const string&, DescriptorPoolFn>(source, pool_fn));
+  fns_.insert(std::pair<const std::string&, DescriptorPoolFn>(source, pool_fn));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/proto/descriptor_pool_registry.h b/tensorflow/core/util/proto/descriptor_pool_registry.h
index 59c709ea150e87..5718243c15cbab 100644
--- a/tensorflow/core/util/proto/descriptor_pool_registry.h
+++ b/tensorflow/core/util/proto/descriptor_pool_registry.h
@@ -39,13 +39,13 @@ class DescriptorPoolRegistry {
   static DescriptorPoolRegistry* Global();
 
   // Returns a pointer to a descriptor pool function for the given source.
-  DescriptorPoolFn* Get(const string& source);
+  DescriptorPoolFn* Get(const std::string& source);
 
   // Registers a descriptor pool factory.
-  void Register(const string& source, const DescriptorPoolFn& pool_fn);
+  void Register(const std::string& source, const DescriptorPoolFn& pool_fn);
 
  private:
-  std::map<string, DescriptorPoolFn> fns_;
+  std::map<std::string, DescriptorPoolFn> fns_;
 };
 
 namespace descriptor_pool_registration {
@@ -53,7 +53,7 @@ namespace descriptor_pool_registration {
 class DescriptorPoolRegistration {
  public:
   DescriptorPoolRegistration(
-      const string& source,
+      const std::string& source,
       const DescriptorPoolRegistry::DescriptorPoolFn& pool_fn) {
     DescriptorPoolRegistry::Global()->Register(source, pool_fn);
   }
diff --git a/tensorflow/core/util/proto/descriptors.cc b/tensorflow/core/util/proto/descriptors.cc
index 31942145fe32fa..e485499c94d5f7 100644
--- a/tensorflow/core/util/proto/descriptors.cc
+++ b/tensorflow/core/util/proto/descriptors.cc
@@ -45,7 +45,7 @@ absl::Status CreatePoolFromSet(
 // The file must contain a serialized `FileDescriptorSet`. See
 // `GetDescriptorPool()` for more information.
 absl::Status GetDescriptorPoolFromFile(
-    tensorflow::Env* env, const string& filename,
+    tensorflow::Env* env, const std::string& filename,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool) {
   absl::Status st = env->FileExists(filename);
   if (!st.ok()) {
@@ -66,7 +66,7 @@ absl::Status GetDescriptorPoolFromFile(
 }
 
 absl::Status GetDescriptorPoolFromBinary(
-    const string& source,
+    const std::string& source,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool) {
   if (!absl::StartsWith(source, "bytes://")) {
     return errors::InvalidArgument(absl::StrCat(
@@ -76,7 +76,7 @@ absl::Status GetDescriptorPoolFromBinary(
   }
   // Parse the FileDescriptorSet.
   protobuf::FileDescriptorSet proto;
-  if (!proto.ParseFromString(string(absl::StripPrefix(source, "bytes://")))) {
+  if (!proto.ParseFromString(absl::StripPrefix(source, "bytes://"))) {
     return errors::InvalidArgument(absl::StrCat(
         "Source does not represent serialized file descriptor set proto. ",
         "This may be due to a missing dependency on the file containing ",
@@ -88,7 +88,7 @@ absl::Status GetDescriptorPoolFromBinary(
 }  // namespace
 
 absl::Status GetDescriptorPool(
-    Env* env, string const& descriptor_source,
+    Env* env, const std::string& descriptor_source,
     protobuf::DescriptorPool const** desc_pool,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool) {
   // Attempt to lookup the pool in the registry.
diff --git a/tensorflow/core/util/proto/descriptors.h b/tensorflow/core/util/proto/descriptors.h
index 3402ed0504410e..7b6ce3b97b5053 100644
--- a/tensorflow/core/util/proto/descriptors.h
+++ b/tensorflow/core/util/proto/descriptors.h
@@ -46,7 +46,7 @@ using tsl::Env;
 // Custom schemas can be supported by registering a handler with the
 // `DescriptorPoolRegistry`.
 absl::Status GetDescriptorPool(
-    Env* env, string const& descriptor_source,
+    Env* env, const std::string& descriptor_source,
     protobuf::DescriptorPool const** desc_pool,
     std::unique_ptr<protobuf::DescriptorPool>* owned_desc_pool);
 
diff --git a/tensorflow/core/util/proto/proto_utils.cc b/tensorflow/core/util/proto/proto_utils.cc
index 0833352bf431d7..f0a103eaa2823c 100644
--- a/tensorflow/core/util/proto/proto_utils.cc
+++ b/tensorflow/core/util/proto/proto_utils.cc
@@ -79,20 +79,20 @@ absl::Status ParseTextFormatFromString(absl::string_view input,
     return absl::Status(absl::StatusCode::kInvalidArgument,
                         "output must be non NULL");
   }
-  string err;
+  std::string err;
   StringErrorCollector err_collector(&err, /*one-indexing=*/true);
   protobuf::TextFormat::Parser parser;
   parser.RecordErrorsTo(&err_collector);
-  if (!parser.ParseFromString(string(input), output)) {
+  if (!parser.ParseFromString(input, output)) {
     return absl::Status(absl::StatusCode::kInvalidArgument, err);
   }
   return absl::OkStatus();
 }
 
-StringErrorCollector::StringErrorCollector(string* error_text)
+StringErrorCollector::StringErrorCollector(std::string* error_text)
     : StringErrorCollector(error_text, false) {}
 
-StringErrorCollector::StringErrorCollector(string* error_text,
+StringErrorCollector::StringErrorCollector(std::string* error_text,
                                            bool one_indexing)
     : error_text_(error_text), index_offset_(one_indexing ? 1 : 0) {
   DCHECK(error_text_ != nullptr) << "error_text must be non NULL";
diff --git a/tensorflow/core/util/proto/proto_utils.h b/tensorflow/core/util/proto/proto_utils.h
index 8a94a832fec58c..65c73e35c15f8b 100644
--- a/tensorflow/core/util/proto/proto_utils.h
+++ b/tensorflow/core/util/proto/proto_utils.h
@@ -44,11 +44,11 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
  public:
   // String error_text is unowned and must remain valid during the use of
   // StringErrorCollector.
-  explicit StringErrorCollector(string* error_text);
+  explicit StringErrorCollector(std::string* error_text);
   // If one_indexing is set to true, all line and column numbers will be
   // increased by one for cases when provided indices are 0-indexed and
   // 1-indexed error messages are desired
-  StringErrorCollector(string* error_text, bool one_indexing);
+  StringErrorCollector(std::string* error_text, bool one_indexing);
   StringErrorCollector(const StringErrorCollector&) = delete;
   StringErrorCollector& operator=(const StringErrorCollector&) = delete;
 
@@ -61,7 +61,7 @@ class StringErrorCollector : public protobuf::io::ErrorCollector {
                      absl::string_view message) override;
 
  private:
-  string* const error_text_;
+  std::string* const error_text_;
   const int index_offset_;
 };
 
diff --git a/tensorflow/core/util/proto/proto_utils_test.cc b/tensorflow/core/util/proto/proto_utils_test.cc
index 8632c2a5e29d52..460e41ad770c31 100644
--- a/tensorflow/core/util/proto/proto_utils_test.cc
+++ b/tensorflow/core/util/proto/proto_utils_test.cc
@@ -61,21 +61,21 @@ TEST(ParseTextFormatFromStringTest, DiesOnNullOutputPointer) {
 }
 
 TEST(StringErrorCollectorTest, AppendsError) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordError(1, 2, "foo");
   EXPECT_EQ("1(2): foo\n", err);
 }
 
 TEST(StringErrorCollectorTest, AppendsWarning) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordWarning(1, 2, "foo");
   EXPECT_EQ("1(2): foo\n", err);
 }
 
 TEST(StringErrorCollectorTest, AppendsMultipleError) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordError(1, 2, "foo");
   collector.RecordError(3, 4, "bar");
@@ -83,7 +83,7 @@ TEST(StringErrorCollectorTest, AppendsMultipleError) {
 }
 
 TEST(StringErrorCollectorTest, AppendsMultipleWarning) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err);
   collector.RecordWarning(1, 2, "foo");
   collector.RecordWarning(3, 4, "bar");
@@ -91,7 +91,7 @@ TEST(StringErrorCollectorTest, AppendsMultipleWarning) {
 }
 
 TEST(StringErrorCollectorTest, OffsetWorks) {
-  string err;
+  std::string err;
   StringErrorCollector collector(&err, true);
   collector.RecordError(1, 2, "foo");
   collector.RecordWarning(3, 4, "bar");

From d5820b300095cb9df734451e04ca4071c23fdf66 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 05:20:19 -0800
Subject: [PATCH 518/753] Automated Code Change

PiperOrigin-RevId: 846217449
---
 tensorflow/core/tfrt/common/async_value_tensor.h | 2 +-
 tensorflow/core/tfrt/common/pjrt_state.cc        | 2 +-
 tensorflow/core/tfrt/common/pjrt_state.h         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/tfrt/common/async_value_tensor.h b/tensorflow/core/tfrt/common/async_value_tensor.h
index 06e99f8f7bcc48..83d0efcb5cc63a 100644
--- a/tensorflow/core/tfrt/common/async_value_tensor.h
+++ b/tensorflow/core/tfrt/common/async_value_tensor.h
@@ -64,7 +64,7 @@ class AsyncValueAllocator : public Allocator {
   void DeallocateRaw(void* ptr) override;
 
   bool AllocatesOpaqueHandle() const override { return true; }
-  string Name() override { return "async-value"; }
+  std::string Name() override { return "async-value"; }
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_state.cc b/tensorflow/core/tfrt/common/pjrt_state.cc
index 9a6ec5bba211e5..e20e2ca0790586 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.cc
+++ b/tensorflow/core/tfrt/common/pjrt_state.cc
@@ -99,6 +99,6 @@ PjRtGpuClientCreationInfo* PjRtState::GetPjRtGpuClientCreationInfo() {
   return pjrt_gpu_client_creation_info_.get();
 }
 
-string PjRtState::DebugString() const { return "PjRtState"; }
+std::string PjRtState::DebugString() const { return "PjRtState"; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/common/pjrt_state.h b/tensorflow/core/tfrt/common/pjrt_state.h
index 3da5fb930a9e1b..e0e9f8657bb8a8 100644
--- a/tensorflow/core/tfrt/common/pjrt_state.h
+++ b/tensorflow/core/tfrt/common/pjrt_state.h
@@ -63,7 +63,7 @@ class PjRtState : public ResourceBase {
   // Moves PJRT client to `unused_`. The PJRT client moved to `unused_` will not
   // be returned by `GetPjRtClient`.
   absl::Status MovePjRtClientToUnused(const DeviceType& device_type);
-  string DebugString() const override;
+  std::string DebugString() const override;
 
   // Saves information needed to create a PJRT client (to enable creating a
   // client with remote devices).

From d3933721c1ee613590da77d76b27f27532394e7d Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 18 Dec 2025 05:31:45 -0800
Subject: [PATCH 519/753] When opening a file, check that the file path is not
 null.

PiperOrigin-RevId: 846221230
---
 tensorflow/lite/delegates/xnnpack/file_util.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/lite/delegates/xnnpack/file_util.cc b/tensorflow/lite/delegates/xnnpack/file_util.cc
index 268aebc50468e3..7fbb917c850e4e 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util.cc
@@ -96,6 +96,9 @@ FileDescriptor::Offset FileDescriptorView::MovePos(
 }
 
 FileDescriptor FileDescriptor::Open(const char* path, int flags, mode_t mode) {
+  if (!path) {
+    return {};
+  }
 #if defined(_WIN32)
   if (!(flags & O_TEXT)) {
     flags |= O_BINARY;

From 6286fccd8f40a1782922d4672a648ede614de799 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 05:33:23 -0800
Subject: [PATCH 520/753] Automated Code Change

PiperOrigin-RevId: 846221752
---
 third_party/xla/xla/service/llvm_ir/BUILD             | 1 +
 third_party/xla/xla/service/llvm_ir/llvm_loop.cc      | 1 -
 third_party/xla/xla/service/llvm_ir/llvm_util_test.cc | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index a625cd355bdbe0..5043fe1ab34097 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -311,6 +311,7 @@ xla_cc_test(
         "//xla:error_spec",
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/tests:hlo_test_base",
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_loop.cc b/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
index 70e952bdcf961e..7845c54786f22a 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_loop.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <numeric>
-#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc b/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc
index 5e57393dc95cf2..b39afac44ff9e9 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util_test.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla::llvm_ir {
 namespace {

From b64c84f2c3b2e5c577509d28f711777ee5d28b2c Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 18 Dec 2025 05:50:11 -0800
Subject: [PATCH 521/753] Remove forgotten ROCM version checks from
 NcclCollectives

The ROCm code path doesn't go through NcclCollectives anymore. Therefore these checks are obsolete.

PiperOrigin-RevId: 846226180
---
 .../xla/xla/backends/gpu/collectives/nccl_collectives.cc | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
index b887bbb24e4e78..d4990d32193f10 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_collectives.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/gpu/collectives/nccl_collectives.h"
 
+#include <atomic>
 #include <cstdint>
 #include <cstdlib>
 #include <functional>
@@ -104,9 +105,7 @@ static absl::StatusOr<ncclConfig_t> AsNcclConfig(
     const se::StreamExecutor* stream_executor) {
   ncclConfig_t comm_config = NCCL_CONFIG_INITIALIZER;
   comm_config.blocking = config.blocking_communicators ? 1 : 0;
-#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION > 50700
   comm_config.splitShare = config.split_share;
-#endif
   int nccl_version;
   XLA_NCCL_RETURN_IF_ERROR(ncclGetVersion(&nccl_version));
   if (config.max_nchannels > 0) {
@@ -231,7 +230,6 @@ NcclCollectives::SplitCommunicatorsWithCancel(
   const auto& gpu_config =
       tsl::down_cast<const GpuCollectives::Config&>(config);
 
-#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
   auto make_comm = [&](int i) -> absl::StatusOr<ncclComm_t> {
     auto* device = tsl::down_cast<GpuCollectives::Device*>(ranks[i].device);
     TF_RET_CHECK(device != nullptr);
@@ -268,11 +266,6 @@ NcclCollectives::SplitCommunicatorsWithCancel(
   }  // pool's destructor blocks until all scheduled work is done.
   TF_RETURN_IF_ERROR(status);
   return split_comms;
-#else
-  return absl::UnimplementedError(
-      absl::StrFormat("%s:%d: NCCL operation ncclCommSplit not implemented",
-                      __FILE__, __LINE__));
-#endif  // !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
 }
 
 static absl::StatusOr<xla::gpu::GpuCollectives*> GetNvshmemCollectives() {

From 17fa72acdeb3a8bc0931267f6bb866a8e7e67d31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 05:50:44 -0800
Subject: [PATCH 522/753] Automated Code Change

PiperOrigin-RevId: 846226345
---
 third_party/xla/xla/stream_executor/rocm/BUILD         | 10 ++++++++--
 third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h |  3 +++
 .../xla/stream_executor/rocm/rocm_command_buffer.cc    |  2 --
 .../rocm/rocm_compute_capability_test.cc               |  1 +
 third_party/xla/xla/stream_executor/rocm/rocm_dnn.h    |  2 ++
 .../xla/xla/stream_executor/rocm/rocm_executor_test.cc |  1 -
 third_party/xla/xla/stream_executor/rocm/rocm_stream.h |  2 ++
 7 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index ab3b73b0fc8fa9..885d88ec2812b0 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -43,6 +43,7 @@ xla_cc_test(
     srcs = ["rocm_compute_capability_test.cc"],
     deps = [
         ":rocm_compute_capability",
+        "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/util/proto:proto_matchers",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
@@ -530,11 +531,13 @@ cc_library(
         "//xla/tsl/platform:logging",
         "//xla/tsl/platform:macros",
         "//xla/tsl/platform:statusor",
+        "//xla/tsl/protobuf:dnn_proto_cc",
         "//xla/tsl/util:determinism_for_kernels",
         "//xla/tsl/util:env_var",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
@@ -676,6 +679,7 @@ cc_library(
     deps = [
         ":hip_blas_utils",
         ":hipblas_lt_header",
+        ":hipblaslt_if_static",
         ":rocblas_plugin",
         ":rocm_executor",
         ":rocm_platform_id",
@@ -696,6 +700,7 @@ cc_library(
         "//xla/tsl/platform:status",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -705,8 +710,6 @@ cc_library(
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:dso_loader",
         "@local_tsl//tsl/platform:ml_dtypes",
-    ] + [
-        ":hipblaslt_if_static",
     ],
     alwayslink = True,
 )
@@ -732,7 +735,10 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
         "@local_config_rocm//rocm:rocm_headers",
         "@local_tsl//tsl/platform:dso_loader",
     ],
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
index efe8b84b674168..71b79826d0b7cb 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.h
@@ -16,7 +16,10 @@ limitations under the License.
 #include <cstddef>
 #include <utility>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "rocm/rocm_config.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_address.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
index 528f4febfcd2e4..0bb07079f362ba 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_command_buffer.cc
@@ -17,13 +17,11 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/base/casts.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/check.h"
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
index ed9eba6c545e17..5ecb0db327988d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_compute_capability_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "xla/stream_executor/device_description.pb.h"
 #include "xla/tsl/util/proto/proto_matchers.h"
 
 namespace stream_executor::rocm {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index 1922c006b60deb..e1dbd07db3a95a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "rocm/include/miopen/miopen.h"
 #include "xla/stream_executor/device_address.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/protobuf/dnn.pb.h"
 
 namespace stream_executor {
 namespace gpu {
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
index 0bfef481d9f4f4..342c3ca5952b40 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "xla/stream_executor/rocm/rocm_executor.h"
 
 #include <memory>
-#include <variant>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_stream.h b/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
index a16a3bb5305559..4d8cfbb7acf90c 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_stream.h
@@ -23,6 +23,8 @@ limitations under the License.
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "rocm/include/hip/hip_runtime.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/event.h"

From 6457884a9bdca545f06cebb79b9bde691ce7f7f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 06:07:02 -0800
Subject: [PATCH 523/753] Automated Code Change

PiperOrigin-RevId: 846231902
---
 .../gpu/transforms/double_buffer_loop_unrolling.cc        | 2 --
 .../xla/service/gpu/transforms/nest_gemm_fusion_test.cc   | 8 --------
 .../service/gpu/transforms/transpose_dimension_grouper.cc | 3 ---
 3 files changed, 13 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
index fa58739643633a..c509f381ecfe87 100644
--- a/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
+++ b/third_party/xla/xla/service/gpu/transforms/double_buffer_loop_unrolling.cc
@@ -14,8 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
 
-#include <algorithm>
-#include <cmath>
 #include <cstdint>
 #include <iterator>
 #include <optional>
diff --git a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
index da4c0d8adc6eb1..bc37e69dcb8489 100644
--- a/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/nest_gemm_fusion_test.cc
@@ -15,21 +15,13 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/nest_gemm_fusion.h"
 
-#include <cstdint>
 #include <memory>
-#include <string>
-#include <utility>
-#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/container/inlined_vector.h"
 #include "absl/log/log.h"
 #include "absl/status/status_matchers.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "absl/strings/substitute.h"
 #include "mlir/IR/MLIRContext.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
index de243e47343e98..d3eb4c56e28c78 100644
--- a/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
+++ b/third_party/xla/xla/service/gpu/transforms/transpose_dimension_grouper.cc
@@ -15,10 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/transforms/transpose_dimension_grouper.h"
 
-#include <cstddef>
 #include <cstdint>
-#include <functional>
-#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"

From 5e49ee5ed358bd9b18e4f9484224085576229bfb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 06:15:09 -0800
Subject: [PATCH 524/753] Automated Code Change

PiperOrigin-RevId: 846234559
---
 .../core/runtime_fallback/test/forwarding_test_kernels.cc   | 6 +++---
 .../core/runtime_fallback/test/tfrt_forwarding_kernels.cc   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc b/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc
index 4454a04cc1ab34..758a9074637aa2 100644
--- a/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc
+++ b/tensorflow/core/runtime_fallback/test/forwarding_test_kernels.cc
@@ -39,8 +39,8 @@ class ScalarAdd : public OpKernelT {
     const Tensor& input1 = ctx->input(1);
 
     Tensor output(input0);
-    output.scalar<int32>()() =
-        input0.scalar<int32>()() + input1.scalar<int32>()();
+    output.scalar<int32_t>()() =
+        input0.scalar<int32_t>()() + input1.scalar<int32_t>()();
 
     ctx->set_output(0, output);
   }
@@ -54,7 +54,7 @@ REGISTER_OP("ScalarAdd") SCALAR_ADD_PROPERTIES;
 
 // When calling ScalarAdd from TF, use the standard OpKernel* types.
 REGISTER_KERNEL_BUILDER(
-    Name("ScalarAdd").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
+    Name("ScalarAdd").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
     ScalarAdd<OpKernel, OpKernelConstruction, OpKernelContext>)
 #endif
 
diff --git a/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc b/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
index 6d45437dae4625..5193167366ac35 100644
--- a/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
+++ b/tensorflow/core/runtime_fallback/test/tfrt_forwarding_kernels.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 static void TFDConstantTensor5D(tfrt::Argument<int32_t> value,
                                 tfrt::Result<Tensor> tensor) {
   Tensor out(DT_INT32, TensorShape({1, 1, 1, 1, 1}));
-  out.flat<int32>()(0) = value.get();
+  out.flat<int32_t>()(0) = value.get();
   tensor.Emplace(out);
 }
 

From 4e34cc6fb7c23a58f5a3b94436ba8b45321bcb0e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 06:28:57 -0800
Subject: [PATCH 525/753] [XLA:GPU] Support partitioned across replicas modules

PiperOrigin-RevId: 846238886
---
 .../gpu/runtime/collective_metadata_thunk.cc  |  4 +-
 .../xla/xla/tests/collective_metadata_test.cc | 41 +++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
index d6fb24c4b59581..e845aed3cc5b42 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_metadata_thunk.cc
@@ -72,8 +72,8 @@ CollectiveConfig CollectiveMetadataThunk::GetCollectiveConfig(
     }
   }
 
-  config.group_mode =
-      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA;
+  config.group_mode = CollectiveOpGroupMode::
+      COLLECTIVE_OP_GROUP_MODE_CROSS_REPLICA_AND_PARTITION;
 
   return config;
 }
diff --git a/third_party/xla/xla/tests/collective_metadata_test.cc b/third_party/xla/xla/tests/collective_metadata_test.cc
index b5655496b18eb8..836b0157f1bba4 100644
--- a/third_party/xla/xla/tests/collective_metadata_test.cc
+++ b/third_party/xla/xla/tests/collective_metadata_test.cc
@@ -107,6 +107,47 @@ TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadata) {
   }
 }
 
+TEST_F(CollectiveMetadataTest, ConstructCollectiveMetadataForPartitions) {
+  const absl::string_view kModuleStr = R"(
+  HloModule test, allow_spmd_sharding_propagation_to_parameters={true}, allow_spmd_sharding_propagation_to_output={true}, num_partitions=2
+
+  ENTRY test_computation {
+    param_0 = f32[4] parameter(0)
+    param_1 = f32[4] parameter(1)
+
+    const_0 = f32[1] constant({10})
+
+    result_tuple = (f32[4], f32[4]{0}, f32[1], u64[9]) custom-call(param_0, param_1, const_0), custom_call_target="CollectiveMetadata", output_to_operand_aliasing={{0}: (0, {}), {1}: (1, {})}
+    ROOT get_tuple_element = u64[9] get-tuple-element(result_tuple), index=3
+  })";
+
+  constexpr int kNumPartitions = 2;
+  ASSERT_GE(hlo_runner_->device_count(), kNumPartitions)
+      << "Test requires at least " << kNumPartitions << " devices ("
+      << hlo_runner_->device_count() << " available)";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto unoptimized_module,
+      ParseAndReturnVerifiedModule(kModuleStr, /*replica_count=*/1,
+                                   /*num_partitions=*/kNumPartitions));
+
+  Literal input_0 = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  Literal input_1 = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      ExecutionResult execution_result,
+      ExecuteReplicated(std::move(unoptimized_module),
+                        /*arguments=*/std::vector<Literal*>{&input_0, &input_1},
+                        /*run_hlo_passes=*/false));
+  const std::vector<Literal>& result = execution_result.results;
+  ASSERT_EQ(result.size(), kNumPartitions);
+
+  absl::Span<const uint64_t> first_result_data = result[0].data<uint64_t>();
+  absl::Span<const uint64_t> second_result_data = result[1].data<uint64_t>();
+  constexpr int kNumElements = 9;
+  ASSERT_EQ(first_result_data.size(), kNumElements);
+  ASSERT_EQ(second_result_data.size(), kNumElements);
+}
+
 TEST_F(CollectiveMetadataTest, BuildMultimemOnlyOncePerModuleExecution) {
   const absl::string_view kModuleStr = R"(
   HloModule test, replica_count=2

From 50c19ba0223294678c289f822db9d4bcaa640428 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 06:47:02 -0800
Subject: [PATCH 526/753] Apply llvm-use-new-mlir-op-builder fixes

This migrates `builder.create<Op>()` => `Op::create()`

PiperOrigin-RevId: 846246070
---
 .../deallocation/transforms/buffer_reuse.cc   |   4 +-
 .../xla/mlir_hlo/deallocation/utils/util.cc   |  20 +-
 .../xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc       | 290 +++++++++---------
 .../expand_hlo_tuples/expand_hlo_tuples.cc    |   6 +-
 .../hlo_legalize_to_memref.cc                 |  35 ++-
 .../hlo_legalize_to_stablehlo.cc              |  10 +-
 .../legalize_einsum_to_dot_general.cc         |   6 +-
 ...legalize_trigonometric_to_approximation.cc |  85 ++---
 .../mhlo_flatten_tuple/mhlo_flatten_tuple.cc  |  12 +-
 .../transforms/optimize_mhlo/optimize_mhlo.cc |  22 +-
 .../prepare_for_export/prepare_for_export.cc  |  10 +-
 .../stablehlo_legalize_to_hlo.cc              |   6 +-
 .../unfuse_batch_norm/unfuse_batch_norm.cc    |  98 +++---
 .../transforms/sdy_refine_shapes.cpp          |   4 +-
 .../stablehlo_add_quant_dequant_conv.cpp      |  11 +-
 .../stablehlo_canonicalize_dynamism.cpp       |   6 +-
 ...stablehlo_canonicalize_from_hlo_import.cpp |  20 +-
 .../stablehlo_legalize_quant_composite.cpp    |   4 +-
 .../stablehlo_prepare_for_hlo_export.cpp      |  14 +-
 .../mlir_hlo/transforms/alloc_to_arg_pass.cc  |   4 +-
 .../xla/xla/mlir_hlo/transforms/bufferize.cc  | 285 +++++++++--------
 .../xla/mlir_hlo/transforms/bufferize_pass.cc |   6 +-
 .../transforms/detensorize_scf_ops.cc         |  10 +-
 .../transforms/lower_index_cast_pass.cc       |   8 +-
 .../mlir_hlo/transforms/tile_loops_pass.cc    |   2 +-
 .../xla/mlir_hlo/transforms/vectorize_copy.cc |  22 +-
 .../open_while_free_vars_sharding.cc          |   5 +-
 .../export_callback_custom_calls.cc           |   4 +-
 .../xla/xla/service/spmd/shardy/utils.cc      |  12 +-
 29 files changed, 520 insertions(+), 501 deletions(-)

diff --git a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index 6b598b2371cd30..95c9934d4c7bc6 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -394,8 +394,8 @@ bool hoistAllocs(Block& block) {
 void promoteToStack(memref::DeallocOp dealloc) {
   auto alloc = dealloc.getMemref().getDefiningOp<memref::AllocOp>();
   OpBuilder b(alloc);
-  auto alloca = b.create<memref::AllocaOp>(
-      alloc->getLoc(), mlir::cast<MemRefType>(alloc->getResultTypes()[0]),
+  auto alloca = memref::AllocaOp::create(
+      b, alloc->getLoc(), mlir::cast<MemRefType>(alloc->getResultTypes()[0]),
       alloc.getAlignmentAttr());
   alloc->replaceAllUsesWith(ValueRange{alloca.getResult()});
   alloc->erase();
diff --git a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
index dc5014afc09bd0..b470ad53c61a63 100644
--- a/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
+++ b/third_party/xla/xla/mlir_hlo/deallocation/utils/util.cc
@@ -85,25 +85,25 @@ RegionBranchOpInterface moveRegionsToNewOpButKeepOldOp(
   OpBuilder b(op);
   RegionBranchOpInterface newOp;
   if (llvm::isa<scf::ForOp>(op)) {
-    newOp = b.create<scf::ForOp>(op.getLoc(), op->getOperands()[0],
-                                 op->getOperands()[1], op->getOperands()[2],
-                                 op->getOperands().drop_front(3));
+    newOp = scf::ForOp::create(b, op.getLoc(), op->getOperands()[0],
+                               op->getOperands()[1], op->getOperands()[2],
+                               op->getOperands().drop_front(3));
   } else if (llvm::isa<scf::WhileOp>(op)) {
-    newOp = b.create<scf::WhileOp>(
-        op.getLoc(),
+    newOp = scf::WhileOp::create(
+        b, op.getLoc(),
         TypeRange{op->getRegion(0).front().getTerminator()->getOperands()}
             .drop_front(),
         op->getOperands());
   } else if (llvm::isa<scf::IfOp>(op)) {
-    newOp = b.create<scf::IfOp>(
-        op.getLoc(),
+    newOp = scf::IfOp::create(
+        b, op.getLoc(),
         TypeRange{op->getRegion(0).front().getTerminator()->getOperands()},
         op->getOperands()[0], op->getNumRegions() > 1);
   } else if (llvm::isa<scf::ParallelOp>(op)) {
     auto parallel = llvm::cast<scf::ParallelOp>(op);
-    newOp = b.create<scf::ParallelOp>(
-        op.getLoc(), parallel.getLowerBound(), parallel.getUpperBound(),
-        parallel.getStep(), parallel.getInitVals());
+    newOp = scf::ParallelOp::create(b, op.getLoc(), parallel.getLowerBound(),
+                                    parallel.getUpperBound(),
+                                    parallel.getStep(), parallel.getInitVals());
   } else {
     llvm_unreachable("unsupported");
   }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 6e91413b1149cb..48d15aaafbe12c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -239,7 +239,7 @@ static void replaceOpWithRegion(PatternRewriter& rewriter, Operation* op,
 Value maybeCastTo(OpBuilder& b, Location loc, Value value, Type type) {
   if (type == value.getType()) return value;
   assert(type.isIndex() || value.getType().isIndex());
-  return b.create<arith::IndexCastOp>(loc, type, value);
+  return arith::IndexCastOp::create(b, loc, type, value);
 }
 
 DenseElementsAttr reshape(DenseElementsAttr attr, ShapedType newType) {
@@ -941,26 +941,26 @@ LogicalResult DotGeneralOp::reifyReturnTypeShapes(
   SmallVector<Value> dimensions;
   for (const int64_t lhsDim : dimNumbers.getLhsBatchingDimensions()) {
     dimensions.push_back(
-        builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), lhsDim));
+        tensor::DimOp::create(builder, getLoc(), adaptor.getLhs(), lhsDim));
   }
 
   for (int64_t i = 0; i < lhsType.getRank(); i++) {
     if (!llvm::is_contained(dimNumbers.getLhsContractingDimensions(), i) &&
         !llvm::is_contained(dimNumbers.getLhsBatchingDimensions(), i)) {
       dimensions.push_back(
-          builder.create<tensor::DimOp>(getLoc(), adaptor.getLhs(), i));
+          tensor::DimOp::create(builder, getLoc(), adaptor.getLhs(), i));
     }
   }
   for (int64_t i = 0; i < rhsType.getRank(); i++) {
     if (!llvm::is_contained(dimNumbers.getRhsContractingDimensions(), i) &&
         !llvm::is_contained(dimNumbers.getRhsBatchingDimensions(), i)) {
       dimensions.push_back(
-          builder.create<tensor::DimOp>(getLoc(), adaptor.getRhs(), i));
+          tensor::DimOp::create(builder, getLoc(), adaptor.getRhs(), i));
     }
   }
 
   reifiedReturnShapes.push_back(
-      builder.create<tensor::FromElementsOp>(getLoc(), dimensions));
+      tensor::FromElementsOp::create(builder, getLoc(), dimensions));
   return success();
 }
 
@@ -1491,11 +1491,11 @@ struct GatherSlice : public OpRewritePattern<GatherOp> {
     }
     Type elementType = cast<TensorType>(gather.getType()).getElementType();
     auto sliceType = RankedTensorType::get(sliceShape, elementType);
-    Value result = rewriter.create<SliceOp>(
-        gather.getLoc(), sliceType, gather.getOperand(),
-        rewriter.getI64TensorAttr(sliceStart),
-        rewriter.getI64TensorAttr(sliceEnd),
-        rewriter.getI64TensorAttr(sliceStride));
+    Value result = SliceOp::create(rewriter, gather.getLoc(), sliceType,
+                                   gather.getOperand(),
+                                   rewriter.getI64TensorAttr(sliceStart),
+                                   rewriter.getI64TensorAttr(sliceEnd),
+                                   rewriter.getI64TensorAttr(sliceStride));
 
     auto collapsedSliceDims = dnums.getCollapsedSliceDims();
     if (!collapsedSliceDims.empty()) {
@@ -1506,7 +1506,8 @@ struct GatherSlice : public OpRewritePattern<GatherOp> {
         }
       }
       auto reshapeType = RankedTensorType::get(reshapeShape, elementType);
-      result = rewriter.create<ReshapeOp>(gather.getLoc(), reshapeType, result);
+      result =
+          ReshapeOp::create(rewriter, gather.getLoc(), reshapeType, result);
     }
 
     result.setType(gather.getType());
@@ -1541,7 +1542,7 @@ void getSliceSizeValues(GatherOp* gather, OpBuilder& builder, Location loc,
                         ValueRange operands,
                         SmallVectorImpl<Value>& sliceSizes) {
   for (int64_t val : gather->getSliceSizes().getValues<int64_t>()) {
-    sliceSizes.push_back(builder.create<arith::ConstantIndexOp>(loc, val));
+    sliceSizes.push_back(arith::ConstantIndexOp::create(builder, loc, val));
   }
 }
 
@@ -1552,9 +1553,9 @@ void getSliceSizeValues(DynamicGatherOp* /*dGather*/, OpBuilder& builder,
   Value sliceSizes = adaptor.getSliceSizes();
   auto sliceSizesTy = cast<ShapedType>(sliceSizes.getType());
   for (int64_t i = 0; i < sliceSizesTy.getDimSize(0); ++i) {
-    Value idx = builder.create<arith::ConstantIndexOp>(loc, i);
+    Value idx = arith::ConstantIndexOp::create(builder, loc, i);
     sliceSizeValues.push_back(
-        builder.create<tensor::ExtractOp>(loc, sliceSizes, idx));
+        tensor::ExtractOp::create(builder, loc, sliceSizes, idx));
   }
 }
 
@@ -1582,7 +1583,7 @@ LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
 
   auto getStartIndicesDim = [&](int64_t index) {
     return toShapeElType(
-        builder.create<tensor::DimOp>(loc, startIndices, index));
+        tensor::DimOp::create(builder, loc, startIndices, index));
   };
   SmallVector<Value, 4> shapeValues;
   auto getSliceDim = [&sliceSizes](int64_t index) -> Value {
@@ -1596,8 +1597,9 @@ LogicalResult reifyGatherShape(Op* op, OpBuilder& builder, ValueRange operands,
                            op->getDimensionNumbers().getIndexVectorDim(),
                            shapeValues);
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc, RankedTensorType::get({resultRank}, shapeElTy), shapeValues);
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc, RankedTensorType::get({resultRank}, shapeElTy),
+      shapeValues);
   reifiedReturnShapes.push_back(outputShape);
 
   return success();
@@ -1742,8 +1744,8 @@ struct IotaBroadcast : public OpRewritePattern<IotaOp> {
     auto iotaType = RankedTensorType::get({resultTy.getDimSize(iotaDimension)},
                                           resultTy.getElementType());
 
-    auto newIota = rewriter.create<IotaOp>(iota.getLoc(), iotaType,
-                                           rewriter.getI64IntegerAttr(0));
+    auto newIota = IotaOp::create(rewriter, iota.getLoc(), iotaType,
+                                  rewriter.getI64IntegerAttr(0));
 
     auto broadcastAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({1}, rewriter.getIntegerType(64)),
@@ -1808,21 +1810,21 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
     auto iotaDimension = iota.getIotaDimension();
     auto iotaDimensionInt = iotaDimension;
 
-    auto convertedShape = rewriter.create<arith::IndexCastOp>(
-        iota.getLoc(),
+    auto convertedShape = arith::IndexCastOp::create(
+        rewriter, iota.getLoc(),
         RankedTensorType::get(
             cast<ShapedType>(iota.getOutputShape().getType()).getShape(),
             rewriter.getI64Type()),
         iota.getOutputShape());
 
-    auto slicedShape = rewriter.create<SliceOp>(
-        iota.getLoc(), convertedShape,
-        rewriter.getI64TensorAttr(iotaDimensionInt),
-        rewriter.getI64TensorAttr(iotaDimensionInt + 1),
-        rewriter.getI64TensorAttr(1));
+    auto slicedShape =
+        SliceOp::create(rewriter, iota.getLoc(), convertedShape,
+                        rewriter.getI64TensorAttr(iotaDimensionInt),
+                        rewriter.getI64TensorAttr(iotaDimensionInt + 1),
+                        rewriter.getI64TensorAttr(1));
 
-    auto convertedSlicedShape = rewriter.create<arith::IndexCastOp>(
-        iota.getLoc(),
+    auto convertedSlicedShape = arith::IndexCastOp::create(
+        rewriter, iota.getLoc(),
         RankedTensorType::get(
             {1},
             cast<ShapedType>(iota.getOutputShape().getType()).getElementType()),
@@ -1831,9 +1833,9 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
     auto iotaType = RankedTensorType::get(
         {resultTy.getDimSize(iotaDimensionInt)}, resultTy.getElementType());
 
-    auto newIota = rewriter.create<DynamicIotaOp>(
-        iota.getLoc(), iotaType, convertedSlicedShape,
-        rewriter.getI64IntegerAttr(0));
+    auto newIota = DynamicIotaOp::create(rewriter, iota.getLoc(), iotaType,
+                                         convertedSlicedShape,
+                                         rewriter.getI64IntegerAttr(0));
 
     auto broadcastAttr = DenseIntElementsAttr::get(
         RankedTensorType::get({1}, rewriter.getIntegerType(64)),
@@ -1857,7 +1859,7 @@ static Value castToIndexTensor(OpBuilder& builder, Location loc,
   ShapedType resultTy = shape::getExtentTensorType(
       builder.getContext(), cast<ShapedType>(shapeOp.getType()).getDimSize(0));
   if (shapeOp.getType() == resultTy) return shapeOp;  // Nothing to do.
-  return builder.create<arith::IndexCastOp>(loc, resultTy, shapeOp);
+  return arith::IndexCastOp::create(builder, loc, resultTy, shapeOp);
 }
 
 LogicalResult DynamicIotaOp::reifyReturnTypeShapes(
@@ -2046,8 +2048,8 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
 
       auto dotNums = DotDimensionNumbersAttr::get(
           op.getContext(), {}, {}, {lhsContractDim}, {rhsContractDim});
-      auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
-          op.getLoc(), op.getType(), lhs, rhs, dotNums,
+      auto dotOp = mhlo::DotGeneralOp::create(
+          rewriter, op.getLoc(), op.getType(), lhs, rhs, dotNums,
           op.getPrecisionConfig().value_or(nullptr), DotAlgorithmAttr{});
 
       rewriter.replaceOp(op, dotOp.getResult());
@@ -2072,8 +2074,8 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
     lhsTy = RankedTensorType::get(lhsShape, lhsTy.getElementType());
     rhsTy = RankedTensorType::get(rhsShape, rhsTy.getElementType());
 
-    lhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), lhsTy, lhs);
-    rhs = rewriter.create<mhlo::ReshapeOp>(op.getLoc(), rhsTy, rhs);
+    lhs = mhlo::ReshapeOp::create(rewriter, op.getLoc(), lhsTy, lhs);
+    rhs = mhlo::ReshapeOp::create(rewriter, op.getLoc(), rhsTy, rhs);
 
     auto dotTy = RankedTensorType::get(
         {featureGroupCount, lhsBatchSize, rhsBatchSize / featureGroupCount},
@@ -2082,8 +2084,8 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
     auto dotNums = DotDimensionNumbersAttr::get(
         op.getContext(), {lhsContractDim}, {rhsContractDim},
         {lhsContractDim + 1}, {rhsContractDim == 0 ? 2 : 0});
-    auto dotOp = rewriter.create<mhlo::DotGeneralOp>(
-        op.getLoc(), dotTy, lhs, rhs, dotNums,
+    auto dotOp = mhlo::DotGeneralOp::create(
+        rewriter, op.getLoc(), dotTy, lhs, rhs, dotNums,
         op.getPrecisionConfig().value_or(nullptr), DotAlgorithmAttr{});
 
     llvm::SmallVector<int64_t> perms;
@@ -2095,8 +2097,9 @@ struct ConvolutionIsDot : public OpRewritePattern<mhlo::ConvolutionOp> {
         {dotTy.getDimSize(perms[0]), dotTy.getDimSize(perms[1]),
          dotTy.getDimSize(perms[2])},
         dotTy.getElementType());
-    auto transposeOp = rewriter.create<mhlo::TransposeOp>(
-        op.getLoc(), transposeTy, dotOp, rewriter.getI64TensorAttr(perms));
+    auto transposeOp =
+        mhlo::TransposeOp::create(rewriter, op.getLoc(), transposeTy, dotOp,
+                                  rewriter.getI64TensorAttr(perms));
 
     rewriter.replaceOpWithNewOp<mhlo::ReshapeOp>(op, resultTy, transposeOp);
     return success();
@@ -2290,8 +2293,8 @@ struct EliminateRedundantConvert : public OpRewritePattern<ConvertOp> {
       // like fp16 -> fp32 -> fp64, bf16 -> fp32 -> fp16
       if (cast<FloatType>(secondType).getWidth() >
           cast<FloatType>(firstType).getWidth()) {
-        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
-                                                  convertOp.getOperand());
+        Value result = ConvertOp::create(
+            rewriter, loc, op.getResult().getType(), convertOp.getOperand());
         rewriter.replaceOp(op, result);
         return success();
       }
@@ -2301,8 +2304,8 @@ struct EliminateRedundantConvert : public OpRewritePattern<ConvertOp> {
       // like i16 -> i32 -> i64, u16 -> i32 -> u32
       if (cast<IntegerType>(secondType).getWidth() >
           cast<IntegerType>(firstType).getWidth()) {
-        Value result = rewriter.create<ConvertOp>(loc, op.getResult().getType(),
-                                                  convertOp.getOperand());
+        Value result = ConvertOp::create(
+            rewriter, loc, op.getResult().getType(), convertOp.getOperand());
         rewriter.replaceOp(op, result);
         return success();
       }
@@ -2702,7 +2705,7 @@ LogicalResult BroadcastOp::reifyReturnTypeShapes(
   // Collect the broadcast sizes.
   for (const auto& size : getBroadcastSizes()) {
     shapeValues.push_back(
-        builder.create<arith::ConstantIndexOp>(loc, size.getZExtValue()));
+        arith::ConstantIndexOp::create(builder, loc, size.getZExtValue()));
   }
 
   // Collect the operand sizes.
@@ -2711,8 +2714,8 @@ LogicalResult BroadcastOp::reifyReturnTypeShapes(
         builder.createOrFold<tensor::DimOp>(loc, operand, index));
   }
 
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
+  reifiedReturnShapes.push_back(tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             builder.getIndexType()),
       shapeValues));
@@ -2874,7 +2877,8 @@ namespace {
 template <typename OpTy, typename... Args>
 OpTy refineOpWithNewOp(PatternRewriter& rewriter, Operation* op,
                        Args&&... args) {
-  auto newOp = rewriter.create<OpTy>(op->getLoc(), std::forward<Args>(args)...);
+  auto newOp =
+      OpTy::create(rewriter, op->getLoc(), std::forward<Args>(args)...);
 
   llvm::SmallVector<Value> replacementResults;
   assert(op->getNumResults() == newOp->getNumResults() &&
@@ -2885,8 +2889,8 @@ OpTy refineOpWithNewOp(PatternRewriter& rewriter, Operation* op,
     if (llvm::any_of(opResult.getUsers(), [&](Operation* user) {
           return user->getDialect() != op->getDialect();
         })) {
-      replacementResult = rewriter.create<tensor::CastOp>(
-          op->getLoc(), opResult.getType(), newOpResult);
+      replacementResult = tensor::CastOp::create(
+          rewriter, op->getLoc(), opResult.getType(), newOpResult);
     }
     replacementResults.push_back(replacementResult);
   }
@@ -3274,7 +3278,7 @@ LogicalResult ConcatenateOp::reifyReturnTypeShapes(
     SmallVector<Value, 4> shapeVals;
     for (const auto& element : llvm::enumerate(operandType.getShape())) {
       Value valueDim = toShapeScalarType(
-          builder.create<tensor::DimOp>(loc, operand, element.index()));
+          tensor::DimOp::create(builder, loc, operand, element.index()));
       shapeVals.push_back(valueDim);
     }
     allShapeValues.emplace_back(std::move(shapeVals));
@@ -3289,12 +3293,12 @@ LogicalResult ConcatenateOp::reifyReturnTypeShapes(
           << "Concatenate expects all operands must be of the same rank";
       return failure();
     }
-    shapeValues[axis] = builder.create<arith::AddIOp>(loc, shapeValues[axis],
-                                                      otherShapeValues[axis]);
+    shapeValues[axis] = arith::AddIOp::create(builder, loc, shapeValues[axis],
+                                              otherShapeValues[axis]);
   }
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues);
@@ -3489,8 +3493,8 @@ struct DynamicSliceToSlice : public OpRewritePattern<DynamicSliceOp> {
         sliceStartIndices, dynamicSlice.getSliceSizes(), &rewriter);
     DenseIntElementsAttr sliceStrides =
         rewriter.getI64TensorAttr(SmallVector<int64_t, 4>(inputRank, 1));
-    auto result = rewriter.create<SliceOp>(loc, input, sliceStartIndices,
-                                           sliceLimits, sliceStrides);
+    auto result = SliceOp::create(rewriter, loc, input, sliceStartIndices,
+                                  sliceLimits, sliceStrides);
     rewriter.replaceOp(dynamicSlice, result);
     return success();
   }
@@ -3568,14 +3572,15 @@ struct RealDSliceToDSlice : public OpRewritePattern<RealDynamicSliceOp> {
     // Adapt accordingly in order to be compatible with DynamicSliceOp.
     SmallVector<Value> startIndices;
     for (auto i = 0; i < static_cast<int64_t>(sliceSizes.size()); ++i) {
-      auto startIndex1D = rewriter.create<SliceOp>(
-          op.getLoc(), op.getStartIndices(), rewriter.getI64TensorAttr(i),
-          rewriter.getI64TensorAttr(i + 1), rewriter.getI64TensorAttr(1));
+      auto startIndex1D = SliceOp::create(
+          rewriter, op.getLoc(), op.getStartIndices(),
+          rewriter.getI64TensorAttr(i), rewriter.getI64TensorAttr(i + 1),
+          rewriter.getI64TensorAttr(1));
       auto startIndex0DType = RankedTensorType::get(
           {},
           cast<ShapedType>(op.getStartIndices().getType()).getElementType());
-      auto startIndex0D = rewriter.create<ReshapeOp>(
-          op.getLoc(), startIndex0DType, startIndex1D);
+      auto startIndex0D = ReshapeOp::create(rewriter, op.getLoc(),
+                                            startIndex0DType, startIndex1D);
       startIndices.push_back(startIndex0D);
     }
 
@@ -3610,29 +3615,31 @@ LogicalResult RealDynamicSliceOp::reifyReturnTypeShapes(
   shapeValues.reserve(operandType.getRank());
   Type shapeScalarType =
       cast<ShapedType>(startIndices.getType()).getElementType();
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  Value one = arith::ConstantIndexOp::create(builder, loc, 1);
   one = maybeCastTo(builder, loc, one, shapeScalarType);
   for (const auto& element : llvm::enumerate(operandType.getShape())) {
-    Value offset = builder.create<arith::ConstantIndexOp>(loc, element.index());
+    Value offset =
+        arith::ConstantIndexOp::create(builder, loc, element.index());
     Value valueStart =
-        builder.create<tensor::ExtractOp>(loc, startIndices, offset);
+        tensor::ExtractOp::create(builder, loc, startIndices, offset);
     Value valueLimit =
-        builder.create<tensor::ExtractOp>(loc, limitIndices, offset);
-    Value valueStride = builder.create<tensor::ExtractOp>(loc, strides, offset);
+        tensor::ExtractOp::create(builder, loc, limitIndices, offset);
+    Value valueStride =
+        tensor::ExtractOp::create(builder, loc, strides, offset);
     // size = (limit - start + stride - 1) / stride
-    shapeValues.push_back(builder.create<arith::DivSIOp>(
-        loc,
-        builder.create<arith::SubIOp>(
-            loc,
-            builder.create<arith::AddIOp>(
-                loc, valueStride,
-                builder.create<arith::SubIOp>(loc, valueLimit, valueStart)),
+    shapeValues.push_back(arith::DivSIOp::create(
+        builder, loc,
+        arith::SubIOp::create(
+            builder, loc,
+            arith::AddIOp::create(
+                builder, loc, valueStride,
+                arith::SubIOp::create(builder, loc, valueLimit, valueStart)),
             one),
         valueStride));
   }
 
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
+  reifiedReturnShapes.push_back(tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues));
@@ -4208,8 +4215,8 @@ struct LowerBoolSplatConstantsIntoRegion : public OpRewritePattern<ReduceOp> {
     // Create new splat constants to replace block arguments.
     for (BlockArgument barg : bb.getArguments()) {
       int argIdx = barg.getArgNumber();
-      mhlo::ConstantOp newCst = rewriter.create<mhlo::ConstantOp>(
-          bb.front().getLoc(), barg.getType(), bargCstAttrs[argIdx]);
+      mhlo::ConstantOp newCst = mhlo::ConstantOp::create(
+          rewriter, bb.front().getLoc(), barg.getType(), bargCstAttrs[argIdx]);
       barg.replaceAllUsesWith(newCst);
     }
     return success();
@@ -4230,8 +4237,8 @@ static LogicalResult convertEmptyReduces(ReduceOp op,
     auto empty = rewriter.getI64TensorAttr({});
     if (t.hasStaticShape()) {
       for (auto [init, out] : llvm::zip(op.getInitValues(), op.getResults())) {
-        out.replaceAllUsesWith(rewriter.create<BroadcastInDimOp>(
-            op.getLoc(), out.getType(), init, empty));
+        out.replaceAllUsesWith(BroadcastInDimOp::create(
+            rewriter, op.getLoc(), out.getType(), init, empty));
       }
       return success();
     }
@@ -4241,8 +4248,8 @@ static LogicalResult convertEmptyReduces(ReduceOp op,
       return failure();
     for (auto [init, shape, out] :
          llvm::zip(op.getInitValues(), shapes, op.getResults())) {
-      out.replaceAllUsesWith(rewriter.create<DynamicBroadcastInDimOp>(
-          op.getLoc(), out.getType(), init, shape, empty));
+      out.replaceAllUsesWith(DynamicBroadcastInDimOp::create(
+          rewriter, op.getLoc(), out.getType(), init, shape, empty));
     }
     return success();
   }
@@ -4282,12 +4289,12 @@ LogicalResult ReduceOp::reifyReturnTypeShapes(
       continue;
     }
     Value valueDim = toShapeScalarType(
-        builder.create<tensor::DimOp>(loc, inputs[0], element.index()));
+        tensor::DimOp::create(builder, loc, inputs[0], element.index()));
     shapeValues.push_back(valueDim);
   }
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues);
@@ -4614,36 +4621,37 @@ LogicalResult PadOp::reifyReturnTypeShapes(
   for (const APInt& val : padInteriorAttr.getValues<APInt>())
     padInterior.push_back(val.getSExtValue());
 
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1).getResult();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0).getResult();
+  Value one = arith::ConstantIndexOp::create(builder, loc, 1).getResult();
+  Value zero = arith::ConstantIndexOp::create(builder, loc, 0).getResult();
 
   llvm::SmallVector<Value> dimensions;
   dimensions.reserve(operandTy.getRank());
   for (int i = 0, s = operandTy.getRank(); i < s; ++i) {
     Value padEdge =
-        builder.create<arith::ConstantIndexOp>(loc, padHigh[i] + padLow[i]);
+        arith::ConstantIndexOp::create(builder, loc, padHigh[i] + padLow[i]);
 
     // First we grab the initial interior size.
-    Value dim = builder.create<tensor::DimOp>(loc, operand, i).getResult();
+    Value dim = tensor::DimOp::create(builder, loc, operand, i).getResult();
 
     // Compute the interior of the tensor and determine padding size.
     if (padInterior[i] > 0) {
       Value padInter =
-          builder.create<arith::ConstantIndexOp>(loc, padInterior[i])
+          arith::ConstantIndexOp::create(builder, loc, padInterior[i])
               .getResult();
-      Value interior = builder.create<arith::SubIOp>(loc, dim, one).getResult();
-      interior = builder.create<arith::MaxSIOp>(loc, interior, zero);
-      interior = builder.create<arith::MulIOp>(loc, interior, padInter);
-      dim = builder.create<arith::AddIOp>(loc, dim, interior).getResult();
+      Value interior =
+          arith::SubIOp::create(builder, loc, dim, one).getResult();
+      interior = arith::MaxSIOp::create(builder, loc, interior, zero);
+      interior = arith::MulIOp::create(builder, loc, interior, padInter);
+      dim = arith::AddIOp::create(builder, loc, dim, interior).getResult();
     }
 
     // Then we add the padding on the edge of the tensor.
-    dim = builder.create<arith::AddIOp>(loc, dim, padEdge).getResult();
+    dim = arith::AddIOp::create(builder, loc, dim, padEdge).getResult();
     dimensions.push_back(dim);
   }
 
   Value dimensionTensor =
-      builder.create<tensor::FromElementsOp>(loc, dimensions).getResult();
+      tensor::FromElementsOp::create(builder, loc, dimensions).getResult();
   reifiedReturnShapes.push_back(dimensionTensor);
   return success();
 }
@@ -4767,38 +4775,40 @@ LogicalResult DynamicPadOp::reifyReturnTypeShapes(
   };
 
   Value zero =
-      toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 0));
-  Value one = toShapeScalarType(builder.create<arith::ConstantIndexOp>(loc, 1));
+      toShapeScalarType(arith::ConstantIndexOp::create(builder, loc, 0));
+  Value one =
+      toShapeScalarType(arith::ConstantIndexOp::create(builder, loc, 1));
 
   for (int idx : llvm::seq<int>(0, operandType.getShape().size())) {
     Value valueDim =
-        toShapeScalarType(builder.create<tensor::DimOp>(loc, operand, idx));
-    Value offset = builder.create<arith::ConstantIndexOp>(loc, idx);
+        toShapeScalarType(tensor::DimOp::create(builder, loc, operand, idx));
+    Value offset = arith::ConstantIndexOp::create(builder, loc, idx);
     Value valueLow =
-        builder.create<tensor::ExtractOp>(loc, edgePaddingLow, offset);
+        tensor::ExtractOp::create(builder, loc, edgePaddingLow, offset);
     Value valueHigh =
-        builder.create<tensor::ExtractOp>(loc, edgePaddingHigh, offset);
+        tensor::ExtractOp::create(builder, loc, edgePaddingHigh, offset);
     Value valueInterior =
-        builder.create<tensor::ExtractOp>(loc, interiorPadding, offset);
+        tensor::ExtractOp::create(builder, loc, interiorPadding, offset);
     // output_size = input_size + padding_low + padding_high + interior *
     // max(input_size - 1, 0)
-    Value valueDimLessThanOne = builder.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, valueDim, one);
-    Value interiorSize = builder.create<arith::MulIOp>(
-        loc, valueInterior,
-        builder.create<mlir::arith::SelectOp>(
-            loc, valueDimLessThanOne, zero,
-            builder.create<arith::SubIOp>(loc, valueDim, one)));
-    shapeValues.push_back(builder.create<arith::AddIOp>(
-        loc,
-        builder.create<arith::AddIOp>(
-            loc, builder.create<arith::AddIOp>(loc, interiorSize, valueDim),
+    Value valueDimLessThanOne = arith::CmpIOp::create(
+        builder, loc, arith::CmpIPredicate::slt, valueDim, one);
+    Value interiorSize = arith::MulIOp::create(
+        builder, loc, valueInterior,
+        mlir::arith::SelectOp::create(
+            builder, loc, valueDimLessThanOne, zero,
+            arith::SubIOp::create(builder, loc, valueDim, one)));
+    shapeValues.push_back(arith::AddIOp::create(
+        builder, loc,
+        arith::AddIOp::create(
+            builder, loc,
+            arith::AddIOp::create(builder, loc, interiorSize, valueDim),
             valueLow),
         valueHigh));
   }
 
-  reifiedReturnShapes.push_back(builder.create<tensor::FromElementsOp>(
-      loc,
+  reifiedReturnShapes.push_back(tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues));
@@ -5684,8 +5694,8 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
     }
 
     auto concatRange = OperandRange(subsetStart, subsetEnd);
-    auto newConcat = rewriter.create<ConcatenateOp>(
-        concat.getLoc(), concatRange, concat.getDimension());
+    auto newConcat = ConcatenateOp::create(rewriter, concat.getLoc(),
+                                           concatRange, concat.getDimension());
 
     llvm::SmallVector<APInt, 6> newStart(start);
     llvm::SmallVector<APInt, 6> newLimit(limit);
@@ -5693,10 +5703,10 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
     newLimit[dimension] -= frontOffset;
 
     auto attrType = cast<ShapedType>(slice.getStartIndices().getType());
-    auto create = rewriter.create<SliceOp>(
-        slice.getLoc(), newConcat,
-        DenseIntElementsAttr::get(attrType, newStart),
-        DenseIntElementsAttr::get(attrType, newLimit), slice.getStrides());
+    auto create = SliceOp::create(rewriter, slice.getLoc(), newConcat,
+                                  DenseIntElementsAttr::get(attrType, newStart),
+                                  DenseIntElementsAttr::get(attrType, newLimit),
+                                  slice.getStrides());
     rewriter.replaceOp(slice, create.getResult());
     return success();
   }
@@ -5763,8 +5773,8 @@ static LogicalResult sortDropEmptyUseArgs(SortOp op,
     }
   }
 
-  auto newOp = rewriter.create<SortOp>(op.getLoc(), newOperands,
-                                       op.getDimension(), op.getIsStable());
+  auto newOp = SortOp::create(rewriter, op.getLoc(), newOperands,
+                              op.getDimension(), op.getIsStable());
   Region& region = newOp.getComparator();
   rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
   region.front().eraseArguments(erasedBlockArgs);
@@ -5795,9 +5805,8 @@ static LogicalResult sortOpInferDefaultDimension(SortOp op,
   }
 
   IntegerAttr dim = rewriter.getI64IntegerAttr(ty.getRank() - 1);
-  auto newOp =
-      rewriter.create<SortOp>(op.getLoc(), op.getResultTypes(), op.getInputs(),
-                              dim, op.getIsStableAttr());
+  auto newOp = SortOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                              op.getInputs(), dim, op.getIsStableAttr());
   Region& region = newOp.getComparator();
   rewriter.inlineRegionBefore(op.getComparator(), region, region.end());
   rewriter.replaceOp(op, newOp.getResults());
@@ -5964,8 +5973,8 @@ LogicalResult TransposeOp::reifyReturnTypeShapes(
     shapeValues[std::distance(permutation.begin(), it)] = valueDim;
   }
 
-  Value outputShape = builder.create<tensor::FromElementsOp>(
-      loc,
+  Value outputShape = tensor::FromElementsOp::create(
+      builder, loc,
       RankedTensorType::get({static_cast<int64_t>(shapeValues.size())},
                             shapeScalarType),
       shapeValues);
@@ -6432,8 +6441,8 @@ struct ScatterFullReplace : public OpRewritePattern<ScatterOp> {
 
     auto dimensions =
         llvm::to_vector(llvm::seq<int64_t>(0, baseType.getRank()));
-    auto map = rewriter.create<mhlo::MapOp>(
-        scatter.getLoc(), scatter->getResultTypes(),
+    auto map = mhlo::MapOp::create(
+        rewriter, scatter.getLoc(), scatter->getResultTypes(),
         ValueRange{scatter.getOperands()[0], scatter.getUpdates()[0]},
         rewriter.getI64TensorAttr(dimensions));
     rewriter.inlineRegionBefore(scatter.getRegion(), map.getRegion(),
@@ -6536,9 +6545,9 @@ static LogicalResult whileCanonicalization(WhileOp whileOp,
   for (int idx : llvm::reverse(invariantArgIdxs))
     bodyReturnOp->eraseOperand(idx);
 
-  WhileOp newWhileOp = rewriter.create<WhileOp>(
-      whileOp.getLoc(), bodyReturnOp->getOperandTypes(), newOperands,
-      whileOp->getAttrs());
+  WhileOp newWhileOp = WhileOp::create(rewriter, whileOp.getLoc(),
+                                       bodyReturnOp->getOperandTypes(),
+                                       newOperands, whileOp->getAttrs());
   newWhileOp.getBodyRegion(0).takeBody(whileOp.getBodyRegion(0));
   newWhileOp.getBodyRegion(1).takeBody(whileOp.getBodyRegion(1));
   for (auto results : llvm::zip(resultsToReplace, newWhileOp->getResults()))
@@ -7546,10 +7555,11 @@ static void buildSortComparisonBody(llvm::ArrayRef<Type> elementTypes,
     typeAttr = symbolizeComparisonType(*compareType).value();
   else
     typeAttr = ComparisonType::NOTYPE;
-  Value compare = builder->create<mhlo::CompareOp>(
-      loc, block->getArgument(0), block->getArgument(1), direction, typeAttr);
+  Value compare =
+      mhlo::CompareOp::create(*builder, loc, block->getArgument(0),
+                              block->getArgument(1), direction, typeAttr);
 
-  builder->create<mhlo::ReturnOp>(loc, compare);
+  mhlo::ReturnOp::create(*builder, loc, compare);
 }
 
 SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
@@ -7559,7 +7569,7 @@ SortOp createSortOp(PatternRewriter* rewriter, const Location& loc,
   assert(!operands.empty() && "No operands to sort");
   // Create the sort op.
   auto sortOp =
-      rewriter->create<mhlo::SortOp>(loc, operands, dimension, isStable);
+      mhlo::SortOp::create(*rewriter, loc, operands, dimension, isStable);
 
   // Use TOTALORDER comparison type instead of the default comparison if the
   // element type is of type float.
@@ -7595,13 +7605,13 @@ Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
           (attrShapedType.getShape() != resultShapedType.getShape()))
         return nullptr;
     }
-    return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
+    return mhlo::ConstantOp::create(builder, loc, type, elementsAttr);
   }
   // HLO dialect constants require the type of value and result to match for
   // non-quantized tensors.
   if (type != elementsAttr.getType()) return nullptr;
 
-  return builder.create<mhlo::ConstantOp>(loc, type, elementsAttr);
+  return mhlo::ConstantOp::create(builder, loc, type, elementsAttr);
 }
 
 static int64_t getNumLeafBuffers(Type type) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
index 8d3378e9c0d453..561d072ca3a672 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/expand_hlo_tuples/expand_hlo_tuples.cc
@@ -93,7 +93,7 @@ class ExpandHloTuplesPass
         OpBuilder builder(func.getBody());
         builder.setInsertionPointToStart(&func.getBody().front());
         auto newTuple =
-            builder.create<mhlo::TupleOp>(loc, tupleType, flattenedOperands);
+            mhlo::TupleOp::create(builder, loc, tupleType, flattenedOperands);
         func.getArgument(originalArgumentIndex).replaceAllUsesWith(newTuple);
 
         // Now the original argument has been rewired, we should be able to
@@ -129,8 +129,8 @@ class ExpandHloTuplesPass
       return success();
     }
 
-    builder.create<mlir::func::ReturnOp>(returnOp.getLoc(),
-                                         expandedReturnOperands);
+    mlir::func::ReturnOp::create(builder, returnOp.getLoc(),
+                                 expandedReturnOperands);
     returnOp.erase();
     auto newFuncType = FunctionType::get(
         oldFuncType.getContext(), expandedInputTypes, expandedResultTypes);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
index 7f3c6b3e24f8cf..88d1c06712800c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
@@ -141,8 +141,8 @@ struct DynamicReshapeOpInterface
       if (failed(tensorAlloc)) return failure();
       auto memrefType =
           MemRefType::get(bufferType.getShape(), bufferType.getElementType());
-      operand = rewriter.create<bufferization::ToBufferOp>(
-          op->getLoc(), memrefType, *tensorAlloc);
+      operand = bufferization::ToBufferOp::create(rewriter, op->getLoc(),
+                                                  memrefType, *tensorAlloc);
     }
     bufferization::replaceOpWithNewBufferizedOp<memref::ReshapeOp>(
         rewriter, op, resultType, operand, *outputShapeBuffer);
@@ -165,8 +165,8 @@ FailureOr<Value> insertDynamicMemrefCastOp(
   auto resultType = mlir::cast<RankedTensorType>(op.getType());
   auto resultRank = resultType.getRank();
 
-  Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+  Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
 
   // Compute a reversed scan product. Compute the stride for the dimensions so
   // far, working from minor to major dimensions. Additionally, save the
@@ -177,15 +177,15 @@ FailureOr<Value> insertDynamicMemrefCastOp(
   for (int i = operandRank - 1; i >= 0; --i) {
     Value operandDimSize =
         ShapedType::isDynamic(operandShape[i])
-            ? rewriter.create<memref::DimOp>(loc, operand, i).getResult()
-            : rewriter.create<arith::ConstantIndexOp>(loc, operandShape[i])
+            ? memref::DimOp::create(rewriter, loc, operand, i).getResult()
+            : arith::ConstantIndexOp::create(rewriter, loc, operandShape[i])
                   .getResult();
     operandSizes[i] = operandDimSize;
 
     operandStrides[i] = strideSoFar;
     if (i > 0) {
       strideSoFar =
-          rewriter.create<arith::MulIOp>(loc, strideSoFar, operandDimSize);
+          arith::MulIOp::create(rewriter, loc, strideSoFar, operandDimSize);
     }
   }
 
@@ -198,15 +198,15 @@ FailureOr<Value> insertDynamicMemrefCastOp(
     outputToInputDim[dim.value().getSExtValue()] = dim.index();
   }
   for (int i = 0; i < resultRank; ++i) {
-    Value iVal = rewriter.create<arith::ConstantIndexOp>(loc, i);
+    Value iVal = arith::ConstantIndexOp::create(rewriter, loc, i);
     FailureOr<Value> outputDimsBuffer =
         getBuffer(rewriter, op.getOutputDimensions(), options, state);
     if (failed(outputDimsBuffer)) return failure();
     Value resultDimSize =
-        rewriter.create<memref::LoadOp>(loc, *outputDimsBuffer, iVal);
+        memref::LoadOp::create(rewriter, loc, *outputDimsBuffer, iVal);
     if (!resultDimSize.getType().isIndex()) {
-      resultDimSize = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), resultDimSize);
+      resultDimSize = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getIndexType(), resultDimSize);
     }
     if (resultType.isDynamicDim(i)) {
       sizes.push_back(resultDimSize);
@@ -229,10 +229,11 @@ FailureOr<Value> insertDynamicMemrefCastOp(
     //    => stride flattened buffer stride
     // 2) Operand dim < result dim => expansion is needed => stride := 0.
     int dim = it->second;
-    Value isExpansion = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::slt, operandSizes[dim], resultDimSize);
-    Value select = rewriter.create<mlir::arith::SelectOp>(
-        loc, isExpansion, zero, operandStrides[dim]);
+    Value isExpansion =
+        arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::slt,
+                              operandSizes[dim], resultDimSize);
+    Value select = mlir::arith::SelectOp::create(rewriter, loc, isExpansion,
+                                                 zero, operandStrides[dim]);
     strides.push_back(select);
   }
 
@@ -243,8 +244,8 @@ FailureOr<Value> insertDynamicMemrefCastOp(
       makeStridedLinearLayoutMap(dynamicLayout,
                                  /*offset=*/0, rewriter.getContext()));
 
-  auto transformedOperand = rewriter.create<memref::ReinterpretCastOp>(
-      loc, typeErasedMemrefType, operand,
+  auto transformedOperand = memref::ReinterpretCastOp::create(
+      rewriter, loc, typeErasedMemrefType, operand,
       /*offset=*/rewriter.getI64IntegerAttr(0), sizes, strides);
   return transformedOperand.getResult();
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index e52bab960be48f..0362efa19df89e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -442,8 +442,8 @@ FailureOr<func::FuncOp> rewriteMhloRegionAsFunc(
   auto& block = region.getBlocks().front();
   auto type = rewriter.getFunctionType(
       block.getArgumentTypes(), block.getTerminator()->getOperandTypes());
-  auto funcOp = rewriter.create<func::FuncOp>(
-      region.getLoc(), op->getName().stripDialect(), type);
+  auto funcOp = func::FuncOp::create(rewriter, region.getLoc(),
+                                     op->getName().stripDialect(), type);
   symTable.insert(funcOp);
 
   // Move region into new function
@@ -685,9 +685,9 @@ class HloToStablehloOpConverter
     // for the generic builder.
     HloToStablehloOp<HloOpTy> stablehloOp;
     if constexpr (std::is_same<HloOpTy, mhlo::CaseOp>::value) {
-      stablehloOp = rewriter.create<stablehlo::CaseOp>(
-          hloOp.getLoc(), stablehloTypes, stablehloOperands, stablehloAttrs,
-          hloOp.getBranches().size());
+      stablehloOp = stablehlo::CaseOp::create(
+          rewriter, hloOp.getLoc(), stablehloTypes, stablehloOperands,
+          stablehloAttrs, hloOp.getBranches().size());
     } else {
       stablehloOp = rewriter.create<HloToStablehloOp<HloOpTy>>(
           hloOp.getLoc(), stablehloTypes, stablehloOperands, stablehloAttrs);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index 7502231cc5814b..9c5a34351d8ef9 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -159,9 +159,9 @@ struct EinsumToDotGeneralPattern : public OpRewritePattern<EinsumOp> {
     auto dimNumbers = mhlo::DotDimensionNumbersAttr::get(
         rewriter.getContext(), lhsBatchingDims, rhsBatchingDims,
         lhsContractingDims, rhsContractingDims);
-    auto dotGeneralOp = rewriter.create<DotGeneralOp>(
-        einsum.getLoc(), dotGeneralResultType, einsum.getLhs(), einsum.getRhs(),
-        dimNumbers,
+    auto dotGeneralOp = DotGeneralOp::create(
+        rewriter, einsum.getLoc(), dotGeneralResultType, einsum.getLhs(),
+        einsum.getRhs(), dimNumbers,
         /*precision_config=*/ArrayAttr{}, /*dot_algorithm=*/DotAlgorithmAttr{});
 
     if (isNaturalOrder) {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
index 4e2f9c247d2dd8..071efb37345760 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
@@ -63,7 +63,7 @@ class ApproximateOnExtendedF32Lowering : public OpRewritePattern<OpTy> {
       if (argTy.isF64()) return failure();
 
       if (argTy.isF16())
-        arg = rewriter.create<arith::ExtFOp>(loc, rewriter.getF32Type(), arg);
+        arg = arith::ExtFOp::create(rewriter, loc, rewriter.getF32Type(), arg);
 
       // If we still do not have f32, fail.
       if (!arg.getType().isF32()) return failure();
@@ -77,7 +77,7 @@ class ApproximateOnExtendedF32Lowering : public OpRewritePattern<OpTy> {
     // Truncate back if needed.
     if (op.getType().isF16()) {
       result =
-          rewriter.create<arith::TruncFOp>(loc, rewriter.getF16Type(), result);
+          arith::TruncFOp::create(rewriter, loc, rewriter.getF16Type(), result);
     }
 
     rewriter.replaceOp(op, {result});
@@ -108,59 +108,62 @@ class ApproximateTanhLowering
         4.89352518554385e-03f};
 
     // Materialize polynomial approximation.
-    Value inputSquared = rewriter.create<arith::MulFOp>(loc, input, input);
-    Value numerator = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getF32FloatAttr(numeratorCoeffs[0]));
+    Value inputSquared = arith::MulFOp::create(rewriter, loc, input, input);
+    Value numerator = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getF32FloatAttr(numeratorCoeffs[0]));
     for (int64_t i = 1; i < static_cast<int64_t>(numeratorCoeffs.size()); i++) {
-      numerator = rewriter.create<arith::AddFOp>(
-          loc, rewriter.create<arith::MulFOp>(loc, inputSquared, numerator),
-          rewriter.create<arith::ConstantOp>(
-              loc, rewriter.getF32FloatAttr(numeratorCoeffs[i])));
+      numerator = arith::AddFOp::create(
+          rewriter, loc,
+          arith::MulFOp::create(rewriter, loc, inputSquared, numerator),
+          arith::ConstantOp::create(
+              rewriter, loc, rewriter.getF32FloatAttr(numeratorCoeffs[i])));
     }
-    numerator = rewriter.create<arith::MulFOp>(loc, input, numerator);
-    Value denominator = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getF32FloatAttr(denominatorCoeffs[0]));
+    numerator = arith::MulFOp::create(rewriter, loc, input, numerator);
+    Value denominator = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getF32FloatAttr(denominatorCoeffs[0]));
     for (int64_t i = 1; i < static_cast<int64_t>(denominatorCoeffs.size());
          i++) {
-      denominator = rewriter.create<arith::AddFOp>(
-          loc, rewriter.create<arith::MulFOp>(loc, inputSquared, denominator),
-          rewriter.create<arith::ConstantOp>(
-              loc, rewriter.getF32FloatAttr(denominatorCoeffs[i])));
+      denominator = arith::AddFOp::create(
+          rewriter, loc,
+          arith::MulFOp::create(rewriter, loc, inputSquared, denominator),
+          arith::ConstantOp::create(
+              rewriter, loc, rewriter.getF32FloatAttr(denominatorCoeffs[i])));
     }
-    Value approx = rewriter.create<arith::DivFOp>(loc, numerator, denominator);
+    Value approx = arith::DivFOp::create(rewriter, loc, numerator, denominator);
 
     // For small values of |x|, we can approximate tanh(x) = x. For extremely
     // small values of x (|x| < 1e-37), the other approximation would evaluate
     // tanh(x) = 0.
     constexpr float kUseIdentityApprox = 0.0004;
-    Value absInput = rewriter.create<math::AbsFOp>(loc, input);
-    Value useIdentityApprox = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::OLT, absInput,
-        rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getF32FloatAttr(kUseIdentityApprox)));
-    approx =
-        rewriter.create<arith::SelectOp>(loc, useIdentityApprox, input, approx);
+    Value absInput = math::AbsFOp::create(rewriter, loc, input);
+    Value useIdentityApprox = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::OLT, absInput,
+        arith::ConstantOp::create(
+            rewriter, loc, rewriter.getF32FloatAttr(kUseIdentityApprox)));
+    approx = arith::SelectOp::create(rewriter, loc, useIdentityApprox, input,
+                                     approx);
 
     // For very small/large values, use a constant approximation -1/1.
-    Value tooLargeInput = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UGT, input,
-        rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getF32FloatAttr(7.90531110763549805f)));
-    Value tooSmallInput = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::ULT, input,
-        rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getF32FloatAttr(-7.90531110763549805f)));
-    Value inputIsNan = rewriter.create<arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UNE, input, input);
-    approx = rewriter.create<arith::SelectOp>(
-        loc, tooLargeInput,
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(1.0)),
+    Value tooLargeInput = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::UGT, input,
+        arith::ConstantOp::create(
+            rewriter, loc, rewriter.getF32FloatAttr(7.90531110763549805f)));
+    Value tooSmallInput = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::ULT, input,
+        arith::ConstantOp::create(
+            rewriter, loc, rewriter.getF32FloatAttr(-7.90531110763549805f)));
+    Value inputIsNan = arith::CmpFOp::create(
+        rewriter, loc, arith::CmpFPredicate::UNE, input, input);
+    approx = arith::SelectOp::create(
+        rewriter, loc, tooLargeInput,
+        arith::ConstantOp::create(rewriter, loc, rewriter.getF32FloatAttr(1.0)),
         approx);
-    approx = rewriter.create<arith::SelectOp>(
-        loc, tooSmallInput,
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(-1.0)),
+    approx = arith::SelectOp::create(
+        rewriter, loc, tooSmallInput,
+        arith::ConstantOp::create(rewriter, loc,
+                                  rewriter.getF32FloatAttr(-1.0)),
         approx);
-    approx = rewriter.create<arith::SelectOp>(loc, inputIsNan, input, approx);
+    approx = arith::SelectOp::create(rewriter, loc, inputIsNan, input, approx);
 
     return approx;
   }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
index b86038624c4c24..46449eb5cbaadb 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
@@ -66,7 +66,7 @@ Value createTupleValue(OpBuilder &builder, Location loc,
 
   assert(mlir::cast<TupleType>(tupleType).getTypes().size() ==
          flattenValues.size());
-  return builder.create<mhlo::TupleOp>(loc, flattenValues);
+  return mhlo::TupleOp::create(builder, loc, flattenValues);
 }
 
 void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
@@ -78,8 +78,9 @@ void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
   }
   int flattenIdx = 0;
   for (auto innerType : tupleType.getTypes()) {
-    auto innerValue = builder.create<mhlo::GetTupleElementOp>(
-        loc, innerType, value, builder.getI32IntegerAttr(flattenIdx++));
+    auto innerValue = mhlo::GetTupleElementOp::create(
+        builder, loc, innerType, value,
+        builder.getI32IntegerAttr(flattenIdx++));
     flattenTupleValue(builder, loc, innerValue, flattenedValues);
   }
 }
@@ -114,8 +115,9 @@ struct FlattenCustomCallOp : public OpRewritePattern<CustomCallOp> {
         flattenTupleType(result, flattenedResultTypes);
     }
 
-    auto flattenedCall = rewriter.create<mhlo::CustomCallOp>(
-        op->getLoc(), flattenedResultTypes, flattenedOperands, op->getAttrs());
+    auto flattenedCall =
+        mhlo::CustomCallOp::create(rewriter, op->getLoc(), flattenedResultTypes,
+                                   flattenedOperands, op->getAttrs());
 
     rewriter.replaceOp(op, flattenResult
                                ? createTupleValue(rewriter, op->getLoc(),
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
index bd45785c2c5bec..35d3379583437d 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo.cc
@@ -120,10 +120,11 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
         auto start = getI64ElementsAttr({i}, &rewriter);
         auto limit = getI64ElementsAttr({i + 1}, &rewriter);
         auto stride = getI64ElementsAttr({1}, &rewriter);
-        auto indicesSlice = rewriter.create<SliceOp>(
-            gather.getLoc(), gatherStartIndices, start, limit, stride);
-        auto reshaped = rewriter.create<ReshapeOp>(
-            gather.getLoc(),
+        auto indicesSlice =
+            SliceOp::create(rewriter, gather.getLoc(), gatherStartIndices,
+                            start, limit, stride);
+        auto reshaped = ReshapeOp::create(
+            rewriter, gather.getLoc(),
             RankedTensorType::get({},
                                   mlir::cast<ShapedType>(indicesSlice.getType())
                                       .getElementType()),
@@ -139,9 +140,10 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
     // Start indices have implicit zeros when not specified. This is because
     // Gather occurs similar to slicing where full slices are inferred. Add any
     // missing zeros as necessary.
-    auto zero = rewriter.create<ConstantOp>(
-        gather.getLoc(), rewriter.getZeroAttr(RankedTensorType::get(
-                             {}, gatherStartIndicesTy.getElementType())));
+    auto zero =
+        ConstantOp::create(rewriter, gather.getLoc(),
+                           rewriter.getZeroAttr(RankedTensorType::get(
+                               {}, gatherStartIndicesTy.getElementType())));
     while (static_cast<int64_t>(sliceStartIndices.size()) <
            sliceSizesTy.getDimSize(0)) {
       sliceStartIndices.push_back(zero);
@@ -153,9 +155,9 @@ class GatherIsSlice : public OpRewritePattern<GatherOp> {
     }
 
     auto sliceTy = RankedTensorType::get(sliceShape, resultTy.getElementType());
-    auto slice = rewriter.create<DynamicSliceOp>(
-        gather.getLoc(), sliceTy, gather.getOperand(), sliceStartIndices,
-        gather.getSliceSizes());
+    auto slice = DynamicSliceOp::create(rewriter, gather.getLoc(), sliceTy,
+                                        gather.getOperand(), sliceStartIndices,
+                                        gather.getSliceSizes());
 
     rewriter.replaceOpWithNewOp<ReshapeOp>(gather, gather.getType(), slice);
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 265bb8f4cac0ea..bf92c0636b759f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -71,12 +71,12 @@ void prepareConstantOp(Operation* op, SplatElementsAttr attr) {
     assert(mlir::isa<FloatType>(complexTy.getElementType()) &&
            "unexpected int complex in MHLO");
     auto complexVal = attr.getSplatValue<std::complex<APFloat>>();
-    cst = b.create<ConstantOp>(DenseElementsAttr::get(tensorType, complexVal));
+    cst = ConstantOp::create(b, DenseElementsAttr::get(tensorType, complexVal));
   } else {
-    cst = b.create<ConstantOp>(attr.getSplatValue<Attribute>());
+    cst = ConstantOp::create(b, attr.getSplatValue<Attribute>());
   }
   auto broadcast =
-      b.create<BroadcastInDimOp>(returnType, cst, b.getI64TensorAttr({}));
+      BroadcastInDimOp::create(b, returnType, cst, b.getI64TensorAttr({}));
   if (auto sharding = op->getAttrOfType<mlir::StringAttr>(kShardingAttr)) {
     // The added broadcast inherits the kShardingAttr from op.
     broadcast->setAttr(kShardingAttr, sharding);
@@ -103,8 +103,8 @@ void prepareBroadcastInDim(BroadcastInDimOp bcast) {
     return rawDims[lhs] < rawDims[rhs];
   });
   OpBuilder builder(bcast);
-  bcast.setOperand(builder.create<TransposeOp>(
-      bcast.getLoc(), bcast.getOperand(),
+  bcast.setOperand(TransposeOp::create(
+      builder, bcast.getLoc(), bcast.getOperand(),
       DenseIntElementsAttr::get(dims.getType(), transposedDim)));
   // Now reuse the original broadcast_dimensions and sort it.
   transposedDim.assign(rawDims.begin(), rawDims.end());
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index 536d09200dad39..cb7e6793f19f5c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -404,9 +404,9 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
     // for the generic builder.
     StablehloToHloOp<StablehloOpTy> hloOp;
     if constexpr (std::is_same<StablehloOpTy, stablehlo::CaseOp>::value) {
-      hloOp = rewriter.create<mhlo::CaseOp>(stablehloOp.getLoc(), hloTypes,
-                                            hloOperands, hloAttrs,
-                                            stablehloOp.getBranches().size());
+      hloOp = mhlo::CaseOp::create(rewriter, stablehloOp.getLoc(), hloTypes,
+                                   hloOperands, hloAttrs,
+                                   stablehloOp.getBranches().size());
     } else {
       hloOp = rewriter.create<StablehloToHloOp<StablehloOpTy>>(
           stablehloOp.getLoc(), hloTypes, hloOperands, hloAttrs);
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
index d07f3178552f76..47791f5ec751c4 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc
@@ -50,8 +50,8 @@ Value broadcastToFeatureDim(Location loc, RankedTensorType resultType,
         loc, resultType, value1d, shapeValue, dims);
   }
   assert(resultType.hasStaticShape());
-  return rewriter.create<mhlo::BroadcastInDimOp>(loc, resultType, value1d,
-                                                 dims);
+  return mhlo::BroadcastInDimOp::create(rewriter, loc, resultType, value1d,
+                                        dims);
 }
 
 // Get the shape of operand, assuming it is a dynamic shape with static rank.
@@ -59,8 +59,8 @@ Value getShapeValue(Location loc, Value operand,
                     PatternRewriter &rewriter) {  // NOLINT
   RankedTensorType resultType =
       mlir::dyn_cast<RankedTensorType>(operand.getType());
-  return rewriter.create<mlir::shape::ShapeOfOp>(
-      loc,
+  return mlir::shape::ShapeOfOp::create(
+      rewriter, loc,
       RankedTensorType::get({resultType.getRank()}, rewriter.getIndexType()),
       operand);
 }
@@ -90,12 +90,12 @@ Value materializeEpsilon(Operation *op, FloatAttr epsilonAttr, FloatType fpType,
   auto scalarType = RankedTensorType::get({}, fpType);
   auto epsilonTensorAttr =
       DenseElementsAttr::get(scalarType, {mlir::cast<Attribute>(epsilonAttr)});
-  Value epsilon = b.create<mhlo::ConstantOp>(epsilonTensorAttr);
+  Value epsilon = mhlo::ConstantOp::create(b, epsilonTensorAttr);
   auto dimsType = RankedTensorType::get({0}, b.getIntegerType(64));
   auto dims = DenseIntElementsAttr::get(dimsType, SmallVector<int64_t, 1>{});
   if (broadcastToType.hasStaticShape()) {
-    return b.create<mhlo::BroadcastInDimOp>(broadcastToType, epsilon,
-                                            /*broadcast_dims=*/dims);
+    return mhlo::BroadcastInDimOp::create(b, broadcastToType, epsilon,
+                                          /*broadcast_dims=*/dims);
   }
   Value shapeValue = getShapeValue(op->getLoc(), broadcastTo, rewriter);
   return b.createOrFold<mhlo::DynamicBroadcastInDimOp>(broadcastToType, epsilon,
@@ -134,9 +134,9 @@ class UnfuseBatchNormInferencePattern
     if (!epsilon) {
       return failure();
     }
-    Value stddev = rewriter.create<mhlo::AddOp>(bnOp.getLoc(),
-                                                bnOp.getVariance(), epsilon);
-    stddev = rewriter.create<mhlo::SqrtOp>(bnOp.getLoc(), stddev);
+    Value stddev = mhlo::AddOp::create(rewriter, bnOp.getLoc(),
+                                       bnOp.getVariance(), epsilon);
+    stddev = mhlo::SqrtOp::create(rewriter, bnOp.getLoc(), stddev);
 
     // Broadcast all terms.
     Value shapeValue;
@@ -157,12 +157,12 @@ class UnfuseBatchNormInferencePattern
 
     // Compute:
     // scale * (input - mean) / stddev + offset
-    Value result = rewriter.create<mhlo::SubtractOp>(
-        bnOp.getLoc(), bnOp.getOperand(), broadcastMean);
+    Value result = mhlo::SubtractOp::create(rewriter, bnOp.getLoc(),
+                                            bnOp.getOperand(), broadcastMean);
     result =
-        rewriter.create<mhlo::MulOp>(bnOp.getLoc(), result, broadcastScale);
+        mhlo::MulOp::create(rewriter, bnOp.getLoc(), result, broadcastScale);
     result =
-        rewriter.create<mhlo::DivOp>(bnOp.getLoc(), result, broadcastStddev);
+        mhlo::DivOp::create(rewriter, bnOp.getLoc(), result, broadcastStddev);
     rewriter.replaceOpWithNewOp<mhlo::AddOp>(bnOp, result, broadcastOffset);
 
     return success();
@@ -178,8 +178,8 @@ Value createReduce(Location loc, Value operand, Value zero,
   Type reduceResultType = RankedTensorType::get(
       {operandType.getDimSize(featureIndex)}, operandType.getElementType());
   mhlo::ReduceOp reduce =
-      rewriter.create<mhlo::ReduceOp>(loc, reduceResultType, operand, zero,
-                                      rewriter.getI64TensorAttr(reduceDims));
+      mhlo::ReduceOp::create(rewriter, loc, reduceResultType, operand, zero,
+                             rewriter.getI64TensorAttr(reduceDims));
 
   // setup "mhlo.reduce"'s body
   Region &region = reduce.getBody();
@@ -194,8 +194,8 @@ Value createReduce(Location loc, Value operand, Value zero,
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPointToStart(&block);
     Value addResult =
-        rewriter.create<mhlo::AddOp>(loc, *firstArgument, *secondArgument);
-    rewriter.create<mhlo::ReturnOp>(loc, addResult);
+        mhlo::AddOp::create(rewriter, loc, *firstArgument, *secondArgument);
+    mhlo::ReturnOp::create(rewriter, loc, addResult);
   }
 
   return reduce.getResult(0);
@@ -214,17 +214,18 @@ Value calculateReduceSize(Operation *op, Value operand,
     Value operandShape = getShapeValue(op->getLoc(), operand, rewriter);
     Value scaleShape = getShapeValue(op->getLoc(), scale, rewriter);
     Value operandTotalSize =
-        b.create<shape::NumElementsOp>(indexType, operandShape);
+        shape::NumElementsOp::create(b, indexType, operandShape);
     Value scaleTotalSize =
-        b.create<shape::NumElementsOp>(indexType, scaleShape);
+        shape::NumElementsOp::create(b, indexType, scaleShape);
     Value reduceSize =
-        b.create<shape::DivOp>(indexType, operandTotalSize, scaleTotalSize);
-    reduceSize = b.create<arith::IndexCastOp>(b.getI64Type(), reduceSize);
-    reduceSize = b.create<tensor::FromElementsOp>(reduceSize);
-    reduceSize = b.create<mhlo::ConvertOp>(
-        RankedTensorType::get({1}, operandType.getElementType()), reduceSize);
-    reduceSize = b.create<mhlo::ReshapeOp>(
-        RankedTensorType::get({}, operandType.getElementType()), reduceSize);
+        shape::DivOp::create(b, indexType, operandTotalSize, scaleTotalSize);
+    reduceSize = arith::IndexCastOp::create(b, b.getI64Type(), reduceSize);
+    reduceSize = tensor::FromElementsOp::create(b, reduceSize);
+    reduceSize = mhlo::ConvertOp::create(
+        b, RankedTensorType::get({1}, operandType.getElementType()),
+        reduceSize);
+    reduceSize = mhlo::ReshapeOp::create(
+        b, RankedTensorType::get({}, operandType.getElementType()), reduceSize);
     return b.createOrFold<mhlo::DynamicBroadcastInDimOp>(
         scaleType, reduceSize, scaleShape, b.getI64TensorAttr({}));
   }
@@ -244,8 +245,8 @@ Value calculateReduceSize(Operation *op, Value operand,
   if (losesInfo) {
     op->emitWarning("Conversion of reduce_dims_size loses precision");
   }
-  Value reduceSize = b.create<mhlo::ConstantOp>(
-      DenseFPElementsAttr::get(scaleType, floatValue));
+  Value reduceSize = mhlo::ConstantOp::create(
+      b, DenseFPElementsAttr::get(scaleType, floatValue));
   return reduceSize;
 }
 
@@ -278,8 +279,8 @@ class UnfuseBatchNormTrainingPattern
     }
 
     // zero constant
-    Value constZero = rewriter.create<mhlo::ConstantOp>(
-        bnOp.getLoc(),
+    Value constZero = mhlo::ConstantOp::create(
+        rewriter, bnOp.getLoc(),
         DenseFPElementsAttr::get(RankedTensorType::get({}, fpType),
                                  APFloat::getZero(fpType.getFloatSemantics())));
     // epsilon
@@ -300,27 +301,28 @@ class UnfuseBatchNormTrainingPattern
     Value sum = createReduce(bnOp.getLoc(), bnOp.getOperand(), constZero,
                              dimensionsWithoutFeature, featureIndex, rewriter);
     // X^2
-    Value operandSquare = rewriter.create<mhlo::MulOp>(
-        bnOp.getLoc(), bnOp.getOperand(), bnOp.getOperand());
+    Value operandSquare = mhlo::MulOp::create(
+        rewriter, bnOp.getLoc(), bnOp.getOperand(), bnOp.getOperand());
     // Sum[X^2]
     Value squareSum =
         createReduce(bnOp.getLoc(), operandSquare, constZero,
                      dimensionsWithoutFeature, featureIndex, rewriter);
     // E[X]
-    Value mean = rewriter.create<mhlo::DivOp>(bnOp.getLoc(), sum, reduceSize);
+    Value mean = mhlo::DivOp::create(rewriter, bnOp.getLoc(), sum, reduceSize);
     // E[X^2]
     Value squareMean =
-        rewriter.create<mhlo::DivOp>(bnOp.getLoc(), squareSum, reduceSize);
+        mhlo::DivOp::create(rewriter, bnOp.getLoc(), squareSum, reduceSize);
     // E^2[X]
-    Value meanSquare = rewriter.create<mhlo::MulOp>(bnOp.getLoc(), mean, mean);
+    Value meanSquare = mhlo::MulOp::create(rewriter, bnOp.getLoc(), mean, mean);
     // Var[X]
-    Value var = rewriter.create<mhlo::SubtractOp>(bnOp.getLoc(), squareMean,
-                                                  meanSquare);
+    Value var = mhlo::SubtractOp::create(rewriter, bnOp.getLoc(), squareMean,
+                                         meanSquare);
     // Var[X] + epsilon
     Value varAddEpsilon =
-        rewriter.create<mhlo::AddOp>(bnOp.getLoc(), var, epsilon);
+        mhlo::AddOp::create(rewriter, bnOp.getLoc(), var, epsilon);
     // Sqrt(Var[X] + epsilon)
-    Value sqrtVar = rewriter.create<mhlo::SqrtOp>(bnOp.getLoc(), varAddEpsilon);
+    Value sqrtVar =
+        mhlo::SqrtOp::create(rewriter, bnOp.getLoc(), varAddEpsilon);
 
     Value shapeValue;
     if (!operandType.hasStaticShape()) {
@@ -329,27 +331,27 @@ class UnfuseBatchNormTrainingPattern
     // X - E[X]
     Value meanBroadcast = broadcastToFeatureDim(
         bnOp.getLoc(), operandType, mean, shapeValue, featureIndex, rewriter);
-    Value operandMinusMean = rewriter.create<mhlo::SubtractOp>(
-        bnOp.getLoc(), bnOp.getOperand(), meanBroadcast);
+    Value operandMinusMean = mhlo::SubtractOp::create(
+        rewriter, bnOp.getLoc(), bnOp.getOperand(), meanBroadcast);
     // (X - E[X]) / Sqrt(Var[X] + epsilon)
     Value sqrtVarBroadcast =
         broadcastToFeatureDim(bnOp.getLoc(), operandType, sqrtVar, shapeValue,
                               featureIndex, rewriter);
-    Value normalized = rewriter.create<mhlo::DivOp>(
-        bnOp.getLoc(), operandMinusMean, sqrtVarBroadcast);
+    Value normalized = mhlo::DivOp::create(rewriter, bnOp.getLoc(),
+                                           operandMinusMean, sqrtVarBroadcast);
 
     // ((X - E[X]) / Sqrt(Var[X] + epsilon)) * scale
     Value scaleBroadcast =
         broadcastToFeatureDim(bnOp.getLoc(), operandType, bnOp.getScale(),
                               shapeValue, featureIndex, rewriter);
-    Value scaledNormalized =
-        rewriter.create<mhlo::MulOp>(bnOp.getLoc(), normalized, scaleBroadcast);
+    Value scaledNormalized = mhlo::MulOp::create(rewriter, bnOp.getLoc(),
+                                                 normalized, scaleBroadcast);
     // ((X - E[X]) / Sqrt(Var[X] + epsilon)) * scale + offset.
     Value offsetBroadcast =
         broadcastToFeatureDim(bnOp.getLoc(), operandType, bnOp.getOffset(),
                               shapeValue, featureIndex, rewriter);
-    Value shiftedNormalized = rewriter.create<mhlo::AddOp>(
-        bnOp.getLoc(), scaledNormalized, offsetBroadcast);
+    Value shiftedNormalized = mhlo::AddOp::create(
+        rewriter, bnOp.getLoc(), scaledNormalized, offsetBroadcast);
 
     // results
     SmallVector<Value> results = {shiftedNormalized, mean, var};
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
index b4879df7c20ffd..ed921647946dc3 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/sdy_refine_shapes.cpp
@@ -176,8 +176,8 @@ LogicalResult refineValues(
     };
     if (llvm::none_of(value.getUses(), isFuncReturn)) continue;
     rewriter.setInsertionPointAfter(manualComputation);
-    auto castToUnrefinedType = rewriter.create<UnrealizedConversionCastOp>(
-        manualComputation->getLoc(), unrefinedType, value);
+    auto castToUnrefinedType = UnrealizedConversionCastOp::create(
+        rewriter, manualComputation->getLoc(), unrefinedType, value);
     value.replaceUsesWithIf(castToUnrefinedType.getOutputs()[0], isFuncReturn);
   }
 
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
index 8fae3ddee4f585..bd49a0434e7c00 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_add_quant_dequant_conv.cpp
@@ -83,12 +83,11 @@ struct AddQuantDeQuantAfterConvolutionOp final
         cast<ShapedType>(clonedConvOp->getResult(0).getType());
     auto loc = clonedConvOp->getLoc();
     auto quantizedType = getQuantizedType(loc, rewriter, convResultType);
-    auto stablehloQuantizeOp = rewriter.create<stablehlo::UniformQuantizeOp>(
-        op.getLoc(), quantizedType, clonedConvOp->getResult(0));
-    auto stablehloDeQuantizeOp =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            op.getLoc(), op.getType(),
-            /*input=*/stablehloQuantizeOp.getResult());
+    auto stablehloQuantizeOp = stablehlo::UniformQuantizeOp::create(
+        rewriter, op.getLoc(), quantizedType, clonedConvOp->getResult(0));
+    auto stablehloDeQuantizeOp = stablehlo::UniformDequantizeOp::create(
+        rewriter, op.getLoc(), op.getType(),
+        /*input=*/stablehloQuantizeOp.getResult());
     rewriter.replaceAllUsesWith(op, stablehloDeQuantizeOp.getResult());
     return success();
   }
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
index 73af52c7f31bde..a4dc72397919c1 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_dynamism.cpp
@@ -63,9 +63,9 @@ struct CanonicalizeDynamicReduceWindowOpPattern
                                          "expected static window_dilations");
     if (failed(hlo::matchInts(op.getPadding(), padding)))
       return rewriter.notifyMatchFailure(op, "expected static padding");
-    auto newOp = rewriter.create<stablehlo::ReduceWindowOp>(
-        op->getLoc(), op->getResultTypes(), op.getInputs(), op.getInitValues(),
-        rewriter.getDenseI64ArrayAttr(windowDimensions),
+    auto newOp = stablehlo::ReduceWindowOp::create(
+        rewriter, op->getLoc(), op->getResultTypes(), op.getInputs(),
+        op.getInitValues(), rewriter.getDenseI64ArrayAttr(windowDimensions),
         rewriter.getDenseI64ArrayAttr(windowStrides),
         rewriter.getDenseI64ArrayAttr(baseDilations),
         rewriter.getDenseI64ArrayAttr(windowDilations),
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
index 1b2e3cf15eaa97..8f5ba8def72a0f 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_canonicalize_from_hlo_import.cpp
@@ -95,8 +95,8 @@ LogicalResult expandTupledTensorInReturnOp(func::FuncOp func) {
       // Construct a new tuple and rewire it.
       OpBuilder builder(func.getBody());
       builder.setInsertionPointToStart(&func.getBody().front());
-      auto newTuple =
-          builder.create<stablehlo::TupleOp>(loc, tupleType, flattenedOperands);
+      auto newTuple = stablehlo::TupleOp::create(builder, loc, tupleType,
+                                                 flattenedOperands);
       func.getArgument(originalArgumentIndex).replaceAllUsesWith(newTuple);
 
       // Now the original argument has been rewired, we should be able to
@@ -130,8 +130,8 @@ LogicalResult expandTupledTensorInReturnOp(func::FuncOp func) {
 
   if (returnOp.getOperands() == expandedReturnOperands) return success();
 
-  builder.create<mlir::func::ReturnOp>(returnOp.getLoc(),
-                                       expandedReturnOperands);
+  mlir::func::ReturnOp::create(builder, returnOp.getLoc(),
+                               expandedReturnOperands);
   returnOp.erase();
   auto newFuncType = FunctionType::get(oldFuncType.getContext(),
                                        expandedInputTypes, expandedResultTypes);
@@ -174,7 +174,7 @@ Value createTupleValue(OpBuilder &builder, Location loc,
         createTupleValue(builder, loc, flattenValues, childType));
   }
 
-  return builder.create<mlir::stablehlo::TupleOp>(loc, flattenedSubValues)
+  return mlir::stablehlo::TupleOp::create(builder, loc, flattenedSubValues)
       .getResult();
 }
 
@@ -187,8 +187,9 @@ void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
   }
   int flattenIdx = 0;
   for (auto innerType : tupleType.getTypes()) {
-    auto innerValue = builder.create<stablehlo::GetTupleElementOp>(
-        loc, innerType, value, builder.getI32IntegerAttr(flattenIdx++));
+    auto innerValue = stablehlo::GetTupleElementOp::create(
+        builder, loc, innerType, value,
+        builder.getI32IntegerAttr(flattenIdx++));
     flattenTupleValue(builder, loc, innerValue, flattenedValues);
   }
 }
@@ -220,8 +221,9 @@ struct FlattenCustomCallOp : public OpRewritePattern<stablehlo::CustomCallOp> {
                                   op->result_type_end());
     }
 
-    auto flattenedCall = rewriter.create<stablehlo::CustomCallOp>(
-        op->getLoc(), flattenedResultTypes, flattenedOperands, op->getAttrs());
+    auto flattenedCall = stablehlo::CustomCallOp::create(
+        rewriter, op->getLoc(), flattenedResultTypes, flattenedOperands,
+        op->getAttrs());
 
     if (flattenResult) {
       ValueRange flattenedResultsRef(flattenedCall.getResults());
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp
index d2c8485cf77d50..3f5b026fb940f5 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_legalize_quant_composite.cpp
@@ -309,8 +309,8 @@ class RewriteFakeQuantCompositeOp
         quantizedDimension, storageTypeMin, storageTypeMax);
     RankedTensorType quantizedType = RankedTensorType::get(
         llvm::cast<ShapedType>(op.getType(0)).getShape(), quantizedElementType);
-    auto stablehloQuantizeOp = rewriter.create<stablehlo::UniformQuantizeOp>(
-        op.getLoc(), quantizedType, /*input=*/op.getOperand(0));
+    auto stablehloQuantizeOp = stablehlo::UniformQuantizeOp::create(
+        rewriter, op.getLoc(), quantizedType, /*input=*/op.getOperand(0));
     rewriter.replaceOpWithNewOp<stablehlo::UniformDequantizeOp>(
         op, op.getType(0),
         /*input=*/stablehloQuantizeOp.getResult());
diff --git a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
index 5712ad045279b4..2450fc2393598d 100644
--- a/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
+++ b/third_party/xla/xla/mlir_hlo/stablehlo_ext/transforms/stablehlo_prepare_for_hlo_export.cpp
@@ -72,13 +72,13 @@ static void prepareConstantOp(Operation *op, SplatElementsAttr attr) {
     assert(mlir::isa<FloatType>(complexTy.getElementType()) &&
            "unexpected int complex in StableHLO");
     auto complexVal = attr.getSplatValue<std::complex<APFloat>>();
-    cst = b.create<stablehlo::ConstantOp>(
-        DenseElementsAttr::get(tensorType, complexVal));
+    cst = stablehlo::ConstantOp::create(
+        b, DenseElementsAttr::get(tensorType, complexVal));
   } else {
-    cst = b.create<stablehlo::ConstantOp>(attr.getSplatValue<Attribute>());
+    cst = stablehlo::ConstantOp::create(b, attr.getSplatValue<Attribute>());
   }
-  auto broadcast = b.create<stablehlo::BroadcastInDimOp>(
-      returnType, cst, b.getDenseI64ArrayAttr({}));
+  auto broadcast = stablehlo::BroadcastInDimOp::create(
+      b, returnType, cst, b.getDenseI64ArrayAttr({}));
   if (auto sharding = op->getAttrOfType<mlir::StringAttr>(kShardingAttr)) {
     // The added broadcast inherits the kShardingAttr from op.
     broadcast->setAttr(kShardingAttr, sharding);
@@ -103,8 +103,8 @@ static void prepareBroadcastInDim(stablehlo::BroadcastInDimOp bcast) {
   llvm::sort(transposedDim,
              [&](int64_t lhs, int64_t rhs) { return dims[lhs] < dims[rhs]; });
   OpBuilder builder(bcast);
-  bcast.setOperand(builder.create<stablehlo::TransposeOp>(
-      bcast.getLoc(), bcast.getOperand(),
+  bcast.setOperand(stablehlo::TransposeOp::create(
+      builder, bcast.getLoc(), bcast.getOperand(),
       mlir::DenseI64ArrayAttr::get(builder.getContext(), transposedDim)));
   // Now reuse the original broadcast_dimensions and sort it.
   transposedDim.assign(dims.begin(), dims.end());
diff --git a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
index 6022dc31e64a1d..24288b5613acd6 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/alloc_to_arg_pass.cc
@@ -92,8 +92,8 @@ void AllocToArgPass::runOnOperation() {
         // buffer.
         rewriter.setInsertionPoint(allocOp);
         Value arg = funcOp.getArguments().back();
-        Value collapsedArg = rewriter.create<memref::CollapseShapeOp>(
-            loc, arg, expandOp.getReassociationIndices());
+        Value collapsedArg = memref::CollapseShapeOp::create(
+            rewriter, loc, arg, expandOp.getReassociationIndices());
 
         // Replace alloc and its expansion.
         rewriter.replaceOp(allocOp, collapsedArg);
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
index ba72dda3746b97..73956da6a25874 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize.cc
@@ -55,21 +55,21 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
     // TODO(kramerb): Should this use materializeConstant instead?
     auto makeConstant = [&](Attribute attr, Type type) -> Value {
       if (complex::ConstantOp::isBuildableWith(attr, type))
-        return rewriter.create<complex::ConstantOp>(
-            loc, type, mlir::cast<ArrayAttr>(attr));
-      return rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
+        return complex::ConstantOp::create(rewriter, loc, type,
+                                           mlir::cast<ArrayAttr>(attr));
+      return arith::ConstantOp::create(rewriter, loc, cast<TypedAttr>(attr));
     };
 
     if (resultRank == 0) {
-      Value buffer = rewriter.create<memref::AllocOp>(loc, memrefType);
+      Value buffer = memref::AllocOp::create(rewriter, loc, memrefType);
       Value constant =
           makeConstant(elementsAttr.getValues<Attribute>()[0], elementType);
-      rewriter.create<memref::StoreOp>(loc, constant, buffer);
+      memref::StoreOp::create(rewriter, loc, constant, buffer);
       rewriter.replaceOp(op, {buffer});
       return success();
     }
 
-    Value buffer = rewriter.create<memref::AllocaOp>(loc, memrefType);
+    Value buffer = memref::AllocaOp::create(rewriter, loc, memrefType);
 
     bool allSameElems = elementsAttr.isSplat();
     Value value;
@@ -79,8 +79,8 @@ struct BufferizeConstantOp : public OpConversionPattern<arith::ConstantOp> {
     for (const auto &en :
          llvm::enumerate(elementsAttr.getValues<Attribute>())) {
       if (!allSameElems) value = makeConstant(en.value(), elementType);
-      Value index = rewriter.create<arith::ConstantIndexOp>(loc, en.index());
-      rewriter.create<memref::StoreOp>(loc, value, buffer, index);
+      Value index = arith::ConstantIndexOp::create(rewriter, loc, en.index());
+      memref::StoreOp::create(rewriter, loc, value, buffer, index);
     }
     rewriter.replaceOp(op, {buffer});
     return success();
@@ -97,7 +97,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
       ConversionPatternRewriter &rewriter) const override {
     auto loc = broadcastShapesOp.getLoc();
     ImplicitLocOpBuilder lb(loc, rewriter);
-    Value zero = lb.create<arith::ConstantIndexOp>(0);
+    Value zero = arith::ConstantIndexOp::create(lb, 0);
     SmallVector<Value> shapes = adaptor.getShapes();
     size_t k = shapes.size();
     SmallVector<Value> ranks;
@@ -106,12 +106,12 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // Determine the maximum rank of the operands.
     Value maxRank;
     for (size_t i = 0; i < k; ++i) {
-      Value rank = lb.create<memref::DimOp>(loc, shapes[i], zero);
+      Value rank = memref::DimOp::create(lb, loc, shapes[i], zero);
       ranks.push_back(rank);
       if (i) {
-        Value rankIsGreater = lb.create<arith::CmpIOp>(
-            arith::CmpIPredicate::ugt, ranks[i], maxRank);
-        maxRank = lb.create<arith::SelectOp>(rankIsGreater, ranks[i], maxRank);
+        Value rankIsGreater = arith::CmpIOp::create(
+            lb, arith::CmpIPredicate::ugt, ranks[i], maxRank);
+        maxRank = arith::SelectOp::create(lb, rankIsGreater, ranks[i], maxRank);
       } else {
         maxRank = ranks[0];
       }
@@ -122,17 +122,17 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     resultShapes.reserve(k);
     auto resultType =
         MemRefType::get({ShapedType::kDynamic}, lb.getIndexType());
-    Value one = lb.create<arith::ConstantIndexOp>(1);
+    Value one = arith::ConstantIndexOp::create(lb, 1);
     for (size_t i = 0; i < k; ++i) {
       // We assume the buffer will be small, so we allocate it on the stack.
       // TODO(b/181654096): Replace AllocaOp with AllocOp.
-      auto result = lb.create<memref::AllocaOp>(resultType, ranks[i]);
-      lb.create<scf::ForOp>(zero, ranks[i], one, mlir::ValueRange(),
-                            [&one, &result](OpBuilder& b, Location l, Value idx,
-                                            ValueRange /*vr*/) {
-                              b.create<memref::StoreOp>(l, one, result, idx);
-                              b.create<scf::YieldOp>(l, mlir::ValueRange());
-                            });
+      auto result = memref::AllocaOp::create(lb, resultType, ranks[i]);
+      scf::ForOp::create(lb, zero, ranks[i], one, mlir::ValueRange(),
+                         [&one, &result](OpBuilder& b, Location l, Value idx,
+                                         ValueRange /*vr*/) {
+                           memref::StoreOp::create(b, l, one, result, idx);
+                           scf::YieldOp::create(b, l, mlir::ValueRange());
+                         });
       resultShapes.push_back(result);
     }
 
@@ -143,10 +143,10 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // backward, because the broadcasting semantics mean that the last
     // dimensions of each shape (the least significant ones) are matched
     // together.
-    Value two = lb.create<arith::ConstantIndexOp>(2);
-    Value maxRankPlusTwo = lb.create<arith::AddIOp>(loc, maxRank, two);
+    Value two = arith::ConstantIndexOp::create(lb, 2);
+    Value maxRankPlusTwo = arith::AddIOp::create(lb, loc, maxRank, two);
     Value constantFalse =
-        lb.create<arith::ConstantOp>(lb.getI1Type(), lb.getBoolAttr(false));
+        arith::ConstantOp::create(lb, lb.getI1Type(), lb.getBoolAttr(false));
     SmallVector<Value> initValues;
     initValues.reserve(k + 3);
     // Initially, all values are marked as not broadcasted.
@@ -164,9 +164,9 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // used as an offset from the end of each shape vector. We iterate until
     // max_rank + 1 to handle the case that we have a running_product > 1 left
     // when we have processed all dimensions of the largest shape.
-    auto mainLoop = lb.create<scf::ForOp>(
-        one, maxRankPlusTwo, one, initValues,
-        [&](OpBuilder &b, Location l, Value v, ValueRange vr) {
+    auto mainLoop = scf::ForOp::create(
+        lb, one, maxRankPlusTwo, one, initValues,
+        [&](OpBuilder& b, Location l, Value v, ValueRange vr) {
           // 'same_size' should track what the size of the dimension is to which
           // the 1-sized dimensions are broadcasted. If all of the dimensions
           // are 1, it will stay 1.
@@ -192,41 +192,41 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
           for (size_t i = 0; i < k; ++i) {
             // Determine the size of the current dimension. If the dimension is
             // out of bounds, we choose the value 'one'.
-            Value isOutOfBounds = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ult, ranks[i], v);
-            Value dimension = b.create<arith::SubIOp>(l, ranks[i], v);
+            Value isOutOfBounds = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ult, ranks[i], v);
+            Value dimension = arith::SubIOp::create(b, l, ranks[i], v);
             resultDimensions.push_back(dimension);
-            Value currentSize =
-                b.create<scf::IfOp>(
-                     l, isOutOfBounds,
-                     [&](OpBuilder &b, Location l) {
-                       b.create<scf::YieldOp>(l, one);
-                     },
-                     [&](OpBuilder &b, Location l) {
-                       // Using IfOp instead of SelectOp makes sure that we
-                       // don't try to load if the dimension is out of bounds.
-                       Value size =
-                           b.create<memref::LoadOp>(l, shapes[i], dimension);
-                       b.create<scf::YieldOp>(l, size);
-                     })
-                    .getResult(0);
+            Value currentSize = scf::IfOp::create(
+                                    b, l, isOutOfBounds,
+                                    [&](OpBuilder& b, Location l) {
+                                      scf::YieldOp::create(b, l, one);
+                                    },
+                                    [&](OpBuilder& b, Location l) {
+                                      // Using IfOp instead of SelectOp makes
+                                      // sure that we don't try to load if the
+                                      // dimension is out of bounds.
+                                      Value size = memref::LoadOp::create(
+                                          b, l, shapes[i], dimension);
+                                      scf::YieldOp::create(b, l, size);
+                                    })
+                                    .getResult(0);
             // Compute whether the current dimension does require broadcasting.
-            Value currentSizeIsNotOne = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, currentSize, one);
+            Value currentSizeIsNotOne = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ne, currentSize, one);
             noBroadcasting.push_back(currentSizeIsNotOne);
-            Value newSameSize = b.create<arith::SelectOp>(
-                l, currentSizeIsNotOne, currentSize, sameSize);
-            Value sameSizeWasNotOne = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, sameSize, one);
-            Value isDifferentSize = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, sameSize, newSameSize);
+            Value newSameSize = arith::SelectOp::create(
+                b, l, currentSizeIsNotOne, currentSize, sameSize);
+            Value sameSizeWasNotOne = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ne, sameSize, one);
+            Value isDifferentSize = arith::CmpIOp::create(
+                b, l, arith::CmpIPredicate::ne, sameSize, newSameSize);
             // The broadcast is invalid if the size of the current dimension
             // is not equal to the expected size, unless the expected size was
             // still the initial value 1.
             Value isInvalid =
-                b.create<arith::AndIOp>(l, sameSizeWasNotOne, isDifferentSize);
-            currentDimensionHasInvalidBroadcast = b.create<arith::OrIOp>(
-                l, currentDimensionHasInvalidBroadcast, isInvalid);
+                arith::AndIOp::create(b, l, sameSizeWasNotOne, isDifferentSize);
+            currentDimensionHasInvalidBroadcast = arith::OrIOp::create(
+                b, l, currentDimensionHasInvalidBroadcast, isInvalid);
             sameSize = newSameSize;
           }
 
@@ -234,22 +234,22 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
           // status regarding whether it needs broadcasting at the current
           // dimension versus whether it needs broadcasting at the previous
           // dimension.
-          Value sameSizeIsOne = b.create<arith::CmpIOp>(
-              l, arith::CmpIPredicate::eq, sameSize, one);
+          Value sameSizeIsOne = arith::CmpIOp::create(
+              b, l, arith::CmpIPredicate::eq, sameSize, one);
           Value differentBroadcastingSet = constantFalse;
           for (size_t i = 0; i < k; ++i) {
             // If all dimensions are 1, we preserve the status whether a shape
             // needs broadcasting or not, because in that case the dimension can
             // just be ignored.
-            noBroadcasting[i] = b.create<arith::SelectOp>(
-                l, sameSizeIsOne, prevNoBroadcasting[i], noBroadcasting[i]);
+            noBroadcasting[i] = arith::SelectOp::create(
+                b, l, sameSizeIsOne, prevNoBroadcasting[i], noBroadcasting[i]);
             // Compare whether the current shape changes its status regarding
             // whether it needs broadcasting at the current dimension.
-            Value broadcastingIsDifferent = b.create<arith::CmpIOp>(
-                l, arith::CmpIPredicate::ne, prevNoBroadcasting[i],
-                noBroadcasting[i]);
-            differentBroadcastingSet = b.create<arith::OrIOp>(
-                l, differentBroadcastingSet, broadcastingIsDifferent);
+            Value broadcastingIsDifferent =
+                arith::CmpIOp::create(b, l, arith::CmpIPredicate::ne,
+                                      prevNoBroadcasting[i], noBroadcasting[i]);
+            differentBroadcastingSet = arith::OrIOp::create(
+                b, l, differentBroadcastingSet, broadcastingIsDifferent);
           }
           Value runningProduct = vr[k];
           Value currentDimensionOffset = vr[k + 1];
@@ -257,83 +257,82 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
           // We need to stop combining dimensions if the set of shapes which
           // need broadcasting at the current dimension changes compared to the
           // set of shapes needing broadcasting at the previous dimension.
-          Value isLastIteration =
-              b.create<arith::CmpIOp>(l, arith::CmpIPredicate::sgt, v, maxRank);
-          Value stopCombiningDimensions = b.create<arith::OrIOp>(
-              l, isLastIteration, differentBroadcastingSet);
-          auto ifStopCombiningDimensions = b.create<scf::IfOp>(
-              l, stopCombiningDimensions,
-              [&](OpBuilder &b, Location l) {
+          Value isLastIteration = arith::CmpIOp::create(
+              b, l, arith::CmpIPredicate::sgt, v, maxRank);
+          Value stopCombiningDimensions = arith::OrIOp::create(
+              b, l, isLastIteration, differentBroadcastingSet);
+          auto ifStopCombiningDimensions = scf::IfOp::create(
+              b, l, stopCombiningDimensions,
+              [&](OpBuilder& b, Location l) {
                 // If the running product is not 1, add one dimension of size
                 // 'running_product' to each shape that didn't need
                 // broadcasting, otherwise add a 1 dimension if it was
                 // previously indexed in-bounds.
-                Value runningProductNotOne = b.create<arith::CmpIOp>(
-                    l, arith::CmpIPredicate::ne, runningProduct, one);
+                Value runningProductNotOne = arith::CmpIOp::create(
+                    b, l, arith::CmpIPredicate::ne, runningProduct, one);
                 Value newDimensionOffset =
-                    b.create<scf::IfOp>(
-                         l, runningProductNotOne,
-                         [&](OpBuilder &b, Location l) {
-                           Value newDimensionOffset = b.create<arith::AddIOp>(
-                               l, currentDimensionOffset, one);
-                           Value minusOne =
-                               lb.create<arith::ConstantIndexOp>(-1);
-                           for (size_t i = 0; i < k; ++i) {
-                             Value wasInBounds = b.create<arith::CmpIOp>(
-                                 l, arith::CmpIPredicate::sge,
-                                 resultDimensions[i], minusOne);
-                             Value shouldStoreDimension =
-                                 b.create<arith::OrIOp>(l, wasInBounds,
-                                                        prevNoBroadcasting[i]);
-                             b.create<scf::IfOp>(
-                                 l, shouldStoreDimension,
-                                 [&](OpBuilder &b, Location l) {
-                                   Value outputDimension =
-                                       b.create<arith::SubIOp>(
-                                           l, ranks[i], newDimensionOffset);
-                                   // If the shape needed broadcasting at the
-                                   // previous dimension, we set the output size
-                                   // to 1, otherwise to 'running_product'.
-                                   Value outputSize = b.create<arith::SelectOp>(
-                                       l, prevNoBroadcasting[i], runningProduct,
-                                       one);
-                                   b.create<memref::StoreOp>(l, outputSize,
-                                                             resultShapes[i],
-                                                             outputDimension);
-                                   b.create<scf::YieldOp>(l,
-                                                          mlir::ValueRange());
-                                 });
-                           }
-                           b.create<scf::YieldOp>(l, newDimensionOffset);
-                         },
-                         [&](OpBuilder &b, Location l) {
-                           b.create<scf::YieldOp>(l, currentDimensionOffset);
-                         })
+                    scf::IfOp::create(
+                        b, l, runningProductNotOne,
+                        [&](OpBuilder& b, Location l) {
+                          Value newDimensionOffset = arith::AddIOp::create(
+                              b, l, currentDimensionOffset, one);
+                          Value minusOne =
+                              arith::ConstantIndexOp::create(lb, -1);
+                          for (size_t i = 0; i < k; ++i) {
+                            Value wasInBounds = arith::CmpIOp::create(
+                                b, l, arith::CmpIPredicate::sge,
+                                resultDimensions[i], minusOne);
+                            Value shouldStoreDimension = arith::OrIOp::create(
+                                b, l, wasInBounds, prevNoBroadcasting[i]);
+                            scf::IfOp::create(
+                                b, l, shouldStoreDimension,
+                                [&](OpBuilder& b, Location l) {
+                                  Value outputDimension = arith::SubIOp::create(
+                                      b, l, ranks[i], newDimensionOffset);
+                                  // If the shape needed broadcasting at the
+                                  // previous dimension, we set the output size
+                                  // to 1, otherwise to 'running_product'.
+                                  Value outputSize = arith::SelectOp::create(
+                                      b, l, prevNoBroadcasting[i],
+                                      runningProduct, one);
+                                  memref::StoreOp::create(b, l, outputSize,
+                                                          resultShapes[i],
+                                                          outputDimension);
+                                  scf::YieldOp::create(b, l,
+                                                       mlir::ValueRange());
+                                });
+                          }
+                          scf::YieldOp::create(b, l, newDimensionOffset);
+                        },
+                        [&](OpBuilder& b, Location l) {
+                          scf::YieldOp::create(b, l, currentDimensionOffset);
+                        })
                         .getResult(0);
-                b.create<scf::YieldOp>(
-                    l, ValueRange{sameSize, newDimensionOffset});
+                scf::YieldOp::create(b, l,
+                                     ValueRange{sameSize, newDimensionOffset});
               },
-              [&](OpBuilder &b, Location l) {
+              [&](OpBuilder& b, Location l) {
                 Value newRunningProduct =
-                    b.create<arith::MulIOp>(l, runningProduct, sameSize);
-                b.create<scf::YieldOp>(
-                    l, ValueRange{newRunningProduct, currentDimensionOffset});
+                    arith::MulIOp::create(b, l, runningProduct, sameSize);
+                scf::YieldOp::create(
+                    b, l,
+                    ValueRange{newRunningProduct, currentDimensionOffset});
               });
           // Add the remaining results.
           noBroadcasting.push_back(ifStopCombiningDimensions.getResult(0));
           noBroadcasting.push_back(ifStopCombiningDimensions.getResult(1));
           Value isInvalid = vr.back();
-          isInvalid = b.create<arith::OrIOp>(
-              l, isInvalid, currentDimensionHasInvalidBroadcast);
+          isInvalid = arith::OrIOp::create(b, l, isInvalid,
+                                           currentDimensionHasInvalidBroadcast);
           noBroadcasting.push_back(isInvalid);
-          b.create<scf::YieldOp>(l, noBroadcasting);
+          scf::YieldOp::create(b, l, noBroadcasting);
         });
     Value isInvalid = mainLoop.getResults().back();
     for (size_t i = 0; i < k; ++i) {
       resultShapes[i] =
           removeLeadingOnesFrom1DMemref(lb, resultShapes[i], ranks[i]);
       resultShapes[i] =
-          lb.create<arith::SelectOp>(isInvalid, shapes[i], resultShapes[i]);
+          arith::SelectOp::create(lb, isInvalid, shapes[i], resultShapes[i]);
     }
     rewriter.replaceOp(broadcastShapesOp, resultShapes);
     return success();
@@ -346,20 +345,20 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // boolean flag for whether every size so far was 1, one with the number of
     // leading 1's.
     Value constantTrue =
-        lb.create<arith::ConstantOp>(lb.getI1Type(), lb.getBoolAttr(true));
-    Value zero = lb.create<arith::ConstantIndexOp>(0);
-    Value one = lb.create<arith::ConstantIndexOp>(1);
-    auto leadingOnesLoop = lb.create<scf::ForOp>(
-        zero, rank, one, ValueRange{constantTrue, zero},
-        [&](OpBuilder &b, Location l, Value idx, ValueRange vr) {
-          auto size = b.create<memref::LoadOp>(l, extentMemref, idx);
+        arith::ConstantOp::create(lb, lb.getI1Type(), lb.getBoolAttr(true));
+    Value zero = arith::ConstantIndexOp::create(lb, 0);
+    Value one = arith::ConstantIndexOp::create(lb, 1);
+    auto leadingOnesLoop = scf::ForOp::create(
+        lb, zero, rank, one, ValueRange{constantTrue, zero},
+        [&](OpBuilder& b, Location l, Value idx, ValueRange vr) {
+          auto size = memref::LoadOp::create(b, l, extentMemref, idx);
           auto isEqualToOne =
-              b.create<arith::CmpIOp>(l, arith::CmpIPredicate::eq, size, one);
-          auto allOnes = b.create<arith::AndIOp>(l, vr.front(), isEqualToOne);
-          auto increasedValue = b.create<arith::AddIOp>(l, vr.back(), one);
+              arith::CmpIOp::create(b, l, arith::CmpIPredicate::eq, size, one);
+          auto allOnes = arith::AndIOp::create(b, l, vr.front(), isEqualToOne);
+          auto increasedValue = arith::AddIOp::create(b, l, vr.back(), one);
           auto numberOfLeadingOnes =
-              b.create<arith::SelectOp>(l, allOnes, increasedValue, vr.back());
-          b.create<scf::YieldOp>(l, ValueRange{allOnes, numberOfLeadingOnes});
+              arith::SelectOp::create(b, l, allOnes, increasedValue, vr.back());
+          scf::YieldOp::create(b, l, ValueRange{allOnes, numberOfLeadingOnes});
         });
     return leadingOnesLoop.getResults()[1];
   }
@@ -367,7 +366,7 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
   Value removeLeadingOnesFrom1DMemref(ImplicitLocOpBuilder &lb,
                                       Value extentMemref, Value rank) const {
     Value leadingOnes = countLeadingOnes(lb, extentMemref, rank);
-    Value newRank = lb.create<arith::SubIOp>(rank, leadingOnes);
+    Value newRank = arith::SubIOp::create(lb, rank, leadingOnes);
     auto resultType =
         MemRefType::get({ShapedType::kDynamic}, lb.getIndexType());
     // We cannot use SubView here to return a MemRef with 'leading_ones' as
@@ -377,16 +376,16 @@ struct BufferizeAndConvertMinimumBroadcastShapesOp
     // another buffer of the desired size and copy the elements over. We assume
     // the buffer will be small, so we allocate it on the stack.
     // TODO(b/181654096): Replace AllocaOp with AllocOp.
-    Value result = lb.create<memref::AllocaOp>(resultType, newRank);
-    Value zero = lb.create<arith::ConstantIndexOp>(0);
-    Value one = lb.create<arith::ConstantIndexOp>(1);
-    lb.create<scf::ForOp>(
-        zero, newRank, one, mlir::ValueRange(),
+    Value result = memref::AllocaOp::create(lb, resultType, newRank);
+    Value zero = arith::ConstantIndexOp::create(lb, 0);
+    Value one = arith::ConstantIndexOp::create(lb, 1);
+    scf::ForOp::create(
+        lb, zero, newRank, one, mlir::ValueRange(),
         [&](OpBuilder& b, Location l, Value idx, ValueRange /*vr*/) {
-          Value idxWithOffset = b.create<arith::AddIOp>(l, idx, leadingOnes);
-          auto size = b.create<memref::LoadOp>(l, extentMemref, idxWithOffset);
-          b.create<memref::StoreOp>(l, size, result, idx);
-          b.create<scf::YieldOp>(l, mlir::ValueRange());
+          Value idxWithOffset = arith::AddIOp::create(b, l, idx, leadingOnes);
+          auto size = memref::LoadOp::create(b, l, extentMemref, idxWithOffset);
+          memref::StoreOp::create(b, l, size, result, idx);
+          scf::YieldOp::create(b, l, mlir::ValueRange());
         });
     return result;
   }
diff --git a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
index 2e9ea7a8a59254..fa572362080eaa 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/bufferize_pass.cc
@@ -93,7 +93,7 @@ static Value materializeToTensor(OpBuilder& builder, TensorType type,
                                  ValueRange inputs, Location loc) {
   assert(inputs.size() == 1);
   assert(mlir::isa<BaseMemRefType>(inputs[0].getType()));
-  return builder.create<bufferization::ToTensorOp>(loc, type, inputs[0]);
+  return bufferization::ToTensorOp::create(builder, loc, type, inputs[0]);
 }
 
 // TODO(pifon): Remove as soon as https://reviews.llvm.org/D93126 is landed.
@@ -129,7 +129,7 @@ class CustomBufferizeTypeConverter : public mlir::TypeConverter {
       }
       if (isa<TensorType>(inputs[0].getType())) {
         // Tensor to MemRef cast.
-        return builder.create<bufferization::ToBufferOp>(loc, type, inputs[0]);
+        return bufferization::ToBufferOp::create(builder, loc, type, inputs[0]);
       }
       llvm_unreachable("only tensor/memref input types supported");
     });
@@ -146,7 +146,7 @@ class CustomBufferizeTypeConverter : public mlir::TypeConverter {
         return inputs[0];
       }
       assert(mlir::isa<TensorType>(inputs[0].getType()));
-      return builder.create<bufferization::ToBufferOp>(loc, type, inputs[0]);
+      return bufferization::ToBufferOp::create(builder, loc, type, inputs[0]);
     });
   }
 };
diff --git a/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc b/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
index 12d8b3814646e7..402fd7b112c432 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/detensorize_scf_ops.cc
@@ -62,7 +62,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     b.setInsertionPoint(result);
     for (auto [index, operand] : unitTensors(result->getOperands())) {
-      result->setOperand(index, b.create<tensor::ExtractOp>(operand));
+      result->setOperand(index, tensor::ExtractOp::create(b, operand));
     }
 
     // Fix any block arguments in the op. We're detensorizing all arguments that
@@ -76,8 +76,8 @@ struct RegionOpPattern : public OpRewritePattern<T> {
           // Change the argument type to a scalar, but repack it into a tensor.
           arg.setType(
               mlir::cast<RankedTensorType>(arg.getType()).getElementType());
-          auto converted = b.create<tensor::FromElementsOp>(
-              RankedTensorType::get({}, arg.getType()), arg);
+          auto converted = tensor::FromElementsOp::create(
+              b, RankedTensorType::get({}, arg.getType()), arg);
           arg.replaceAllUsesExcept(converted, converted.getOperation());
         }
 
@@ -86,7 +86,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
              unitTensors(block.getTerminator()->getOperands())) {
           b.setInsertionPoint(block.getTerminator());
           block.getTerminator()->setOperand(
-              index, b.create<tensor::ExtractOp>(operand));
+              index, tensor::ExtractOp::create(b, operand));
         }
       }
     }
@@ -99,7 +99,7 @@ struct RegionOpPattern : public OpRewritePattern<T> {
       opResult.setType(oldType.getElementType());
 
       // Convert the scalar back to a tensor in the output.
-      results[index] = b.create<tensor::FromElementsOp>(oldType, opResult);
+      results[index] = tensor::FromElementsOp::create(b, oldType, opResult);
     }
     rewriter.replaceOp(op.getOperation(), results);
     return success();
diff --git a/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
index b773792e67b5c4..0022fb5dfdc213 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
@@ -48,10 +48,10 @@ struct IndexCastConverter : public OpRewritePattern<T> {
         tensor::createDynamicDimValues(rewriter, op.getLoc(), op.getIn());
     rewriter.replaceOpWithNewOp<tensor::GenerateOp>(
         op, resultTy, dynamicExtents,
-        [&](OpBuilder &b, Location loc, ValueRange args) {
-          Value extent = b.create<tensor::ExtractOp>(loc, op.getIn(), args);
-          Value cast = b.create<T>(loc, resultTy.getElementType(), extent);
-          b.create<tensor::YieldOp>(loc, cast);
+        [&](OpBuilder& b, Location loc, ValueRange args) {
+          Value extent = tensor::ExtractOp::create(b, loc, op.getIn(), args);
+          Value cast = T::create(b, loc, resultTy.getElementType(), extent);
+          tensor::YieldOp::create(b, loc, cast);
         });
     return success();
   }
diff --git a/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc b/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc
index d6efd72d2437c0..8d624f3afd8f31 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/tile_loops_pass.cc
@@ -116,7 +116,7 @@ void TileLoopsPass::runOnOperation() {
       int64_t difference = upper[i].value() - lower[i].value();
       if (difference % (step[i].value() * unrollFactor) != 0) continue;
       ploop.getUpperBoundMutable().slice(i, 1).assign(
-          builder.create<arith::ConstantIndexOp>(loc, unrollFactor));
+          arith::ConstantIndexOp::create(builder, loc, unrollFactor));
     }
   }
 
diff --git a/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc b/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
index 5650e83be0c2d4..28bdded6ed8376 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
+++ b/third_party/xla/xla/mlir_hlo/transforms/vectorize_copy.cc
@@ -88,7 +88,7 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
                          targetType.getNumElements() <= tileSize;
 
     if (isContiguous || isSmall) {
-      rewriter.create<memref::CopyOp>(loc, src, target);
+      memref::CopyOp::create(rewriter, loc, src, target);
       return;
     }
 
@@ -99,14 +99,14 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
     const int64_t remainderSize = dimSize % sliceSize;
     const int64_t upperBound = shape[dim] - remainderSize;
 
-    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
     Value tileSizeValue =
-        rewriter.create<arith::ConstantIndexOp>(loc, sliceSize);
+        arith::ConstantIndexOp::create(rewriter, loc, sliceSize);
     Value upperBoundValue =
-        rewriter.create<arith::ConstantIndexOp>(loc, upperBound);
+        arith::ConstantIndexOp::create(rewriter, loc, upperBound);
 
-    auto loop = rewriter.create<scf::ForOp>(loc, zero, upperBoundValue,
-                                            tileSizeValue, target);
+    auto loop = scf::ForOp::create(rewriter, loc, zero, upperBoundValue,
+                                   tileSizeValue, target);
 
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointToStart(loop.getBody());
@@ -123,7 +123,7 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
     createLoopsNest(rewriter, loc, dim + 1, srcSubview, targetSubview, shape,
                     offsets, sizes, strides);
 
-    rewriter.create<scf::YieldOp>(loc, loop.getRegionIterArgs()[0]);
+    scf::YieldOp::create(rewriter, loc, loop.getRegionIterArgs()[0]);
 
     // Remainder copy can only be created for the innermost loop, for other
     // loops remainder size is guaranteed to be 0.
@@ -138,8 +138,8 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
       Value targetRemainderSubview =
           getSubView(rewriter, loc, target, shape, offsets, sizes, strides);
 
-      rewriter.create<memref::CopyOp>(loc, srcRemainderSubview,
-                                      targetRemainderSubview);
+      memref::CopyOp::create(rewriter, loc, srcRemainderSubview,
+                             targetRemainderSubview);
     }
   }
 
@@ -154,8 +154,8 @@ struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
         cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
             shape, valType, offsets, sizes, strides));
 
-    return rewriter.create<memref::SubViewOp>(loc, valSubviewType, val, offsets,
-                                              sizes, strides);
+    return memref::SubViewOp::create(rewriter, loc, valSubviewType, val,
+                                     offsets, sizes, strides);
   }
 
   int64_t tileSize;
diff --git a/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc b/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
index dc1506711a2a1d..3faf853f49cf11 100644
--- a/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
+++ b/third_party/xla/xla/service/spmd/shardy/round_trip_common/open_while_free_vars_sharding.cc
@@ -65,9 +65,8 @@ class OpenWhileFreeVarsShardingPass
           // a sharding constraint.
           continue;
         }
-        auto shardingConstraint =
-            rewriter.create<mlir::sdy::ShardingConstraintOp>(
-                freeVar.getLoc(), freeVar, fullyOpenSharding);
+        auto shardingConstraint = mlir::sdy::ShardingConstraintOp::create(
+            rewriter, freeVar.getLoc(), freeVar, fullyOpenSharding);
         // Only replace uses in the regions of the while op.
         rewriter.replaceUsesWithIf(
             freeVar, shardingConstraint, [op](mlir::OpOperand& use) {
diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
index 5943506c60909f..e2c26b649550bc 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_callback_custom_calls.cc
@@ -59,8 +59,8 @@ void replaceCallbackWithTupleVersion(CustomCallOp customCall) {
       mlir::TupleType::get(customCall->getContext(),
                            {customCall->getResultTypes()}),
       rewriter);
-  auto getTupleElement = rewriter.create<mlir::stablehlo::GetTupleElementOp>(
-      customCall.getLoc(), customCall->getResultTypes().front(),
+  auto getTupleElement = mlir::stablehlo::GetTupleElementOp::create(
+      rewriter, customCall.getLoc(), customCall->getResultTypes().front(),
       tupleCustomCall.getResult(0), rewriter.getI32IntegerAttr(0));
   getTupleElement->setAttr(kXlaShardingAttr,
                            customCall->getAttr(kXlaShardingAttr));
diff --git a/third_party/xla/xla/service/spmd/shardy/utils.cc b/third_party/xla/xla/service/spmd/shardy/utils.cc
index d3a4bd26971cef..91f44339a22565 100644
--- a/third_party/xla/xla/service/spmd/shardy/utils.cc
+++ b/third_party/xla/xla/service/spmd/shardy/utils.cc
@@ -260,12 +260,12 @@ void adjustOutputSharding(
 CustomCallOp cloneCustomCallWithNewResultTypes(CustomCallOp op,
                                                mlir::TypeRange resultTypes,
                                                mlir::IRRewriter& rewriter) {
-  auto customCallOp = rewriter.create<CustomCallOp>(
-      op.getLoc(), resultTypes, op.getOperands(), op.getCallTargetNameAttr(),
-      op.getHasSideEffectAttr(), op.getBackendConfigAttr(),
-      op.getApiVersionAttr(), op.getCalledComputations(),
-      op.getOperandLayoutsAttr(), op.getResultLayoutsAttr(),
-      op.getOutputOperandAliases());
+  auto customCallOp = CustomCallOp::create(
+      rewriter, op.getLoc(), resultTypes, op.getOperands(),
+      op.getCallTargetNameAttr(), op.getHasSideEffectAttr(),
+      op.getBackendConfigAttr(), op.getApiVersionAttr(),
+      op.getCalledComputations(), op.getOperandLayoutsAttr(),
+      op.getResultLayoutsAttr(), op.getOutputOperandAliases());
   customCallOp->setDiscardableAttrs(mlir::DictionaryAttr::get(
       op->getContext(), llvm::to_vector(op->getDiscardableAttrs())));
   return customCallOp;

From 4baf73c4e8c906cc97dceb0f399fdce56604843b Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 18 Dec 2025 06:48:13 -0800
Subject: [PATCH 527/753] Refactor: Extract AotCompilationResult to
 CompiledModule.

This change moves the definition of `AotCompilationResult` into a new header file `compiled_module.h` and renames the class to `CompiledModule`. `CompilationResult` would have been the preferred name, but it's already in-use elsewhere.

The original `AotCompilationResult` is kept as a deprecated alias.

PiperOrigin-RevId: 846246415
---
 third_party/xla/xla/service/BUILD             | 14 ++++-
 third_party/xla/xla/service/compiled_module.h | 55 +++++++++++++++++++
 third_party/xla/xla/service/compiler.h        | 33 +----------
 3 files changed, 70 insertions(+), 32 deletions(-)
 create mode 100644 third_party/xla/xla/service/compiled_module.h

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index d049724ae6fe90..640992b0283e5b 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -1586,6 +1586,18 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "compiled_module",
+    hdrs = ["compiled_module.h"],
+    deps = [
+        ":buffer_assignment",
+        "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:stream_executor_h",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
 cc_library(
     name = "compiler",
     srcs = ["compiler.cc"],
@@ -1595,8 +1607,8 @@ cc_library(
         "//xla/backends/gpu/runtime:__subpackages__",
     ]),
     deps = [
-        ":buffer_assignment",
         ":buffer_value",
+        ":compiled_module",
         ":computation_placer",
         ":executable",
         ":hlo_cost_analysis",
diff --git a/third_party/xla/xla/service/compiled_module.h b/third_party/xla/xla/service/compiled_module.h
new file mode 100644
index 00000000000000..7c4f8f6f305aaf
--- /dev/null
+++ b/third_party/xla/xla/service/compiled_module.h
@@ -0,0 +1,55 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_COMPILED_MODULE_H_
+#define XLA_SERVICE_COMPILED_MODULE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+
+class Executable;
+
+// Abstract superclass describing the result of an ahead-of-time compilation.
+class CompiledModule {
+ public:
+  virtual ~CompiledModule() = default;
+
+  virtual absl::StatusOr<std::string> SerializeAsString() const = 0;
+
+  virtual absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
+      const stream_executor::StreamExecutor* executor) && = 0;
+
+  virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
+      const {
+    return absl::UnimplementedError("buffer_assignment is not supported.");
+  }
+
+  // Returns the optimized HLO module if one was computed and the implementation
+  // supports it.
+  virtual const HloModule* optimized_module() const = 0;
+  virtual std::shared_ptr<HloModule> shared_optimized_module() = 0;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_COMPILED_MODULE_H_
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index ee2fac51790803..7938e3d1a50588 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -41,8 +41,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_value.h"
+#include "xla/service/compiled_module.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -71,36 +71,7 @@ using ObjectFileData = std::vector<char>;
 
 class AotCompilationOptions;
 
-// Abstract superclass describing the result of an ahead-of-time compilation.
-class AotCompilationResult {
- public:
-  AotCompilationResult(const AotCompilationResult&) = delete;
-  AotCompilationResult& operator=(AotCompilationResult const&) = delete;
-
-  virtual ~AotCompilationResult() = default;
-
-  virtual absl::StatusOr<std::string> SerializeAsString() const {
-    return Unimplemented("SerializeAsString unimplemented.");
-  }
-
-  virtual absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      const se::StreamExecutor* executor) && {
-    return Unimplemented("LoadExecutable unimplemented.");
-  }
-
-  virtual absl::StatusOr<std::unique_ptr<BufferAssignment>> buffer_assignment()
-      const {
-    return Unimplemented("buffer_assignment unimplemented.");
-  }
-
-  // Returns the optimized HLO module if one was computed and the implementation
-  // supports it.
-  virtual const HloModule* optimized_module() const = 0;
-  virtual std::shared_ptr<HloModule> shared_optimized_module() = 0;
-
- protected:
-  AotCompilationResult() = default;
-};
+using AotCompilationResult ABSL_DEPRECATE_AND_INLINE() = CompiledModule;
 
 // Abstract superclass describing metadata produced during ahead-of-time
 // compilation.

From 9d0d22dbaa24ce6b196c70cac0490085859e9470 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 18 Dec 2025 07:10:07 -0800
Subject: [PATCH 528/753] [PJRT] Change the two optimizations in Transpose to
 operate on Loop nests, rather than on the original dimensions.

These are simpler both to write and to think about.

No behavior changes are intended.

PiperOrigin-RevId: 846253300
---
 third_party/xla/xla/pjrt/transpose.cc      | 210 +++++++++------------
 third_party/xla/xla/pjrt/transpose.h       |  78 ++++----
 third_party/xla/xla/pjrt/transpose_test.cc | 165 +++++++++-------
 3 files changed, 214 insertions(+), 239 deletions(-)

diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index b352f02d531325..c7eb090396085c 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -548,124 +548,6 @@ static void ComputeStrides(
   }
 }
 
-void TransposePlan::RemoveTrivialDimensions(
-    absl::InlinedVector<int64_t, 4>& a_dims,
-    absl::InlinedVector<int64_t, 4>& permutation,
-    absl::InlinedVector<int64_t, 4>& lda,
-    absl::InlinedVector<int64_t, 4>& lda_tile,
-    absl::InlinedVector<int64_t, 4>& a_tiling,
-    absl::InlinedVector<int64_t, 4>& b_tiling) {
-  int ndim = a_dims.size();
-  // How many positions has the i-th dimension of 'a' been moved to the left?
-  // -1 if the dimension is to be removed.
-  std::vector<int> shift(ndim);
-  absl::InlinedVector<int64_t, 4> updated_a_dims;
-  absl::InlinedVector<int64_t, 4> updated_lda;
-  absl::InlinedVector<int64_t, 4> updated_lda_tile;
-  absl::InlinedVector<int64_t, 4> updated_a_tiling;
-  updated_a_dims.reserve(ndim);
-  updated_lda.reserve(ndim);
-  updated_lda_tile.reserve(ndim);
-  updated_a_tiling.reserve(ndim);
-  std::vector<int64_t> inv_permutation = InversePermutation(permutation);
-  for (int a_dim = 0; a_dim < ndim; ++a_dim) {
-    int b_dim = inv_permutation[a_dim];
-    // A dimension is trivial if it has size 1 and is not tiled.
-    if (a_dims[a_dim] == 1 && a_tiling[a_dim] == 1 && b_tiling[b_dim] == 1) {
-      shift[a_dim] = -1;
-    } else {
-      updated_a_dims.push_back(a_dims[a_dim]);
-      updated_lda.push_back(lda[a_dim]);
-      updated_lda_tile.push_back(lda_tile[a_dim]);
-      updated_a_tiling.push_back(a_tiling[a_dim]);
-      shift[a_dim] = a_dim + 1 - updated_a_dims.size();
-    }
-  }
-
-  // Updates the permutation and tiling of b.
-  absl::InlinedVector<int64_t, 4> updated_permutation;
-  absl::InlinedVector<int64_t, 4> updated_b_tiling;
-  updated_permutation.reserve(updated_a_dims.size());
-  updated_b_tiling.reserve(updated_a_dims.size());
-  for (int b_dim = 0; b_dim < ndim; ++b_dim) {
-    int a_dim = permutation[b_dim];
-    if (shift[a_dim] >= 0) {
-      updated_permutation.push_back(a_dim - shift[a_dim]);
-      updated_b_tiling.push_back(b_tiling[b_dim]);
-    }
-  }
-
-  DCHECK(IsPermutation(updated_permutation));
-  a_dims = std::move(updated_a_dims);
-  permutation = std::move(updated_permutation);
-  lda = std::move(updated_lda);
-  lda_tile = std::move(updated_lda_tile);
-  a_tiling = std::move(updated_a_tiling);
-  b_tiling = std::move(updated_b_tiling);
-}
-
-void TransposePlan::CoalesceDimensions(
-    absl::InlinedVector<int64_t, 4>& a_dims,
-    absl::InlinedVector<int64_t, 4>& permutation,
-    absl::InlinedVector<int64_t, 4>& lda,
-    absl::InlinedVector<int64_t, 4>& lda_tile,
-    absl::InlinedVector<int64_t, 4>& a_tiling,
-    absl::InlinedVector<int64_t, 4>& b_tiling) {
-  int ndim = a_dims.size();
-  // How many positions has the i-th dimension of 'a' been moved to the left?
-  // -1 if the dimension is to be removed.
-  std::vector<int> shift(ndim, 0);
-  absl::InlinedVector<int64_t, 4> updated_a_dims;
-  absl::InlinedVector<int64_t, 4> updated_lda;
-  absl::InlinedVector<int64_t, 4> updated_lda_tile;
-  absl::InlinedVector<int64_t, 4> updated_a_tiling;
-  updated_a_dims.reserve(ndim);
-  updated_lda.reserve(ndim);
-  updated_lda_tile.reserve(ndim);
-  updated_a_tiling.reserve(ndim);
-  std::vector<int64_t> inv_permutation = InversePermutation(permutation);
-  for (int a_dim = 0; a_dim < ndim; ++a_dim) {
-    // We can coalesce two dimensions if they appear consecutively
-    // in both the input dimensions and the output dimensions, and the stride
-    // of the outer dimension is the usual multiple of the inner dimension.
-    if (a_dim > 0 && inv_permutation[a_dim - 1] + 1 == inv_permutation[a_dim] &&
-        lda[a_dim - 1] == lda[a_dim] * a_dims[a_dim] &&
-        a_tiling[a_dim - 1] == 1 && a_tiling[a_dim] == 1 &&
-        b_tiling[inv_permutation[a_dim]] == 1 &&
-        b_tiling[inv_permutation[a_dim - 1]] == 1) {
-      updated_a_dims.back() *= a_dims[a_dim];
-      updated_lda.back() = lda[a_dim];
-      shift[a_dim] = -1;
-    } else {
-      updated_a_dims.push_back(a_dims[a_dim]);
-      updated_lda.push_back(lda[a_dim]);
-      updated_lda_tile.push_back(lda_tile[a_dim]);
-      updated_a_tiling.push_back(a_tiling[a_dim]);
-      shift[a_dim] = a_dim + 1 - updated_a_dims.size();
-    }
-  }
-
-  // Updates the permutation.
-  absl::InlinedVector<int64_t, 4> updated_permutation;
-  absl::InlinedVector<int64_t, 4> updated_b_tiling;
-  updated_permutation.reserve(updated_a_dims.size());
-  updated_b_tiling.reserve(updated_a_dims.size());
-  for (int b_dim = 0; b_dim < ndim; ++b_dim) {
-    int a_dim = permutation[b_dim];
-    if (shift[a_dim] >= 0) {
-      updated_permutation.push_back(a_dim - shift[a_dim]);
-      updated_b_tiling.push_back(b_tiling[b_dim]);
-    }
-  }
-  DCHECK(IsPermutation(updated_permutation));
-  a_dims = std::move(updated_a_dims);
-  permutation = std::move(updated_permutation);
-  lda = std::move(updated_lda);
-  lda_tile = std::move(updated_lda_tile);
-  a_tiling = std::move(updated_a_tiling);
-  b_tiling = std::move(updated_b_tiling);
-}
-
 int64_t TransposePlan::InputNumElems() const {
   int64_t size = 1;
   for (size_t i = 0; i < a_dims_.size(); ++i) {
@@ -712,6 +594,15 @@ static absl::Status ParseTilingSpecification(
   return absl::OkStatus();
 }
 
+bool TransposePlan::Loop::operator==(const Loop& other) const {
+  return dim_in_a == other.dim_in_a && tile_interior == other.tile_interior &&
+         dim_size == other.dim_size && tile_size == other.tile_size &&
+         lda == other.lda && ldb == other.ldb &&
+         is_inner_dim_in_a == other.is_inner_dim_in_a &&
+         is_inner_dim_in_b == other.is_inner_dim_in_b &&
+         parallelism == other.parallelism;
+}
+
 // Helper function that builds a plan.
 void TransposePlan::BuildPlanNodes(int thread_id,
                                    std::vector<TransposePlan::Node>& nodes) {
@@ -1063,11 +954,6 @@ void TransposePlan::Initialize() {
   if (num_elems_ == 0) {
     return;
   }
-  RemoveTrivialDimensions(a_dims_, permutation_, lda_, lda_tile_, a_tiling_,
-                          b_tiling_);
-  CoalesceDimensions(a_dims_, permutation_, lda_, lda_tile_, a_tiling_,
-                     b_tiling_);
-
   // permutation maps dimensions of b to a
   // inverse_permutation maps dimensions of a to b
   std::vector<int64_t> inverse_permutation = InversePermutation(permutation_);
@@ -1144,6 +1030,9 @@ void TransposePlan::Initialize() {
     }
   }
 
+  RemoveTrivialLoops(loop_order_);
+  CoalesceLoops(loop_order_);
+
   // Bound the block sizes so they are smaller than the stride-1 dimension
   // size.
   int64_t a_stride1_size = std::max(
@@ -1425,4 +1314,79 @@ absl::StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
       });
 }
 
+/*static*/ void TransposePlan::RemoveTrivialLoops(std::vector<Loop>& loops) {
+  auto it = std::remove_if(loops.begin(), loops.end(), [](const Loop& loop) {
+    // We must preserve the loop if it corresponds to the innermost dimension
+    // of the layout, because the kernels (especially TransposeConstStride1)
+    // rely on finding a node with is_inner_dim_in_a/b set to true.
+    if (loop.is_inner_dim_in_a || loop.is_inner_dim_in_b) {
+      return false;
+    }
+    if (loop.tile_interior) {
+      return loop.tile_size == 1;
+    }
+    // Exterior loop.
+    // Trivial if dim_size == tile_size (1 complete tile, no partials). This
+    // also accounts for the case where the dimension is of size 1, since in
+    // that case the tile size is also 1.
+    return loop.dim_size == loop.tile_size;
+  });
+  loops.erase(it, loops.end());
+}
+
+/*static*/ void TransposePlan::CoalesceLoops(std::vector<Loop>& loops) {
+  if (loops.empty()) {
+    return;
+  }
+
+  // Coalesce from slow-varying to fast-varying (outer to inner).
+  // loop_order_[0] is slowest.
+  int write_pos = 0;
+  for (int read_pos = 1; read_pos < loops.size(); ++read_pos) {
+    Loop& outer = loops[write_pos];
+    const Loop& inner = loops[read_pos];
+
+    int64_t inner_iter_size = inner.tile_interior
+                                  ? inner.tile_size
+                                  : (inner.dim_size / inner.tile_size);
+
+    // Two loops can be coalesced if:
+    // * they are both tile interiors or both tile exteriors
+    // * neither has a partial tile
+    // * the inner loop is a multiple of the outer loop.
+    // TODO(phawkins): I suspect this condition can be simplified. In particular
+    // the condition that we separate tile exteriors from interiors feels
+    // arbitrary.
+    bool coalescable = (outer.tile_interior == inner.tile_interior) &&
+                       (outer.dim_size % outer.tile_size == 0) &&
+                       (inner.dim_size % inner.tile_size == 0) &&
+                       (outer.lda == inner.lda * inner_iter_size) &&
+                       (outer.ldb == inner.ldb * inner_iter_size);
+    if (coalescable) {
+      if (outer.tile_interior) {
+        outer.tile_size *= inner.tile_size;
+        outer.dim_size *= inner.dim_size;
+      } else {
+        outer.dim_size *= inner_iter_size;
+      }
+
+      outer.lda = inner.lda;
+      outer.ldb = inner.ldb;
+
+      outer.is_inner_dim_in_a =
+          inner.is_inner_dim_in_a || outer.is_inner_dim_in_a;
+      outer.is_inner_dim_in_b =
+          inner.is_inner_dim_in_b || outer.is_inner_dim_in_b;
+
+      // Don't advance write_pos, so we can merge more into 'outer'.
+    } else {
+      ++write_pos;
+      if (write_pos != read_pos) {
+        loops[write_pos] = inner;
+      }
+    }
+  }
+  loops.resize(write_pos + 1);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index c428d2df0f7f9d..aef51be791a04b 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -147,23 +147,40 @@ class TransposePlan {
  protected:
   // Methods protected so they can be accessed by tests.
 
-  // Removes any size-1 dimensions.
-  static void RemoveTrivialDimensions(
-      absl::InlinedVector<int64_t, 4>& a_dims,
-      absl::InlinedVector<int64_t, 4>& permutation,
-      absl::InlinedVector<int64_t, 4>& lda,
-      absl::InlinedVector<int64_t, 4>& lda_tile,
-      absl::InlinedVector<int64_t, 4>& a_tiling,
-      absl::InlinedVector<int64_t, 4>& b_tiling);
-
-  // Collapses together dimensions that are adjacent both in `dims` and
-  // `permutation`.
-  static void CoalesceDimensions(absl::InlinedVector<int64_t, 4>& a_dims,
-                                 absl::InlinedVector<int64_t, 4>& permutation,
-                                 absl::InlinedVector<int64_t, 4>& lda,
-                                 absl::InlinedVector<int64_t, 4>& lda_tile,
-                                 absl::InlinedVector<int64_t, 4>& a_tiling,
-                                 absl::InlinedVector<int64_t, 4>& b_tiling);
+  struct Loop {
+    // Dimension number in A from which this loop originated. This is mostly
+    // for debugging the plan.
+    int dim_in_a;
+
+    // If true, the loop iterates over the interior of a tile.
+    // For an untiled dimension, this is always false. For a tiled dimension,
+    // we will have two loops: one over the tile exteriors and one over the tile
+    // interiors.
+    bool tile_interior;
+
+    // Size of the iteration space.
+    int64_t dim_size;
+
+    // Size of the tiles, if this a tiled dimension.
+    int64_t tile_size;
+
+    int64_t lda;  // Stride in A for this loop.
+    int64_t ldb;  // Stride in B for this loop.
+
+    // Is this the innermost (stride 1) dimension in A or B? These dimensions
+    // are special for the kernels.
+    bool is_inner_dim_in_a;
+    bool is_inner_dim_in_b;
+
+    // Number of parallel threads to use for this loop.
+    int64_t parallelism;
+
+    bool operator==(const Loop& other) const;
+  };
+
+  // Exposed for testing.
+  static void RemoveTrivialLoops(std::vector<Loop>& loops);
+  static void CoalesceLoops(std::vector<Loop>& loops);
 
  private:
   // Performs plan initialization that cannot fail.
@@ -221,34 +238,7 @@ class TransposePlan {
   bool b_is_tiled_;
 
   // Order to traverse dimensions, from slowest-varying to fastest-varying.
-  struct Loop {
-    // Dimension number in A from which this loop originated. This is mostly
-    // for debugging the plan.
-    int dim_in_a;
-
-    // If true, the loop iterates over the interior of a tile.
-    // For an untiled dimension, this is always false. For a tiled dimension,
-    // we will have two loops: one over the tile exteriors and one over the tile
-    // interiors.
-    bool tile_interior;
-
-    // Size of the iteration space.
-    int64_t dim_size;
 
-    // Size of the tiles, if this a tiled dimension.
-    int64_t tile_size;
-
-    int64_t lda;  // Stride in A for this loop.
-    int64_t ldb;  // Stride in B for this loop.
-
-    // Is this the innermost (stride 1) dimension in A or B? These dimensions
-    // are special for the kernels.
-    bool is_inner_dim_in_a;
-    bool is_inner_dim_in_b;
-
-    // Number of parallel threads to use for this loop.
-    int64_t parallelism;
-  };
   std::vector<Loop> loop_order_;
 
   // Root nodes of the plan, i.e., pointing to the outermost loops in the loop
diff --git a/third_party/xla/xla/pjrt/transpose_test.cc b/third_party/xla/xla/pjrt/transpose_test.cc
index 4ba51f5bcff8f5..c136540eee1175 100644
--- a/third_party/xla/xla/pjrt/transpose_test.cc
+++ b/third_party/xla/xla/pjrt/transpose_test.cc
@@ -54,82 +54,103 @@ namespace xla {
 
 class TestTransposePlan : public TransposePlan {
  public:
-  using TransposePlan::CoalesceDimensions;
-  using TransposePlan::RemoveTrivialDimensions;
+  using Loop = TransposePlan::Loop;
+  using TransposePlan::CoalesceLoops;
+  using TransposePlan::RemoveTrivialLoops;
 };
 
-TEST(TransposeTest, RemoveTrivialDimensions) {
-  absl::InlinedVector<int64_t, 4> dims = {4, 5, 1, 3, 1, 2, 5};
-  absl::InlinedVector<int64_t, 4> perm = {0, 2, 1, 4, 3, 6, 5};
-  absl::InlinedVector<int64_t, 4> lda = {2, 5, 7, 100, 3, 0, 1};
-  absl::InlinedVector<int64_t, 4> lda_tile = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> input_tiling = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> output_tiling = {1, 1, 1, 1, 1, 1, 1};
-  TestTransposePlan::RemoveTrivialDimensions(dims, perm, lda, lda_tile,
-                                             input_tiling, output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 5, 3, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(0, 1, 2, 4, 3));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {4, 3, 2, 1, 0};
-  lda = {2, 5, 100, 0, 1};
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::RemoveTrivialDimensions(dims, perm, lda, lda_tile,
-                                             input_tiling, output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 5, 3, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(4, 3, 2, 1, 0));
+TEST(TransposeTest, RemoveTrivialLoops) {
+  using Loop = TestTransposePlan::Loop;
+  std::vector<Loop> loops;
+  // Exterior loop, trivial (size 1)
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/1,
+                       /*tile_size=*/1});
+  // Exterior loop, trivial (dim_size == tile_size, 1 tile)
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/false, /*dim_size=*/10,
+                       /*tile_size=*/10});
+  // Exterior loop, non-trivial
+  loops.push_back(Loop{/*dim_in_a=*/2, /*tile_interior=*/false, /*dim_size=*/10,
+                       /*tile_size=*/2});
+  // Interior loop, trivial (size 1)
+  loops.push_back(Loop{/*dim_in_a=*/3, /*tile_interior=*/true, /*dim_size=*/10,
+                       /*tile_size=*/1});
+  // Interior loop, non-trivial
+  loops.push_back(Loop{/*dim_in_a=*/4, /*tile_interior=*/true, /*dim_size=*/10,
+                       /*tile_size=*/10});
+  // Trivial loop (size 1) but preserved because it is inner dim
+  loops.push_back(Loop{/*dim_in_a=*/5, /*tile_interior=*/false, /*dim_size=*/1,
+                       /*tile_size=*/1, /*lda=*/1, /*ldb=*/1,
+                       /*is_inner_dim_in_a=*/true,
+                       /*is_inner_dim_in_b=*/false});
+
+  TestTransposePlan::RemoveTrivialLoops(loops);
+
+  ASSERT_EQ(loops.size(), 3);
+  // Expect loop 2 (Exterior non-trivial)
+  EXPECT_EQ(loops[0].dim_in_a, 2);
+  EXPECT_EQ(loops[0].tile_interior, false);
+  // Expect loop 4 (Interior non-trivial)
+  EXPECT_EQ(loops[1].dim_in_a, 4);
+  EXPECT_EQ(loops[1].tile_interior, true);
+  // Expect loop 5 (Trivial but preserved)
+  EXPECT_EQ(loops[2].dim_in_a, 5);
+  EXPECT_EQ(loops[2].is_inner_dim_in_a, true);
 }
 
-TEST(TransposeTest, CoalesceDimensions) {
-  absl::InlinedVector<int64_t, 4> dims = {4, 5, 1, 3, 1, 2, 5};
-  absl::InlinedVector<int64_t, 4> perm = {0, 2, 1, 4, 3, 6, 5};
-  absl::InlinedVector<int64_t, 4> lda = {50, 30, 30, 10, 10, 5, 1};
-  absl::InlinedVector<int64_t, 4> lda_tile = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> input_tiling = {1, 1, 1, 1, 1, 1, 1};
-  absl::InlinedVector<int64_t, 4> output_tiling = {1, 1, 1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 5, 1, 3, 1, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(0, 2, 1, 4, 3, 6, 5));
-  EXPECT_THAT(lda, testing::ElementsAre(50, 30, 30, 10, 10, 5, 1));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {4, 1, 2, 3, 0};
-  lda = {150, 30, 10, 5, 1};
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 30, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(2, 1, 0));
-  EXPECT_THAT(lda, testing::ElementsAre(150, 5, 1));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {0, 1, 2, 3, 4};
-  lda = {150, 30, 10, 5, 1};
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(600));
-  EXPECT_THAT(perm, testing::ElementsAre(0));
-  EXPECT_THAT(lda, testing::ElementsAre(1));
-
-  dims = {4, 5, 3, 2, 5};
-  perm = {4, 1, 2, 3, 0};
-  lda = {150, 30, 10, 7, 1};  // Non-standard stridings prevent coalescing.
-  lda_tile = {1, 1, 1, 1, 1};
-  input_tiling = {1, 1, 1, 1, 1};
-  output_tiling = {1, 1, 1, 1, 1};
-  TestTransposePlan::CoalesceDimensions(dims, perm, lda, lda_tile, input_tiling,
-                                        output_tiling);
-  EXPECT_THAT(dims, testing::ElementsAre(4, 15, 2, 5));
-  EXPECT_THAT(perm, testing::ElementsAre(3, 1, 2, 0));
-  EXPECT_THAT(lda, testing::ElementsAre(150, 10, 7, 1));
+TEST(TransposeTest, CoalesceLoops) {
+  using Loop = TestTransposePlan::Loop;
+  std::vector<Loop> loops;
+
+  // Case 1: Compatible untiled loops
+  // Outer: size 4, stride 20 (inner size 5 * inner stride 4)
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/4,
+                       /*tile_size=*/1, /*lda=*/20, /*ldb=*/400});
+  // Inner: size 5, stride 4
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/false, /*dim_size=*/5,
+                       /*tile_size=*/1, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+
+  ASSERT_EQ(loops.size(), 1);
+  EXPECT_EQ(loops[0].dim_size, 20);
+  EXPECT_EQ(loops[0].tile_size, 1);
+  EXPECT_EQ(loops[0].lda, 4);
+  EXPECT_EQ(loops[0].ldb, 80);
+
+  // Case 2: Incompatible strides
+  loops.clear();
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/4,
+                       /*tile_size=*/1, /*lda=*/21,
+                       /*ldb=*/400});  // lda mismatch
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/false, /*dim_size=*/5,
+                       /*tile_size=*/1, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+  EXPECT_EQ(loops.size(), 2);
+
+  // Case 3: Compatible tiled interior
+  loops.clear();
+  // Outer interior: tile_size 4, lda 16
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/true, /*dim_size=*/100,
+                       /*tile_size=*/4, /*lda=*/16, /*ldb=*/320});
+  // Inner interior: tile_size 4, lda 4
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/true, /*dim_size=*/100,
+                       /*tile_size=*/4, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+  ASSERT_EQ(loops.size(), 1);
+  EXPECT_EQ(loops[0].tile_size, 16);
+  EXPECT_EQ(loops[0].tile_interior, true);
+
+  // Case 4: Mismatched tile_interior status (should not coalesce)
+  loops.clear();
+  loops.push_back(Loop{/*dim_in_a=*/0, /*tile_interior=*/false, /*dim_size=*/4,
+                       /*tile_size=*/1, /*lda=*/20, /*ldb=*/400});
+  loops.push_back(Loop{/*dim_in_a=*/1, /*tile_interior=*/true, /*dim_size=*/5,
+                       /*tile_size=*/5, /*lda=*/4, /*ldb=*/80});
+
+  TestTransposePlan::CoalesceLoops(loops);
+  EXPECT_EQ(loops.size(), 2);
 }
 
 TEST(TransposeTest, InvalidTilings) {

From a59ffc09ddd3bc77cf3dc669d92b691317b67222 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 07:23:17 -0800
Subject: [PATCH 529/753] Reverts 408bf09796590bc66233afff288bf926e2736a9d

PiperOrigin-RevId: 846257722
---
 .../codegen/triton/triton_gemm_fusion_test.cc | 21 ++---
 .../xla/xla/service/gpu/gpu_compiler.cc       |  3 +
 .../xla/xla/service/gpu/gpu_compiler_test.cc  | 28 +++++++
 .../gpu_compiler_test_autotune_db.textproto   |  4 +-
 .../xla/xla/service/gpu/gpu_fusible.cc        | 19 +----
 .../xla/xla/service/gpu/gpu_fusible_test.cc   | 44 +++-------
 .../xla/xla/service/gpu/ir_emission_utils.cc  | 42 +++++-----
 .../xla/service/gpu/ir_emission_utils_test.cc | 81 +++++--------------
 .../gpu/model/coalescing_analysis_test.cc     | 18 ++---
 .../xla/xla/service/gpu/transforms/BUILD      |  2 +-
 .../transforms/cudnn_norm_rewriter_test.cc    | 30 ++++---
 .../transforms/fusion_block_level_rewriter.cc | 27 +++----
 .../fusion_block_level_rewriter_test.cc       | 32 --------
 .../gpu/transforms/layout_assignment_a100.hlo |  7 +-
 .../gpu/transforms/layout_assignment_h100.hlo |  7 +-
 .../gpu/transforms/layout_assignment_v100.hlo |  7 +-
 third_party/xla/xla/shape_util.cc             |  1 -
 third_party/xla/xla/shape_util_test.cc        | 15 +---
 18 files changed, 141 insertions(+), 247 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
index 127b0d60a72832..52e84704781010 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/triton_gemm_fusion_test.cc
@@ -562,10 +562,7 @@ ENTRY e {
                                ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-// TODO: b/422676780 - Enable the tests once the indexing maps-based tiling is
-// deprecated. The test is disabled after we remove TransposeDimensionGrouper
-// pass, because the infra currently requires grouping of adjacent dimensions.
-TEST_F(TritonGemmTest, DISABLED_SplitLhsNoncontractingTransposeRhs) {
+TEST_F(TritonGemmTest, SplitLhsNoncontractingTransposeRhs) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
@@ -590,10 +587,7 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
 }
 
-// TODO: b/422676780 - Enable the tests once the indexing maps-based tiling is
-// deprecated. The test is disabled after we remove TransposeDimensionGrouper
-// pass, because the infra currently requires grouping of adjacent dimensions.
-TEST_F(TritonGemmTest, DISABLED_SplitLhsNoncontracting) {
+TEST_F(TritonGemmTest, SplitLhsNoncontracting) {
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
   p0 = f32[72,72] parameter(0)
@@ -1782,17 +1776,12 @@ ENTRY e {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           GetOptimizedModule(kHloText));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
-      root,
-      GmockMatch(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Bitcast(
           m::Fusion(m::Fusion(m::Parameter(), m::Parameter())
                         .WithFusionKind(HloInstruction::FusionKind::kCustom))
-              .WithFusionKind(HloInstruction::FusionKind::kInput)));
-
-  const HloFusionInstruction* root_fusion = Cast<HloFusionInstruction>(root);
-  EXPECT_EQ(root_fusion->fused_expression_root()->opcode(),
-            HloOpcode::kTranspose);
+              .WithFusionKind(HloInstruction::FusionKind::kInput))));
 
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 45185f41fab4d0..36471b34d7a2ac 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -1751,6 +1751,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
       // introduced the nested fusions. We also want to keep it close to the
       // gemm rewriter to avoid the possibility of new passes to rewrite the
       // transpose.
+      pipeline.AddPass<TransposeDimensionGrouper>();
       pipeline.AddPass<GemmFusion>(gpu_version);
       pipeline.AddPass<GemmFusionSwapOperands>();
     } else if (cuda_cc != nullptr &&
@@ -1778,6 +1779,8 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // also have unsorted update_window_dims.
     pipeline.AddPass<ScatterSimplifier>();
     pipeline.AddPass<BroadcastCanonicalizer>();
+    // BroadcastCanonicalizer can create transposes.
+    pipeline.AddPass<TransposeDimensionGrouper>();
     pipeline.AddPass<ReductionDegenerateDimRemover>();
     pipeline.AddPass<ReductionLayoutNormalizer>();
     // Run Softmax fusion after layout normalization. We expect a default layout
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 33abfb08f9faec..855267bd792c9a 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -1855,6 +1855,34 @@ TEST_F(PassOrderTest, NestGemmFusionRunsAfterHoistFusedBitcasts) {
   VerifyPassOrder("hoist-fused-bitcasts", "nest_gemm_fusion");
 }
 
+TEST_F(PassOrderTest, TransposeDimensionGrouperRunsBeforeGemmRewriter) {
+  if (!get_cuda_cc().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "triton-gemm-rewriter requires at least Ampere to run.";
+  }
+  if (!optimized_module_) {
+    CompileModule(GetModuleConfigForTest());
+  }
+  // DebugOptions options = GetDebugOptionsForTest();
+  // options.set_xla_gpu_enable_triton_gemm(true);
+  // SetDebugOptions(options);
+  // Verify that transpose-dimension-grouper runs immediately before
+  // triton-gemm-rewriter. We want to keep them close together to avoid the
+  // possibility of new passes to rewrite the transpose and make it
+  // not compatible with the generic triton emitter.
+  // Simple VerifyPassOrder does not work here as we want to check that passes
+  // are run next to each other, also transpose-dimension-grouper runs one more
+  // time after the gemm rewriter.
+  CHECK(optimized_module_);
+  std::string previous_pass_name;
+  for (const HloPassMetadata& pass_metadata :
+       optimized_module_->metadata().proto().pass_metadata()) {
+    if (pass_metadata.pass_name() == "triton-gemm-rewriter") {
+      EXPECT_EQ(previous_pass_name, "transpose-dimension-grouper");
+    }
+    previous_pass_name = pass_metadata.pass_name();
+  }
+}
+
 TEST_F(PassOrderTest,
        ReducePrecisionIsRemovedAfterAllCallsToSimplifyFPConversions) {
   // Because of an issue with JAX remat and `SimplifyFPConversions` (see PR:
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
index e5b7f65cab6f69..67d67b9594af5f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test_autotune_db.textproto
@@ -63,7 +63,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_4), dimensions={0,1,3,2}\n  tmp_6 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_5)\n  tmp_7 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_7)\n  tmp_9 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_6, bf16[128,1024,1024]{2,1,0} tmp_8), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_10 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_9)\n}"
+  hlo: "{\n  tmp_0 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[] constant({...})\n  tmp_2 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_1), dimensions={}\n  tmp_3 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_0, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_2)\n  tmp_4 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_3)\n  tmp_5 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_5), dimensions={0,2,1}\n  tmp_7 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_6)\n  tmp_8 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[4,32,1024,1024]{3,2,1,0} tmp_7)\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} parameter(1)\n  tmp_10 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9)\n  tmp_11 = bf16[128,1024,1024]{2,1,0} dot(bf16[128,1024,1024]{2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_10), lhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_batch_dims={0}, rhs_contracting_dims={1}\n  ROOT tmp_12 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[128,1024,1024]{2,1,0} tmp_11)\n}"
   result {
     gemm {
       algorithm: -1
@@ -183,7 +183,7 @@ results {
 }
 results {
   device: "CUDA: 9.0, Cores: 132, GPU clock: 1.98 GHz, Memory bandwidth: 3352 GB/s, L2 cache: 50 MB, DNN version: 1.2.3"
-  hlo: "{\n  tmp_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[3,4,32,1024,1024]{4,3,2,1,0} transpose(bf16[3,32,1024,4,1024]{4,3,2,1,0} tmp_0), dimensions={0,3,1,2,4}\n  tmp_2 = bf16[1,3,32,1024]{3,2,1,0} parameter(1)\n  tmp_3 = bf16[3,32,1024]{2,1,0} bitcast(bf16[1,3,32,1024]{3,2,1,0} tmp_2)\n  tmp_4 = bf16[3,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[3,32,1024]{2,1,0} tmp_3), dimensions={0,2,3}\n  tmp_5 = bf16[3,4,32,1024,1024]{4,3,2,1,0} add(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_1, bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_4)\n  tmp_6 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_5), slice={[1:2], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_7 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_5), slice={[0:1], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_8 = bf16[] constant({...})\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_8), dimensions={}\n  tmp_10 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_7, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9)\n  tmp_11 = bf16[4,32,1024,1024]{3,2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_10)\n  tmp_12 = bf16[4,32,1024,1024]{3,2,1,0} transpose(bf16[4,32,1024,1024]{3,2,1,0} tmp_11), dimensions={0,1,3,2}\n  ROOT tmp_13 = (bf16[1,4,32,1024,1024]{4,3,2,1,0}, bf16[4,32,1024,1024]{3,2,1,0}) tuple(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_6, bf16[4,32,1024,1024]{3,2,1,0} tmp_12)\n}"
+  hlo: "{\n  tmp_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} parameter(0)\n  tmp_1 = bf16[3,32768,4,1024]{3,2,1,0} bitcast(bf16[3,32,1024,4,1024]{4,3,2,1,0} tmp_0)\n  tmp_2 = bf16[3,4,32768,1024]{3,2,1,0} transpose(bf16[3,32768,4,1024]{3,2,1,0} tmp_1), dimensions={0,2,1,3}\n  tmp_3 = bf16[3,4,32,1024,1024]{4,3,2,1,0} bitcast(bf16[3,4,32768,1024]{3,2,1,0} tmp_2)\n  tmp_4 = bf16[1,3,32,1024]{3,2,1,0} parameter(1)\n  tmp_5 = bf16[3,32,1024]{2,1,0} bitcast(bf16[1,3,32,1024]{3,2,1,0} tmp_4)\n  tmp_6 = bf16[3,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[3,32,1024]{2,1,0} tmp_5), dimensions={0,2,3}\n  tmp_7 = bf16[3,4,32,1024,1024]{4,3,2,1,0} add(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_3, bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_6)\n  tmp_8 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[1:2], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_9 = bf16[1,4,32,1024,1024]{4,3,2,1,0} slice(bf16[3,4,32,1024,1024]{4,3,2,1,0} tmp_7), slice={[0:1], [0:4], [0:32], [0:1024], [0:1024]}\n  tmp_10 = bf16[] constant({...})\n  tmp_11 = bf16[1,4,32,1024,1024]{4,3,2,1,0} broadcast(bf16[] tmp_10), dimensions={}\n  tmp_12 = bf16[1,4,32,1024,1024]{4,3,2,1,0} multiply(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_9, bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_11)\n  tmp_13 = bf16[128,1024,1024]{2,1,0} bitcast(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_12)\n  tmp_14 = bf16[128,1024,1024]{2,1,0} transpose(bf16[128,1024,1024]{2,1,0} tmp_13), dimensions={0,2,1}\n  ROOT tmp_15 = (bf16[1,4,32,1024,1024]{4,3,2,1,0}, bf16[128,1024,1024]{2,1,0}) tuple(bf16[1,4,32,1024,1024]{4,3,2,1,0} tmp_8, bf16[128,1024,1024]{2,1,0} tmp_14)\n}"
   result {
     other {
       name: "NativeEmitter"
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index eab17134475244..5f665644c5719e 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -64,22 +64,9 @@ bool ContainsTransposeWithSmallMostMinorDim(const HloFusionAdaptor& fusion,
       return false;
     }
     const HloInstruction& transpose = instr.instruction();
-    // The kLoop emitter operates on the original transpose, but it handles the
-    // index calculation. The critical factor for performance (coalescing) is
-    // the size of the contiguous memory block being accessed in the minor
-    // dimension. Normalization reveals this true physical dimension size by
-    // merging adjacent logical dimensions. If this normalized dimension is
-    // large enough, the unrolled accesses will be coalesced, justifying the
-    // unroll factor.
-    absl::InlinedVector<int64_t, 3> permutation;
-    auto normalized_dims_or = ShapeUtil::GetNormalizedLogicalTransposeShape(
-        transpose.operand(0)->shape(), transpose.shape(),
-        transpose.dimensions(), permutation);
-    if (normalized_dims_or.ok()) {
-      return normalized_dims_or.value().back() < unroll_factor;
-    } else {
-      return transpose.shape().dimensions().back() < unroll_factor;
-    }
+    // We can assume that TransposeDimensionGrouper pass has run, so no need
+    // to try to combine adjacent dimensions.
+    return transpose.shape().dimensions().back() < unroll_factor;
   });
 }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index b19b554d0f4a3a..d1ad49adfd1278 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -603,7 +603,9 @@ TEST_F(GpuFusibleTest, FusionHeroesAreCompatible_TransposeFusionNotCompatible) {
     fused_computation_1 {
       p0.1 = f32[64,32]{1,0} parameter(0)
       neg = f32[64,32]{1,0} negate(p0.1)
-      ROOT transpose = f32[32,64]{1,0} transpose(neg), dimensions={1,0}
+      bc = f32[1,64,32]{2,1,0} bitcast(neg)
+      transpose = f32[1,32,64]{2,1,0} transpose(bc), dimensions={0,2,1}
+      ROOT bc2 = f32[32,64]{1,0} bitcast(transpose)
     }
 
     fused_computation_2 {
@@ -621,10 +623,12 @@ TEST_F(GpuFusibleTest, FusionHeroesAreCompatible_TransposeFusionNotCompatible) {
   const HloInstruction* fusion_1 =
       module->entry_computation()->root_instruction();
   const HloInstruction* fusion_2 = fusion_1->operand(0);
-  EXPECT_FALSE(FusionHeroesAreCompatible(fusion_1->fused_expression_root(),
-                                         fusion_2->fused_expression_root()));
-  EXPECT_FALSE(FusionHeroesAreCompatible(fusion_2->fused_expression_root(),
-                                         fusion_1->fused_expression_root()));
+  EXPECT_FALSE(
+      FusionHeroesAreCompatible(fusion_1->fused_expression_root(),
+                                fusion_2->fused_expression_root()->operand(0)));
+  EXPECT_FALSE(
+      FusionHeroesAreCompatible(fusion_2->fused_expression_root()->operand(0),
+                                fusion_1->fused_expression_root()));
 }
 
 TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_LoopFusions) {
@@ -1306,9 +1310,9 @@ TEST_F(GpuFusibleTest, ChooseFusionKind) {
 HloModule module
 
 ENTRY computation {
-    p = f32[5000,6000]{1,0} parameter(0)
-    c = f32[6000,5000] transpose(p), dimensions={1,0}
-    ROOT r = f32[300,20,5000] reshape(c)
+    p = f32[1,5000,6000]{2,1,0} parameter(0)
+    c = f32[1,6000,5000]{2,1,0} transpose(p), dimensions={0,2,1}
+    ROOT r = f32[300,20,5000]{2,1,0} reshape(c)
 }
 )")
                     .value();
@@ -1798,30 +1802,6 @@ ENTRY main {
   EXPECT_EQ(config.unroll_factor, 8);
 }
 
-TEST_F(GpuFusibleTest,
-       ComputeLoopFusionConfigForLoopTransposeEffectiveLargerMinorDim) {
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
-HloModule m
-
-ENTRY main {
-  p0 = f16[256,2048,4,2]{3,2,1,0} parameter(0)
-  ROOT res = f16[2048,256,4,2]{3,2,1,0} transpose(p0), dimensions={1,0,2,3}
-}
-)"));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
-  se::DeviceDescription device_info_h100{
-      TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
-  auto analysis = HloFusionAnalysis::Create(*root, device_info_h100);
-  auto config = ComputeLoopFusionConfig(analysis, root->shape());
-  EXPECT_EQ(config.unroll_factor, 4);
-
-  se::DeviceDescription device_info_b200{
-      TestGpuDeviceInfo::RTXB200SXMDeviceInfo()};
-  analysis = HloFusionAnalysis::Create(*root, device_info_b200);
-  config = ComputeLoopFusionConfig(analysis, root->shape());
-  EXPECT_EQ(config.unroll_factor, 8);
-}
-
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 6eca640b573d94..72c74c7b8ea8d2 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -245,23 +245,20 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     return std::nullopt;
   }
 
-  absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_dims_or = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      hero.operand(0)->shape(), hero.shape(), hero.dimensions(), permutation);
-  if (!normalized_dims_or.ok()) {
-    return std::nullopt;
-  }
-  auto normalized_dims = normalized_dims_or.value();
-  auto normalized_operand_dims =
-      Permute(normalized_dims, InversePermutation(permutation));
+  // We can assume that TransposeDimensionGrouper pass has run, so no need to
+  // call GetNormalizedLogicalTransposeShape here.
+  absl::InlinedVector<int64_t, 3> permutation(hero.dimensions().begin(),
+                                              hero.dimensions().end());
   // A real transpose needs at least 2 transpose dimensions.
   if (permutation.size() < 2) {
     return std::nullopt;
   }
   auto bit_width = GetBitwidth(hero.shape().element_type());
-  int64_t operand_most_minor_dim = normalized_operand_dims.back();
+  absl::InlinedVector<int64_t, 3> dimensions(hero.shape().dimensions().begin(),
+                                             hero.shape().dimensions().end());
+  int64_t operand_most_minor_dim = hero.operand(0)->shape().dimensions().back();
 
-  TransposeDescription desc{&hero, normalized_dims, permutation,
+  TransposeDescription desc{&hero, dimensions, permutation,
                             /*shmem_usage=*/0};
   if (CanEmitPackedTranspose(desc)) {
     int64_t vector_size =
@@ -270,28 +267,27 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
         kNumShmemBanks * (kBankBitwidth / 8) * kNumShmemBanks * vector_size;
     return desc;
   }
-  // Minor dimension is preserved.
-  if (permutation.back() == normalized_dims.size() - 1) {
+  if (permutation.back() == dimensions.size() - 1) {
     operand_most_minor_dim =
-        normalized_operand_dims[normalized_dims.size() - 2];
-    if (bit_width * normalized_dims.back() <= kMaxBitsInMostMinorDimension &&
-        bit_width * normalized_dims.back() *
+        hero.operand(0)->shape().dimensions(dimensions.size() - 2);
+    if (bit_width * dimensions.back() <= kMaxBitsInMostMinorDimension &&
+        bit_width * dimensions.back() *
                 std::min(operand_most_minor_dim,
-                         normalized_dims[normalized_dims.size() - 2]) >=
+                         dimensions[dimensions.size() - 2]) >=
             8 * kMinDimensionToTransposeTiled) {
       // Tile size for transposition.
       int64_t shmem_usage_bytes =
           CeilOfRatio(kNumShmemBanks * (kNumShmemBanks + 1LL) * bit_width *
-                          normalized_dims.back(),
+                          dimensions.back(),
                       8LL);
-      return TransposeDescription{&hero, normalized_dims, permutation,
+      return TransposeDescription{&hero, dimensions, permutation,
                                   shmem_usage_bytes};
     }
   } else if ((operand_most_minor_dim >= kMinDimensionToTransposeTiled &&
-              normalized_dims.back() >= kMinDimensionToTransposeTiled) ||
+              dimensions.back() >= kMinDimensionToTransposeTiled) ||
              (operand_most_minor_dim >= kMinDimensionToTransposeTiled2 &&
-              normalized_dims.back() >= kMinDimensionToTransposeTiled2 &&
-              operand_most_minor_dim * normalized_dims.back() >=
+              dimensions.back() >= kMinDimensionToTransposeTiled2 &&
+              operand_most_minor_dim * dimensions.back() >=
                   kMinTotalDimensionsToTransposeTiled)) {
     // TODO(b/415741994): TransposeEmitter is regressing for S4 when the last
     // dimension is being transposed. The issue seems to be related to bank
@@ -301,7 +297,7 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     }
     int64_t shmem_usage_bytes =
         CeilOfRatio(kNumShmemBanks * (kNumShmemBanks + 1LL) * bit_width, 8LL);
-    return TransposeDescription{&hero, normalized_dims, permutation,
+    return TransposeDescription{&hero, dimensions, permutation,
                                 shmem_usage_bytes};
   }
   return std::nullopt;
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index dea30196de473e..f14ff91e330107 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -82,13 +82,12 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTranspose) {
 HloModule module
 
 ENTRY entry {
-  p = f32[32,48,64]{2,1,0} parameter(0)
-  ROOT t = f32[64,32,48]{2,1,0} transpose(p), dimensions={2,0,1}
+  p = f32[1536,64]{1,0} parameter(0)
+  ROOT t = f32[64,1536]{1,0} transpose(p), dimensions={1,0}
 }
 )";
-  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                       ParseAndReturnVerifiedModule(hlo));
-
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
   HloInstruction* tr = module->entry_computation()->root_instruction();
 
   auto result = GetDescriptionForTiledTransposeEmitter(*tr);
@@ -103,12 +102,12 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogical102Transpose) {
 HloModule module
 
 ENTRY entry {
-  p = f32[32,48,1,2]{3,2,1,0} parameter(0)
-  ROOT t = f32[48,32,1,2]{3,2,1,0} transpose(p), dimensions={1,0,2,3}
+  p = f32[32,48,2]{2,1,0} parameter(0)
+  ROOT t = f32[48,32,2]{2,1,0} transpose(p), dimensions={1,0,2}
 }
 )";
-  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                       ParseAndReturnVerifiedModule(hlo));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
   HloInstruction* tr = module->entry_computation()->root_instruction();
 
   auto result = GetDescriptionForTiledTransposeEmitter(*tr);
@@ -413,8 +412,10 @@ fusion {
   p = f32[32,48,64]{2,1,0} parameter(0)
   p2 = f32[48,32,64]{2,1,0} parameter(1)
   t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
-  t2 = f32[64,48,32]{2,1,0} transpose(p2), dimensions={2,0,1}
-  ROOT add = f32[64,48,32]{2,1,0} add(t, t2)
+  bc = f32[1,1536,64]{2,1,0} bitcast(p2)
+  t2 = f32[1,64,1536]{2,1,0} transpose(bc), dimensions={0,2,1}
+  bc2 = f32[64,48,32]{2,1,0} bitcast(t2)
+  ROOT add = f32[64,48,32]{2,1,0} add(t, bc2)
 }
 
 ENTRY main {
@@ -433,26 +434,6 @@ ENTRY main {
   EXPECT_EQ(&FindNonTrivialHero(*r), r);
 }
 
-TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeWithGrouping) {
-  const char* hlo = R"(
-HloModule module
-
-ENTRY entry {
-  p = f32[32,32,64]{2,1,0} parameter(0)
-  ROOT t = f32[64,32,32]{2,1,0} transpose(p), dimensions={2,0,1}
-}
-)";
-  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                       ParseAndReturnVerifiedModule(hlo));
-  HloInstruction* tr = module->entry_computation()->root_instruction();
-
-  auto result = GetDescriptionForTiledTransposeEmitter(*tr);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result->instr, tr);
-  EXPECT_EQ(result->dimensions, InlinedVector({64, 1024}));
-  EXPECT_EQ(result->permutation, InlinedVector({1, 0}));
-}
-
 TEST_F(IrEmissionUtilsTest, FindNonTrivialHeroOutsideFusion) {
   const char* hlo = R"(
 HloModule module
@@ -552,13 +533,13 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOneSwapDimIsSmall) {
 HloModule module
 
 fusion {
-  p = f32[100,11,12,8]{3,2,1,0} parameter(0)
-  ROOT t = f32[8,12,100,11]{3,2,1,0} transpose(p), dimensions={3,2,0,1}
+  p = f32[1100,12,8]{2,1,0} parameter(0)
+  ROOT t = f32[8,12,1100]{2,1,0} transpose(p), dimensions={2,1,0}
 }
 
 ENTRY main {
-  param = f32[100,11,12,8]{3,2,1,0} parameter(0)
-  ROOT fusion = f32[8,12,100,11]{3,2,1,0} fusion(param), kind=kInput, calls=fusion
+  param = f32[1100,12,8]{2,1,0} parameter(0)
+  ROOT fusion = f32[8,12,1100]{2,1,0} fusion(param), kind=kInput, calls=fusion
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -578,13 +559,13 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOtherSwapDimIsSmall) {
 HloModule module
 
 fusion {
-  p = f32[8,12,100,11]{3,2,1,0} parameter(0)
-  ROOT t = f32[100,11,12,8]{3,2,1,0} transpose(p), dimensions={2,3,1,0}
+  p = f32[8,12,1100]{2,1,0} parameter(0)
+  ROOT t = f32[1100,12,8]{2,1,0} transpose(p), dimensions={2,1,0}
 }
 
 ENTRY main {
-  param = f32[8,12,100,11]{3,2,1,0} parameter(0)
-  ROOT fusion = f32[100,11,12,8]{3,2,1,0} fusion(param), kind=kInput, calls=fusion
+  param = f32[8,12,1100]{2,1,0} parameter(0)
+  ROOT fusion = f32[1100,12,8]{2,1,0} fusion(param), kind=kInput, calls=fusion
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -599,28 +580,6 @@ ENTRY main {
   EXPECT_EQ(result->permutation, InlinedVector({2, 1, 0}));
 }
 
-TEST_F(IrEmissionUtilsTest,
-       FindTiledLogicalTransposeWithSize1DimensionInRawShape) {
-  const char* hlo = R"(
-HloModule module
-
-ENTRY entry {
-  p = f32[32,1,16,2]{3,2,1,0} parameter(0)
-  ROOT t = f32[16,1,32,2]{3,2,1,0} transpose(p), dimensions={2,1,0,3}
-}
-)";
-  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                       ParseAndReturnVerifiedModule(hlo));
-
-  HloInstruction* tr = module->entry_computation()->root_instruction();
-
-  auto result = GetDescriptionForTiledTransposeEmitter(*tr);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result->instr, tr);
-  EXPECT_EQ(result->dimensions, InlinedVector({16, 32, 2}));
-  EXPECT_EQ(result->permutation, InlinedVector({1, 0, 2}));
-}
-
 TEST_F(IrEmissionUtilsTest, IsContiguousSlice) {
   const char* hlo = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 098b8f94fd04c0..6d81b8c935ec17 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -177,13 +177,13 @@ TEST_F(CoalescingTest, Transpose) {
     HloModule module
 
     fusion {
-      %input = f32[100, 64, 32] parameter(0)
-      ROOT transpose = f32[32, 100, 64] transpose(%input), dimensions={2, 0, 1}
+      %input = f32[1, 6400, 32] parameter(0)
+      ROOT transpose = f32[1, 32, 6400] transpose(%input), dimensions={0, 2, 1}
     }
 
     ENTRY entry {
-      %input = f32[100, 64, 32] parameter(0)
-      ROOT %fusion = f32[32, 100, 64] fusion(%input), kind=kLoop, calls=fusion
+      %input = f32[1, 6400, 32] parameter(0)
+      ROOT %fusion = f32[1, 32, 6400] fusion(%input), kind=kLoop, calls=fusion
   })";
   // thread_x to linearized input mapping for thread_x in [0, 31]:
   // Operand 1:  (thread_x)[s0] -> (thread_x + s0 * 128) for s0 in [0, 7]
@@ -258,15 +258,15 @@ TEST_F(CoalescingTest, TransposeOfBroadcastHeuristic) {
     HloModule module
 
     fusion {
-      input = f32[32, 100, 64] parameter(0)
-      ROOT slice = f32[32, 100, 1] slice(input), slice={[0:32:1], [0:100:1], [0:1:1]}
+      input = f32[1, 32, 6400] parameter(0)
+      ROOT slice = f32[1, 32, 100] slice(input), slice={[0:1:1], [0:32:1], [0:6400:64]}
     }
 
     ENTRY entry {
       p0 = f32[32] parameter(0)
-      broadcast = f32[100, 64, 32] broadcast(p0), dimensions={2}
-      transpose = f32[32, 100, 64] transpose(broadcast), dimensions={2, 0, 1}
-      ROOT %fusion = f32[32, 100, 1] fusion(transpose), kind=kLoop, calls=fusion
+      broadcast = f32[1, 6400, 32] broadcast(p0), dimensions={2}
+      transpose = f32[1, 32, 6400] transpose(broadcast), dimensions={0, 2, 1}
+      ROOT %fusion = f32[1, 32, 100] fusion(transpose), kind=kLoop, calls=fusion
   })";
   EXPECT_TRUE(IsReadCoalescedHeuristic(ir));
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 08a13b2bb05b9b..0b702ad9af2acb 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -210,11 +210,11 @@ cc_library(
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
diff --git a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
index 2073fc9f90b858..24a469ff6c3d8f 100644
--- a/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/cudnn_norm_rewriter_test.cc
@@ -287,7 +287,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[8,8,6]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
@@ -299,7 +299,8 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[8,6,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} bitcast([[FUSION]])
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -347,7 +348,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2Degenerate1) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,1,6,8], {{.*}}: f32[6], {{.*}}: f32[6]) -> f32[2,1,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,1,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,6]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,6]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
@@ -359,7 +360,8 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D2Degenerate1) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,6,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,1,6,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[2,6,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,1,6,8]{3,2,1,0} bitcast([[FUSION]])
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -407,7 +409,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> f32[2,4,6,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,24]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
@@ -419,7 +421,8 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[2,24,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,6,8]{3,2,1,0} bitcast([[FUSION]])
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -467,7 +470,7 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12Degenerate2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> f32[2,4,1,8] {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
@@ -479,7 +482,8 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12Degenerate2) {
 ; CHECK-DAG:         "epsilon":0.001
 ; CHECK:           }
 ; CHECK-NEXT:    [[GTE:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} get-tuple-element([[CC]]), index=0
-; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,1,8]{3,2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[2,4,8]{2,1,0} fusion([[GTE]]), kind={{.*}}, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-NEXT:  ROOT {{.*}} = f32[2,4,1,8]{3,2,1,0} bitcast([[FUSION]])
   )";
 
   TestNorm(hlo_text, optimized_hlo);
@@ -821,7 +825,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6]) -> (f32[2,4,6,8], f32[2,8], f32[2,8], f32[2,8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,24]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
@@ -881,7 +885,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12Degenerate2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1]) -> (f32[2,4,1,8], f32[2,8], f32[2,8], f32[2,8]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE:%[^ ]+]] = f32[2,8,4]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
@@ -1177,7 +1181,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[6], {{.*}}: f32[6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[8,8,6]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,6,1,1]{3,2,1,0} bitcast([[P1]])
@@ -1270,7 +1274,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,6,8], {{.*}}: f32[4,6], {{.*}}: f32[4,6], {{.*}}: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,24]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,6,1]{3,2,1,0} bitcast([[P1]])
@@ -1363,7 +1367,7 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12Degenerate2) {
 
 ; CHECK-LABEL: ENTRY %test ({{.*}}: f32[2,4,1,8], {{.*}}: f32[4,1], {{.*}}: f32[4,1], {{.*}}: f32[2,4,1,8]) -> (f32[2,4,1,8], f32[2,4,1,8], f32[4,1], f32[4,1]) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,1,8]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[1,2,8,4]{3,2,1,0} fusion([[P0]])
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4]{2,1,0} fusion([[P0]])
 ; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,1]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[1,4,1,1]{3,2,1,0} bitcast([[P1]])
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
index 06d92020417493..3f510f05645814 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter.cc
@@ -20,13 +20,13 @@ limitations under the License.
 #include <variant>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/Support/MathExtras.h"
 #include "xla/backends/gpu/codegen/triton/support.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -44,7 +44,6 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/service/pattern_matcher.h"
-#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -79,28 +78,20 @@ bool ShouldRewriteLoopTransposeFusion(
   // is neither the minormost nor the second minormost dimension in the output,
   // and the output minormost dimension is swapped with the new minormost
   // dimension.
+  int64_t rank = root->shape().dimensions().size();
 
-  // We use the normalized logical transpose shape so it should be enough to
-  // check that the minormost dimension's index within the result is smaller
-  // than rank - 2, and that the new minormost dimension is swapped with it.
-  absl::InlinedVector<int64_t, 3> permutation;
-  auto normalized_dims_or = ShapeUtil::GetNormalizedLogicalTransposeShape(
-      root->operand(0)->shape(), root->shape(), root->dimensions(),
-      permutation);
-  if (!normalized_dims_or.ok()) {
-    return false;
-  }
-  auto normalized_dims = normalized_dims_or.value();
-  int64_t rank = normalized_dims.size();
-
+  // The transpose dimension grouper has run, so it should be enough to check
+  // that the minormost dimension's index within the result is smaller than
+  // rank - 2, and that the new minormost dimension is swapped with it.
   // This only triggers for transposes with major-to-minor layout.
   bool has_major_to_minor_layout =
       LayoutUtil::IsMonotonicWithDim0Major(root->shape().layout());
-  int64_t result_minormost_dim_in_operand = permutation.back();
+  absl::Span<int64_t const> transpose_dimensions = root->dimensions();
+  int64_t result_minormost_dim_in_operand = transpose_dimensions.back();
 
   if (!(has_major_to_minor_layout &&
-        permutation[result_minormost_dim_in_operand] == rank - 1 &&
-        permutation[rank - 1] < rank - 2)) {
+        transpose_dimensions[result_minormost_dim_in_operand] == rank - 1 &&
+        transpose_dimensions[rank - 1] < rank - 2)) {
     return false;
   }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
index 8fc1960a1a628d..dfe161f86da219 100644
--- a/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/fusion_block_level_rewriter_test.cc
@@ -211,38 +211,6 @@ ENTRY entry  {
   EXPECT_TRUE(HasTritonBlockLevelFusionConfig(root));
 }
 
-TEST_F(FusionBlockLevelRewriterTest,
-       RewritesLoopTransposeFusionWithSplitDimensions) {
-  // This test checks if the rewriter can handle a transpose where dimensions
-  // are split in the HLO but logically contiguous.
-  // Logical shape: [100, 200, 300] -> [300, 200, 100] (Swap dim 0 and 2).
-  // Physical shape: [100, 200, 10, 30] -> [10, 30, 200, 100].
-  // The normalized logical transpose shape should recover the logical swap.
-  const absl::string_view hlo_text = R"(
-fusion_computation {
-  p0 = f32[100,200,10,30] parameter(0)
-  ROOT transpose = f32[10,30,200,100] transpose(p0), dimensions={2,3,1,0}
-}
-
-ENTRY entry {
-  p0 = f32[100,200,10,30] parameter(0)
-  ROOT fusion = f32[10,30,200,100] fusion(p0), kind=kLoop,
-    calls=fusion_computation
-})";
-  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                       ParseAndReturnVerifiedModule(hlo_text));
-
-  EXPECT_THAT(
-      FusionBlockLevelRewriter(device_info_, HloCostAnalysis::DefaultShapeSize,
-                               &mlir_context_)
-          .Run(module.get()),
-      absl_testing::IsOkAndHolds(true));
-  const HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_EQ(root->opcode(), HloOpcode::kFusion);
-  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kCustom);
-  EXPECT_TRUE(HasTritonBlockLevelFusionConfig(root));
-}
-
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
index 3b5c92a19388d2..0281e68b03e4ba 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_a100.hlo
@@ -1,9 +1,10 @@
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/a100_pcie_80.txtpb --split-input-file | FileCheck %s
 
-// CHECK: %wrapped_transpose_computation
+// CHECK: fused_transpose
 // CHECK-NEXT: bf16[3,3,16,32]{3,2,1,0} parameter(0)
-// CHECK-NEXT: bf16[32,3,3,16]{3,2,1,0} transpose
-// CHECK-SAME: dimensions={3,0,1,2}
+// CHECK-NEXT: bf16[144,32]{1,0} bitcast
+// CHECK-NEXT: bf16[32,144]{1,0} transpose
+// CHECK-SAME: dimensions={1,0}
 // CHECK: (bf16[1,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call
 // CHECK-SAME: window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward
 
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
index 1b82bb55c80b2b..10cc948cf6a288 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_h100.hlo
@@ -1,9 +1,10 @@
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/h100_sxm.txtpb --split-input-file | FileCheck %s
 
-// CHECK: %wrapped_transpose_computation
+// CHECK: fused_transpose
 // CHECK-NEXT: f8e4m3fn[3,3,16,32]{3,2,1,0} parameter(0)
-// CHECK-NEXT: f8e4m3fn[32,3,3,16]{3,2,1,0} transpose
-// CHECK-SAME: dimensions={3,0,1,2}
+// CHECK-NEXT: f8e4m3fn[144,32]{1,0} bitcast
+// CHECK-NEXT: f8e4m3fn[32,144]{1,0} transpose
+// CHECK-SAME: dimensions={1,0}
 // CHECK: (f8e4m3fn[1,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call
 // CHECK-SAME: window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward
 
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
index d5baeb8a42af7d..5ae06c318a1cf9 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_v100.hlo
@@ -1,9 +1,10 @@
 // RUN: hlo-opt %s --platform=gpu --stage=hlo --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/v100.txtpb --split-input-file | FileCheck %s
 
-// CHECK: %wrapped_transpose_computation
+// CHECK: fused_transpose
 // CHECK-NEXT: f16[3,3,16,32]{3,2,1,0} parameter(0)
-// CHECK-NEXT: f16[32,3,3,16]{3,2,1,0} transpose
-// CHECK-SAME: dimensions={3,0,1,2}
+// CHECK-NEXT: f16[144,32]{1,0} bitcast
+// CHECK-NEXT: f16[32,144]{1,0} transpose
+// CHECK-SAME: dimensions={1,0}
 // CHECK: (f16[1,64,64,32]{3,2,1,0}, u8[0]{0}) custom-call
 // CHECK-SAME: window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_o01i->b01f, custom_call_target="__cudnn$convForward
 
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index 5d29d0c2b76fb3..a9e0f6dee030c2 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -2379,7 +2379,6 @@ absl::InlinedVector<int64_t, 3> GetNormalizedTransposeShapeHelper(
       normalized_shape.dimensions().begin(),
       normalized_shape.dimensions().end());
   if (segments.size() == 1) {
-    permutation.push_back(0);
     return normalized_dims;
   }
   // Derive the permutation from the segments.
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index 5af96d0805c84d..8d3cedb6d1d21a 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -1815,20 +1815,7 @@ TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_NoTranspose) {
                            input_shape, output_shape, dimensions, permutation));
 
   EXPECT_THAT(normalized_shape, ElementsAre(8192));
-  EXPECT_THAT(permutation, ElementsAre(0));
-}
-
-TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_IdentityWithMerges) {
-  Shape output_shape = ShapeUtil::MakeShape(F32, {10, 20});
-  Shape input_shape = ShapeUtil::MakeShape(F32, {20, 10});
-  // Identity transpose that allows merging dimensions.
-  absl::InlinedVector<int64_t, 3> dimensions = {0, 1};
-  absl::InlinedVector<int64_t, 3> permutation;
-  ASSERT_OK_AND_ASSIGN(auto normalized_shape,
-                       ShapeUtil::GetNormalizedLogicalTransposeShape(
-                           input_shape, output_shape, dimensions, permutation));
-  EXPECT_THAT(normalized_shape, ElementsAre(200));
-  EXPECT_THAT(permutation, ElementsAre(0));
+  EXPECT_THAT(permutation, IsEmpty());
 }
 
 TEST(ShapeUtilTest, GetNormalizedLogicalTransposeShape_Simple2D) {

From 5a0f4aee019fbee9e2b7cb857fae91db2a02aabe Mon Sep 17 00:00:00 2001
From: Harsha H S <hsharsha@users.noreply.github.com>
Date: Thu, 18 Dec 2025 07:48:53 -0800
Subject: [PATCH 530/753] PR #35510: [ROCm] Initialze collectives to nullptr to
 force its allocation later
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35510

📝 Summary of Changes
Initialize collectives pointer to nullptr

🎯 Justification

Gpu runtime options are initialized in TF and transferred to XLA to execute thunks. Since the memory is not cleared collectives point to an uninitialized memory resulting in segfault during nccl collective initialization and operation.

🚀 Kind of Contribution
Please remove what does not apply: 🐛 Bug Fix,

Copybara import of the project:

--
2bfc6fbddbf2f9a926dd504169c56be45d2f1a0a by Harsha HS <Harsha.HavanurShamsundara@amd.com>:

[ROCm] Initialze collectives to nullptr to force its allocation later

Merging this change closes #35510

PiperOrigin-RevId: 846266642
---
 third_party/xla/xla/service/gpu/gpu_executable_run_options.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
index b97dfdef2334e6..b7fa471dd05519 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable_run_options.h
@@ -92,7 +92,7 @@ class GpuExecutableRunOptions {
   bool enable_mock_collectives_ = false;
   std::optional<DeviceIdMap> gpu_global_device_ids_;
   CliqueIdCallback clique_id_callback_;
-  GpuCollectives* collectives_;
+  GpuCollectives* collectives_ = nullptr;
   std::optional<absl::flat_hash_map<GlobalDeviceId, IncarnationId>>
       incarnations_;
 };

From 434dd85854d671c3ca94b16cf428a13d7540e9d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 07:54:31 -0800
Subject: [PATCH 531/753] Apply llvm-use-new-mlir-op-builder fixes

This migrates `builder.create<Op>()` => `Op::create()`

PiperOrigin-RevId: 846268375
---
 .../quantization_lib/quantization_utils.h     |   4 +-
 .../mhlo/transforms/map_mhlo_to_scalar_op.h   | 303 +++++++++---------
 2 files changed, 156 insertions(+), 151 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
index 66d307dd2fbd86..6559ad29d1f788 100644
--- a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
@@ -645,8 +645,8 @@ class QuantizationPattern : public RewritePattern {
             if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
               continue;
             }
-            auto cst = rewriter.create<arith::ConstantOp>(
-                quantized_op->getLoc(), attr);
+            auto cst = arith::ConstantOp::create(rewriter,
+                                                 quantized_op->getLoc(), attr);
             quantizing_op->setOperand(i, cst.getResult());
           }
         }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index a39b2639705942..cc5e9139c7d976 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -352,7 +352,7 @@ inline Value getConstantOrSplat(OpBuilder* b, Location loc, Type t,
   if (ShapedType shapedType = mlir::dyn_cast<ShapedType>(t)) {
     v = SplatElementsAttr::get(shapedType, v);
   }
-  return b->create<arith::ConstantOp>(loc, t, cast<TypedAttr>(v));
+  return arith::ConstantOp::create(*b, loc, t, cast<TypedAttr>(v));
 }
 
 template <typename PredicateType>
@@ -401,20 +401,20 @@ inline Value cmpComplex(Location loc, Value lhs, Value rhs,
   auto complexType = mlir::cast<ComplexType>(lhs.getType());
   if (mlir::isa<FloatType>(complexType.getElementType())) {
     if (comparisonDirection == ComparisonDirection::EQ) {
-      return b->create<complex::EqualOp>(loc, lhs, rhs);
+      return complex::EqualOp::create(*b, loc, lhs, rhs);
     }
     if (comparisonDirection == ComparisonDirection::NE) {
-      return b->create<complex::NotEqualOp>(loc, lhs, rhs);
+      return complex::NotEqualOp::create(*b, loc, lhs, rhs);
     }
 
     // Perform a lexicographical comparison for the (real, imaginary) pair.
     Type complexFloatTy = complexType.getElementType();
 
-    Value lhsReal = b->create<complex::ReOp>(loc, complexFloatTy, lhs);
-    Value rhsReal = b->create<complex::ReOp>(loc, complexFloatTy, rhs);
+    Value lhsReal = complex::ReOp::create(*b, loc, complexFloatTy, lhs);
+    Value rhsReal = complex::ReOp::create(*b, loc, complexFloatTy, rhs);
 
-    Value lhsImag = b->create<complex::ImOp>(loc, complexFloatTy, lhs);
-    Value rhsImag = b->create<complex::ImOp>(loc, complexFloatTy, rhs);
+    Value lhsImag = complex::ImOp::create(*b, loc, complexFloatTy, lhs);
+    Value rhsImag = complex::ImOp::create(*b, loc, complexFloatTy, rhs);
 
     auto predicate = getCmpPredicate<arith::CmpFPredicate>(comparisonDirection,
                                                            /*is_signed=*/true);
@@ -422,15 +422,15 @@ inline Value cmpComplex(Location loc, Value lhs, Value rhs,
 
     //   if (lhsReal == rhsReal && lhsImag `predicate` rhsImag ||
     //       lhsReal `predicate` rhsReal)
-    Value realsAreEq = b->create<arith::CmpFOp>(loc, arith::CmpFPredicate::OEQ,
-                                                lhsReal, rhsReal);
+    Value realsAreEq = arith::CmpFOp::create(*b, loc, arith::CmpFPredicate::OEQ,
+                                             lhsReal, rhsReal);
     Value imagsAreOrdered =
-        b->create<arith::CmpFOp>(loc, *predicate, lhsImag, rhsImag);
+        arith::CmpFOp::create(*b, loc, *predicate, lhsImag, rhsImag);
     Value realsAreOrdered =
-        b->create<arith::CmpFOp>(loc, *predicate, lhsReal, rhsReal);
+        arith::CmpFOp::create(*b, loc, *predicate, lhsReal, rhsReal);
 
-    Value orLhs = b->create<arith::AndIOp>(loc, realsAreEq, imagsAreOrdered);
-    return b->create<arith::OrIOp>(loc, orLhs, realsAreOrdered);
+    Value orLhs = arith::AndIOp::create(*b, loc, realsAreEq, imagsAreOrdered);
+    return arith::OrIOp::create(*b, loc, orLhs, realsAreOrdered);
   }
   return nullptr;
 }
@@ -459,9 +459,9 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
       // -NaN < -Inf < -Finite < -0 < +0 < +Finite < +Inf < +NaN
       auto intType = b->getIntegerType(floatType.getWidth());
       auto zero =
-          b->create<arith::ConstantOp>(loc, intType, b->getZeroAttr(intType));
-      auto max = b->create<arith::ConstantOp>(
-          loc, intType,
+          arith::ConstantOp::create(*b, loc, intType, b->getZeroAttr(intType));
+      auto max = arith::ConstantOp::create(
+          *b, loc, intType,
           b->getIntegerAttr(intType,
                             APInt::getSignedMaxValue(floatType.getWidth())));
       // Switch from a floating point value to a integer value in such a way
@@ -475,11 +475,11 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
       // obvious order, -0 is ordered before 0, and -NaN and NaN appear at the
       // beginning and end of the ordering.
       auto toIntegral = [&](Value v) {
-        auto x = b->create<arith::BitcastOp>(loc, intType, v);
+        auto x = arith::BitcastOp::create(*b, loc, intType, v);
         auto cmp =
-            b->create<arith::CmpIOp>(loc, arith::CmpIPredicate::slt, x, zero);
-        auto sub = b->create<arith::SubIOp>(loc, max, x);
-        return b->create<arith::SelectOp>(loc, cmp, sub, x);
+            arith::CmpIOp::create(*b, loc, arith::CmpIPredicate::slt, x, zero);
+        auto sub = arith::SubIOp::create(*b, loc, max, x);
+        return arith::SelectOp::create(*b, loc, cmp, sub, x);
       };
       auto lhsInt = toIntegral(lhs);
       auto rhsInt = toIntegral(rhs);
@@ -487,7 +487,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::CompareOp>(
           getCmpPredicate<arith::CmpIPredicate>(comparisonDirection,
                                                 /*is_signed=*/true);
       assert(predicate.has_value() && "expected valid comparison direction");
-      return b->create<arith::CmpIOp>(loc, *predicate, lhsInt, rhsInt);
+      return arith::CmpIOp::create(*b, loc, *predicate, lhsInt, rhsInt);
     }
     std::optional<arith::CmpFPredicate> predicate =
         getCmpPredicate<arith::CmpFPredicate>(comparisonDirection,
@@ -574,7 +574,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::MaxOp>(
   // 'max' performs a lexicographical comparison for the (real, imaginary) pair.
   Value cond = cmpComplex(loc, lhs, rhs, ComparisonDirection::GE, b);
 
-  return b->create<arith::SelectOp>(loc, cond, lhs, rhs).getResult();
+  return arith::SelectOp::create(*b, loc, cond, lhs, rhs).getResult();
 }
 
 template <>
@@ -599,7 +599,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::MinOp>(
   // 'min' performs a lexicographical comparison for the (real, imaginary) pair.
   Value cond = cmpComplex(loc, lhs, rhs, ComparisonDirection::LE, b);
 
-  return b->create<arith::SelectOp>(loc, cond, lhs, rhs).getResult();
+  return arith::SelectOp::create(*b, loc, cond, lhs, rhs).getResult();
 }
 
 template <>
@@ -619,8 +619,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ImagOp>(
     mhlo::ImagOp::Adaptor adaptor, ArrayRef<NamedAttribute> attributes,
     OpBuilder* b) {
   if (!mlir::isa<ComplexType>(adaptor.getOperand().getType()))
-    return b->create<arith::ConstantOp>(
-        loc, b->getZeroAttr(adaptor.getOperand().getType()));
+    return arith::ConstantOp::create(
+        *b, loc, b->getZeroAttr(adaptor.getOperand().getType()));
   return MapMhloOpToScalarOpImpl<complex::ImOp>{}(
       loc, resultTypes, argTypes, adaptor.getOperands(), attributes, b);
 }
@@ -646,10 +646,12 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
   if (IsUnsignedIntegerType{}(sourceType) &&
       mlir::arith::UIToFPOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::UIToFPOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::UIToFPOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::arith::SIToFPOp::areCastCompatible(sourceType, targetType)) {
-    return b->create<mlir::arith::SIToFPOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::SIToFPOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::isa<FloatType>(sourceType) && mlir::isa<FloatType>(targetType)) {
     if (sourceType == targetType) {
@@ -662,30 +664,30 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
       // There are no ops for conversions between floats of equal width, so we
       // go through the next-larger standard type.
       sourceType = dst.getWidth() == 8 ? b->getF16Type() : b->getF32Type();
-      src = b->create<mlir::arith::ExtFOp>(loc, sourceType, src).getResult();
+      src = mlir::arith::ExtFOp::create(*b, loc, sourceType, src).getResult();
     }
     assert(sourceType.getIntOrFloatBitWidth() != dst.getWidth());
 
     if (sourceType.getIntOrFloatBitWidth() > dst.getWidth()) {
-      return b->create<mlir::arith::TruncFOp>(loc, resultTypes, src,
-                                              attributes);
+      return mlir::arith::TruncFOp::create(*b, loc, resultTypes, src,
+                                           attributes);
     }
-    return b->create<mlir::arith::ExtFOp>(loc, resultTypes, src, attributes);
+    return mlir::arith::ExtFOp::create(*b, loc, resultTypes, src, attributes);
   }
   if (targetType.isInteger(/*width=*/1)) {
     // When casting to bool, we need to compare whether the value is equal to
     // zero.
     if (sourceType.isSignlessInteger() || sourceType.isUnsignedInteger()) {
-      Value zeroIntval = b->create<arith::ConstantOp>(
-          loc, b->getZeroAttr(args.front().getType()));
-      return b->create<mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
-                                            args.front(), zeroIntval);
+      Value zeroIntval = arith::ConstantOp::create(
+          *b, loc, b->getZeroAttr(args.front().getType()));
+      return mlir::arith::CmpIOp::create(*b, loc, arith::CmpIPredicate::ne,
+                                         args.front(), zeroIntval);
     }
     if (mlir::isa<FloatType>(sourceType)) {
-      Value zero = b->create<arith::ConstantOp>(
-          loc, b->getZeroAttr(args.front().getType()));
-      return b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNE,
-                                            args.front(), zero);
+      Value zero = arith::ConstantOp::create(
+          *b, loc, b->getZeroAttr(args.front().getType()));
+      return mlir::arith::CmpFOp::create(*b, loc, arith::CmpFPredicate::UNE,
+                                         args.front(), zero);
     }
   }
   if (mlir::isa<IntegerType>(sourceType) &&
@@ -693,17 +695,17 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
     auto src = mlir::cast<IntegerType>(sourceType);
     auto res = mlir::cast<IntegerType>(targetType);
     if (src.getWidth() > res.getWidth()) {
-      return b->create<mlir::arith::TruncIOp>(loc, resultTypes, args,
-                                              attributes);
+      return mlir::arith::TruncIOp::create(*b, loc, resultTypes, args,
+                                           attributes);
     }
     if (src.getWidth() < res.getWidth()) {
       // Special case boolean values, so they get casted to `1` instead of `-1`.
       if (IsUnsignedIntegerType{}(src)) {
-        return b->create<mlir::arith::ExtUIOp>(loc, resultTypes, args,
-                                               attributes);
+        return mlir::arith::ExtUIOp::create(*b, loc, resultTypes, args,
+                                            attributes);
       }
-      return b->create<mlir::arith::ExtSIOp>(loc, resultTypes, args,
-                                             attributes);
+      return mlir::arith::ExtSIOp::create(*b, loc, resultTypes, args,
+                                          attributes);
     }
     // No conversion is needed for the same width integers
     return args.front();
@@ -711,11 +713,13 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
   if (targetType.isUnsignedInteger() &&
       mlir::arith::FPToUIOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::FPToUIOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::FPToUIOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::arith::FPToSIOp::areCastCompatible(convertedSourceType,
                                                targetType)) {
-    return b->create<mlir::arith::FPToSIOp>(loc, resultTypes, args, attributes);
+    return mlir::arith::FPToSIOp::create(*b, loc, resultTypes, args,
+                                         attributes);
   }
   if (mlir::isa<ComplexType>(targetType)) {
     Type targetElementType =
@@ -732,12 +736,12 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
       assert(!mlir::isa<ComplexType>(sourceElementType) &&
              "elements of complex numbers should not be complex");
       Value sourceReal =
-          b->create<mlir::complex::ReOp>(loc, sourceElementType, args.front());
+          mlir::complex::ReOp::create(*b, loc, sourceElementType, args.front());
       targetReal = mapConvertOpToStdScalarOp(
           loc, targetElementType, targetElementType, sourceElementType,
           sourceReal, attributes, b);
       Value sourceImag =
-          b->create<mlir::complex::ImOp>(loc, sourceElementType, args.front());
+          mlir::complex::ImOp::create(*b, loc, sourceElementType, args.front());
       targetImag = mapConvertOpToStdScalarOp(
           loc, targetElementType, targetElementType, sourceElementType,
           sourceImag, attributes, b);
@@ -747,18 +751,18 @@ inline Value mapConvertOpToStdScalarOp(Location loc, ArrayRef<Type> targetTypes,
       targetReal =
           mapConvertOpToStdScalarOp(loc, targetElementType, targetElementType,
                                     argTypes, args, attributes, b);
-      targetImag = b->create<mlir::arith::ConstantOp>(
-          loc, b->getFloatAttr(targetElementType, 0.0));
+      targetImag = mlir::arith::ConstantOp::create(
+          *b, loc, b->getFloatAttr(targetElementType, 0.0));
     }
-    return b->create<mlir::complex::CreateOp>(loc, targetType, targetReal,
-                                              targetImag);
+    return mlir::complex::CreateOp::create(*b, loc, targetType, targetReal,
+                                           targetImag);
   }
   if (auto sourceComplexType = mlir::dyn_cast<ComplexType>(sourceType)) {
     auto sourceElementType = sourceComplexType.getElementType();
     // When converting from complex to a non-complex type, we take just the real
     // part of the complex number.
     Value sourceReal =
-        b->create<mlir::complex::ReOp>(loc, sourceElementType, args.front());
+        mlir::complex::ReOp::create(*b, loc, sourceElementType, args.front());
     return mapConvertOpToStdScalarOp(loc, targetTypes, resultTypes,
                                      sourceElementType, sourceReal, attributes,
                                      b);
@@ -780,8 +784,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::BitcastConvertOp>(
   if (resultType.getIntOrFloatBitWidth() != argType.getIntOrFloatBitWidth())
     return nullptr;
 
-  return b->create<mlir::arith::BitcastOp>(loc, resultTypes,
-                                           adaptor.getOperands(), attributes);
+  return mlir::arith::BitcastOp::create(*b, loc, resultTypes,
+                                        adaptor.getOperands(), attributes);
 }
 
 template <>
@@ -819,11 +823,11 @@ inline Value mapMhloOpToStdScalarOp<mhlo::IsFiniteOp>(
   if (mlir::isa<FloatType>(adaptor.getX().getType())) {
     auto posInf = APFloat::getInf(
         mlir::cast<FloatType>(adaptor.getX().getType()).getFloatSemantics());
-    auto constPosInf = b->create<arith::ConstantOp>(
-        loc, b->getFloatAttr(adaptor.getX().getType(), posInf));
-    Value absX = b->create<::mlir::math::AbsFOp>(loc, adaptor.getX());
-    return b->create<::mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::ONE,
-                                            absX, constPosInf);
+    auto constPosInf = arith::ConstantOp::create(
+        *b, loc, b->getFloatAttr(adaptor.getX().getType(), posInf));
+    Value absX = ::mlir::math::AbsFOp::create(*b, loc, adaptor.getX());
+    return ::mlir::arith::CmpFOp::create(*b, loc, arith::CmpFPredicate::ONE,
+                                         absX, constPosInf);
   }
   return nullptr;
 }
@@ -867,13 +871,13 @@ inline Value mhloAlwaysPropagateNaN(Value v, ValueRange args, Location loc,
                                     OpBuilder* b) {
   Type elementType = getElementTypeOrSelf(args.front().getType());
   if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
-    Value isnan = b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
-                                                 args[0], args[1]);
+    Value isnan = mlir::arith::CmpFOp::create(
+        *b, loc, arith::CmpFPredicate::UNO, args[0], args[1]);
 
     auto nanApfloat = APFloat::getQNaN(floatType.getFloatSemantics());
     Value nan = getConstantOrSplat(b, loc, args[0].getType(),
                                    b->getFloatAttr(floatType, nanApfloat));
-    v = b->create<mlir::arith::SelectOp>(loc, isnan, nan, v);
+    v = mlir::arith::SelectOp::create(*b, loc, isnan, nan, v);
   }
   return v;
 }
@@ -898,36 +902,36 @@ inline Value makeSafeIntDiv(ImplicitLocOpBuilder& lb, bool isUnsigned,
                             Value returnedOnSignedOverflow) {
   Type type = lhs.getType();
   auto elementType = mlir::cast<IntegerType>(getElementTypeOrSelf(type));
-  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
+  Value zero = arith::ConstantOp::create(lb, lb.getZeroAttr(type));
   auto makeConstant = [&](const APInt& i) {
     return getConstantOrSplat(&lb, lb.getLoc(), type,
                               lb.getIntegerAttr(elementType, i));
   };
   Value one = makeConstant(APInt(elementType.getWidth(), 1));
   Value rhsIsZero =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, rhs, zero);
+      arith::CmpIOp::create(lb, arith::CmpIPredicate::eq, rhs, zero);
 
   // For unsigned just set the divisor to 1 when it would be 0.
   if (isUnsigned) {
-    Value safeRhs = lb.create<arith::SelectOp>(rhsIsZero, one, rhs);
-    Value safeDiv = lb.create<U>(lhs, safeRhs);
-    return lb.create<arith::SelectOp>(rhsIsZero, returnedOnZero, safeDiv);
+    Value safeRhs = arith::SelectOp::create(lb, rhsIsZero, one, rhs);
+    Value safeDiv = U::create(lb, lhs, safeRhs);
+    return arith::SelectOp::create(lb, rhsIsZero, returnedOnZero, safeDiv);
   }
 
   // For signed also check for INT_MIN / -1.
   Value smin = makeConstant(APInt::getSignedMinValue(elementType.getWidth()));
   Value lhsIsSmin =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, lhs, smin);
+      arith::CmpIOp::create(lb, arith::CmpIPredicate::eq, lhs, smin);
   Value minusOne = makeConstant(APInt::getAllOnes(elementType.getWidth()));
   Value rhsIsMinusOne =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, rhs, minusOne);
-  Value hasIntMinOverflow = lb.create<arith::AndIOp>(lhsIsSmin, rhsIsMinusOne);
-  Value rhsIsUnsafe = lb.create<arith::OrIOp>(rhsIsZero, hasIntMinOverflow);
-  Value safeRhs = lb.create<arith::SelectOp>(rhsIsUnsafe, one, rhs);
-  Value safeDiv = lb.create<S>(lhs, safeRhs);
-  Value safeSmin = lb.create<arith::SelectOp>(
-      hasIntMinOverflow, returnedOnSignedOverflow, safeDiv);
-  return lb.create<arith::SelectOp>(rhsIsZero, returnedOnZero, safeSmin);
+      arith::CmpIOp::create(lb, arith::CmpIPredicate::eq, rhs, minusOne);
+  Value hasIntMinOverflow = arith::AndIOp::create(lb, lhsIsSmin, rhsIsMinusOne);
+  Value rhsIsUnsafe = arith::OrIOp::create(lb, rhsIsZero, hasIntMinOverflow);
+  Value safeRhs = arith::SelectOp::create(lb, rhsIsUnsafe, one, rhs);
+  Value safeDiv = S::create(lb, lhs, safeRhs);
+  Value safeSmin = arith::SelectOp::create(lb, hasIntMinOverflow,
+                                           returnedOnSignedOverflow, safeDiv);
+  return arith::SelectOp::create(lb, rhsIsZero, returnedOnZero, safeSmin);
 }
 
 template <>
@@ -978,7 +982,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::RemOp>(
   // INT_SMIN %s -1 = 0
   ImplicitLocOpBuilder lb(loc, *b);
   Type type = adaptor.getLhs().getType();
-  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
+  Value zero = arith::ConstantOp::create(lb, lb.getZeroAttr(type));
   return makeSafeIntDiv<arith::RemUIOp, arith::RemSIOp>(
       lb, originalType.isUnsignedInteger(), adaptor.getLhs(), adaptor.getRhs(),
       /*returnedOnZero=*/adaptor.getLhs(),
@@ -1000,7 +1004,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::NegOp>(
     // lmhlo.neg(x, result) -> result = sub(0, x)
     Value lhs = adaptor.getOperand();
     Value zeroIntval =
-        b->create<arith::ConstantOp>(loc, b->getZeroAttr(lhs.getType()));
+        arith::ConstantOp::create(*b, loc, b->getZeroAttr(lhs.getType()));
     return b->create<ScalarIOp<mhlo::SubtractOp>>(loc, zeroIntval, lhs);
   }
   return nullptr;
@@ -1018,7 +1022,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::NotOp>(
         b, loc, adaptor.getOperand().getType(),
         b->getIntegerAttr(integerType,
                           APInt::getAllOnes(integerType.getWidth())));
-    return b->create<::mlir::arith::XOrIOp>(loc, allOnes, adaptor.getOperand());
+    return ::mlir::arith::XOrIOp::create(*b, loc, allOnes,
+                                         adaptor.getOperand());
   }
   return nullptr;
 }
@@ -1037,7 +1042,7 @@ inline Value mapMhloOpToStdScalarOp<mhlo::LogisticOp>(
   Type type = getElementTypeOrSelf(resultTypes[0]);
   Value oneFloat =
       mlir::isa<ComplexType>(type)
-          ? b->create<arith::ConstantOp>(loc, b->getF32FloatAttr(1.0))
+          ? arith::ConstantOp::create(*b, loc, b->getF32FloatAttr(1.0))
           : getConstantOrSplat(b, loc, resultTypes[0],
                                FloatAttr::get(type, 1.0f));
   Value one = mapConvertOpToStdScalarOp(loc, resultTypes, resultTypes,
@@ -1067,51 +1072,51 @@ inline Value mapMhloOpToStdScalarOp<mhlo::PowOp>(
   // Exponentiation by squaring:
   // https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
   Value negOne =
-      lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, -1));
-  Value zero = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 0));
-  Value one = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 1));
-  Value two = lb.create<arith::ConstantOp>(lb.getIntegerAttr(resultType, 2));
-  Value step = lb.create<arith::ConstantIndexOp>(1);
-  Value lowerBound = lb.create<arith::ConstantIndexOp>(0);
+      arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, -1));
+  Value zero = arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, 0));
+  Value one = arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, 1));
+  Value two = arith::ConstantOp::create(lb, lb.getIntegerAttr(resultType, 2));
+  Value step = arith::ConstantIndexOp::create(lb, 1);
+  Value lowerBound = arith::ConstantIndexOp::create(lb, 0);
   // Everything else would overflow for any exponent > 1, as 2^64
   // is the larget possible exponent for a 64-bit integer, and
   // that's 1 << 6.
-  Value upperBound = lb.create<arith::ConstantIndexOp>(6);
+  Value upperBound = arith::ConstantIndexOp::create(lb, 6);
   auto originalBase = adaptor.getLhs();
   auto originalExponent = adaptor.getRhs();
 
   Value accum =
-      lb.create<scf::ForOp>(
-            lowerBound, upperBound, step,
-            SmallVector<Value>({one, originalBase, originalExponent}),
-            [&](OpBuilder& b, Location, Value /*v*/, ValueRange iters) {
-              Value accum = iters[0];
-              Value base = iters[1];
-              Value exponent = iters[2];
-
-              Value condition = b.create<arith::CmpIOp>(
-                  loc, arith::CmpIPredicate::eq,
-                  b.create<::mlir::arith::AndIOp>(loc, exponent, one), one);
-              Value multiplied =
-                  b.create<::mlir::arith::MulIOp>(loc, accum, base);
-              accum = b.create<::mlir::arith::SelectOp>(loc, condition,
-                                                        multiplied, accum);
-              base = b.create<::mlir::arith::MulIOp>(loc, base, base);
-              exponent = b.create<::mlir::arith::ShRUIOp>(loc, exponent, one);
-              b.create<scf::YieldOp>(
-                  loc, SmallVector<Value>({accum, base, exponent}));
-            })
+      scf::ForOp::create(
+          lb, lowerBound, upperBound, step,
+          SmallVector<Value>({one, originalBase, originalExponent}),
+          [&](OpBuilder& b, Location, Value /*v*/, ValueRange iters) {
+            Value accum = iters[0];
+            Value base = iters[1];
+            Value exponent = iters[2];
+
+            Value condition = arith::CmpIOp::create(
+                b, loc, arith::CmpIPredicate::eq,
+                ::mlir::arith::AndIOp::create(b, loc, exponent, one), one);
+            Value multiplied =
+                ::mlir::arith::MulIOp::create(b, loc, accum, base);
+            accum = ::mlir::arith::SelectOp::create(b, loc, condition,
+                                                    multiplied, accum);
+            base = ::mlir::arith::MulIOp::create(b, loc, base, base);
+            exponent = ::mlir::arith::ShRUIOp::create(b, loc, exponent, one);
+            scf::YieldOp::create(b, loc,
+                                 SmallVector<Value>({accum, base, exponent}));
+          })
           .getResult(0);
 
-  Value rhsIsEven = lb.create<arith::CmpIOp>(
-      arith::CmpIPredicate::eq,
-      lb.create<arith::RemSIOp>(adaptor.getRhs(), two), zero);
-  Value rhsIsNegative = lb.create<arith::CmpIOp>(arith::CmpIPredicate::slt,
-                                                 adaptor.getRhs(), zero);
-  Value lhsIsOne =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, adaptor.getLhs(), one);
-  Value lhsIsNegOne = lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
-                                               adaptor.getLhs(), negOne);
+  Value rhsIsEven = arith::CmpIOp::create(
+      lb, arith::CmpIPredicate::eq,
+      arith::RemSIOp::create(lb, adaptor.getRhs(), two), zero);
+  Value rhsIsNegative = arith::CmpIOp::create(lb, arith::CmpIPredicate::slt,
+                                              adaptor.getRhs(), zero);
+  Value lhsIsOne = arith::CmpIOp::create(lb, arith::CmpIPredicate::eq,
+                                         adaptor.getLhs(), one);
+  Value lhsIsNegOne = arith::CmpIOp::create(lb, arith::CmpIPredicate::eq,
+                                            adaptor.getLhs(), negOne);
 
   // The accum is correct when the rhs is non-negative. When rhs is
   // negative, we return 0 for integer, with the exception of lhs values of 1
@@ -1122,12 +1127,12 @@ inline Value mapMhloOpToStdScalarOp<mhlo::PowOp>(
   // - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
   // - Return 1 if lhs is 1.
   // - Else return 0.
-  Value ifLhsIsOne = lb.create<::mlir::arith::SelectOp>(lhsIsOne, one, zero);
-  Value ifLhsIsNegOne = lb.create<::mlir::arith::SelectOp>(
-      lhsIsNegOne, lb.create<::mlir::arith::SelectOp>(rhsIsEven, one, negOne),
-      ifLhsIsOne);
-  return lb.create<::mlir::arith::SelectOp>(rhsIsNegative, ifLhsIsNegOne,
-                                            accum);
+  Value ifLhsIsOne = ::mlir::arith::SelectOp::create(lb, lhsIsOne, one, zero);
+  Value ifLhsIsNegOne = ::mlir::arith::SelectOp::create(
+      lb, lhsIsNegOne,
+      ::mlir::arith::SelectOp::create(lb, rhsIsEven, one, negOne), ifLhsIsOne);
+  return ::mlir::arith::SelectOp::create(lb, rhsIsNegative, ifLhsIsNegOne,
+                                         accum);
 }
 
 template <>
@@ -1148,35 +1153,35 @@ inline Value mapMhloOpToStdScalarOp<mhlo::SignOp>(
   Type elementType = getElementTypeOrSelf(operand.getType());
   if (auto floatType = mlir::dyn_cast<FloatType>(elementType)) {
     Value zero =
-        b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
-    Value ne0I1 = b->create<::mlir::arith::CmpFOp>(
-        loc, arith::CmpFPredicate::ONE, operand, zero);
+        arith::ConstantOp::create(*b, loc, b->getZeroAttr(operand.getType()));
+    Value ne0I1 = ::mlir::arith::CmpFOp::create(
+        *b, loc, arith::CmpFPredicate::ONE, operand, zero);
     Value ne0Float =
-        b->create<::mlir::arith::UIToFPOp>(loc, zero.getType(), ne0I1);
-    Value copySign = b->create<::mlir::math::CopySignOp>(loc, resultTypes,
-                                                         ne0Float, operand);
-    auto isNan = b->create<::mlir::arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UNO, operand, operand);
-    return b->create<::mlir::arith::SelectOp>(loc, isNan, operand, copySign);
+        ::mlir::arith::UIToFPOp::create(*b, loc, zero.getType(), ne0I1);
+    Value copySign = ::mlir::math::CopySignOp::create(*b, loc, resultTypes,
+                                                      ne0Float, operand);
+    auto isNan = ::mlir::arith::CmpFOp::create(
+        *b, loc, arith::CmpFPredicate::UNO, operand, operand);
+    return ::mlir::arith::SelectOp::create(*b, loc, isNan, operand, copySign);
   }
   if (auto integerType = mlir::dyn_cast<IntegerType>(elementType)) {
     // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
     Value zero =
-        b->create<arith::ConstantOp>(loc, b->getZeroAttr(operand.getType()));
+        arith::ConstantOp::create(*b, loc, b->getZeroAttr(operand.getType()));
     Value bitwidthMinusOne = getConstantOrSplat(
         b, loc, operand.getType(),
         b->getIntegerAttr(integerType, integerType.getWidth() - 1));
     Value one = getConstantOrSplat(b, loc, operand.getType(),
                                    b->getIntegerAttr(integerType, 1));
-    Value cmp = b->create<::mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                                 operand, zero);
+    Value cmp = ::mlir::arith::CmpIOp::create(*b, loc, arith::CmpIPredicate::eq,
+                                              operand, zero);
     Value ashr =
-        b->create<::mlir::arith::ShRSIOp>(loc, operand, bitwidthMinusOne);
-    Value orOp = b->create<::mlir::arith::OrIOp>(loc, ashr, one);
-    return b->create<::mlir::arith::SelectOp>(loc, cmp, zero, orOp);
+        ::mlir::arith::ShRSIOp::create(*b, loc, operand, bitwidthMinusOne);
+    Value orOp = ::mlir::arith::OrIOp::create(*b, loc, ashr, one);
+    return ::mlir::arith::SelectOp::create(*b, loc, cmp, zero, orOp);
   }
   if (mlir::isa<ComplexType>(elementType)) {
-    return b->create<::mlir::complex::SignOp>(loc, elementType, operand);
+    return ::mlir::complex::SignOp::create(*b, loc, elementType, operand);
   }
   return nullptr;
 }
@@ -1192,9 +1197,9 @@ inline Value selectShiftedOrSaturated(ImplicitLocOpBuilder& lb, Value rhs,
   auto bitWidthInt = etype.getIntOrFloatBitWidth();
   Value bitWidth = getConstantOrSplat(&lb, lb.getLoc(), type,
                                       lb.getIntegerAttr(etype, bitWidthInt));
-  Value cmp = lb.create<mlir::arith::CmpIOp>(mlir::arith::CmpIPredicate::ugt,
-                                             bitWidth, rhs);
-  return lb.create<mlir::arith::SelectOp>(cmp, shifted, saturated);
+  Value cmp = mlir::arith::CmpIOp::create(lb, mlir::arith::CmpIPredicate::ugt,
+                                          bitWidth, rhs);
+  return mlir::arith::SelectOp::create(lb, cmp, shifted, saturated);
 }
 
 template <>
@@ -1208,8 +1213,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ShiftLeftOp>(
   Type type = lhs.getType();
 
   // "Saturate" if the shift is greater than the bitwidth of the type
-  Value zero = lb.create<arith::ConstantOp>(lb.getZeroAttr(type));
-  Value shifted = lb.create<mlir::arith::ShLIOp>(lhs, rhs);
+  Value zero = arith::ConstantOp::create(lb, lb.getZeroAttr(type));
+  Value shifted = mlir::arith::ShLIOp::create(lb, lhs, rhs);
 
   return selectShiftedOrSaturated(lb, rhs, shifted, zero, type);
 }
@@ -1225,8 +1230,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ShiftRightLogicalOp>(
   Type type = lhs.getType();
 
   // "Saturate" if the shift is greater than the bitwidth of the type
-  Value zero = lb.create<arith::ConstantOp>(b->getZeroAttr(type));
-  Value shifted = lb.create<mlir::arith::ShRUIOp>(lhs, rhs);
+  Value zero = arith::ConstantOp::create(lb, b->getZeroAttr(type));
+  Value shifted = mlir::arith::ShRUIOp::create(lb, lhs, rhs);
 
   return selectShiftedOrSaturated(lb, rhs, shifted, zero, type);
 }
@@ -1248,8 +1253,8 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ShiftRightArithmeticOp>(
   // "Saturate" if the shift is greater than the bitwidth of the type
   Value maxShift = getConstantOrSplat(
       b, loc, type, lb.getIntegerAttr(etype, bitWidthInt - 1));
-  Value saturatedShifted = lb.create<mlir::arith::ShRSIOp>(lhs, maxShift);
-  Value shifted = lb.create<mlir::arith::ShRSIOp>(lhs, rhs);
+  Value saturatedShifted = mlir::arith::ShRSIOp::create(lb, lhs, maxShift);
+  Value shifted = mlir::arith::ShRSIOp::create(lb, lhs, rhs);
 
   return selectShiftedOrSaturated(lb, rhs, shifted, saturatedShifted, type);
 }

From fbfba09a1b7a82e920a3f7348cc9b02d367a5f0b Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 18 Dec 2025 07:57:11 -0800
Subject: [PATCH 532/753] Remove unnecessary local_defines and add missing
 includes in gpu_executor_test.

The local_defines for CUDA/ROCM are not required for this test. Added explicit includes for headers used in gpu_executor_test.cc.

PiperOrigin-RevId: 846269233
---
 third_party/xla/xla/stream_executor/gpu/BUILD              | 7 ++++---
 .../xla/xla/stream_executor/gpu/gpu_executor_test.cc       | 4 ++++
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 471b043735dcf8..17f895e12c143a 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -651,11 +651,12 @@ xla_test(
     size = "small",
     srcs = ["gpu_executor_test.cc"],
     backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
     deps = [
         "//xla/service:platform_util",
+        "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:memory_allocation",
+        "//xla/stream_executor:memory_space",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
         "//xla/stream_executor:stream_executor_h",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
index b038ecc139d921..47f59c064bc088 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor_test.cc
@@ -20,6 +20,10 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "xla/service/platform_util.h"
+#include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/memory_space.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"

From f644aa87f76850e916bf50a8eb4f64caf2c6cb77 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 18 Dec 2025 07:57:35 -0800
Subject: [PATCH 533/753] PR #35482: Correctly generate compile_commands.json

Imported from GitHub PR https://github.com/openxla/xla/pull/35482

Sometime json incorrectly parse compile commands from bazel, and we end up passing them as

```
"-isystem path/to/includes"
```

to `clangd`, and these flags parsed incorrectly
Copybara import of the project:

--
adf291e21b098d79fa3be4065ee02fafdf5c660a by Eugene Zhulenev <ezhulenev@google.com>:

Correctly generate compile_commands.json

Merging this change closes #35482

PiperOrigin-RevId: 846269357
---
 .../xla/build_tools/lint/generate_compile_commands.py       | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/build_tools/lint/generate_compile_commands.py b/third_party/xla/build_tools/lint/generate_compile_commands.py
index ec9d6fe0d2037b..1c7e6f930931ed 100644
--- a/third_party/xla/build_tools/lint/generate_compile_commands.py
+++ b/third_party/xla/build_tools/lint/generate_compile_commands.py
@@ -67,7 +67,11 @@ def from_args_list(cls, args_list: list[str]) -> "CompileCommand":
       if arg.endswith(".cc"):
         cc_file = arg
 
-      filtered_args.append(arg)
+      # Split generated commands, because otherwise they get wrapped
+      # into "command with spaces" when passed to clangd, and clangd
+      # can't parse them correctly.
+      for s in arg.split(" "):
+        filtered_args.append(s)
 
     return cls(cc_file, filtered_args)
 

From f4c5fe5509cfc4c35db7d92c94fe5fba3f865341 Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Thu, 18 Dec 2025 08:14:19 -0800
Subject: [PATCH 534/753] In `FileDescriptor` tests, improve temporary file
 path generation.

Depending on the compiler, `testing::TempDir() + __FUNCTION__` may generate and
invalid file name.

PiperOrigin-RevId: 846275995
---
 .../lite/delegates/xnnpack/file_util_test.cc  | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/file_util_test.cc b/tensorflow/lite/delegates/xnnpack/file_util_test.cc
index c7f204befd4776..9a1ce5e50aa5f2 100644
--- a/tensorflow/lite/delegates/xnnpack/file_util_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/file_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <fcntl.h>
 
+#include <atomic>
 #include <string>
 #include <type_traits>
 #include <utility>
@@ -25,6 +26,14 @@ limitations under the License.
 namespace tflite::xnnpack {
 namespace {
 
+// Returns a path for a temporary file.
+//
+// Each call will return a new path.
+std::string NewTempFilePath() {
+  static std::atomic<int> i = 0;
+  return testing::TempDir() + "test_file_" + std::to_string(i++);
+}
+
 TEST(FileDescriptorTest, DefaultConstructedIsInvalid) {
   FileDescriptor fd;
   EXPECT_FALSE(fd.IsValid());
@@ -54,7 +63,7 @@ TEST(FileDescriptorTest, OpenNullFileFails) {
 }
 
 TEST(FileDescriptorTest, OpenWriteRewindAndReadWorks) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd =
       FileDescriptor::Open(tmp_file.c_str(), O_CREAT | O_TRUNC | O_RDWR, 0644);
   ASSERT_TRUE(fd.IsValid());
@@ -67,7 +76,7 @@ TEST(FileDescriptorTest, OpenWriteRewindAndReadWorks) {
 }
 
 TEST(FileDescriptorTest, WriteFailureReturnsFalse) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_RDONLY, 0644);
   ASSERT_TRUE(fd.IsValid());
@@ -76,7 +85,7 @@ TEST(FileDescriptorTest, WriteFailureReturnsFalse) {
 }
 
 TEST(FileDescriptorTest, ReadFailureReturnsFalse) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
   ASSERT_TRUE(fd.IsValid());
@@ -85,7 +94,7 @@ TEST(FileDescriptorTest, ReadFailureReturnsFalse) {
 }
 
 TEST(FileDescriptorTest, IsFileEmptyReturnTrueForAnEmptyFileThatExists) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
   fd.Close();
@@ -93,20 +102,20 @@ TEST(FileDescriptorTest, IsFileEmptyReturnTrueForAnEmptyFileThatExists) {
 }
 
 TEST(FileDescriptorTest, IsFileEmptyReturnTrueForAnNonExistingFile) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   EXPECT_TRUE(IsFileEmpty(tmp_file.c_str(), FileDescriptor()));
 }
 
 TEST(FileDescriptorTest,
      IsFileEmptyReturnTrueForAnNonExistingFileWithFileDescriptor) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
   EXPECT_TRUE(IsFileEmpty("asdfasdf", FileDescriptor()));
 }
 
 TEST(FileDescriptorTest, IsFileEmptyReturnFalseForAFileThatHasContents) {
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
+  const std::string tmp_file = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
   const std::string src_data = "The quick brown fox jumps over the lazy dog.";
@@ -117,8 +126,8 @@ TEST(FileDescriptorTest, IsFileEmptyReturnFalseForAFileThatHasContents) {
 TEST(FileDescriptorTest, IsFileEmptyPrioritizesTheFileDescriptor) {
   // We open 2 files, put some data only in one and then pass the file name of
   // the one that has data and the file descriptor of the empty one.
-  const std::string tmp_file = testing::TempDir() + __FUNCTION__;
-  const std::string tmp_file2 = testing::TempDir() + __FUNCTION__ + "2";
+  const std::string tmp_file = NewTempFilePath();
+  const std::string tmp_file2 = NewTempFilePath();
   FileDescriptor fd = FileDescriptor::Open(tmp_file.c_str(),
                                            O_CREAT | O_TRUNC | O_WRONLY, 0644);
   FileDescriptor fd2 = FileDescriptor::Open(tmp_file2.c_str(),

From 91088251a0bc57cc47255836b8d320330f7a3331 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 18 Dec 2025 08:19:37 -0800
Subject: [PATCH 535/753] PR #35463: [xla:gpu] Support ncclAlltoall directly
 for contiguous send/recv buffers

Imported from GitHub PR https://github.com/openxla/xla/pull/35463

With latest NCCL we can use `ncclAlltoall` API directly without having to launch grouped send and recv operations.
Copybara import of the project:

--
0630f4d48049b211442dcb1754e521a4b1f37f7b by Eugene Zhulenev <ezv@amazon.com>:

[xla:gpu] Support ncclAlltoall directly for contiguous send/recv buffers

Merging this change closes #35463

PiperOrigin-RevId: 846277559
---
 .../gpu/collectives/nccl_communicator.cc      | 45 ++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
index 3c4b043ad1287d..1924b34a02320b 100644
--- a/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
+++ b/third_party/xla/xla/backends/gpu/collectives/nccl_communicator.cc
@@ -640,6 +640,32 @@ absl::Status NcclCommunicator::LaunchAllGather(
   return absl::OkStatus();
 }
 
+// If all buffers are contiguous returns a device address range that covers all
+// of them, otherwise returns an empty optional.
+static std::optional<se::DeviceAddressBase> IsContinguous(
+    absl::Span<const se::DeviceAddressBase> buffers) {
+  if (buffers.empty()) {
+    return std::nullopt;
+  }
+
+  if (buffers.size() == 1) {
+    return buffers[0];
+  }
+
+  size_t total_size = buffers[0].size();
+  for (size_t i = 1; i < buffers.size(); ++i) {
+    se::DeviceAddress<uint8_t> a(buffers[i - 1]);
+    se::DeviceAddress<uint8_t> b(buffers[i]);
+    total_size += b.size();
+
+    if (a.base() + a.size() != b.base()) {
+      return std::nullopt;
+    }
+  }
+
+  return se::DeviceAddressBase(buffers[0].opaque(), total_size);
+}
+
 absl::Status NcclCommunicator::LaunchAllToAll(
     absl::InlinedVector<se::DeviceAddressBase, 4> send_buffers,
     absl::InlinedVector<se::DeviceAddressBase, 4> recv_buffers,
@@ -653,12 +679,18 @@ absl::Status NcclCommunicator::LaunchAllToAll(
     absl::StrAppendFormat(out, "%p", buffer.opaque());
   };
 
+  auto send_contiguous = IsContinguous(send_buffers);
+  auto recv_contiguous = IsContinguous(recv_buffers);
+
   VLOG(3) << absl::StreamFormat(
       "[%d] Launch NCCL AllToAll operation; send_buffers=[%s]; "
-      "recv_buffers=[%s]; dtype=%s; count=%d; comm=%p; stream=%p",
+      "send_contiguous=%v; recv_buffers=[%s]; recv_contiguous=%v; dtype=%s; "
+      "count=%d; comm=%p; stream=%p",
       stream->parent()->device_ordinal(),
       absl::StrJoin(send_buffers, ", ", buffer_formatter),
+      send_contiguous.has_value(),
       absl::StrJoin(recv_buffers, ", ", buffer_formatter),
+      recv_contiguous.has_value(),
       primitive_util::LowercasePrimitiveTypeName(dtype), count, comm_, stream);
 
   if (send_buffers.size() != recv_buffers.size()) {
@@ -678,6 +710,17 @@ absl::Status NcclCommunicator::LaunchAllToAll(
 
   TF_ASSIGN_OR_RETURN(ncclDataType_t nccl_dtype, ToNcclDataType(dtype, false));
 
+#if NCCL_VERSION_CODE >= 22800
+  // If send and receive buffers are contiguous we can use all-to-all API from
+  // NCCL directly without launching individual send/recv operations.
+  if (send_contiguous && recv_contiguous) {
+    XLA_NCCL_RETURN_IF_ERROR(ncclAlltoAll(
+        send_contiguous->opaque(), recv_contiguous->opaque(),
+        ToNcclCount(dtype, count), nccl_dtype, comm_, AsCudaStream(stream)));
+    return absl::OkStatus();
+  }
+#endif
+
   TF_RETURN_IF_ERROR(GroupStart());
   for (size_t i = 0; i < send_buffers.size(); ++i) {
     se::DeviceAddressBase send_buffer = send_buffers[i];

From 5db58f8f58a6ff49cabfacc7bc1c938099da4854 Mon Sep 17 00:00:00 2001
From: Penporn Koanantakool <penporn@google.com>
Date: Thu, 18 Dec 2025 09:22:30 -0800
Subject: [PATCH 536/753] [xla:cpu] Do not expand convolution feature group if
 the convolution is supported by libraries.

PiperOrigin-RevId: 846299624
---
 .../xla/xla/backends/cpu/transforms/BUILD     | 11 ++++
 .../cpu/transforms/ynn_matcher_test.cc        | 54 +++++++++++++++++++
 .../xla/xla/service/cpu/cpu_compiler.cc       | 28 ++++++----
 3 files changed, 83 insertions(+), 10 deletions(-)
 create mode 100644 third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc

diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index 67603bc203cdac..ff400cc58cf198 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -113,3 +113,14 @@ cc_library(
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
+
+xla_cc_test(
+    name = "ynn_matcher_test",
+    srcs = ["ynn_matcher_test.cc"],
+    deps = [
+        "//xla:xla_proto_cc",
+        "//xla/service:cpu_plugin",
+        "//xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc
new file mode 100644
index 00000000000000..d03d56c5a68f36
--- /dev/null
+++ b/third_party/xla/xla/backends/cpu/transforms/ynn_matcher_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "xla/tests/hlo_test_base.h"
+#include "xla/xla.pb.h"
+
+namespace xla::cpu {
+namespace {
+
+class YnnE2eTest : public HloTestBase {
+ protected:
+  DebugOptions GetDebugOptionsForTest() const override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    debug_options.add_xla_cpu_experimental_ynn_fusion_type(
+        DebugOptions::LIBRARY_FUSION_TYPE_INDIVIDUAL_CONVOLUTION);
+    debug_options.clear_xla_cpu_experimental_ynn_fusion_type();
+    return debug_options;
+  }
+};
+
+TEST_F(YnnE2eTest, DoNotDegroupConvolutionFeatures) {
+  const char* matmul_module_str = R"(
+  HloModule convolution
+
+  ENTRY %main {
+    %lhs = f32[1,7,8,9] parameter(0)
+    %rhs = f32[1,5,3,9] parameter(1)
+    ROOT %conv = f32[1,4,8,9] convolution(%lhs, %rhs),
+        window={size=1x5 stride=2x1 pad=0_0x2_2}, dim_labels=b01f_01io->b01f,
+        feature_group_count=3
+  })";
+
+  // If the convolution feature group is de-grouped, the shape will change to:
+  //   f32[1,4,8,3,3]{4,3,2,1,0}
+  // This convolution is supported by YNNPACK, so the shape should not change.
+  MatchOptimizedHlo(matmul_module_str,
+                    "CHECK: f32[1,4,8,9]{3,2,1,0} convolution");
+}
+
+}  // namespace
+}  // namespace xla::cpu
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 540a2a6f1550fa..8675446af52fc6 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -781,17 +781,25 @@ absl::Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     return false;
   };
   pipeline.AddPass<ConvolutionGroupConverter>(
-      /*should_expand=*/[](HloInstruction* conv) { return true; }, cost_model,
+      /*should_expand=*/
+      [&library_supports_convolution](HloInstruction* conv) {
+        return !library_supports_convolution(*conv);
+      },
+      cost_model,
       /*convert_batch_groups_only=*/true);
-  auto feature_group_should_expand = [](HloInstruction* conv) {
-    switch (conv->shape().element_type()) {
-      case F16:
-      case F32:
-        return false;
-      default:
-        return true;
-    }
-  };
+  auto feature_group_should_expand =
+      [&library_supports_convolution](HloInstruction* conv) {
+        if (library_supports_convolution(*conv)) {
+          return false;
+        }
+        switch (conv->shape().element_type()) {
+          case F16:
+          case F32:
+            return false;
+          default:
+            return true;
+        }
+      };
   pipeline.AddPass<ConvolutionGroupConverter>(
       feature_group_should_expand, cost_model,
       /*convert_batch_groups_only=*/false);

From c549ee47f87dc9083b4891b7f6dafa2063ddf12a Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Thu, 18 Dec 2025 09:59:09 -0800
Subject: [PATCH 537/753] [XLA:GPU] Use StreamState as rendezvous value.

We can add output pointer to StreamState and it will have all the information for rendezvour. No need to have a separate RendezvousValue struct.

PiperOrigin-RevId: 846313928
---
 .../xla/xla/backends/gpu/runtime/BUILD        |   1 +
 .../gpu/runtime/ragged_all_to_all_thunk.cc    | 116 +++++++-----------
 .../gpu/runtime/ragged_all_to_all_thunk.h     |  18 +++
 3 files changed, 65 insertions(+), 70 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index eff15f7bd6646c..d6c17f5e6bd4bf 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1504,6 +1504,7 @@ cc_library(
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index f6d65a9a110d13..f89a39e29bc3e9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -215,99 +216,75 @@ absl::Status RunRaggedAllToAll(
   return future.Await();
 }
 
-// Contains the values that are passed between host threads with rendezvous.
-struct RendezvousValue {
-  RankId rank;
-  se::DeviceAddressBase output_buffer;
-  se::Event* start_event;
-  se::Event* end_event;
+}  // namespace
 
-  bool operator<(const RendezvousValue& other) const {
-    return rank < other.rank;
-  }
-};
-
-// Executes the rendezvous before the kernel start.
-// Inserts CUDA events into the stream to ensure that all devices have reached
-// the start event before the kernel starts.
-absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
-RendezvousBeforeKernelStart(absl::string_view name,
-                            const GpuCliqueKey& clique_key, RankId rank,
-                            int64_t num_ranks,
-                            const se::DeviceAddressBase& output_buffer,
-                            se::Stream& stream, se::Event* start_event,
-                            se::Event* end_event) {
-  RendezvousValue rendezvous_value;
-  rendezvous_value.rank = rank;
-  rendezvous_value.output_buffer = output_buffer;
-  rendezvous_value.start_event = start_event;
-  rendezvous_value.end_event = end_event;
+absl::StatusOr<
+    std::shared_ptr<std::vector<const RaggedAllToAllStartThunk::StreamState*>>>
+RaggedAllToAllStartThunk::RendezvousBeforeKernelStart(
+    const GpuCliqueKey& clique_key, se::Stream& stream,
+    const StreamState& state) {
+  int64_t num_ranks = clique_key.num_local_participants();
 
   // Record that this device has started the memcpy ragged-all-to-all. We do
   // this before the rendezvous to make sure that RecordEvent is called before
   // WaitFor on another stream.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(start_event));
+  RETURN_IF_ERROR(stream.RecordEvent(state.start_event.get()));
 
-  auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
-    std::vector<RendezvousValue> values_copy;
+  auto rendezvous_fn = [](absl::Span<const StreamState* const> values) {
+    std::vector<const StreamState*> values_copy;
     for (const auto& value : values) {
-      values_copy.push_back(*value);
+      values_copy.push_back(value);
     }
     // Sort to make sure that values are in the same order as the devices are
     // ordered in the communicator.
-    absl::c_sort(values_copy);
+    absl::c_sort(values_copy, [](const StreamState* a, const StreamState* b) {
+      return a->rank < b->rank;
+    });
     return values_copy;
   };
 
   std::string start_rendezvous_key =
-      absl::StrFormat("start %s ragged-all-to-all for rank %d, clique %s", name,
-                      rank.value(), clique_key.ToString());
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
-      Rendezvous<std::vector<RendezvousValue>>(
-          /*name=*/
-          start_rendezvous_key, /*key=*/clique_key,
-          /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
-          rendezvous_fn));
+      absl::StrFormat("start one-shot ragged-all-to-all for rank %d, clique %s",
+                      state.rank.value(), clique_key.ToString());
+  ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<const StreamState*>> rendezvous_values,
+      Rendezvous<std::vector<const StreamState*>>(
+          start_rendezvous_key, clique_key, state, num_ranks, rendezvous_fn));
 
   // Wait for all devices to reach the start event. This indicates that all
   // output buffers are ready for transfer.
-  for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.start_event));
+  for (const StreamState* remote_stream_state : *rendezvous_values) {
+    RETURN_IF_ERROR(stream.WaitFor(remote_stream_state->start_event.get()));
   }
 
   return rendezvous_values;
 }
 
-// Executes the rendezvous after the kernel finish. Waits for all devices to
-// reach the end event.
-absl::Status RendezvousAfterKernelFinish(
-    absl::string_view name, const GpuCliqueKey& clique_key, RankId rank,
-    int64_t num_ranks, se::Stream& stream, se::Event* end_event,
-    const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
+absl::Status RaggedAllToAllStartThunk::RendezvousAfterKernelFinish(
+    const GpuCliqueKey& clique_key, se::Stream& stream,
+    const StreamState& state,
+    absl::Span<const StreamState* const> remote_stream_states) {
+  int64_t num_ranks = clique_key.num_local_participants();
+
   // Record that this device has finished the memcpy ragged-all-to-all.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(end_event));
+  RETURN_IF_ERROR(stream.RecordEvent(state.end_event.get()));
 
   // Do another rendezvous to make sure that we call RecordEvent for end_event
   // before WaitFor on another stream.
-  std::string finish_rendezvous_key =
-      absl::StrFormat("finish %s ragged-all-to-all for rank %d, clique %s",
-                      name, rank.value(), clique_key.ToString());
-  TF_RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
-                                /*key=*/clique_key,
-                                /*num_threads=*/num_ranks));
+  std::string finish_rendezvous_key = absl::StrFormat(
+      "finish one-shot ragged-all-to-all for rank %d, clique %s",
+      state.rank.value(), clique_key.ToString());
+  RETURN_IF_ERROR(Rendezvous(finish_rendezvous_key, clique_key, num_ranks));
 
   // Wait for all devices to reach the end event. This indicates that all
   // updates from other devices have arrived.
-  for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.end_event));
+  for (const StreamState* remote_stream_state : remote_stream_states) {
+    RETURN_IF_ERROR(stream.WaitFor(remote_stream_state->end_event.get()));
   }
 
   return absl::OkStatus();
 }
 
-}  // namespace
-
 absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
     const GpuCliqueKey& clique_key, se::Stream& stream,
     const StreamState& state, absl::Span<DeviceBufferPair const> buffers) {
@@ -322,19 +299,16 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
   PrimitiveType element_type = buffers[0].element_type;
 
   se::DeviceAddressBase input_buffer = buffers[0].source_buffer;
-  se::DeviceAddressBase output_buffer = buffers[1].destination_buffer;
 
   TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
-      RendezvousBeforeKernelStart(
-          /*name=*/"one-shot", clique_key, rank, num_ranks, output_buffer,
-          stream, state.start_event.get(), state.end_event.get()));
+      std::shared_ptr<std::vector<const StreamState*>> remote_stream_states,
+      RendezvousBeforeKernelStart(clique_key, stream, state));
 
   const int64_t num_updates_per_replica = config_.num_total_updates / num_ranks;
 
   absl::InlinedVector<se::DeviceAddressBase, 4> output_ptrs;
-  for (auto& value : *rendezvous_values) {
-    output_ptrs.push_back(value.output_buffer);
+  for (const StreamState* remote_stream_state : *remote_stream_states) {
+    output_ptrs.push_back(remote_stream_state->local_output_buffer);
   }
 
   TF_RETURN_IF_ERROR(RunRaggedAllToAllKernel(
@@ -343,9 +317,8 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
       buffers[4].source_buffer, num_ranks, num_updates_per_replica,
       config_.num_input_rows, config_.num_row_elements));
 
-  return RendezvousAfterKernelFinish(
-      /*name=*/"one-shot", clique_key, rank, num_ranks, stream,
-      state.end_event.get(), rendezvous_values);
+  return RendezvousAfterKernelFinish(clique_key, stream, state,
+                                     *remote_stream_states);
 }
 
 RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
@@ -440,6 +413,9 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
     return absl::InternalError("Failed to allocate output offsets buffer.");
   }
 
+  state->local_output_buffer = params.buffer_allocations->GetDeviceAddress(
+      buffers_[1].destination_buffer);
+
   if (is_local()) {
     TF_ASSIGN_OR_RETURN(state->start_event, executor->CreateEvent());
     TF_ASSIGN_OR_RETURN(state->end_event, executor->CreateEvent());
@@ -493,7 +469,7 @@ absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
                                       device_buffers[0].element_type);
 
   if (should_use_one_shot_kernel) {
-    TF_RETURN_IF_ERROR(
+    RETURN_IF_ERROR(
         RunOneShotRaggedAllToAll(clique_key, stream, *state, device_buffers));
     return false;
   }
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
index 6a48a5fac956b0..b3354b814fda60 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_handle.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -91,6 +92,9 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
     // Device memory buffer for output offsets.
     se::DeviceAddressHandle output_offsets_device_buffer;
 
+    // Device memory buffer for local output.
+    se::DeviceAddressBase local_output_buffer;
+
     // Event to synchronize streams on different devices at the start of the
     // kernel.
     std::unique_ptr<se::Event> start_event;
@@ -103,6 +107,20 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
         : device_ordinal(device_ordinal), rank(rank) {}
   };
 
+  // Executes the rendezvous before the kernel start.
+  // Inserts CUDA events into the stream to ensure that all devices have reached
+  // the start event before the kernel starts.
+  absl::StatusOr<std::shared_ptr<std::vector<const StreamState*>>>
+  RendezvousBeforeKernelStart(const GpuCliqueKey& clique_key,
+                              se::Stream& stream, const StreamState& state);
+
+  // Executes the rendezvous after the kernel finish. Waits for all devices to
+  // reach the end event.
+  absl::Status RendezvousAfterKernelFinish(
+      const GpuCliqueKey& clique_key, se::Stream& stream,
+      const StreamState& state,
+      absl::Span<const StreamState* const> remote_stream_states);
+
   absl::Status RunOneShotRaggedAllToAll(
       const GpuCliqueKey& clique_key, se::Stream& stream,
       const StreamState& state, absl::Span<DeviceBufferPair const> buffers);

From 1a5402d93fa5e31c19ba7113468aa970c2a2036a Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov@google.com>
Date: Thu, 18 Dec 2025 10:00:19 -0800
Subject: [PATCH 538/753] [XLA:GPU] don't stop traversal when sinking bitcasts

For example if we have a fusion

```
dot
bitcast1
...
bad_op
...
bitcast2
...
ROOT root = ...
```

we can still benefit from sinking bitcast2 even though instructions between dot and bad_op will not change.

PiperOrigin-RevId: 846314341
---
 .../xla/xla/service/gpu/transforms/BUILD      |  6 ---
 .../gpu/transforms/hoist_fused_bitcasts.cc    | 48 ++++++++++++-------
 .../transforms/hoist_fused_bitcasts_test.cc   | 38 ++++++++-------
 3 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 0b702ad9af2acb..d48d3ca7ad9156 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1888,14 +1888,11 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:call_graph",
-        "//xla/service:matmul_indexing_utils",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu/model:block_level_parameters",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
@@ -1920,14 +1917,11 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:filecheck",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
-        "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
-        "//xla/service:pattern_matcher",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:status_matchers",
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
index fbc791ee7ad58c..8ca5c54ce1db0d 100644
--- a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
@@ -638,7 +638,9 @@ PlanHoistBitcastUpwardsToCallers(const HloInstruction* bitcast) {
   return result;
 }
 
-// Returns the shape of the root instruction after hoisting all bitcasts.
+// Returns the shape of the root instruction after hoisting bitcasts away from
+// the dot instruction. If traversal encounters an instruction we cannot hoist
+// bitcasts past we try to sink the bitcast starting from that instruction.
 //
 // For example, given:
 //
@@ -700,28 +702,35 @@ absl::StatusOr<Shape> ComputeRootShapeAfterHoistingBitcasts(
     TF_ASSIGN_OR_RETURN(Shape result_shape, [&]() -> absl::StatusOr<Shape> {
       switch (instruction->opcode()) {
         case HloOpcode::kBroadcast: {
-          TF_ASSIGN_OR_RETURN(
-              BitcastParams params,
-              CalculateBroadcastOfBitcast(
-                  Cast<HloBroadcastInstruction>(instruction), operand_shape));
-          return params.new_shape;
+          auto paramsOr = CalculateBroadcastOfBitcast(
+              Cast<HloBroadcastInstruction>(instruction), operand_shape);
+          if (paramsOr.ok()) {
+            return paramsOr->new_shape;
+          }
+          VLOG(2) << "Failed to calculate broadcast of bitcast: "
+                  << paramsOr.status();
+          return instruction->shape();
         }
         case HloOpcode::kTranspose: {
-          TF_ASSIGN_OR_RETURN(
-              BitcastParams params,
-              CalculateTransposeOfBitcast(
-                  Cast<HloTransposeInstruction>(instruction), operand_shape));
-          return params.new_shape;
-        }
-        default:
-          if (!instruction->IsElementwise()) {
-            return absl::FailedPreconditionError(absl::StrCat(
-                "Cannot hoist bitcast past ", instruction->ToString()));
+          auto paramsOr = CalculateTransposeOfBitcast(
+              Cast<HloTransposeInstruction>(instruction), operand_shape);
+          if (paramsOr.ok()) {
+            return paramsOr->new_shape;
           }
-          [[fallthrough]];
-        case HloOpcode::kReshape:  // Reshape is a bitcast.
+          VLOG(2) << "Failed to calculate transpose of bitcast: "
+                  << paramsOr.status();
+          return instruction->shape();
+        }
+        case HloOpcode::kReshape:
         case HloOpcode::kBitcast:
           return operand_shape;
+        default:
+          if (instruction->IsElementwise()) {
+            return operand_shape;
+          }
+          // TODO(b/467421789): we can probably allow sinking from this op down.
+          return absl::FailedPreconditionError(absl::StrCat(
+              "Cannot hoist bitcast past ", instruction->ToString()));
       }
     }());
     if (instruction->IsRoot()) {
@@ -822,6 +831,9 @@ absl::Status MaybeInsertRootBitcast(HloInstruction* dot,
 // transposes and broadcasts. This is not reported as an error.
 absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
                                                     CallGraph* call_graph) {
+  // Instead of implementing a logic to hoist bitcast upwards and downwards
+  // we insert a bitcast at the root that and always hoist bitcasts upwards.
+  // That significantly simplifies the implementation.
   VLOG(2) << "Before hoisting bitcasts: " << dot->parent()->ToString();
 
   auto callers = call_graph->GetComputationCallers(dot->parent());
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
index e14c6bba099ca5..9077028ea76963 100644
--- a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
@@ -231,27 +231,25 @@ CHECK: f16[3,11]{1,0} fusion(
       IsOkAndHolds(true));
 }
 
-// We cannot hoist bitcasts past transposes, but we don't need to hoist
-// because the bitcast is not rank-expanding and symbolic tile analysis
-// works fine.
-TEST_P(HoistFusedBitcastsReshapeTest, BitcastsCannotBeHoistedPastTransposes) {
+TEST_P(HoistFusedBitcastsReshapeTest,
+       ResumeBitcastSinkingAfterIncompatibleOps) {
+  // Even though we cannot hoist the bitcast1 past the transpose we still can
+  // remove bitcast2.
   HloOpcode opcode = GetParam();
   absl::string_view hlo = R"(
 dot {
-  p0 = f32[72,36,2] parameter(0)
-  transpose0 = f32[72,2,36] transpose(p0), dimensions={0,2,1}
-  bitcast0 = f32[144,36] $0(transpose0)
-  p1 = f32[36,3] parameter(1)
-  dot = f32[144,3] dot(bitcast0, p1),
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  bitcast1 = f32[144,3] $0(dot)
-  ROOT transpose1 = f32[3,144] transpose(bitcast1), dimensions={1,0}
+  p0 = f32[2,32] parameter(0)
+  p1 = f32[64,32] parameter(1)
+  dot = f32[2,64] dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  bitcast1 = f32[2,32,2] $0(dot)
+  transpose1 = f32[2,2,32] transpose(bitcast1), dimensions={0,2,1}
+  ROOT bitcast2 = f32[1,2,1,2,32] $0(transpose1)
 }
 
 ENTRY entry {
-  p0 = f32[72,36,2] parameter(0)
-  p1 = f32[36,3] parameter(1)
-  ROOT fusion = f32[3,144] fusion(p0, p1),
+  p0 = f32[2,32] parameter(0)
+  p1 = f32[64,32] parameter(1)
+  ROOT fusion = f32[1,2,1,2,32] fusion(p0, p1),
     kind=kCustom, calls=dot, backend_config={
       "fusion_backend_config":{
         "kind":"__triton_gemm","triton_gemm_config":{
@@ -261,7 +259,15 @@ ENTRY entry {
       }
     }
 })";
-  RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  auto module =
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+  EXPECT_THAT(
+      RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
+  CHECK: ROOT {{.*}} = f32[2,2,32]{2,1,0} transpose
+  CHECK: ENTRY
+  CHECK: ROOT {{.*}} = f32[1,2,1,2,32]{4,3,2,1,0} bitcast
+)"),
+      IsOkAndHolds(true));
 }
 
 TEST_P(HoistFusedBitcastsReshapeTest, BitcastsKeepElementSizeInBits) {

From e9aa89e2efb2f3c3017246cdf31248b677f67887 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Thu, 18 Dec 2025 10:29:56 -0800
Subject: [PATCH 539/753] Add basic replicated execution support to
 ClientLibraryTestRunnerMixin<T>.

PiperOrigin-RevId: 846326488
---
 third_party/xla/xla/tests/BUILD               |  2 ++
 .../tests/client_library_test_runner_mixin.h  | 23 +++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index e3f85c05b090a6..5a26f9cb53eaab 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -369,12 +369,14 @@ cc_library(
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
         "//xla/hlo/ir:hlo",
+        "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_module_config",
         "//xla/service:hlo_module_util",
         "//xla/tsl/lib/core:bitmap",
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/tests/client_library_test_runner_mixin.h b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
index 159ef5318b421f..a5ffb096084524 100644
--- a/third_party/xla/xla/tests/client_library_test_runner_mixin.h
+++ b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_TESTS_CLIENT_LIBRARY_TEST_RUNNER_MIXIN_H_
 #define XLA_TESTS_CLIENT_LIBRARY_TEST_RUNNER_MIXIN_H_
 
+#include <algorithm>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -32,9 +33,12 @@ limitations under the License.
 #include "xla/execution_options_util.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_module_util.h"
 #include "xla/shape.h"
@@ -51,6 +55,7 @@ limitations under the License.
 #include "xla/types.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 
@@ -113,6 +118,24 @@ class ClientLibraryTestRunnerMixin : public T {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                         BuildAndVerifyHloModule(computation, argument_shapes,
                                                 &execution_options));
+    const int64_t num_partitions =
+        std::max(1, execution_options.num_partitions());
+    if (const int64_t num_devices =
+            execution_options.num_replicas() * num_partitions;
+        num_devices > 1) {
+      std::optional<DeviceAssignment> device_assignment;
+      DeviceAssignment* device_assignment_ptr = nullptr;
+      if (execution_options.has_device_assignment()) {
+        device_assignment = module->config().static_device_assignment();
+        device_assignment_ptr = &*device_assignment;
+      }
+      ASSIGN_OR_RETURN(std::vector<Literal> results,
+                       this->ExecuteReplicated(
+                           std::move(module), arguments, num_devices,
+                           device_assignment_ptr, /*run_hlo_passes=*/true,
+                           /*use_threads=*/true));
+      return std::move(results.front());
+    }
     return this->Execute(std::move(module), arguments);
   }
 

From c5342da85738e5168dd804e655429a47cb9787a8 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Thu, 18 Dec 2025 10:55:21 -0800
Subject: [PATCH 540/753] Expose execution_options in
 client_library_test_runner_mixin.

PiperOrigin-RevId: 846336689
---
 third_party/xla/xla/tests/client_library_test_runner_mixin.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/xla/tests/client_library_test_runner_mixin.h b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
index a5ffb096084524..15bd5d2ec1a7ef 100644
--- a/third_party/xla/xla/tests/client_library_test_runner_mixin.h
+++ b/third_party/xla/xla/tests/client_library_test_runner_mixin.h
@@ -407,6 +407,10 @@ class ClientLibraryTestRunnerMixin : public T {
   DebugOptions* mutable_debug_options() {
     return execution_options_.mutable_debug_options();
   }
+  const ExecutionOptions& execution_options() const {
+    return execution_options_;
+  }
+  ExecutionOptions* mutable_execution_options() { return &execution_options_; }
 
  private:
   absl::StatusOr<std::unique_ptr<HloModule>> BuildAndVerifyHloModule(

From 73b620f0c2b33ddf54cba2259a0e9228af3e007a Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Thu, 18 Dec 2025 11:03:55 -0800
Subject: [PATCH 541/753] Remove unneeded
 `CoordinationServiceAgent::ShutdownInternal` method.

PiperOrigin-RevId: 846340273
---
 .../distributed/coordination/coordination_service_agent.cc    | 4 +---
 .../distributed/coordination/coordination_service_agent.h     | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index bc6788cb7ecabc..06a39be55b820d 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -402,9 +402,7 @@ CoordinationServiceAgent::WatchJobState(absl::string_view job_name,
   return response;
 }
 
-absl::Status CoordinationServiceAgent::Shutdown() { return ShutdownInternal(); }
-
-absl::Status CoordinationServiceAgent::ShutdownInternal() {
+absl::Status CoordinationServiceAgent::Shutdown() {
   absl::Status status = absl::OkStatus();
   bool is_connected = false;
   {
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
index cbec2894d15793..c10aaa356eeea6 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
@@ -109,7 +109,7 @@ class CoordinationServiceAgent {
   CoordinationServiceAgent() = default;
 
   virtual ~CoordinationServiceAgent() {
-    absl::Status s = ShutdownInternal();
+    absl::Status s = Shutdown();
     VLOG(3) << "Coordination agent dtor failed with status: " << s;
   }
 
@@ -378,7 +378,6 @@ class CoordinationServiceAgent {
  private:
   friend class CoordinationServiceRpcHandler;
 
-  absl::Status ShutdownInternal();
   // Starts sending heartbeats to the coordination service.
   void StartSendingHeartbeats();
   // Use long polling to get error from the coordination service.

From 649b760aad9fb769a9d728b0e99341ba5aa0a834 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Thu, 18 Dec 2025 11:08:50 -0800
Subject: [PATCH 542/753] Add proto serialization for AllToAllStartThunk

PiperOrigin-RevId: 846342263
---
 .../xla/xla/backends/gpu/runtime/BUILD        | 18 +++++
 .../backends/gpu/runtime/all_to_all_thunk.cc  | 76 +++++++++++++++++--
 .../backends/gpu/runtime/all_to_all_thunk.h   | 17 ++++-
 .../gpu/runtime/all_to_all_thunk_test.cc      | 75 ++++++++++++++++++
 .../xla/xla/backends/gpu/runtime/thunk.proto  | 10 +++
 .../runtime/thunk_proto_deserialization.cc    |  5 ++
 6 files changed, 195 insertions(+), 6 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk_test.cc

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index d6c17f5e6bd4bf..78274594b54b8c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1455,6 +1455,7 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_address",
@@ -1463,6 +1464,7 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:logging",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -1477,6 +1479,21 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "all_to_all_thunk_test",
+    srcs = ["all_to_all_thunk_test.cc"],
+    deps = [
+        ":all_to_all_thunk",
+        ":collective_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "ragged_all_to_all_thunk",
     srcs = ["ragged_all_to_all_thunk.cc"],
@@ -2886,6 +2903,7 @@ cc_library(
     deps = [
         ":all_gather_thunk",
         ":all_reduce_thunk",
+        ":all_to_all_thunk",
         ":collective_thunk",
         ":conditional_thunk",
         ":convolution_reorder_thunk",
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
index c88e31a35b64a7..ddc596e4ec9eb5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
@@ -56,6 +57,7 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -77,17 +79,27 @@ struct BufferRendezvousValue {
 }  // namespace
 
 AllToAllStartThunk::AllToAllStartThunk(
-    ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
-    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
-    : CollectiveThunk(Thunk::kAllToAllStart, thunk_info,
-                      IsGPUSyncCollective(*instr),
+    ThunkInfo thunk_info, std::shared_ptr<AsyncEvents> async_events,
+    const AllToAllConfig& config, std::vector<CollectiveThunk::Buffer> buffers,
+    bool p2p_memcpy_enabled)
+    : CollectiveThunk(Thunk::kAllToAllStart, thunk_info, async_events,
                       AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
-      config_(GetAllToAllConfig(instr)),
+      config_(config),
       buffers_(std::move(buffers)),
       p2p_memcpy_enabled_(p2p_memcpy_enabled) {
   CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
 }
 
+AllToAllStartThunk::AllToAllStartThunk(
+    ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
+    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
+    : AllToAllStartThunk(std::move(thunk_info),
+                         IsGPUSyncCollective(*instr)
+                             ? nullptr
+                             : std::make_shared<CollectiveThunk::AsyncEvents>(),
+                         GetAllToAllConfig(instr), std::move(buffers),
+                         p2p_memcpy_enabled) {}
+
 /*static*/ absl::Status AllToAllStartThunk::CheckImplementable(
     const HloAllToAllInstruction* instr, int64_t replica_count,
     int64_t partition_count) {
@@ -271,6 +283,60 @@ bool AllToAllStartThunk::is_local() const {
   return true;
 }
 
+absl::StatusOr<std::unique_ptr<AllToAllStartThunk>>
+AllToAllStartThunk::FromProto(
+    ThunkInfo thunk_info, const AllToAllStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
+      async_events_map[AsyncEventsUniqueId{
+          thunk_proto.async_events_unique_id()}];
+  if (!async_events) {
+    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  return std::make_unique<AllToAllStartThunk>(
+      std::move(thunk_info), async_events,
+      AllToAllConfig{config, thunk_proto.has_split_dimension()}, buffers,
+      thunk_proto.p2p_memcpy_enabled());
+}
+
+absl::StatusOr<ThunkProto> AllToAllStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  AllToAllStartThunkProto* thunk_proto = proto.mutable_all_to_all_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (!async_events_id.has_value()) {
+    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  }
+  thunk_proto->set_async_events_unique_id(async_events_id->value());
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+
+  thunk_proto->set_has_split_dimension(has_split_dimension());
+  thunk_proto->set_p2p_memcpy_enabled(p2p_memcpy_enabled_);
+
+  return proto;
+}
+
 absl::Status RunAllToAll(bool has_split_dimension,
                          std::vector<DeviceBufferPair>& buffers,
                          se::Stream& stream, Communicator& comm,
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
index 4ced40fd1bff65..b35f1340688605 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream.h"
@@ -49,6 +51,12 @@ class AllToAllStartThunk : public CollectiveThunk {
   AllToAllStartThunk(ThunkInfo thunk_info, const HloAllToAllInstruction* instr,
                      std::vector<Buffer> buffers, bool p2p_memcpy_enabled);
 
+  AllToAllStartThunk(ThunkInfo thunk_info,
+                     std::shared_ptr<AsyncEvents> async_events,
+                     const AllToAllConfig& config,
+                     std::vector<CollectiveThunk::Buffer> buffers,
+                     bool p2p_memcpy_enabled);
+
   // Returns whether the given instruction can be lowered to an all-to-all
   // call.
   static absl::Status CheckImplementable(const HloAllToAllInstruction* instr,
@@ -57,11 +65,18 @@ class AllToAllStartThunk : public CollectiveThunk {
 
   absl::Status Initialize(const InitializeParams& params) override;
 
-  static const char* GetHloOpName() { return "all-to-all-start"; }
+  static absl::string_view GetHloOpName() { return "all-to-all-start"; }
 
   static CollectiveOpGroupMode GetGroupMode(
       const HloAllToAllInstruction* instr);
 
+  static absl::StatusOr<std::unique_ptr<AllToAllStartThunk>> FromProto(
+      ThunkInfo thunk_info, const AllToAllStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
   const CollectiveConfig& config() const override { return config_.config; }
   bool has_split_dimension() const { return config_.has_split_dimension; }
   absl::Span<const Buffer> buffers() const { return buffers_; }
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk_test.cc
new file mode 100644
index 00000000000000..521151a2ee9df3
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_to_all_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          has_split_dimension: false
+          p2p_memcpy_enabled: true
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllToAllStartThunk> thunk,
+      AllToAllStartThunk::FromProto(thunk_info, proto.all_to_all_start_thunk(),
+                                    buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_all_to_all_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.all_to_all_start_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index a4d08af3e7d4b3..ae3ae5022909ae 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -426,6 +426,15 @@ message AllReduceStartThunkProto {
   bool is_async = 9;
 }
 
+message AllToAllStartThunkProto {
+  uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+
+  bool has_split_dimension = 4;
+  bool p2p_memcpy_enabled = 5;
+}
+
 message CollectiveDoneThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
@@ -473,6 +482,7 @@ message ThunkProto {
     CollectiveDoneThunkProto collective_done_thunk = 37;
     AllGatherStartThunkProto all_gather_start_thunk = 38;
     AllReduceStartThunkProto all_reduce_start_thunk = 39;
+    AllToAllStartThunkProto all_to_all_start_thunk = 40;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index a4c8bbdd7e9e51..3700f70f033d1c 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "google/protobuf/message.h"
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
+#include "xla/backends/gpu/runtime/all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
@@ -252,6 +253,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return AllReduceStartThunk::FromProto(
           std::move(thunk_info), thunk_proto.all_reduce_start_thunk(),
           buffer_allocations, collective_async_events_map);
+    case ThunkProto::kAllToAllStartThunk:
+      return AllToAllStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.all_to_all_start_thunk(),
+          buffer_allocations, collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);

From 6b581e1304f8567f3dfa2510f145645901597e2e Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Thu, 18 Dec 2025 11:40:19 -0800
Subject: [PATCH 543/753] [XLA:CPU] Enable i8xi8->i32 grouped convolutions with
 ynnpack.

PiperOrigin-RevId: 846354265
---
 third_party/xla/xla/backends/cpu/ynn_support.cc | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index 1c7f427934a622..7054d92f615ced 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -313,23 +313,12 @@ bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
   // Stores tuple of allowed (input, output) dtypes.
   static const absl::NoDestructor<absl::flat_hash_set<
       std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
-      kAllowedTypesNonGrouped(
-          {{F32, F32, F32}, {BF16, BF16, F32}, {S8, S8, S32}});
-
-  static const absl::NoDestructor<absl::flat_hash_set<
-      std::tuple<PrimitiveType, PrimitiveType, PrimitiveType>>>
-      kAllowedTypesGrouped({{F32, F32, F32}, {BF16, BF16, F32}});
+      kAllowedTypes({{F32, F32, F32}, {BF16, BF16, F32}, {S8, S8, S32}});
 
   PrimitiveType lhs_dtype = conv->operand(0)->shape().element_type();
   PrimitiveType rhs_dtype = conv->operand(1)->shape().element_type();
   PrimitiveType out_dtype = conv->shape().element_type();
-  if (conv->feature_group_count() == 1 &&
-      !kAllowedTypesNonGrouped->contains({lhs_dtype, rhs_dtype, out_dtype})) {
-    return false;
-  }
-
-  if (conv->feature_group_count() > 1 &&
-      !kAllowedTypesGrouped->contains({lhs_dtype, rhs_dtype, out_dtype})) {
+  if (!kAllowedTypes->contains({lhs_dtype, rhs_dtype, out_dtype})) {
     return false;
   }
 

From b4a23c24642c8736e3971271438f16b9663b2974 Mon Sep 17 00:00:00 2001
From: Emily Fertig <emilyaf@google.com>
Date: Thu, 18 Dec 2025 11:43:06 -0800
Subject: [PATCH 544/753] [PJRT C API] Implement
 `xla::PjRtCrossHostSendCancelNotifier`.

This is part of the `MakeCrossHostReceiveBuffers`/`CopyToRemoteDevice` PJRT cross-host transfer API.

PiperOrigin-RevId: 846355464
---
 ...rt_c_api_cross_host_transfers_extension.cc | 178 +++++++++++++-----
 ...jrt_c_api_cross_host_transfers_extension.h |  16 +-
 2 files changed, 150 insertions(+), 44 deletions(-)

diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
index 5b70b483d15dcd..50af005112244e 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.cc
@@ -36,6 +36,97 @@ limitations under the License.
 #include "xla/shape.h"
 
 namespace pjrt {
+namespace {
+
+// Nested callback functions for the C API version of
+// xla::PjRtClient::MakeCrossHostReceiveBuffers.
+using CrossHostRecvNotifierFunction = std::function<void(
+    PJRT_Error* error, const char** serialized_descriptors,
+    size_t* descriptors_sizes, size_t num_descriptors,
+    PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+    void* cancel_notifier_user_arg)>;
+using CrossHostSendCancelNotifierFunction = std::function<void(
+    const char* serialized_descriptor, size_t serialized_descriptor_size,
+    PJRT_Error_Code error_code, const char* error_message,
+    size_t error_message_size,
+    PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+    void* on_canceled_user_arg)>;
+using CrossHostOnCanceledCallbackFunction =
+    std::function<void(PJRT_Error* error)>;
+
+// Callback function for the C API version of
+// xla::PjRtBuffer::CopyToRemoteDevice.
+using RemoteSendCallbackFunction =
+    std::function<void(PJRT_Error* error, bool sends_were_enqueued)>;
+
+xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
+    const PJRT_Transfers_CrossHostRecvNotifierInfo& c_notifier) {
+  return [user_arg = c_notifier.user_arg, notifier = c_notifier.notifier](
+             absl::StatusOr<xla::PjRtCrossHostRecvState> recv_state) {
+    // Define the function to pass as `cancel_notifier_user_arg` to
+    // `notifier`.
+    auto cancel_notifier_function = new CrossHostSendCancelNotifierFunction(
+        [cpp_cancel_notifier = std::move(recv_state->cancel_notifier)](
+            const char* serialized_descriptor,
+            size_t serialized_descriptor_size, PJRT_Error_Code error_code,
+            const char* error_message, size_t error_message_size,
+            PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+            void* on_canceled_user_arg) {
+          std::string serialized_descriptor_str(serialized_descriptor,
+                                                serialized_descriptor_size);
+          std::string error_message_str(error_message, error_message_size);
+          absl::Status state(pjrt::PjrtErrorCodeToStatusCode(error_code),
+                             error_message_str);
+          auto cpp_on_canceled = [user_arg = on_canceled_user_arg,
+                                  on_canceled =
+                                      on_canceled](absl::Status status) {
+            auto error = new PJRT_Error{status};
+            on_canceled(error, user_arg);
+            delete error;
+          };
+          return cpp_cancel_notifier(std::move(serialized_descriptor_str),
+                                     std::move(state),
+                                     std::move(cpp_on_canceled));
+        });
+    PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier =
+        [](const char* serialized_descriptor, size_t serialized_descriptor_size,
+           PJRT_Error_Code error, const char* error_message,
+           size_t error_message_size,
+           PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+           void* on_canceled_user_arg, void* user_arg) {
+          CrossHostSendCancelNotifierFunction* cancel_notifier_fn =
+              reinterpret_cast<CrossHostSendCancelNotifierFunction*>(user_arg);
+          (*cancel_notifier_fn)(serialized_descriptor,
+                                serialized_descriptor_size, error,
+                                error_message, error_message_size, on_canceled,
+                                on_canceled_user_arg);
+        };
+    if (!recv_state.ok()) {
+      auto error = new PJRT_Error{recv_state.status()};
+      notifier(error, nullptr, nullptr, 0, user_arg, cancel_notifier,
+               cancel_notifier_function);
+      delete error;
+      return;
+    }
+    // Convert serialized descriptors to char*.
+    std::vector<xla::PjRtCrossHostRecvDescriptors>& descriptors =
+        recv_state->descriptors;
+    std::vector<size_t> descriptors_sizes;
+    descriptors_sizes.reserve(descriptors.size());
+    std::vector<const char*> serialized_descriptors;
+    serialized_descriptors.reserve(descriptors.size());
+    for (int i = 0; i < descriptors.size(); ++i) {
+      serialized_descriptors.push_back(
+          descriptors[i].serialized_descriptors.front().c_str());
+      descriptors_sizes.push_back(
+          descriptors[i].serialized_descriptors.front().size());
+    }
+    notifier(nullptr, serialized_descriptors.data(), descriptors_sizes.data(),
+             descriptors.size(), user_arg, cancel_notifier,
+             cancel_notifier_function);
+  };
+}
+}  // namespace
 
 PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
     PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers_Args* args) {
@@ -106,41 +197,44 @@ PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostSendBuffers(
   return nullptr;
 }
 
-namespace {
-static xla::PjRtCrossHostRecvNotifier CCrossHostRecvNotifierToCpp(
-    const PJRT_Transfers_CrossHostRecvNotifierInfo& c_notifier) {
-  return [user_arg = c_notifier.user_arg, notifier = c_notifier.notifier](
-             absl::StatusOr<xla::PjRtCrossHostRecvState> recv_state) {
-    if (!recv_state.ok()) {
-      auto error = new PJRT_Error{recv_state.status()};
-      notifier(error, nullptr, nullptr, 0, user_arg);
-      return;
-    }
-    auto& descriptors = recv_state->descriptors;
-    std::vector<size_t> descriptors_sizes;
-    descriptors_sizes.reserve(descriptors.size());
-    std::vector<const char*> serialized_descriptors;
-    serialized_descriptors.reserve(descriptors.size());
-    for (int i = 0; i < descriptors.size(); ++i) {
-      serialized_descriptors.push_back(
-          descriptors[i].serialized_descriptors.front().c_str());
-      descriptors_sizes.push_back(
-          descriptors[i].serialized_descriptors.front().size());
-    }
-    notifier(nullptr, serialized_descriptors.data(), descriptors_sizes.data(),
-             descriptors.size(), user_arg);
-  };
-}
-}  // namespace
-
 PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
     const PJRT_Api* c_api, xla::PjRtCrossHostRecvNotifier cpp_notifier) {
-  using CrossHostRecvNotifierFunction =
-      std::function<void(PJRT_Error*, const char**, size_t*, size_t)>;
   auto notifier_function = new CrossHostRecvNotifierFunction(
       [cpp_notifier = std::move(cpp_notifier), c_api](
           PJRT_Error* error, const char** serialized_descriptors,
-          size_t* descriptors_sizes, size_t num_descriptors) {
+          size_t* descriptors_sizes, size_t num_descriptors,
+          PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+          void* cancel_notifier_user_arg) {
+        xla::PjRtCrossHostSendCancelNotifier cpp_cancel_notifier =
+            [user_arg = cancel_notifier_user_arg, notifier = cancel_notifier,
+             c_api](absl::string_view serialized_descriptor,
+                    absl::Status reason,
+                    std::function<void(absl::Status)> on_canceled) {
+              PJRT_Error_Code error_code =
+                  pjrt::StatusCodeToPjrtErrorCode(reason.code());
+              // Define the function to pass as `on_canceled_user_arg` to
+              // the cancel notifier.
+              auto on_canceled_function =
+                  new CrossHostOnCanceledCallbackFunction(
+                      [cpp_on_canceled = std::move(on_canceled),
+                       c_api](PJRT_Error* error) {
+                        absl::Status status =
+                            ::pjrt::PjrtErrorToStatus(error, c_api);
+                        cpp_on_canceled(status);
+                      });
+              PJRT_Transfers_CrossHostOnCanceledCallback on_canceled_callback =
+                  [](PJRT_Error* error, void* user_arg) {
+                    CrossHostOnCanceledCallbackFunction* on_canceled_fn =
+                        reinterpret_cast<CrossHostOnCanceledCallbackFunction*>(
+                            user_arg);
+                    (*on_canceled_fn)(error);
+                    delete on_canceled_fn;
+                  };
+              notifier(serialized_descriptor.data(),
+                       serialized_descriptor.size(), error_code,
+                       reason.message().data(), reason.message().size(),
+                       on_canceled_callback, on_canceled_function, user_arg);
+            };
         if (error != nullptr) {
           absl::Status state = ::pjrt::PjrtErrorToStatus(error, c_api);
           return cpp_notifier(std::move(state));
@@ -154,34 +248,34 @@ PJRT_Transfers_CrossHostRecvNotifierInfo CppCrossHostRecvNotifierToC(
           state.descriptors.push_back(std::move(descriptors));
         }
 
-        // TODO(emilyaf): Support cancellation.
-        xla::PjRtCrossHostSendCancelNotifier cancel_notifier =
-            [](absl::string_view, absl::Status,
-               std::function<void(absl::Status)>) {
-              LOG(FATAL) << "MakeCrossHostReceiveBuffers: Cancellation is not "
-                            "supported in PJRT C API.";
-            };
-        state.cancel_notifier = cancel_notifier;
+        state.cancel_notifier = cpp_cancel_notifier;
         return cpp_notifier(std::move(state));
       });
   return PJRT_Transfers_CrossHostRecvNotifierInfo{
       /*user_arg=*/notifier_function,
       /*notifier=*/
       [](PJRT_Error* error, const char** serialized_descriptors,
-         size_t* descriptors_sizes, size_t num_descriptors, void* user_arg) {
+         size_t* descriptors_sizes, size_t num_descriptors, void* user_arg,
+         PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+         void* cancel_notifier_user_arg) {
         CrossHostRecvNotifierFunction* notifier_fn =
             reinterpret_cast<CrossHostRecvNotifierFunction*>(user_arg);
         (*notifier_fn)(error, serialized_descriptors, descriptors_sizes,
-                       num_descriptors);
+                       num_descriptors, cancel_notifier,
+                       cancel_notifier_user_arg);
         delete notifier_fn;
+        // The cancellation callback isn't always called, so instead of freeing
+        // it after usage, we free it here after the notifier is called.
+        CrossHostSendCancelNotifierFunction* cancel_notifier_fn =
+            reinterpret_cast<CrossHostSendCancelNotifierFunction*>(
+                cancel_notifier_user_arg);
+        delete cancel_notifier_fn;
       }};
 }
 
 PJRT_Transfers_CrossHostRemoteSendCallbackInfo
 CppCrossHostRemoteSendCallbackToC(
     const PJRT_Api* c_api, xla::PjRtBuffer::RemoteSendCallback cpp_callback) {
-  using RemoteSendCallbackFunction =
-      std::function<void(PJRT_Error * error, bool sends_were_enqueued)>;
   auto on_done_function = new RemoteSendCallbackFunction(
       [cpp_callback = std::move(cpp_callback), c_api](
           PJRT_Error* error, bool sends_were_enqueued) {
diff --git a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
index 160a5f4771498b..98dff8fe57ebba 100644
--- a/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
+++ b/third_party/xla/xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h
@@ -35,7 +35,7 @@ extern "C" {
 // CrossHostSendBuffers and CrossHostReceiveBuffers. These methods allow PjRt
 // clients to implement various optimizations for cross-host transfers.
 
-#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 3
+#define PJRT_API_CROSS_HOST_TRANSFERS_EXTENSION_VERSION 4
 
 // ---------------------------------- Methods ----------------------------------
 
@@ -83,9 +83,21 @@ typedef PJRT_Error* PJRT_Transfers_PJRT_Client_CrossHostReceiveBuffers(
 
 // The structs and methods below correspond to the original cross-host transfers
 // API.
+typedef void (*PJRT_Transfers_CrossHostOnCanceledCallback)(PJRT_Error* error,
+                                                           void* user_arg);
+
+typedef void (*PJRT_Transfers_CrossHostSendCancelNotifier)(
+    const char* serialized_descriptor, size_t serialized_descriptor_size,
+    PJRT_Error_Code reason, const char* error_message,
+    size_t error_message_size,
+    PJRT_Transfers_CrossHostOnCanceledCallback on_canceled,
+    void* on_canceled_user_arg, void* user_arg);
+
 typedef void (*PJRT_Transfers_CrossHostRecvNotifier)(
     PJRT_Error* error, const char** serialized_descriptors,
-    size_t* descriptors_sizes, size_t num_descriptors, void* user_arg);
+    size_t* descriptors_sizes, size_t num_descriptors, void* user_arg,
+    PJRT_Transfers_CrossHostSendCancelNotifier cancel_notifier,
+    void* cancel_notifier_user_arg);
 
 struct PJRT_Transfers_CrossHostRecvNotifierInfo {
   void* user_arg;

From f43a3f21dba69f2a13f93ebb4b4980ee20b1da27 Mon Sep 17 00:00:00 2001
From: Jiya Zhang <jiyaz@google.com>
Date: Thu, 18 Dec 2025 11:48:06 -0800
Subject: [PATCH 545/753] Expose XProf Collection Walltime

PiperOrigin-RevId: 846357376
---
 .../xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index 306598ec942a14..37090027c4301d 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -610,6 +610,7 @@ absl::StatusOr<PerDeviceLiteralVecType> RunInternal(
           running_options.recreate_profiler_session_between_repeats ||
           is_last_repeat;
       if (has_active_profiler_session && upload_active_profiler_session) {
+        XLA_SCOPED_LOGGING_TIMER("FunctionalHloRunner::XProfUpload");
         running_options.profiler->UploadSession();
         has_active_profiler_session = false;
       }

From 389c77fa7a92ba7e7cc40e57ce9df88e45dad48d Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 18 Dec 2025 13:19:48 -0800
Subject: [PATCH 546/753] Enabling indexing analysis directly on StableHLO.

This is adding an additional helper that can operate directly on StableHLO. It tries to reuse/extract indexing utils as much as possible excluding where the cost of reuse negates value.

Only one direction of support is implemented as size was rather large already. The support for runtime variables ended up being a bit more invasive.

PiperOrigin-RevId: 846390948
---
 .../codegen/tiling/symbolic_tile_analysis.cc  |    2 +-
 third_party/xla/xla/hlo/analysis/BUILD        |   32 +-
 .../xla/xla/hlo/analysis/indexing_analysis.cc |  545 ++++-----
 .../xla/xla/hlo/analysis/indexing_analysis.h  |   39 +-
 .../hlo/analysis/indexing_analysis_test.cc    |  308 +++--
 .../hlo/analysis/indexing_analysis_utils.cc   |  340 ++++++
 .../hlo/analysis/indexing_analysis_utils.h    |  102 ++
 .../xla/xla/hlo/analysis/indexing_map_test.cc |    6 +-
 .../xla/hlo/analysis/indexing_test_utils.h    |   18 +-
 .../analysis/stablehlo_indexing_analysis.cc   | 1009 +++++++++++++++++
 .../analysis/stablehlo_indexing_analysis.h    |   29 +
 11 files changed, 1993 insertions(+), 437 deletions(-)
 create mode 100644 third_party/xla/xla/hlo/analysis/indexing_analysis_utils.cc
 create mode 100644 third_party/xla/xla/hlo/analysis/indexing_analysis_utils.h
 create mode 100644 third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.cc
 create mode 100644 third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.h

diff --git a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
index 0ac831f03114a3..2684e6378f5545 100644
--- a/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
+++ b/third_party/xla/xla/codegen/tiling/symbolic_tile_analysis.cc
@@ -1147,7 +1147,7 @@ ComposeIndexingResult ComposeInstructionIndexing(
     IndexingMap rt_map =
         ComposeIndexingMaps(tiled_hlo_instruction->indexing_map(), rt_var.map);
     HloInstructionAdaptor hlo_adaptor =
-        instruction_adaptor.parent().GetInstruction(rt_var.hlo);
+        instruction_adaptor.parent().GetInstruction(rt_var.hlo());
     auto tiled_runtime_var = std::make_unique<SymbolicTiledHloInstruction>(
         &hlo_adaptor.instruction(), rt_map,
         tiled_hlo_instruction->runtime_variables());
diff --git a/third_party/xla/xla/hlo/analysis/BUILD b/third_party/xla/xla/hlo/analysis/BUILD
index 627179d0643995..1494f6de2d76e8 100644
--- a/third_party/xla/xla/hlo/analysis/BUILD
+++ b/third_party/xla/xla/hlo/analysis/BUILD
@@ -594,11 +594,13 @@ cc_library(
     name = "indexing_analysis",
     srcs = [
         "indexing_analysis.cc",
+        "indexing_analysis_utils.cc",
         "indexing_map.cc",
         "indexing_map_serialization.cc",
     ],
     hdrs = [
         "indexing_analysis.h",
+        "indexing_analysis_utils.h",
         "indexing_map.h",
         "indexing_map_serialization.h",
     ],
@@ -626,6 +628,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:logging",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -644,8 +647,6 @@ xla_cc_test(
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
@@ -698,12 +699,18 @@ xla_cc_test(
     deps = [
         ":indexing_analysis",
         ":indexing_test_utils",
+        ":stablehlo_indexing_analysis",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/translate:stablehlo",
         "//xla/hlo/utils:hlo_traversal",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:test",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -796,3 +803,22 @@ xla_cc_test(
         "@llvm-project//mlir:Support",
     ],
 )
+
+cc_library(
+    name = "stablehlo_indexing_analysis",
+    srcs = ["stablehlo_indexing_analysis.cc"],
+    hdrs = ["stablehlo_indexing_analysis.h"],
+    deps = [
+        ":indexing_analysis",
+        ":interval",
+        "//xla:permutation_util",
+        "//xla:shape_util",
+        "//xla/mlir_hlo",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
index 7f686baea60ada..7ce3f66bd2c36b 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.cc
@@ -42,8 +42,13 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"
+#include "xla/hlo/analysis/indexing_analysis_utils.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -93,7 +98,7 @@ HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
 //   into `idx`.
 struct HLORTVar {
   Interval feasible_values;
-  const HloInstruction* hlo;
+  InstructionRef hlo;
   mlir::AffineMap map;
   DimensionVector dim_upper_bounds;
 };
@@ -107,33 +112,80 @@ inline bool operator!=(const HLORTVar& lhs, const HLORTVar& rhs) {
   return !(lhs == rhs);
 }
 
-// Optimizes runtime variable if it's possible to replace it with a constant.
-//
-// Note: we had a more complex logic here that handled more instruction types
-// but was removed due to previous version not updating value ranges
-// (b/419279949).
-std::optional<AffineExpr> OptimizeRTVar(HLORTVar rt_var,
-                                        MLIRContext* mlir_context) {
-  if (auto constant_expr = DynCast<HloConstantInstruction>(rt_var.hlo)) {
-    if (rt_var.map.isConstant()) {
-      const auto idx = rt_var.map.getConstantResults();
+std::optional<int64_t> GetIntOrSplatIntValue(mlir::Attribute attr) {
+  if (auto int_attr = mlir::dyn_cast<mlir::IntegerAttr>(attr)) {
+    return int_attr.getInt();
+  }
+  if (auto splat = mlir::dyn_cast<mlir::SplatElementsAttr>(attr)) {
+    if (auto element_attr = mlir::dyn_cast_or_null<mlir::IntegerAttr>(
+            splat.getSplatValue<mlir::Attribute>())) {
+      return element_attr.getInt();
+    }
+  }
+  return std::nullopt;
+}
+
+}  // namespace
+
+std::optional<AffineExpr> OptimizeHloRTVar(const HloInstruction* hlo,
+                                           const RuntimeVarIndexing& rt_var,
+                                           const Interval& feasible_values,
+                                           MLIRContext* mlir_context) {
+  if (auto constant_expr = DynCast<HloConstantInstruction>(hlo)) {
+    if (rt_var.map.GetAffineMap().isConstant()) {
+      const auto idx = rt_var.map.GetAffineMap().getConstantResults();
       auto const_value = constant_expr->literal().GetIntegralAsS64(idx).value();
-      if (!rt_var.feasible_values.Contains(const_value)) {
-        // Constant is outside of the feasible values, keep the symbol to let
-        // the runtime to handle that.
+      if (!feasible_values.Contains(const_value)) {
         return std::nullopt;
       }
       return getAffineConstantExpr(const_value, mlir_context);
     }
   }
-  if (auto iota_expr = DynCast<HloIotaInstruction>(rt_var.hlo)) {
+  if (auto iota_expr = DynCast<HloIotaInstruction>(hlo)) {
     auto iota_dimension = iota_expr->iota_dimension();
-    CHECK(iota_dimension < rt_var.map.getNumResults());
-    return rt_var.map.getResults()[iota_dimension];
+    CHECK(iota_dimension < rt_var.map.GetAffineMap().getNumResults());
+    return rt_var.map.GetAffineMap().getResults()[iota_dimension];
+  }
+  return std::nullopt;
+}
+
+std::optional<AffineExpr> OptimizeMlirRTVar(mlir::Operation* op,
+                                            const RuntimeVarIndexing& rt_var,
+                                            const Interval& feasible_values,
+                                            MLIRContext* mlir_context) {
+  mlir::Attribute attr;
+  if (mlir::matchPattern(op, mlir::m_Constant(&attr))) {
+    auto int_val = GetIntOrSplatIntValue(attr);
+    if (int_val.has_value()) {
+      if (!feasible_values.Contains(*int_val)) {
+        return std::nullopt;
+      }
+      return getAffineConstantExpr(*int_val, mlir_context);
+    }
+  }
+  if (auto iota_op = llvm::dyn_cast<mlir::stablehlo::IotaOp>(op)) {
+    int64_t iota_dim = iota_op.getIotaDimension();
+    if (iota_dim < rt_var.map.GetAffineMap().getNumResults()) {
+      return rt_var.map.GetAffineMap().getResults()[iota_dim];
+    }
   }
   return std::nullopt;
 }
 
+std::optional<AffineExpr> OptimizeRTVar(const RuntimeVarIndexing& rt_var,
+                                        const Interval& feasible_values,
+                                        MLIRContext* mlir_context) {
+  if (const HloInstruction* hlo = rt_var.hlo()) {
+    return OptimizeHloRTVar(hlo, rt_var, feasible_values, mlir_context);
+  }
+  if (auto* op = rt_var.mlir_op()) {
+    return OptimizeMlirRTVar(op, rt_var, feasible_values, mlir_context);
+  }
+  return std::nullopt;
+}
+
+namespace {
+
 std::vector<IndexingMap::Variable> ConvertHLORTVarsToRTVars(
     const std::vector<HLORTVar>& hlo_rt_vars) {
   std::vector<IndexingMap::Variable> rt_vars;
@@ -154,7 +206,11 @@ IndexingMap FoldRTVarsAndConstructIndexingMap(
   CHECK_EQ(affine_map.getNumSymbols(), hlo_rt_vars.size());
   for (auto idx = 0; idx < affine_map.getNumSymbols(); ++idx) {
     auto& rt_var = hlo_rt_vars[idx];
-    std::optional<AffineExpr> result = OptimizeRTVar(rt_var, mlir_context);
+    std::optional<AffineExpr> result = OptimizeRTVar(
+        RuntimeVarIndexing{rt_var.hlo, IndexingMap::FromTensorSizes(
+                                           rt_var.map, rt_var.dim_upper_bounds,
+                                           /*symbol_upper_bounds=*/{})},
+        rt_var.feasible_values, mlir_context);
     if (!result) {
       continue;
     }
@@ -183,29 +239,24 @@ OperandIndexing CreateOperandIndexingWithRTVars(
   IndexingMap update_map_ops = FoldRTVarsAndConstructIndexingMap(
       operand_map, dim_vars, std::move(rt_vars));
 
-  return OperandIndexing(update_map_ops, rt_indexing);
+  OperandIndexing indexing(update_map_ops, rt_indexing);
+  indexing.RemoveUnusedSymbols();
+  return indexing;
 }
 
 HloInstructionIndexing ComputeOutputToInputCwiseOpIndexing(
     const HloInstruction* instr, MLIRContext* mlir_context) {
-  IndexingMap identity_map = CreateIdentityMap(instr->shape(), mlir_context);
-  IndexingMap unit_map(
-      mlir::AffineMap::get(identity_map.GetAffineMap().getNumDims(),
-                           /*symbolCount=*/0, mlir_context),
-      identity_map.GetDimVars(), /*range_vars=*/{}, /*rt_vars=*/{});
-
-  HloInstructionIndexing instr_indexing;
-  instr_indexing.indexing_maps.resize(instr->operand_count());
-  int64_t operand_count = instr->operand_count();
-  for (int64_t operand_id = 0; operand_id < operand_count; ++operand_id) {
+  HloInstructionIndexing instr_indexing = CreateElementwiseIndexing(
+      instr->operand_count(), instr->shape(), mlir_context);
+  for (int64_t operand_id = 0; operand_id < instr->operand_count();
+       ++operand_id) {
     // Select allows implicit broadcasting in the predicate. We just handle it
     // generically here.
-    auto* operand = instr->operand(operand_id);
-    if (operand->shape().dimensions().size() == 0 &&
-        instr->shape().dimensions().size() > 0) {
-      instr_indexing.indexing_maps[operand_id].emplace(unit_map);
-    } else {
-      instr_indexing.indexing_maps[operand_id].emplace(identity_map);
+    if (instr->operand(operand_id)->shape().dimensions().empty() &&
+        !instr->shape().dimensions().empty()) {
+      instr_indexing.indexing_maps[operand_id].clear();
+      instr_indexing.indexing_maps[operand_id].emplace(
+          CreateScalarIndexingMap(instr->shape(), mlir_context));
     }
   }
   return instr_indexing;
@@ -217,19 +268,14 @@ HloInstructionIndexing ComputeInputToOutputCwiseOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps({identity_map});
 }
 
+}  // namespace
+
+namespace {
+
 HloInstructionIndexing ComputeOutputToInputBroadcastOpIndexing(
     const HloBroadcastInstruction* bcast, MLIRContext* mlir_context) {
-  auto output_dims = bcast->shape().dimensions();
-
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(bcast->dimensions().size());
-  for (int64_t bcast_dim : bcast->dimensions()) {
-    exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
-  }
-  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
-                     mlir_context),
-      output_dims, {});
+  IndexingMap indexing_map = ComputeBroadcastIndexingMap(
+      bcast->shape().dimensions(), bcast->dimensions(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -266,31 +312,15 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
 
 HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     const HloConcatenateInstruction* concat, MLIRContext* mlir_context) {
-  const auto& operand_0_dims = concat->operand(0)->shape().dimensions();
-
-  // Initialize affine map and domain. Only concat_dim elements of both have to
-  // be adjusted for a particular operand_id.
-  mlir::MutableAffineMap affine_map =
-      AffineMap::getMultiDimIdentityMap(operand_0_dims.size(), mlir_context);
-  std::vector<IndexingMap::Variable> dim_vars =
-      DimVarsFromTensorSizes(operand_0_dims);
-
-  HloInstructionIndexing concat_indexing;
-  concat_indexing.indexing_maps.resize(concat->operand_count());
   int64_t concat_dim = concat->concatenate_dimension();
-  AffineExpr concat_dim_expr = getAffineDimExpr(concat_dim, mlir_context);
-  int64_t offset = 0;
-  for (const auto [operand_id, operand] : llvm::enumerate(concat->operands())) {
-    affine_map.setResult(concat_dim, concat_dim_expr - offset);
-    int64_t operand_concat_dim = operand->shape().dimensions()[concat_dim];
-    dim_vars[concat_dim] =
-        IndexingMap::Variable{{offset, offset + operand_concat_dim - 1}};
-    concat_indexing.indexing_maps[operand_id].insert(
-        OperandIndexing(IndexingMap(affine_map.getAffineMap(), dim_vars,
-                                    /*range_vars=*/{}, /*rt_vars=*/{})));
-    offset += operand_concat_dim;
+  std::vector<int64_t> operand_concat_dim_sizes;
+  operand_concat_dim_sizes.reserve(concat->operand_count());
+  for (const auto* operand : concat->operands()) {
+    operand_concat_dim_sizes.push_back(operand->shape().dimensions(concat_dim));
   }
-  return concat_indexing;
+  return ComputeConcatenateIndexing(concat->shape().dimensions().size(),
+                                    concat_dim, concat->shape().dimensions(),
+                                    operand_concat_dim_sizes, mlir_context);
 }
 
 HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
@@ -335,74 +365,11 @@ HloInstructionIndexing ComputeOutputToInputFusionOpIndexing(
 std::pair<IndexingMap, IndexingMap> ComputeDotOperandsIndexingImpl(
     const Shape& lhs_shape, const Shape& rhs_shape, const Shape& output_shape,
     const DotDimensionNumbers& dim_numbers, MLIRContext* mlir_context) {
-  absl::Span<const int64_t> lhs_contracting_dims(
-      dim_numbers.lhs_contracting_dimensions());
-  absl::Span<const int64_t> rhs_contracting_dims =
-      dim_numbers.rhs_contracting_dimensions();
-
-  absl::Span<const int64_t> lhs_batch_dims = dim_numbers.lhs_batch_dimensions();
-  absl::Span<const int64_t> rhs_batch_dims = dim_numbers.rhs_batch_dimensions();
-
-  // According to the StableHLO specification, the dimensions of the output
-  // shape are ordered as follows:
-  //   lhs_batch_dims | lhs_non_contracting_dims | rhs_non_contracting_dims
-  SmallVector<AffineExpr> lhs_exprs(lhs_shape.dimensions().size());
-  SmallVector<AffineExpr> rhs_exprs(rhs_shape.dimensions().size());
-  int64_t output_dim_id = 0;
-
-  // lhs_batch_dims
-  for (auto [lhs_batch_dim, rhs_batch_dim] :
-       llvm::zip(lhs_batch_dims, rhs_batch_dims)) {
-    AffineExpr output_dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    lhs_exprs[lhs_batch_dim] = output_dim_expr;
-    rhs_exprs[rhs_batch_dim] = output_dim_expr;
-    ++output_dim_id;
-  }
-
-  // lhs_non_contracting_dims
-  auto lhs_non_contracting_dims =
-      GetNonContractingDims(lhs_shape, lhs_batch_dims, lhs_contracting_dims);
-  assert(lhs_non_contracting_dims.ok());
-
-  for (int64_t lhs_non_contracting_dim : lhs_non_contracting_dims.value()) {
-    lhs_exprs[lhs_non_contracting_dim] =
-        getAffineDimExpr(output_dim_id++, mlir_context);
-  }
-
-  // rhs_non_contracting_dims
-  auto rhs_non_contracting_dims =
-      GetNonContractingDims(rhs_shape, rhs_batch_dims, rhs_contracting_dims);
-  assert(rhs_non_contracting_dims.ok());
-  for (int64_t rhs_non_contracting_dim : rhs_non_contracting_dims.value()) {
-    rhs_exprs[rhs_non_contracting_dim] =
-        getAffineDimExpr(output_dim_id++, mlir_context);
-  }
-
-  int64_t input_dim_id = 0;
-  std::vector<int64_t> input_dim_sizes;
-  input_dim_sizes.reserve(lhs_contracting_dims.size());
-
-  for (auto [lhs_contracting_dim, rhs_contracting_dim] :
-       llvm::zip(lhs_contracting_dims, rhs_contracting_dims)) {
-    AffineExpr input_dim_expr = getAffineSymbolExpr(input_dim_id, mlir_context);
-    lhs_exprs[lhs_contracting_dim] = input_dim_expr;
-    rhs_exprs[rhs_contracting_dim] = input_dim_expr;
-    ++input_dim_id;
-
-    // LHS and RHS contracting dimensions must match pairwise, and we therefore
-    // need only populate a single input_dim_sizes vector.
-    input_dim_sizes.push_back(lhs_shape.dimensions(lhs_contracting_dim));
-  }
-
-  int64_t output_rank = output_shape.dimensions().size();
-  return std::make_pair(IndexingMap::FromTensorSizes(
-                            AffineMap::get(output_rank, input_dim_sizes.size(),
-                                           lhs_exprs, mlir_context),
-                            output_shape.dimensions(), input_dim_sizes),
-                        IndexingMap::FromTensorSizes(
-                            AffineMap::get(output_rank, input_dim_sizes.size(),
-                                           rhs_exprs, mlir_context),
-                            output_shape.dimensions(), input_dim_sizes));
+  return ComputeDotOperandsIndexing(
+      lhs_shape.dimensions(), rhs_shape.dimensions(), output_shape.dimensions(),
+      dim_numbers.lhs_batch_dimensions(), dim_numbers.rhs_batch_dimensions(),
+      dim_numbers.lhs_contracting_dimensions(),
+      dim_numbers.rhs_contracting_dimensions(), mlir_context);
 }
 
 // Returns the new map with the results scaled by (operand_shape / scale_shape).
@@ -472,29 +439,45 @@ HloInstructionIndexing ComputeOutputToInputDynamicSliceOpIndexing(
       << "b/118437727: Old form, not supported.";
   // A map from tensor iteration space to (), because index operands are 0d
   // tensors.
+  IndexingMap start_indices_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
+
   AffineMap empty_results_affine_map = AffineMap::get(
       /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
-  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
-      empty_results_affine_map, output_shape.dimensions(), {});
-
   std::vector<HLORTVar> offsets_rt_vars;
   offsets_rt_vars.reserve(rank);
   std::vector<AffineExpr> exprs;
   exprs.reserve(rank);
+
   for (auto [dim, slice_size] :
        llvm::enumerate(dynamic_slice->dynamic_slice_sizes())) {
-    exprs.push_back(getAffineDimExpr(dim, mlir_context) +
-                    getAffineSymbolExpr(dim, mlir_context));
-    offsets_rt_vars.push_back(HLORTVar{
-        Interval{0, input_shape.dimensions(dim) - slice_size},
-        dynamic_slice->operand(dim + first_index_num), empty_results_affine_map,
-        ShapeUtil::CreateDimensionVectorFromShape(output_shape)});
+    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
+    const HloInstruction* offset_op =
+        dynamic_slice->operand(dim + first_index_num);
+    int64_t max_index = input_shape.dimensions(dim) - slice_size;
+
+    // Construct temp objects for optimization
+    RuntimeVarIndexing rt_indexing{offset_op, start_indices_map};
+    Interval feasible_values{0, max_index};
+
+    auto simplified_expr =
+        OptimizeRTVar(rt_indexing, feasible_values, mlir_context);
+    if (simplified_expr) {
+      exprs.push_back(dim_expr + *simplified_expr);
+    } else {
+      exprs.push_back(
+          dim_expr + getAffineSymbolExpr(offsets_rt_vars.size(), mlir_context));
+      offsets_rt_vars.push_back(
+          HLORTVar{feasible_values, offset_op, empty_results_affine_map,
+                   ShapeUtil::CreateDimensionVectorFromShape(output_shape)});
+    }
   }
   std::vector<OperandIndexing> indexing_maps(
       dynamic_slice->operand_count(), OperandIndexing(start_indices_map));
 
+  int symbol_count = offsets_rt_vars.size();
   indexing_maps[0] = CreateOperandIndexingWithRTVars(
-      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/rank, exprs,
+      AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/symbol_count, exprs,
                      mlir_context),
       start_indices_map.GetDimVars(), std::move(offsets_rt_vars));
   HloInstructionIndexing result =
@@ -520,11 +503,11 @@ HloInstructionIndexing ComputeOutputToInputDynamicUpdateSliceOpIndexing(
       output_shape.dimensions(), {});
 
   // start_indices: (d0, ... d{N-1}) -> ()
+  IndexingMap start_indices_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
+
   AffineMap empty_results_affine_map = AffineMap::get(
       /*dimCount=*/rank, /*symbolCount=*/0, /*results=*/{}, mlir_context);
-  IndexingMap start_indices_map = IndexingMap::FromTensorSizes(
-      empty_results_affine_map, output_shape.dimensions(), {});
-
   // update: (d0 - rt0, ..., d{N-1} - rt{N-1})
   std::vector<AffineExpr> exprs;
   exprs.reserve(rank);
@@ -618,40 +601,9 @@ HloInstructionIndexing ComputeOutputToInputGatherOpIndexing(
       {operand_indexing, OperandIndexing(indices_map)});
 }
 
-IndexingMap ComputeOutputToInputPadOpIndexingImpl(
-    absl::Span<const int64_t> output_dims,
-    absl::Span<const int64_t> padding_low,
-    absl::Span<const int64_t> padding_high,
-    absl::Span<const int64_t> padding_interior, MLIRContext* mlir_context) {
-  int64_t output_rank = output_dims.size();
+}  // namespace
 
-  std::vector<AffineExpr> exprs;
-  std::vector<std::pair<AffineExpr, Interval>> constraints;
-  std::vector<IndexingMap::Variable> dim_vars;
-  exprs.reserve(output_rank);
-  constraints.reserve(output_rank);
-  int64_t output_dim_id = 0;
-  for (const auto [output_dim, pad_low, pad_high, pad_interior] :
-       llvm::zip(output_dims, padding_low, padding_high, padding_interior)) {
-    AffineExpr dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    dim_vars.push_back({IndexingMap::Variable{
-        std::max(int64_t{0}, pad_low),
-        std::min(output_dim - 1, output_dim - 1 - pad_high)}});
-    if (pad_interior == 0) {
-      exprs.push_back(dim_expr - pad_low);
-    } else {
-      exprs.push_back((dim_expr - pad_low).floorDiv(pad_interior + 1));
-      constraints.push_back(
-          {(dim_expr - pad_low) % (pad_interior + 1), Interval{0, 0}});
-    }
-    ++output_dim_id;
-  }
-  return IndexingMap{
-      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
-      std::move(dim_vars),
-      /*range_vars = */ {},
-      /*rt_vars = */ {}, absl::MakeSpan(constraints)};
-}
+namespace {
 
 HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
     const HloPadInstruction* pad, MLIRContext* mlir_context) {
@@ -666,47 +618,25 @@ HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
     padding_high.push_back(dim_config.edge_padding_high());
     padding_interior.push_back(dim_config.interior_padding());
   }
-  IndexingMap input_indexing_map = ComputeOutputToInputPadOpIndexingImpl(
-      output_shape.dimensions(), padding_low, padding_high, padding_interior,
-      mlir_context);
-  IndexingMap padding_value_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
-                     mlir_context),
-      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+  IndexingMap input_indexing_map =
+      ComputePadIndexingMap(output_shape.dimensions(), padding_low,
+                            padding_high, padding_interior, mlir_context);
+  IndexingMap padding_value_indexing_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
   return HloInstructionIndexing::FromIndexingMaps(
       {input_indexing_map, padding_value_indexing_map});
 }
 
 HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     const HloReduceInstruction* reduce, MLIRContext* mlir_context) {
-  absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
-                                               reduce->dimensions().end());
-
   const Shape& input_shape = reduce->operand(0)->shape();
   const Shape& output_shape = GetOutputShape(reduce, 0);
 
-  std::vector<int64_t> parallel_dims_sizes;
-  int64_t output_dim_id = 0;
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(input_shape.dimensions().size());
-  for (auto [input_dim_id, input_dim] :
-       llvm::enumerate(input_shape.dimensions())) {
-    if (reduce_dims_ids.contains(input_dim_id)) {
-      exprs.push_back(
-          getAffineSymbolExpr(parallel_dims_sizes.size(), mlir_context));
-      parallel_dims_sizes.push_back(input_dim);
-      continue;
-    }
-    exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
-  }
-  IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), reduce_dims_ids.size(),
-                     exprs, mlir_context),
-      output_shape.dimensions(), parallel_dims_sizes);
-  IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
-                     mlir_context),
-      output_shape.dimensions(), {});
+  IndexingMap inputs_indexing_map = ComputeReduceInputIndexingMap(
+      input_shape.dimensions(), output_shape.dimensions(), reduce->dimensions(),
+      mlir_context);
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(reduce->operand_count());
@@ -775,56 +705,24 @@ IndexingMap ComposeIndexingMapsForWindow(
     absl::Span<const int64_t> output_dimensions, const Window& window,
     MLIRContext* mlir_context) {
   size_t rank = input_dimensions.size();
-
-  // Compute shape of the padded input and the indexing map of pad op required
-  // to pad the input.
-  SmallVector<int64_t> padding_low, padding_high, padding_interior,
-      padded_input_dimensions;
-  padding_low.reserve(rank);
-  padding_high.reserve(rank);
-  padding_interior.reserve(rank);
-  padded_input_dimensions.reserve(rank);
-  SmallVector<AffineExpr, 4> exprs;
-  std::vector<IndexingMap::Variable> dim_vars;
-  std::vector<IndexingMap::Variable> range_vars;
-  exprs.reserve(rank);
-  dim_vars.reserve(rank);
-  range_vars.reserve(rank);
-  for (const auto& [dim_id, window_config] :
-       llvm::enumerate(window.dimensions())) {
-    padding_low.push_back(window_config.padding_low());
-    padding_high.push_back(window_config.padding_high());
-    // For some reason interior_padding in HLO pad is offset from base_dilations
-    // in HLO reduce-window by 1.
-    padding_interior.push_back(window_config.base_dilation() - 1);
-    padded_input_dimensions.push_back(
-        input_dimensions[dim_id] + window_config.padding_low() +
-        window_config.padding_high() +
-        (input_dimensions[dim_id] - 1) * (window_config.base_dilation() - 1));
-    AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
-    AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
-
-    exprs.push_back(symbol_expr * window_config.window_dilation() +
-                    window_config.stride() * dim_expr);
-    dim_vars.push_back(
-        {IndexingMap::Variable{0, output_dimensions[dim_id] - 1}});
-    range_vars.push_back({IndexingMap::Variable{0, window_config.size() - 1}});
-  }
-  // Indexing map for pad op that pads the input.
-  IndexingMap padded_input_indexing = ComputeOutputToInputPadOpIndexingImpl(
-      padded_input_dimensions, padding_low, padding_high, padding_interior,
-      mlir_context);
-  // Indexing map for reduce-window, that does not do any padding.
-  IndexingMap input_indexing_no_padding(
-      AffineMap::get(rank, rank, exprs, mlir_context), dim_vars, range_vars,
-      /*rt_vars=*/{});
-
-  // Composed indexing.
-  IndexingMap result =
-      ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
-  result.Simplify();
-  result.RemoveUnusedSymbols();
-  return result;
+  SmallVector<int64_t> window_dims, window_strides, window_dilations,
+      base_dilations, padding;
+  window_dims.reserve(rank);
+  window_strides.reserve(rank);
+  window_dilations.reserve(rank);
+  base_dilations.reserve(rank);
+  padding.reserve(rank * 2);
+  for (const auto& dim : window.dimensions()) {
+    window_dims.push_back(dim.size());
+    window_strides.push_back(dim.stride());
+    window_dilations.push_back(dim.window_dilation());
+    base_dilations.push_back(dim.base_dilation());
+    padding.push_back(dim.padding_low());
+    padding.push_back(dim.padding_high());
+  }
+  return ComposeWindowIndexingMap(input_dimensions, output_dimensions,
+                                  window_dims, window_strides, window_dilations,
+                                  base_dilations, padding, mlir_context);
 }
 
 // Indexing for reduce-window with dilations and non-trivial padding can be
@@ -842,10 +740,8 @@ HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
       reduce_window->window(), mlir_context);
 
   // Indexing map for the init value.
-  IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
-                     mlir_context),
-      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, mlir_context);
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(reduce_window->operand_count());
@@ -1167,43 +1063,16 @@ HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
 
 HloInstructionIndexing ComputeReverseOpIndexing(
     const HloReverseInstruction* reverse, MLIRContext* mlir_context) {
-  absl::flat_hash_set<int64_t> reverse_dims(reverse->dimensions().begin(),
-                                            reverse->dimensions().end());
-  auto output_dims = reverse->shape().dimensions();
-
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(output_dims.size());
-  for (auto [output_dim_id, output_dim] : llvm::enumerate(output_dims)) {
-    auto dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
-    if (!reverse_dims.contains(output_dim_id)) {
-      exprs.push_back(dim_expr);
-      continue;
-    }
-    exprs.push_back(-dim_expr + output_dim - 1);
-  }
-
-  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
-                     mlir_context),
-      output_dims, {});
-
+  IndexingMap indexing_map = ComputeReverseIndexingMap(
+      reverse->shape().dimensions(), reverse->dimensions(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
 HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
     const HloSliceInstruction* slice, MLIRContext* mlir_context) {
-  auto output_rank = slice->shape().dimensions().size();
-
-  std::vector<AffineExpr> exprs;
-  exprs.reserve(output_rank);
-  for (int64_t dim = 0; dim < output_rank; ++dim) {
-    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
-    exprs.push_back(dim_expr * slice->slice_strides()[dim] +
-                    slice->slice_starts()[dim]);
-  }
-  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
-      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
-      slice->shape().dimensions(), {});
+  IndexingMap indexing_map = ComputeSliceIndexingMap(
+      slice->shape().dimensions(), slice->slice_starts(),
+      slice->slice_strides(), mlir_context);
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -1236,13 +1105,6 @@ HloInstructionIndexing ComputeInputToOutputSliceOpIndexing(
   return HloInstructionIndexing::FromIndexingMaps({std::move(indexing_map)});
 }
 
-AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
-                                      MLIRContext* mlir_context) {
-  return AffineMap::getPermutationMap(
-      std::vector<unsigned>(permutation.begin(), permutation.end()),
-      mlir_context);
-}
-
 HloInstructionIndexing ComputeOutputToInputTransposeOpIndexing(
     const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
   AffineMap inverse_permutation = ComputeTransposeIndexingMap(
@@ -1946,7 +1808,7 @@ bool operator==(const OperandIndexing& lhs, const OperandIndexing& rhs) {
 }
 
 bool operator==(const RuntimeVarIndexing& lhs, const RuntimeVarIndexing& rhs) {
-  return lhs.map == rhs.map && lhs.hlo == rhs.hlo;
+  return lhs.map == rhs.map && lhs.instruction_ref == rhs.instruction_ref;
 }
 
 OperandIndexing ComposeOperandIndexing(const OperandIndexing& first,
@@ -1960,7 +1822,8 @@ OperandIndexing ComposeOperandIndexing(const OperandIndexing& first,
                           first.runtime_variables().end());
   for (const auto& rt_var : second.runtime_variables()) {
     IndexingMap combined_map = ComposeIndexingMaps(first.map(), rt_var.map);
-    combined_runtime.push_back(RuntimeVarIndexing{rt_var.hlo, combined_map});
+    combined_runtime.push_back(
+        RuntimeVarIndexing{rt_var.instruction_ref, combined_map});
   }
 
   std::optional<IndexingMap> replica_id_map;
@@ -1981,7 +1844,67 @@ OperandIndexing ComposeOperandIndexing(const OperandIndexing& first,
 }
 
 std::string RuntimeVarIndexing::ToString() const {
-  return absl::StrCat(hlo->ToString(), "; ", xla::ToString(map));
+  // Handle both HLO and MLIR operations producing a unified enough format to
+  // avoid duplication in tests.
+  std::string instruction_str;
+  if (auto* hlo = std::get_if<const HloInstruction*>(&instruction_ref)) {
+    if (*hlo) {
+      // For HLO, print simplified format for parameter and constant
+      if ((*hlo)->opcode() == HloOpcode::kParameter) {
+        instruction_str =
+            absl::StrCat("parameter(", (*hlo)->parameter_number(), ")");
+      } else if ((*hlo)->opcode() == HloOpcode::kConstant) {
+        instruction_str = "constant";
+        // Print constant value for scalar constants
+        const xla::Literal& literal = (*hlo)->literal();
+        if (xla::ShapeUtil::IsScalar(literal.shape())) {
+          instruction_str =
+              absl::StrCat("constant(", literal.ToStringWithoutShape(), ")");
+        }
+      } else {
+        instruction_str = (*hlo)->name();
+      }
+    } else {
+      instruction_str = "<null hlo>";
+    }
+  } else if (auto* val = std::get_if<mlir::Value>(&instruction_ref)) {
+    if (*val) {
+      if (auto* op = val->getDefiningOp()) {
+        // Try to extract constant value for stablehlo/mhlo constant ops
+        llvm::StringRef op_name = op->getName().getStringRef();
+        if (op_name == "stablehlo.constant" || op_name == "mhlo.constant") {
+          instruction_str = "constant";
+          if (auto attr = op->getAttrOfType<mlir::DenseElementsAttr>("value")) {
+            if (attr.isSplat() && attr.getNumElements() == 1) {
+              // Scalar constant - print the value
+              auto elem_type = attr.getElementType();
+              if (elem_type.isSignlessInteger()) {
+                instruction_str = absl::StrCat(
+                    "constant(",
+                    attr.getSplatValue<llvm::APInt>().getSExtValue(), ")");
+              } else if (elem_type.isF32()) {
+                instruction_str =
+                    absl::StrCat("constant(", attr.getSplatValue<float>(), ")");
+              } else if (elem_type.isF64()) {
+                instruction_str = absl::StrCat(
+                    "constant(", attr.getSplatValue<double>(), ")");
+              }
+            }
+          }
+        } else {
+          instruction_str = op_name.str();
+        }
+      } else {
+        // Block argument is print as "parameter(N)" to match HLO format.
+        auto block_arg = llvm::cast<mlir::BlockArgument>(*val);
+        instruction_str =
+            absl::StrCat("parameter(", block_arg.getArgNumber(), ")");
+      }
+    } else {
+      instruction_str = "<null value>";
+    }
+  }
+  return absl::StrCat(instruction_str, "; ", xla::ToString(map));
 }
 
 std::ostream& operator<<(std::ostream& os, const RuntimeVarIndexing& var) {
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis.h b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
index 56e51d9b5804cc..8295d0133f8d89 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -31,7 +33,9 @@ limitations under the License.
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Value.h"
 #include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/shape.h"
@@ -108,15 +112,37 @@ IndexingMap ComputeEpilogueInputToOutputIndexing(
     HloInstructionAdaptor epilogue_parent, HloInstructionAdaptor epilogue_root,
     mlir::MLIRContext* mlir_context);
 
-// Indexing of the runtime variable of the HLO instruction.
+// Type for referencing either an HloInstruction or MLIR Value
+using InstructionRef = std::variant<const HloInstruction*, mlir::Value>;
+
+// Indexing of the runtime variable of the HLO instruction or MLIR operation.
 struct RuntimeVarIndexing {
-  // Instruction of the runtime variable. Note that while in trivial cases it
+  // Instruction reference. Can be either HloInstruction* (for XLA HLO) or
+  // mlir::Value (for StableHLO/MLIR). Note that while in trivial cases it
   // points to one of the operands of the instruction, with multiple
   // instructions and fusions it may point to an arbitrary instruction in the
   // computation.
-  const HloInstruction* hlo;
-  // Output-to-input indexing map from the instruction to the output of `hlo`.
+  InstructionRef instruction_ref;
+
+  // Output-to-input indexing map from the instruction to the output.
   IndexingMap map;
+
+  // Accessor for HloInstruction*
+  const HloInstruction* hlo() const {
+    if (auto* hlo = std::get_if<const HloInstruction*>(&instruction_ref)) {
+      return *hlo;
+    }
+    return nullptr;
+  }
+
+  // Accessor for MLIR operations
+  mlir::Operation* mlir_op() const {
+    if (auto* val = std::get_if<mlir::Value>(&instruction_ref)) {
+      return val->getDefiningOp();
+    }
+    return nullptr;
+  }
+
   std::string ToString() const;
 };
 
@@ -213,6 +239,11 @@ llvm::SmallVector<IndexingMap, 4> MapLogicalToLinearizedPhysicalShape(
     absl::Span<const HloInstruction* const> operands,
     mlir::MLIRContext* mlir_context);
 
+// Optimizes a runtime variable if it's possible to replace it with a constant.
+std::optional<mlir::AffineExpr> OptimizeRTVar(const RuntimeVarIndexing& rt_var,
+                                              const Interval& feasible_values,
+                                              mlir::MLIRContext* mlir_context);
+
 // Computes the indexing map from logical to linearized physical shape for each
 // operand and adds them to `result`. `result` may be non-empty when this
 // function is called and can be used to accumulate results from several calls
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
index 74aaf30a25216f..8db318e73f6b8b 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_test.cc
@@ -15,20 +15,33 @@ limitations under the License.
 
 #include "xla/hlo/analysis/indexing_analysis.h"
 
+#include <utility>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "mlir/IR/Visitors.h"
+#include "stablehlo/dialect/StablehloOps.h"
 #include "xla/hlo/analysis/indexing_map.h"
 #include "xla/hlo/analysis/indexing_map_serialization.h"
 #include "xla/hlo/analysis/indexing_test_utils.h"
+#include "xla/hlo/analysis/stablehlo_indexing_analysis.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/translate/stablehlo.h"
 #include "xla/hlo/utils/hlo_traversal.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+using ::llvm::dyn_cast;
 using ::testing::ElementsAre;
 using ::testing::Eq;
 using ::testing::ExplainMatchResult;
@@ -42,9 +55,84 @@ MATCHER_P2(MatchInstrIndexing, operand_id, indexing_map_matchers, "") {
                             result_listener);
 }
 
-using IndexingAnalysisTest = IndexingTestBase;
+class IndexingAnalysisTest : public IndexingTestBase,
+                             public ::testing::WithParamInterface<bool> {
+ public:
+  using IndexingTestBase::GetInputToOutputIndexing;
+  using IndexingTestBase::GetOutputToInputIndexing;
+
+  HloInstructionIndexing GetOutputToInputIndexing(
+      const HloInstruction* instr, int output_id,
+      bool use_physical_layout) override {
+    if (GetParam()) {
+      // StableHLO mode
+      auto module_ref =
+          xla::ConvertHloToStablehlo(mlir_context_, instr->GetModule());
+      if (!module_ref.ok()) {
+        ADD_FAILURE() << "HLO to StableHLO conversion failed: "
+                      << module_ref.status();
+        return HloInstructionIndexing::FromIndexingMaps({});
+      }
+      stablehlo_modules_.push_back(std::move(module_ref.value()));
+      auto module_op = *stablehlo_modules_.back();
+      mlir::Operation* op = nullptr;
+      module_op->walk([&](mlir::Operation* nested_op) {
+        if (auto name_loc = dyn_cast<mlir::NameLoc>(nested_op->getLoc())) {
+          if (name_loc.getName() == instr->name()) {
+            op = nested_op;
+            return mlir::WalkResult::interrupt();
+          }
+        }
+        return mlir::WalkResult::advance();
+      });
+      if (!op) {
+        ADD_FAILURE() << "Could not find corresponding StableHLO op for "
+                      << instr->name();
+        return HloInstructionIndexing::FromIndexingMaps({});
+      }
+      return ComputeOutputToInputIndexing(op, output_id);
+    }
+    return IndexingTestBase::GetOutputToInputIndexing(instr, output_id,
+                                                      use_physical_layout);
+  }
+
+  void SetUp() override {
+    IndexingTestBase::SetUp();
+    mlir_context_.loadDialect<mlir::stablehlo::StablehloDialect,
+                              mlir::func::FuncDialect>();
+
+    static const auto* unsupported_tests =
+        new absl::flat_hash_set<absl::string_view>{
+            // StableHLO indexing analysis does not support physical layout /
+            // permutations yet.
+            "PhysicalLayoutTestInputPermutation/1",
+            "PhysicalLayoutTestOutputPermutation/1",
+            "PhysicalLayoutTestInputAndOutputPermutation/1",
+
+            // Custom call / MHLO unknown handling.
+            "ScaledDotOp/1",
+        };
+
+    if (GetParam()) {
+      const testing::TestInfo* test_info =
+          testing::UnitTest::GetInstance()->current_test_info();
+      absl::string_view test_name = test_info->name();
+      // Here we rely on exact match of "TestName/1" which corresponds to
+      // GetParam() == true.
+      if (unsupported_tests->contains(test_name)) {
+        GTEST_SKIP() << "Skipping unsupported StableHLO test: " << test_name;
+      }
+    }
+  }
+
+ private:
+  std::vector<mlir::OwningOpRef<mlir::ModuleOp>> stablehlo_modules_;
+};
+
+INSTANTIATE_TEST_SUITE_P(StablehloIndexingAnalysis, IndexingAnalysisTest,
+                         ::testing::Values(false, true));
 
-TEST_F(IndexingAnalysisTest, GroupIndexingMapsByProducers) {
+TEST_P(IndexingAnalysisTest, GroupIndexingMapsByProducers) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -75,7 +163,7 @@ TEST_F(IndexingAnalysisTest, GroupIndexingMapsByProducers) {
                   )")))));
 }
 
-TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
+TEST_P(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -119,7 +207,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
                       )")))));
 }
 
-TEST_F(IndexingAnalysisTest,
+TEST_P(IndexingAnalysisTest,
        ComputeGroupedOutputToInputIndexing_VariadicReduce) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -180,7 +268,7 @@ TEST_F(IndexingAnalysisTest,
                   )")))));
 }
 
-TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
+TEST_P(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -211,7 +299,7 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
                                                    )")))));
 }
 
-TEST_F(IndexingAnalysisTest,
+TEST_P(IndexingAnalysisTest,
        ComputeGroupedOutputToInputIndexing_StartNotAtRoot) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
@@ -263,7 +351,7 @@ TEST_F(IndexingAnalysisTest,
           )")))));
 }
 
-TEST_F(IndexingAnalysisTest, PhysicalLayoutTestOutputPermutation) {
+TEST_P(IndexingAnalysisTest, PhysicalLayoutTestOutputPermutation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -294,7 +382,7 @@ TEST_F(IndexingAnalysisTest, PhysicalLayoutTestOutputPermutation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, CopyNothing) {
+TEST_P(IndexingAnalysisTest, CopyNothing) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -313,7 +401,7 @@ TEST_F(IndexingAnalysisTest, CopyNothing) {
               MatchIndexingString("operand id = 0 KNOWN EMPTY"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeNothing) {
+TEST_P(IndexingAnalysisTest, ReshapeNothing) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -340,7 +428,7 @@ TEST_F(IndexingAnalysisTest, ReshapeNothing) {
             1);
 }
 
-TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
+TEST_P(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -371,7 +459,7 @@ TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputAndOutputPermutation) {
+TEST_P(IndexingAnalysisTest, PhysicalLayoutTestInputAndOutputPermutation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -402,7 +490,7 @@ TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputAndOutputPermutation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ElementwiseOp) {
+TEST_P(IndexingAnalysisTest, ElementwiseOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -444,7 +532,7 @@ TEST_F(IndexingAnalysisTest, ElementwiseOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, Map) {
+TEST_P(IndexingAnalysisTest, Map) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     mapper {
@@ -491,7 +579,7 @@ TEST_F(IndexingAnalysisTest, Map) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BitcastIsReshape) {
+TEST_P(IndexingAnalysisTest, BitcastIsReshape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -509,7 +597,7 @@ TEST_F(IndexingAnalysisTest, BitcastIsReshape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BitcastIsTranspose) {
+TEST_P(IndexingAnalysisTest, BitcastIsTranspose) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -528,7 +616,7 @@ TEST_F(IndexingAnalysisTest, BitcastIsTranspose) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
+TEST_P(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -555,7 +643,7 @@ TEST_F(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, BroadcastOp) {
+TEST_P(IndexingAnalysisTest, BroadcastOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -583,7 +671,7 @@ TEST_F(IndexingAnalysisTest, BroadcastOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConstantOp) {
+TEST_P(IndexingAnalysisTest, ConstantOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -594,7 +682,7 @@ TEST_F(IndexingAnalysisTest, ConstantOp) {
   EXPECT_THAT(input_indexing.ToString(), IsEmpty());
 }
 
-TEST_F(IndexingAnalysisTest, ConcatenateOp) {
+TEST_P(IndexingAnalysisTest, ConcatenateOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -658,7 +746,7 @@ TEST_F(IndexingAnalysisTest, ConcatenateOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
+TEST_P(IndexingAnalysisTest, DynamicSliceOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -682,11 +770,11 @@ TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
         rt1 in [0, 0],
         rt2 in [0, 226]
       runtime variables:
-        rt0: %of1 = s32[] parameter(1); (d0, d1, d2) -> (),
+        rt0: parameter(1); (d0, d1, d2) -> (),
           domain: d0 in [0, 0], d1 in [0, 1], d2 in [0, 31]
-        rt1: %of2 = s32[] parameter(2); (d0, d1, d2) -> (),
+        rt1: parameter(2); (d0, d1, d2) -> (),
           domain: d0 in [0, 0], d1 in [0, 1], d2 in [0, 31]
-        rt2: %of3 = s32[] parameter(3); (d0, d1, d2) -> (),
+        rt2: parameter(3); (d0, d1, d2) -> (),
           domain: d0 in [0, 0], d1 in [0, 1], d2 in [0, 31]
     operand id = 1
       (d0, d1, d2) -> (),
@@ -709,7 +797,7 @@ TEST_F(IndexingAnalysisTest, DynamicSliceOp) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
+TEST_P(IndexingAnalysisTest, DynamicUpdateSliceOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -735,9 +823,9 @@ TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
         rt0 in [0, 15],
         rt1 in [0, 20]
       runtime variables:
-        rt0: %of1 = s32[] parameter(2); (d0, d1) -> (),
+        rt0: parameter(2); (d0, d1) -> (),
           domain: d0 in [0, 19], d1 in [0, 29]
-        rt1: %of2 = s32[] parameter(3); (d0, d1) -> (),
+        rt1: parameter(3); (d0, d1) -> (),
           domain: d0 in [0, 19], d1 in [0, 29]
     operand id = 2
       (d0, d1) -> (),
@@ -752,7 +840,7 @@ TEST_F(IndexingAnalysisTest, DynamicUpdateSliceOp) {
       )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
+TEST_P(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -778,7 +866,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithDot) {
+TEST_P(IndexingAnalysisTest, FusionOpWithDot) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     f {
       p0 = s8[3,12288,6,128]{3,2,1,0} parameter(0)
@@ -900,7 +988,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDot) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithSoftmax) {
+TEST_P(IndexingAnalysisTest, FusionOpWithSoftmax) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     add_computation {
       p0 = f32[] parameter(0)
@@ -964,7 +1052,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSoftmax) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
+TEST_P(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -992,7 +1080,7 @@ TEST_F(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
+TEST_P(IndexingAnalysisTest, FusionExponentialDuplication) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule test_module
 
@@ -1047,7 +1135,7 @@ TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
                           )"))));
 }
 
-TEST_F(IndexingAnalysisTest, GatherOp) {
+TEST_P(IndexingAnalysisTest, GatherOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY main {
@@ -1069,9 +1157,9 @@ TEST_F(IndexingAnalysisTest, GatherOp) {
         rt0 in [0, 26],
         rt1 in [0, 68]
       runtime variables:
-        rt0: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 0),
+        rt0: parameter(1); (d0, d1, d2, d3) -> (d0, 0),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
-        rt1: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 1),
+        rt1: parameter(1); (d0, d1, d2, d3) -> (d0, 1),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
     operand id = 1
       (d0, d1, d2, d3)[s0] -> (d0, s0),
@@ -1084,7 +1172,7 @@ TEST_F(IndexingAnalysisTest, GatherOp) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
+TEST_P(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY main {
@@ -1106,9 +1194,9 @@ TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
         rt0 in [0, 26],
         rt1 in [0, 68]
       runtime variables:
-        rt0: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 1),
+        rt0: parameter(1); (d0, d1, d2, d3) -> (d0, 1),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
-        rt1: %indices = s32[1806,2]{1,0} parameter(1); (d0, d1, d2, d3) -> (d0, 0),
+        rt1: parameter(1); (d0, d1, d2, d3) -> (d0, 0),
           domain: d0 in [0, 1805], d1 in [0, 6], d2 in [0, 7], d3 in [0, 3]
     operand id = 1
       (d0, d1, d2, d3)[s0] -> (d0, s0),
@@ -1121,7 +1209,7 @@ TEST_F(IndexingAnalysisTest, GatherOpWithShuffledStartIndexMap) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1158,7 +1246,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1195,7 +1283,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
+TEST_P(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1230,7 +1318,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReducedSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReducedSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1266,7 +1354,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReducedSlice) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1287,7 +1375,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1309,7 +1397,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
+TEST_P(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1332,7 +1420,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1357,7 +1445,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1394,11 +1482,11 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
         rt2 in [0, 25],
         rt3 in [0, 16]
       runtime variables:
-        rt0: %of21 = s32[] parameter(3); (d0, d1) -> (),
+        rt0: parameter(3); (d0, d1) -> (),
           domain: d0 in [0, 24], d1 in [0, 15]
-        rt1: %of22 = s32[] parameter(4); (d0, d1) -> (),
+        rt1: parameter(4); (d0, d1) -> (),
           domain: d0 in [0, 24], d1 in [0, 15]
-        rt2: %of11 = s32[] parameter(1); (d0, d1){rt0, rt1} -> (),
+        rt2: parameter(1); (d0, d1){rt0, rt1} -> (),
           domain:
             d0 in [0, 24],
             d1 in [0, 15],
@@ -1406,7 +1494,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
             rt1 in [0, 16],
             d0 + rt0 in [0, 49],
             d1 + rt1 in [0, 31]
-        rt3: %of12 = s32[] parameter(2); (d0, d1){rt0, rt1} -> (),
+        rt3: parameter(2); (d0, d1){rt0, rt1} -> (),
           domain:
             d0 in [0, 24], d1 in [0, 15],
             rt0 in [0, 25], rt1 in [0, 16],
@@ -1435,7 +1523,7 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDynSliceOfDynSlice) {
     )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
+TEST_P(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1476,7 +1564,7 @@ TEST_F(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
+TEST_P(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1509,7 +1597,7 @@ TEST_F(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
+TEST_P(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     f {
@@ -1540,7 +1628,7 @@ TEST_F(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, IotaOp) {
+TEST_P(IndexingAnalysisTest, IotaOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1551,7 +1639,7 @@ TEST_F(IndexingAnalysisTest, IotaOp) {
   EXPECT_THAT(input_indexing.indexing_maps, IsEmpty());
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpCollapseShape) {
+TEST_P(IndexingAnalysisTest, ReshapeOpCollapseShape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1567,7 +1655,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpCollapseShape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpExpandShape) {
+TEST_P(IndexingAnalysisTest, ReshapeOpExpandShape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1584,7 +1672,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandShape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
+TEST_P(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1613,7 +1701,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
+TEST_P(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1631,7 +1719,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape2DTo3D) {
+TEST_P(IndexingAnalysisTest, ReshapeOpGenericReshape2DTo3D) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1649,7 +1737,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape2DTo3D) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape3DTo2D) {
+TEST_P(IndexingAnalysisTest, ReshapeOpGenericReshape3DTo2D) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1668,7 +1756,7 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape3DTo2D) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, PadOp) {
+TEST_P(IndexingAnalysisTest, PadOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1692,7 +1780,7 @@ TEST_F(IndexingAnalysisTest, PadOp) {
                                 )"));
 }
 
-TEST_F(IndexingAnalysisTest, PadOpNoInterior) {
+TEST_P(IndexingAnalysisTest, PadOpNoInterior) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -1715,7 +1803,7 @@ TEST_F(IndexingAnalysisTest, PadOpNoInterior) {
                                 )"));
 }
 
-TEST_F(IndexingAnalysisTest, PadOpNegativePadding) {
+TEST_P(IndexingAnalysisTest, PadOpNegativePadding) {
   // The interior padding is applied first (even with negative padding), so we
   // get a size of 5 (7 + 6 - 8).
   // in:     0 1 2 3 4 5 6
@@ -1742,7 +1830,7 @@ TEST_F(IndexingAnalysisTest, PadOpNegativePadding) {
                                 )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceOp) {
+TEST_P(IndexingAnalysisTest, ReduceOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1793,7 +1881,7 @@ TEST_F(IndexingAnalysisTest, ReduceOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, VariadicReduceOp) {
+TEST_P(IndexingAnalysisTest, VariadicReduceOp) {
   HloInstruction* root = ParseAndGetRoot(R"(
     HloModule m
     min {
@@ -1895,7 +1983,7 @@ TEST_F(IndexingAnalysisTest, VariadicReduceOp) {
                   ElementsAre(MatchOperandIndexing(kInitToOutputIndexing))));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1926,7 +2014,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_4DWithTrivalDims_NoPadding) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_4DWithTrivalDims_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1961,7 +2049,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_4DWithTrivalDims_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -1995,7 +2083,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -2027,7 +2115,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_BaseDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     max {
@@ -2058,7 +2146,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_WindowDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
+TEST_P(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     combiner {
@@ -2136,7 +2224,7 @@ TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2171,7 +2259,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_4DWithTrivialDims_NoPadding) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_4DWithTrivialDims_NoPadding) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2208,7 +2296,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_4DWithTrivialDims_NoPadding) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2245,7 +2333,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_PaddingAndWindowStride) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2282,7 +2370,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_LhsDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2317,7 +2405,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_RhsDilation) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2352,7 +2440,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_FeatureGroups) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
+TEST_P(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2388,7 +2476,7 @@ TEST_F(IndexingAnalysisTest, ConvolutionOp_BatchGroups) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReverseOp) {
+TEST_P(IndexingAnalysisTest, ReverseOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2419,7 +2507,7 @@ TEST_F(IndexingAnalysisTest, ReverseOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, ReverseReshape) {
+TEST_P(IndexingAnalysisTest, ReverseReshape) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     fused_computation {
@@ -2444,7 +2532,7 @@ TEST_F(IndexingAnalysisTest, ReverseReshape) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, SliceOp) {
+TEST_P(IndexingAnalysisTest, SliceOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2479,7 +2567,7 @@ TEST_F(IndexingAnalysisTest, SliceOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, TransposeOp) {
+TEST_P(IndexingAnalysisTest, TransposeOp) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2508,7 +2596,7 @@ TEST_F(IndexingAnalysisTest, TransposeOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, TransposeOp4D) {
+TEST_P(IndexingAnalysisTest, TransposeOp4D) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2527,7 +2615,7 @@ TEST_F(IndexingAnalysisTest, TransposeOp4D) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, DotOp) {
+TEST_P(IndexingAnalysisTest, DotOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2564,7 +2652,7 @@ TEST_F(IndexingAnalysisTest, DotOp) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, ScaledDotOp) {
+TEST_P(IndexingAnalysisTest, ScaledDotOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2605,7 +2693,7 @@ TEST_F(IndexingAnalysisTest, ScaledDotOp) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, UnsupportedOps) {
+TEST_P(IndexingAnalysisTest, UnsupportedOps) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2633,7 +2721,7 @@ TEST_F(IndexingAnalysisTest, UnsupportedOps) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
+TEST_P(IndexingAnalysisTest, FusionWithUnsupportedOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     fused_computation {
@@ -2670,7 +2758,7 @@ TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
                           )"));
 }
 
-TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
+TEST_P(IndexingAnalysisTest, EpilogueIndexing) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
     fused_computation {
@@ -2703,7 +2791,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
+TEST_P(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
     fused_computation {
@@ -2732,7 +2820,7 @@ TEST_F(IndexingAnalysisTest, EpilogueIndexing_NoEpilogue) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, BroadcastingElementwise) {
+TEST_P(IndexingAnalysisTest, BroadcastingElementwise) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m
     ENTRY e {
@@ -2758,7 +2846,7 @@ TEST_F(IndexingAnalysisTest, BroadcastingElementwise) {
               )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_ScalarConstant) {
+TEST_P(IndexingAnalysisTest, FusionWithRTVarsSimplification_ScalarConstant) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2781,7 +2869,7 @@ TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_ScalarConstant) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest,
+TEST_P(IndexingAnalysisTest,
        FusionWithRTVarsSimplification_ScalarConstantOutsideOfRangeIsKept) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
@@ -2804,12 +2892,12 @@ TEST_F(IndexingAnalysisTest,
         d0 in [0, 9],
         rt0 in [0, 90]
       runtime variables:
-        rt0: %offset = s64[] constant(99); (d0) -> (),
+        rt0: constant(99); (d0) -> (),
           domain: d0 in [0, 9]
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_Iota) {
+TEST_P(IndexingAnalysisTest, FusionWithRTVarsSimplification_Iota) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2837,7 +2925,7 @@ TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_Iota) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_IotaAsConstant) {
+TEST_P(IndexingAnalysisTest, FusionWithRTVarsSimplification_IotaAsConstant) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2865,7 +2953,7 @@ TEST_F(IndexingAnalysisTest, FusionWithRTVarsSimplification_IotaAsConstant) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
+TEST_P(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"hlo(
       HloModule m
       fused_computation {
@@ -2884,7 +2972,10 @@ TEST_F(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
           calls=fused_computation
       }
     )hlo"));
-  EXPECT_THAT(input_indexing.ToString(), MatchIndexingString(R"(
+  // HLO uses instruction name, StableHLO uses op name
+  EXPECT_THAT(input_indexing.ToString(),
+              MatchIndexingString(
+                  absl::StrFormat(R"(
     operand id = 0
       (d0, d1){rt0} -> (0, d1 + rt0 - 4096),
       domain:
@@ -2893,17 +2984,18 @@ TEST_F(IndexingAnalysisTest, FusionOpWithPadAndDynamicSlice) {
         rt0 in [0, 4096],
         d1 + rt0 in [4096, 8191]
       runtime variables:
-        rt0: %bitcast.4 = s32[] bitcast(%slice); (d0, d1) -> (),
+        rt0: %s; (d0, d1) -> (),
           domain: d0 in [0, 0], d1 in [0, 4095]
     operand id = 1
       (d0, d1) -> (0),
       domain:
         d0 in [0, 0],
         d1 in [0, 4095]
-  )"));
+    )",
+                                  GetParam() ? "mhlo.bitcast" : "bitcast.4")));
 }
 
-TEST_F(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
+TEST_P(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule t
 
@@ -2946,7 +3038,7 @@ TEST_F(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
     operand id = 1 (d0, d1)[s0]{rt0} -> (rt0, d1, s0),
       domain: d0 in [0, 3], d1 in [0, 4], s0 in [0, 1], rt0 in [0, 3]
     runtime variables:
-      rt0: %p1 = s32[] parameter(1);
+      rt0: parameter(1);
         (d0, d1)[s0] -> (), domain: d0 in [0, 3], d1 in [0, 4], s0 in [0, 1]
     operand id = 2 (d0, d1) -> (),
       domain: d0 in [0, 3], d1 in [0, 4]
@@ -2957,7 +3049,7 @@ TEST_F(IndexingAnalysisTest, NestedDotFusionWithDynamicUpdateSlice) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherOp) {
+TEST_P(IndexingAnalysisTest, AllGatherOp) {
   auto input_indexing = GetOutputToInputIndexing(ParseAndGetRoot(R"(
     HloModule m, replica_count=4
     ENTRY e {
@@ -2980,7 +3072,7 @@ TEST_F(IndexingAnalysisTest, AllGatherOp) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherFusionWithTranspose) {
+TEST_P(IndexingAnalysisTest, AllGatherFusionWithTranspose) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3019,7 +3111,7 @@ TEST_F(IndexingAnalysisTest, AllGatherFusionWithTranspose) {
   )"));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherFusionWithReshape) {
+TEST_P(IndexingAnalysisTest, AllGatherFusionWithReshape) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3063,7 +3155,7 @@ TEST_F(IndexingAnalysisTest, AllGatherFusionWithReshape) {
   )")));
 }
 
-TEST_F(IndexingAnalysisTest, ChainedAllGatherFusion) {
+TEST_P(IndexingAnalysisTest, ChainedAllGatherFusion) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3086,7 +3178,7 @@ TEST_F(IndexingAnalysisTest, ChainedAllGatherFusion) {
               ElementsAre(UndefinedOperandIndexing()));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherDotFusion_GatherNonContractingDim) {
+TEST_P(IndexingAnalysisTest, AllGatherDotFusion_GatherNonContractingDim) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
@@ -3121,7 +3213,7 @@ TEST_F(IndexingAnalysisTest, AllGatherDotFusion_GatherNonContractingDim) {
   )")));
 }
 
-TEST_F(IndexingAnalysisTest, AllGatherDotFusion_GatherContractingDim) {
+TEST_P(IndexingAnalysisTest, AllGatherDotFusion_GatherContractingDim) {
   auto root = ParseAndGetRoot(R"(
     HloModule m
 
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.cc b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.cc
new file mode 100644
index 00000000000000..ba37bf74e778e7
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.cc
@@ -0,0 +1,340 @@
+/* Copyright 2023 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/analysis/indexing_analysis_utils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+using llvm::SmallVector;
+using mlir::AffineExpr;
+using mlir::AffineMap;
+using mlir::getAffineConstantExpr;
+using mlir::getAffineDimExpr;
+using mlir::getAffineSymbolExpr;
+using mlir::MLIRContext;
+
+IndexingMap ComputeBroadcastIndexingMap(
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> broadcast_dims, MLIRContext* mlir_context) {
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(broadcast_dims.size());
+  for (int64_t bcast_dim : broadcast_dims) {
+    exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
+  }
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
+                     mlir_context),
+      output_dims, {});
+}
+
+IndexingMap ComputeSliceIndexingMap(absl::Span<const int64_t> output_shape_dims,
+                                    absl::Span<const int64_t> slice_starts,
+                                    absl::Span<const int64_t> slice_strides,
+                                    mlir::MLIRContext* mlir_context) {
+  auto rank = output_shape_dims.size();
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    AffineExpr dim_expr = getAffineDimExpr(dim, mlir_context);
+    exprs.push_back(dim_expr * slice_strides[dim] + slice_starts[dim]);
+  }
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(rank, /*symbolCount=*/0, exprs, mlir_context),
+      output_shape_dims, {});
+}
+
+IndexingMap ComputeReverseIndexingMap(
+    absl::Span<const int64_t> output_shape_dims,
+    absl::Span<const int64_t> reverse_dims, mlir::MLIRContext* mlir_context) {
+  absl::flat_hash_set<int64_t> reverse_dims_set(reverse_dims.begin(),
+                                                reverse_dims.end());
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_shape_dims.size());
+  for (auto [output_dim_id, output_dim] : llvm::enumerate(output_shape_dims)) {
+    auto dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    exprs.push_back(reverse_dims_set.contains(output_dim_id)
+                        ? -dim_expr + output_dim - 1
+                        : dim_expr);
+  }
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape_dims.size(), /*symbolCount=*/0, exprs,
+                     mlir_context),
+      output_shape_dims, {});
+}
+
+HloInstructionIndexing ComputeConcatenateIndexing(
+    int64_t rank, int64_t concat_dim, absl::Span<const int64_t> output_dims,
+    const std::vector<int64_t>& operand_concat_dim_sizes,
+    mlir::MLIRContext* mlir_context) {
+  mlir::MutableAffineMap affine_map =
+      AffineMap::getMultiDimIdentityMap(rank, mlir_context);
+  std::vector<IndexingMap::Variable> dim_vars =
+      DimVarsFromTensorSizes(output_dims);
+
+  HloInstructionIndexing concat_indexing;
+  concat_indexing.indexing_maps.resize(operand_concat_dim_sizes.size());
+  AffineExpr concat_dim_expr = getAffineDimExpr(concat_dim, mlir_context);
+  int64_t offset = 0;
+  for (const auto [operand_id, operand_concat_dim] :
+       llvm::enumerate(operand_concat_dim_sizes)) {
+    affine_map.setResult(concat_dim, concat_dim_expr - offset);
+    dim_vars[concat_dim] =
+        IndexingMap::Variable{{offset, offset + operand_concat_dim - 1}};
+    concat_indexing.indexing_maps[operand_id].insert(
+        OperandIndexing(IndexingMap(affine_map.getAffineMap(), dim_vars,
+                                    /*range_vars=*/{}, /*rt_vars=*/{})));
+    offset += operand_concat_dim;
+  }
+  return concat_indexing;
+}
+
+std::pair<IndexingMap, IndexingMap> ComputeDotOperandsIndexing(
+    absl::Span<const int64_t> lhs_dims, absl::Span<const int64_t> rhs_dims,
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> lhs_batch_dims,
+    absl::Span<const int64_t> rhs_batch_dims,
+    absl::Span<const int64_t> lhs_contracting_dims,
+    absl::Span<const int64_t> rhs_contracting_dims, MLIRContext* mlir_context) {
+  SmallVector<AffineExpr> lhs_exprs(lhs_dims.size());
+  SmallVector<AffineExpr> rhs_exprs(rhs_dims.size());
+  int64_t output_dim_id = 0;
+
+  // Batch dimensions
+  for (auto [lhs_batch_dim, rhs_batch_dim] :
+       llvm::zip(lhs_batch_dims, rhs_batch_dims)) {
+    AffineExpr output_dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    lhs_exprs[lhs_batch_dim] = output_dim_expr;
+    rhs_exprs[rhs_batch_dim] = output_dim_expr;
+    ++output_dim_id;
+  }
+
+  // LHS non-contracting dims
+  absl::flat_hash_set<int64_t> lhs_batch_set(lhs_batch_dims.begin(),
+                                             lhs_batch_dims.end());
+  absl::flat_hash_set<int64_t> lhs_contracting_set(lhs_contracting_dims.begin(),
+                                                   lhs_contracting_dims.end());
+  for (int64_t i = 0; i < lhs_dims.size(); ++i) {
+    if (!lhs_batch_set.contains(i) && !lhs_contracting_set.contains(i)) {
+      lhs_exprs[i] = getAffineDimExpr(output_dim_id++, mlir_context);
+    }
+  }
+
+  // RHS non-contracting dims
+  absl::flat_hash_set<int64_t> rhs_batch_set(rhs_batch_dims.begin(),
+                                             rhs_batch_dims.end());
+  absl::flat_hash_set<int64_t> rhs_contracting_set(rhs_contracting_dims.begin(),
+                                                   rhs_contracting_dims.end());
+  for (int64_t i = 0; i < rhs_dims.size(); ++i) {
+    if (!rhs_batch_set.contains(i) && !rhs_contracting_set.contains(i)) {
+      rhs_exprs[i] = getAffineDimExpr(output_dim_id++, mlir_context);
+    }
+  }
+
+  // Contracting dimensions (as symbols)
+  int64_t symbol_id = 0;
+  std::vector<int64_t> symbol_sizes;
+  symbol_sizes.reserve(lhs_contracting_dims.size());
+  for (auto [lhs_contract, rhs_contract] :
+       llvm::zip(lhs_contracting_dims, rhs_contracting_dims)) {
+    AffineExpr symbol_expr = getAffineSymbolExpr(symbol_id, mlir_context);
+    lhs_exprs[lhs_contract] = symbol_expr;
+    rhs_exprs[rhs_contract] = symbol_expr;
+    symbol_sizes.push_back(lhs_dims[lhs_contract]);
+    ++symbol_id;
+  }
+
+  int64_t output_rank = output_dims.size();
+  return std::make_pair(IndexingMap::FromTensorSizes(
+                            AffineMap::get(output_rank, symbol_sizes.size(),
+                                           lhs_exprs, mlir_context),
+                            output_dims, symbol_sizes),
+                        IndexingMap::FromTensorSizes(
+                            AffineMap::get(output_rank, symbol_sizes.size(),
+                                           rhs_exprs, mlir_context),
+                            output_dims, symbol_sizes));
+}
+
+IndexingMap ComputeReduceInputIndexingMap(absl::Span<const int64_t> input_dims,
+                                          absl::Span<const int64_t> output_dims,
+                                          absl::Span<const int64_t> reduce_dims,
+                                          MLIRContext* mlir_context) {
+  absl::flat_hash_set<int64_t> reduce_dims_set(reduce_dims.begin(),
+                                               reduce_dims.end());
+  std::vector<int64_t> parallel_dims_sizes;
+  int64_t output_dim_id = 0;
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(input_dims.size());
+
+  for (auto [input_dim_id, input_dim] : llvm::enumerate(input_dims)) {
+    if (reduce_dims_set.contains(input_dim_id)) {
+      exprs.push_back(
+          getAffineSymbolExpr(parallel_dims_sizes.size(), mlir_context));
+      parallel_dims_sizes.push_back(input_dim);
+      continue;
+    }
+    exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
+  }
+
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_dims.size(), reduce_dims_set.size(), exprs,
+                     mlir_context),
+      output_dims, parallel_dims_sizes);
+}
+
+IndexingMap ComputePadIndexingMap(absl::Span<const int64_t> output_dims,
+                                  absl::Span<const int64_t> padding_low,
+                                  absl::Span<const int64_t> padding_high,
+                                  absl::Span<const int64_t> padding_interior,
+                                  MLIRContext* mlir_context) {
+  int64_t output_rank = output_dims.size();
+
+  std::vector<AffineExpr> exprs;
+  std::vector<std::pair<AffineExpr, Interval>> constraints;
+  std::vector<IndexingMap::Variable> dim_vars;
+  exprs.reserve(output_rank);
+  constraints.reserve(output_rank);
+  int64_t output_dim_id = 0;
+  for (const auto [output_dim, pad_low, pad_high, pad_interior] :
+       llvm::zip(output_dims, padding_low, padding_high, padding_interior)) {
+    AffineExpr dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    dim_vars.push_back({IndexingMap::Variable{
+        std::max(int64_t{0}, pad_low),
+        std::min(output_dim - 1, output_dim - 1 - pad_high)}});
+    if (pad_interior == 0) {
+      exprs.push_back(dim_expr - pad_low);
+    } else {
+      exprs.push_back((dim_expr - pad_low).floorDiv(pad_interior + 1));
+      constraints.push_back(
+          {(dim_expr - pad_low) % (pad_interior + 1), Interval{0, 0}});
+    }
+    ++output_dim_id;
+  }
+  return IndexingMap{
+      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
+      std::move(dim_vars),
+      /*range_vars = */ {},
+      /*rt_vars = */ {}, absl::MakeSpan(constraints)};
+}
+
+IndexingMap ComposeWindowIndexingMap(absl::Span<const int64_t> input_dims,
+                                     absl::Span<const int64_t> output_dims,
+                                     absl::Span<const int64_t> window_dims,
+                                     absl::Span<const int64_t> window_strides,
+                                     absl::Span<const int64_t> window_dilations,
+                                     absl::Span<const int64_t> base_dilations,
+                                     absl::Span<const int64_t> padding,
+                                     MLIRContext* mlir_context) {
+  size_t rank = input_dims.size();
+
+  // Compute shape of the padded input and the indexing map of pad op required
+  // to pad the input.
+  SmallVector<int64_t> padding_low, padding_high, padding_interior,
+      padded_input_dimensions;
+  SmallVector<AffineExpr, 4> exprs;
+  std::vector<IndexingMap::Variable> dim_vars;
+  std::vector<IndexingMap::Variable> range_vars;
+  exprs.reserve(rank);
+  dim_vars.reserve(rank);
+  range_vars.reserve(rank);
+
+  for (size_t dim_id = 0; dim_id < rank; ++dim_id) {
+    int64_t pad_low = padding[dim_id * 2];
+    int64_t pad_high = padding[dim_id * 2 + 1];
+    int64_t base_dilation = base_dilations[dim_id];
+    int64_t window_dilation = window_dilations[dim_id];
+    int64_t window_stride = window_strides[dim_id];
+    int64_t output_dim = output_dims[dim_id];
+    int64_t window_dim = window_dims[dim_id];
+    int64_t input_dim_size = input_dims[dim_id];
+
+    padding_low.push_back(pad_low);
+    padding_high.push_back(pad_high);
+    // For some reason interior_padding in HLO pad is offset from base_dilations
+    // in HLO reduce-window by 1.
+    padding_interior.push_back(base_dilation - 1);
+    padded_input_dimensions.push_back(input_dim_size + pad_low + pad_high +
+                                      (input_dim_size - 1) *
+                                          (base_dilation - 1));
+    AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
+    AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
+
+    exprs.push_back(symbol_expr * window_dilation + window_stride * dim_expr);
+    dim_vars.push_back({IndexingMap::Variable{0, output_dim - 1}});
+    range_vars.push_back({IndexingMap::Variable{0, window_dim - 1}});
+  }
+  // Indexing map for pad op that pads the input.
+  IndexingMap padded_input_indexing =
+      ComputePadIndexingMap(padded_input_dimensions, padding_low, padding_high,
+                            padding_interior, mlir_context);
+  // Indexing map for reduce-window, that does not do any padding.
+  IndexingMap input_indexing_no_padding(
+      AffineMap::get(rank, rank, exprs, mlir_context), dim_vars, range_vars,
+      /*rt_vars=*/{});
+
+  // Composed indexing.
+  IndexingMap result =
+      ComposeIndexingMaps(input_indexing_no_padding, padded_input_indexing);
+  result.Simplify();
+  result.RemoveUnusedSymbols();
+  return result;
+}
+
+HloInstructionIndexing CreateElementwiseIndexing(int64_t num_operands,
+                                                 const Shape& output_shape,
+                                                 MLIRContext* mlir_context) {
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(output_shape.dimensions().size(),
+                                        mlir_context),
+      output_shape.dimensions(), {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(num_operands);
+  for (int64_t i = 0; i < num_operands; ++i) {
+    indexing.indexing_maps[i].insert(OperandIndexing{identity_map});
+  }
+  return indexing;
+}
+
+IndexingMap CreateScalarIndexingMap(const Shape& output_shape,
+                                    MLIRContext* mlir_context) {
+  return IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape.dimensions().size(), /*symbolCount=*/0, {},
+                     mlir_context),
+      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+}
+
+AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
+                                      MLIRContext* mlir_context) {
+  return AffineMap::getPermutationMap(
+      std::vector<unsigned>(permutation.begin(), permutation.end()),
+      mlir_context);
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.h b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.h
new file mode 100644
index 00000000000000..d6b84d0d0b82f9
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/indexing_analysis_utils.h
@@ -0,0 +1,102 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_UTILS_H_
+#define XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_UTILS_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/MLIRContext.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/shape.h"
+
+namespace xla {
+
+struct HloInstructionIndexing;
+
+// Computes the indexing map for a Pad operation.
+IndexingMap ComputePadIndexingMap(absl::Span<const int64_t> output_dims,
+                                  absl::Span<const int64_t> padding_low,
+                                  absl::Span<const int64_t> padding_high,
+                                  absl::Span<const int64_t> padding_interior,
+                                  mlir::MLIRContext* mlir_context);
+
+// Computes the indexing map for a window-based operation (e.g. ReduceWindow,
+// Convolution).
+IndexingMap ComposeWindowIndexingMap(absl::Span<const int64_t> input_dims,
+                                     absl::Span<const int64_t> output_dims,
+                                     absl::Span<const int64_t> window_dims,
+                                     absl::Span<const int64_t> window_strides,
+                                     absl::Span<const int64_t> window_dilations,
+                                     absl::Span<const int64_t> base_dilations,
+                                     absl::Span<const int64_t> padding,
+                                     mlir::MLIRContext* mlir_context);
+
+// Creates an elementwise indexing for num_operands operands with the given
+// output shape. All operands use an identity mapping.
+HloInstructionIndexing CreateElementwiseIndexing(
+    int64_t num_operands, const Shape& output_shape,
+    mlir::MLIRContext* mlir_context);
+
+// Creates a scalar (empty) indexing map for the given output shape.
+// Used for scalar operands like init values or padding values.
+IndexingMap CreateScalarIndexingMap(const Shape& output_shape,
+                                    mlir::MLIRContext* mlir_context);
+
+IndexingMap ComputeBroadcastIndexingMap(
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> broadcast_dims, mlir::MLIRContext* mlir_context);
+
+IndexingMap ComputeSliceIndexingMap(absl::Span<const int64_t> output_shape_dims,
+                                    absl::Span<const int64_t> slice_starts,
+                                    absl::Span<const int64_t> slice_strides,
+                                    mlir::MLIRContext* mlir_context);
+
+IndexingMap ComputeReverseIndexingMap(
+    absl::Span<const int64_t> output_shape_dims,
+    absl::Span<const int64_t> reverse_dims, mlir::MLIRContext* mlir_context);
+
+mlir::AffineMap ComputeTransposeIndexingMap(
+    absl::Span<const int64_t> permutation, mlir::MLIRContext* mlir_context);
+
+HloInstructionIndexing ComputeConcatenateIndexing(
+    int64_t rank, int64_t concat_dim, absl::Span<const int64_t> output_dims,
+    const std::vector<int64_t>& operand_concat_dim_sizes,
+    mlir::MLIRContext* mlir_context);
+
+// Computes indexing maps for DotGeneral operands.
+// Returns a pair of (lhs_indexing_map, rhs_indexing_map).
+std::pair<IndexingMap, IndexingMap> ComputeDotOperandsIndexing(
+    absl::Span<const int64_t> lhs_dims, absl::Span<const int64_t> rhs_dims,
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> lhs_batch_dims,
+    absl::Span<const int64_t> rhs_batch_dims,
+    absl::Span<const int64_t> lhs_contracting_dims,
+    absl::Span<const int64_t> rhs_contracting_dims,
+    mlir::MLIRContext* mlir_context);
+
+// Computes indexing map for reduce input operands.
+IndexingMap ComputeReduceInputIndexingMap(absl::Span<const int64_t> input_dims,
+                                          absl::Span<const int64_t> output_dims,
+                                          absl::Span<const int64_t> reduce_dims,
+                                          mlir::MLIRContext* mlir_context);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_INDEXING_ANALYSIS_UTILS_H_
diff --git a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
index 98283eb8d7fe44..23e519deac9c01 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
+++ b/third_party/xla/xla/hlo/analysis/indexing_map_test.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "xla/hlo/analysis/interval.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/verified_hlo_module.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace {
@@ -1456,8 +1454,8 @@ TEST_F(IndexingMapTest, RangeVarSupportsAbslHashAndEqAndNe) {
 }
 
 TEST_F(IndexingMapTest, RTVarSupportsAbslHashAndEqAndNe) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
-                          ParseAndReturnVerifiedModule(R"(
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                       ParseAndReturnVerifiedModule(R"(
                             HloModule m
                             ENTRY e {
                               ROOT %constant = s64[] constant(42)
diff --git a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
index dc53be5c8ce0db..d14e6d621ce6e8 100644
--- a/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
+++ b/third_party/xla/xla/hlo/analysis/indexing_test_utils.h
@@ -70,13 +70,19 @@ class IndexingTestBase : public HloHardwareIndependentTestBase {
  public:
   HloInstruction* ParseAndGetRoot(absl::string_view hlo_string);
 
-  HloInstructionIndexing GetOutputToInputIndexing(
-      const HloInstruction* instr, int output_id = 0,
-      bool use_physical_layout = false);
+  virtual HloInstructionIndexing GetOutputToInputIndexing(
+      const HloInstruction* instr, int output_id, bool use_physical_layout);
+  HloInstructionIndexing GetOutputToInputIndexing(const HloInstruction* instr,
+                                                  int output_id = 0) {
+    return GetOutputToInputIndexing(instr, output_id, false);
+  }
 
-  HloInstructionIndexing GetInputToOutputIndexing(
-      const HloInstruction* instr, int input_id = 0,
-      bool use_physical_layout = false);
+  virtual HloInstructionIndexing GetInputToOutputIndexing(
+      const HloInstruction* instr, int input_id, bool use_physical_layout);
+  HloInstructionIndexing GetInputToOutputIndexing(const HloInstruction* instr,
+                                                  int input_id = 0) {
+    return GetInputToOutputIndexing(instr, input_id, false);
+  }
 
   mlir::MLIRContext mlir_context_;
   std::unique_ptr<VerifiedHloModule> module_;
diff --git a/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.cc b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.cc
new file mode 100644
index 00000000000000..ee547bb52b8ea9
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.cc
@@ -0,0 +1,1009 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/analysis/stablehlo_indexing_analysis.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "stablehlo/dialect/StablehloOps.h"  // IWYU pragma: keep
+#include "xla/hlo/analysis/indexing_analysis.h"
+#include "xla/hlo/analysis/indexing_analysis_utils.h"
+#include "xla/hlo/analysis/indexing_map.h"
+#include "xla/hlo/analysis/interval.h"
+#include "xla/layout_util.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+#include "xla/permutation_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+namespace {
+
+using namespace ::mlir::stablehlo;  // NOLINT
+namespace mhlo = ::mlir::mhlo;
+
+using ::llvm::ArrayRef;
+using ::llvm::enumerate;
+using ::mlir::AffineExpr;
+using ::mlir::AffineMap;
+using ::mlir::BlockArgument;
+using ::mlir::DenseIntElementsAttr;
+using ::mlir::dyn_cast;
+using ::mlir::MLIRContext;
+using ::mlir::Operation;
+using ::mlir::RankedTensorType;
+using ::mlir::SmallVector;
+using ::mlir::Value;
+
+HloInstructionIndexing CreateUnknownIndexing(int64_t count) {
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(count);
+  for (int64_t i = 0; i < count; ++i) {
+    indexing.indexing_maps[i].insert(
+        OperandIndexing{IndexingMap::GetUndefined()});
+  }
+  return indexing;
+}
+
+Shape GetShape(Value value) {
+  auto shaped_type = dyn_cast<RankedTensorType>(value.getType());
+  if (!shaped_type) {
+    return Shape();
+  }
+  std::vector<int64_t> dimensions(shaped_type.getShape().begin(),
+                                  shaped_type.getShape().end());
+  return ShapeUtil::MakeShape(F32, dimensions);
+}
+
+// Operation-specific helper implementations
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    AllGatherOp all_gather, int output_id) {
+  MLIRContext* context = all_gather.getContext();
+  int64_t all_gather_dim = all_gather.getAllGatherDim();
+  auto output_shape = GetShape(all_gather.getResult(0));
+  int64_t output_rank = output_shape.dimensions().size();
+
+  // Input shape for the first operand
+  auto input_shape = GetShape(all_gather.getOperand(0));
+  int64_t all_gather_input_dim_size = input_shape.dimensions(all_gather_dim);
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(output_rank);
+
+  for (int64_t i = 0; i < output_rank; ++i) {
+    auto dim = mlir::getAffineDimExpr(i, context);
+    exprs.push_back(i == all_gather_dim ? dim % all_gather_input_dim_size
+                                        : dim);
+  }
+
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 0, exprs, context), output_shape.dimensions(),
+      {});
+
+  AffineExpr replica_id_expr = mlir::getAffineDimExpr(all_gather_dim, context)
+                                   .floorDiv(all_gather_input_dim_size);
+
+  IndexingMap replica_id_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 0, replica_id_expr, context),
+      output_shape.dimensions(), {});
+
+  OperandIndexing operand_indexing(indexing_map, {}, replica_id_map);
+
+  HloInstructionIndexing indexing;
+  // HLO implementation only returns indexing for the first operand.
+  // We mirror this behavior for consistency, although StableHLO ops might be
+  // variadic.
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(operand_indexing);
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    BitcastConvertOp bitcast, int output_id) {
+  MLIRContext* context = bitcast.getContext();
+  auto input_shape = GetShape(bitcast.getOperand());
+  auto output_shape = GetShape(bitcast.getResult());
+  IndexingMap indexing_map = GetBitcastMap(output_shape, input_shape, context);
+  indexing_map.Simplify();
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    BroadcastInDimOp bcast, int output_id) {
+  MLIRContext* context = bcast.getContext();
+  // Check if result has RankedTensorType
+  if (!dyn_cast<RankedTensorType>(bcast.getResult().getType())) {
+    return CreateUnknownIndexing(1);
+  }
+  auto output_shape = GetShape(bcast.getResult());
+  IndexingMap indexing_map = ComputeBroadcastIndexingMap(
+      output_shape.dimensions(), bcast.getBroadcastDimensions(), context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ConcatenateOp concat, int output_id) {
+  MLIRContext* context = concat.getContext();
+  int64_t concat_dim = concat.getDimension();
+  auto output_shape = GetShape(concat.getResult());
+  std::vector<int64_t> operand_concat_dim_sizes;
+  operand_concat_dim_sizes.reserve(concat.getInputs().size());
+  for (Value operand : concat.getInputs()) {
+    operand_concat_dim_sizes.push_back(
+        GetShape(operand).dimensions(concat_dim));
+  }
+  return ComputeConcatenateIndexing(output_shape.dimensions().size(),
+                                    concat_dim, output_shape.dimensions(),
+                                    operand_concat_dim_sizes, context);
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ConvolutionOp conv, int output_id) {
+  MLIRContext* context = conv.getContext();
+  auto input_shape = GetShape(conv.getLhs());
+  auto kernel_shape = GetShape(conv.getRhs());
+  auto output_shape = GetShape(conv.getResult());
+  auto dnums = conv.getDimensionNumbers();
+  size_t rank = output_shape.dimensions().size();
+
+  // Collect sizes for input/output spatial dimensions.
+  size_t spatial_rank = dnums.getInputSpatialDimensions().size();
+  std::vector<int64_t> input_spatial_sizes(spatial_rank);
+  std::vector<int64_t> kernel_spatial_sizes(spatial_rank);
+  std::vector<int64_t> output_spatial_sizes(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_spatial_sizes[i] =
+        input_shape.dimensions(dnums.getInputSpatialDimensions()[i]);
+    kernel_spatial_sizes[i] =
+        kernel_shape.dimensions(dnums.getKernelSpatialDimensions()[i]);
+    output_spatial_sizes[i] =
+        output_shape.dimensions(dnums.getOutputSpatialDimensions()[i]);
+  }
+
+  SmallVector<int64_t> ones(spatial_rank, 1);
+  auto strides = conv.getWindowStrides().value_or(ones);
+  auto lhs_dilation = conv.getLhsDilation().value_or(ones);
+  auto rhs_dilation = conv.getRhsDilation().value_or(ones);
+  SmallVector<int64_t> padding_flat;
+  if (conv.getPadding()) {
+    for (auto val : conv.getPadding()->getValues<int64_t>()) {
+      padding_flat.push_back(val);
+    }
+  } else {
+    padding_flat.assign(spatial_rank * 2, 0);
+  }
+
+  // Indexing map for the input value (spatial dimensions only).
+  // The dimension numbers in the resulting affine expressions have to be
+  // remapped to correspond to the correct output dimensions.
+  IndexingMap input_spatial_indexing = ComposeWindowIndexingMap(
+      input_spatial_sizes, output_spatial_sizes, kernel_spatial_sizes, strides,
+      rhs_dilation, lhs_dilation, padding_flat, context);
+  std::vector<AffineExpr> replacement_dims(spatial_rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    replacement_dims[i] =
+        mlir::getAffineDimExpr(dnums.getOutputSpatialDimensions()[i], context);
+  }
+
+  // Build affine expressions and constraints for input spatial dimensions.
+  std::vector<AffineExpr> input_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    input_exprs[dnums.getInputSpatialDimensions()[i]] =
+        input_spatial_indexing.GetAffineMap().getResult(i).replaceDims(
+            replacement_dims);
+  }
+  llvm::MapVector<AffineExpr, Interval> input_constraints;
+  for (const auto& [key, val] : input_spatial_indexing.GetConstraints()) {
+    input_constraints[key.replaceDims(replacement_dims)] = val;
+  }
+
+  // Build affine expressions for kernel spatial and output dimensions.
+  std::vector<AffineExpr> kernel_exprs(rank);
+  for (int i = 0; i < spatial_rank; ++i) {
+    kernel_exprs[dnums.getKernelSpatialDimensions()[i]] =
+        mlir::getAffineSymbolExpr(i, context);
+  }
+  AffineExpr dim_expr =
+      mlir::getAffineDimExpr(dnums.getOutputFeatureDimension(), context);
+  kernel_exprs[dnums.getKernelOutputFeatureDimension()] = dim_expr;
+
+  // Build initial symbol ranges.
+  std::vector<IndexingMap::Variable> input_symbols =
+      input_spatial_indexing.GetRangeVars();
+  std::vector<IndexingMap::Variable> kernel_symbols =
+      RangeVarsFromTensorSizes(kernel_spatial_sizes);
+
+  // Add symbol for input feature dimension.
+  input_exprs[dnums.getInputFeatureDimension()] =
+      mlir::getAffineSymbolExpr(input_symbols.size(), context);
+  kernel_exprs[dnums.getKernelInputFeatureDimension()] =
+      mlir::getAffineSymbolExpr(kernel_symbols.size(), context);
+
+  int64_t input_group_size =
+      kernel_shape.dimensions(dnums.getKernelInputFeatureDimension());
+  Interval input_feature_range{0, input_group_size - 1};
+  input_symbols.push_back(IndexingMap::Variable{input_feature_range});
+  kernel_symbols.push_back(IndexingMap::Variable{input_feature_range});
+
+  // With multiple feature groups, the input feature dimension is equally split.
+  if (conv.getFeatureGroupCount() > 1) {
+    AffineExpr& input_feature = input_exprs[dnums.getInputFeatureDimension()];
+    int64_t output_group_size =
+        output_shape.dimensions(dnums.getOutputFeatureDimension());
+    int64_t feature_group_size =
+        output_group_size / conv.getFeatureGroupCount();
+    input_feature = dim_expr.floorDiv(feature_group_size) * input_group_size +
+                    input_feature;
+  }
+
+  // With multiple batch groups, the input batch dimension is equally split.
+  AffineExpr batch_dim_expr =
+      mlir::getAffineDimExpr(dnums.getOutputBatchDimension(), context);
+  if (conv.getBatchGroupCount() > 1) {
+    int64_t batch_group_size =
+        output_shape.dimensions(dnums.getOutputBatchDimension());
+    AffineExpr batch_group_expr =
+        mlir::getAffineSymbolExpr(input_symbols.size(), context);
+    input_symbols.push_back(IndexingMap::Variable{
+        {0, static_cast<int64_t>(conv.getBatchGroupCount()) - 1}});
+    input_exprs[dnums.getInputBatchDimension()] =
+        batch_group_expr * batch_group_size + batch_dim_expr;
+  } else {
+    input_exprs[dnums.getInputBatchDimension()] = batch_dim_expr;
+  }
+
+  // Indexing map for the input value.
+  IndexingMap inputs_indexing(
+      AffineMap::get(rank, input_symbols.size(), input_exprs, context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), input_symbols,
+      /*rt_vars=*/{}, input_constraints);
+  // We may need to simplify and remove unused symbols again, as the input
+  // feature dimension size may be trivial.
+  inputs_indexing.Simplify();
+  inputs_indexing.RemoveUnusedSymbols();
+
+  // Indexing map for the kernel value.
+  IndexingMap kernel_indexing(
+      AffineMap::get(rank, kernel_symbols.size(), kernel_exprs, context),
+      DimVarsFromTensorSizes(output_shape.dimensions()), kernel_symbols,
+      /*rt_vars=*/{});
+  kernel_indexing.Simplify();
+  kernel_indexing.RemoveUnusedSymbols();
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{inputs_indexing});
+  indexing.indexing_maps[1].insert(OperandIndexing{kernel_indexing});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DotGeneralOp dot_general, int output_id) {
+  MLIRContext* context = dot_general.getContext();
+  auto lhs_shape = GetShape(dot_general.getLhs());
+  auto rhs_shape = GetShape(dot_general.getRhs());
+  auto output_shape = GetShape(dot_general.getResult());
+  auto dim_numbers = dot_general.getDotDimensionNumbers();
+
+  auto lhs_batch_dims = dim_numbers.getLhsBatchingDimensions();
+  auto rhs_batch_dims = dim_numbers.getRhsBatchingDimensions();
+  auto lhs_contracting_dims = dim_numbers.getLhsContractingDimensions();
+  auto rhs_contracting_dims = dim_numbers.getRhsContractingDimensions();
+
+  auto [lhs_map, rhs_map] = ComputeDotOperandsIndexing(
+      lhs_shape.dimensions(), rhs_shape.dimensions(), output_shape.dimensions(),
+      lhs_batch_dims, rhs_batch_dims, lhs_contracting_dims,
+      rhs_contracting_dims, context);
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{lhs_map});
+  indexing.indexing_maps[1].insert(OperandIndexing{rhs_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DotOp dot, int output_id) {
+  MLIRContext* context = dot.getContext();
+  auto lhs_shape = GetShape(dot.getLhs());
+  auto rhs_shape = GetShape(dot.getRhs());
+  auto output_shape = GetShape(dot.getResult());
+
+  // Following XLA's DotOp pattern:
+  // For dot product: lhs[..., k] * rhs[k, ...] -> output[..., ...]
+  // LHS: batch_dims + k (contracting)
+  // RHS: k (contracting) + non_contracting
+  int64_t lhs_rank = lhs_shape.dimensions().size();
+  int64_t rhs_rank = rhs_shape.dimensions().size();
+  int64_t output_rank = output_shape.dimensions().size();
+
+  llvm::SmallVector<AffineExpr> lhs_exprs(lhs_rank);
+  llvm::SmallVector<AffineExpr> rhs_exprs(rhs_rank);
+  // LHS non-contracting dimensions map to output dims [0, output_rank-1)
+  // For vector-matrix or matrix-vector: this is either batch dims or empty
+  for (int64_t i = 0; i < lhs_rank - 1; ++i) {
+    lhs_exprs[i] = mlir::getAffineDimExpr(i, context);
+  }
+  // RHS non-contracting dimensions map to output dims starting after LHS
+  // For matrix-vector: output_rank may be < rhs_rank-1 (vector result)
+  for (int64_t i = 0; i < rhs_rank - 1; ++i) {
+    int64_t output_dim = (lhs_rank - 1) + i;
+    if (output_dim < output_rank) {
+      rhs_exprs[i + 1] = mlir::getAffineDimExpr(output_dim, context);
+    } else {
+      // Matrix-vector case: result is vector, extra RHS dims are implicit
+      rhs_exprs[i + 1] = mlir::getAffineConstantExpr(0, context);
+    }
+  }
+
+  // Contracting dimension (k): symbol for both LHS and RHS
+  int64_t k_dim = lhs_shape.dimensions()[lhs_rank - 1];
+  AffineExpr k_expr = mlir::getAffineSymbolExpr(0, context);
+  lhs_exprs[lhs_rank - 1] = k_expr;
+  rhs_exprs[0] = k_expr;
+  IndexingMap lhs_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 1, lhs_exprs, context),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {k_dim});
+  IndexingMap rhs_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, 1, rhs_exprs, context),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {k_dim});
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{lhs_map});
+  indexing.indexing_maps[1].insert(OperandIndexing{rhs_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DynamicSliceOp dynamic_slice, int output_id) {
+  MLIRContext* context = dynamic_slice.getContext();
+  auto input_shape = GetShape(dynamic_slice.getOperand());
+  auto output_shape = GetShape(dynamic_slice.getResult());
+  int64_t rank = output_shape.dimensions().size();
+
+  std::vector<int64_t> dim_sizes(output_shape.dimensions().begin(),
+                                 output_shape.dimensions().end());
+  std::vector<IndexingMap::Variable> dim_vars;
+  dim_vars.reserve(dim_sizes.size());
+  for (auto size : dim_sizes) {
+    dim_vars.push_back(IndexingMap::Variable{{0, size - 1}});
+  }
+
+  std::vector<AffineExpr> exprs;
+  exprs.reserve(rank);
+  std::vector<IndexingMap::Variable> rt_vars;
+  std::vector<RuntimeVarIndexing> runtime_vars;
+
+  // An empty affine map for scalar runtime variables.
+  // Needed for indices_map construction below
+  AffineMap empty_map = AffineMap::get(rank, 0, {}, context);
+
+  for (auto [dim, slice_size] :
+       llvm::enumerate(dynamic_slice.getSliceSizes())) {
+    AffineExpr dim_expr = getAffineDimExpr(dim, context);
+    Value rt_var_val = dynamic_slice.getStartIndices()[dim];
+    int64_t max_index = input_shape.dimensions(dim) - slice_size;
+
+    // Construct indexing map for the start index (scalar map keyed by output
+    // dimensions) We reuse the scalar map logic: (d0...dN) -> ()
+    IndexingMap rt_index_map = CreateScalarIndexingMap(output_shape, context);
+
+    // Attempt constant folding/optimization
+    RuntimeVarIndexing rt_indexing{rt_var_val, rt_index_map};
+    Interval feasible_values{0, max_index};
+
+    auto simplified_expr = OptimizeRTVar(rt_indexing, feasible_values, context);
+
+    if (simplified_expr) {
+      exprs.push_back(dim_expr + *simplified_expr);
+    } else {
+      exprs.push_back(dim_expr + getAffineSymbolExpr(rt_vars.size(), context));
+      rt_vars.push_back(IndexingMap::Variable{{0, max_index}});
+      runtime_vars.push_back(RuntimeVarIndexing{rt_var_val, rt_index_map});
+    }
+  }
+
+  IndexingMap input_map{AffineMap::get(rank, rt_vars.size(), exprs, context),
+                        dim_vars,
+                        {},
+                        rt_vars};
+
+  OperandIndexing operand_indexing{input_map, runtime_vars};
+
+  IndexingMap indices_map =
+      IndexingMap::FromTensorSizes(empty_map, dim_sizes, {});
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(dynamic_slice.getNumOperands());
+  indexing.indexing_maps[0].insert(operand_indexing);
+  for (size_t i = 1; i < dynamic_slice.getNumOperands(); ++i) {
+    indexing.indexing_maps[i].insert(OperandIndexing{indices_map});
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    DynamicUpdateSliceOp dus, int output_id) {
+  MLIRContext* context = dus.getContext();
+  auto operand_shape = GetShape(dus.getOperand());
+  auto update_shape = GetShape(dus.getUpdate());
+  auto output_shape = GetShape(dus.getResult());
+  int64_t rank = output_shape.dimensions().size();
+
+  // Operand (input): identity mapping
+  std::vector<AffineExpr> identity;
+  identity.reserve(rank);
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    identity.push_back(getAffineDimExpr(dim, context));
+  }
+  std::vector<int64_t> dim_sizes(output_shape.dimensions().begin(),
+                                 output_shape.dimensions().end());
+  IndexingMap operand_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(rank, 0, identity, context), dim_sizes, {});
+
+  // Update: (d0 - rt0, ..., d{N-1} - rt{N-1}) with runtime variables
+  std::vector<AffineExpr> update_exprs;
+  std::vector<IndexingMap::Variable> rt_vars;
+  update_exprs.reserve(rank);
+  rt_vars.reserve(rank);
+
+  for (int64_t dim = 0; dim < rank; ++dim) {
+    update_exprs.push_back(getAffineDimExpr(dim, context) -
+                           getAffineSymbolExpr(dim, context));
+    rt_vars.push_back(IndexingMap::Variable{
+        {0, operand_shape.dimensions(dim) - update_shape.dimensions(dim)}});
+  }
+
+  std::vector<IndexingMap::Variable> dim_vars;
+  dim_vars.reserve(dim_sizes.size());
+  for (auto size : dim_sizes) {
+    dim_vars.push_back(IndexingMap::Variable{{0, size - 1}});
+  }
+
+  IndexingMap update_map{
+      AffineMap::get(rank, rank, update_exprs, context), dim_vars, {}, rt_vars};
+
+  // Create RuntimeVarIndexing for offset operands
+  std::vector<RuntimeVarIndexing> runtime_vars;
+  runtime_vars.reserve(rank);
+  AffineMap empty_map = AffineMap::get(rank, 0, {}, context);
+  IndexingMap rt_index_map =
+      IndexingMap::FromTensorSizes(empty_map, dim_sizes, {});
+
+  for (auto offset_value : dus.getStartIndices()) {
+    runtime_vars.push_back(RuntimeVarIndexing{offset_value, rt_index_map});
+  }
+
+  OperandIndexing update_indexing{update_map, runtime_vars};
+
+  // Start indices: empty map
+  IndexingMap indices_map =
+      IndexingMap::FromTensorSizes(empty_map, dim_sizes, {});
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(dus.getNumOperands());
+  indexing.indexing_maps[0].insert(OperandIndexing{operand_map});
+  indexing.indexing_maps[1].insert(update_indexing);
+  for (size_t i = 2; i < dus.getNumOperands(); ++i) {
+    indexing.indexing_maps[i].insert(OperandIndexing{indices_map});
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    GatherOp gather, int output_id) {
+  MLIRContext* context = gather.getContext();
+  auto operand_shape = GetShape(gather.getOperand());
+  auto start_indices_shape = GetShape(gather.getStartIndices());
+  auto output_shape = GetShape(gather.getResult());
+  int64_t output_rank = output_shape.dimensions().size();
+
+  auto dimension_numbers = gather.getDimensionNumbers();
+  int64_t index_vector_dim = dimension_numbers.getIndexVectorDim();
+  int64_t index_vector_length =
+      start_indices_shape.dimensions(index_vector_dim);
+
+  // Map for indices operand: (d0, ..., d{rank-1}) -> (d0, s0)
+  // where s0 ranges over index vector dimension
+  AffineExpr indices_id_dim = getAffineDimExpr(0, context);
+  std::vector<int64_t> dim_sizes(output_shape.dimensions().begin(),
+                                 output_shape.dimensions().end());
+  std::vector<IndexingMap::Variable> dim_vars;
+  dim_vars.reserve(dim_sizes.size());
+  for (auto size : dim_sizes) {
+    dim_vars.push_back(IndexingMap::Variable{{0, size - 1}});
+  }
+
+  IndexingMap indices_map{
+      AffineMap::get(output_rank, 1,
+                     {indices_id_dim, getAffineSymbolExpr(0, context)},
+                     context),
+      dim_vars,
+      {IndexingMap::Variable{{0, index_vector_length - 1}}},
+      /*rt_vars=*/{}};
+
+  // Map for operand with runtime variables
+  std::vector<AffineExpr> exprs;
+  std::vector<RuntimeVarIndexing> runtime_vars;
+  std::vector<IndexingMap::Variable> rt_vars;
+  auto slice_sizes = gather.getSliceSizes();
+  auto offset_dims = dimension_numbers.getOffsetDims();
+  auto start_index_map = dimension_numbers.getStartIndexMap();
+
+  exprs.reserve(operand_shape.dimensions().size());
+
+  for (auto [operand_dim_id, slice_size] : enumerate(slice_sizes)) {
+    int64_t output_dim_id = offset_dims[operand_dim_id];
+    exprs.push_back(mlir::getAffineDimExpr(output_dim_id, context));
+
+    // Check if this dimension is indexed by start_indices
+    auto it = absl::c_find(start_index_map, operand_dim_id);
+    if (it == start_index_map.end()) {
+      continue;
+    }
+
+    int64_t start_index_map_idx = it - start_index_map.begin();
+
+    // Create runtime variable for this index
+    AffineMap rt_var_map = AffineMap::get(
+        output_rank, 0,
+        {indices_id_dim,
+         mlir::getAffineConstantExpr(start_index_map_idx, context)},
+        context);
+
+    IndexingMap rt_index_map =
+        IndexingMap::FromTensorSizes(rt_var_map, dim_sizes, {});
+
+    int64_t upper_bound = operand_shape.dimensions(operand_dim_id) - slice_size;
+
+    RuntimeVarIndexing rt_indexing{gather.getStartIndices(), rt_index_map};
+    Interval feasible_values{0, upper_bound};
+
+    if (auto simplified =
+            OptimizeRTVar(rt_indexing, feasible_values, context)) {
+      exprs.back() = exprs.back() + *simplified;
+      continue;
+    }
+
+    runtime_vars.push_back(rt_indexing);
+    rt_vars.push_back(IndexingMap::Variable{{0, upper_bound}});
+
+    // Add runtime variable to expression
+    exprs.back() = exprs.back() +
+                   mlir::getAffineSymbolExpr(runtime_vars.size() - 1, context);
+  }
+
+  IndexingMap operand_map{
+      AffineMap::get(output_rank, runtime_vars.size(), exprs, context),
+      dim_vars,
+      {},
+      rt_vars};
+
+  OperandIndexing operand_indexing{operand_map, runtime_vars};
+
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(operand_indexing);
+  indexing.indexing_maps[1].insert(OperandIndexing{indices_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    GetTupleElementOp gte, int output_id) {
+  if (!dyn_cast<RankedTensorType>(gte.getResult().getType())) {
+    return CreateUnknownIndexing(1);
+  }
+  auto output_shape = GetShape(gte.getResult());
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(output_shape.dimensions().size(),
+                                        gte.getContext()),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{identity_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    PadOp pad, int output_id) {
+  MLIRContext* context = pad.getContext();
+  auto output_shape = GetShape(pad.getResult());
+  auto edge_padding_low = pad.getEdgePaddingLow();
+  auto edge_padding_high = pad.getEdgePaddingHigh();
+  auto interior_padding = pad.getInteriorPadding();
+  IndexingMap input_indexing_map =
+      ComputePadIndexingMap(output_shape.dimensions(), edge_padding_low,
+                            edge_padding_high, interior_padding, context);
+  IndexingMap padding_value_indexing_map =
+      CreateScalarIndexingMap(output_shape, context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(2);
+  indexing.indexing_maps[0].insert(OperandIndexing{input_indexing_map});
+  indexing.indexing_maps[1].insert(OperandIndexing{padding_value_indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReduceOp reduce, int output_id) {
+  MLIRContext* context = reduce.getContext();
+
+  auto input_shape = GetShape(reduce.getInputs()[0]);
+  auto output_shape = GetShape(reduce.getResults()[0]);
+
+  IndexingMap inputs_indexing_map = ComputeReduceInputIndexingMap(
+      input_shape.dimensions(), output_shape.dimensions(),
+      reduce.getDimensions(), context);
+
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, context);
+
+  HloInstructionIndexing indexing;
+  int64_t num_inputs = reduce.getInputs().size();
+  int64_t num_operands = num_inputs + reduce.getInitValues().size();
+  indexing.indexing_maps.resize(num_operands);
+
+  for (int64_t id = 0; id < num_inputs; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inputs_indexing_map));
+  }
+  for (int64_t id = num_inputs; id < num_operands; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inits_indexing_map));
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReduceWindowOp reduce_window, int output_id) {
+  MLIRContext* context = reduce_window.getContext();
+
+  // Following XLA's ReduceWindowOp pattern:
+  // Indexing for reduce-window with dilations and non-trivial padding
+  // is represented as a composition using ComposeWindowIndexingMap
+
+  auto input_shape = GetShape(reduce_window.getInputs()[0]);
+  auto output_shape = GetShape(reduce_window.getResults()[0]);
+
+  SmallVector<int64_t> default_dilations(input_shape.dimensions().size(), 1);
+  SmallVector<int64_t> default_padding(input_shape.dimensions().size() * 2, 0);
+
+  ArrayRef<int64_t> window_dilations =
+      reduce_window.getWindowDilations()
+          ? ArrayRef<int64_t>(*reduce_window.getWindowDilations())
+          : ArrayRef(default_dilations);
+  ArrayRef<int64_t> base_dilations =
+      reduce_window.getBaseDilations()
+          ? ArrayRef<int64_t>(*reduce_window.getBaseDilations())
+          : ArrayRef(default_dilations);
+
+  SmallVector<int64_t> padding_flat;
+  if (reduce_window.getPadding()) {
+    auto padding_attr = reduce_window.getPadding().value();
+    for (auto val : padding_attr.getValues<int64_t>()) {
+      padding_flat.push_back(val);
+    }
+  } else {
+    padding_flat = default_padding;
+  }
+
+  // Indexing map for the input value
+  IndexingMap inputs_indexing = ComposeWindowIndexingMap(
+      input_shape.dimensions(), output_shape.dimensions(),
+      reduce_window.getWindowDimensions(),
+      reduce_window.getWindowStrides().value_or(
+          reduce_window.getWindowDimensions()),
+      window_dilations, base_dilations, padding_flat, context);
+
+  // Indexing map for the init value
+  IndexingMap inits_indexing_map =
+      CreateScalarIndexingMap(output_shape, context);
+
+  HloInstructionIndexing indexing;
+  int64_t num_inputs = reduce_window.getInputs().size();
+  int64_t num_operands = num_inputs + reduce_window.getInitValues().size();
+  indexing.indexing_maps.resize(num_operands);
+
+  for (int64_t id = 0; id < num_inputs; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inputs_indexing));
+  }
+  for (int64_t id = num_inputs; id < num_operands; ++id) {
+    indexing.indexing_maps[id].insert(OperandIndexing(inits_indexing_map));
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReshapeOp reshape, int output_id) {
+  MLIRContext* context = reshape.getContext();
+  auto input_shape = GetShape(reshape.getOperand());
+  auto output_shape = GetShape(reshape.getResult());
+  IndexingMap indexing_map = GetBitcastMap(output_shape, input_shape, context);
+  indexing_map.Simplify();
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    ReverseOp reverse, int output_id) {
+  MLIRContext* context = reverse.getContext();
+  auto output_shape = GetShape(reverse.getResult());
+  IndexingMap indexing_map = ComputeReverseIndexingMap(
+      output_shape.dimensions(), reverse.getDimensions(), context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    SliceOp slice, int output_id) {
+  MLIRContext* context = slice.getContext();
+  auto output_shape = GetShape(slice.getResult());
+  IndexingMap indexing_map = ComputeSliceIndexingMap(
+      output_shape.dimensions(), slice.getStartIndices(), slice.getStrides(),
+      context);
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    TransposeOp transpose, int output_id) {
+  MLIRContext* context = transpose.getContext();
+  auto output_shape = GetShape(transpose.getResult());
+  auto permutation = std::vector<int64_t>(transpose.getPermutation().begin(),
+                                          transpose.getPermutation().end());
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ComputeTransposeIndexingMap(InversePermutation(permutation), context),
+      output_shape.dimensions(), {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    TupleOp tuple_op, int output_id) {
+  MLIRContext* context = tuple_op.getContext();
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(tuple_op->getNumOperands());
+  for (auto [i, operand] : enumerate(tuple_op->getOperands())) {
+    if (!dyn_cast<RankedTensorType>(operand.getType())) {
+      continue;
+    }
+    auto operand_shape = GetShape(operand);
+    IndexingMap identity_map = IndexingMap::FromTensorSizes(
+        AffineMap::getMultiDimIdentityMap(operand_shape.dimensions().size(),
+                                          context),
+        std::vector<int64_t>(operand_shape.dimensions().begin(),
+                             operand_shape.dimensions().end()),
+        {});
+    indexing.indexing_maps[i].insert(OperandIndexing{identity_map});
+  }
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    mlir::mhlo::BitcastOp op, int output_id) {
+  Shape input_shape = GetShape(op.getOperand());
+  if (auto attr = op->getAttrOfType<DenseIntElementsAttr>("source_layout")) {
+    std::vector<int64_t> layout;
+    for (const auto& val : attr.getValues<int64_t>()) {
+      layout.push_back(val);
+    }
+    *input_shape.mutable_layout() = LayoutUtil::MakeLayout(layout);
+  }
+
+  Shape output_shape = GetShape(op.getResult());
+  if (auto attr = op->getAttrOfType<DenseIntElementsAttr>("result_layout")) {
+    std::vector<int64_t> layout;
+    for (const auto& val : attr.getValues<int64_t>()) {
+      layout.push_back(val);
+    }
+    *output_shape.mutable_layout() = LayoutUtil::MakeLayout(layout);
+  }
+  IndexingMap indexing_map =
+      GetBitcastMap(output_shape, input_shape, op.getContext());
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{indexing_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    mhlo::CopyOp op, int output_id) {
+  auto output_shape = GetShape(op.getResult());
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(output_shape.dimensions().size(),
+                                        op.getContext()),
+      std::vector<int64_t>(output_shape.dimensions().begin(),
+                           output_shape.dimensions().end()),
+      {});
+  HloInstructionIndexing indexing;
+  indexing.indexing_maps.resize(1);
+  indexing.indexing_maps[0].insert(OperandIndexing{identity_map});
+  return indexing;
+}
+
+[[maybe_unused]] HloInstructionIndexing ComputeOutputToInputIndexingImpl(
+    mhlo::FusionOp op, int output_id) {
+  auto& region = op.getRegion();
+  if (region.empty()) {
+    return CreateUnknownIndexing(op.getNumOperands());
+  }
+
+  auto& block = region.front();
+  auto terminator = block.getTerminator();
+  if (output_id >= terminator->getNumOperands()) {
+    return CreateUnknownIndexing(op.getNumOperands());
+  }
+
+  HloInstructionIndexing fusion_indexing;
+  fusion_indexing.indexing_maps.resize(op.getNumOperands());
+
+  struct WorkItem {
+    Value value;
+    OperandIndexing indexing;
+  };
+  std::vector<WorkItem> worklist;
+
+  // Start with the result of the fusion corresponding to output_id
+  Value root_val = terminator->getOperand(output_id);
+  Shape root_shape = GetShape(root_val);
+  int64_t rank = root_shape.dimensions().size();
+
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(rank, op.getContext()),
+      std::vector<int64_t>(root_shape.dimensions().begin(),
+                           root_shape.dimensions().end()),
+      {});
+  worklist.push_back({root_val, OperandIndexing{identity_map}});
+
+  while (!worklist.empty()) {
+    auto [val, current_indexing] = worklist.back();
+    worklist.pop_back();
+
+    if (current_indexing.IsUndefined()) {
+      // Propagate undefined?
+    }
+
+    if (auto block_arg = dyn_cast<BlockArgument>(val)) {
+      if (block_arg.getOwner() == &block) {
+        int arg_idx = block_arg.getArgNumber();
+        if (arg_idx < fusion_indexing.indexing_maps.size()) {
+          fusion_indexing.indexing_maps[arg_idx].insert(current_indexing);
+        }
+      }
+      continue;
+    }
+
+    Operation* producer = val.getDefiningOp();
+    if (!producer) {
+      continue;
+    }
+
+    // Recursive call to handle internal op
+    int producer_result_idx = llvm::cast<mlir::OpResult>(val).getResultNumber();
+    auto producer_indexing =
+        ComputeOutputToInputIndexing(producer, producer_result_idx);
+
+    for (size_t i = 0; i < producer->getNumOperands(); ++i) {
+      Value operand = producer->getOperand(i);
+      for (const auto& operand_indexing : producer_indexing.indexing_maps[i]) {
+        if (operand_indexing.IsUndefined() || current_indexing.IsUndefined()) {
+          worklist.push_back(
+              {operand, OperandIndexing{IndexingMap::GetUndefined()}});
+          continue;
+        }
+        // Note: ComposeOperandIndexing order is (Inner, Outer) aka (Consumer,
+        // Producer) to compute Outer(Inner(x)).
+        OperandIndexing composed =
+            ComposeOperandIndexing(current_indexing, operand_indexing);
+        if (!composed.IsUndefined()) {
+          composed.Simplify();
+          composed.RemoveUnusedSymbols();
+        }
+        worklist.push_back({operand, composed});
+      }
+    }
+  }
+  return fusion_indexing;
+}
+
+}  // namespace
+
+HloInstructionIndexing ComputeOutputToInputIndexing(Operation* op,
+                                                    int output_id) {
+  MLIRContext* context = op->getContext();
+  HloInstructionIndexing indexing =
+      llvm::TypeSwitch<Operation*, HloInstructionIndexing>(op)
+          // Operations with extracted helpers.
+          .Case<AllGatherOp, BitcastConvertOp, BroadcastInDimOp, ConcatenateOp,
+                ConvolutionOp, DotOp, DotGeneralOp, DynamicSliceOp,
+                DynamicUpdateSliceOp, GatherOp, GetTupleElementOp, PadOp,
+                ReduceOp, ReduceWindowOp, ReshapeOp, ReverseOp, SliceOp,
+                TransposeOp, TupleOp,
+                // MHLO ops.
+                mhlo::BitcastOp, mhlo::CopyOp, mhlo::FusionOp>(
+              [&](auto typed_op) {
+                return ComputeOutputToInputIndexingImpl(typed_op, output_id);
+              })
+
+          // Elementwise identity operations, all operands use identity mapping.
+          .Case<AddOp, SubtractOp, MulOp, DivOp, RemOp, MaxOp, MinOp, AndOp,
+                OrOp, XorOp, AbsOp, NegOp, SignOp, CosineOp, SineOp, TanhOp,
+                SqrtOp, RsqrtOp, ExpOp, Expm1Op, LogOp, Log1pOp, FloorOp,
+                CeilOp, ConvertOp, SelectOp, ClampOp, CompareOp,
+                PopulationCountOp, NotOp, IsFiniteOp, RoundNearestEvenOp,
+                OptimizationBarrierOp, MapOp, SortOp>([&](Operation* op) {
+            if (!dyn_cast<RankedTensorType>(op->getResult(0).getType())) {
+              return CreateUnknownIndexing(op->getNumOperands());
+            }
+            auto output_shape = GetShape(op->getResult(0));
+            HloInstructionIndexing indexing = CreateElementwiseIndexing(
+                op->getNumOperands(), output_shape, context);
+            // Handle scalar broadcast for operands with no dimensions
+            for (auto [i, operand] : llvm::enumerate(op->getOperands())) {
+              if (GetShape(operand).dimensions().empty()) {
+                indexing.indexing_maps[i].clear();
+                indexing.indexing_maps[i].insert(OperandIndexing{
+                    CreateScalarIndexingMap(output_shape, context)});
+              }
+            }
+            return indexing;
+          })
+
+          // Default:
+          //  - IotaOp, ConstantOp, CreateTokenOp, AfterAllOp
+          //  - unknown indexing for unsupported operations
+          .Default([&](Operation* op) {
+            return CreateUnknownIndexing(op->getNumOperands());
+          });
+  return indexing;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.h b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.h
new file mode 100644
index 00000000000000..b5fca80c1bc70e
--- /dev/null
+++ b/third_party/xla/xla/hlo/analysis/stablehlo_indexing_analysis.h
@@ -0,0 +1,29 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_ANALYSIS_STABLEHLO_INDEXING_ANALYSIS_H_
+#define XLA_HLO_ANALYSIS_STABLEHLO_INDEXING_ANALYSIS_H_
+
+#include "mlir/IR/Operation.h"
+
+namespace xla {
+struct HloInstructionIndexing;
+
+HloInstructionIndexing ComputeOutputToInputIndexing(mlir::Operation* op,
+                                                    int output_id);
+
+}  // namespace xla
+
+#endif  // XLA_HLO_ANALYSIS_STABLEHLO_INDEXING_ANALYSIS_H_

From 3f4f38e2e1c29538d79e942e22c2e160b3a0ea8e Mon Sep 17 00:00:00 2001
From: Quoc Truong <quoct@google.com>
Date: Thu, 18 Dec 2025 13:42:13 -0800
Subject: [PATCH 547/753] Update bazelisk, buildifier, buildozer and bats-core
 to the latest version in ml_build Dockerfile

PiperOrigin-RevId: 846398411
---
 ci/official/containers/ml_build/Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/official/containers/ml_build/Dockerfile b/ci/official/containers/ml_build/Dockerfile
index a4fb0cd9b1640a..ba090e65c95b33 100644
--- a/ci/official/containers/ml_build/Dockerfile
+++ b/ci/official/containers/ml_build/Dockerfile
@@ -58,10 +58,10 @@ RUN if [ -e "/usr/local/cuda/compat/libcuda.so.1" ]; then ln -s /usr/local/cuda/
 # - buildozer: clean bazel build deps
 # - gcloud SDK: communicate with Google Cloud Platform (GCP) for RBE, CI
 # - patchelf: Utility tool to modify existing ELF executables and libraries
-RUN git clone --branch v1.11.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
-RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.21.0/bazelisk-linux-amd64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildifier-linux-amd64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildozer-linux-amd64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
+RUN git clone --branch v1.13.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
+RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.27.0/bazelisk-linux-amd64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
+RUN wget https://github.com/bazelbuild/buildtools/releases/download/v8.2.1/buildifier-linux-amd64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
+RUN wget https://github.com/bazelbuild/buildtools/releases/download/v8.2.1/buildozer-linux-amd64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
 
 RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-x86_64.tar.gz | \
     tar zxf - google-cloud-sdk && \

From f58771eed6fe4a222334ae02d4a08d893740467d Mon Sep 17 00:00:00 2001
From: Cong Liu <congliu@google.com>
Date: Thu, 18 Dec 2025 14:01:26 -0800
Subject: [PATCH 548/753] [XLA] Add DeadDynamicUpdateSliceElimination pass.

Reverts 7dc32beee3397698fe9b5eb81e5132714395d835

PiperOrigin-RevId: 846405574
---
 .../xla/xla/hlo/transforms/simplifiers/BUILD  |  35 ++++
 .../dead_dynamic_update_slice_elimination.cc  | 176 ++++++++++++++++++
 .../dead_dynamic_update_slice_elimination.h   |  52 ++++++
 ...d_dynamic_update_slice_elimination_test.cc | 140 ++++++++++++++
 4 files changed, 403 insertions(+)
 create mode 100644 third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.cc
 create mode 100644 third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h
 create mode 100644 third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination_test.cc

diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
index 85d50f4c030665..c699420212b568 100644
--- a/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/BUILD
@@ -1851,6 +1851,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dead_dynamic_update_slice_elimination",
+    srcs = ["dead_dynamic_update_slice_elimination.cc"],
+    hdrs = ["dead_dynamic_update_slice_elimination.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "dead_dynamic_update_slice_elimination_test",
+    srcs = ["dead_dynamic_update_slice_elimination_test.cc"],
+    deps = [
+        ":dead_dynamic_update_slice_elimination",
+        ":hlo_dce",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:pattern_matcher",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "call_parameter_cleanup_test",
     srcs = ["call_parameter_cleanup_test.cc"],
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.cc
new file mode 100644
index 00000000000000..ef7058daa766f3
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.cc
@@ -0,0 +1,176 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h"
+
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/primitive_util.h"
+#include "xla/shape.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+std::optional<int64_t> GetConstantAsInt64(const HloInstruction* inst) {
+  if (!inst->IsConstant() || !ShapeUtil::IsScalar(inst->shape())) {
+    return std::nullopt;
+  }
+  return primitive_util::PrimitiveTypeSwitch<std::optional<int64_t>>(
+      [&](auto primitive_type_constant) -> std::optional<int64_t> {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          return static_cast<int64_t>(
+              inst->literal().GetFirstElement<NativeT>());
+        }
+        return std::nullopt;
+      },
+      inst->shape().element_type());
+}
+
+std::optional<std::vector<int64_t>> GetStartIndices(const HloInstruction* dus) {
+  absl::Span<HloInstruction* const> start_indices_operands =
+      absl::MakeSpan(dus->operands())
+          .subspan(xla::Cast<HloDynamicUpdateSliceInstruction>(dus)
+                       ->first_index_operand_number());
+  std::vector<int64_t> start_indices;
+  for (HloInstruction* operand : start_indices_operands) {
+    std::optional<int64_t> start_index = GetConstantAsInt64(operand);
+    if (!start_index.has_value()) {
+      return std::nullopt;
+    }
+    start_indices.push_back(*start_index);
+  }
+  return start_indices;
+}
+
+// Checks if the ranges [start1, end1) and [start2, end2) overlap.
+//
+// Example:
+// RangesOverlap(0, 10, 5, 15) -> true
+// RangesOverlap(0, 10, 10, 20) -> false
+// RangesOverlap(0, 10, 15, 20) -> false
+bool RangesOverlap(int64_t start1, int64_t end1, int64_t start2, int64_t end2) {
+  return start1 < end2 && start2 < end1;
+}
+
+// If true, the updated elements of the dynamic-update-slice is not accessed
+// by the slice user.
+bool IsDusUpdateUnused(const std::vector<int64_t>& dus_starts,
+                       const Shape& update_shape,
+                       const HloInstruction* slice_user) {
+  if (slice_user->opcode() != HloOpcode::kSlice) {
+    return false;
+  }
+  // Get Slice ranges
+  const std::vector<int64_t>& slice_starts = slice_user->slice_starts();
+  const std::vector<int64_t>& slice_limits = slice_user->slice_limits();
+
+  // The slice accesses the updated part IFF there is an overlap in *ALL*
+  // dimensions. If there is no overlap in any dimension, the slice is safe,
+  // i.e., it doesn't access the updated elements.
+  for (int dim = 0; dim < update_shape.dimensions().size(); ++dim) {
+    int64_t dus_start = dus_starts[dim];
+    int64_t dus_limit = dus_start + update_shape.dimensions(dim);
+    int64_t slice_start = slice_starts[dim];
+    int64_t slice_limit = slice_limits[dim];
+    if (RangesOverlap(dus_start, dus_limit, slice_start, slice_limit)) {
+      continue;
+    }
+    // Disjoint in this dimension, so slice does not overlap with update.
+    return true;
+  }
+  // Overlap in all dimensions, so slice reads updated values.
+  return false;
+}
+
+// Helper function to process a single DynamicUpdateSlice instruction.
+// Returns true if the module was changed.
+absl::StatusOr<bool> ProcessDynamicUpdateSlice(HloInstruction* dus,
+                                               HloComputation* comp) {
+  const std::optional<std::vector<int64_t>> dus_starts = GetStartIndices(dus);
+  if (!dus_starts.has_value()) {
+    // Not a constant start index, cannot simplify.
+    return false;
+  }
+  const std::vector<int64_t>& dus_starts_vec = *dus_starts;
+  HloInstruction* update_operand = dus->mutable_operand(1);
+  if (dus_starts_vec.size() != update_operand->shape().dimensions().size()) {
+    // DUS start indices size does not match update operand shape dimensions
+    // size.
+    VLOG(1) << "DUS start indices size does not match update operand shape "
+               "dimensions size: "
+            << dus->ToString();
+    return false;
+  }
+
+  bool is_dus_update_unused =
+      dus->user_count() > 0 &&
+      absl::c_all_of(dus->users(), [&](HloInstruction* user) {
+        return IsDusUpdateUnused(dus_starts_vec, update_operand->shape(), user);
+      });
+  VLOG(2) << "  is_dus_update_unused: " << is_dus_update_unused;
+  if (is_dus_update_unused) {
+    TF_RETURN_IF_ERROR(dus->ReplaceAllUsesWith(dus->mutable_operand(0)));
+    TF_RETURN_IF_ERROR(comp->RemoveInstruction(dus));
+    return true;  // Changed
+  }
+  return false;  // Not changed
+}
+
+}  // namespace
+
+absl::StatusOr<bool> DeadDynamicUpdateSliceElimination::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  auto computations_range = module->computations(execution_threads);
+  std::vector<HloComputation*> computations(computations_range.begin(),
+                                            computations_range.end());
+  for (HloComputation* computation : computations) {
+    std::vector<HloInstruction*> post_order_instructions =
+        computation->MakeInstructionPostOrder();
+    for (auto it = post_order_instructions.rbegin();
+         it != post_order_instructions.rend(); ++it) {
+      HloInstruction* instruction = *it;
+      if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
+        continue;
+      }
+      VLOG(2) << "Processing DUS: " << instruction->ToString();
+      TF_ASSIGN_OR_RETURN(bool dus_changed,
+                          ProcessDynamicUpdateSlice(instruction, computation));
+      if (dus_changed) {
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h
new file mode 100644
index 00000000000000..5f17372119f5ea
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h
@@ -0,0 +1,52 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_TRANSFORMS_SIMPLIFIERS_DEAD_DYNAMIC_UPDATE_SLICE_ELIMINATION_H_
+#define XLA_HLO_TRANSFORMS_SIMPLIFIERS_DEAD_DYNAMIC_UPDATE_SLICE_ELIMINATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla {
+
+// HLO pass that removes dynamic-update-slice (DUS) instructions if the region
+// they modify is never accessed by any downstream operations.
+//
+// This optimization applies if all users of a DUS are slice instructions, its
+// indices are constant, and none of its slice users read from the region
+// updated by the DUS. If these conditions are met, the pass replaces all uses
+// of the DUS with its input operand and removes the DUS instruction. The
+// optimization is applied from root to top, so that if any DUS is removed that
+// makes certain upstream DUSs removable, those will also be removed in the same
+// pass.
+class DeadDynamicUpdateSliceElimination : public HloModulePass {
+ public:
+  DeadDynamicUpdateSliceElimination() = default;
+  ~DeadDynamicUpdateSliceElimination() override = default;
+  absl::string_view name() const override { return "dead-dus-elimination"; }
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (instructions were removed).
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_HLO_TRANSFORMS_SIMPLIFIERS_DEAD_DYNAMIC_UPDATE_SLICE_ELIMINATION_H_
diff --git a/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination_test.cc b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination_test.cc
new file mode 100644
index 00000000000000..9a96ddff60554a
--- /dev/null
+++ b/third_party/xla/xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/transforms/simplifiers/dead_dynamic_update_slice_elimination.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/hlo/transforms/simplifiers/hlo_dce.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+namespace m = ::xla::match;
+
+class DeadDynamicUpdateSliceEliminationTest
+    : public HloHardwareIndependentTestBase {};
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, NoDeadDUS) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %constant.0 = bf16[] constant(0)
+  %idx.1806 = s32[] constant(1806)
+  %idx.0 = s32[] constant(0)
+  %param.0 = bf16[2408,16] parameter(0)
+  %update_block = bf16[301,16] broadcast(%constant.0), dimensions={}
+  %dus = bf16[2408,16] dynamic-update-slice(%param.0, %update_block, %idx.1806, %idx.0)
+  ROOT %slice = bf16[602,16] slice(%dus), slice={[1505:2107], [0:16]}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_FALSE(dds.Run(module.get()).value());
+}
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, MultiUsersNoDeadDUS) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %constant.0 = bf16[] constant(0)
+  %idx.1806 = s32[] constant(1806)
+  %idx.0 = s32[] constant(0)
+  %param.0 = bf16[2408,16] parameter(0)
+  %update_block = bf16[301,16] broadcast(%constant.0), dimensions={}
+  %dus = bf16[2408,16] dynamic-update-slice(%param.0, %update_block, %idx.1806, %idx.0)
+  %slice.0 = bf16[301,16] slice(%dus), slice={[1505:1806], [0:16]}
+  %slice.1 = bf16[301,16] slice(%dus), slice={[1806:2107], [0:16]}
+  ROOT %tuple = (bf16[301,16], bf16[301,16]) tuple(%slice.0, %slice.1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_FALSE(dds.Run(module.get()).value());
+}
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, RemoveDeadDUS) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %constant.0 = bf16[] constant(0)
+  %idx.1806 = s32[] constant(1806)
+  %idx.0 = s32[] constant(0)
+  %param.0 = bf16[2408,16] parameter(0)
+  %update_block = bf16[301,16] broadcast(%constant.0), dimensions={}
+  %dus = bf16[2408,16] dynamic-update-slice(%param.0, %update_block, %idx.1806, %idx.0)
+  ROOT %slice = bf16[301,16] slice(%dus), slice={[1505:1806], [0:16]}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_TRUE(dds.Run(module.get()).value());
+  HloDCE dce;
+  EXPECT_TRUE(dce.Run(module.get()).value());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(0))));
+}
+
+TEST_F(DeadDynamicUpdateSliceEliminationTest, RemoveDeadDUSChain) {
+  const absl::string_view kHlo = R"(
+HloModule module
+
+ENTRY main {
+  %param.0 = bf16[256,2408,1,16,256] parameter(0)
+  %constant.bf16.0 = bf16[] constant(0)
+  %broadcast.12717 = bf16[256,301,1,16,256] broadcast(%constant.bf16.0), dimensions={}
+  %constant.6347 = s32[] constant(0)
+  %constant.6386 = s32[] constant(2107)
+  %constant.6387 = s32[] constant(1806)
+  %constant.6388 = s32[] constant(1505)
+  %constant.6389 = s32[] constant(1204)
+  %constant.6390 = s32[] constant(903)
+  %constant.6391 = s32[] constant(602)
+  %constant.6392 = s32[] constant(301)
+  %dynamic-update-slice.643 = bf16[256,2408,1,16,256] dynamic-update-slice(%param.0, %broadcast.12717, %constant.6347, %constant.6386, %constant.6347, %constant.6347, %constant.6347)
+  %gather.214 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.643), slice={[0:256], [1806:2107], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.644 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.643, %broadcast.12717, %constant.6347, %constant.6387, %constant.6347, %constant.6347, %constant.6347)
+  %gather.215 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.644), slice={[0:256], [1505:1806], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.645 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.644, %broadcast.12717, %constant.6347, %constant.6388, %constant.6347, %constant.6347, %constant.6347)
+  %gather.216 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.645), slice={[0:256], [1204:1505], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.646 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.645, %broadcast.12717, %constant.6347, %constant.6389, %constant.6347, %constant.6347, %constant.6347)
+  %gather.217 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.646), slice={[0:256], [903:1204], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.647 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.646, %broadcast.12717, %constant.6347, %constant.6390, %constant.6347, %constant.6347, %constant.6347)
+  %gather.218 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.647), slice={[0:256], [602:903], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.648 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.647, %broadcast.12717, %constant.6347, %constant.6391, %constant.6347, %constant.6347, %constant.6347)
+  %gather.219 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.648), slice={[0:256], [301:602], [0:1], [0:16], [0:256]}
+  %dynamic-update-slice.649 = bf16[256,2408,1,16,256] dynamic-update-slice(%dynamic-update-slice.648, %broadcast.12717, %constant.6347, %constant.6392, %constant.6347, %constant.6347, %constant.6347)
+  %gather.220 = bf16[256,301,1,16,256] slice(%dynamic-update-slice.649), slice={[0:256], [0:301], [0:1], [0:16], [0:256]}
+  ROOT %result = (bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256], bf16[256,301,1,16,256]) tuple(%gather.214, %gather.215, %gather.216, %gather.217, %gather.218, %gather.219, %gather.220)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(kHlo));
+  DeadDynamicUpdateSliceElimination dds;
+  EXPECT_TRUE(dds.Run(module.get()).value());
+  for (HloInstruction* instruction :
+       module->entry_computation()->instructions()) {
+    EXPECT_NE(instruction->opcode(), HloOpcode::kDynamicUpdateSlice);
+  }
+}
+
+}  // namespace
+}  // namespace xla

From f043b5833d659bef7d3f7bd66c3bd421228b7c66 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 14:19:11 -0800
Subject: [PATCH 549/753] add low_priority queue batch attr to mlrt
 batch_kernel

PiperOrigin-RevId: 846412287
---
 .../core/tfrt/mlrt/kernel/batch_kernel.cc     | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
index 17243e2e3b0bc6..442ef61d093a5b 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -475,6 +475,43 @@ REGISTER_OP(kMlrtBatchFunctionName)
     .Attr("container: string = ''")
     .Attr("shared_name: string = ''")
     .Attr("batching_queue: string = ''")
+    // A separate set of batch options for the low priority requests, which is
+    // used for priority queue batching.
+    .Attr("low_priority_max_batch_size: int = 0")
+    .Attr("low_priority_batch_timeout_micros: int = 0")
+    .Attr("low_priority_allowed_batch_sizes: list(int) = []")
+    .Attr("low_priority_max_enqueued_batches: int = 0")
+    // Policy that determines the mixed priority batching behavior when low
+    // priority batch parameters are present.
+    //
+    // low_priority_padding_with_next_allowed_batch_size: If high priority
+    // batches time out without reaching the max batch size, low priority inputs
+    // pad the high priority batches up to the next allowed batch size. A low
+    // priority only batch gets schedule only when the low priority input times
+    // out or reaches the max batch size while there is no high priority input
+    // waiting to be processed.
+    // low_priority_padding_with_max_batch_size: Same as above but pad up to the
+    // max batch size.
+    // priority_isolation: High priority and low priority inputs never share the
+    // same batch, i.e., no low priority input padding high priority batches.
+    // Low priority inputs get scheduled only as part of low priority only
+    // batches as described above.
+    // priority_merge: High and low priority inputs are queued separately but
+    // when a batch needs to be scheduled, the two queues are treated as one
+    // merged flat list of inputs with high priority inputs at the front of the
+    // list of tasks to use for the next batch. If all inputs are of the same
+    // priority, the behavior is the same as disabling prioritization.
+    .Attr(
+        "mixed_priority_policy: "
+        "{'low_priority_padding_with_max_batch_size', "
+        "'low_priority_padding_with_next_allowed_batch_size', "
+        "'priority_isolation', 'priority_merge'} = "
+        "'low_priority_padding_with_max_batch_size'")
+    // See the description of the batch_padding_policy attribute of
+    // BatchFunction in core/ops/batch_ops.cc.
+    .Attr(
+        "batch_padding_policy: "
+        "{'PAD_UP', 'BATCH_DOWN', 'MINIMIZE_TPU_COST_PER_REQUEST'} = 'PAD_UP'")
     .Attr("Tin: list(type)")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")

From a430578a6bb92fadf3df00d52d00a3d8e93732a5 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Thu, 18 Dec 2025 14:22:14 -0800
Subject: [PATCH 550/753] Clean up device assignment logic in HloRunnerPjRt.

This change reduces the number of places where we generate device assignments,
and attempts to use static device assignments whenever they are available. As a
fallback, the client's default device assigments are used.

PiperOrigin-RevId: 846413387
---
 .../xla/xla/service/hlo_runner_pjrt.cc        | 95 ++++++++++++-------
 third_party/xla/xla/service/hlo_runner_pjrt.h |  3 +-
 2 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index ab2d8958683683..8e30e148182937 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -248,6 +248,26 @@ class HloRunnerPjRtExecutable : public OpaqueExecutable {
   std::unique_ptr<PjRtLoadedExecutable> loaded_executable_;
 };
 
+// Obtains the best device assignment for the given executable.
+// If the executable was compiled with a device assignment, that assignment is
+// returned. Otherwise, the static device assignment is pulled from the module
+// and returned instead. If that does not exist either, the default device
+// assignment is computed and returned.
+absl::StatusOr<DeviceAssignment> GetBestDeviceAssignment(
+    HloRunnerPjRtExecutable* const executable, PjRtClient& client) {
+  ASSIGN_OR_RETURN(CompileOptions compile_options,
+                   executable->executable()->GetCompileOptions());
+  if (compile_options.executable_build_options.has_device_assignment()) {
+    return compile_options.executable_build_options.device_assignment();
+  }
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::shared_ptr<HloModule>> hlo_modules,
+                      executable->executable()->GetHloModules());
+  TF_RET_CHECK(hlo_modules.size() == 1);
+  return GetStaticDeviceAssignmentOrComputeDefault(*hlo_modules.front(),
+                                                   client);
+}
+
 }  // namespace
 
 HloRunnerPjRt::HloRunnerPjRt(std::unique_ptr<PjRtClient> pjrt_client)
@@ -539,10 +559,8 @@ HloRunnerPjRt::DeserializeExecutable(const absl::string_view serialized) const {
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const HloRunnerInterface::ReplicatedExecuteOptions& options) {
-  TF_ASSIGN_OR_RETURN(
-      DeviceAssignment device_assignment,
-      GetStaticDeviceAssignmentOrComputeDefault(*module, *pjrt_client_));
-  return ExecuteReplicated(std::move(module), options, &device_assignment);
+  return ExecuteReplicated(std::move(module), options,
+                           /*device_assignment=*/nullptr);
 }
 
 absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
@@ -560,13 +578,8 @@ absl::StatusOr<std::vector<Literal>>
 HloRunnerPjRt::ExecuteReplicatedWithExecutable(
     OpaqueExecutable* const absl_nonnull executable,
     const ReplicatedExecuteOptions& options) {
-  ASSIGN_OR_RETURN(const HloModule* const module,
-                   HloModuleFromWrapped(executable));
-  ASSIGN_OR_RETURN(
-      DeviceAssignment device_assignment,
-      GetStaticDeviceAssignmentOrComputeDefault(*module, *pjrt_client_));
   return ExecuteReplicatedWithExecutable(executable, options,
-                                         &device_assignment);
+                                         /*device_assignment=*/nullptr);
 }
 
 absl::StatusOr<std::vector<Literal>>
@@ -574,21 +587,26 @@ HloRunnerPjRt::ExecuteReplicatedWithExecutable(
     OpaqueExecutable* const absl_nonnull executable,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  std::optional<DeviceAssignment> default_device_assignment = std::nullopt;
+  ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
+                   HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
+
+  // If a device assignment is provided, use it. Otherwise, use the one from the
+  // executable, or if that is not available, generate a default one.
+  std::optional<DeviceAssignment> device_assignment_storage = std::nullopt;
   if (device_assignment == nullptr) {
-    ASSIGN_OR_RETURN(default_device_assignment,
-                     GetDefaultDeviceAssignment(options.num_devices, 1));
-    device_assignment = &*default_device_assignment;
+    ASSIGN_OR_RETURN(
+        device_assignment_storage,
+        GetBestDeviceAssignment(wrapped_executable, *pjrt_client_));
+    device_assignment = &*device_assignment_storage;
   }
   CHECK_NE(device_assignment, nullptr);
-  TF_ASSIGN_OR_RETURN(HloRunnerPjRtExecutable* const wrapped_executable,
-                      HloRunnerPjRtExecutable::TryUnwrap(*this, executable));
 
   xla::ExecuteOptions execute_options;
   return ExecuteReplicatedImpl(
       [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
           absl::AnyInvocable<OpaqueExecutable*(int64_t)>
-              executable_provider_arg)
+              executable_provider_arg,
+          absl::Span<PjRtDevice* const>)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         TF_ASSIGN_OR_RETURN(
@@ -609,17 +627,29 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
     absl::AnyInvocable<const Literal*(int64_t, int64_t)> argument_provider,
     const HloRunnerInterface::ReplicatedExecuteOptions& options,
     DeviceAssignment* device_assignment) {
-  std::optional<DeviceAssignment> default_device_assignment = std::nullopt;
+  CHECK_GT(options.num_devices, 0);
+  ASSIGN_OR_RETURN(
+      HloRunnerPjRtExecutable* const wrapped_executable_device0,
+      HloRunnerPjRtExecutable::TryUnwrap(*this, executable_provider(0)));
+
+  // NB: we assume all executables have the same device assignments.  If a
+  // device assignment is provided, use it. Otherwise, use the one from the
+  // first device's executable, or if that is not available, generate a default
+  // one.
+  std::optional<DeviceAssignment> device_assignment_storage = std::nullopt;
   if (device_assignment == nullptr) {
-    TF_ASSIGN_OR_RETURN(default_device_assignment,
-                        GetDefaultDeviceAssignment(options.num_devices, 1));
-    device_assignment = &*default_device_assignment;
+    ASSIGN_OR_RETURN(
+        device_assignment_storage,
+        GetBestDeviceAssignment(wrapped_executable_device0, *pjrt_client_));
+    device_assignment = &*device_assignment_storage;
   }
   CHECK_NE(device_assignment, nullptr);
+
   return ExecuteReplicatedImpl(
       [&](absl::Span<const std::vector<PjRtBuffer*>> argument_buffer_slices,
           absl::AnyInvocable<OpaqueExecutable*(int64_t)>
-              executable_provider_arg)
+              executable_provider_arg,
+          absl::Span<PjRtDevice* const> id_to_device_ptr)
           -> absl::StatusOr<
               std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> {
         TF_RET_CHECK(options.use_threads);
@@ -645,12 +675,9 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicated(
             TF_ASSIGN_OR_RETURN(
                 PjRtLoadedExecutable * pjrt_executable,
                 executable->GetOrLoadExecutable(pjrt_client_.get()));
-            TF_ASSIGN_OR_RETURN(
-                PjRtDevice * device_ptr,
-                pjrt_client_->LookupDevice(
-                    DeviceIdForInvocation(*device_assignment, i)));
             pool.Schedule([&per_replica_results, i, pjrt_executable,
-                           args = argument_buffer_slices[i], device_ptr]() {
+                           args = argument_buffer_slices[i],
+                           device_ptr = id_to_device_ptr[i]]() {
               std::optional<Future<>> returned_future = {};
               xla::ExecuteOptions options;
               per_replica_results[i] = pjrt_executable->ExecuteSharded(
@@ -686,7 +713,8 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
     absl::AnyInvocable<
         absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
             absl::Span<const std::vector<PjRtBuffer*>>,
-            absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
+            absl::AnyInvocable<OpaqueExecutable*(int64_t)>,
+            absl::Span<PjRtDevice* const>)>
         execution_helper,
     absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
     absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,
@@ -695,8 +723,9 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
     DeviceAssignment* device_assignment) {
   TF_RET_CHECK(options.infeed_values.empty() ||
                options.infeed_values.size() == options.num_devices);
+  TF_RET_CHECK(device_assignment != nullptr);
 
-  std::vector<PjRtDevice*> replica_devices(options.num_devices, nullptr);
+  std::vector<PjRtDevice*> id_to_device_ptr(options.num_devices, nullptr);
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> argument_buffer_slices;
   argument_buffer_slices.reserve(options.num_devices);
   std::vector<bool> is_tuple_result(options.num_devices, false);
@@ -705,7 +734,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
     TF_ASSIGN_OR_RETURN(PjRtDevice* const device_ptr,
                         pjrt_client_->LookupDevice(
                             DeviceIdForInvocation(*device_assignment, i)));
-    replica_devices[i] = device_ptr;
+    id_to_device_ptr[i] = device_ptr;
 
     // Get the entry layout.
     OpaqueExecutable* const wrapped_executable = executable_provider(i);
@@ -750,7 +779,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
   if (has_infeed) {
     for (int64_t i = 0; i < options.num_devices; ++i) {
       pool->Schedule(
-          [device = replica_devices[i],
+          [device = id_to_device_ptr[i],
            &infeed_literal = *ABSL_DIE_IF_NULL(options.infeed_values[i]),
            infeed_steps = options.infeed_steps, &infeed_outfeed_status_mu,
            &infeed_outfeed_status]() {
@@ -773,7 +802,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
       options.outfeed_values->resize(options.num_devices);
     }
     for (int64_t i = 0; i < options.num_devices; ++i) {
-      pool->Schedule([i, device = replica_devices[i],
+      pool->Schedule([i, device = id_to_device_ptr[i],
                       outfeed_values = options.outfeed_values,
                       outfeed_shape = options.outfeed_shape,
                       infeed_steps = options.infeed_steps,
@@ -802,7 +831,7 @@ absl::StatusOr<std::vector<Literal>> HloRunnerPjRt::ExecuteReplicatedImpl(
       const std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
           result_buffers,
       execution_helper(BufferMatToPointerMat(argument_buffer_slices),
-                       std::move(executable_provider)));
+                       std::move(executable_provider), id_to_device_ptr));
   VLOG(1) << "Replicated execution terminated";
 
   // Get the result from execution.
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index 08b4f4b12fa227..e5e1da94988b27 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -145,7 +145,8 @@ class HloRunnerPjRt : public HloRunnerInterface {
       absl::AnyInvocable<
           absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>>(
               absl::Span<const std::vector<PjRtBuffer*>>,
-              absl::AnyInvocable<OpaqueExecutable*(int64_t)>)>
+              absl::AnyInvocable<OpaqueExecutable*(int64_t)>,
+              absl::Span<PjRtDevice* const>)>
           execution_helper,
       absl::AnyInvocable<OpaqueExecutable*(int64_t)> executable_provider,
       absl::AnyInvocable<int64_t(int64_t)> argument_count_provider,

From 57c1006923cf9868201512c0211c7ad1fd409cfd Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Thu, 18 Dec 2025 14:23:01 -0800
Subject: [PATCH 551/753] Add ScopedAsyncTrackingEvent to PJRT C API.

PiperOrigin-RevId: 846413643
---
 third_party/xla/xla/pjrt/c/BUILD              |  1 +
 third_party/xla/xla/pjrt/c/CHANGELOG.md       |  6 +++
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 37 ++++++++++++++-
 third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc | 10 +++++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 32 +++++++++++++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h  |  5 +++
 third_party/xla/xla/pjrt/c_api_client/BUILD   |  3 ++
 .../pjrt/c_api_client/pjrt_c_api_client.cc    | 45 +++++++++++++++++++
 .../xla/pjrt/c_api_client/pjrt_c_api_client.h | 24 +++++++---
 9 files changed, 154 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index 203269226200e8..babc266cfdb341 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -276,6 +276,7 @@ cc_library(
         "//xla/pjrt:pjrt_layout",
         "//xla/pjrt:pjrt_stream_executor_client",
         "//xla/pjrt:raw_buffer",
+        "//xla/pjrt:scoped_async_tracking_event",
         "//xla/pjrt/distributed:key_value_store_interface",
         "//xla/pjrt/proto:compile_options_proto_cc",
         "//xla/pjrt/proto:topology_description_proto_cc",
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 49e8c858d4d5a8..1c142fa18afc15 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,11 @@
 # PJRT C API changelog
 
+## 0.86
+
+* Add `PJRT_Device_CreateAsyncTrackingEvent`.
+
+* Add `PJRT_AsyncTrackingEvent_Destroy`.
+
 ## 0.85
 
 * Add `PJRT_Device_PoisonExecution`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index eaeb72ce4164c3..ae8caf9d724246 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -104,7 +104,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 85
+#define PJRT_API_MINOR 86
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -1437,6 +1437,36 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_PoisonExecution_Args, poisoned);
 typedef PJRT_Error* PJRT_Device_PoisonExecution(
     PJRT_Device_PoisonExecution_Args* args);
 
+// --------------------------- AsyncTrackingEvent ------------------------------
+
+typedef struct PJRT_AsyncTrackingEvent PJRT_AsyncTrackingEvent;
+
+struct PJRT_Device_CreateAsyncTrackingEvent_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Device* device;
+  const char* description;
+  size_t description_size;
+  PJRT_AsyncTrackingEvent* event;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_CreateAsyncTrackingEvent_Args, event);
+
+// Creates an async tracking event. The caller is responsible for destroying the
+// event.
+typedef PJRT_Error* PJRT_Device_CreateAsyncTrackingEvent(
+    PJRT_Device_CreateAsyncTrackingEvent_Args* args);
+
+struct PJRT_AsyncTrackingEvent_Destroy_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_AsyncTrackingEvent* event;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_AsyncTrackingEvent_Destroy_Args, event);
+
+// Destroys the async tracking event.
+typedef PJRT_Error* PJRT_AsyncTrackingEvent_Destroy(
+    PJRT_AsyncTrackingEvent_Destroy_Args* args);
+
 //-------------------------------- Memory --------------------------------------
 
 struct PJRT_Memory_Id_Args {
@@ -2794,10 +2824,13 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_AsyncHostToDeviceTransferManager_TransferLiteral);
   _PJRT_API_STRUCT_FIELD(PJRT_Buffer_CopyRawToHostFuture);
   _PJRT_API_STRUCT_FIELD(PJRT_Device_PoisonExecution);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_CreateAsyncTrackingEvent);
+  _PJRT_API_STRUCT_FIELD(PJRT_AsyncTrackingEvent_Destroy);
 } PJRT_Api;
 
 enum {
-  PJRT_Api_STRUCT_SIZE = PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Device_PoisonExecution)
+  PJRT_Api_STRUCT_SIZE =
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_AsyncTrackingEvent_Destroy)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index fa74c035be033a..19e9c0678a0160 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -956,6 +956,10 @@ FieldOffsetsAndSizesForVersion(int major_version, int minor_version) {
     if (minor_version >= 85) {
       add_field("PJRT_Device_PoisonExecution", kFnPtrSize);
     }
+    if (minor_version >= 86) {
+      add_field("PJRT_Device_CreateAsyncTrackingEvent", kFnPtrSize);
+      add_field("PJRT_AsyncTrackingEvent_Destroy", kFnPtrSize);
+    }
     return version_offsets_and_sizes;
   }
   LOG(FATAL) << "Unsupported API version: " << major_version << "."
@@ -1361,6 +1365,12 @@ TEST_F(PjrtCAbiTestBase, FieldOffsetsAndSizes) {
           {"PJRT_Device_PoisonExecution",
            {offsetof(PJRT_Api, PJRT_Device_PoisonExecution),
             sizeof(PJRT_Api::PJRT_Device_PoisonExecution)}},
+          {"PJRT_Device_CreateAsyncTrackingEvent",
+           {offsetof(PJRT_Api, PJRT_Device_CreateAsyncTrackingEvent),
+            sizeof(PJRT_Api::PJRT_Device_CreateAsyncTrackingEvent)}},
+          {"PJRT_AsyncTrackingEvent_Destroy",
+           {offsetof(PJRT_Api, PJRT_AsyncTrackingEvent_Destroy),
+            sizeof(PJRT_Api::PJRT_AsyncTrackingEvent_Destroy)}},
       };
   ASSERT_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
   ASSERT_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 223f6b0148db4e..23c788b97ad7bd 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -63,6 +63,7 @@ limitations under the License.
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/runtime/device_id.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
@@ -789,6 +790,33 @@ PJRT_Error* PJRT_Device_PoisonExecution(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Device_CreateAsyncTrackingEvent(
+    PJRT_Device_CreateAsyncTrackingEvent_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Device_CreateAsyncTrackingEvent_Args",
+      PJRT_Device_CreateAsyncTrackingEvent_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  absl::string_view description(args->description, args->description_size);
+  std::unique_ptr<xla::ScopedAsyncTrackingEvent> event =
+      args->device->device->CreateAsyncTrackingEvent(description);
+  if (event == nullptr) {
+    args->event = nullptr;
+  } else {
+    args->event = new PJRT_AsyncTrackingEvent{std::move(event)};
+  }
+  return nullptr;
+}
+
+PJRT_Error* PJRT_AsyncTrackingEvent_Destroy(
+    PJRT_AsyncTrackingEvent_Destroy_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_AsyncTrackingEvent_Destroy_Args",
+      PJRT_AsyncTrackingEvent_Destroy_Args_STRUCT_SIZE, args->struct_size));
+  delete args->event;
+  return nullptr;
+}
+
 PJRT_Error* PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer(
     PJRT_AsyncHostToDeviceTransferManager_RetrieveBuffer_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -3243,6 +3271,10 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       /*PJRT_Buffer_CopyRawToHostFuture=*/
       pjrt::PJRT_Buffer_CopyRawToHostFuture,
       /*PJRT_Device_PoisonExecution=*/pjrt::PJRT_Device_PoisonExecution,
+      /*PJRT_Device_CreateAsyncTrackingEvent=*/
+      pjrt::PJRT_Device_CreateAsyncTrackingEvent,
+      /*PJRT_AsyncTrackingEvent_Destroy=*/
+      pjrt::PJRT_AsyncTrackingEvent_Destroy,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 387c8642694872..e5381380430651 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/shape.h"
 
 struct PJRT_Error {
@@ -259,6 +260,10 @@ struct PJRT_PhaseCompiler {
       : compiler(phase_compiler), owned_compiler(nullptr) {}
 };
 
+struct PJRT_AsyncTrackingEvent {
+  std::unique_ptr<xla::ScopedAsyncTrackingEvent> event;
+};
+
 namespace pjrt {
 // C API definitions
 
diff --git a/third_party/xla/xla/pjrt/c_api_client/BUILD b/third_party/xla/xla/pjrt/c_api_client/BUILD
index 263de8a8c3b868..5f06f8d3ed1fda 100644
--- a/third_party/xla/xla/pjrt/c_api_client/BUILD
+++ b/third_party/xla/xla/pjrt/c_api_client/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:pjrt_layout",
+        "//xla/pjrt:scoped_async_tracking_event",
         "//xla/pjrt:string_utils",
         "//xla/pjrt/c:pjrt_c_api_ffi_extension_hdrs",
         "//xla/pjrt/c:pjrt_c_api_hdrs",
@@ -77,6 +78,8 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
+        "//xla/tsl/concurrency:async_value",
+        "//xla/tsl/concurrency:ref_count",
         "//xla/tsl/framework:allocator",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 0e727c473edbc6..4d519828fbf143 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -77,10 +77,13 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
@@ -1742,6 +1745,48 @@ absl::StatusOr<bool> PjRtCApiDevice::PoisonExecution(int32_t launch_id,
   return args.poisoned;
 }
 
+std::unique_ptr<ScopedAsyncTrackingEvent>
+PjRtCApiDevice::CreateAsyncTrackingEvent(absl::string_view description) const {
+  if (client_->pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      client_->pjrt_c_api()->pjrt_api_version.minor_version < 86) {
+    return nullptr;
+  }
+  PJRT_Device_CreateAsyncTrackingEvent_Args args;
+  args.struct_size = PJRT_Device_CreateAsyncTrackingEvent_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.device = c_device();
+  args.description = description.data();
+  args.description_size = description.size();
+  args.event = nullptr;
+
+  const PJRT_Api* api = client_->pjrt_c_api();
+  pjrt::LogFatalIfPjrtError(api->PJRT_Device_CreateAsyncTrackingEvent(&args),
+                            api);
+
+  if (args.event == nullptr) {
+    return nullptr;
+  }
+  return std::make_unique<PjRtCApiAsyncTrackingEvent>(api, args.event);
+}
+
+PjRtCApiAsyncTrackingEvent::PjRtCApiAsyncTrackingEvent(
+    const PJRT_Api* c_api, PJRT_AsyncTrackingEvent* event)
+    : c_api_(c_api), event_(event) {}
+
+PjRtCApiAsyncTrackingEvent::~PjRtCApiAsyncTrackingEvent() {
+  PJRT_AsyncTrackingEvent_Destroy_Args args;
+  args.struct_size = PJRT_AsyncTrackingEvent_Destroy_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.event = event_;
+  pjrt::LogFatalIfPjrtError(c_api_->PJRT_AsyncTrackingEvent_Destroy(&args),
+                            c_api_);
+}
+
+void PjRtCApiAsyncTrackingEvent::AddDependency(
+    tsl::RCReference<tsl::AsyncValue> dependency) {
+  LOG(FATAL) << "AddDependency is not supported in C API yet.";
+}
+
 // ------------------------------- Memory --------------------------------------
 
 const PJRT_Api* PjRtCApiMemorySpace::pjrt_c_api() const {
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index d61987f8b8c23c..30e99cd55d74a7 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -54,9 +54,12 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/pjrt_layout.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
+#include "xla/pjrt/scoped_async_tracking_event.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
+#include "xla/tsl/concurrency/async_value.h"
+#include "xla/tsl/concurrency/ref_count.h"
 #include "xla/tsl/framework/allocator.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
 #include "xla/util.h"
@@ -171,13 +174,7 @@ class PjRtCApiDevice : public PjRtDevice {
       absl::string_view kind) const override;
 
   std::unique_ptr<ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
-      absl::string_view description) const override {
-    LOG(FATAL)
-        << "PJRT C API does not support CreateAsyncTrackingEvent. Please "
-           "report an issue at https://github.com/google/jax/issues if you "
-           "need this feature.";
-    return nullptr;
-  }
+      absl::string_view description) const override;
 
   absl::StatusOr<bool> PoisonExecution(int32_t launch_id,
                                        absl::Status error) override;
@@ -324,6 +321,19 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
   void InitAttributes();
 };
 
+class PjRtCApiAsyncTrackingEvent : public ScopedAsyncTrackingEvent {
+ public:
+  PjRtCApiAsyncTrackingEvent(const PJRT_Api* c_api,
+                             PJRT_AsyncTrackingEvent* event);
+  ~PjRtCApiAsyncTrackingEvent() override;
+
+  void AddDependency(tsl::RCReference<tsl::AsyncValue> dependency) override;
+
+ private:
+  const PJRT_Api* c_api_;
+  PJRT_AsyncTrackingEvent* event_;
+};
+
 class PjRtCApiClient : public PjRtClient {
  public:
   PjRtCApiClient(

From 9f210d0ef3c52f314966eac3196747af598a5626 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Thu, 18 Dec 2025 14:54:51 -0800
Subject: [PATCH 552/753] Use a deterministic device assignment for split
 compilation.

The big gap in split comp adoption so far has been the inability to run
multi-device tests (ExecuteReplicated). This was in part caused our inconsistent
treatment of device assignments (fixed in #35535). The other reason (addressed
here) is that the default device assignments returned by PjRt may not be
deterministic. Device assignments obtained during compilation and execution can
differ, which matters e.g. if the test overrides the static device assignments
manually with the default. With this change we always return an iota assignment
for split compilation.

PiperOrigin-RevId: 846425397
---
 .../xla/xla/service/hlo_runner_pjrt.cc        | 20 +++++++++++++++++++
 third_party/xla/xla/service/hlo_runner_pjrt.h |  6 ++++++
 2 files changed, 26 insertions(+)

diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.cc b/third_party/xla/xla/service/hlo_runner_pjrt.cc
index 8e30e148182937..dd569941286ce7 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.cc
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.cc
@@ -980,6 +980,14 @@ std::string MakeFilename(const HloModule& module, const bool run_hlo_passes) {
                                                  fingerprint_bytes.size());
   return absl::StrCat(absl::BytesToHexString(fingerprint_bytes_view), ".bin");
 }
+
+inline absl::StatusOr<DeviceAssignment> SplitPhaseDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) {
+  DeviceAssignment device_assignment(num_replicas, num_partitions);
+  device_assignment.FillIota(0);
+  return device_assignment;
+}
+
 }  // namespace
 
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
@@ -1001,6 +1009,12 @@ CompilePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
   return wrapped_executable;
 }
 
+absl::StatusOr<DeviceAssignment>
+CompilePhaseHloRunnerPjRt::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  return SplitPhaseDefaultDeviceAssignment(num_replicas, num_partitions);
+}
+
 absl::StatusOr<std::unique_ptr<OpaqueExecutable>>
 ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
                                             const bool run_hlo_passes) {
@@ -1045,4 +1059,10 @@ ExecutePhaseHloRunnerPjRt::CreateExecutable(std::unique_ptr<HloModule> module,
   return DeserializeExecutable(serialized_executable);
 }
 
+absl::StatusOr<DeviceAssignment>
+ExecutePhaseHloRunnerPjRt::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  return SplitPhaseDefaultDeviceAssignment(num_replicas, num_partitions);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_runner_pjrt.h b/third_party/xla/xla/service/hlo_runner_pjrt.h
index e5e1da94988b27..bb698a64bbe786 100644
--- a/third_party/xla/xla/service/hlo_runner_pjrt.h
+++ b/third_party/xla/xla/service/hlo_runner_pjrt.h
@@ -192,6 +192,9 @@ class CompilePhaseHloRunnerPjRt : public HloRunnerPjRt {
         "expected.");
   }
 
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
  private:
   std::string artifact_dir_;
 };
@@ -221,6 +224,9 @@ class ExecutePhaseHloRunnerPjRt : public HloRunnerPjRt {
   absl::StatusOr<std::unique_ptr<OpaqueExecutable>> CreateExecutable(
       std::unique_ptr<HloModule> module, bool run_hlo_passes) override;
 
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
  private:
   std::string artifact_dir_;
   bool compile_if_not_found_;

From 6383e3632c91bdc8eccd458f699b317f03968b84 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 18 Dec 2025 15:15:38 -0800
Subject: [PATCH 553/753] [PJRT] Change transpose code to chunk nodes on the
 Loop nest representation, not the Node representation.

It is simpler to partition the Loops, which directly represent an iteration space, rather than the Nodes, which have to deal with a bunch of annoying details about tiling and partial tiles.

Refactoring, no behavior changes intended.

PiperOrigin-RevId: 846433128
---
 third_party/xla/xla/pjrt/BUILD             |   1 +
 third_party/xla/xla/pjrt/transpose.cc      | 202 ++++++++++++---------
 third_party/xla/xla/pjrt/transpose.h       |  25 ++-
 third_party/xla/xla/pjrt/transpose_test.cc |   5 +-
 4 files changed, 141 insertions(+), 92 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index a0bdf45320c23e..f266bac15df8e0 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -972,6 +972,7 @@ cc_library(
 xla_cc_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
+    shard_count = 10,
     deps = [
         ":transpose",
         "//xla:array",
diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index c7eb090396085c..5deddb6c0dee60 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -600,47 +600,37 @@ bool TransposePlan::Loop::operator==(const Loop& other) const {
          lda == other.lda && ldb == other.ldb &&
          is_inner_dim_in_a == other.is_inner_dim_in_a &&
          is_inner_dim_in_b == other.is_inner_dim_in_b &&
-         parallelism == other.parallelism;
+         parallelism == other.parallelism && start == other.start &&
+         end == other.end;
 }
 
 // Helper function that builds a plan.
-void TransposePlan::BuildPlanNodes(int thread_id,
+void TransposePlan::BuildPlanNodes(int chunk_id,
                                    std::vector<TransposePlan::Node>& nodes) {
   VLOG(8) << "Before plan build: " << ToString();
   const int ndim = a_dims_.size();
   DCHECK_GT(ndim, 0);
 
+  // Use the pre-computed chunk loops which have start/end bounds already set.
+  absl::Span<const Loop> chunk_loops = chunk_loops_[chunk_id];
+
   // We build plans in a depth-first order, visiting loops from outermost to
   // innermost. We use a stack (depth-first) order to handle trailing partial
   // tiles, which we "come back to" after handling the non-trailing case.
   struct Agendum {
-    // The ID of the loop to visit in loop_order_.
+    // The ID of the loop to visit in chunk_loops.
     int loop_id;
     // The parent node ID whose trailing tile should be made to point to this
     // node.
     int parent_node_id;
 
-    // The number of parallel tasks available to run this loop and its
-    // successors.
-    int num_tasks_at_loop;
-
-    // The ID number of the current thread in the tasks at this loop.
-    int task_id_at_loop;
-
     // For which dimensions of `a` are we to visit the partial trailing tile
     // a loop that visits that tile's interior?
     absl::InlinedVector<bool, 4> partial_tiles;
   };
   std::stack<Agendum> agenda;
 
-  int total_tasks = 1;
-  for (const Loop& loop : loop_order_) {
-    total_tasks *= loop.parallelism;
-  }
-
   agenda.push(Agendum{/*loop_id=*/0, /*parent_node_id=*/-1,
-                      /*num_tasks_at_loop=*/total_tasks,
-                      /*task_id_at_loop=*/thread_id,
                       absl::InlinedVector<bool, 4>(ndim, false)});
 
   auto loop_has_trivial_iteration_space = [](const Node& node) {
@@ -659,9 +649,8 @@ void TransposePlan::BuildPlanNodes(int thread_id,
           node_id - agendum.parent_node_id;
     }
 
-    if (agendum.loop_id == loop_order_.size()) {
+    if (agendum.loop_id == chunk_loops.size()) {
       // We've reached the end of the loop nest.
-      DCHECK_EQ(agendum.num_tasks_at_loop, 1);
       // Transpose loops have a sentinel node, indicated by a negative `inc`
       // value, that describes the striding of the inner transpose kernel.
       if (!inner_kernel_is_memcpy_) {
@@ -675,14 +664,9 @@ void TransposePlan::BuildPlanNodes(int thread_id,
       continue;
     }
 
-    const Loop& loop = loop_order_[agendum.loop_id];
+    const Loop& loop = chunk_loops[agendum.loop_id];
     int a_dim = loop.dim_in_a;
 
-    // Compute the number of tasks for the next loop iteration.
-    int task_id_at_loop = agendum.task_id_at_loop;
-    int num_tasks_at_loop = agendum.num_tasks_at_loop / loop.parallelism;
-    int task_id_at_next_loop = task_id_at_loop % num_tasks_at_loop;
-
     Node node;
     node.lda = loop.lda;
     node.ldb = loop.ldb;
@@ -695,19 +679,18 @@ void TransposePlan::BuildPlanNodes(int thread_id,
       node.inc = inner_block_elems_ * outer_block_elems_b_;
     }
 
-    int task_id = task_id_at_loop / num_tasks_at_loop;
-
     if (loop.tile_interior) {
       // We are visiting the tile interior of a tiled dimension.
       bool partial = agendum.partial_tiles[a_dim];
 
       int64_t size = partial ? loop.dim_size % loop.tile_size : loop.tile_size;
-      int64_t num_iterations = CeilOfRatio(size, node.inc);
-      int64_t num_iterations_per_task =
-          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
-      node.start = std::min(size, task_id * num_iterations_per_task * node.inc);
-      node.end =
-          std::min(size, (task_id + 1) * num_iterations_per_task * node.inc);
+      // loop.start and loop.end are in element units.
+      // Verify alignment to block boundaries.
+      CHECK(loop.start % node.inc == 0)
+          << "loop.start=" << loop.start
+          << " must be aligned to node.inc=" << node.inc;
+      node.start = loop.start;
+      node.end = std::min<int64_t>(size, loop.end);
 
       if (node.is_inner_dim_in_a && inner_kernel_is_memcpy_) {
         node.end = (node.end - node.start) * elem_size_in_bytes_;
@@ -720,8 +703,6 @@ void TransposePlan::BuildPlanNodes(int thread_id,
       Agendum new_agendum;
       new_agendum.loop_id = agendum.loop_id + 1;
       new_agendum.parent_node_id = -1;
-      new_agendum.task_id_at_loop = task_id_at_next_loop;
-      new_agendum.num_tasks_at_loop = num_tasks_at_loop;
       new_agendum.partial_tiles = agendum.partial_tiles;
       agenda.push(std::move(new_agendum));
     } else {
@@ -732,14 +713,16 @@ void TransposePlan::BuildPlanNodes(int thread_id,
 
       // If there is a trailing partial tile as well as complete tiles, handle
       // it as a trailer on the loop over complete tiles.
+      // A chunk is responsible for the trailing tile if its loop.end covers
+      // the full dimension.
+      int64_t full_size = CeilOfRatio(loop.dim_size, loop.tile_size);
+      bool handles_trailing =
+          loop.end >= full_size && loop.start <= num_complete_tiles;
       bool has_trailing_plan_node = false;
-      if (num_complete_tiles > 0 && has_partial_tile &&
-          task_id == loop.parallelism - 1) {
+      if (num_complete_tiles > 0 && has_partial_tile && handles_trailing) {
         Agendum new_agendum;
         new_agendum.loop_id = agendum.loop_id + 1;
         new_agendum.parent_node_id = node_id;
-        new_agendum.task_id_at_loop = task_id_at_next_loop;
-        new_agendum.num_tasks_at_loop = num_tasks_at_loop;
         new_agendum.partial_tiles = agendum.partial_tiles;
         new_agendum.partial_tiles[a_dim] = true;
         agenda.push(std::move(new_agendum));
@@ -751,15 +734,10 @@ void TransposePlan::BuildPlanNodes(int thread_id,
       // path to handle the trailing tile.
       bool partial = num_complete_tiles == 0 && has_partial_tile;
 
-      // Evenly divide the loop iterations amongst the threads.
+      // loop.start and loop.end are in tile units.
       int64_t num_tiles = partial ? 1 : num_complete_tiles;
-      int64_t num_iterations = CeilOfRatio(num_tiles, node.inc);
-      int64_t num_iterations_per_task =
-          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
-      node.start =
-          std::min(num_tiles, task_id * num_iterations_per_task * node.inc);
-      node.end = std::min(num_tiles,
-                          (task_id + 1) * num_iterations_per_task * node.inc);
+      node.start = loop.start;
+      node.end = std::min<int64_t>(num_tiles, loop.end);
 
       if (node.is_inner_dim_in_a && inner_kernel_is_memcpy_) {
         node.end = (node.end - node.start) * elem_size_in_bytes_;
@@ -774,8 +752,6 @@ void TransposePlan::BuildPlanNodes(int thread_id,
       Agendum new_agendum;
       new_agendum.loop_id = agendum.loop_id + 1;
       new_agendum.parent_node_id = -1;
-      new_agendum.task_id_at_loop = task_id_at_next_loop;
-      new_agendum.num_tasks_at_loop = num_tasks_at_loop;
       new_agendum.partial_tiles = agendum.partial_tiles;
       new_agendum.partial_tiles[a_dim] = partial;
       agenda.push(std::move(new_agendum));
@@ -999,7 +975,10 @@ void TransposePlan::Initialize() {
                         : ldb_[pos_stride1a_in_b];
   }
 
-  loop_order_.reserve(ndim);
+  // Order to traverse dimensions, from slowest-varying to fastest-varying.
+  std::vector<Loop> loop_order;
+
+  loop_order.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
     Loop loop;
     loop.dim_in_a = i;
@@ -1017,7 +996,7 @@ void TransposePlan::Initialize() {
     }
     loop.is_inner_dim_in_a = (loop.tile_size == 1) && (i == pos_stride1a);
     loop.is_inner_dim_in_b = (loop.tile_size == 1) && (i == pos_stride1b_in_a);
-    loop_order_.push_back(loop);
+    loop_order.push_back(loop);
 
     if (loop.tile_size > 1) {
       loop.tile_interior = true;
@@ -1026,12 +1005,12 @@ void TransposePlan::Initialize() {
                              : ldb_[inverse_permutation[i]];
       loop.is_inner_dim_in_a = (i == pos_stride1a);
       loop.is_inner_dim_in_b = (i == pos_stride1b_in_a);
-      loop_order_.push_back(loop);
+      loop_order.push_back(loop);
     }
   }
 
-  RemoveTrivialLoops(loop_order_);
-  CoalesceLoops(loop_order_);
+  RemoveTrivialLoops(loop_order);
+  CoalesceLoops(loop_order);
 
   // Bound the block sizes so they are smaller than the stride-1 dimension
   // size.
@@ -1118,7 +1097,7 @@ void TransposePlan::Initialize() {
                            inner_kernel_is_memcpy_ && l.tile_interior,
                            -std::min<double>(a_stride * penalty, b_stride));
   };
-  absl::c_stable_sort(loop_order_, [&](const Loop& a, const Loop& b) {
+  absl::c_stable_sort(loop_order, [&](const Loop& a, const Loop& b) {
     return cost(a) < cost(b);
   });
   // It is a required invariant of the loop order that tile interiors always
@@ -1127,13 +1106,14 @@ void TransposePlan::Initialize() {
   // both input and output.
 
   // The stride-1 loop must be innermost for a memcpy loop.
-  DCHECK(!inner_kernel_is_memcpy_ || loop_order_.back().is_inner_dim_in_a)
+  DCHECK(!inner_kernel_is_memcpy_ || loop_order.back().is_inner_dim_in_a)
       << ToString();
 
-  int num_threads = ChooseParallelizationStrategy();
-  nodes_.resize(num_threads);
-  for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
-    BuildPlanNodes(thread_id, nodes_[thread_id]);
+  int num_chunks = ChooseParallelizationStrategy(loop_order);
+  chunk_loops_ = PartitionLoops(num_chunks, loop_order);
+  nodes_.resize(num_chunks);
+  for (int chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
+    BuildPlanNodes(chunk_id, nodes_[chunk_id]);
   }
 
   switch (transformation_) {
@@ -1148,7 +1128,8 @@ void TransposePlan::Initialize() {
   }
 }
 
-int TransposePlan::ChooseParallelizationStrategy() {
+int TransposePlan::ChooseParallelizationStrategy(
+    std::vector<Loop>& loop_order) {
   int available_parallelism = num_threads_requested_;
 
   // Compute the number of iterations in `loop`.
@@ -1170,14 +1151,14 @@ int TransposePlan::ChooseParallelizationStrategy() {
   };
 
   // Estimate the number of bytes each iteration of each loop processes.
-  absl::InlinedVector<int64_t, 4> work_in_bytes(loop_order_.size());
+  absl::InlinedVector<int64_t, 4> work_in_bytes(loop_order.size());
   int64_t acc = elem_size_in_bytes_;
   if (!inner_kernel_is_memcpy_) {
     acc *= inner_block_elems_ * inner_block_elems_ * outer_block_elems_a_ *
            outer_block_elems_b_;
   }
   auto work_it = work_in_bytes.rbegin();
-  for (auto it = loop_order_.rbegin(); it != loop_order_.rend(); ++it) {
+  for (auto it = loop_order.rbegin(); it != loop_order.rend(); ++it) {
     *work_it++ = acc;
     acc *= loop_iterations(*it);
   }
@@ -1186,11 +1167,17 @@ int TransposePlan::ChooseParallelizationStrategy() {
 
   // Heuristic that attempts to parallelize the outermost loops, down to a
   // minimum per-thread number of bytes processed.
-  int num_threads = 1;
-  for (size_t i = 0; i < loop_order_.size(); ++i) {
-    Loop& loop = loop_order_[i];
+  int num_chunks = 1;
+  for (size_t i = 0; i < loop_order.size(); ++i) {
+    Loop& loop = loop_order[i];
     CHECK_GE(available_parallelism, 1);
     int64_t iterations = loop_iterations(loop);
+
+    // Initialize loop iteration bounds to full range in element units.
+    loop.start = 0;
+    loop.end = loop.tile_interior ? loop.tile_size
+                                  : CeilOfRatio(loop.dim_size, loop.tile_size);
+
     int kMinBytesPerThread = inner_kernel_is_memcpy_ ? (1 << 20) : (1 << 26);
     int64_t min_iterations_per_thread =
         CeilOfRatio<int64_t>(kMinBytesPerThread, work_in_bytes[i]);
@@ -1198,16 +1185,58 @@ int TransposePlan::ChooseParallelizationStrategy() {
 
     VLOG(8) << "iterations=" << iterations << " parallel_work=" << parallel_work
             << " available_parallelism=" << available_parallelism;
-    if (parallel_work >= available_parallelism) {
-      loop.parallelism = available_parallelism;
-      available_parallelism = 1;
-    } else {
-      loop.parallelism = parallel_work;
-      available_parallelism /= parallel_work;
+    int parallelism = std::min<int64_t>(available_parallelism, parallel_work);
+    if (parallelism > 1) {
+      // If we use CeilOfRatio(iterations, parallelism) as the chunk size, we
+      // might end up with fewer chunks than parallelism if the chunk size is
+      // large. For example, if iterations=17 and parallelism=16,
+      // chunk_size=2. Then useful_tasks=9. We should reduce parallelism to 9.
+      int64_t chunk_size =
+          CeilOfRatio(iterations, static_cast<int64_t>(parallelism));
+      int64_t useful_tasks = CeilOfRatio(iterations, chunk_size);
+      parallelism = useful_tasks;
+    }
+    loop.parallelism = parallelism;
+    available_parallelism /= parallelism;
+    num_chunks *= loop.parallelism;
+  }
+  return num_chunks;
+}
+
+std::vector<std::vector<TransposePlan::Loop>> TransposePlan::PartitionLoops(
+    int num_chunks, const std::vector<Loop>& loop_order) {
+  std::vector<std::vector<Loop>> result(num_chunks);
+  for (int chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
+    // Copy the base loop order for this chunk.
+    result[chunk_id] = loop_order;
+
+    // For each loop, narrow the start/end bounds to this chunk's portion.
+    int task_id_remaining = chunk_id;
+    int num_tasks_remaining = num_chunks;
+
+    for (size_t i = 0; i < loop_order.size(); ++i) {
+      Loop& chunk_loop = result[chunk_id][i];
+      const Loop& base_loop = loop_order[i];
+
+      num_tasks_remaining /= base_loop.parallelism;
+      int task_id = task_id_remaining / num_tasks_remaining;
+      task_id_remaining = task_id_remaining % num_tasks_remaining;
+
+      // Divide this loop's iterations (in element units) among parallelism
+      // tasks.
+      int64_t iterations = base_loop.end - base_loop.start;
+      int64_t iterations_per_task =
+          CeilOfRatio<int64_t>(iterations, base_loop.parallelism);
+
+      chunk_loop.start =
+          base_loop.start + std::min(iterations, task_id * iterations_per_task);
+      chunk_loop.end =
+          base_loop.start +
+          std::min(iterations, (task_id + 1) * iterations_per_task);
     }
-    num_threads *= loop.parallelism;
   }
-  return num_threads;
+
+  return result;
 }
 
 std::string TransposePlan::ToString() const {
@@ -1228,11 +1257,16 @@ std::string TransposePlan::ToString() const {
                       node.is_inner_dim_in_b ? "y" : "n");
                 }));
       });
-  auto format_loop_order = [](std::string* out, const Loop& loop) {
-    return absl::StrAppend(out, loop.dim_in_a,
-                           loop.tile_interior ? "[tile]" : "", "(",
-                           loop.parallelism, ")");
+  auto format_loop = [](std::string* out, const Loop& loop) {
+    absl::StrAppendFormat(out, "%d%s[%d,%d](%d)", loop.dim_in_a,
+                          loop.tile_interior ? "[tile]" : "", loop.start,
+                          loop.end, loop.parallelism);
   };
+  std::string chunk_loops_str = absl::StrJoin(
+      chunk_loops_, "\n",
+      [&](std::string* out, const std::vector<Loop>& loops) {
+        absl::StrAppend(out, "    ", absl::StrJoin(loops, ", ", format_loop));
+      });
   std::string transformation_str;
   switch (transformation_) {
     case Transformation::kNone:
@@ -1244,19 +1278,19 @@ std::string TransposePlan::ToString() const {
   }
   return absl::StrFormat(
       "elem_size=%d a_dims=%s b_dims=%s permutation=%s a_tiling=%s b_tiling=%s "
-      "lda=%s lda_tile=%s ldb=%s ldb_tile=%s loop_order=%s "
+      "lda=%s lda_tile=%s ldb=%s ldb_tile=%s "
       "outer_bs=[%d,%d] inner_bs=%d "
       "transformation=%s scratch_size=%d\n"
+      "chunk_loops:\n%s\n"
       "nodes:\n%s",
       elem_size_in_bytes_, absl::StrJoin(a_dims_, ","),
       absl::StrJoin(Permute(a_dims_, permutation_), ","),
       absl::StrJoin(permutation_, ","), absl::StrJoin(a_tiling_, ","),
       absl::StrJoin(b_tiling_, ","), absl::StrJoin(lda_, ","),
       absl::StrJoin(lda_tile_, ","), absl::StrJoin(ldb_, ","),
-      absl::StrJoin(ldb_tile_, ","),
-      absl::StrJoin(loop_order_, ",", format_loop_order), outer_block_elems_a_,
-      outer_block_elems_b_, inner_block_elems_, transformation_str,
-      scratch_size_, nodes_str);
+      absl::StrJoin(ldb_tile_, ","), outer_block_elems_a_, outer_block_elems_b_,
+      inner_block_elems_, transformation_str, scratch_size_, chunk_loops_str,
+      nodes_str);
 }
 
 bool TransposePlanCacheKey::operator==(
@@ -1340,7 +1374,7 @@ absl::StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
   }
 
   // Coalesce from slow-varying to fast-varying (outer to inner).
-  // loop_order_[0] is slowest.
+  // loops[0] is slowest.
   int write_pos = 0;
   for (int read_pos = 1; read_pos < loops.size(); ++read_pos) {
     Loop& outer = loops[write_pos];
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index aef51be791a04b..b0eccc5d37132b 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -175,6 +175,11 @@ class TransposePlan {
     // Number of parallel threads to use for this loop.
     int64_t parallelism;
 
+    // Iteration bounds for this chunk. Initially [0, full_iterations).
+    // After chunk splitting, each chunk's loops have narrowed bounds.
+    int64_t start = 0;  // Inclusive start of iteration range
+    int64_t end = 0;    // Exclusive end of iteration range
+
     bool operator==(const Loop& other) const;
   };
 
@@ -186,11 +191,17 @@ class TransposePlan {
   // Performs plan initialization that cannot fail.
   void Initialize();
 
-  void BuildPlanNodes(int thread_id, std::vector<Node>& output_nodes);
+  void BuildPlanNodes(int chunk_id, std::vector<Node>& nodes);
+
+  // Chooses a parallelism for each loop. Returns the number of separate chunks
+  // in the plan, and populates the `parallelism` field of each loop.
+  int ChooseParallelizationStrategy(std::vector<Loop>& loop_order);
 
-  // Chooses a parallelism for each loop. Returns the total number of parallel
-  // work units.
-  int ChooseParallelizationStrategy();
+  // Creates per-chunk loop vectors by splitting loop_order_ into per-chunk
+  // loops. Returns a vector of loop vectors, one per chunk. Each chunk's
+  // loops have their start/end bounds narrowed to represent that chunk's work.
+  std::vector<std::vector<Loop>> PartitionLoops(
+      int num_chunks, const std::vector<Loop>& loop_order);
 
   // The signature of ExecuteTyped uses char* pointers because we perform
   // address calculations with strides in bytes; the strides need not be
@@ -237,9 +248,9 @@ class TransposePlan {
   bool a_is_tiled_;
   bool b_is_tiled_;
 
-  // Order to traverse dimensions, from slowest-varying to fastest-varying.
-
-  std::vector<Loop> loop_order_;
+  // Per-chunk loop nests. Each loop nest has its own start/end bounds
+  // representing one chunk of the work.
+  std::vector<std::vector<Loop>> chunk_loops_;
 
   // Root nodes of the plan, i.e., pointing to the outermost loops in the loop
   // nest. The outer vector is indexed on the thread ID.
diff --git a/third_party/xla/xla/pjrt/transpose_test.cc b/third_party/xla/xla/pjrt/transpose_test.cc
index c136540eee1175..716f5d3bdff220 100644
--- a/third_party/xla/xla/pjrt/transpose_test.cc
+++ b/third_party/xla/xla/pjrt/transpose_test.cc
@@ -462,7 +462,10 @@ std::vector<TransposeTestCase> GetTransposeTestCases() {
                         /*permutation=*/{3, 1, 2, 0},
                         /*input_tiling=*/{},
                         /*output_tiling=*/{8, 128}),
-  };
+      TransposeTestCase{/*dims=*/{129, 1234567},
+                        /*permutation=*/{0, 1},
+                        /*input_tiling=*/{},
+                        /*output_tiling=*/{8, 128}}};
   return cases;
 }
 

From e32237efd3c4458cfab8f4b08462ac1715c51771 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Thu, 18 Dec 2025 15:24:36 -0800
Subject: [PATCH 554/753] [XLA:GPU] Split SortRewriter into two passes.

1. SortRewriter: Actually rewrites the sort and puts 1 elem scratch size
2. EstimateCubScratchSize: Talks to the runner to understand how much memory is needed for the scratch space.

PiperOrigin-RevId: 846436222
---
 third_party/xla/xla/service/gpu/BUILD         |  10 +-
 .../xla/xla/service/gpu/cublas_cudnn.cc       |   8 +
 .../xla/xla/service/gpu/cublas_cudnn.h        |   6 +
 .../xla/xla/service/gpu/gpu_compiler.cc       | 177 +++++-----
 .../xla/xla/service/gpu/gpu_compiler_test.cc  |   2 +-
 .../xla/xla/service/gpu/transforms/BUILD      |  46 ++-
 .../transforms/estimate_cub_scratch_size.cc   | 123 +++++++
 .../transforms/estimate_cub_scratch_size.h    |  57 ++++
 .../estimate_cub_scratch_size_test.cc         | 317 ++++++++++++++++++
 .../gpu/transforms/layout_assignment.cc       |   2 +-
 .../gpu/transforms/layout_assignment_test.cc  |   2 +-
 .../service/gpu/transforms/sort_rewriter.cc   |  78 +++--
 .../gpu/transforms/sort_rewriter_test.cc      |  35 +-
 13 files changed, 719 insertions(+), 144 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.cc
 create mode 100644 third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.h
 create mode 100644 third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 6c9258ca18a55c..5125bbc4b1fb92 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1565,12 +1565,8 @@ cc_library(
 
 cc_library(
     name = "gpu_compiler",
-    srcs = [
-        "gpu_compiler.cc",
-    ],
-    hdrs = [
-        "gpu_compiler.h",
-    ],
+    srcs = ["gpu_compiler.cc"],
+    hdrs = ["gpu_compiler.h"],
     tags = ["gpu"],
     deps = [
         ":alias_info",
@@ -1772,6 +1768,7 @@ cc_library(
         "//xla/service/gpu/transforms:dot_strength_reduction",
         "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
         "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
+        "//xla/service/gpu/transforms:estimate_cub_scratch_size",
         "//xla/service/gpu/transforms:explicit_collectives_group_async_wrapper",
         "//xla/service/gpu/transforms:explicit_stream_annotation_async_wrapper",
         "//xla/service/gpu/transforms:fusion_block_level_rewriter",
@@ -1848,6 +1845,7 @@ cc_library(
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.cc b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
index f0976d24bd3b27..ecaf28349a6b07 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.cc
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.cc
@@ -99,6 +99,8 @@ const absl::string_view kCudnnfMHASoftmaxDropoutBackwardCallTarget =
     "__cudnn$fmhaSoftmaxDropoutBackward";
 
 const absl::string_view kCubDeviceRadixSortTarget = "__cub$DeviceRadixSort";
+const absl::string_view kCubDeviceRadixSortUnassignedScratchSizeTarget =
+    "__cub$DeviceRadixSortUnassignedScratchSize";
 
 bool IsCustomCallToDnnConvolution(const HloInstruction& hlo) {
   if (hlo.opcode() != HloOpcode::kCustomCall) {
@@ -186,6 +188,12 @@ bool IsCubDeviceRadixSort(const HloInstruction& hlo) {
          hlo.custom_call_target() == kCubDeviceRadixSortTarget;
 }
 
+bool IsCubDeviceRadixSortNoScratchSize(const HloInstruction& hlo) {
+  return hlo.opcode() == HloOpcode::kCustomCall &&
+         hlo.custom_call_target() ==
+             kCubDeviceRadixSortUnassignedScratchSizeTarget;
+}
+
 absl::StatusOr<CudnnConvKind> GetCudnnConvKind(
     const HloCustomCallInstruction* instr) {
   absl::string_view target = instr->custom_call_target();
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.h b/third_party/xla/xla/service/gpu/cublas_cudnn.h
index 02d712b21c473b..034ec33c9dc983 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.h
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.h
@@ -219,7 +219,13 @@ bool IsCustomCallToBlockScaledDot(const HloInstruction& hlo);
 // Reference: https://nvlabs.github.io/cub/
 extern const absl::string_view kCubDeviceRadixSortTarget;
 
+// CUB library call that allows to not specify the scratch size.
+// EstimateCubScratchSizePass will assign the correct scratch size.
+extern const absl::string_view kCubDeviceRadixSortUnassignedScratchSizeTarget;
+
 bool IsCubDeviceRadixSort(const HloInstruction& hlo);
+bool IsCubDeviceRadixSortNoScratchSize(const HloInstruction& hlo);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index 36471b34d7a2ac..73e15a18b29f4b 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -232,6 +232,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/dot_strength_reduction.h"
 #include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
 #include "xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h"
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
 #include "xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h"
 #include "xla/service/gpu/transforms/explicit_stream_annotation_async_wrapper.h"
 #include "xla/service/gpu/transforms/fusion_wrapper.h"
@@ -321,6 +322,7 @@ limitations under the License.
 #include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/profiler/lib/scoped_annotation.h"
 #include "tsl/profiler/lib/traceme.h"
+#include "xla/tsl/platform/status_macros.h"
 
 #ifdef PLATFORM_GOOGLE
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.h"
@@ -532,7 +534,7 @@ absl::Status SetHostDeviceType(HloInstruction* instr) {
   TF_ASSIGN_OR_RETURN(auto backend_config,
                       instr->backend_config<GpuBackendConfig>());
   backend_config.set_device_type(DEVICE_TYPE_HOST);
-  TF_RETURN_IF_ERROR(instr->set_backend_config(backend_config));
+  RETURN_IF_ERROR(instr->set_backend_config(backend_config));
   return absl::OkStatus();
 }
 
@@ -557,10 +559,9 @@ bool BackendConfigDeviceTypeIsHost(HloInstruction* instr) {
 }  // namespace
 
 absl::Status RunOptimizationPasses(
-    HloModule* hlo_module, stream_executor::StreamExecutor* stream_exec,
-    const Compiler::GpuTargetConfig& gpu_target_config,
+    HloModule* hlo_module, const Compiler::GpuTargetConfig& gpu_target_config,
     const AlgebraicSimplifierOptions& layout_insensitive_algsimp_opts,
-    absl::string_view platform_name) {
+    absl::string_view platform_name, bool enable_sort_rewriter) {
   const DebugOptions& debug_options = hlo_module->config().debug_options();
   se::GpuComputeCapability gpu_version =
       gpu_target_config.device_description.gpu_compute_capability();
@@ -617,19 +618,10 @@ absl::Status RunOptimizationPasses(
   // would do.
   pipeline.AddPass<PermutationSortExpander>();
 
-  // SortRewriter needs to ask the device how much scratch space is needed,
-  // which isn't feasible if we don't have a device.
-  if (hlo_module->config().debug_options().xla_gpu_enable_cub_radix_sort()) {
-    if (stream_exec != nullptr) {
-      pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
-                                     std::string{platform_name});
-    } else {
-      LOG(WARNING) << "Using fallback sort algorithm rather than SortRewriter, "
-                      "which will be slower at runtime. To avoid this, "
-                      "compile with a GPU present.";
-    }
+  if (enable_sort_rewriter) {
+    pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
+                                   std::string{platform_name});
   }
-
   // Comparison total order expander
   pipeline.AddPass<ComparisonExpander>(std::array{std::make_pair(BF16, F32)});
 
@@ -720,17 +712,10 @@ absl::Status RunOptimizationPasses(
   // DynamicPadder creates a stable KeyValue sort for dynamic reshapes.
   pipeline.AddPass<DynamicPadder>(dynamic_padder_options);
   // SortRewriter needs to run before StableSortExpander.
-  if (debug_options.xla_gpu_enable_cub_radix_sort()) {
-    if (stream_exec != nullptr) {
-      pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
-                                     gpu_target_config.platform_name);
-    } else {
-      LOG(WARNING) << "Using fallback sort algorithm rather than SortRewriter, "
-                      "which will be slower at runtime. To avoid this, "
-                      "compile with a GPU present.";
-    }
+  if (enable_sort_rewriter) {
+    pipeline.AddPass<SortRewriter>(gpu_target_config.device_description,
+                                   gpu_target_config.platform_name);
   }
-
   // Expand the sort op to support stable sorting if required.
   pipeline.AddPass<StableSortExpander>();
 
@@ -1146,17 +1131,17 @@ absl::Status RunFusionPasses(HloModule* hlo_module,
 
   HloPassPipeline pre_fusion("pre-fusion");
   pre_fusion.AddPass<AddTrackingSuffixToInstructionNames>();
-  TF_RETURN_IF_ERROR(pre_fusion.Run(hlo_module).status());
+  RETURN_IF_ERROR(pre_fusion.Run(hlo_module).status());
 
-  TF_RETURN_IF_ERROR(
-      FusionPipeline(hlo_module->config().debug_options(), shape_size_fn,
-                     alias_info, thread_pool, gpu_device_info, mlir_context)
-          .Run(hlo_module, {HloInstruction::kMainExecutionThread})
-          .status());
+  RETURN_IF_ERROR(FusionPipeline(hlo_module->config().debug_options(),
+                                 shape_size_fn, alias_info, thread_pool,
+                                 gpu_device_info, mlir_context)
+                      .Run(hlo_module, {HloInstruction::kMainExecutionThread})
+                      .status());
 
   if (VLOG_IS_ON(2)) {
     HloFusionStatsVisitor stats;
-    TF_RETURN_IF_ERROR(hlo_module->entry_computation()->Accept(&stats));
+    RETURN_IF_ERROR(hlo_module->entry_computation()->Accept(&stats));
     VLOG(2) << stats.ToString();
   }
 
@@ -1384,7 +1369,7 @@ absl::Status RunDynamicSliceFusionPasses(HloModule* hlo_module,
           });
       return hero_op.has_value();
     });
-    TF_RETURN_IF_ERROR(
+    RETURN_IF_ERROR(
         pipeline.Run(hlo_module, {HloInstruction::kMainExecutionThread})
             .status());
   }
@@ -1497,14 +1482,14 @@ absl::Status GpuCompiler::OptimizeHloModule(
         ClearBackendConfigDeviceType;
     pipeline.AddPass<HloHostDeviceTypeCallWrapper>(
         hlo_host_device_type_call_wrapper_options);
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
-  TF_RETURN_IF_ERROR(RunPreSPMDPartitionerPasses(hlo_module));
+  RETURN_IF_ERROR(RunPreSPMDPartitionerPasses(hlo_module));
   // Set max_windowed_einsum_iteration to slice_size, as there will be
   // significant overhead when scaled beyond the maximum size of the
   // fast-interconnect domain.
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunSPMDPasses(hlo_module, gpu_target_config, alias_info,
                     layout_insensitive_algsimp_opts,
                     /*max_windowed_einsum_iteration=*/options.slice_size));
@@ -1514,7 +1499,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
     pipeline.AddPass<HostComputeAsyncifier>(BackendConfigDeviceTypeIsHost);
     pipeline.AddPass<HostOffloadingPrepare>(
         HostOffloadingPrepare::Rewrite::kConvertToCustomCall);
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
   // Dump the HLO module after SPMD partitioning. There should be no more Python
@@ -1523,12 +1508,23 @@ absl::Status GpuCompiler::OptimizeHloModule(
   TF_ASSIGN_OR_RETURN(
       const stream_executor::Platform* platform,
       stream_executor::PlatformManager::PlatformWithId(PlatformId()));
-  TF_RETURN_IF_ERROR(
-      RunOptimizationPasses(hlo_module, stream_exec, gpu_target_config,
-                            layout_insensitive_algsimp_opts, platform->Name()));
+
+  // SortRewriter needs to ask the device how much scratch space is needed,
+  // which isn't feasible if we don't have a device.
+  bool enable_sort_rewriter =
+      hlo_module->config().debug_options().xla_gpu_enable_cub_radix_sort();
+  if (stream_exec == nullptr && !options.early_exit_with_layouts) {
+    LOG(WARNING) << "Using fallback sort algorithm rather than SortRewriter, "
+                    "which will be slower at runtime. To avoid this, "
+                    "compile with a GPU present.";
+    enable_sort_rewriter = false;
+  }
+  RETURN_IF_ERROR(RunOptimizationPasses(
+      hlo_module, gpu_target_config, layout_insensitive_algsimp_opts,
+      platform->Name(), enable_sort_rewriter));
   se::GpuComputeCapability gpu_version =
       device_description.gpu_compute_capability();
-  TF_RETURN_IF_ERROR(RunCollectiveOptimizationPasses(
+  RETURN_IF_ERROR(RunCollectiveOptimizationPasses(
       hlo_module, options, layout_insensitive_algsimp_opts, gpu_version,
       platform->VisibleDeviceCount(), pointer_size_));
 
@@ -1540,17 +1536,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
     TF_ASSIGN_OR_RETURN(dnn_version, GetDnnVersionInfo(stream_exec));
   }
 
-  TF_RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
+  RETURN_IF_ERROR(OptimizeHloConvolutionCanonicalization(
       hlo_module, gpu_version, dnn_version,
       device_description.runtime_version()));
 
-  TF_RETURN_IF_ERROR(RunLayoutAssignmentPasses(
-      hlo_module, gpu_version, dnn_version, device_description));
+  RETURN_IF_ERROR(RunLayoutAssignmentPasses(hlo_module, gpu_version,
+                                            dnn_version, device_description));
   if (options.early_exit_with_layouts) {
     return absl::OkStatus();
   }
 
-  TF_RETURN_IF_ERROR(RunLayoutNormalizationPasses(
+  RETURN_IF_ERROR(RunLayoutNormalizationPasses(
       hlo_module,
       GetAlgebraicSimplifierOptions(
           AlgebraicSimplifierMode::kLayoutNormalization,
@@ -1559,22 +1555,22 @@ absl::Status GpuCompiler::OptimizeHloModule(
       gpu_version));
 
   // Run target-specific HLO optimization passes after layout assignment.
-  TF_RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
+  RETURN_IF_ERROR(OptimizeHloPostLayoutAssignment(
       hlo_module, stream_exec, options, gpu_target_config, alias_info,
       thread_pool.get_mutable()));
 
   // This is a "low effort, high impact" fusion that should be run first.
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunDynamicSliceFusionPasses(hlo_module, /*platform_id=*/PlatformId()));
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunFusionPasses(hlo_module, gpu_target_config, thread_pool.get_mutable(),
                       ShapeSizeBytesFunction(), alias_info, &mlir_context_));
-  TF_RETURN_IF_ERROR(RunPostFusionPasses(hlo_module, device_description,
-                                         alias_info, pointer_size_, options,
-                                         &mlir_context_));
-  TF_RETURN_IF_ERROR(RunAsyncCollectivesConversionPasses(hlo_module));
-  TF_RETURN_IF_ERROR(RunPostFusionSimplificationPasses(
+  RETURN_IF_ERROR(RunPostFusionPasses(hlo_module, device_description,
+                                      alias_info, pointer_size_, options,
+                                      &mlir_context_));
+  RETURN_IF_ERROR(RunAsyncCollectivesConversionPasses(hlo_module));
+  RETURN_IF_ERROR(RunPostFusionSimplificationPasses(
       hlo_module,
       GetAlgebraicSimplifierOptions(
           AlgebraicSimplifierMode::kPostFusionSimplification,
@@ -1582,10 +1578,10 @@ absl::Status GpuCompiler::OptimizeHloModule(
           gpu_target_config.platform_name == "ROCM"),
       gpu_version, gpu_target_config));
 
-  TF_RETURN_IF_ERROR(RunPostFusionVerificationPasses(
+  RETURN_IF_ERROR(RunPostFusionVerificationPasses(
       hlo_module, stream_exec, options, gpu_target_config, &mlir_context_));
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunCollectiveScheduleLinearizerPasses(hlo_module, stream_exec));
 
   {
@@ -1596,17 +1592,17 @@ absl::Status GpuCompiler::OptimizeHloModule(
         DebugOptions::DETECTION_MODE_NONE) {
       pipeline.AddPass<UnstableReductionDetector>();
     }
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
-  TF_RETURN_IF_ERROR(RunAsyncDotPasses(hlo_module));
+  RETURN_IF_ERROR(RunAsyncDotPasses(hlo_module));
   {
     HloPassPipeline pipeline("autotune-fusion-emitters");
     pipeline.AddPass<FusionWrapper>(gpu_target_config.device_description);
-    TF_RETURN_IF_ERROR(AddFusionAutotuningPass(
+    RETURN_IF_ERROR(AddFusionAutotuningPass(
         &pipeline, hlo_module, options, thread_pool.get_mutable(), stream_exec,
         &gpu_target_config, ShapeSizeBytesFunction()));
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+    RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
 
   return absl::OkStatus();
@@ -1811,7 +1807,8 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     // annotations, this pass will add the annotations.
     pipeline.AddPass<SubByteNormalization>(
         SubByteNormalization::SET_ELEMENT_SIZE);
-    TF_RETURN_IF_ERROR(
+    pipeline.AddPass<EstimateCubScratchSize>(gpu_target_config.platform_name);
+    RETURN_IF_ERROR(
         pipeline.Run(hlo_module, {HloInstruction::kMainExecutionThread})
             .status());
   }
@@ -1832,7 +1829,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // f32).
   add_float_normalization(pipeline);
 
-  TF_RETURN_IF_ERROR(AddGemmFusionAutotuningPasses(
+  RETURN_IF_ERROR(AddGemmFusionAutotuningPasses(
       &pipeline, hlo_module, autotune_config, thread_pool,
       options.key_value_store,
       gpu_target_config.device_description.runtime_version(), stream_exec));
@@ -1848,7 +1845,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   AddGemmRewriterPasses(pipeline, debug_options, gpu_version,
                         gpu_target_config.device_description.runtime_version());
 
-  TF_RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
+  RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
       &pipeline, gpu_version, options, hlo_module, autotune_config, thread_pool,
       stream_exec, &gpu_target_config));
 
@@ -1930,7 +1927,7 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
       "end-of-post-layout_assignment");
 #endif  // NDEBUG
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       pipeline.Run(hlo_module, {HloInstruction::kMainExecutionThread})
           .status());
   return absl::OkStatus();
@@ -1947,7 +1944,7 @@ GpuCompiler::GetTargetConfig(const Compiler::CompileOptions& options,
   }
   if (!debug_opts.xla_gpu_target_config_filename().empty()) {
     std::string gpu_target_config_string;
-    TF_RETURN_IF_ERROR(tsl::ReadFileToString(
+    RETURN_IF_ERROR(tsl::ReadFileToString(
         tsl::Env::Default(), debug_opts.xla_gpu_target_config_filename(),
         &gpu_target_config_string));
     stream_executor::GpuTargetConfigProto gpu_target_config_proto;
@@ -1990,7 +1987,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   }
 
   const DebugOptions debug_opts = module->config().debug_options();
-  TF_RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
+  RETURN_IF_ERROR(LoadAutotuneResultsFromFile(debug_opts));
   bool is_deviceless = options.gpu_target_config.has_value() ||
                        !debug_opts.xla_gpu_target_config_filename().empty();
 
@@ -2012,15 +2009,15 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
   const se::DeviceDescription& device_description =
       gpu_target_config.device_description;
   std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(device_description);
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), is_deviceless ? nullptr : stream_exec,
                         options, gpu_target_config, alias_info.get()));
   if (options.early_exit_with_layouts) {
     return std::move(module);
   }
 
-  TF_RETURN_IF_ERROR(RunPreSchedulingCopyInsertion(*module, device_description,
-                                                   alias_info.get()));
+  RETURN_IF_ERROR(RunPreSchedulingCopyInsertion(*module, device_description,
+                                                alias_info.get()));
 
   uint64_t end_usecs = tsl::Env::Default()->NowMicros();
 
@@ -2037,9 +2034,8 @@ absl::StatusOr<std::unique_ptr<HloModule>> GpuCompiler::RunHloPasses(
       AutotuneConfig autotune_config,
       AutotuneConfig::FromDebugOptions(device_config, debug_opts));
   if (!is_deviceless) {
-    TF_RETURN_IF_ERROR(
-        AutotunerUtil::SerializeAutotuneResults(&autotune_results));
-    TF_RETURN_IF_ERROR(SerializeAutotuneResultsToFile(debug_opts));
+    RETURN_IF_ERROR(AutotunerUtil::SerializeAutotuneResults(&autotune_results));
+    RETURN_IF_ERROR(SerializeAutotuneResultsToFile(debug_opts));
   }
   const std::optional<std::string> optimized_fingerprint =
       MaybeUploadOptimizedGpuSymbols(module.get(), autotune_results);
@@ -2100,7 +2096,7 @@ absl::Status RunPostSchedulingCopyInsertion(HloModule* module,
           ? kRegionBasedLiveRangeAnalysisLimit
           : 0;
   CopyInsertion copy_insertion(alias_info, kUseRegionBasedLiveRangeAnalysis);
-  TF_RETURN_IF_ERROR(copy_insertion.RemoveUnnecessaryCopies(module));
+  RETURN_IF_ERROR(copy_insertion.RemoveUnnecessaryCopies(module));
 
   // Stash away the schedule during copy insertion, to avoid validation failures
   // while the module is in flux.
@@ -2111,10 +2107,10 @@ absl::Status RunPostSchedulingCopyInsertion(HloModule* module,
   // whether it is legal to remove a copy. However, copies in the graph may be
   // necessary for other reason such as preventing a constant from being live
   // out of the graph. So run AddSpecialCaseCopies to re-insert these copies.
-  TF_RETURN_IF_ERROR(copy_insertion.CopyInsertion::AddSpecialCaseCopies(
+  RETURN_IF_ERROR(copy_insertion.CopyInsertion::AddSpecialCaseCopies(
       module, /*execution_threads=*/{}, ShouldAddCopyForCollectiveMemorySpace));
 
-  TF_RETURN_IF_ERROR(HloDCE().Run(module).status());
+  RETURN_IF_ERROR(HloDCE().Run(module).status());
 
   // The passes above can add and remove copies, update the schedule to
   // account for these transformations. Newly added instructions will be
@@ -2123,8 +2119,8 @@ absl::Status RunPostSchedulingCopyInsertion(HloModule* module,
   // Update and restore the schedule. The saved schedule has a reference to the
   // updated HLO module. The saved schedule needs to be updated before restoring
   // it to the module to avoid validation failures.
-  TF_RETURN_IF_ERROR(saved_schedule.Update());
-  TF_RETURN_IF_ERROR(module->set_schedule(std::move(saved_schedule)));
+  RETURN_IF_ERROR(saved_schedule.Update());
+  RETURN_IF_ERROR(module->set_schedule(std::move(saved_schedule)));
 
   return absl::OkStatus();
 }
@@ -2469,9 +2465,9 @@ absl::StatusOr<GpuCompiler::BackendCompileResult> GpuCompiler::CompileAndLink(
               << current_cache.entries_size() << " cached kernels.";
     }
     if (!binaries_to_cache.empty()) {
-      TF_RETURN_IF_ERROR(
-          UpdateDiskKernelCache(resolved_path, /*do_append=*/cache_file_exists,
-                                current_cache, binaries_to_cache));
+      RETURN_IF_ERROR(UpdateDiskKernelCache(resolved_path,
+                                            /*do_append=*/cache_file_exists,
+                                            current_cache, binaries_to_cache));
     }
   }
 
@@ -2513,15 +2509,15 @@ GpuCompiler::CompileToBackendResult(
     const se::DeviceDescription& gpu_device_info) {
   tsl::profiler::TraceMe traceme("CompileToBackendResult");
   std::unique_ptr<GpuAliasInfo> alias_info = GetAliasInfo(gpu_device_info);
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunPreSchedulingPasses(module, gpu_device_info, alias_info.get()));
   TF_ASSIGN_OR_RETURN(ScheduleMetadata schedule_metadata,
                       ScheduleGpuModule(module, pointer_size_, gpu_device_info,
                                         &mlir_context_, alias_info.get()));
   HloPassPipeline pipeline("scheduled-gpu-module");
   AddHloVerifier(&pipeline);
-  TF_RETURN_IF_ERROR(pipeline.Run(module).status());
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(pipeline.Run(module).status());
+  RETURN_IF_ERROR(
       RunPostSchedulingPipelines(module, schedule_metadata.scheduler_mem_limit,
                                  gpu_device_info, alias_info.get()));
 
@@ -2640,8 +2636,8 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   BinaryMap dnn_compiled_graphs;
   if (stream_exec) {
-    TF_RETURN_IF_ERROR(RunCudnnCompilerPasses(module.get(), stream_exec,
-                                              &dnn_compiled_graphs));
+    RETURN_IF_ERROR(RunCudnnCompilerPasses(module.get(), stream_exec,
+                                           &dnn_compiled_graphs));
   }
 
   const DebugOptions& debug_opts = module->config().debug_options();
@@ -2671,7 +2667,7 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
     cost_analysis_options.set_bytes_per_second(
         gpu_device_info.memory_bandwidth());
     GpuHloCostAnalysis cost_analysis(cost_analysis_options, gpu_device_info);
-    TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
+    RETURN_IF_ERROR(module->entry_computation()->Accept(&cost_analysis));
     VLOG(1) << absl::StrFormat(
         "#module=%s,program_id=%d# estimated memory r+w %s", module->name(),
         module->unique_id(),
@@ -2995,7 +2991,7 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     const se::DeviceDescription& gpu_device_info,
     const GpuAliasInfo* alias_info) {
   tsl::profiler::TraceMe traceme("RunPostSchedulingPipelines");
-  TF_RETURN_IF_ERROR(RunPostSchedulingCopyInsertion(module, alias_info));
+  RETURN_IF_ERROR(RunPostSchedulingCopyInsertion(module, alias_info));
   HloPassPipeline main_pipeline("post-scheduling-passes");
 
   // Pipeline for async -> sync conversion on for non-overlapped async ops.
@@ -3080,7 +3076,7 @@ absl::Status GpuCompiler::LoadAutotuneResultsFromFile(
     absl::call_once(once, [&file_path, &status] {
       status = AutotunerUtil::LoadAutotuneResultsFromFile(file_path);
     });
-    TF_RETURN_IF_ERROR(status);
+    RETURN_IF_ERROR(status);
   }
   return absl::OkStatus();
 }
@@ -3093,8 +3089,7 @@ absl::Status GpuCompiler::SerializeAutotuneResultsToFile(
       !file_path.empty()) {
     // Warning: This writes the autotune results at every compilation,
     // possibly multiple times per process.
-    TF_RETURN_IF_ERROR(
-        AutotunerUtil::SerializeAutotuneResultsToFile(file_path));
+    RETURN_IF_ERROR(AutotunerUtil::SerializeAutotuneResultsToFile(file_path));
   }
   return absl::OkStatus();
 }
@@ -3167,7 +3162,7 @@ GpuCompiler::LoadExecutableFromAotResult(
       hlo_module->config()
           .debug_options()
           .xla_gpu_enable_llvm_module_compilation_parallelism()) {
-    TF_RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
+    RETURN_IF_ERROR(LoadCache(ir_emitter_context, cache_file_path));
   }
 
   ThunkEmitter thunk_emitter(&ir_emitter_context);
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index 855267bd792c9a..9e9b1289e1ca2f 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -2108,7 +2108,7 @@ ENTRY %main {
   EXPECT_CALL(mock_log,
               Log(absl::LogSeverity::kWarning, EndsWith("/gpu_compiler.cc"),
                   StartsWith("Using fallback sort algorithm")))
-      .Times(2);
+      .Times(1);
 
   // StartCapturingLogs has to be called even if we expect not to capture any
   // logs.
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index d48d3ca7ad9156..8b3006db1b7a11 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -2602,12 +2602,10 @@ cc_library(
     srcs = ["sort_rewriter.cc"],
     hdrs = ["sort_rewriter.h"],
     deps = [
-        "//xla:comparison_util",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/runtime:cub_sort_thunk",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/pass:hlo_pass",
         "//xla/service:pattern_matcher",
@@ -2635,6 +2633,7 @@ xla_test(
         "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
+        ":estimate_cub_scratch_size",
         ":sort_rewriter",
         "//xla:error_spec",
         "//xla:shape_util",
@@ -2655,6 +2654,49 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "estimate_cub_scratch_size",
+    srcs = ["estimate_cub_scratch_size.cc"],
+    hdrs = ["estimate_cub_scratch_size.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/backends/gpu/runtime:cub_sort_thunk",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/service/gpu:cublas_cudnn",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "estimate_cub_scratch_size_test",
+    srcs = ["estimate_cub_scratch_size_test.cc"],
+    backends = ["h100"],
+    tags = [
+        "cuda-only",
+        "test_migrated_to_hlo_runner_pjrt",
+    ],
+    deps = [
+        ":estimate_cub_scratch_size",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:platform_util",
+        "//xla/stream_executor:platform",
+        "//xla/tests:hlo_pjrt_interpreter_reference_mixin",
+        "//xla/tests:hlo_pjrt_test_base",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "stream_attribute_annotator",
     srcs = ["stream_attribute_annotator.cc"],
diff --git a/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.cc b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.cc
new file mode 100644
index 00000000000000..63dd80dc641cae
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.cc
@@ -0,0 +1,123 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla::gpu {
+
+// Rewrites a single sort instruction with a custom call.
+absl::StatusOr<bool> EstimateCubScratchSize::RunOnInstruction(
+    HloCustomCallInstruction* custom_call) {
+  CHECK_EQ(custom_call->custom_call_target(),
+           kCubDeviceRadixSortUnassignedScratchSizeTarget);
+  const Shape& key_shape = custom_call->operand(0)->shape();
+  PrimitiveType key_type = key_shape.element_type();
+  std::optional<PrimitiveType> value_type;
+  if (custom_call->operand_count() == 2) {
+    value_type = custom_call->operand(1)->shape().element_type();
+  }
+
+  ASSIGN_OR_RETURN(
+      std::unique_ptr<CubSortRunnerInterface> runner,
+      CubSortRunnerInterface::Create(key_type, value_type, platform_name_));
+
+  int64_t num_elements = Product(key_shape.dimensions());
+  // It is assumed that the sorting happens on the innermost dimension.
+  int64_t batch_size = num_elements / key_shape.dimensions().back();
+
+  ASSIGN_OR_RETURN(int64_t scratch_size,
+                   runner->GetScratchSize(num_elements, batch_size));
+
+  // Align and increase scratch size to fit the offsets.
+  if (batch_size > 1) {
+    scratch_size += sizeof(int) - scratch_size % sizeof(int);
+    scratch_size += (batch_size + 1) * sizeof(int);
+  }
+
+  // Update the custom call.
+  Shape new_shape = custom_call->shape();
+  new_shape.mutable_tuple_shapes()->back() =
+      ShapeUtil::MakeShape(U8, {scratch_size});
+  HloInstruction* new_custom_call =
+      custom_call->AddInstruction(HloInstruction::CreateCustomCall(
+          new_shape, absl::MakeSpan(custom_call->operands()),
+          kCubDeviceRadixSortTarget));
+  new_custom_call->SetupDerivedInstruction(custom_call);
+  RETURN_IF_ERROR(custom_call->parent()->ReplaceInstructionWithDifferentShape(
+      custom_call, new_custom_call));
+  return true;
+}
+
+// Rewrites the sorts in the given computation into calls to CUB.
+absl::StatusOr<bool> EstimateCubScratchSize::RunOnComputation(
+    HloComputation* computation) {
+  std::vector<HloCustomCallInstruction*> custom_calls;
+  for (auto* inst : computation->instructions()) {
+    if (auto custom_call = DynCast<HloCustomCallInstruction>(inst)) {
+      if (custom_call->custom_call_target() ==
+          kCubDeviceRadixSortUnassignedScratchSizeTarget) {
+        custom_calls.push_back(custom_call);
+      }
+    }
+  }
+  bool changed = false;
+  for (auto* call : custom_calls) {
+    ASSIGN_OR_RETURN(bool result, RunOnInstruction(call));
+    changed |= result;
+  }
+  return changed;
+}
+
+// Replace compatible sort operations with custom calls.
+absl::StatusOr<bool> EstimateCubScratchSize::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  XLA_VLOG_LINES(
+      3, "EstimateCubScratchSize::RunImpl(), before:\n" + module->ToString());
+  bool changed = false;
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  XLA_VLOG_LINES(
+      3, "EstimateCubScratchSize::RunImpl(), after:\n" + module->ToString());
+  return changed;
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.h b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.h
new file mode 100644
index 00000000000000..a8207dbef54736
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size.h
@@ -0,0 +1,57 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_ESTIMATE_CUB_SCRATCH_SIZE_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_ESTIMATE_CUB_SCRATCH_SIZE_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Updates the scratch size of CUB sort custom calls to match the actual
+// scratch size. Also changes the custom call target from
+// kCubDeviceRadixSortUnassignedScratchSizeTarget to kCubDeviceRadixSortTarget.
+class EstimateCubScratchSize : public HloModulePass {
+ public:
+  explicit EstimateCubScratchSize(std::string platform_name)
+      : platform_name_(platform_name) {}
+
+  absl::string_view name() const override {
+    return "estimate-cub-scratch-size";
+  }
+
+ protected:
+  absl::StatusOr<bool> RunOnInstruction(HloCustomCallInstruction* custom_call);
+  absl::StatusOr<bool> RunOnComputation(HloComputation* computation);
+
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  std::string platform_name_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_ESTIMATE_CUB_SCRATCH_SIZE_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size_test.cc b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size_test.cc
new file mode 100644
index 00000000000000..499121a92b4c35
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/estimate_cub_scratch_size_test.cc
@@ -0,0 +1,317 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/service/platform_util.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla::gpu {
+namespace {
+
+class EstimateCubScratchSizeTest
+    : public HloPjRtInterpreterReferenceMixin<HloPjRtTestBase> {
+ public:
+  void SetUp() override {
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>::SetUp();
+    ASSERT_OK_AND_ASSIGN(test_platform_, PlatformUtil::GetPlatform("gpu"));
+  }
+
+  void RunAndCheck(absl::string_view hlo, absl::string_view expected) {
+    RunAndFilecheckHloRewrite(
+        hlo, EstimateCubScratchSize(GetTestPlatform()->Name()), expected);
+  }
+
+  const stream_executor::Platform* GetTestPlatform() const {
+    return test_platform_;
+  }
+
+ private:
+  stream_executor::Platform* test_platform_ = nullptr;
+};
+
+// Basic sort: ascending.
+TEST_F(EstimateCubScratchSizeTest, U32_F32) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u32[1000] parameter(0)
+      %values = f32[1000] parameter(1)
+      %custom-call = (u32[1000]{0}, f32[1000]{0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u32[1000]{0}, f32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[1000] parameter(0)
+      %custom-call = (f32[1000]{0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = f32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, S32_S32) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = s32[1000] parameter(0)
+      %values = s32[1000] parameter(1)
+      %custom-call = (s32[1000]{0}, s32[1000]{0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = s32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (s32[1000]{0}, s32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[1000] parameter(0)
+      %custom-call = (f32[1000]{0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = f32[1000]{0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[1000]{0}, u8[1]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32_Rank3) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[10,10,10] parameter(0)
+      %custom-call = (f32[10,10,10]{2,1,0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = f32[10,10,10]{2,1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[10,10,10]{2,1,0}, u8[4756]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, F32_Rank2) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = f32[10,100] parameter(0)
+      %custom-call = (f32[10,100]{1,0}, u8[1]{0})
+        custom-call(%keys),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = f32[10,100]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (f32[10,100]{1,0}, u8[4396]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_F16_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = f16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U32_F32_Rank2) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u32[16,128] parameter(0)
+      %values = f32[16,128] parameter(1)
+      %custom-call = (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u32[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[16708]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U64_F64_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u64[16,128] parameter(0)
+      %values = f64[16,128] parameter(1)
+      %custom-call = (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u64[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[33092]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_BF16) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = bf16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_BF16_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = bf16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, bf16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U16_F16) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u16[16,128] parameter(0)
+      %values = f16[16,128] parameter(1)
+      %custom-call = (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u16[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u16[16,128]{1,0}, f16[16,128]{1,0}, u8[8516]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U32_F32_Rank2_Descending) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u32[16,128] parameter(0)
+      %values = f32[16,128] parameter(1)
+      %custom-call = (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":true}
+      ROOT %t = u32[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u32[16,128]{1,0}, f32[16,128]{1,0}, u8[16708]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":true}
+  )");
+}
+
+TEST_F(EstimateCubScratchSizeTest, U64_F64) {
+  const char hlo[] = R"(
+    HloModule m
+    ENTRY main {
+      %keys = u64[16,128] parameter(0)
+      %values = f64[16,128] parameter(1)
+      %custom-call = (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[1]{0})
+        custom-call(%keys, %values),
+        custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize",
+        backend_config={"descending":false}
+      ROOT %t = u64[16,128]{1,0} get-tuple-element(%custom-call), index=0
+  })";
+  RunAndCheck(hlo, R"(
+    // CHECK: (u64[16,128]{1,0}, f64[16,128]{1,0}, u8[33092]{0}) custom-call
+    // CHECK-SAME: custom_call_target="__cub$DeviceRadixSort",
+    // CHECK-SAME: backend_config={"descending":false}
+  )");
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
index 93e8b37be9a8c5..87a09a8dd4ab89 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment.cc
@@ -523,7 +523,7 @@ absl::Status GpuLayoutAssignment::AddBackendConstraints(
       TF_RETURN_IF_ERROR(SetOperandLayout(op0_shape, instruction, 0));
       TF_RETURN_IF_ERROR(SetInstructionLayout(output_shape, instruction));
     } else if ((HloPredicateIsOp<HloOpcode::kSort>(instruction) ||
-                IsCubDeviceRadixSort(*instruction)) &&
+                IsCubDeviceRadixSortNoScratchSize(*instruction)) &&
                instruction->operand(0)->shape().dimensions().size() > 1) {
       // Make sure that all the operands and the output(s) have the same layout.
       Shape keys_shape = instruction->operand(0)->shape();
diff --git a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
index 161ac37dcc644a..d13d61754ed6dd 100644
--- a/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/layout_assignment_test.cc
@@ -417,7 +417,7 @@ TEST_F(LayoutAssignmentTest,
     values = f32[2,3]{1,0} parameter(0)
     transpose = f32[3,2]{1,0} transpose(values), dimensions={1,0}
     ROOT sort = (f32[3,2]{1,0}, f32[3,2]{1,0}, u8[128]{0})
-        custom-call(keys, transpose), custom_call_target="__cub$DeviceRadixSort"
+        custom-call(keys, transpose), custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize"
   })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
index c4399f3964123f..4ed5d493ab093b 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter.cc
@@ -28,8 +28,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/backends/gpu/runtime/cub_sort_thunk.h"
-#include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -49,8 +47,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 
-namespace xla {
-namespace gpu {
+namespace xla::gpu {
 namespace {
 
 namespace m = match;
@@ -227,12 +224,44 @@ std::optional<SortComputationAnalysis> AnalyzeSortOp(
       sort_analysis->sort_order, sort_key_type, sort_value_type};
 }
 
-// Create runner for CUB sort operation.
-absl::StatusOr<std::unique_ptr<CubSortRunnerInterface>> CreateRunner(
-    const SortComputationAnalysis& sort_analysis,
-    absl::string_view platform_name) {
-  return CubSortRunnerInterface::Create(
-      sort_analysis.key_type, sort_analysis.value_type, platform_name);
+// Returns whether the sort operation is supported by CUB.
+bool AreOperandTypesSupportedByCub(
+    const SortComputationAnalysis& sort_analysis) {
+  PrimitiveType key_type = sort_analysis.key_type;
+  std::optional<PrimitiveType> value_type = sort_analysis.value_type;
+  if (!value_type.has_value()) {
+    switch (key_type) {
+      case BF16:
+      case F16:
+      case F32:
+      case F64:
+      case S8:
+      case S16:
+      case S32:
+      case S64:
+      case U8:
+      case U16:
+      case U32:
+      case U64:
+        return true;
+      default:
+        return false;
+    }
+  }
+  auto value_bitwidth = primitive_util::BitWidth(*value_type);
+  switch (key_type) {
+    case U8:
+    case U16:
+    case U32:
+    case U64:
+    case F32:
+      return value_bitwidth == 16 || value_bitwidth == 32 ||
+             value_bitwidth == 64;
+    case S32:
+      return value_bitwidth == 32;
+    default:
+      return false;
+  }
 }
 
 // Restore the result shape after sorting a pair of tensors.
@@ -456,7 +485,7 @@ bool IsCubCompatibleSort(const se::DeviceDescription& device_description,
     VLOG(2) << "Only simple compare computations are supported";
     return false;
   }
-  if (!CreateRunner(*sort_analysis, platform_name).ok()) {
+  if (!AreOperandTypesSupportedByCub(*sort_analysis)) {
     VLOG(2) << "Unsupported operand types (no compiled CUB kernels): "
             << PrimitiveType_Name(sort_analysis->key_type) << " "
             << (sort_analysis->value_type.has_value()
@@ -476,22 +505,6 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
   // Get the sort tensor index and direction.
   SortComputationAnalysis sort_analysis = AnalyzeSortOp(*sort_op).value();
 
-  // Get scratch size requirements from CUB.
-  const Shape& operand_shape = sort_op->operand(0)->shape();
-  int64_t batch_size = Product(operand_shape.dimensions()) /
-                       operand_shape.dimensions(sort_op->sort_dimension());
-
-  TF_ASSIGN_OR_RETURN(auto runner, CreateRunner(sort_analysis, platform_name_));
-  TF_ASSIGN_OR_RETURN(
-      int64_t scratch_size,
-      runner->GetScratchSize(Product(operand_shape.dimensions()), batch_size));
-
-  // Align and increase scratch size to fit the offsets.
-  if (batch_size > 1) {
-    scratch_size += sizeof(int) - scratch_size % sizeof(int);
-    scratch_size += (batch_size + 1) * sizeof(int);
-  }
-
   // Values are only present if sorting a pair of tensors.
   HloInstruction* keys;
   HloInstruction* values = nullptr;
@@ -519,13 +532,17 @@ absl::StatusOr<bool> SortRewriter::RunOnInstruction(
     shapes.push_back(values->shape());
     operands.push_back(values);
   }
-  shapes.push_back(ShapeUtil::MakeShape(U8, {scratch_size}));
+  // The last shape corresponds to the scratch buffer. In this pass we put 1 as
+  // the scratch size, but later the actual size will be set by the
+  // AssignCubScratchSize pass.
+  shapes.push_back(ShapeUtil::MakeShape(U8, {/*scratch_size=*/1}));
   Shape call_shape = ShapeUtil::MakeTupleShape(absl::MakeSpan(shapes));
 
   // Build the custom call instruction.
   HloInstruction* custom_call =
       sort_op->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, absl::MakeSpan(operands), kCubDeviceRadixSortTarget));
+          call_shape, absl::MakeSpan(operands),
+          kCubDeviceRadixSortUnassignedScratchSizeTarget));
 
   xla::SortOptions backend_config;
   backend_config.set_descending(sort_analysis.descending);
@@ -586,5 +603,4 @@ absl::StatusOr<bool> SortRewriter::RunImpl(
   return changed;
 }
 
-}  // namespace gpu
-}  // namespace xla
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
index b2b7382e5d8fa8..cc6dc67fa79bfb 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/transforms/estimate_cub_scratch_size.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/platform.h"
@@ -58,10 +59,11 @@ class SortRewriterTest
 
   bool RunModuleAndPass(HloModule* module) {
     auto cloned = module->Clone();
-    bool changed = SortRewriter(TestGpuDeviceInfo::CudaOrRocmDeviceInfo(),
-                                GetTestPlatform()->Name())
-                       .Run(module)
-                       .value();
+    const std::string& platform_name = GetTestPlatform()->Name();
+    bool changed =
+        SortRewriter(TestGpuDeviceInfo::CudaOrRocmDeviceInfo(), platform_name)
+            .Run(module)
+            .value();
     if (changed) {
       // Here we run an end to end test to make sure that SortRewriter does
       // not introduce an incorrect rewrite. To do this, we need to clone the
@@ -106,7 +108,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -132,7 +136,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/true);
 }
@@ -158,7 +164,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -512,7 +520,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -538,7 +548,9 @@ ENTRY %main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Parameter()), 0)));
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Parameter()),
+          0)));
   ExpectDirection(module->entry_computation()->root_instruction()->operand(0),
                   /*descending=*/false);
 }
@@ -559,7 +571,7 @@ ENTRY %main {
       dimensions={0}, to_apply=%compare, metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}
 })";
   constexpr char kExpectedPattern[] = R"(
-    // CHECK: %[[CC:.*]] = (u16[1000]{0}, u8[1]{0}) custom-call({{.*}}), custom_call_target="__cub$DeviceRadixSort", metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}, backend_config={"descending":true}
+    // CHECK: %[[CC:.*]] = (u16[1000]{0}, u8[1]{0}) custom-call({{.*}}), custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize", metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}, backend_config={"descending":true}
   )";
   for (const auto& [device_description, platform_name] :
        {std::tuple{TestGpuDeviceInfo::RTXA6000DeviceInfo(), "CUDA"},
@@ -602,7 +614,8 @@ ENTRY main {
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
       GmockMatch(m::GetTupleElement(
-          m::CustomCall({kCubDeviceRadixSortTarget}, m::Op(), m::Parameter()),
+          m::CustomCall({kCubDeviceRadixSortUnassignedScratchSizeTarget},
+                        m::Op(), m::Parameter()),
           1)))
       << module->ToString();
 }

From decbcf45d973916d1e0f8101eab3b1f3f105e837 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Thu, 18 Dec 2025 15:58:59 -0800
Subject: [PATCH 555/753] switch from deprecated TF_CHECK_OK

PiperOrigin-RevId: 846448345
---
 tensorflow/compiler/mlir/lite/BUILD           |   2 +-
 .../compiler/mlir/lite/flatbuffer_import.cc   |  10 +-
 .../quantization/tensorflow/calibrator/BUILD  |   2 +-
 .../calibration_statistics_saver_op_test.cc   | 112 +++++++++---------
 .../mlir/quantization/tensorflow/cc/BUILD     |   2 +-
 .../tensorflow/cc/save_variables_test.cc      |  36 +++---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   9 +-
 .../mlir/tensorflow/utils/dump_graph_test.cc  |  17 ++-
 .../mlir/tensorflow/utils/fake_session.cc     |  14 ++-
 tensorflow/compiler/mlir/tf2xla/api/v2/BUILD  |   4 +-
 .../tf2xla/api/v2/graph_to_tf_executor.cc     |   9 +-
 .../compiler/mlir/tf2xla/internal/BUILD       |   2 +-
 .../graph_to_tf_executor_util_test.cc         |  10 +-
 tensorflow/compiler/mlir/tools/optimize/BUILD |   4 +-
 .../mlir/tools/optimize/quantization_utils.cc |  10 +-
 tensorflow/compiler/tests/BUILD               |   3 +-
 tensorflow/compiler/tests/randomized_tests.cc |   6 +-
 .../tests/unary_ops_composition_test.cc       |  10 +-
 tensorflow/compiler/tf2xla/BUILD              |  60 +++++++---
 .../compiler/tf2xla/functionalize_cond.cc     |  43 +++++--
 tensorflow/compiler/tf2xla/kernels/BUILD      |   5 +-
 .../tf2xla/kernels/reduction_ops_common.cc    |   6 +-
 .../tf2xla/kernels/xla_custom_call_op.cc      |   7 +-
 ...ht_outside_compilation_kernels_for_test.cc |  11 +-
 tensorflow/compiler/tf2xla/resource_util.cc   |  18 ++-
 .../compiler/tf2xla/side_effect_util.cc       |  18 ++-
 tensorflow/compiler/tf2xla/tf2xla_util.cc     |  38 ++++--
 .../compiler/tf2xla/tf2xla_util_test.cc       |  27 +++--
 .../xla_compiled_cpu_function_thunks.cc       |   7 +-
 tensorflow/compiler/tf2xla/xla_context.cc     |  36 +++---
 tensorflow/compiler/tf2xla/xla_helpers.cc     |  38 ++++--
 tensorflow/compiler/tf2xla/xla_op_registry.cc |  16 +--
 32 files changed, 364 insertions(+), 228 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index ab6c5abeca86f0..b48a8ef6411711 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1699,6 +1699,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -1714,7 +1715,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TranslateLib",
         "@local_xla//xla/tsl/platform:errors",
-        "@local_xla//xla/tsl/platform:status",
         "@local_xla//xla/tsl/platform:statusor",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:vhlo_ops",
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 19aae278c33178..ab7d782dba8d33 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -101,7 +102,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -974,8 +974,8 @@ StatusOr<Operation*> ConvertOp(
   if (op_name == "tfl.lstm") {
     // TODO(b/147587779): add the right region if region is empty.
     op_state.addRegion();
-    TF_CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
-                                          builder));
+    CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
+                                       builder));
   }
   if (op_name == "tfl.while") {
     // Adds two empty regions for "tfl.while". We will fill the regions after
@@ -986,8 +986,8 @@ StatusOr<Operation*> ConvertOp(
     op_state.addRegion();
   }
   if (op_name == "tfl.unidirectional_sequence_lstm") {
-    TF_CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
-                                          builder));
+    CHECK_OK(AddOpIntermediatesForLstm(op, intermediate_types, op_state, loc,
+                                       builder));
   }
   if (op_name == "tfl.reshape") {
     // Flattens reshape ops when more than one dimension shape operand is given.
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index 4ec998d18bcfa3..b06568589dadf2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -221,9 +221,9 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_googletest//:gtest",
         "@local_xla//xla/tsl/platform:errors",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
index fd3acb188656a1..7f8f2623b7bfa0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics_saver_op_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status_matchers.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibration_statistics.pb.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -55,11 +55,11 @@ TEST_F(CalibrationStatisticsSaverTest, MissingOutputPath) {
   inputs.emplace_back("min", 0, DT_FLOAT);
   inputs.emplace_back("max", 0, DT_FLOAT);
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Finalize(node_def()));
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Finalize(node_def()));
   ASSERT_THAT(InitOp(),
               absl_testing::StatusIs(
                   tsl::error::INVALID_ARGUMENT,
@@ -75,12 +75,12 @@ TEST_F(CalibrationStatisticsSaverTest, WrongNumInputs) {
   inputs.emplace_back("min", 0, DT_FLOAT);
   inputs.emplace_back("max", 0, DT_FLOAT);
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", "/tmp/statistics.pbtxt")
-                  .Finalize(node_def()));
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", "/tmp/statistics.pbtxt")
+               .Finalize(node_def()));
   ASSERT_THAT(InitOp(),
               absl_testing::StatusIs(
                   tsl::error::ABORTED,
@@ -98,12 +98,12 @@ TEST_F(CalibrationStatisticsSaverTest, WrongInputTypes) {
   inputs.emplace_back("max", 0, DT_FLOAT);
   inputs.emplace_back("histogram", 0, DT_FLOAT);
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", "/tmp/statistics.pbtxt")
-                  .Finalize(node_def()));
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", "/tmp/statistics.pbtxt")
+               .Finalize(node_def()));
   ASSERT_THAT(InitOp(),
               absl_testing::StatusIs(
                   tsl::error::ABORTED,
@@ -123,24 +123,23 @@ TEST_F(CalibrationStatisticsSaverTest, SimpleMinMax) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({0}), {});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
   ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
 
@@ -163,24 +162,23 @@ TEST_F(CalibrationStatisticsSaverTest, SimpleAverageMinMax) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({0}), {});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
   ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
 
@@ -204,24 +202,23 @@ TEST_F(CalibrationStatisticsSaverTest, SimpleHistogram) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({8}), {1, 4, 6, 7, 3, 2, 1, 0});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(1));
   ASSERT_THAT(statistics_map.statistics(), ElementsAre(Key("1")));
 
@@ -250,13 +247,13 @@ TEST_F(CalibrationStatisticsSaverTest, MultipleStats) {
   const std::string dir = testing::TmpDir();
   const std::string output_file_path = io::JoinPath(dir, "statistics.pbtxt");
 
-  TF_CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
-                  .Input(inputs)
-                  .Attr("ids", ids)
-                  .Attr("calibration_methods", calibration_methods)
-                  .Attr("output_file_path", output_file_path)
-                  .Finalize(node_def()));
-  TF_CHECK_OK(InitOp());
+  CHECK_OK(NodeDefBuilder("op", "CalibrationStatisticsSaver")
+               .Input(inputs)
+               .Attr("ids", ids)
+               .Attr("calibration_methods", calibration_methods)
+               .Attr("output_file_path", output_file_path)
+               .Finalize(node_def()));
+  CHECK_OK(InitOp());
 
   AddInputFromArray<float>(TensorShape({}), {1.f});
   AddInputFromArray<float>(TensorShape({}), {5.f});
@@ -265,12 +262,11 @@ TEST_F(CalibrationStatisticsSaverTest, MultipleStats) {
   AddInputFromArray<float>(TensorShape({}), {5.f});
   AddInputFromArray<int64_t>(TensorShape({8}), {1, 4, 6, 7, 3, 2, 1, 0});
 
-  TF_CHECK_OK(RunOpKernel());
+  CHECK_OK(RunOpKernel());
   kernel_.reset();
 
   CalibrationStatisticsMap statistics_map;
-  TF_CHECK_OK(
-      ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
+  CHECK_OK(ReadBinaryProto(Env::Default(), output_file_path, &statistics_map));
   ASSERT_THAT(statistics_map.statistics(), SizeIs(2));
   ASSERT_THAT(statistics_map.statistics(), Contains(Key("1")));
   ASSERT_THAT(statistics_map.statistics(), Contains(Key("2")));
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 6fe5bd285f8f50..99f2d2a52e1a92 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -54,6 +54,7 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/util/tensor_bundle",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
@@ -62,7 +63,6 @@ tf_cc_test(
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
index 2fca9426c9d59f..3c5d085d7655aa 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "absl/cleanup/cleanup.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -63,7 +63,7 @@ class SaveVariablesToCheckpointTest : public ::testing::Test {
       return absl::InternalError("Failed to create temp file.");
     }
 
-    TF_CHECK_OK(env_->CreateDir(tmp_dir));
+    CHECK_OK(env_->CreateDir(tmp_dir));
     return tmp_dir;
   }
 
@@ -103,8 +103,8 @@ TEST_F(SaveVariablesToCheckpointTest, VariableSavedToCheckpoint) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -149,8 +149,8 @@ TEST_F(SaveVariablesToCheckpointTest, MultipleVariablesSavedToCheckpoint) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -186,8 +186,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -218,8 +218,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   EXPECT_TRUE(
@@ -253,8 +253,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -293,8 +293,8 @@ TEST_F(SaveVariablesToCheckpointTest, MutableVariablesNotSaved) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -330,8 +330,8 @@ TEST_F(SaveVariablesToCheckpointTest,
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   const absl::StatusOr<std::vector<std::string>> variable_shared_names =
@@ -371,8 +371,8 @@ TEST_F(SaveVariablesToCheckpointTest, FailsWhenDuplicateSharedName) {
 
   const absl::Cleanup checkpoint_prefix_cleanup = [this, &checkpoint_prefix]() {
     int64_t undeleted_files, undeleted_dirs;
-    TF_CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
-                                        &undeleted_dirs));
+    CHECK_OK(env_->DeleteRecursively(*checkpoint_prefix, &undeleted_files,
+                                     &undeleted_dirs));
   };
 
   EXPECT_FALSE(
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 588bb1caa75e8b..494c23f1344e9b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -574,10 +574,11 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
-        "//tensorflow/core/common_runtime:threadpool_device",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:threadpool_options",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
@@ -1311,8 +1312,10 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
index 9d9780d231523f..3fea8e64e85ca3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
@@ -15,11 +15,22 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h"
 
+#include <cstdint>
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/dump_graph.h"
 
@@ -68,7 +79,7 @@ class StringWritableFile : public WritableFile {
 TEST(Dump, TextualIrToFileSuccess) {
   Graph graph(OpRegistry::Global());
   Node* node;
-  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+  CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
   setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
   UseMlirForGraphDump(MlirDumpConfig());
@@ -98,7 +109,7 @@ TEST(Dump, TextualIrWithOptions) {
 TEST(Dump, DumpToTFG) {
   Graph graph(OpRegistry::Global());
   Node* node;
-  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+  CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
 
   std::string actual;
   StringWritableFile file(&actual);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
index 2ee95c1337aa52..cb406a2d0e3fc9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
@@ -19,17 +19,21 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "llvm/Support/CommandLine.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/threadpool_options.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/session_options.h"
@@ -81,9 +85,9 @@ void FakeSession::InitVariables() {
   auto container = device->resource_manager()->default_container();
 
   // Create 2 resources and initialize them with dummy values.
-  TF_CHECK_OK(device->resource_manager()->Create(
+  CHECK_OK(device->resource_manager()->Create(
       container, "var1", new tensorflow::Var(tensorflow::DataType::DT_FLOAT)));
-  TF_CHECK_OK(device->resource_manager()->Create(
+  CHECK_OK(device->resource_manager()->Create(
       container, "var2", new tensorflow::Var(tensorflow::DataType::DT_FLOAT)));
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 746bca0cdb79b7..da75a97030412d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -323,6 +323,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":mlir_roundtrip_flags",
         "//tensorflow/compiler/jit:shape_inference_helpers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -349,13 +350,13 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime:function_body",
-        "//tensorflow/core/platform:crash_analysis",
         "//tensorflow/core/platform:types",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -365,7 +366,6 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@local_xla//xla:status_macros",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc
index edf726134f66bd..cb48ff03def75d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/graph_to_tf_executor.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/match.h"
@@ -74,7 +75,6 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -83,14 +83,15 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/api/v2/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/node_order.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/crash_analysis.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/function_body.h"
 #include "tensorflow/core/common_runtime/function_def_utils.h"
@@ -120,11 +121,9 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/crash_analysis.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stack_frame.h"
-#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
@@ -1889,7 +1888,7 @@ mlir::Operation* ImporterBase::CreateOperation(
     NameRangeMap input_ranges, output_ranges;
     // This will fail only if the OpDef is syntactically invalid.
     // TODO(jpienaar): Convert this CHECK into a properly propagated error.
-    TF_CHECK_OK(
+    CHECK_OK(
         NameRangesForNode(node, node.op_def(), &input_ranges, &output_ranges));
     if (inner_op->hasTrait<mlir::OpTrait::AttrSizedOperandSegments>()) {
       // Add derived "operand_segment_sizes" attr to the created operation.
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
index 4e7d058c3c6c6c..f292b270f855e8 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
@@ -419,10 +419,10 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/platform:enable_tf2_utils",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
         "@local_xla//xla/tsl/lib/core:status_test_util",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc
index 8ffe558029ad8b..cb332fe4fb997b 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/graph_to_tf_executor_util_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -31,7 +32,6 @@ limitations under the License.
 #include "tensorflow/cc/ops/tpu_functional_ops.h"
 #include "tensorflow/cc/ops/tpu_replication_ops.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -124,11 +124,11 @@ Node* FromNodeDef(absl::string_view name, absl::string_view node_type,
   }
 
   NodeDef node_def;
-  TF_CHECK_OK(builder.Finalize(&node_def));
+  CHECK_OK(builder.Finalize(&node_def));
 
   absl::Status s;
   Node* node = graph.AddNode(node_def, &s);
-  TF_CHECK_OK(s);
+  CHECK_OK(s);
   return node;
 }
 
@@ -547,12 +547,12 @@ TEST(UnsupportedOpTest,
   builder.Attr("dtypes", DT_FLOAT);
   builder.Attr("shapes", 1);
   NodeDef node_def;
-  TF_CHECK_OK(builder.Finalize(&node_def));
+  CHECK_OK(builder.Finalize(&node_def));
   absl::Status s;
   Node* node_InfeedDequeueTuple = (*root.graph()).AddNode(node_def, &s);
   node_InfeedDequeueTuple->set_requested_device(
       "/device:TPU_REPLICATED_CORE:0");
-  TF_CHECK_OK(s);
+  CHECK_OK(s);
   ASSERT_NE(node_InfeedDequeueTuple, nullptr);
 
   Graph graph(OpRegistry::Global());
diff --git a/tensorflow/compiler/mlir/tools/optimize/BUILD b/tensorflow/compiler/mlir/tools/optimize/BUILD
index 6a3cc301bc24ca..d7bece21567fdf 100644
--- a/tensorflow/compiler/mlir/tools/optimize/BUILD
+++ b/tensorflow/compiler/mlir/tools/optimize/BUILD
@@ -17,9 +17,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/tools:safe_cast",
         "//tensorflow/core/framework:tensor_shape",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/platform:macros",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc b/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc
index 09a5b928a2622f..5e0ec1ccac3b9f 100644
--- a/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/tools/optimize/quantization_utils.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/mlir/tools/safe_cast.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tflite_migration {
@@ -92,13 +92,13 @@ void SymmetricPerChannelQuantizeValues(const float* const input,
   // Quantize the values.
   int indices[kPerChannelMaxDim];
   tensorflow::TensorShape unextended_shape;
-  TF_CHECK_OK(tensorflow::TensorShapeUtils::MakeShape(absl::MakeSpan(dimension),
-                                                      &unextended_shape));
+  CHECK_OK(tensorflow::TensorShapeUtils::MakeShape(absl::MakeSpan(dimension),
+                                                   &unextended_shape));
   tensorflow::TensorShape shape;
   for (int i = 0; i < kPerChannelMaxDim - unextended_shape.dims(); ++i) {
-    TF_CHECK_OK(shape.AddDimWithStatus(1));
+    CHECK_OK(shape.AddDimWithStatus(1));
   }
-  TF_CHECK_OK(shape.AppendShapeWithStatus(unextended_shape));
+  CHECK_OK(shape.AppendShapeWithStatus(unextended_shape));
   channel_dim_index += kPerChannelMaxDim - unextended_shape.dims();
 
   for (indices[0] = 0; indices[0] < shape.dim_size(0); indices[0]++) {
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 995ae2b5740ae7..22e471c768e377 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -2903,13 +2903,12 @@ tf_cuda_cc_test(
         "//tensorflow/compiler/jit:xla_kernel_creator",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_testutil",
-        "@local_xla//xla/tsl/platform:status",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index 88b379331b32ef..b23164ac9aeb18 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -90,11 +90,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/session_options.h"
@@ -4931,8 +4929,8 @@ int main(int argc, char** argv) {
   // XLA devices register kernels at construction time; create all known devices
   // to make sure the kernels are registered.
   std::vector<std::unique_ptr<tensorflow::Device>> devices;
-  TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
-      tensorflow::SessionOptions(), "", &devices));
+  CHECK_OK(tensorflow::DeviceFactory::AddDevices(tensorflow::SessionOptions(),
+                                                 "", &devices));
   tensorflow::StaticDeviceMgr device_mgr(std::move(devices));
 
   tensorflow::Device* ignored;
diff --git a/tensorflow/compiler/tests/unary_ops_composition_test.cc b/tensorflow/compiler/tests/unary_ops_composition_test.cc
index c27b8070bbb450..00fd0ea67041b9 100644
--- a/tensorflow/compiler/tests/unary_ops_composition_test.cc
+++ b/tensorflow/compiler/tests/unary_ops_composition_test.cc
@@ -16,12 +16,13 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <memory>
+#include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/tsl/lib/core/status_test_util.h"
-#include "xla/tsl/platform/status.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/device_factory.h"
@@ -33,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/port.h"
 
@@ -84,8 +84,8 @@ class UnaryOpsCompositionTest : public OpsTestBase {
     DeviceContext* device_context =
         device_->tensorflow_accelerator_device_info()->default_context;
 
-    TF_CHECK_OK(device_context->CopyCPUTensorToDeviceSync(&input_on_host,
-                                                          device_, input));
+    CHECK_OK(device_context->CopyCPUTensorToDeviceSync(&input_on_host, device_,
+                                                       input));
 
     TF_ASSERT_OK(RunOpKernel());
 
@@ -95,7 +95,7 @@ class UnaryOpsCompositionTest : public OpsTestBase {
     Tensor* output = GetOutput(0);
     Tensor output_on_host(cpu_allocator, output->dtype(), output->shape());
 
-    TF_CHECK_OK(device_context->CopyDeviceTensorToCPUSync(
+    CHECK_OK(device_context->CopyDeviceTensorToCPUSync(
         output, "output 0", device_, &output_on_host));
 
     test::ExpectClose(expected_tensor, output_on_host, /*atol=*/1e-5,
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index e5545445817ec2..9c2dfc073afccb 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -490,7 +490,6 @@ cc_library(
         "@local_xla//xla/service/cpu:executable_proto_cc",
         "@local_xla//xla/tsl/concurrency:async_value",
         "@local_xla//xla/tsl/platform:env",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
@@ -721,12 +720,15 @@ cc_library(
         ":common",
         ":xla_expression",
         ":xla_helpers",
+        ":xla_resource",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:core_cpu_internal",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_xla//xla:literal",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:status_macros",
         "@local_xla//xla:xla_data_proto_cc",
@@ -767,7 +769,6 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
         "@local_xla//xla:util",
         "@local_xla//xla/client:client_library",
     ],
@@ -846,18 +847,25 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:refcount",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@local_xla//xla:executable_run_options",
-        "@local_xla//xla:types",
-        "@local_xla//xla/backends/gpu/collectives:gpu_clique_key",
+        "@local_xla//xla:literal",
+        "@local_xla//xla:literal_util",
+        "@local_xla//xla:shape_util",
         "@local_xla//xla/core/collectives:clique_id",
         "@local_xla//xla/core/collectives:clique_key",
         "@local_xla//xla/hlo/builder:xla_builder",
         "@local_xla//xla/hlo/builder:xla_computation",
-        "@local_xla//xla/hlo/builder/lib:arithmetic",
-        "@local_xla//xla/hlo/builder/lib:constants",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/hlo/translate/mhlo_to_hlo:layout_util",
         "@local_xla//xla/service:computation_placer_hdr",
@@ -1046,6 +1054,11 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@local_xla//xla:status_macros",
@@ -1063,6 +1076,7 @@ tf_cc_test(
         "//tensorflow/cc:function_ops",
         "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -1071,6 +1085,8 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1253,10 +1269,9 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/stream_executor:device_memory",
         "@local_xla//xla/stream_executor:stream",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
     alwayslink = 1,
 )
@@ -1327,21 +1342,21 @@ cc_library(
         "functionalize_cond.h",
     ],
     deps = [
-        ":frontend_attributes_util",
         ":functionalize_control_flow_util",
-        ":tf2xla_util",
+        ":tf2xla_defs",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/core/platform:hash",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
         "@local_xla//xla:status_macros",
-        "@local_xla//xla:union_find",
     ],
 )
 
@@ -1607,7 +1622,14 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:str_util",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_xla//xla/tsl/platform:errors",
     ],
 )
 
@@ -1673,12 +1695,18 @@ cc_library(
     deps = [
         ":resource_operation_table",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:graph",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:function_body",
+        "//tensorflow/core/common_runtime:function_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@local_xla//xla:status_macros",
     ],
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 2adc83512c6617..b5426bc35c58df 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -16,30 +16,49 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 
 #include <algorithm>
+#include <array>
+#include <cstddef>
+#include <cstdint>
 #include <deque>
-#include <stack>
+#include <functional>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
-#include "absl/memory/memory.h"
-#include "absl/strings/match.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/types/optional.h"
-#include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
-#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
-#include "xla/union_find.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+#include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/hash.h"
 #include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
@@ -1138,7 +1157,7 @@ StateMap::CondId FunctionalizeCond::StateAlongEdge(const Edge* e) {
     StateMap::CondState state;
     if (id != nullptr) state = *id;
     OutputTensor predicate;
-    TF_CHECK_OK(GetSwitchPredicate(*src, &predicate));
+    CHECK_OK(GetSwitchPredicate(*src, &predicate));
     if (e->IsControlEdge()) {
       // In gradients of tf.cond(), in each branch, we have a NoOp node as
       // control pivot. These NoOp nodes have control dependency from Switch
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index bb50d530484b10..281da5c23c54e8 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1893,18 +1893,17 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_resource",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/core:framework",
-        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@local_xla//xla:literal",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/builder:xla_builder",
         "@local_xla//xla/hlo/builder:xla_computation",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
@@ -3019,10 +3018,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/log:check",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/builder:xla_builder",
-        "@local_xla//xla/tsl/platform:status",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 3bfe9e384405b2..6e8417e2d25ff2 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -16,9 +16,11 @@ limitations under the License.
 // XLA-specific reduction Ops.
 
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -30,14 +32,12 @@ limitations under the License.
 #include "xla/hlo/builder/xla_computation.h"
 #include "xla/literal.h"
 #include "xla/shape_util.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
@@ -125,7 +125,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
   // Construct the builder for the reduction lambda.
   xla::XlaBuilder r(absl::StrCat(desc, "-reduction"));
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type));
+  CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type));
 
   auto data = xla::ConvertElementType(ctx->Input(0), type);
   // Call virtual method to get the initial value.
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
index 99a0ec6d9e38dd..e9d0314780ca54 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
@@ -13,20 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/shape.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace {
@@ -47,8 +47,7 @@ class XlaCustomCallOp : public XlaOpKernel {
     }
 
     xla::Shape output_shape;
-    TF_CHECK_OK(
-        TensorShapeToXLAShape(output_type_, output_shape_, &output_shape));
+    CHECK_OK(TensorShapeToXLAShape(output_type_, output_shape_, &output_shape));
     xla::XlaOp output = xla::CustomCall(ctx->builder(), target_name_, inputs,
                                         output_shape, backend_config_);
     ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
index 993b98e61dc0ed..93444bfeb49125 100644
--- a/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
+++ b/tensorflow/compiler/tf2xla/light_outside_compilation_kernels_for_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <string>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
 
 #include "absl/log/check.h"
 #include "absl/status/status.h"
@@ -24,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -36,8 +39,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 // Sample kernels for the light outside compilation test.
 
@@ -302,8 +303,8 @@ class TestTfMustBeConstantOp : public OpKernel {
     AllocatorAttributes pinned_alloc_attrs;
     pinned_alloc_attrs.set_on_host(true);
     pinned_alloc_attrs.set_gpu_compatible(true);
-    TF_CHECK_OK(ctx->allocate_temp(input.dtype(), input.shape(), &tmp,
-                                   pinned_alloc_attrs));
+    CHECK_OK(ctx->allocate_temp(input.dtype(), input.shape(), &tmp,
+                                pinned_alloc_attrs));
 
     OP_REQUIRES_OK(ctx, stream->Memcpy(tmp.data(),
                                        stream_executor::DeviceAddressBase{
diff --git a/tensorflow/compiler/tf2xla/resource_util.cc b/tensorflow/compiler/tf2xla/resource_util.cc
index e78828df4e13a4..50990e0bb2858d 100644
--- a/tensorflow/compiler/tf2xla/resource_util.cc
+++ b/tensorflow/compiler/tf2xla/resource_util.cc
@@ -15,13 +15,27 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/resource_util.h"
 
+#include <optional>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "xla/status_macros.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -204,8 +218,8 @@ absl::Status PropagateThroughCallOp(
   // Instantiate associated function to get function body.
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(InstantiateFunctionCall(n.def(), lib_runtime, &handle));
-  auto release_handle_on_return = gtl::MakeCleanup(
-      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+  auto release_handle_on_return =
+      gtl::MakeCleanup([&] { CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
 
   // Recursively analyze called function for resource sources and users.
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index e8b2a56cdf64d2..a28d6ac8b1554f 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -15,8 +15,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
+#include "absl/types/span.h"
+#include "xla/tsl/platform/errors.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/str_util.h"
 
 namespace tensorflow {
 
@@ -98,9 +111,8 @@ std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
 
         first_side_effecting_node_on_path = n;
         std::string original_node_name;
-        TF_CHECK_OK(GetNodeAttr(n->def(),
-                                kXlaOriginalOutsideCompilationNodeName,
-                                &original_node_name));
+        CHECK_OK(GetNodeAttr(n->def(), kXlaOriginalOutsideCompilationNodeName,
+                             &original_node_name));
         results.insert(original_node_name);
       },
       [&](Node* n) {
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 042b572c234355..5884cddba62b3d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -15,19 +15,36 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
-#include <functional>
+#include <atomic>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <map>
+#include <memory>
+#include <optional>
 #include <queue>
 #include <random>
 #include <set>
+#include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
-#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -35,13 +52,16 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_debug_info_builder.h"
+#include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
@@ -1025,7 +1045,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
     // Look into forward While body function and check if TensorListPushBack op
     // has a Const input.
     NameAttrList fwd_body_attr;
-    TF_CHECK_OK(GetNodeAttr(fwd_while->def(), "body", &fwd_body_attr));
+    CHECK_OK(GetNodeAttr(fwd_while->def(), "body", &fwd_body_attr));
     const FunctionDef* fwd_body = fld->Find(fwd_body_attr.name());
     if (!fwd_body) {
       return errors::InvalidArgument("Cannot find function ",
@@ -1033,7 +1053,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
                                      fwd_while->DebugString());
     }
     std::unique_ptr<FunctionBody> fwd_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(
+    CHECK_OK(FunctionDefToBodyHelper(
         *fwd_body, AttrSlice(&fwd_body_attr.attr()), fld, &fwd_fbody));
 
     // Find the TensorListPushBack node; it's one of fwd_arg's successors.
@@ -1051,7 +1071,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
 
     // Get input for the TensorListPushBack node.
     Node* input_node;
-    TF_CHECK_OK(tl_push_nodes[0]->input_node(1, &input_node));
+    CHECK_OK(tl_push_nodes[0]->input_node(1, &input_node));
     if (input_node->type_string() != "Const") {
       // Input for the TensorList is not Const node.
       continue;
@@ -1062,7 +1082,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
     // Rewrite backward While body function, replace usages of
     // TensorListPopBack with a Const node.
     NameAttrList bwd_body_attr;
-    TF_CHECK_OK(GetNodeAttr(bwd_while->def(), "body", &bwd_body_attr));
+    CHECK_OK(GetNodeAttr(bwd_while->def(), "body", &bwd_body_attr));
     const FunctionDef* bwd_body = fld->Find(bwd_body_attr.name());
     if (!bwd_body) {
       return errors::InvalidArgument("Cannot find function ",
@@ -1070,7 +1090,7 @@ absl::Status RewriteTensorListWithConstElement(Graph* g,
                                      bwd_while->DebugString());
     }
     std::unique_ptr<FunctionBody> bwd_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(
+    CHECK_OK(FunctionDefToBodyHelper(
         *bwd_body, AttrSlice(&bwd_body_attr.attr()), fld, &bwd_fbody));
 
     // Find the TensorListPopBack node; it's one of bwd_arg's successors.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index ef64b82f50e5be..1d81f778232523 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -15,25 +15,39 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/ops/data_flow_ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/list_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/no_op.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/graph_optimizer.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/version.h"
 
@@ -492,8 +506,7 @@ TEST(PropagateConstIntoFunctionalNodes, RewriteTensorListWithConstMember) {
   const FunctionDef* bwd_body = fld.Find("bwd_body_tl_rewrite_0");
   ASSERT_NE(bwd_body, nullptr);
   std::unique_ptr<FunctionBody> bwd_fbody;
-  TF_CHECK_OK(
-      FunctionDefToBodyHelper(*bwd_body, AttrSlice(), &fld, &bwd_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*bwd_body, AttrSlice(), &fld, &bwd_fbody));
   auto node_name_index = bwd_fbody->graph->BuildNodeNameIndex();
   const Node* identity = node_name_index.at("identity");
   ASSERT_NE(identity, nullptr);
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
index 68c4d7f90b204c..f4b7ed44ff41d5 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function_thunks.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/backends/cpu/runtime/function_library.h"
 #include "xla/service/cpu/cpu_aot_compilation_result.h"
 #include "xla/tsl/concurrency/async_value_ref.h"
-#include "xla/tsl/platform/status.h"
 
 namespace tensorflow {
 
@@ -47,17 +46,17 @@ XlaCompiledCpuFunctionThunks::XlaCompiledCpuFunctionThunks(
       std::move(function_library));
 
   // To load a CPU executable we don't need a compiler or a stream executor.
-  TF_CHECK_OK(aot_compilation_result.status());
+  CHECK_OK(aot_compilation_result.status());
   // NO_CDC: aot_compilation_result is checked to be OK above.
   auto cpu_executable = std::move(*aot_compilation_result.value())
                             .LoadExecutable(/*stream_exec=*/nullptr);
 
-  TF_CHECK_OK(cpu_executable.status());
+  CHECK_OK(cpu_executable.status());
   auto executable_or_err =
       // NO_CDC: cpu_executable is checked to be OK above.
       xla::cpu::NanoRtExecutable::Create(std::move(cpu_executable.value()));
 
-  TF_CHECK_OK(executable_or_err.status());
+  CHECK_OK(executable_or_err.status());
   // NO_CDC: executable_or_err is checked to be OK above.
   executable_ = std::move(executable_or_err.value());
 }
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index fad607b1ae1333..16289828892460 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -15,23 +15,29 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 
+#include <cstdint>
+#include <functional>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_expression.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "xla/client/client_library.h"
+#include "tensorflow/compiler/tf2xla/xla_resource.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "xla/shape_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
@@ -44,8 +50,8 @@ const char XlaContext::kXlaContextResourceName[] = "_xla_context";
   // per-step context is looked up in the resource manager. The
   // JIT will prepopulate the JITContext.
   XlaContext* context;
-  TF_CHECK_OK(ctx->step_container()->Lookup(ctx->resource_manager(),
-                                            kXlaContextResourceName, &context));
+  CHECK_OK(ctx->step_container()->Lookup(ctx->resource_manager(),
+                                         kXlaContextResourceName, &context));
   // The resource manager handed us a fresh reference to 'context', but retains
   // a reference itself so the context won't be freed. The resource manager will
   // outlive the JIT compilation.
@@ -88,7 +94,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
     VLOG(1) << "Building Max() for " << type_string;
     xla::XlaBuilder b("max<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -104,7 +110,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
     VLOG(1) << "Building Min() for " << type_string;
     xla::XlaBuilder b("min<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -120,7 +126,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
     VLOG(1) << "Building Add() for " << type_string;
     xla::XlaBuilder b("add<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -137,7 +143,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateLogAddExp(
     VLOG(1) << "Building LogAddExp() for " << type_string;
     xla::XlaBuilder b("log_add_exp<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
@@ -158,7 +164,7 @@ const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
     VLOG(1) << "Building Mul() for " << type_string;
     xla::XlaBuilder b("mul<" + type_string + ">");
     xla::PrimitiveType xla_type;
-    TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
+    CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x =
         xla::Parameter(&b, 0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
     auto y =
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 45814517342abc..0250d423296ede 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -17,58 +17,74 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 
-#include <map>
+#include <cstdint>
+#include <numeric>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/container/btree_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
-#include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/core/collectives/clique_id.h"
 #include "xla/core/collectives/clique_key.h"
-#include "xla/hlo/builder/lib/arithmetic.h"
-#include "xla/hlo/builder/lib/constants.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/refcount.h"
 
 namespace tensorflow {
 
 xla::XlaOp XlaHelpers::Zero(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return xla::ConstantLiteral(b, xla::LiteralUtil::Zero(type));
 }
 
 xla::XlaOp XlaHelpers::One(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return xla::ConstantLiteral(b, xla::LiteralUtil::One(type));
 }
 
 xla::XlaOp XlaHelpers::IntegerLiteral(xla::XlaBuilder* b, DataType data_type,
                                       int64_t value) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return ::tensorflow::IntegerLiteral(b, type, value);
 }
 
 xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
                                     double value) {
   xla::PrimitiveType type;
-  TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
+  CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return ::tensorflow::FloatLiteral(b, type, value);
 }
 
@@ -139,7 +155,7 @@ DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
 xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp operand,
                                           const DataType new_element_type) {
   xla::PrimitiveType convert_to;
-  TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
+  CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
   return xla::ConvertElementType(operand, convert_to);
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index c74db865769229..f8e85ba81f677a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 
-#include <functional>
+#include <algorithm>
+#include <iterator>
 #include <memory>
+#include <set>
 #include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
@@ -28,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "xla/tsl/platform/errors.h"
 #include "xla/util.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -42,11 +48,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
 
 namespace tensorflow {
 
@@ -265,7 +267,7 @@ void XlaOpRegistry::RegisterCompilationKernels() {
             "Ops registered: \n" +
                 dynamic_cast<OpRegistry*>(op_registry)->DebugString(true));
       }
-      TF_CHECK_OK(lookup_status);
+      CHECK_OK(lookup_status);
 
       std::unordered_set<std::string> type_attrs;
       for (const OpDef::AttrDef& attr_def : op_def->attr()) {
@@ -475,7 +477,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const std::string& op) {
       }
     } else {
       int start, stop;
-      TF_CHECK_OK(op_kernel->InputRange(input, &start, &stop));
+      CHECK_OK(op_kernel->InputRange(input, &start, &stop));
       for (int i = start; i < stop; ++i) {
         result->push_back(i);
       }

From 226de359e2cd86d561dfdd87169ebd4fc8a8a02f Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Thu, 18 Dec 2025 16:21:42 -0800
Subject: [PATCH 556/753] Simplify `CoordinationServiceAgent` creation.

Previously, there was a `CreateCoordinationServiceAgent` function and three
public `Initialize` methods. Now, there is a single
`CoordinationServiceAgent::Create` method that does the creation and
initialization.

PiperOrigin-RevId: 846456258
---
 .../xla/xla/pjrt/distributed/client.cc        |  9 ++-
 .../xla/pjrt/distributed/coordination/BUILD   |  2 +
 .../coordination/client_server_test.cc        | 11 +--
 .../coordination_service_agent.cc             | 79 +++++--------------
 .../coordination/coordination_service_agent.h | 52 ++++++------
 .../coordination_service_agent_test.cc        | 26 +++---
 .../preemption_sync_manager_test.cc           | 22 +++---
 7 files changed, 78 insertions(+), 123 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/client.cc b/third_party/xla/xla/pjrt/distributed/client.cc
index 09bbb1060763cc..e8f57c5d4e36ec 100644
--- a/third_party/xla/xla/pjrt/distributed/client.cc
+++ b/third_party/xla/xla/pjrt/distributed/client.cc
@@ -98,13 +98,14 @@ DistributedRuntimeCoordinationServiceClient::
 
   std::unique_ptr<CoordinationClient> leader_client;
   leader_client.reset(NewGrpcCoordinationClient(channel));
-  coord_agent_ = CreateCoordinationServiceAgent();
-  const absl::Status status = coord_agent_->Initialize(
+  auto agent = CoordinationServiceAgent::Create(
       options.env, "jax_worker", options.node_id, config,
       std::move(leader_client), options.missed_heartbeat_callback,
       options.recoverable);
-  if (!status.ok()) {
-    LOG(ERROR) << "Coordination agent failed to initialize: " << status;
+  if (!agent.ok()) {
+    LOG(ERROR) << "Coordination agent failed to initialize: " << agent.status();
+  } else {
+    coord_agent_ = *std::move(agent);
   }
   task_id_ = options.node_id;
   config_ = config;
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/BUILD b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
index a24a4726930aca..d50eae2b53bdd1 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/coordination/BUILD
@@ -142,6 +142,7 @@ cc_library(
         "@com_google_absl//absl/functional:bind_front",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
@@ -163,6 +164,7 @@ xla_cc_test(
         "//xla/tsl/lib/core:status_test_util",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:status",
+        "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "//xla/tsl/protobuf:coordination_config_proto_cc_impl",
         "//xla/tsl/protobuf:coordination_service_proto_cc_impl",
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
index 783efd39fa4010..b071e9508a7f1a 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/client_server_test.cc
@@ -130,16 +130,17 @@ class ClientServerTest : public ::testing::Test {
     std::unique_ptr<CoordinationClient> leader_client;
     leader_client.reset(NewGrpcCoordinationClient(channel));
 
-    auto coord_agent = CreateCoordinationServiceAgent();
     CoordinationServiceAgent::Config config =
         GetConfig(init_and_shutdown_timeout, shutdown_on_destruction);
-    const absl::Status status = coord_agent->Initialize(
+    auto coord_agent = CoordinationServiceAgent::Create(
         tsl::Env::Default(), "agent", node_id, config, std::move(leader_client),
         std::move(error_fn), recoverable);
-    if (!status.ok()) {
-      LOG(ERROR) << "Coordination agent failed to initialize: " << status;
+    if (!coord_agent.ok()) {
+      LOG(ERROR) << "Coordination agent failed to initialize: "
+                 << coord_agent.status();
+      return nullptr;
     }
-    return coord_agent;
+    return *std::move(coord_agent);
   }
 
   void StartService(int num_nodes,
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
index 06a39be55b820d..8e3fa8497577ea 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "absl/functional/bind_front.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -55,7 +56,6 @@ limitations under the License.
 namespace xla {
 using tensorflow::CoordinatedTask;
 using tensorflow::CoordinatedTaskState;
-using tensorflow::CoordinatedTaskStateInfo;
 using tensorflow::DeviceInfo;
 using tensorflow::KeyValueEntry;
 
@@ -69,64 +69,40 @@ constexpr char kHeartbeatThread[] = "CoordinationServiceHeartbeatLoop";
 
 }  // namespace
 
-absl::Status CoordinationServiceAgent::Initialize(
-    tsl::Env* env, absl::string_view job_name, int task_id,
-    const Config& config, std::unique_ptr<CoordinationClient> leader_client,
-    tsl::StatusCallback error_fn) {
-  return Initialize(env, job_name, task_id, config, std::move(leader_client),
-                    error_fn,
-                    /*recoverable=*/false);
-}
-
-absl::Status CoordinationServiceAgent::Initialize(
+/*static*/ absl::StatusOr<std::unique_ptr<CoordinationServiceAgent>>
+CoordinationServiceAgent::Create(
     tsl::Env* env, absl::string_view job_name, int task_id,
     const Config& config, std::unique_ptr<CoordinationClient> leader_client,
     tsl::StatusCallback error_fn, bool recoverable) {
-  CoordinatedTask task;
-  task.set_job_name(std::string(job_name));
-  task.set_task_id(task_id);
+  // Validate arguments.
+  if (config.service_leader.empty()) {
+    return MakeCoordinationError(absl::InvalidArgumentError(
+        "CoordinationServiceAgent must be initialized with a valid leader."));
+  }
+  if (leader_client == nullptr) {
+    return MakeCoordinationError(absl::InvalidArgumentError(
+        "CoordinationServiceAgent must have a valid leader client."));
+  }
   if (recoverable) {
     LOG(WARNING)
         << "Using experimental recoverable task feature. The default shutdown "
            "barrier will only block non-recoverable tasks. If a synchronized "
            "shutdown is desired, the user / library should invoke "
            "`WaitAtBarrier` explicitly at the end of the program.";
-    task.set_recoverable(true);
   }
-  return Initialize(env, task, config, std::move(leader_client), error_fn);
-}
 
-absl::Status CoordinationServiceAgent::Initialize(
-    tsl::Env* env, const CoordinatedTask& task, const Config& config,
-    std::unique_ptr<CoordinationClient> leader_client,
-    tsl::StatusCallback error_fn) {
+  // Record coordination service agent metric.
   enabled_usage_metric->GetCell()->Set(true);
-  absl::MutexLock l(state_mu_);
-  if (state_ != CoordinatedTaskState::TASKSTATE_UNINITIALIZED) {
-    return MakeCoordinationError(absl::FailedPreconditionError(
-        "Coordination service agent has already been initialized."));
-  }
 
-  env_ = env;
-  task_ = task;
-  config_ = config;
-  if (config_.service_leader.empty()) {
-    return MakeCoordinationError(absl::InvalidArgumentError(
-        "CoordinationServiceAgent must be initialized with a valid leader."));
-  }
-  leader_client_ = std::move(leader_client);
-  if (leader_client_ == nullptr) {
-    return MakeCoordinationError(absl::InvalidArgumentError(
-        "CoordinationServiceAgent must have a valid leader client."));
-  }
-  error_fn_ = error_fn;
-  state_ = CoordinatedTaskState::TASKSTATE_DISCONNECTED;
-  return absl::OkStatus();
-}
+  CoordinatedTask task;
+  task.set_job_name(std::string(job_name));
+  task.set_task_id(task_id);
+  task.set_recoverable(recoverable);
 
-bool CoordinationServiceAgent::IsInitialized() {
-  absl::MutexLock l(state_mu_);
-  return state_ != CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
+  // The CoordinationServiceAgent constructor is private, so we can't call
+  // std::make_unique.
+  return absl::WrapUnique(new CoordinationServiceAgent(
+      env, task, config, error_fn, std::move(leader_client)));
 }
 
 bool CoordinationServiceAgent::IsConnected() {
@@ -354,11 +330,6 @@ const DeviceInfo& CoordinationServiceAgent::GetClusterDeviceInfo() {
 }
 
 absl::StatusOr<CoordinatedTask> CoordinationServiceAgent::GetOwnTask() {
-  if (!IsInitialized()) {
-    return MakeCoordinationError(absl::FailedPreconditionError(
-        "Agent has not been initialized; we do not "
-        "know the associated task yet."));
-  }
   return task_;
 }
 
@@ -964,10 +935,6 @@ absl::Status CoordinationServiceAgent::ValidateRunningAgent(
 }
 
 absl::StatusOr<tsl::Env*> CoordinationServiceAgent::GetEnv() {
-  if (!IsInitialized()) {
-    return MakeCoordinationError(absl::FailedPreconditionError(
-        "Coordination service agent has not been initialized."));
-  }
   if (env_ == nullptr) {
     return MakeCoordinationError(absl::FailedPreconditionError(
         "Coordination service agent was not "
@@ -976,8 +943,4 @@ absl::StatusOr<tsl::Env*> CoordinationServiceAgent::GetEnv() {
   return env_;
 }
 
-std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent() {
-  return std::make_unique<CoordinationServiceAgent>();
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
index c10aaa356eeea6..429bdec124f9e9 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent.h
@@ -106,30 +106,16 @@ class CoordinationServiceAgent {
   using ChangedKeyValuesCallback =
       std::function<void(const std::map<std::string, std::string>&)>;
 
-  CoordinationServiceAgent() = default;
+  static absl::StatusOr<std::unique_ptr<CoordinationServiceAgent>> Create(
+      tsl::Env* env, absl::string_view job_name, int task_id,
+      const Config& config, std::unique_ptr<CoordinationClient> leader_client,
+      tsl::StatusCallback error_fn, bool recoverable = false);
 
   virtual ~CoordinationServiceAgent() {
     absl::Status s = Shutdown();
     VLOG(3) << "Coordination agent dtor failed with status: " << s;
   }
 
-  absl::Status Initialize(tsl::Env* env, absl::string_view job_name,
-                          int task_id, const Config& config,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          tsl::StatusCallback error_fn, bool recoverable);
-  absl::Status Initialize(tsl::Env* env, absl::string_view job_name,
-                          int task_id, const Config& config,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          tsl::StatusCallback error_fn);
-  absl::Status Initialize(tsl::Env* env,
-                          const tensorflow::CoordinatedTask& task,
-                          const Config& config,
-                          std::unique_ptr<CoordinationClient> leader_client,
-                          tsl::StatusCallback error_fn);
-
-  // Return true if the coordination service agent has been initialized.
-  bool IsInitialized();
-
   // Return true if the coordination service agent has successfully connected
   // with the Coordination Service
   bool IsConnected();
@@ -154,11 +140,11 @@ class CoordinationServiceAgent {
 
   // State transition in coordination service agent:
   //
-  //                 Init              Connect           SetError
-  //   UNINITIALIZED ---> DISCONNECTED ------> CONNECTED -------> ERROR
-  //                           ^                                  |
-  //                           |__________________________________|
-  //                                         Reset
+  //               Connect           SetError
+  //  DISCONNECTED ------> CONNECTED -------> ERROR
+  //       ^                                  |
+  //       |__________________________________|
+  //                     Reset
 
   // Get task associated with this agent.
   absl::StatusOr<tensorflow::CoordinatedTask> GetOwnTask();
@@ -179,7 +165,7 @@ class CoordinationServiceAgent {
   // distinguish user-specified errors from internal service or RPC failures.
   // Possible service errors:
   //   - Internal: Coordination service has shut down.
-  //   - FailedPrecondition: Uninitialized/disconnected/already in error state.
+  //   - FailedPrecondition: disconnected/already in error state.
   //   - InvalidArgument: Unexpected task request
   absl::Status ReportError(const absl::Status& error);
 
@@ -290,8 +276,8 @@ class CoordinationServiceAgent {
   //       for the same barrier, (2) one of the participating tasks is not in
   //       the cluster, or (3) task making the request is not included in the
   //       list of participating tasks.
-  //   - FailedPrecondition: Agent is in UNINITIALIZED or ERROR state, or the
-  //       same barrier id is still being invoked.
+  //   - FailedPrecondition: Agent is in ERROR state, or the same barrier id is
+  //       still being invoked.
   virtual absl::Status WaitAtBarrier(
       absl::string_view barrier_id, absl::Duration timeout,
       const std::vector<tensorflow::CoordinatedTask>& tasks);
@@ -378,6 +364,16 @@ class CoordinationServiceAgent {
  private:
   friend class CoordinationServiceRpcHandler;
 
+  explicit CoordinationServiceAgent(
+      tsl::Env* env, const tensorflow::CoordinatedTask& task,
+      const Config& config, tsl::StatusCallback error_fn,
+      std::unique_ptr<CoordinationClient> leader_client)
+      : env_(env),
+        task_(task),
+        config_(config),
+        error_fn_(error_fn),
+        leader_client_(std::move(leader_client)) {}
+
   // Starts sending heartbeats to the coordination service.
   void StartSendingHeartbeats();
   // Use long polling to get error from the coordination service.
@@ -398,7 +394,7 @@ class CoordinationServiceAgent {
 
   mutable absl::Mutex state_mu_;
   tensorflow::CoordinatedTaskState state_ ABSL_GUARDED_BY(state_mu_) =
-      tensorflow::CoordinatedTaskState::TASKSTATE_UNINITIALIZED;
+      tensorflow::CoordinatedTaskState::TASKSTATE_DISCONNECTED;
   absl::Status status_ ABSL_GUARDED_BY(state_mu_) = absl::OkStatus();
   // Tracks the number of times a barrier has been used, keyed by id.
   absl::flat_hash_map<std::string, int64_t> barrier_counter_
@@ -429,8 +425,6 @@ class CoordinationServiceAgent {
   void operator=(const CoordinationServiceAgent&) = delete;
 };
 
-std::unique_ptr<CoordinationServiceAgent> CreateCoordinationServiceAgent();
-
 }  // namespace xla
 
 #endif  // XLA_PJRT_DISTRIBUTED_COORDINATION_COORDINATION_SERVICE_AGENT_H_
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
index 43c59e7a611ce5..e2e983cbb8a4ed 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_agent_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/status.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/tsl/protobuf/coordination_config.pb.h"
 #include "xla/tsl/protobuf/coordination_service.pb.h"
@@ -154,12 +155,13 @@ class CoordinationServiceAgentTest : public ::testing::Test {
   // Should be called after mocking service responses, before testing the agent.
   void InitializeAgent(CoordinationServiceAgent::Config config = {}) {
     config.service_leader = "test_leader";
-    TF_ASSERT_OK(agent_->Initialize(
-        tsl::Env::Default(), /*job_name=*/"test_job",
-        /*task_id=*/0, config, std::move(client_),
-        /*error_fn=*/[](absl::Status s) {
-          LOG(ERROR) << "Coordination agent is set to error: " << s;
-        }));
+    TF_ASSERT_OK_AND_ASSIGN(
+        agent_, CoordinationServiceAgent::Create(
+                    tsl::Env::Default(), /*job_name=*/"test_job",
+                    /*task_id=*/0, config, std::move(client_),
+                    /*error_fn=*/[](absl::Status s) {
+                      LOG(ERROR) << "Coordination agent is set to error: " << s;
+                    }));
   }
 
   TestCoordinationClient* GetClient() {
@@ -170,8 +172,7 @@ class CoordinationServiceAgentTest : public ::testing::Test {
   }
 
  protected:
-  std::unique_ptr<CoordinationServiceAgent> agent_ =
-      CreateCoordinationServiceAgent();
+  std::unique_ptr<CoordinationServiceAgent> agent_;
   std::unique_ptr<TestCoordinationClient> client_ =
       std::make_unique<TestCoordinationClient>();
 };
@@ -490,18 +491,9 @@ TEST_F(CoordinationServiceAgentTest, GetOwnTask) {
   EXPECT_EQ(actual_task.task_id(), expected_task.task_id());
 }
 
-TEST_F(CoordinationServiceAgentTest, GetOwnTask_Uninitialized) {
-  auto result = agent_->GetOwnTask();
-
-  EXPECT_TRUE(absl::IsFailedPrecondition(result.status()));
-}
-
 TEST_F(CoordinationServiceAgentTest, GetEnv_SucceedsAfterInit) {
-  EXPECT_TRUE(absl::IsFailedPrecondition(agent_->GetEnv().status()));
   InitializeAgent();
-
   absl::StatusOr<tsl::Env*> result = agent_->GetEnv();
-
   TF_ASSERT_OK(result.status());
   EXPECT_EQ(*result, tsl::Env::Default());
 }
diff --git a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
index ab33fea7b2ab82..2564ccc737b75e 100644
--- a/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/preemption/preemption_sync_manager_test.cc
@@ -163,12 +163,16 @@ class PreemptionSyncManagerTest : public ::testing::Test {
     };
     CoordinationServiceAgent::Config coord_config;
     coord_config.service_leader = "test_leader";
-    CHECK_OK(coord_agent_->Initialize(tsl::Env::Default(), kJobName,
-                                      /*task_id=*/0, coord_config,
-                                      std::move(coord_client), error_fn));
-    CHECK_OK(coord_agent2_->Initialize(tsl::Env::Default(), kJobName,
-                                       /*task_id=*/1, coord_config,
-                                       std::move(coord_client2), error_fn));
+    coord_agent_ =
+        CoordinationServiceAgent::Create(tsl::Env::Default(), kJobName,
+                                         /*task_id=*/0, coord_config,
+                                         std::move(coord_client), error_fn)
+            .value();
+    coord_agent2_ =
+        CoordinationServiceAgent::Create(tsl::Env::Default(), kJobName,
+                                         /*task_id=*/1, coord_config,
+                                         std::move(coord_client2), error_fn)
+            .value();
     CHECK_OK(coord_agent_->Connect());
     CHECK_OK(coord_agent2_->Connect());
   }
@@ -180,12 +184,10 @@ class PreemptionSyncManagerTest : public ::testing::Test {
   std::unique_ptr<tsl::AsyncServiceInterface> coord_rpc_service_;
   std::unique_ptr<tsl::Thread> coord_rpc_thread_;
   // Owned by task 1.
-  std::unique_ptr<CoordinationServiceAgent> coord_agent_ =
-      CreateCoordinationServiceAgent();
+  std::unique_ptr<CoordinationServiceAgent> coord_agent_;
   FakePreemptionNotifier* preempt_notifier_;
   // Owned by task 2.
-  std::unique_ptr<CoordinationServiceAgent> coord_agent2_ =
-      CreateCoordinationServiceAgent();
+  std::unique_ptr<CoordinationServiceAgent> coord_agent2_;
   FakePreemptionNotifier* preempt_notifier2_;
 };
 

From e54cce9f2b1c52f1675d4743e487ece6bcc444f0 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Thu, 18 Dec 2025 16:35:41 -0800
Subject: [PATCH 557/753] (4/N) Add support for `NamedSharding` in existing
 `HloShardingUtil` methods. Remaining methods will be updated in follow up
 cl's.

PiperOrigin-RevId: 846460947
---
 third_party/xla/xla/hlo/ir/named_sharding.h   |  3 +
 .../xla/xla/hlo/utils/hlo_sharding_util.cc    | 61 +++++++++++++++++--
 .../xla/xla/hlo/utils/hlo_sharding_util.h     |  5 +-
 .../xla/hlo/utils/hlo_sharding_util_test.cc   | 26 ++++++++
 4 files changed, 88 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/named_sharding.h b/third_party/xla/xla/hlo/ir/named_sharding.h
index 0795df6588a397..3631512f6299dd 100644
--- a/third_party/xla/xla/hlo/ir/named_sharding.h
+++ b/third_party/xla/xla/hlo/ir/named_sharding.h
@@ -87,6 +87,9 @@ class NamedSharding {
   absl::Span<const DimensionSharding> dim_shardings() const {
     return dim_shardings_;
   }
+  const DimensionSharding& dim_sharding(int64_t dim) const {
+    return dim_shardings_[dim];
+  }
   absl::Span<const AxisRef> replicated_axes() const { return replicated_axes_; }
   absl::Span<const AxisRef> unreduced_axes() const { return unreduced_axes_; }
   absl::Span<const OpMetadata> metadata() const { return metadata_; }
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index a2ce251f7ab382..116e26a4377d28 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -789,6 +789,28 @@ HloSharding TransposeSharding(const HloSharding& sharding,
   if (sharding.IsTileMaximal() || sharding.IsManual()) {
     return sharding;
   }
+
+  if (sharding.UseNamedShardingLeaf()) {
+    // For NamedSharding, subgroup dimensions (e.g., for replication) are
+    // handled separately from data dimensions. The `dimensions` parameter here
+    // only permutes data dimensions, so its size must match the tensor rank.
+    // This differs from the tile-based HloSharding format, where subgroup
+    // dimensions are part of the tile assignment.
+    CHECK_EQ(sharding.num_dimensions(), dimensions.size());
+
+    std::vector<NamedSharding::DimensionSharding> transposed_dim_shardings(
+        sharding.num_dimensions());
+    for (int64_t i = 0; i < dimensions.size(); ++i) {
+      transposed_dim_shardings[dimensions[i]] =
+          sharding.named_sharding().dim_sharding(i);
+    }
+    return HloSharding(NamedSharding(
+        sharding.named_sharding().mesh(), transposed_dim_shardings,
+        sharding.named_sharding().replicated_axes(),
+        sharding.named_sharding().unreduced_axes(),
+        sharding.named_sharding().metadata()));
+  }
+
   std::vector<int> perm_dimensions(dimensions.begin(), dimensions.end());
   // Add subgroup dims if missing.
   if (sharding.TiledDataRank() == dimensions.size()) {
@@ -1621,10 +1643,11 @@ HloSharding RemoveShapeDimensions(const HloSharding& sharding,
   }
 
   if (sharding.UseNamedShardingLeaf()) {
-    // Check to ensure subgroup dimensions are not passed in dims_to_remove as
-    // named sharding doesn't handle them as part of dim_shardings but separate
-    // replicated, unreduced axes as opposed to tile hlo sharding format which
-    // uses tile dimensions to represent subgroup dimensions as well.
+    // For NamedSharding, subgroup dimensions (e.g., for replication) are
+    // handled separately from data dimensions. The `dimensions` parameter here
+    // only permutes data dimensions, so its size must match the tensor rank.
+    // This differs from the tile-based HloSharding format, where subgroup
+    // dimensions are part of the tile assignment.
     DCHECK(
         std::all_of(dims_to_remove.begin(), dims_to_remove.end(),
                     [&](int64_t i) { return i < sharding.num_dimensions(); }));
@@ -1669,6 +1692,36 @@ std::optional<HloSharding> TransposeShardingWithCollapsedDims(
   if (source.IsTileMaximal() || source.IsManual()) {
     return source;
   }
+
+  if (source.UseNamedShardingLeaf()) {
+    // For NamedSharding, subgroup dimensions (e.g., for replication) are
+    // handled separately from data dimensions. The `dimensions` parameter here
+    // only permutes data dimensions, so its size must match the tensor rank.
+    // This differs from the tile-based HloSharding format, where subgroup
+    // dimensions are part of the tile assignment.
+    CHECK_EQ(source.num_dimensions(), src_to_tgt.size());
+
+    for (int64_t i = 0; i < src_to_tgt.size(); ++i) {
+      if (src_to_tgt[i] < 0 && source.dimension(i) > 1) {
+        return std::nullopt;
+      }
+    }
+
+    std::vector<NamedSharding::DimensionSharding> new_dim_shardings(
+        tgt_to_src.size());
+    for (int64_t i = 0; i < tgt_to_src.size(); ++i) {
+      if (tgt_to_src[i] >= 0) {
+        new_dim_shardings[i] =
+            source.named_sharding().dim_sharding(tgt_to_src[i]);
+      }
+    }
+
+    return HloSharding(NamedSharding(
+        source.named_sharding().mesh(), new_dim_shardings,
+        source.named_sharding().replicated_axes(),
+        source.named_sharding().unreduced_axes(), source.metadata()));
+  }
+
   if (src_to_tgt.size() < source.num_dimensions()) {
     // Add missing subgroup dims.
     DimensionVector new_src_to_tgt(src_to_tgt.begin(), src_to_tgt.end());
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
index 66e60692386523..95fd3d307cd954 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.h
@@ -126,9 +126,8 @@ HloSharding FindCommonSharding(
 HloSharding MoveAndMergeShardingTiles(const HloSharding& sharding,
                                       int64_t source_dim, int64_t target_dim);
 
-// Returns the HloSharding with the tile dimensions and tile assignment
-// transposed based on the specified dimension numbers. In case of a tile
-// maximal sharding returns the original sharding.
+// Returns the HloSharding transposed based on the specified dimension numbers.
+// In case of a tile maximal sharding returns the original sharding.
 HloSharding TransposeSharding(const HloSharding& sharding,
                               absl::Span<const int64_t> dimensions);
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index d5deceb27c1970..27b6f4ee1f9076 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -145,12 +145,28 @@ TEST(HloShardingUtilTest, MoveAndMergeShardingTilesSubGroup) {
 TEST(HloShardingUtilTest, TransposeShardingReplicated) {
   EXPECT_EQ(TransposeSharding(HloSharding::Replicate(), {0, 1, 2}),
             HloSharding::Replicate());
+
+  EXPECT_EQ(
+      TransposeSharding(HloSharding::Replicate({}, /*use_named_sharding=*/true),
+                        {0, 1, 2}),
+      HloSharding::Replicate({}, /*use_named_sharding=*/true));
 }
 
 TEST(HloShardingUtilTest, TransposeShardingTiled) {
   HloSharding input = HloSharding::IotaTile({1, 2, 1, 2});
   HloSharding output = HloSharding::IotaTile({2, 1, 2, 1}, {2, 2}, {1, 0});
   EXPECT_EQ(TransposeSharding(input, {3, 0, 1, 2}), output);
+
+  {
+    Mesh mesh({2, 2}, {"a", "b"});
+    NamedSharding input =
+        test_utils::FromAxisNames(mesh, {{}, {"a"}, {}, {"b"}});
+    NamedSharding output =
+        test_utils::FromAxisNames(mesh, {{"b"}, {}, {"a"}, {}});
+    EXPECT_EQ(
+        TransposeSharding(HloSharding(input), {3, 2, 1, 0}).named_sharding(),
+        output);
+  }
 }
 
 TEST(HloShardingUtilTest, TransposeShardingWithCollapsedDimsSubgroupManual) {
@@ -160,6 +176,16 @@ TEST(HloShardingUtilTest, TransposeShardingWithCollapsedDimsSubgroupManual) {
       HloSharding::Subgroup(TileAssignment({1, 1, 2, 4}), {OpSharding::MANUAL});
   EXPECT_EQ(TransposeShardingWithCollapsedDims(input, {-1, 2}, {-1, -1, 1}),
             output);
+
+  {
+    Mesh mesh({1, 2, 4}, {"a", "b", "c"});
+    NamedSharding input = test_utils::FromAxisNames(mesh, {{"a"}, {"b"}});
+    NamedSharding output = test_utils::FromAxisNames(mesh, {{}, {}, {"b"}});
+    EXPECT_EQ(TransposeShardingWithCollapsedDims(HloSharding(input), {-1, 2},
+                                                 {-1, -1, 1})
+                  ->named_sharding(),
+              output);
+  }
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned1) {

From c2da2414305086336f510dce6fb318c14928df0c Mon Sep 17 00:00:00 2001
From: Abhinav Gunjal <agunjal@google.com>
Date: Thu, 18 Dec 2025 16:56:52 -0800
Subject: [PATCH 558/753] Integrate StableHLO at openxla/stablehlo@d496423c

PiperOrigin-RevId: 846467466
---
 .../xla/third_party/stablehlo/temporary.patch | 1056 -----------------
 .../xla/third_party/stablehlo/workspace.bzl   |    4 +-
 2 files changed, 2 insertions(+), 1058 deletions(-)

diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index f87d4742f36878..8b137891791fe9 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,1057 +1 @@
-diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
---- stablehlo/BUILD.bazel
-+++ stablehlo/BUILD.bazel
-@@ -1183,6 +1183,7 @@
-         ":chlo_ops",
-         ":chlo_rewriters_inc_gen",
-         ":stablehlo_aggressive_simplification_inc_gen",
-+        ":stablehlo_broadcast_lowering",
-         ":stablehlo_create_compatibility_expander_inc_gen",
-         ":stablehlo_create_complex_math_expander_inc_gen",
-         ":stablehlo_legalize_deprecated_ops_inc_gen",
-@@ -1922,6 +1923,24 @@
-     ],
- )
- 
-+cc_test(
-+    name = "chlo_builder_test",
-+    srcs = ["stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp"],
-+    deps = [
-+        ":attr_type_builder_util",
-+        ":chlo_builder",
-+        ":func_builder",
-+        ":mlir_builder",
-+        ":register",
-+        ":stablehlo_builder",
-+        ":stablehlo_ops",
-+        "@llvm-project//mlir:IR",
-+        "@llvm-project//mlir:Support",
-+        "@llvm-project//third-party/unittest:gmock",
-+        "@llvm-project//third-party/unittest:gtest",
-+    ],
-+)
-+
- gentbl_cc_library(
-     name = "func_builder_inc",
-     tbl_outs = {
-diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
---- stablehlo/stablehlo/dialect/Base.cpp
-+++ stablehlo/stablehlo/dialect/Base.cpp
-@@ -29,6 +29,7 @@
- #include "llvm/ADT/STLExtras.h"
- #include "llvm/ADT/Sequence.h"
- #include "llvm/ADT/SmallVector.h"
-+#include "llvm/Support/Casting.h"
- #include "llvm/Support/Debug.h"
- #include "llvm/Support/ErrorHandling.h"
- #include "mlir/Dialect/Quant/IR/QuantTypes.h"
-@@ -781,6 +782,14 @@
-           numScales == rankedType.getDimSize(quantDim));
- }
- 
-+bool isBoundedDynamic(Type type) {
-+  RankedTensorType rankedType = dyn_cast<RankedTensorType>(type);
-+  if (!rankedType) return false;
-+  auto boundedAttr =
-+      mlir::dyn_cast_if_present<BoundedAttrInterface>(rankedType.getEncoding());
-+  return boundedAttr != nullptr;
-+}
-+
- bool hasSingleBoundedDimension(Type type) {
-   RankedTensorType rankedType = dyn_cast<RankedTensorType>(type);
-   auto boundedAttr =
-diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
---- stablehlo/stablehlo/dialect/Base.h
-+++ stablehlo/stablehlo/dialect/Base.h
-@@ -101,6 +101,9 @@
- // mentioned in the StableHLO specification.
- bool isValidQuantizedDimension(Type type);
- 
-+// Returns true if the given type is a bounded dynamic tensor.
-+bool isBoundedDynamic(Type type);
-+
- // Returns true if the given type has a single bounded dimension.
- bool hasSingleBoundedDimension(Type type);
- 
-diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/dialect/ChloOps.cpp
---- stablehlo/stablehlo/dialect/ChloOps.cpp
-+++ stablehlo/stablehlo/dialect/ChloOps.cpp
-@@ -365,11 +365,14 @@
-   Type elementType = op.getValue().getType();
-   Type operandType = op.getOperand().getType();
-   if (isa<UnrankedTensorType>(operandType)) {
-+    // TODO(b/326463552): Remove unranked dynamism from CHLO.
-     inferredReturnShapes.emplace_back(elementType);
--  } else {
--    const auto& shape = cast<RankedTensorType>(operandType).getShape();
--    inferredReturnShapes.emplace_back(shape, elementType);
--  }
-+    return success();
-+  }
-+  auto rankedType = cast<RankedTensorType>(operandType);
-+  const auto& shape = rankedType.getShape();
-+  Attribute encoding = rankedType.getEncoding();
-+  inferredReturnShapes.emplace_back(shape, elementType, encoding);
-   return success();
- }
- 
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt b/stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
---- stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
-+++ stablehlo/stablehlo/integrations/cpp/builder/CMakeLists.txt
-@@ -137,6 +137,7 @@
-     set_target_properties(check-stablehlo-ci PROPERTIES FOLDER "Tests")
-     add_unittest(check-stablehlo-ci "unittests"
-       MlirBuilderTest.cpp
-+      ChloBuilderTest.cpp
-       StablehloBuilderTest.cpp
-       AttrTypeBuilderUtilTest.cpp
-     )
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp b/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.cpp
-@@ -31,5 +31,15 @@
- 
- #include "stablehlo/integrations/cpp/builder/ChloBuilder.cpp.inc"
- 
-+/////////////////
-+// MANUAL APIs
-+/////////////////
-+
-+MlirOp ConstantLike(MlirOp input, DenseElementsAttr val) {
-+  MlirBuilder& builder = input.getBuilder();
-+  auto splat_val = val.getSplatValue<TypedAttr>();
-+  return builder.create<chlo::ConstantLikeOp>(splat_val, input.getValue());
-+}
-+
- }  // namespace chlo
- }  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h b/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h
---- stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h
-+++ stablehlo/stablehlo/integrations/cpp/builder/ChloBuilder.h
-@@ -19,6 +19,7 @@
- #include <cstdint>
- 
- #include "llvm/ADT/SmallVector.h"
-+#include "mlir/IR/BuiltinAttributes.h"
- #include "stablehlo/dialect/ChloOps.h"
- #include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
- 
-@@ -31,6 +32,12 @@
- 
- #include "stablehlo/integrations/cpp/builder/ChloBuilder.h.inc"
- 
-+/////////////////
-+// MANUAL APIs
-+/////////////////
-+
-+MlirOp ConstantLike(MlirOp input, DenseElementsAttr val);
-+
- }  // namespace chlo
- }  // namespace mlir
- 
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp b/stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/ChloBuilderTest.cpp
-@@ -0,0 +1,141 @@
-+/* Copyright 2025 The OpenXLA Authors.
-+
-+Licensed under the Apache License, Version 2.0 (the "License");
-+you may not use this file except in compliance with the License.
-+You may obtain a copy of the License at
-+
-+    http://www.apache.org/licenses/LICENSE-2.0
-+
-+Unless required by applicable law or agreed to in writing, software
-+distributed under the License is distributed on an "AS IS" BASIS,
-+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+See the License for the specific language governing permissions and
-+limitations under the License.
-+==============================================================================*/
-+
-+#include <string>
-+
-+#include "mlir/IR/BuiltinAttributes.h"
-+#include "mlir/IR/BuiltinOps.h"
-+#include "mlir/IR/DialectRegistry.h"
-+#include "mlir/IR/MLIRContext.h"
-+#include "mlir/IR/OwningOpRef.h"
-+#include "mlir/IR/Types.h"
-+#include "mlir/IR/Verifier.h"
-+#include "mlir/Support/DebugStringHelper.h"
-+#include "mlir/Support/LLVM.h"
-+#include "stablehlo/dialect/Register.h"
-+#include "stablehlo/integrations/cpp/builder/AttrTypeBuilderUtil.h"
-+#include "stablehlo/integrations/cpp/builder/ChloBuilder.h"
-+#include "stablehlo/integrations/cpp/builder/FuncBuilder.h"
-+#include "stablehlo/integrations/cpp/builder/MlirBuilder.h"
-+#include "testing/base/public/gunit.h"
-+#include "stablehlo/integrations/cpp/builder/StablehloBuilder.h"
-+
-+namespace mlir {
-+namespace chlo {
-+
-+namespace {
-+
-+// Wrap a module builder and register the classes needed
-+class ChloModuleBuilder {
-+ public:
-+  ChloModuleBuilder()
-+      : context_(), module_builder_(context_, mlir::unknownLoc(context_)) {
-+    DialectRegistry registry;
-+    stablehlo::registerAllDialects(registry);
-+    context_.appendDialectRegistry(registry);
-+    context_.loadAllAvailableDialects();
-+  }
-+
-+  ModuleBuilder& get() { return module_builder_; }
-+  ModuleBuilder* operator->() { return &module_builder_; }
-+
-+ private:
-+  MLIRContext context_;
-+  ModuleBuilder module_builder_;
-+};
-+
-+// TODO: Make a FileCheck matcher
-+
-+}  // namespace
-+
-+TEST(ChloBuilderTest, SmokeTest) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xi64>) -> tensor<2xi64> {
-+    %0 = chlo.constant dense<1> : tensor<i64>
-+    %1 = chlo.broadcast_add %arg0, %0 : (tensor<2xi64>, tensor<i64>) -> tensor<2xi64>
-+    return %1 : tensor<2xi64>
-+  }
-+})mlir";
-+
-+  ChloModuleBuilder mb;
-+  {  // Build Main Func
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type2xi64 = makeTensorType(mb->getContext(), {2}, ElementType::I64);
-+    auto typeScalari64 = makeTensorType(mb->getContext(), {}, ElementType::I64);
-+    auto arg0 = func::Argument(fb, type2xi64);
-+    auto cst = Constant(fb, mlir::makeConstant(1L, typeScalari64));
-+    auto add = BroadcastAdd(arg0, cst);
-+    func::Return(fb, {add});
-+  }
-+
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_TRUE(succeeded(mlir::verify(*module)));
-+  EXPECT_EQ(expected, debugString(*module));
-+}
-+
-+TEST(MlirBuilderTest, ConstantLike) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xi64>) -> tensor<2xi64> {
-+    %0 = "chlo.constant_like"(%arg0) <{value = 1 : i64}> : (tensor<2xi64>) -> tensor<2xi64>
-+    return %0 : tensor<2xi64>
-+  }
-+})mlir";
-+
-+  ChloModuleBuilder mb;
-+  {  // Build Main Func
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type2xi64 = makeTensorType(mb->getContext(), {2}, ElementType::I64);
-+    auto typeScalari64 = makeTensorType(mb->getContext(), {}, ElementType::I64);
-+    auto arg0 = func::Argument(fb, type2xi64);
-+    auto cst = ConstantLike(arg0, mlir::makeConstant(1L, typeScalari64));
-+    func::Return(fb, {cst});
-+  }
-+
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_TRUE(succeeded(mlir::verify(*module)));
-+  EXPECT_EQ(expected, debugString(*module));
-+}
-+
-+TEST(MlirBuilderTest, ConstantLikeBounded) {
-+  std::string expected = R"mlir(module {
-+  func.func @main(%arg0: tensor<2xi64>, %arg1: tensor<i32>) -> tensor<?xi32, #stablehlo.bounds<2>> {
-+    %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<2xi64>, tensor<i32>) -> tensor<?xi64, #stablehlo.bounds<2>>
-+    %1 = "chlo.constant_like"(%0) <{value = 1 : i32}> : (tensor<?xi64, #stablehlo.bounds<2>>) -> tensor<?xi32, #stablehlo.bounds<2>>
-+    return %1 : tensor<?xi32, #stablehlo.bounds<2>>
-+  }
-+})mlir";
-+
-+  ChloModuleBuilder mb;
-+  {  // Build Main Func
-+    Location funcLoc = fileLineColLoc(mb->getContext(), "main.mlir", 1, 1);
-+    func::FunctionBuilder fb(mb.get(), "main", funcLoc);
-+    auto type2xi64 = makeTensorType(mb->getContext(), {2}, ElementType::I64);
-+    auto typei32 = makeTensorType(mb->getContext(), {}, ElementType::I32);
-+    auto arg0 = func::Argument(fb, type2xi64);
-+    auto arg1 = func::Argument(fb, typei32);
-+    auto sds = stablehlo::SetDimensionSize(arg0, arg1, 0);
-+    auto cst = ConstantLike(sds, mlir::makeConstant(1L, typei32));
-+    func::Return(fb, {cst});
-+  }
-+
-+  OwningOpRef<ModuleOp> module = mb->build();
-+  EXPECT_TRUE(succeeded(mlir::verify(*module)));
-+  EXPECT_EQ(expected, debugString(*module));
-+}
-+
-+}  // namespace chlo
-+}  // namespace mlir
-diff --ruN a/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp b/stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
---- stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
-+++ stablehlo/stablehlo/integrations/cpp/builder/StablehloBuilder.cpp
-@@ -67,6 +67,7 @@
-   MlirOp operand = input;
-   auto inputType = mlir::cast<RankedTensorType>(input.getType());
-   auto resultType = inputType.clone(resultElementType);
-+  if (inputType == resultType) return input;  // skip no-op convert
-   if (isa<ComplexType>(inputType.getElementType()) &&
-       !isa<ComplexType>(resultElementType)) {
-     operand = stablehlo::Real(operand);
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
-@@ -622,6 +622,10 @@
-   func.return %result : tensor<complex<f32>>
- }
- 
-+//////
-+// Broadcast binary elementwise ops tests are located in
-+// chlo_legalize_to_stablehlo_broadcast.mlir
-+
- // -----
- 
- // Lower statically shaped `constant_like` to constant.
-@@ -632,6 +636,24 @@
-   %result = "chlo.constant_like"(%arg) { value = 3.2 : f32 }
-       : (tensor<1x2xi64>) -> tensor<1x2xf32>
-   func.return %result : tensor<1x2xf32>
-+}
-+
-+// -----
-+
-+// Lower dynamically shaped `constant_like` to broadcasted constant.
-+// CHECK-LABEL: constant_like_bounded_dynamic_shape
-+// CHECK-SAME: (%[[ARG0:.*]]: tensor<2xi64>, %[[ARG1:.*]]: tensor<i32>)
-+func.func @constant_like_bounded_dynamic_shape(%arg0: tensor<2xi64>, %arg1: tensor<i32>) -> tensor<?xi32, #stablehlo.bounds<2>> {
-+  %0 = stablehlo.set_dimension_size %arg0, %arg1, dim = 0 : (tensor<2xi64>, tensor<i32>) -> tensor<?xi64, #stablehlo.bounds<2>>
-+  // CHECK-NOT: chlo.constant_like
-+  // CHECK: %[[ARG0_DYN:.*]] = stablehlo.set_dimension_size %[[ARG0]], %[[ARG1]], dim = 0 : (tensor<2xi64>, tensor<i32>) -> tensor<?xi64, #stablehlo.bounds<2>>
-+  // CHECK: %[[CST:.*]] = stablehlo.constant dense<1> : tensor<i32>
-+  // CHECK-NEXT: %[[BCAST:.*]] = stablehlo.broadcast_in_dim %[[CST]], dims = [] : (tensor<i32>) -> tensor<2xi32>
-+  // CHECK-NEXT: %[[GDS:.*]] = stablehlo.get_dimension_size %[[ARG0_DYN]], dim = 0 : (tensor<?xi64, #stablehlo.bounds<2>>) -> tensor<i32>
-+  // CHECK-NEXT: %[[SDS:.*]] = stablehlo.set_dimension_size %[[BCAST]], %[[GDS]], dim = 0 : (tensor<2xi32>, tensor<i32>) -> tensor<?xi32, #stablehlo.bounds<2>>
-+  // CHECK-NEXT: return %[[SDS]] : tensor<?xi32, #stablehlo.bounds<2>>
-+  %1 = "chlo.constant_like"(%0) <{value = 1 : i32}> : (tensor<?xi64, #stablehlo.bounds<2>>) -> tensor<?xi32, #stablehlo.bounds<2>>
-+  return %1 : tensor<?xi32, #stablehlo.bounds<2>>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir
---- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir
-+++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo_broadcast.mlir
-@@ -3,8 +3,8 @@
- // Check the non-broadcast case for each registered op, then just check a
- // representative op for detailed broadcast semantics.
- 
--// CHECK-LABEL: @addWithoutBroadcast
--func.func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @add_no_broadcast
-+func.func @add_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.add %arg0, %arg1
-   %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -12,8 +12,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @addStaticBroadcastExpanding
--func.func @addStaticBroadcastExpanding(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @add_static_broadcast_expanding
-+func.func @add_static_broadcast_expanding(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-   // CHECK:      %[[BROADCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [] : (tensor<f32>) -> tensor<4xf32>
-   // CHECK-NEXT: stablehlo.add %arg0, %[[BROADCAST]]
-   // CHECK-NOT: shape
-@@ -23,8 +23,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @addStaticBroadcastSameRank
--func.func @addStaticBroadcastSameRank(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
-+// CHECK-LABEL: @add_static_broadcast_same_rank
-+func.func @add_static_broadcast_same_rank(%arg0: tensor<1x4xf32>, %arg1: tensor<4x1xf32>) -> tensor<4x4xf32> {
-   // CHECK:      %[[ARG0_B:.+]] = stablehlo.broadcast_in_dim %arg0, dims = [0, 1] : (tensor<1x4xf32>) -> tensor<4x4xf32>
-   // CHECK-NEXT: %[[ARG1_B:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [0, 1] : (tensor<4x1xf32>) -> tensor<4x4xf32>
-   // CHECK-NEXT: stablehlo.add %[[ARG0_B]], %[[ARG1_B]] : tensor<4x4xf32>
-@@ -35,11 +35,33 @@
- 
- // -----
- 
--
--// CHECK-LABEL: @dynamicBroadcast
-+// [<=10] x [<=10] => [<=10]
-+// CHECK-LABEL: func @add_bounded_dynamic_no_broadcast
-+func.func @add_bounded_dynamic_no_broadcast(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<?xf64, #stablehlo.bounds<10>>) -> tensor<?xf64, #stablehlo.bounds<10>> {
-+  // CHECK-NEXT: stablehlo.add %arg0, %arg1
-+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<?xf64, #stablehlo.bounds<10>>) -> tensor<?xf64, #stablehlo.bounds<10>>
-+  return %0 : tensor<?xf64, #stablehlo.bounds<10>>
-+}
-+
-+// -----
-+
-+// [<=10] x [] => [<=10]
-+// CHECK-LABEL: func @add_bounded_dynamic_expanding
-+func.func @add_bounded_dynamic_expanding(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>> {
-+  // CHECK: %[[RHS_BCAST:.+]] = stablehlo.broadcast_in_dim %arg1, dims = [] : (tensor<f64>) -> tensor<10xf64>
-+  // CHECK: %[[DIM_SIZE:.+]] = stablehlo.get_dimension_size %arg0, dim = 0
-+  // CHECK: %[[RHS_BCAST_DYN:.+]] = stablehlo.set_dimension_size %[[RHS_BCAST]], %[[DIM_SIZE]], dim = 0
-+  // CHECK-NEXT: stablehlo.add %arg0, %[[RHS_BCAST_DYN]]
-+  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>>
-+  return %0 : tensor<?xf64, #stablehlo.bounds<10>>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: @add_dynamic_broadcast
- // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
- // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
--func.func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-+func.func @add_dynamic_broadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-   // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-   // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-@@ -57,10 +79,10 @@
- 
- // -----
- 
--// CHECK-LABEL: @dynamicBroadcastComplex
-+// CHECK-LABEL: @dynamic_broadcast_complex
- // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
- // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
--func.func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-+func.func @dynamic_broadcast_complex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-   // CHECK-DAG:  %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-   // CHECK-DAG:  %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-   // CHECK-NEXT: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-@@ -78,10 +100,10 @@
- 
- // -----
- 
--// CHECK-LABEL: @dynamicBroadcastCompare
-+// CHECK-LABEL: @compare_dynamic_broadcast
- // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
- // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
--func.func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-+func.func @compare_dynamic_broadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-   // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
-   // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
-   // CHECK: %[[WITNESS:.+]] = shape.cstr_broadcastable %[[ARG0_S]], %[[ARG1_S]]
-@@ -191,8 +213,8 @@
- // -----
- 
- // Verifies that broadcast_dimensions validity checks are valid.
--// CHECK-LABEL: @dynamicNonScalarBroadcastDimensions
--func.func @dynamicNonScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-+// CHECK-LABEL: @dynamic_non_scalar_broadcast_dimensions
-+func.func @dynamic_non_scalar_broadcast_dimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-   // CHECK: stablehlo.add
-   %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions =  array<i64: 1> } : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-   func.return %0 : tensor<1x4xf32>
-@@ -201,8 +223,8 @@
- // -----
- 
- // Verifies that broadcast_dimensions validity checks are valid.
--// CHECK-LABEL: @dynamicNonScalarByScalarBroadcastDimensions
--func.func @dynamicNonScalarByScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
-+// CHECK-LABEL: @dynamic_non_scalar_by_scalar_broadcast_dimensions
-+func.func @dynamic_non_scalar_by_scalar_broadcast_dimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
-   // CHECK: stablehlo.add
-   %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<1x4xf32>, tensor<f32>) -> tensor<1x4xf32>
-   func.return %0 : tensor<1x4xf32>
-@@ -211,7 +233,7 @@
- // -----
- 
- // Verifies that invalid broadcast dimensions are rejected.
--func.func @dynamicNonScalarBroadcastDimensionsSizeMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-+func.func @dynamic_non_scalar_broadcast_dimensions_size_mismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-   // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-   // expected-error @+1 {{failed to legalize operation}}
-   %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-@@ -221,7 +243,7 @@
- // -----
- 
- // Verifies that invalid broadcast dimensions are rejected.
--func.func @dynamicNonScalarBroadcastDimensionsMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-+func.func @dynamic_non_scalar_broadcast_dimensions_mismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-   // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
-   // expected-error @+1 {{failed to legalize operation}}
-   %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-@@ -232,8 +254,8 @@
- // Note that broadcast_add is used as a proxy for all of the template
- // expansions. Tests below merely verify that the op has an expansion.
- 
--// CHECK-LABEL: @andWithoutBroadcast
--func.func @andWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-+// CHECK-LABEL: @and_no_broadcast
-+func.func @and_no_broadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-   // CHECK: stablehlo.and %arg0, %arg1
-   %0 = chlo.broadcast_and %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
-@@ -241,8 +263,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @atan2WithoutBroadcast
--func.func @atan2WithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @atan2_no_broadcast
-+func.func @atan2_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.atan2 %arg0, %arg1
-   %0 = chlo.broadcast_atan2 %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -250,8 +272,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @compareWithoutBroadcast
--func.func @compareWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xi1> {
-+// CHECK-LABEL: @compare_no_broadcast
-+func.func @compare_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xi1> {
-   // CHECK: stablehlo.compare EQ, %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-   %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
-@@ -259,8 +281,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @complexWithoutBroadcast
--func.func @complexWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xcomplex<f32>> {
-+// CHECK-LABEL: @complex_no_broadcast
-+func.func @complex_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xcomplex<f32>> {
-   // CHECK: stablehlo.complex %arg0, %arg1 : tensor<4xcomplex<f32>>
-   %0 = chlo.broadcast_complex %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
-   func.return %0 : tensor<4xcomplex<f32>>
-@@ -268,8 +290,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @divideWithoutBroadcast
--func.func @divideWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @divide_no_broadcast
-+func.func @divide_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.divide %arg0, %arg1
-   %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -277,8 +299,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @maximumWithoutBroadcast
--func.func @maximumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @maximum_no_broadcast
-+func.func @maximum_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.maximum %arg0, %arg1
-   %0 = chlo.broadcast_maximum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -286,8 +308,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @minimumWithoutBroadcast
--func.func @minimumWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @minimum_no_broadcast
-+func.func @minimum_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.minimum %arg0, %arg1
-   %0 = chlo.broadcast_minimum %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -295,8 +317,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @multiplyWithoutBroadcast
--func.func @multiplyWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @multiply_no_broadcast
-+func.func @multiply_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.multiply %arg0, %arg1
-   %0 = chlo.broadcast_multiply %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -304,8 +326,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @orWithoutBroadcast
--func.func @orWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-+// CHECK-LABEL: @or_no_broadcast
-+func.func @or_no_broadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-   // CHECK: stablehlo.or %arg0, %arg1
-   %0 = chlo.broadcast_or %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
-@@ -313,8 +335,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @powerWithoutBroadcast
--func.func @powerWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @power_no_broadcast
-+func.func @power_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.power %arg0, %arg1
-   %0 = chlo.broadcast_power %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -322,8 +344,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @remainderWithoutBroadcast
--func.func @remainderWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @remainder_no_broadcast
-+func.func @remainder_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.remainder %arg0, %arg1
-   %0 = chlo.broadcast_remainder %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -331,8 +353,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @shift_leftWithoutBroadcast
--func.func @shift_leftWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-+// CHECK-LABEL: @shift_left_no_broadcast
-+func.func @shift_left_no_broadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-   // CHECK: stablehlo.shift_left %arg0, %arg1
-   %0 = chlo.broadcast_shift_left %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
-@@ -340,8 +362,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @shift_right_arithmeticWithoutBroadcast
--func.func @shift_right_arithmeticWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-+// CHECK-LABEL: @shift_right_arithmetic_no_broadcast
-+func.func @shift_right_arithmetic_no_broadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-   // CHECK: stablehlo.shift_right_arithmetic %arg0, %arg1
-   %0 = chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
-@@ -349,8 +371,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @shift_right_logicalWithoutBroadcast
--func.func @shift_right_logicalWithoutBroadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-+// CHECK-LABEL: @shift_right_logical_no_broadcast
-+func.func @shift_right_logical_no_broadcast(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-   // CHECK: stablehlo.shift_right_logical %arg0, %arg1
-   %0 = chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
-@@ -358,8 +380,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @subWithoutBroadcast
--func.func @subWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+// CHECK-LABEL: @sub_no_broadcast
-+func.func @sub_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-   // CHECK: stablehlo.subtract %arg0, %arg1
-   %0 = chlo.broadcast_subtract %arg0, %arg1 : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-   func.return %0 : tensor<4xf32>
-@@ -367,16 +389,16 @@
- 
- // -----
- 
--// CHECK-LABEL: @xorWithoutBroadcast
--func.func @xorWithoutBroadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-+// CHECK-LABEL: @xor_no_broadcast
-+func.func @xor_no_broadcast(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
-   // CHECK: stablehlo.xor %arg0, %arg1
-   %0 = chlo.broadcast_xor %arg0, %arg1 : (tensor<4xi1>, tensor<4xi1>) -> tensor<4xi1>
-   func.return %0 : tensor<4xi1>
- }
- 
- // -----
--// CHECK-LABEL: @NextAfterWithoutBroadcast
--func.func @NextAfterWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-+// CHECK-LABEL: @next_after_no_broadcast
-+func.func @next_after_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-     -> tensor<4xf32> {
-   // CHECK-NOT: chlo.broadcast_next_after
-   %0 = chlo.broadcast_next_after %arg0, %arg1
-@@ -386,8 +408,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @PolygammaWithoutBroadcast
--func.func @PolygammaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-+// CHECK-LABEL: @Polygamma_no_broadcast
-+func.func @Polygamma_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-     -> tensor<4xf32> {
-   // CHECK-NOT: chlo.broadcast_polygamma
-   // CHECK-NOT: chlo.polygamma
-@@ -398,8 +420,8 @@
- 
- // -----
- 
--// CHECK-LABEL: @ZetaWithoutBroadcast
--func.func @ZetaWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-+// CHECK-LABEL: @Zeta_no_broadcast
-+func.func @Zeta_no_broadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>)
-     -> tensor<4xf32> {
-   // CHECK-NOT: chlo.broadcast_zeta
-   // CHECK-NOT: chlo.zeta
-diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
---- stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
-+++ stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
-@@ -316,7 +316,7 @@
- // Serialized string:
- //   "\08\03\1A\02\01\02\22\02\00\01"
- func.func @test_custom_call2(%arg0: tensor<16x16xf32>) -> tensor<16x16xf32> {
--  %0 = "stablehlo.custom_call"(%arg0) {backend_config = "", call_target_name = "Sharding", stablehlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
-+  %0 = "stablehlo.custom_call"(%arg0) {backend_config = "", call_target_name = "Sharding", mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"} : (tensor<16x16xf32>) -> tensor<16x16xf32>
-   func.return %0 : tensor<16x16xf32>
- }
- 
-diff --ruN a/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir b/stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
---- stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-+++ stablehlo/stablehlo/tests/transforms/stablehlo_aggressive_folder.mlir
-@@ -218,6 +218,21 @@
-   // CHECK-NEXT: return [[TRUE]], [[FALSE]], [[TRUE]], [[TRUE]], [[TRUE]], [[FALSE]], [[TRUE]], [[FALSE]]
-   return %0, %1, %2, %3, %4, %5, %6, %7 :
-          tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>, tensor<i1>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func.func @compare_fold_with_implicit_comparison_type
-+func.func @compare_fold_with_implicit_comparison_type() -> (tensor<3xi1>, tensor<3xi1>) {
-+  %c_0 = stablehlo.constant dense<0> : tensor<3xi64>
-+  %c = stablehlo.constant dense<[-1, 0, 1]> : tensor<3xi64>
-+  %c_1 = stablehlo.constant dense<0.0> : tensor<3xf64>
-+  %c_2 = stablehlo.constant dense<[-1.0, 0.0, 1.0]> : tensor<3xf64>
-+  %0 = stablehlo.compare GE, %c, %c_0 : (tensor<3xi64>, tensor<3xi64>) -> tensor<3xi1>
-+  %1 = stablehlo.compare GE, %c_2, %c_1 : (tensor<3xf64>, tensor<3xf64>) -> tensor<3xi1>
-+  // CHECK-DAG:  [[RES:%.+]] = stablehlo.constant dense<[false, true, true]> : tensor<3xi1>
-+  // CHECK-NEXT: return [[RES]], [[RES]] : tensor<3xi1>, tensor<3xi1>
-+  return %0, %1 : tensor<3xi1>, tensor<3xi1>
- }
- 
- // -----
-diff --ruN a/stablehlo/stablehlo/transforms/CMakeLists.txt b/stablehlo/stablehlo/transforms/CMakeLists.txt
---- stablehlo/stablehlo/transforms/CMakeLists.txt
-+++ stablehlo/stablehlo/transforms/CMakeLists.txt
-@@ -113,6 +113,7 @@
-   MLIRTransformUtils
-   StablehloBase
-   StablehloBroadcastUtils
-+  StablehloBroadcastLowering
-   StablehloLinalgTransforms
-   StablehloOps
-   StablehloOptimizationPasses
-diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
---- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-+++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
-@@ -35,7 +35,6 @@
- #include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/IR/BuiltinTypeInterfaces.h"
- #include "mlir/IR/BuiltinTypes.h"
--#include "mlir/IR/ImplicitLocOpBuilder.h"
- #include "mlir/IR/MLIRContext.h"
- #include "mlir/IR/PatternMatch.h"
- #include "mlir/IR/TypeUtilities.h"
-@@ -51,6 +50,7 @@
- #include "stablehlo/transforms/ChloDecompositionUtils.h"
- #include "stablehlo/transforms/PassUtils.h"
- #include "stablehlo/transforms/Passes.h"
-+#include "stablehlo/transforms/StablehloBroadcastLowering.h"
- 
- // This must precede all other headers, otherwise during Windows cross
- // compilation, M_PI will not be defined.
-@@ -201,34 +201,13 @@
-       val);
- }
- 
--// Broadcast using numpy-style broadcasting semantics.
--// This is only valid if the CHLO op has static shaped operands, and no
--// explicitly specified broadcast_dimensions.
--//
--// Asserts that input is ranked tensor type.
--Value numpyBroadcastIfNeeded(Value op, RankedTensorType opResultType,
--                             PatternRewriter& rewriter) {
--  RankedTensorType inputType = cast<RankedTensorType>(op.getType());
--  RankedTensorType broadcastedResultType =
--      opResultType.clone(inputType.getElementType());
--
--  // No broadcasting needed if input type matches broadcasted result type.
--  if (inputType == broadcastedResultType) return op;
--
--  // broadcast dims are the last dims for numpy style broadcasting.
--  int64_t inputRank = inputType.getRank();
--  int64_t resultRank = opResultType.getRank();
--  auto broadcastDimensions =
--      llvm::to_vector(llvm::seq<int64_t>(resultRank - inputRank, resultRank));
--  return stablehlo::BroadcastInDimOp::create(rewriter, op.getLoc(),
--                                             broadcastedResultType, op,
--                                             broadcastDimensions)
--      .getResult();
--}
--
- //===----------------------------------------------------------------------===//
- // Broadcasting Patterns.
- //===----------------------------------------------------------------------===//
-+
-+bool isStaticOrBoundedDynamicTensor(RankedTensorType type) {
-+  return type.hasStaticShape() || hlo::isBoundedDynamic(type);
-+}
- 
- // Converts binary ops that statically are determined to not broadcast directly
- // to the corresponding stablehlo non-broadcasting op.
-@@ -243,12 +222,14 @@
-     // Only rewrite for statically determinable non-broadcasting cases.
-     auto lhsType = dyn_cast<RankedTensorType>(adaptor.getLhs().getType());
-     auto rhsType = dyn_cast<RankedTensorType>(adaptor.getRhs().getType());
--    if (!lhsType || !rhsType || lhsType.getShape() != rhsType.getShape() ||
--        !lhsType.hasStaticShape() || !rhsType.hasStaticShape())
-+    if (!lhsType || !rhsType || !isStaticOrBoundedDynamicTensor(lhsType) ||
-+        !isStaticOrBoundedDynamicTensor(rhsType) ||
-+        lhsType.getShape() != rhsType.getShape() ||
-+        lhsType.getEncoding() != rhsType.getEncoding())
-       return rewriter.notifyMatchFailure(
-           op,
-           "expected LHS and RHS to be ranked tensors with matching shapes that "
--          "are all static");
-+          "are all static or bounded dynamic");
- 
-     rewriter.replaceOp(
-         op, ValueRange{Adaptor::createOp(op, op.getType(),
-@@ -270,41 +251,46 @@
-     // Only rewrite for statically determinable non-broadcasting cases.
-     auto lhsType = dyn_cast<RankedTensorType>(adaptor.getLhs().getType());
-     auto rhsType = dyn_cast<RankedTensorType>(adaptor.getRhs().getType());
--    if (!lhsType || !rhsType || !lhsType.hasStaticShape() ||
--        !rhsType.hasStaticShape())
-+    if (!lhsType || !rhsType || !isStaticOrBoundedDynamicTensor(lhsType) ||
-+        !isStaticOrBoundedDynamicTensor(rhsType))
-       return rewriter.notifyMatchFailure(
-           op,
--          "expected LHS and RHS to be ranked tensor types with static "
--          "shape");
-+          "expected LHS and RHS to be ranked tensor types with static or "
-+          "bounded dynamic shape");
- 
-     // Rely on CHLO type inference to figure out the proper broadcasted shape.
-     auto resultType = dyn_cast<RankedTensorType>(op.getResult().getType());
--    if (!resultType || !resultType.hasStaticShape())
-+    if (!resultType || !isStaticOrBoundedDynamicTensor(resultType))
-       return rewriter.notifyMatchFailure(
--          op, "expected result to be a ranked tensor type with static shape");
-+          op,
-+          "expected result to be a ranked tensor type with static or bounded "
-+          "dynamic shape");
- 
-     auto lhs = adaptor.getLhs();
-     auto rhs = adaptor.getRhs();
-     auto broadcastDimensions = adaptor.getBroadcastDimensions();
-     if (broadcastDimensions &&
--        !hlo::isLegalNumpyRankedBroadcast(lhs, rhs, *broadcastDimensions))
-+        !hlo::isLegalNumpyRankedBroadcast(lhs, rhs, *broadcastDimensions)) {
-       return rewriter.notifyMatchFailure(
-           op,
-           "expected implicit broadcast_dimensions or numpy-style broadcasting");
-+    }
- 
-     LLVM_DEBUG(llvm::dbgs()
-                << "CHLO Decomposing " << op->getName() << " with broadcast "
-                << lhsType << " x " << rhsType << " -> " << resultType << "\n");
- 
--    // If operands are static directly create stablehlo broadcasting ops.
--    // Use numpy-style broadcasting with using StableHLO broadcast ops,
--    // when user didn't specify broadcast_dimensions.
--    auto lhsBroadcast =
--        numpyBroadcastIfNeeded(adaptor.getLhs(), resultType, rewriter);
--    auto rhsBroadcast =
--        numpyBroadcastIfNeeded(adaptor.getRhs(), resultType, rewriter);
--    auto result = Adaptor::createOp(op, resultType,
--                                    {lhsBroadcast, rhsBroadcast}, rewriter);
-+    // If operands are static or bounded dynamic, directly create stablehlo
-+    // broadcasting ops. Use numpy-style broadcasting with using StableHLO
-+    // broadcast ops. Can leave off broadcast_dimensions since the above
-+    // logic verifies that they are the default for numpy-style broadcasting.
-+    mlir::SmallVector<Value> broadcastOperands = {lhs, rhs};
-+    auto broadcasted_values =
-+        stablehlo::numpyBroadcastIfNeeded(rewriter, broadcastOperands);
-+    if (failed(broadcasted_values)) return failure();
-+
-+    auto result =
-+        Adaptor::createOp(op, resultType, *broadcasted_values, rewriter);
-     rewriter.replaceOp(op, {result.getResult()});
-     return success();
-   }
-@@ -425,7 +411,21 @@
-       return success();
-     }
- 
--    // Lower to broadcasted constant.
-+    // Lower to cst -> broadcast -> set_dimension_size if bounded dynamic.
-+    if (hlo::isBoundedDynamic(resultTy)) {
-+      Value constant = mlir::stablehlo::ConstantOp::create(
-+          rewriter, op.getLoc(), op.getValue());
-+      mlir::FailureOr<stablehlo::Dimensions> operandDims =
-+          getDimensions(adaptor.getOperand());
-+      if (failed(operandDims)) return failure();
-+      mlir::FailureOr<Value> broadcast =
-+          stablehlo::numpyBroadcastIfNeeded(rewriter, constant, *operandDims);
-+      if (failed(broadcast)) return failure();
-+      rewriter.replaceOp(op, *broadcast);
-+      return success();
-+    }
-+
-+    // Lower unbounded dynamic to broadcasted constant.
-     Location loc = op.getLoc();
-     Value constant =
-         mlir::stablehlo::ConstantOp::create(rewriter, loc, op.getValue());
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
---- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
-+++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
-@@ -59,26 +59,6 @@
-   };
- }
- 
--FailureOr<Dimensions> getDimensions(Value op) {
--  // Get tensor type
--  mlir::RankedTensorType tensor_type = dyn_cast<RankedTensorType>(op.getType());
--  if (!tensor_type)
--    return emitError(op.getLoc(),
--                     "expected ranked tensor type for broadcast inputs");
--
--  auto encoding =
--      mlir::dyn_cast_if_present<mlir::stablehlo::TypeExtensionsAttr>(
--          tensor_type.getEncoding());
--
--  Dimensions dimensions;
--  dimensions.reserve(tensor_type.getRank());
--  for (int64_t idx = 0; idx < tensor_type.getRank(); ++idx) {
--    auto dimInfo = getDimensionInfo(op, tensor_type, encoding, idx);
--    dimensions.push_back(dimInfo);
--  }
--  return dimensions;
--}
--
- FailureOr<Dimensions> getNumpyBroadcastShapeWithBounds(Value op,
-                                                        const Dimensions& a,
-                                                        const Dimensions& b) {
-@@ -130,6 +110,28 @@
-   LLVM_DEBUG(llvm::dbgs() << "[getNumpyBroadcastShapeWithBounds] result: "
-                           << toString(result) << "\n");
-   return result;
-+}
-+
-+}  // namespace
-+
-+FailureOr<Dimensions> getDimensions(Value op) {
-+  // Get tensor type
-+  mlir::RankedTensorType tensor_type = dyn_cast<RankedTensorType>(op.getType());
-+  if (!tensor_type)
-+    return emitError(op.getLoc(),
-+                     "expected ranked tensor type for broadcast inputs");
-+
-+  auto encoding =
-+      mlir::dyn_cast_if_present<mlir::stablehlo::TypeExtensionsAttr>(
-+          tensor_type.getEncoding());
-+
-+  Dimensions dimensions;
-+  dimensions.reserve(tensor_type.getRank());
-+  for (int64_t idx = 0; idx < tensor_type.getRank(); ++idx) {
-+    auto dimInfo = getDimensionInfo(op, tensor_type, encoding, idx);
-+    dimensions.push_back(dimInfo);
-+  }
-+  return dimensions;
- }
- 
- mlir::RankedTensorType getRankedTensorType(const Dimensions& dims,
-@@ -155,7 +157,6 @@
-   return mlir::RankedTensorType::get(shape, element_type, encoding);
- }
- 
--}  // namespace
- 
- FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
-                                              ArrayRef<Value> ops) {
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
---- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
-+++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
-@@ -47,6 +47,14 @@
- using Dimensions = SmallVector<DimensionInfo>;
- std::string toString(const Dimensions& dims);
- 
-+// Returns the dimensions of the given op, or failure if the op's type is not a
-+// ranked tensor.
-+FailureOr<Dimensions> getDimensions(Value op);
-+
-+// Returns the ranked tensor type with the given dimensions and element type.
-+mlir::RankedTensorType getRankedTensorType(const Dimensions& dims,
-+                                           mlir::Type element_type);
-+
- // Returns the common shape these ops would broadcast to, or an error if the
- // ops are not broadcastable.
- FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
-diff --ruN a/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp b/stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
---- stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-+++ stablehlo/stablehlo/transforms/optimization/StablehloAggressiveFolder.cpp
-@@ -701,22 +701,50 @@
-     if (failed(validateShapeFoldDtype(rewriter, op, resultType)))
-       return failure();
- 
-+    ComparisonType comparisonType = getComparisonType(op);
-+    if (comparisonType == ComparisonType::NOTYPE)
-+      return rewriter.notifyMatchFailure(
-+          op, "Could not determine comparison type.");
-+
-+    LLVM_DEBUG(llvm::dbgs() << "comparisonType: " << comparisonType << "\n");
-+
-     auto res = foldBinaryOpIntOrFloat<FoldCompare, IntegerAttr, IntegerAttr>(
--        rewriter, op,
--        FoldCompare(op.getComparisonDirection(), op.getCompareType()));
-+        rewriter, op, FoldCompare(op.getComparisonDirection(), comparisonType));
-     if (failed(res)) return failure();
-     rewriter.replaceOpWithNewOp<mlir::stablehlo::ConstantOp>(op, res.value());
-     return success();
-   }
- 
-+  // Return the comparison type if set, else return the assumed comparison type
-+  // according to the StableHLO spec.
-+  ComparisonType getComparisonType(CompareOp op) const {
-+    auto compareType = op.getCompareType();
-+    if (compareType.has_value() &&
-+        compareType.value() != ComparisonType::NOTYPE)
-+      return *compareType;
-+
-+    Type elementType = op.getLhs().getType().getElementType();
-+    if (elementType.isUnsignedInteger() || elementType.isSignlessInteger(1))
-+      return ComparisonType::UNSIGNED;
-+    if (elementType.isSignlessInteger())
-+      return ComparisonType::SIGNED;
-+    else if (elementType.isFloat() || mlir::isa<ComplexType>(elementType))
-+      return ComparisonType::FLOAT;
-+    else
-+      return ComparisonType::NOTYPE;
-+  }
-+
-   struct FoldCompare {
-     FoldCompare(ComparisonDirection direction,
--                std::optional<ComparisonType> kind)
-+                ComparisonType kind)
-         : direction(direction), kind(kind) {}
-     ComparisonDirection direction;
--    std::optional<ComparisonType> kind;
-+    ComparisonType kind;
- 
-     APInt operator()(APFloat lhs, APFloat rhs) {
-+      if (kind != ComparisonType::FLOAT && kind != ComparisonType::TOTALORDER)
-+        llvm::report_fatal_error("invalid float comparison");
-+
-       bool result = false;
-       switch (direction) {
-         case ComparisonDirection::EQ:
-@@ -741,6 +769,9 @@
-       return APInt(/*bitwidth=*/1, result);
-     }
-     APInt operator()(APInt lhs, APInt rhs) {
-+      if (kind != ComparisonType::UNSIGNED && kind != ComparisonType::SIGNED)
-+        llvm::report_fatal_error("invalid integer comparison");
-+
-       bool result = false;
-       switch (direction) {
-         case ComparisonDirection::EQ:
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index 48e631619a6888..ed5215b42b4c30 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "1ef9e390b5295e676d2b864fe1924bc2f3f4cf0f"
-    STABLEHLO_SHA256 = "818c951ad0ba0ac6c26d3ed01fed8f9a0e5ca93f5aed35005f75f0faf11bdfb0"
+    STABLEHLO_COMMIT = "d496423cdb7f7d5272f14d517681202a0b9cbe41"
+    STABLEHLO_SHA256 = "eac3bd19f6c0b86ed3216b63d871d7c34a1aa679ca4a34975fe70fd043b34b85"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(

From 728c0ee8e4d8f4e63c2861da38c417cb7702d441 Mon Sep 17 00:00:00 2001
From: Michael Whittaker <mwhittaker@google.com>
Date: Thu, 18 Dec 2025 17:11:14 -0800
Subject: [PATCH 559/753] Remove unused
 `CoordinationService::Config::recoverable_jobs` field.

PiperOrigin-RevId: 846472174
---
 .../distributed/coordination/coordination_service.cc     | 9 +--------
 .../pjrt/distributed/coordination/coordination_service.h | 6 ------
 .../coordination/coordination_service_test.cc            | 6 ++++--
 3 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
index 0326b3c5cbb270..2c676d3b6be4fb 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.cc
@@ -889,8 +889,7 @@ absl::Status CoordinationService::RecordHeartbeat(const CoordinatedTask& task,
 bool CoordinationService::AllTasksAreRecoverable(
     const std::vector<CoordinatedTask>& tasks) {
   for (const auto& task : tasks) {
-    if (!cluster_state_[GetTaskName(task)]->IsRecoverable() &&
-        !isRecoverableJob(task.job_name())) {
+    if (!cluster_state_[GetTaskName(task)]->IsRecoverable()) {
       return false;
     }
   }
@@ -1757,12 +1756,6 @@ void CoordinationService::CompleteShutdownAfterBarrier(
   }
 }
 
-bool CoordinationService::isRecoverableJob(
-    const absl::string_view task_name) const {
-  return config_.recoverable_jobs.find(task_name) !=
-         config_.recoverable_jobs.end();
-}
-
 void CoordinationService::SendErrorPollingResponseOrFailAllTasks(
     const absl::Status& error) {
   CHECK(!error.ok()) << "SendErrorPollingResponseOrFailAllTasks called with OK "
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
index 1c302da245c90a..6c2ddb1b4cda1b 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service.h
@@ -89,12 +89,6 @@ class CoordinationService {
     // worker can disconnect individually.
     absl::Duration shutdown_barrier_timeout = absl::ZeroDuration();
 
-    // The list of jobs which are recoverable. If a task in this list fails,
-    // it will not propagate error to other tasks.
-    // If empty, no jobs will be recoverable and every task failure will cause
-    // error propagation to other tasks.
-    absl::flat_hash_set<std::string> recoverable_jobs;
-
     // If a task restarts with a new incarnation, we may allow it to reconnect
     // silently. This is useful when we know that a task can immediately resume
     // work upon re-connecting to the service.
diff --git a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
index ec75076d4c6208..ee224bc5bebb44 100644
--- a/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
+++ b/third_party/xla/xla/pjrt/distributed/coordination/coordination_service_test.cc
@@ -224,7 +224,8 @@ class CoordinateTwoTasksTest : public ::testing::Test {
         GetCoordinationServiceConfig(/*num_tasks=*/2);
     config.heartbeat_timeout = kHeartbeatTimeout;
     if (set_worker_job_recoverable) {
-      config.recoverable_jobs.insert("worker");
+      task_0_.set_recoverable(true);
+      task_1_.set_recoverable(true);
     }
     if (enable_shutdown_barrier) {
       config.shutdown_barrier_timeout = kShutdownBarrierTimeout;
@@ -1912,16 +1913,17 @@ TEST_F(CoordinateTwoTasksTest,
 TEST(CoordinationServiceTest, RecoverableAndNonRecoverableTasks) {
   CoordinationService::Config config;
   // Workers are recoverable, chief is not.
-  config.recoverable_jobs.insert("worker");
   CoordinatedTask chief;
   chief.set_job_name("chief");
   chief.set_task_id(0);
   CoordinatedTask task_0;
   task_0.set_job_name("worker");
   task_0.set_task_id(0);
+  task_0.set_recoverable(true);
   CoordinatedTask task_1;
   task_1.set_job_name("worker");
   task_1.set_task_id(1);
+  task_1.set_recoverable(true);
   CoordinatedJob chief_job;
   chief_job.set_name("chief");
   chief_job.set_num_tasks(1);

From 46f4c9ab9f8d71faa9f9f86b69868236c65d5c6a Mon Sep 17 00:00:00 2001
From: Hyeontaek Lim <hyeontaek@google.com>
Date: Thu, 18 Dec 2025 17:11:46 -0800
Subject: [PATCH 560/753] [JAX] Refresh a custom layout if a buffer is copied
 across clients or memories

This change retrieves a new custom layout if a buffer is copied across PjRt clients or memory spaces because the detail of the buffer layout often changes (e.g., tiling is added if a buffer is copied from a CPU client to a TPU client).

Without this change, the newly added test would fail with an error: `AssertionError: Layou[14 chars]or=(1, 0), tiling=(), sub_byte_element_size_in_bits=0) != Layou[14 chars]or=(1, 0), tiling=((8, 128),), sub_byte_element_size_in_bits=0)`

PiperOrigin-RevId: 846472304
---
 .../xla/xla/python/pjrt_ifrt/pjrt_array.cc     | 18 +++++++++---------
 third_party/xla/xla/python/version.h           |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
index 05be2489eaafee..ac9bc1ace446ad 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -550,15 +550,15 @@ absl::StatusOr<ArrayRef> PjRtArray::Copy(
   if (new_client == nullptr) {
     new_client = client_;
   }
-  std::shared_ptr<const xla::PjRtLayout> layout;
-  static MemoryKind kUnpinnedHostMemoryKind(UnpinnedHostMemorySpace::kKind);
-  // Unpinned host supports default layouts only; a custom layout would be
-  // ignored.
-  // TODO(hyeontaek): This behavior should be informed by the underlying PjRt
-  // client instead of following a convention.
-  if (layout_ != nullptr &&
-      canonicalized_sharding_memory_kind != kUnpinnedHostMemoryKind) {
-    layout = layout_;
+  std::shared_ptr<const xla::PjRtLayout> layout = layout_;
+  // If a copy has happened across clients or across different memory spaces,
+  // the layout of a new buffer may be different from that of the original
+  // buffer. Refreshing the custom layout using the new buffer layout makes sure
+  // that `PjRtArray` tracks a valid custom layout.
+  if (layout != nullptr &&
+      (client_ != new_client ||
+       sharding_->memory_kind() != canonicalized_sharding_memory_kind)) {
+    layout = buffers.front()->layout();
   }
   return std::visit(
       [this, new_client, &new_sharding, &buffers,
diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 6460623314ce63..96eadc39b1eeb2 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -19,6 +19,6 @@ limitations under the License.
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
 #define JAX_IFRT_VERSION_NUMBER \
-  44  // xla::ifrt::Device has a new PlatformName() API.
+  45  // Refresh custom layouts when copying an array across clients.
 
 #endif  // XLA_PYTHON_VERSION_H_

From 818d234c94c32a5c193e6283ece0c3acaa6bb8c6 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Thu, 18 Dec 2025 17:39:51 -0800
Subject: [PATCH 561/753] Add `testonly` parameter to `py_import` macros to
 allow testonly dependencies.

PiperOrigin-RevId: 846479655
---
 third_party/xla/third_party/py/py_import.bzl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/third_party/py/py_import.bzl b/third_party/xla/third_party/py/py_import.bzl
index 08aa56f1b42c20..7ab46f8bfa00ef 100644
--- a/third_party/xla/third_party/py/py_import.bzl
+++ b/third_party/xla/third_party/py/py_import.bzl
@@ -49,13 +49,15 @@ def py_import(
         wheel,
         deps = [],
         wheel_deps = [],
-        zip_deps = []):
+        zip_deps = [],
+        testonly = False):
     unpacked_wheel_name = name + "_unpacked_wheel"
     _unpacked_wheel(
         name = unpacked_wheel_name,
         wheel = wheel,
         wheel_deps = wheel_deps,
         zip_deps = zip_deps,
+        testonly = testonly,
     )
     py_library(
         name = name,
@@ -63,6 +65,7 @@ def py_import(
         imports = [unpacked_wheel_name],
         deps = deps,
         visibility = ["//visibility:public"],
+        testonly = testonly,
     )
 
 """Unpacks the wheel and uses its content as a py_library.

From 968bb7de374fc0a279e6d460c85f6b04a78053dc Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 18 Dec 2025 18:00:28 -0800
Subject: [PATCH 562/753] [PJRT] Remove the .start field on transpose plan
 Nodes.

Instead, accumulate a per-chunk offset and apply that once at the start of execution.

This is also intended as a refactoring, but it may have small impacts on transpose performance since we have essentially moved some scalar math out of the inner loops.

PiperOrigin-RevId: 846484673
---
 third_party/xla/xla/pjrt/transpose.cc | 118 ++++++++++++--------------
 third_party/xla/xla/pjrt/transpose.h  |  11 ++-
 2 files changed, 63 insertions(+), 66 deletions(-)

diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 5deddb6c0dee60..6ad578d667866e 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -120,9 +120,8 @@ static constexpr int kMaxInnerBlockSizeBytes = 16;
 // A plan is a data structure that describes a loop nest.
 // TODO(phawkins): consider shrinking Node so it fits in a cache line.
 struct TransposePlan::Node {
-  // The loop should iterate over the index space range(start, end, inc).
+  // The loop should iterate over the index space range(0, end, inc).
   // These fields are ignored by the macrokernel.
-  int64_t start;
   int64_t end;  // For the inner loop of a memcpy loop nest, this is the size of
                 // the transfer.
   int64_t inc;  // The transpose sentinel node has inc < 0.
@@ -203,7 +202,6 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
   DVLOG(10) << "Transpose " << outer_bs_a << " " << outer_bs_b;
   DCHECK_GT(outer_bs_a, 0);
   DCHECK_GT(outer_bs_b, 0);
-  const int64_t start = node->start;
   const int64_t end = node->end;
   const int64_t stop = node->end - (node->inc - 1);
   const int64_t lda = node->lda;
@@ -217,7 +215,7 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
     const int64_t lda_block = next_node->lda;
     const int64_t ldb_block = next_node->ldb;
     int64_t i;
-    for (i = start; i < stop; i += inc) {
+    for (i = 0; i < stop; i += inc) {
       MacroKernel<T, inner_bs, transformation>(a + i * lda, lda_block,
                                                outer_bs_a, b + i * ldb,
                                                ldb_block, outer_bs_b, scratch);
@@ -281,7 +279,7 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
     // inner loops. Structurally this code is identical to the previous case,
     // but we call Transpose() recursively instead of MacroKernel().
     int64_t i;
-    for (i = start; i < stop; i += inc) {
+    for (i = 0; i < stop; i += inc) {
       Transpose<T, inner_bs, transformation>(
           a + i * lda, outer_bs_a, b + i * ldb, outer_bs_b, next_node, scratch);
     }
@@ -335,59 +333,44 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
 
 void TransposeConstStride1(const char* __restrict a, char* __restrict b,
                            TransposePlan::Node const* __restrict node) {
-  a += node[0].start * node[0].lda;
-  b += node[0].start * node[0].ldb;
   if (node[0].is_inner_dim_in_a) {
     int64_t num_bytes = node->end;
     std::memcpy(b, a, num_bytes);
   } else if (node[1].is_inner_dim_in_a) {
-    int64_t offset_a = node[1].start * node[1].lda;
-    int64_t offset_b = node[1].start * node[1].ldb;
     int64_t num_bytes = node[1].end;
-    a += offset_a;
-    b += offset_b;
-    for (int64_t i = node[0].start; i < node[0].end; ++i) {
+    for (int64_t i = 0; i < node[0].end; ++i) {
       std::memcpy(b, a, num_bytes);
       a += node[0].lda;
       b += node[0].ldb;
     }
     if (node[0].trailing_tile_next_node_inc) {
-      TransposeConstStride1(a - offset_a, b - offset_b,
-                            node + node[0].trailing_tile_next_node_inc);
+      TransposeConstStride1(a, b, node + node[0].trailing_tile_next_node_inc);
     }
   } else if (node[2].is_inner_dim_in_a) {
     int64_t num_bytes = node[2].end;
-    int64_t offset_a1 = node[1].start * node[1].lda;
-    int64_t offset_b1 = node[1].start * node[1].ldb;
-    int64_t offset_a2 = node[2].start * node[2].lda;
-    int64_t offset_b2 = node[2].start * node[2].ldb;
-    a += offset_a1 + offset_a2;
-    b += offset_b1 + offset_b2;
-    for (int64_t i = node[0].start; i < node[0].end; ++i) {
+    for (int64_t i = 0; i < node[0].end; ++i) {
       const char* a1 = a;
       char* b1 = b;
-      for (int64_t j = node[1].start; j < node[1].end; ++j) {
+      for (int64_t j = 0; j < node[1].end; ++j) {
         std::memcpy(b1, a1, num_bytes);
         a1 += node[1].lda;
         b1 += node[1].ldb;
       }
       if (node[1].trailing_tile_next_node_inc) {
-        TransposeConstStride1(a1 - offset_a2, b1 - offset_b2,
+        TransposeConstStride1(a1, b1,
                               &node[1] + node[1].trailing_tile_next_node_inc);
       }
       a += node[0].lda;
       b += node[0].ldb;
     }
     if (node[0].trailing_tile_next_node_inc) {
-      TransposeConstStride1(a - offset_a1 - offset_a2,
-                            b - offset_b1 - offset_b2,
-                            node + node[0].trailing_tile_next_node_inc);
+      TransposeConstStride1(a, b, node + node[0].trailing_tile_next_node_inc);
     }
   } else {
-    for (int64_t i = node[0].start; i < node[0].end; ++i) {
-      const char* a1 = a + node[1].start * node[1].lda;
-      char* b1 = b + node[1].start * node[1].ldb;
-      for (int64_t j = node[1].start; j < node[1].end; ++j) {
+    for (int64_t i = 0; i < node[0].end; ++i) {
+      const char* a1 = a;
+      char* b1 = b;
+      for (int64_t j = 0; j < node[1].end; ++j) {
         TransposeConstStride1(a1, b1, node + 2);
         a1 += node[1].lda;
         b1 += node[1].ldb;
@@ -468,10 +451,12 @@ void TransposePlan::Execute(
   }
   tsl::profiler::TraceMe traceme("Transpose::Execute", /*level=*/2);
 
-  const char* ac = static_cast<const char*>(a);
-  char* bc = static_cast<char*>(b);
+  auto execute_by_type = [&](int chunk_id) {
+    const char* ac =
+        static_cast<const char*>(a) + input_offset_bytes_[chunk_id];
+    char* bc = static_cast<char*>(b) + output_offset_bytes_[chunk_id];
 
-  auto execute_by_type = [&](absl::Span<Node const> nodes) {
+    absl::Span<Node const> nodes = nodes_[chunk_id];
     if (inner_kernel_is_memcpy_) {
       DCHECK(transformation_ == Transformation::kNone);
       // Memcpy-based plans all assume element size 1 (i.e., bytes).
@@ -506,20 +491,19 @@ void TransposePlan::Execute(
   };
 
   if (!schedule_work || nodes_.size() <= 1) {
-    for (const auto& nodes : nodes_) {
-      execute_by_type(nodes);
+    for (int i = 0; i < nodes_.size(); ++i) {
+      execute_by_type(i);
     }
   } else {
     absl::BlockingCounter counter(nodes_.size() - 1);
-    for (size_t i = 1; i < nodes_.size(); ++i) {
-      absl::Span<Node const> nodes = nodes_[i];
-      (*schedule_work)([&, nodes]() {
-        execute_by_type(nodes);
+    for (int i = 1; i < nodes_.size(); ++i) {
+      (*schedule_work)([&, i]() {
+        execute_by_type(i);
         counter.DecrementCount();
       });
     }
     // Run the first chunk inline in this thread.
-    execute_by_type(nodes_[0]);
+    execute_by_type(0);
     counter.Wait();
   }
 }
@@ -634,7 +618,7 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
                       absl::InlinedVector<bool, 4>(ndim, false)});
 
   auto loop_has_trivial_iteration_space = [](const Node& node) {
-    return node.start == 0 && node.start + node.inc == node.end;
+    return node.inc == node.end;
   };
 
   while (!agenda.empty()) {
@@ -655,7 +639,7 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
       // value, that describes the striding of the inner transpose kernel.
       if (!inner_kernel_is_memcpy_) {
         Node node;
-        node.start = node.end = node.inc = -1;
+        node.end = node.inc = -1;
         node.lda = sentinel_lda_;
         node.ldb = sentinel_ldb_;
         nodes.push_back(node);
@@ -689,11 +673,10 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
       CHECK(loop.start % node.inc == 0)
           << "loop.start=" << loop.start
           << " must be aligned to node.inc=" << node.inc;
-      node.start = loop.start;
-      node.end = std::min<int64_t>(size, loop.end);
+      node.end = std::min<int64_t>(size, loop.end) - loop.start;
 
       if (node.is_inner_dim_in_a && inner_kernel_is_memcpy_) {
-        node.end = (node.end - node.start) * elem_size_in_bytes_;
+        node.end *= elem_size_in_bytes_;
       }
 
       if (!loop_has_trivial_iteration_space(node) ||
@@ -736,11 +719,10 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
 
       // loop.start and loop.end are in tile units.
       int64_t num_tiles = partial ? 1 : num_complete_tiles;
-      node.start = loop.start;
-      node.end = std::min<int64_t>(num_tiles, loop.end);
+      node.end = std::min<int64_t>(num_tiles, loop.end) - loop.start;
 
       if (node.is_inner_dim_in_a && inner_kernel_is_memcpy_) {
-        node.end = (node.end - node.start) * elem_size_in_bytes_;
+        node.end *= elem_size_in_bytes_;
       }
 
       // If this loop has a trivial iteration space, drop it.
@@ -1110,7 +1092,8 @@ void TransposePlan::Initialize() {
       << ToString();
 
   int num_chunks = ChooseParallelizationStrategy(loop_order);
-  chunk_loops_ = PartitionLoops(num_chunks, loop_order);
+  PartitionLoops(num_chunks, loop_order, chunk_loops_, input_offset_bytes_,
+                 output_offset_bytes_);
   nodes_.resize(num_chunks);
   for (int chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
     BuildPlanNodes(chunk_id, nodes_[chunk_id]);
@@ -1203,13 +1186,16 @@ int TransposePlan::ChooseParallelizationStrategy(
   return num_chunks;
 }
 
-std::vector<std::vector<TransposePlan::Loop>> TransposePlan::PartitionLoops(
-    int num_chunks, const std::vector<Loop>& loop_order) {
-  std::vector<std::vector<Loop>> result(num_chunks);
+/*static*/ void TransposePlan::PartitionLoops(
+    int num_chunks, const std::vector<Loop>& loop_order,
+    std::vector<std::vector<TransposePlan::Loop>>& result,
+    std::vector<int64_t>& input_offset_bytes,
+    std::vector<int64_t>& output_offset_bytes) {
+  // Copy the base loop order for each chunk.
+  result.resize(num_chunks, loop_order);
+  input_offset_bytes.resize(num_chunks);
+  output_offset_bytes.resize(num_chunks);
   for (int chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
-    // Copy the base loop order for this chunk.
-    result[chunk_id] = loop_order;
-
     // For each loop, narrow the start/end bounds to this chunk's portion.
     int task_id_remaining = chunk_id;
     int num_tasks_remaining = num_chunks;
@@ -1233,10 +1219,10 @@ std::vector<std::vector<TransposePlan::Loop>> TransposePlan::PartitionLoops(
       chunk_loop.end =
           base_loop.start +
           std::min(iterations, (task_id + 1) * iterations_per_task);
+      input_offset_bytes[chunk_id] += chunk_loop.start * chunk_loop.lda;
+      output_offset_bytes[chunk_id] += chunk_loop.start * chunk_loop.ldb;
     }
   }
-
-  return result;
 }
 
 std::string TransposePlan::ToString() const {
@@ -1249,9 +1235,9 @@ std::string TransposePlan::ToString() const {
                   absl::StrAppendFormat(
                       out,
                       "    "
-                      "Node(start=%d,end=%d,inc=%d,lda=%"
+                      "Node(end=%d,inc=%d,lda=%"
                       "d,ldb=%d,next_trailing=%d,inner_a=%s,inner_b=%s)",
-                      node.start, node.end, node.inc, node.lda, node.ldb,
+                      node.end, node.inc, node.lda, node.ldb,
                       node.trailing_tile_next_node_inc,
                       node.is_inner_dim_in_a ? "y" : "n",
                       node.is_inner_dim_in_b ? "y" : "n");
@@ -1262,11 +1248,15 @@ std::string TransposePlan::ToString() const {
                           loop.tile_interior ? "[tile]" : "", loop.start,
                           loop.end, loop.parallelism);
   };
-  std::string chunk_loops_str = absl::StrJoin(
-      chunk_loops_, "\n",
-      [&](std::string* out, const std::vector<Loop>& loops) {
-        absl::StrAppend(out, "    ", absl::StrJoin(loops, ", ", format_loop));
-      });
+  std::vector<std::string> chunk_strings;
+  chunk_strings.reserve(chunk_loops_.size());
+  for (int i = 0; i < chunk_loops_.size(); ++i) {
+    chunk_strings.push_back(absl::StrFormat(
+        "    chunk %d: input_offset=%d output_offset=%d loops=%s", i,
+        input_offset_bytes_[i], output_offset_bytes_[i],
+        absl::StrJoin(chunk_loops_[i], ", ", format_loop)));
+  }
+  std::string chunk_loops_str = absl::StrJoin(chunk_strings, "\n");
   std::string transformation_str;
   switch (transformation_) {
     case Transformation::kNone:
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index b0eccc5d37132b..44926cfe82ae62 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -200,8 +200,11 @@ class TransposePlan {
   // Creates per-chunk loop vectors by splitting loop_order_ into per-chunk
   // loops. Returns a vector of loop vectors, one per chunk. Each chunk's
   // loops have their start/end bounds narrowed to represent that chunk's work.
-  std::vector<std::vector<Loop>> PartitionLoops(
-      int num_chunks, const std::vector<Loop>& loop_order);
+  static void PartitionLoops(
+      int num_chunks, const std::vector<Loop>& loop_order,
+      std::vector<std::vector<TransposePlan::Loop>>& result,
+      std::vector<int64_t>& input_offset_bytes,
+      std::vector<int64_t>& output_offset_bytes);
 
   // The signature of ExecuteTyped uses char* pointers because we perform
   // address calculations with strides in bytes; the strides need not be
@@ -252,6 +255,10 @@ class TransposePlan {
   // representing one chunk of the work.
   std::vector<std::vector<Loop>> chunk_loops_;
 
+  // Per-chunk byte offsets into the input and output arrays.
+  std::vector<int64_t> input_offset_bytes_;
+  std::vector<int64_t> output_offset_bytes_;
+
   // Root nodes of the plan, i.e., pointing to the outermost loops in the loop
   // nest. The outer vector is indexed on the thread ID.
   absl::InlinedVector<std::vector<Node>, 1> nodes_;

From 72155b4767b39022db1cd31807f51e3689384fae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 20:14:18 -0800
Subject: [PATCH 563/753] Automated Code Change

PiperOrigin-RevId: 846523012
---
 tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
index 442ef61d093a5b..e8e367c48a568a 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -244,7 +244,7 @@ class MlrtBatchResource : public tensorflow::serving::BatchResourceBase {
     return absl::OkStatus();
   }
 
-  string DebugString() const final { return "MlrtBatchResource"; }
+  std::string DebugString() const final { return "MlrtBatchResource"; }
 
   mlrt::bc::Function batch_function() const { return batch_function_; }
 

From 4b820b349f4b3d802b1fc7543ebfefeb67bac263 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 20:19:31 -0800
Subject: [PATCH 564/753] Automated Code Change

PiperOrigin-RevId: 846524556
---
 .../core/tfrt/tfrt_session/tfrt_session.cc    | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
index 0fc8f06b2b5e53..9956e74011d7ed 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session.cc
@@ -173,7 +173,7 @@ class TfrtSession : public tensorflow::Session {
   }
 
   absl::Status Create(GraphDef&& graph) override {
-    absl::MutexLock lock(&session_state_lock_);
+    absl::MutexLock lock(session_state_lock_);
     return CreateLocked(std::move(graph));
   }
 
@@ -279,7 +279,7 @@ class TfrtSession : public tensorflow::Session {
   }
 
   absl::Status Extend(GraphDef&& graph) override {
-    absl::MutexLock lock(&session_state_lock_);
+    absl::MutexLock lock(session_state_lock_);
     return ExtendLocked(std::move(graph));
   }
 
@@ -299,7 +299,7 @@ class TfrtSession : public tensorflow::Session {
       std::vector<Tensor>* outputs,
       const thread::ThreadPoolOptions& thread_pool_options) {
     {
-      absl::MutexLock lock(&session_state_lock_);
+      absl::MutexLock lock(session_state_lock_);
       if (session_state_ == SessionState::kInitialized) {
         return errors::Unavailable("Session not created yet.");
       }
@@ -401,7 +401,7 @@ class TfrtSession : public tensorflow::Session {
   // NOTE: This API is still experimental and may change.
   absl::Status MakeCallable(const CallableOptions& callable_options,
                             CallableHandle* out_handle) override {
-    absl::MutexLock lock(&callables_lock_);
+    absl::MutexLock lock(callables_lock_);
     *out_handle = next_callable_handle_++;
     assert(callables_.find(*out_handle) == callables_.end());
     callables_[*out_handle] = {callable_options};
@@ -436,7 +436,7 @@ class TfrtSession : public tensorflow::Session {
       const thread::ThreadPoolOptions& thread_pool_options) override {
     Callable callable;
     {
-      absl::MutexLock lock(&callables_lock_);
+      absl::MutexLock lock(callables_lock_);
       auto it = callables_.find(handle);
       if (it == callables_.end())
         return errors::InvalidArgument("No such callable handle: ", handle);
@@ -466,7 +466,7 @@ class TfrtSession : public tensorflow::Session {
   /// session.
   /// NOTE: This API is still experimental and may change.
   absl::Status ReleaseCallable(CallableHandle handle) override {
-    absl::MutexLock lock(&callables_lock_);
+    absl::MutexLock lock(callables_lock_);
     auto it = callables_.find(handle);
     if (it == callables_.end())
       return errors::InvalidArgument("No such callable handle: ", handle);
@@ -475,7 +475,7 @@ class TfrtSession : public tensorflow::Session {
   }
 
   absl::Status Close() override {
-    absl::MutexLock lock(&session_state_lock_);
+    absl::MutexLock lock(session_state_lock_);
     session_state_ = SessionState::kClosed;
     return absl::OkStatus();
   }
@@ -721,7 +721,7 @@ class TfrtSessionFactory::ThreadPoolManager {
           "TFRT session does not yet support session local thread pool");
     }
 
-    absl::MutexLock lock(&mutex_);
+    absl::MutexLock lock(mutex_);
 
     auto it = named_thread_pools_.find(name);
     // The thread pool with the given name already exists.
@@ -842,7 +842,7 @@ absl::Status TfrtSessionFactory::NewSession(const SessionOptions& options,
 
   *out_session = nullptr;
 
-  absl::MutexLock lock(&mutex_);
+  absl::MutexLock lock(mutex_);
   std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
@@ -873,13 +873,13 @@ static TfrtSessionFactory* session_factory = nullptr;
 
 tfrt_stub::Runtime* TfrtSessionFactory::GetRuntime() {
   DCHECK(session_factory != nullptr);
-  absl::MutexLock lock(&session_factory->mutex_);
+  absl::MutexLock lock(session_factory->mutex_);
   return session_factory->runtime_;
 }
 
 absl::Status InitializeTfrtSession(const TfrtSessionOptions& options) {
   DCHECK(session_factory != nullptr);
-  absl::MutexLock lock(&session_factory->mutex_);
+  absl::MutexLock lock(session_factory->mutex_);
   DCHECK(!session_factory->IsInitialized());
   return UpdateTfrtSessionOptionsLocked(options);
 }

From 5a8f8d649101fc9d48b91d6a88ba5fc611be4d5e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 20:56:14 -0800
Subject: [PATCH 565/753] Automated Code Change

PiperOrigin-RevId: 846534754
---
 tensorflow/core/tfrt/fallback/fallback_state.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index c7f12aed50daa3..c500f862e1e706 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -51,8 +51,9 @@ namespace tfrt_stub {
 
 namespace {
 
-string DeviceName(absl::string_view name_prefix, absl::string_view device_type,
-                  int32_t task_id, size_t device_id) {
+std::string DeviceName(absl::string_view name_prefix,
+                       absl::string_view device_type, int32_t task_id,
+                       size_t device_id) {
   return strings::StrCat(absl::StripSuffix(name_prefix, "0"), task_id,
                          "/device:", device_type, ":", device_id);
 }

From f1953049fbd9e320013bd272378f0339fedf6795 Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Thu, 18 Dec 2025 23:00:15 -0800
Subject: [PATCH 566/753] Fix struct size check in
 PJRT_Executable_GetCompiledMemoryStats.

PiperOrigin-RevId: 846575527
---
 third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 23c788b97ad7bd..8b117e432673b9 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -2071,8 +2071,9 @@ PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args) {
 PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
     PJRT_Executable_GetCompiledMemoryStats_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
-      "PJRT_Executable_Serialize_Args",
-      PJRT_Executable_Serialize_Args_STRUCT_SIZE, args->struct_size));
+      "PJRT_Executable_GetCompiledMemoryStats_Args",
+      PJRT_Executable_GetCompiledMemoryStats_Args_STRUCT_SIZE,
+      args->struct_size));
   PJRT_ASSIGN_OR_RETURN(auto memory_stats,
                         args->executable->executable->GetCompiledMemoryStats());
   args->generated_code_size_in_bytes =

From c04f83c07d41829b087690729e637bcc24a2c9dd Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Thu, 18 Dec 2025 23:07:29 -0800
Subject: [PATCH 567/753] Update symbols in nccl stub for NCCL 2.28.9.

Follow-up after https://github.com/openxla/xla/pull/35463.

PiperOrigin-RevId: 846577791
---
 third_party/xla/xla/tsl/cuda/nccl.symbols | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/tsl/cuda/nccl.symbols b/third_party/xla/xla/tsl/cuda/nccl.symbols
index 48524f43e189e7..43f799578f82ef 100644
--- a/third_party/xla/xla/tsl/cuda/nccl.symbols
+++ b/third_party/xla/xla/tsl/cuda/nccl.symbols
@@ -1,5 +1,6 @@
 ncclAllGather
 ncclAllReduce
+ncclAlltoAll
 ncclBcast
 ncclBroadcast
 ncclCommAbort

From e8508f09ebe97c09695df2ecd5b172d14e68f99c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 18 Dec 2025 23:14:10 -0800
Subject: [PATCH 568/753] Automated Code Change

PiperOrigin-RevId: 846579789
---
 tensorflow/python/util/BUILD              | 2 ++
 tensorflow/python/util/kernel_registry.cc | 2 ++
 tensorflow/python/util/nest.cc            | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 5875bf0e16668d..0851f88e67434a 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -133,6 +133,8 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//third_party/python_runtime:headers",
     ],
     alwayslink = 1,
diff --git a/tensorflow/python/util/kernel_registry.cc b/tensorflow/python/util/kernel_registry.cc
index 6a78c6668d9643..8d6a68dd7397f6 100644
--- a/tensorflow/python/util/kernel_registry.cc
+++ b/tensorflow/python/util/kernel_registry.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/util/kernel_registry.h"
 
+#include <string>
+
 #include "absl/log/log.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/python/util/nest.cc b/tensorflow/python/util/nest.cc
index d7df8c42dde196..467359cbb9cf5e 100644
--- a/tensorflow/python/util/nest.cc
+++ b/tensorflow/python/util/nest.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <cstddef>
 #include <string>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/python/lib/core/safe_pyobject_ptr.h"

From 5451b22b40d7ef3c9881040edc6647df05d11bf0 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Thu, 18 Dec 2025 23:43:46 -0800
Subject: [PATCH 569/753] Remove unused dependencies and conditional
 compilation from XLA GPU emitter transforms.

This change prunes unnecessary dependencies from the `passes` target in //third_party/tensorflow/compiler/xla/backends/gpu/codegen/emitters/transforms/BUILD. It also removes conditional compilation flags and dependencies related to CUDA and ROCM, as they are not required for the current functionality. Additionally, it updates convert_float_amd.cc to use proto2::TextFormat directly.

PiperOrigin-RevId: 846589876
---
 .../gpu/codegen/emitters/transforms/BUILD     | 27 +------------------
 .../emitters/transforms/convert_float_amd.cc  |  6 ++---
 2 files changed, 4 insertions(+), 29 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
index b96ff357233680..2446234eacb089 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/BUILD
@@ -1,10 +1,5 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@rules_cc//cc:cc_library.bzl", "cc_library")
-load(
-    "//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -43,37 +38,22 @@ cc_library(
         "recover_exp2.cc",
     ],
     hdrs = ["passes.h"],
-    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
     deps = [
         ":passes_inc_gen",
-        "//xla:shape_util",
-        "//xla:util",
         "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/codegen/emitters/ir:xla",
-        "//xla/codegen/emitters/ir:xla_dialect_inc_gen",
-        "//xla/codegen/emitters/transforms:atomic_rmw_utils",
         "//xla/hlo/analysis:indexing_analysis",
-        "//xla/mlir_hlo",
-        "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service/gpu:gpu_fusible",
-        "//xla/service/gpu:ir_emission_utils",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_description_proto_cc",
-        "//xla/stream_executor:semantic_version",
         "//xla/stream_executor/rocm:rocm_compute_capability",
-        "//xla/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AffineDialect",
-        "@llvm-project//mlir:AffineUtils",
         "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:CallOpInterfaces",
-        "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:DataLayoutInterfaces",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:GPUDialect",
@@ -88,10 +68,5 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:VectorDialect",
-        "@local_tsl//tsl/platform:protobuf",
-    ] + if_cuda_is_configured([
-        "//xla/service/gpu/llvm_gpu_backend:nvptx_backend",
-    ]) + if_rocm_is_configured([
-        "//xla/service/gpu/llvm_gpu_backend:amdgpu_backend",
-    ]),
+    ],
 )
diff --git a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
index da448aa7ae0b76..1e7d6464ad1d0f 100644
--- a/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/emitters/transforms/convert_float_amd.cc
@@ -43,11 +43,11 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "google/protobuf/text_format.h"
 #include "xla/backends/gpu/codegen/emitters/transforms/passes.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/rocm/rocm_compute_capability.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 namespace gpu {
@@ -561,8 +561,8 @@ class ConvertFloatAMDPass
   void runOnOperation() override {
     if (!gpu_device_info_.empty()) {
       se::GpuDeviceInfoProto device_info;
-      CHECK(tsl::protobuf::TextFormat::ParseFromString(gpu_device_info_,
-                                                       &device_info));
+      CHECK(
+          google::protobuf::TextFormat::ParseFromString(gpu_device_info_, &device_info));
       absl::StatusOr<se::DeviceDescription> device_description =
           se::DeviceDescription::FromProto(device_info);
       CHECK_OK(device_description.status());

From be8b991bd8b07ab6bd7456247d897ddc201cc8e0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 00:01:55 -0800
Subject: [PATCH 570/753] Automated Code Change

PiperOrigin-RevId: 846594715
---
 third_party/xla/xla/backends/gpu/runtime/BUILD               | 5 +++++
 third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc       | 1 +
 third_party/xla/xla/backends/gpu/runtime/norm_thunk.h        | 1 +
 .../xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h  | 1 +
 .../backends/gpu/runtime/nvshmem_collective_permute_thunk.h  | 1 +
 .../xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc       | 1 +
 .../xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc       | 1 +
 third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc    | 2 ++
 third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h     | 1 +
 third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h  | 1 +
 third_party/xla/xla/backends/gpu/runtime/thunk.cc            | 1 -
 11 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 78274594b54b8c..de324ce85f41f5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -2014,6 +2014,7 @@ cc_library(
     hdrs = ["norm_thunk.h"],
     deps = [
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/runtime:buffer_use",
@@ -2056,7 +2057,9 @@ cc_library(
     hdrs = ["outfeed_thunk.h"],
     deps = [
         ":shaped_slice",
+        ":shaped_slice_proto_cc",
         ":thunk",
+        ":thunk_proto_cc",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:status_macros",
@@ -3060,6 +3063,7 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
@@ -3091,6 +3095,7 @@ cc_library(
         ":p2p_thunk_common",
         ":thunk",
         "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
         "//xla/backends/gpu/collectives:gpu_collectives",
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
diff --git a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc
index 153d4f6e9e7048..0387a6e36b0f1e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h
index a0b392033f7fae..d92f23098bdb8b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/norm_thunk.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_norm_runner.h"
 #include "xla/stream_executor/stream.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
index d8ad220cbe0cec..a6c74ab97a05f4 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_all_reduce_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
index 17c945f85e3149..2167e9c5bd1b5e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_collective_permute_thunk.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
index e310a931c8790b..dba1927c63560e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_recv_thunk.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
index 7c30a373e730c1..ad76d81acace6d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/nvshmem_send_thunk.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc
index bb9259d19a7cf6..e03a176051e11f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.pb.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_transfer_manager.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
index de0904648b7a7b..2a07b027f4207b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/outfeed_thunk.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
index a3aadbacfc7bd0..f8875d3037fc5b 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.cc b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
index 5e174cf83177b0..623f4fd376c49d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/base/nullability.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/functional/function_ref.h"
 #include "absl/status/status.h"

From 45c0013db6d0b5c8dadca5ca1f9e80cb4b0b60fa Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Fri, 19 Dec 2025 00:22:22 -0800
Subject: [PATCH 571/753] Add GetCompilationOptions to PjRt C API

PiperOrigin-RevId: 846601597
---
 third_party/xla/xla/pjrt/c/CHANGELOG.md       |  4 +++
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 30 +++++++++++++++++--
 third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc |  6 ++++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 24 +++++++++++++++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h  |  6 ++++
 .../pjrt/c_api_client/pjrt_c_api_client.cc    | 26 ++++++++++++++++
 .../xla/pjrt/c_api_client/pjrt_c_api_client.h |  6 ++++
 .../c_api_client/pjrt_c_api_client_test.cc    | 23 ++++++++++++++
 8 files changed, 123 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 1c142fa18afc15..c1c7c8cce99139 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,9 @@
 # PJRT C API changelog
 
+## 0.87
+
+* Add `PJRT_Executable_GetCompileOptions`.
+
 ## 0.86
 
 * Add `PJRT_Device_CreateAsyncTrackingEvent`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index ae8caf9d724246..a89d6a4c93bc8c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -104,7 +104,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 86
+#define PJRT_API_MINOR 87
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -2031,6 +2031,8 @@ typedef PJRT_Error* PJRT_Executable_OutputMemoryKinds(
 
 typedef struct PJRT_SerializedExecutable PJRT_SerializedExecutable;
 
+typedef struct PJRT_SerializedCompileOptions PJRT_SerializedCompileOptions;
+
 struct PJRT_Executable_Serialize_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2054,6 +2056,29 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Serialize_Args,
 typedef PJRT_Error* PJRT_Executable_Serialize(
     PJRT_Executable_Serialize_Args* args);
 
+struct PJRT_Executable_GetCompileOptions_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Executable* executable;
+
+  // Lives only as long as serialized_compile_options
+  const char* serialized_bytes;  // out
+  size_t serialized_bytes_size;  // out
+
+  PJRT_SerializedCompileOptions*
+      serialized_compile_options;  // backs serialized_bytes.
+  // cleanup fn must be called to free the backing memory for serialized_bytes.
+  // Should only be called once on serialized_compile_options.
+  void (*serialized_compile_options_deleter)(
+      PJRT_SerializedCompileOptions* options);  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_GetCompileOptions_Args,
+                          serialized_compile_options_deleter);
+
+// Returns the CompileOptions that were used to compile this executable.
+typedef PJRT_Error* PJRT_Executable_GetCompileOptions(
+    PJRT_Executable_GetCompileOptions_Args* args);
+
 struct PJRT_Executable_DeserializeAndLoad_Args {
   size_t struct_size;
   PJRT_Extension_Base* extension_start;
@@ -2826,11 +2851,12 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_Device_PoisonExecution);
   _PJRT_API_STRUCT_FIELD(PJRT_Device_CreateAsyncTrackingEvent);
   _PJRT_API_STRUCT_FIELD(PJRT_AsyncTrackingEvent_Destroy);
+  _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompileOptions);
 } PJRT_Api;
 
 enum {
   PJRT_Api_STRUCT_SIZE =
-      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_AsyncTrackingEvent_Destroy)
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Executable_GetCompileOptions)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 19e9c0678a0160..246e746bf0ddf6 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -960,6 +960,9 @@ FieldOffsetsAndSizesForVersion(int major_version, int minor_version) {
       add_field("PJRT_Device_CreateAsyncTrackingEvent", kFnPtrSize);
       add_field("PJRT_AsyncTrackingEvent_Destroy", kFnPtrSize);
     }
+    if (minor_version >= 87) {
+      add_field("PJRT_Executable_GetCompileOptions", kFnPtrSize);
+    }
     return version_offsets_and_sizes;
   }
   LOG(FATAL) << "Unsupported API version: " << major_version << "."
@@ -1371,6 +1374,9 @@ TEST_F(PjrtCAbiTestBase, FieldOffsetsAndSizes) {
           {"PJRT_AsyncTrackingEvent_Destroy",
            {offsetof(PJRT_Api, PJRT_AsyncTrackingEvent_Destroy),
             sizeof(PJRT_Api::PJRT_AsyncTrackingEvent_Destroy)}},
+          {"PJRT_Executable_GetCompileOptions",
+           {offsetof(PJRT_Api, PJRT_Executable_GetCompileOptions),
+            sizeof(PJRT_Api::PJRT_Executable_GetCompileOptions)}},
       };
   ASSERT_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
   ASSERT_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 8b117e432673b9..7c95b2c80e7d1e 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -2068,6 +2068,28 @@ PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args) {
   return nullptr;
 }
 
+PJRT_Error* PJRT_Executable_GetCompileOptions(
+    PJRT_Executable_GetCompileOptions_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Executable_GetCompileOptions_Args",
+      PJRT_Executable_GetCompileOptions_Args_STRUCT_SIZE, args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(xla::CompileOptions options,
+                        args->executable->executable->GetCompileOptions());
+  PJRT_ASSIGN_OR_RETURN(xla::CompileOptionsProto options_proto,
+                        options.ToProto());
+  std::string serialized = options_proto.SerializeAsString();
+
+  PJRT_SerializedCompileOptions* serialized_options =
+      new PJRT_SerializedCompileOptions;
+  serialized_options->serialized = std::move(serialized);
+  args->serialized_compile_options = serialized_options;
+  args->serialized_bytes = serialized_options->serialized.data();
+  args->serialized_bytes_size = serialized_options->serialized.size();
+  args->serialized_compile_options_deleter =
+      +[](PJRT_SerializedCompileOptions* options) { delete options; };
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
     PJRT_Executable_GetCompiledMemoryStats_Args* args) {
   PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
@@ -3276,6 +3298,8 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       pjrt::PJRT_Device_CreateAsyncTrackingEvent,
       /*PJRT_AsyncTrackingEvent_Destroy=*/
       pjrt::PJRT_AsyncTrackingEvent_Destroy,
+      /*PJRT_Executable_GetCompileOptions=*/
+      pjrt::PJRT_Executable_GetCompileOptions,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index e5381380430651..a0da7731d17837 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -215,6 +215,10 @@ struct PJRT_SerializedExecutable {
   std::string serialized;
 };
 
+struct PJRT_SerializedCompileOptions {
+  std::string serialized;
+};
+
 struct PJRT_DeviceAssignmentSerialized {
   std::string serialized;
 };
@@ -378,6 +382,8 @@ PJRT_Error* PJRT_Executable_OutputMemoryKinds(
 PJRT_Error* PJRT_Executable_OptimizedProgram(
     PJRT_Executable_OptimizedProgram_Args* args);
 PJRT_Error* PJRT_Executable_Serialize(PJRT_Executable_Serialize_Args* args);
+PJRT_Error* PJRT_Executable_GetCompileOptions(
+    PJRT_Executable_GetCompileOptions_Args* args);
 PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
     PJRT_Executable_GetCompiledMemoryStats_Args* args);
 
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 4d519828fbf143..1e4823dc2f80b8 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -2210,6 +2210,32 @@ absl::StatusOr<std::string> PjRtCApiExecutable::FingerprintExecutable() const {
                      args.executable_fingerprint_size);
 }
 
+absl::StatusOr<CompileOptions> PjRtCApiExecutable::GetCompileOptions() const {
+  if (c_api_->pjrt_api_version.major_version == 0 &&
+      c_api_->pjrt_api_version.minor_version < 87) {
+    return absl::UnimplementedError(
+        "PJRT_Executable_GetCompileOptions not implemented in this PJRT "
+        "plugin.");
+  }
+  PJRT_Executable_GetCompileOptions_Args args;
+  args.struct_size = PJRT_Executable_GetCompileOptions_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.executable = c_executable();
+  RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_Executable_GetCompileOptions(&args),
+                              c_api_);
+  absl::Cleanup cleanup = [&args] {
+    args.serialized_compile_options_deleter(args.serialized_compile_options);
+  };
+  CompileOptionsProto proto;
+  if (!proto.ParseFromString(
+          std::string(args.serialized_bytes, args.serialized_bytes_size))) {
+    return absl::InternalError(
+        "PjRtCApiExecutable::GetCompileOptions: Failed to parse "
+        "CompileOptionsProto");
+  }
+  return CompileOptions::FromProto(proto);
+}
+
 // ------------------------ Loaded Executables ---------------------------------
 
 PjRtCApiLoadedExecutable::PjRtCApiLoadedExecutable(
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index 30e99cd55d74a7..84d71e9740ec7f 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -680,6 +680,8 @@ class PjRtCApiExecutable : public PjRtExecutable {
   // TODO(b/438000615): Move this to PjRtLoadedExecutable.
   absl::StatusOr<std::string> GetSerializedExecutableMetadata() const;
 
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override;
+
  private:
   const PJRT_Api* c_api_;
   std::unique_ptr<PJRT_Executable, ::pjrt::PJRT_ExecutableDeleter> executable_;
@@ -755,6 +757,10 @@ class PjRtCApiLoadedExecutable : public PjRtLoadedExecutable {
     return executable_->GetOutputMemoryKinds();
   }
 
+  absl::StatusOr<CompileOptions> GetCompileOptions() const override {
+    return executable_->GetCompileOptions();
+  }
+
   absl::StatusOr<std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>> Execute(
       absl::Span<const std::vector<PjRtBuffer*>> argument_handles,
       const ExecuteOptions& options,
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
index caf4bdc409b6f9..f45151debe82db 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client_test.cc
@@ -338,6 +338,29 @@ TEST(PjRtCApiClientTest, NonEmptyExecutableFingerprint) {
   }
 }
 
+TEST(PjRtCApiClientTest, GetCompileOptions) {
+  SetUpCpuPjRtApi();
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,
+                       GetCApiClient("cpu"));
+  Shape shape = ShapeUtil::MakeShapeWithType<float>({4});
+  XlaBuilder builder("sum");
+  auto inp_0 = Parameter(&builder, 0, shape, "input0");
+  auto inp_1 = Parameter(&builder, 1, shape, "input1");
+  auto sum = Add(inp_0, inp_1);
+  builder.SetUpAlias({}, 0, {});
+  auto computation = builder.Build(sum).value();
+
+  CompileOptions options;
+  options.compile_portable_executable = !options.compile_portable_executable;
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtLoadedExecutable> executable,
+                       client->CompileAndLoad(computation, options));
+
+  ASSERT_OK_AND_ASSIGN(CompileOptions retrieved_options,
+                       executable->GetCompileOptions());
+  EXPECT_EQ(retrieved_options.compile_portable_executable,
+            options.compile_portable_executable);
+}
+
 TEST(PjRtCApiClientTest, CreateBuffersForAsyncHostToDeviceWithShape) {
   SetUpCpuPjRtApi();
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<PjRtClient> client,

From af68cfc4bdf43b60d30e15bdb930c8179a3127d1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 00:43:02 -0800
Subject: [PATCH 572/753] Automated Code Change

PiperOrigin-RevId: 846608754
---
 .../compiler/mlir/tfr/integration/tfr_decompose_ctx.cc     | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
index e2f1bdbfb0a0de..2672a90f93cdd3 100644
--- a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
@@ -81,8 +81,9 @@ absl::StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
   std::string tfr_lib_dir;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar(
       kTFRLibEnv, "tensorflow/compiler/mlir/tfr/resources", &tfr_lib_dir));
-  string composite_mlir_dir = io::JoinPath(env->GetRunfilesDir(), tfr_lib_dir);
-  std::vector<string> files;
+  std::string composite_mlir_dir =
+      io::JoinPath(env->GetRunfilesDir(), tfr_lib_dir);
+  std::vector<std::string> files;
   TF_RETURN_IF_ERROR(env->GetChildren(composite_mlir_dir, &files));
   if (files.empty()) {
     return errors::Internal(absl::StrCat(
@@ -90,7 +91,7 @@ absl::StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
   }
   std::string tfr_raw_text;
   for (const auto& file : files) {
-    string fullpath = io::JoinPath(composite_mlir_dir, file);
+    std::string fullpath = io::JoinPath(composite_mlir_dir, file);
     if (env->MatchPath(fullpath, io::JoinPath(composite_mlir_dir, "*.mlir"))) {
       std::string text;
       TF_RETURN_IF_ERROR(ReadFileToString(env, fullpath, &text));

From 1299abfd8f0aa9e64cb64d54d75685ac3de9a5d9 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Fri, 19 Dec 2025 00:53:56 -0800
Subject: [PATCH 573/753] [XLA:GPU] Add
 xla_gpu_default_to_alg_dot_bf16_bf16_f32 flag to XLA.

This flag allows using the dot precision algorithm ALG_DOT_BF16_BF16_F32 for f32 dot ops by default on GPU, which can improve performance at the expense of numerical accuracy.

This change just adds the flag, the implementation will follow.

PiperOrigin-RevId: 846612133
---
 third_party/xla/xla/debug_options_flags.cc |  8 ++++++++
 third_party/xla/xla/xla.proto              | 10 +++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index a8cbbe96789a02..5ad2bb235fd49c 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -370,6 +370,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_enable_fusion_block_level_rewriter(false);
 
   opts.set_xla_gpu_enable_llvm_module_compilation_parallelism(false);
+  opts.set_xla_gpu_default_to_alg_dot_bf16_bf16_f32(false);
   opts.set_xla_gpu_enable_libnvptxcompiler(
       stream_executor::IsLibNvPtxCompilerSupported());
   opts.set_xla_gpu_libnvjitlink_mode(DebugOptions::LIB_NV_JIT_LINK_MODE_AUTO);
@@ -1533,6 +1534,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "--xla_gpu_force_compilation_parallelism flag and the thread pool "
       "supplied to GpuCompiler."));
 
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_default_to_alg_dot_bf16_bf16_f32",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_default_to_alg_dot_bf16_bf16_f32),
+      debug_options->xla_gpu_default_to_alg_dot_bf16_bf16_f32(),
+      "Use the dot precision algorithm `ALG_DOT_BF16_BF16_F32 by default for "
+      "f32 dots."));
   flag_list->push_back(
       tsl::Flag("xla_gpu_deterministic_ops",
                 bool_setter_for(&DebugOptions::set_xla_gpu_deterministic_ops),
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 6575b4803029e2..e0d8356b32869b 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -457,6 +457,14 @@ message DebugOptions {
   // but potentially higher the performance.
   optional int32 xla_gpu_cudnn_gemm_max_plans = 318;
 
+  // Allows using the dot precision algorithm `ALG_DOT_BF16_BF16_F32 for f32 dot
+  // ops by default. This is expected to improve performance at the expense of
+  // numerical accuracy.
+  //
+  // At this point, XLA may still choose a higher precision dot algorithm, but
+  // we expect this to change at a later point.
+  optional bool xla_gpu_default_to_alg_dot_bf16_bf16_f32 = 441;
+
   // Guarantees run-to-run determinism.
   // This flag implies --xla_gpu_exclude_nondeterministic_ops and in addition
   // disables autotuning.
@@ -1333,7 +1341,7 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 441
+  // Next id: 442
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From 3d23902120ec594464feddd8ddbc2561b530e542 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 01:04:04 -0800
Subject: [PATCH 574/753] compat: Update forward compatibility horizon to
 2025-12-19

PiperOrigin-RevId: 846615186
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index a84c2af8863d2c..a7fa8eab2a0c08 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 18)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 35dc0c4552a96f79deb762297aee0392c8adb16c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 01:04:11 -0800
Subject: [PATCH 575/753] Update GraphDef version to 2446.

PiperOrigin-RevId: 846615217
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index d7d8fd441463ee..198b5a6e175699 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2445  // Updated: 2025/12/18
+#define TF_GRAPH_DEF_VERSION 2446  // Updated: 2025/12/19
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 458fe6aa5a38bbd47e76c5e7328a2bdbfa009f1e Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Fri, 19 Dec 2025 01:11:22 -0800
Subject: [PATCH 576/753] Refactor: Pass GpuComputeCapability to
 GetBlasComputationType

This change modifies `gpu::GetBlasComputationType` to accept a `GpuComputeCapability`. This allows the function to make platform-specific decisions, such as enabling TF32, without relying on preprocessor macros like `GOOGLE_CUDA`. Call sites are updated to pass the appropriate compute capability.

PiperOrigin-RevId: 846617600
---
 .../xla/xla/backends/gpu/autotuner/cublas.cc  |  3 ++-
 third_party/xla/xla/service/gpu/BUILD         |  4 ++--
 .../xla/xla/service/gpu/matmul_utils.cc       | 19 +++++++++++--------
 .../xla/xla/service/gpu/matmul_utils.h        |  4 +++-
 .../service/gpu/transforms/gemm_rewriter.cc   | 10 +++++-----
 .../xla/stream_executor/cuda/cuda_blas_lt.cc  | 10 ++++++----
 third_party/xla/xla/stream_executor/gpu/BUILD |  7 ++++---
 .../xla/stream_executor/gpu/gpu_blas_lt.cc    | 10 ++++------
 .../xla/xla/stream_executor/gpu/gpu_blas_lt.h |  4 +++-
 .../xla/stream_executor/rocm/hip_blas_lt.cc   | 12 ++++++------
 10 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
index 4f80eff3317db6..751c5c75502d58 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cublas.cc
@@ -104,7 +104,8 @@ CublasBackend::GetSupportedConfigs(const HloInstruction& instr) {
       out_desc.compute_type,
       se::gpu::GetBlasComputationType(
           gemm_config.precision_algorithm, gemm_config.lhs_layout.dtype,
-          gemm_config.output_layout.dtype, gemm_config.compute_precision));
+          gemm_config.output_layout.dtype, gemm_config.compute_precision,
+          target_config().device_description.gpu_compute_capability()));
 
   se::blas::BlasSupport* blas = stream_executor()->AsBlas();
   if (blas == nullptr) {
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 5125bbc4b1fb92..a95d9dd9862b8a 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1121,6 +1121,8 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
         "//xla/stream_executor/gpu:gpu_blas_lt",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -1128,8 +1130,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 6c9dea647fcf37..464df1087303f3 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -31,7 +31,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/autotuning.pb.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -50,11 +49,11 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -479,7 +478,8 @@ bool IsTf32Allowed(PrecisionConfig::Algorithm algorithm,
 
 absl::StatusOr<GemmConfig::DescriptorsTuple> GemmConfig::GetMatrixDescriptors(
     se::DeviceAddressBase lhs_buf, se::DeviceAddressBase rhs_buf,
-    se::DeviceAddressBase out_buf) const {
+    se::DeviceAddressBase out_buf,
+    const se::GpuComputeCapability& gpu_version) const {
   auto create_matrix_desc = [](const se::gpu::MatrixLayout& layout,
                                se::DeviceAddressBase data)
       -> absl::StatusOr<se::gpu::MatrixDescriptor> {
@@ -512,7 +512,7 @@ absl::StatusOr<GemmConfig::DescriptorsTuple> GemmConfig::GetMatrixDescriptors(
   TF_ASSIGN_OR_RETURN(out_desc.compute_type,
                       se::gpu::GetBlasComputationType(
                           PrecisionConfig::ALG_UNSET, lhs.dtype, out.dtype,
-                          se::blas::kDefaultComputePrecision));
+                          se::blas::kDefaultComputePrecision, gpu_version));
 
   TF_ASSIGN_OR_RETURN(se::gpu::MatrixDescriptor lhs_desc,
                       create_matrix_desc(lhs, lhs_buf));
@@ -541,8 +541,9 @@ absl::Status DoGemmWithAlgorithm(const se::gpu::MatrixDescriptor& lhs,
   PrimitiveType output_type = primitive_util::NativeToPrimitiveType<Output>();
   TF_ASSIGN_OR_RETURN(
       se::blas::ComputationType computation_type,
-      se::gpu::GetBlasComputationType(precision_algorithm, lhs_type,
-                                      output_type, compute_precision));
+      se::gpu::GetBlasComputationType(
+          precision_algorithm, lhs_type, output_type, compute_precision,
+          stream->parent()->GetDeviceDescription().gpu_compute_capability()));
   se::DeviceAddress<Output> output_data(output.data);
 
   // Set a workspace for all Blas operations launched below.
@@ -626,7 +627,9 @@ absl::Status RunGemm(const GemmConfig& config, se::DeviceAddressBase lhs_buffer,
 
   TF_ASSIGN_OR_RETURN(
       GemmConfig::DescriptorsTuple desc,
-      config.GetMatrixDescriptors(lhs_buffer, rhs_buffer, output_buffer));
+      config.GetMatrixDescriptors(
+          lhs_buffer, rhs_buffer, output_buffer,
+          stream->parent()->GetDeviceDescription().gpu_compute_capability()));
 
   se::EngineOptions engine_options{
       deterministic_ops,
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index 8204e4e68c4f67..4c0a4de4dfdf9b 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -146,7 +147,8 @@ struct GemmConfig : public se::gpu::GemmConfig {
   };
   absl::StatusOr<DescriptorsTuple> GetMatrixDescriptors(
       se::DeviceAddressBase lhs_buf, se::DeviceAddressBase rhs_buf,
-      se::DeviceAddressBase out_buf) const;
+      se::DeviceAddressBase out_buf,
+      const se::GpuComputeCapability& gpu_version) const;
 };
 
 // Run the given GEMM instruction `gemm` subject to the configuration
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
index 1481eebc29a044..e5ff188990e778 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter.cc
@@ -2095,7 +2095,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
         const se::blas::ComputationType compute_type,
         se::gpu::GetBlasComputationType(
             instr.precision_config().algorithm(), a_dtype, output_type,
-            stream_executor::blas::kDefaultComputePrecision));
+            stream_executor::blas::kDefaultComputePrecision, gpu_version_));
     se::blas::DataType scale_type =
         se::gpu::GetScaleType(output_dtype, compute_type);
 
@@ -2193,10 +2193,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return false;
     }
 
-    TF_ASSIGN_OR_RETURN(
-        const se::blas::ComputationType compute_type,
-        se::gpu::GetBlasComputationType(
-            algorithm, a_dtype, instr.shape().element_type(), max_precision));
+    TF_ASSIGN_OR_RETURN(const se::blas::ComputationType compute_type,
+                        se::gpu::GetBlasComputationType(
+                            algorithm, a_dtype, instr.shape().element_type(),
+                            max_precision, gpu_version_));
     se::blas::DataType scale_type =
         se::gpu::GetScaleType(output_dtype, compute_type);
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
index 831a7424404b93..581abc2ca0966e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -318,10 +318,12 @@ auto BlasLt::GetMatmulPlan(const gpu::GemmConfig& cfg,
 
   auto compute_type = cfg.compute_type;
   if (!compute_type) {  // obtain compute_type unless provided by the user
-    TF_ASSIGN_OR_RETURN(compute_type,
-                        gpu::GetBlasComputationType(
-                            cfg.precision_algorithm, lhs_layout.dtype,
-                            output_layout.dtype, cfg.compute_precision));
+    TF_ASSIGN_OR_RETURN(
+        compute_type,
+        gpu::GetBlasComputationType(
+            cfg.precision_algorithm, lhs_layout.dtype, output_layout.dtype,
+            cfg.compute_precision,
+            parent_->GetDeviceDescription().gpu_compute_capability()));
   }
 
   // FP8 matmuls have a fast accumulation mode that is less precise than the
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 17f895e12c143a..707e8fbced5cde 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -409,7 +409,6 @@ cc_library(
     name = "gpu_blas_lt",
     srcs = ["gpu_blas_lt.cc"],
     hdrs = ["gpu_blas_lt.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":gpu_blas_lt_proto_cc",
         "//xla:shape_util",
@@ -420,8 +419,11 @@ cc_library(
         "//xla/service:algorithm_util",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_address",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:platform",
         "//xla/stream_executor:stream",
         "//xla/stream_executor:stream_executor_h",
+        "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/protobuf:dnn_proto_cc",
         "@com_google_absl//absl/algorithm:container",
@@ -435,9 +437,8 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
-    ]) + if_static([
+    ] + if_static([
         "@local_tsl//tsl/platform:tensor_float_32_utils",
     ]),
 )
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
index 2df49a47abe288..e024a9ddcb87e0 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/primitive_util.h"
 #include "xla/service/algorithm_util.h"
 #include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.pb.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -37,9 +38,7 @@ limitations under the License.
 #include "xla/tsl/protobuf/dnn.pb.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
-#if GOOGLE_CUDA
 #include "tsl/platform/tensor_float_32_utils.h"
-#endif
 
 namespace stream_executor {
 
@@ -205,7 +204,8 @@ xla::GemmConfigProto::MatrixLayout MatrixLayout::ToProto() const {
 
 absl::StatusOr<ComputationType> GetBlasComputationType(
     xla::PrecisionConfig::Algorithm algorithm, xla::PrimitiveType lhs_dtype,
-    xla::PrimitiveType output_dtype, int64_t compute_precision) {
+    xla::PrimitiveType output_dtype, int64_t compute_precision,
+    const GpuComputeCapability& cc) {
   if (algorithm == xla::PrecisionConfig::ALG_UNSET) {
     switch (output_dtype) {
       case PrimitiveType::F8E5M2:      // fall-through
@@ -222,14 +222,12 @@ absl::StatusOr<ComputationType> GetBlasComputationType(
         return ComputationType::kF32;
       case PrimitiveType::F32:  // fall-through
       case PrimitiveType::C64:
-#if GOOGLE_CUDA
-        if (tsl::tensor_float_32_execution_enabled() &&
+        if (cc.IsCuda() && tsl::tensor_float_32_execution_enabled() &&
             compute_precision <= 1 && lhs_dtype == output_dtype) {
           // CublasLt requires compute type to be F32 for F8 matmul.
           // TF32 should only be chosen for FP32 or C64 gemm
           return ComputationType::kTF32AsF32;
         }
-#endif
         return ComputationType::kF32;
       case PrimitiveType::F64:  // fall-through
       case PrimitiveType::C128:
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
index 8b6043b8bf8b64..963c28b3b9550d 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_blas_lt.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_address.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.pb.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -44,7 +45,8 @@ absl::StatusOr<xla::PrimitiveType> AsXlaPrimitiveType(blas::DataType dtype);
 
 absl::StatusOr<blas::ComputationType> GetBlasComputationType(
     xla::PrecisionConfig::Algorithm algorithm, xla::PrimitiveType lhs_dtype,
-    xla::PrimitiveType output_dtype, int64_t compute_precision);
+    xla::PrimitiveType output_dtype, int64_t compute_precision,
+    const GpuComputeCapability& cc);
 
 // Returns the type for the alpha and beta scalars.
 blas::DataType GetScaleType(blas::DataType c_type,
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index a304edaecf9b24..ba22feb58ac690 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "Eigen/Core"
 #include "rocm/include/hip/library_types.h"
 #include "rocm/include/hipblas/hipblas.h"
 #include "rocm/include/hipblaslt/hipblaslt.h"
@@ -55,7 +54,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/util.h"
-#include "tsl/platform/ml_dtypes.h"
 
 #define SET_ATTR(setter, handle, attr, value) \
   ToStatus(setter(handle, attr, &value, sizeof(decltype(value))), #setter)
@@ -326,10 +324,12 @@ auto BlasLt::GetMatmulPlan(const gpu::GemmConfig& cfg, Epilogue epilogue) const
 
   auto compute_type = cfg.compute_type;
   if (!compute_type) {  // obtain compute_type unless provided by the user
-    TF_ASSIGN_OR_RETURN(compute_type,
-                        gpu::GetBlasComputationType(
-                            cfg.precision_algorithm, lhs_layout.dtype,
-                            output_layout.dtype, cfg.compute_precision));
+    TF_ASSIGN_OR_RETURN(
+        compute_type,
+        gpu::GetBlasComputationType(
+            cfg.precision_algorithm, lhs_layout.dtype, output_layout.dtype,
+            cfg.compute_precision,
+            parent_->GetDeviceDescription().gpu_compute_capability()));
   }
 
   if (lhs_layout.order == gpu::MatrixLayout::Order::kRowMajor) {

From b9853f94c32cd1a83c2ce7416ba50abdc2aeba96 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Fri, 19 Dec 2025 01:46:15 -0800
Subject: [PATCH 577/753] Add more logs to the CuDNN autotuner backend.

PiperOrigin-RevId: 846629554
---
 third_party/xla/xla/backends/gpu/autotuner/cudnn.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
index f2d79b13a17413..f311ea3e0deb0e 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/cudnn.cc
@@ -239,6 +239,7 @@ GetCudnnFusionConfigs(const HloInstruction& instr,
   std::vector<std::unique_ptr<BackendConfig>> configs;
   int plan_count = CuDnnFusionCompiler::GetAvailablePlanCount(
       *stream_executor, *DynCast<HloFusionInstruction>(&instr));
+  VLOG(2) << "Found " << plan_count << " plans for cudnn fusion.";
   configs.reserve(plan_count);
   for (int plan_id = 0; plan_id < plan_count; ++plan_id) {
     CudnnBackendConfig config;

From 5ffce0d7c979638e68efaec00d22cc4c1519fd59 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 19 Dec 2025 02:10:49 -0800
Subject: [PATCH 578/753] [XLA:GPU] Fix bugs when sorting two elements with a
 fused iota.

The first bug was that we did not pass the right HloInstruction to EmitIota().
The second bug happens if the comparison determines that the two elements don't
have to be swapped. Then we would not write the computed iota into the output
buffer, as we only write elements if they need to be swapped.

PiperOrigin-RevId: 846637328
---
 .../backends/gpu/codegen/llvm/sort_util.cc    | 40 +++++++++++++++----
 .../xla/xla/service/gpu/tests/sorting_test.cc | 30 ++++++++++++++
 2 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
index 40d88f803841db..167eefc4256b7a 100644
--- a/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/llvm/sort_util.cc
@@ -73,7 +73,7 @@ absl::Status EmitCompareLoopBody(
     std::function<void(int64_t operand, llvm::Value* index, llvm::Value* value)>
         write_element,
     const EmitCallToNestedComputationCallback& emit_compare_callback,
-    llvm::IRBuilderBase* b, bool needs_bounds_checks = true) {
+    llvm::IRBuilderBase* b, bool force_write, bool needs_bounds_checks = true) {
   auto index_typed_constant = [&](int64_t value) {
     return llvm::ConstantInt::get(index_type, value);
   };
@@ -211,7 +211,18 @@ absl::Status EmitCompareLoopBody(
           llvm::Value* is_smaller_than = b->CreateICmpNE(
               result, llvm::ConstantInt::get(result->getType(), 0),
               "boolean_predicate");
-          ksl.If("is_smaller_than", is_smaller_than, [&]() {
+          auto write_original_order = [&]() {
+            for (int64_t i = 0; i < num_values; ++i) {
+              // Don't swap the values.
+              auto value1 = b->CreateLoad(values_to_compare_types[i * 2],
+                                          values_to_compare[i * 2]);
+              auto value2 = b->CreateLoad(values_to_compare_types[i * 2 + 1],
+                                          values_to_compare[i * 2 + 1]);
+              write_element(i, current_keys_index, value2);
+              write_element(i, compare_keys_index, value1);
+            }
+          };
+          auto write_swapped_order = [&]() {
             for (int64_t i = 0; i < num_values; ++i) {
               // Swap the values.
               auto value1 = b->CreateLoad(values_to_compare_types[i * 2],
@@ -221,7 +232,18 @@ absl::Status EmitCompareLoopBody(
               write_element(i, current_keys_index, value1);
               write_element(i, compare_keys_index, value2);
             }
-          });
+          };
+          if (force_write) {
+            // If we don't use shared memory, we have to make sure that values
+            // that were emitted as part of the first iteration get written to
+            // global memory, even if the comparison determined that no swap is
+            // necessary.
+            ksl.If("is_smaller_than", is_smaller_than, write_swapped_order,
+                   write_original_order);
+          } else {
+            ksl.If("is_smaller_than", is_smaller_than, write_swapped_order);
+          }
+
           return absl::OkStatus();
         }));
   }
@@ -359,14 +381,14 @@ absl::Status EmitTiledCompareLoop(
                 unroll_factor / 2, params.size(), element_pair_index, xor_mask,
                 tiled_keys_index.GetType(), element_address,
                 element_address_pointee_type, write_element,
-                emit_compare_callback, b);
+                emit_compare_callback, b, /*force_write=*/false);
           },
           [&]() {
             return EmitCompareLoopBody(
                 tile_size, num_threads, unroll_factor / 2, params.size(),
                 element_pair_index, xor_mask, tiled_keys_index.GetType(),
                 element_address, element_address_pointee_type, write_element,
-                emit_compare_callback, b,
+                emit_compare_callback, b, /*force_write=*/false,
                 /*needs_bounds_checks=*/false);
           }));
     } else {
@@ -374,7 +396,7 @@ absl::Status EmitTiledCompareLoop(
           tile_size, num_threads, unroll_factor / 2, params.size(),
           element_pair_index, xor_mask, tiled_keys_index.GetType(),
           element_address, element_address_pointee_type, write_element,
-          emit_compare_callback, b,
+          emit_compare_callback, b, /*force_write=*/false,
           /*needs_bounds_checks=*/false));
     }
     // Wait until all comparisons have happened.
@@ -506,7 +528,8 @@ absl::Status EmitSortInPlace(
         llvm::Value* element;
         if (emit_iota_operands &&
             HloPredicateIsOp<HloOpcode::kIota>(sort->operand(operand))) {
-          ASSIGN_OR_RETURN(element, EmitIota(sort, keys_index, module, b));
+          ASSIGN_OR_RETURN(
+              element, EmitIota(sort->operand(operand), keys_index, module, b));
         } else {
           if (!primitive_util::IsSubByteNonPredType(element_type)) {
             return values_arrays[operand].EmitArrayElementAddress(keys_index,
@@ -536,7 +559,8 @@ absl::Status EmitSortInPlace(
           dimension_to_sort_bound, /*num_threads=*/1, unroll_factor / 2,
           values_arrays.size(), tiles_index[rank - 1], xor_masks[0],
           tiles_index.GetType(), element_address, element_address_pointee_type,
-          write_element, emit_compare_callback, b));
+          write_element, emit_compare_callback, b,
+          /*force_write=*/emit_iota_operands));
     }
     return absl::OkStatus();
   };
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index e8b6fad5b6d45c..130a1eb950b0b5 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -176,6 +176,36 @@ TEST_F(SortingTest, SortFusionWithIotaOperand) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
 
+TEST_F(SortingTest, SortFusionWithIotaOperandTinySortDim) {
+  const char* hlo_text = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_index = s32[] parameter(2)
+      %rhs_index = s32[] parameter(3)
+      %lt_key = pred[] compare(%lhs_key, %rhs_key), direction=LT
+      %gt_key = pred[] compare(%rhs_key, %lhs_key), direction=LT
+      %eq_key = pred[] compare(%lt_key, %gt_key), direction=EQ
+      %lt_index = pred[] compare(%lhs_index, %rhs_index), direction=LT
+      ROOT res = pred[] select(%eq_key, %lt_index, %lt_key)
+    }
+
+    sort_fusion {
+      p0 = s32[2]{0} parameter(0)
+      iota = s32[2]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[2]{0}, s32[2]{0}) sort(p0, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[2]{0} parameter(0)
+      ROOT fusion = (s32[2]{0}, s32[2]{0}) fusion(p), kind=kInput, calls=sort_fusion
+    }
+  )";
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
 // Test that verifies the IgnoreMemorySpace option works correctly
 TEST_F(SortingTest, LayoutsInShapesEqualWithIgnoreMemorySpace) {
   const char* hlo_text = R"(

From bf95b538bfbf1254f652e98407235fddf9f41b01 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Fri, 19 Dec 2025 02:11:06 -0800
Subject: [PATCH 579/753] Make filecheck prefixes a parameter.

`CHECK-PTX` and `CHECK-GCN` have been considered valid prefixes only if the build is configured as a ROCM or a CUDA build. This change makes this a parameter to the RunFilecheck functions and allows individual tests to decide which prefixes they wanna use.

PiperOrigin-RevId: 846637426
---
 third_party/xla/xla/hlo/testlib/BUILD         | 13 ++------
 third_party/xla/xla/hlo/testlib/filecheck.cc  | 33 +++++++++----------
 third_party/xla/xla/hlo/testlib/filecheck.h   | 10 +++---
 .../hlo_hardware_independent_test_base.cc     |  6 ++--
 .../hlo_hardware_independent_test_base.h      |  4 +--
 .../gpu/transforms/gemm_rewriter_fp8_test.cc  | 12 ++++++-
 6 files changed, 40 insertions(+), 38 deletions(-)

diff --git a/third_party/xla/xla/hlo/testlib/BUILD b/third_party/xla/xla/hlo/testlib/BUILD
index dc5d2a2da5cf65..afd25005125feb 100644
--- a/third_party/xla/xla/hlo/testlib/BUILD
+++ b/third_party/xla/xla/hlo/testlib/BUILD
@@ -1,17 +1,9 @@
 # Description:
 #   Base testing infrastructure for XLA.
 
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 load("//xla/tsl:tsl.bzl", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "filegroup")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
-load(
-    "//xla/tsl/platform/default:cuda_build_defs.bzl",
-    "if_cuda_is_configured",
-)
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -65,7 +57,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/hlo/ir:hlo_module_group",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/utils:hlo_query",
@@ -99,9 +90,7 @@ cc_library(
     data = [
         "@llvm-project//llvm:FileCheck",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
-        "//xla:types",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:resource_loader",
@@ -109,7 +98,9 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:path",
     ],
diff --git a/third_party/xla/xla/hlo/testlib/filecheck.cc b/third_party/xla/xla/hlo/testlib/filecheck.cc
index f85e742f2933ab..7f351633f231df 100644
--- a/third_party/xla/xla/hlo/testlib/filecheck.cc
+++ b/third_party/xla/xla/hlo/testlib/filecheck.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/resource_loader.h"
@@ -30,8 +32,9 @@ limitations under the License.
 
 namespace xla {
 
-absl::StatusOr<bool> RunFileCheck(const std::string& input,
-                                  absl::string_view pattern) {
+absl::StatusOr<bool> RunFileCheck(
+    const std::string& input, absl::string_view pattern,
+    absl::Span<const absl::string_view> additional_check_prefixes) {
   // Generate an input file for the FileCheck pattern.
   std::string pattern_path;
   auto env = tsl::Env::Default();
@@ -40,11 +43,13 @@ absl::StatusOr<bool> RunFileCheck(const std::string& input,
   }
   TF_RETURN_IF_ERROR(tsl::WriteStringToFile(env, pattern_path, pattern));
   VLOG(3) << "input: " << input;
-  return RunFileCheckWithPatternFile(input, pattern_path);
+  return RunFileCheckWithPatternFile(input, pattern_path,
+                                     additional_check_prefixes);
 }
 
 absl::StatusOr<bool> RunFileCheckWithPatternFile(
-    const std::string& input, const std::string& pattern_file) {
+    const std::string& input, const std::string& pattern_file,
+    absl::Span<const absl::string_view> additional_check_prefixes) {
   // Invoke FileCheck to check whether input matches `pattern`.
   std::string binary_name = "FileCheck";
   tsl::io::AppendDotExeIfWindows(binary_name);
@@ -53,24 +58,16 @@ absl::StatusOr<bool> RunFileCheckWithPatternFile(
           ? tsl::io::JoinPath("external", "llvm-project", "llvm", binary_name)
           : tsl::io::JoinPath("llvm", "llvm-project", "llvm", binary_name));
 
+  std::string check_prefixes = "--check-prefixes=CHECK";
+  for (const absl::string_view& prefix : additional_check_prefixes) {
+    absl::StrAppend(&check_prefixes, ",", prefix);
+  }
+
   tsl::SubProcess file_check_process;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  std::string file_check_prefixes;
-#if GOOGLE_CUDA
-  file_check_prefixes = "--check-prefixes=CHECK,CHECK-PTX";
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
-  file_check_prefixes = "--check-prefixes=CHECK,CHECK-GCN";
-#endif  // TENSORFLOW_USE_ROCM
   file_check_process.SetProgram(
       file_check_path,
       {file_check_path, "-v", "-dump-input=fail", "--dump-input-filter=all",
-       file_check_prefixes, "--allow-unused-prefixes", pattern_file});
-#else  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  file_check_process.SetProgram(file_check_path,
-                                {file_check_path, "-v", "-dump-input=fail",
-                                 "--dump-input-filter=all", pattern_file});
-#endif
+       check_prefixes, "--allow-unused-prefixes", pattern_file});
   file_check_process.SetChannelAction(tsl::CHAN_STDIN, tsl::ACTION_PIPE);
   file_check_process.SetChannelAction(tsl::CHAN_STDERR, tsl::ACTION_PIPE);
   if (!file_check_process.Start()) {
diff --git a/third_party/xla/xla/hlo/testlib/filecheck.h b/third_party/xla/xla/hlo/testlib/filecheck.h
index 3ea8de22f60fe8..2d2ec9712bd082 100644
--- a/third_party/xla/xla/hlo/testlib/filecheck.h
+++ b/third_party/xla/xla/hlo/testlib/filecheck.h
@@ -20,21 +20,23 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-#include "xla/types.h"
+#include "absl/types/span.h"
 
 namespace xla {
 
 // Runs FileCheck with the given pattern over given input string. Provided that
 // FileCheck can execute, returns true if and only if FileCheck succeeded in
 // matching the input.
-absl::StatusOr<bool> RunFileCheck(const std::string& input,
-                                  absl::string_view pattern);
+absl::StatusOr<bool> RunFileCheck(
+    const std::string& input, absl::string_view pattern,
+    absl::Span<const absl::string_view> additional_check_prefixes = {});
 
 // Runs FileCheck with the given pattern file over given input string. Provided
 // that FileCheck can execute, returns true if and only if FileCheck succeeded
 // in matching the input.
 absl::StatusOr<bool> RunFileCheckWithPatternFile(
-    const std::string& input, const std::string& pattern_file);
+    const std::string& input, const std::string& pattern_file,
+    absl::Span<const absl::string_view> additional_check_prefixes = {});
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
index b950513875436e..8ac3ca2ef35c89 100644
--- a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
+++ b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -222,7 +223,8 @@ void HloHardwareIndependentTestBase::RunAndFilecheckHloRewrite(
     absl::string_view hlo, HloPassInterface&& hlo_pass,
     std::optional<absl::string_view> expected,
     std::function<void(HloModule*)> after_pass_checks,
-    const HloModuleConfig* config) const {
+    const HloModuleConfig* config,
+    absl::Span<const absl::string_view> additional_check_prefixes) const {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           config ? ParseAndReturnVerifiedModule(hlo, *config)
                                  : ParseAndReturnVerifiedModule(hlo));
@@ -233,7 +235,7 @@ void HloHardwareIndependentTestBase::RunAndFilecheckHloRewrite(
         bool filecheck_matches,
         RunFileCheck(
             module->ToString(HloPrintOptions().set_print_large_constants(true)),
-            *expected));
+            *expected, additional_check_prefixes));
     EXPECT_TRUE(filecheck_matches) << module->ToString();
     if (after_pass_checks) {
       after_pass_checks(module.get());
diff --git a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
index d9a0973ce1b0bf..3d02c90b35c375 100644
--- a/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
+++ b/third_party/xla/xla/hlo/testlib/hlo_hardware_independent_test_base.h
@@ -35,7 +35,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/hlo/ir/hlo_module_group.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/pass/hlo_pass_interface.h"
@@ -167,7 +166,8 @@ class HloHardwareIndependentTestBase : public ::testing::Test {
       absl::string_view hlo_with_filecheck_lines, HloPassInterface&& hlo_pass,
       std::optional<absl::string_view> expected,
       std::function<void(HloModule*)> after_pass_checks = nullptr,
-      const HloModuleConfig* config = nullptr) const;
+      const HloModuleConfig* config = nullptr,
+      absl::Span<const absl::string_view> additional_check_prefixes = {}) const;
 
   using FixedMapping =
       std::initializer_list<std::pair<absl::string_view, absl::string_view>>;
diff --git a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
index c76b340ce86906..04f1cbd82df4da 100644
--- a/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/gemm_rewriter_fp8_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_map.h"
@@ -136,9 +137,18 @@ class ParameterizedFp8GemmRewriteTest
     if (expected.has_value()) {
       std::string replaced_pattern =
           absl::StrReplaceAll(expected.value(), replacements_);
+      std::vector<absl::string_view> additional_check_prefixes;
+      if (IsCuda()) {
+        additional_check_prefixes.push_back("CHECK-PTX");
+      }
+      if (IsRocm()) {
+        additional_check_prefixes.push_back("CHECK-GCN");
+      }
+
       GemmRewriteTestBase::RunAndFilecheckHloRewrite(
           absl::StrReplaceAll(hlo, replacements_), std::move(hlo_pass),
-          replaced_pattern, after_pass_checks, config);
+          replaced_pattern, after_pass_checks, config,
+          additional_check_prefixes);
     }
   }
 

From aa1c37b22c9967dcb74eb914d73f9adaf22fba18 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 02:11:35 -0800
Subject: [PATCH 580/753] Automated Code Change

PiperOrigin-RevId: 846637593
---
 .../xla/backends/gpu/codegen/triton/transforms/int4_passes.cc    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
index a5adf3a2bdd05d..7ac2dd35a193d8 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include <algorithm>
 #include <cstdint>
-#include <functional>
 #include <iterator>
 #include <memory>
 #include <optional>

From d68f58209eb731c3e298c2d487a3f3df5cd9ce6b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 02:33:44 -0800
Subject: [PATCH 581/753] Automated Code Change

PiperOrigin-RevId: 846644862
---
 .../codegen/triton/compilation_pipeline_rocm.cc    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
index 2786b61fc4fd73..591bdb8a6ff33c 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -86,28 +86,30 @@ static void MakeTTGIR(mlir::OpPassManager* pm,
     pm->addPass(mlir::createTritonAMDGPUScheduleLoops({num_stages}));
     pm->addPass(mlir::createTritonAMDGPUPipeline(
         {/*useAsyncCopy=*/false, /*usePingpong=*/false}));
-    if (/*use_async_copy=*/false) {  // Not enabled by default.
+    if (/*use_async_copy=*//* DISABLES CODE */ (
+        false)) {  // Not enabled by default.
       pm->addPass(mlir::createTritonAMDGPUCoalesceAsyncCopy());
     }
     pm->addPass(mlir::createCanonicalizerPass());
   }
-  if (/*(instruction_sched_variant=="none") == */ false) {
+  if (/*(instruction_sched_variant=="none") == */ /* DISABLES CODE */ (false)) {
     pm->addPass(mt::createTritonAMDGPUInsertInstructionSchedHintsPass("none"));
   }
   pm->addPass(mt::gpu::createTritonGPUOptimizeDotOperands({true}));
   pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   pm->addPass(mt::gpu::createTritonGPUReduceDataDuplication());
-  if (/*(instruction_sched_variant=="none") == */ false) {
+  if (/*(instruction_sched_variant=="none") == */ /* DISABLES CODE */ (false)) {
     pm->addPass(mlir::createTritonAMDGPUInThreadTranspose());
     pm->addPass(mt::gpu::createTritonGPURemoveLayoutConversions());
   }
   if (rocm_cc.has_amd_matrix_instr()) {
     pm->addPass(mt::gpu::createTritonGPUReorderInstructions());
   }
-  if (/*use_block_pingpong=*/false) {
+  if (/*use_block_pingpong=*//* DISABLES CODE */ (false)) {
     pm->addPass(mlir::createTritonAMDGPUBlockPingpong({num_stages}));
   }
-  if (/*use_buffer_ops=*/false) {  // Not enabled by default.
+  if (/*use_buffer_ops=*//* DISABLES CODE */ (
+      false)) {  // Not enabled by default.
     pm->addPass(mlir::createTritonAMDGPUCanonicalizePointers());
     pm->addPass(mlir::createCanonicalizerPass());
     pm->addPass(mlir::createTritonAMDGPUConvertToBufferOps({arch_name}));
@@ -140,7 +142,7 @@ static void MakeLLIR(mlir::OpPassManager* pm,
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mlir::createSymbolDCEPass());
-  if (/*(instruction_sched_variant=="none") == */ false) {
+  if (/*(instruction_sched_variant=="none") == */ /* DISABLES CODE */ (false)) {
     pm->addPass(mt::createTritonAMDGPULowerInstructionSchedHintsPass(
         rocm_cc.gfx_version(), num_stages));
   }

From 8bb1b4215c096c0800f99b19567856948b0ab332 Mon Sep 17 00:00:00 2001
From: pemeliya <141146080+pemeliya@users.noreply.github.com>
Date: Fri, 19 Dec 2025 05:22:20 -0800
Subject: [PATCH 582/753] PR #35354: [ROCM] bug-fixing SortRewriter and fixing
 self_adjoint_test on ROCM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Imported from GitHub PR https://github.com/openxla/xla/pull/35354

📝 Summary of Changes
- This reenables SortRewriter which was erroneously disabled on ROCM (wrong platform).
- SortRewriter is called after EighExpander to make sure sort op inserted by EighExpander gets properly rewritten
- Reenables sort_rewriter_test on ROCM which was marked 'cuda-only'

This shall also improve the perf of EighExpander on CUDA since [this sort op](https://github.com/ROCm/xla/blob/855c22fafe82b400429380621fa8596a523a2014/xla/hlo/transforms/expanders/eigh_expander.cc#L375) gets now rewritten to CUB custom-call

🚀 Kind of Contribution
🐛 Bug Fix, ⚡️ Performance Improvement,

🧪 Unit Tests:
The existing self_adjoint_test was failing on ROCM due to disabled SortRewriter.
Also sort_rewriter_test is now enabled on ROCM too

@xla-rotation could you have a look please?
Copybara import of the project:

--
2ef0f6553a3255d3f66535fd01abfd0aa3f68ef4 by Pavel Emeliyanenko <pavel.emeliyanenko@amd.com>:

Fixing SortRewriter and fixing self_adjoint_test on ROCM

Moved SortRewriter back to its original place

Merging this change closes #35354

PiperOrigin-RevId: 846691031
---
 .../xla/xla/service/gpu/transforms/BUILD      |  1 -
 .../gpu/transforms/sort_rewriter_test.cc      | 28 +++++++++++++++----
 .../rocm/cub_sort_kernel_rocm.cu.cc           |  4 +--
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 8b3006db1b7a11..1f2b9204105c19 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -2629,7 +2629,6 @@ xla_test(
         "gpu",
     ],
     tags = [
-        "cuda-only",
         "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
index cc6dc67fa79bfb..471facbe522d88 100644
--- a/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/sort_rewriter_test.cc
@@ -439,6 +439,9 @@ ENTRY %main {
   ROOT %sort = f32[$0,100000] sort(%input), dimensions={1}, to_apply=%compare
 })";
 
+  if (xla::PlatformUtil::CanonicalPlatformName("gpu").value() == "rocm") {
+    GTEST_SKIP() << "Skipping CUDA-specific test";
+  }
   auto pass = SortRewriter(TestGpuDeviceInfo::RTXH100SXMDeviceInfo(), "CUDA");
 
   // Batch 1
@@ -477,6 +480,9 @@ ENTRY %main {
   ROOT %sort = f32[$0,100000] sort(%input), dimensions={1}, to_apply=%compare
 })";
 
+  if (xla::PlatformUtil::CanonicalPlatformName("gpu").value() == "rocm") {
+    GTEST_SKIP() << "Skipping CUDA-specific test";
+  }
   auto pass = SortRewriter(TestGpuDeviceInfo::RTXA6000DeviceInfo(), "CUDA");
 
   // Batch 1
@@ -571,13 +577,23 @@ ENTRY %main {
       dimensions={0}, to_apply=%compare, metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}
 })";
   constexpr char kExpectedPattern[] = R"(
-    // CHECK: %[[CC:.*]] = (u16[1000]{0}, u8[1]{0}) custom-call({{.*}}), custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize", metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}, backend_config={"descending":true}
+    // CHECK: %[[CC:.*]] = (u16[1000]{0}, u8[{{[0-9]+}}]{0}) custom-call({{.*}}), custom_call_target="__cub$DeviceRadixSortUnassignedScratchSize", metadata={op_type="sort" op_name="sort" source_file="path/to/test.cc" source_line=68}, backend_config={"descending":true}
   )";
-  for (const auto& [device_description, platform_name] :
-       {std::tuple{TestGpuDeviceInfo::RTXA6000DeviceInfo(), "CUDA"},
-        std::tuple{TestGpuDeviceInfo::RTXH100SXMDeviceInfo(), "CUDA"}}) {
-    RunAndFilecheckHloRewrite(kHlo,
-                              SortRewriter(device_description, platform_name),
+
+  auto platform_name = absl::AsciiStrToUpper(
+      xla::PlatformUtil::CanonicalPlatformName("gpu").value());
+  auto device_list = [platform_name]() -> std::vector<se::DeviceDescription> {
+    if (platform_name == "CUDA") {
+      return {TestGpuDeviceInfo::RTXA6000DeviceInfo(),
+              TestGpuDeviceInfo::RTXH100SXMDeviceInfo()};
+    } else {
+      return {TestGpuDeviceInfo::AMDMI210DeviceInfo(),
+              TestGpuDeviceInfo::AMDRX7900DeviceInfo()};
+    }
+  };
+
+  for (const auto& device_desc : device_list()) {
+    RunAndFilecheckHloRewrite(kHlo, SortRewriter(device_desc, platform_name),
                               kExpectedPattern);
   }
 }
diff --git a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
index 7b7d43ab3e460d..d6dcaade5543da 100644
--- a/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
+++ b/third_party/xla/xla/stream_executor/rocm/cub_sort_kernel_rocm.cu.cc
@@ -252,7 +252,7 @@ static absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes,
           .Attr<size_t>("num_items")                                          \
           .Attr<size_t>("batch_size"));                                       \
   XLA_FFI_REGISTER_HANDLER(                                                   \
-      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_keys_" #suffix, "CUDA", \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_keys_" #suffix, "ROCM", \
       {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                \
        /* .initialize = */ kCubSortKeysInitialize_##suffix,                   \
        /* .execute = */ kCubSortKeysExecute_##suffix});
@@ -278,7 +278,7 @@ static absl::Status CubSortPairsGetScratchSize(size_t* temp_bytes,
           .Attr<size_t>("num_items")                                           \
           .Attr<size_t>("batch_size"));                                        \
   XLA_FFI_REGISTER_HANDLER(                                                    \
-      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_pairs_" #suffix, "CUDA", \
+      xla::ffi::GetXlaFfiApi(), "xla.gpu.ext.cub_sort_pairs_" #suffix, "ROCM", \
       {/* .instantiate = */ nullptr, /* .prepare = */ nullptr,                 \
        /* .initialize = */ kCubSortPairsInitialize_##suffix,                   \
        /* .execute = */ kCubSortPairsExecute_##suffix});

From 6df9bab4d35808b4faa6d79b4671e61417629290 Mon Sep 17 00:00:00 2001
From: Theotime Combes <tcombes@google.com>
Date: Fri, 19 Dec 2025 07:05:39 -0800
Subject: [PATCH 583/753] [XLA:GPU] Only mark HoistFusedBitcasts as changed
 when modifications occur.

PiperOrigin-RevId: 846719569
---
 .../xla/xla/service/gpu/transforms/BUILD      |  1 +
 .../gpu/transforms/hoist_fused_bitcasts.cc    | 36 +++++++++++--------
 .../transforms/hoist_fused_bitcasts_test.cc   | 17 +++++----
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 1f2b9204105c19..1f3979a00abb01 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1891,6 +1891,7 @@ cc_library(
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:matmul_utils",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_set",
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
index 8ca5c54ce1db0d..ca1a24b835ec14 100644
--- a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla::gpu {
 namespace {
@@ -799,15 +800,15 @@ absl::Status HoistBitcastUpwardsToCallers(HloInstruction* bitcast,
 // shape. The bitcast is chosen so that it cancels out bitcasts and reshapes
 // along the way up to the dot. Updates the callers of the dot to expect the new
 // root shape.
-absl::Status MaybeInsertRootBitcast(HloInstruction* dot,
-                                    absl::Span<HloInstruction*> callers) {
+absl::StatusOr<bool> MaybeInsertRootBitcast(
+    HloInstruction* dot, absl::Span<HloInstruction*> callers) {
   TF_ASSIGN_OR_RETURN(Shape root_shape,
                       ComputeRootShapeAfterHoistingBitcasts(dot));
 
   HloComputation* computation = dot->parent();
   HloInstruction* root = computation->root_instruction();
   if (root->shape() == root_shape) {
-    return absl::OkStatus();
+    return false;
   }
 
   // Insert a new bitcast at the root.
@@ -822,24 +823,28 @@ absl::Status MaybeInsertRootBitcast(HloInstruction* dot,
     *caller->mutable_shape() = root_shape;
   }
 
-  return absl::OkStatus();
+  return true;
 }
 
 // Try hoisting bitcasts and reshapes in the computation away from 'dot' to the
 // callers of the computation. Some bitcasts or reshapes may remain in the
 // computation, because they cannot be hoisted across all ops, e.g. across some
 // transposes and broadcasts. This is not reported as an error.
-absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
-                                                    CallGraph* call_graph) {
+absl::StatusOr<bool> TryHoistBitcastsInComputationToCallers(
+    HloInstruction* dot, CallGraph* call_graph) {
+  bool changed = false;
   // Instead of implementing a logic to hoist bitcast upwards and downwards
   // we insert a bitcast at the root that and always hoist bitcasts upwards.
   // That significantly simplifies the implementation.
   VLOG(2) << "Before hoisting bitcasts: " << dot->parent()->ToString();
 
   auto callers = call_graph->GetComputationCallers(dot->parent());
-  if (auto status = MaybeInsertRootBitcast(dot, absl::MakeSpan(callers));
-      !status.ok()) {
-    VLOG(2) << "Failed to insert root bitcast: " << status;
+  absl::StatusOr<bool> inserted =
+      MaybeInsertRootBitcast(dot, absl::MakeSpan(callers));
+  if (!inserted.ok()) {
+    VLOG(2) << "Failed to insert root bitcast: " << inserted.status();
+  } else {
+    changed |= *inserted;
   }
   VLOG(2) << "After inserting root bitcast: " << dot->parent()->ToString();
 
@@ -856,11 +861,13 @@ absl::Status TryHoistBitcastsInComputationToCallers(HloInstruction* dot,
     if (!status.ok()) {
       VLOG(2) << "Failed to hoist " << instruction->ToString()
               << " upwards: " << status;
+    } else {
+      changed = true;
     }
   }
 
   VLOG(2) << "After hoisting bitcasts: " << dot->parent()->ToString();
-  return absl::OkStatus();
+  return changed;
 }
 
 class HoistFusedBitcastsVisitor : public DfsHloRewriteVisitor {
@@ -884,10 +891,11 @@ class HoistFusedBitcastsVisitor : public DfsHloRewriteVisitor {
       }
     }
 
-    TF_RETURN_IF_ERROR(
-        TryHoistBitcastsInComputationToCallers(instr, call_graph));
-    // TODO(b/446827313): don't mark as changed if no changes were made.
-    MarkAsChanged();
+    ASSIGN_OR_RETURN(bool changed,
+                     TryHoistBitcastsInComputationToCallers(instr, call_graph));
+    if (changed) {
+      MarkAsChanged();
+    }
     return absl::OkStatus();
   }
 
diff --git a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
index 9077028ea76963..f6cde67faa95d6 100644
--- a/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/hoist_fused_bitcasts_test.cc
@@ -535,7 +535,8 @@ ENTRY e {
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
   std::unique_ptr<VerifiedHloModule> module =
-      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
   // Cos should not be rewritten as we cannot hoist bitcast.
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
@@ -571,7 +572,8 @@ ENTRY e {
     "split_k":1,"num_stages":1,"num_warps":4,"num_ctas":1}}}}
 )";
   std::unique_ptr<VerifiedHloModule> module =
-      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
   // Cos should not be rewritten as we cannot hoist bitcast.
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
@@ -834,7 +836,8 @@ ENTRY e {
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
   std::unique_ptr<VerifiedHloModule> module =
-      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
   EXPECT_THAT(
       RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()), R"(
 CHECK:      transpose
@@ -866,7 +869,8 @@ ENTRY e {
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
   std::unique_ptr<VerifiedHloModule> module =
-      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
 CHECK:      f32[2,3,5]{2,1,0} $0
@@ -1010,7 +1014,7 @@ CHECK-SAME: dimensions={0,1}
 }
 
 TEST_P(HoistFusedBitcastsReshapeTest,
-       BitcastsAreHoistedDownThroughBroadcastsWithNonDefaultLayout) {
+       BitcastsAreNotHoistedDownThroughBroadcastsWithNonDefaultLayout) {
   HloOpcode opcode = GetParam();
   absl::string_view hlo = R"(
 triton_dot {
@@ -1031,7 +1035,8 @@ ENTRY e {
     "split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}}
 )";
   std::unique_ptr<VerifiedHloModule> module =
-      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)));
+      RunHoistFusedBitcasts(absl::Substitute(hlo, HloOpcodeString(opcode)),
+                            /*expect_change=*/false);
   EXPECT_THAT(RunFileCheck(module->ToString(HloPrintOptions::ShortParsable()),
                            absl::Substitute(R"(
 CHECK:      f32[2,3,5]{2,1,0} $0(dot)

From 162efc7b6cf6eae703db2a06cf2989d752654cb8 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 19 Dec 2025 08:25:36 -0800
Subject: [PATCH 584/753] [PJRT] Add a ExecuteChunk method that allows the user
 to execute a single chunk of the transpose only.

The current Execute() method executes all the chunks at one time. This refactoring is in preparation for software pipelining transposes with DMAs. Execute() now calls ExecuteChunk() for each chunk.

PiperOrigin-RevId: 846743899
---
 third_party/xla/xla/pjrt/transpose.cc | 97 ++++++++++++++-------------
 third_party/xla/xla/pjrt/transpose.h  |  6 ++
 2 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 6ad578d667866e..91aa71119141ce 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -442,6 +442,49 @@ struct uint128 {
 };
 static_assert(sizeof(uint128) == 16, "uint128 should be 16 bytes in size");
 
+void TransposePlan::ExecuteChunk(int chunk_id, const void* a, void* b) const {
+  if (num_elems_ == 0) {
+    return;
+  }
+  tsl::profiler::TraceMe traceme("Transpose::ExecuteChunk", /*level=*/2);
+
+  absl::Span<Node const> nodes = nodes_[chunk_id];
+  const char* ac = static_cast<const char*>(a) + input_offset_bytes_[chunk_id];
+  char* bc = static_cast<char*>(b) + output_offset_bytes_[chunk_id];
+
+  if (inner_kernel_is_memcpy_) {
+    DCHECK(transformation_ == Transformation::kNone);
+    // Memcpy-based plans all assume element size 1 (i.e., bytes).
+    TransposeConstStride1(ac, bc, nodes.data());
+    return;
+  }
+
+  switch (elem_size_in_bytes_) {
+    case 1:
+      ExecuteTyped<uint8_t, Transformation::kNone>(ac, bc, nodes);
+      break;
+    case 2:
+      ExecuteTyped<uint16_t, Transformation::kNone>(ac, bc, nodes);
+      break;
+    case 4:
+      if (transformation_ == Transformation::kNone) {
+        ExecuteTyped<uint32_t, Transformation::kNone>(ac, bc, nodes);
+      } else {
+        DCHECK(transformation_ == Transformation::kF64ToEf57);
+        ExecuteTyped<uint32_t, Transformation::kF64ToEf57>(ac, bc, nodes);
+      }
+      break;
+    case 8:
+      ExecuteTyped<uint64_t, Transformation::kNone>(ac, bc, nodes);
+      break;
+    case 16:
+      ExecuteTyped<uint128, Transformation::kNone>(ac, bc, nodes);
+      break;
+    default:
+      LOG(FATAL) << "Unimplemented element size " << elem_size_in_bytes_;
+  }
+}
+
 void TransposePlan::Execute(
     const void* a, void* b,
     std::optional<absl::FunctionRef<void(std::function<void(void)>)>>
@@ -451,59 +494,19 @@ void TransposePlan::Execute(
   }
   tsl::profiler::TraceMe traceme("Transpose::Execute", /*level=*/2);
 
-  auto execute_by_type = [&](int chunk_id) {
-    const char* ac =
-        static_cast<const char*>(a) + input_offset_bytes_[chunk_id];
-    char* bc = static_cast<char*>(b) + output_offset_bytes_[chunk_id];
-
-    absl::Span<Node const> nodes = nodes_[chunk_id];
-    if (inner_kernel_is_memcpy_) {
-      DCHECK(transformation_ == Transformation::kNone);
-      // Memcpy-based plans all assume element size 1 (i.e., bytes).
-      TransposeConstStride1(ac, bc, nodes.data());
-      return;
-    }
-
-    switch (elem_size_in_bytes_) {
-      case 1:
-        ExecuteTyped<uint8_t, Transformation::kNone>(ac, bc, nodes);
-        break;
-      case 2:
-        ExecuteTyped<uint16_t, Transformation::kNone>(ac, bc, nodes);
-        break;
-      case 4:
-        if (transformation_ == Transformation::kNone) {
-          ExecuteTyped<uint32_t, Transformation::kNone>(ac, bc, nodes);
-        } else {
-          DCHECK(transformation_ == Transformation::kF64ToEf57);
-          ExecuteTyped<uint32_t, Transformation::kF64ToEf57>(ac, bc, nodes);
-        }
-        break;
-      case 8:
-        ExecuteTyped<uint64_t, Transformation::kNone>(ac, bc, nodes);
-        break;
-      case 16:
-        ExecuteTyped<uint128, Transformation::kNone>(ac, bc, nodes);
-        break;
-      default:
-        LOG(FATAL) << "Unimplemented element size " << elem_size_in_bytes_;
-    }
-  };
-
-  if (!schedule_work || nodes_.size() <= 1) {
-    for (int i = 0; i < nodes_.size(); ++i) {
-      execute_by_type(i);
+  if (!schedule_work || Parallelism() <= 1) {
+    for (int i = 0; i < Parallelism(); ++i) {
+      ExecuteChunk(i, a, b);
     }
   } else {
-    absl::BlockingCounter counter(nodes_.size() - 1);
-    for (int i = 1; i < nodes_.size(); ++i) {
+    absl::BlockingCounter counter(Parallelism() - 1);
+    for (size_t i = 1; i < nodes_.size(); ++i) {
       (*schedule_work)([&, i]() {
-        execute_by_type(i);
+        ExecuteChunk(i, a, b);
         counter.DecrementCount();
       });
     }
-    // Run the first chunk inline in this thread.
-    execute_by_type(0);
+    ExecuteChunk(0, a, b);
     counter.Wait();
   }
 }
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index 44926cfe82ae62..d6dea32e6c97c6 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -124,6 +124,12 @@ class TransposePlan {
                std::optional<absl::FunctionRef<void(std::function<void(void)>)>>
                    schedule_work = std::nullopt) const;
 
+  // Executes a single chunk of the transposition. To perform a complete
+  // transposition, call ExecuteChunk for each chunk ID from 0 to Parallelism()
+  // - 1. It is legal to call ExecuteChunk for independent chunks in parallel.
+  // This is useful for callers that want to manage their own threading.
+  void ExecuteChunk(int chunk_id, const void* a, void* b) const;
+
   // Returns a human-readable description of the plan.
   std::string ToString() const;
 

From 6af9ca2d9c88467688a5a5fed179c3574b99d3e1 Mon Sep 17 00:00:00 2001
From: Deqiang Chen <deqiangc@google.com>
Date: Fri, 19 Dec 2025 09:47:30 -0800
Subject: [PATCH 585/753] Add LINT.IfChange to keep batch function op
 registrations in sync.

PiperOrigin-RevId: 846770644
---
 .../runtime/runtime_fallback_batch_tf_opkernels.cc             | 2 ++
 tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc               | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
index 9e7e9678635db5..016ccf6b1bf55c 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
@@ -407,6 +407,7 @@ REGISTER_KERNEL_BUILDER(
 
 // Identical to BatchFunction except it has 2 extra TFRT attributes and it does
 // not have `f` attribute. Users will not invoke this op directly.
+// LINT.IfChange
 REGISTER_OP("_BatchFunctionFallback")
     .Input("in_tensors: Tin")
     .Input("captured_tensors: Tcaptured")
@@ -467,6 +468,7 @@ REGISTER_OP("_BatchFunctionFallback")
     .Attr("opaque_function_handle: int")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// LINT.ThenChange(//tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc)
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
index e8e367c48a568a..b260fc6a492833 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -461,6 +461,7 @@ REGISTER_KERNEL_BUILDER(
     Name(kMlrtBatchFunctionName).Device(DEVICE_GPU),
     tfrt_stub::BatchFunctionFallbackKernel<MlrtBatchResource>);
 
+// LINT.IfChange
 // Identical to BatchFunction except it has 2 extra TFRT attributes and it does
 // not have `f` attribute. Users will not invoke this op directly.
 REGISTER_OP(kMlrtBatchFunctionName)
@@ -522,6 +523,8 @@ REGISTER_OP(kMlrtBatchFunctionName)
     .Attr("opaque_function_handle: int")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// LINT.ThenChange(//tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc)
+
 }  // namespace
 
 // TODO(rohitju, chky): This additional Register is not ideal but unavoidable

From 843206ca817f964c128e4a810f9d95cde46d3833 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Fri, 19 Dec 2025 10:39:43 -0800
Subject: [PATCH 586/753] Enable using custom hermetic NCCL version.

The NCCL version can be chosen via `HERMETIC_NCCL_VERSION` env var.

See docs [here](https://github.com/google-ml-infra/rules_ml_toolchain/blob/main/gpu/README.md#environment-variables-controlling-the-hermetic-cudacudnnnvshmem-versions).

PiperOrigin-RevId: 846788706
---
 .bazelrc                                   | 3 ++-
 WORKSPACE                                  | 6 +++---
 ci/official/utilities/code_check_full.bats | 2 ++
 tensorflow/workspace0.bzl                  | 6 +++---
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 414253387ff77e..35e5b9454a68c0 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -291,10 +291,11 @@ common:mkl_aarch64_threadpool -c opt
 # This is an alias for the mkl_aarch64_threadpool build.
 common:mkl_aarch64 --config=mkl_aarch64_threadpool
 
-# Default CUDA, CUDNN and NVSHMEM versions.
+# Default CUDA, CUDNN, NCCL and NVSHMEM versions.
 common:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.5.1"
 common:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.3.0"
 common:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
+common:cuda_version --repo_env=HERMETIC_NCCL_VERSION="2.27.7"
 
 # CUDA: This config refers to building CUDA op kernels with nvcc.
 common:cuda --repo_env TF_NEED_CUDA=1
diff --git a/WORKSPACE b/WORKSPACE
index c7944b7b28c0db..0c4c70e21101bc 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -21,10 +21,10 @@ tf_http_archive(
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
-    strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
+    sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+    strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
     urls = tf_mirror_urls(
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
     ),
 )
 
diff --git a/ci/official/utilities/code_check_full.bats b/ci/official/utilities/code_check_full.bats
index 50488bb0a35f3f..22c315dc3c29bb 100644
--- a/ci/official/utilities/code_check_full.bats
+++ b/ci/official/utilities/code_check_full.bats
@@ -216,6 +216,7 @@ EOF
     --@local_config_cuda//cuda:include_cuda_libs=false \
     --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
     --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
+    --repo_env=HERMETIC_NCCL_VERSION="2.27.7" \
     "somepath(//tensorflow/tools/pip_package:wheel, " \
     "@local_config_cuda//cuda:cudart + "\
     "@local_config_cuda//cuda:cudart + "\
@@ -240,6 +241,7 @@ EOF
     --@local_config_cuda//cuda:include_cuda_libs=false \
     --repo_env=HERMETIC_CUDA_VERSION="12.3.2" \
     --repo_env=HERMETIC_CUDNN_VERSION="8.9.7.29" \
+    --repo_env=HERMETIC_NCCL_VERSION="2.27.7" \
     --define framework_shared_object=false \
     "somepath(//tensorflow/tools/pip_package:wheel, " \
     "@local_config_cuda//cuda:cudart + "\
diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
index 3ac7ef7b409c34..005d8552b79300 100644
--- a/tensorflow/workspace0.bzl
+++ b/tensorflow/workspace0.bzl
@@ -108,10 +108,10 @@ def workspace():
     # Details: https://github.com/google-ml-infra/rules_ml_toolchain
     tf_http_archive(
         name = "rules_ml_toolchain",
-        sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
-        strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
+        sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+        strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
         urls = tf_mirror_urls(
-            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
+            "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
         ),
     )
 

From ea362136d94765afd0e7321a05da76721565e528 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Fri, 19 Dec 2025 10:43:37 -0800
Subject: [PATCH 587/753] Allow serialization for sync collectives

PiperOrigin-RevId: 846790089
---
 .../backends/gpu/runtime/all_gather_thunk.cc  | 19 +++++++-----
 .../backends/gpu/runtime/all_reduce_thunk.cc  | 19 +++++++-----
 .../gpu/runtime/all_reduce_thunk_test.cc      | 30 +++++++++++++++++++
 .../backends/gpu/runtime/all_to_all_thunk.cc  | 19 +++++++-----
 .../backends/gpu/runtime/collective_thunk.cc  | 18 ++++++-----
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  8 ++---
 6 files changed, 78 insertions(+), 35 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
index f4d8617abaf50f..c2dfc808d40e07 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_gather_thunk.cc
@@ -121,11 +121,15 @@ AllGatherStartThunk::FromProto(
     buffers.push_back(buffer);
   }
 
-  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
-      async_events_map[AsyncEventsUniqueId{
-          thunk_proto.async_events_unique_id()}];
-  if (!async_events) {
-    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
   }
 
   return std::make_unique<AllGatherStartThunk>(
@@ -142,10 +146,9 @@ absl::StatusOr<ThunkProto> AllGatherStartThunk::ToProto() const {
       proto.mutable_all_gather_start_thunk();
 
   std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
-  if (!async_events_id.has_value()) {
-    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
   }
-  thunk_proto->set_async_events_unique_id(async_events_id->value());
 
   for (const Buffer& buffer : buffers_) {
     ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
index 4347670507e220..33f9802818b17f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk.cc
@@ -259,11 +259,15 @@ AllReduceStartThunk::FromProto(
     buffers.push_back(buffer);
   }
 
-  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
-      async_events_map[AsyncEventsUniqueId{
-          thunk_proto.async_events_unique_id()}];
-  if (!async_events) {
-    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
   }
 
   CollectiveConfig config =
@@ -290,10 +294,9 @@ absl::StatusOr<ThunkProto> AllReduceStartThunk::ToProto() const {
       proto.mutable_all_reduce_start_thunk();
 
   std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
-  if (!async_events_id.has_value()) {
-    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
   }
-  thunk_proto->set_async_events_unique_id(async_events_id->value());
 
   for (const Buffer& buffer : buffers_) {
     ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc
index 96c258b30d7623..2aa2cc91c0e3f8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_reduce_thunk_test.cc
@@ -70,5 +70,35 @@ TEST(CollectiveThunkTest, ProtoRoundTrip) {
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
 }
 
+TEST(CollectiveThunkTest, SyncCollective) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        all_reduce_start_thunk {
+          collective_config {}
+          reduction_kind: 1
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<AllReduceStartThunk> thunk,
+      AllReduceStartThunk::FromProto(thunk_info, proto.all_reduce_start_thunk(),
+                                     buffer_allocations, async_events_map));
+  ASSERT_EQ(thunk->async_events(), nullptr);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
index ddc596e4ec9eb5..8d68160cc8b2be 100644
--- a/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/all_to_all_thunk.cc
@@ -297,11 +297,15 @@ AllToAllStartThunk::FromProto(
     buffers.push_back(buffer);
   }
 
-  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
-      async_events_map[AsyncEventsUniqueId{
-          thunk_proto.async_events_unique_id()}];
-  if (!async_events) {
-    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
   }
 
   CollectiveConfig config =
@@ -320,10 +324,9 @@ absl::StatusOr<ThunkProto> AllToAllStartThunk::ToProto() const {
   AllToAllStartThunkProto* thunk_proto = proto.mutable_all_to_all_start_thunk();
 
   std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
-  if (!async_events_id.has_value()) {
-    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
   }
-  thunk_proto->set_async_events_unique_id(async_events_id->value());
 
   for (const Buffer& buffer : buffers_) {
     ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
index c421e0323641d8..8ddff57556d6b8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -539,8 +539,8 @@ absl::StatusOr<ThunkProto> CollectiveDoneThunk::ToProto() const {
   thunk_proto->set_async_stream_kind(stream_kind_);
 
   std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
-  if (!async_events_id.has_value()) {
-    return absl::FailedPreconditionError("AsyncEvents is not set.");
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
   }
   thunk_proto->set_async_events_unique_id(async_events_id->value());
   thunk_proto->set_thunk_kind(Thunk::KindToProto(kind()));
@@ -551,11 +551,15 @@ absl::StatusOr<std::unique_ptr<CollectiveDoneThunk>>
 CollectiveDoneThunk::FromProto(
     ThunkInfo thunk_info, const CollectiveDoneThunkProto& thunk_proto,
     CollectiveThunk::AsyncEventsMap& async_events_map) {
-  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
-      async_events_map[AsyncEventsUniqueId{
-          thunk_proto.async_events_unique_id()}];
-  if (!async_events) {
-    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
   }
 
   ASSIGN_OR_RETURN(Thunk::Kind kind,
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index ae3ae5022909ae..d7af07c98214dd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -400,7 +400,7 @@ message CollectiveThunkProto {
 }
 
 message AllGatherStartThunkProto {
-  uint64 async_events_unique_id = 1;
+  optional uint64 async_events_unique_id = 1;
   CollectiveConfigProto collective_config = 2;
   repeated CollectiveBufferProto buffers = 3;
 }
@@ -414,7 +414,7 @@ enum ReductionKindProto {
 }
 
 message AllReduceStartThunkProto {
-  uint64 async_events_unique_id = 1;
+  optional uint64 async_events_unique_id = 1;
   CollectiveConfigProto collective_config = 2;
   repeated CollectiveBufferProto buffers = 3;
 
@@ -427,7 +427,7 @@ message AllReduceStartThunkProto {
 }
 
 message AllToAllStartThunkProto {
-  uint64 async_events_unique_id = 1;
+  optional uint64 async_events_unique_id = 1;
   CollectiveConfigProto collective_config = 2;
   repeated CollectiveBufferProto buffers = 3;
 
@@ -438,7 +438,7 @@ message AllToAllStartThunkProto {
 message CollectiveDoneThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
-  uint64 async_events_unique_id = 3;
+  optional uint64 async_events_unique_id = 3;
 }
 
 message ThunkProto {

From a3fad5d03e2b1d6738a171f98f5b69e1103f05a6 Mon Sep 17 00:00:00 2001
From: Yulia Baturina <ybaturina@google.com>
Date: Fri, 19 Dec 2025 10:49:24 -0800
Subject: [PATCH 588/753] Enable using custom hermetic NCCL version.

The NCCL version can be chosen via `HERMETIC_NCCL_VERSION` env var.

See docs [here](https://github.com/google-ml-infra/rules_ml_toolchain/blob/main/gpu/README.md#environment-variables-controlling-the-hermetic-cudacudnnnvshmem-versions).

PiperOrigin-RevId: 846792626
---
 third_party/xla/MODULE.bazel       | 6 +++---
 third_party/xla/WORKSPACE          | 6 +++---
 third_party/xla/tensorflow.bazelrc | 3 ++-
 third_party/xla/workspace0.bzl     | 6 +++---
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/third_party/xla/MODULE.bazel b/third_party/xla/MODULE.bazel
index defeb37f2e1b10..114f1dd36f5315 100644
--- a/third_party/xla/MODULE.bazel
+++ b/third_party/xla/MODULE.bazel
@@ -45,9 +45,9 @@ bazel_dep(name = "rules_ml_toolchain")
 # echo "sha256-${HASH}"
 archive_override(
     module_name = "rules_ml_toolchain",
-    integrity = "sha256-U5Be3lDj7rx4ImbiDpuawdcWbvaLh3vqWT02ANz+A+Y=",
-    strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
-    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz"],
+    integrity = "sha256-HCxTCgVOnos8gR7CHtimh/yGW+w6u8j/Zb64KbHWeuQ=",
+    strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
+    urls = ["https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz"],
 )
 
 # TODO: Upstream the patch?
diff --git a/third_party/xla/WORKSPACE b/third_party/xla/WORKSPACE
index e5e460105a164c..29e65b3afcc430 100644
--- a/third_party/xla/WORKSPACE
+++ b/third_party/xla/WORKSPACE
@@ -9,10 +9,10 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 # Details: https://github.com/google-ml-infra/rules_ml_toolchain
 tf_http_archive(
     name = "rules_ml_toolchain",
-    sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
-    strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
+    sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+    strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
     urls = tf_mirror_urls(
-        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
+        "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
     ),
 )
 
diff --git a/third_party/xla/tensorflow.bazelrc b/third_party/xla/tensorflow.bazelrc
index c933ef4fb1072e..38a7ef005f800d 100644
--- a/third_party/xla/tensorflow.bazelrc
+++ b/third_party/xla/tensorflow.bazelrc
@@ -189,10 +189,11 @@ common:mkl_aarch64_threadpool -c opt
 # This is an alias for the mkl_aarch64_threadpool build.
 common:mkl_aarch64 --config=mkl_aarch64_threadpool
 
-# Default CUDA, CUDNN and NVSHMEM versions.
+# Default CUDA, CUDNN, NCCL and NVSHMEM versions.
 common:cuda_version --repo_env=HERMETIC_CUDA_VERSION="12.9.1"
 common:cuda_version --repo_env=HERMETIC_CUDNN_VERSION="9.8.0"
 common:cuda_version --repo_env=HERMETIC_NVSHMEM_VERSION="3.2.5"
+common:cuda_version --repo_env=HERMETIC_NCCL_VERSION="2.27.7"
 
 # CUDA: This config refers to building CUDA op kernels with nvcc.
 common:cuda --repo_env TF_NEED_CUDA=1
diff --git a/third_party/xla/workspace0.bzl b/third_party/xla/workspace0.bzl
index 52e509da92e800..fa0212dfcea2eb 100644
--- a/third_party/xla/workspace0.bzl
+++ b/third_party/xla/workspace0.bzl
@@ -140,10 +140,10 @@ def workspace():
     if "rules_ml_toolchain" not in native.existing_rules():
         tf_http_archive(
             name = "rules_ml_toolchain",
-            sha256 = "53905ede50e3eebc782266e20e9b9ac1d7166ef68b877bea593d3600dcfe03e6",
-            strip_prefix = "rules_ml_toolchain-a1ff84835e407b41eef5fd1a865a23748c294db6",
+            sha256 = "1c2c530a054e9e8b3c811ec21ed8a687fc865bec3abbc8ff65beb829b1d67ae4",
+            strip_prefix = "rules_ml_toolchain-6734d2a174bf29e731d3f473743d1cc1a86100c3",
             urls = tf_mirror_urls(
-                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/a1ff84835e407b41eef5fd1a865a23748c294db6.tar.gz",
+                "https://github.com/google-ml-infra/rules_ml_toolchain/archive/6734d2a174bf29e731d3f473743d1cc1a86100c3.tar.gz",
             ),
         )
 

From c698eecb5ab1f33bab10598b8a07211c80cca5ff Mon Sep 17 00:00:00 2001
From: Kevin Gleason <gleasonk@google.com>
Date: Fri, 19 Dec 2025 10:54:33 -0800
Subject: [PATCH 589/753] [StableHLO] Update CHLO broadcast ops to use
 StableHLO bounded-dynamism-aware broadcasting for type inference.

PiperOrigin-RevId: 846795035
---
 .../xla/third_party/stablehlo/temporary.patch | 560 ++++++++++++++++++
 1 file changed, 560 insertions(+)

diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index 8b137891791fe9..71f4c67bb1848b 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1 +1,561 @@
+diff --ruN a/stablehlo/BUILD.bazel b/stablehlo/BUILD.bazel
+--- stablehlo/BUILD.bazel
++++ stablehlo/BUILD.bazel
+@@ -257,6 +257,7 @@
+         ":chlo_enums_inc_gen",
+         ":chlo_ops_inc_gen",
+         ":stablehlo_assembly_format",
++        ":stablehlo_broadcast_lowering",
+         ":stablehlo_type_inference",
+         "@llvm-project//llvm:Support",
+         "@llvm-project//mlir:BytecodeOpInterface",
+diff --ruN a/stablehlo/stablehlo/dialect/Base.cpp b/stablehlo/stablehlo/dialect/Base.cpp
+--- stablehlo/stablehlo/dialect/Base.cpp
++++ stablehlo/stablehlo/dialect/Base.cpp
+@@ -25,7 +25,6 @@
+ #include <utility>
+ 
+ #include "llvm/ADT/APInt.h"
+-#include "llvm/ADT/Hashing.h"
+ #include "llvm/ADT/STLExtras.h"
+ #include "llvm/ADT/Sequence.h"
+ #include "llvm/ADT/SmallVector.h"
+@@ -47,7 +46,6 @@
+ #include "mlir/Interfaces/SideEffectInterfaces.h"
+ #include "mlir/Support/LLVM.h"
+ #include "mlir/Support/LogicalResult.h"
+-#include "mlir/Support/TypeID.h"
+ 
+ // Include order matters
+ #include "stablehlo/dialect/BaseAttrInterfaces.cpp.inc"
+@@ -246,7 +244,7 @@
+   if (boundsLen != rank)
+     return emitError() << "Bounds length is " << boundsLen
+                        << ", expected to be equal to rank(" << rank
+-                       << ") of the tensor";
++                       << ") of the tensor " << type;
+ 
+   for (int64_t dim = 0; dim < rank; ++dim) {
+     int64_t bound = bounds[dim];
+@@ -254,7 +252,8 @@
+     if (bound != ShapedType::kDynamic && dimSize != ShapedType::kDynamic)
+       return emitError() << "Static dimension " << dim
+                          << " cannot have a bound, use ShapedType::kDynamic to "
+-                            "indicate a missing bound";
++                            "indicate a missing bound in tensor "
++                         << type;
+   }
+ 
+   return success();
+diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
+--- stablehlo/stablehlo/dialect/Base.h
++++ stablehlo/stablehlo/dialect/Base.h
+@@ -486,9 +486,12 @@
+                                 inferredReturnTypes)))
+       return failure();
+     if (inferredReturnTypes.size() != 1) return failure();
+-    auto inferredReturnType = dyn_cast<ShapedType>(inferredReturnTypes[0]);
++    auto inferredReturnType =
++        dyn_cast<RankedTensorType>(inferredReturnTypes[0]);
+     if (!inferredReturnType) return failure();
+-    inferredReturnShapes.push_back(inferredReturnType);
++    inferredReturnShapes.emplace_back(inferredReturnType.getShape(),
++                                      inferredReturnType.getElementType(),
++                                      inferredReturnType.getEncoding());
+     return success();
+   }
+ };
+diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/dialect/ChloOps.cpp
+--- stablehlo/stablehlo/dialect/ChloOps.cpp
++++ stablehlo/stablehlo/dialect/ChloOps.cpp
+@@ -19,14 +19,14 @@
+ #include <algorithm>
+ #include <cassert>
+ #include <cstdint>
+-#include <iostream>
+ #include <iterator>
+ #include <optional>
+ #include <string>
+ 
+ #include "llvm/ADT/STLExtras.h"
+ #include "llvm/ADT/SmallVector.h"
+-#include "llvm/ADT/TypeSwitch.h"
++#include "llvm/ADT/TypeSwitch.h"  // IWYU pragma: keep
++#include "llvm/Support/ErrorHandling.h"
+ #include "mlir/Dialect/Complex/IR/Complex.h"
+ #include "mlir/Dialect/Traits.h"
+ #include "mlir/IR/Attributes.h"
+@@ -51,6 +51,7 @@
+ #include "stablehlo/dialect/BroadcastUtils.h"
+ #include "stablehlo/dialect/ChloBytecode.h"
+ #include "stablehlo/dialect/TypeInference.h"
++#include "stablehlo/transforms/StablehloBroadcastLowering.h"
+ 
+ // Include order matters
+ #include "stablehlo/dialect/ChloEnums.cpp.inc"
+@@ -104,54 +105,95 @@
+ //===----------------------------------------------------------------------===//
+ 
+ namespace {
++
++bool isStaticOrBoundedDynamicTensor(RankedTensorType type) {
++  return type.hasStaticShape() || hlo::isBoundedDynamic(type);
++}
++
+ // Gets the resulting type from a broadcast between two types.
+-ShapedTypeComponents getBroadcastType(
+-    Type x, Type y, Type elementType,
+-    std::optional<ArrayRef<int64_t>> broadcastDimensionsAttr) {
+-  auto xRanked = dyn_cast<RankedTensorType>(x);
+-  auto yRanked = dyn_cast<RankedTensorType>(y);
+-  if (!xRanked || !yRanked) return {elementType};
+-
+-  auto shapeX = xRanked.getShape();
+-  auto shapeY = yRanked.getShape();
+-
+-  // If no broadcast dimensions, assume "numpy" broadcasting.
+-  if (shapeX.size() == shapeY.size() || !broadcastDimensionsAttr.has_value()) {
+-    llvm::SmallVector<int64_t, 4> outShape;
+-    if (!mlir::OpTrait::util::getBroadcastedShape(shapeX, shapeY, outShape)) {
++ShapedTypeComponents getNumpyBroadcastType(ArrayRef<Value> operands,
++                                           Type elementType) {
++  if (operands.empty())
++    llvm::report_fatal_error("Called getNumpyBroadcastType with no operands");
++
++  // Handle unranked tensors
++  if (llvm::any_of(operands,
++                   [](Value v) { return !isa<RankedTensorType>(v.getType()); }))
++    return {elementType};
++
++  // All static or bounded, use bounded dynamic aware broadcasting.
++  bool allStaticOrBounded = llvm::all_of(operands, [](Value v) {
++    return isStaticOrBoundedDynamicTensor(cast<RankedTensorType>(v.getType()));
++  });
++  if (allStaticOrBounded) {
++    Location errorLoc = operands[0].getLoc();
++    FailureOr<stablehlo::Dimensions> outShape =
++        stablehlo::getNumpyBroadcastShape(errorLoc, operands);
++    if (failed(outShape)) {
+       // Signal illegal broadcast_dimensions as unranked.
+       return {elementType};
+     }
+-    return {outShape, elementType};
+-  }
+-
+-  auto shapeLarge = shapeX.size() > shapeY.size() ? shapeX : shapeY;
+-  auto shapeSmall = shapeX.size() <= shapeY.size() ? shapeX : shapeY;
++    RankedTensorType outType =
++        stablehlo::getRankedTensorType(*outShape, elementType);
++    return {outType.getShape(), outType.getElementType(),
++            outType.getEncoding()};
++  }
++
++  // Fall back to non-bounded dynamic aware broadcasting
++  // Will pick more lenient output shapes `x . ? => ?`
++  llvm::SmallVector<int64_t, 4> outShape =
++      llvm::to_vector(cast<RankedTensorType>(operands[0].getType()).getShape());
++  for (Value operand : operands) {
++    // Make a copy of current shape since `getBroadcastedShape` will modify it.
++    llvm::SmallVector<int64_t, 4> currentShape = outShape;
++    auto operandShape = cast<RankedTensorType>(operand.getType()).getShape();
++    if (!mlir::OpTrait::util::getBroadcastedShape(currentShape, operandShape,
++                                                  outShape)) {
++      return {elementType};
++    }
++  }
++  return {outShape, elementType};
++}
++
++ShapedTypeComponents getBroadcastTypeWithBroadcastDimensions(
++    Value x, Value y, Type elementType,
++    std::optional<ArrayRef<int64_t>> broadcastDimensionsAttr) {
++  if (!broadcastDimensionsAttr.has_value())
++    return getNumpyBroadcastType({x, y}, elementType);
++
++  // Only support two operands if broadcast_dimensions is specified.
++  auto shapeX = dyn_cast<RankedTensorType>(x.getType());
++  auto shapeY = dyn_cast<RankedTensorType>(y.getType());
++
++  // Handle unranked tensors
++  if (!shapeX || !shapeY) return {elementType};
++
++  auto shapeLarge = shapeX.getRank() > shapeY.getRank() ? shapeX : shapeY;
++  auto shapeSmall = shapeX.getRank() <= shapeY.getRank() ? shapeX : shapeY;
+ 
+   auto broadcastDimensions = broadcastDimensionsAttr.value();
+-  if (broadcastDimensions.size() != shapeSmall.size()) {
++  if (broadcastDimensions.size() != shapeSmall.getRank()) {
+     // Signal illegal broadcast_dimensions as unranked.
+     return {elementType};
+   }
+   llvm::SmallVector<int64_t, 4> shapeLargeFiltered;
+-  shapeLargeFiltered.reserve(shapeSmall.size());
++  shapeLargeFiltered.reserve(shapeSmall.getRank());
+   for (const auto& dim : broadcastDimensions) {
+-    if (dim >= static_cast<int64_t>(shapeLarge.size())) return {elementType};
+-    shapeLargeFiltered.push_back(shapeLarge[dim]);
++    if (dim >= static_cast<int64_t>(shapeLarge.getRank())) return {elementType};
++    shapeLargeFiltered.push_back(shapeLarge.getDimSize(dim));
+   }
+   llvm::SmallVector<int64_t, 4> outShapeFiltered;
+-  if (!mlir::OpTrait::util::getBroadcastedShape(shapeSmall, shapeLargeFiltered,
+-                                                outShapeFiltered))
++  if (!mlir::OpTrait::util::getBroadcastedShape(
++          shapeSmall.getShape(), shapeLargeFiltered, outShapeFiltered))
+     // Signal illegal broadcast_dimensions as unranked.
+     return {elementType};
+ 
+   // Update according to the broadcast dimensions.
+-  llvm::SmallVector<int64_t, 4> outShape(shapeLarge.begin(), shapeLarge.end());
++  llvm::SmallVector<int64_t, 4> outShape(shapeLarge.getShape());
+   for (const auto& indexPair : llvm::enumerate(broadcastDimensions)) {
+     auto newValue = outShapeFiltered[indexPair.index()];
+     outShape[indexPair.value()] = newValue;
+   }
+-
+   return {outShape, elementType};
+ }
+ 
+@@ -160,6 +202,7 @@
+     DictionaryAttr attributes, OpaqueProperties properties,
+     std::optional<ArrayRef<int64_t>> broadcastDimensions, Type elementType,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
++  // Handle unranked.
+   ShapedType lhsType = cast<ShapedType>(operands[0].getType());
+   ShapedType rhsType = cast<ShapedType>(operands[1].getType());
+   if (!lhsType || !rhsType ||
+@@ -167,8 +210,8 @@
+           lhsType.getElementType(), rhsType.getElementType()))
+     return emitOptionalError(location, "mismatched operand types");
+   if (!elementType) elementType = lhsType.getElementType();
+-  inferredReturnShapes.push_back(
+-      getBroadcastType(lhsType, rhsType, elementType, broadcastDimensions));
++  inferredReturnShapes.push_back(getBroadcastTypeWithBroadcastDimensions(
++      operands[0], operands[1], elementType, broadcastDimensions));
+   return success();
+ }
+ 
+@@ -397,7 +440,6 @@
+     DictionaryAttr, OpaqueProperties, RegionRange,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+   BroadcastSelectOp::Adaptor op(operands.getValues());
+-  auto predType = cast<ShapedType>(op.getPred().getType());
+   auto onTrueType = cast<ShapedType>(op.getOnTrue().getType());
+   auto onFalseType = cast<ShapedType>(op.getOnFalse().getType());
+ 
+@@ -407,12 +449,8 @@
+   Type elementType = onTrueType.getElementType();
+ 
+   // Compute the result shape as two binary broadcasts.
+-  ShapedTypeComponents& components = inferredReturnShapes.emplace_back(
+-      getBroadcastType(onTrueType, onFalseType, elementType, std::nullopt));
+-  if (components.hasRank())
+-    components = getBroadcastType(
+-        RankedTensorType::get(components.getDims(), elementType), predType,
+-        elementType, std::nullopt);
++  inferredReturnShapes.emplace_back(
++      getNumpyBroadcastType(llvm::to_vector(op.getOperands()), elementType));
+   return success();
+ }
+ 
+diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
+--- stablehlo/stablehlo/dialect/StablehloOps.cpp
++++ stablehlo/stablehlo/dialect/StablehloOps.cpp
+@@ -1569,7 +1569,8 @@
+ void ConvertOp::build(OpBuilder& builder, OperationState& result, Value operand,
+                       Type resultElementTy) {
+   auto rankedTy = cast<RankedTensorType>(operand.getType());
+-  auto resultTy = RankedTensorType::get(rankedTy.getShape(), resultElementTy);
++  auto resultTy = RankedTensorType::get(rankedTy.getShape(), resultElementTy,
++                                        rankedTy.getEncoding());
+   build(builder, result, resultTy, operand);
+ }
+ 
+diff --ruN a/stablehlo/stablehlo/dialect/TypeInference.cpp b/stablehlo/stablehlo/dialect/TypeInference.cpp
+--- stablehlo/stablehlo/dialect/TypeInference.cpp
++++ stablehlo/stablehlo/dialect/TypeInference.cpp
+@@ -2013,12 +2013,12 @@
+     MLIRContext* context, std::optional<Location>, Value lhs,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+   // compare_c1
+-  ShapedTypeComponents& components =
+-      inferredReturnShapes.emplace_back(IntegerType::get(context, /*width=*/1));
+-  auto argTy = cast<ShapedType>(lhs.getType());
++  ShapedTypeComponents& components = inferredReturnShapes.emplace_back();
++  auto argTy = cast<RankedTensorType>(lhs.getType());
++  auto resElementTy = IntegerType::get(context, /*width=*/1);
+   // compare_c2
+   components =
+-      ShapedTypeComponents(argTy.getShape(), components.getElementType());
++      ShapedTypeComponents(argTy.getShape(), resElementTy, argTy.getEncoding());
+   return success();
+ }
+ 
+@@ -2119,9 +2119,10 @@
+ LogicalResult inferConvertOp(
+     std::optional<Location> location, Value operand,
+     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+-  auto operandType = cast<ShapedType>(operand.getType());
++  auto operandType = cast<RankedTensorType>(operand.getType());
+   // convert_c1
+-  inferredReturnShapes.emplace_back(operandType.getShape());
++  inferredReturnShapes.emplace_back(operandType.getShape(), nullptr,
++                                    operandType.getEncoding());
+   return success();
+ }
+ 
+diff --ruN a/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir b/stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+--- stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
++++ stablehlo/stablehlo/tests/chlo/chlo_legalize_to_stablehlo.mlir
+@@ -3913,6 +3913,149 @@
+ 
+ // -----
+ 
++!bounded_type = tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK-LABEL:   func.func @erf_inv_bounded(
++// CHECK-SAME:      %[[ARG0:.*]]: tensor<?x16xf32, #stablehlo.bounds<16, ?>>) {
++// CHECK:           %[[NEGATE_0:.*]] = stablehlo.negate %[[ARG0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_0:.*]] = stablehlo.multiply %[[ARG0]], %[[NEGATE_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[LOG_PLUS_ONE_0:.*]] = stablehlo.log_plus_one %[[MULTIPLY_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[NEGATE_1:.*]] = stablehlo.negate %[[LOG_PLUS_ONE_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_0:.*]] = stablehlo.constant dense<5.000000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_0:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_0]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_0:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_0:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_0]], %[[GET_DIMENSION_SIZE_0]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[COMPARE_0:.*]] = stablehlo.compare  LT, %[[NEGATE_1]], %[[SET_DIMENSION_SIZE_0]] : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<?x16xi1, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_1:.*]] = stablehlo.constant dense<2.500000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_1:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_1]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_1:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_1:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_1]], %[[GET_DIMENSION_SIZE_1]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SUBTRACT_0:.*]] = stablehlo.subtract %[[NEGATE_1]], %[[SET_DIMENSION_SIZE_1]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SQRT_0:.*]] = stablehlo.sqrt %[[NEGATE_1]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_2:.*]] = stablehlo.constant dense<3.000000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_2:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_2]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_2:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_2:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_2]], %[[GET_DIMENSION_SIZE_2]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SUBTRACT_1:.*]] = stablehlo.subtract %[[SQRT_0]], %[[SET_DIMENSION_SIZE_2]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_0:.*]] = stablehlo.select %[[COMPARE_0]], %[[SUBTRACT_0]], %[[SUBTRACT_1]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_3:.*]] = stablehlo.constant dense<2.81022636E-8> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_3:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_3]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_3:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_3:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_3]], %[[GET_DIMENSION_SIZE_3]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_4:.*]] = stablehlo.constant dense<-2.00214257E-4> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_4:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_4]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_4:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_4:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_4]], %[[GET_DIMENSION_SIZE_4]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_1:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_3]], %[[SET_DIMENSION_SIZE_4]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_5:.*]] = stablehlo.constant dense<3.43273939E-7> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_5:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_5]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_5:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_5:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_5]], %[[GET_DIMENSION_SIZE_5]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_6:.*]] = stablehlo.constant dense<1.00950558E-4> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_6:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_6]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_6:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_6:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_6]], %[[GET_DIMENSION_SIZE_6]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_2:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_5]], %[[SET_DIMENSION_SIZE_6]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_1:.*]] = stablehlo.multiply %[[SELECT_1]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_0:.*]] = stablehlo.add %[[SELECT_2]], %[[MULTIPLY_1]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_7:.*]] = stablehlo.constant dense<-3.5233877E-6> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_7:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_7]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_7:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_7:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_7]], %[[GET_DIMENSION_SIZE_7]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_8:.*]] = stablehlo.constant dense<0.00134934322> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_8:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_8]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_8:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_8:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_8]], %[[GET_DIMENSION_SIZE_8]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_3:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_7]], %[[SET_DIMENSION_SIZE_8]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_2:.*]] = stablehlo.multiply %[[ADD_0]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_1:.*]] = stablehlo.add %[[SELECT_3]], %[[MULTIPLY_2]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_9:.*]] = stablehlo.constant dense<-4.39150654E-6> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_9:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_9]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_9:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_9:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_9]], %[[GET_DIMENSION_SIZE_9]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_10:.*]] = stablehlo.constant dense<-0.00367342844> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_10:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_10]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_10:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_10:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_10]], %[[GET_DIMENSION_SIZE_10]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_4:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_9]], %[[SET_DIMENSION_SIZE_10]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_3:.*]] = stablehlo.multiply %[[ADD_1]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_2:.*]] = stablehlo.add %[[SELECT_4]], %[[MULTIPLY_3]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_11:.*]] = stablehlo.constant dense<2.1858087E-4> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_11:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_11]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_11:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_11:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_11]], %[[GET_DIMENSION_SIZE_11]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_12:.*]] = stablehlo.constant dense<0.00573950773> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_12:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_12]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_12:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_12:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_12]], %[[GET_DIMENSION_SIZE_12]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_5:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_11]], %[[SET_DIMENSION_SIZE_12]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_4:.*]] = stablehlo.multiply %[[ADD_2]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_3:.*]] = stablehlo.add %[[SELECT_5]], %[[MULTIPLY_4]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_13:.*]] = stablehlo.constant dense<-0.00125372503> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_13:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_13]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_13:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_13:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_13]], %[[GET_DIMENSION_SIZE_13]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_14:.*]] = stablehlo.constant dense<-0.0076224613> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_14:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_14]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_14:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_14:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_14]], %[[GET_DIMENSION_SIZE_14]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_6:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_13]], %[[SET_DIMENSION_SIZE_14]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_5:.*]] = stablehlo.multiply %[[ADD_3]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_4:.*]] = stablehlo.add %[[SELECT_6]], %[[MULTIPLY_5]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_15:.*]] = stablehlo.constant dense<-0.00417768164> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_15:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_15]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_15:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_15:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_15]], %[[GET_DIMENSION_SIZE_15]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_16:.*]] = stablehlo.constant dense<0.00943887047> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_16:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_16]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_16:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_16:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_16]], %[[GET_DIMENSION_SIZE_16]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_7:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_15]], %[[SET_DIMENSION_SIZE_16]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_6:.*]] = stablehlo.multiply %[[ADD_4]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_5:.*]] = stablehlo.add %[[SELECT_7]], %[[MULTIPLY_6]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_17:.*]] = stablehlo.constant dense<0.246640727> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_17:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_17]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_17:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_17:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_17]], %[[GET_DIMENSION_SIZE_17]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_18:.*]] = stablehlo.constant dense<1.00167406> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_18:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_18]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_18:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_18:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_18]], %[[GET_DIMENSION_SIZE_18]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_8:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_17]], %[[SET_DIMENSION_SIZE_18]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_7:.*]] = stablehlo.multiply %[[ADD_5]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_6:.*]] = stablehlo.add %[[SELECT_8]], %[[MULTIPLY_7]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_19:.*]] = stablehlo.constant dense<1.50140941> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_19:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_19]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_19:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_19:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_19]], %[[GET_DIMENSION_SIZE_19]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_20:.*]] = stablehlo.constant dense<2.83297682> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_20:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_20]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_20:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_20:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_20]], %[[GET_DIMENSION_SIZE_20]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_9:.*]] = stablehlo.select %[[COMPARE_0]], %[[SET_DIMENSION_SIZE_19]], %[[SET_DIMENSION_SIZE_20]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_8:.*]] = stablehlo.multiply %[[ADD_6]], %[[SELECT_0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ADD_7:.*]] = stablehlo.add %[[SELECT_9]], %[[MULTIPLY_8]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_9:.*]] = stablehlo.multiply %[[ADD_7]], %[[ARG0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[ABS_0:.*]] = stablehlo.abs %[[ARG0]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_21:.*]] = stablehlo.constant dense<1.000000e+00> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_21:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_21]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_21:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_21:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_21]], %[[GET_DIMENSION_SIZE_21]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[COMPARE_1:.*]] = stablehlo.compare  EQ, %[[ABS_0]], %[[SET_DIMENSION_SIZE_21]] : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<?x16xi1, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[CONSTANT_22:.*]] = stablehlo.constant dense<0x7F800000> : tensor<f32>
++// CHECK:           %[[BROADCAST_IN_DIM_22:.*]] = stablehlo.broadcast_in_dim %[[CONSTANT_22]], dims = [] : (tensor<f32>) -> tensor<16x16xf32>
++// CHECK:           %[[GET_DIMENSION_SIZE_22:.*]] = stablehlo.get_dimension_size %[[ARG0]], dim = 0 : (tensor<?x16xf32, #stablehlo.bounds<16, ?>>) -> tensor<i32>
++// CHECK:           %[[SET_DIMENSION_SIZE_22:.*]] = stablehlo.set_dimension_size %[[BROADCAST_IN_DIM_22]], %[[GET_DIMENSION_SIZE_22]], dim = 0 : (tensor<16x16xf32>, tensor<i32>) -> tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[MULTIPLY_10:.*]] = stablehlo.multiply %[[ARG0]], %[[SET_DIMENSION_SIZE_22]] : tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           %[[SELECT_10:.*]] = stablehlo.select %[[COMPARE_1]], %[[MULTIPLY_10]], %[[MULTIPLY_9]] : tensor<?x16xi1, #stablehlo.bounds<16, ?>>, tensor<?x16xf32, #stablehlo.bounds<16, ?>>
++// CHECK:           return
++// CHECK:         }
++func.func @erf_inv_bounded(%arg0 : !bounded_type) {
++  %0 = chlo.erf_inv %arg0 : !bounded_type -> !bounded_type
++  return
++}
++
++// -----
++
+ // CHECK-LABEL:   @square_complex_f32(
+ // CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<complex<f32>>) -> tensor<complex<f32>> {
+ // CHECK:           %[[VAL_1:.*]] = stablehlo.real %[[VAL_0]] : (tensor<complex<f32>>) -> tensor<f32>
+diff --ruN a/stablehlo/stablehlo/tests/infer_chlo.mlir b/stablehlo/stablehlo/tests/infer_chlo.mlir
+--- stablehlo/stablehlo/tests/infer_chlo.mlir
++++ stablehlo/stablehlo/tests/infer_chlo.mlir
+@@ -239,3 +239,41 @@
+   %r17 = "hlo_test_infer.get_return_types"(%17) : (tensor<2xf32>) -> tensor<2xf32>
+   func.return %r17 : tensor<2xf32>
+ }
++
++// -----
++
++/////
++// Bounded dynamic
++
++// [<=10] x [1] => [<=10]
++// CHECK-LABEL: @bounded_dynamic_broadcast_scalar
++func.func @bounded_dynamic_broadcast_scalar(%arg0: tensor<?xf64, #stablehlo.bounds<10>>, %arg1: tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>> {
++  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<?xf64, #stablehlo.bounds<10>>, tensor<f64>) -> tensor<?xf64, #stablehlo.bounds<10>>
++  // CHECK: types0 = tensor<?xf64, #stablehlo.bounds<10>>
++  %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<?xf64, #stablehlo.bounds<10>>) -> tensor<?xf64, #stablehlo.bounds<10>>
++  return %1 : tensor<?xf64, #stablehlo.bounds<10>>
++}
++
++// -----
++
++// [<=10] x [?] => [?]
++// CHECK-LABEL: @bounded_dynamic_broadcast_unbounded
++!bounded_type = tensor<?xf64, #stablehlo.bounds<10>>
++!unbounded_type = tensor<?xf64>
++func.func @bounded_dynamic_broadcast_unbounded(%arg0: !bounded_type, %arg1: !unbounded_type) -> !unbounded_type {
++  %0 = chlo.broadcast_add %arg0, %arg1 : (!bounded_type, !unbounded_type) -> !unbounded_type
++  // CHECK: types0 = tensor<?xf64>
++  %1 = "hlo_test_infer.get_return_types"(%0) : (!unbounded_type) -> !unbounded_type
++  return %1 : !unbounded_type
++}
++
++// -----
++
++// CHECK-LABEL: @broadcast_select_types_bounded
++!bounded_type = tensor<?xf64, #stablehlo.bounds<10>>
++func.func @broadcast_select_types_bounded(%arg0: tensor<i1>, %arg1: !bounded_type, %arg2: !bounded_type) -> !bounded_type {
++  %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<i1>, !bounded_type, !bounded_type) -> !bounded_type
++  // CHECK: types0 = tensor<?xf64, #stablehlo.bounds<10>>
++  %1 = "hlo_test_infer.get_return_types"(%0) : (!bounded_type) -> !bounded_type
++  return %1: !bounded_type
++}
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
+--- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
++++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.cpp
+@@ -92,7 +92,6 @@
+ 
+     // If both LHS and RHS are not 1, dim size must match.
+     if (dim_a.size != dim_b.size) {
+-      // FIXME
+       return emitError(op.getLoc(), "incompatible shapes for broadcasting ")
+              << dim_a.size << " and " << dim_b.size;
+     }
+@@ -157,11 +156,10 @@
+   return mlir::RankedTensorType::get(shape, element_type, encoding);
+ }
+ 
+-FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
++FailureOr<Dimensions> getNumpyBroadcastShape(Location loc,
+                                              ArrayRef<Value> ops) {
+   if (ops.empty())
+-    return emitError(builder.getInsertionPoint()->getLoc(),
+-                     "requires at least one operand to broadcast");
++    return emitError(loc, "requires at least one operand to broadcast");
+ 
+   Value first = ops[0];
+   auto bcastShapeOrFail = getDimensions(first);
+@@ -197,7 +195,8 @@
+ FailureOr<SmallVector<Value>> numpyBroadcastIfNeeded(OpBuilder& builder,
+                                                      ArrayRef<Value> operands) {
+   // Figure out the broadcast shape
+-  auto bcastShapeOrFail = getNumpyBroadcastShape(builder, operands);
++  auto errLoc = builder.getInsertionPoint()->getLoc();
++  auto bcastShapeOrFail = getNumpyBroadcastShape(errLoc, operands);
+   if (failed(bcastShapeOrFail)) return failure();
+   Dimensions bcastShape = std::move(*bcastShapeOrFail);
+ 
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h b/stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
+--- stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
++++ stablehlo/stablehlo/transforms/StablehloBroadcastLowering.h
+@@ -22,6 +22,7 @@
+ #include <string>
+ 
+ #include "mlir/IR/Builders.h"
++#include "mlir/IR/Location.h"
+ #include "mlir/IR/Value.h"
+ #include "mlir/Support/LLVM.h"
+ 
+@@ -57,8 +58,7 @@
+ 
+ // Returns the common shape these ops would broadcast to, or an error if the
+ // ops are not broadcastable.
+-FailureOr<Dimensions> getNumpyBroadcastShape(OpBuilder& builder,
+-                                             ArrayRef<Value> ops);
++FailureOr<Dimensions> getNumpyBroadcastShape(Location loc, ArrayRef<Value> ops);
+ 
+ // Apply numpy broadcasting to the given operands, returning an error if any
+ // operands are not broadcastable.
 

From aa95cad5da857d0e0ff88f9e544715502be51ff0 Mon Sep 17 00:00:00 2001
From: Jian Cai <jiancai@google.com>
Date: Fri, 19 Dec 2025 11:15:41 -0800
Subject: [PATCH 590/753] [XLA][Numerics][HLO Value Tracking] Support HLO
 original values in ConditionalSimplifier pass

This updates the original value of a conditional op when unused tuple elements are removed.

PiperOrigin-RevId: 846802773
---
 third_party/xla/xla/hlo/ir/BUILD              |  2 +
 third_party/xla/xla/hlo/ir/hlo_instruction.cc |  1 +
 third_party/xla/xla/hlo/ir/hlo_module.cc      |  2 +-
 .../xla/xla/hlo/ir/hlo_original_value.cc      | 53 -------------
 .../xla/xla/hlo/ir/hlo_original_value.h       | 12 ---
 .../xla/xla/hlo/ir/hlo_original_value_test.cc |  1 +
 .../xla/xla/hlo/ir/hlo_original_value_util.cc | 78 +++++++++++++++++++
 .../xla/xla/hlo/ir/hlo_original_value_util.h  | 69 ++++++++++++++++
 third_party/xla/xla/hlo/parser/hlo_parser.cc  |  1 +
 third_party/xla/xla/service/BUILD             |  4 +
 .../xla/xla/service/conditional_simplifier.cc |  2 +
 .../service/conditional_simplifier_test.cc    | 52 +++++++++++++
 .../xla/xla/service/while_loop_simplifier.cc  | 36 +--------
 third_party/xla/xla/tuple_tree.h              |  2 +
 14 files changed, 216 insertions(+), 99 deletions(-)
 create mode 100644 third_party/xla/xla/hlo/ir/hlo_original_value_util.cc
 create mode 100644 third_party/xla/xla/hlo/ir/hlo_original_value_util.h

diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 07b2e615d62479..26157ab983ec1f 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -31,6 +31,7 @@ cc_library(
         "hlo_module_metadata.cc",
         "hlo_opcode.cc",
         "hlo_original_value.cc",
+        "hlo_original_value_util.cc",
         "hlo_schedule.cc",
         "hlo_sharding_metadata.cc",
         "replica_group.cc",
@@ -51,6 +52,7 @@ cc_library(
         "hlo_op_metadata.h",
         "hlo_opcode.h",
         "hlo_original_value.h",
+        "hlo_original_value_util.h",
         "hlo_print_options.h",
         "hlo_schedule.h",
         "hlo_sharding.h",
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index 946668fcbe5a5f..c370122c43179c 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -60,6 +60,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_op_metadata.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.cc b/third_party/xla/xla/hlo/ir/hlo_module.cc
index aa64124e7d6bdb..dfb08c3b780477 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module.cc
@@ -55,7 +55,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.cc b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
index 9ca6b693639a7f..51eb648979a1cc 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.cc
@@ -24,17 +24,13 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/no_destructor.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
-#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/utils/pointer_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tuple_tree.h"
@@ -247,55 +243,6 @@ std::shared_ptr<OriginalValue> OriginalValue::CreateFromInstruction(
   return original_value;
 }
 
-void CopyOriginalValue(const HloInstruction* src_instruction,
-                       HloInstruction* dest_instruction, bool clone,
-                       bool issue_warning) {
-  if (!src_instruction || !dest_instruction ||
-      !ShapeUtil::Compatible(src_instruction->shape(),
-                             dest_instruction->shape())) {
-    if (issue_warning) {
-      LOG(WARNING)
-          << "Expect the new instruction to have the same shape with the old "
-             "instruction when moving over original_value";
-    }
-    return;
-  }
-
-  std::shared_ptr<OriginalValue> original_value =
-      src_instruction->original_value();
-  if (!original_value) {
-    return;
-  }
-
-  if (!clone || original_value->is_synthetic_call()) {
-    dest_instruction->set_original_value(original_value);
-    return;
-  }
-
-  // Deep clone the tree.
-  auto cloned_tree = std::make_shared<OriginalValue>(original_value->tree());
-  dest_instruction->set_original_value(cloned_tree);
-}
-
-void DeduplicateOriginalValues(HloModule* module) {
-  absl::flat_hash_set<std::shared_ptr<OriginalValue>,
-                      PointeeHash<OriginalValue>, PointeeEqual<OriginalValue>>
-      unique_original_values;
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (std::shared_ptr<OriginalValue> original_value =
-              instruction->original_value()) {
-        auto p = unique_original_values.insert(original_value);
-        if (!p.second) {
-          // Reassign the pointer with the existing identical object and release
-          // the duplicate.
-          instruction->set_original_value(*p.first);
-        }
-      }
-    }
-  }
-}
-
 /* static */
 TupleTree<std::optional<OriginalArray>>&
 OriginalValue::EmptyOriginalValueTupleTree() {
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value.h b/third_party/xla/xla/hlo/ir/hlo_original_value.h
index 5ccfebc6839308..92eb1fab1fd465 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value.h
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value.h
@@ -167,17 +167,5 @@ class OriginalValue {
       data_;
 };
 
-// Copies the original value of the source to the destination instruction if the
-// shapes of the source and destination are compatible. This performs a deep
-// copy if clone is set to true. Otherwise, it performs a shallow copy. Print a
-// warning if the shapes are not compatible and issue_warning is set to true.
-void CopyOriginalValue(const HloInstruction* src_instruction,
-                       HloInstruction* dest_instruction, bool clone,
-                       bool issue_warning);
-
-// Removes duplicates of original value objects referenced in the module to save
-// memory storage.
-void DeduplicateOriginalValues(HloModule* module);
 }  // namespace xla
-
 #endif  // XLA_HLO_IR_HLO_ORIGINAL_VALUE_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
index b70a8f1576587e..c3a8d55425f2bb 100644
--- a/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/hash/hash_testing.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_util.cc b/third_party/xla/xla/hlo/ir/hlo_original_value_util.cc
new file mode 100644
index 00000000000000..2c8fa9d56a64ce
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_util.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/hlo/ir/hlo_original_value_util.h"
+
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/utils/pointer_utils.h"
+#include "xla/shape_util.h"
+
+namespace xla {
+
+void CopyOriginalValue(const HloInstruction* src_instruction,
+                       HloInstruction* dest_instruction, bool clone,
+                       bool issue_warning) {
+  if (!src_instruction || !dest_instruction ||
+      !ShapeUtil::Compatible(src_instruction->shape(),
+                             dest_instruction->shape())) {
+    if (issue_warning) {
+      LOG(WARNING)
+          << "Expect the new instruction to have the same shape with the old "
+             "instruction when moving over original_value";
+    }
+    return;
+  }
+
+  std::shared_ptr<OriginalValue> original_value =
+      src_instruction->original_value();
+  if (!original_value) {
+    return;
+  }
+
+  if (!clone || original_value->is_synthetic_call()) {
+    dest_instruction->set_original_value(original_value);
+    return;
+  }
+
+  // Deep clone the tree.
+  auto cloned_tree = std::make_shared<OriginalValue>(original_value->tree());
+  dest_instruction->set_original_value(cloned_tree);
+}
+
+void DeduplicateOriginalValues(HloModule* module) {
+  absl::flat_hash_set<std::shared_ptr<OriginalValue>,
+                      PointeeHash<OriginalValue>, PointeeEqual<OriginalValue>>
+      unique_original_values;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (std::shared_ptr<OriginalValue> original_value =
+              instruction->original_value()) {
+        auto p = unique_original_values.insert(original_value);
+        if (!p.second) {
+          // Reassign the pointer with the existing identical object and release
+          // the duplicate.
+          instruction->set_original_value(*p.first);
+        }
+      }
+    }
+  }
+}
+}  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_original_value_util.h b/third_party/xla/xla/hlo/ir/hlo_original_value_util.h
new file mode 100644
index 00000000000000..94ca6f5debdc16
--- /dev/null
+++ b/third_party/xla/xla/hlo/ir/hlo_original_value_util.h
@@ -0,0 +1,69 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_HLO_IR_HLO_ORIGINAL_VALUE_UTIL_H_
+#define XLA_HLO_IR_HLO_ORIGINAL_VALUE_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_original_value.h"
+
+namespace xla {
+
+// Copies the original value of the source to the destination instruction.
+// Original arrays in the source original value are rearranged in the new
+// original value according to the given mapping of old to new tuple indices.
+template <typename T>
+std::enable_if_t<std::is_integral_v<T>> CopyOriginalValue(
+    const HloInstruction* src_instruction, HloInstruction* dest_instruction,
+    const absl::flat_hash_map<T, T>& old_to_new_tuple_idx) {
+  std::shared_ptr<OriginalValue> old_original_value =
+      src_instruction->original_value();
+  if (!old_original_value) {
+    return;
+  }
+  const int64_t src_tuple_size = old_original_value->tree().num_leaves();
+  const int64_t dest_tuple_size = old_to_new_tuple_idx.size();
+  std::shared_ptr<xla::OriginalValue> new_original_value =
+      std::make_shared<xla::OriginalValue>(dest_instruction->shape());
+  for (const auto& [old_idx, new_idx] : old_to_new_tuple_idx) {
+    if (old_idx < 0 || old_idx >= src_tuple_size || new_idx < 0 ||
+        new_idx >= dest_tuple_size) {
+      return;
+    }
+    new_original_value->mutable_tree()->CopySubtreeFrom(
+        old_original_value->tree(), {old_idx}, {new_idx});
+  }
+  dest_instruction->set_original_value(new_original_value);
+}
+
+// Copies the original value of the source to the destination instruction if the
+// shapes of the source and destination are compatible. This performs a deep
+// copy if clone is set to true. Otherwise, it performs a shallow copy. Print a
+// warning if the shapes are not compatible and issue_warning is set to true.
+void CopyOriginalValue(const HloInstruction* src_instruction,
+                       HloInstruction* dest_instruction, bool clone,
+                       bool issue_warning);
+
+// Removes duplicates of original value objects referenced in the module to save
+// memory storage.
+void DeduplicateOriginalValues(HloModule* module);
+}  // namespace xla
+
+#endif  // XLA_HLO_IR_HLO_ORIGINAL_VALUE_UTIL_H_
diff --git a/third_party/xla/xla/hlo/parser/hlo_parser.cc b/third_party/xla/xla/hlo/parser/hlo_parser.cc
index 57d934e6437ea0..626fc80f49839d 100644
--- a/third_party/xla/xla/hlo/parser/hlo_parser.cc
+++ b/third_party/xla/xla/hlo/parser/hlo_parser.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 640992b0283e5b..085c27f698c40e 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -2638,6 +2638,7 @@ xla_cc_test(
     srcs = ["conditional_simplifier_test.cc"],
     deps = [
         ":conditional_simplifier",
+        ":hlo_verifier",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:types",
@@ -2645,9 +2646,12 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test",
+        "//xla/hlo/testlib:verified_hlo_module",
         "//xla/hlo/utils:hlo_matchers",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/third_party/xla/xla/service/conditional_simplifier.cc b/third_party/xla/xla/service/conditional_simplifier.cc
index 0308e66d7d3bfc..7b9ea64becffbe 100644
--- a/third_party/xla/xla/service/conditional_simplifier.cc
+++ b/third_party/xla/xla/service/conditional_simplifier.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/literal.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/call_inliner.h"
@@ -300,6 +301,7 @@ bool RemoveUnusedTupleElements(HloInstruction* conditional_op) {
 
   // Replace the conditional instruction itself.
   *conditional_op->mutable_shape() = new_shape;
+  CopyOriginalValue(conditional_op, conditional_op, old_to_new_mapping);
 
   // Reroute all user GTE instructions to new tuple indices.
   for (HloInstruction* user : conditional_op->users()) {
diff --git a/third_party/xla/xla/service/conditional_simplifier_test.cc b/third_party/xla/xla/service/conditional_simplifier_test.cc
index 6baae0030ee21c..c300276556642d 100644
--- a/third_party/xla/xla/service/conditional_simplifier_test.cc
+++ b/third_party/xla/xla/service/conditional_simplifier_test.cc
@@ -15,18 +15,23 @@ limitations under the License.
 
 #include "xla/service/conditional_simplifier.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test.h"
+#include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/hlo/utils/hlo_matchers.h"
 #include "xla/literal_util.h"
+#include "xla/service/hlo_verifier.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
@@ -512,6 +517,53 @@ ENTRY entry {
                   op::GetTupleElement(op::Tuple(op::AfterAll()), 0))));
 }
 
+TEST_F(ConditionalSimplifierTest, RemoveUnusedTupleElementsWithOriginalValue) {
+  absl::string_view hlo_string =
+      R"(
+HloModule FirstTupleElementUnusedAndRemoved
+
+on_true {
+  arg_tuple.7 = (f32[10,10]{1,0}) parameter(0)
+  get-tuple-element.9 = f32[10,10]{1,0} get-tuple-element(arg_tuple.7), index=0
+  copy = f32[10,10]{1,0} copy(get-tuple-element.9)
+  ROOT tuple.6 = (f32[10,10]{1,0}, f32[10,10]{1,0}) tuple(copy, get-tuple-element.9), origin={({"tuple.6" {0}}, {"tuple.6" {1}})}
+}
+
+on_false {
+  constant.17 = f32[] constant(0)
+  constant.18 = f32[] constant(1)
+  rng.19 = f32[10,10]{1,0} rng(constant.17, constant.18), distribution=rng_uniform
+  arg_tuple.14 = (f32[10,10]{1,0}) parameter(0)
+  get-tuple-element.16 = f32[10,10]{1,0} get-tuple-element(arg_tuple.14), index=0
+  ROOT tuple.7 = (f32[10,10]{1,0}, f32[10,10]{1,0}) tuple(rng.19, get-tuple-element.16), origin={({"tuple.7" {0}}, {"tuple.7" {1}})}
+}
+
+ENTRY main {
+  constant.38 = pred[] constant(true)
+  arg_tuple.30 = (s32[], f32[10,10]{1,0}) parameter(0)
+  get-tuple-element.21 = f32[10,10]{1,0} get-tuple-element(arg_tuple.30), index=1
+  tuple.1 = (f32[10,10]{1,0}) tuple(get-tuple-element.21)
+  conditional = (f32[10,10]{1,0}, f32[10,10]{1,0}) conditional(constant.38, tuple.1, tuple.1), true_computation=on_true, false_computation=on_false, origin={({"cond" {0}}, {"cond" {1}})}
+  get-second-index = f32[10,10]{1,0} get-tuple-element(conditional), index=1
+  ROOT result = (f32[10,10]{1,0}) tuple(get-second-index)
+}
+)";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+
+  ASSERT_OK_AND_ASSIGN(bool changed, ConditionalSimplifier().Run(module.get()));
+  EXPECT_TRUE(changed);
+  HloVerifier v(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false);
+  TF_ASSERT_OK(v.Run(module.get()));
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  // The first element of "conditional" result tuple (f32[10,10], f32[10,10])
+  // should be removed since it is not referenced by any GTE instructions (see
+  // "get-second-index" instruction in hlo_string).
+  EXPECT_EQ(ShapeUtil::TupleElementCount(conditional->shape()), 1);
+  EXPECT_EQ(conditional->original_value()->ToString(), R"(({"cond" {1}}))");
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_simplifier.cc b/third_party/xla/xla/service/while_loop_simplifier.cc
index 2bf42ac92b32a2..a1aab5b744e8f8 100644
--- a/third_party/xla/xla/service/while_loop_simplifier.cc
+++ b/third_party/xla/xla/service/while_loop_simplifier.cc
@@ -37,7 +37,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/hlo/ir/hlo_original_value.h"
+#include "xla/hlo/ir/hlo_original_value_util.h"
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/hlo/utils/hlo_query.h"
@@ -152,35 +152,6 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
     HloInstruction* while_op, absl::flat_hash_set<int64_t>& used_tuple_indices,
     std::optional<absl::flat_hash_map<int32_t, int32_t>>
         dead_to_surviving_index = std::nullopt) {
-  auto copy_remaining_original_arrays =
-      [&](const HloInstruction* src_instruction,
-          HloInstruction* dest_instruction,
-          const absl::flat_hash_map<int64_t, int64_t>& old_to_new_tuple_idx) {
-        std::shared_ptr<OriginalValue> original_value =
-            src_instruction->original_value();
-        if (!original_value) {
-          return;
-        }
-
-        const int64_t src_tuple_size =
-                          src_instruction->shape().tuple_shapes().size(),
-                      dest_tuple_size =
-                          dest_instruction->shape().tuple_shapes().size();
-        std::shared_ptr<OriginalValue> old_original_value =
-            src_instruction->original_value();
-        std::shared_ptr<xla::OriginalValue> new_original_value =
-            std::make_shared<xla::OriginalValue>(dest_instruction->shape());
-        for (const auto& [old_idx, new_idx] : old_to_new_tuple_idx) {
-          if (old_idx < 0 || old_idx >= src_tuple_size || new_idx < 0 ||
-              new_idx >= dest_tuple_size) {
-            return;
-          }
-          new_original_value->mutable_tree()->CopySubtreeFrom(
-              old_original_value->tree(), {old_idx}, {new_idx});
-        }
-        dest_instruction->set_original_value(new_original_value);
-      };
-
   // Build up maps from the old/new to the new/old tuple indices.
   std::vector<int64_t> new_to_old_tuple_idx(used_tuple_indices.begin(),
                                             used_tuple_indices.end());
@@ -306,9 +277,8 @@ static absl::StatusOr<HloInstruction*> RemoveDeadTupleIndices(
   CopyFrontendAttributes(while_op, new_while_op);
   CopyMetadata(while_op, new_while_op);
 
-  copy_remaining_original_arrays(while_init, new_while_init,
-                                 old_to_new_tuple_idx);
-  copy_remaining_original_arrays(while_op, new_while_op, old_to_new_tuple_idx);
+  CopyOriginalValue(while_init, new_while_init, old_to_new_tuple_idx);
+  CopyOriginalValue(while_op, new_while_op, old_to_new_tuple_idx);
 
   // Create a tuple op that recreates the output of the old while op.  That is,
   // we transform to
diff --git a/third_party/xla/xla/tuple_tree.h b/third_party/xla/xla/tuple_tree.h
index 018fcb56031960..92952b7a66fe39 100644
--- a/third_party/xla/xla/tuple_tree.h
+++ b/third_party/xla/xla/tuple_tree.h
@@ -502,6 +502,8 @@ class TupleTree {
     return const_reverse_leaf_iterator(leaf_begin());
   }
 
+  size_t num_leaves() const { return std::distance(leaf_begin(), leaf_end()); }
+
   // Returns an iterator pointing to the node at the given ShapeIndex.
   // Returns end() if the index is not found.
   iterator find(ShapeIndexView index) {

From 068c5bfe9599738011d094d48db431dd5aff97f5 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Fri, 19 Dec 2025 11:44:27 -0800
Subject: [PATCH 591/753] Simplify TrackedCpuDeviceBuffer.

Update CpuRawBuffer to always have the size available and remove AfterAll implementation in TrackedCpuDeviceBuffer.

PiperOrigin-RevId: 846812637
---
 .../xla/xla/pjrt/common_pjrt_client.cc        |  8 ++-
 third_party/xla/xla/pjrt/common_pjrt_client.h |  3 +-
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    | 29 +++++-----
 third_party/xla/xla/pjrt/cpu/cpu_client.h     |  3 +-
 third_party/xla/xla/pjrt/cpu/raw_buffer.cc    | 30 +++++++++--
 third_party/xla/xla/pjrt/cpu/raw_buffer.h     | 10 +++-
 .../xla/pjrt/cpu/tracked_cpu_device_buffer.cc | 53 ++-----------------
 .../xla/pjrt/cpu/tracked_cpu_device_buffer.h  | 28 ----------
 .../cpu/tracked_cpu_device_buffer_test.cc     |  5 +-
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  4 +-
 .../xla/pjrt/pjrt_stream_executor_client.h    |  3 +-
 11 files changed, 65 insertions(+), 111 deletions(-)

diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index 8c187f326821f1..63ff52c1c3ba63 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -205,8 +205,12 @@ CommonPjRtClient::CreateAliasBuffer(const Shape& shape,
   tsl::RCReference<CommonPjRtRawBuffer> raw_buffer;
   PjRtFulfillAliasRawBufferCallback buffer_promise;
 
-  TF_ASSIGN_OR_RETURN(std::tie(raw_buffer, buffer_promise),
-                      CreateRawBufferChannel(memory_space));
+  TF_ASSIGN_OR_RETURN(int64_t on_device_bytes_count,
+                      GetOnDeviceBytesCount(memory_space, shape));
+
+  TF_ASSIGN_OR_RETURN(
+      std::tie(raw_buffer, buffer_promise),
+      CreateRawBufferChannel(memory_space, on_device_bytes_count));
 
   tsl::RCReference<xla::PjRtDeviceEventPromise> definition_event_promise;
   tsl::RCReference<xla::PjRtDeviceEvent> definition_event;
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index 27084fbc94e1b6..7557aa785500d6 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -199,7 +199,8 @@ class CommonPjRtClient : public PjRtClient {
       absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>) &&>;
   virtual absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                                    PjRtFulfillAliasRawBufferCallback>>
-  CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                         size_t on_device_bytes_count) {
     return absl::UnimplementedError("CreateRawBufferChannel is not supported");
   }
 
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 1e56ee3735163c..f0a3fa1a083eb5 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -911,9 +911,8 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::CreateErrorBuffer(
       shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
           /*owns_buffers=*/true, std::move(raw_buffer),
-          absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>{
-              tsl::AsyncValueRef<CpuEvent>(
-                  tsl::MakeErrorAsyncValueRef(std::move(error)))}),
+          tsl::AsyncValueRef<CpuEvent>(
+              tsl::MakeErrorAsyncValueRef(std::move(error)))),
       memory_space);
 }
 
@@ -1004,16 +1003,11 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::DefineBuffer(
                         raw_buffer->memory_space()->DebugString(),
                         memory_space->DebugString()));
   }
-  absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
-  for (auto& ev : definition_device_events) {
-    definition_events.push_back(
-        tsl::down_cast<CpuTrackedDeviceEvent*>(ev.get())->event());
-  }
   return std::unique_ptr<PjRtBuffer>(std::make_unique<CommonPjRtBufferImpl>(
       on_device_shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
           /*owns_buffers=*/raw_buffer_is_mutable, std::move(raw_buffer),
-          ShapeUtil::ByteSizeOf(on_device_shape), std::move(definition_events)),
+          CpuTrackedDeviceEvent::AfterAll(definition_device_events)),
       memory_space));
 }
 
@@ -1030,10 +1024,12 @@ PjRtCpuClient::AllocateRawBuffer(PjRtMemorySpace* memory_space,
 
 absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                          CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
-PjRtCpuClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space) {
+PjRtCpuClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                                      size_t on_device_bytes_count) {
   auto buffer_promise = tsl::MakeIndirectAsyncValue();
   auto raw_buffer = tsl::MakeRef<CpuRawBuffer>(
-      memory_space, tsl::AsyncValueRef<CpuDeviceMemory>(buffer_promise));
+      memory_space, tsl::AsyncValueRef<CpuDeviceMemory>(buffer_promise),
+      on_device_bytes_count);
 
   auto buffer_promise_cb =
       [buffer_promise = std::move(buffer_promise), memory_space](
@@ -1858,14 +1854,13 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
     res.reserve(result_buffers_info.size());
     for (int i = 0; i < result_buffers_info.size(); ++i) {
       // Program execution writes to output buffers so it's a definition event.
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
-      definition_events.push_back(execute_event.CopyRef());
       auto leaf_tracked_device_buffer =
           std::make_unique<TrackedCpuDeviceBuffer>(
               result_buffers_info[i].owns_buffer,
               tsl::MakeRef<CpuRawBuffer>(
-                  memory_space, std::move(result_buffers_info[i].buffer)),
-              result_buffers_info[i].buffer_size, std::move(definition_events));
+                  memory_space, std::move(result_buffers_info[i].buffer),
+                  result_buffers_info[i].buffer_size),
+              execute_event.CopyRef());
       auto leaf_buffer = std::make_unique<CommonPjRtBufferImpl>(
           result_shape.tuple_shapes(i), std::move(leaf_tracked_device_buffer),
           memory_space);
@@ -1877,8 +1872,8 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
     auto tracked_device_buffer = std::make_unique<TrackedCpuDeviceBuffer>(
         result_buffers_info[0].owns_buffer,
         tsl::MakeRef<CpuRawBuffer>(memory_space,
-                                   std::move(result_buffers_info[0].buffer)),
-        result_buffers_info[0].buffer_size,
+                                   std::move(result_buffers_info[0].buffer),
+                                   result_buffers_info[0].buffer_size),
         /*definition_event=*/execute_event);
     auto output_buffer = std::make_unique<CommonPjRtBufferImpl>(
         result_shape, std::move(tracked_device_buffer), memory_space);
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index e2e29df765002a..548cac91cb8096 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -218,7 +218,8 @@ class PjRtCpuClient final : public CommonPjRtClient {
 
   absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                            PjRtFulfillAliasRawBufferCallback>>
-  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                         size_t on_device_bytes_count) override;
 
   absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> AllocateRawBuffer(
       PjRtMemorySpace* memory_space, size_t on_device_bytes_count,
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
index 04a00dc9b7d776..bf4a5bf2dc4e8b 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
@@ -105,12 +105,33 @@ Future<> CpuTrackedDeviceEvent::GetReadyFuture() {
       });
 }
 
+/*static*/ tsl::AsyncValueRef<CpuEvent> CpuTrackedDeviceEvent::AfterAll(
+    absl::Span<const tsl::RCReference<PjRtDeviceEvent>> events) {
+  tsl::AsyncValueRef<CpuEvent> definition_event;
+  if (events.empty()) {
+    return tsl::MakeAvailableAsyncValueRef<CpuEvent>();
+  }
+  if (events.size() == 1) {
+    return tsl::down_cast<CpuTrackedDeviceEvent*>(events[0].get())->event();
+  }
+
+  tsl::CountDownAsyncValueRef<CpuEvent> after_all(events.size());
+  for (auto& ev : events) {
+    tsl::down_cast<CpuTrackedDeviceEvent*>(ev.get())->event().AndThen(
+        [after_all](absl::Status status) mutable {
+          after_all.CountDown(std::move(status));
+        });
+  }
+  return std::move(after_all).AsRef();
+}
+
 /*static*/ absl::StatusOr<tsl::RCReference<CpuRawBuffer>>
 CpuRawBuffer::Allocate(PjRtMemorySpace* memory_space, size_t size_bytes,
                        const CpuDeviceMemory::Allocator& allocator) {
   TF_ASSIGN_OR_RETURN(auto memory,
                       CpuDeviceMemory::Allocate(size_bytes, allocator));
-  return tsl::MakeRef<CpuRawBuffer>(memory_space, std::move(memory));
+  return tsl::MakeRef<CpuRawBuffer>(memory_space, std::move(memory),
+                                    size_bytes);
 }
 
 /*static*/ absl::StatusOr<tsl::RCReference<CpuRawBuffer>>
@@ -126,12 +147,11 @@ CpuRawBuffer::ImportForeignMemory(
   return tsl::MakeRef<CpuRawBuffer>(
       memory_space,
       CpuDeviceMemory::CreateForeignMemory(data, on_device_bytes_count,
-                                           std::move(on_delete_callback)));
+                                           std::move(on_delete_callback)),
+      on_device_bytes_count);
 }
 
-size_t CpuRawBuffer::GetOnDeviceSizeInBytes() const {
-  return buffer_->size_bytes();
-}
+size_t CpuRawBuffer::GetOnDeviceSizeInBytes() const { return buffer_size_; }
 
 void* CpuRawBuffer::GetHostPointer() const { return buffer_->untyped_data(); }
 
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.h b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
index 0d5f91fc08f74a..e7b1dd3a2013f7 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
@@ -83,6 +83,9 @@ class CpuTrackedDeviceEvent : public PjRtDeviceEvent {
 
   Future<> GetReadyFuture() override;
 
+  static tsl::AsyncValueRef<CpuEvent> AfterAll(
+      absl::Span<const tsl::RCReference<PjRtDeviceEvent>> events);
+
  private:
   tsl::AsyncValueRef<CpuEvent> event_;
   const char* callee_type_;
@@ -92,8 +95,10 @@ class CpuTrackedDeviceEvent : public PjRtDeviceEvent {
 class CpuRawBuffer : public CommonPjRtRawBuffer {
  public:
   CpuRawBuffer(PjRtMemorySpace* memory_space,
-               tsl::AsyncValueRef<CpuDeviceMemory> buffer)
-      : memory_space_(memory_space), buffer_(std::move(buffer)) {}
+               tsl::AsyncValueRef<CpuDeviceMemory> buffer, size_t buffer_size)
+      : memory_space_(memory_space),
+        buffer_(std::move(buffer)),
+        buffer_size_(buffer_size) {}
 
   absl::Status ValidateSlice(int64_t offset, int64_t slice_size);
 
@@ -169,6 +174,7 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
  private:
   PjRtMemorySpace* const memory_space_;
   tsl::AsyncValueRef<CpuDeviceMemory> buffer_;
+  size_t buffer_size_;
 };
 
 absl::StatusOr<xla::Shape> MakeDefaultCpuBufferShape(xla::Shape shape,
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
index 3e2f8aafc0000b..6b1868725c8422 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
@@ -49,28 +49,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-// Returns an AsyncValueRef<CpuEvent> that will be ready after all the async
-// values in `events` are ready. If errors occurs, one of the errors will be
-// propagated through the returned async value.
-tsl::AsyncValueRef<CpuEvent> AfterAll(
-    absl::Span<const tsl::AsyncValueRef<CpuEvent>> events) {
-  if (events.empty()) {
-    return tsl::MakeAvailableAsyncValueRef<CpuEvent>();
-  }
-  if (events.size() == 1) {
-    return events.front();
-  }
-
-  tsl::CountDownAsyncValueRef<CpuEvent> after_all(events.size());
-  for (auto& event : events) {
-    event.AndThen([after_all](absl::Status status) mutable {
-      after_all.CountDown(std::move(status));
-    });
-  }
-
-  return std::move(after_all).AsRef();
-}
-
 //===----------------------------------------------------------------------===//
 // Default CpuDeviceMemory::RawMemory allocator.
 //===----------------------------------------------------------------------===//
@@ -210,19 +188,6 @@ absl::Status CpuDeviceMemory::AllocateInto(
 // TrackedCpuDeviceBuffer.
 //===----------------------------------------------------------------------===//
 
-TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
-    : TrackedCpuDeviceBuffer(owns_buffers, std::move(raw_buffer),
-                             AfterAll(definition_events)) {}
-
-TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-    size_t buffer_size,
-    absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events)
-    : TrackedCpuDeviceBuffer(owns_buffers, std::move(raw_buffer), buffer_size,
-                             AfterAll(definition_events)) {}
-
 TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
     bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     tsl::AsyncValueRef<CpuEvent> definition_event)
@@ -230,20 +195,6 @@ TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
       owns_buffers_(owns_buffers),
       definition_event_(std::move(definition_event)) {
   DCHECK(definition_event_);
-  CHECK(tensorflow::down_cast<CpuRawBuffer*>(this->raw_buffer().get())
-            ->buffer()
-            .IsConcrete());
-  buffer_size_ = this->raw_buffer()->GetOnDeviceSizeInBytes();
-}
-
-TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-    size_t buffer_size, tsl::AsyncValueRef<CpuEvent> definition_event)
-    : AbstractTrackedDeviceBuffer(std::move(raw_buffer)),
-      owns_buffers_(owns_buffers),
-      buffer_size_(buffer_size),
-      definition_event_(std::move(definition_event)) {
-  DCHECK(definition_event_);
 }
 
 TrackedCpuDeviceBuffer::~TrackedCpuDeviceBuffer() = default;
@@ -257,7 +208,9 @@ const tsl::AsyncValueRef<CpuDeviceMemory>& TrackedCpuDeviceBuffer::buffer() {
   return *missing_buffer;
 }
 
-size_t TrackedCpuDeviceBuffer::BufferSize() { return buffer_size_; }
+size_t TrackedCpuDeviceBuffer::BufferSize() {
+  return raw_buffer() ? raw_buffer()->GetOnDeviceSizeInBytes() : 0;
+}
 
 void TrackedCpuDeviceBuffer::AddUsageEvents(
     absl::Span<tsl::AsyncValueRef<CpuEvent>> events) {
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
index 580846b00078c5..907e7045595e1e 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
@@ -140,35 +140,9 @@ class CpuDeviceMemory {
 // memory. This class is thread-compatible.
 class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
  public:
-  // For non-tuple, takes a single buffer.
-  // For tuple, takes the leaf buffers. Tuple index table created internally.
-  // Nested tuple is not supported.
-
-  // Constructor for allocated cpu memory, i.e., `buffer` should have concrete
-  // states. Definition event is after the list of `definition_events`.
-  TrackedCpuDeviceBuffer(
-      bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
-
-  // Variant with single definition event.
-  TrackedCpuDeviceBuffer(bool owns_buffers,
-                         tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-                         tsl::AsyncValueRef<CpuEvent> definition_event);
-
-  // Constructor for unallocated cpu memory, i.e., `buffer` will have
-  // unconstructed states, and we also need to provide `buffer_size` which will
-  // be the size of the `buffer` after allocation. Definition event is after the
-  // list of `definition_events`. Callers need to ensure cpu memory is allocated
-  // before the definition event is ready.
-  TrackedCpuDeviceBuffer(
-      bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-      size_t buffer_size,
-      absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events);
-
   // Variant with single definition event.
   TrackedCpuDeviceBuffer(bool owns_buffers,
                          tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
-                         size_t buffer_size,
                          tsl::AsyncValueRef<CpuEvent> definition_event);
 
   TrackedCpuDeviceBuffer(TrackedCpuDeviceBuffer&&) noexcept = default;
@@ -218,8 +192,6 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
 
   bool owns_buffers_;
 
-  // Should equal raw_buffer()->GetOnDeviceSizeInBytes();
-  size_t buffer_size_;
   // The definition event are associated with CPU operations that write to the
   // buffers.
   tsl::AsyncValueRef<CpuEvent> definition_event_;
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
index 4d54ae1dacb9a4..3986adc0abf7a2 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
@@ -108,8 +108,9 @@ TEST(TrackedCpuDeviceBufferTest, DelayedAllocation) {
 
   auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
   TrackedCpuDeviceBuffer tracked_buffer(
-      /*owns_buffers=*/true, tsl::MakeRef<CpuRawBuffer>(memory_space, buffer),
-      expected.size(), definition_event);
+      /*owns_buffers=*/true,
+      tsl::MakeRef<CpuRawBuffer>(memory_space, buffer, expected.size()),
+      definition_event);
   auto result = tracked_buffer.buffer();
   ASSERT_FALSE(result.IsAvailable());
   ASSERT_EQ(tracked_buffer.BufferSize(), expected.size());
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 56f631b46a9d0e..6b000d5bcd0599 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -540,8 +540,8 @@ PjRtStreamExecutorClient::DefineBuffer(
 
 absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                          CommonPjRtClient::PjRtFulfillAliasRawBufferCallback>>
-PjRtStreamExecutorClient::CreateRawBufferChannel(
-    PjRtMemorySpace* memory_space) {
+PjRtStreamExecutorClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                                                 size_t on_device_bytes_count) {
   auto buffer_promise = tsl::MakeIndirectAsyncValue();
   auto* device = tensorflow::down_cast<PjRtStreamExecutorDevice*>(
       memory_space->devices()[0]);
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index c00be14ba84295..40125ea7ff1b2e 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -401,7 +401,8 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
 
   absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                            PjRtFulfillAliasRawBufferCallback>>
-  CreateRawBufferChannel(PjRtMemorySpace* memory_space) override;
+  CreateRawBufferChannel(PjRtMemorySpace* memory_space,
+                         size_t on_device_bytes_count) override;
 
   absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>> LinearizeInto(
       const LiteralSlice& literal, const xla::Shape& device_shape,

From 07acbd560e5d2ee59f8df8d4d0a7797d9cf03f9a Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Fri, 19 Dec 2025 12:19:51 -0800
Subject: [PATCH 592/753] [ReplicaGroupV3][Refactor][3/n] Use
 CollectiveDeviceListBase for polymorphic device list support.

PiperOrigin-RevId: 846825118
---
 third_party/xla/xla/hlo/ir/hlo_instruction.cc | 59 +++++++-------
 third_party/xla/xla/hlo/ir/hlo_instruction.h  | 18 ++---
 .../xla/xla/hlo/ir/hlo_instructions.cc        | 25 +++---
 third_party/xla/xla/hlo/ir/hlo_instructions.h | 18 ++---
 third_party/xla/xla/hlo/ir/replica_group.cc   | 27 +++++++
 third_party/xla/xla/hlo/ir/replica_group.h    | 39 ++++++++-
 .../xla/xla/service/collective_ops_utils.cc   | 40 ++++-----
 .../xla/xla/service/collective_ops_utils.h    | 24 +++---
 .../xla/service/collective_ops_utils_test.cc  | 81 +++++++++----------
 .../gpu/model/collective_interpolator.cc      | 29 ++++---
 .../gpu/model/sol_latency_estimator_test.cc   |  2 +-
 .../xla/xla/service/spmd/spmd_partitioner.h   |  4 +-
 .../xla/service/spmd/spmd_partitioner_test.cc | 47 +++++------
 13 files changed, 247 insertions(+), 166 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index c370122c43179c..eb0c9fd9b969f2 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -1692,7 +1692,7 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllGather(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+    int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
     bool constrain_layout, const std::optional<int64_t>& channel_id,
     bool use_global_device_ids) {
   return std::make_unique<HloAllGatherInstruction>(
@@ -1711,13 +1711,11 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateAllGatherStart(const Shape& shape,
-                                     absl::Span<HloInstruction* const> operands,
-                                     int64_t all_gather_dimension,
-                                     const CollectiveDeviceList& device_list,
-                                     bool constrain_layout,
-                                     const std::optional<int64_t>& channel_id,
-                                     bool use_global_device_ids) {
+HloInstruction::CreateAllGatherStart(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
+    bool constrain_layout, const std::optional<int64_t>& channel_id,
+    bool use_global_device_ids) {
   return std::make_unique<HloAllGatherInstruction>(
       HloOpcode::kAllGatherStart, shape, operands, all_gather_dimension,
       device_list, constrain_layout, channel_id, use_global_device_ids);
@@ -1737,9 +1735,9 @@ HloInstruction::CreateAllGatherStart(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids) {
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids) {
   return std::make_unique<HloAllReduceInstruction>(
       HloOpcode::kAllReduce, shape, operands, reduce_computation, device_list,
       constrain_layout, channel_id, use_global_device_ids);
@@ -1756,11 +1754,14 @@ HloInstruction::CreateAllGatherStart(
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateReduceScatter(
-    const Shape& shape, absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids, int64_t scatter_dimension) {
+HloInstruction::CreateReduceScatter(const Shape& shape,
+                                    absl::Span<HloInstruction* const> operands,
+                                    HloComputation* reduce_computation,
+                                    const CollectiveDeviceListBase& device_list,
+                                    bool constrain_layout,
+                                    const std::optional<int64_t>& channel_id,
+                                    bool use_global_device_ids,
+                                    int64_t scatter_dimension) {
   return std::make_unique<HloReduceScatterInstruction>(
       shape, operands, reduce_computation, device_list, constrain_layout,
       channel_id, use_global_device_ids, scatter_dimension);
@@ -1779,13 +1780,11 @@ HloInstruction::CreateReduceScatter(
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateAllReduceStart(const Shape& shape,
-                                     absl::Span<HloInstruction* const> operands,
-                                     HloComputation* reduce_computation,
-                                     const CollectiveDeviceList& device_list,
-                                     bool constrain_layout,
-                                     const std::optional<int64_t>& channel_id,
-                                     bool use_global_device_ids) {
+HloInstruction::CreateAllReduceStart(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids) {
   return std::make_unique<HloAllReduceInstruction>(
       HloOpcode::kAllReduceStart, shape, operands, reduce_computation,
       device_list, constrain_layout, channel_id, use_global_device_ids);
@@ -1804,7 +1803,7 @@ HloInstruction::CreateAllReduceStart(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllToAll(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id,
     const std::optional<int64_t>& split_dimension) {
   return std::make_unique<HloAllToAllInstruction>(shape, operands, device_list,
@@ -1822,10 +1821,10 @@ HloInstruction::CreateAllReduceStart(
 }
 
 /* static */ std::unique_ptr<HloInstruction>
-HloInstruction::CreateRaggedAllToAll(const Shape& shape,
-                                     absl::Span<HloInstruction* const> operands,
-                                     const CollectiveDeviceList& device_list,
-                                     const std::optional<int64_t>& channel_id) {
+HloInstruction::CreateRaggedAllToAll(
+    const Shape& shape, absl::Span<HloInstruction* const> operands,
+    const CollectiveDeviceListBase& device_list,
+    const std::optional<int64_t>& channel_id) {
   return std::make_unique<HloRaggedAllToAllInstruction>(
       shape, operands, device_list, channel_id);
 }
@@ -1842,7 +1841,7 @@ HloInstruction::CreateRaggedAllToAll(
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateCollectiveBroadcast(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id) {
   return std::make_unique<HloCollectiveBroadcastInstruction>(
       HloOpcode::kCollectiveBroadcast, shape, operands, device_list,
@@ -5814,7 +5813,7 @@ const std::vector<ReplicaGroup>& HloInstruction::replica_groups() const {
   return Cast<HloCollectiveInstruction>(this)->replica_groups();
 }
 
-const CollectiveDeviceList& HloInstruction::device_list() const {
+const CollectiveDeviceListBase& HloInstruction::device_list() const {
   return Cast<HloCollectiveInstruction>(this)->device_list();
 }
 
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index d62cb20233986d..40a178fa26958e 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -496,7 +496,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // order of inputs from different participants.
   static std::unique_ptr<HloInstruction> CreateAllGather(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
       bool constrain_layout, const std::optional<int64_t>& channel_id,
       bool use_global_device_ids);
 
@@ -516,7 +516,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // conjunction of a AllGatherDone op that synchronizes and returns the result.
   static std::unique_ptr<HloInstruction> CreateAllGatherStart(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      int64_t all_gather_dimension, const CollectiveDeviceList& device_list,
+      int64_t all_gather_dimension, const CollectiveDeviceListBase& device_list,
       bool constrain_layout, const std::optional<int64_t>& channel_id,
       bool use_global_device_ids);
 
@@ -543,7 +543,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   static std::unique_ptr<HloInstruction> CreateAllReduce(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -559,7 +559,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   static std::unique_ptr<HloInstruction> CreateReduceScatter(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids,
       int64_t scatter_dimension);
 
@@ -587,7 +587,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   static std::unique_ptr<HloInstruction> CreateAllReduceStart(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -625,7 +625,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // performs AllToAll and then concatenates the results into a single array.
   static std::unique_ptr<HloInstruction> CreateAllToAll(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id,
       const std::optional<int64_t>& split_dimension = std::nullopt);
 
@@ -733,7 +733,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   //
   static std::unique_ptr<HloInstruction> CreateRaggedAllToAll(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list,
+      const CollectiveDeviceListBase& device_list,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -748,7 +748,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   // on that replica is a tensor consists of 0(s) in `shape`.
   static std::unique_ptr<HloInstruction> CreateCollectiveBroadcast(
       const Shape& shape, absl::Span<HloInstruction* const> operand,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -2308,7 +2308,7 @@ class alignas(kInstructionTypeMask + 1) HloInstruction {
   const std::vector<ReplicaGroup>& replica_groups() const;
 
   // Delegates to HloCollectiveInstruction::device_list.
-  const CollectiveDeviceList& device_list() const;
+  const CollectiveDeviceListBase& device_list() const;
 
   // Delegates to HloCollectivePermuteInstruction::source_target_pairs.
   const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs() const;
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index c8a09b07ce03c1..cc8459830044e9 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -918,10 +918,10 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
 HloCollectiveInstruction::HloCollectiveInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id)
     : HloChannelInstruction(opcode, shape, channel_id),
-      device_list_(std::make_shared<CollectiveDeviceList>(device_list)),
+      device_list_(device_list.Clone()),
       constrain_layout_(constrain_layout) {
   for (auto operand : operands) {
     AppendOperand(operand);
@@ -985,7 +985,7 @@ bool HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 HloAllGatherInstruction::HloAllGatherInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands, int64_t all_gather_dimension,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id, bool use_global_device_ids)
     : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id),
@@ -1045,9 +1045,9 @@ bool HloAllGatherInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 HloAllReduceInstructionBase::HloAllReduceInstructionBase(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids)
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids)
     : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id),
       use_global_device_ids_(use_global_device_ids) {
@@ -1106,9 +1106,10 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl(
 
 HloReduceScatterInstruction::HloReduceScatterInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    HloComputation* reduce_computation, const CollectiveDeviceList& device_list,
-    bool constrain_layout, const std::optional<int64_t>& channel_id,
-    bool use_global_device_ids, int64_t scatter_dimension)
+    HloComputation* reduce_computation,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
+    const std::optional<int64_t>& channel_id, bool use_global_device_ids,
+    int64_t scatter_dimension)
     : HloAllReduceInstructionBase(
           HloOpcode::kReduceScatter, shape, operands, reduce_computation,
           device_list, constrain_layout, channel_id, use_global_device_ids),
@@ -1161,7 +1162,7 @@ HloReduceScatterInstruction::CloneWithNewOperandsImpl(
 
 HloAllToAllInstruction::HloAllToAllInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id,
     const std::optional<int64_t>& split_dimension)
     : HloCollectiveInstruction(HloOpcode::kAllToAll, shape, operands,
@@ -1216,7 +1217,7 @@ bool HloAllToAllInstruction::IdenticalSlowPathIgnoringChannelIdValues(
 
 HloRaggedAllToAllInstruction::HloRaggedAllToAllInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list,
+    const CollectiveDeviceListBase& device_list,
     const std::optional<int64_t>& channel_id)
     : HloCollectiveInstruction(HloOpcode::kRaggedAllToAll, shape, operands,
                                device_list,
@@ -1251,7 +1252,7 @@ void HloRaggedAllToAllInstruction::PrintExtraAttributesImpl(
 HloCollectiveBroadcastInstruction::HloCollectiveBroadcastInstruction(
     HloOpcode opcode, const Shape& shape,
     absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceList& device_list, bool constrain_layout,
+    const CollectiveDeviceListBase& device_list, bool constrain_layout,
     const std::optional<int64_t>& channel_id)
     : HloCollectiveInstruction(opcode, shape, operands, device_list,
                                constrain_layout, channel_id) {}
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index 5b87d2636778be..88f902c9b093f8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -651,7 +651,7 @@ class HloCollectiveInstruction : public HloChannelInstruction {
     return device_list_->replica_groups();
   }
 
-  const CollectiveDeviceList& device_list() const {
+  const CollectiveDeviceListBase& device_list() const {
     const CollectiveDeviceList* device_list_v1 =
         dynamic_cast<const CollectiveDeviceList*>(device_list_.get());
     // TODO(b/468442352): After XLA codebase is genericized to utilize
@@ -683,8 +683,8 @@ class HloCollectiveInstruction : public HloChannelInstruction {
   explicit HloCollectiveInstruction(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& collective_device_list, bool constrain_layout,
-      const std::optional<int64_t>& channel_id);
+      const CollectiveDeviceListBase& collective_device_list,
+      bool constrain_layout, const std::optional<int64_t>& channel_id);
 
   HloInstructionProto ToProto() const override;
 
@@ -704,7 +704,7 @@ class HloAllGatherInstruction : public HloCollectiveInstruction {
   explicit HloAllGatherInstruction(HloOpcode opcode, const Shape& shape,
                                    absl::Span<HloInstruction* const> operands,
                                    int64_t all_gather_dimension,
-                                   const CollectiveDeviceList& device_list,
+                                   const CollectiveDeviceListBase& device_list,
                                    bool constrain_layout,
                                    const std::optional<int64_t>& channel_id,
                                    bool use_global_device_ids);
@@ -760,7 +760,7 @@ class HloAllReduceInstructionBase : public HloCollectiveInstruction {
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids);
 
   // Returns true if the ids in the ReplicaGroup config represent a global id of
@@ -817,7 +817,7 @@ class HloReduceScatterInstruction : public HloAllReduceInstructionBase {
   explicit HloReduceScatterInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
       HloComputation* reduce_computation,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id, bool use_global_device_ids,
       int64_t scatter_dimension);
 
@@ -862,7 +862,7 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
  public:
   explicit HloAllToAllInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id,
       const std::optional<int64_t>& split_dimension);
 
@@ -910,7 +910,7 @@ class HloRaggedAllToAllInstruction : public HloCollectiveInstruction {
  public:
   explicit HloRaggedAllToAllInstruction(
       const Shape& shape, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list,
+      const CollectiveDeviceListBase& device_list,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
@@ -941,7 +941,7 @@ class HloCollectiveBroadcastInstruction : public HloCollectiveInstruction {
   explicit HloCollectiveBroadcastInstruction(
       HloOpcode opcode, const Shape& shape,
       absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceList& device_list, bool constrain_layout,
+      const CollectiveDeviceListBase& device_list, bool constrain_layout,
       const std::optional<int64_t>& channel_id);
 
   ABSL_DEPRECATED("Use CollectiveDeviceList instead of list of ReplicaGroup.")
diff --git a/third_party/xla/xla/hlo/ir/replica_group.cc b/third_party/xla/xla/hlo/ir/replica_group.cc
index c7a8ecc2eb8f25..d0c67c7afd6387 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.cc
+++ b/third_party/xla/xla/hlo/ir/replica_group.cc
@@ -489,4 +489,31 @@ CollectiveDeviceList CollectiveDeviceList::FromProto(
   return FromProto(proto.collective_device_list());
 }
 
+CollectiveDeviceList ConvertToV1CollectiveDeviceList(
+    const CollectiveDeviceListBase& device_list) {
+  switch (device_list.version()) {
+    case CollectiveDeviceListVersion::kListOfLists: {
+      return dynamic_cast<const CollectiveDeviceList&>(device_list);
+    }
+    case CollectiveDeviceListVersion::kIota: {
+      if (const auto* v2 =
+              dynamic_cast<const IotaReplicaGroupList*>(&device_list)) {
+        return CollectiveDeviceList(*v2);
+      }
+      const auto* v1 = dynamic_cast<const CollectiveDeviceList*>(&device_list);
+      CHECK(v1 != nullptr) << "Failed to convert kIota to V1 list.";
+      return *v1;
+    }
+    case CollectiveDeviceListVersion::kMeshAxes: {
+      const auto* v3 =
+          dynamic_cast<const MeshAxesReplicaGroupList*>(&device_list);
+      CHECK(v3 != nullptr) << "Failed to convert kMeshAxes to V1 list.";
+      return v3->ToCollectiveDeviceList();
+    }
+    default:
+      LOG(FATAL) << "Unknown CollectiveDeviceListVersion: "
+                 << static_cast<int>(device_list.version());
+  }
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/replica_group.h b/third_party/xla/xla/hlo/ir/replica_group.h
index 92e23d1310dde5..c55e5802a2d4bf 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.h
+++ b/third_party/xla/xla/hlo/ir/replica_group.h
@@ -53,6 +53,19 @@ class CollectiveDeviceListBase {
   CollectiveDeviceListBase(CollectiveDeviceListBase&&) = default;
   CollectiveDeviceListBase& operator=(CollectiveDeviceListBase&&) = default;
 
+  // This is strict equality, which means that two different types
+  // can't be compared for functional equality (i.e. even though an
+  // IotaReplicaGroup and a CollectiveDeviceList may correspond to the same
+  // underlying set of device groups, they will compare as unequal).
+  friend bool operator==(const CollectiveDeviceListBase& lhs,
+                         const CollectiveDeviceListBase& rhs) {
+    if (typeid(lhs) != typeid(rhs)) {
+      return false;
+    }
+    // If types are the same, delegate to the derived implementation
+    return lhs.isEqual(rhs);
+  }
+
   virtual int64_t num_replica_groups() const = 0;
   virtual int64_t num_devices_per_group() const = 0;
   int64_t num_total_devices() const {
@@ -89,6 +102,9 @@ class CollectiveDeviceListBase {
 
   // shared_ptr for fast copy.
   mutable std::shared_ptr<std::vector<ReplicaGroup>> replica_groups_ = nullptr;
+
+ protected:
+  virtual bool isEqual(const CollectiveDeviceListBase& other) const = 0;
 };
 
 class MeshAxesReplicaGroupList : public CollectiveDeviceListBase {
@@ -129,6 +145,13 @@ class MeshAxesReplicaGroupList : public CollectiveDeviceListBase {
   IotaReplicaGroupList ToIotaReplicaGroupList() const;
   CollectiveDeviceList ToCollectiveDeviceList() const;
 
+ protected:
+  bool isEqual(const CollectiveDeviceListBase& other) const override {
+    const MeshAxesReplicaGroupList& rhs =
+        static_cast<const MeshAxesReplicaGroupList&>(other);
+    return *this == rhs;
+  }
+
  private:
   absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>
   GetDimToReshapeAndAggregateAxes() const;
@@ -203,6 +226,13 @@ class IotaReplicaGroupList : public CollectiveDeviceListBase {
 
   static IotaReplicaGroupList FromProto(const IotaReplicaGroupListProto& proto);
 
+ protected:
+  bool isEqual(const CollectiveDeviceListBase& other) const override {
+    const IotaReplicaGroupList& rhs =
+        static_cast<const IotaReplicaGroupList&>(other);
+    return *this == rhs;
+  }
+
  private:
   IotaTileAssignment iota_tile_assignment_;
   int64_t num_replica_groups_ = -1;
@@ -306,6 +336,13 @@ class CollectiveDeviceList : public CollectiveDeviceListBase {
     return std::make_unique<CollectiveDeviceList>(*this);
   };
 
+ protected:
+  bool isEqual(const CollectiveDeviceListBase& other) const override {
+    const CollectiveDeviceList& rhs =
+        static_cast<const CollectiveDeviceList&>(other);
+    return *this == rhs;
+  }
+
  private:
   // Construct collective device list from protobuf replica group start and end
   // iterators.
@@ -333,7 +370,7 @@ class CollectiveDeviceList : public CollectiveDeviceListBase {
   std::optional<IotaReplicaGroupList> iota_replica_group_list_;
 };
 
-std::optional<CollectiveDeviceList> ConvertToV1CollectiveDeviceList(
+CollectiveDeviceList ConvertToV1CollectiveDeviceList(
     const CollectiveDeviceListBase& device_list);
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index 45ee3a096c9708..b8ff16b756eec9 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -238,7 +238,8 @@ absl::StatusOr<CollectiveOpGroupMode> GetCollectiveOpGroupMode(
   return Internal("Unexpected instruction type.");
 }
 
-const CollectiveDeviceList& GetCollectiveDeviceList(const HloInstruction* hlo) {
+const CollectiveDeviceListBase& GetCollectiveDeviceList(
+    const HloInstruction* hlo) {
   return Cast<HloCollectiveInstruction>(hlo)->device_list();
 }
 
@@ -375,21 +376,23 @@ GetParticipatingDevicesGroups(const HloInstruction* collective) {
       device_assignment, GetCollectiveReplicaGroups(collective), mode);
 }
 
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
     const DeviceAssignment& device_assignment,
-    const CollectiveDeviceList& collective_device_list,
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode) {
   return GetParticipatingFlattenedIdGroups(
       collective_device_list, group_mode, device_assignment.replica_count(),
       device_assignment.computation_count());
 }
 
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const CollectiveDeviceList& collective_device_list,
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode, int replica_count, int partition_count) {
   if (group_mode ==
       CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_FLATTENED_ID) {
-    return collective_device_list;
+    return collective_device_list.Clone();
   }
   std::vector<ReplicaGroup> filled_empty_replica_group;
   absl::Span<const ReplicaGroup> original_replica_groups =
@@ -456,27 +459,29 @@ absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
       }
     }
   }
-  return CollectiveDeviceList(flattened_replica_groups);
+  return std::make_unique<CollectiveDeviceList>(flattened_replica_groups);
 }
 
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, const DeviceAssignment& device_assignment) {
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo,
+                                  const DeviceAssignment& device_assignment) {
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode mode,
                       GetCollectiveOpGroupMode(hlo));
   TF_ASSIGN_OR_RETURN(
-      CollectiveDeviceList collective_device_list,
+      std::unique_ptr<CollectiveDeviceListBase> collective_device_list,
       GetParticipatingFlattenedIdGroups(device_assignment,
                                         GetCollectiveDeviceList(hlo), mode));
   return collective_device_list;
 }
 
 // Same as above, used for cases where static_device_assignment is not present.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, int replica_count, int partition_count) {
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo, int replica_count,
+                                  int partition_count) {
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode mode,
                       GetCollectiveOpGroupMode(hlo));
   TF_ASSIGN_OR_RETURN(
-      CollectiveDeviceList collective_device_list,
+      std::unique_ptr<CollectiveDeviceListBase> collective_device_list,
       GetParticipatingFlattenedIdGroups(GetCollectiveDeviceList(hlo), mode,
                                         replica_count, partition_count));
   return collective_device_list;
@@ -664,13 +669,12 @@ absl::StatusOr<std::vector<int64_t>> GetPariticipantCountsForReplicaGroups(
 
 absl::StatusOr<std::optional<std::pair<int64_t, int64_t>>>
 GetReplicaGroupCountAndSize(const HloInstruction* hlo) {
-  const CollectiveDeviceList& device_list = GetCollectiveDeviceList(hlo);
+  const CollectiveDeviceListBase& device_list = GetCollectiveDeviceList(hlo);
   auto config = hlo->GetModule()->config();
 
-  if (device_list.iota_replica_group_list().has_value()) {
-    return std::make_pair(
-        device_list.iota_replica_group_list()->num_replica_groups(),
-        device_list.iota_replica_group_list()->num_devices_per_group());
+  if (device_list.version() == CollectiveDeviceListVersion::kIota) {
+    return std::make_pair(device_list.num_replica_groups(),
+                          device_list.num_devices_per_group());
   }
   TF_ASSIGN_OR_RETURN(CollectiveOpGroupMode group_mode,
                       GetCollectiveOpGroupMode(hlo));
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index adaa3acc37ac49..bdb81ed62cae5b 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_SERVICE_COLLECTIVE_OPS_UTILS_H_
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -80,7 +81,8 @@ absl::StatusOr<std::vector<int>> GetParticipatingIDs(
 absl::StatusOr<std::vector<std::vector<int64_t>>> GetAsyncReplicaGroups(
     const HloInstruction* instruction);
 
-const CollectiveDeviceList& GetCollectiveDeviceList(const HloInstruction* hlo);
+const CollectiveDeviceListBase& GetCollectiveDeviceList(
+    const HloInstruction* hlo);
 
 const std::vector<ReplicaGroup>& GetCollectiveReplicaGroups(
     const HloInstruction* hlo);
@@ -129,24 +131,28 @@ GetParticipatingDevicesGroups(const HloInstruction* collective);
 
 // Same as above, except that it returns the flattened id in the replica groups
 // instead of device id.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
     const DeviceAssignment& device_assignment,
-    const CollectiveDeviceList& collective_device_list,
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode);
 
 // Same as above, but take replica/partition count instead of device assignment.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const CollectiveDeviceList& collective_device_list,
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(
+    const CollectiveDeviceListBase& collective_device_list,
     CollectiveOpGroupMode group_mode, int replica_count, int partition_count);
 
 // Same as above, with collective group mode determined by the collective
 // instruction.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, const DeviceAssignment& device_assignment);
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo,
+                                  const DeviceAssignment& device_assignment);
 
 // Same as above, used for cases where static_device_assignment is not present.
-absl::StatusOr<CollectiveDeviceList> GetParticipatingFlattenedIdGroups(
-    const HloInstruction* hlo, int replica_count, int partition_count);
+absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+GetParticipatingFlattenedIdGroups(const HloInstruction* hlo, int replica_count,
+                                  int partition_count);
 
 // Figures out which devices are participating in the collective subgroup.
 absl::StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
diff --git a/third_party/xla/xla/service/collective_ops_utils_test.cc b/third_party/xla/xla/service/collective_ops_utils_test.cc
index d7e9c91ee9c438..6d762d9b13b7df 100644
--- a/third_party/xla/xla/service/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/collective_ops_utils_test.cc
@@ -57,11 +57,11 @@ using CycleType = collective_permute_cycle::CycleType;
 
 // Creates a container of ReplicaGroups.
 std::vector<ReplicaGroup> CreateReplicaGroups(
-    const std::vector<std::vector<int64_t>> &replica_groups) {
+    const std::vector<std::vector<int64_t>>& replica_groups) {
   std::vector<ReplicaGroup> result;
   result.reserve(replica_groups.size());
-  for (const auto &replica_group : replica_groups) {
-    ReplicaGroup &group = result.emplace_back();
+  for (const auto& replica_group : replica_groups) {
+    ReplicaGroup& group = result.emplace_back();
     for (auto id : replica_group) {
       group.add_replica_ids(id);
     }
@@ -116,7 +116,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnUnverifiedModule(hlo_string));
 
-  HloInstruction *all_gather =
+  HloInstruction* all_gather =
       module->entry_computation()->GetInstructionWithName("all-gather");
 
   EXPECT_EQ(IsOrHasCollectiveWithChannelId(all_gather), all_gather);
@@ -138,10 +138,10 @@ TEST(CollectiveOpsUtilsTest, IsNonFusionCollectiveSendRecv) {
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule(hlo_string));
 
-  HloInstruction *recv_ctx =
+  HloInstruction* recv_ctx =
       module->entry_computation()->GetInstructionWithName("recv_ctx");
   ASSERT_NE(recv_ctx, nullptr);
-  HloInstruction *send_ctx =
+  HloInstruction* send_ctx =
       module->entry_computation()->GetInstructionWithName("send_ctx");
   ASSERT_NE(send_ctx, nullptr);
 
@@ -160,7 +160,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
       HloInstruction * param_0,
       builder.AddParameter(HloInstruction::CreateParameter(
           0, ShapeUtil::MakeShape(BF16, {1, 512, 4096}), "p0")));
-  HloInstruction *instr =
+  HloInstruction* instr =
       builder.AddInstruction(HloInstruction::CreateAllGather(
           ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_0}, 1,
           CollectiveDeviceList(std::vector<ReplicaGroup>({group})), true, 231,
@@ -178,7 +178,7 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
       HloInstruction * param_1,
       builder2.AddParameter(HloInstruction::CreateParameter(
           0, ShapeUtil::MakeShape(BF16, {1, 512, 4096}), "p1")));
-  HloInstruction *instr_without_channel_id =
+  HloInstruction* instr_without_channel_id =
       builder2.AddInstruction(HloInstruction::CreateAllGather(
           ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_1}, 1, {group},
           true, std::nullopt, true));
@@ -191,7 +191,6 @@ TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
   EXPECT_EQ(IsOrHasCollectiveWithChannelId(fusion2.get()), nullptr);
 }
 
-
 TEST(IsExclusivelyCrossModuleTest, CrossReplicaNoChannelSet) {
   int64_t num_replicas = 4;
   int64_t num_partitions = 2;
@@ -280,14 +279,14 @@ TEST(CollectiveOpsUtilsTest, GetReplicaGroups) {
   // Set up a collective permute start instruction
   auto builder = HloComputation::Builder("GetReplicaGroupsTest");
   auto param_shape = ShapeUtil::MakeShape(F32, {4, 4});
-  HloInstruction *param_0 = builder.AddInstruction(
+  HloInstruction* param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "p0"));
 
   // Test for CollectivePermuteStart
   std::vector<std::pair<int64_t, int64_t>> source_target_pairs = {
       {0, 1}, {1, 2}, {2, 3}, {3, 0}};
 
-  HloInstruction *permute_start =
+  HloInstruction* permute_start =
       builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
           param_shape, param_0, source_target_pairs, /*channel_id=*/1));
 
@@ -303,7 +302,7 @@ TEST(CollectiveOpsUtilsTest, GetReplicaGroups) {
   // Test for AllGatherStart
   std::vector<ReplicaGroup> replica_groups =
       CreateReplicaGroups({{0, 1}, {2, 3}});
-  HloInstruction *all_gather_start =
+  HloInstruction* all_gather_start =
       builder.AddInstruction(HloInstruction::CreateAllGatherStart(
           ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
           /*all_gather_dimension=*/0, replica_groups,
@@ -326,10 +325,10 @@ TEST(CollectiveOpsUtilsTest, GetReplicaGroups) {
   reducer_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeScalarShape(F32), HloOpcode::kAdd, reducer_x, reducer_y));
 
-  HloComputation *add_computation =
+  HloComputation* add_computation =
       module.AddEmbeddedComputation(reducer_builder.Build());
 
-  HloInstruction *all_reduce_start =
+  HloInstruction* all_reduce_start =
       builder.AddInstruction(HloInstruction::CreateAllReduceStart(
           ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
           add_computation, replica_groups, /*constrain_layout=*/false,
@@ -347,14 +346,14 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   HloModule module("test_module", HloModuleConfig());
   auto builder = HloComputation::Builder("IsAsyncCollectiveTest");
   auto param_shape = ShapeUtil::MakeShape(F32, {4, 4});
-  HloInstruction *param_0 = builder.AddInstruction(
+  HloInstruction* param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, param_shape, "p0"));
 
   // Test for CollectivePermuteStart and CollectivePermuteDone
   std::vector<std::pair<int64_t, int64_t>> source_target_pairs = {
       {0, 1}, {1, 2}, {2, 3}, {3, 0}};
 
-  HloInstruction *permute_start =
+  HloInstruction* permute_start =
       builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
           param_shape, param_0, source_target_pairs, /*channel_id=*/1));
 
@@ -362,7 +361,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.ok());
   EXPECT_TRUE(is_async_status.value());
 
-  HloInstruction *permute_done =
+  HloInstruction* permute_done =
       builder.AddInstruction(HloInstruction::CreateUnary(
           param_shape, HloOpcode::kCollectivePermuteDone, permute_start));
 
@@ -374,7 +373,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   std::vector<ReplicaGroup> replica_groups =
       CreateReplicaGroups({{0, 1}, {2, 3}});
 
-  HloInstruction *all_gather_start =
+  HloInstruction* all_gather_start =
       builder.AddInstruction(HloInstruction::CreateAllGatherStart(
           ShapeUtil::MakeTupleShape(
               {ShapeUtil::MakeShape(F32, {8, 4}), param_shape}),
@@ -386,7 +385,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.ok());
   EXPECT_TRUE(is_async_status.value());
 
-  HloInstruction *all_gather_done = builder.AddInstruction(
+  HloInstruction* all_gather_done = builder.AddInstruction(
       HloInstruction::CreateUnary(ShapeUtil::MakeShape(F32, {8, 4}),
                                   HloOpcode::kAllGatherDone, all_gather_start));
 
@@ -397,17 +396,17 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   // Test for AllReduceStart and AllReduceDone
   // First create a reduction computation
   HloComputation::Builder reducer_builder("add");
-  HloInstruction *reducer_x = reducer_builder.AddInstruction(
+  HloInstruction* reducer_x = reducer_builder.AddInstruction(
       HloInstruction::CreateParameter(0, ShapeUtil::MakeScalarShape(F32), "x"));
-  HloInstruction *reducer_y = reducer_builder.AddInstruction(
+  HloInstruction* reducer_y = reducer_builder.AddInstruction(
       HloInstruction::CreateParameter(1, ShapeUtil::MakeScalarShape(F32), "y"));
   reducer_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeScalarShape(F32), HloOpcode::kAdd, reducer_x, reducer_y));
 
-  HloComputation *add_computation =
+  HloComputation* add_computation =
       module.AddEmbeddedComputation(reducer_builder.Build());
 
-  HloInstruction *all_reduce_start =
+  HloInstruction* all_reduce_start =
       builder.AddInstruction(HloInstruction::CreateAllReduceStart(
           ShapeUtil::MakeTupleShape({param_shape, param_shape}), {param_0},
           add_computation, replica_groups, /*constrain_layout=*/false,
@@ -417,7 +416,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.ok());
   EXPECT_TRUE(is_async_status.value());
 
-  HloInstruction *all_reduce_done =
+  HloInstruction* all_reduce_done =
       builder.AddInstruction(HloInstruction::CreateUnary(
           param_shape, HloOpcode::kAllReduceDone, all_reduce_start));
 
@@ -426,7 +425,7 @@ TEST(CollectiveOpsUtilsTest, IsAsyncCollective) {
   EXPECT_TRUE(is_async_status.value());
 
   // Test for regular CollectivePermute (non-async)
-  HloInstruction *permute =
+  HloInstruction* permute =
       builder.AddInstruction(HloInstruction::CreateCollectivePermute(
           param_shape, param_0, source_target_pairs, /*channel_id=*/1));
 
@@ -612,7 +611,7 @@ std::vector<TestCase> GetTestCases() {
 class GetCollectOpGroupModeTest : public testing::TestWithParam<TestCase> {};
 
 TEST_P(GetCollectOpGroupModeTest, Test) {
-  const TestCase &tc = GetParam();
+  const TestCase& tc = GetParam();
   absl::StatusOr<CollectiveOpGroupMode> actual =
       GetCollectiveOpGroupMode(tc.has_channel_id, tc.use_global_device_ids);
   if (tc.expected) {
@@ -681,13 +680,13 @@ absl::StatusOr<std::unique_ptr<HloComputation>> CreateMaxComputation() {
   TF_ASSIGN_OR_RETURN(HloInstruction * b,
                       builder_max.AddParameter(
                           HloInstruction::CreateParameter(1, scalar, "b")));
-  HloInstruction *max = builder_max.AddInstruction(
+  HloInstruction* max = builder_max.AddInstruction(
       HloInstruction::CreateBinary(scalar, HloOpcode::kMaximum, a, b), "max");
   return builder_max.Build(max);
 }
 
 TEST_P(GetCollectOpGroupModeTestForInstruction, Test) {
-  const TestCaseForInstruction &test_case = GetParam();
+  const TestCaseForInstruction& test_case = GetParam();
   ReplicaGroup group;
   for (int k = 0; k < 4; ++k) {
     group.add_replica_ids(k);
@@ -712,7 +711,7 @@ TEST_P(GetCollectOpGroupModeTestForInstruction, Test) {
                           builder.AddParameter(HloInstruction::CreateParameter(
                               0, two_elements, "parameter")));
 
-  HloInstruction *collective;
+  HloInstruction* collective;
   switch (test_case.op_code) {
     case HloOpcode::kAllGather:
       collective = builder.AddInstruction(HloInstruction::CreateAllGather(
@@ -823,7 +822,7 @@ std::string TestCase::ToString() const {
   return s.str();
 }
 
-std::ostream &operator<<(std::ostream &os, const TestCase &tc) {
+std::ostream& operator<<(std::ostream& os, const TestCase& tc) {
   os << tc.ToString();
   return os;
 }
@@ -1077,7 +1076,7 @@ std::vector<TestCase> GetTestCases() {
 class GetParticipatingTest : public testing::TestWithParam<TestCase> {};
 
 TEST_P(GetParticipatingTest, Test) {
-  const TestCase &tc = GetParam();
+  const TestCase& tc = GetParam();
 
   int64_t num_replicas = tc.device_assignment.n1();
   int64_t num_partitions = tc.device_assignment.n2();
@@ -1103,7 +1102,7 @@ TEST_P(GetParticipatingTest, Test) {
   }
 
   // Test GetParticipatingDevices.
-  for (const TestCase::CurrentIdAndOutput &subtest : tc.subtests) {
+  for (const TestCase::CurrentIdAndOutput& subtest : tc.subtests) {
     absl::StatusOr<std::vector<GlobalDeviceId>> actual =
         GetParticipatingDevices(GlobalDeviceId(subtest.current_id),
                                 device_assignment, replica_groups, *group_mode);
@@ -1144,15 +1143,15 @@ TEST_P(GetParticipatingTest, Test) {
               testing::UnorderedElementsAreArray(expect_device_groups));
 
   // Test GetParticipatingFlattenedIdGroups.
-  absl::StatusOr<CollectiveDeviceList> collective_device_list =
-      GetParticipatingFlattenedIdGroups(
+  absl::StatusOr<std::unique_ptr<CollectiveDeviceListBase>>
+      collective_device_list = GetParticipatingFlattenedIdGroups(
           device_assignment, CollectiveDeviceList(replica_groups), *group_mode);
   if (!collective_device_list.ok()) {
     EXPECT_TRUE(tc.expected_failure);
     return;
   }
-  const std::vector<ReplicaGroup> &actual_flattened_id_groups =
-      collective_device_list.value().replica_groups();
+  const std::vector<ReplicaGroup>& actual_flattened_id_groups =
+      collective_device_list.value()->replica_groups();
 
   std::vector<std::vector<int64_t>> actual_flattened_id_groups_int;
   actual_flattened_id_groups_int.reserve(actual_flattened_id_groups.size());
@@ -1192,16 +1191,16 @@ TEST_P(GetParticipatingTest, Test) {
       /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y"));
   sum_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y));
-  HloComputation *reduction =
+  HloComputation* reduction =
       hlo_module.AddEmbeddedComputation(sum_builder.Build());
   HloComputation::Builder entry_builder("test_entry");
-  HloInstruction *operand = entry_builder.AddInstruction(
+  HloInstruction* operand = entry_builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0f)));
   std::optional<int64_t> channel_id = std::nullopt;
   if (tc.has_channel_id) {
     channel_id = 0;
   }
-  HloInstruction *ar =
+  HloInstruction* ar =
       entry_builder.AddInstruction(HloInstruction::CreateAllReduce(
           operand->shape(), {operand}, reduction, replica_groups,
           /*constrain_layout=*/false,
@@ -1241,7 +1240,7 @@ class GetPariticipantCountsForReplicaGroupsTest
     : public testing::TestWithParam<TestCase> {};
 
 TEST_P(GetPariticipantCountsForReplicaGroupsTest, Test) {
-  const TestCase &tc = GetParam();
+  const TestCase& tc = GetParam();
 
   std::vector<ReplicaGroup> replica_groups =
       CreateReplicaGroups(tc.replica_groups);
@@ -1294,7 +1293,7 @@ INSTANTIATE_TEST_SUITE_P(
     GetPariticipantCountsForReplicaGroupsTest,
     testing::ValuesIn(GetTestCases()),
     [](const testing::TestParamInfo<
-        GetPariticipantCountsForReplicaGroupsTest::ParamType> &info) {
+        GetPariticipantCountsForReplicaGroupsTest::ParamType>& info) {
       return info.param.test_name;
     });
 
diff --git a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
index 5663de4e143406..c2408e4e709ac1 100644
--- a/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
+++ b/third_party/xla/xla/service/gpu/model/collective_interpolator.cc
@@ -101,13 +101,12 @@ struct InterpolationSpecification {
 // Returns number of participating devices in an input `device_list`. Supports
 // only `iota_replica_group_list`.
 absl::StatusOr<int> GetNumParticipatingDevices(
-    const CollectiveDeviceList& device_list) {
-  auto iota = device_list.iota_replica_group_list();
-  if (!iota.has_value()) {
+    const CollectiveDeviceListBase& device_list) {
+  if (device_list.version() != CollectiveDeviceListVersion::kIota) {
     return absl::FailedPreconditionError(
         "Only iota device assignment is supported.");
   }
-  return iota->num_devices_per_group();
+  return device_list.num_devices_per_group();
 }
 
 absl::StatusOr<InterpolationSpecification> Spec(
@@ -147,13 +146,16 @@ absl::StatusOr<InterpolationSpecification> Spec(
   TF_ASSIGN_OR_RETURN(int num_devices,
                       GetNumParticipatingDevices(collective->device_list()));
 
+  CollectiveDeviceList list_of_devices =
+      ConvertToV1CollectiveDeviceList(collective->device_list());
+
   return InterpolationSpecification{
       /*opcode=*/collective->opcode(),
       /*num_devices=*/num_devices,
       /*transfer_size=*/bytes_transferred,
       /*data_type=*/collective->shape().element_type(),
       /*collective_params=*/
-      CollectiveOpSpecInfo{collective->device_list(), comm}};
+      CollectiveOpSpecInfo{list_of_devices, comm}};
 }
 
 std::unique_ptr<HloModule> AllReduceModule(
@@ -348,11 +350,13 @@ std::unique_ptr<HloModule> CollectivePermuteModule(
   return module;
 }
 
-std::optional<CollectiveDeviceList> CanonicalDeviceList(
+std::optional<std::unique_ptr<CollectiveDeviceListBase>> CanonicalDeviceList(
     const HloCollectiveInstruction& instr) {
-  if (instr.device_list().iota_replica_group_list().has_value()) {
-    return instr.device_list();
+  const CollectiveDeviceListBase& device_list = instr.device_list();
+  if (device_list.version() == CollectiveDeviceListVersion::kIota) {
+    return device_list.Clone();
   }
+
   auto num_groups_and_devices = GetReplicaGroupCountAndSize(&instr);
   if (!num_groups_and_devices.ok() || !num_groups_and_devices->has_value()) {
     VLOG(1) << "Failed to determine a number of devices participating in "
@@ -363,7 +367,7 @@ std::optional<CollectiveDeviceList> CanonicalDeviceList(
 
   IotaReplicaGroupList iota((*num_groups_and_devices)->first,
                             (*num_groups_and_devices)->second);
-  return CollectiveDeviceList(iota);
+  return std::make_unique<CollectiveDeviceList>(iota);
 }
 
 HloOpcode AsyncToSyncOpcode(const HloCollectiveInstruction& instr) {
@@ -720,12 +724,15 @@ absl::StatusOr<absl::Duration> CollectiveInterpolator::EstimatedRuntime(
         absl::StrCat("Cannot find key for instr: ", instr.ToString()));
   }
   auto* collective = Cast<HloCollectiveInstruction>(&instr);
-  std::optional<CollectiveDeviceList> devices =
+  std::optional<std::unique_ptr<CollectiveDeviceListBase>> devices =
       CanonicalDeviceList(*collective);
   if (devices.has_value()) {
+    CollectiveDeviceList list_of_devices =
+        ConvertToV1CollectiveDeviceList(*devices.value());
+
     ExactInterpolatorKey exact_key{
         /*opcode=*/instr.opcode(),
-        /*collective_params=*/*devices,
+        /*collective_params=*/list_of_devices,
         /*data_type=*/
         RequiresAccumulation(instr.opcode())
             ? std::make_optional(instr.shape().element_type())
diff --git a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
index b18d54daa25df2..daec75b8f1c563 100644
--- a/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/gpu/model/sol_latency_estimator_test.cc
@@ -739,7 +739,7 @@ class IsSolLatencyEstimatorEnabledTest : public HloTestBase {
         module->AddEmbeddedComputation(wrapped_computation.Build());
     entry->AddInstruction(HloInstruction::CreateAllReduce(
         shape, {dummy_operand}, subcomp,
-        /*replica_groups=*/{}, /*constrain_layout=*/false,
+        /*device_list=*/CollectiveDeviceList(), /*constrain_layout=*/false,
         /*channel_id=*/std::nullopt, /*use_global_device_ids=*/false));
   }
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index c430d29b65036c..8aae8502e73d9b 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -824,12 +824,12 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   }
 
   virtual double GetCommunicationTimeInMilliSec(
-      int64_t bytes, const CollectiveDeviceList& collective_device_list) {
+      int64_t bytes, const CollectiveDeviceListBase& collective_device_list) {
     return 0.0;
   }
 
   virtual int GetCommunicationMultiplier(
-      const CollectiveDeviceList& collective_device_list) {
+      const CollectiveDeviceListBase& collective_device_list) {
     return 1;
   }
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
index 9e2118ef7d8e16..e824fafc006dac 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_test.cc
@@ -532,13 +532,12 @@ ENTRY entry {
   EXPECT_NE(all_gather, nullptr);
 
   // Verify all-gather instruction contains ReplicaGroupV2.
-  EXPECT_TRUE(all_gather->device_list().iota_replica_group_list().has_value());
-  IotaReplicaGroupList list =
-      all_gather->device_list().iota_replica_group_list().value();
-  EXPECT_EQ(list.num_replica_groups(), 1);
-  EXPECT_EQ(list.num_devices_per_group(), 4);
-  EXPECT_THAT(list.reshape_dims(), ::testing::ElementsAre(4));
-  EXPECT_THAT(list.transpose_perm(), ::testing::ElementsAre(0));
+  EXPECT_TRUE(all_gather->device_list().version() ==
+              CollectiveDeviceListVersion::kIota);
+  EXPECT_EQ(all_gather->device_list(),
+            CollectiveDeviceList(IotaReplicaGroupList(
+                /*num_replica_groups=*/1, /*num_devices_per_group=*/4,
+                /*reshape_dims=*/{4}, /*transpose_perm=*/{0})));
 }
 
 TEST_P(SpmdPartitioningTest, TiledToSingleDevice) {
@@ -598,8 +597,10 @@ ENTRY entry {
   EXPECT_EQ(all_to_all->replica_groups().size(), 1);
   EXPECT_EQ(all_to_all->replica_groups()[0].replica_ids_size(), 8);
   if (GetParam() == ShardingFormatPicker::ShardingType::kBestEffortV2) {
-    EXPECT_EQ(all_to_all->device_list().iota_replica_group_list(),
-              IotaReplicaGroupList(1, 8, {4, 2}, {1, 0}));
+    EXPECT_EQ(all_to_all->device_list(),
+              CollectiveDeviceList(IotaReplicaGroupList(
+                  /*num_replica_groups=*/1, /*num_devices_per_group=*/8,
+                  /*reshape_dims=*/{4, 2}, /*transpose_perm=*/{1, 0})));
   } else {
     std::vector<std::vector<int64_t>> expected_replica_groups = {
         {0, 2, 4, 6, 1, 3, 5, 7}};
@@ -2007,18 +2008,12 @@ ENTRY entry {
             module->entry_computation()->instructions().end());
 
   // Verify all-reduce instruction contains ReplicaGroupV2.
-  EXPECT_TRUE((*all_reduce_instruction)
-                  ->device_list()
-                  .iota_replica_group_list()
-                  .has_value());
-  IotaReplicaGroupList list = (*all_reduce_instruction)
-                                  ->device_list()
-                                  .iota_replica_group_list()
-                                  .value();
-  EXPECT_EQ(list.num_replica_groups(), 1);
-  EXPECT_EQ(list.num_devices_per_group(), 8);
-  EXPECT_THAT(list.reshape_dims(), ::testing::ElementsAre(8));
-  EXPECT_THAT(list.transpose_perm(), ::testing::ElementsAre(0));
+  EXPECT_EQ((*all_reduce_instruction)->device_list().version(),
+            CollectiveDeviceListVersion::kIota);
+  EXPECT_EQ((*all_reduce_instruction)->device_list(),
+            CollectiveDeviceList(IotaReplicaGroupList(
+                /*num_replica_groups=*/1, /*num_devices_per_group=*/8,
+                /*reshape_dims=*/{8}, /*transpose_perm=*/{0})));
 }
 
 TEST_P(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowReversal) {
@@ -12070,8 +12065,14 @@ ENTRY %module {
   EXPECT_TRUE(all_to_all != nullptr);
   if (GetParam() ==
       test_only::ShardingFormatPicker::ShardingType::kBestEffortV2) {
-    EXPECT_EQ(all_to_all->device_list().iota_replica_group_list().value(),
-              IotaReplicaGroupList(4, 2, {2, 2, 2}, {0, 2, 1}));
+    EXPECT_EQ(all_to_all->device_list().version(),
+              CollectiveDeviceListVersion::kIota);
+    EXPECT_EQ(all_to_all->device_list(),
+              CollectiveDeviceList(IotaReplicaGroupList(
+                  /*num_replica_groups=*/4, /*num_devices_per_group=*/2,
+                  /*reshape_dims=*/{2, 2, 2},
+                  /*transpose_perm=*/{0, 2, 1})));
+
   } else {
     std::vector<std::vector<int64_t>> expected_replica_groups = {
         {0, 2}, {1, 3}, {4, 6}, {5, 7}};

From 453b5bbfdeecf05f3a5bc60af1ef770bca7d3c3a Mon Sep 17 00:00:00 2001
From: Fengwu Yao <fengwuyao@google.com>
Date: Fri, 19 Dec 2025 12:54:13 -0800
Subject: [PATCH 593/753] Internal changes only.

PiperOrigin-RevId: 846835877
---
 tensorflow/lite/types/fp16.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/lite/types/fp16.h b/tensorflow/lite/types/fp16.h
index cc63fe7d21fbd8..94484350f68bcd 100644
--- a/tensorflow/lite/types/fp16.h
+++ b/tensorflow/lite/types/fp16.h
@@ -27,6 +27,13 @@ limitations under the License.
 // - https://github.com/google/XNNPACK/issues/6989
 // We also don't need a lot of the functionality in the upstream library.
 
+// If building with a library that uses //third_party/FP16, that library
+// provides its own fp16 conversion functions. Avoid redefining them here to
+// prevent build errors.
+// FP16_H and FP16_BITCASTS_H are defined by //third_party/FP16/fp16.h and
+// //third_party/FP16/bitcasts.h respectively.
+#if !defined(FP16_H) && !defined(FP16_BITCASTS_H)
+
 static inline float fp32_from_bits(uint32_t w) {
   union {
     uint32_t as_bits;
@@ -216,4 +223,6 @@ static inline uint16_t fp16_ieee_from_fp32_value(float f) {
          (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 
+#endif  // !defined(FP16_H) && !defined(FP16_BITCASTS_H)
+
 #endif  // TENSORFLOW_LITE_TYPES_FP16_H_

From e961b236974e99cca6434b14f8a0d80ecbce803f Mon Sep 17 00:00:00 2001
From: Vlad Sytchenko <vsytch@google.com>
Date: Fri, 19 Dec 2025 13:03:01 -0800
Subject: [PATCH 594/753] [XLA] Continue trying to unroll pipelined loops after
 failure

PiperOrigin-RevId: 846839467
---
 third_party/xla/xla/service/BUILD             |   5 +-
 .../service/while_loop_pipeline_unroller.cc   |  28 +++--
 .../while_loop_pipeline_unroller_test.cc      | 105 +++++++++++++++++-
 3 files changed, 120 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 085c27f698c40e..d42050eb0f5a2a 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -6100,14 +6100,14 @@ cc_library(
         "//xla/hlo/pass:hlo_pass",
         "//xla/hlo/transforms/simplifiers:flatten_call_graph",
         "//xla/hlo/transforms/simplifiers:hlo_dce",
+        "//xla/tsl/platform:status_macros",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -6122,7 +6122,6 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/hlo/testlib:test_helpers",
-        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
index 2f8838dfc7dd1b..537220e28e543e 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
@@ -34,8 +35,7 @@ limitations under the License.
 #include "xla/hlo/transforms/simplifiers/flatten_call_graph.h"
 #include "xla/hlo/transforms/simplifiers/hlo_dce.h"
 #include "xla/service/while_util.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/statusor.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 /*static*/
@@ -125,8 +125,9 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
     }
   }
 
-  std::vector<HloInstruction*> original_roots;
   for (auto&& [while_instruction, unroll_factor] : while_instructions) {
+    VLOG(1) << "Unrolling: " << while_instruction->name()
+            << " unroll_factor: " << unroll_factor;
     HloComputation* body = while_instruction->while_body();
     HloComputation* condition = while_instruction->while_condition();
 
@@ -138,16 +139,15 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
         b.AddInstruction(HloInstruction::CreateParameter(
             0, while_instruction->shape(), "input_tuple"));
     HloComputation* unrolled_body = module->AddEmbeddedComputation(b.Build());
+    HloInstruction* unrolled_root = input_tuple;
     for (int64_t step = 0; step < unroll_factor; ++step) {
       HloComputation* loop_step = module->AddEmbeddedComputation(body->Clone(
           absl::StrFormat("unrolled_%dx_step_%d", unroll_factor, step)));
       input_tuple = unrolled_body->AddInstruction(HloInstruction::CreateCall(
           while_instruction->shape(), {input_tuple}, loop_step));
-      original_roots.push_back(input_tuple);
+      unrolled_root = input_tuple;
     }
     // The final original root is now the root of the unrolled loop.
-    HloInstruction* unrolled_root = original_roots.back();
-    original_roots.pop_back();
     unrolled_body->set_root_instruction(unrolled_root);
 
     // We need the unrolled loop and the remainder (original) loop to execute
@@ -167,12 +167,16 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
         while_instruction->parent()->AddInstruction(HloInstruction::CreateWhile(
             while_instruction->shape(), unrolled_condition, body,
             while_instruction->mutable_operand(0)));
-    TF_RETURN_IF_ERROR(WhileUtil::IncrementWhileLoopTripCount(
-        *unrolled_while_instruction, -(unroll_factor - 1)));
+    absl::Status status = WhileUtil::IncrementWhileLoopTripCount(
+        *unrolled_while_instruction, -(unroll_factor - 1));
     unrolled_while_instruction->set_while_body(unrolled_body);
 
-    TF_RETURN_IF_ERROR(
-        while_instruction->ReplaceOperandWith(0, unrolled_while_instruction));
+    if (status.ok()) {
+      RETURN_IF_ERROR(
+          while_instruction->ReplaceOperandWith(0, unrolled_while_instruction));
+    } else {
+      VLOG(1) << "Failed to unroll: " << while_instruction->name();
+    }
   }
 
   const bool changed = !while_instructions.empty();
@@ -181,9 +185,9 @@ absl::StatusOr<bool> WhileLoopPipelineUnroller::RunImpl(
     // recursively clone all the nested computations. FCG will take care of this
     // for us.
     FlattenCallGraph fcg;
-    TF_RETURN_IF_ERROR(fcg.Run(module).status());
+    RETURN_IF_ERROR(fcg.Run(module).status());
     HloDCE dce;
-    TF_RETURN_IF_ERROR(dce.Run(module).status());
+    RETURN_IF_ERROR(dce.Run(module).status());
   }
 
   return changed;
diff --git a/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc b/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
index d5a6cc011343fc..b1ce4e66e25fe9 100644
--- a/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_pipeline_unroller_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/service/copy_insertion.h"
-#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -78,7 +78,7 @@ ENTRY main {
   ROOT while.0 = (s32[], s32[], s32[], s32[]) while(while_tuple.0), body=body, condition=condition
 }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
   WhileLoopPipelineUnroller wlpu;
   ASSERT_IS_OK(wlpu.Run(module.get()).status());
   AliasInfo alias_info;
@@ -147,7 +147,7 @@ ENTRY main {
   ROOT root.0 = get-tuple-element(while.0), index=0
 }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
   WhileLoopPipelineUnroller wlpu;
   ASSERT_IS_OK(wlpu.Run(module.get()).status());
   AliasInfo alias_info;
@@ -186,5 +186,104 @@ ENTRY main {
   }
 }
 
+TEST_F(WhileLoopPipelineUnrollerTest, FailureRecovery) {
+  constexpr absl::string_view hlo = R"hlo(
+HloModule main
+
+body {
+  input_tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+  arg.0 = get-tuple-element(input_tuple.0), index=0
+  arg.1 = get-tuple-element(input_tuple.0), index=1
+  arg.2 = get-tuple-element(input_tuple.0), index=2
+  arg.3 = get-tuple-element(input_tuple.0), index=3
+
+  one.0 = s32[] constant(1)
+  out.0 = add(arg.0, one.0)
+
+  add.0 = add(arg.3, one.0)
+  ROOT output_tuple.0 = tuple(arg.1, arg.2, out.0, add.0)
+}
+
+condition {
+  input_tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+  arg.3 = get-tuple-element(input_tuple.0), index=3
+  three.0 = s32[] constant(3)
+  ROOT pred.0 = compare(arg.3, three.0), direction=LT
+}
+
+ENTRY main {
+  tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+  while-pass.0 = (s32[], s32[], s32[], s32[]) while(tuple.0), body={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.0 = get-tuple-element(tuple.0), index=0
+    arg.1 = get-tuple-element(tuple.0), index=1
+    arg.2 = get-tuple-element(tuple.0), index=2
+    arg.3 = get-tuple-element(tuple.0), index=3
+
+    one.0 = s32[] constant(1)
+    add.0 = add(arg.0, one.0)
+
+    add.1 = add(arg.3, one.0)
+    ROOT output_tuple.0 = tuple(arg.1, arg.2, add.0, add.1)
+  }, condition={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.3 = get-tuple-element(tuple.0), index=3
+    three.0 = s32[] constant(3)
+    ROOT pred.0 = compare(arg.3, three.0), direction=LT
+  }
+  ROOT while-fail.0 = (s32[], s32[], s32[], s32[]) while(while-pass.0), body={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.0 = get-tuple-element(tuple.0), index=0
+    arg.1 = get-tuple-element(tuple.0), index=1
+    arg.2 = get-tuple-element(tuple.0), index=2
+    arg.3 = get-tuple-element(tuple.0), index=3
+
+    one.0 = s32[] constant(1)
+    add.0 = add(arg.0, one.0)
+
+    add.1 = add(arg.3, one.0)
+    ROOT output_tuple.0 = tuple(arg.1, arg.2, add.0, add.1)
+  }, condition={
+    tuple.0 = (s32[], s32[], s32[], s32[]) parameter(0)
+    arg.3 = get-tuple-element(tuple.0), index=3
+    three.0 = s32[] constant(3)
+    pred.0 = compare(arg.3, three.0), direction=LT
+    true.0 = pred[] constant(true)
+    ROOT and.0 = and(pred.0, true.0)
+  }
+}
+  )hlo";
+  ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+  WhileLoopPipelineUnroller wlpu;
+  ASSERT_IS_OK(wlpu.Run(module.get()).status());
+  AliasInfo alias_info;
+  CopyInsertion copy_insertion(&alias_info,
+                               /*use_region_based_live_range_analysis=*/-1);
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+
+  const HloInstruction* pass_original_loop =
+      FindInstruction(module.get(), "while-pass.0");
+  // The rolled passing loop should have 3 copies.
+  // arg.1 moves to index 0.
+  // arg.2 moves to index 1.
+  // out.0 moves to index 2.
+  EXPECT_EQ(Count(HloOpcode::kCopy, *pass_original_loop->while_body()), 3);
+
+  const HloInstruction* unrolled_loop = pass_original_loop->operand(0);
+  ASSERT_EQ(unrolled_loop->opcode(), HloOpcode::kWhile);
+  // There should be no copies inserted into the unrolled loop.
+  EXPECT_EQ(Count(HloOpcode::kCopy, *unrolled_loop->while_body()), 0);
+
+  const HloInstruction* fail_loop =
+      FindInstruction(module.get(), "while-fail.0");
+  // The rolled failing loop should have 3 copies.
+  // arg.1 moves to index 0.
+  // arg.2 moves to index 1.
+  // out.0 moves to index 2.
+  EXPECT_EQ(Count(HloOpcode::kCopy, *fail_loop->while_body()), 3);
+  // The failing loop should not have been unrolled.
+  EXPECT_EQ(fail_loop->users().size(), 0);
+}
+
 }  // namespace
 }  // namespace xla

From d11a803d83c3040078ebcb854d99d9f93377a39d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 19 Dec 2025 13:07:58 -0800
Subject: [PATCH 595/753] [XLA:GPU] Allow to fuse iotas into sort.

Also adjust alias info logic to allow sharing the sort fusion operands with the
right sort fusion outputs.

PiperOrigin-RevId: 846841000
---
 third_party/xla/xla/service/gpu/BUILD         |   2 +-
 third_party/xla/xla/service/gpu/alias_info.cc |   9 +
 .../xla/xla/service/gpu/alias_info_test.cc    | 169 ++++++++++++------
 .../xla/xla/service/gpu/fusion_pipeline.cc    |   2 +
 .../xla/xla/service/gpu/tests/sorting_test.cc |   4 +-
 .../xla/xla/service/gpu/transforms/BUILD      |  34 ++++
 .../service/gpu/transforms/priority_fusion.cc |   2 +-
 .../gpu/transforms/sort_iota_fusion.cc        |  70 ++++++++
 .../service/gpu/transforms/sort_iota_fusion.h |  39 ++++
 .../gpu/transforms/sort_iota_fusion_test.cc   |  71 ++++++++
 10 files changed, 347 insertions(+), 55 deletions(-)
 create mode 100644 third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.cc
 create mode 100644 third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.h
 create mode 100644 third_party/xla/xla/service/gpu/transforms/sort_iota_fusion_test.cc

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index a95d9dd9862b8a..20378ae18cce67 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1532,6 +1532,7 @@ cc_library(
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/transforms:multi_output_fusion",
         "//xla/service/gpu/transforms:priority_fusion",
+        "//xla/service/gpu/transforms:sort_iota_fusion",
         "//xla/service/gpu/transforms:variadic_op_splitter",
         "//xla/stream_executor:device_description",
         "//xla/tsl/platform:env",
@@ -2822,7 +2823,6 @@ xla_cc_test(
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
diff --git a/third_party/xla/xla/service/gpu/alias_info.cc b/third_party/xla/xla/service/gpu/alias_info.cc
index 1d42d7cd120777..0db594dc5cd8cd 100644
--- a/third_party/xla/xla/service/gpu/alias_info.cc
+++ b/third_party/xla/xla/service/gpu/alias_info.cc
@@ -128,6 +128,15 @@ std::optional<bool> FusionCanShareBufferHint(
           continue;
         }
       }
+      // For sort, we can share the buffer if the operand appears only once. We
+      // can share it with that output buffer that corresponds to the operand.
+      if (hlo == non_bitcast_root && hlo->opcode() == HloOpcode::kSort &&
+          absl::c_count(hlo->operands(), hlo_operand) == 1) {
+        if (user_index != ShapeIndex{hlo->operand_index(hlo_operand)}) {
+          return false;
+        }
+        continue;
+      }
       if (non_bitcast_root->opcode() == HloOpcode::kDynamicUpdateSlice &&
           hlo->opcode() == HloOpcode::kDynamicSlice &&
           non_bitcast_root->operand(0) == hlo->operand(0) &&
diff --git a/third_party/xla/xla/service/gpu/alias_info_test.cc b/third_party/xla/xla/service/gpu/alias_info_test.cc
index cc56b0764c8d5f..d64c4ac75a46e2 100644
--- a/third_party/xla/xla/service/gpu/alias_info_test.cc
+++ b/third_party/xla/xla/service/gpu/alias_info_test.cc
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
 namespace {
@@ -79,8 +78,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -102,8 +101,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -125,8 +124,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -151,8 +150,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
   // The second operand cannot share the buffer with the second fusion output,
@@ -182,8 +181,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
   // The first operand cannot share the buffer with the second fusion output,
@@ -222,8 +221,8 @@ ENTRY %main {
       kind=kLoop, calls=%fused_computation
 })";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {1}));
 }
@@ -265,8 +264,8 @@ TEST_F(AliasInfoTest, BufferCannotBeSharedScatterMultiOutputFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   // We expect that no buffer can be shared, because when Scatter is involved,
   // the only buffer we can potentially share is the first operand of scatter,
@@ -312,8 +311,8 @@ TEST_F(AliasInfoTest, BufferCanBeSharedScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {}));
@@ -353,8 +352,8 @@ TEST_F(AliasInfoTest, BufferCannotBeSharedScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {}));
@@ -401,8 +400,8 @@ TEST_F(AliasInfoTest, BufferCanBeSharedVariadicScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(1), {1}));
@@ -441,8 +440,8 @@ TEST_F(AliasInfoTest,
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {}));
@@ -486,13 +485,81 @@ TEST_F(AliasInfoTest, BufferCannotBeSharedVariadicScatterFusion) {
     }
     )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {1}));
 }
 
+TEST_F(AliasInfoTest, BufferCanBeSharedSortFusion) {
+  const char* const kModuleString = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_update_0 = s32[] parameter(2)
+      %rhs_update_0 = s32[] parameter(3)
+      %lhs_permutation = s32[] parameter(4)
+      %rhs_permutation = s32[] parameter(5)
+      ROOT %compare = pred[] compare(%lhs_key, %rhs_key), direction=LT
+    }
+
+    sort_fusion {
+      p0 = s32[16384]{0} parameter(0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) sort(p0, iota, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[16384]{0} parameter(0)
+      ROOT fusion = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kInput, calls=sort_fusion
+    }
+    )";
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {2}));
+}
+
+TEST_F(AliasInfoTest, BufferCannotBeSharedSortFusionDuplicateSortOperand) {
+  const char* const kModuleString = R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_update_0 = s32[] parameter(2)
+      %rhs_update_0 = s32[] parameter(3)
+      %lhs_permutation = s32[] parameter(4)
+      %rhs_permutation = s32[] parameter(5)
+      ROOT %compare = pred[] compare(%lhs_key, %rhs_key), direction=LT
+    }
+
+    sort_fusion {
+      p0 = s32[16384]{0} parameter(0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) sort(p0, iota, p0), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+
+    ENTRY main {
+      p = s32[16384]{0} parameter(0)
+      ROOT fusion = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kInput, calls=sort_fusion
+    }
+    )";
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
+  ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {2}));
+}
+
 TEST_F(AliasInfoTest, BufferCannotBeSharedConvertedShapeDifferentByteWidth) {
   const char* const kModuleString = R"(
 HloModule fusion
@@ -510,8 +577,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -533,8 +600,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -555,8 +622,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -577,8 +644,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -606,8 +673,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -636,8 +703,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -664,8 +731,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
@@ -702,8 +769,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {1}));
@@ -738,8 +805,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {0}));
 }
@@ -768,8 +835,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalTrue(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -799,8 +866,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -830,8 +897,8 @@ ENTRY main {
 }
 )";
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(0), {}));
 }
@@ -862,8 +929,8 @@ ENTRY main {
   ROOT %address_computation = (f32[8]{0}, (f32[128]{0}, f32[256]{0})) fusion(p0, p1, p2), kind=kCustom, calls=%dynamic-slice-fusion, backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation","kernel_index":0}},"force_earliest_schedule":false,"reification_cost":[]}
 }
 )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleString));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                       ParseAndReturnVerifiedModule(kModuleString));
   HloInstruction* fusion = module->entry_computation()->root_instruction();
   ExpectOptionalFalse(MayAlias(fusion, fusion->operand(1), {1, 0}));
 }
diff --git a/third_party/xla/xla/service/gpu/fusion_pipeline.cc b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
index 2758ac1fe479f5..f8101d4126e4a4 100644
--- a/third_party/xla/xla/service/gpu/fusion_pipeline.cc
+++ b/third_party/xla/xla/service/gpu/fusion_pipeline.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/transforms/multi_output_fusion.h"
 #include "xla/service/gpu/transforms/priority_fusion.h"
+#include "xla/service/gpu/transforms/sort_iota_fusion.h"
 #include "xla/service/gpu/transforms/variadic_op_splitter.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_cse.h"
@@ -58,6 +59,7 @@ HloPassPipeline FusionPipeline(
       std::make_unique<CpuGpuVerifierMetadata>(std::move(opts)),
       "hlo verifier (debug)");
 
+  fusion.AddPass<SortIotaFusion>();
   GpuHloCostAnalysis::Options cost_analysis_options{
       shape_size_bytes_function,
       /*per_second_rates=*/{},
diff --git a/third_party/xla/xla/service/gpu/tests/sorting_test.cc b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
index 130a1eb950b0b5..58d7222b990469 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/sorting_test.cc
@@ -170,7 +170,7 @@ TEST_F(SortingTest, SortFusionWithIotaOperand) {
 
     ENTRY main {
       p = s32[16384]{0} parameter(0)
-      ROOT fusion = (s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kInput, calls=sort_fusion
+      ROOT fusion = (s32[16384]{0}, s32[16384]{0}) fusion(p), kind=kCustom, calls=sort_fusion
     }
   )";
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
@@ -200,7 +200,7 @@ TEST_F(SortingTest, SortFusionWithIotaOperandTinySortDim) {
 
     ENTRY main {
       p = s32[2]{0} parameter(0)
-      ROOT fusion = (s32[2]{0}, s32[2]{0}) fusion(p), kind=kInput, calls=sort_fusion
+      ROOT fusion = (s32[2]{0}, s32[2]{0}) fusion(p), kind=kCustom, calls=sort_fusion
     }
   )";
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo_text, ErrorSpec{1e-5, 1e-5}));
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 1f3979a00abb01..07a5b9ac565259 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -2598,6 +2598,40 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "sort_iota_fusion",
+    srcs = ["sort_iota_fusion.cc"],
+    hdrs = ["sort_iota_fusion.h"],
+    deps = [
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/pass:hlo_pass",
+        "//xla/tsl/platform:status_macros",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+xla_cc_test(
+    name = "sort_iota_fusion_test",
+    srcs = [
+        "sort_iota_fusion_test.cc",
+    ],
+    deps = [
+        ":sort_iota_fusion",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:pattern_matcher_gmock",
+        "//xla/service:pattern_matcher",
+        "@com_google_absl//absl/status:status_matchers",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "sort_rewriter",
     srcs = ["sort_rewriter.cc"],
diff --git a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
index 3630635a713d2e..1a4e128f41679a 100644
--- a/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/priority_fusion.cc
@@ -1295,12 +1295,12 @@ HloInstruction::FusionKind PriorityFusion::ChooseKind(
     case HloFusionAnalysis::EmitterFusionKind::kTriton:
     case HloFusionAnalysis::EmitterFusionKind::kCustomFusion:
     case HloFusionAnalysis::EmitterFusionKind::kCuDnn:
+    case HloFusionAnalysis::EmitterFusionKind::kSort:
       return HloInstruction::FusionKind::kCustom;
     case HloFusionAnalysis::EmitterFusionKind::kConcatenate:
     case HloFusionAnalysis::EmitterFusionKind::kReduction:
     case HloFusionAnalysis::EmitterFusionKind::kTranspose:
     case HloFusionAnalysis::EmitterFusionKind::kScatter:
-    case HloFusionAnalysis::EmitterFusionKind::kSort:
       return HloInstruction::FusionKind::kInput;
   }
 }
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.cc b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.cc
new file mode 100644
index 00000000000000..cba65682acee38
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.cc
@@ -0,0 +1,70 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/sort_iota_fusion.h"
+
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/tsl/platform/status_macros.h"
+
+namespace xla::gpu {
+namespace {
+
+class SortIotaFusionGroupVisitor : public DfsHloRewriteVisitor {
+ public:
+  absl::Status HandleSort(HloInstruction* sort) override {
+    VLOG(4) << "Input: " << sort->ToString();
+    std::vector<HloInstruction*> iota_operands;
+    absl::flat_hash_set<HloInstruction*> different_iotas;
+    for (HloInstruction* operand : sort->mutable_operands()) {
+      if (HloPredicateIsOp<HloOpcode::kIota>(operand)) {
+        if (different_iotas.insert(operand).second) {
+          iota_operands.push_back(operand);
+        }
+      }
+    }
+    if (iota_operands.empty()) {
+      return absl::OkStatus();
+    }
+    HloInstruction* fusion =
+        sort->parent()->AddInstruction(HloInstruction::CreateFusion(
+            sort->shape(), HloInstruction::FusionKind::kCustom, sort));
+    for (HloInstruction* iota : iota_operands) {
+      fusion->FuseInstruction(iota);
+    }
+    VLOG(5) << "Generated fusion: " << fusion->ToString();
+    return ReplaceInstruction(sort, fusion);
+  }
+};
+}  // namespace
+
+absl::StatusOr<bool> SortIotaFusion::RunImpl(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  ASSIGN_OR_RETURN(bool changed, SortIotaFusionGroupVisitor().RunOnModule(
+                                     module, execution_threads));
+  return changed;
+}
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.h b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.h
new file mode 100644
index 00000000000000..dacb103910c363
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion.h
@@ -0,0 +1,39 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_TRANSFORMS_SORT_IOTA_FUSION_H_
+#define XLA_SERVICE_GPU_TRANSFORMS_SORT_IOTA_FUSION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/pass/hlo_pass_interface.h"
+
+namespace xla::gpu {
+
+// Fuses iotas into sort.
+class SortIotaFusion : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "sort-iota-fusion"; }
+
+ protected:
+  absl::StatusOr<bool> RunImpl(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRANSFORMS_SORT_IOTA_FUSION_H_
diff --git a/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion_test.cc
new file mode 100644
index 00000000000000..34ba36597c0b75
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/transforms/sort_iota_fusion_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/transforms/sort_iota_fusion.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status_matchers.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/hlo/testlib/pattern_matcher_gmock.h"
+#include "xla/service/pattern_matcher.h"
+
+namespace m = ::xla::match;
+
+namespace xla::gpu {
+namespace {
+
+using SortIotaFusionTest = HloHardwareIndependentTestBase;
+
+TEST_F(SortIotaFusionTest, FuseIota) {
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    sorting_computation {
+      %lhs_key = s32[] parameter(0)
+      %rhs_key = s32[] parameter(1)
+      %lhs_index = s32[] parameter(2)
+      %rhs_index = s32[] parameter(3)
+      %lhs_index2 = s32[] parameter(4)
+      %rhs_index2 = s32[] parameter(5)
+      %lt_key = pred[] compare(%lhs_key, %rhs_key), direction=LT
+      %gt_key = pred[] compare(%rhs_key, %lhs_key), direction=LT
+      %eq_key = pred[] compare(%lt_key, %gt_key), direction=EQ
+      %lt_index = pred[] compare(%lhs_index, %rhs_index), direction=LT
+      ROOT res = pred[] select(%eq_key, %lt_index, %lt_key)
+    }
+
+    ENTRY main {
+      p0 = s32[16384]{0} parameter(0)
+      neg = s32[16384]{0} negate(p0)
+      iota = s32[16384]{0} iota(), iota_dimension=0
+      ROOT sort = (s32[16384]{0}, s32[16384]{0}, s32[16384]{0}) sort(neg, iota, iota), dimensions={0}, is_stable=true, to_apply=sorting_computation
+    }
+  )");
+  EXPECT_THAT(SortIotaFusion().Run(module.get()),
+              absl_testing::IsOkAndHolds(true));
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* fusion = nullptr;
+  ASSERT_THAT(root, GmockMatch(m::Fusion(&fusion, m::Negate())));
+  EXPECT_EQ(fusion->fusion_kind(), HloInstruction::FusionKind::kCustom);
+  EXPECT_THAT(fusion->fused_expression_root(),
+              GmockMatch(m::Sort(m::Parameter(), m::Iota(), m::Iota())));
+}
+
+}  // namespace
+}  // namespace xla::gpu

From 5e685fb6e1e645a3af69c1b462d2329abdac7357 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 13:53:16 -0800
Subject: [PATCH 596/753] Apply llvm-use-new-mlir-op-builder fixes

This migrates `builder.create<Op>()` => `Op::create()`

PiperOrigin-RevId: 846854812
---
 .../compiler/mlir/lite/debug/debug_test.cc    |  13 +-
 .../experimental/common/outline_operations.cc |   8 +-
 .../tac/transforms/device_transform.cc        |   9 +-
 .../transforms/device_transform_patterns.cc   |  18 +-
 .../tac/transforms/pick_subgraphs.cc          |   4 +-
 .../quantization_lib/quantization_utils.h     |   8 +-
 .../quantization/tensorflow/tf_to_quant.cc    |   8 +-
 .../compose_uniform_quantized_type_pass.cc    |  96 ++--
 .../lite/stablehlo/transforms/legalize_hlo.cc | 520 +++++++++---------
 .../legalize_hlo_conversions/conv_util.cc     |   8 +-
 .../legalize_hlo_conversions/custom_call.cc   |   6 +-
 .../legalize_hlo_conversions/dot_general.cc   | 113 ++--
 .../legalize_hlo_conversions/fft.cc           |  50 +-
 .../legalize_hlo_conversions/gelu.cc          |   6 +-
 .../transforms/legalize_hlo_conversions/if.cc |   4 +-
 .../legalize_hlo_conversions/reduce.cc        |  37 +-
 .../legalize_hlo_conversions/reduce_window.cc |  26 +-
 .../legalize_hlo_conversions/scatter.cc       |  22 +-
 .../legalize_hlo_conversions/slice.cc         |  42 +-
 .../legalize_hlo_conversions/util.cc          |  16 +-
 .../legalize_hlo_conversions/util.h           |   2 +-
 .../legalize_hlo_conversions/while.cc         |   7 +-
 ...lize_stablehlo_custom_call_to_composite.cc |   6 +-
 .../transforms/legalize_stablehlo_to_vhlo.cc  |   2 +-
 ...ze_tf_xla_call_module_to_stablehlo_pass.cc |  16 +-
 .../lite/stablehlo/transforms/optimize.cc     |  21 +-
 .../transforms/smuggle_disallowed_ops.cc      |   4 +-
 .../stablehlo_fuse_convolution_pass.cc        |  20 +-
 .../transforms/tflite_legalize_hlo.cc         |  10 +-
 .../transforms/unfold_splat_constant_pass.cc  |  11 +-
 .../decompose_hybrid_quantization.cc          |  11 +-
 .../mlir/lite/transforms/if_outline.cc        |  10 +-
 .../lite/transforms/insert_call_once_op.cc    |   2 +-
 .../transforms/optimize_batch_matmul_pass.cc  |  54 +-
 .../optimize_broadcast_like_pass.cc           |  13 +-
 .../mlir/lite/transforms/optimize_pass.cc     | 267 ++++-----
 .../transforms/pin_ops_with_side_effects.cc   |   8 +-
 .../mlir/lite/transforms/post_quantize.cc     |  29 +-
 .../mlir/lite/transforms/prepare_tf.cc        | 151 ++---
 .../push_transpose_through_ewise_pass.cc      |  16 +-
 .../compiler/mlir/lite/transforms/quantize.cc |   9 +-
 .../lite/transforms/quantize_variables.cc     |  43 +-
 .../mlir/lite/transforms/raise_custom_ops.cc  |   6 +-
 .../lite/transforms/reduce_type_precision.cc  |  12 +-
 .../unfold_large_splat_constants_pass.cc      |  32 +-
 .../mlir/lite/utils/fake_quant_utils.h        |   8 +-
 .../compiler/mlir/lite/utils/lstm_utils.cc    |  55 +-
 .../compiler/mlir/lite/utils/nms_utils.cc     |  16 +-
 .../mlir/lite/utils/perception_ops_utils.cc   |  20 +-
 .../mlir/lite/utils/region_isolation_test.cc  |   2 +-
 .../compiler/mlir/lite/utils/tftext_utils.cc  |  24 +-
 .../common/attrs_and_constraints.h            |   4 +-
 .../quantization_lib/quantization_utils.h     |   8 +-
 .../convert_tf_quant_to_mhlo_int_test.cc      |   4 +-
 .../passes/bridge/convert_tf_quant_types.cc   |   8 +-
 .../passes/convert_func_to_bfloat16.cc        |  10 +-
 .../convert_xla_call_module_op_to_bfloat16.cc |   6 +-
 .../passes/defer_activation_transpose.cc      |  39 +-
 .../passes/fold_constant_transpose.cc         |   4 +-
 .../insert_calibration_statistics_saver.cc    |   6 +-
 .../passes/merge_fusion_with_dequantize.cc    |  25 +-
 .../passes/nchw_convolution_to_nhwc.cc        |  19 +-
 .../stablehlo/passes/prepare_quantize.cc      |   4 +-
 .../stablehlo/passes/quantize_weight.cc       |  10 +-
 .../passes/unwrap_xla_call_module_op.cc       |   4 +-
 .../tensorflow/cc/constant_fold.cc            |   2 +-
 .../tensorflow/passes/cast_bf16_ops_to_f32.cc |   8 +-
 .../tensorflow/passes/prepare_quantize_drq.cc |   8 +-
 .../utils/tf_to_xla_attribute_utils.cc        |  75 +--
 69 files changed, 1097 insertions(+), 1048 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/debug/debug_test.cc b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
index b82d5725182745..a9337c0c84f944 100644
--- a/tensorflow/compiler/mlir/lite/debug/debug_test.cc
+++ b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
@@ -103,20 +103,21 @@ class InitPassManagerTest : public testing::Test {
     context_.loadAllAvailableDialects();
 
     mlir::OpBuilder builder(&context_);
-    module_ = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
+    module_ = mlir::ModuleOp::create(builder, builder.getUnknownLoc());
 
     builder.setInsertionPointToStart(module_->getBody());
-    auto func = builder.create<mlir::func::FuncOp>(  //
-        builder.getUnknownLoc(), "main", builder.getFunctionType({}, {}));
+    auto func = mlir::func::FuncOp::create(builder,  //
+                                           builder.getUnknownLoc(), "main",
+                                           builder.getFunctionType({}, {}));
     func->setAttr("tfl.func", builder.getUnitAttr());
     builder.setInsertionPointToStart(func.addEntryBlock());
     llvm::SmallVector<int> shape{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    builder.create<mlir::arith::ConstantOp>(
-        builder.getUnknownLoc(),
+    mlir::arith::ConstantOp::create(
+        builder, builder.getUnknownLoc(),
         mlir::DenseIntElementsAttr::get(
             mlir::RankedTensorType::get(shape.size(), builder.getI32Type()),
             shape));
-    builder.create<mlir::func::ReturnOp>(builder.getUnknownLoc());
+    mlir::func::ReturnOp::create(builder, builder.getUnknownLoc());
   }
 
   absl::Status GetDumpDir(std::string* dump_dir) {
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
index 533a69bdfd9efa..614f9738356019 100644
--- a/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/common/outline_operations.cc
@@ -163,8 +163,8 @@ func::FuncOp BuildFuncOp(const Subgraph& subgraph, OpBuilder& builder,
     Value cloned_output = values_in_scope.lookup(result);
     return_operands.push_back(cloned_output);
   }
-  function_builder.create<mlir::func::ReturnOp>(new_func.getLoc(),
-                                                return_operands);
+  mlir::func::ReturnOp::create(function_builder, new_func.getLoc(),
+                               return_operands);
   ops_added.func_op = new_func;
   module.push_back(new_func);
   return new_func;
@@ -179,8 +179,8 @@ void ExtractSubgraphToFunc(const Subgraph& subgraph, OpBuilder& builder,
   Operation* last_output = subgraph.partition_ops_.back();
 
   builder.setInsertionPoint(last_output);
-  auto call_op = builder.create<func::CallOp>(last_output->getLoc(), func,
-                                              subgraph.FuncArguments());
+  auto call_op = func::CallOp::create(builder, last_output->getLoc(), func,
+                                      subgraph.FuncArguments());
   ops_added.call_op = call_op;
   // FuncOutputs refer to the original `Values` in input module which are now
   // invalid after pulling out the defining ops. The values in
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
index 787190318b63ad..c5c8c040c2bb28 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform.cc
@@ -89,8 +89,8 @@ void ConvertQuantizedOpToFloat(mlir::func::FuncOp func, OpBuilder* builder) {
         auto dequantized_input_type =
             mlir::quant::QuantizedType::castToExpressedType(input_type);
         builder->setInsertionPoint(op);
-        auto dequantize_op = builder->create<TFL::DequantizeOp>(
-            op->getLoc(), dequantized_input_type, input.get());
+        auto dequantize_op = TFL::DequantizeOp::create(
+            *builder, op->getLoc(), dequantized_input_type, input.get());
         dequantized_inputs.push_back(dequantize_op);
       } else {
         dequantized_inputs.push_back(input.get());
@@ -126,8 +126,9 @@ void ConvertQuantizedOpToFloat(mlir::func::FuncOp func, OpBuilder* builder) {
       Value new_result = new_op->getResult(i);
       if (IsQI8Type(result_type) || IsQUI8Type(result_type)) {
         builder->setInsertionPoint(op);
-        TFL::QuantizeOp quant_op = builder->create<TFL::QuantizeOp>(
-            op->getLoc(), result_type, new_result, TypeAttr::get(result_type));
+        TFL::QuantizeOp quant_op =
+            TFL::QuantizeOp::create(*builder, op->getLoc(), result_type,
+                                    new_result, TypeAttr::get(result_type));
         new_result = quant_op.getResult();
       }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
index d701254f333322..e6d7c6425abafe 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
@@ -85,11 +85,11 @@ TFL::ReshapeOp InsertReshapeOp(Location loc, Value input, Type element_type,
   auto new_shape_attr =
       mlir::DenseIntElementsAttr::get(reshape_shape_type, new_shape_array_i32);
 
-  auto new_shape = builder->create<TFL::ConstOp>(loc, new_shape_attr);
+  auto new_shape = TFL::ConstOp::create(*builder, loc, new_shape_attr);
 
   auto reshape_out_type = RankedTensorType::get(new_shape_array, element_type);
-  return builder->create<TFL::ReshapeOp>(loc, reshape_out_type, input,
-                                         new_shape);
+  return TFL::ReshapeOp::create(*builder, loc, reshape_out_type, input,
+                                new_shape);
 }
 
 LogicalResult EnsureBias(Operation* op, int bias_idx,
@@ -148,7 +148,7 @@ TF::ConstOp PadConstValues(Operation* input_op, int value_to_pad,
   auto new_value_i32_attr =
       mlir::DenseIntElementsAttr::get(value_shape_type, value_i32);
 
-  return builder->create<TF::ConstOp>(loc, new_value_i32_attr);
+  return TF::ConstOp::create(*builder, loc, new_value_i32_attr);
 }
 
 SmallVector<Value, 4> SliceOutputs(Operation* split_op, Value input,
@@ -186,13 +186,13 @@ SmallVector<Value, 4> SliceOutputs(Operation* split_op, Value input,
         mlir::DenseIntElementsAttr::get(slice_type, slice_size);
 
     auto slice_begin_const =
-        rewriter->create<TFL::ConstOp>(split_op->getLoc(), slice_begin_attr);
+        TFL::ConstOp::create(*rewriter, split_op->getLoc(), slice_begin_attr);
     auto slice_size_const =
-        rewriter->create<TFL::ConstOp>(split_op->getLoc(), slice_size_attr);
+        TFL::ConstOp::create(*rewriter, split_op->getLoc(), slice_size_attr);
 
-    auto slice_op = rewriter->create<TFL::SliceOp>(
-        split_op->getLoc(), current_output_type, input, slice_begin_const,
-        slice_size_const);
+    auto slice_op =
+        TFL::SliceOp::create(*rewriter, split_op->getLoc(), current_output_type,
+                             input, slice_begin_const, slice_size_const);
 
     // Rewire output.
     slice_outputs.push_back(slice_op.getResult());
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
index 58940205edf1ab..300daee0f9a40d 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/pick_subgraphs.cc
@@ -430,8 +430,8 @@ void PickSubgraphsPass::RewireSubgraphs(
         if (call.getCallee() != impl.getName()) {
           // We need to rebuild the call op. :(
           builder->setInsertionPoint(call);
-          auto new_call = builder->create<func::CallOp>(call.getLoc(), impl,
-                                                        call.getOperands());
+          auto new_call = func::CallOp::create(*builder, call.getLoc(), impl,
+                                               call.getOperands());
 
           // Set interface_name & target to the call_op as well.
           new_call->setAttr(kInterfaceNameAttr,
diff --git a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
index 6559ad29d1f788..1da38c2c9f466e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/common/quantization_lib/quantization_utils.h
@@ -346,10 +346,10 @@ void CreateVerifier(mlir::Operation* quantizing_op,
   BoolAttr log =
       rewriter.getBoolAttr(quant_params.numeric_verify_spec.log_if_failed_flag);
   // Verify the quantized value by sending the result to the verifier.
-  rewriter.create<VerifierT>(
-      quantizing_op->getLoc(), quantized_op->getResult(result_idx).getType(),
-      quantized_op->getResult(result_idx), quantizing_op->getResult(result_idx),
-      tolerance, log);
+  VerifierT::create(rewriter, quantizing_op->getLoc(),
+                    quantized_op->getResult(result_idx).getType(),
+                    quantized_op->getResult(result_idx),
+                    quantizing_op->getResult(result_idx), tolerance, log);
 }
 
 template <>
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index 6c43167a78cbae..529b5d2161be32 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -150,10 +150,10 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
     // and its users.
     Value value = tf_op.getOutputs();
-    auto quantize = rewriter.create<quantfork::QuantizeCastOp>(
-        tf_op.getLoc(), qtype.getValue(), value);
-    auto dequantize = rewriter.create<quantfork::DequantizeCastOp>(
-        tf_op.getLoc(), res_type, quantize.getResult());
+    auto quantize = quantfork::QuantizeCastOp::create(rewriter, tf_op.getLoc(),
+                                                      qtype.getValue(), value);
+    auto dequantize = quantfork::DequantizeCastOp::create(
+        rewriter, tf_op.getLoc(), res_type, quantize.getResult());
     value.replaceAllUsesWith(dequantize);
     quantize.getOperation()->replaceUsesOfWith(dequantize, value);
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
index 4107859b7412af..0dd7e1f3b97a1c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
@@ -677,13 +677,12 @@ class ComposeUniformQuantizedConvolutionOp
         CreateI8F32UniformQuantizedType(
             uniform_quantize_call_op.getLoc(), *rewriter.getContext(),
             input_scale_value, input_zero_point_value);
-    auto input_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            uniform_quantize_call_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input_value.getType())
-                .clone(input_quantized_element_type),
-            /*operand=*/input_value);
+    auto input_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, uniform_quantize_call_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input_value.getType())
+            .clone(input_quantized_element_type),
+        /*operand=*/input_value);
 
     rewriter.replaceAllUsesWith(input_i8_to_f32_convert_op.getResult(),
                                 input_uniform_quantize_op.getResult());
@@ -754,8 +753,8 @@ class ComposeUniformQuantizedConvolutionOp
             /*quantization_dimension=*/3);
 
     // Create a new constant op for the filter in i8.
-    auto quantized_filter_constant_op = rewriter.create<stablehlo::ConstantOp>(
-        filter_op->getLoc(),
+    auto quantized_filter_constant_op = stablehlo::ConstantOp::create(
+        rewriter, filter_op->getLoc(),
         /*output=*/
         filter_i8_value_attr.getType().clone(filter_quantized_element_type),
         /*value=*/filter_i8_value_attr);
@@ -797,18 +796,16 @@ class ComposeUniformQuantizedConvolutionOp
 
     SmallVector<Type> new_conv_output_types = {
         output_uniform_quantized_tensor_type};
-    auto new_conv_op_with_output_type =
-        rewriter.create<stablehlo::ConvolutionOp>(
-            op.getLoc(), new_conv_output_types, op.getOperands(),
-            op->getAttrs());
+    auto new_conv_op_with_output_type = stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), new_conv_output_types, op.getOperands(),
+        op->getAttrs());
 
     rewriter.replaceAllUsesWith(op.getResult(),
                                 new_conv_op_with_output_type.getResult());
 
-    auto new_output_dequant_op =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            rewriter.getUnknownLoc(),
-            /*operand=*/new_conv_op_with_output_type);
+    auto new_output_dequant_op = stablehlo::UniformDequantizeOp::create(
+        rewriter, rewriter.getUnknownLoc(),
+        /*operand=*/new_conv_op_with_output_type);
 
     auto output_uniform_dequantize_call_op = cast<func::CallOp>(
         *output_uniform_quantize_call_op.getResult(0).user_begin());
@@ -1035,13 +1032,12 @@ class ComposeUniformQuantizedDotGeneralOp
             input_scale_value, input_zero_point_value);
 
     Value input_value = input_uniform_quantize_call_pattern->GetInputValue();
-    auto input_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            input_i8_to_f32_convert_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input_value.getType())
-                .clone(input_uniform_quantized_type),
-            /*operand=*/input_value);
+    auto input_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, input_i8_to_f32_convert_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input_value.getType())
+            .clone(input_uniform_quantized_type),
+        /*operand=*/input_value);
 
     rewriter.replaceAllUsesWith(input_i8_to_f32_convert_op.getResult(),
                                 input_uniform_quantize_op.getResult());
@@ -1116,8 +1112,8 @@ class ComposeUniformQuantizedDotGeneralOp
             quantization_dimension);
 
     // Create a new constant op for the filter in i8.
-    auto quantized_filter_constant_op = rewriter.create<stablehlo::ConstantOp>(
-        filter_constant_op.getLoc(),
+    auto quantized_filter_constant_op = stablehlo::ConstantOp::create(
+        rewriter, filter_constant_op.getLoc(),
         /*output=*/
         mlir::cast<TensorType>(filter_constant_op.getResult().getType())
             .clone(filter_uniform_quantized_type),
@@ -1157,8 +1153,8 @@ class ComposeUniformQuantizedDotGeneralOp
             output_uniform_quantize_call_op.getLoc(), *rewriter.getContext(),
             output_scale_value, output_zero_point_value);
 
-    auto new_dot_general_op = rewriter.create<stablehlo::DotGeneralOp>(
-        op.getLoc(), /*resultType0=*/
+    auto new_dot_general_op = stablehlo::DotGeneralOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/
         mlir::cast<TensorType>(op.getResult().getType())
             .clone(output_uniform_quantized_type),
         /*lhs=*/op.getLhs(), /*rhs=*/op.getRhs(),
@@ -1168,10 +1164,9 @@ class ComposeUniformQuantizedDotGeneralOp
 
     rewriter.replaceAllUsesWith(op.getResult(), new_dot_general_op.getResult());
 
-    auto new_output_dequant_op =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            output_uniform_dequantize_call_op.getLoc(),
-            /*operand=*/new_dot_general_op);
+    auto new_output_dequant_op = stablehlo::UniformDequantizeOp::create(
+        rewriter, output_uniform_dequantize_call_op.getLoc(),
+        /*operand=*/new_dot_general_op);
 
     rewriter.replaceAllUsesWith(output_uniform_dequantize_call_op.getResult(0),
                                 new_output_dequant_op.getResult());
@@ -1423,13 +1418,12 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
             input1_scale_value, input1_zero_point_value);
 
     Value input1_value = input1_uniform_quantize_call_pattern->GetInputValue();
-    auto input1_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            input1_uniform_quantize_call_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input1_value.getType())
-                .clone(input1_uniform_quantized_type),
-            /*operand=*/input1_value);
+    auto input1_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, input1_uniform_quantize_call_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input1_value.getType())
+            .clone(input1_uniform_quantized_type),
+        /*operand=*/input1_value);
 
     rewriter.replaceAllUsesWith(input1_zero_point_subtract_op.getResult(),
                                 input1_uniform_quantize_op.getResult());
@@ -1462,13 +1456,12 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
             input2_scale_value, input2_zero_point_value);
 
     Value input2_value = input2_uniform_quantize_call_pattern->GetInputValue();
-    auto input2_uniform_quantize_op =
-        rewriter.create<stablehlo::UniformQuantizeOp>(
-            input2_uniform_quantize_call_op.getLoc(),
-            /*result=*/
-            mlir::cast<TensorType>(input2_value.getType())
-                .clone(input2_uniform_quantized_type),
-            /*operand=*/input2_value);
+    auto input2_uniform_quantize_op = stablehlo::UniformQuantizeOp::create(
+        rewriter, input2_uniform_quantize_call_op.getLoc(),
+        /*result=*/
+        mlir::cast<TensorType>(input2_value.getType())
+            .clone(input2_uniform_quantized_type),
+        /*operand=*/input2_value);
 
     rewriter.replaceAllUsesWith(input2_zero_point_subtract_op.getResult(),
                                 input2_uniform_quantize_op.getResult());
@@ -1512,8 +1505,8 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
             output_uniform_quantize_call_op.getLoc(), *rewriter.getContext(),
             output_scale_value, output_zero_point_value);
 
-    auto new_dot_general_op = rewriter.create<stablehlo::DotGeneralOp>(
-        op.getLoc(), /*resultType0=*/
+    auto new_dot_general_op = stablehlo::DotGeneralOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/
         mlir::cast<TensorType>(op.getResult().getType())
             .clone(output_uniform_quantized_type),
         /*lhs=*/op.getLhs(), /*rhs=*/op.getRhs(),
@@ -1523,10 +1516,9 @@ class ComposeUniformQuantizedDotGeneralOpWithTwoQuantizedActivations
 
     rewriter.replaceAllUsesWith(op.getResult(), new_dot_general_op.getResult());
 
-    auto new_output_dequant_op =
-        rewriter.create<stablehlo::UniformDequantizeOp>(
-            output_uniform_dequantize_call_op.getLoc(),
-            /*operand=*/new_dot_general_op);
+    auto new_output_dequant_op = stablehlo::UniformDequantizeOp::create(
+        rewriter, output_uniform_dequantize_call_op.getLoc(),
+        /*operand=*/new_dot_general_op);
 
     rewriter.replaceAllUsesWith(output_uniform_dequantize_call_op.getResult(0),
                                 new_output_dequant_op.getResult());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 7608ff985f1eb9..0d8688b2c8855a 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -328,22 +328,22 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
       size.push_back(input_shape[i] - pre_slice - post_slice);
     }
 
-    auto start_attr = rewriter.create<TF::ConstOp>(
-        value.getLoc(),
+    auto start_attr = TF::ConstOp::create(
+        rewriter, value.getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({static_cast<int64_t>(start.size())},
                                   rewriter.getI64Type()),
             start));
-    auto size_attr = rewriter.create<TF::ConstOp>(
-        value.getLoc(),
+    auto size_attr = TF::ConstOp::create(
+        rewriter, value.getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({static_cast<int64_t>(size.size())},
                                   rewriter.getI64Type()),
             size));
     auto output_type = RankedTensorType::get(size, input_type.getElementType());
 
-    return rewriter.create<TF::SliceOp>(value.getLoc(), output_type, value,
-                                        start_attr, size_attr);
+    return TF::SliceOp::create(rewriter, value.getLoc(), output_type, value,
+                               start_attr, size_attr);
   }
 
   void CreateConvOp(mhlo::ConvolutionOp conv_op, ArrayRef<int64_t> strides,
@@ -381,14 +381,15 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
           mlir::dyn_cast<RankedTensorType>(conv_op.getLhs().getType());
       RankedTensorType padding_attr_type = mlir::RankedTensorType::get(
           {lhs_type.getRank(), 2}, rewriter.getIntegerType(64));
-      auto padding_const = rewriter.create<TF::ConstOp>(
-          conv_op->getLoc(),
+      auto padding_const = TF::ConstOp::create(
+          rewriter, conv_op->getLoc(),
           mlir::DenseElementsAttr::get(padding_attr_type,
                                        ArrayRef<int64_t>(new_padding)));
       // Add Pad op.
       auto pad_output_type = UnrankedTensorType::get(lhs_type.getElementType());
-      sliced_lhs = rewriter.create<TF::PadOp>(
-          conv_op->getLoc(), pad_output_type, sliced_lhs, padding_const);
+      sliced_lhs =
+          TF::PadOp::create(rewriter, conv_op->getLoc(), pad_output_type,
+                            sliced_lhs, padding_const);
       padding = "VALID";
     }
 
@@ -422,28 +423,28 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
                                                     hlo_filter_shape.end());
       tf_filter_shape[2] = input_channels;
       tf_filter_shape[3] = hlo_filter_shape.back() / input_channels;
-      auto reshaped_filter = rewriter.create<mhlo::ReshapeOp>(
-          rhs.getLoc(),
+      auto reshaped_filter = mhlo::ReshapeOp::create(
+          rewriter, rhs.getLoc(),
           RankedTensorType::get(tf_filter_shape, filter_type.getElementType()),
           rhs);
 
-      output = rewriter.create<TF::DepthwiseConv2dNativeOp>(
-          conv_op.getLoc(), conv_output_type, sliced_lhs, reshaped_filter,
-          rewriter.getI64ArrayAttr(strides),
+      output = TF::DepthwiseConv2dNativeOp::create(
+          rewriter, conv_op.getLoc(), conv_output_type, sliced_lhs,
+          reshaped_filter, rewriter.getI64ArrayAttr(strides),
           /*padding=*/rewriter.getStringAttr(padding),
           /*explicit_paddings=*/rewriter.getI64ArrayAttr(new_padding),
           /*data_format=*/rewriter.getStringAttr("NHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
     } else if (num_spatial_dims == 3) {
-      output = rewriter.create<TF::Conv3DOp>(
-          conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
+      output = TF::Conv3DOp::create(
+          rewriter, conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
           rewriter.getI64ArrayAttr(strides),
           /*padding=*/rewriter.getStringAttr(padding),
           /*data_format=*/rewriter.getStringAttr("NDHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
     } else {
-      output = rewriter.create<TF::Conv2DOp>(
-          conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
+      output = TF::Conv2DOp::create(
+          rewriter, conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
           rewriter.getI64ArrayAttr(strides),
           /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
           /*padding=*/rewriter.getStringAttr(padding),
@@ -462,8 +463,8 @@ class ConvertNdConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
               dnums.getOutputFeatureDimension(),
               *dnums.getOutputSpatialDimensions().begin(), num_spatial_dims,
               conv_output_type, rewriter);
-      output = rewriter.create<mhlo::TransposeOp>(
-          conv_op.getLoc(), conv_op.getType(), output, permutation);
+      output = mhlo::TransposeOp::create(
+          rewriter, conv_op.getLoc(), conv_op.getType(), output, permutation);
     }
     rewriter.replaceOp(conv_op, {output});
   }
@@ -513,8 +514,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     auto image_2d_type =
         RankedTensorType::get(image_2d_shape, image_type.getElementType());
     auto loc = conv_op.getLoc();
-    auto image_2d_op = rewriter.create<mhlo::ReshapeOp>(
-        conv_op.getLoc(), image_2d_type, conv_op.getLhs());
+    auto image_2d_op = mhlo::ReshapeOp::create(rewriter, conv_op.getLoc(),
+                                               image_2d_type, conv_op.getLhs());
 
     // Transpose image to get it into NWHC form (where H is the added dim).
     SmallVector<int64_t, 4> image_permutation = {
@@ -523,9 +524,9 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
         dnums.getInputFeatureDimension()};
     auto image_permutation_and_shape = GetPermutationAndTransposedShape(
         image_permutation, image_2d_type, rewriter);
-    auto transposed_image_2d_op = rewriter.create<mhlo::TransposeOp>(
-        loc, image_permutation_and_shape.shape, image_2d_op->getResult(0),
-        image_permutation_and_shape.permutation);
+    auto transposed_image_2d_op = mhlo::TransposeOp::create(
+        rewriter, loc, image_permutation_and_shape.shape,
+        image_2d_op->getResult(0), image_permutation_and_shape.permutation);
 
     // Reshape kernel to add a new spatial dimension.
     auto kernel_type = mlir::cast<ShapedType>(conv_op.getRhs().getType());
@@ -536,8 +537,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     kernel_2d_shape.push_back(1);
     auto kernel_2d_type =
         RankedTensorType::get(kernel_2d_shape, kernel_type.getElementType());
-    auto kernel_2d_op =
-        rewriter.create<mhlo::ReshapeOp>(loc, kernel_2d_type, conv_op.getRhs());
+    auto kernel_2d_op = mhlo::ReshapeOp::create(rewriter, loc, kernel_2d_type,
+                                                conv_op.getRhs());
 
     // Transpose kernel to get it into WHIO form (where H is the added dim).
     SmallVector<int64_t, 4> kernel_permutation = {
@@ -547,9 +548,9 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
         dnums.getKernelOutputFeatureDimension()};
     auto kernel_permutation_and_shape = GetPermutationAndTransposedShape(
         kernel_permutation, kernel_2d_type, rewriter);
-    auto transposed_kernel_2d_op = rewriter.create<mhlo::TransposeOp>(
-        loc, kernel_permutation_and_shape.shape, kernel_2d_op->getResult(0),
-        kernel_permutation_and_shape.permutation);
+    auto transposed_kernel_2d_op = mhlo::TransposeOp::create(
+        rewriter, loc, kernel_permutation_and_shape.shape,
+        kernel_2d_op->getResult(0), kernel_permutation_and_shape.permutation);
 
     //
     // Create 2d equivalents for 1d convolution attributes.
@@ -638,12 +639,12 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
                                          rewriter)
             .shape;
 
-    auto conv2d_op = rewriter.create<mhlo::ConvolutionOp>(
-        loc, transposed_output_2d_shape, transposed_image_2d_op.getResult(),
-        transposed_kernel_2d_op.getResult(), window_strides_2d, padding_2d,
-        lhs_dilation_2d, rhs_dilation_2d, window_reversal_2d, dnums_2d,
-        conv_op.getFeatureGroupCount(), conv_op.getBatchGroupCount(),
-        conv_op.getPrecisionConfigAttr());
+    auto conv2d_op = mhlo::ConvolutionOp::create(
+        rewriter, loc, transposed_output_2d_shape,
+        transposed_image_2d_op.getResult(), transposed_kernel_2d_op.getResult(),
+        window_strides_2d, padding_2d, lhs_dilation_2d, rhs_dilation_2d,
+        window_reversal_2d, dnums_2d, conv_op.getFeatureGroupCount(),
+        conv_op.getBatchGroupCount(), conv_op.getPrecisionConfigAttr());
 
     OpResult conv2d_output = conv2d_op->getResult(0);
     auto conv2d_output_type = mlir::cast<ShapedType>(conv2d_output.getType());
@@ -656,8 +657,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     // affectively applied.
     auto output_permutation_and_shape = GetInversePermutationAndShape(
         output_permutation, conv2d_output_type, rewriter);
-    auto transposed_output_2d_op = rewriter.create<mhlo::TransposeOp>(
-        loc, output_permutation_and_shape.shape, conv2d_output,
+    auto transposed_output_2d_op = mhlo::TransposeOp::create(
+        rewriter, loc, output_permutation_and_shape.shape, conv2d_output,
         output_permutation_and_shape.permutation);
 
     // Drop the trailing spatial dimension from the output.
@@ -804,11 +805,10 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
       } else {
         limit_indices[channel_idx] = depth_idx + 1;
       }
-      return rewriter.create<mhlo::SliceOp>(
-          conv_op.getLoc(), tensor,
-          GetI64ElementsAttr(start_indices, &rewriter),
-          GetI64ElementsAttr(limit_indices, &rewriter),
-          GetI64ElementsAttr(strides, &rewriter));
+      return mhlo::SliceOp::create(rewriter, conv_op.getLoc(), tensor,
+                                   GetI64ElementsAttr(start_indices, &rewriter),
+                                   GetI64ElementsAttr(limit_indices, &rewriter),
+                                   GetI64ElementsAttr(strides, &rewriter));
     };
 
     // Storage for smaller convolution results
@@ -832,18 +832,19 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
           RankedTensorType::get(new_output_shape, output_type.getElementType());
 
       // Create a Smaller Convolution (Ensure compatibility)
-      auto conv_result = rewriter.create<mhlo::ConvolutionOp>(
-          conv_op.getLoc(), new_output_type, sliced_input, sliced_kernel,
-          conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
-          conv_op.getLhsDilationAttr(), conv_op.getRhsDilationAttr(),
-          conv_op.getWindowReversalAttr(), conv_op.getDimensionNumbers(), 1, 1,
+      auto conv_result = mhlo::ConvolutionOp::create(
+          rewriter, conv_op.getLoc(), new_output_type, sliced_input,
+          sliced_kernel, conv_op.getWindowStridesAttr(),
+          conv_op.getPaddingAttr(), conv_op.getLhsDilationAttr(),
+          conv_op.getRhsDilationAttr(), conv_op.getWindowReversalAttr(),
+          conv_op.getDimensionNumbers(), 1, 1,
           conv_op.getPrecisionConfigAttr());
 
       conv_results.push_back(conv_result);
     }
 
-    auto final_output = rewriter.create<mhlo::ConcatenateOp>(
-        conv_op.getLoc(), conv_results,
+    auto final_output = mhlo::ConcatenateOp::create(
+        rewriter, conv_op.getLoc(), conv_results,
         rewriter.getI64IntegerAttr(dnums.getOutputFeatureDimension()));
     rewriter.replaceOp(conv_op, final_output.getResult());
     return mlir::success();
@@ -854,8 +855,8 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
                               llvm::ArrayRef<int32_t> output_sizes,
                               bool align_corners,
                               ConversionPatternRewriter& rewriter) const {
-    Value output_sizes_attr = rewriter.create<TF::ConstOp>(
-        conv_op.getLoc(),
+    Value output_sizes_attr = TF::ConstOp::create(
+        rewriter, conv_op.getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({static_cast<int64_t>(output_sizes.size())},
                                   rewriter.getI32Type()),
@@ -863,8 +864,8 @@ class ConvertToResizeBilinearOpOrDepthwiseTransposedConvOp
     // The value of half_pixel_centers couldn't be inferred from the IR and XLA
     // only support half_pixel_centers=True as in 01/11/2022. Here
     // half_pixel_centers=False is hardcoded.
-    Value output = rewriter.create<TF::ResizeBilinearOp>(
-        conv_op.getLoc(), conv_op.getType(), conv_op.getLhs(),
+    Value output = TF::ResizeBilinearOp::create(
+        rewriter, conv_op.getLoc(), conv_op.getType(), conv_op.getLhs(),
         output_sizes_attr,
         /*align_corners=*/rewriter.getBoolAttr(align_corners),
         /*half_pixel_centers=*/rewriter.getBoolAttr(false));
@@ -1071,8 +1072,8 @@ class ConvertNonTrivialConvOp
       permutation.push_back(dnums.getKernelOutputFeatureDimension());
       permutation.push_back(dnums.getKernelInputFeatureDimension());
 
-      auto filter_transposed = rewriter.create<mhlo::TransposeOp>(
-          conv_op.getLoc(), conv_op.getRhs(),
+      auto filter_transposed = mhlo::TransposeOp::create(
+          rewriter, conv_op.getLoc(), conv_op.getRhs(),
           DenseIntElementsAttr::get(
               RankedTensorType::get({static_cast<int64_t>(permutation.size())},
                                     rewriter.getI64Type()),
@@ -1082,8 +1083,9 @@ class ConvertNonTrivialConvOp
 
     // Lets hard-code the reverse indexes to be {0, 1} as the expectation is
     // that the kernel is always in HWOI format, with the above code.
-    mhlo::ReverseOp filter = rewriter.create<mhlo::ReverseOp>(
-        conv_op.getLoc(), reverse_filter_in, rewriter.getI64TensorAttr({0, 1}));
+    mhlo::ReverseOp filter =
+        mhlo::ReverseOp::create(rewriter, conv_op.getLoc(), reverse_filter_in,
+                                rewriter.getI64TensorAttr({0, 1}));
 
     // if output is not in [b, 0, 1, f] format, insert transpose to go back
     if (dnums.getOutputBatchDimension() != 0 ||
@@ -1112,23 +1114,23 @@ class ConvertNonTrivialConvOp
       auto output_type = RankedTensorType::get(
           transposed_output_shape,
           mlir::cast<ShapedType>(conv_op.getRhs().getType()).getElementType());
-      auto output_sizes = rewriter.create<TF::ConstOp>(
-          conv_op.getLoc(),
+      auto output_sizes = TF::ConstOp::create(
+          rewriter, conv_op.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get(
                   {static_cast<int64_t>(transposed_output_shape_i32.size())},
                   rewriter.getI32Type()),
               transposed_output_shape_i32));
-      auto new_conv = rewriter.create<TF::Conv2DBackpropInputOp>(
-          conv_op.getLoc(), output_type, output_sizes, filter, conv_input,
-          rewriter.getI64ArrayAttr(strides),
+      auto new_conv = TF::Conv2DBackpropInputOp::create(
+          rewriter, conv_op.getLoc(), output_type, output_sizes, filter,
+          conv_input, rewriter.getI64ArrayAttr(strides),
           /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
           /*padding=*/rewriter.getStringAttr(padding),
           /*explicit_paddings=*/rewriter.getI64ArrayAttr({}),
           /*data_format=*/rewriter.getStringAttr("NHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
-      auto output_transpose = rewriter.create<mhlo::TransposeOp>(
-          conv_op.getLoc(), new_conv.getResult(),
+      auto output_transpose = mhlo::TransposeOp::create(
+          rewriter, conv_op.getLoc(), new_conv.getResult(),
           rewriter.getI64TensorAttr(transpose_order));
       conv_op->replaceAllUsesWith(output_transpose);
       rewriter.eraseOp(conv_op);
@@ -1139,8 +1141,8 @@ class ConvertNonTrivialConvOp
                .getShape()) {
         output_shape_i32.push_back(dim);
       }
-      auto output_sizes = rewriter.create<TF::ConstOp>(
-          conv_op.getLoc(),
+      auto output_sizes = TF::ConstOp::create(
+          rewriter, conv_op.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get(
                   {static_cast<int64_t>(output_shape_i32.size())},
@@ -1255,12 +1257,12 @@ class ConvertSliceOp : public OpConversionPattern<mhlo::SliceOp> {
   LogicalResult matchAndRewrite(
       mhlo::SliceOp slice_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto begin = rewriter.create<TF::ConstOp>(slice_op.getLoc(),
-                                              slice_op.getStartIndices());
-    auto end = rewriter.create<TF::ConstOp>(slice_op.getLoc(),
-                                            slice_op.getLimitIndices());
+    auto begin = TF::ConstOp::create(rewriter, slice_op.getLoc(),
+                                     slice_op.getStartIndices());
+    auto end = TF::ConstOp::create(rewriter, slice_op.getLoc(),
+                                   slice_op.getLimitIndices());
     auto strides =
-        rewriter.create<TF::ConstOp>(slice_op.getLoc(), slice_op.getStrides());
+        TF::ConstOp::create(rewriter, slice_op.getLoc(), slice_op.getStrides());
     rewriter.replaceOpWithNewOp<TF::StridedSliceOp>(
         slice_op, slice_op.getType(), slice_op.getOperand(), begin, end,
         strides);
@@ -1294,22 +1296,24 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
     // Clamp indices to [0, input_size - output_size]
     llvm::SmallVector<Value, 4> start_indices_vector;
     start_indices_vector.reserve(op.getStartIndices().size());
-    Value clamp_min = rewriter.create<TF::ConstOp>(
-        op.getLoc(),
+    Value clamp_min = TF::ConstOp::create(
+        rewriter, op.getLoc(),
         rewriter.getIntegerAttr(signed_start_indices_element_type, 0));
     for (uint64_t i = 0, e = op.getStartIndices().size(); i < e; ++i) {
       // Always put a cast there.
       auto start = op.getStartIndices()[i];
       auto cast_type = mlir::cast<ShapedType>(start.getType())
                            .clone(signed_start_indices_element_type);
-      auto cast_op = rewriter.create<TF::CastOp>(op.getLoc(), cast_type, start);
-      Value clamp_max = rewriter.create<TF::ConstOp>(
-          op.getLoc(), rewriter.getIntegerAttr(
-                           signed_start_indices_element_type,
-                           input_type.getShape()[i] -
-                               op.getSliceSizes().getValues<int64_t>()[i]));
-      Value clamped_index = rewriter.create<mhlo::ClampOp>(
-          op.getLoc(), cast_type, clamp_min, cast_op, clamp_max);
+      auto cast_op =
+          TF::CastOp::create(rewriter, op.getLoc(), cast_type, start);
+      Value clamp_max = TF::ConstOp::create(
+          rewriter, op.getLoc(),
+          rewriter.getIntegerAttr(
+              signed_start_indices_element_type,
+              input_type.getShape()[i] -
+                  op.getSliceSizes().getValues<int64_t>()[i]));
+      Value clamped_index = mhlo::ClampOp::create(
+          rewriter, op.getLoc(), cast_type, clamp_min, cast_op, clamp_max);
       start_indices_vector.push_back(clamped_index);
     }
 
@@ -1317,11 +1321,12 @@ class ConvertDynamicSliceOp : public OpConversionPattern<mhlo::DynamicSliceOp> {
     Type start_indices_type = RankedTensorType::get(
         {static_cast<int64_t>(start_indices_vector.size())},
         signed_start_indices_element_type);
-    Value start_indices_op = rewriter.create<TF::PackOp>(
-        op.getLoc(), start_indices_type, ValueRange(start_indices_vector));
+    Value start_indices_op =
+        TF::PackOp::create(rewriter, op.getLoc(), start_indices_type,
+                           ValueRange(start_indices_vector));
 
     Value slice_sices_op =
-        rewriter.create<TF::ConstOp>(op.getLoc(), op.getSliceSizes());
+        TF::ConstOp::create(rewriter, op.getLoc(), op.getSliceSizes());
     rewriter.replaceOpWithNewOp<TF::SliceOp>(op, op.getType(), op.getOperand(),
                                              start_indices_op, slice_sices_op);
     return success();
@@ -1378,8 +1383,8 @@ Value BuildReshapeOp(ImplicitLocOpBuilder& builder,
                      ArrayRef<int64_t> shape, Type idx_type,
                      Type element_type) {
   Value shape_cst = BuildIntArrayConstOp(builder, rewriter, shape, idx_type);
-  Value reshaped_input = builder.create<TF::ReshapeOp>(
-      RankedTensorType::get(shape, element_type), input, shape_cst);
+  Value reshaped_input = TF::ReshapeOp::create(
+      builder, RankedTensorType::get(shape, element_type), input, shape_cst);
   return reshaped_input;
 }
 
@@ -1389,8 +1394,9 @@ Value BuildSliceOp(ImplicitLocOpBuilder& builder,
                    Value begin, ArrayRef<int64_t> shape, Type idx_type,
                    Type element_type) {
   Value shape_cst = BuildIntArrayConstOp(builder, rewriter, shape, idx_type);
-  Value slice_result = builder.create<TF::SliceOp>(
-      RankedTensorType::get(shape, element_type), input, begin, shape_cst);
+  Value slice_result =
+      TF::SliceOp::create(builder, RankedTensorType::get(shape, element_type),
+                          input, begin, shape_cst);
   return slice_result;
 }
 
@@ -1416,8 +1422,8 @@ class ConvertDynamicUpdateSliceOp
     llvm::SmallVector<Value> start_indices_vector;
     Append(start_indices_vector, op.getStartIndices());
     auto shape_tensor_type = RankedTensorType::get({shape_dim}, idx_type);
-    Value start_indices_tensor = rewriter.create<TF::PackOp>(
-        op.getLoc(), shape_tensor_type, start_indices_vector);
+    Value start_indices_tensor = TF::PackOp::create(
+        rewriter, op.getLoc(), shape_tensor_type, start_indices_vector);
     rewriter.replaceOpWithNewOp<TF::XlaDynamicUpdateSliceOp>(
         op, op.getType(), op.getOperand(), op.getUpdate(),
         start_indices_tensor);
@@ -1584,7 +1590,7 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
                                       bool is_lhs) {
   auto operand_type = mlir::cast<ShapedType>(operand.getType());
   BoolAttr true_attr = builder.getBoolAttr(true);
-  auto operand_shape = builder.create<TF::ShapeOp>(operand, true_attr);
+  auto operand_shape = TF::ShapeOp::create(builder, operand, true_attr);
   const int64_t operand_rank = operand_type.getRank();
   // Compute flattened out dimension and contracting dimension using
   // TF::UnsortedSegmentProdOp.
@@ -1600,26 +1606,28 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
   }
   auto seg_prod_result_type =
       RankedTensorType::get(static_cast<int32_t>(1), builder.getI32Type());
-  auto out_segids_cst = builder.create<TF::ConstOp>(
-      builder.getI32TensorAttr(flattened_out_segids));
-  auto contracting_segids_cst = builder.create<TF::ConstOp>(
-      builder.getI32TensorAttr(flattened_contracting_segids));
+  auto out_segids_cst = TF::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_out_segids));
+  auto contracting_segids_cst = TF::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_contracting_segids));
   auto num_segids_tensor =
-      builder.create<TF::ConstOp>(builder.getI32IntegerAttr(1));
-  auto flattened_out_dims = builder.create<TF::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, out_segids_cst, num_segids_tensor);
-  auto flattened_contracting_dims = builder.create<TF::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, contracting_segids_cst,
+      TF::ConstOp::create(builder, builder.getI32IntegerAttr(1));
+  auto flattened_out_dims = TF::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, out_segids_cst,
+      num_segids_tensor);
+  auto flattened_contracting_dims = TF::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, contracting_segids_cst,
       num_segids_tensor);
   llvm::SmallVector<Value, 3> flattend_shape_values;
   // Gather the batch dimensions.
   if (!dot_dimensions_info.batch_dimensions().AxesArray().empty()) {
     if (ShapedType::isDynamicShape(
             dot_dimensions_info.batch_dimensions().SizesArray())) {
-      auto batch_axes_tensor =
-          builder.create<TF::ConstOp>(builder.getI64TensorAttr(
-              dot_dimensions_info.batch_dimensions().AxesArray()));
-      auto batch_dims = builder.create<TF::GatherOp>(
+      auto batch_axes_tensor = TF::ConstOp::create(
+          builder, builder.getI64TensorAttr(
+                       dot_dimensions_info.batch_dimensions().AxesArray()));
+      auto batch_dims = TF::GatherOp::create(
+          builder,
           RankedTensorType::get(
               {static_cast<int>(
                   dot_dimensions_info.batch_dimensions().AxesArray().size())},
@@ -1633,7 +1641,7 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
         batch_i32_vec.push_back(static_cast<int32_t>(element));
       }
       auto batch_dims =
-          builder.create<TF::ConstOp>(builder.getI32TensorAttr(batch_i32_vec));
+          TF::ConstOp::create(builder, builder.getI32TensorAttr(batch_i32_vec));
       flattend_shape_values.push_back(batch_dims);
     }
   }
@@ -1649,9 +1657,9 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
       builder.getIntegerType(32));
   // Concatenate the batch dimensions, flattened out dimension and flattened
   // contracting dimension.
-  return builder.create<TF::ConcatOp>(
-      concat_result_type,
-      builder.create<TF::ConstOp>(builder.getI32IntegerAttr(0)),
+  return TF::ConcatOp::create(
+      builder, concat_result_type,
+      TF::ConstOp::create(builder, builder.getI32IntegerAttr(0)),
       flattend_shape_values);
 }
 
@@ -1682,8 +1690,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       lhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       lhs_dot_dimensions_info.out_dimensions().SizesArray(),
       lhs_dot_dimensions_info.contracting_dimensions().SizesArray());
-  auto lhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto lhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(lhs_transposed_shape, lhs_type.getElementType()),
       lhs,
       DenseIntElementsAttr::get(
@@ -1700,8 +1708,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       rhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       rhs_dot_dimensions_info.contracting_dimensions().SizesArray(),
       rhs_dot_dimensions_info.out_dimensions().SizesArray());
-  auto rhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto rhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(rhs_transposed_shape, rhs_type.getElementType()),
       rhs,
       DenseIntElementsAttr::get(
@@ -1717,15 +1725,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.FlattenedContractingDimensionSize()});
   Value lhs_flattend;
   if (lhs_type.hasStaticShape()) {
-    lhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed.getResult());
   } else {
     auto lhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         lhs, lhs_dot_dimensions_info, builder, /*is_lhs=*/true);
-    lhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed, lhs_flattend_shape_op);
   }
@@ -1739,15 +1747,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
   Value rhs_flattend;
   if (rhs_type.hasStaticShape()) {
-    rhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed.getResult());
   } else {
     auto rhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         rhs, rhs_dot_dimensions_info, builder, /*is_lhs=*/false);
-    rhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed, rhs_flattend_shape_op);
   }
@@ -1759,36 +1767,38 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
                           lhs_dot_dimensions_info.FlattenedOutDimensionSize()},
                       llvm::ArrayRef<int64_t>{
                           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
-  auto matmul = rewriter.create<TF::BatchMatMulV3Op>(
-      loc, RankedTensorType::get(matmul_shape, result_type.getElementType()),
+  auto matmul = TF::BatchMatMulV3Op::create(
+      rewriter, loc,
+      RankedTensorType::get(matmul_shape, result_type.getElementType()),
       lhs_flattend, rhs_flattend);
 
   if (result_type.hasStaticShape()) {
     auto reshaped =
-        rewriter.create<mhlo::ReshapeOp>(loc, result_type, matmul.getResult());
+        mhlo::ReshapeOp::create(rewriter, loc, result_type, matmul.getResult());
     return reshaped.getResult();
   }
 
   // Reshape for dynamic shaped operands. The result shape is
   // [lhs_batch_dimensions, lhs_out_dimensions, rhs_out_dimensions].
   BoolAttr true_attr = rewriter.getBoolAttr(true);
-  auto lhs_shape = rewriter.create<TF::ShapeOp>(loc, lhs, true_attr);
-  auto rhs_shape = rewriter.create<TF::ShapeOp>(loc, rhs, true_attr);
+  auto lhs_shape = TF::ShapeOp::create(rewriter, loc, lhs, true_attr);
+  auto rhs_shape = TF::ShapeOp::create(rewriter, loc, rhs, true_attr);
   llvm::SmallVector<int64_t, 4> lhs_batch_and_out =
       Concat<int64_t>(lhs_dot_dimensions_info.batch_dimensions().AxesArray(),
                       lhs_dot_dimensions_info.out_dimensions().AxesArray());
-  auto lhs_batch_and_out_cst = rewriter.create<TF::ConstOp>(
-      loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
-  auto lhs_batch_and_out_dims = rewriter.create<TF::GatherOp>(
-      loc,
+  auto lhs_batch_and_out_cst = TF::ConstOp::create(
+      rewriter, loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
+  auto lhs_batch_and_out_dims = TF::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get({static_cast<int>(lhs_batch_and_out.size())},
                             rewriter.getIntegerType(32)),
       lhs_shape, lhs_batch_and_out_cst, true_attr);
-  auto rhs_out_cst = rewriter.create<TF::ConstOp>(
-      loc, rewriter.getI64TensorAttr(
-               rhs_dot_dimensions_info.out_dimensions().AxesArray()));
-  auto rhs_out_dims = rewriter.create<TF::GatherOp>(
-      loc,
+  auto rhs_out_cst = TF::ConstOp::create(
+      rewriter, loc,
+      rewriter.getI64TensorAttr(
+          rhs_dot_dimensions_info.out_dimensions().AxesArray()));
+  auto rhs_out_dims = TF::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get(
           {static_cast<int32_t>(
               rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
@@ -1800,13 +1810,13 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.out_dimensions().AxesArray().size() +
           rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
       rewriter.getIntegerType(32));
-  auto result_shape = rewriter.create<TF::ConcatOp>(
-      loc, result_shape_type,
-      rewriter.create<TF::ConstOp>(loc, rewriter.getI32IntegerAttr(0)),
+  auto result_shape = TF::ConcatOp::create(
+      rewriter, loc, result_shape_type,
+      TF::ConstOp::create(rewriter, loc, rewriter.getI32IntegerAttr(0)),
       ValueRange{lhs_batch_and_out_dims, rhs_out_dims});
 
-  auto reshaped = rewriter.create<mhlo::DynamicReshapeOp>(
-      loc, result_type, matmul.getResult(), result_shape);
+  auto reshaped = mhlo::DynamicReshapeOp::create(
+      rewriter, loc, result_type, matmul.getResult(), result_shape);
   return reshaped.getResult();
 }
 
@@ -1844,9 +1854,10 @@ template <typename TfReduceOp, typename TfBinOp>
 LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
                                        TF::ConstOp reduction_indices,
                                        ConversionPatternRewriter& rewriter) {
-  Value reduce_result = rewriter.create<TfReduceOp>(
-      reduce_op.getLoc(), reduce_op.getType(0), input, reduction_indices,
-      /*keep_dim=*/rewriter.getBoolAttr(false));
+  Value reduce_result =
+      TfReduceOp::create(rewriter, reduce_op.getLoc(), reduce_op.getType(0),
+                         input, reduction_indices,
+                         /*keep_dim=*/rewriter.getBoolAttr(false));
   rewriter.replaceOpWithNewOp<TfBinOp>(reduce_op, reduce_op.getType(0),
                                        reduce_result,
                                        reduce_op.getInitValues()[0]);
@@ -1902,8 +1913,9 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
     }
     auto dim_type = RankedTensorType::get(
         {static_cast<int64_t>(reduce_dims.size())}, rewriter.getI64Type());
-    auto reduction_indices = rewriter.create<TF::ConstOp>(
-        reduce_op.getLoc(), dim_type, rewriter.getI64TensorAttr(reduce_dims));
+    auto reduction_indices =
+        TF::ConstOp::create(rewriter, reduce_op.getLoc(), dim_type,
+                            rewriter.getI64TensorAttr(reduce_dims));
 
     // In `MatchReduceOpOperand` function, we already match that the
     // "mhlo::ReduceOp" only has one operand, one init_value and one result.
@@ -2103,25 +2115,26 @@ class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
 
     auto range_type =
         RankedTensorType::get({type.getShape()[dimension]}, element_type);
-    Value start_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), start);
-    Value limit_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), limit);
-    Value delta_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), delta);
-    Value result = rewriter.create<TF::RangeOp>(iota_op.getLoc(), range_type,
-                                                start_op, limit_op, delta_op);
+    Value start_op = TF::ConstOp::create(rewriter, iota_op.getLoc(), start);
+    Value limit_op = TF::ConstOp::create(rewriter, iota_op.getLoc(), limit);
+    Value delta_op = TF::ConstOp::create(rewriter, iota_op.getLoc(), delta);
+    Value result = TF::RangeOp::create(rewriter, iota_op.getLoc(), range_type,
+                                       start_op, limit_op, delta_op);
 
     if (type.getRank() > 1) {
       std::vector<int64_t> reshape_shape(type.getRank(), 1);
       reshape_shape[iota_op.getIotaDimension()] = type.getShape()[dimension];
       auto reshape_type = RankedTensorType::get(reshape_shape, element_type);
-      Value reshape_shape_op = rewriter.create<TF::ConstOp>(
-          iota_op.getLoc(), rewriter.getI64TensorAttr(reshape_shape));
-      result = rewriter.create<TF::ReshapeOp>(iota_op.getLoc(), reshape_type,
-                                              result, reshape_shape_op);
+      Value reshape_shape_op = TF::ConstOp::create(
+          rewriter, iota_op.getLoc(), rewriter.getI64TensorAttr(reshape_shape));
+      result = TF::ReshapeOp::create(rewriter, iota_op.getLoc(), reshape_type,
+                                     result, reshape_shape_op);
 
-      Value broadcast_shape_op = rewriter.create<TF::ConstOp>(
-          iota_op.getLoc(), rewriter.getI64TensorAttr(type.getShape()));
-      result = rewriter.create<TF::BroadcastToOp>(iota_op.getLoc(), type,
-                                                  result, broadcast_shape_op);
+      Value broadcast_shape_op =
+          TF::ConstOp::create(rewriter, iota_op.getLoc(),
+                              rewriter.getI64TensorAttr(type.getShape()));
+      result = TF::BroadcastToOp::create(rewriter, iota_op.getLoc(), type,
+                                         result, broadcast_shape_op);
     }
 
     rewriter.replaceOp(iota_op, result);
@@ -2314,8 +2327,8 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
       if (right_padding != 0) return failure();
     }
 
-    auto axis = rewriter.create<TF::ConstOp>(
-        rw->getLoc(),
+    auto axis = TF::ConstOp::create(
+        rewriter, rw->getLoc(),
         rewriter.getIntegerAttr(rewriter.getIntegerType(64), cumulative_axis));
 
     rewriter.replaceOpWithNewOp<TfCumOp>(rw, rw.getType(0), rw.getInputs()[0],
@@ -2585,7 +2598,7 @@ arith::ConstantOp ShapeToConst(PatternRewriter& rewriter, Value value) {
   auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
                                          rewriter.getIntegerType(64));
   auto attr = DenseElementsAttr::get(attr_type, shape);
-  return rewriter.create<arith::ConstantOp>(value.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(rewriter, value.getLoc(), attr_type, attr);
 }
 
 bool IsSign(APInt a, APInt sign) {
@@ -2841,8 +2854,8 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
 
     TF::CastOp cast_op = nullptr;
     if (canonical_start_indices_type.getElementType().isUnsignedInteger(32)) {
-      cast_op = rewriter.create<TF::CastOp>(
-          gather_op->getLoc(),
+      cast_op = TF::CastOp::create(
+          rewriter, gather_op->getLoc(),
           RankedTensorType::get(canonical_start_indices_type.getShape(),
                                 rewriter.getI64Type()),
           canonical_start_indices);
@@ -2861,8 +2874,8 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
 
     auto canonical_result_type = RankedTensorType::get(
         canonical_result_shape, result_type.getElementType());
-    auto canonical_result = rewriter.create<TF::GatherNdOp>(
-        gather_op->getLoc(), canonical_result_type, canonical_operand,
+    auto canonical_result = TF::GatherNdOp::create(
+        rewriter, gather_op->getLoc(), canonical_result_type, canonical_operand,
         cast_op ? cast_op.getResult() : canonical_start_indices);
 
     auto offset_dims = gather_op.getDimensionNumbers().getOffsetDims();
@@ -2968,24 +2981,24 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     auto min_start_indices = BuildIntArrayConstOp(
         builder, rewriter, llvm::SmallVector<int64_t>({0, 0}),
         start_indices_type.getElementType());
-    auto start_indices_max_op = rewriter.create<TF::MaximumOp>(
-        gather_op.getLoc(), start_indices, min_start_indices);
-    auto clamped_start_indices_op = rewriter.create<TF::MinimumOp>(
-        gather_op.getLoc(), start_indices_max_op, max_start_indices);
+    auto start_indices_max_op = TF::MaximumOp::create(
+        rewriter, gather_op.getLoc(), start_indices, min_start_indices);
+    auto clamped_start_indices_op = TF::MinimumOp::create(
+        rewriter, gather_op.getLoc(), start_indices_max_op, max_start_indices);
 
     int64_t batch_size = start_indices_type.getDimSize(batch_dim);
     auto slice_size = BuildIntArrayConstOp(
         builder, rewriter, slice_sizes_vector, rewriter.getI32Type());
     if (batch_size == 1) {
-      auto squeeze_op = rewriter.create<TF::SqueezeOp>(
-          gather_op.getLoc(),
+      auto squeeze_op = TF::SqueezeOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({rank_two},
                                 start_indices_type.getElementType()),
           clamped_start_indices_op,
           rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({batch_dim})));
       auto slice_op =
-          rewriter.create<TF::SliceOp>(gather_op.getLoc(), gather_op.getType(),
-                                       operand, squeeze_op, slice_size);
+          TF::SliceOp::create(rewriter, gather_op.getLoc(), gather_op.getType(),
+                              operand, squeeze_op, slice_size);
       rewriter.replaceOp(gather_op, slice_op);
       return mlir::success();
     }
@@ -2999,29 +3012,29 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
       auto two = BuildIntArrayConstOp(builder, rewriter,
                                       llvm::SmallVector<int64_t>({1, 2}),
                                       rewriter.getI32Type());
-      auto begin = rewriter.create<TF::SliceOp>(
-          gather_op.getLoc(),
+      auto begin = TF::SliceOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({1, 2}, start_indices_type.getElementType()),
           clamped_start_indices_op, zero, two);
-      auto squeeze_op = rewriter.create<TF::SqueezeOp>(
-          gather_op.getLoc(),
+      auto squeeze_op = TF::SqueezeOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({rank_two},
                                 start_indices_type.getElementType()),
           begin,
           rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({batch_dim})));
-      auto slice_op = rewriter.create<TF::SliceOp>(
-          gather_op.getLoc(),
+      auto slice_op = TF::SliceOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get({1, slice_sizes_vector[1]},
                                 operand_type.getElementType()),
           operand, squeeze_op, slice_size);
       slices.push_back(slice_op);
     }
     auto scalar_type = RankedTensorType::get({}, rewriter.getI32Type());
-    auto zero_scalar = rewriter.create<TF::ConstOp>(
-        gather_op.getLoc(),
+    auto zero_scalar = TF::ConstOp::create(
+        rewriter, gather_op.getLoc(),
         DenseIntElementsAttr::get(scalar_type, static_cast<int32_t>(0)));
-    auto concat_op = rewriter.create<TF::ConcatV2Op>(
-        gather_op.getLoc(), result_type, slices, zero_scalar);
+    auto concat_op = TF::ConcatV2Op::create(rewriter, gather_op.getLoc(),
+                                            result_type, slices, zero_scalar);
     rewriter.replaceOp(gather_op, concat_op);
     return mlir::success();
   }
@@ -3116,12 +3129,13 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     if (canonical_result_type.hasStaticShape()) {
       auto unflattened_result_type = RankedTensorType::get(
           unflattened_shape, original_result_type.getElementType());
-      canonical_result = rewriter.create<mhlo::ReshapeOp>(
-          gather_op.getLoc(), unflattened_result_type, canonical_result);
+      canonical_result =
+          mhlo::ReshapeOp::create(rewriter, gather_op.getLoc(),
+                                  unflattened_result_type, canonical_result);
     }
     // Transpose back to the original result shape.
-    return rewriter.create<mhlo::TransposeOp>(
-        gather_op.getLoc(), original_result_type, canonical_result,
+    return mhlo::TransposeOp::create(
+        rewriter, gather_op.getLoc(), original_result_type, canonical_result,
         rewriter.getI64TensorAttr(
             GetInversePermutationArray(permutation_to_canonical)));
   }
@@ -3168,13 +3182,13 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     // Transpose the dimensions and flatten the batching dimensions.
     RankedTensorType transposed_type =
         RankedTensorType::get(transposed_shape, operand_type.getElementType());
-    auto transposed_operand = rewriter.create<mhlo::TransposeOp>(
-        gather_op.getLoc(), transposed_type, operand,
+    auto transposed_operand = mhlo::TransposeOp::create(
+        rewriter, gather_op.getLoc(), transposed_type, operand,
         rewriter.getI64TensorAttr(permutation));
     auto flattened_type =
         RankedTensorType::get(flattened_shape, operand_type.getElementType());
-    auto flattened_operand = rewriter.create<mhlo::ReshapeOp>(
-        gather_op.getLoc(), flattened_type, transposed_operand);
+    auto flattened_operand = mhlo::ReshapeOp::create(
+        rewriter, gather_op.getLoc(), flattened_type, transposed_operand);
     return flattened_operand;
   }
 
@@ -3233,13 +3247,13 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     reshaped_shape.push_back(index_vector_size);
 
     // Transpose the dimensions and flatten the batching dimensions.
-    auto transposed_start_indices = rewriter.create<mhlo::TransposeOp>(
-        gather_op.getLoc(),
+    auto transposed_start_indices = mhlo::TransposeOp::create(
+        rewriter, gather_op.getLoc(),
         RankedTensorType::get(transposed_shape,
                               start_indices_type.getElementType()),
         start_indices, rewriter.getI64TensorAttr(permutation));
-    start_indices = rewriter.create<mhlo::ReshapeOp>(
-        gather_op.getLoc(),
+    start_indices = mhlo::ReshapeOp::create(
+        rewriter, gather_op.getLoc(),
         RankedTensorType::get(reshaped_shape,
                               start_indices_type.getElementType()),
         transposed_start_indices);
@@ -3275,32 +3289,33 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
       llvm::SmallVector<int64_t> offsets_shape(start_indices_shape.size(), 1);
       offsets_shape[non_trivial_sliced_dim] = slice_sizes[operand_dim];
       start_indices_shape[non_trivial_sliced_dim] = slice_sizes[operand_dim];
-      auto offsets = rewriter.create<mhlo::IotaOp>(
-          gather_op.getLoc(),
+      auto offsets = mhlo::IotaOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get(offsets_shape,
                                 start_indices_type.getElementType()),
           rewriter.getI64IntegerAttr(non_trivial_sliced_dim));
       non_trivial_sliced_dim++;
 
       // Pad with 0s on the other operand dimensions.
-      Value zero = rewriter.create<arith::ConstantOp>(
-          gather_op.getLoc(), rewriter.getZeroAttr(RankedTensorType::get(
-                                  {}, start_indices_type.getElementType())));
+      Value zero = arith::ConstantOp::create(
+          rewriter, gather_op.getLoc(),
+          rewriter.getZeroAttr(
+              RankedTensorType::get({}, start_indices_type.getElementType())));
       int rank = offsets_shape.size();
       llvm::SmallVector<int64_t> padding_low(rank, 0);
       llvm::SmallVector<int64_t> padding_high(rank, 0);
       llvm::SmallVector<int64_t> padding_interior(rank, 0);
       padding_low.back() = i;
       padding_high.back() = start_indices_shape.back() - i - 1;
-      auto padded_offsets = rewriter.create<mhlo::PadOp>(
-          gather_op.getLoc(), offsets, zero,
-          GetI64ElementsAttr(padding_low, &rewriter),
-          GetI64ElementsAttr(padding_high, &rewriter),
-          GetI64ElementsAttr(padding_interior, &rewriter));
+      auto padded_offsets =
+          mhlo::PadOp::create(rewriter, gather_op.getLoc(), offsets, zero,
+                              GetI64ElementsAttr(padding_low, &rewriter),
+                              GetI64ElementsAttr(padding_high, &rewriter),
+                              GetI64ElementsAttr(padding_interior, &rewriter));
 
       // Add the padded offsets to the start indices (with broadcasting).
-      start_indices = rewriter.create<TF::AddOp>(gather_op.getLoc(),
-                                                 start_indices, padded_offsets);
+      start_indices = TF::AddOp::create(rewriter, gather_op.getLoc(),
+                                        start_indices, padded_offsets);
     }
 
     if (!start_indices_batching_dims.empty()) {
@@ -3308,15 +3323,15 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
       // operand.
       llvm::SmallVector<int64_t> offsets_shape = start_indices_shape;
       offsets_shape.back() = 1;
-      auto offsets = rewriter.create<mhlo::IotaOp>(
-          gather_op.getLoc(),
+      auto offsets = mhlo::IotaOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get(offsets_shape,
                                 start_indices_type.getElementType()),
           rewriter.getI64IntegerAttr(0));
 
       start_indices_shape.back()++;
-      start_indices = rewriter.create<mhlo::ConcatenateOp>(
-          gather_op.getLoc(),
+      start_indices = mhlo::ConcatenateOp::create(
+          rewriter, gather_op.getLoc(),
           RankedTensorType::get(start_indices_shape,
                                 start_indices_type.getElementType()),
           ValueRange{offsets, start_indices},
@@ -3345,8 +3360,9 @@ class ConvertWhileOp : public OpConversionPattern<mhlo::WhileOp> {
     // Creates a TF::WhileRegionOp to replace the mhlo::WhileOp. HLO WhileOp
     // currently doesn't support stateless and shape invariant, so these
     // parameters are set to the default values.
-    auto new_while = rewriter.create<TF::WhileRegionOp>(
-        while_op.getLoc(), while_op->getResultTypes(), while_op->getOperands(),
+    auto new_while = TF::WhileRegionOp::create(
+        rewriter, while_op.getLoc(), while_op->getResultTypes(),
+        while_op->getOperands(),
         /*parallel_iterations=*/10,
         /*is_stateless=*/false, /*shape_invariant=*/false);
     new_while.getCond().takeBody(while_op.getCond());
@@ -3366,8 +3382,8 @@ class ConvertIfOp : public OpConversionPattern<mhlo::IfOp> {
       mhlo::IfOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     // HLO IfOp currently doesn't support stateless
-    auto new_op = rewriter.create<TF::IfRegionOp>(
-        op.getLoc(), op->getResultTypes(), op.getPred(),
+    auto new_op = TF::IfRegionOp::create(
+        rewriter, op.getLoc(), op->getResultTypes(), op.getPred(),
         /*is_stateless=*/false, /*_then_func_name=*/nullptr,
         /*_else_func_name=*/nullptr);
     new_op.getThenBranch().takeBody(op.getTrueBranch());
@@ -3427,10 +3443,10 @@ Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
       {pad_op.getEdgePaddingLow().size(), 2}, rewriter.getI64Type());
   auto padding_attr = DenseIntElementsAttr::get(padding_attr_type, padding);
   auto padding_amount_const_op =
-      rewriter.create<arith::ConstantOp>(loc, padding_attr_type, padding_attr);
-  auto new_pad_op = rewriter.create<TF::PadV2Op>(
-      loc, pad_op.getType().clone(pad_output_shape), pad_op.getOperand(),
-      padding_amount_const_op, pad_op.getPaddingValue());
+      arith::ConstantOp::create(rewriter, loc, padding_attr_type, padding_attr);
+  auto new_pad_op = TF::PadV2Op::create(
+      rewriter, loc, pad_op.getType().clone(pad_output_shape),
+      pad_op.getOperand(), padding_amount_const_op, pad_op.getPaddingValue());
   if (!has_negative_padding_amount) {
     return new_pad_op;
   }
@@ -3438,15 +3454,14 @@ Value ConvertPadOp(PatternRewriter& rewriter, Operation* old_op) {
   // Convert negative padding amount into slice.
   auto slice_attr_type = RankedTensorType::get(
       {pad_op.getEdgePaddingLow().size()}, rewriter.getI64Type());
-  auto slice_begins_const_op = rewriter.create<arith::ConstantOp>(
-      loc, slice_attr_type,
+  auto slice_begins_const_op = arith::ConstantOp::create(
+      rewriter, loc, slice_attr_type,
       DenseIntElementsAttr::get(slice_attr_type, slice_begins));
-  auto slice_sizes_const_op = rewriter.create<arith::ConstantOp>(
-      loc, slice_attr_type,
+  auto slice_sizes_const_op = arith::ConstantOp::create(
+      rewriter, loc, slice_attr_type,
       DenseIntElementsAttr::get(slice_attr_type, slice_sizes));
-  return rewriter.create<TF::SliceOp>(loc, pad_op.getType(), new_pad_op,
-                                      slice_begins_const_op,
-                                      slice_sizes_const_op);
+  return TF::SliceOp::create(rewriter, loc, pad_op.getType(), new_pad_op,
+                             slice_begins_const_op, slice_sizes_const_op);
 }
 
 class ConvertPopulationCountOp
@@ -3459,8 +3474,8 @@ class ConvertPopulationCountOp
       ConversionPatternRewriter& rewriter) const final {
     auto output_type = op.getType().clone(
         rewriter.getIntegerType(/*width=*/8, /*isSigned=*/false));
-    auto pop_cnt = rewriter.create<TF::PopulationCountOp>(
-        op.getLoc(), output_type, op.getOperand());
+    auto pop_cnt = TF::PopulationCountOp::create(rewriter, op.getLoc(),
+                                                 output_type, op.getOperand());
     auto cast_or_pop_cnt =
         rewriter.createOrFold<TF::CastOp>(op.getLoc(), op.getType(), pop_cnt);
     rewriter.replaceOp(op, {cast_or_pop_cnt});
@@ -3608,9 +3623,9 @@ class ConvertCustomCallWithApproxTopK
     }
     auto is_max_k = rewriter.getBoolAttr(true);
 
-    auto approx_top_k = rewriter.create<TF::ApproxTopKOp>(
-        op.getLoc(), op->getResultTypes(), op.getInputs()[0], top_k_attr,
-        reduction_dim_attr, recall_target_attr, is_max_k,
+    auto approx_top_k = TF::ApproxTopKOp::create(
+        rewriter, op.getLoc(), op->getResultTypes(), op.getInputs()[0],
+        top_k_attr, reduction_dim_attr, recall_target_attr, is_max_k,
         reduction_input_size_override_attr, aggregate_to_topk_attr);
 
     rewriter.replaceOp(op, approx_top_k.getResults());
@@ -3661,8 +3676,8 @@ class ConvertGetDimensionSizeOp
       mhlo::GetDimensionSizeOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
     ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
-    Value shape_op = rewriter.create<TF::ShapeOp>(op.getLoc(), op.getOperand(),
-                                                  rewriter.getBoolAttr(true));
+    Value shape_op = TF::ShapeOp::create(rewriter, op.getLoc(), op.getOperand(),
+                                         rewriter.getBoolAttr(true));
     Value size =
         BuildIntArrayConstOp(builder, rewriter, llvm::SmallVector<int64_t>({1}),
                              rewriter.getI32Type());
@@ -3670,13 +3685,13 @@ class ConvertGetDimensionSizeOp
         builder, rewriter,
         llvm::SmallVector<int64_t>({static_cast<int64_t>(op.getDimension())}),
         rewriter.getI64Type());
-    Value slice_op = rewriter.create<TF::SliceOp>(
-        op.getLoc(),
+    Value slice_op = TF::SliceOp::create(
+        rewriter, op.getLoc(),
         RankedTensorType::get({static_cast<int64_t>(1)},
                               op.getType().getElementType()),
         shape_op, begin, size);
-    Value squeeze_op = rewriter.create<TF::SqueezeOp>(
-        op.getLoc(), op.getType(), slice_op,
+    Value squeeze_op = TF::SqueezeOp::create(
+        rewriter, op.getLoc(), op.getType(), slice_op,
         rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({0})));
     rewriter.replaceOp(op, {squeeze_op});
     return success();
@@ -3749,25 +3764,26 @@ class ConvertDynamicIotaOp : public OpConversionPattern<mhlo::DynamicIotaOp> {
     if (mlir::isa<FloatType>(element_type)) {
       auto cast_type =
           mlir::cast<ShapedType>(output_shape.getType()).clone(element_type);
-      output_shape = rewriter.create<TF::CastOp>(dynamic_iota_op.getLoc(),
-                                                 cast_type, output_shape);
+      output_shape = TF::CastOp::create(rewriter, dynamic_iota_op.getLoc(),
+                                        cast_type, output_shape);
     }
     DenseIntElementsAttr scalar_attr = DenseIntElementsAttr::get(
         RankedTensorType::get({0}, rewriter.getI32Type()),
         llvm::ArrayRef<int32_t>({}));
     auto scalar_shape =
-        rewriter.create<TF::ConstOp>(dynamic_iota_op.getLoc(), scalar_attr);
-    auto limit_scalar = rewriter.create<TF::ReshapeOp>(
-        dynamic_iota_op.getLoc(), RankedTensorType::get({}, element_type),
-        output_shape, scalar_shape);
+        TF::ConstOp::create(rewriter, dynamic_iota_op.getLoc(), scalar_attr);
+    auto limit_scalar = TF::ReshapeOp::create(
+        rewriter, dynamic_iota_op.getLoc(),
+        RankedTensorType::get({}, element_type), output_shape, scalar_shape);
     auto range_type =
         RankedTensorType::get({type.getShape()[dimension]}, element_type);
     Value start_op =
-        rewriter.create<TF::ConstOp>(dynamic_iota_op.getLoc(), start);
+        TF::ConstOp::create(rewriter, dynamic_iota_op.getLoc(), start);
     Value delta_op =
-        rewriter.create<TF::ConstOp>(dynamic_iota_op.getLoc(), delta);
-    Value range_op = rewriter.create<TF::RangeOp>(
-        dynamic_iota_op.getLoc(), range_type, start_op, limit_scalar, delta_op);
+        TF::ConstOp::create(rewriter, dynamic_iota_op.getLoc(), delta);
+    Value range_op =
+        TF::RangeOp::create(rewriter, dynamic_iota_op.getLoc(), range_type,
+                            start_op, limit_scalar, delta_op);
     rewriter.replaceOp(dynamic_iota_op, range_op);
     return success();
   }
@@ -3820,7 +3836,7 @@ arith::ConstantOp ExpandedShape(PatternRewriter& rewriter, Value input,
       RankedTensorType::get({static_cast<int64_t>(expanded_shape.size())},
                             rewriter.getIntegerType(64));
   auto attr = DenseElementsAttr::get(attr_type, expanded_shape);
-  return rewriter.create<arith::ConstantOp>(output.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(rewriter, output.getLoc(), attr_type, attr);
 }
 
 Value ExpandedDynamicShape(PatternRewriter& rewriter, Value input,
@@ -3843,9 +3859,9 @@ Value ExpandedDynamicShape(PatternRewriter& rewriter, Value input,
   for (int64_t i : expanded_dimensions) {
     auto index_attr = DenseIntElementsAttr::get(
         RankedTensorType::get({}, rewriter.getI64Type()), {i});
-    Value index = rewriter.create<TF::ConstOp>(output.getLoc(), index_attr);
-    expanded_input = rewriter.create<TF::ExpandDimsOp>(output.getLoc(),
-                                                       expanded_input, index);
+    Value index = TF::ConstOp::create(rewriter, output.getLoc(), index_attr);
+    expanded_input = TF::ExpandDimsOp::create(rewriter, output.getLoc(),
+                                              expanded_input, index);
   }
   return expanded_input;
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc
index e5ea3d2ebc5e93..096de88c16055f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/conv_util.cc
@@ -111,11 +111,11 @@ Value CreatePadOpFromConvPadding(OpBuilder& b, mhlo::ConvolutionOp op) {
   auto padding_value_type = RankedTensorType::get({}, data.ElementType());
   auto padding_value_attr = b.getZeroAttr(padding_value_type);
   auto padding_value_op =
-      b.create<arith::ConstantOp>(op->getLoc(), padding_value_attr);
+      arith::ConstantOp::create(b, op->getLoc(), padding_value_attr);
 
-  auto pad_op = b.create<mhlo::PadOp>(padding_value_op->getLoc(), op.getLhs(),
-                                      padding_value_op, lo_padding_attr,
-                                      hi_padding_attr, interior_padding_attr);
+  auto pad_op = mhlo::PadOp::create(b, padding_value_op->getLoc(), op.getLhs(),
+                                    padding_value_op, lo_padding_attr,
+                                    hi_padding_attr, interior_padding_attr);
 
   return pad_op;
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
index f89f8acd446315..18d9b10d677259 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/custom_call.cc
@@ -82,9 +82,9 @@ LogicalResult ConvertCustomCallOp::matchAndRewrite(
   if (!call_target_name.starts_with("custom_call.")) {
     return failure();
   }
-  auto tfl_custom = rewriter.create<TFL::CustomOp>(
-      mhlo_custom_call.getLoc(), mhlo_custom_call.getResultTypes(),
-      mhlo_custom_call.getInputs());
+  auto tfl_custom = TFL::CustomOp::create(rewriter, mhlo_custom_call.getLoc(),
+                                          mhlo_custom_call.getResultTypes(),
+                                          mhlo_custom_call.getInputs());
   tfl_custom.setCustomCodeAttr(rewriter.getStringAttr(call_target_name));
 
   if (auto bc = mhlo_custom_call.getBackendConfig()) {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
index 940c75256b9e75..347817d3cc6d59 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/dot_general.cc
@@ -178,7 +178,8 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
                                       ImplicitLocOpBuilder& builder,
                                       bool is_lhs) {
   auto operand_type = mlir::cast<ShapedType>(operand.getType());
-  auto operand_shape = builder.create<TFL::ShapeOp>(
+  auto operand_shape = TFL::ShapeOp::create(
+      builder,
       RankedTensorType::get(static_cast<int32_t>(operand_type.getRank()),
                             builder.getIntegerType(32)),
       operand);
@@ -197,27 +198,29 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
   }
   auto seg_prod_result_type =
       RankedTensorType::get(static_cast<int32_t>(1), builder.getI32Type());
-  auto out_segids_cst = builder.create<TFL::ConstOp>(
-      builder.getI32TensorAttr(flattened_out_segids));
-  auto contracting_segids_cst = builder.create<TFL::ConstOp>(
-      builder.getI32TensorAttr(flattened_contracting_segids));
-  auto num_segids_tensor =
-      builder.create<TFL::ConstOp>(DenseIntElementsAttr::get(
-          RankedTensorType::get({}, builder.getIntegerType(32)), 1));
-  auto flattened_out_dims = builder.create<TFL::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, out_segids_cst, num_segids_tensor);
-  auto flattened_contracting_dims = builder.create<TFL::UnsortedSegmentProdOp>(
-      seg_prod_result_type, operand_shape, contracting_segids_cst,
+  auto out_segids_cst = TFL::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_out_segids));
+  auto contracting_segids_cst = TFL::ConstOp::create(
+      builder, builder.getI32TensorAttr(flattened_contracting_segids));
+  auto num_segids_tensor = TFL::ConstOp::create(
+      builder, DenseIntElementsAttr::get(
+                   RankedTensorType::get({}, builder.getIntegerType(32)), 1));
+  auto flattened_out_dims = TFL::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, out_segids_cst,
+      num_segids_tensor);
+  auto flattened_contracting_dims = TFL::UnsortedSegmentProdOp::create(
+      builder, seg_prod_result_type, operand_shape, contracting_segids_cst,
       num_segids_tensor);
   llvm::SmallVector<Value, 3> flattend_shape_values;
   // Gather the batch dimensions.
   if (!dot_dimensions_info.batch_dimensions().AxesArray().empty()) {
     if (ShapedType::isDynamicShape(
             dot_dimensions_info.batch_dimensions().SizesArray())) {
-      auto batch_axes_tensor =
-          builder.create<TFL::ConstOp>(builder.getI64TensorAttr(
-              dot_dimensions_info.batch_dimensions().AxesArray()));
-      auto batch_dims = builder.create<TFL::GatherOp>(
+      auto batch_axes_tensor = TFL::ConstOp::create(
+          builder, builder.getI64TensorAttr(
+                       dot_dimensions_info.batch_dimensions().AxesArray()));
+      auto batch_dims = TFL::GatherOp::create(
+          builder,
           RankedTensorType::get(
               {static_cast<int>(
                   dot_dimensions_info.batch_dimensions().AxesArray().size())},
@@ -230,8 +233,8 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
            dot_dimensions_info.batch_dimensions().SizesArray()) {
         batch_i32_vec.push_back(static_cast<int32_t>(element));
       }
-      auto batch_dims =
-          builder.create<TFL::ConstOp>(builder.getI32TensorAttr(batch_i32_vec));
+      auto batch_dims = TFL::ConstOp::create(
+          builder, builder.getI32TensorAttr(batch_i32_vec));
       flattend_shape_values.push_back(batch_dims);
     }
   }
@@ -247,9 +250,9 @@ Value BuildDotOperandFlattenedShapeOp(Value operand,
       builder.getIntegerType(32));
   // Concatenate the batch dimensions, flattened out dimension and flattened
   // contracting dimension.
-  return builder.create<TFL::ConcatenationOp>(
-      concat_result_type, flattend_shape_values, /*axis*/ 0,
-      /*fused_activation_function*/ "NONE");
+  return TFL::ConcatenationOp::create(builder, concat_result_type,
+                                      flattend_shape_values, /*axis*/ 0,
+                                      /*fused_activation_function*/ "NONE");
 }
 }  // namespace
 
@@ -280,8 +283,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       lhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       lhs_dot_dimensions_info.out_dimensions().SizesArray(),
       lhs_dot_dimensions_info.contracting_dimensions().SizesArray());
-  auto lhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto lhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(lhs_transposed_shape, lhs_type.getElementType()),
       lhs,
       DenseIntElementsAttr::get(
@@ -298,8 +301,8 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
       rhs_dot_dimensions_info.batch_dimensions().SizesArray(),
       rhs_dot_dimensions_info.contracting_dimensions().SizesArray(),
       rhs_dot_dimensions_info.out_dimensions().SizesArray());
-  auto rhs_transposed = rewriter.create<mhlo::TransposeOp>(
-      loc,
+  auto rhs_transposed = mhlo::TransposeOp::create(
+      rewriter, loc,
       RankedTensorType::get(rhs_transposed_shape, rhs_type.getElementType()),
       rhs,
       DenseIntElementsAttr::get(
@@ -314,15 +317,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.FlattenedContractingDimensionSize()});
   Value lhs_flattend;
   if (lhs_type.hasStaticShape()) {
-    lhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed.getResult());
   } else {
     auto lhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         lhs, lhs_dot_dimensions_info, builder, /*is_lhs=*/true);
-    lhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    lhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(lhs_flattened_shape, lhs_type.getElementType()),
         lhs_transposed, lhs_flattend_shape_op);
   }
@@ -336,15 +339,15 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
   Value rhs_flattend;
   if (rhs_type.hasStaticShape()) {
-    rhs_flattend = rewriter.create<mhlo::ReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed.getResult());
   } else {
     auto rhs_flattend_shape_op = BuildDotOperandFlattenedShapeOp(
         rhs, rhs_dot_dimensions_info, builder, /*is_lhs=*/false);
-    rhs_flattend = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc,
+    rhs_flattend = mhlo::DynamicReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get(rhs_flattened_shape, rhs_type.getElementType()),
         rhs_transposed, rhs_flattend_shape_op);
   }
@@ -357,44 +360,46 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
                       llvm::ArrayRef<int64_t>{
                           rhs_dot_dimensions_info.FlattenedOutDimensionSize()});
   BoolAttr false_attr = rewriter.getBoolAttr(false);
-  auto matmul = rewriter.create<TFL::BatchMatMulOp>(
-      loc, RankedTensorType::get(matmul_shape, result_type.getElementType()),
+  auto matmul = TFL::BatchMatMulOp::create(
+      rewriter, loc,
+      RankedTensorType::get(matmul_shape, result_type.getElementType()),
       lhs_flattend, rhs_flattend, /*adj_x*/ false_attr, /*adj_y*/ false_attr,
       /*asym_quant_input*/ false_attr);
   if (result_type.hasStaticShape()) {
     auto reshaped =
-        rewriter.create<mhlo::ReshapeOp>(loc, result_type, matmul.getResult());
+        mhlo::ReshapeOp::create(rewriter, loc, result_type, matmul.getResult());
     return reshaped.getResult();
   }
 
   // Reshape for dynamic shaped operands. The result shape is
   // [lhs_batch_dimensions, lhs_out_dimensions, rhs_out_dimensions].
-  auto lhs_shape = rewriter.create<TFL::ShapeOp>(
-      loc,
+  auto lhs_shape = TFL::ShapeOp::create(
+      rewriter, loc,
       RankedTensorType::get(static_cast<int32_t>(lhs_type.getRank()),
                             builder.getIntegerType(32)),
       lhs);
-  auto rhs_shape = rewriter.create<TFL::ShapeOp>(
-      loc,
+  auto rhs_shape = TFL::ShapeOp::create(
+      rewriter, loc,
       RankedTensorType::get(static_cast<int32_t>(rhs_type.getRank()),
                             builder.getIntegerType(32)),
       rhs);
   llvm::SmallVector<int64_t, 4> lhs_batch_and_out =
       Concat<int64_t>(lhs_dot_dimensions_info.batch_dimensions().AxesArray(),
                       lhs_dot_dimensions_info.out_dimensions().AxesArray());
-  auto lhs_batch_and_out_cst = rewriter.create<TFL::ConstOp>(
-      loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
-  auto lhs_batch_and_out_dims = rewriter.create<TFL::GatherOp>(
-      loc,
+  auto lhs_batch_and_out_cst = TFL::ConstOp::create(
+      rewriter, loc, rewriter.getI64TensorAttr(lhs_batch_and_out));
+  auto lhs_batch_and_out_dims = TFL::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get({static_cast<int>(lhs_batch_and_out.size())},
                             rewriter.getIntegerType(32)),
       lhs_shape, lhs_batch_and_out_cst,
       /*axis*/ 0, /*batch_dims*/ 0);
-  auto rhs_out_cst = rewriter.create<TFL::ConstOp>(
-      loc, rewriter.getI64TensorAttr(
-               rhs_dot_dimensions_info.out_dimensions().AxesArray()));
-  auto rhs_out_dims = rewriter.create<TFL::GatherOp>(
-      loc,
+  auto rhs_out_cst = TFL::ConstOp::create(
+      rewriter, loc,
+      rewriter.getI64TensorAttr(
+          rhs_dot_dimensions_info.out_dimensions().AxesArray()));
+  auto rhs_out_dims = TFL::GatherOp::create(
+      rewriter, loc,
       RankedTensorType::get(
           {static_cast<int32_t>(
               rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
@@ -407,12 +412,12 @@ Value ConvertDot(PatternRewriter& rewriter, Value lhs, Value rhs,
           lhs_dot_dimensions_info.out_dimensions().AxesArray().size() +
           rhs_dot_dimensions_info.out_dimensions().AxesArray().size())},
       rewriter.getIntegerType(32));
-  auto result_shape = rewriter.create<TFL::ConcatenationOp>(
-      loc, result_shape_type, ValueRange{lhs_batch_and_out_dims, rhs_out_dims},
-      0, "NONE");
+  auto result_shape = TFL::ConcatenationOp::create(
+      rewriter, loc, result_shape_type,
+      ValueRange{lhs_batch_and_out_dims, rhs_out_dims}, 0, "NONE");
 
-  auto reshaped = rewriter.create<mhlo::DynamicReshapeOp>(
-      loc, result_type, matmul.getResult(), result_shape);
+  auto reshaped = mhlo::DynamicReshapeOp::create(
+      rewriter, loc, result_type, matmul.getResult(), result_shape);
   return reshaped.getResult();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
index f2d29774c31c89..34b1b60fd1b825 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/fft.cc
@@ -74,9 +74,11 @@ bool IsSupportedRfftOp(mhlo::FftOp fft_op) {
 // concatenate with other dimension sizes.
 Value GetDimensionSizeTensor(OpBuilder& rewriter, Location loc, Value input,
                              int64_t dim) {
-  auto size_scalar = rewriter.create<mhlo::GetDimensionSizeOp>(loc, input, dim);
-  return rewriter.create<mhlo::ReshapeOp>(
-      loc, RankedTensorType::get({1}, rewriter.getI32Type()), size_scalar);
+  auto size_scalar =
+      mhlo::GetDimensionSizeOp::create(rewriter, loc, input, dim);
+  return mhlo::ReshapeOp::create(
+      rewriter, loc, RankedTensorType::get({1}, rewriter.getI32Type()),
+      size_scalar);
 }
 
 // Convert rfft to rfft2d.
@@ -154,13 +156,13 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
           expanded_input_shape_values.push_back(GetDimensionSizeTensor(
               rewriter, fft_op.getLoc(), fft_operand, i));
         }
-        expanded_input_shape_values.push_back(rewriter.create<mhlo::ConstantOp>(
-            fft_op.getLoc(), rewriter.getI32TensorAttr({1})));
+        expanded_input_shape_values.push_back(mhlo::ConstantOp::create(
+            rewriter, fft_op.getLoc(), rewriter.getI32TensorAttr({1})));
         expanded_input_shape_values.push_back(GetDimensionSizeTensor(
             rewriter, fft_op.getLoc(), fft_operand, input_shape.size() - 1));
 
-        auto expanded_input_shape_tensor = rewriter.create<mhlo::ConcatenateOp>(
-            fft_op.getLoc(),
+        auto expanded_input_shape_tensor = mhlo::ConcatenateOp::create(
+            rewriter, fft_op.getLoc(),
             RankedTensorType::get(
                 {static_cast<int64_t>(expanded_input_shape_values.size())},
                 rewriter.getI32Type()),
@@ -168,12 +170,12 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
 
         // Create a new mhlo.dynamic_reshape op with the expanded input and
         // expanded input shape. SHAPE tensor is created in the previous step.
-        fft_operand = rewriter.create<mhlo::DynamicReshapeOp>(
-            fft_op.getLoc(), expanded_input_type, fft_operand,
+        fft_operand = mhlo::DynamicReshapeOp::create(
+            rewriter, fft_op.getLoc(), expanded_input_type, fft_operand,
             expanded_input_shape_tensor);
       } else {
-        fft_operand = rewriter.create<mhlo::ReshapeOp>(
-            fft_op.getLoc(), expanded_input_type, fft_operand);
+        fft_operand = mhlo::ReshapeOp::create(rewriter, fft_op.getLoc(),
+                                              expanded_input_type, fft_operand);
       }
 
       SmallVector<int64_t, 6> new_output_shape = {output_shape.begin(),
@@ -186,8 +188,8 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
     }
 
     auto new_fft =
-        rewriter.create<mhlo::FftOp>(fft_op.getLoc(), output_type, fft_operand,
-                                     fft_op.getFftType(), new_fft_lengths_attr);
+        mhlo::FftOp::create(rewriter, fft_op.getLoc(), output_type, fft_operand,
+                            fft_op.getFftType(), new_fft_lengths_attr);
 
     if (input_shape[input_shape.size() - 2] != 1) {
       // Squeeze the output dimensions back to 2D.
@@ -202,19 +204,20 @@ class ConvertNDFftTo2DFftOp : public OpRewritePattern<mhlo::FftOp> {
             rewriter, fft_op.getLoc(), new_fft.getResult(),
             new_fft.getResult().getType().getShape().size() - 1));
 
-        auto shape_tensor = rewriter.create<mhlo::ConcatenateOp>(
-            fft_op.getLoc(),
+        auto shape_tensor = mhlo::ConcatenateOp::create(
+            rewriter, fft_op.getLoc(),
             RankedTensorType::get(
                 {static_cast<int64_t>(output_shape_values.size())},
                 rewriter.getI32Type()),
             output_shape_values, 0);
-        auto squeeze_op = rewriter.create<mhlo::DynamicReshapeOp>(
-            fft_op.getLoc(), fft_op.getResult().getType(), new_fft.getResult(),
-            shape_tensor);
+        auto squeeze_op = mhlo::DynamicReshapeOp::create(
+            rewriter, fft_op.getLoc(), fft_op.getResult().getType(),
+            new_fft.getResult(), shape_tensor);
         rewriter.replaceOp(fft_op, squeeze_op.getResult());
       } else {
-        auto squeeze_op = rewriter.create<mhlo::ReshapeOp>(
-            fft_op.getLoc(), fft_op.getResult().getType(), new_fft.getResult());
+        auto squeeze_op = mhlo::ReshapeOp::create(rewriter, fft_op.getLoc(),
+                                                  fft_op.getResult().getType(),
+                                                  new_fft.getResult());
         rewriter.replaceOp(fft_op, squeeze_op.getResult());
       }
     } else {
@@ -256,9 +259,10 @@ class LegalizeRfftOp : public OpConversionPattern<mhlo::FftOp> {
 
     auto output_type = mlir::cast<ShapedType>(fft_op.getResult().getType());
     auto fft_len_const =
-        rewriter.create<arith::ConstantOp>(fft_op.getLoc(), fft_len_f32_attr);
-    auto tfl_rfft2d = rewriter.create<TFL::RFFT2dOp>(
-        fft_op.getLoc(), output_type, fft_op.getOperand(), fft_len_const);
+        arith::ConstantOp::create(rewriter, fft_op.getLoc(), fft_len_f32_attr);
+    auto tfl_rfft2d =
+        TFL::RFFT2dOp::create(rewriter, fft_op.getLoc(), output_type,
+                              fft_op.getOperand(), fft_len_const);
 
     rewriter.replaceOp(fft_op, tfl_rfft2d.getResult());
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc
index 539a9934f75e5a..9833b3415f3059 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/gelu.cc
@@ -119,9 +119,9 @@ LogicalResult LowerGELU::matchAndRewrite(Operation* op,
   if (!HasSplatArg(rhs_mul, kOneOverRoot2, 1)) return failure();
 
   auto is_approx_attr = rewriter.getBoolAttr(false);
-  auto gelu = rewriter.create<TFL::GeluOp>(
-      output_mul.getLoc(), output_mul.getResult().getType(),
-      erf_input->getOperand(0), is_approx_attr);
+  auto gelu = TFL::GeluOp::create(rewriter, output_mul.getLoc(),
+                                  output_mul.getResult().getType(),
+                                  erf_input->getOperand(0), is_approx_attr);
   rewriter.replaceAllOpUsesWith(output_mul, gelu);
   // Note these must be erased in reverse topo order to avoid
   // failing in debug mode.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc
index e43f342aec2cdc..6b377c0eee933c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/if.cc
@@ -33,8 +33,8 @@ class LegalizeIfOp : public OpConversionPattern<mhlo::IfOp> {
   LogicalResult matchAndRewrite(
       mhlo::IfOp if_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto new_op = rewriter.create<TFL::IfOp>(
-        if_op.getLoc(), if_op.getResultTypes(), if_op.getPred());
+    auto new_op = TFL::IfOp::create(rewriter, if_op.getLoc(),
+                                    if_op.getResultTypes(), if_op.getPred());
 
     new_op.getThenRegion().takeBody(if_op.getTrueBranch());
     new_op.getElseRegion().takeBody(if_op.getFalseBranch());
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc
index f237a7168e5660..5b5368ac1f5522 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce.cc
@@ -248,8 +248,8 @@ LogicalResult ConvertReduceOpToArgMinMax<
   int64_t axis = reduce_op.getDimensions().getValues<int64_t>()[0];
 
   auto dim_type = RankedTensorType::get({1}, rewriter.getI32Type());
-  auto reduction_indices = rewriter.create<arith::ConstantOp>(
-      reduce_op.getLoc(), dim_type,
+  auto reduction_indices = arith::ConstantOp::create(
+      rewriter, reduce_op.getLoc(), dim_type,
       rewriter.getI32TensorAttr({static_cast<int32_t>(axis)}));
 
   // Generate a Max and an ArgMax of as the mhlo op returns both while in TF
@@ -260,24 +260,24 @@ LogicalResult ConvertReduceOpToArgMinMax<
   if (operand_type.getElementType().isInteger(1)) {
     // TF does not support min or max on boolean (int1) arguments.
     // Use AnyOp for MaxOp and AllOp for MinOp.
-    auto tf_reduce_op = rewriter.create<BooleanReduce>(
-        reduce_op.getLoc(), reduce_op->getResult(0).getType(), operand,
-        reduction_indices,
+    auto tf_reduce_op = BooleanReduce::create(
+        rewriter, reduce_op.getLoc(), reduce_op->getResult(0).getType(),
+        operand, reduction_indices,
         /*keep_dim=*/rewriter.getBoolAttr(false));
-    auto tf_argreduce_op = rewriter.create<ArgReduce>(
-        reduce_op.getLoc(), reduce_op->getResult(1).getType(), operand,
-        reduction_indices);
+    auto tf_argreduce_op = ArgReduce::create(rewriter, reduce_op.getLoc(),
+                                             reduce_op->getResult(1).getType(),
+                                             operand, reduction_indices);
 
     rewriter.replaceOp(reduce_op, {tf_reduce_op, tf_argreduce_op});
   } else {
-    auto tf_reduce_op = rewriter.create<Reduce>(
-        reduce_op.getLoc(), reduce_op->getResult(0).getType(), operand,
-        reduction_indices,
+    auto tf_reduce_op = Reduce::create(
+        rewriter, reduce_op.getLoc(), reduce_op->getResult(0).getType(),
+        operand, reduction_indices,
         /*keep_dim=*/rewriter.getBoolAttr(false));
 
-    auto tf_argreduce_op = rewriter.create<ArgReduce>(
-        reduce_op.getLoc(), reduce_op->getResult(1).getType(), operand,
-        reduction_indices);
+    auto tf_argreduce_op = ArgReduce::create(rewriter, reduce_op.getLoc(),
+                                             reduce_op->getResult(1).getType(),
+                                             operand, reduction_indices);
 
     rewriter.replaceOp(reduce_op, {tf_reduce_op, tf_argreduce_op});
   }
@@ -366,9 +366,10 @@ template <typename ReduceOp, typename BinaryOp, bool BuilderHasFAF = false>
 LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
                                        arith::ConstantOp reduction_indices,
                                        ConversionPatternRewriter& rewriter) {
-  Value reduce_result = rewriter.create<ReduceOp>(
-      reduce_op.getLoc(), reduce_op.getType(0), input, reduction_indices,
-      /*keep_dim=*/rewriter.getBoolAttr(false));
+  Value reduce_result =
+      ReduceOp::create(rewriter, reduce_op.getLoc(), reduce_op.getType(0),
+                       input, reduction_indices,
+                       /*keep_dim=*/rewriter.getBoolAttr(false));
 
   if constexpr (BuilderHasFAF) {
     rewriter.replaceOpWithNewOp<BinaryOp>(reduce_op, reduce_result,
@@ -455,7 +456,7 @@ class ConvertReduce : public OpConversionPattern<mhlo::ReduceOp> {
 
     auto tfl_dims = GetDimsAsI32Elements(rewriter, reduce_op);
     auto tfl_dims_op =
-        rewriter.create<arith::ConstantOp>(reduce_op.getLoc(), tfl_dims);
+        arith::ConstantOp::create(rewriter, reduce_op.getLoc(), tfl_dims);
 
     //
     // replace with new reduce op, chaining binary op if needed.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc
index 4382a9864cac02..c4a3dc62fd58f0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/reduce_window.cc
@@ -126,7 +126,7 @@ Value TransposeTensor(OpBuilder& b, Value tensor,
   const int64_t perm_size = perm.size();
   auto perm_attr_type = RankedTensorType::get({perm_size}, b.getI64Type());
   auto perm_attr = DenseIntElementsAttr::get(perm_attr_type, perm);
-  return b.create<mhlo::TransposeOp>(tensor.getLoc(), tensor, perm_attr);
+  return mhlo::TransposeOp::create(b, tensor.getLoc(), tensor, perm_attr);
 }
 
 DenseIntElementsAttr BuildDenseI64(OpBuilder& b, ArrayRef<int64_t> shape,
@@ -289,9 +289,10 @@ LogicalResult RelayoutReduceWindow::matchAndRewrite(
 
   // transpose input and build new reduce_window
   auto new_input = TransposeTensor(rewriter, input, perm_for_inputs);
-  auto new_rw = rewriter.create<mhlo::ReduceWindowOp>(
-      op.getLoc(), new_out_type, new_input, init_val, new_window_dims_attr,
-      new_window_strides_attr, BuildDenseI64(rewriter, view.BaseDilations()),
+  auto new_rw = mhlo::ReduceWindowOp::create(
+      rewriter, op.getLoc(), new_out_type, new_input, init_val,
+      new_window_dims_attr, new_window_strides_attr,
+      BuildDenseI64(rewriter, view.BaseDilations()),
       BuildDenseI64(rewriter, view.WindowDilations()), new_paddings_attr);
   IRMapping ir_map;
   op.getBody().cloneInto(&new_rw.getBody(), ir_map);
@@ -412,7 +413,7 @@ LogicalResult LegalizeCumSum::matchAndRewrite(
       RankedTensorType::get({}, rewriter.getI32Type()),
       static_cast<int32_t>(axis));
   auto axis_cst =
-      rewriter.create<arith::ConstantOp>(op->getLoc(), axis_cst_attr);
+      arith::ConstantOp::create(rewriter, op->getLoc(), axis_cst_attr);
 
   auto tfl_exclusive_attr = rewriter.getBoolAttr(false);
   auto tfl_reverse_attr = rewriter.getBoolAttr(false);
@@ -476,7 +477,7 @@ TFL::PadV2Op LegalizeMaxPool::BuildExplicitPadOp(
       llvm::ArrayRef<int64_t>(padding_values));
 
   auto padding_values_op =
-      rewriter.create<arith::ConstantOp>(op.getLoc(), padding_dense_attr);
+      arith::ConstantOp::create(rewriter, op.getLoc(), padding_dense_attr);
 
   llvm::SmallVector<int64_t, 4> pad_output_shape_vector;
   pad_output_shape_vector.push_back(input_type.getDimSize(0));
@@ -489,8 +490,8 @@ TFL::PadV2Op LegalizeMaxPool::BuildExplicitPadOp(
   pad_output_shape_vector.push_back(input_type.getDimSize(3));
   auto pad_output_type = mlir::RankedTensorType::get(
       pad_output_shape_vector, output_type.getElementType());
-  return rewriter.create<TFL::PadV2Op>(op.getLoc(), pad_output_type, input,
-                                       padding_values_op, init);
+  return TFL::PadV2Op::create(rewriter, op.getLoc(), pad_output_type, input,
+                              padding_values_op, init);
 }
 
 LogicalResult LegalizeMaxPool::matchAndRewrite(
@@ -575,13 +576,12 @@ void ReplaceWithAvgPool(mhlo::DivOp op, Value rw_lhs_input,
 
   auto [fh, fw, sh, sw, p, faf] =
       BuildTFLPoolAttrs(rewriter, lhs_view, padding);
-  Value final_op = rewriter.create<TFL::AveragePool2DOp>(
-      op->getLoc(), out_type, rw_lhs_input, fh, fw, p, sh, sw, faf);
+  Value final_op = TFL::AveragePool2DOp::create(
+      rewriter, op->getLoc(), out_type, rw_lhs_input, fh, fw, p, sh, sw, faf);
 
   if (opt_final_tpose) {
-    final_op = rewriter
-                   .create<mhlo::TransposeOp>(final_op.getLoc(), final_op,
-                                              opt_final_tpose.getPermutation())
+    final_op = mhlo::TransposeOp::create(rewriter, final_op.getLoc(), final_op,
+                                         opt_final_tpose.getPermutation())
                    .getResult();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc
index 87bf7770a20ddf..303c446d536b47 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/scatter.cc
@@ -69,8 +69,8 @@ LogicalResult CanonicalizeScatterUpdates(
   auto permutation_and_shape = GetPermutationAndTransposedShape(
       permutation_array, updates_type, rewriter);
 
-  auto transposed_updates = rewriter.create<mhlo::TransposeOp>(
-      scatter_op->getLoc(), permutation_and_shape.shape, updates,
+  auto transposed_updates = mhlo::TransposeOp::create(
+      rewriter, scatter_op->getLoc(), permutation_and_shape.shape, updates,
       permutation_and_shape.permutation);
 
   updates = transposed_updates;
@@ -163,9 +163,9 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
       permutation_array, operand_type, rewriter);
 
   Location loc = scatter_op.getLoc();
-  auto transposed_operand = rewriter.create<mhlo::TransposeOp>(
-      loc, permutation_and_shape.shape, operands[0],
-      permutation_and_shape.permutation);
+  auto transposed_operand =
+      mhlo::TransposeOp::create(rewriter, loc, permutation_and_shape.shape,
+                                operands[0], permutation_and_shape.permutation);
 
   Value new_indices = indices;
   int64_t index_depth =
@@ -181,8 +181,8 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
         builder, rewriter,
         llvm::SmallVector<int64_t>({num_updates, index_depth}),
         rewriter.getI32Type());
-    new_indices = rewriter.create<TF::ReshapeOp>(
-        loc,
+    new_indices = TF::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get({num_updates, index_depth},
                               indices_type.getElementType()),
         indices, indices_shape);
@@ -190,8 +190,8 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
         builder, rewriter,
         llvm::SmallVector<int64_t>({num_updates, updates_type.getDimSize(0)}),
         rewriter.getI32Type());
-    new_updates = rewriter.create<TF::ReshapeOp>(
-        loc,
+    new_updates = TF::ReshapeOp::create(
+        rewriter, loc,
         RankedTensorType::get({1, updates_type.getDimSize(0)},
                               updates_type.getElementType()),
         new_updates, updates_shape);
@@ -200,8 +200,8 @@ LogicalResult ConvertScatterOp<BinaryOp, TfOp>::matchAndRewrite(
   // Apply TF scatter to update the trailing dimensions of the
   // transposed operand.
   auto tf_scatter_op =
-      rewriter.create<TfOp>(loc, permutation_and_shape.shape,
-                            transposed_operand, new_indices, new_updates);
+      TfOp::create(rewriter, loc, permutation_and_shape.shape,
+                   transposed_operand, new_indices, new_updates);
 
   // Reverse the earlier transpose.
   auto inverse_permutation = GetInversePermutation(permutation_array, rewriter);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc
index e43c0c665ff9db..548951c1ae43e0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/slice.cc
@@ -46,8 +46,8 @@ Value PackScalarIndices(mlir::ValueRange indices, OpBuilder& b) {
   auto values_count_attr = b.getI32IntegerAttr(num_indices);
   auto pack_axis_attr = b.getI32IntegerAttr(0);
 
-  return b.create<TFL::PackOp>(indices.back().getLoc(), packed_indices_type,
-                               indices, values_count_attr, pack_axis_attr);
+  return TFL::PackOp::create(b, indices.back().getLoc(), packed_indices_type,
+                             indices, values_count_attr, pack_axis_attr);
 }
 
 //===----------------------------------------------------------------------===//
@@ -56,8 +56,8 @@ Value PackScalarIndices(mlir::ValueRange indices, OpBuilder& b) {
 
 // Cast the value to i32.
 Value BuildTFLCastOp(OpBuilder& b, Value value) {
-  return b.create<TFL::CastOp>(
-      value.getLoc(),
+  return TFL::CastOp::create(
+      b, value.getLoc(),
       RankedTensorType::get(llvm::cast<ShapedType>(value.getType()).getShape(),
                             b.getI32Type()),
       value);
@@ -70,12 +70,12 @@ class LegalizeSliceOp : public OpConversionPattern<mhlo::SliceOp> {
   LogicalResult matchAndRewrite(
       mhlo::SliceOp slice_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto begin = rewriter.create<arith::ConstantOp>(slice_op.getLoc(),
-                                                    slice_op.getStartIndices());
-    auto end = rewriter.create<arith::ConstantOp>(slice_op.getLoc(),
-                                                  slice_op.getLimitIndices());
-    auto strides = rewriter.create<arith::ConstantOp>(slice_op.getLoc(),
-                                                      slice_op.getStrides());
+    auto begin = arith::ConstantOp::create(rewriter, slice_op.getLoc(),
+                                           slice_op.getStartIndices());
+    auto end = arith::ConstantOp::create(rewriter, slice_op.getLoc(),
+                                         slice_op.getLimitIndices());
+    auto strides = arith::ConstantOp::create(rewriter, slice_op.getLoc(),
+                                             slice_op.getStrides());
     auto zero = rewriter.getIntegerAttr(rewriter.getI32Type(), 0);
     auto no_offset = rewriter.getBoolAttr(false);
 
@@ -116,8 +116,8 @@ LogicalResult CastSliceIndicesToSignless::matchAndRewrite(
 
   llvm::SmallVector<Value> casted_start_inds;
   for (auto start_ind_opr : op.getStartIndices()) {
-    auto casted_start_ind_opr = rewriter.create<mhlo::ConvertOp>(
-        start_ind_opr.getLoc(), start_ind_opr, new_start_e_type);
+    auto casted_start_ind_opr = mhlo::ConvertOp::create(
+        rewriter, start_ind_opr.getLoc(), start_ind_opr, new_start_e_type);
     casted_start_inds.push_back(casted_start_ind_opr.getResult());
   }
 
@@ -161,8 +161,8 @@ LogicalResult LegalizeDynamicSliceOp::matchAndRewrite(
   // clamp start indices between zero and shape(operand) - slice_sizes
   //=-----
 
-  Value clamp_left_cst = rewriter.create<arith::ConstantOp>(
-      op->getLoc(), rewriter.getZeroAttr(start_type));
+  Value clamp_left_cst = arith::ConstantOp::create(
+      rewriter, op->getLoc(), rewriter.getZeroAttr(start_type));
 
   llvm::SmallVector<Value> new_start_indices;
   const auto stride_sizes = UnrollI64Splat(op.getSliceSizes());
@@ -170,15 +170,15 @@ LogicalResult LegalizeDynamicSliceOp::matchAndRewrite(
   for (auto [dim_size, start_ind_opr, stride_size] :
        llvm::zip(input_type.getShape(), op.getStartIndices(), stride_sizes)) {
     const int64_t clamp_right_val = dim_size - stride_size;
-    auto clamp_right_cst = rewriter.create<arith::ConstantOp>(
-        op->getLoc(),
+    auto clamp_right_cst = arith::ConstantOp::create(
+        rewriter, op->getLoc(),
         DenseElementsAttr::get(start_type, rewriter.getIntegerAttr(
                                                start_e_type, clamp_right_val)));
 
-    Value new_start_ind = rewriter.create<TFL::MaximumOp>(
-        op->getLoc(), start_type, clamp_left_cst, start_ind_opr);
-    new_start_ind = rewriter.create<TFL::MinimumOp>(
-        op->getLoc(), start_type, clamp_right_cst, new_start_ind);
+    Value new_start_ind = TFL::MaximumOp::create(
+        rewriter, op->getLoc(), start_type, clamp_left_cst, start_ind_opr);
+    new_start_ind = TFL::MinimumOp::create(rewriter, op->getLoc(), start_type,
+                                           clamp_right_cst, new_start_ind);
 
     new_start_indices.push_back(new_start_ind);
   }
@@ -190,7 +190,7 @@ LogicalResult LegalizeDynamicSliceOp::matchAndRewrite(
   auto packed_indices = PackScalarIndices(new_start_indices, rewriter);
 
   auto slice_sizes_cst =
-      rewriter.create<arith::ConstantOp>(op->getLoc(), op.getSliceSizes());
+      arith::ConstantOp::create(rewriter, op->getLoc(), op.getSliceSizes());
 
   rewriter.replaceOpWithNewOp<TFL::SliceOp>(op, op.getType(), op.getOperand(),
                                             packed_indices, slice_sizes_cst);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
index a64012415729e4..6dcf03b1600244 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.cc
@@ -70,7 +70,7 @@ Value BuildIntConstOp(ImplicitLocOpBuilder& builder,
                       ConversionPatternRewriter& rewriter, int64_t const_value,
                       Type type) {
   Value result_const =
-      builder.create<TF::ConstOp>(rewriter.getIntegerAttr(type, const_value));
+      TF::ConstOp::create(builder, rewriter.getIntegerAttr(type, const_value));
   return result_const;
 }
 
@@ -115,8 +115,8 @@ LogicalResult NormalizeIndexVector(Operation* parent_op, Value& indices,
     new_start_indices_shape.push_back(1);
     indices_type = RankedTensorType::get(new_start_indices_shape,
                                          indices_type.getElementType());
-    indices = rewriter.create<mhlo::ReshapeOp>(parent_op->getLoc(),
-                                               indices_type, indices);
+    indices = mhlo::ReshapeOp::create(rewriter, parent_op->getLoc(),
+                                      indices_type, indices);
   } else if (index_vector_dim != indices_type.getRank() - 1) {
     // If index_vector_dim isn't the last dimension in indices then it isn't
     // supported yet.
@@ -197,8 +197,8 @@ Value InsertTranspose(Value value, int batch_dim, int feature_dim,
                                     default_batch_dim, default_feature_dim,
                                     default_spatial_dim_start, num_spatial_dims,
                                     type, rewriter);
-  return rewriter.create<mhlo::TransposeOp>(value.getLoc(), type, value,
-                                            permutation);
+  return mhlo::TransposeOp::create(rewriter, value.getLoc(), type, value,
+                                   permutation);
 }
 
 Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
@@ -206,10 +206,10 @@ Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
   if (auto shaped_type = mlir::dyn_cast<RankedTensorType>(val.getType())) {
     ShapedType new_type =
         RankedTensorType::get(shaped_type.getShape(), new_ele_type);
-    return rewriter.create<TFL::CastOp>(loc, new_type, val);
+    return TFL::CastOp::create(rewriter, loc, new_type, val);
   }
-  return rewriter.create<TFL::CastOp>(
-      loc, UnrankedTensorType::get(new_ele_type), val);
+  return TFL::CastOp::create(rewriter, loc,
+                             UnrankedTensorType::get(new_ele_type), val);
 }
 
 // Replaces `region`'s terminator to TFL::Yield.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
index c72fce3ffc6a84..1bf33c1d0d993e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/util.h
@@ -75,7 +75,7 @@ Value BuildIntArrayConstOp(ImplicitLocOpBuilder& builder,
     }
     const_value_raw = rewriter.getI32TensorAttr(const_i32_vec);
   }
-  Value result_const = builder.create<ConstOpT>(const_value_raw);
+  Value result_const = ConstOpT::create(builder, const_value_raw);
   return result_const;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc
index c2323b63b9370c..0de2ccafedbe16 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_conversions/while.cc
@@ -51,9 +51,10 @@ class LeagalizeWhileOp : public OpConversionPattern<mhlo::WhileOp> {
     // currently doesn't support stateless, so this
     // parameters are set to the default values.
     auto is_stateless = rewriter.getBoolAttr(false);
-    auto new_while = rewriter.create<TFL::WhileOp>(
-        while_op.getLoc(), while_op->getResultTypes(), while_op->getOperands(),
-        /*is_stateless=*/is_stateless);
+    auto new_while = TFL::WhileOp::create(rewriter, while_op.getLoc(),
+                                          while_op->getResultTypes(),
+                                          while_op->getOperands(),
+                                          /*is_stateless=*/is_stateless);
     new_while.getCond().takeBody(while_op.getCond());
     new_while.getBody().takeBody(while_op.getBody());
     TFLReplaceReturnOp(new_while.getCond(), rewriter);
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
index 113293596536c9..c7f88bb2ebeebc 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_custom_call_to_composite.cc
@@ -69,9 +69,9 @@ struct ReplaceCustomCallWithComposite final
 
     auto decomposition = mlir::cast<FlatSymbolRefAttr>(calledComputations[0]);
 
-    auto composite = rewriter.create<mlir::stablehlo::CompositeOp>(
-        op.getLoc(), op.getResultTypes(), op.getOperands(), name.str(), attrs,
-        decomposition.getValue());
+    auto composite = mlir::stablehlo::CompositeOp::create(
+        rewriter, op.getLoc(), op.getResultTypes(), op.getOperands(),
+        name.str(), attrs, decomposition.getValue());
     rewriter.replaceOp(op, composite.getResults());
     return success();
   }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
index 704dbf37d680dd..836598d19a7516 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_stablehlo_to_vhlo.cc
@@ -59,7 +59,7 @@ namespace {
 
 Value MaterializeIllegalCast(OpBuilder &builder, Type type,
                                             ValueRange inputs, Location loc) {
-  return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+  return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
       ->getResult(0);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
index 78da8b153f47fc..614bd070748267 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
@@ -177,8 +177,8 @@ class ConvertTFXlaCallModuleOp : public OpRewritePattern<TF::XlaCallModuleOp> {
 
     SmallVector<Value, 4> call_op_operands(op.getOperands());
     if (ContainsPlatformIndexArg(op)) {
-      Value dummy_const = rewriter.create<TF::ConstOp>(
-          op.getLoc(),
+      Value dummy_const = TF::ConstOp::create(
+          rewriter, op.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get({}, rewriter.getIntegerType(32)), {0}));
       call_op_operands.insert(call_op_operands.begin(), dummy_const);
@@ -196,16 +196,16 @@ class ConvertTFXlaCallModuleOp : public OpRewritePattern<TF::XlaCallModuleOp> {
       Value operand = std::get<0>(operand_and_type);
       Type expected_type = std::get<1>(operand_and_type);
       if (operand.getType() != expected_type) {
-        operand = rewriter.create<TF::CastOp>(
-            op.getLoc(), expected_type, operand,
-            /*Truncate=*/rewriter.getBoolAttr(false));
+        operand =
+            TF::CastOp::create(rewriter, op.getLoc(), expected_type, operand,
+                               /*Truncate=*/rewriter.getBoolAttr(false));
       }
       casted_operands.push_back(operand);
     }
 
-    auto call = rewriter.create<func::CallOp>(
-        op->getLoc(), main_fn.getSymName(), main_fn.getResultTypes(),
-        casted_operands);
+    auto call =
+        func::CallOp::create(rewriter, op->getLoc(), main_fn.getSymName(),
+                             main_fn.getResultTypes(), casted_operands);
     rewriter.replaceOp(op, call->getResults());
 
     return success();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
index 282c44a958c27f..1effffd9aa00e3 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/optimize.cc
@@ -269,9 +269,9 @@ LogicalResult LiftDotConcatLHS(mhlo::ConcatenateOp concat,
         mlir::dyn_cast<ShapedType>(v.getType()).getShape()[new_concat_dim];
   }
 
-  auto new_concat = rewriter.create<mhlo::ConcatenateOp>(
-      concat->getLoc(), concat.getType().clone(new_concat_shape), all_dot_lhs,
-      rewriter.getI64IntegerAttr(new_concat_dim));
+  auto new_concat = mhlo::ConcatenateOp::create(
+      rewriter, concat->getLoc(), concat.getType().clone(new_concat_shape),
+      all_dot_lhs, rewriter.getI64IntegerAttr(new_concat_dim));
   rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
       concat, concat.getType(), new_concat, first_dot.getRhs(),
       first_dot.getDotDimensionNumbers(), first_dot.getPrecisionConfigAttr(),
@@ -368,11 +368,11 @@ LogicalResult LiftDotConcatLHSAndRHS(mhlo::ConcatenateOp concat,
         mlir::dyn_cast<ShapedType>(v.getType()).getShape()[rhs_batch_dim];
   }
 
-  auto lhs_new_concat = rewriter.create<mhlo::ConcatenateOp>(
-      concat->getLoc(), concat.getType().clone(lhs_new_concat_shape),
+  auto lhs_new_concat = mhlo::ConcatenateOp::create(
+      rewriter, concat->getLoc(), concat.getType().clone(lhs_new_concat_shape),
       all_dot_lhs, rewriter.getI64IntegerAttr(lhs_batch_dim));
-  auto rhs_new_concat = rewriter.create<mhlo::ConcatenateOp>(
-      concat->getLoc(), concat.getType().clone(rhs_new_concat_shape),
+  auto rhs_new_concat = mhlo::ConcatenateOp::create(
+      rewriter, concat->getLoc(), concat.getType().clone(rhs_new_concat_shape),
       all_dot_rhs, rewriter.getI64IntegerAttr(rhs_batch_dim));
   rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
       concat, concat.getType(), lhs_new_concat, rhs_new_concat,
@@ -439,7 +439,8 @@ LogicalResult FuseSliceConcat(mhlo::ConcatenateOp concat,
     new_slice_shape.push_back(second_limit - first_start);
   }
 
-  auto new_slice = rewriter.create<mhlo::SliceOp>(
+  auto new_slice = mhlo::SliceOp::create(
+      rewriter,
       FusedLoc::get(first->getContext(), {first.getLoc(), second.getLoc()}),
       first.getType().clone(new_slice_shape), first.getOperand(),
       /*start_indices=*/rewriter.getI64TensorAttr(new_start),
@@ -730,8 +731,8 @@ class SimplifyBroadcastInDimsReshape
 
     auto new_broadcast_input_type = RankedTensorType::get(
         new_broadcast_input_shape, broadcast_type.getElementType());
-    auto new_broadcast_input = rewriter.create<mhlo::ReshapeOp>(
-        op->getLoc(), new_broadcast_input_type, op.getOperand());
+    auto new_broadcast_input = mhlo::ReshapeOp::create(
+        rewriter, op->getLoc(), new_broadcast_input_type, op.getOperand());
     auto new_broadcast_dims_attr =
         rewriter.getI64TensorAttr(new_broadcast_dims);
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
index 249a1018e091f4..13f981c8714f46 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/smuggle_disallowed_ops.cc
@@ -44,8 +44,8 @@ LogicalResult SmuggleOp(Operation* op, PatternRewriter& rewriter) {
       rewriter.getNamedAttr("call_target_name", op->getName().getIdentifier());
   SmallVector<NamedAttribute> attrs{op->getAttrs()};
   attrs.push_back(call_target);
-  auto custom_call = rewriter.create<mlir::stablehlo::CustomCallOp>(
-      op->getLoc(), op->getResultTypes(), op->getOperands(), attrs);
+  auto custom_call = mlir::stablehlo::CustomCallOp::create(
+      rewriter, op->getLoc(), op->getResultTypes(), op->getOperands(), attrs);
   rewriter.replaceOp(op, custom_call.getResults());
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
index fcecd557aeab1c..557b721bfaf35f 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_fuse_convolution_pass.cc
@@ -143,13 +143,15 @@ class FuseStablehloMulAndConvolutionPattern
       broadcast_dims =
           DenseI64ArrayAttr::get(rewriter.getContext(), {filter_rank - 1});
     }
-    Value broadcast_multiplier = rewriter.create<stablehlo::BroadcastInDimOp>(
-        mul_op.getLoc(), filter.getType(), multiplier, broadcast_dims);
-    Value new_filter = rewriter.create<stablehlo::MulOp>(
-        mul_op.getLoc(), filter.getType(), filter, broadcast_multiplier);
-    Value new_conv = rewriter.create<stablehlo::ConvolutionOp>(
-        mul_op.getLoc(), conv_op.getType(), conv_op.getLhs(), new_filter,
-        conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
+    Value broadcast_multiplier = stablehlo::BroadcastInDimOp::create(
+        rewriter, mul_op.getLoc(), filter.getType(), multiplier,
+        broadcast_dims);
+    Value new_filter =
+        stablehlo::MulOp::create(rewriter, mul_op.getLoc(), filter.getType(),
+                                 filter, broadcast_multiplier);
+    Value new_conv = stablehlo::ConvolutionOp::create(
+        rewriter, mul_op.getLoc(), conv_op.getType(), conv_op.getLhs(),
+        new_filter, conv_op.getWindowStridesAttr(), conv_op.getPaddingAttr(),
         conv_op.getLhsDilationAttr(), conv_op.getRhsDilationAttr(),
         conv_op.getWindowReversalAttr(), conv_op.getDimensionNumbers(),
         conv_op.getFeatureGroupCount(), conv_op.getBatchGroupCount(),
@@ -169,8 +171,8 @@ class FuseStablehloMulAndConvolutionPattern
               conv_op) {
         return failure();
       }
-      Value new_shape_of = rewriter.create<shape::ShapeOfOp>(
-          mul_op.getLoc(), shape_of_op.getType(), new_conv);
+      Value new_shape_of = shape::ShapeOfOp::create(
+          rewriter, mul_op.getLoc(), shape_of_op.getType(), new_conv);
       shape_of_op.replaceAllUsesWith(new_shape_of);
       rewriter.replaceOp(mul_op, {new_conv});
     }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
index 0c43a5c4047a64..b283dea3098232 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tflite_legalize_hlo.cc
@@ -69,7 +69,7 @@ arith::ConstantOp ShapeToConst(PatternRewriter& rewriter, Value value) {
   auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
                                          rewriter.getIntegerType(64));
   auto attr = DenseElementsAttr::get(attr_type, shape);
-  return rewriter.create<arith::ConstantOp>(value.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(rewriter, value.getLoc(), attr_type, attr);
 }
 
 // Returns true if broadcast_dimensions obey Tensorflow convention, as in new
@@ -107,7 +107,7 @@ arith::ConstantOp ExpandedShape(OpBuilder& b, Value input,
   auto attr_type = RankedTensorType::get(
       {static_cast<int64_t>(expanded_shape.size())}, b.getIntegerType(32));
   auto attr = DenseElementsAttr::get(attr_type, expanded_shape);
-  return b.create<arith::ConstantOp>(output.getLoc(), attr_type, attr);
+  return arith::ConstantOp::create(b, output.getLoc(), attr_type, attr);
 }
 
 Value ExpandedDynamicShape(OpBuilder& b, Value input,
@@ -132,7 +132,7 @@ Value ExpandedDynamicShape(OpBuilder& b, Value input,
   for (int64_t i : expanded_dimensions) {
     auto index_attr = DenseIntElementsAttr::get(
         RankedTensorType::get({}, b.getI64Type()), {i});
-    Value index = b.create<arith::ConstantOp>(output.getLoc(), index_attr);
+    Value index = arith::ConstantOp::create(b, output.getLoc(), index_attr);
 
     auto cur_type = llvm::cast<ShapedType>(expanded_input.getType());
     auto cur_shape = cur_type.getShape();
@@ -145,8 +145,8 @@ Value ExpandedDynamicShape(OpBuilder& b, Value input,
 
     auto new_type = RankedTensorType::get(new_shape, cur_type.getElementType());
 
-    expanded_input = b.create<TFL::ExpandDimsOp>(output.getLoc(), new_type,
-                                                 expanded_input, index);
+    expanded_input = TFL::ExpandDimsOp::create(b, output.getLoc(), new_type,
+                                               expanded_input, index);
   }
 
   return expanded_input;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
index b0a023494f1ca4..b5aded528cdc25 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/unfold_splat_constant_pass.cc
@@ -74,18 +74,17 @@ class UnfoldSplatConstantPass
       return;
     }
     op_builder->setInsertionPoint(const_op);
-    Value scalar = op_builder->create<mhlo::ConstantOp>(
-        const_op->getLoc(),
+    Value scalar = mhlo::ConstantOp::create(
+        *op_builder, const_op->getLoc(),
         DenseElementsAttr::get(
             RankedTensorType::get(/*shape=*/{}, element_type),
             splat_elements_attr.getSplatValue<Attribute>()));
     auto broadcast_dims = DenseIntElementsAttr::get(
         RankedTensorType::get(/*shape=*/{0}, op_builder->getI64Type()),
         llvm::SmallVector<int64_t>{});
-    mhlo::BroadcastInDimOp broadcast_in_dim_op =
-        op_builder->create<mhlo::BroadcastInDimOp>(
-            const_op->getLoc(), splat_elements_attr.getType(), scalar,
-            broadcast_dims);
+    mhlo::BroadcastInDimOp broadcast_in_dim_op = mhlo::BroadcastInDimOp::create(
+        *op_builder, const_op->getLoc(), splat_elements_attr.getType(), scalar,
+        broadcast_dims);
     const_op->replaceAllUsesWith(broadcast_in_dim_op);
     const_op->erase();
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
index 6b92b5f63ee66f..e04be6148b7b1d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/decompose_hybrid_quantization.cc
@@ -93,7 +93,7 @@ class DequantizeConverter : public OpRewritePattern<SrcOp> {
       if (QuantizedType::getQuantizedElementType(operand.getType())) {
         auto newTy = QuantizedType::castToExpressedType(operand.getType());
         newOperands.push_back(
-            rewriter.create<TFL::DequantizeOp>(loc, newTy, operand));
+            TFL::DequantizeOp::create(rewriter, loc, newTy, operand));
         continue;
       }
 
@@ -109,9 +109,8 @@ class DequantizeConverter : public OpRewritePattern<SrcOp> {
       newResultTys.push_back(resultTy);
     }
 
-    auto newResults = rewriter
-                          .create<SrcOp>(loc, newResultTys, newOperands,
-                                         op->getAttrDictionary().getValue())
+    auto newResults = SrcOp::create(rewriter, loc, newResultTys, newOperands,
+                                    op->getAttrDictionary().getValue())
                           .getOperation()
                           ->getResults();
 
@@ -120,8 +119,8 @@ class DequantizeConverter : public OpRewritePattern<SrcOp> {
       Value result = newResults[i];
       Type resultTy = op->getOpResult(i).getType();
       if (QuantizedType::getQuantizedElementType(resultTy)) {
-        replaceResults.push_back(rewriter.create<TFL::QuantizeOp>(
-            loc, resultTy, result, TypeAttr::get(resultTy)));
+        replaceResults.push_back(TFL::QuantizeOp::create(
+            rewriter, loc, resultTy, result, TypeAttr::get(resultTy)));
         continue;
       }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/if_outline.cc b/tensorflow/compiler/mlir/lite/transforms/if_outline.cc
index 0e7c03dd32b35f..c45d5f74b8988d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/if_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/if_outline.cc
@@ -84,7 +84,7 @@ func::FuncOp CreateOutlineFuncAndEraseRegion(
   type = FunctionType::get(context, types, result_types);
 
   // Create outlined function and move region body to it.
-  auto outlined_func = func_builder.create<func::FuncOp>(loc, name, type);
+  auto outlined_func = func::FuncOp::create(func_builder, loc, name, type);
   outlined_func.getBody().takeBody(region);
   Region& func_region = outlined_func.getBody();
 
@@ -97,8 +97,8 @@ func::FuncOp CreateOutlineFuncAndEraseRegion(
   // Replace yield op with return.
   Operation* yield_op = outlined_func.getBody().front().getTerminator();
   OpBuilder return_builder(yield_op);
-  return_builder.create<func::ReturnOp>(yield_op->getLoc(),
-                                        yield_op->getOperands());
+  func::ReturnOp::create(return_builder, yield_op->getLoc(),
+                         yield_op->getOperands());
   yield_op->erase();
 
   SymbolTable(region.getParentOfType<ModuleOp>()).insert(outlined_func);
@@ -121,8 +121,8 @@ void ReplaceRegionWithCall(StringRef name, Region& region,
     new_operands.push_back(block->addArgument(t, loc));
   }
   new_operands.append(extern_values.begin(), extern_values.end());
-  auto call = b.create<func::CallOp>(loc, func, new_operands);
-  b.create<YieldOp>(loc, call.getResults());
+  auto call = func::CallOp::create(b, loc, func, new_operands);
+  YieldOp::create(b, loc, call.getResults());
 }
 
 void IfOutlinePass::OutlineIf(IfOp if_op) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc b/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
index 4e1fe8e012211a..7a85d60b51d6eb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/insert_call_once_op.cc
@@ -51,7 +51,7 @@ void InsertCallOnceOpFromSessionInitializerPass::runOnOperation() {
 
       OpBuilder builder(func.getContext());
       builder.setInsertionPointToStart(&func.getBlocks().front());
-      builder.create<TFL::CallOnceOp>(func.getLoc(), init_func_op.getName());
+      TFL::CallOnceOp::create(builder, func.getLoc(), init_func_op.getName());
     }
   }
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
index bfeea6d6e6373a..668493eca931e7 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_batch_matmul_pass.cc
@@ -135,8 +135,8 @@ struct ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs
       // mapped to X and Z dimension.
       std::iter_swap(permute.begin() + input_rank - 1,
                      permute.begin() + input_rank - 2);
-      auto permutation_tensor_op = rewriter.create<arith::ConstantOp>(
-          bmm_op->getLoc(), permuation_tensor_type,
+      auto permutation_tensor_op = arith::ConstantOp::create(
+          rewriter, bmm_op->getLoc(), permuation_tensor_type,
           DenseElementsAttr::get(permuation_tensor_type, permute));
 
       auto input_shape = input_type.getShape();
@@ -181,9 +181,8 @@ struct ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs
             RankedTensorType::get(permuted_shape, input_type.getElementType());
       }
 
-      return rewriter.create<TFL::TransposeOp>(
-          bmm_op->getLoc(), output_type, input,
-          permutation_tensor_op.getResult());
+      return TFL::TransposeOp::create(rewriter, bmm_op->getLoc(), output_type,
+                                      input, permutation_tensor_op.getResult());
     };
 
     Value input_lhs = bmm_op.getX();
@@ -198,10 +197,11 @@ struct ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs
         !bmm_op.getAdjY() ? create_z_x_transpose_op(input_rhs) : input_rhs;
 
     Type output_type = bmm_op.getResult().getType();
-    auto no_input = rewriter.create<TFL::NoValueOp>(
-        bmm_op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
-    auto fc_op = rewriter.create<TFL::FullyConnectedOp>(
-        bmm_op->getLoc(), ArrayRef<Type>{output_type},
+    auto no_input =
+        TFL::NoValueOp::create(rewriter, bmm_op->getLoc(),
+                               rewriter.getNoneType(), rewriter.getUnitAttr());
+    auto fc_op = TFL::FullyConnectedOp::create(
+        rewriter, bmm_op->getLoc(), ArrayRef<Type>{output_type},
         /*input=*/output_lhs, /*filter=*/output_rhs, /*bias=*/no_input,
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
         /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
@@ -257,13 +257,14 @@ struct ConvertBatchMatMulOpToReduceSum
       cY = rhs_shape.size() - 1;
     }
 
-    auto reduce_dim_op = rewriter.create<TFL::ConstOp>(
-        bmm_op->getLoc(),
+    auto reduce_dim_op = TFL::ConstOp::create(
+        rewriter, bmm_op->getLoc(),
         DenseIntElementsAttr::get(
             RankedTensorType::get({1}, rewriter.getI32Type()), {cY}));
-    auto sum_op = rewriter.create<TFL::SumOp>(
-        bmm_op->getLoc(), bmm_op.getType(), bmm_op.getY(), reduce_dim_op,
-        /*keep_dims=*/rewriter.getBoolAttr(true));
+    auto sum_op =
+        TFL::SumOp::create(rewriter, bmm_op->getLoc(), bmm_op.getType(),
+                           bmm_op.getY(), reduce_dim_op,
+                           /*keep_dims=*/rewriter.getBoolAttr(true));
     rewriter.replaceOp(bmm_op, sum_op);
     return success();
   };
@@ -368,19 +369,21 @@ struct FuseRhsTransposeIntoBatchMatMulOp
     new_reshape_input_shape.push_back(
         rhs_contracting_dimensions.SizesArray().front());
 
-    Value new_reshape_shape_value = rewriter.create<arith::ConstantOp>(
-        bmm_op->getLoc(),
+    Value new_reshape_shape_value = arith::ConstantOp::create(
+        rewriter, bmm_op->getLoc(),
         GetI32ElementsAttr(new_reshape_input_shape, &rewriter));
-    auto new_reshape_value = rewriter.create<TFL::ReshapeOp>(
-        bmm_op->getLoc(), transpose_op.getInput(), new_reshape_shape_value);
+    auto new_reshape_value = TFL::ReshapeOp::create(rewriter, bmm_op->getLoc(),
+                                                    transpose_op.getInput(),
+                                                    new_reshape_shape_value);
 
     // Replace the BatchMatMulOp with a FullyConnectedOp, if the RHS of BMM has
     // no broadcasting dimensions. I.e. RHS of BMM is of Rank 2.
     if (rhs_dimensions_info.batch_dimensions().AxesArray().empty()) {
-      auto no_input = rewriter.create<TFL::NoValueOp>(
-          bmm_op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
-      auto fc_op = rewriter.create<TFL::FullyConnectedOp>(
-          bmm_op->getLoc(), ArrayRef<Type>{bmm_op.getType()},
+      auto no_input = TFL::NoValueOp::create(rewriter, bmm_op->getLoc(),
+                                             rewriter.getNoneType(),
+                                             rewriter.getUnitAttr());
+      auto fc_op = TFL::FullyConnectedOp::create(
+          rewriter, bmm_op->getLoc(), ArrayRef<Type>{bmm_op.getType()},
           /*input=*/bmm_op.getX(), /*filter=*/new_reshape_value,
           /*bias=*/no_input,
           /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
@@ -391,9 +394,10 @@ struct FuseRhsTransposeIntoBatchMatMulOp
     } else {
       // Replace the BatchMatMulOp with a BatchMatMulOp with adj_y = true and
       // transpose fused into RHS.
-      auto bmm_op_with_adj_y = rewriter.create<TFL::BatchMatMulOp>(
-          bmm_op->getLoc(), bmm_op.getType(), bmm_op.getX(), new_reshape_value,
-          bmm_op.getAdjX(), /*adj_y=*/true, mlir::BoolAttr());
+      auto bmm_op_with_adj_y = TFL::BatchMatMulOp::create(
+          rewriter, bmm_op->getLoc(), bmm_op.getType(), bmm_op.getX(),
+          new_reshape_value, bmm_op.getAdjX(), /*adj_y=*/true,
+          mlir::BoolAttr());
       rewriter.replaceOp(bmm_op, {bmm_op_with_adj_y.getResult()});
     }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
index aed2946db17ba3..21b1963998d0d5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_broadcast_like_pass.cc
@@ -131,8 +131,9 @@ Value PrepareBroadcastLikeOpInput(Operation* op, PatternRewriter& rewriter) {
         RankedTensorType::get({}, elements_attr.getType().getElementType()),
         elements_attr.getSplatValue<mlir::Attribute>());
 
-    return rewriter.create<arith::ConstantOp>(
-        op->getLoc(), scalar_elements_attr.getType(), scalar_elements_attr);
+    return arith::ConstantOp::create(rewriter, op->getLoc(),
+                                     scalar_elements_attr.getType(),
+                                     scalar_elements_attr);
   }
   return nullptr;
 }
@@ -380,10 +381,10 @@ LogicalResult ReorderBroadcastToCast::matchAndRewrite(
           : static_cast<TensorType>(UnrankedTensorType::get(
                 old_cast_op_output_type.getElementType()));
 
-  auto new_cast_op = rewriter.create<TFL::CastOp>(
-      fused_loc, new_cast_op_output_type, input_value);
-  auto new_broadcast_to_op = rewriter.create<TFL::BroadcastToOp>(
-      fused_loc, old_cast_op_output_type, new_cast_op.getOutput(),
+  auto new_cast_op = TFL::CastOp::create(rewriter, fused_loc,
+                                         new_cast_op_output_type, input_value);
+  auto new_broadcast_to_op = TFL::BroadcastToOp::create(
+      rewriter, fused_loc, old_cast_op_output_type, new_cast_op.getOutput(),
       broadcast_to_op.getShape());
 
   rewriter.replaceOp(cast_op, new_broadcast_to_op.getOutput());
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
index ec6e2b5902503f..062d9c1e712de2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_pass.cc
@@ -502,8 +502,8 @@ Value GetBiasMultiplier(OpBuilder& builder, Value binary_op,
       (llvm::isa<mlir::TFL::AddOp>(binary_op.getDefiningOp()) ? 1.0 : -1.0);
   Attribute constant_attr = FloatAttr::get(element_type, multiplier);
 
-  return builder.create<arith::ConstantOp>(
-      binary_op.getLoc(),
+  return arith::ConstantOp::create(
+      builder, binary_op.getLoc(),
       DenseFPElementsAttr::get(RankedTensorType::get({}, element_type),
                                constant_attr));
 }
@@ -677,10 +677,10 @@ Value ReshapeValueDroppingLastDim(OpBuilder& builder, Value value) {
   } else {
     new_shape.push_back(-1);
   }
-  return builder.create<ReshapeOp>(
-      value.getLoc(), value,
-      builder.create<arith::ConstantOp>(
-          value.getLoc(),
+  return ReshapeOp::create(
+      builder, value.getLoc(), value,
+      arith::ConstantOp::create(
+          builder, value.getLoc(),
           DenseIntElementsAttr::get(
               RankedTensorType::get(type.getRank() - 1, builder.getI32Type()),
               new_shape)));
@@ -754,9 +754,8 @@ Value Get1DShapeValue(OpBuilder& builder, Value value) {
   }
   auto output_type = RankedTensorType::get({1}, builder.getI32Type());
   const int num_elements = type.getNumElements();
-  return builder.create<ConstOp>(
-      value.getLoc(), output_type,
-      DenseIntElementsAttr::get(output_type, num_elements));
+  return ConstOp::create(builder, value.getLoc(), output_type,
+                         DenseIntElementsAttr::get(output_type, num_elements));
 }
 
 Type GetEmbeddingLookupShape(Value lookup, Value value) {
@@ -780,8 +779,8 @@ mlir::Value GetFcOutput(OpBuilder* builder,
                         StringAttr fused_activation_function,
                         StringAttr weights_format, BoolAttr keep_num_dims,
                         BoolAttr asymmetric_quantize_inputs) {
-  auto fc_op = builder->create<FullyConnectedOp>(
-      result[0].getLoc(), result.getTypes(), input, filter, bias,
+  auto fc_op = FullyConnectedOp::create(
+      *builder, result[0].getLoc(), result.getTypes(), input, filter, bias,
       fused_activation_function, weights_format, keep_num_dims,
       asymmetric_quantize_inputs);
   return fc_op->getResult(0);
@@ -973,13 +972,13 @@ struct SqueezeReshapesAroundBroadcastOp
             .drop_back(num_trailing_broadcast_dims)
             .drop_front(num_leading_broadcast_dims)};
 
-    Value new_reshape_shape_value = rewriter.create<arith::ConstantOp>(
-        inner_reshape_op->getLoc(),
+    Value new_reshape_shape_value = arith::ConstantOp::create(
+        rewriter, inner_reshape_op->getLoc(),
         GetI32ElementsAttr(new_reshape_shape_i32, &rewriter));
 
-    auto new_inner_reshape_op = rewriter.create<TFL::ReshapeOp>(
-        inner_reshape_op->getLoc(), inner_reshape_input,
-        new_reshape_shape_value);
+    auto new_inner_reshape_op =
+        TFL::ReshapeOp::create(rewriter, inner_reshape_op->getLoc(),
+                               inner_reshape_input, new_reshape_shape_value);
 
     // Create a new reshape_op to replace the old inner reshape_op.
     rewriter.replaceOp(inner_reshape_op, new_inner_reshape_op.getResult());
@@ -990,11 +989,12 @@ struct SqueezeReshapesAroundBroadcastOp
             .drop_back(num_trailing_broadcast_dims)
             .drop_front(num_leading_broadcast_dims)};
 
-    Value new_broadcast_shape_value = rewriter.create<arith::ConstantOp>(
-        loc, GetI64ElementsAttr(new_broadcast_shape, &rewriter));
+    Value new_broadcast_shape_value = arith::ConstantOp::create(
+        rewriter, loc, GetI64ElementsAttr(new_broadcast_shape, &rewriter));
 
-    auto new_broadcast_to_op = rewriter.create<TFL::BroadcastToOp>(
-        loc, RankedTensorType::get(new_broadcast_shape, rewriter.getF32Type()),
+    auto new_broadcast_to_op = TFL::BroadcastToOp::create(
+        rewriter, loc,
+        RankedTensorType::get(new_broadcast_shape, rewriter.getF32Type()),
         new_inner_reshape_op.getOutput(), new_broadcast_shape_value);
 
     // Create a new broadcast_op to replace the old broadcast_op.
@@ -1055,18 +1055,19 @@ struct FuseAddAndStridedSlice : public OpRewritePattern<TFL::StridedSliceOp> {
         added_value.reshape(RankedTensorType::get(
             {num_dims},
             mlir::cast<ShapedType>(added_value.getType()).getElementType()));
-    ::mlir::arith::ConstantOp new_end = rewriter.create<arith::ConstantOp>(
-        strided_slice_op.getEnd().getLoc(), new_added_value);
+    ::mlir::arith::ConstantOp new_end = arith::ConstantOp::create(
+        rewriter, strided_slice_op.getEnd().getLoc(), new_added_value);
 
     if (strided_slice_op.getBeginMask() != 0) return failure();
     if (strided_slice_op.getEndMask() != 0) return failure();
     if (strided_slice_op.getEllipsisMask() != 0) return failure();
     mlir::TFL::StridedSliceOp new_strided_slice_op =
-        rewriter.create<TFL::StridedSliceOp>(
-            strided_slice_op.getLoc(), strided_slice_op.getOutput().getType(),
-            strided_slice_op.getInput(), strided_slice_op.getBegin(), new_end,
-            strided_slice_op.getStrides(), strided_slice_op.getBeginMask(),
-            strided_slice_op.getEndMask(), strided_slice_op.getEllipsisMask(),
+        TFL::StridedSliceOp::create(
+            rewriter, strided_slice_op.getLoc(),
+            strided_slice_op.getOutput().getType(), strided_slice_op.getInput(),
+            strided_slice_op.getBegin(), new_end, strided_slice_op.getStrides(),
+            strided_slice_op.getBeginMask(), strided_slice_op.getEndMask(),
+            strided_slice_op.getEllipsisMask(),
             strided_slice_op.getNewAxisMask(),
             strided_slice_op.getShrinkAxisMask(),
             /*offset=*/true);
@@ -1186,24 +1187,26 @@ struct Convert2DUpscalingToResizeNearestNeighor
     SmallVector<int64_t, 4> reshape_shape_in_int64(
         {1, image_size, image_size, feature_size});
 
-    auto reshape_shape_const_op = rewriter.create<TFL::ConstOp>(
-        gather_nd_first->getLoc(),
-        GetI32ElementsAttr(reshape_shape, &rewriter));
+    auto reshape_shape_const_op =
+        TFL::ConstOp::create(rewriter, gather_nd_first->getLoc(),
+                             GetI32ElementsAttr(reshape_shape, &rewriter));
 
-    auto reshape_op = rewriter.create<TFL::ReshapeOp>(
-        gather_nd_first->getLoc(),
+    auto reshape_op = TFL::ReshapeOp::create(
+        rewriter, gather_nd_first->getLoc(),
         tensorflow::GetTypeFromTFTensorShape(reshape_shape_in_int64,
                                              result_type.getElementType()),
         params_value, reshape_shape_const_op.getResult());
 
     // Add TFL::resize_nearest_neighor op for 2x upscaling.
     SmallVector<int32_t, 2> size_vec = {image_size * 2, image_size * 2};
-    auto size_const_op = rewriter.create<TFL::ConstOp>(
-        gather_nd_first->getLoc(), GetI32ElementsAttr(size_vec, &rewriter));
+    auto size_const_op =
+        TFL::ConstOp::create(rewriter, gather_nd_first->getLoc(),
+                             GetI32ElementsAttr(size_vec, &rewriter));
 
-    auto resize = rewriter.create<TFL::ResizeNearestNeighborOp>(
-        gather_nd_first->getLoc(), transpose_second.getResult().getType(),
-        reshape_op.getResult(), size_const_op.getResult(), false, false);
+    auto resize = TFL::ResizeNearestNeighborOp::create(
+        rewriter, gather_nd_first->getLoc(),
+        transpose_second.getResult().getType(), reshape_op.getResult(),
+        size_const_op.getResult(), false, false);
 
     rewriter.replaceOp(transpose_second, resize.getResult());
     return success();
@@ -1233,13 +1236,13 @@ static std::optional<Value> GetAs1DValue(PatternRewriter& rewriter, Value value,
           RankedTensorType::get({num_channels}, type.getElementType());
       auto splat_attr =
           DenseElementsAttr::get(splat_type, attr.getSplatValue<Attribute>());
-      return rewriter.create<arith::ConstantOp>(value.getLoc(), splat_attr);
+      return arith::ConstantOp::create(rewriter, value.getLoc(), splat_attr);
     }
 
     if (HasOneTailUnitDimension(attr) &&
         attr.getNumElements() == num_channels) {
       auto flattened = FlattenTo1D(attr);
-      return rewriter.create<arith::ConstantOp>(value.getLoc(), flattened);
+      return arith::ConstantOp::create(rewriter, value.getLoc(), flattened);
     }
   }
 
@@ -1259,7 +1262,7 @@ static std::optional<Value> GetBiasIn1D(PatternRewriter& rewriter, Value bias,
     RankedTensorType type =
         RankedTensorType::get({num_channels}, fallback_element_type);
     auto attr = rewriter.getZeroAttr(type);
-    return rewriter.create<arith::ConstantOp>(bias.getLoc(), type, attr);
+    return arith::ConstantOp::create(rewriter, bias.getLoc(), type, attr);
   }
 
   auto bias_type = mlir::dyn_cast<RankedTensorType>(bias.getType());
@@ -1377,34 +1380,34 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
     }
 
     auto new_bias =
-        rewriter
-            .create<AddOp>(add_op.getLoc(), bias_1d.value(), add_rhs_1d.value(),
-                           rewriter.getStringAttr("NONE"))
+        AddOp::create(rewriter, add_op.getLoc(), bias_1d.value(),
+                      add_rhs_1d.value(), rewriter.getStringAttr("NONE"))
             .getOutput();
     mlir::Value out =
-        rewriter
-            .create<TFL::FullyConnectedOp>(
-                mlir::FusedLoc::get(fc_op.getContext(),
-                                    {fc_op.getLoc(), add_op.getLoc()}),
-                fc_output_type,
-                /*input=*/fc_op.getInput(),
-                /*filter=*/filter,
-                /*bias=*/new_bias,
-                /*fused_activation_function=*/
-                rewriter.getStringAttr(add_op.getFusedActivationFunction()),
-                /*weights_format=*/
-                rewriter.getStringAttr(fc_op.getWeightsFormat()),
-                /*keep_num_dims=*/rewriter.getBoolAttr(fc_op.getKeepNumDims()),
-                /*asymmetric_quantize_inputs=*/
-                fc_op.getAsymmetricQuantizeInputsAttr())
+        TFL::FullyConnectedOp::create(
+            rewriter,
+            mlir::FusedLoc::get(fc_op.getContext(),
+                                {fc_op.getLoc(), add_op.getLoc()}),
+            fc_output_type,
+            /*input=*/fc_op.getInput(),
+            /*filter=*/filter,
+            /*bias=*/new_bias,
+            /*fused_activation_function=*/
+            rewriter.getStringAttr(add_op.getFusedActivationFunction()),
+            /*weights_format=*/
+            rewriter.getStringAttr(fc_op.getWeightsFormat()),
+            /*keep_num_dims=*/rewriter.getBoolAttr(fc_op.getKeepNumDims()),
+            /*asymmetric_quantize_inputs=*/
+            fc_op.getAsymmetricQuantizeInputsAttr())
             .getOutput()[0];
 
     if (fc_output_type.getShape() != add_output_type.getShape()) {
-      auto target_shape = rewriter.create<arith::ConstantOp>(
-          add_op.getLoc(), rewriter.getI32TensorAttr(llvm::SmallVector<int32_t>(
-                               add_output_type.getShape())));
-      out = rewriter.create<ReshapeOp>(add_op.getLoc(), add_output_type, out,
-                                       target_shape);
+      auto target_shape = arith::ConstantOp::create(
+          rewriter, add_op.getLoc(),
+          rewriter.getI32TensorAttr(
+              llvm::SmallVector<int32_t>(add_output_type.getShape())));
+      out = ReshapeOp::create(rewriter, add_op.getLoc(), add_output_type, out,
+                              target_shape);
     }
     rewriter.replaceOp(add_op, out);
 
@@ -1471,8 +1474,8 @@ struct FuseAddAndFullyConnected
       return failure();
     }
 
-    auto new_bias = rewriter.create<TFL::FullyConnectedOp>(
-        fc_op.getLoc(), old_bias.getType(),
+    auto new_bias = TFL::FullyConnectedOp::create(
+        rewriter, fc_op.getLoc(), old_bias.getType(),
         /*input=*/add_op.getRhs(),
         /*filter=*/fc_op.getFilter(),
         /*bias=*/old_bias,
@@ -1482,7 +1485,8 @@ struct FuseAddAndFullyConnected
         /*asymmetric_quantize_inputs=*/fc_op.getAsymmetricQuantizeInputsAttr());
 
     // Create the updated FC.
-    auto new_fc = rewriter.create<TFL::FullyConnectedOp>(
+    auto new_fc = TFL::FullyConnectedOp::create(
+        rewriter,
         FusedLoc::get(add_op.getContext(), {add_op.getLoc(), fc_op.getLoc()}),
         fc_op.getOutput().getTypes(),
         /*input=*/add_op.getLhs(),
@@ -1557,14 +1561,14 @@ struct FuseMulAndFullyConnected
     auto location =
         FusedLoc::get(mul_op.getContext(), {mul_op.getLoc(), fc_op.getLoc()});
 
-    auto new_filter = rewriter.create<TFL::MulOp>(
-        location,
+    auto new_filter = TFL::MulOp::create(
+        rewriter, location,
         /*lhs=*/fc_op.getFilter(),
         /*rhs=*/mul_op.getRhs(),
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
     // Create the updated FC.
-    auto new_fc = rewriter.create<TFL::FullyConnectedOp>(
-        location, fc_op.getOutput().getTypes(),
+    auto new_fc = TFL::FullyConnectedOp::create(
+        rewriter, location, fc_op.getOutput().getTypes(),
         /*input=*/mul_op.getLhs(),
         /*filter=*/new_filter,
         /*bias=*/fc_op.getBias(),
@@ -1597,7 +1601,8 @@ struct FuseFullyConnectedAndReluX : public OpRewritePattern<ReluXOp> {
         rewriter.getStringAttr(fully_connected_op.getWeightsFormat());
     auto new_keep_num_dims =
         rewriter.getBoolAttr(fully_connected_op.getKeepNumDims());
-    auto fc = rewriter.create<FullyConnectedOp>(
+    auto fc = FullyConnectedOp::create(
+        rewriter,
         FusedLoc::get(relu_op.getContext(),
                       {fully_connected_op.getLoc(), relu_op.getLoc()}),
         relu_op.getType(), /*input=*/fully_connected_op.getInput(),
@@ -1674,7 +1679,7 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
     }
 
     auto new_op =
-        rewriter.create<arith::ConstantOp>(mul_op.getLoc(), new_type, new_cst);
+        arith::ConstantOp::create(rewriter, mul_op.getLoc(), new_type, new_cst);
     Value new_const_val = new_op.getResult();
 
     // Rewrite. Since the folder of TFL::MulOp couldn't broadcast the operands,
@@ -1689,15 +1694,16 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
       if (size > (1 << 30)) return failure();
     }
     auto new_filter =
-        rewriter.create<TF::MulOp>(mul_op.getLoc(), filter, new_const_val)
+        TF::MulOp::create(rewriter, mul_op.getLoc(), filter, new_const_val)
             .getZ();
     // If bias isn't None, it needs to be multiplied as well.
     if (!mlir::isa<NoneType>(bias.getType())) {
-      bias = rewriter.create<TF::MulOp>(mul_op.getLoc(), bias, constant_val)
+      bias = TF::MulOp::create(rewriter, mul_op.getLoc(), bias, constant_val)
                  .getZ();
     }
 
-    auto fc = rewriter.create<TFL::FullyConnectedOp>(
+    auto fc = TFL::FullyConnectedOp::create(
+        rewriter,
         FusedLoc::get(fc_op.getContext(), {fc_op.getLoc(), mul_op.getLoc()}),
         mul_op.getType(),
         /*input=*/fc_op.getInput(),
@@ -1848,13 +1854,13 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
     DenseElementsAttr broadcasted_gamma_attr =
         ExpandTo4DForConv(gamma_cst, filter_output_dim);
     auto broadcasted_gamma =
-        rewriter.create<ConstOp>(loc, broadcasted_gamma_attr);
+        ConstOp::create(rewriter, loc, broadcasted_gamma_attr);
 
     // Inject a mul between the filter constant and the quantize op.
-    auto new_filter = rewriter
-                          .create<TFL::MulOp>(loc, filter, broadcasted_gamma,
-                                              rewriter.getStringAttr("NONE"))
-                          .getResult();
+    auto new_filter =
+        TFL::MulOp::create(rewriter, loc, filter, broadcasted_gamma,
+                           rewriter.getStringAttr("NONE"))
+            .getResult();
     // Update the scale in the quantize op.
     auto new_qtype = RescaleQtype(q_op.getQtype(), gamma_cst);
     if (!new_qtype) {
@@ -1869,11 +1875,11 @@ struct FuseAffinOpAndMulWithQDQs : public OpRewritePattern<TFL::MulOp> {
 
       auto squeezed_gamma = FlattenTo1D(gamma_cst);
       auto squeezed_gamma_type = squeezed_gamma.getType();
-      auto squeezed_gamma_op = rewriter.create<arith::ConstantOp>(
-          affine_op.getLoc(), squeezed_gamma_type, squeezed_gamma);
+      auto squeezed_gamma_op = arith::ConstantOp::create(
+          rewriter, affine_op.getLoc(), squeezed_gamma_type, squeezed_gamma);
 
-      auto new_bias = rewriter.create<TFL::MulOp>(
-          loc, bias, squeezed_gamma_op, rewriter.getStringAttr("NONE"));
+      auto new_bias = TFL::MulOp::create(rewriter, loc, bias, squeezed_gamma_op,
+                                         rewriter.getStringAttr("NONE"));
       affine_op.getOperation()->replaceUsesOfWith(bias, new_bias);
     }
 
@@ -1977,7 +1983,7 @@ struct FuseBinaryOpToFollowingAffineOp : public OpRewritePattern<AffineOpType> {
       }
       auto new_bias = DenseFPElementsAttr::get(new_bias_type, new_bias_values);
       auto new_bias_op =
-          rewriter.create<ConstOp>(fc_op.getLoc(), new_bias_type, new_bias);
+          ConstOp::create(rewriter, fc_op.getLoc(), new_bias_type, new_bias);
       fc_op.setOperand(0, binary_op->getOperand(0));
       fc_op.setOperand(2, new_bias_op);
     } else if (llvm::isa<MulOp, DivOp>(binary_op)) {
@@ -1992,8 +1998,8 @@ struct FuseBinaryOpToFollowingAffineOp : public OpRewritePattern<AffineOpType> {
           });
       // We recreate the constant op in case it is shared by the other ops. This
       // might increase the model size.
-      auto new_filter_op = rewriter.create<ConstOp>(
-          fc_op.getLoc(), filter.getType(), new_filter);
+      auto new_filter_op = ConstOp::create(rewriter, fc_op.getLoc(),
+                                           filter.getType(), new_filter);
       fc_op.setOperand(0, binary_op->getOperand(0));
       if (fc_op.getFilter() != filter) {
         // This filter goes through quantize and dequantize ops. Then we just
@@ -2186,8 +2192,9 @@ struct FuseUnpackAndConcatToReshape
       new_shape_array_i32.push_back(
           ShapedType::isDynamic(size) ? -1 : static_cast<int32_t>(size));
     }
-    auto new_shape = rewriter.create<TFL::ConstOp>(
-        concat_op.getLoc(), GetI32ElementsAttr(new_shape_array_i32, &rewriter));
+    auto new_shape = TFL::ConstOp::create(
+        rewriter, concat_op.getLoc(),
+        GetI32ElementsAttr(new_shape_array_i32, &rewriter));
 
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
         concat_op, output_type, unpack_op.getInput(), new_shape);
@@ -2273,8 +2280,8 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
     auto k = !values.use_empty() ? k_values : k_indices;
     // Build scalar tensor k.
     auto k_ty = mlir::RankedTensorType::get({}, rewriter.getIntegerType(32));
-    Value k_cst = rewriter.create<TFL::ConstOp>(
-        op.getLoc(), DenseElementsAttr::get(k_ty, k));
+    Value k_cst = TFL::ConstOp::create(rewriter, op.getLoc(),
+                                       DenseElementsAttr::get(k_ty, k));
     // Compute new result types.
     auto values_ty = mlir::dyn_cast<ShapedType>(values.getType());
     auto indices_ty = mlir::dyn_cast<ShapedType>(indices.getType());
@@ -2287,8 +2294,9 @@ struct OptimizeTopK : public OpRewritePattern<TFL::TopKV2Op> {
         mlir::RankedTensorType::get(shape, values_ty.getElementType());
     auto new_indices_ty =
         mlir::RankedTensorType::get(shape, indices_ty.getElementType());
-    TFL::TopKV2Op top_k_op = rewriter.create<TFL::TopKV2Op>(
-        op.getLoc(), new_values_ty, new_indices_ty, op->getOperand(0), k_cst);
+    TFL::TopKV2Op top_k_op =
+        TFL::TopKV2Op::create(rewriter, op.getLoc(), new_values_ty,
+                              new_indices_ty, op->getOperand(0), k_cst);
 
     // Remove original ops (topk, Slice, Slice).
     if (!values.use_empty()) {
@@ -2376,10 +2384,12 @@ struct FuseReshapeAndTransposeAroundBatchMatmul
         static_cast<int>(std::accumulate(
             transpose_input.getType().getShape().begin() + 2,
             transpose_input.getType().getShape().end(), 1, std::multiplies()))};
-    auto shape_constant = rewriter.create<ConstOp>(
-        batch_matmul.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
-    auto reshaped_input = rewriter.create<ReshapeOp>(
-        batch_matmul.getLoc(), transpose_op.getInput(), shape_constant);
+    auto shape_constant =
+        ConstOp::create(rewriter, batch_matmul.getLoc(),
+                        GetI32ElementsAttr(new_shape, &rewriter));
+    auto reshaped_input =
+        ReshapeOp::create(rewriter, batch_matmul.getLoc(),
+                          transpose_op.getInput(), shape_constant);
     rewriter.replaceOpWithNewOp<BatchMatMulOp>(
         op, op.getType(), reshaped_input, batch_matmul.getX(),
         /*adj_x=*/false, /*adj_y=*/!batch_matmul.getAdjX(),
@@ -2438,10 +2448,10 @@ struct FuseTransposeReshapeIntoBatchMatmul
         reshape_op.getType().getShape().drop_front().begin(),
         reshape_op.getType().getShape().drop_front().end());
     new_shape.push_back(reshape_op.getType().getDimSize(0));
-    auto shape_constant = rewriter.create<ConstOp>(
-        op.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
-    auto new_reshape = rewriter.create<ReshapeOp>(
-        op.getLoc(), transpose_op.getInput(), shape_constant);
+    auto shape_constant = ConstOp::create(
+        rewriter, op.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
+    auto new_reshape = ReshapeOp::create(
+        rewriter, op.getLoc(), transpose_op.getInput(), shape_constant);
     rewriter.replaceOpWithNewOp<BatchMatMulOp>(
         op, op.getType(), op.getX(), new_reshape, op.getAdjX(), !op.getAdjY(),
         op.getAsymmetricQuantizeInputsAttr());
@@ -2647,8 +2657,8 @@ struct UndoBroadcastFullyConnectedBiasAddWithQDQs
 
     auto new_bias = FlattenTo1D(bias_op.getValueAttr());
     auto new_bias_type = new_bias.getType();
-    auto new_bias_op = rewriter.create<arith::ConstantOp>(
-        bias_op.getLoc(), new_bias_type, new_bias);
+    auto new_bias_op = arith::ConstantOp::create(rewriter, bias_op.getLoc(),
+                                                 new_bias_type, new_bias);
 
     // Update QuantizeOp with the new bias and its output shape
     q_op.setOperand(new_bias_op);
@@ -2717,10 +2727,11 @@ struct MoveReshapeAfterFullyConnected
     new_input_shape.pop_back();
     new_input_shape.push_back(input_ty.getShape().back());
 
-    auto reshape_before = rewriter.create<TFL::ReshapeOp>(
-        fc.getLoc(), fc.getInput(),
-        rewriter.create<arith::ConstantOp>(
-            fc->getLoc(), GetI32ElementsAttr(new_input_shape, &rewriter)));
+    auto reshape_before = TFL::ReshapeOp::create(
+        rewriter, fc.getLoc(), fc.getInput(),
+        arith::ConstantOp::create(
+            rewriter, fc->getLoc(),
+            GetI32ElementsAttr(new_input_shape, &rewriter)));
 
     rewriter.replaceOpWithNewOp<TFL::FullyConnectedOp>(
         reshape,
@@ -2864,16 +2875,16 @@ struct PushTransposeThroughSqueeze : public RewritePattern {
             transpose.getInput().getType().getDimSize(i));
       }
     }
-    auto new_squeeze = rewriter.create<TFL::SqueezeOp>(
-        squeeze->getLoc(),
+    auto new_squeeze = TFL::SqueezeOp::create(
+        rewriter, squeeze->getLoc(),
         mlir::RankedTensorType::get(new_squeeze_shape,
                                     squeeze.getType().getElementType()),
         transpose.getInput(), rewriter.getI32ArrayAttr(new_squeeze_dims));
 
-    auto new_transpose = rewriter.create<TFL::TransposeOp>(
-        squeeze->getLoc(), squeeze.getType(), new_squeeze,
-        rewriter.create<arith::ConstantOp>(
-            squeeze->getLoc(), GetI32ElementsAttr(new_perm, &rewriter)));
+    auto new_transpose = TFL::TransposeOp::create(
+        rewriter, squeeze->getLoc(), squeeze.getType(), new_squeeze,
+        arith::ConstantOp::create(rewriter, squeeze->getLoc(),
+                                  GetI32ElementsAttr(new_perm, &rewriter)));
 
     rewriter.replaceOp(squeeze, new_transpose);
     return success();
@@ -3000,17 +3011,18 @@ struct ReorderTransposeReshapeTranspose
         mlir::dyn_cast_or_null<RankedTensorType>(reshape.getType());
     if (!reshape_type) return failure();
 
-    auto new_reshape_shape_const = rewriter.create<arith::ConstantOp>(
-        reshape.getLoc(), GetI32ElementsAttr(new_reshape_shape, &rewriter));
+    auto new_reshape_shape_const = arith::ConstantOp::create(
+        rewriter, reshape.getLoc(),
+        GetI32ElementsAttr(new_reshape_shape, &rewriter));
 
-    auto new_inner_reshape = rewriter.create<TFL::ReshapeOp>(
-        reshape.getLoc(),
+    auto new_inner_reshape = TFL::ReshapeOp::create(
+        rewriter, reshape.getLoc(),
         RankedTensorType::get(new_reshape_shape, reshape_type.getElementType()),
         input, new_reshape_shape_const.getResult());
-    auto new_inner_tpose = rewriter.create<TFL::TransposeOp>(
-        inner_tpose.getLoc(), reshape_type, new_inner_reshape,
-        rewriter.create<arith::ConstantOp>(
-            inner_tpose.getLoc(),
+    auto new_inner_tpose = TFL::TransposeOp::create(
+        rewriter, inner_tpose.getLoc(), reshape_type, new_inner_reshape,
+        arith::ConstantOp::create(
+            rewriter, inner_tpose.getLoc(),
             GetI32ElementsAttr(new_inner_perm, &rewriter)));
 
     rewriter.replaceOp(reshape, new_inner_tpose);
@@ -3079,8 +3091,8 @@ struct FullyConnectedSwapOperandsWhenLHSIsConst
     RankedTensorType intermediate_type =
         RankedTensorType::get({O, B}, element_type);
 
-    auto new_fc = rewriter.create<TFL::FullyConnectedOp>(
-        loc,
+    auto new_fc = TFL::FullyConnectedOp::create(
+        rewriter, loc,
         /*resultTypes=*/intermediate_type,
         /*input=*/filter,  // Original Filter V[O, I]
         /*filter=*/input,  // Original Input C[B, I]
@@ -3096,10 +3108,11 @@ struct FullyConnectedSwapOperandsWhenLHSIsConst
     RankedTensorType final_shape_type =
         RankedTensorType::get({B, O}, element_type);
 
-    Value transposed_result = rewriter.create<TFL::TransposeOp>(
-        loc, final_shape_type, new_fc.getResult(0),
-        rewriter.create<arith::ConstantOp>(
-            loc, GetI32ElementsAttr(ArrayRef<int32_t>({1, 0}), &rewriter)));
+    Value transposed_result = TFL::TransposeOp::create(
+        rewriter, loc, final_shape_type, new_fc.getResult(0),
+        arith::ConstantOp::create(
+            rewriter, loc,
+            GetI32ElementsAttr(ArrayRef<int32_t>({1, 0}), &rewriter)));
 
     rewriter.replaceOp(fc, transposed_result);
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc b/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
index 7baa0136f1c33c..b93422d3812f6c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/pin_ops_with_side_effects.cc
@@ -140,15 +140,15 @@ void PinOpsWithSideEffectsPass::runOnOperation() {
     // Wrap all side-effect producing/dependent operations in a ControlNodeOp.
     builder.setInsertionPoint(op);
     Location loc = op->getLoc();
-    auto outer_op = builder.create<ControlNodeOp>(
-        loc, op->getResultTypes(), ControlType::get(op->getContext()),
-        control_tokens);
+    auto outer_op = ControlNodeOp::create(builder, loc, op->getResultTypes(),
+                                          ControlType::get(op->getContext()),
+                                          control_tokens);
     Region region;
     Block *new_block = new Block;
     region.push_back(new_block);
     builder.setInsertionPointToEnd(&region.front());
     Operation *inner_op = builder.clone(*op);
-    builder.create<YieldOp>(loc, inner_op->getResults());
+    YieldOp::create(builder, loc, inner_op->getResults());
     outer_op.getBody().takeBody(region);
     // Careful: We can't use outer_op.getResults(), because that also includes
     // the control token.
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 2538cc423cdf1e..0cf34df94faf6c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -183,7 +183,7 @@ std::optional<mlir::Value> GetConstTensor(PatternRewriter& rewriter,
   auto const_attr = DenseElementsAttr::get(const_type, vec);
 
   auto const_op =
-      rewriter.create<arith::ConstantOp>(loc, const_type, const_attr);
+      arith::ConstantOp::create(rewriter, loc, const_type, const_attr);
   return const_op.getResult();
 }
 
@@ -207,8 +207,8 @@ std::optional<mlir::Value> ConvertDequantizeOp(
     auto const_attr =
         DenseElementsAttr::get(const_type, static_cast<float>(zeropoint[0]));
 
-    auto const_op = rewriter.create<arith::ConstantOp>(op->getLoc(), const_type,
-                                                       const_attr);
+    auto const_op = arith::ConstantOp::create(rewriter, op->getLoc(),
+                                              const_type, const_attr);
     zp_val = const_op.getResult();
   } else {
     SmallVector<int64_t> shape;
@@ -224,8 +224,8 @@ std::optional<mlir::Value> ConvertDequantizeOp(
     auto const_attr =
         DenseElementsAttr::get(const_type, static_cast<float>(scale[0]));
 
-    auto const_op = rewriter.create<arith::ConstantOp>(op->getLoc(), const_type,
-                                                       const_attr);
+    auto const_op = arith::ConstantOp::create(rewriter, op->getLoc(),
+                                              const_type, const_attr);
     scale_val = const_op.getResult();
   } else {
     SmallVector<int64_t> shape;
@@ -237,16 +237,17 @@ std::optional<mlir::Value> ConvertDequantizeOp(
   if (!zp_val || !scale_val) return std::nullopt;
 
   auto op1_cast_in =
-      rewriter.create<TFL::CastOp>(op->getLoc(), output_type, input_value);
+      TFL::CastOp::create(rewriter, op->getLoc(), output_type, input_value);
 
-  auto op2_sub_op1 = rewriter.create<TFL::SubOp>(
-      op->getLoc(), output_type, op1_cast_in.getResult(), zp_val.value(),
+  auto op2_sub_op1 = TFL::SubOp::create(
+      rewriter, op->getLoc(), output_type, op1_cast_in.getResult(),
+      zp_val.value(),
       /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
 
-  return rewriter
-      .create<TFL::MulOp>(
-          op->getLoc(), output_type, op2_sub_op1.getResult(), scale_val.value(),
-          /*fused_activation_function=*/rewriter.getStringAttr("NONE"))
+  return TFL::MulOp::create(
+             rewriter, op->getLoc(), output_type, op2_sub_op1.getResult(),
+             scale_val.value(),
+             /*fused_activation_function=*/rewriter.getStringAttr("NONE"))
       .getResult();
 }
 
@@ -313,8 +314,8 @@ struct RemoveVolatileOps : public OpRewritePattern<DequantizeOp> {
 
       auto const_type = tensorflow::GetTypeFromTFTensorShape(
           output_type.getShape(), qtype.getStorageType());
-      auto const_op = rewriter.create<arith::ConstantOp>(
-          op->getLoc(), const_type, qconst_op.getValue());
+      auto const_op = arith::ConstantOp::create(
+          rewriter, op->getLoc(), const_type, qconst_op.getValue());
 
       auto new_value =
           ConvertDequantizeOp(rewriter, op, output_type, const_op.getResult(),
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 957d243e72774d..899e4e9e088312 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -102,7 +102,7 @@ static Value CreateTFCastOpI32(OpBuilder *builder, Location loc, Value x,
   auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getI32Type());
-  return builder->create<TF::CastOp>(loc, type, x, truncate);
+  return TF::CastOp::create(*builder, loc, type, x, truncate);
 }
 }  // namespace
 
@@ -253,7 +253,7 @@ class ConvertTFConvOp : public RewritePattern {
         tensorflow::GetTypeFromTFTensorShape({bias_dim}, elem_type);
     auto bias_attr = rewriter.getZeroAttr(bias_type);
     auto bias =
-        rewriter.create<TF::ConstOp>(op->getLoc(), bias_type, bias_attr);
+        TF::ConstOp::create(rewriter, op->getLoc(), bias_type, bias_attr);
 
     if (op->getAttrOfType<StringAttr>("padding").getValue() == "EXPLICIT") {
       // Add Const op for padding value.
@@ -276,12 +276,12 @@ class ConvertTFConvOp : public RewritePattern {
           mlir::DenseIntElementsAttr::get(padding_attr_type, padding_values);
 
       auto padding_const =
-          rewriter.create<TF::ConstOp>(op->getLoc(), padding_attr);
+          TF::ConstOp::create(rewriter, op->getLoc(), padding_attr);
 
       // Add Pad op.
       auto pad_output_type = UnrankedTensorType::get(elem_type);
-      input = rewriter.create<TF::PadOp>(op->getLoc(), pad_output_type, input,
-                                         padding_const);
+      input = TF::PadOp::create(rewriter, op->getLoc(), pad_output_type, input,
+                                padding_const);
 
       // Set Conv padding to `VALID` since padding has been handled by Pad op.
       state.padding = rewriter.getStringAttr("VALID");
@@ -315,8 +315,8 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
                             Type result_type, Value input, Value filter,
                             Value bias) const {
     filter = legalizeFilter(rewriter, loc, filter);
-    return rewriter.create<TFL::Conv2DOp>(
-        loc, result_type, input, filter, bias,
+    return TFL::Conv2DOp::create(
+        rewriter, loc, result_type, input, filter, bias,
         /*dilation_h_factor=*/state->dilation_height_factor,
         /*dilation_w_factor=*/state->dilation_width_factor,
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
@@ -338,7 +338,7 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
         {static_cast<int>(perm.size())}, rewriter.getIntegerType(32));
     auto perm_attr =
         DenseElementsAttr::get(perm_type, llvm::ArrayRef<int>(perm));
-    auto perm_op = rewriter.create<TF::ConstOp>(loc, perm_type, perm_attr);
+    auto perm_op = TF::ConstOp::create(rewriter, loc, perm_type, perm_attr);
 
     // Create tensor type for the transpose result.
     auto filter_type = mlir::cast<RankedTensorType>(filter.getType());
@@ -350,7 +350,7 @@ class ConvertTFConv2D : public ConvertTFConvOp<ConvertTFConv2D, TF::Conv2DOp> {
     auto result_type =
         tensorflow::GetTypeFromTFTensorShape(result_shape, elem_type);
 
-    return rewriter.create<TF::TransposeOp>(loc, result_type, filter, perm_op);
+    return TF::TransposeOp::create(rewriter, loc, result_type, filter, perm_op);
   }
 };
 
@@ -382,8 +382,8 @@ class ConvertTFDepthwiseConv2dNative
         mlir::cast<RankedTensorType>(filter.getType()).getDimSize(3);
 
     filter = legalizeFilter(rewriter, loc, filter);
-    return rewriter.create<TFL::DepthwiseConv2DOp>(
-        loc, result_type, input, filter, bias,
+    return TFL::DepthwiseConv2DOp::create(
+        rewriter, loc, result_type, input, filter, bias,
         /*dilation_h_factor=*/state->dilation_height_factor,
         /*dilation_w_factor=*/state->dilation_width_factor,
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
@@ -420,9 +420,9 @@ class ConvertTFDepthwiseConv2dNative
           rewriter.getI32IntegerAttr(ConvertToTfliteSize(size));
     }
     auto shape_attr = DenseElementsAttr::get(shape_type, result_shape_data);
-    auto shape = rewriter.create<TF::ConstOp>(loc, shape_type, shape_attr);
+    auto shape = TF::ConstOp::create(rewriter, loc, shape_type, shape_attr);
 
-    return rewriter.create<TF::ReshapeOp>(loc, result_type, filter, shape);
+    return TF::ReshapeOp::create(rewriter, loc, result_type, filter, shape);
   }
 };
 
@@ -495,11 +495,11 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     auto shape_attr = DenseElementsAttr::get(shape_type, result_shape_data);
     auto shape =
-        rewriter.create<arith::ConstantOp>(loc, shape_type, shape_attr);
+        arith::ConstantOp::create(rewriter, loc, shape_type, shape_attr);
     auto revised_output_type = tensorflow::GetTypeFromTFTensorShape(
         revised_shape, original_input_type.getElementType());
-    TF::ReshapeOp reshape = rewriter.create<TF::ReshapeOp>(
-        loc, revised_output_type, original_input, shape);
+    TF::ReshapeOp reshape = TF::ReshapeOp::create(
+        rewriter, loc, revised_output_type, original_input, shape);
 
     // Replace the original strided_slice.
     uint64_t revised_begin_mask = strided_slice_op.getBeginMask();
@@ -656,13 +656,13 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     auto begin_attr = DenseElementsAttr::get<int32_t>(type, padded_begin);
     auto begin_op =
-        rewriter.create<arith::ConstantOp>(op->getLoc(), type, begin_attr);
+        arith::ConstantOp::create(rewriter, op->getLoc(), type, begin_attr);
     auto end_attr = DenseElementsAttr::get<int32_t>(type, padded_end);
     auto end_op =
-        rewriter.create<arith::ConstantOp>(op->getLoc(), type, end_attr);
+        arith::ConstantOp::create(rewriter, op->getLoc(), type, end_attr);
     auto stride_attr = DenseElementsAttr::get<int32_t>(type, padded_stride);
     auto stride_op =
-        rewriter.create<arith::ConstantOp>(op->getLoc(), type, stride_attr);
+        arith::ConstantOp::create(rewriter, op->getLoc(), type, stride_attr);
 
     rewriter.replaceOpWithNewOp<TF::StridedSliceOp>(
         op, strided_slice_op.getType(), input, begin_op.getResult(),
@@ -767,17 +767,17 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     auto begin_end_type = tensorflow::GetTypeFromTFTensorShape(
         {num_input_dims}, rewriter.getIntegerType(32));
-    auto new_begin_attr = rewriter.create<arith::ConstantOp>(
-        op->getLoc(), begin_end_type,
+    auto new_begin_attr = arith::ConstantOp::create(
+        rewriter, op->getLoc(), begin_end_type,
         DenseElementsAttr::get<int32_t>(begin_end_type, padded_begin));
-    auto new_end_attr = rewriter.create<arith::ConstantOp>(
-        op->getLoc(), begin_end_type,
+    auto new_end_attr = arith::ConstantOp::create(
+        rewriter, op->getLoc(), begin_end_type,
         DenseElementsAttr::get<int32_t>(begin_end_type, padded_end));
     auto strides_type = tensorflow::GetTypeFromTFTensorShape(
         {static_cast<int64_t>(padded_strides.size())},
         rewriter.getIntegerType(32));
-    auto new_strides_attr = rewriter.create<arith::ConstantOp>(
-        op->getLoc(), strides_type,
+    auto new_strides_attr = arith::ConstantOp::create(
+        rewriter, op->getLoc(), strides_type,
         DenseElementsAttr::get<int32_t>(strides_type, padded_strides));
 
     auto attribute_type = rewriter.getIntegerType(64);
@@ -1043,9 +1043,10 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
         auto reduce_dim_type = tensorflow::GetTypeFromTFTensorShape(
             {3}, rewriter.getIntegerType(32));
         ::mlir::SmallVector<int32_t, 3> reduce_dim_values = {0, 1, 2};
-        reduce_dim_op = rewriter.create<TF::ConstOp>(
-            odsLoc, ::mlir::DenseIntElementsAttr::get(reduce_dim_type,
-                                                      reduce_dim_values));
+        reduce_dim_op =
+            TF::ConstOp::create(rewriter, odsLoc,
+                                ::mlir::DenseIntElementsAttr::get(
+                                    reduce_dim_type, reduce_dim_values));
       }
 
       auto new_mean_type = tensorflow::GetTypeFromTFTensorShape(
@@ -1053,8 +1054,8 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
       ::mlir::TF::MeanOp mean_op_1;
       {
         ::mlir::Value x_value = (*x.begin());
-        mean_op_1 = rewriter.create<TF::MeanOp>(
-            odsLoc, new_mean_type, x_value, reduce_dim_op,
+        mean_op_1 = TF::MeanOp::create(
+            rewriter, odsLoc, new_mean_type, x_value, reduce_dim_op,
             /*keep_dims=*/rewriter.getBoolAttr(false));
       }
 
@@ -1064,15 +1065,15 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
         ::mlir::Value tblgen_value_1 = (*mean_op_1.getODSResults(0).begin());
         // If x has shape of [b, h, w, c], the result of mean_op_1 will have
         // shape of [c]. Therefore, their shapes are always compatible.
-        square_diff_op = rewriter.create<::mlir::TF::SquaredDifferenceOp>(
-            odsLoc, tblgen_value_0, tblgen_value_1);
+        square_diff_op = ::mlir::TF::SquaredDifferenceOp::create(
+            rewriter, odsLoc, tblgen_value_0, tblgen_value_1);
       }
 
       ::mlir::TF::MeanOp mean_op_2;
       {
         ::mlir::Value input_value = (*square_diff_op.getODSResults(0).begin());
-        mean_op_2 = rewriter.create<TF::MeanOp>(
-            odsLoc, new_mean_type, input_value, reduce_dim_op,
+        mean_op_2 = TF::MeanOp::create(
+            rewriter, odsLoc, new_mean_type, input_value, reduce_dim_op,
             /*keep_dims=*/rewriter.getBoolAttr(false));
       }
 
@@ -1083,57 +1084,56 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
     ::llvm::SmallVector<::mlir::Value, 4> replace_values;
     ::mlir::TF::ConstOp epsilon_const_op;
     {
-      epsilon_const_op =
-          rewriter.create<::mlir::TF::ConstOp>(odsLoc,
-                                               /*value=*/epsilon);
+      epsilon_const_op = ::mlir::TF::ConstOp::create(rewriter, odsLoc,
+                                                     /*value=*/epsilon);
     }
     ::mlir::TF::AddOp add_op_1;
     {
       ::mlir::Value epsilon_value =
           (*epsilon_const_op.getODSResults(0).begin());
       // Multiplying with a constant, no need to check broadcastibility.
-      add_op_1 = rewriter.create<::mlir::TF::AddOp>(odsLoc,
-                                                    /*x=*/variance_value,
-                                                    /*y=*/epsilon_value);
+      add_op_1 = ::mlir::TF::AddOp::create(rewriter, odsLoc,
+                                           /*x=*/variance_value,
+                                           /*y=*/epsilon_value);
     }
     ::mlir::TF::RsqrtOp rsqrt_op;
     {
       ::mlir::SmallVector<::mlir::Value, 4> tblgen_values;
       ::mlir::SmallVector<::mlir::NamedAttribute, 4> tblgen_attrs;
       tblgen_values.push_back((*add_op_1.getODSResults(0).begin()));
-      rsqrt_op = rewriter.create<::mlir::TF::RsqrtOp>(odsLoc, tblgen_values,
-                                                      tblgen_attrs);
+      rsqrt_op = ::mlir::TF::RsqrtOp::create(rewriter, odsLoc, tblgen_values,
+                                             tblgen_attrs);
     }
     ::mlir::TF::MulOp multiplier;
     {
       ::mlir::Value tblgen_value_0 = (*scale.begin());
       ::mlir::Value tblgen_value_1 = (*rsqrt_op.getODSResults(0).begin());
-      multiplier = rewriter.create<::mlir::TF::MulOp>(odsLoc,
-                                                      /*x=*/tblgen_value_0,
-                                                      /*y=*/tblgen_value_1);
+      multiplier = ::mlir::TF::MulOp::create(rewriter, odsLoc,
+                                             /*x=*/tblgen_value_0,
+                                             /*y=*/tblgen_value_1);
     }
     ::mlir::TF::MulOp mul_op_1;
     {
       ::mlir::Value tblgen_value_0 = (*x.begin());
       ::mlir::Value tblgen_value_1 = (*multiplier.getODSResults(0).begin());
-      mul_op_1 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
-                                                    /*x=*/tblgen_value_0,
-                                                    /*y=*/tblgen_value_1);
+      mul_op_1 = ::mlir::TF::MulOp::create(rewriter, odsLoc,
+                                           /*x=*/tblgen_value_0,
+                                           /*y=*/tblgen_value_1);
     }
     ::mlir::TF::MulOp mul_op_2;
     {
       ::mlir::Value multiplier_value = (*multiplier.getODSResults(0).begin());
-      mul_op_2 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
-                                                    /*x=*/mean_value,
-                                                    /*y=*/multiplier_value);
+      mul_op_2 = ::mlir::TF::MulOp::create(rewriter, odsLoc,
+                                           /*x=*/mean_value,
+                                           /*y=*/multiplier_value);
     }
     ::mlir::TF::SubOp sub_op;
     {
       ::mlir::Value tblgen_value_0 = (*offset.begin());
       ::mlir::Value tblgen_value_1 = (*mul_op_2.getODSResults(0).begin());
-      sub_op = rewriter.create<::mlir::TF::SubOp>(odsLoc,
-                                                  /*x=*/tblgen_value_0,
-                                                  /*y=*/tblgen_value_1);
+      sub_op = ::mlir::TF::SubOp::create(rewriter, odsLoc,
+                                         /*x=*/tblgen_value_0,
+                                         /*y=*/tblgen_value_1);
     }
     ::mlir::TF::AddOp add_op_2;
     {
@@ -1145,8 +1145,8 @@ struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
       for (auto v : fused_batch_norm_op.getODSResults(0)) {
         tblgen_types.push_back(v.getType());
       }
-      add_op_2 = rewriter.create<::mlir::TF::AddOp>(
-          odsLoc, tblgen_types, tblgen_values, tblgen_attrs);
+      add_op_2 = ::mlir::TF::AddOp::create(rewriter, odsLoc, tblgen_types,
+                                           tblgen_values, tblgen_attrs);
     }
     for (auto v :
          ::llvm::SmallVector<::mlir::Value, 4>{add_op_2.getODSResults(0)}) {
@@ -1261,9 +1261,9 @@ struct ReorderFakeQuantPattern : public RewritePattern {
                                 ReorderOp &new_reorder_op) const {
     Value tensor_value = (*input.begin());
     Value shape_value = (*shape.begin());
-    new_reorder_op = rewriter.create<ReorderOp>(ods_loc,
-                                                /*tensor=*/tensor_value,
-                                                /*shape=*/shape_value);
+    new_reorder_op = ReorderOp::create(rewriter, ods_loc,
+                                       /*tensor=*/tensor_value,
+                                       /*shape=*/shape_value);
     return success();
   }
 
@@ -1289,8 +1289,8 @@ struct ReorderFakeQuantPattern : public RewritePattern {
     for (auto v : casted_op.getODSResults(0)) {
       target_types.push_back(v.getType());
     }
-    fakequant_op = rewriter.create<TF::FakeQuantWithMinMaxVarsOp>(
-        ods_loc, target_types, target_values, target_attrs);
+    fakequant_op = TF::FakeQuantWithMinMaxVarsOp::create(
+        rewriter, ods_loc, target_types, target_values, target_attrs);
     return success();
   }
 
@@ -1442,13 +1442,14 @@ struct ConvertRfftToRfft2d : public RewritePattern {
 
     auto expaned_input_type = tensorflow::GetTypeFromTFTensorShape(
         expanded_input_shape, input_type.getElementType());
-    TF::ExpandDimsOp expanded_input = rewriter.create<TF::ExpandDimsOp>(
-        rfft_op.getLoc(), expaned_input_type, input, minus_two->getResult());
+    TF::ExpandDimsOp expanded_input =
+        TF::ExpandDimsOp::create(rewriter, rfft_op.getLoc(), expaned_input_type,
+                                 input, minus_two->getResult());
 
     // Expanded fft_len.
     auto one_attr = mlir::DenseIntElementsAttr::get(one_ele_type, {1});
 
-    auto one = rewriter.create<TF::ConstOp>(rfft_op.getLoc(), one_attr);
+    auto one = TF::ConstOp::create(rewriter, rfft_op.getLoc(), one_attr);
 
     auto zero = CreateConstOpWithSingleValue(&rewriter, rfft_op.getLoc(),
                                              one_ele_type, 0);
@@ -1456,21 +1457,22 @@ struct ConvertRfftToRfft2d : public RewritePattern {
     auto expanded_fft_len_type = tensorflow::GetTypeFromTFTensorShape(
         {2}, fft_len_type.getElementType());
 
-    TF::ConcatV2Op expanded_fft_len = rewriter.create<TF::ConcatV2Op>(
-        rfft_op.getLoc(), expanded_fft_len_type,
+    TF::ConcatV2Op expanded_fft_len = TF::ConcatV2Op::create(
+        rewriter, rfft_op.getLoc(), expanded_fft_len_type,
         SmallVector<Value, 2>({one.getResult(), fft_len}), zero->getResult());
 
     // Insert the rfft_2d.
     auto rfft2d_out_type = tensorflow::GetTypeFromTFTensorShape(
         expanded_output_shape, output_type.getElementType());
-    TF::RFFT2DOp rfft2d = rewriter.create<TF::RFFT2DOp>(
-        rfft_op.getLoc(), rfft2d_out_type, expanded_input.getResult(),
+    TF::RFFT2DOp rfft2d = TF::RFFT2DOp::create(
+        rewriter, rfft_op.getLoc(), rfft2d_out_type, expanded_input.getResult(),
         expanded_fft_len.getResult());
 
     // Insert the squeeze op.
     auto squeeze_dim = rewriter.getI64ArrayAttr({-2});
-    TF::SqueezeOp squeeze = rewriter.create<TF::SqueezeOp>(
-        rfft_op.getLoc(), output_type, rfft2d.getResult(), squeeze_dim);
+    TF::SqueezeOp squeeze =
+        TF::SqueezeOp::create(rewriter, rfft_op.getLoc(), output_type,
+                              rfft2d.getResult(), squeeze_dim);
 
     rewriter.replaceOp(op, squeeze.getResult());
 
@@ -1614,8 +1616,8 @@ class QuantizeConcatResult : public OpRewritePattern<TF::ConcatV2Op> {
     llvm::SmallVector<Value, 4> inputs{concat_result, min_v, max_v};
 
     rewriter.setInsertionPointAfter(concat.getOperation());
-    auto new_fake_quant_op = rewriter.create<TF::FakeQuantWithMinMaxVarsOp>(
-        concat.getLoc(), concat->getResultTypes(), inputs,
+    auto new_fake_quant_op = TF::FakeQuantWithMinMaxVarsOp::create(
+        rewriter, concat.getLoc(), concat->getResultTypes(), inputs,
         (*fake_quant_ops.begin())->getAttrs());
 
     for (OpOperand *use : uses) {
@@ -1673,8 +1675,9 @@ class QuantizeMeanResult : public OpRewritePattern<TF::MeanOp> {
     llvm::SmallVector<Value, 4> inputs{mean_result, fq.getMin(), fq.getMax()};
 
     rewriter.setInsertionPointAfter(mean.getOperation());
-    auto new_fake_quant_op = rewriter.create<TF::FakeQuantWithMinMaxVarsOp>(
-        mean.getLoc(), mean->getResultTypes(), inputs, fq->getAttrs());
+    auto new_fake_quant_op = TF::FakeQuantWithMinMaxVarsOp::create(
+        rewriter, mean.getLoc(), mean->getResultTypes(), inputs,
+        fq->getAttrs());
 
     for (OpOperand *use : uses) {
       use->assign(new_fake_quant_op);
diff --git a/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc b/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc
index 4dc0b4bf668043..82803f6de927cb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/push_transpose_through_ewise_pass.cc
@@ -173,8 +173,8 @@ class CommuteBothInputsTransposedWithEwiseOps : public RewritePattern {
                         new_out_type, op->getAttrs());
 
     // Apply original tranpose to output of ewise op.
-    auto out_tpose_op = rewriter.create<TFL::TransposeOp>(
-        new_ewise_op->getLoc(), op->getResult(0).getType(),
+    auto out_tpose_op = TFL::TransposeOp::create(
+        rewriter, new_ewise_op->getLoc(), op->getResult(0).getType(),
         new_ewise_op->getResults()[0], perm1);
     rewriter.replaceOp(op, out_tpose_op.getOperation());
     return success();
@@ -273,7 +273,7 @@ class CommuteTransposeWithEwiseOps : public RewritePattern {
           RankedTensorType::get(inverse_perm.size(), rewriter.getI32Type()),
           inverse_perm);
       auto inverse_perm_op =
-          rewriter.create<arith::ConstantOp>(perm.getLoc(), inverse_perm_attr);
+          arith::ConstantOp::create(rewriter, perm.getLoc(), inverse_perm_attr);
 
       // Transpose the input constant.
       auto in_rtt =
@@ -283,9 +283,9 @@ class CommuteTransposeWithEwiseOps : public RewritePattern {
           RankedTensorType::get(PermuteShape(in_rtt.getShape(), inverse_perm),
                                 in_rtt.getElementType());
 
-      tposed_const = rewriter.create<TFL::TransposeOp>(
-          cst_arg->getLoc(), inverse_type, cst_arg->getResult(0),
-          inverse_perm_op);
+      tposed_const =
+          TFL::TransposeOp::create(rewriter, cst_arg->getLoc(), inverse_type,
+                                   cst_arg->getResult(0), inverse_perm_op);
     }
 
     auto current_out_type =
@@ -301,8 +301,8 @@ class CommuteTransposeWithEwiseOps : public RewritePattern {
                         new_out_type, op->getAttrs());
 
     // Apply original tranpose to output of ewise op.
-    auto out_tpose_op = rewriter.create<TFL::TransposeOp>(
-        new_ewise_op->getLoc(), op->getResult(0).getType(),
+    auto out_tpose_op = TFL::TransposeOp::create(
+        rewriter, new_ewise_op->getLoc(), op->getResult(0).getType(),
         new_ewise_op->getResults()[0], perm);
     rewriter.replaceOp(op, out_tpose_op.getOperation());
     return success();
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index d0c143d73914c9..c213c1ee498250 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -207,8 +207,9 @@ class PushForwardDrqFQ : public OpRewritePattern<stablehlo::CompositeOp> {
     Value float_input = drq_fq_op.getOperand(drq_fq_op.getNumOperands() - 1);
 
     // Create a new pad op.
-    auto new_pad_op = rewriter.create<TFL::PadOp>(
-        pad_op.getLoc(), pad_op.getType(), float_input, pad_op.getPadding());
+    auto new_pad_op =
+        TFL::PadOp::create(rewriter, pad_op.getLoc(), pad_op.getType(),
+                           float_input, pad_op.getPadding());
 
     // Create a new drq fake quant op.
     // Operands are the same, except for the last one.
@@ -218,8 +219,8 @@ class PushForwardDrqFQ : public OpRewritePattern<stablehlo::CompositeOp> {
     }
     new_drq_operands.push_back(new_pad_op.getResult());
 
-    auto new_drq_fq_op = rewriter.create<stablehlo::CompositeOp>(
-        drq_fq_op.getLoc(), pad_op.getType(), new_drq_operands,
+    auto new_drq_fq_op = stablehlo::CompositeOp::create(
+        rewriter, drq_fq_op.getLoc(), pad_op.getType(), new_drq_operands,
         drq_fq_op->getAttrs());
 
     rewriter.replaceOp(pad_op, new_drq_fq_op.getResult(0));
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
index 7453ed54975a5a..d6e18dc4158508 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize_variables.cc
@@ -114,11 +114,12 @@ void QuantizeVariablesPass::QuantizeVariable(
       // Add dequantize.
       builder.setInsertionPointAfter(read_variable_op);
       auto new_read_variable_op =
-          builder.create<ReadVariableOp>(read_variable_op.getLoc(), ref_qtype,
-                                         read_variable_op.getResourceId());
-      auto new_dq_op = builder.create<DequantizeOp>(
-          read_variable_op.getLoc(), read_variable_op.getResult().getType(),
-          new_read_variable_op.getResult());
+          ReadVariableOp::create(builder, read_variable_op.getLoc(), ref_qtype,
+                                 read_variable_op.getResourceId());
+      auto new_dq_op =
+          DequantizeOp::create(builder, read_variable_op.getLoc(),
+                               read_variable_op.getResult().getType(),
+                               new_read_variable_op.getResult());
       read_variable_op->replaceAllUsesWith(new_dq_op);
       read_variable_op.erase();
     }
@@ -135,19 +136,19 @@ void QuantizeVariablesPass::QuantizeVariable(
         if (qtype == quant::QuantizedType::getQuantizedElementType(ref_qtype)) {
           // Same quantization parameters, remove it.
           builder.setInsertionPoint(assign_variable_op);
-          auto new_assign_variable_op = builder.create<AssignVariableOp>(
-              assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
-              dq_op.getInput());
+          auto new_assign_variable_op = AssignVariableOp::create(
+              builder, assign_variable_op.getLoc(),
+              assign_variable_op.getResourceId(), dq_op.getInput());
           assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
         } else {
           // Otherwise, apply re-quantization.
           builder.setInsertionPoint(assign_variable_op);
-          auto new_q_op = builder.create<QuantizeOp>(
-              assign_variable_op.getLoc(), ref_qtype, dq_op.getInput(),
+          auto new_q_op = QuantizeOp::create(
+              builder, assign_variable_op.getLoc(), ref_qtype, dq_op.getInput(),
               TypeAttr::get(ref_qtype));
-          auto new_assign_variable_op = builder.create<AssignVariableOp>(
-              assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
-              new_q_op.getResult());
+          auto new_assign_variable_op = AssignVariableOp::create(
+              builder, assign_variable_op.getLoc(),
+              assign_variable_op.getResourceId(), new_q_op.getResult());
           assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
         }
         assign_variable_op.erase();
@@ -155,12 +156,12 @@ void QuantizeVariablesPass::QuantizeVariable(
       } else {
         // Add quantize op.
         builder.setInsertionPoint(assign_variable_op);
-        auto new_q_op = builder.create<QuantizeOp>(
-            assign_variable_op.getLoc(), ref_qtype,
+        auto new_q_op = QuantizeOp::create(
+            builder, assign_variable_op.getLoc(), ref_qtype,
             assign_variable_op.getValue(), TypeAttr::get(ref_qtype));
-        auto new_assign_variable_op = builder.create<AssignVariableOp>(
-            assign_variable_op.getLoc(), assign_variable_op.getResourceId(),
-            new_q_op.getResult());
+        auto new_assign_variable_op = AssignVariableOp::create(
+            builder, assign_variable_op.getLoc(),
+            assign_variable_op.getResourceId(), new_q_op.getResult());
         assign_variable_op->replaceAllUsesWith(new_assign_variable_op);
         assign_variable_op.erase();
       }
@@ -171,9 +172,9 @@ void QuantizeVariablesPass::QuantizeVariable(
     builder.setInsertionPoint(var_handle_op);
     auto output_type = UnrankedTensorType::get(TF::ResourceType::get(
         {mlir::cast<TensorType>(ref_qtype)}, builder.getContext()));
-    auto new_var_handle_op = builder.create<VarHandleOp>(
-        var_handle_op.getLoc(), output_type, var_handle_op.getContainer(),
-        var_handle_op.getSharedName());
+    auto new_var_handle_op = VarHandleOp::create(
+        builder, var_handle_op.getLoc(), output_type,
+        var_handle_op.getContainer(), var_handle_op.getSharedName());
     var_handle_op->replaceAllUsesWith(new_var_handle_op);
     var_handle_op.erase();
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
index 80e0986209e8d0..58fff203b9fb3e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/raise_custom_ops.cc
@@ -80,8 +80,8 @@ void RaiseCustomOpsPass::runOnOperation() {
   for (auto *op : custom_ops) {
     builder.setInsertionPoint(op);
     Location loc = op->getLoc();
-    auto custom_op = builder.create<CustomTfOp>(loc, op->getResultTypes(),
-                                                op->getOperands());
+    auto custom_op = CustomTfOp::create(builder, loc, op->getResultTypes(),
+                                        op->getOperands());
     Region region;
     Block *new_block = new Block;
     region.push_back(new_block);
@@ -95,7 +95,7 @@ void RaiseCustomOpsPass::runOnOperation() {
       inner_op->setOperand(idx_args.index(), idx_args.value());
     }
     custom_op->setAttrs(inner_op->getAttrs());
-    builder.create<YieldOp>(loc, inner_op->getResults());
+    YieldOp::create(builder, loc, inner_op->getResults());
     custom_op.getBody().takeBody(region);
 
     op->replaceAllUsesWith(custom_op);
diff --git a/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc b/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
index e964f76b30efbe..a0a6df9cf4feef 100644
--- a/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/reduce_type_precision.cc
@@ -106,12 +106,12 @@ class SanitizeGatherOpOutputToI4 : public OpRewritePattern<TFL::GatherOp> {
     }
 
     Builder builder(op.getContext());
-    auto new_gather_op = rewriter.create<TFL::GatherOp>(
-        op.getLoc(),
-        /*result=*/
-        mlir::cast<TensorType>(op.getResult().getType())
-            .clone(builder.getI4Type()),
-        /*operand=*/op.getOperands(), op->getAttrs());
+    auto new_gather_op =
+        TFL::GatherOp::create(rewriter, op.getLoc(),
+                              /*result=*/
+                              mlir::cast<TensorType>(op.getResult().getType())
+                                  .clone(builder.getI4Type()),
+                              /*operand=*/op.getOperands(), op->getAttrs());
     rewriter.replaceAllUsesWith(op.getResult(), new_gather_op.getResult());
 
     return success();
diff --git a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
index 2b03557121652f..6f476ded0a1a62 100644
--- a/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/tflite_passes/unfold_large_splat_constants_pass.cc
@@ -60,23 +60,21 @@ void MaybeUnfoldLargeSplatConstant(mlir::OpBuilder* op_builder,
   }
 
   op_builder->setInsertionPoint(const_op);
-  mlir::arith::ConstantOp fill_shape =
-      op_builder->create<mlir::arith::ConstantOp>(
-          const_op->getLoc(), DenseIntElementsAttr::get(
-                                  tensorflow::GetTypeFromTFTensorShape(
-                                      {splat_elements_attr.getType().getRank()},
-                                      op_builder->getI64Type()),
-                                  splat_elements_attr.getType().getShape()));
-  mlir::arith::ConstantOp fill_value =
-      op_builder->create<mlir::arith::ConstantOp>(
-          const_op->getLoc(),
-          DenseElementsAttr::get(
-              tensorflow::GetTypeFromTFTensorShape(
-                  {}, splat_elements_attr.getType().getElementType()),
-              splat_elements_attr.getSplatValue<Attribute>()));
-  TFL::FillOp fill = op_builder->create<TFL::FillOp>(
-      const_op->getLoc(), splat_elements_attr.getType(), fill_shape,
-      fill_value);
+  mlir::arith::ConstantOp fill_shape = mlir::arith::ConstantOp::create(
+      *op_builder, const_op->getLoc(),
+      DenseIntElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
+                                    {splat_elements_attr.getType().getRank()},
+                                    op_builder->getI64Type()),
+                                splat_elements_attr.getType().getShape()));
+  mlir::arith::ConstantOp fill_value = mlir::arith::ConstantOp::create(
+      *op_builder, const_op->getLoc(),
+      DenseElementsAttr::get(
+          tensorflow::GetTypeFromTFTensorShape(
+              {}, splat_elements_attr.getType().getElementType()),
+          splat_elements_attr.getSplatValue<Attribute>()));
+  TFL::FillOp fill = TFL::FillOp::create(*op_builder, const_op->getLoc(),
+                                         splat_elements_attr.getType(),
+                                         fill_shape, fill_value);
   const_op->replaceAllUsesWith(fill);
   const_op->erase();
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
index 4e0fb068c8b9e8..d9cab52085ef5b 100644
--- a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
@@ -150,10 +150,10 @@ class InsertTFLQuantOpsAfterTFFakeQuantOp {
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
     // and its users.
     Value value = tf_op.getOutputs();
-    auto quantize = rewriter.create<TFL::QuantizeOp>(
-        tf_op.getLoc(), qtype.getValue(), value, qtype);
-    auto dequantize = rewriter.create<TFL::DequantizeOp>(
-        tf_op.getLoc(), res_type, quantize.getOutput());
+    auto quantize = TFL::QuantizeOp::create(rewriter, tf_op.getLoc(),
+                                            qtype.getValue(), value, qtype);
+    auto dequantize = TFL::DequantizeOp::create(rewriter, tf_op.getLoc(),
+                                                res_type, quantize.getOutput());
     value.replaceAllUsesWith(dequantize);
     quantize.getOperation()->replaceUsesOfWith(dequantize, value);
 
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index a2023742140fce..f94cad6b5eabe7 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -52,14 +52,14 @@ Value CreateI32SplatConst(OpBuilder* builder, ArrayRef<int64_t> shape,
                           int32_t val, mlir::Location location) {
   auto type = RankedTensorType::get(shape, builder->getIntegerType(32));
   auto attr = DenseElementsAttr::get(type, val);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreateF32SplatConst(OpBuilder* builder, ArrayRef<int64_t> shape,
                           float val, mlir::Location location) {
   auto type = RankedTensorType::get(shape, builder->getF32Type());
   auto attr = DenseElementsAttr::get(type, val);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreatTfF32ConstOp(OpBuilder* builder, ArrayRef<int64_t> shape, float val,
@@ -67,7 +67,7 @@ Value CreatTfF32ConstOp(OpBuilder* builder, ArrayRef<int64_t> shape, float val,
   auto type = RankedTensorType::get(shape, builder->getF32Type());
   auto ele_type = RankedTensorType::get({1}, builder->getF32Type());
   auto attr = DenseElementsAttr::get(ele_type, val);
-  return builder->create<TF::ConstOp>(location, type, attr);
+  return TF::ConstOp::create(*builder, location, type, attr);
 }
 
 Value CreateI64DenseConst(OpBuilder* builder, ArrayRef<int64_t> shape,
@@ -75,7 +75,7 @@ Value CreateI64DenseConst(OpBuilder* builder, ArrayRef<int64_t> shape,
   auto type = RankedTensorType::get(static_cast<int>(shape.size()),
                                     builder->getIntegerType(64));
   auto attr = DenseElementsAttr::get(type, values);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreateI32DenseConst(OpBuilder* builder, ArrayRef<int32_t> values,
@@ -83,12 +83,12 @@ Value CreateI32DenseConst(OpBuilder* builder, ArrayRef<int32_t> values,
   auto type = RankedTensorType::get(static_cast<int>(values.size()),
                                     builder->getIntegerType(32));
   auto attr = DenseElementsAttr::get(type, values);
-  return builder->create<arith::ConstantOp>(location, type, attr);
+  return arith::ConstantOp::create(*builder, location, type, attr);
 }
 
 Value CreateNoneValue(OpBuilder* builder, mlir::Location location) {
-  return builder->create<TFL::NoValueOp>(location, builder->getNoneType(),
-                                         builder->getUnitAttr());
+  return TFL::NoValueOp::create(*builder, location, builder->getNoneType(),
+                                builder->getUnitAttr());
 }
 
 Value Transpose(OpBuilder* builder, Value value_to_transpose,
@@ -106,8 +106,8 @@ Value Transpose(OpBuilder* builder, Value value_to_transpose,
   auto elem_type = transpose_type.getElementType();
   auto result_type = RankedTensorType::get(transpose_shape, elem_type);
 
-  return builder->create<TF::TransposeOp>(location, result_type,
-                                          value_to_transpose, perm_op);
+  return TF::TransposeOp::create(*builder, location, result_type,
+                                 value_to_transpose, perm_op);
 }
 
 Value Transpose2D(OpBuilder* builder, Value value_to_transpose,
@@ -121,8 +121,8 @@ Value Reverse(OpBuilder* builder, Value value_to_reverse, int axis,
               RankedTensorType type, mlir::Location location) {
   auto axis_op = CreateI32SplatConst(builder, {1}, axis, location);
   // The result type will be the same as the input.
-  return builder->create<TF::ReverseV2Op>(location, type, value_to_reverse,
-                                          axis_op);
+  return TF::ReverseV2Op::create(*builder, location, type, value_to_reverse,
+                                 axis_op);
 }
 
 ArrayRef<int64_t> GetRankedTensorShape(Value value) {
@@ -154,8 +154,8 @@ Value SliceRankedTensor(OpBuilder* builder, Value input,
   auto slice_i2c_size =
       CreateI64DenseConst(builder, size_shape, size_values, location);
 
-  return builder->create<TF::SliceOp>(
-      location,
+  return TF::SliceOp::create(
+      *builder, location,
       RankedTensorType::get(
           size_values,
           mlir::cast<RankedTensorType>(input.getType()).getElementType()),
@@ -175,9 +175,9 @@ Value CreateStridedSliceOp(mlir::Location loc, ArrayRef<int64_t> output_shape,
   auto end_tensor = CreateI32DenseConst(builder, end, loc);
   auto strides_tensor = CreateI32DenseConst(builder, strides, loc);
 
-  return builder->create<TF::StridedSliceOp>(
-      loc, output_type, input, begin_tensor, end_tensor, strides_tensor,
-      builder->getI64IntegerAttr(begin_mask),
+  return TF::StridedSliceOp::create(
+      *builder, loc, output_type, input, begin_tensor, end_tensor,
+      strides_tensor, builder->getI64IntegerAttr(begin_mask),
       builder->getI64IntegerAttr(end_mask),
       builder->getI64IntegerAttr(ellipsis_mask),
       builder->getI64IntegerAttr(new_axis_mask),
@@ -590,21 +590,20 @@ TF::ConstOp Create1DConstantOp(const std::vector<int>& value, Location loc,
   auto type =
       mlir::RankedTensorType::get(value.size(), builder->getIntegerType(32));
   auto dense_values = mlir::DenseIntElementsAttr::get(type, value);
-  return builder->create<TF::ConstOp>(loc, dense_values);
+  return TF::ConstOp::create(*builder, loc, dense_values);
 }
 
 TF::ConstOp CreateScalarConstantOp(int value, Location loc,
                                    OpBuilder* builder) {
-  return builder->create<TF::ConstOp>(loc, builder->getI32IntegerAttr(value));
+  return TF::ConstOp::create(*builder, loc, builder->getI32IntegerAttr(value));
 }
 
 TF::ReshapeOp CreateFlattenOP(const Value& input, Location loc,
                               OpBuilder* builder) {
   auto output_shape = Create1DConstantOp({-1}, loc, builder);
-  return builder->create<mlir::TF::ReshapeOp>(
-      loc,
-      /*tensor=*/input,
-      /*shape=*/output_shape.getResult());
+  return mlir::TF::ReshapeOp::create(*builder, loc,
+                                     /*tensor=*/input,
+                                     /*shape=*/output_shape.getResult());
 }
 
 LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
@@ -637,9 +636,9 @@ LogicalResult CreateEqualSizeSplitVOp(Value input, int axis, int splits,
       builder);
 
   auto axis_op = CreateScalarConstantOp(axis, loc, builder);
-  *result = builder->create<TF::SplitVOp>(loc, output_types, input,
-                                          size_of_splits_op.getResult(),
-                                          axis_op.getResult());
+  *result =
+      TF::SplitVOp::create(*builder, loc, output_types, input,
+                           size_of_splits_op.getResult(), axis_op.getResult());
   return success();
 }
 
@@ -771,8 +770,8 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
       mlir::cast<RankedTensorType>(final_inputs.getType()).getElementType());
 
   Value none = CreateNoneValue(builder, func_op.getLoc());
-  auto lstm = builder->create<mlir::TFL::UnidirectionalSequenceLSTMOp>(
-      func_op.getLoc(), result_type, /*input=*/final_inputs,
+  auto lstm = mlir::TFL::UnidirectionalSequenceLSTMOp::create(
+      *builder, func_op.getLoc(), result_type, /*input=*/final_inputs,
       /*input_to_input_weights=*/weights_array->getResult(0),
       /*input_to_forget_weights=*/weights_array->getResult(1),
       /*input_to_cell_weights=*/weights_array->getResult(2),
@@ -881,7 +880,7 @@ LogicalResult ConvertKerasLSTMLayer(mlir::func::FuncOp func_op,
                                           func_op.getFunctionType().getInputs(),
                                           output_types));
 
-  builder->create<mlir::func::ReturnOp>(func_op.getLoc(), outputs);
+  mlir::func::ReturnOp::create(*builder, func_op.getLoc(), outputs);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
index 211336de124075..59c3f883411221 100644
--- a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
@@ -57,11 +57,11 @@ void ConvertNMSPaddedFunc::RewriteFunc() {
   auto output_type1 = func_.getFunctionType().getResult(1);
 
   OpBuilder builder(func_.getBody());
-  auto op = builder.create<mlir::TFL::NonMaxSuppressionV4Op>(
-      func_.getLoc(), output_type0, output_type1, boxes, scores,
+  auto op = mlir::TFL::NonMaxSuppressionV4Op::create(
+      builder, func_.getLoc(), output_type0, output_type1, boxes, scores,
       max_output_size, iou_threshold, score_threshold);
 
-  builder.create<mlir::func::ReturnOp>(func_.getLoc(), op.getResults());
+  mlir::func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 }
 
 LogicalResult ConvertNMSPaddedFunc::VerifySignature() {
@@ -102,11 +102,11 @@ LogicalResult ConvertSSDPostProcessFunc::RewriteFunc() {
                                     custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func_.getLoc(), func_.getFunctionType().getResults(),
-      func_.getArguments(), kCustomSSDPostprocessing,
-      CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func_.getLoc(), op.getResults());
+  auto op = CustomOp::create(builder, func_.getLoc(),
+                             func_.getFunctionType().getResults(),
+                             func_.getArguments(), kCustomSSDPostprocessing,
+                             CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
index 5f680c7db9be58..4bcf4b86e0ea17 100644
--- a/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/perception_ops_utils.cc
@@ -104,11 +104,11 @@ LogicalResult ConvertMaxUnpoolingFunc::RewriteFunc() {
   if (failed(CreateCustomOptions(custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func_.getLoc(), func_.getFunctionType().getResults(),
-      func_.getArguments(), kMaxUnpooling,
-      CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func_.getLoc(), op.getResults());
+  auto op = CustomOp::create(builder, func_.getLoc(),
+                             func_.getFunctionType().getResults(),
+                             func_.getArguments(), kMaxUnpooling,
+                             CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 
   return success();
 }
@@ -205,11 +205,11 @@ LogicalResult ConvertDenseImageWarpFunc::RewriteFunc() {
                  StringAttr::get(func_.getContext(), kImageWarping));
 
   OpBuilder builder(func_.getBody());
-  auto op = builder.create<CustomOp>(func_.getLoc(),
-                                     func_.getFunctionType().getResults(),
-                                     func_.getArguments(), kImageWarping,
-                                     CustomOption(&builder, /*content=*/""));
-  builder.create<func::ReturnOp>(func_.getLoc(), op.getResults());
+  auto op = CustomOp::create(builder, func_.getLoc(),
+                             func_.getFunctionType().getResults(),
+                             func_.getArguments(), kImageWarping,
+                             CustomOption(&builder, /*content=*/""));
+  func::ReturnOp::create(builder, func_.getLoc(), op.getResults());
 
   return success();
 }
diff --git a/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc b/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc
index 3c136be98ef071..f3917e32d91126 100644
--- a/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/region_isolation_test.cc
@@ -85,7 +85,7 @@ TEST(RegionIsolationTest, CaseOp) {
 
   OpBuilder b(&ctx);
 
-  OwningOpRef<ModuleOp> root(b.create<ModuleOp>(b.getUnknownLoc()));
+  OwningOpRef<ModuleOp> root(ModuleOp::create(b, b.getUnknownLoc()));
 
   {
     auto& block = root->getBodyRegion().front();
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
index fa191c6c69d984..a402deb4bc230e 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -136,10 +136,10 @@ LogicalResult ConvertWhitespaceTokenizer(func::FuncOp func, llvm::StringRef api,
   func->setAttr(kTFImplements, attr);
   OpBuilder builder(func.getBody());
   std::string empty_option_buffer;
-  auto op = builder.create<CustomOp>(
-      func.getLoc(), func.getFunctionType().getResults(), func.getArguments(),
-      api, CustomOption(&builder, empty_option_buffer));
-  builder.create<func::ReturnOp>(func.getLoc(), op.getResults());
+  auto op = CustomOp::create(
+      builder, func.getLoc(), func.getFunctionType().getResults(),
+      func.getArguments(), api, CustomOption(&builder, empty_option_buffer));
+  func::ReturnOp::create(builder, func.getLoc(), op.getResults());
   return success();
 }
 
@@ -267,10 +267,10 @@ LogicalResult ConvertNgrams(func::FuncOp func, llvm::StringRef api,
                                       custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func.getLoc(), func.getFunctionType().getResults(), func.getArguments(),
-      api, CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func.getLoc(), op.getResults());
+  auto op = CustomOp::create(
+      builder, func.getLoc(), func.getFunctionType().getResults(),
+      func.getArguments(), api, CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func.getLoc(), op.getResults());
   return success();
 }
 
@@ -350,10 +350,10 @@ LogicalResult ConvertSgnnProjection(func::FuncOp func, llvm::StringRef api,
                                               custom_option_buffer))) {
     return failure();
   }
-  auto op = builder.create<CustomOp>(
-      func.getLoc(), func.getFunctionType().getResults(), func.getArguments(),
-      api, CustomOption(&builder, custom_option_buffer));
-  builder.create<func::ReturnOp>(func.getLoc(), op.getResults());
+  auto op = CustomOp::create(
+      builder, func.getLoc(), func.getFunctionType().getResults(),
+      func.getArguments(), api, CustomOption(&builder, custom_option_buffer));
+  func::ReturnOp::create(builder, func.getLoc(), op.getResults());
   return success();
 }
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index e94f9359d6fad2..89896d69079c28 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -89,12 +89,12 @@ Value CreateConstValue(OpBuilder& builder, const Location loc,
         RankedTensorType::get(shape, builder.getIntegerType(sizeof(T) * 8));
 
     const auto attr = DenseIntElementsAttr::get(shape_type, values);
-    return builder.create<TF::ConstOp>(loc, attr);
+    return TF::ConstOp::create(builder, loc, attr);
   }
 
   const auto type = RankedTensorType::get(shape, builder.getF32Type());
   const auto value_attr = DenseFPElementsAttr::get(type, values);
-  return builder.create<TF::ConstOp>(loc, value_attr);
+  return TF::ConstOp::create(builder, loc, value_attr);
 }
 
 // Creates a 1D array with integer/float type.
diff --git a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
index 324b70c8fbe573..5f43083540831f 100644
--- a/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
+++ b/tensorflow/compiler/mlir/quantization/common/quantization_lib/quantization_utils.h
@@ -288,10 +288,10 @@ struct ConvertStatsToQDQs
     rewriter.setInsertionPointAfter(op.getOperation());
     Type result_type = quant_type.castFromExpressedType(op.getType());
     auto q =
-        rewriter.create<QuantizeOpT>(op.getLoc(), result_type, op.getArg());
+        QuantizeOpT::create(rewriter, op.getLoc(), result_type, op.getArg());
     q->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
 
-    auto dq = rewriter.create<DequantizeOpT>(op.getLoc(), op.getType(), q);
+    auto dq = DequantizeOpT::create(rewriter, op.getLoc(), op.getType(), q);
     op.getResult().replaceAllUsesWith(dq);
     q.getOperation()->replaceUsesOfWith(dq, op.getArg());
     op.erase();
@@ -644,8 +644,8 @@ class QuantizationPattern : public RewritePattern {
             if (!matchPattern(q.getOperand(), m_Constant(&attr))) {
               continue;
             }
-            auto cst = rewriter.create<arith::ConstantOp>(
-                quantized_op->getLoc(), attr);
+            auto cst = arith::ConstantOp::create(rewriter,
+                                                 quantized_op->getLoc(), attr);
             quantizing_op->setOperand(i, cst.getResult());
           }
         }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
index 5ce7217927771b..4203d7824844f9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_to_mhlo_int_test.cc
@@ -125,9 +125,9 @@ class ConvertTfQuantToMhloIntTest : public Test {
       // can't lower tf.Const.
       Value cst;
       if (use_mhlo_const) {
-        cst = builder.create<mhlo::ConstantOp>(func_op->getLoc(), attrs);
+        cst = mhlo::ConstantOp::create(builder, func_op->getLoc(), attrs);
       } else {
-        cst = builder.create<TF::ConstOp>(func_op->getLoc(), attrs);
+        cst = TF::ConstOp::create(builder, func_op->getLoc(), attrs);
       }
       func_op.getArgument(i).replaceAllUsesWith(cst);
     }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
index 46d408b06d05e4..cc63c246434934 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_types.cc
@@ -241,8 +241,8 @@ class TFUniformQuantizedOpsPattern : public ConversionPattern {
       Type orig_op_type = op->getOperandTypes()[i];
       if (IsIllegalType(orig_op_type) &&
           !IsQintValueDefinedByIntToQintCast(op->getOperand(i))) {
-        new_operands.push_back(rewriter.create<TF::CastOp>(
-            op->getLoc(), orig_op_type, operands[i]));
+        new_operands.push_back(TF::CastOp::create(rewriter, op->getLoc(),
+                                                  orig_op_type, operands[i]));
       } else {
         new_operands.push_back(operands[i]);
       }
@@ -261,8 +261,8 @@ class TFUniformQuantizedOpsPattern : public ConversionPattern {
       Value &result = new_results[i];
       if (IsIllegalType(result.getType()) &&
           !IsQintValueQintToIntCast(op->getResult(i))) {
-        result = rewriter.create<TF::CastOp>(
-            op->getLoc(), ToLegalType(result.getType()), result);
+        result = TF::CastOp::create(rewriter, op->getLoc(),
+                                    ToLegalType(result.getType()), result);
       }
       // If the result is already consumed by qint->int CastOp, manually replace
       // its use by the new UQ op. This is because such CastOp is already legal,
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
index b7903b433757b5..1dd93a9b2c165e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
@@ -177,15 +177,17 @@ class BitcastConvertOpPattern
       return failure();
     } else if (is_input_legal) {
       // output is f32, we bitcast_convert to f32 and then convert to bf16.
-      const Value output = rewriter.create<mlir::stablehlo::BitcastConvertOp>(
-          op->getLoc(), op.getResult().getType(), adaptor.getOperand());
+      const Value output = mlir::stablehlo::BitcastConvertOp::create(
+          rewriter, op->getLoc(), op.getResult().getType(),
+          adaptor.getOperand());
       rewriter.replaceOpWithNewOp<mlir::stablehlo::ConvertOp>(
           op, getTypeConverter()->convertType(op.getResult().getType()),
           output);
     } else if (is_output_legal) {
       // input is f32, we convert from bf16 and then bitcast_convert.
-      const Value output = rewriter.create<mlir::stablehlo::ConvertOp>(
-          op->getLoc(), op.getOperand().getType(), adaptor.getOperand());
+      const Value output = mlir::stablehlo::ConvertOp::create(
+          rewriter, op->getLoc(), op.getOperand().getType(),
+          adaptor.getOperand());
       rewriter.replaceOpWithNewOp<mlir::stablehlo::BitcastConvertOp>(
           op, op.getResult().getType(), output);
     } else {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
index 7ee6bbd98f61e6..a63ffb1504bd85 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
@@ -123,8 +123,8 @@ void ConvertXlaCallModuleOpToBfloat16Pass::runOnOperation() {
     builder.setInsertionPoint(op);
     for (auto& op_operand : op->getOpOperands()) {
       if (quant::stablehlo::IsLargeFloatType(op_operand.get().getType())) {
-        op_operand.set(builder.create<TF::CastOp>(
-            op->getLoc(),
+        op_operand.set(TF::CastOp::create(
+            builder, op->getLoc(),
             quant::stablehlo::ToBfloat16Type(op_operand.get().getType()),
             op_operand.get()));
       }
@@ -135,7 +135,7 @@ void ConvertXlaCallModuleOpToBfloat16Pass::runOnOperation() {
         const Type original_type = op_result.getType();
         op_result.setType(quant::stablehlo::ToBfloat16Type(original_type));
         const Value cast =
-            builder.create<TF::CastOp>(op->getLoc(), original_type, op_result);
+            TF::CastOp::create(builder, op->getLoc(), original_type, op_result);
         op_result.replaceAllUsesExcept(cast, cast.getDefiningOp());
       }
     }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
index 0fdefd7342624c..08befa7708297c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/defer_activation_transpose.cc
@@ -58,8 +58,8 @@ LogicalResult IsTransposeOpWithPermuation(Operation* absl_nullable op,
 // The Location is set as `input`'s loc.
 TransposeOp CreateTransposeOp(Value input, const ArrayRef<int64_t> permutation,
                               PatternRewriter& rewriter) {
-  return rewriter.create<TransposeOp>(
-      input.getLoc(), input, rewriter.getDenseI64ArrayAttr(permutation));
+  return TransposeOp::create(rewriter, input.getLoc(), input,
+                             rewriter.getDenseI64ArrayAttr(permutation));
 }
 
 // Defers the transpose of the left-hand side (LHS) to the right-hand side and
@@ -77,7 +77,7 @@ void DeferRhsTransposeForBinaryOp(OpT op, PatternRewriter& rewriter) {
       /*input=*/rhs, kNchwToNhwcPermutation, rewriter);
 
   auto new_binary_op =
-      rewriter.create<OpT>(op.getLoc(), lhs_pre_transpose, rhs_transpose_op);
+      OpT::create(rewriter, op.getLoc(), lhs_pre_transpose, rhs_transpose_op);
 
   // NHWC -> NCHW for the output, to match the shapes of `op`'s users.
   TransposeOp output_transpose_op = CreateTransposeOp(
@@ -166,23 +166,22 @@ class DeferActivationTransposeForMaxPoolReduceWindowOp
 
     // Create a new `stablehlo.reduce_window` with all relevant attributes
     // permutated to match the new operand & result type.
-    auto new_reduce_window_op =
-        rewriter.create<mlir::stablehlo::ReduceWindowOp>(
-            op.getLoc(), new_result_type, transpose_op.getOperand(),
-            /*init_value=*/op.getOperand(1),
-            /*window_dimensions=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowDimensions(),
-                                kNchwToNhwcPermutation),
-            /*window_strides=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowStrides(),
-                                kNchwToNhwcPermutation),
-            /*base_dilations=*/
-            PermuteI64ArrayAttr(rewriter, op.getBaseDilations(),
-                                kNchwToNhwcPermutation),
-            /*window_dilations=*/
-            PermuteI64ArrayAttr(rewriter, op.getWindowDilations(),
-                                kNchwToNhwcPermutation),
-            /*padding=*/DenseIntElementsAttr(nullptr));
+    auto new_reduce_window_op = mlir::stablehlo::ReduceWindowOp::create(
+        rewriter, op.getLoc(), new_result_type, transpose_op.getOperand(),
+        /*init_value=*/op.getOperand(1),
+        /*window_dimensions=*/
+        PermuteI64ArrayAttr(rewriter, op.getWindowDimensions(),
+                            kNchwToNhwcPermutation),
+        /*window_strides=*/
+        PermuteI64ArrayAttr(rewriter, op.getWindowStrides(),
+                            kNchwToNhwcPermutation),
+        /*base_dilations=*/
+        PermuteI64ArrayAttr(rewriter, op.getBaseDilations(),
+                            kNchwToNhwcPermutation),
+        /*window_dilations=*/
+        PermuteI64ArrayAttr(rewriter, op.getWindowDilations(),
+                            kNchwToNhwcPermutation),
+        /*padding=*/DenseIntElementsAttr(nullptr));
 
     // Clone the reduce body. It is not affected by the permutation.
     IRMapping mapping;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
index 699f157e3d1b39..f4648f9a0a0362 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/fold_constant_transpose.cc
@@ -160,8 +160,8 @@ class FoldTransposedConstantOp
                                      /*elementType=*/rewriter.getF32Type());
     auto new_value_attr =
         DenseFPElementsAttr::get(new_value_type, std::move(transposed_values));
-    auto new_const_op = rewriter.create<mlir::stablehlo::ConstantOp>(
-        combined_loc, new_value_attr);
+    auto new_const_op = mlir::stablehlo::ConstantOp::create(
+        rewriter, combined_loc, new_value_attr);
 
     rewriter.replaceAllUsesWith(op, new_const_op);
     return success();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
index e855c51749e6d5..05a826b14b010a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/insert_calibration_statistics_saver.cc
@@ -86,9 +86,9 @@ LogicalResult InsertCalibrationStatisticsSaverOp(
   ArrayAttr ids_attr = builder.getStrArrayAttr(ids);
   ArrayAttr calibration_methods_attr =
       builder.getI32ArrayAttr(calibration_methods);
-  builder.create<TF::CalibrationStatisticsSaverOp>(
-      region.getLoc(), statistics_outputs, output_file_path_attr, ids_attr,
-      calibration_methods_attr);
+  TF::CalibrationStatisticsSaverOp::create(
+      builder, region.getLoc(), statistics_outputs, output_file_path_attr,
+      ids_attr, calibration_methods_attr);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
index c72879c2e04a4d..71a5b35e351495 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/merge_fusion_with_dequantize.cc
@@ -114,20 +114,21 @@ class MergeFusionWithUniformDequantizePattern
 
     // Modify the quantized fused function to do dequantize+relu(6).
     rewriter.setInsertionPoint(req_op);
-    Value new_result = rewriter.create<mlir::stablehlo::UniformDequantizeOp>(
-        req_op.getLoc(), func_op.getResultTypes()[0], req_op.getOperand());
+    Value new_result = mlir::stablehlo::UniformDequantizeOp::create(
+        rewriter, req_op.getLoc(), func_op.getResultTypes()[0],
+        req_op.getOperand());
     if (func_name.contains("_relu6_")) {
-      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
-          req_op.getLoc(), rewriter.getF32FloatAttr(0));
-      auto max = rewriter.create<mlir::stablehlo::ConstantOp>(
-          req_op.getLoc(), rewriter.getF32FloatAttr(6));
-      new_result = rewriter.create<mlir::stablehlo::ClampOp>(
-          req_op.getLoc(), min, new_result, max);
+      auto min = mlir::stablehlo::ConstantOp::create(
+          rewriter, req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      auto max = mlir::stablehlo::ConstantOp::create(
+          rewriter, req_op.getLoc(), rewriter.getF32FloatAttr(6));
+      new_result = mlir::stablehlo::ClampOp::create(rewriter, req_op.getLoc(),
+                                                    min, new_result, max);
     } else if (func_name.contains("_relu_")) {
-      auto min = rewriter.create<mlir::stablehlo::ConstantOp>(
-          req_op.getLoc(), rewriter.getF32FloatAttr(0));
-      new_result = rewriter.create<mlir::chlo::BroadcastMaxOp>(
-          req_op.getLoc(), min, new_result, nullptr);
+      auto min = mlir::stablehlo::ConstantOp::create(
+          rewriter, req_op.getLoc(), rewriter.getF32FloatAttr(0));
+      new_result = mlir::chlo::BroadcastMaxOp::create(rewriter, req_op.getLoc(),
+                                                      min, new_result, nullptr);
     }
     return_op->setOperand(0, new_result);
     rewriter.eraseOp(req_op);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
index 51950c5513c5df..1c425487799962 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/nchw_convolution_to_nhwc.cc
@@ -76,8 +76,9 @@ class RewriteNchwConvolutionToNhwc
     const TensorType new_input_tensor_type = GetTransposedTensorType(
         mlir::cast<TensorType>(input.getType()), kNchwToNhwcPermutation);
 
-    auto input_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
-        op.getLoc(), /*resultType0=*/new_input_tensor_type, /*operand=*/input,
+    auto input_transpose_op = mlir::stablehlo::TransposeOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/new_input_tensor_type,
+        /*operand=*/input,
         rewriter.getDenseI64ArrayAttr(kNchwToNhwcPermutation));
 
     // Transpose the filter tensor: [o, i, 0, 1] => [0, 1, i, o]
@@ -85,8 +86,9 @@ class RewriteNchwConvolutionToNhwc
     const TensorType new_filter_tensor_type = GetTransposedTensorType(
         mlir::cast<TensorType>(filter.getType()), kOihwToHwioPermutation);
 
-    auto filter_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
-        op.getLoc(), /*resultType0=*/new_filter_tensor_type, /*operand=*/filter,
+    auto filter_transpose_op = mlir::stablehlo::TransposeOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/new_filter_tensor_type,
+        /*operand=*/filter,
         rewriter.getDenseI64ArrayAttr(kOihwToHwioPermutation));
 
     // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
@@ -108,8 +110,8 @@ class RewriteNchwConvolutionToNhwc
     // reused without modification because the ordering of spatial dimensions
     // is not modified (i.e. before: [b, f, 0, 1], after: [b, 0, 1, f] => the
     // spatial dimension is still ordered as {0, 1}).
-    auto new_convolution_op = rewriter.create<mlir::stablehlo::ConvolutionOp>(
-        op.getLoc(), /*resultType0=*/new_conv_output_tensor_type,
+    auto new_convolution_op = mlir::stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), /*resultType0=*/new_conv_output_tensor_type,
         /*lhs=*/input_transpose_op,
         /*rhs=*/filter_transpose_op,
         /*window_strides=*/op.getWindowStridesAttr(),
@@ -125,8 +127,9 @@ class RewriteNchwConvolutionToNhwc
     // Transpose the output of the `ConvolutionOp` back to the original op's
     // output shape so that users' shapes match.
     // [b, 0, 1, f] => [b, f, 0, 1]
-    auto output_transpose_op = rewriter.create<mlir::stablehlo::TransposeOp>(
-        new_convolution_op.getLoc(), /*resultType0=*/output_tensor_type,
+    auto output_transpose_op = mlir::stablehlo::TransposeOp::create(
+        rewriter, new_convolution_op.getLoc(),
+        /*resultType0=*/output_tensor_type,
         /*operand=*/new_convolution_op,
         rewriter.getDenseI64ArrayAttr(kNhwcToNchwPermutation));
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
index 2102e64f223d55..4dff113b6427c9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
@@ -95,8 +95,8 @@ class MergeConsecutiveQuantizeCast
         q_op.getArg().getDefiningOp<mlir::quant::ir::QuantizeCastOp>();
     if (!preceding_qcast) return failure();
 
-    auto new_qcast = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
-        q_op.getLoc(), q_op.getType(), preceding_qcast.getArg());
+    auto new_qcast = mlir::quant::ir::QuantizeCastOp::create(
+        rewriter, q_op.getLoc(), q_op.getType(), preceding_qcast.getArg());
     new_qcast->setAttr(kVolatileOpAttrName, rewriter.getUnitAttr());
     q_op->replaceAllUsesWith(new_qcast);
     return success();
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
index 9d0a978bdb8efc..e65d5423458f50 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
@@ -163,8 +163,8 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
       }
     }
     rewriter.setInsertionPointAfter(op);
-    ConvertOp new_convert_op = rewriter.create<ConvertOp>(
-        op->getLoc(), new_result_type, op.getResult());
+    ConvertOp new_convert_op = ConvertOp::create(
+        rewriter, op->getLoc(), new_result_type, op.getResult());
     quantizable_op->setOperand(quantize_operand_num,
                                new_convert_op.getResult());
   }
@@ -203,10 +203,10 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
       // of its number of users.
       rewriter.setInsertionPointAfter(op);
       // create new F16 constant op in that location
-      ConstantOp new_const = rewriter.create<ConstantOp>(
-          op->getLoc(), new_result_type, new_value_attr);
+      ConstantOp new_const = ConstantOp::create(
+          rewriter, op->getLoc(), new_result_type, new_value_attr);
       ConvertOp dcast =
-          rewriter.create<ConvertOp>(op->getLoc(), old_result_type, new_const);
+          ConvertOp::create(rewriter, op->getLoc(), old_result_type, new_const);
       // replace all convert ops with dq op.
       convert_op->replaceAllUsesWith(dcast);
       // Return without scanning for the next ConvertOp as only one ConvertOp is
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
index ac1f5e8d705d49..46da2a3f25b82c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/unwrap_xla_call_module_op.cc
@@ -82,8 +82,8 @@ void UnwrapXlaCallModuleOp(TF::XlaCallModuleOp call_op,
   // TODO: b/310291615 - find a better way for multi-platform support.
   if (call_op_has_platform_index_arg) {
     arg_mapper.map(func_op.getArgument(0),
-                   builder.create<mhlo::ConstantOp>(
-                       func_op.getLoc(), builder.getI16IntegerAttr(0)));
+                   mhlo::ConstantOp::create(builder, func_op.getLoc(),
+                                            builder.getI16IntegerAttr(0)));
   }
   for (auto [func_arg, operand] : llvm::zip_equal(
            func_op.getArguments().take_back(call_op.getNumOperands()),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
index 5cef40a8e77c0e..42bf32a27e7bee 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.cc
@@ -67,7 +67,7 @@ LogicalResult FoldOperation(OpBuilder& builder, Operation* op,
   results.clear();
   builder.setInsertionPointAfter(op);
   for (const auto& result_value : result_values) {
-    results.push_back(builder.create<TF::ConstOp>(op->getLoc(), result_value));
+    results.push_back(TF::ConstOp::create(builder, op->getLoc(), result_value));
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
index 430056668af8ae..c2339fe044edd7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/cast_bf16_ops_to_f32.cc
@@ -91,8 +91,8 @@ class CastBf16OpsToF32 : public RewritePattern {
     for (int i = 0; i < op->getNumOperands(); i++) {
       Value input = op->getOperand(i);
       if (getElementTypeOrSelf(input).isBF16()) {
-        Value f32_cast = rewriter.create<TF::CastOp>(
-            op->getLoc(),
+        Value f32_cast = TF::CastOp::create(
+            rewriter, op->getLoc(),
             CloneTypeWithNewElementType(input.getType(), rewriter.getF32Type()),
             input);
         op->setOperand(i, f32_cast);
@@ -108,8 +108,8 @@ class CastBf16OpsToF32 : public RewritePattern {
         for (Operation* user : op->getUsers()) {
           for (int i = 0; i < user->getNumOperands(); i++) {
             if (user->getOperand(i) == value) {
-              Value bf16_cast = rewriter.create<TF::CastOp>(
-                  user->getLoc(),
+              Value bf16_cast = TF::CastOp::create(
+                  rewriter, user->getLoc(),
                   CloneTypeWithNewElementType(value.getType(),
                                               rewriter.getBF16Type()),
                   value);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
index bfbb8b45c2d80c..2ae814880fc2ff 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize_drq.cc
@@ -241,10 +241,10 @@ class PrepareDRQQuantizableOp : public OpRewritePattern<arith::ConstantOp> {
       }
     }
     rewriter.setInsertionPointAfter(op);
-    auto q = rewriter.create<mlir::quant::ir::QuantizeCastOp>(
-        op->getLoc(), cast_type, op.getResult());
-    auto dq = rewriter.create<mlir::quant::ir::DequantizeCastOp>(
-        op->getLoc(), expressed_type, q);
+    auto q = mlir::quant::ir::QuantizeCastOp::create(rewriter, op->getLoc(),
+                                                     cast_type, op.getResult());
+    auto dq = mlir::quant::ir::DequantizeCastOp::create(rewriter, op->getLoc(),
+                                                        expressed_type, q);
     quantize_op->setOperand(quantize_operand_num, dq.getResult());
     return true;
   }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
index 2ef9eeecc7bc2d..0c42b760557c51 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/utils/tf_to_xla_attribute_utils.cc
@@ -32,8 +32,8 @@ namespace {
 Value GetDimValue(OpBuilder &builder, Location loc, Value shape_value,
                   int32_t dim) {
   Type attribute_type = builder.getI64Type();
-  return builder.create<TF::StridedSliceOp>(
-      loc,
+  return TF::StridedSliceOp::create(
+      builder, loc,
       RankedTensorType::get(
           {}, mlir::cast<ShapedType>(shape_value.getType()).getElementType()),
       /*input=*/shape_value,
@@ -60,16 +60,16 @@ void GetSamePaddingValues(OpBuilder &builder, Location loc, Value input_size,
   Type int32_scalar_type = zero.getType();
 
   auto scalar_add = [&](Value lhs, Value rhs) {
-    return builder.create<TF::AddOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::AddOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
   auto scalar_mul = [&](Value lhs, Value rhs) {
-    return builder.create<TF::MulOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::MulOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
   auto scalar_sub = [&](Value lhs, Value rhs) {
-    return builder.create<TF::SubOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::SubOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
   auto scalar_div = [&](Value lhs, Value rhs) {
-    return builder.create<TF::DivOp>(loc, int32_scalar_type, lhs, rhs);
+    return TF::DivOp::create(builder, loc, int32_scalar_type, lhs, rhs);
   };
 
   // effective_filter_size = (filter_size - 1) * dilation_rate + 1
@@ -90,7 +90,7 @@ void GetSamePaddingValues(OpBuilder &builder, Location loc, Value input_size,
       scalar_add(effective_filter_size_op,
                  scalar_mul(stride_value, scalar_sub(output_size, one))),
       input_size);
-  padding_needed = builder.create<TF::MaximumOp>(loc, padding_needed, zero);
+  padding_needed = TF::MaximumOp::create(builder, loc, padding_needed, zero);
   padding_low = scalar_div(padding_needed, two);
   padding_high = scalar_sub(padding_needed, padding_low);
 }
@@ -104,14 +104,15 @@ Value PadForDynamicShapedInputSamePadding(
 
   auto reshape_op = [&](Value value, const SmallVector<int64_t> &shape) {
     const int64_t rank = shape.size();
-    return builder.create<TF::ReshapeOp>(
-        loc, RankedTensorType::get(shape, builder.getI32Type()), value,
+    return TF::ReshapeOp::create(
+        builder, loc, RankedTensorType::get(shape, builder.getI32Type()), value,
         CreateConstValue<int64_t>(builder, loc, {rank}, shape));
   };
 
   ShapedType filter_shape = mlir::cast<ShapedType>(filter.getType());
-  Value input_shape_value = builder.create<TF::ShapeOp>(
-      loc, RankedTensorType::get({num_dims}, builder.getI32Type()), input);
+  Value input_shape_value = TF::ShapeOp::create(
+      builder, loc, RankedTensorType::get({num_dims}, builder.getI32Type()),
+      input);
   auto scalar_to_rank1 = [&](Value value) { return reshape_op(value, {1}); };
   for (int i : llvm::seq<int>(1, num_dims - 1)) {
     Value input_size_i = GetDimValue(builder, loc, input_shape_value, i);
@@ -131,12 +132,12 @@ Value PadForDynamicShapedInputSamePadding(
       builder, loc, /*shape=*/{num_dims - 2, 2},
       /*values=*/SmallVector<int32_t>(2 * (num_dims - 2), 0));
   Value zero = CreateScalarConstValue(builder, loc, 0);
-  Value temp_padding_rank1 = builder.create<TF::ConcatOp>(
-      loc, RankedTensorType::get({2 * num_dims}, builder.getI32Type()), zero,
-      temp_padding_values);
+  Value temp_padding_rank1 = TF::ConcatOp::create(
+      builder, loc, RankedTensorType::get({2 * num_dims}, builder.getI32Type()),
+      zero, temp_padding_values);
   Value temp_padding = reshape_op(temp_padding_rank1, {num_dims, 2});
-  return builder.create<TF::PadV2Op>(
-      loc, input.getType(), input, temp_padding,
+  return TF::PadV2Op::create(
+      builder, loc, input.getType(), input, temp_padding,
       CreateScalarConstValue<int8_t>(builder, loc, input_zp_value));
 }
 
@@ -224,9 +225,9 @@ Value CalculatePaddingAndPadIfNeeded(OpBuilder &builder, Location loc,
     output_shape[i] += padding_values[2 * i] + padding_values[2 * i + 1];
   }
 
-  return builder.create<TF::PadV2Op>(
-      loc, RankedTensorType::get(output_shape, builder.getI8Type()), input,
-      temp_padding,
+  return TF::PadV2Op::create(
+      builder, loc, RankedTensorType::get(output_shape, builder.getI8Type()),
+      input, temp_padding,
       CreateScalarConstValue<int8_t>(builder, loc, input_zp_value));
 }
 
@@ -254,7 +255,7 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
                                     value_type.getShape().end());
   RankedTensorType shape_type =
       RankedTensorType::get({rank}, builder.getI64Type());
-  Value shape_value = builder.create<TF::ShapeOp>(loc, shape_type, value);
+  Value shape_value = TF::ShapeOp::create(builder, loc, shape_type, value);
 
   // It is guaranteed that packed_shape[pack_dim] is known.
   if (packed_shape[pack_dim] % 2 != 0) {
@@ -263,14 +264,14 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
     padding[pack_dim * 2 + 1] = 1;
     Value padding_value =
         CreateConstValue<int32_t>(builder, loc, {rank, 2}, padding);
-    value = builder.create<TF::PadV2Op>(
-        loc, RankedTensorType::get(packed_shape, builder.getI8Type()), value,
-        padding_value, CreateScalarConstValue<int8_t>(builder, loc, 0));
+    value = TF::PadV2Op::create(
+        builder, loc, RankedTensorType::get(packed_shape, builder.getI8Type()),
+        value, padding_value, CreateScalarConstValue<int8_t>(builder, loc, 0));
 
     SmallVector<int64_t> shape_add(rank, 0);
     shape_add[pack_dim] = 1;
-    shape_value = builder.create<TF::AddOp>(
-        loc, shape_type, shape_value,
+    shape_value = TF::AddOp::create(
+        builder, loc, shape_type, shape_value,
         CreateConstValue<int64_t>(builder, loc, {rank}, shape_add));
   }
   packed_shape[pack_dim] /= 2;
@@ -279,17 +280,17 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
 
   RankedTensorType packed_output_type =
       RankedTensorType::get(packed_shape, builder.getI8Type());
-  Value packed_shape_value = builder.create<TF::DivOp>(
-      loc, shape_type, shape_value,
+  Value packed_shape_value = TF::DivOp::create(
+      builder, loc, shape_type, shape_value,
       CreateConstValue<int64_t>(builder, loc, {rank}, divisor));
 
   Value packed_low_begin_value = CreateConstValue<int64_t>(
       builder, loc, {rank}, SmallVector<int64_t>(rank, 0));
   Value packed_low_value =
-      builder.create<TF::SliceOp>(loc, packed_output_type, value,
-                                  packed_low_begin_value, packed_shape_value);
-  packed_low_value = builder.create<TF::BitwiseAndOp>(
-      loc, packed_output_type, packed_low_value,
+      TF::SliceOp::create(builder, loc, packed_output_type, value,
+                          packed_low_begin_value, packed_shape_value);
+  packed_low_value = TF::BitwiseAndOp::create(
+      builder, loc, packed_output_type, packed_low_value,
       CreateScalarConstValue<int8_t>(builder, loc, 0x0F));
 
   SmallVector<int64_t> packed_high_begin(rank, 0);
@@ -297,14 +298,14 @@ Value PackOperand(OpBuilder &builder, Location loc, Value value, int pack_dim) {
   Value packed_high_begin_value =
       CreateConstValue<int64_t>(builder, loc, {rank}, packed_high_begin);
   Value packed_high_value =
-      builder.create<TF::SliceOp>(loc, packed_output_type, value,
-                                  packed_high_begin_value, packed_shape_value);
-  packed_high_value = builder.create<TF::LeftShiftOp>(
-      loc, packed_output_type, packed_high_value,
+      TF::SliceOp::create(builder, loc, packed_output_type, value,
+                          packed_high_begin_value, packed_shape_value);
+  packed_high_value = TF::LeftShiftOp::create(
+      builder, loc, packed_output_type, packed_high_value,
       CreateScalarConstValue<int8_t>(builder, loc, 4));
 
-  Operation *packed = builder.create<TF::BitwiseOrOp>(
-      loc, packed_output_type, packed_low_value, packed_high_value);
+  Operation* packed = TF::BitwiseOrOp::create(
+      builder, loc, packed_output_type, packed_low_value, packed_high_value);
   return ConstantFoldOpIfPossible(packed).front();
 }
 

From d6a407c9f5e0774e26b2f85b420381ff72d7ea67 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 14:02:02 -0800
Subject: [PATCH 597/753] Integrate LLVM at llvm/llvm-project@7d381f2a5634

Updates LLVM usage to match
[7d381f2a5634](https://github.com/llvm/llvm-project/commit/7d381f2a5634)

PiperOrigin-RevId: 846858892
---
 .../xla/third_party/llvm/generated.patch      | 151 ++++++
 .../xla/third_party/llvm/workspace.bzl        |   4 +-
 .../xla/third_party/shardy/temporary.patch    | 471 ++++++++++++++++++
 .../xla/third_party/shardy/workspace.bzl      |   4 +-
 4 files changed, 626 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/third_party/llvm/generated.patch b/third_party/xla/third_party/llvm/generated.patch
index 509398da979e83..f82404ca1cbe14 100644
--- a/third_party/xla/third_party/llvm/generated.patch
+++ b/third_party/xla/third_party/llvm/generated.patch
@@ -1 +1,152 @@
 Auto generated patch. Do not edit or delete it, even if empty.
+diff -ruN --strip-trailing-cr a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
+--- a/clang/lib/Serialization/ASTReaderDecl.cpp
++++ b/clang/lib/Serialization/ASTReaderDecl.cpp
+@@ -2107,8 +2107,9 @@
+     auto *Def = DD.Definition;
+     DD = std::move(MergeDD);
+     DD.Definition = Def;
+-    for (auto *D : Def->redecls())
+-      cast<CXXRecordDecl>(D)->DefinitionData = &DD;
++    for (auto *R = Reader.getMostRecentExistingDecl(Def); R;
++         R = R->getPreviousDecl())
++      cast<CXXRecordDecl>(R)->DefinitionData = &DD;
+     return;
+   }
+ 
+diff -ruN --strip-trailing-cr a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
+--- a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
++++ b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
+@@ -61,14 +61,14 @@
+ LIBC_INLINE static void write_mxcsr(uint32_t w) { _mm_setcsr(w); }
+ 
+ LIBC_INLINE static void clear_except(uint16_t excepts) {
+-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
++  uint32_t mxcsr = get_mxcsr();
+   mxcsr &= ~static_cast<uint32_t>(excepts);
+-  _MM_SET_EXCEPTION_STATE(mxcsr);
++  write_mxcsr(mxcsr);
+ }
+ 
+ LIBC_INLINE static uint16_t test_except(uint16_t excepts) {
+   uint32_t mxcsr = get_mxcsr();
+-  return static_cast<uint16_t>(excepts & mxcsr);
++  return static_cast<uint16_t>(excepts & ExceptionFlags::ALL_F & mxcsr);
+ }
+ 
+ LIBC_INLINE static uint16_t get_except() {
+@@ -83,9 +83,9 @@
+ }
+ 
+ LIBC_INLINE static void raise_except(uint16_t excepts) {
+-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
+-  mxcsr |= excepts;
+-  _MM_SET_EXCEPTION_STATE(mxcsr);
++  uint32_t mxcsr = get_mxcsr();
++  mxcsr |= excepts & ExceptionFlags::ALL_F;
++  write_mxcsr(mxcsr);
+ #ifdef LIBC_TRAP_ON_RAISE_FP_EXCEPT
+   // We will try to trigger the SIGFPE if floating point exceptions are not
+   // masked.  Since we already set all the floating point exception flags, we
+diff -ruN --strip-trailing-cr a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
+--- a/libcxx/include/__flat_map/flat_map.h
++++ b/libcxx/include/__flat_map/flat_map.h
+@@ -465,13 +465,13 @@
+   }
+ 
+   // [flat.map.access], element access
+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
+     requires is_constructible_v<mapped_type>
+   {
+     return try_emplace(__x).first->second;
+   }
+ 
+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
+     requires is_constructible_v<mapped_type>
+   {
+     return try_emplace(std::move(__x)).first->second;
+@@ -480,7 +480,7 @@
+   template <class _Kp>
+     requires(__is_compare_transparent && is_constructible_v<key_type, _Kp> && is_constructible_v<mapped_type> &&
+              !is_convertible_v<_Kp &&, const_iterator> && !is_convertible_v<_Kp &&, iterator>)
+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
+     return try_emplace(std::forward<_Kp>(__x)).first->second;
+   }
+ 
+diff -ruN --strip-trailing-cr a/libcxx/include/map b/libcxx/include/map
+--- a/libcxx/include/map
++++ b/libcxx/include/map
+@@ -1092,9 +1092,9 @@
+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __tree_.size(); }
+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return __tree_.max_size(); }
+ 
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
+ #  ifndef _LIBCPP_CXX03_LANG
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
+ #  endif
+ 
+   template <class _Arg,
+diff -ruN --strip-trailing-cr a/libcxx/include/unordered_map b/libcxx/include/unordered_map
+--- a/libcxx/include/unordered_map
++++ b/libcxx/include/unordered_map
+@@ -1262,9 +1262,9 @@
+   }
+ #  endif // _LIBCPP_STD_VER >= 20
+ 
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
+ #  ifndef _LIBCPP_CXX03_LANG
+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
+ #  endif
+ 
+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& at(const key_type& __k);
+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
+--- a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
++++ b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
+@@ -66,9 +66,9 @@
+   TransparentKey<int> tkey;
+ 
+   std::flat_map<int, int> nfm;
+-  nfm[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  fm[std::move(key)];  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  fm[std::move(tkey)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++  nfm[key];            // no-warning
++  fm[std::move(key)];  // no-warning
++  fm[std::move(tkey)]; // no-warning
+ 
+   fm.at(key);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+   cfm.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
+--- a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
++++ b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
+@@ -55,8 +55,8 @@
+ 
+   int key = 0;
+ 
+-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++  m[key];            // no-warning
++  m[std::move(key)]; // no-warning
+ 
+ #if TEST_STD_VER >= 14
+   std::map<std::string, int, std::less<>> strMap;
+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
+--- a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
++++ b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
+@@ -81,8 +81,8 @@
+   ctm.equal_range(tkey); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+ #endif
+ 
+-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++  m[key];            // no-warning
++  m[std::move(key)]; // no-warning
+ 
+   m.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+   cm.at(key); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
diff --git a/third_party/xla/third_party/llvm/workspace.bzl b/third_party/xla/third_party/llvm/workspace.bzl
index f2c3289a046872..29af0ffbd8c12c 100644
--- a/third_party/xla/third_party/llvm/workspace.bzl
+++ b/third_party/xla/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "8f264586d7521b0e305ca7bb78825aa3382ffef7"
-    LLVM_SHA256 = "5784c4af94caba66bc8c460e07e222f751e4f4c9db9c45b3a68ff55379cf587d"
+    LLVM_COMMIT = "7d381f2a5634d1e41b61299839d652cc4a021898"
+    LLVM_SHA256 = "f1641918fd3f5e1667d39afb9c261da39ed9f74e30f1c2f98031d6d609a8de15"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index e69de29bb2d1d6..3e0e0520e60482 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -0,0 +1,471 @@
+diff --git a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc
+index aceb4d7..8752484 100644
+--- a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc
++++ b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc
+@@ -908,8 +908,8 @@ void insertAllReducesForReductionFactors(
+   }
+ }
+ 
+-bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
+-                                          const SymbolTable& symbolTable) {
++bool convertReshardToShardedToUnreduced(Operation* op, IRRewriter& rewriter,
++                                        const SymbolTable& symbolTable) {
+   ReshardOp reshardOp = dyn_cast<ReshardOp>(op);
+   if (!reshardOp) {
+     return false;
+@@ -934,12 +934,7 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
+       << "Reshard op has different meshes for input and output. The result has "
+          "non-empty unreduced axes.";
+ 
+-  // The relationship of the unreduced axes is "out = in + r2u + s2u", where
+-  // "r2u" is the replicated-to-unreduced axes and "s2u" is the
+-  // sharded-to-unreduced axes.
+-  SmallVector<AxisRefAttr> r2uAnds2uAxes =
+-      getAxisSetDiff(outUnreducedAxes, inUnreducedAxes, inMesh);
+-  if (r2uAnds2uAxes.empty()) {
++  if (getAxisSetDiff(outUnreducedAxes, inUnreducedAxes, inMesh).empty()) {
+     return false;
+   }
+ 
+@@ -950,7 +945,7 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
+       << "Input of sharded-to-unreduced reshard must be a block argument or a "
+          "reshard op.";
+ 
+-  SmallVector<AxisRefAttr> s2uAxes;
++  SmallVector<AxisRefAttr> newUnreducedAxes = llvm::to_vector(inUnreducedAxes);
+   SmallVector<AxisRefListAttr> axesPerDim(inSharding.getRank());
+   for (auto [inDimSharding, outDimSharding, axes] :
+        llvm::zip_equal(inSharding.getDimShardings(),
+@@ -971,7 +966,7 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
+       }
+       diff.append(inAxes.begin() + outAxes.size(), inAxes.end());
+       axes = AxisRefListAttr::get(rewriter.getContext(), diff);
+-      s2uAxes.append(diff);
++      newUnreducedAxes.append(diff);
+     } else {
+       SDY_LOG(FATAL)
+           << "The reshard op needs to be decomposed to a sharded-to-unreduced "
+@@ -979,27 +974,17 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
+     }
+   }
+ 
++  sortAndMergeAxes(newUnreducedAxes, inMesh);
++
+   rewriter.setInsertionPoint(reshardOp);
+-  Value result = input;
+-
+-  SmallVector<AxisRefAttr> r2uAxes =
+-      getAxisSetDiff(r2uAnds2uAxes, s2uAxes, inMesh);
+-  if (!r2uAxes.empty()) {
+-    SmallVector<AxisRefAttr> inPlusR2uAxes = llvm::to_vector(inUnreducedAxes);
+-    inPlusR2uAxes.append(r2uAxes.begin(), r2uAxes.end());
+-    sortAndMergeAxes(inPlusR2uAxes, inMesh);
+-    TensorShardingAttr r2uSharding =
+-        TensorShardingAttr::get(rewriter.getContext(), inSharding.getMeshName(),
+-                                inSharding.getDimShardings(),
+-                                outSharding.getReplicatedAxes(), inPlusR2uAxes);
+-    result = ReplicatedToUnreducedOp::create(rewriter, reshardOp.getLoc(),
+-                                             result, r2uAxes, r2uSharding);
+-  }
+-  if (!s2uAxes.empty()) {
+-    result = ShardedToUnreducedOp::create(rewriter, reshardOp.getLoc(), result,
+-                                          axesPerDim, outSharding);
++  Operation* result = ShardedToUnreducedOp::create(
++      rewriter, reshardOp.getLoc(), input, axesPerDim,
++      outSharding.replaceUnreducedAxes(newUnreducedAxes));
++  if (newUnreducedAxes != outUnreducedAxes) {
++    SDY_LOG(WARNING) << "need repliaced-to-unreduced";
++    result = ReshardOp::create(rewriter, reshardOp.getLoc(),
++                               result->getResult(0), outSharding);
+   }
+-
+   rewriter.replaceOp(reshardOp, result);
+   return true;
+ }
+diff --git a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h
+index 0a5563f..c183216 100644
+--- a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h
++++ b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h
+@@ -164,19 +164,15 @@ AxesPerFactor findCommonAxes(const ShardingProjection& shardingProjection,
+                              OpShardingRuleAttr shardingRule,
+                              ArrayRef<int64_t> tensorSizes, const Mesh& mesh);
+ 
+-// Converts a `sdy.reshard` op to an `sdy.replicated-to-unreduced` op and/or an
+-// `sdy.sharded-to-unreduced` op. Returns true if the conversion is successful.
+-//
+-// `r2u` keeps the sharded size, while `s2u` increases the sharded size. Hence,
+-// we do `r2u` first and then `s2u`.
++// Converts a `sdy.reshard` op to an `sdy.sharded-to-unreduced` op. Returns true
++// if the conversion is successful.
+ //
+ // The requirements are:
+ // 1. `op` is a `sdy.reshard` op.
+-// 2. The input and output shardings have the same mesh.
+-// 3. The input of `op` is another `sdy.reshard` op or a block argument.
+-// 4. The input unreduced axes is a strict subset of the output unreduced axes.
+-bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
+-                                          const SymbolTable& symbolTable);
++// 2. The input of `op` is another `sdy.reshard` op or a block argument.
++// 3. The `op` can be converted to a single `sdy.sharded-to-unreduced` op.
++bool convertReshardToShardedToUnreduced(Operation* op, IRRewriter& rewriter,
++                                        const SymbolTable& symbolTable);
+ 
+ }  // namespace sdy
+ }  // namespace mlir
+diff --git a/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc b/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc
+index 85d048e..7f96c9b 100644
+--- a/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc
++++ b/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc
+@@ -486,7 +486,7 @@ struct InsertExplicitReshardsPass
+         return;
+       }
+ 
+-      if (convertReshardToUnreducedCollectives(op, rewriter, symbolTable)) {
++      if (convertReshardToShardedToUnreduced(op, rewriter, symbolTable)) {
+         return;
+       }
+ 
+diff --git a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir
+index f30109e..f3868a9 100644
+--- a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir
++++ b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir
+@@ -2,7 +2,6 @@
+ 
+ sdy.mesh @mesh = <["x"=2, "y"=2, "z"=4]>
+ sdy.mesh @other_mesh = <["x"=2, "y"=2]>
+-sdy.mesh @mesh_x16 = <["x"=16]>
+ sdy.mesh @mesh_abcd = <["a"=2, "b"=2, "c"=2, "d"=2]>
+ 
+ //===----------------------------------------------------------------------===//
+@@ -521,17 +520,17 @@ func.func @different_arguments_to_multiple_named_computations_with_same_input_ou
+ }
+ 
+ //===----------------------------------------------------------------------===//
+-// Replicated and sharded to unreduced tests
++// Sharded to unreduced tests
+ //===----------------------------------------------------------------------===//
+ 
+-// CHECK-LABEL: func @sharded_to_unreduced
+-func.func @sharded_to_unreduced(
+-    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
+-    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
++// CHECK-LABEL: func @sharded_to_unreduced_1
++func.func @sharded_to_unreduced_1(
++    %arg0 : tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
++    -> (tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
+   // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x"}>
+   // CHECK-NEXT: return %0
+-  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<16x8xf32>
+-  return %0 : tensor<16x8xf32>
++  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<24x8xf32>
++  return %0 : tensor<24x8xf32>
+ }
+ 
+ // CHECK-LABEL: func @sharded_to_unreduced_single_axis
+@@ -574,44 +573,13 @@ func.func @sharded_to_unreduced_with_subaxis(
+  return %0 : tensor<16x8xf32>
+ }
+ 
+-// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_full_axis
+-func.func @implicitly_and_explicitly_replicated_to_unreduced_full_axis(
+-    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], replicated={"z"}, unreduced={"y"}>})
+-    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x", "z"} %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}>
+-  // CHECK-NEXT: return %0
+-  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<16x8xf32>
+-  return %0 : tensor<16x8xf32>
+-}
+-
+-// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis
+-func.func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis(
+-    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], replicated={"x":(8)2}, unreduced={"x":(4)2}>})
+-    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x":(2)2, "x":(8)2} %arg0 out_sharding=<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>
+-  // CHECK-NEXT: return %0
+-  %0 = sdy.reshard %arg0 <@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}> : tensor<16x8xf32>
+-  return %0 : tensor<16x8xf32>
+-}
+-
+-// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_full_axis
+-func.func @replicated_and_sharded_to_unreduced_full_axis(
++// CHECK-LABEL: func @sharded_to_unreduced_and_replicated_to_unreduced
++func.func @sharded_to_unreduced_and_replicated_to_unreduced(
+     %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y"}>})
+     -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z"} %arg0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<16x8xf32>
+-  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{"x"}, {}] %0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<16x8xf32>
++  // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y"}>
++  // CHECK-NEXT: %1 = sdy.reshard %0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}>
+   // CHECK-NEXT: return %1
+  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> :  tensor<16x8xf32>
+  return %0 : tensor<16x8xf32>
+ }
+-
+-// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_sub_axis
+-func.func @replicated_and_sharded_to_unreduced_sub_axis(
+-    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y"}>})
+-    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y", "z"}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z":(2)2} %arg0 out_sharding=<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y", "z":(2)2}> : tensor<16x8xf32>
+-  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{}, {"z":(1)2}] %0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<16x8xf32>
+-  // CHECK-NEXT: return %1
+- %0 = sdy.reshard %arg0 <@mesh, [{"x"}, {}], unreduced={"y", "z"}> :  tensor<16x8xf32>
+- return %0 : tensor<16x8xf32>
+-}
+diff --git a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir
+index 5b1973a..5dea360 100644
+--- a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir
++++ b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir
+@@ -1,7 +1,6 @@
+ // RUN: sdy_opt %s -sdy-insert-explicit-reshards='enable-full-version=true' | FileCheck %s
+ 
+ sdy.mesh @mesh = <["x"=4, "y"=2, "z"=4]>
+-sdy.mesh @mesh_x16 = <["x"=16]>
+ 
+ // CHECK-LABEL: func @all_reduce_on_func_input
+ func.func @all_reduce_on_func_input(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"y"}>}, %arg1: tensor<4x8xf32>) -> tensor<4x8xf32> {
+@@ -307,17 +306,17 @@ func.func @all_reduce_source_and_target_fully_replicated_shardings_and_different
+ }
+ 
+ //===----------------------------------------------------------------------===//
+-// Replicated and sharded to unreduced tests
++// Sharded to unreduced tests
+ //===----------------------------------------------------------------------===//
+ 
+-// CHECK-LABEL: func @sharded_to_unreduced
+-func.func @sharded_to_unreduced(
+-    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
+-    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
++// CHECK-LABEL: func @sharded_to_unreduced_1
++func.func @sharded_to_unreduced_1(
++    %arg0 : tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
++    -> (tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
+   // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x"}>
+   // CHECK-NEXT: return %0
+-  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<32x32xf32>
+-  return %0 : tensor<32x32xf32>
++  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<24x8xf32>
++  return %0 : tensor<24x8xf32>
+ }
+ 
+ // CHECK-LABEL: func @sharded_to_unreduced_single_axis
+@@ -360,44 +359,13 @@ func.func @sharded_to_unreduced_with_subaxis(
+  return %0 : tensor<32x32xf32>
+ }
+ 
+-// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_full_axis
+-func.func @implicitly_and_explicitly_replicated_to_unreduced_full_axis(
+-    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], replicated={"z"}, unreduced={"y"}>})
+-    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x", "z"} %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}>
+-  // CHECK-NEXT: return %0
+-  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<32x32xf32>
+-  return %0 : tensor<32x32xf32>
+-}
+-
+-// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis
+-func.func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis(
+-    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], replicated={"x":(8)2}, unreduced={"x":(4)2}>})
+-    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x":(2)2, "x":(8)2} %arg0 out_sharding=<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>
+-  // CHECK-NEXT: return %0
+-  %0 = sdy.reshard %arg0 <@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}> : tensor<32x32xf32>
+-  return %0 : tensor<32x32xf32>
+-}
+-
+-// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_full_axis
+-func.func @replicated_and_sharded_to_unreduced_full_axis(
++// CHECK-LABEL: func @sharded_to_unreduced_and_replicated_to_unreduced
++func.func @sharded_to_unreduced_and_replicated_to_unreduced(
+     %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y"}>})
+     -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z"} %arg0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<32x32xf32>
+-  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{"x"}, {}] %0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<32x32xf32>
++  // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y"}>
++  // CHECK-NEXT: %1 = sdy.reshard %0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}>
+   // CHECK-NEXT: return %1
+  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> :  tensor<32x32xf32>
+  return %0 : tensor<32x32xf32>
+ }
+-
+-// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_sub_axis
+-func.func @replicated_and_sharded_to_unreduced_sub_axis(
+-    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y"}>})
+-    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y", "z"}>}) {
+-  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z":(2)2} %arg0 out_sharding=<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y", "z":(2)2}> : tensor<32x32xf32>
+-  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{}, {"z":(1)2}] %0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<32x32xf32>
+-  // CHECK-NEXT: return %1
+- %0 = sdy.reshard %arg0 <@mesh, [{"x"}, {}], unreduced={"y", "z"}> :  tensor<32x32xf32>
+- return %0 : tensor<32x32xf32>
+-}
+diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
+index 509398d..f82404c 100644
+--- a/third_party/llvm/generated.patch
++++ b/third_party/llvm/generated.patch
+@@ -1 +1,152 @@
+ Auto generated patch. Do not edit or delete it, even if empty.
++diff -ruN --strip-trailing-cr a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
++--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++++ b/clang/lib/Serialization/ASTReaderDecl.cpp
++@@ -2107,8 +2107,9 @@
++     auto *Def = DD.Definition;
++     DD = std::move(MergeDD);
++     DD.Definition = Def;
++-    for (auto *D : Def->redecls())
++-      cast<CXXRecordDecl>(D)->DefinitionData = &DD;
+++    for (auto *R = Reader.getMostRecentExistingDecl(Def); R;
+++         R = R->getPreviousDecl())
+++      cast<CXXRecordDecl>(R)->DefinitionData = &DD;
++     return;
++   }
++ 
++diff -ruN --strip-trailing-cr a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
++--- a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
+++++ b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
++@@ -61,14 +61,14 @@
++ LIBC_INLINE static void write_mxcsr(uint32_t w) { _mm_setcsr(w); }
++ 
++ LIBC_INLINE static void clear_except(uint16_t excepts) {
++-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
+++  uint32_t mxcsr = get_mxcsr();
++   mxcsr &= ~static_cast<uint32_t>(excepts);
++-  _MM_SET_EXCEPTION_STATE(mxcsr);
+++  write_mxcsr(mxcsr);
++ }
++ 
++ LIBC_INLINE static uint16_t test_except(uint16_t excepts) {
++   uint32_t mxcsr = get_mxcsr();
++-  return static_cast<uint16_t>(excepts & mxcsr);
+++  return static_cast<uint16_t>(excepts & ExceptionFlags::ALL_F & mxcsr);
++ }
++ 
++ LIBC_INLINE static uint16_t get_except() {
++@@ -83,9 +83,9 @@
++ }
++ 
++ LIBC_INLINE static void raise_except(uint16_t excepts) {
++-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
++-  mxcsr |= excepts;
++-  _MM_SET_EXCEPTION_STATE(mxcsr);
+++  uint32_t mxcsr = get_mxcsr();
+++  mxcsr |= excepts & ExceptionFlags::ALL_F;
+++  write_mxcsr(mxcsr);
++ #ifdef LIBC_TRAP_ON_RAISE_FP_EXCEPT
++   // We will try to trigger the SIGFPE if floating point exceptions are not
++   // masked.  Since we already set all the floating point exception flags, we
++diff -ruN --strip-trailing-cr a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
++--- a/libcxx/include/__flat_map/flat_map.h
+++++ b/libcxx/include/__flat_map/flat_map.h
++@@ -465,13 +465,13 @@
++   }
++ 
++   // [flat.map.access], element access
++-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
+++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
++     requires is_constructible_v<mapped_type>
++   {
++     return try_emplace(__x).first->second;
++   }
++ 
++-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
+++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
++     requires is_constructible_v<mapped_type>
++   {
++     return try_emplace(std::move(__x)).first->second;
++@@ -480,7 +480,7 @@
++   template <class _Kp>
++     requires(__is_compare_transparent && is_constructible_v<key_type, _Kp> && is_constructible_v<mapped_type> &&
++              !is_convertible_v<_Kp &&, const_iterator> && !is_convertible_v<_Kp &&, iterator>)
++-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
+++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
++     return try_emplace(std::forward<_Kp>(__x)).first->second;
++   }
++ 
++diff -ruN --strip-trailing-cr a/libcxx/include/map b/libcxx/include/map
++--- a/libcxx/include/map
+++++ b/libcxx/include/map
++@@ -1092,9 +1092,9 @@
++   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __tree_.size(); }
++   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return __tree_.max_size(); }
++ 
++-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
+++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
++ #  ifndef _LIBCPP_CXX03_LANG
++-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
+++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
++ #  endif
++ 
++   template <class _Arg,
++diff -ruN --strip-trailing-cr a/libcxx/include/unordered_map b/libcxx/include/unordered_map
++--- a/libcxx/include/unordered_map
+++++ b/libcxx/include/unordered_map
++@@ -1262,9 +1262,9 @@
++   }
++ #  endif // _LIBCPP_STD_VER >= 20
++ 
++-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
+++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
++ #  ifndef _LIBCPP_CXX03_LANG
++-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
+++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
++ #  endif
++ 
++   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& at(const key_type& __k);
++diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
++--- a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
+++++ b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
++@@ -66,9 +66,9 @@
++   TransparentKey<int> tkey;
++ 
++   std::flat_map<int, int> nfm;
++-  nfm[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++-  fm[std::move(key)];  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++-  fm[std::move(tkey)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+++  nfm[key];            // no-warning
+++  fm[std::move(key)];  // no-warning
+++  fm[std::move(tkey)]; // no-warning
++ 
++   fm.at(key);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++   cfm.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
++--- a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
+++++ b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
++@@ -55,8 +55,8 @@
++ 
++   int key = 0;
++ 
++-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+++  m[key];            // no-warning
+++  m[std::move(key)]; // no-warning
++ 
++ #if TEST_STD_VER >= 14
++   std::map<std::string, int, std::less<>> strMap;
++diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
++--- a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
+++++ b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
++@@ -81,8 +81,8 @@
++   ctm.equal_range(tkey); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++ #endif
++ 
++-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+++  m[key];            // no-warning
+++  m[std::move(key)]; // no-warning
++ 
++   m.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
++   cm.at(key); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
+index f2c3289..29af0ff 100644
+--- a/third_party/llvm/workspace.bzl
++++ b/third_party/llvm/workspace.bzl
+@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
+ 
+ def repo(name):
+     """Imports LLVM."""
+-    LLVM_COMMIT = "8f264586d7521b0e305ca7bb78825aa3382ffef7"
+-    LLVM_SHA256 = "5784c4af94caba66bc8c460e07e222f751e4f4c9db9c45b3a68ff55379cf587d"
++    LLVM_COMMIT = "7d381f2a5634d1e41b61299839d652cc4a021898"
++    LLVM_SHA256 = "f1641918fd3f5e1667d39afb9c261da39ed9f74e30f1c2f98031d6d609a8de15"
+ 
+     tf_http_archive(
+         name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 971064ea06b0c9..03bd1efd1ba577 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "e74939f4948986b2b5fe0e04cefb0afc2300672b"
-    SHARDY_SHA256 = "04243cb1d585b5d43cf0d8bd8e611bc732090859a0ab1370bc93dcec0efe8e9e"
+    SHARDY_COMMIT = "05276b9c4469f2331e326f614d712da7b907f7df"
+    SHARDY_SHA256 = "f76bef82a597c4d72505dc1c5f8559cf77e720bdeacf976845578970e03265ea"
 
     tf_http_archive(
         name = "shardy",

From b35e4ed192d1e9cefcf6be5e5657e4256d778787 Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Fri, 19 Dec 2025 14:05:31 -0800
Subject: [PATCH 598/753] [ReplicaGroupV3][Refactor][4/n] Update replica group
 classes to hide implementation details (via private/protected) to external
 callers.

PiperOrigin-RevId: 846860541
---
 third_party/xla/xla/hlo/ir/replica_group.h | 75 ++++++++++------------
 1 file changed, 34 insertions(+), 41 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/replica_group.h b/third_party/xla/xla/hlo/ir/replica_group.h
index c55e5802a2d4bf..a06b50fd4f1eb4 100644
--- a/third_party/xla/xla/hlo/ir/replica_group.h
+++ b/third_party/xla/xla/hlo/ir/replica_group.h
@@ -43,6 +43,8 @@ class CollectiveDeviceList;
 
 enum class CollectiveDeviceListVersion { kListOfLists, kIota, kMeshAxes };
 
+// Base class providing the interface for all collective device list
+// representations.
 class CollectiveDeviceListBase {
  public:
   virtual ~CollectiveDeviceListBase() = default;
@@ -73,6 +75,7 @@ class CollectiveDeviceListBase {
   }
   virtual std::vector<std::vector<int64_t>> flattened_replica_groups()
       const = 0;
+
   virtual const std::vector<ReplicaGroup>& replica_groups() const {
     if (replica_groups_ != nullptr) {
       return *replica_groups_;
@@ -86,33 +89,30 @@ class CollectiveDeviceListBase {
     }
     return *replica_groups_;
   };
+
   virtual void Print(Printer* printer) const = 0;
   virtual void Print(Printer* printer,
                      bool print_full_replica_group_list) const {
-    return Print(printer);
-  };
+    Print(printer);
+  }
   virtual std::string ToString() const = 0;
   virtual std::string ToString(bool print_full_replica_group_list) const {
     return ToString();
   };
 
   virtual std::unique_ptr<CollectiveDeviceListBase> Clone() const = 0;
-
   virtual CollectiveDeviceListVersion version() const = 0;
 
-  // shared_ptr for fast copy.
-  mutable std::shared_ptr<std::vector<ReplicaGroup>> replica_groups_ = nullptr;
-
  protected:
+  // Used by operator== to check equality of derived types.
   virtual bool isEqual(const CollectiveDeviceListBase& other) const = 0;
+
+  // shared_ptr for fast copy and lazy materialization.
+  mutable std::shared_ptr<std::vector<ReplicaGroup>> replica_groups_ = nullptr;
 };
 
+// Compact representation using Mesh and Axis indices.
 class MeshAxesReplicaGroupList : public CollectiveDeviceListBase {
-  struct ReshapeAndAggregateAxes {
-    std::vector<int64_t> reshape_dims;
-    std::vector<int64_t> aggregate_axes;
-  };
-
  public:
   explicit MeshAxesReplicaGroupList(Mesh mesh, std::vector<AxisRef> axes);
 
@@ -125,49 +125,49 @@ class MeshAxesReplicaGroupList : public CollectiveDeviceListBase {
     return H::combine(std::move(h), c.mesh_, c.axes_);
   }
 
+  // Overrides
   int64_t num_replica_groups() const override;
   int64_t num_devices_per_group() const override;
   std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
   void Print(Printer* printer) const override;
   std::string ToString() const override;
   MeshAxesReplicaGroupListProto ToProto() const;
+
   std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
     return std::make_unique<MeshAxesReplicaGroupList>(*this);
-  };
+  }
   CollectiveDeviceListVersion version() const override {
     return CollectiveDeviceListVersion::kMeshAxes;
   }
 
+  // Conversion and Serialization
   static MeshAxesReplicaGroupList FromProto(
       const MeshAxesReplicaGroupListProto& proto);
-
-  // Methods for converting to V2 and V1 representations.
   IotaReplicaGroupList ToIotaReplicaGroupList() const;
   CollectiveDeviceList ToCollectiveDeviceList() const;
 
  protected:
   bool isEqual(const CollectiveDeviceListBase& other) const override {
-    const MeshAxesReplicaGroupList& rhs =
-        static_cast<const MeshAxesReplicaGroupList&>(other);
-    return *this == rhs;
+    return *this == static_cast<const MeshAxesReplicaGroupList&>(other);
   }
 
  private:
+  struct ReshapeAndAggregateAxes {
+    std::vector<int64_t> reshape_dims;
+    std::vector<int64_t> aggregate_axes;
+  };
+
+  // Internal helpers for computing device groups.
   absl::flat_hash_map<int64_t, ReshapeAndAggregateAxes>
   GetDimToReshapeAndAggregateAxes() const;
   std::pair<std::vector<int64_t>, std::vector<int64_t>> ComputeReindexedAxes()
       const;
+
   Mesh mesh_;
   std::vector<AxisRef> axes_;
 };
 
-std::string ReplicaGroupsToString(
-    absl::Span<const ReplicaGroup> replica_groups);
-
-// Represents a list of replica groups (a list of list of devices) with
-// reshaping and transposing an iota array (iota tile assignment). Can be used
-// to represent certain common patterns of device lists in a compact, scalable
-// format.
+// Representation using Iota patterns (reshaping/transposing linear ranges).
 class IotaReplicaGroupList : public CollectiveDeviceListBase {
  public:
   explicit IotaReplicaGroupList(int64_t num_replica_groups,
@@ -213,24 +213,21 @@ class IotaReplicaGroupList : public CollectiveDeviceListBase {
   std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
 
   void Print(Printer* printer) const override;
-
   std::string ToString() const override;
+
   std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
     return std::make_unique<IotaReplicaGroupList>(*this);
-  };
+  }
   CollectiveDeviceListVersion version() const override {
     return CollectiveDeviceListVersion::kIota;
   }
 
   IotaReplicaGroupListProto ToProto() const;
-
   static IotaReplicaGroupList FromProto(const IotaReplicaGroupListProto& proto);
 
  protected:
   bool isEqual(const CollectiveDeviceListBase& other) const override {
-    const IotaReplicaGroupList& rhs =
-        static_cast<const IotaReplicaGroupList&>(other);
-    return *this == rhs;
+    return *this == static_cast<const IotaReplicaGroupList&>(other);
   }
 
  private:
@@ -239,9 +236,7 @@ class IotaReplicaGroupList : public CollectiveDeviceListBase {
   int64_t num_devices_per_group_ = -1;
 };
 
-// Represents a series of devices participating in a collective operation
-// (all-gather, all-reduce, etc.). While this directly translates to a list of
-// replica groups, it may be used to represent these lists in compact forms.
+// Legacy/Explicit representation using an explicit list of ReplicaGroups.
 class CollectiveDeviceList : public CollectiveDeviceListBase {
  public:
   explicit CollectiveDeviceList() {
@@ -263,7 +258,6 @@ class CollectiveDeviceList : public CollectiveDeviceListBase {
     replica_groups_ = ToReplicaGroupVector(replica_groups);
   };
 
-  // Replica groups are materialized lazily upon first access.
   explicit CollectiveDeviceList(
       const IotaReplicaGroupList& iota_replica_group_list)
       : iota_replica_group_list_(iota_replica_group_list) {}
@@ -298,7 +292,7 @@ class CollectiveDeviceList : public CollectiveDeviceListBase {
     return h;
   }
 
-  // Lazyly explands iota if applicable.
+  // Overrides
   const std::vector<ReplicaGroup>& replica_groups() const override;
   std::vector<std::vector<int64_t>> flattened_replica_groups() const override;
   const std::optional<IotaReplicaGroupList>& iota_replica_group_list() const {
@@ -322,6 +316,7 @@ class CollectiveDeviceList : public CollectiveDeviceListBase {
              bool print_full_replica_group_list) const override;
   std::string ToString() const override;
   std::string ToString(bool print_full_replica_group_list) const override;
+
   CollectiveDeviceListVersion version() const override {
     if (iota_replica_group_list_.has_value()) {
       return CollectiveDeviceListVersion::kIota;
@@ -332,20 +327,17 @@ class CollectiveDeviceList : public CollectiveDeviceListBase {
   CollectiveDeviceListProto ToProto() const;
   static CollectiveDeviceList FromProto(const CollectiveDeviceListProto& proto);
   static CollectiveDeviceList FromProto(const HloInstructionProto& proto);
+
   std::unique_ptr<CollectiveDeviceListBase> Clone() const override {
     return std::make_unique<CollectiveDeviceList>(*this);
   };
 
  protected:
   bool isEqual(const CollectiveDeviceListBase& other) const override {
-    const CollectiveDeviceList& rhs =
-        static_cast<const CollectiveDeviceList&>(other);
-    return *this == rhs;
+    return *this == static_cast<const CollectiveDeviceList&>(other);
   }
 
  private:
-  // Construct collective device list from protobuf replica group start and end
-  // iterators.
   CollectiveDeviceList(
       tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator start,
       tsl::protobuf::RepeatedPtrField<ReplicaGroup>::const_iterator end) {
@@ -364,12 +356,13 @@ class CollectiveDeviceList : public CollectiveDeviceListBase {
     return result;
   }
 
-  // Load replica groups from iota tile assignment if not already done so.
   void MaybeMaterializeFullReplicaGroupList() const;
 
   std::optional<IotaReplicaGroupList> iota_replica_group_list_;
 };
 
+std::string ReplicaGroupsToString(
+    absl::Span<const ReplicaGroup> replica_groups);
 CollectiveDeviceList ConvertToV1CollectiveDeviceList(
     const CollectiveDeviceListBase& device_list);
 

From 84ad581652911f91120f53bb07bcaeed812f34a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 14:10:03 -0800
Subject: [PATCH 599/753] Apply llvm-use-new-mlir-op-builder fixes

This migrates `builder.create<Op>()` => `Op::create()`

PiperOrigin-RevId: 846862419
---
 .../tf2xla/kernels/xla_call_module_op.cc      |   6 +-
 tensorflow/core/function/testing/test_pass.h  |   3 +-
 .../eliminate_passthrough_iter_args/pass.cc   |   8 +-
 .../transforms/func_to_graph/func_to_graph.cc |   2 +-
 .../transforms/functional_to_region/impl.cc   |  16 +-
 .../transforms/graph_to_func/graph_to_func.cc |   6 +-
 .../transforms/region_to_functional/impl.cc   |  42 ++---
 tensorflow/dtensor/cc/save_restore_util.cc    |  13 +-
 .../mlir/cluster_function_conversion.cc       |   4 +-
 tensorflow/dtensor/mlir/collectives.cc        | 108 +++++------
 .../mlir/device_mesh_cluster_coarsening.cc    |   8 +-
 .../dtensor_allreduce_combine_optimization.cc |  32 ++--
 .../dtensor_allreduce_scatter_optimization.cc |   8 +-
 .../dtensor_allreduce_sum_optimization.cc     |   8 +-
 .../mlir/dtensor_layout_to_xla_sharding_op.cc |   4 +-
 .../mlir/dtensor_mixed_precision_reduce.cc    |   8 +-
 .../dtensor_replace_relayout_with_identity.cc |   6 +-
 tensorflow/dtensor/mlir/dtensor_send_recv.cc  | 132 ++++++-------
 .../mlir/expansions/argmax_spmd_expander.cc   |   6 +-
 .../expansions/dataparallel_spmd_expander.cc  |   4 +-
 .../mlir/expansions/io_op_spmd_expander.cc    |  18 +-
 .../mlir/expansions/iterator_spmd_expander.cc |   4 +-
 .../mlir/expansions/meta_spmd_expander.cc     |   2 +-
 .../mlir/expansions/optional_spmd_expander.cc |   4 +-
 .../expansions/random_op_spmd_expander.cc     |  41 ++--
 .../expansions/replicated_spmd_expander.cc    |   4 +-
 .../expansions/segmentation_spmd_expander.cc  |   6 +-
 .../mlir/expansions/slice_spmd_expander.cc    |   9 +-
 .../mlir/expansions/softmax_spmd_expander.cc  | 100 +++++-----
 .../tensorlist_reserve_spmd_expander.cc       |   7 +-
 .../mlir/handle_cross_cluster_dependencies.cc |  18 +-
 .../dtensor/mlir/layout_propagation_v2.cc     |  34 ++--
 tensorflow/dtensor/mlir/lower_send_recv.cc    |   4 +-
 tensorflow/dtensor/mlir/merge_clusters.cc     |  40 ++--
 .../dtensor/mlir/move_compilation_to_host.cc  |  36 ++--
 .../dtensor/mlir/op_to_device_cluster.cc      |   6 +-
 tensorflow/dtensor/mlir/op_utils.cc           |   4 +-
 .../dtensor/mlir/propagate_default_layout.cc  |   4 +-
 .../propagate_device_id_to_function_args.cc   |   8 +-
 .../dtensor/mlir/restore_shape_inference.cc   |  13 +-
 .../dynamic_enqueue_sparse_expander.cc        |  29 ++-
 .../matmul_sparse_expander.cc                 |   4 +-
 .../dtensor/mlir/spmd_expander_common.cc      |   9 +-
 tensorflow/dtensor/mlir/tpu_integration.cc    |   8 +-
 .../dtensor/mlir/utils/collective_lowering.cc | 175 +++++++++---------
 .../dtensor/mlir/utils/update_tpu_metadata.cc |   4 +-
 tensorflow/dtensor/mlir/value_utils.cc        |  88 ++++-----
 .../codegen/emitters/cpu_fusion_emitter.cc    |   4 +-
 .../codegen/emitters/cpu_scatter_emitter.cc   |  10 +-
 .../transforms/peel_workgroup_loop.cc         |  28 +--
 .../transforms/xla_cpu_rewrite_patterns.cc    |  72 +++----
 .../tiled/transforms/lower_xtile_entry.cc     |  55 +++---
 .../tiled/transforms/lowering_utils.cc        |  11 +-
 .../tiled/transforms/shlo_to_vector.cc        |  23 +--
 .../transforms/vectorized_reduce_emitter.cc   |   8 +-
 .../codegen/triton/transforms/int4_passes.cc  |   8 +-
 .../emitters/transforms/expand_float_ops.cc   | 118 ++++++------
 .../emitters/transforms/flatten_tensors.cc    |  38 ++--
 .../transforms/lower_xla_intrinsic_lib.cc     |  14 +-
 .../emitters/transforms/lower_xla_to_scf.cc   |  25 +--
 .../emitters/transforms/unswitch_loops.cc     |  21 ++-
 .../transforms/vectorize_loads_stores.cc      |  38 ++--
 .../xla/xla/codegen/intrinsic/intrinsic.h     |   2 +-
 .../auto_sharding_stablehlo_pass.cc           |   2 +-
 .../translate/hlo_to_mhlo/async_importer.cc   |  42 ++---
 .../hlo_to_mhlo/custom_call_importer.cc       |  33 ++--
 .../hlo/translate/hlo_to_mhlo/hlo_utils.cc    |   3 +-
 .../transforms/outline_with_xla_framework.cc  |  20 +-
 .../transforms/xla_framework_to_llvm_pass.cc  |  40 ++--
 .../xla/xla/mlir_hlo/utils/hlo_utils.h        |   2 +-
 .../ir/transforms/ifrt_merge_reshards_pass.cc |  14 +-
 71 files changed, 879 insertions(+), 855 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index e06c0b09ba9938..c2aee328a9cd23 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -519,7 +519,7 @@ class XlaCallModuleOp : public XlaOpKernel {
         } else if (options.add_token_input_output) {
           // Add a dummy token if the inner computation takes a token but the
           // custom call doesn't have a token argument.
-          args.push_back(builder.create<mlir::stablehlo::CreateTokenOp>(loc));
+          args.push_back(mlir::stablehlo::CreateTokenOp::create(builder, loc));
         }
 
         input_args.reserve(result.input_mapping.size());
@@ -530,7 +530,7 @@ class XlaCallModuleOp : public XlaOpKernel {
 
       // Call the lowered function.
       auto call =
-          builder.create<mlir::func::CallOp>(loc, main_func, input_args);
+          mlir::func::CallOp::create(builder, loc, main_func, input_args);
 
       // Unpack the result tuple (`options.always_return_tuple` is true). If
       // `has_tuple_input_output` is true, the first result is a token type.
@@ -548,7 +548,7 @@ class XlaCallModuleOp : public XlaOpKernel {
             mlir::Value token = results.back();
             if (!token.use_empty()) {
               token.replaceAllUsesWith(
-                  builder.create<mlir::stablehlo::CreateTokenOp>(loc));
+                  mlir::stablehlo::CreateTokenOp::create(builder, loc));
             }
             results.pop_back();
           }
diff --git a/tensorflow/core/function/testing/test_pass.h b/tensorflow/core/function/testing/test_pass.h
index 93c2116f5ad996..c3bee77403884c 100644
--- a/tensorflow/core/function/testing/test_pass.h
+++ b/tensorflow/core/function/testing/test_pass.h
@@ -101,7 +101,8 @@ struct TestPassTfDialect
     DCHECK(target != nullptr);
 
     builder.setInsertionPoint(target);
-    auto replacement = builder.create<mlir::TF::AddV2Op>(
+    auto replacement = mlir::TF::AddV2Op::create(
+        builder,
         mlir::NameLoc::get(
             mlir::StringAttr::get(builder.getContext(), "x_plus_y")),
         target->getResultTypes(), target->getOperand(0), target->getOperand(1));
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
index c7a943533ba8ec..e58e58aec7f9c0 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
@@ -133,8 +133,8 @@ struct EliminateForPassthroughIterArgs
   static ForRegionOp RebuildOp(const llvm::BitVector &indices, ForRegionOp op,
                                IRRewriter &rewriter) {
     rewriter.setInsertionPoint(op);
-    auto new_op = rewriter.create<ForRegionOp>(
-        op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
+    auto new_op = ForRegionOp::create(
+        rewriter, op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
         op.getCtl().getType(), op.getStart(), op.getLimit(), op.getDelta(),
         FilterByIndex(op.getInit(), indices), op.getCtls(),
         op.getBodyAttrsAttr(), op.getRegionAttrsAttr());
@@ -163,8 +163,8 @@ struct EliminateWhileLikePassthroughIterArgs
                                      WhileLikeRegionOp op,
                                      IRRewriter &rewriter) {
     rewriter.setInsertionPoint(op);
-    auto new_op = rewriter.create<WhileLikeRegionOp>(
-        op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
+    auto new_op = WhileLikeRegionOp::create(
+        rewriter, op.getLoc(), FilterByIndex(op.getOuts().getTypes(), indices),
         op.getCtl().getType(), FilterByIndex(op.getInit(), indices),
         op.getCtls(), op.getParallelIterationsAttr(), op.getCondAttrsAttr(),
         op.getBodyAttrsAttr(), op.getCondRegionAttrsAttr(),
diff --git a/tensorflow/core/transforms/func_to_graph/func_to_graph.cc b/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
index 1c2941bd8da120..5be91dbb286d92 100644
--- a/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
+++ b/tensorflow/core/transforms/func_to_graph/func_to_graph.cc
@@ -102,7 +102,7 @@ absl::Status FuncToGraph(GraphFuncOp func) {
   }
 
   OpBuilder builder(func);
-  auto graph = builder.create<GraphOp>(func.getLoc(), version);
+  auto graph = GraphOp::create(builder, func.getLoc(), version);
 
   // Remove the terminator.
   func.SingleBlock::getBody()->getTerminator()->erase();
diff --git a/tensorflow/core/transforms/functional_to_region/impl.cc b/tensorflow/core/transforms/functional_to_region/impl.cc
index f5bdd163ed1007..aaf67332ae2d48 100644
--- a/tensorflow/core/transforms/functional_to_region/impl.cc
+++ b/tensorflow/core/transforms/functional_to_region/impl.cc
@@ -322,8 +322,8 @@ LogicalResult ConvertIfLikeOp<IfLikeOp, IfLikeRegionOp>::matchAndRewrite(
   // Create the region-based op, passing in the required attributes.
   ValueRange args, ctls;
   std::tie(args, ctls) = this->SplitControl(op.getArgs());
-  auto region_op = rewriter.create<IfLikeRegionOp>(
-      op.getLoc(), op.getResultTypes(), op.getCond(), ctls,
+  auto region_op = IfLikeRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), op.getCond(), ctls,
       op.getThenBranch().getAttrs(), op.getElseBranch().getAttrs(),
       PreserveAttributes(then_func, /*drop_args=*/true),
       PreserveAttributes(else_func, /*drop_args=*/true));
@@ -390,8 +390,8 @@ LogicalResult ConvertCaseLikeOp<CaseLikeOp, CaseLikeRegionOp>::matchAndRewrite(
   // Create the region-based op, passing in the required attributes.
   ValueRange args, ctls;
   std::tie(args, ctls) = this->SplitControl(op.getArgs());
-  auto region_op = rewriter.create<CaseLikeRegionOp>(
-      op.getLoc(), op.getResultTypes(), op.getBranchIndex(), ctls,
+  auto region_op = CaseLikeRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), op.getBranchIndex(), ctls,
       rewriter.getArrayAttr(branch_attrs), region_attrs,
       op.getBranches().size());
   util::ForwardNonIntrinsicAttributes(op, region_op);
@@ -440,8 +440,8 @@ ConvertWhileLikeOp<WhileLikeOp, WhileLikeRegionOp>::matchAndRewrite(
   // TODO(jeffniu): Change this to call the infer return types builder.
   ValueRange init, ctls;
   std::tie(init, ctls) = this->SplitControl(op.getArgs());
-  auto region_op = rewriter.create<WhileLikeRegionOp>(
-      op.getLoc(), op.getResultTypes(), init, ctls,
+  auto region_op = WhileLikeRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), init, ctls,
       op.getParallelIterationsAttr(), op.getCond().getAttrs(),
       op.getBody().getAttrs(), PreserveAttributes(cond_func),
       PreserveAttributes(body_func));
@@ -482,8 +482,8 @@ LogicalResult ConvertForOp::matchAndRewrite(tfg::ForOp op,
   // `ForRegion` does. We will need to insert casts.
   ValueRange init, ctls;
   std::tie(init, ctls) = SplitControl(op.getArgs());
-  auto region_op = rewriter.create<ForRegionOp>(
-      op.getLoc(), op.getResultTypes(), op.getStart(), op.getLimit(),
+  auto region_op = ForRegionOp::create(
+      rewriter, op.getLoc(), op.getResultTypes(), op.getStart(), op.getLimit(),
       op.getDelta(), init, ctls, op.getBody().getAttrs(),
       PreserveAttributes(body_func));
   util::ForwardNonIntrinsicAttributes(op, region_op);
diff --git a/tensorflow/core/transforms/graph_to_func/graph_to_func.cc b/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
index d3769db8bcdf00..ae9e8d48c6a17a 100644
--- a/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
+++ b/tensorflow/core/transforms/graph_to_func/graph_to_func.cc
@@ -66,8 +66,8 @@ absl::Status GraphToFunc(GraphOp graph, ArrayRef<Value> feeds,
 
   FunctionType func_type = builder.getFunctionType(arg_types, ret_types);
   auto loc = graph.getLoc();
-  auto func_op = builder.create<GraphFuncOp>(loc, func_name, func_type,
-                                             /*generic=*/false);
+  auto func_op = GraphFuncOp::create(builder, loc, func_name, func_type,
+                                     /*generic=*/false);
   func_op->setAttr("tfg.lifted_graph_version", graph.getVersion());
   func_op.getRegion().takeBody(graph.getRegion());
 
@@ -75,7 +75,7 @@ absl::Status GraphToFunc(GraphOp graph, ArrayRef<Value> feeds,
   // fetches, the fetch value will be replaced with feed argument.
   OpBuilder body_builder =
       OpBuilder::atBlockEnd(func_op.SingleBlock::getBody());
-  body_builder.create<ReturnOp>(loc, fetches, control_rets);
+  ReturnOp::create(body_builder, loc, fetches, control_rets);
 
   StringAttr tfg_name = dialect->getTfgNameAttrIdentifier();
   StringAttr lifted_value_name = builder.getStringAttr("tfg.lifted_value_attr");
diff --git a/tensorflow/core/transforms/region_to_functional/impl.cc b/tensorflow/core/transforms/region_to_functional/impl.cc
index 6a494c083394ee..9fce62a74a1173 100644
--- a/tensorflow/core/transforms/region_to_functional/impl.cc
+++ b/tensorflow/core/transforms/region_to_functional/impl.cc
@@ -232,8 +232,8 @@ struct ConvertIfLikeRegionOpToExplicitCapture
 
   IfLikeRegionOp RebuildWith(IfLikeRegionOp op, ValueRange added,
                              PatternRewriter &rewriter) const override {
-    return rewriter.create<IfLikeRegionOp>(
-        op.getLoc(), op.getResultTypes(), op.getCond(), op.getCtls(),
+    return IfLikeRegionOp::create(
+        rewriter, op.getLoc(), op.getResultTypes(), op.getCond(), op.getCtls(),
         op.getThenAttrsAttr(), op.getElseAttrsAttr(),
         op.getThenRegionAttrsAttr(), op.getElseRegionAttrsAttr());
   }
@@ -246,9 +246,9 @@ struct ConvertCaseLikeRegionOpToExplicitCapture
 
   CaseLikeRegionOp RebuildWith(CaseLikeRegionOp op, ValueRange added,
                                PatternRewriter &rewriter) const override {
-    return rewriter.create<CaseLikeRegionOp>(
-        op.getLoc(), op.getResultTypes(), op.getBranchIndex(), op.getCtls(),
-        op.getBranchAttrsAttr(), op.getRegionAttrsAttr(),
+    return CaseLikeRegionOp::create(
+        rewriter, op.getLoc(), op.getResultTypes(), op.getBranchIndex(),
+        op.getCtls(), op.getBranchAttrsAttr(), op.getRegionAttrsAttr(),
         op.getBranches().size());
   }
 };
@@ -294,9 +294,9 @@ struct ConvertWhileLikeRegionOpToExplicitCapture
     util::LoopRegionResultAdded(op.getBodyRegion(), added.size());
 
     rewriter.setInsertionPoint(op);
-    return rewriter.create<WhileLikeRegionOp>(
-        op.getLoc(), results, op.getCtl().getType(), operands, op.getCtls(),
-        op.getParallelIterationsAttr(), op.getCondAttrsAttr(),
+    return WhileLikeRegionOp::create(
+        rewriter, op.getLoc(), results, op.getCtl().getType(), operands,
+        op.getCtls(), op.getParallelIterationsAttr(), op.getCondAttrsAttr(),
         op.getBodyAttrsAttr(), op.getCondRegionAttrsAttr(),
         op.getBodyRegionAttrsAttr());
   }
@@ -323,8 +323,8 @@ struct ConvertForRegionOpToExplicitCapture
     util::LoopRegionResultAdded(op.getBodyRegion(), added.size());
 
     rewriter.setInsertionPoint(op);
-    return rewriter.create<ForRegionOp>(
-        op.getLoc(), results, op.getCtl().getType(), op.getStart(),
+    return ForRegionOp::create(
+        rewriter, op.getLoc(), results, op.getCtl().getType(), op.getStart(),
         op.getLimit(), op.getDelta(), operands, op.getCtls(),
         op.getBodyAttrsAttr(), op.getRegionAttrsAttr());
   }
@@ -869,8 +869,8 @@ LogicalResult ConvertIfLikeOp<IfLikeRegionOp, IfLikeOp>::matchAndRewrite(
 
   rewriter.setInsertionPoint(op);
   auto func_op =
-      rewriter.create<IfLikeOp>(op.getLoc(), op.getResultTypes(), op.getCond(),
-                                operands, branches[0], branches[1]);
+      IfLikeOp::create(rewriter, op.getLoc(), op.getResultTypes(), op.getCond(),
+                       operands, branches[0], branches[1]);
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
@@ -923,9 +923,9 @@ LogicalResult ConvertCaseLikeOp<CaseLikeRegionOp, CaseLikeOp>::matchAndRewrite(
   llvm::append_range(operands, op.getCtls());
 
   rewriter.setInsertionPoint(op);
-  auto func_op = rewriter.create<CaseLikeOp>(op.getLoc(), op.getResultTypes(),
-                                             op.getBranchIndex(), operands,
-                                             rewriter.getArrayAttr(branches));
+  auto func_op = CaseLikeOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                                    op.getBranchIndex(), operands,
+                                    rewriter.getArrayAttr(branches));
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
@@ -999,9 +999,9 @@ ConvertWhileLikeOp<WhileLikeRegionOp, WhileLikeOp>::matchAndRewrite(
   llvm::append_range(operands, op.getCtls());
 
   rewriter.setInsertionPoint(op);
-  auto func_op = rewriter.create<WhileLikeOp>(op.getLoc(), op.getResultTypes(),
-                                              operands, cond_ref, body_ref,
-                                              op.getParallelIterationsAttr());
+  auto func_op =
+      WhileLikeOp::create(rewriter, op.getLoc(), op.getResultTypes(), operands,
+                          cond_ref, body_ref, op.getParallelIterationsAttr());
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
@@ -1037,9 +1037,9 @@ LogicalResult ConvertForOp::matchAndRewrite(ForRegionOp op,
   llvm::append_range(operands, op.getCtls());
 
   rewriter.setInsertionPoint(op);
-  auto func_op = rewriter.create<tfg::ForOp>(
-      op.getLoc(), op.getResultTypes(), op.getStart(), op.getLimit(),
-      op.getDelta(), operands, body_ref[0]);
+  auto func_op = tfg::ForOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                                    op.getStart(), op.getLimit(), op.getDelta(),
+                                    operands, body_ref[0]);
   util::ForwardNonIntrinsicAttributes(op, func_op);
   rewriter.replaceOp(op, func_op.getResults());
   return success();
diff --git a/tensorflow/dtensor/cc/save_restore_util.cc b/tensorflow/dtensor/cc/save_restore_util.cc
index dcaf41baf5f1e6..2f8d75cca43fa9 100644
--- a/tensorflow/dtensor/cc/save_restore_util.cc
+++ b/tensorflow/dtensor/cc/save_restore_util.cc
@@ -156,13 +156,12 @@ SaveOpSpecs BuildPerDeviceSave(
         shape_and_slice_specs.push_back({});
 
         mlir::Value new_prefix =
-            builder
-                .create<mlir::TF::AddOp>(
-                    prefix.getLoc(),
-                    mlir::dyn_cast<mlir::RankedTensorType>(prefix.getType()),
-                    prefix,
-                    StringScalarConst(builder, prefix.getLoc(),
-                                      DeviceSuffix(device_id, total_devices)))
+            mlir::TF::AddOp::create(
+                builder, prefix.getLoc(),
+                mlir::dyn_cast<mlir::RankedTensorType>(prefix.getType()),
+                prefix,
+                StringScalarConst(builder, prefix.getLoc(),
+                                  DeviceSuffix(device_id, total_devices)))
                 .getZ();
         // Generate new prefix based on device_id and save op index, only when
         // we need a new save_op.
diff --git a/tensorflow/dtensor/mlir/cluster_function_conversion.cc b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
index 2f725e3e9a383f..51107b7adf544c 100644
--- a/tensorflow/dtensor/mlir/cluster_function_conversion.cc
+++ b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
@@ -142,8 +142,8 @@ mlir::LogicalResult ReplaceClusterWithPartitionCallOp(
   llvm::StringRef function_name = cluster_func.getFunc();
 
   builder->setInsertionPoint(cluster_func);
-  auto call_op = builder->create<mlir::TF::StatefulPartitionedCallOp>(
-      cluster_func.getLoc(), output_types, cluster_func.getOperands(),
+  auto call_op = mlir::TF::StatefulPartitionedCallOp::create(
+      *builder, cluster_func.getLoc(), output_types, cluster_func.getOperands(),
       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr, function_name, mesh_attr,
       /*config_proto=*/builder->getStringAttr(""),
       /*executor_type=*/builder->getStringAttr(""));
diff --git a/tensorflow/dtensor/mlir/collectives.cc b/tensorflow/dtensor/mlir/collectives.cc
index b82304c6fd1749..ca4f5b6e8febda 100644
--- a/tensorflow/dtensor/mlir/collectives.cc
+++ b/tensorflow/dtensor/mlir/collectives.cc
@@ -105,8 +105,8 @@ StatusOr<mlir::Value> EmitAllGather(
 
   mlir::Location loc = DT_LOC2(input.getLoc(), "DTensorAllGatherOp");
   mlir::TF::DTensorAllGatherOp all_gather =
-      builder.create<mlir::TF::DTensorAllGatherOp>(
-          loc, output_type, input,
+      mlir::TF::DTensorAllGatherOp::create(
+          builder, loc, output_type, input,
           mlir::dtensor::LayoutAttr::get(builder.getContext(), src_layout),
           mlir::dtensor::LayoutAttr::get(builder.getContext(), tgt_layout));
   SetSingleLayoutOnOp(all_gather, tgt_layout);
@@ -153,8 +153,8 @@ StatusOr<const mlir::Value> EmitAllScatter(
 
   mlir::Location loc = DT_LOC2(original_value.getLoc(), "DTensorAllScatterOp");
   mlir::TF::DTensorAllScatterOp all_scatter =
-      builder.create<mlir::TF::DTensorAllScatterOp>(
-          loc, output_type, original_value,
+      mlir::TF::DTensorAllScatterOp::create(
+          builder, loc, output_type, original_value,
           mlir::dtensor::LayoutAttr::get(builder.getContext(), original_layout),
           mlir::dtensor::LayoutAttr::get(builder.getContext(), desired_layout));
   SetSingleLayoutOnOp(all_scatter, desired_layout);
@@ -224,11 +224,10 @@ StatusOr<mlir::Value> EmitAllToAll(
                       LocalTypeFromGlobalType(tgt_layout, global_type));
 
   mlir::Location loc = DT_LOC2(input.getLoc(), "DTensorAllToAllOp");
-  mlir::TF::DTensorAllToAllOp all_to_all =
-      builder.create<mlir::TF::DTensorAllToAllOp>(
-          loc, output_type, input,
-          mlir::dtensor::LayoutAttr::get(builder.getContext(), src_layout),
-          mlir::dtensor::LayoutAttr::get(builder.getContext(), tgt_layout));
+  mlir::TF::DTensorAllToAllOp all_to_all = mlir::TF::DTensorAllToAllOp::create(
+      builder, loc, output_type, input,
+      mlir::dtensor::LayoutAttr::get(builder.getContext(), src_layout),
+      mlir::dtensor::LayoutAttr::get(builder.getContext(), tgt_layout));
   SetSingleLayoutOnOp(all_to_all, tgt_layout);
 
   if (newly_created_ops != nullptr) newly_created_ops->insert(all_to_all);
@@ -247,20 +246,21 @@ StatusOr<mlir::Value> EmitDenseToSparseToDense(
   // values tensor = tf.gather_nd(input, indices)
   // shape tensor = tf.shape(input)
   mlir::TF::ZerosLikeOp zeros_like =
-      builder.create<mlir::TF::ZerosLikeOp>(input.getLoc(), input);
-  mlir::TF::NotEqualOp not_equal = builder.create<mlir::TF::NotEqualOp>(
-      zeros_like.getLoc(), input, zeros_like, builder.getBoolAttr(false));
+      mlir::TF::ZerosLikeOp::create(builder, input.getLoc(), input);
+  mlir::TF::NotEqualOp not_equal =
+      mlir::TF::NotEqualOp::create(builder, zeros_like.getLoc(), input,
+                                   zeros_like, builder.getBoolAttr(false));
 
-  mlir::TF::WhereOp indices = builder.create<mlir::TF::WhereOp>(
-      not_equal.getLoc(),
+  mlir::TF::WhereOp indices = mlir::TF::WhereOp::create(
+      builder, not_equal.getLoc(),
       mlir::RankedTensorType::get(GetShapeOfValue(not_equal).value(),
                                   builder.getI64Type()),
       not_equal);
 
-  mlir::TF::GatherNdOp values = builder.create<mlir::TF::GatherNdOp>(
-      input.getLoc(), input.getType(), input, indices);
-  auto shape = builder.create<mlir::TF::ShapeOp>(input.getLoc(), input,
-                                                 builder.getBoolAttr(false));
+  mlir::TF::GatherNdOp values = mlir::TF::GatherNdOp::create(
+      builder, input.getLoc(), input.getType(), input, indices);
+  auto shape = mlir::TF::ShapeOp::create(builder, input.getLoc(), input,
+                                         builder.getBoolAttr(false));
 
   // Emit a SparseToDenseOp and replace the SparseTensor with the result of
   // this new op.
@@ -270,8 +270,8 @@ StatusOr<mlir::Value> EmitDenseToSparseToDense(
           builder, input.getLoc(),
           mlir::cast<mlir::TensorType>(input.getType()).getElementType()));
 
-  auto dense = builder.create<mlir::TF::SparseToDenseOp>(
-      input.getLoc(), input.getType(),
+  auto dense = mlir::TF::SparseToDenseOp::create(
+      builder, input.getLoc(), input.getType(),
       mlir::ValueRange({indices, shape, values, zero_scalar}));
 
   if (newly_created_ops != nullptr) {
@@ -310,8 +310,8 @@ StatusOr<mlir::Value> EmitRelayout(
   // If two layouts are the same, or the only difference is layout type, then
   // there is no need to actually relayout data.
   if (src_layout.IsEquivalentIgnoringType(tgt_layout)) {
-    mlir::TF::IdentityOp op = builder.create<mlir::TF::IdentityOp>(
-        input.getLoc(), input.getType(), input);
+    mlir::TF::IdentityOp op = mlir::TF::IdentityOp::create(
+        builder, input.getLoc(), input.getType(), input);
     if (newly_created_ops != nullptr) newly_created_ops->insert(op);
     return op.getOutput();
   }
@@ -405,7 +405,7 @@ mlir::Operation* EmitTransposeOp(mlir::OpBuilder& builder,
 
   auto constant_attr = builder.getI64TensorAttr(perm_arr);
   auto perm_op =
-      builder.create<mlir::TF::ConstOp>(loc, perm_type, constant_attr);
+      mlir::TF::ConstOp::create(builder, loc, perm_type, constant_attr);
 
   std::vector<int64_t> transposed_shape(shape.begin(), shape.end());
   for (int i = 0; i < shape.size(); i++) {
@@ -414,8 +414,8 @@ mlir::Operation* EmitTransposeOp(mlir::OpBuilder& builder,
   auto transposed_type = mlir::RankedTensorType::get(
       transposed_shape, tr_input_type.getElementType());
 
-  return builder.create<mlir::TF::TransposeOp>(loc, transposed_type, input,
-                                               perm_op);
+  return mlir::TF::TransposeOp::create(builder, loc, transposed_type, input,
+                                       perm_op);
 }
 
 StatusOr<mlir::Operation*> EmitBarrierWithConstValue(mlir::OpBuilder& builder,
@@ -470,10 +470,10 @@ StatusOr<mlir::Operation*> EmitAllReduce(
                       DeviceTypeFromMesh(output_layout.mesh()));
 
   mlir::Location loc = DT_LOC2(input->getLoc(), "DTensorAllReduceOp");
-  auto all_reduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-      loc, input->getResultTypes()[0], input->getOpResult(0),
-      builder.create<mlir::TF::ConstOp>(DT_LOC2(loc, "group_assignment"),
-                                        group_assignment),
+  auto all_reduce = mlir::TF::DTensorAllReduceOp::create(
+      builder, loc, input->getResultTypes()[0], input->getOpResult(0),
+      mlir::TF::ConstOp::create(builder, DT_LOC2(loc, "group_assignment"),
+                                group_assignment),
       builder.getStringAttr(std::string(reduce_op)),
       builder.getStringAttr(device_type));
   SetSingleLayoutOnOp(all_reduce, output_layout);
@@ -575,7 +575,7 @@ StatusOr<mlir::Value> CreateConstSrcTargetPair(const Mesh& mesh,
   auto src_target_attr =
       mlir::DenseIntElementsAttr::get(shaped_type, src_target_pair_flat);
   mlir::Value src_target_pair_tensor =
-      builder.create<mlir::TF::ConstOp>(location, src_target_attr);
+      mlir::TF::ConstOp::create(builder, location, src_target_attr);
   return src_target_pair_tensor;
 }
 
@@ -636,13 +636,14 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   //
   // For example, if mesh dimension splits the input tensor by its height
   // dimension, then `left` actually means tensor to pad on the top side.
-  mlir::Value is_on_left_edge = builder.create<mlir::TF::EqualOp>(
-      location, CreateIntScalarConst(0, builder, location, /*use_int64=*/false),
+  mlir::Value is_on_left_edge = mlir::TF::EqualOp::create(
+      builder, location,
+      CreateIntScalarConst(0, builder, location, /*use_int64=*/false),
       scalar_mesh_coordinate, builder.getBoolAttr(true));
 
   TF_ASSIGN_OR_RETURN(const int mesh_dim_size, mesh.dim_size(mesh_dim));
-  mlir::Value is_on_right_edge = builder.create<mlir::TF::EqualOp>(
-      location,
+  mlir::Value is_on_right_edge = mlir::TF::EqualOp::create(
+      builder, location,
       CreateIntScalarConst(mesh_dim_size - 1, builder, location,
                            /*use_int64=*/false),
       scalar_mesh_coordinate, builder.getBoolAttr(true));
@@ -663,7 +664,7 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   }
 
   mlir::Value ghost_tensor_left =
-      builder.create<mlir::TF::ConstOp>(location, const_attr).getResult();
+      mlir::TF::ConstOp::create(builder, location, const_attr).getResult();
 
   // Get the right side slice of the input tensor to pad on left side.
   llvm::SmallVector<int64_t, 4> begin_left(layout.rank(), 0);
@@ -676,11 +677,13 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   size[split_dim_index] = halo_size;
 
   mlir::Value size_tensor_left = ops_util::GetR1Const(size, builder, location);
-  mlir::Value sliced_tensor_left = builder.create<mlir::TF::SliceOp>(
-      location, halo_type, tensor, begin_tensor_left, size_tensor_left);
+  mlir::Value sliced_tensor_left =
+      mlir::TF::SliceOp::create(builder, location, halo_type, tensor,
+                                begin_tensor_left, size_tensor_left);
 
-  mlir::Value halo_tensor_left = builder.create<mlir::TF::SelectV2Op>(
-      location, is_on_right_edge, ghost_tensor_left, sliced_tensor_left);
+  mlir::Value halo_tensor_left =
+      mlir::TF::SelectV2Op::create(builder, location, is_on_right_edge,
+                                   ghost_tensor_left, sliced_tensor_left);
 
   // Invoke collective permute to receive the tensor from neighboring processor.
   // Halo slices from the left neighbor are received on each processor (they
@@ -690,12 +693,12 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
       CreateConstSrcTargetPair(mesh, mesh_dim, /*shift_left=*/false, location,
                                builder));
 
-  mlir::Value left_concat_value = builder.create<mlir::TF::CollectivePermuteOp>(
-      location, sliced_tensor_left.getType(), halo_tensor_left,
+  mlir::Value left_concat_value = mlir::TF::CollectivePermuteOp::create(
+      builder, location, sliced_tensor_left.getType(), halo_tensor_left,
       src_target_pair_left);
 
   mlir::Value ghost_tensor_right =
-      builder.create<mlir::TF::ConstOp>(location, const_attr).getResult();
+      mlir::TF::ConstOp::create(builder, location, const_attr).getResult();
 
   // Else, values to pad is tensor from different processor. We use collective
   // permute to access tensor slice from another device.
@@ -704,13 +707,15 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
   mlir::Value begin_tensor_right =
       ops_util::GetR1Const(begin_right, builder, location);
   mlir::Value size_tensor_right = ops_util::GetR1Const(size, builder, location);
-  mlir::Value sliced_tensor_right = builder.create<mlir::TF::SliceOp>(
-      location, halo_type, tensor, begin_tensor_right, size_tensor_right);
+  mlir::Value sliced_tensor_right =
+      mlir::TF::SliceOp::create(builder, location, halo_type, tensor,
+                                begin_tensor_right, size_tensor_right);
 
   // Find the halo tensor value to pad on the `right` side.
   // If input block is on the right edge, we use zero ghost tensor instead.
-  mlir::Value halo_tensor_right = builder.create<mlir::TF::SelectV2Op>(
-      location, is_on_left_edge, ghost_tensor_right, sliced_tensor_right);
+  mlir::Value halo_tensor_right =
+      mlir::TF::SelectV2Op::create(builder, location, is_on_left_edge,
+                                   ghost_tensor_right, sliced_tensor_right);
 
   // Invoke collective permute to receive the tensor from neighboring processor.
   // Halo slices from the right neighbor are received on each processor (they
@@ -719,10 +724,9 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
       mlir::Value src_target_pair_right,
       CreateConstSrcTargetPair(mesh, mesh_dim, /*shift_left=*/true, location,
                                builder));
-  mlir::Value right_concat_value =
-      builder.create<mlir::TF::CollectivePermuteOp>(
-          location, sliced_tensor_right.getType(), halo_tensor_right,
-          src_target_pair_right);
+  mlir::Value right_concat_value = mlir::TF::CollectivePermuteOp::create(
+      builder, location, sliced_tensor_right.getType(), halo_tensor_right,
+      src_target_pair_right);
 
   // Final halo exchanged value is concatenated value of left_concat_value,
   // tensor, and right_concat_value in the mesh_dimension.
@@ -734,8 +738,8 @@ StatusOr<mlir::Value> EmitHaloExchange(mlir::OpBuilder& builder, int halo_size,
       final_shape, input_tensor_type.getElementType());
   mlir::Value concat_axis =
       CreateIntScalarConst(split_dim_index, builder, location);
-  mlir::Value final_value = builder.create<mlir::TF::ConcatV2Op>(
-      location, final_type,
+  mlir::Value final_value = mlir::TF::ConcatV2Op::create(
+      builder, location, final_type,
       llvm::SmallVector<mlir::Value, 4>{left_concat_value, tensor,
                                         right_concat_value},
       concat_axis);
diff --git a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
index 4c45da0110c7b0..3b0e959ee32979 100644
--- a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
+++ b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
@@ -194,8 +194,8 @@ mlir::LogicalResult CreateMergedMeshCluster(
     output_values_to_replace.emplace_back(std::get<1>(cluster_return_value));
   }
 
-  *merged_cluster = builder->create<mlir::tf_device::ClusterOp>(
-      current_cluster.getLoc(), merged_cluster_output_types);
+  *merged_cluster = mlir::tf_device::ClusterOp::create(
+      *builder, current_cluster.getLoc(), merged_cluster_output_types);
   auto mesh_attr = current_cluster->getAttrOfType<mlir::StringAttr>(kMeshAttr);
   if (!mesh_attr)
     return current_cluster.emitOpError(kMissingMeshAttributeErrorMessage);
@@ -206,8 +206,8 @@ mlir::LogicalResult CreateMergedMeshCluster(
   // `current_cluster` and `merging_cluster`.
   merged_cluster->getBody().push_back(new mlir::Block);
   builder->setInsertionPointToEnd(&merged_cluster->GetBody());
-  builder->create<mlir::tf_device::ReturnOp>(merged_cluster->getLoc(),
-                                             merged_cluster_output_values);
+  mlir::tf_device::ReturnOp::create(*builder, merged_cluster->getLoc(),
+                                    merged_cluster_output_values);
 
   // Make sure to replace usages of tf_device.cluster ops to be merged-away with
   // newly created tf_device.cluster op.
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
index e4cea2348f3d09..09b53ae4b72895 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
@@ -151,10 +151,10 @@ mlir::LogicalResult MergeAllReduceGroup(
   mlir::Location loc = all_reduce_group[0].getLoc();
   mlir::Type elem_type = all_reduce_group[0].getType().getElementType();
   auto zero_scalar = ops_util::CreateScalarConst(0, builder, loc);
-  auto zero_scalar_elem_type = builder.create<mlir::TF::CastOp>(
-      loc, mlir::RankedTensorType::get({}, elem_type), zero_scalar);
-  auto merged = builder.create<mlir::TF::FillOp>(
-      loc, ops_util::GetR1Const({total_num_elements}, builder, loc),
+  auto zero_scalar_elem_type = mlir::TF::CastOp::create(
+      builder, loc, mlir::RankedTensorType::get({}, elem_type), zero_scalar);
+  auto merged = mlir::TF::FillOp::create(
+      builder, loc, ops_util::GetR1Const({total_num_elements}, builder, loc),
       zero_scalar_elem_type);
 
   // Store every all-reduce's input at an offset location in the merged tensor,
@@ -175,23 +175,23 @@ mlir::LogicalResult MergeAllReduceGroup(
     }
 
     int num_elements = all_reduce_ranked_type.getNumElements();
-    auto flattened = builder.create<mlir::TF::ReshapeOp>(
-        DT_LOC2(loc, "CombinedReduceFlatten"), all_reduce.getInput(),
+    auto flattened = mlir::TF::ReshapeOp::create(
+        builder, DT_LOC2(loc, "CombinedReduceFlatten"), all_reduce.getInput(),
         ops_util::GetR1Const({num_elements}, builder, loc));
     flattened_types.push_back(flattened.getType());
     auto indices = ops_util::GetR1Const({offset_num_elements}, builder, loc);
 
     if (all_reduce.getDeviceType().contains("TPU")) {
-      updated = builder.create<mlir::TF::XlaDynamicUpdateSliceOp>(
-          DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
+      updated = mlir::TF::XlaDynamicUpdateSliceOp::create(
+          builder, DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
           /*input=*/i == 0 ? merged.getResult() : updated,
           /*update=*/flattened, indices);
     } else {
       auto end = ops_util::GetR1Const({offset_num_elements + num_elements},
                                       builder, loc);
       auto strides = ops_util::GetR1Const({1}, builder, loc);
-      updated = builder.create<mlir::TF::TensorStridedSliceUpdateOp>(
-          DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
+      updated = mlir::TF::TensorStridedSliceUpdateOp::create(
+          builder, DT_LOC2(loc, "CombinedReduceUpdateSlice"), merged.getType(),
           /*input=*/i == 0 ? merged.getResult() : updated, indices, end,
           strides,
           /*value=*/flattened);
@@ -200,8 +200,8 @@ mlir::LogicalResult MergeAllReduceGroup(
   }
 
   // All-reduce the updated merged tensor.
-  auto merged_all_reduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-      all_reduce_group[0].getLoc(), updated.getType(), updated,
+  auto merged_all_reduce = mlir::TF::DTensorAllReduceOp::create(
+      builder, all_reduce_group[0].getLoc(), updated.getType(), updated,
       all_reduce_group[0].getGroupAssignment(),
       all_reduce_group[0].getReduceOp(), all_reduce_group[0].getDeviceType());
   SetSingleLayoutOnOp(
@@ -223,13 +223,13 @@ mlir::LogicalResult MergeAllReduceGroup(
           all_reduce_ranked_type));
     }
     int num_elements = all_reduce_ranked_type.getNumElements();
-    auto slice = builder.create<mlir::TF::SliceOp>(
-        DT_LOC2(loc, "PostCombinedReduceSlice"), flattened_types[i],
+    auto slice = mlir::TF::SliceOp::create(
+        builder, DT_LOC2(loc, "PostCombinedReduceSlice"), flattened_types[i],
         /*input=*/merged_all_reduce,
         /*begin=*/ops_util::GetR1Const({offset_num_elements}, builder, loc),
         /*size=*/ops_util::GetR1Const({num_elements}, builder, loc));
-    auto replacement = builder.create<mlir::TF::ReshapeOp>(
-        DT_LOC2(loc, "PostCombinedReduceReshape"), slice.getResult(),
+    auto replacement = mlir::TF::ReshapeOp::create(
+        builder, DT_LOC2(loc, "PostCombinedReduceReshape"), slice.getResult(),
         ops_util::GetR1Const(all_reduce_shapes[i], builder, loc));
     replacements.push_back(replacement);
     offset_num_elements += num_elements;
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
index b16eeb8230f860..5721d03ce2c343 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
@@ -137,14 +137,14 @@ mlir::LogicalResult ApplyOptimization(mlir::func::FuncOp function) {
         VLOG(2) << "Fuse reduce scatter with scatter_dim: " << scatter_dim;
 
         mlir::OpBuilder builder(all_reduce);
-        auto scatter_dim_const_op = builder.create<mlir::TF::ConstOp>(
-            all_reduce.getLoc(),
+        auto scatter_dim_const_op = mlir::TF::ConstOp::create(
+            builder, all_reduce.getLoc(),
             mlir::DenseIntElementsAttr::get(
                 mlir::RankedTensorType::get({}, builder.getI32Type()),
                 {scatter_dim}));
 
-        auto reduce_scatter = builder.create<mlir::TF::DTensorReduceScatterOp>(
-            all_reduce.getLoc(), all_scatter->getResultTypes(),
+        auto reduce_scatter = mlir::TF::DTensorReduceScatterOp::create(
+            builder, all_reduce.getLoc(), all_scatter->getResultTypes(),
             all_reduce.getOperand(0), all_reduce.getGroupAssignment(),
             scatter_dim_const_op, all_reduce.getReduceOp(),
             all_reduce.getDeviceType());
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
index 0a7a232290b8a7..e8a2fde042ae62 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
@@ -160,8 +160,8 @@ mlir::LogicalResult OptimizeAllReduceAndSum(mlir::Operation* op,
   mlir::OpBuilder builder(op);
   builder.setInsertionPointAfterValue(op->getResult(0));
   mlir::TF::DTensorAllReduceOp all_reduce =
-      builder.create<mlir::TF::DTensorAllReduceOp>(
-          op->getLoc(), op->getResult(0).getType(), op->getResult(0),
+      mlir::TF::DTensorAllReduceOp::create(
+          builder, op->getLoc(), op->getResult(0).getType(), op->getResult(0),
           group_assignment, builder.getStringAttr(std::string(kReduceOpAdd)),
           builder.getStringAttr(first_reduction_op.getDeviceType()));
 
@@ -394,8 +394,8 @@ mlir::LogicalResult ExtractAllReduceFromWhileOp(
 
   // Create a singe reduction operation that reduces the result of the locally
   // added tensor.
-  auto new_all_reduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-      all_reduce.getLoc(), while_output.getType(), while_output,
+  auto new_all_reduce = mlir::TF::DTensorAllReduceOp::create(
+      builder, all_reduce.getLoc(), while_output.getType(), while_output,
       cloned_group_assignment->getResult(0),
       builder.getStringAttr(std::string(kReduceOpAdd)),
       builder.getStringAttr(all_reduce.getDeviceType()));
diff --git a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
index 6cc0a14cb1eefd..457cec03a0e1ca 100644
--- a/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
+++ b/tensorflow/dtensor/mlir/dtensor_layout_to_xla_sharding_op.cc
@@ -112,8 +112,8 @@ void DTensorLayoutToXlaShardingOpPass::runOnOperation() {
         // the V1 sharding attr, so set V2 sharding to "" here. It may be better
         // to set the V2 sharding attr here and then removed it when V1 is
         // removed.
-        auto sharding_op = builder.create<mlir::TF::XlaShardingOp>(
-            layout_op.getLoc(), layout_op.getOutput().getType(),
+        auto sharding_op = mlir::TF::XlaShardingOp::create(
+            builder, layout_op.getLoc(), layout_op.getOutput().getType(),
             layout_op.getInput(),
             /*sharding=*/builder.getStringAttr(""),  // Not used by tf2xla.
             /*_xlaSharding=*/sharding_attr,
diff --git a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
index b722e1bba45e0d..c0f066483451fe 100644
--- a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
@@ -98,16 +98,16 @@ mlir::LogicalResult MaybeUpcastForReduction(ReduceOpType reduce_op,
   const mlir::RankedTensorType& output_type =
       mlir::dyn_cast<mlir::RankedTensorType>(reduce_op.getOutput().getType());
 
-  mlir::TF::CastOp upcast = builder.create<mlir::TF::CastOp>(
-      loc,
+  mlir::TF::CastOp upcast = mlir::TF::CastOp::create(
+      builder, loc,
       mlir::RankedTensorType::get(input_type.getShape(), builder.getF32Type()),
       reduce_op.getInput());
   reduce_op->setOperand(0, upcast.getY());
   reduce_op.getOutput().setType(upcast.getY().getType());
 
   builder.setInsertionPointAfter(reduce_op);
-  mlir::TF::CastOp downcast = builder.create<mlir::TF::CastOp>(
-      loc,
+  mlir::TF::CastOp downcast = mlir::TF::CastOp::create(
+      builder, loc,
       mlir::RankedTensorType::get(output_type.getShape(),
                                   output_type.getElementType()),
       reduce_op);
diff --git a/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc b/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc
index d5e957e19050d2..1b320bcfc100ab 100644
--- a/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc
+++ b/tensorflow/dtensor/mlir/dtensor_replace_relayout_with_identity.cc
@@ -37,9 +37,9 @@ class DTensorReplaceRelayoutWithIdentityPass
       mlir::OpBuilder builder(relayout_op);
       // Inserts an IdentityOp at the position of the relayout_op with the same
       // attributes as the relayout_op.
-      auto new_identity = builder.create<mlir::TF::IdentityOp>(
-          relayout_op->getLoc(), relayout_op.getType(), relayout_op.getInput(),
-          relayout_op->getAttrs());
+      auto new_identity = mlir::TF::IdentityOp::create(
+          builder, relayout_op->getLoc(), relayout_op.getType(),
+          relayout_op.getInput(), relayout_op->getAttrs());
       relayout_op.getOutput().replaceAllUsesWith(new_identity.getOutput());
       relayout_op.erase();
     });
diff --git a/tensorflow/dtensor/mlir/dtensor_send_recv.cc b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
index c728725dbaf073..fa6d2bd041189f 100644
--- a/tensorflow/dtensor/mlir/dtensor_send_recv.cc
+++ b/tensorflow/dtensor/mlir/dtensor_send_recv.cc
@@ -85,8 +85,8 @@ mlir::Value GetOrCreateCompilationKey(mlir::Operation* op) {
   auto result_type =
       mlir::RankedTensorType::get({3}, builder.getType<mlir::TF::StringType>());
   auto new_compilation_key =
-      builder.create<mlir::TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
-          cluster.getLoc(), /*program=*/result_type,
+      mlir::TF::_XlaCompileMlirPlaceholderProgramKeyOp::create(
+          builder, cluster.getLoc(), /*program=*/result_type,
           llvm::ArrayRef<mlir::Value>{});
   return new_compilation_key.getProgram();
 }
@@ -107,8 +107,8 @@ StatusOr<mlir::Value> GetDeviceOrdinal(const Mesh& mesh,
   }
   // Slice out the device ordinal using the device ID as index.
   TF_ASSIGN_OR_RETURN(mlir::Value device_id, DeviceId(function));
-  mlir::TF::SliceOp device_ordinal = builder->create<mlir::TF::SliceOp>(
-      loc,
+  mlir::TF::SliceOp device_ordinal = mlir::TF::SliceOp::create(
+      *builder, loc,
       /*output=*/EffectivelyScalarR1Type(builder->getIntegerType(32)),
       /*input=*/IntConst(*builder, loc, device_id_to_ordinal),
       /*begin=*/
@@ -118,8 +118,8 @@ StatusOr<mlir::Value> GetDeviceOrdinal(const Mesh& mesh,
   mlir::Value device_ordinal_scalar =
       ReshapeSizeTypeToScalar(*builder, loc, device_ordinal);
   if (return_int64_type) {
-    device_ordinal_scalar = builder->create<mlir::TF::CastOp>(
-        loc, mlir::RankedTensorType::get({}, builder->getI64Type()),
+    device_ordinal_scalar = mlir::TF::CastOp::create(
+        *builder, loc, mlir::RankedTensorType::get({}, builder->getI64Type()),
         device_ordinal_scalar);
   }
   return device_ordinal_scalar;
@@ -138,8 +138,8 @@ StatusOr<mlir::Operation*> LowerDTensorSendToTFOp(
   absl::Span<const std::string> receiving_devices = target_mesh.local_devices();
 
   mlir::Operation* lowered_send_op;
-  lowered_send_op = builder.create<mlir::TF::_HostSendOp>(
-      send_input.getLoc(), send_input, tensor_name, sending_devices[0],
+  lowered_send_op = mlir::TF::_HostSendOp::create(
+      builder, send_input.getLoc(), send_input, tensor_name, sending_devices[0],
       /*send_device_incarnation=*/0, receiving_devices[0],
       /*client_terminated=*/false);
 
@@ -184,12 +184,13 @@ StatusOr<mlir::Operation*> LowerDTensorSendToXlaOp(
           GetDeviceOrdinal(send_input_layout.mesh(), loc, send_func, &builder));
     }
     // Create XlaSendFromHostV2 op
-    lowered_send_op = builder.create<mlir::TF::_XlaSendFromHostV2Op>(
-        loc, value_to_send, program_key, device_ordinal, dtensor_send.getKey());
+    lowered_send_op = mlir::TF::_XlaSendFromHostV2Op::create(
+        builder, loc, value_to_send, program_key, device_ordinal,
+        dtensor_send.getKey());
   } else {
     // Note that for ops running in XLA/TPU, device ordinal input is not needed.
-    lowered_send_op = builder.create<mlir::TF::XlaSendToHostOp>(
-        loc, send_input, dtensor_send.getKey());
+    lowered_send_op = mlir::TF::XlaSendToHostOp::create(
+        builder, loc, send_input, dtensor_send.getKey());
   }
 
   dtensor_send.erase();
@@ -246,16 +247,16 @@ StatusOr<mlir::Operation*> LowerDTensorRecvToXlaOp(
 
     auto program_key = GetOrCreateCompilationKey(dtensor_recv);
     builder.setInsertionPoint(dtensor_recv);
-    recv_xla_op = builder.create<mlir::TF::_XlaRecvAtHostV2Op>(
-        dtensor_recv.getLoc(), output_types,
+    recv_xla_op = mlir::TF::_XlaRecvAtHostV2Op::create(
+        builder, dtensor_recv.getLoc(), output_types,
         /*dynamic_key=*/program_key, device_ordinal, dtensor_recv.getKeyAttr());
   } else {
     TF_ASSIGN_OR_RETURN(auto local_shape_attr,
                         GetDTensorRecvLocalShapeAttr(dtensor_recv));
 
     // Create XlaRecvFromHost op.
-    recv_xla_op = builder.create<mlir::TF::XlaRecvFromHostOp>(
-        dtensor_recv.getLoc(), output_type, local_shape_attr,
+    recv_xla_op = mlir::TF::XlaRecvFromHostOp::create(
+        builder, dtensor_recv.getLoc(), output_type, local_shape_attr,
         dtensor_recv.getKeyAttr());
   }
 
@@ -299,8 +300,8 @@ StatusOr<mlir::Operation*> LowerDTensorSendFromCPUToTFOp(
 
   mlir::Operation* lowered_send_op;
   for (size_t i = 0; i < receiving_devices.size(); ++i)
-    lowered_send_op = builder.create<mlir::TF::_HostSendOp>(
-        send_input.getLoc(), dtensor_send.getInput(), tensor_name,
+    lowered_send_op = mlir::TF::_HostSendOp::create(
+        builder, send_input.getLoc(), dtensor_send.getInput(), tensor_name,
         sending_devices[0],
         /*send_device_incarnation=*/0, receiving_devices[i]);
 
@@ -326,8 +327,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecvFromCPUToTFOp(
   mlir::Operation* lowered_recv_op;
   mlir::Location loc = dtensor_recv.getLoc();
   for (size_t i = 0; i < receiving_devices.size(); ++i)
-    lowered_recv_op = builder.create<mlir::TF::_HostRecvOp>(
-        loc, dtensor_recv.getType(), tensor_name, sending_devices[0],
+    lowered_recv_op = mlir::TF::_HostRecvOp::create(
+        builder, loc, dtensor_recv.getType(), tensor_name, sending_devices[0],
         /*send_device_incarnation=*/0, receiving_devices[i]);
 
   // Replace dtensor_recv with newly created recv op and remove DTensorRecv op.
@@ -351,8 +352,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecvToTFOp(
   absl::Span<const std::string> receiving_devices = recv_mesh.local_devices();
 
   mlir::Location loc = dtensor_recv.getLoc();
-  mlir::Operation* lowered_recv_op = builder.create<mlir::TF::_HostRecvOp>(
-      loc, output_type, tensor_name, sending_devices[0],
+  mlir::Operation* lowered_recv_op = mlir::TF::_HostRecvOp::create(
+      builder, loc, output_type, tensor_name, sending_devices[0],
       /*send_device_incarnation=*/0, receiving_devices[0]);
 
   return lowered_recv_op;
@@ -385,7 +386,7 @@ llvm::SmallVector<mlir::Attribute, 4> GenerateBranches(
                                   ? func_op.getArgument(0)
                                   : mlir::BlockArgument{};
     auto branch_op = fn(fn_builder, location, arg, it.value());
-    fn_builder.create<mlir::func::ReturnOp>(location, branch_op->getResults());
+    mlir::func::ReturnOp::create(fn_builder, location, branch_op->getResults());
 
     branches.push_back(mlir::SymbolRefAttr::get(func_op));
   }
@@ -429,25 +430,24 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorSendToTFHostSend(
         mlir::Value val = arg;
         if (i32_copy) {
           auto val_type = mlir::cast<mlir::TensorType>(val.getType());
-          val = op_builder
-                    .create<mlir::TF::CastOp>(
-                        loc,
-                        mlir::RankedTensorType::get(
-                            val_type.getShape(), op_builder.getIntegerType(64)),
-                        val)
+          val = mlir::TF::CastOp::create(
+                    op_builder, loc,
+                    mlir::RankedTensorType::get(val_type.getShape(),
+                                                op_builder.getIntegerType(64)),
+                    val)
                     ->getResult(0);
         }
-        return op_builder.create<mlir::TF::_HostSendOp>(
-            loc, val, tensor_name, std::get<0>(device_pair),
+        return mlir::TF::_HostSendOp::create(
+            op_builder, loc, val, tensor_name, std::get<0>(device_pair),
             /*send_device_incarnation=*/0, std::get<1>(device_pair));
       });
-  mlir::Operation* case_op = builder.create<mlir::TF::CaseOp>(
-      dtensor_send.getLoc(),
-      /*output=*/llvm::ArrayRef<mlir::Type>{},
-      /*branch_index=*/device_ordinal,
-      /*input=*/dtensor_send->getOperands(),
-      /*branches=*/builder.getArrayAttr(branches),
-      /*is_stateless=*/builder.getBoolAttr(false));
+  mlir::Operation* case_op =
+      mlir::TF::CaseOp::create(builder, dtensor_send.getLoc(),
+                               /*output=*/llvm::ArrayRef<mlir::Type>{},
+                               /*branch_index=*/device_ordinal,
+                               /*input=*/dtensor_send->getOperands(),
+                               /*branches=*/builder.getArrayAttr(branches),
+                               /*is_stateless=*/builder.getBoolAttr(false));
 
   // erase the send op here iff targeting a gpu
   // otherwise there will be 'op not within cluster' error(s)
@@ -494,14 +494,15 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorRecvToTFHostRecv(
       "{0}_receive_{1}_{2}", device_pairs,
       [&](mlir::OpBuilder& op_builder, auto& loc, auto _,
           auto device_pair) -> mlir::Operation* {
-        auto recv_op = op_builder.create<mlir::TF::_HostRecvOp>(
-            loc, local_output_type, tensor_name, std::get<0>(device_pair),
+        auto recv_op = mlir::TF::_HostRecvOp::create(
+            op_builder, loc, local_output_type, tensor_name,
+            std::get<0>(device_pair),
             /*send_device_incarnation=*/0, std::get<1>(device_pair));
         SetSingleLayoutOnOp(recv_op, recv_layout);
         return recv_op;
       });
-  mlir::Operation* case_op = builder.create<mlir::TF::CaseOp>(
-      dtensor_recv.getLoc(),
+  mlir::Operation* case_op = mlir::TF::CaseOp::create(
+      builder, dtensor_recv.getLoc(),
       /*output=*/llvm::ArrayRef<mlir::Type>{local_output_type},
       /*branch_index=*/device_ordinal,
       /*input=*/dtensor_recv->getOperands(),
@@ -510,8 +511,8 @@ StatusOr<mlir::Operation*> LowerOneToOneDTensorRecvToTFHostRecv(
 
   mlir::Operation* lowered_recv;
   if (i32_copy) {
-    lowered_recv = builder.create<mlir::TF::CastOp>(
-        dtensor_recv.getLoc(), local_recv_type, case_op->getResult(0));
+    lowered_recv = mlir::TF::CastOp::create(
+        builder, dtensor_recv.getLoc(), local_recv_type, case_op->getResult(0));
   } else {
     lowered_recv = case_op;
   }
@@ -639,12 +640,12 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
         GetDeviceOrdinal(*mesh, loc,
                          send_cluster->getParentOfType<mlir::func::FuncOp>(),
                          &builder));
-    mlir::Value predicate = builder.create<mlir::TF::EqualOp>(
-        loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
+    mlir::Value predicate = mlir::TF::EqualOp::create(
+        builder, loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
         /*incompatible_shape_error=*/builder.getBoolAttr(true));
 
-    auto send_if = builder.create<mlir::TF::IfRegionOp>(
-        loc, llvm::SmallVector<mlir::Type, 4>{}, predicate,
+    auto send_if = mlir::TF::IfRegionOp::create(
+        builder, loc, llvm::SmallVector<mlir::Type, 4>{}, predicate,
         /*is_stateless=*/builder.getBoolAttr(true),
         GetUniqueControlflowFnName("copy_to_mesh_send_if_then", builder),
         GetUniqueControlflowFnName("copy_to_mesh_send_if_else", builder));
@@ -653,16 +654,15 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
     auto& else_branch = send_if.getElseBranch();
     else_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&else_branch.front());
-    builder.create<mlir::TF::YieldOp>(
-        loc,
-        /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    mlir::TF::YieldOp::create(builder, loc,
+                              /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
     // Create then branch region with DTensorSend op.
     auto& then_branch = send_if.getThenBranch();
     then_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&then_branch.front());
-    auto yield = builder.create<mlir::TF::YieldOp>(
-        loc, /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    auto yield = mlir::TF::YieldOp::create(
+        builder, loc, /*operands=*/llvm::ArrayRef<mlir::Value>{});
     dtensor_send->moveBefore(yield);
 
     // Lower DTensorSend op to actual TF op.
@@ -684,8 +684,8 @@ StatusOr<mlir::Operation*> LowerDTensorSend(mlir::Operation* send_op,
       if (!recv_mesh.is_cpu_mesh() &&
           send_type.getElementType().isInteger(32)) {
         builder.setInsertionPointAfter(send_input.getDefiningOp());
-        auto cast_to_int64 = builder.create<mlir::TF::CastOp>(
-            send_input.getLoc(),
+        auto cast_to_int64 = mlir::TF::CastOp::create(
+            builder, send_input.getLoc(),
             mlir::RankedTensorType::get(send_type.getShape(),
                                         builder.getIntegerType(64)),
             send_input);
@@ -781,8 +781,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
         GetDeviceOrdinal(recv_mesh, loc,
                          recv_cluster->getParentOfType<mlir::func::FuncOp>(),
                          &builder));
-    mlir::Value predicate = builder.create<mlir::TF::EqualOp>(
-        loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
+    mlir::Value predicate = mlir::TF::EqualOp::create(
+        builder, loc, device_ordinal, CreateIntScalarConst(0, builder, loc),
         /*incompatible_shape_error=*/builder.getBoolAttr(true));
 
     mlir::TensorType recv_type = dtensor_recv.getType();
@@ -795,8 +795,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
                                           builder.getIntegerType(64))
             : recv_type;
 
-    auto recv_if = builder.create<mlir::TF::IfRegionOp>(
-        loc, llvm::SmallVector<mlir::Type, 4>{output_type}, predicate,
+    auto recv_if = mlir::TF::IfRegionOp::create(
+        builder, loc, llvm::SmallVector<mlir::Type, 4>{output_type}, predicate,
         /*is_stateless=*/builder.getBoolAttr(true),
         GetUniqueControlflowFnName("copy_to_mesh_recv_if_then", builder),
         GetUniqueControlflowFnName("copy_to_mesh_recv_if_else", builder));
@@ -831,9 +831,9 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
       return absl::InvalidArgumentError("unsupported output type");
     }
 
-    mlir::Value zeros = builder.create<mlir::TF::ConstOp>(loc, const_attr);
-    builder.create<mlir::TF::YieldOp>(
-        loc, /*operands=*/llvm::ArrayRef<mlir::Value>{zeros});
+    mlir::Value zeros = mlir::TF::ConstOp::create(builder, loc, const_attr);
+    mlir::TF::YieldOp::create(builder, loc,
+                              /*operands=*/llvm::ArrayRef<mlir::Value>{zeros});
 
     // Create then branch region with DTensorRecv op.
     auto& then_branch = recv_if.getThenBranch();
@@ -843,8 +843,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
 
     TF_ASSIGN_OR_RETURN(mlir::Operation * xla_recv,
                         lower_fn(send_mesh, dtensor_recv, output_type));
-    builder.create<mlir::TF::YieldOp>(
-        loc,
+    mlir::TF::YieldOp::create(
+        builder, loc,
         /*operands=*/llvm::ArrayRef<mlir::Value>{xla_recv->getResult(0)});
 
     // Broadcast the received output to all GPU/TPU devices.
@@ -859,8 +859,8 @@ StatusOr<mlir::Operation*> LowerDTensorRecv(mlir::Operation* send_op,
                                     kReduceOpAdd));
 
     if (need_i32_to_i64_upcast) {
-      lowered_recv = builder.create<mlir::TF::CastOp>(
-          loc, recv_type, lowered_recv->getResult(0));
+      lowered_recv = mlir::TF::CastOp::create(builder, loc, recv_type,
+                                              lowered_recv->getResult(0));
     }
 
     // Replaces usages of DTensorRecv op with the broadcasted value.
diff --git a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
index e0bbc25792cd66..10b6296d5638b6 100644
--- a/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/argmax_spmd_expander.cc
@@ -106,9 +106,9 @@ StatusOr<mlir::Operation*> ArgMaxSPMDExpander::ExpandOp(mlir::Operation* op) {
     }
   }
 
-  auto new_argmax = builder.create<mlir::TF::ArgMaxOp>(
-      argmax_op.getLoc(), argmax_op.getResult().getType(), input,
-      argmax_op.getDimension());
+  auto new_argmax = mlir::TF::ArgMaxOp::create(builder, argmax_op.getLoc(),
+                                               argmax_op.getResult().getType(),
+                                               input, argmax_op.getDimension());
   op->getResult(0).replaceAllUsesWith(new_argmax.getOutput());
   op->erase();
 
diff --git a/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
index f53f3b2a188945..6fb9cb790910ed 100644
--- a/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/dataparallel_spmd_expander.cc
@@ -257,8 +257,8 @@ StatusOr<mlir::Operation*> DataparallelSPMDExpander::RelayoutOperandsAndOutputs(
   builder.setInsertionPointAfter(last_op_after_splitting);
 
   // Tie all outputs together with identity_n
-  auto identity_op = builder.create<mlir::TF::IdentityNOp>(
-      op->getLoc(), generated_types, generated_outputs);
+  auto identity_op = mlir::TF::IdentityNOp::create(
+      builder, op->getLoc(), generated_types, generated_outputs);
   newly_created_ops.insert(identity_op);
   for (int i = 0; i < output_layouts.size(); ++i) {
     op->getOpResult(i).replaceAllUsesExcept(identity_op.getResult(i),
diff --git a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
index 0242ebbb0544e3..7de31a8bb7e5f1 100644
--- a/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/io_op_spmd_expander.cc
@@ -83,8 +83,8 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
   mlir::Block* then_fn_block = then_func.addEntryBlock();
   mlir::OpBuilder then_fn_builder =
       mlir::OpBuilder::atBlockBegin(then_fn_block);
-  then_fn_builder.create<mlir::TF::NoOp>(location);
-  then_fn_builder.create<mlir::func::ReturnOp>(location);
+  mlir::TF::NoOp::create(then_fn_builder, location);
+  mlir::func::ReturnOp::create(then_fn_builder, location);
 
   // Build else_func that is the branch of device_id == 0.
   // The else func is just the original op.
@@ -100,9 +100,9 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
   mlir::OpBuilder else_fn_builder =
       mlir::OpBuilder::atBlockBegin(else_fn_block);
 
-  else_fn_builder.create<T>(location, op->getResultTypes(),
-                            else_fn_block->getArguments());
-  else_fn_builder.create<mlir::func::ReturnOp>(location);
+  T::create(else_fn_builder, location, op->getResultTypes(),
+            else_fn_block->getArguments());
+  mlir::func::ReturnOp::create(else_fn_builder, location);
 
   symbol_table.insert(then_func);
   symbol_table.insert(else_func);
@@ -115,12 +115,12 @@ StatusOr<mlir::Operation*> Expand(mlir::Operation* op) {
           builder, location,
           mlir::cast<mlir::TensorType>(device_id.getType()).getElementType()));
 
-  mlir::TF::NotEqualOp not_equal = builder.create<mlir::TF::NotEqualOp>(
-      location, device_id, zero_scalar,
+  mlir::TF::NotEqualOp not_equal = mlir::TF::NotEqualOp::create(
+      builder, location, device_id, zero_scalar,
       /*incompatible_shape_error=*/builder.getBoolAttr(false));
 
-  mlir::Operation* if_op = builder.create<mlir::TF::IfOp>(
-      location, then_func.getFunctionType().getResults(),
+  mlir::Operation* if_op = mlir::TF::IfOp::create(
+      builder, location, then_func.getFunctionType().getResults(),
       /*cond=*/not_equal.getResult(),
       /*input=*/op->getOperands(),
       /*then_branch=*/then_func.getSymName(),
diff --git a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
index 8a442d727aa19f..0bd4da477d2205 100644
--- a/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/iterator_spmd_expander.cc
@@ -62,8 +62,8 @@ StatusOr<mlir::Operation*> IteratorGetNextSPMDExpander::ExpandOp(
         local_shape, global_output_type.getElementType());
   }
 
-  auto new_op = builder.create<mlir::TF::IteratorGetNextOp>(
-      DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
+  auto new_op = mlir::TF::IteratorGetNextOp::create(
+      builder, DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
 
   for (int i = 0; i < original_op->getNumResults(); ++i) {
     original_op.getResult(i).replaceAllUsesWith(new_op.getResult(i));
diff --git a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
index e18d3edf44d913..b2d6ca37777281 100644
--- a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
@@ -791,7 +791,7 @@ StatusOr<mlir::Operation*> ReshapeSPMDExpander::ExpandOp(mlir::Operation* op) {
   auto const_attr =
       mlir::DenseIntElementsAttr::get(new_shape, local_reshape_const);
   auto new_reshape_const_op =
-      builder.create<mlir::TF::ConstOp>(DT_LOC(op), const_attr);
+      mlir::TF::ConstOp::create(builder, DT_LOC(op), const_attr);
   mlir::TF::ReshapeOp new_reshape_op = mlir::TF::ReshapeOp::create(
       builder, op->getLoc(), new_input, new_reshape_const_op);
 
diff --git a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
index 3c146a06a48558..a45a2df40a32e4 100644
--- a/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/optional_spmd_expander.cc
@@ -54,8 +54,8 @@ StatusOr<mlir::Operation*> OptionalGetValueSPMDExpander::ExpandOp(
     local_types[i] = local_type;
   }
 
-  auto new_op = builder.create<mlir::TF::OptionalGetValueOp>(
-      DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
+  auto new_op = mlir::TF::OptionalGetValueOp::create(
+      builder, DT_LOC(op->getLoc()), local_types, original_op->getOperand(0));
 
   for (int i = 0; i < original_op->getNumResults(); ++i) {
     original_op.getResult(i).replaceAllUsesWith(new_op.getResult(i));
diff --git a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
index b6e1c316cef6f2..6175e133710f7a 100644
--- a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
@@ -151,26 +151,26 @@ StatusOr<mlir::Value> GetDeviceSeed(const Layout& layout, mlir::Operation* op) {
   mlir::Attribute const_attr =
       mlir::DenseIntElementsAttr::get(const_type, multipliers);
   mlir::Value multiplier =
-      builder.create<mlir::TF::ConstOp>(cluster.getLoc(), const_attr)
+      mlir::TF::ConstOp::create(builder, cluster.getLoc(), const_attr)
           .getOutput();
 
   const mlir::RankedTensorType one_by_one =
       mlir::RankedTensorType::get({1, 1}, builder.getIntegerType(32));
 
-  mlir::Value seed = builder.create<mlir::TF::MatMulOp>(
-      cluster.getLoc(), one_by_one, mesh_coordinates, multiplier);
+  mlir::Value seed = mlir::TF::MatMulOp::create(
+      builder, cluster.getLoc(), one_by_one, mesh_coordinates, multiplier);
 
   // Largest prime in 16 bits.
   mlir::Value prime = CreateIntScalarConst(
       /*value=*/65521, builder, cluster.getLoc(), /*use_int64=*/false);
 
   mlir::Value seed_plus_prime =
-      builder
-          .create<mlir::TF::AddV2Op>(cluster.getLoc(), one_by_one, seed, prime)
+      mlir::TF::AddV2Op::create(builder, cluster.getLoc(), one_by_one, seed,
+                                prime)
           .getZ();
 
-  mlir::TF::SqueezeOp squeeze = builder.create<mlir::TF::SqueezeOp>(
-      cluster.getLoc(),
+  mlir::TF::SqueezeOp squeeze = mlir::TF::SqueezeOp::create(
+      builder, cluster.getLoc(),
       mlir::RankedTensorType::get({}, builder.getIntegerType(32)),
       seed_plus_prime, builder.getI64ArrayAttr({0, 1}));
 
@@ -207,11 +207,12 @@ StatusOr<mlir::Value> ComputeNewSeed(mlir::OpBuilder& builder,
   mlir::Type seed_type =
       mlir::cast<mlir::TensorType>(op_seed.getType()).getElementType();
 
-  device_id_seed = builder.create<mlir::TF::CastOp>(
-      location, mlir::RankedTensorType::get({}, seed_type), device_id_seed);
+  device_id_seed = mlir::TF::CastOp::create(
+      builder, location, mlir::RankedTensorType::get({}, seed_type),
+      device_id_seed);
 
-  mlir::Value seed_xor =
-      builder.create<mlir::TF::BitwiseXorOp>(location, op_seed, device_id_seed);
+  mlir::Value seed_xor = mlir::TF::BitwiseXorOp::create(
+      builder, location, op_seed, device_id_seed);
   return seed_xor;
 }
 
@@ -240,8 +241,8 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV1(const Layout& layout,
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
   // TODO(zhonglinhan) : check different input for StatelessRandomUniformInt
-  auto local_random = builder.create<RandomOp>(location, new_random_type,
-                                               new_shape_value, seed_xor);
+  auto local_random = RandomOp::create(builder, location, new_random_type,
+                                       new_shape_value, seed_xor);
   op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
@@ -272,9 +273,9 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2(const Layout& layout,
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
-  auto local_random = builder.create<RandomOp>(
-      location, new_random_type, new_shape_value, seed_xor,
-      random_op.getCounter(), random_op.getAlg());
+  auto local_random =
+      RandomOp::create(builder, location, new_random_type, new_shape_value,
+                       seed_xor, random_op.getCounter(), random_op.getAlg());
   op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
@@ -305,10 +306,10 @@ StatusOr<mlir::Operation*> CreatedShardedLocalRandomOpV2Range(
 
   auto new_shape_value = Int64Const(builder, location, new_random_shape);
 
-  auto local_random = builder.create<RandomOp>(
-      location, new_random_type, new_shape_value, seed_xor,
-      random_op.getCounter(), random_op.getAlg(), random_op.getMinval(),
-      random_op.getMaxval());
+  auto local_random =
+      RandomOp::create(builder, location, new_random_type, new_shape_value,
+                       seed_xor, random_op.getCounter(), random_op.getAlg(),
+                       random_op.getMinval(), random_op.getMaxval());
   op->getResult(0).replaceAllUsesWith(local_random.getOutput());
   op->erase();
   return local_random.getOperation();
diff --git a/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc
index feb5b9eda74a01..f55d62efa81501 100644
--- a/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/replicated_spmd_expander.cc
@@ -82,8 +82,8 @@ ReplicatedOpSPMDExpander::ReplicatedRelayoutOperandsAndOutputs(
   builder.setInsertionPointAfter(last_op_after_splitting);
 
   // Tie all outputs together with identity_n
-  auto identity_op = builder.create<mlir::TF::IdentityNOp>(
-      op->getLoc(), generated_types, generated_outputs);
+  auto identity_op = mlir::TF::IdentityNOp::create(
+      builder, op->getLoc(), generated_types, generated_outputs);
   newly_created_ops.insert(identity_op);
   for (int i = 0; i < output_layouts.size(); ++i) {
     op->getOpResult(i).replaceAllUsesExcept(identity_op.getResult(i),
diff --git a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
index c0aa768d9a5d03..c2fc958965ec33 100644
--- a/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/segmentation_spmd_expander.cc
@@ -122,9 +122,9 @@ StatusOr<mlir::Operation*> UnsortedSegmentSumSPMDExpander::ExpandOp(
       EmitRelayout(segment_ids, segment_ids_layout, new_segment_ids_layout));
 
   mlir::OpBuilder builder(op);
-  mlir::Operation* new_sum_op = builder.create<mlir::TF::UnsortedSegmentSumOp>(
-      op->getLoc(), sum_op.getOutput().getType(), data, new_segment_ids,
-      sum_op.getNumSegments());
+  mlir::Operation* new_sum_op = mlir::TF::UnsortedSegmentSumOp::create(
+      builder, op->getLoc(), sum_op.getOutput().getType(), data,
+      new_segment_ids, sum_op.getNumSegments());
 
   InferSPMDExpandedLocalShape(new_sum_op);
 
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
index fc082290109260..4cf10413879cbf 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
@@ -187,11 +187,10 @@ StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
   else
     new_size = Int64Const(builder, loc, sizes);
 
-  auto new_op = builder
-                    .create<mlir::TF::SliceOp>(
-                        loc, slice_op.getOutput().getType(), relayout_input,
-                        slice_op.getBegin(), new_size)
-                    .getOperation();
+  auto new_op =
+      mlir::TF::SliceOp::create(builder, loc, slice_op.getOutput().getType(),
+                                relayout_input, slice_op.getBegin(), new_size)
+          .getOperation();
   new_op = InferSPMDExpandedLocalShape(new_op);
 
   TF_ASSIGN_OR_RETURN(auto relayout_output,
diff --git a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
index 196e3702b1c843..62fc9413e78307 100644
--- a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
@@ -70,12 +70,12 @@ StatusOr<mlir::Value> ComputeGlobalReduce(
 
   // First compute a local reduce
   if (reduce_op == kReduceOpAdd) {
-    local_reduce = builder.create<mlir::TF::SumOp>(
-        input.getLoc(), input, reduction_indices,
+    local_reduce = mlir::TF::SumOp::create(
+        builder, input.getLoc(), input, reduction_indices,
         /*keep_dims=*/builder.getBoolAttr(true));
   } else if (reduce_op == kReduceOpMax) {
-    local_reduce = builder.create<mlir::TF::MaxOp>(
-        input.getLoc(), input, reduction_indices,
+    local_reduce = mlir::TF::MaxOp::create(
+        builder, input.getLoc(), input, reduction_indices,
         /*keep_dims=*/builder.getBoolAttr(true));
   } else {
     return errors::Unimplemented("reduction ", reduce_op, " not implemented");
@@ -107,8 +107,8 @@ StatusOr<mlir::Value> ComputeGlobalReduce(
     // dimension attribute type. Everything else is OK with int32_t dimensions.
     std::vector<int64_t> reduce_dim_array_64(reduced_dims.begin(),
                                              reduced_dims.end());
-    global_reduce = builder.create<mlir::TF::SqueezeOp>(
-        input.getLoc(), new_type, global_reduce->getResult(0),
+    global_reduce = mlir::TF::SqueezeOp::create(
+        builder, input.getLoc(), new_type, global_reduce->getResult(0),
         builder.getI64ArrayAttr(reduce_dim_array_64));
   }
   return global_reduce->getResult(0);
@@ -143,9 +143,9 @@ absl::Status ComputeExpAndSum(mlir::OpBuilder& builder,
 
   // Subtract max from local copy of logits.
   shifted_logits =
-      builder.create<mlir::TF::SubOp>(loc, logits, max_logits).getResult();
+      mlir::TF::SubOp::create(builder, loc, logits, max_logits).getResult();
   exp_of_shifted_logits =
-      builder.create<mlir::TF::ExpOp>(loc, shifted_logits).getResult();
+      mlir::TF::ExpOp::create(builder, loc, shifted_logits).getResult();
 
   // Sum the exponential.
   TF_ASSIGN_OR_RETURN(
@@ -162,8 +162,9 @@ mlir::Value ComputeSoftmax(mlir::OpBuilder& builder,
                            const mlir::Value& exp_of_shifted_logits,
                            const mlir::Value& sum_of_exp) {
   // For Softmax, we compute exp(shifted_logits)/sum(exp(shifted_logits))
-  auto softmax = builder.create<mlir::TF::DivOp>(
-      exp_of_shifted_logits.getLoc(), exp_of_shifted_logits, sum_of_exp);
+  auto softmax =
+      mlir::TF::DivOp::create(builder, exp_of_shifted_logits.getLoc(),
+                              exp_of_shifted_logits, sum_of_exp);
   return softmax.getResult();
 }
 
@@ -174,9 +175,9 @@ mlir::Value ComputeLogSoftmax(mlir::OpBuilder& builder,
                               const mlir::Value& sum_of_exp) {
   // For LogSoftmax, we compute shifted_logs - log(sum(exp(shifted_logits)))
   auto log_of_sum =
-      builder.create<mlir::TF::LogOp>(shifted_logits.getLoc(), sum_of_exp);
-  auto log_softmax = builder.create<mlir::TF::SubOp>(
-      shifted_logits.getLoc(), shifted_logits, log_of_sum.getResult());
+      mlir::TF::LogOp::create(builder, shifted_logits.getLoc(), sum_of_exp);
+  auto log_softmax = mlir::TF::SubOp::create(
+      builder, shifted_logits.getLoc(), shifted_logits, log_of_sum.getResult());
   return log_softmax.getResult();
 }
 
@@ -223,12 +224,11 @@ StatusOr<mlir::Value> GetFPConstOfType(mlir::OpBuilder& builder,
                                        const mlir::Value& input, float value) {
   if (mlir::TensorType type =
           mlir::dyn_cast<mlir::TensorType>(input.getType())) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            input.getLoc(),
-            mlir::DenseFPElementsAttr::get<float>(
-                mlir::RankedTensorType::get({}, type.getElementType()),
-                {value}))
+    return mlir::TF::ConstOp::create(
+               builder, input.getLoc(),
+               mlir::DenseFPElementsAttr::get<float>(
+                   mlir::RankedTensorType::get({}, type.getElementType()),
+                   {value}))
         .getOutput();
   } else {
     return errors::Unimplemented("non tensor type for labels is not supported");
@@ -290,23 +290,23 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
 
   // Slice out the [1,1] for mesh_dim_index.
   mlir::Value shard_id =
-      builder
-          .create<mlir::TF::SliceOp>(
-              loc, mlir::RankedTensorType::get({1, 1}, builder.getI32Type()),
-              mesh_coordinates,
-              IntConst(builder, input.getLoc(), {0, mesh_dim_index}),
-              IntConst(builder, input.getLoc(), {1, 1}))
+      mlir::TF::SliceOp::create(
+          builder, loc,
+          mlir::RankedTensorType::get({1, 1}, builder.getI32Type()),
+          mesh_coordinates,
+          IntConst(builder, input.getLoc(), {0, mesh_dim_index}),
+          IntConst(builder, input.getLoc(), {1, 1}))
           .getOutput();
 
-  shard_id = builder
-                 .create<mlir::TF::SqueezeOp>(
-                     loc, mlir::RankedTensorType::get({}, builder.getI32Type()),
-                     shard_id, builder.getI64ArrayAttr({0, 1}))
-                 .getOutput();
+  shard_id =
+      mlir::TF::SqueezeOp::create(
+          builder, loc, mlir::RankedTensorType::get({}, builder.getI32Type()),
+          shard_id, builder.getI64ArrayAttr({0, 1}))
+          .getOutput();
 
   // `new_indices` = `input` - `shard_id` * (classes/num_shards)
   mlir::Value id_offset =
-      builder.create<mlir::TF::MulOp>(loc, shard_id, depth).getZ();
+      mlir::TF::MulOp::create(builder, loc, shard_id, depth).getZ();
 
   // Note that the type of id_offset (int32) may not match the type of input.
   // So we insert a cast in this case.
@@ -314,25 +314,23 @@ StatusOr<mlir::Value> ComputeOneHot(mlir::OpBuilder& builder,
       mlir::dyn_cast<mlir::TensorType>(input.getType());
   if (!input_type) return errors::InvalidArgument("input is not a TensorType");
   if (!input_type.getElementType().isInteger(32))
-    id_offset =
-        builder
-            .create<mlir::TF::CastOp>(
-                loc,
-                mlir::RankedTensorType::get({}, input_type.getElementType()),
-                id_offset)
-            .getY();
+    id_offset = mlir::TF::CastOp::create(builder, loc,
+                                         mlir::RankedTensorType::get(
+                                             {}, input_type.getElementType()),
+                                         id_offset)
+                    .getY();
 
   mlir::Value indices =
-      builder.create<mlir::TF::SubOp>(loc, input, id_offset).getZ();
+      mlir::TF::SubOp::create(builder, loc, input, id_offset).getZ();
 
   TF_ASSIGN_OR_RETURN(mlir::Value on_value,
                       GetFPConstOfType(builder, features, 1.0));
   TF_ASSIGN_OR_RETURN(mlir::Value off_value,
                       GetFPConstOfType(builder, features, 0.0));
 
-  return builder
-      .create<mlir::TF::OneHotOp>(input.getLoc(), indices, depth, on_value,
-                                  off_value, builder.getI64IntegerAttr(1))
+  return mlir::TF::OneHotOp::create(builder, input.getLoc(), indices, depth,
+                                    on_value, off_value,
+                                    builder.getI64IntegerAttr(1))
       .getOutput();
 }
 
@@ -530,7 +528,7 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::MaybeRelayoutOutputs(
   llvm::SmallVector<mlir::Value, 4> values = {new_loss, new_backprop};
 
   mlir::TF::IdentityNOp identity_op =
-      builder.create<mlir::TF::IdentityNOp>(loss.getLoc(), types, values);
+      mlir::TF::IdentityNOp::create(builder, loss.getLoc(), types, values);
 
   newly_created_ops.insert(identity_op);
 
@@ -627,17 +625,15 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::ExpandOp(
                       GetFPConstOfType(builder, labels, 0.0));
 
   const mlir::Value is_labels_zero =
-      builder
-          .create<mlir::TF::EqualOp>(op->getLoc(), labels, labels_zero,
-                                     builder.getBoolAttr(true))
+      mlir::TF::EqualOp::create(builder, op->getLoc(), labels, labels_zero,
+                                builder.getBoolAttr(true))
           .getZ();
   const mlir::Value safe_softmax =
-      builder
-          .create<mlir::TF::SelectV2Op>(op->getLoc(), is_labels_zero,
-                                        features_zero, log_softmax)
+      mlir::TF::SelectV2Op::create(builder, op->getLoc(), is_labels_zero,
+                                   features_zero, log_softmax)
           .getOutput();
   const mlir::Value prod =
-      builder.create<mlir::TF::MulOp>(op->getLoc(), labels, safe_softmax)
+      mlir::TF::MulOp::create(builder, op->getLoc(), labels, safe_softmax)
           .getZ();
 
   // Compute the reduce sum
@@ -648,10 +644,10 @@ StatusOr<mlir::Operation*> SoftmaxLossOpSPMDExpander::ExpandOp(
 
   builder.setInsertionPointAfterValue(positive_loss);
   mlir::Value loss =
-      builder.create<mlir::TF::NegOp>(op->getLoc(), positive_loss).getY();
+      mlir::TF::NegOp::create(builder, op->getLoc(), positive_loss).getY();
 
   mlir::Value backprop =
-      builder.create<mlir::TF::SubOp>(op->getLoc(), softmax, labels);
+      mlir::TF::SubOp::create(builder, op->getLoc(), softmax, labels);
 
   return MaybeRelayoutOutputs(op, loss, backprop, internal_layout,
                               output_layouts[0], output_layouts[1]);
diff --git a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
index 7dc4ae56d0ed71..8e2fa02dcc9f44 100644
--- a/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.cc
@@ -58,10 +58,9 @@ StatusOr<mlir::Operation*> TensorListReserveSPMDExpander::ExpandOp(
               mlir::RankedTensorType::get(local_shape, element_type),
               builder.getContext()));
   mlir::Value new_shape_value = Int64Const(builder, DT_LOC(op), local_shape);
-  mlir::TF::TensorListReserveOp new_op =
-      builder.create<mlir::TF::TensorListReserveOp>(
-          DT_LOC(op), new_output_type, new_shape_value,
-          tensorlist_op.getNumElements());
+  mlir::TF::TensorListReserveOp new_op = mlir::TF::TensorListReserveOp::create(
+      builder, DT_LOC(op), new_output_type, new_shape_value,
+      tensorlist_op.getNumElements());
 
   op->getResult(0).replaceAllUsesWith(new_op.getResult());
   op->erase();
diff --git a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
index a0c137cb83dc4d..f1e3a60f8a2d21 100644
--- a/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
+++ b/tensorflow/dtensor/mlir/handle_cross_cluster_dependencies.cc
@@ -206,8 +206,8 @@ mlir::LogicalResult HandleCopyToMeshWithinCluster(
       }
     }
     mlir::OpBuilder builder(op);
-    auto identity_op = builder.create<mlir::TF::IdentityOp>(
-        op.getLoc(), input.getType(), input);
+    auto identity_op = mlir::TF::IdentityOp::create(builder, op.getLoc(),
+                                                    input.getType(), input);
     op->getResult(0).replaceAllUsesWith(identity_op.getOutput());
     op->erase();
     return mlir::WalkResult::advance();
@@ -246,8 +246,9 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
 
   // Create send op that sends data from input cluster to target cluster.
   const Mesh& target_mesh = mesh_or_status.value();
-  builder.create<mlir::TF::DTensorSend>(
-      copy_to_mesh.getLoc(), value_to_send, builder.getStringAttr(op_key),
+  mlir::TF::DTensorSend::create(
+      builder, copy_to_mesh.getLoc(), value_to_send,
+      builder.getStringAttr(op_key),
       mlir::dtensor::MeshAttr::get(context, target_mesh));
 
   // Create recv op that recvs data from send op.
@@ -258,8 +259,8 @@ mlir::LogicalResult LowerToSendRecv(mlir::TF::CopyToMeshOp copy_to_mesh,
         "CopyToMesh op must have static shape.");
 
   builder.setInsertionPoint(copy_to_mesh);
-  auto recv_op = builder.create<mlir::TF::DTensorRecv>(
-      copy_to_mesh.getLoc(), value_to_send.getType(),
+  auto recv_op = mlir::TF::DTensorRecv::create(
+      builder, copy_to_mesh.getLoc(), value_to_send.getType(),
       builder.getStringAttr(op_key),
       mlir::TF::ShapeAttr::get(context, tensor_type),
       mlir::dtensor::MeshAttr::get(context, target_mesh));
@@ -396,8 +397,9 @@ mlir::LogicalResult InsertCopyToMesh(mlir::tf_device::ClusterOp cluster) {
     if (input_mesh == mesh) continue;
     mlir::OpBuilder builder(op);
 
-    auto new_op = builder.create<mlir::TF::CopyToMeshOp>(
-        op->getLoc(), op->getResult(0).getType(), input, mesh.ToString());
+    auto new_op = mlir::TF::CopyToMeshOp::create(builder, op->getLoc(),
+                                                 op->getResult(0).getType(),
+                                                 input, mesh.ToString());
     op->replaceUsesOfWith(input, new_op.getResult());
   }
   return mlir::success();
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index c8dd29135e96ca..49ede9025b4310 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -733,9 +733,9 @@ mlir::LogicalResult InsertDTensorLayoutOps(
     mlir::Type value_type = GetSubtypeOrSelf(merged_layout.first);
 
     if (auto type = mlir::dyn_cast<mlir::TensorType>(value_type)) {
-      auto layout_op = builder.create<mlir::TF::DTensorLayout>(
-          merged_layout.first.getLoc(), merged_layout.first, layout_attr,
-          mlir::TF::ShapeAttr::get(builder.getContext(), type));
+      auto layout_op = mlir::TF::DTensorLayout::create(
+          builder, merged_layout.first.getLoc(), merged_layout.first,
+          layout_attr, mlir::TF::ShapeAttr::get(builder.getContext(), type));
       llvm::SmallPtrSet<mlir::Operation*, 4> exception{layout_op};
       merged_layout.first.replaceAllUsesExcept(layout_op.getOutput(),
                                                exception);
@@ -1234,30 +1234,26 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
       mlir::TF::ShapeAttr global_shape = mlir::TF::ShapeAttr::get(
           builder.getContext(),
           mlir::cast<mlir::TensorType>(yield_op->getOperand(i).getType()));
-      mlir::TF::RelayoutOp first_relayout =
-          builder.create<mlir::TF::RelayoutOp>(
-              op.getLoc(), yield_op->getOperand(i).getType(),
-              yield_op->getOperand(i), input_layout.ToString());
-      mlir::TF::DTensorLayout first_layout_op =
-          builder.create<mlir::TF::DTensorLayout>(
-              op.getLoc(), first_relayout.getOutput(),
-              mlir::dtensor::LayoutAttr::get(builder.getContext(),
-                                             input_layout),
-              global_shape);
+      mlir::TF::RelayoutOp first_relayout = mlir::TF::RelayoutOp::create(
+          builder, op.getLoc(), yield_op->getOperand(i).getType(),
+          yield_op->getOperand(i), input_layout.ToString());
+      mlir::TF::DTensorLayout first_layout_op = mlir::TF::DTensorLayout::create(
+          builder, op.getLoc(), first_relayout.getOutput(),
+          mlir::dtensor::LayoutAttr::get(builder.getContext(), input_layout),
+          global_shape);
       yield_op->setOperand(i, first_layout_op.getOutput());
 
       // Insert the second relayout op after the loop itself.
       builder.setInsertionPointAfter(op);
       mlir::TF::DTensorLayout second_layout_op =
-          builder.create<mlir::TF::DTensorLayout>(
-              op.getLoc(), op->getResult(i),
+          mlir::TF::DTensorLayout::create(
+              builder, op.getLoc(), op->getResult(i),
               mlir::dtensor::LayoutAttr::get(builder.getContext(),
                                              input_layout),
               global_shape);
-      mlir::TF::RelayoutOp second_relayout =
-          builder.create<mlir::TF::RelayoutOp>(
-              op.getLoc(), second_layout_op.getOutput().getType(),
-              second_layout_op.getOutput(), output_layout.ToString());
+      mlir::TF::RelayoutOp second_relayout = mlir::TF::RelayoutOp::create(
+          builder, op.getLoc(), second_layout_op.getOutput().getType(),
+          second_layout_op.getOutput(), output_layout.ToString());
       op->getResult(i).replaceAllUsesExcept(
           second_relayout.getOutput(), llvm::SmallPtrSet<mlir::Operation*, 1>{
                                            second_layout_op.getOperation()});
diff --git a/tensorflow/dtensor/mlir/lower_send_recv.cc b/tensorflow/dtensor/mlir/lower_send_recv.cc
index 0cbcdd61abd7c4..142932afbee7da 100644
--- a/tensorflow/dtensor/mlir/lower_send_recv.cc
+++ b/tensorflow/dtensor/mlir/lower_send_recv.cc
@@ -90,8 +90,8 @@ void PropagateDeviceIdToClusters(mlir::ModuleOp module) {
 
   module.walk([&](mlir::tf_device::ClusterOp op) {
     mlir::OpBuilder builder(&op.GetBody().front());
-    builder.create<mlir::TF::IdentityOp>(main_func.getLoc(),
-                                         device_id->getType(), *device_id);
+    mlir::TF::IdentityOp::create(builder, main_func.getLoc(),
+                                 device_id->getType(), *device_id);
   });
 }
 
diff --git a/tensorflow/dtensor/mlir/merge_clusters.cc b/tensorflow/dtensor/mlir/merge_clusters.cc
index 0e88ca55057a26..81a856aa1a0c9c 100644
--- a/tensorflow/dtensor/mlir/merge_clusters.cc
+++ b/tensorflow/dtensor/mlir/merge_clusters.cc
@@ -288,31 +288,31 @@ void CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region, const Mesh& mesh,
       absl::StrCat(kSendRecvKeyPrefix, *num_send_recvs);
   *num_send_recvs += 1;
 
-  builder.create<mlir::TF::DTensorSend>(
-      if_region.getLoc(), if_region.getCond(),
-      builder.getStringAttr(send_recv_key),
-      mlir::dtensor::MeshAttr::get(context, mesh));
+  mlir::TF::DTensorSend::create(builder, if_region.getLoc(),
+                                if_region.getCond(),
+                                builder.getStringAttr(send_recv_key),
+                                mlir::dtensor::MeshAttr::get(context, mesh));
 
   // Create new cluster op that contains cloned if operation.
-  auto new_cluster = builder.create<mlir::tf_device::ClusterOp>(
-      if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{});
+  auto new_cluster = mlir::tf_device::ClusterOp::create(
+      builder, if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{});
   new_cluster.getBody().push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&new_cluster.GetBody());
-  auto return_op = builder.create<mlir::tf_device::ReturnOp>(
-      if_region.getLoc(), llvm::SmallVector<mlir::Value, 4>{});
+  auto return_op = mlir::tf_device::ReturnOp::create(
+      builder, if_region.getLoc(), llvm::SmallVector<mlir::Value, 4>{});
 
   // Add DTensorRecv op inside new cluster that receives the cluster.
   builder.setInsertionPoint(return_op);
-  auto recv_op = builder.create<mlir::TF::DTensorRecv>(
-      if_region.getLoc(), predicate_tensor_type,
+  auto recv_op = mlir::TF::DTensorRecv::create(
+      builder, if_region.getLoc(), predicate_tensor_type,
       builder.getStringAttr(send_recv_key),
       mlir::TF::ShapeAttr::get(context, predicate_tensor_type),
       mlir::dtensor::MeshAttr::get(context, mesh));
 
   // Clone tf.IfRegion op inside newly created cluster and make sure
   // that the predicate tensor is from DTensorRecv op created above.
-  auto host_side_if = builder.create<mlir::TF::IfRegionOp>(
-      if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{},
+  auto host_side_if = mlir::TF::IfRegionOp::create(
+      builder, if_region.getLoc(), llvm::SmallVector<mlir::Type, 4>{},
       recv_op.getOutput(), if_region.getIsStateless(),
       GetUniqueControlflowFnName("cloned_if_then", builder),
       GetUniqueControlflowFnName("cloned_if_else", builder));
@@ -322,15 +322,15 @@ void CloneEmptyIfWithPredicate(mlir::TF::IfRegionOp if_region, const Mesh& mesh,
   auto& then_branch = host_side_if.getThenBranch();
   then_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&then_branch.front());
-  builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
-                                    /*operands=*/llvm::ArrayRef<mlir::Value>{});
+  mlir::TF::YieldOp::create(builder, if_region.getLoc(),
+                            /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
   // Create empty else branch region.
   auto& else_branch = host_side_if.getElseBranch();
   else_branch.push_back(new mlir::Block);
   builder.setInsertionPointToEnd(&else_branch.front());
-  builder.create<mlir::TF::YieldOp>(if_region.getLoc(),
-                                    /*operands=*/llvm::ArrayRef<mlir::Value>{});
+  mlir::TF::YieldOp::create(builder, if_region.getLoc(),
+                            /*operands=*/llvm::ArrayRef<mlir::Value>{});
   new_cluster->setAttr(kMeshAttr, builder.getStringAttr(mesh.ToString()));
 }
 
@@ -550,8 +550,8 @@ mlir::LogicalResult MergeClusters(mlir::ModuleOp module) {
 
     // Create a single cluster op contains merged computations for `mesh`.
     builder.setInsertionPoint(&func_block.front());
-    auto new_cluster = builder.create<mlir::tf_device::ClusterOp>(
-        module.getLoc(), merged_return_types);
+    auto new_cluster = mlir::tf_device::ClusterOp::create(
+        builder, module.getLoc(), merged_return_types);
     new_cluster.getBody().push_back(new mlir::Block);
     new_cluster->setAttr(kMeshAttr, builder.getStringAttr(mesh.ToString()));
 
@@ -578,8 +578,8 @@ mlir::LogicalResult MergeClusters(mlir::ModuleOp module) {
     }
 
     builder.setInsertionPointToEnd(&new_cluster.GetBody());
-    builder.create<mlir::tf_device::ReturnOp>(new_cluster.getLoc(),
-                                              merged_return_values);
+    mlir::tf_device::ReturnOp::create(builder, new_cluster.getLoc(),
+                                      merged_return_values);
 
     // Replace return value usages.
     for (auto it :
diff --git a/tensorflow/dtensor/mlir/move_compilation_to_host.cc b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
index 053913f4844606..894b1bacbe72ee 100644
--- a/tensorflow/dtensor/mlir/move_compilation_to_host.cc
+++ b/tensorflow/dtensor/mlir/move_compilation_to_host.cc
@@ -117,8 +117,8 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
   builder.setInsertionPointAfter(compile_op);
   for (int i = 0; i < num_tpu_devices; ++i) {
     const std::string& tensor_name = device_key_map[i];
-    auto send = builder.create<mlir::TF::_HostSendOp>(
-        compile_op->getLoc(), compilation_key, tensor_name,
+    auto send = mlir::TF::_HostSendOp::create(
+        builder, compile_op->getLoc(), compilation_key, tensor_name,
         compile_op_launch.getDevice(),
         /*send_device_incarnation=*/0, local_devices[i]);
     send->setAttr("device", compile_op_launch.getDeviceAttr());
@@ -148,15 +148,15 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
 
     mlir::Block* fn_block = recv_select_fn.addEntryBlock();
     mlir::OpBuilder fn_builder = mlir::OpBuilder::atBlockEnd(fn_block);
-    auto recv = fn_builder.create<mlir::TF::_HostRecvOp>(
-        compile_op->getLoc(),
+    auto recv = mlir::TF::_HostRecvOp::create(
+        fn_builder, compile_op->getLoc(),
         mlir::cast<mlir::TensorType>(compilation_key.getType()),
         device_key_map[i], compile_op_launch.getDevice(),
         /*send_device_incarnation=*/0, local_devices[i]);
     recv->setAttr("device", builder.getStringAttr(local_devices[i]));
 
-    fn_builder.create<mlir::func::ReturnOp>(recv_select_fn.getLoc(),
-                                            recv.getTensor());
+    mlir::func::ReturnOp::create(fn_builder, recv_select_fn.getLoc(),
+                                 recv.getTensor());
 
     compilation_key_functions.emplace_back(recv_select_fn);
   }
@@ -172,8 +172,8 @@ mlir::LogicalResult CreateSendRecvOpsToTransferProgramKey(
     symbols.push_back(mlir::SymbolRefAttr::get(func));
 
   // Create a TF::Case op that selects `values` based on `id`.
-  auto program_key = builder.create<mlir::TF::CaseOp>(
-      compile_op.getLoc(),
+  auto program_key = mlir::TF::CaseOp::create(
+      builder, compile_op.getLoc(),
       /*output=*/llvm::SmallVector<mlir::Type, 4>{compilation_key.getType()},
       /*branch_index=*/*device_id,
       /*input=*/llvm::ArrayRef<mlir::Value>{},
@@ -288,15 +288,16 @@ mlir::LogicalResult HandleCompilationOps(
           llvm::formatv("error while creating TPU compilation logic. {0}",
                         device_ordinal_host.status().message()));
 
-    mlir::Value predicate_host = builder.create<mlir::TF::EqualOp>(
-        compile_op.getLoc(), *device_ordinal_host,
+    mlir::Value predicate_host = mlir::TF::EqualOp::create(
+        builder, compile_op.getLoc(), *device_ordinal_host,
         CreateIntScalarConst(0, builder, compile_op.getLoc()),
         /*incompatible_shape_error=*/builder.getBoolAttr(true));
 
     // If op here contains send/recv and TPUCompile op that should not be pruned
     // away. Therefore, we explicitly set the op to be stateful.
-    auto if_host = builder.create<mlir::TF::IfRegionOp>(
-        compile_op.getLoc(), llvm::SmallVector<mlir::Type, 4>{}, predicate_host,
+    auto if_host = mlir::TF::IfRegionOp::create(
+        builder, compile_op.getLoc(), llvm::SmallVector<mlir::Type, 4>{},
+        predicate_host,
         /*is_stateless=*/builder.getBoolAttr(false),
         GetUniqueControlflowFnName("compilation_host_then", builder),
         GetUniqueControlflowFnName("compilation_host_else", builder));
@@ -305,18 +306,17 @@ mlir::LogicalResult HandleCompilationOps(
     auto& host_else_branch = if_host.getElseBranch();
     host_else_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&host_else_branch.front());
-    builder.create<mlir::TF::YieldOp>(
-        compile_op.getLoc(),
-        /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    mlir::TF::YieldOp::create(builder, compile_op.getLoc(),
+                              /*operands=*/llvm::ArrayRef<mlir::Value>{});
 
     // Create then branch region with logic to compile TPU program and send
     // program key to all TPU devices.
     auto& host_then_branch = if_host.getThenBranch();
     host_then_branch.push_back(new mlir::Block);
     builder.setInsertionPointToEnd(&host_then_branch.front());
-    auto yield = builder.create<mlir::TF::YieldOp>(
-        compile_op.getLoc(),
-        /*operands=*/llvm::ArrayRef<mlir::Value>{});
+    auto yield =
+        mlir::TF::YieldOp::create(builder, compile_op.getLoc(),
+                                  /*operands=*/llvm::ArrayRef<mlir::Value>{});
     compilation_move_before = yield;
 
     builder.setInsertionPointAfter(if_host);
diff --git a/tensorflow/dtensor/mlir/op_to_device_cluster.cc b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
index df3aa89dd9bd35..89c351b0f71ccc 100644
--- a/tensorflow/dtensor/mlir/op_to_device_cluster.cc
+++ b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
@@ -51,8 +51,8 @@ mlir::LogicalResult WrapDeviceCluster(mlir::OpBuilder *builder,
                                       mlir::Operation *op) {
   // Create new tf_device.cluster op wrapping a single operation.
   builder->setInsertionPoint(op);
-  auto cluster = builder->create<mlir::tf_device::ClusterOp>(
-      op->getLoc(), op->getResultTypes());
+  auto cluster = mlir::tf_device::ClusterOp::create(*builder, op->getLoc(),
+                                                    op->getResultTypes());
   if (auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(op)) {
     cluster->setAttr(kMeshAttr, builder->getStringAttr(
                                     layout_op.getLayout().mesh().ToString()));
@@ -89,7 +89,7 @@ mlir::LogicalResult WrapDeviceCluster(mlir::OpBuilder *builder,
   cluster.getBody().push_back(new mlir::Block);
 
   builder->setInsertionPointToEnd(&cluster.GetBody());
-  builder->create<mlir::tf_device::ReturnOp>(op->getLoc(), op->getResults());
+  mlir::tf_device::ReturnOp::create(*builder, op->getLoc(), op->getResults());
 
   // Move `op` inside newly created `ClusterOp`.
   op->moveBefore(cluster.GetBody().getTerminator());
diff --git a/tensorflow/dtensor/mlir/op_utils.cc b/tensorflow/dtensor/mlir/op_utils.cc
index 08aa8f95612104..4b7a776ea2cd2c 100644
--- a/tensorflow/dtensor/mlir/op_utils.cc
+++ b/tensorflow/dtensor/mlir/op_utils.cc
@@ -116,8 +116,8 @@ mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
 
       // Replace DTensorLayout op with identity op.
       mlir::OpBuilder builder(input_layout_op);
-      auto new_identity = builder.create<mlir::TF::IdentityOp>(
-          input_layout_op->getLoc(), input_layout_op.getType(),
+      auto new_identity = mlir::TF::IdentityOp::create(
+          builder, input_layout_op->getLoc(), input_layout_op.getType(),
           input_layout_op.getInput());
       input_layout_op.getOutput().replaceAllUsesWith(new_identity.getOutput());
       input_layout_op.erase();
diff --git a/tensorflow/dtensor/mlir/propagate_default_layout.cc b/tensorflow/dtensor/mlir/propagate_default_layout.cc
index 6b0b35283fdca5..7be77a3f624ff4 100644
--- a/tensorflow/dtensor/mlir/propagate_default_layout.cc
+++ b/tensorflow/dtensor/mlir/propagate_default_layout.cc
@@ -53,8 +53,8 @@ void CreateDTensorLayoutOp(const Layout& layout, mlir::Value input,
                            mlir::MLIRContext* context) {
   if (layout.IsEmpty()) return;
 
-  auto layout_op = builder->create<mlir::TF::DTensorLayout>(
-      loc, input, mlir::dtensor::LayoutAttr::get(context, layout),
+  auto layout_op = mlir::TF::DTensorLayout::create(
+      *builder, loc, input, mlir::dtensor::LayoutAttr::get(context, layout),
       mlir::TF::ShapeAttr::get(context, type));
   if (arg_index != nullptr) {
     layout_op->setAttr(kFromArgIndex, arg_index);
diff --git a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
index 0417e392b4b28d..7381e3628e25d9 100644
--- a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
+++ b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
@@ -125,8 +125,8 @@ mlir::LogicalResult PrependDeviceIdToCallsites(mlir::OpBuilder* builder,
   mlir::Operation* new_call = nullptr;
   if (auto stateful_partitioned_call =
           llvm::dyn_cast<mlir::TF::StatefulPartitionedCallOp>(op)) {
-    new_call = builder->create<mlir::TF::StatefulPartitionedCallOp>(
-        op->getLoc(), op->getResultTypes(), new_operands,
+    new_call = mlir::TF::StatefulPartitionedCallOp::create(
+        *builder, op->getLoc(), op->getResultTypes(), new_operands,
         /*args_attrs=*/nullptr,
         /*res_attrs=*/nullptr, stateful_partitioned_call.getF(),
         stateful_partitioned_call.getConfig(),
@@ -134,8 +134,8 @@ mlir::LogicalResult PrependDeviceIdToCallsites(mlir::OpBuilder* builder,
         stateful_partitioned_call.getExecutorType());
   } else {
     auto partitioned_call = llvm::cast<mlir::TF::PartitionedCallOp>(op);
-    new_call = builder->create<mlir::TF::PartitionedCallOp>(
-        op->getLoc(), op->getResultTypes(), new_operands,
+    new_call = mlir::TF::PartitionedCallOp::create(
+        *builder, op->getLoc(), op->getResultTypes(), new_operands,
         /*args_attrs=*/nullptr,
         /*res_attrs=*/nullptr, partitioned_call.getF(),
         partitioned_call.getConfig(), partitioned_call.getConfigProto(),
diff --git a/tensorflow/dtensor/mlir/restore_shape_inference.cc b/tensorflow/dtensor/mlir/restore_shape_inference.cc
index ab327153634786..3be8637314be97 100644
--- a/tensorflow/dtensor/mlir/restore_shape_inference.cc
+++ b/tensorflow/dtensor/mlir/restore_shape_inference.cc
@@ -85,8 +85,8 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
     // O(N).
     value.setType(type);
   } else if (auto cast_op = llvm::dyn_cast_or_null<mlir::TF::CastOp>(op)) {
-    auto new_cast_op = builder->create<mlir::TF::CastOp>(cast_op.getLoc(), type,
-                                                         cast_op.getOperand());
+    auto new_cast_op = mlir::TF::CastOp::create(*builder, cast_op.getLoc(),
+                                                type, cast_op.getOperand());
     cast_op.replaceAllUsesWith(new_cast_op.getResult());
     cast_op.erase();
 
@@ -103,8 +103,8 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
         module, builder, new_cast_op.getOperand(), new_type);
   } else if (auto identity_op =
                  llvm::dyn_cast_or_null<mlir::TF::IdentityOp>(op)) {
-    auto new_identity_op = builder->create<mlir::TF::IdentityOp>(
-        identity_op.getLoc(), type, identity_op.getInput());
+    auto new_identity_op = mlir::TF::IdentityOp::create(
+        *builder, identity_op.getLoc(), type, identity_op.getInput());
     identity_op.getOutput().replaceAllUsesWith(new_identity_op.getOutput());
     identity_op.erase();
 
@@ -128,8 +128,9 @@ mlir::LogicalResult BackwardShapeInferenceToRestoreOp(mlir::ModuleOp module,
     // RestoreV2Op we want to fix is on the mesh of the corresponding
     // DTensorSend. Set shape of this DTensorRecv first and go to the
     // corresponding DTensorSend.
-    auto new_recv_op = builder->create<mlir::TF::DTensorRecv>(
-        recv_op.getLoc(), type, builder->getStringAttr(recv_op.getKey()),
+    auto new_recv_op = mlir::TF::DTensorRecv::create(
+        *builder, recv_op.getLoc(), type,
+        builder->getStringAttr(recv_op.getKey()),
         mlir::TF::ShapeAttr::get(builder->getContext(),
                                  mlir::dyn_cast<mlir::TensorType>(type)),
         mlir::dtensor::MeshAttr::get(builder->getContext(), recv_op.getMesh()));
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
index f08908eff9395e..e695320769ecc4 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/dynamic_enqueue_sparse_expander.cc
@@ -53,14 +53,14 @@ StatusOr<mlir::Value> ExpandIndices(mlir::OpBuilder& builder,
           .getElementType());
   // Little trick to make a rank-2 tensor of [[0,0], [0,1]] using rank 1
   // constants.
-  mlir::Value indices_padding = builder.create<mlir::TF::ReshapeOp>(
-      loc,
+  mlir::Value indices_padding = mlir::TF::ReshapeOp::create(
+      builder, loc,
       mlir::TF::collection_ops_util::GetR1Const({0, 0, 0, 1}, builder, loc),
       mlir::TF::collection_ops_util::GetR1Const({2, 2}, builder, loc));
   mlir::Value indices_padded =
-      builder.create<mlir::TF::PadOp>(loc, indices_padded_type,
-                                      /*input=*/indices,
-                                      /*paddings=*/indices_padding);
+      mlir::TF::PadOp::create(builder, loc, indices_padded_type,
+                              /*input=*/indices,
+                              /*paddings=*/indices_padding);
   return indices_padded;
 }
 
@@ -98,16 +98,15 @@ StatusOr<mlir::Operation*> DynamicEnqueueSparseExpander::ExpandOp(
   // This op does not have a return value so we do not need to replace any
   // consumers.
   mlir::Operation* sparse_enqueue_op =
-      builder
-          .create<mlir::TF::DynamicEnqueueTPUEmbeddingArbitraryTensorBatchOp>(
-              location,
-              /*sample_indices_or_row_splits_list=*/indices,
-              /*embedding_indices=*/values,
-              /*aggregation_weights=*/dense_enqueue_op.getAggregationWeights(),
-              /*mode_override=*/
-              dense_enqueue_op.getModeOverride(),
-              /*device_ordinal=*/dense_enqueue_op.getDeviceOrdinal(),
-              /*combiners=*/dense_enqueue_op.getCombiners());
+      mlir::TF::DynamicEnqueueTPUEmbeddingArbitraryTensorBatchOp::create(
+          builder, location,
+          /*sample_indices_or_row_splits_list=*/indices,
+          /*embedding_indices=*/values,
+          /*aggregation_weights=*/dense_enqueue_op.getAggregationWeights(),
+          /*mode_override=*/
+          dense_enqueue_op.getModeOverride(),
+          /*device_ordinal=*/dense_enqueue_op.getDeviceOrdinal(),
+          /*combiners=*/dense_enqueue_op.getCombiners());
   dense_enqueue_op.erase();
   return sparse_enqueue_op;
 }
diff --git a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
index 7ed10e42dfe186..5056b89ca9ae32 100644
--- a/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expansions/matmul_sparse_expander.cc
@@ -38,8 +38,8 @@ StatusOr<mlir::Operation*> MatMulSparseExpander::ExpandOp(mlir::Operation* op) {
     // Since operand 0 is a SparseValue, we don't need to check that
     // the indices, values, and dense_shapes exist.
     mlir::TF::SparseTensorDenseMatMulOp new_op =
-        builder.create<mlir::TF::SparseTensorDenseMatMulOp>(
-            op->getLoc(), op->getResultTypes(),
+        mlir::TF::SparseTensorDenseMatMulOp::create(
+            builder, op->getLoc(), op->getResultTypes(),
             mlir::ValueRange{
                 GetIndicesFromSparseTensor(op->getOperand(0)).value(),
                 GetValuesFromSparseTensor(op->getOperand(0)).value(),
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index 91eab6f8438dc2..9fd3af1af33c07 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -125,8 +125,8 @@ absl::Status CreateSplitOp(const int num_split, const int split_dimension,
       mlir::RankedTensorType::get({}, builder->getIntegerType(32));
   auto split_dimension_attr =
       mlir::DenseElementsAttr::get(split_dim_type, split_dimension);
-  auto split_dimension_op = builder->create<mlir::TF::ConstOp>(
-      location, split_dim_type, split_dimension_attr);
+  auto split_dimension_op = mlir::TF::ConstOp::create(
+      *builder, location, split_dim_type, split_dimension_attr);
 
   // Correctly set output shapes of split op output if input shape is statically
   // known.
@@ -157,8 +157,9 @@ absl::Status CreateSplitOp(const int num_split, const int split_dimension,
 
   // Creates a split op that splits |src_input| along |split_dimension|.
   llvm::SmallVector<mlir::Type, 4> output_types(num_split, output_type);
-  *split_op = builder->create<mlir::TF::SplitOp>(
-      location, output_types, split_dimension_op.getOutput(), src_input);
+  *split_op =
+      mlir::TF::SplitOp::create(*builder, location, output_types,
+                                split_dimension_op.getOutput(), src_input);
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/dtensor/mlir/tpu_integration.cc b/tensorflow/dtensor/mlir/tpu_integration.cc
index 67c6e0d9fbed23..e7fffe44a1f520 100644
--- a/tensorflow/dtensor/mlir/tpu_integration.cc
+++ b/tensorflow/dtensor/mlir/tpu_integration.cc
@@ -110,8 +110,8 @@ mlir::LogicalResult CreateTPUCluster(
   auto& function_block = function->getCallableRegion()->front();
   builder->setInsertionPointToStart(&function_block);
 
-  auto cluster = builder->create<mlir::tf_device::ClusterOp>(
-      tpu_call.getLoc(), function->getResultTypes());
+  auto cluster = mlir::tf_device::ClusterOp::create(*builder, tpu_call.getLoc(),
+                                                    function->getResultTypes());
   cluster.getBody().push_back(new mlir::Block);
 
   auto& function_body = function_block.getOperations();
@@ -121,8 +121,8 @@ mlir::LogicalResult CreateTPUCluster(
 
   builder->setInsertionPointToEnd(&cluster.GetBody());
   mlir::Operation* function_block_terminator = function_block.getTerminator();
-  builder->create<mlir::tf_device::ReturnOp>(
-      tpu_call.getLoc(), function_block_terminator->getOperands());
+  mlir::tf_device::ReturnOp::create(*builder, tpu_call.getLoc(),
+                                    function_block_terminator->getOperands());
 
   function_block_terminator->setOperands(cluster.getResults());
 
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index df52a5ddde934b..7858b3430d33ef 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -122,8 +122,8 @@ mlir::LogicalResult EmitAllReduceForXla(
   constexpr char kCrossReplica[] = "CrossReplica";
 
   // For TPUs, lower to XlaAllReduce straightforwardly.
-  *final_op = builder.create<mlir::TF::XlaAllReduceOp>(
-      all_reduce.getLoc(), all_reduce.getResult().getType(),
+  *final_op = mlir::TF::XlaAllReduceOp::create(
+      builder, all_reduce.getLoc(), all_reduce.getResult().getType(),
       all_reduce.getInput(), all_reduce.getGroupAssignment(),
       all_reduce.getReduceOpAttr(), builder.getStringAttr(kCrossReplica));
   return mlir::success();
@@ -198,7 +198,7 @@ mlir::Value GetRelativeDeviceId(mlir::Operation* op,
       ops_util::ReshapeScalarToSizeType(builder, DeviceId(op).value(), loc);
   mlir::Value start_device_id = ops_util::GetR1Const(
       {output_layout.mesh().min_global_device_id()}, builder, loc);
-  return builder.create<mlir::TF::SubOp>(loc, device_id, start_device_id);
+  return mlir::TF::SubOp::create(builder, loc, device_id, start_device_id);
 }
 
 void CreateGroupAndInstanceKey(
@@ -219,13 +219,14 @@ void CreateGroupAndInstanceKey(
   // Create a scalar group key by slicing device_id_to_group_key with
   // device_id.
   auto group_key_loc = DT_LOC2(loc, "group_key");
-  auto group_key_slice = builder.create<mlir::TF::SliceOp>(
-      group_key_loc, EffectivelyScalarR1Type(builder.getIntegerType(32)),
+  auto group_key_slice = mlir::TF::SliceOp::create(
+      builder, group_key_loc,
+      EffectivelyScalarR1Type(builder.getIntegerType(32)),
       /*input=*/IntConst(builder, loc, device_id_to_group_key),
       /*begin=*/device_id,
       /*size=*/IntConst(builder, loc, {1}));
-  auto group_key_reshape = builder.create<mlir::TF::ReshapeOp>(
-      group_key_loc, /*tensor=*/group_key_slice.getResult(),
+  auto group_key_reshape = mlir::TF::ReshapeOp::create(
+      builder, group_key_loc, /*tensor=*/group_key_slice.getResult(),
       /*shape=*/ops_util::GetR1Const({}, builder, loc));
   *group_key_scalar = group_key_reshape.getResult();
 
@@ -257,8 +258,8 @@ mlir::Operation* EmitCollectiveReduce(
   const bool is_mean_op = reduce_op_str == kReduceOpMean;
   mlir::Value group_size_scalar = ops_util::CreateScalarConst(
       host_group_size, builder, DT_LOC2(loc, "group_size"));
-  auto collective_reduce = builder.create<mlir::TF::CollectiveReduceV2Op>(
-      loc, /*output_type=*/input.getType(), input, group_size_scalar,
+  auto collective_reduce = mlir::TF::CollectiveReduceV2Op::create(
+      builder, loc, /*output_type=*/input.getType(), input, group_size_scalar,
       group_key_scalar, instance_key_scalar,
       /*ordering_token=*/mlir::ValueRange({}),
       /*merge_op=*/builder.getStringAttr(is_mean_op ? "Add" : reduce_op_str),
@@ -312,19 +313,21 @@ mlir::Operation* EmitCollectiveReduceScatter(
   const bool is_mean_op = reduce_op_str == kReduceOpMean;
   mlir::Value group_size_scalar = ops_util::CreateScalarConst(
       host_group_size, builder, DT_LOC2(loc, "group_size"));
-  auto collective_reduce_scatter = builder.create<
-      mlir::TF::CollectiveReduceScatterV2Op>(
-      loc, output_type, input, group_size_scalar, group_key_scalar,
-      instance_key_scalar,
-      /*ordering_token=*/mlir::ValueRange({}),
-      /*merge_op=*/builder.getStringAttr(is_mean_op ? "Add" : reduce_op_str),
-      /*final_op=*/builder.getStringAttr(is_mean_op ? "Div" : "Id"),
-      /*communication_hint=*/builder.getStringAttr("nccl"),  // TODO(tmorris):
-                                                             // this shouldn't
-                                                             // be needed
-      /*timeout_seconds=*/builder.getF32FloatAttr(0.),
-      /*is_stateless=*/builder.getBoolAttr(false),
-      /*max_subdivs_per_device=*/builder.getI64IntegerAttr(16));
+  auto collective_reduce_scatter =
+      mlir::TF::CollectiveReduceScatterV2Op::create(
+          builder, loc, output_type, input, group_size_scalar, group_key_scalar,
+          instance_key_scalar,
+          /*ordering_token=*/mlir::ValueRange({}),
+          /*merge_op=*/
+          builder.getStringAttr(is_mean_op ? "Add" : reduce_op_str),
+          /*final_op=*/builder.getStringAttr(is_mean_op ? "Div" : "Id"),
+          /*communication_hint=*/
+          builder.getStringAttr("nccl"),  // TODO(tmorris):
+                                          // this shouldn't
+                                          // be needed
+          /*timeout_seconds=*/builder.getF32FloatAttr(0.),
+          /*is_stateless=*/builder.getBoolAttr(false),
+          /*max_subdivs_per_device=*/builder.getI64IntegerAttr(16));
   SetSingleLayoutOnOp(collective_reduce_scatter, Layout::Empty());
   if (need_transpose) {
     return EmitTransposeOp(builder, loc,
@@ -394,8 +397,8 @@ mlir::Operation* EmitCollectiveAllToAll(
         new_shape.push_back(input_shape[i]);
       }
     }
-    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
-        loc, data, ops_util::GetR1Const(new_shape, builder, loc));
+    auto reshape_op = mlir::TF::ReshapeOp::create(
+        builder, loc, data, ops_util::GetR1Const(new_shape, builder, loc));
 
     std::vector<int64> perm_for_permute_transpose;
     perm_for_permute_transpose.reserve(input_shape.size() + 1);
@@ -420,8 +423,8 @@ mlir::Operation* EmitCollectiveAllToAll(
                                        1LL, std::multiplies<int64>());
   std::vector<int64> flatten_shape = {host_group_size,
                                       num_elements / host_group_size};
-  auto flatten_reshape_op = builder.create<mlir::TF::ReshapeOp>(
-      loc, input, ops_util::GetR1Const(flatten_shape, builder, loc));
+  auto flatten_reshape_op = mlir::TF::ReshapeOp::create(
+      builder, loc, input, ops_util::GetR1Const(flatten_shape, builder, loc));
   mlir::TensorType output_type =
       mlir::RankedTensorType::get(flatten_shape, input_type.getElementType());
 
@@ -432,9 +435,10 @@ mlir::Operation* EmitCollectiveAllToAll(
                             &group_key_scalar, &instance_key_scalar);
   mlir::Value group_size_scalar =
       ops_util::CreateScalarConst(host_group_size, builder, loc);
-  auto collective_alltoall = builder.create<mlir::TF::CollectiveAllToAllV2Op>(
-      loc, /*output_type=*/output_type, flatten_reshape_op->getResult(0),
-      group_size_scalar, group_key_scalar, instance_key_scalar,
+  auto collective_alltoall = mlir::TF::CollectiveAllToAllV2Op::create(
+      builder, loc, /*output_type=*/output_type,
+      flatten_reshape_op->getResult(0), group_size_scalar, group_key_scalar,
+      instance_key_scalar,
       /*ordering_token=*/mlir::ValueRange({}),
       /*communication_hint=*/builder.getStringAttr(""),
       /*timeout_seconds=*/builder.getF32FloatAttr(0.),
@@ -444,8 +448,9 @@ mlir::Operation* EmitCollectiveAllToAll(
 
   if (requires_transpose) {
     // Unflatten after all-to-all.
-    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
-        loc, prev_op, ops_util::GetR1Const(transposed_shape, builder, loc));
+    auto reshape_op = mlir::TF::ReshapeOp::create(
+        builder, loc, prev_op,
+        ops_util::GetR1Const(transposed_shape, builder, loc));
     // Undo earlier transpose which moved split or concat dim to rank 0.
     std::vector<int64> perm_for_transpose;
     perm_for_transpose.reserve(input_shape.size());
@@ -473,8 +478,8 @@ mlir::Operation* EmitCollectiveAllToAll(
   std::vector<int64> output_shape(input_shape.begin(), input_shape.end());
   output_shape[concat_dimension] *= host_group_size;
   output_shape[split_dimension] /= host_group_size;
-  auto post_reshape_op = builder.create<mlir::TF::ReshapeOp>(
-      loc, prev_op, ops_util::GetR1Const(output_shape, builder, loc));
+  auto post_reshape_op = mlir::TF::ReshapeOp::create(
+      builder, loc, prev_op, ops_util::GetR1Const(output_shape, builder, loc));
 
   return post_reshape_op;
 }
@@ -503,8 +508,8 @@ mlir::Operation* EmitCollectiveGather(
 
   mlir::Value group_size_scalar =
       ops_util::CreateScalarConst(host_group_size, builder, loc);
-  auto collective_gather = builder.create<mlir::TF::CollectiveGatherV2Op>(
-      loc, /*output_type=*/input.getType(), input, group_size_scalar,
+  auto collective_gather = mlir::TF::CollectiveGatherV2Op::create(
+      builder, loc, /*output_type=*/input.getType(), input, group_size_scalar,
       group_key_scalar, instance_key_scalar,
       /*ordering_token=*/mlir::ValueRange({}),
       /*communication_hint=*/builder.getStringAttr(""),
@@ -606,12 +611,10 @@ mlir::LogicalResult LowerReduceScatterOp(
   mlir::OpBuilder builder(reduce_scatter);
   if (reduce_scatter.getDeviceType().ends_with("TPU")) {
     // For TPUs, lower to XlaReduceScatter straightforwardly.
-    mlir::Operation* xla_reduce_scatter =
-        builder.create<mlir::TF::XlaReduceScatterOp>(
-            loc, reduce_scatter.getResult().getType(),
-            reduce_scatter.getInput(), reduce_scatter.getGroupAssignment(),
-            reduce_scatter.getScatterDimension(),
-            reduce_scatter.getReduceOpAttr());
+    mlir::Operation* xla_reduce_scatter = mlir::TF::XlaReduceScatterOp::create(
+        builder, loc, reduce_scatter.getResult().getType(),
+        reduce_scatter.getInput(), reduce_scatter.getGroupAssignment(),
+        reduce_scatter.getScatterDimension(), reduce_scatter.getReduceOpAttr());
     SetSingleLayoutOnOp(xla_reduce_scatter, *output_layout);
     reduce_scatter.replaceAllUsesWith(xla_reduce_scatter);
   } else if (reduce_scatter.getDeviceType().ends_with("GPU") &&
@@ -653,16 +656,17 @@ mlir::LogicalResult LowerReduceScatterOp(
       return reduce_scatter.emitOpError(input_layout.status().message());
     }
 
-    auto dtensor_allreduce = builder.create<mlir::TF::DTensorAllReduceOp>(
-        reduce_scatter.getLoc(), reduce_scatter.getOperand(0).getType(),
-        reduce_scatter.getOperand(0), reduce_scatter.getGroupAssignment(),
-        reduce_scatter.getReduceOp(), reduce_scatter.getDeviceType());
+    auto dtensor_allreduce = mlir::TF::DTensorAllReduceOp::create(
+        builder, reduce_scatter.getLoc(),
+        reduce_scatter.getOperand(0).getType(), reduce_scatter.getOperand(0),
+        reduce_scatter.getGroupAssignment(), reduce_scatter.getReduceOp(),
+        reduce_scatter.getDeviceType());
     SetSingleLayoutOnOp(dtensor_allreduce, *input_layout);
 
     mlir::Operation* dtensor_all_scatter =
-        builder.create<mlir::TF::DTensorAllScatterOp>(
-            reduce_scatter.getLoc(), reduce_scatter.getResult().getType(),
-            dtensor_allreduce.getResult(),
+        mlir::TF::DTensorAllScatterOp::create(
+            builder, reduce_scatter.getLoc(),
+            reduce_scatter.getResult().getType(), dtensor_allreduce.getResult(),
             mlir::dtensor::LayoutAttr::get(builder.getContext(), *input_layout),
             mlir::dtensor::LayoutAttr::get(builder.getContext(),
                                            *output_layout));
@@ -676,8 +680,9 @@ mlir::LogicalResult LowerReduceScatterOp(
 mlir::Value CreateZeroScalar(mlir::OpBuilder& builder, mlir::Location loc,
                              mlir::RankedTensorType type) {
   const mlir::Value zero_scalar = ops_util::CreateScalarConst(0, builder, loc);
-  return builder.create<mlir::TF::CastOp>(
-      loc, mlir::RankedTensorType::get({}, type.getElementType()), zero_scalar);
+  return mlir::TF::CastOp::create(
+      builder, loc, mlir::RankedTensorType::get({}, type.getElementType()),
+      zero_scalar);
 }
 
 // device_id is the relative device_id in a mesh (device id - mesh's 1st device
@@ -691,15 +696,15 @@ mlir::Value SelectElementsBasedOnId(
       ops_util::GetR1Const(candidates_flat, builder, loc);
   const mlir::Value candidates_shape =
       ops_util::GetR1Const({num_devices, output_shape_size}, builder, loc);
-  const mlir::Value candidates = builder.create<mlir::TF::ReshapeOp>(
-      loc, candidates_flat_const, candidates_shape);
+  const mlir::Value candidates = mlir::TF::ReshapeOp::create(
+      builder, loc, candidates_flat_const, candidates_shape);
 
   // Add a zero after the only value in the 1x1 device_id tensor.
-  const mlir::Value device_id_paddings = builder.create<mlir::TF::ReshapeOp>(
-      loc, ops_util::GetR1Const({0, 1}, builder, loc),
+  const mlir::Value device_id_paddings = mlir::TF::ReshapeOp::create(
+      builder, loc, ops_util::GetR1Const({0, 1}, builder, loc),
       ops_util::GetR1Const({1, 2}, builder, loc));
-  const mlir::Value device_id_padded = builder.create<mlir::TF::PadOp>(
-      loc, candidates_shape.getType(), /*input=*/device_id,
+  const mlir::Value device_id_padded = mlir::TF::PadOp::create(
+      builder, loc, candidates_shape.getType(), /*input=*/device_id,
       /*paddings=*/device_id_paddings);
 
   // Slice a vertical vector out of the 2D candidates matrix.
@@ -707,13 +712,15 @@ mlir::Value SelectElementsBasedOnId(
       {1, output_shape_size}, builder.getIntegerType(32));
   const mlir::Value chosen_shape_const =
       ops_util::GetR1Const(chosen_shape_type.getShape(), builder, loc);
-  const mlir::Value chosen = builder.create<mlir::TF::SliceOp>(
-      loc, chosen_shape_type, /*input=*/candidates, /*begin=*/device_id_padded,
+  const mlir::Value chosen = mlir::TF::SliceOp::create(
+      builder, loc, chosen_shape_type, /*input=*/candidates,
+      /*begin=*/device_id_padded,
       /*size=*/chosen_shape_const);
 
   // Remove the leading dimension of size 1 before returning the result.
-  return builder.create<mlir::TF::ReshapeOp>(
-      loc, chosen, ops_util::GetR1Const({output_shape_size}, builder, loc));
+  return mlir::TF::ReshapeOp::create(
+      builder, loc, chosen,
+      ops_util::GetR1Const({output_shape_size}, builder, loc));
 }
 
 StatusOr<const mlir::DenseIntElementsAttr> GetGroupAssignment(
@@ -841,8 +848,8 @@ mlir::LogicalResult LowerAllGatherOpToCollective(
       new_shape.push_back(input_shape_after_tr[j]);
     }
 
-    auto reshape_op = builder.create<mlir::TF::ReshapeOp>(
-        loc, /*tensor=*/collective_op->getResult(0),
+    auto reshape_op = mlir::TF::ReshapeOp::create(
+        builder, loc, /*tensor=*/collective_op->getResult(0),
         /*shape=*/ops_util::GetR1Const(new_shape, builder, loc));
 
     prev_op_result = reshape_op->getResult(0);
@@ -877,8 +884,8 @@ mlir::LogicalResult LowerAllGatherOpToCollective(
     prev_op_result = post_transpose_op->getResult(0);
   }
 
-  auto output_reshape_op = builder.create<mlir::TF::ReshapeOp>(
-      loc, /*tensor=*/prev_op_result,
+  auto output_reshape_op = mlir::TF::ReshapeOp::create(
+      builder, loc, /*tensor=*/prev_op_result,
       /*shape=*/ops_util::GetR1Const(output_shape, builder, loc));
   SetSingleLayoutOnOp(output_reshape_op, tgt_layout);
   all_gather.replaceAllUsesWith(output_reshape_op->getResult(0));
@@ -900,8 +907,8 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   builder.setInsertionPointAfter(all_gather);
 
   if (concat_dims.empty()) {
-    mlir::TF::IdentityOp identity = builder.create<mlir::TF::IdentityOp>(
-        all_gather.getLoc(), all_gather.getInput().getType(),
+    mlir::TF::IdentityOp identity = mlir::TF::IdentityOp::create(
+        builder, all_gather.getLoc(), all_gather.getInput().getType(),
         all_gather.getInput());
     SetSingleLayoutOnOp(identity, tgt_layout);
 
@@ -942,7 +949,7 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   const mlir::Value output_shape_const = Int64Const(builder, loc, output_shape);
   const mlir::Value zero_scalar = CreateZeroScalar(builder, loc, input_type);
   const mlir::Value zeros =
-      builder.create<mlir::TF::FillOp>(loc, output_shape_const, zero_scalar);
+      mlir::TF::FillOp::create(builder, loc, output_shape_const, zero_scalar);
 
   // For every possible device ID, generate its strided slice ranges. Store all
   // ranges---num_devices * output_shape_size * (begin, end, stride)---as three
@@ -1001,12 +1008,12 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
     if (!tgt_layout.mesh().is_tpu_mesh())
       return all_gather.emitOpError()
              << "source and target layout are not both on tpu";
-    update_result = builder.create<mlir::TF::XlaDynamicUpdateSliceOp>(
-        loc, zeros.getType(), /*input=*/zeros,
+    update_result = mlir::TF::XlaDynamicUpdateSliceOp::create(
+        builder, loc, zeros.getType(), /*input=*/zeros,
         /*update=*/all_gather.getInput(), /*indices=*/begin);
   } else {
-    update_result = builder.create<mlir::TF::TensorStridedSliceUpdateOp>(
-        loc, zeros.getType(),
+    update_result = mlir::TF::TensorStridedSliceUpdateOp::create(
+        builder, loc, zeros.getType(),
         /*input=*/zeros, begin, end, strides,
         /*value=*/all_gather.getInput());
   }
@@ -1062,9 +1069,9 @@ mlir::LogicalResult LowerAllGatherOp(mlir::TF::DTensorAllGatherOp all_gather) {
   absl::string_view reduce_type = kReduceOpAdd;
   if (type && type.getElementType().isInteger(1)) reduce_type = kReduceOpAny;
   mlir::TF::DTensorAllReduceOp all_reduce =
-      builder.create<mlir::TF::DTensorAllReduceOp>(
-          loc, update_result.getType(), update_result,
-          builder.create<mlir::TF::ConstOp>(loc, group_assignment),
+      mlir::TF::DTensorAllReduceOp::create(
+          builder, loc, update_result.getType(), update_result,
+          mlir::TF::ConstOp::create(builder, loc, group_assignment),
           builder.getStringAttr(std::string(reduce_type)),
           builder.getStringAttr(device_type));
   SetSingleLayoutOnOp(all_reduce, tgt_layout);
@@ -1146,12 +1153,12 @@ mlir::LogicalResult LowerAllScatterOp(
   mlir::Attribute matrix_attr =
       mlir::DenseIntElementsAttr::get(matrix_type, matrix);
   mlir::Value matrix_value =
-      builder.create<mlir::TF::ConstOp>(all_scatter.getLoc(), matrix_attr)
+      mlir::TF::ConstOp::create(builder, all_scatter.getLoc(), matrix_attr)
           .getResult();
 
   // Compute the offset from mult_matrix_value and mesh_coordinates.
-  mlir::TF::MatMulOp offset = builder.create<mlir::TF::MatMulOp>(
-      all_scatter.getLoc(),
+  mlir::TF::MatMulOp offset = mlir::TF::MatMulOp::create(
+      builder, all_scatter.getLoc(),
       mlir::RankedTensorType::get({1, original_layout.rank()},
                                   builder.getIntegerType(32)),
       mesh_coordinates, matrix_value);
@@ -1164,14 +1171,14 @@ mlir::LogicalResult LowerAllScatterOp(
   }
 
   // Input to slice needs to be rank 1, so we need to squeeze it.
-  mlir::TF::SqueezeOp offset_squeezed = builder.create<mlir::TF::SqueezeOp>(
-      all_scatter.getLoc(),
+  mlir::TF::SqueezeOp offset_squeezed = mlir::TF::SqueezeOp::create(
+      builder, all_scatter.getLoc(),
       mlir::RankedTensorType::get({original_layout.rank()},
                                   builder.getIntegerType(32)),
       offset.getProduct(), builder.getI64ArrayAttr({0}));
 
-  auto result = builder.create<mlir::TF::SliceOp>(
-      all_scatter.getLoc(), output_type, all_scatter.getInput(),
+  auto result = mlir::TF::SliceOp::create(
+      builder, all_scatter.getLoc(), output_type, all_scatter.getInput(),
       offset_squeezed.getOutput(), slice_shape_value);
 
   SetSingleLayoutOnOp(result, desired_layout);
@@ -1231,9 +1238,9 @@ mlir::LogicalResult LowerAllToAllOp(mlir::TF::DTensorAllToAllOp all_to_all) {
 
   if (mlir::StringRef(device_type).ends_with("TPU")) {
     // For TPUs, lower to XlaAllToAll.
-    mlir::Operation* xla_all_to_all = builder.create<mlir::TF::AllToAllOp>(
-        loc, all_to_all.getResult().getType(), all_to_all.getInput(),
-        builder.create<mlir::TF::ConstOp>(loc, group_assignment),
+    mlir::Operation* xla_all_to_all = mlir::TF::AllToAllOp::create(
+        builder, loc, all_to_all.getResult().getType(), all_to_all.getInput(),
+        mlir::TF::ConstOp::create(builder, loc, group_assignment),
         concat_dimension, split_dimension, group_size);
     SetSingleLayoutOnOp(xla_all_to_all, tgt_layout);
     all_to_all.replaceAllUsesWith(xla_all_to_all);
diff --git a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
index 6eaeacee29f611..d8a7bcd9705521 100644
--- a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
+++ b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
@@ -254,8 +254,8 @@ mlir::LogicalResult UpdateTPUCompileMetadata(const Mesh& mesh_config,
     if (mesh_config.use_xla_spmd()) {
       // Create a new compile op with the appropriate new number of operands.
       builder->setInsertionPointAfter(compile);
-      auto new_compile_op = builder->create<mlir::TF::_TPUCompileMlirOp>(
-          compile.getLoc(), compile.getCompilationStatus().getType(),
+      auto new_compile_op = mlir::TF::_TPUCompileMlirOp::create(
+          *builder, compile.getLoc(), compile.getCompilationStatus().getType(),
           /*program=*/
           llvm::SmallVector<mlir::Type, 8>(
               mesh_config.num_devices(),
diff --git a/tensorflow/dtensor/mlir/value_utils.cc b/tensorflow/dtensor/mlir/value_utils.cc
index e9240996904fd0..edc6afb95a67ab 100644
--- a/tensorflow/dtensor/mlir/value_utils.cc
+++ b/tensorflow/dtensor/mlir/value_utils.cc
@@ -103,8 +103,8 @@ mlir::Value ReshapeSizeTypeToScalar(mlir::OpBuilder builder, mlir::Location loc,
       mlir::RankedTensorType::get({}, builder.getIntegerType(32));
   mlir::Value scalar_shape =
       ops_util::GetR1Const(scalar_type.getShape(), builder, loc);
-  return builder.create<mlir::TF::ReshapeOp>(
-      loc, mlir::ArrayRef<mlir::Type>{scalar_type},
+  return mlir::TF::ReshapeOp::create(
+      builder, loc, mlir::ArrayRef<mlir::Type>{scalar_type},
       mlir::ArrayRef<mlir::Value>{tensor, scalar_shape});
 }
 
@@ -114,7 +114,7 @@ mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
       {static_cast<int64_t>(values.size())}, builder.getIntegerType(32));
   mlir::Attribute const_attr =
       mlir::DenseIntElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type) {
@@ -133,7 +133,7 @@ mlir::Value Int64Const(mlir::OpBuilder& builder, mlir::Location loc,
       {static_cast<int64_t>(values.size())}, builder.getIntegerType(64));
   mlir::Attribute const_attr =
       mlir::DenseIntElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 mlir::Value FloatConst(mlir::OpBuilder& builder, mlir::Location loc,
@@ -142,16 +142,17 @@ mlir::Value FloatConst(mlir::OpBuilder& builder, mlir::Location loc,
       {static_cast<int64_t>(values.size())}, builder.getF32Type());
   mlir::Attribute const_attr =
       mlir::DenseFPElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 mlir::Value StringScalarConst(mlir::OpBuilder& builder, mlir::Location loc,
                               llvm::StringRef value) {
-  return builder.create<mlir::TF::ConstOp>(
-      loc, mlir::DenseStringElementsAttr::get(
-               mlir::RankedTensorType::get(
-                   {}, builder.getType<mlir::TF::StringType>()),
-               value));
+  return mlir::TF::ConstOp::create(
+      builder, loc,
+      mlir::DenseStringElementsAttr::get(
+          mlir::RankedTensorType::get({},
+                                      builder.getType<mlir::TF::StringType>()),
+          value));
 }
 
 mlir::Value StringConst(mlir::OpBuilder& builder, mlir::Location loc,
@@ -161,7 +162,7 @@ mlir::Value StringConst(mlir::OpBuilder& builder, mlir::Location loc,
                                   builder.getType<mlir::TF::StringType>());
   mlir::Attribute const_attr =
       mlir::DenseStringElementsAttr::get(const_type, values);
-  return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
+  return mlir::TF::ConstOp::create(builder, loc, const_attr).getResult();
 }
 
 mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
@@ -213,14 +214,16 @@ absl::Status ExtractConstVectorFromValue(
 mlir::Value CreateIntScalarConst(const int64_t value, mlir::OpBuilder builder,
                                  mlir::Location loc, bool use_int64) {
   if (use_int64) {
-    return builder.create<mlir::TF::ConstOp>(
-        loc, mlir::DenseIntElementsAttr::get(
-                 mlir::RankedTensorType::get({}, builder.getI64Type()), value));
+    return mlir::TF::ConstOp::create(
+        builder, loc,
+        mlir::DenseIntElementsAttr::get(
+            mlir::RankedTensorType::get({}, builder.getI64Type()), value));
   } else {
-    return builder.create<mlir::TF::ConstOp>(
-        loc, mlir::DenseIntElementsAttr::get(
-                 mlir::RankedTensorType::get({}, builder.getI32Type()),
-                 static_cast<int32_t>(value)));
+    return mlir::TF::ConstOp::create(
+        builder, loc,
+        mlir::DenseIntElementsAttr::get(
+            mlir::RankedTensorType::get({}, builder.getI32Type()),
+            static_cast<int32_t>(value)));
   }
 }
 
@@ -228,32 +231,32 @@ StatusOr<mlir::Value> CreateZeroScalarConst(mlir::OpBuilder& builder,
                                             mlir::Location loc,
                                             mlir::Type type) {
   if (type.isF64()) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseFPElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getF64Type()),
-                     static_cast<double>(0.)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseFPElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getF64Type()),
+                   static_cast<double>(0.)))
         .getResult();
   } else if (type.isF32()) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseFPElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getF32Type()),
-                     static_cast<float>(0.f)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseFPElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getF32Type()),
+                   static_cast<float>(0.f)))
         .getResult();
   } else if (type.isInteger(32)) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseIntElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getI32Type()),
-                     static_cast<int32_t>(0)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseIntElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getI32Type()),
+                   static_cast<int32_t>(0)))
         .getResult();
   } else if (type.isInteger(64)) {
-    return builder
-        .create<mlir::TF::ConstOp>(
-            loc, mlir::DenseIntElementsAttr::get(
-                     mlir::RankedTensorType::get({}, builder.getI64Type()),
-                     static_cast<int64_t>(0)))
+    return mlir::TF::ConstOp::create(
+               builder, loc,
+               mlir::DenseIntElementsAttr::get(
+                   mlir::RankedTensorType::get({}, builder.getI64Type()),
+                   static_cast<int64_t>(0)))
         .getResult();
   } else {
     return errors::InvalidArgument(
@@ -270,8 +273,9 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
     return errors::InvalidArgument("Input array must have shape [1, N].");
   }
 
-  mlir::TF::SliceOp sliced_value = builder.create<mlir::TF::SliceOp>(
-      location, mlir::RankedTensorType::get({1, 1}, arrayType.getElementType()),
+  mlir::TF::SliceOp sliced_value = mlir::TF::SliceOp::create(
+      builder, location,
+      mlir::RankedTensorType::get({1, 1}, arrayType.getElementType()),
       /*input=*/array,
       /*begin=*/IntConst(builder, location, {0, index}),
       /*size=*/IntConst(builder, location, {1, 1}));
@@ -281,8 +285,8 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
       mlir::RankedTensorType::get({}, builder.getIntegerType(32));
   mlir::Value scalar_shape = mlir::TF::collection_ops_util::GetR1Const(
       scalar_size_type.getShape(), builder, location);
-  mlir::Value scalar_sliced_value = builder.create<mlir::TF::ReshapeOp>(
-      location, mlir::ArrayRef<mlir::Type>{scalar_size_type},
+  mlir::Value scalar_sliced_value = mlir::TF::ReshapeOp::create(
+      builder, location, mlir::ArrayRef<mlir::Type>{scalar_size_type},
       mlir::ArrayRef<mlir::Value>{sliced_value.getOutput(), scalar_shape},
       mlir::ArrayRef<mlir::NamedAttribute>{});
   return scalar_sliced_value;
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
index e7fc61a4c0e774..bbbd45800e3f5b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_fusion_emitter.cc
@@ -227,8 +227,8 @@ absl::StatusOr<mlir::func::FuncOp> EmitEntryFunctionApi(
   }
 
   builder.setInsertionPointToStart(fusion_module.getBody());
-  auto entry_func = builder.create<FuncOp>(
-      loc, entry_function_name,
+  auto entry_func = FuncOp::create(
+      builder, loc, entry_function_name,
       mlir::FunctionType::get(context, param_types, result_types),
       /*sym_visibility=*/mlir::StringAttr{},
       mlir::ArrayAttr::get(context, arg_attrs),
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
index 711bdb28709ed0..218f246803de9b 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/cpu_scatter_emitter.cc
@@ -165,19 +165,19 @@ SmallVector<Value> EmitScatterComputation(
     ret.reserve(reduced_values.size());
     for (const auto& [reduced_value, output_tensor] :
          llvm::zip(reduced_values, output_tensors)) {
-      ret.push_back(b.create<mlir::tensor::InsertOp>(reduced_value,
-                                                     output_tensor, indices));
+      ret.push_back(mlir::tensor::InsertOp::create(b, reduced_value,
+                                                   output_tensor, indices));
     }
     return ret;
   }
   Value output_tensor = output_tensors.front();
   Value update_elem = update_elems.front();
-  auto atomic_rmw = b.create<AtomicRMWOp>(output_tensor, indices);
+  auto atomic_rmw = AtomicRMWOp::create(b, output_tensor, indices);
   mlir::OpBuilder body_builder = atomic_rmw.getBodyBuilder();
   auto reduced_val =
       emitters::InlineBlock(body_builder, reducer.getBody().front(),
                             {atomic_rmw.getCurrentValue(), update_elem})[0];
-  body_builder.create<xla::YieldOp>(reducer->getLoc(), reduced_val);
+  xla::YieldOp::create(body_builder, reducer->getLoc(), reduced_val);
   return {atomic_rmw->getResult(0)};
 }
 
@@ -444,7 +444,7 @@ absl::Status CpuScatterFusion::EmitEntryFunction(
                           updated_outputs);
                     },
                     [&](mlir::OpBuilder& else_b, mlir::Location else_loc) {
-                      else_b.create<scf::YieldOp>(else_loc, output_tensors);
+                      scf::YieldOp::create(else_b, else_loc, output_tensors);
                     })
                 .getResults();
         return predicated_updates;
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc
index 57b529747f90fa..231889025496a5 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/peel_workgroup_loop.cc
@@ -114,9 +114,9 @@ struct PeelWorkgroupLoopPattern : public mlir::OpRewritePattern<xla::LoopOp> {
       }
 
       mlir::ImplicitLocOpBuilder builder(loop_op.getLoc(), rewriter);
-      auto cmp_op = builder.create<mlir::arith::CmpIOp>(
-          mlir::arith::CmpIPredicate::sle, work_group_dim.operand,
-          builder.create<mlir::arith::ConstantIndexOp>(query_dimension_upper));
+      auto cmp_op = mlir::arith::CmpIOp::create(
+          builder, mlir::arith::CmpIPredicate::sle, work_group_dim.operand,
+          mlir::arith::ConstantIndexOp::create(builder, query_dimension_upper));
 
       auto loop_body_cloner = GetLoopBodyCloner(loop_op);
 
@@ -128,11 +128,11 @@ struct PeelWorkgroupLoopPattern : public mlir::OpRewritePattern<xla::LoopOp> {
             query_dimension_upper;
         peeled_map.Simplify();
 
-        auto peeled_loop =
-            then_builder.create<LoopOp>(then_loc, peeled_map, loop_op.getDims(),
-                                        loop_op.getInits(), loop_body_cloner);
-        then_builder.create<mlir::scf::YieldOp>(then_loc,
-                                                peeled_loop.getResults());
+        auto peeled_loop = LoopOp::create(then_builder, then_loc, peeled_map,
+                                          loop_op.getDims(), loop_op.getInits(),
+                                          loop_body_cloner);
+        mlir::scf::YieldOp::create(then_builder, then_loc,
+                                   peeled_loop.getResults());
       };
       auto else_body_builder = [&](mlir::OpBuilder& else_builder,
                                    mlir::Location else_loc) -> void {
@@ -142,14 +142,14 @@ struct PeelWorkgroupLoopPattern : public mlir::OpRewritePattern<xla::LoopOp> {
         tail_map.Simplify();
 
         auto tail_loop =
-            else_builder.create<LoopOp>(else_loc, tail_map, loop_op.getDims(),
-                                        loop_op.getInits(), loop_body_cloner);
-        else_builder.create<mlir::scf::YieldOp>(else_loc,
-                                                tail_loop.getResults());
+            LoopOp::create(else_builder, else_loc, tail_map, loop_op.getDims(),
+                           loop_op.getInits(), loop_body_cloner);
+        mlir::scf::YieldOp::create(else_builder, else_loc,
+                                   tail_loop.getResults());
       };
 
-      auto if_op = builder.create<mlir::scf::IfOp>(cmp_op, then_body_builder,
-                                                   else_body_builder);
+      auto if_op = mlir::scf::IfOp::create(builder, cmp_op, then_body_builder,
+                                           else_body_builder);
 
       rewriter.replaceOp(loop_op, if_op.getResults());
       return mlir::success();
diff --git a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
index fd71b9ee8d36f0..8eee5850bbdde4 100644
--- a/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/emitters/transforms/xla_cpu_rewrite_patterns.cc
@@ -90,24 +90,24 @@ struct LowerLoadOp : public mlir::OpRewritePattern<LoadOp> {
     auto kernel_arg = KernelArgType(b.getContext());
 
     // Get a pointer to the first `KernelArg` struct.
-    auto cast = b.create<mlir::UnrealizedConversionCastOp>(op.getLoc(), ptr,
-                                                           op.getCallFrame())
+    auto cast = mlir::UnrealizedConversionCastOp::create(b, op.getLoc(), ptr,
+                                                         op.getCallFrame())
                     .getResult(0);
-    auto args_gep = b.create<mlir::LLVM::GEPOp>(
-        ptr, kernel_call_frame, cast,
+    auto args_gep = mlir::LLVM::GEPOp::create(
+        b, ptr, kernel_call_frame, cast,
         llvm::SmallVector<mlir::LLVM::GEPArg, 2>{mlir::LLVM::GEPArg(0),
                                                  mlir::LLVM::GEPArg(3)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
-    auto args_ptr = b.create<mlir::LLVM::LoadOp>(ptr, args_gep);
+    auto args_ptr = mlir::LLVM::LoadOp::create(b, ptr, args_gep);
     args_ptr.setInvariant(true);
 
     // Get a pointer to the `KernelArg` at the given index.
-    auto arg_gep = b.create<mlir::LLVM::GEPOp>(
-        ptr, kernel_arg, args_ptr,
+    auto arg_gep = mlir::LLVM::GEPOp::create(
+        b, ptr, kernel_arg, args_ptr,
         llvm::SmallVector<mlir::LLVM::GEPArg, 2>{
             mlir::LLVM::GEPArg(op.getIndex()), mlir::LLVM::GEPArg(0)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
-    auto arg_ptr = b.create<mlir::LLVM::LoadOp>(ptr, arg_gep);
+    auto arg_ptr = mlir::LLVM::LoadOp::create(b, ptr, arg_gep);
     arg_ptr.setInvariant(true);
 
     if (auto dereferenceable = op->getAttrOfType<mlir::IntegerAttr>(
@@ -121,12 +121,12 @@ struct LowerLoadOp : public mlir::OpRewritePattern<LoadOp> {
       mlir::LLVMTypeConverter converter(rewriter.getContext());
       mlir::Value memref_desc = mlir::MemRefDescriptor::fromStaticShape(
           b, op.getLoc(), converter, memref_type, arg_ptr);
-      auto memref_cast = b.create<mlir::UnrealizedConversionCastOp>(
-          op.getLoc(), op.getResult().getType(), memref_desc);
+      auto memref_cast = mlir::UnrealizedConversionCastOp::create(
+          b, op.getLoc(), op.getResult().getType(), memref_desc);
       rewriter.replaceOp(op, memref_cast);
     } else {
-      auto arg_ptr_cast = b.create<mlir::UnrealizedConversionCastOp>(
-          op.getLoc(), op.getResult().getType(), arg_ptr.getResult());
+      auto arg_ptr_cast = mlir::UnrealizedConversionCastOp::create(
+          b, op.getLoc(), op.getResult().getType(), arg_ptr.getResult());
       rewriter.replaceOp(op, arg_ptr_cast.getResult(0));
     }
     return mlir::success();
@@ -149,26 +149,25 @@ struct LowerExtractWorkgroupIdOp
     auto i64_ty = builder.getI64Type();
 
     // Get a pointer to the `WorkGroupThread` struct.
-    auto cast = builder
-                    .create<mlir::UnrealizedConversionCastOp>(ptr_type,
-                                                              op.getCallFrame())
+    auto cast = mlir::UnrealizedConversionCastOp::create(builder, ptr_type,
+                                                         op.getCallFrame())
                     .getResult(0);
-    auto workgroup_gep = builder.create<mlir::LLVM::GEPOp>(
-        ptr_type, kernel_call_frame, cast,
+    auto workgroup_gep = mlir::LLVM::GEPOp::create(
+        builder, ptr_type, kernel_call_frame, cast,
         mlir::ArrayRef<mlir::LLVM::GEPArg>{mlir::LLVM::GEPArg(0),
                                            mlir::LLVM::GEPArg(1)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto workgroup_ptr =
-        builder.create<mlir::LLVM::LoadOp>(ptr_type, workgroup_gep);
+        mlir::LLVM::LoadOp::create(builder, ptr_type, workgroup_gep);
 
     int32_t workgroup_dim_idx = static_cast<int32_t>(op.getDimension());
-    auto workgroup_dim_gep = builder.create<mlir::LLVM::GEPOp>(
-        ptr_type, kernel_dim, workgroup_ptr,
+    auto workgroup_dim_gep = mlir::LLVM::GEPOp::create(
+        builder, ptr_type, kernel_dim, workgroup_ptr,
         mlir::ArrayRef<mlir::LLVM::GEPArg>{
             mlir::LLVM::GEPArg(0), mlir::LLVM::GEPArg(workgroup_dim_idx)},
         mlir::LLVM::GEPNoWrapFlags::inbounds);
     auto workgroup_dim_load =
-        builder.create<mlir::LLVM::LoadOp>(i64_ty, workgroup_dim_gep);
+        mlir::LLVM::LoadOp::create(builder, i64_ty, workgroup_dim_gep);
     workgroup_dim_load.setInvariant(true);
 
     mlir::Value workgroup_dim = workgroup_dim_load.getResult();
@@ -176,11 +175,12 @@ struct LowerExtractWorkgroupIdOp
         mlir::DataLayout::closest(builder.getInsertionBlock()->getParentOp())
             .getTypeSizeInBits(mlir::IndexType::get(context)));
     if (index_ty != i64_ty) {
-      workgroup_dim = builder.create<mlir::LLVM::TruncOp>(
-          index_ty, workgroup_dim, mlir::LLVM::IntegerOverflowFlags::nsw);
+      workgroup_dim =
+          mlir::LLVM::TruncOp::create(builder, index_ty, workgroup_dim,
+                                      mlir::LLVM::IntegerOverflowFlags::nsw);
     }
-    auto workgroup_dim_cast = builder.create<mlir::UnrealizedConversionCastOp>(
-        mlir::IndexType::get(context), workgroup_dim);
+    auto workgroup_dim_cast = mlir::UnrealizedConversionCastOp::create(
+        builder, mlir::IndexType::get(context), workgroup_dim);
 
     rewriter.replaceOp(op, workgroup_dim_cast.getResult(0));
 
@@ -252,8 +252,8 @@ struct RewriteFunctionSignatures : mlir::OpRewritePattern<mlir::func::FuncOp> {
     llvm::SmallVector<mlir::Type> new_operands{ptr};
     rewriter.setInsertionPointToStart(&op.getBody().front());
 
-    auto cast = rewriter.create<mlir::UnrealizedConversionCastOp>(
-        op.getLoc(), func_type.getInput(0), op.getArgument(0));
+    auto cast = mlir::UnrealizedConversionCastOp::create(
+        rewriter, op.getLoc(), func_type.getInput(0), op.getArgument(0));
     op.getArgument(0).replaceAllUsesExcept(cast.getResult(0), cast);
     op.setFunctionType(rewriter.getFunctionType(new_operands, {ptr}));
     auto& entry = op->getRegion(0).front();
@@ -301,8 +301,9 @@ class WrapEntryWithCallFrame
 
     auto call_frame_type = CallFrameType::get(context);
     auto error_type = ErrorType::get(context);
-    mlir::func::FuncOp kernel_func = builder.create<mlir::func::FuncOp>(
-        kernel_name, rewriter.getFunctionType({call_frame_type}, {error_type}));
+    mlir::func::FuncOp kernel_func = mlir::func::FuncOp::create(
+        builder, kernel_name,
+        rewriter.getFunctionType({call_frame_type}, {error_type}));
 
     builder.setInsertionPointToStart(kernel_func.addEntryBlock());
 
@@ -316,7 +317,7 @@ class WrapEntryWithCallFrame
       mlir::DictionaryAttr arg_attr =
           arg_attrs ? mlir::dyn_cast<mlir::DictionaryAttr>(arg_attrs[idx])
                     : nullptr;
-      LoadOp load = builder.create<LoadOp>(arg.getType(), call_frame_arg, idx);
+      LoadOp load = LoadOp::create(builder, arg.getType(), call_frame_arg, idx);
       if (arg_attr) {
         load->setAttrs(arg_attr);
       }
@@ -325,16 +326,17 @@ class WrapEntryWithCallFrame
 
     for (auto workgroup_id : {WorkGroupDimension::x, WorkGroupDimension::y,
                               WorkGroupDimension::z}) {
-      call_args.push_back(builder.create<ExtractWorkgroupIdOp>(
-          mlir::IndexType::get(context), call_frame_arg, workgroup_id));
+      call_args.push_back(
+          ExtractWorkgroupIdOp::create(builder, mlir::IndexType::get(context),
+                                       call_frame_arg, workgroup_id));
     }
 
     // Use func::call here rather than pure call to avoid the entry function
     // being DCEd.
-    builder.create<mlir::func::CallOp>(op, call_args);
+    mlir::func::CallOp::create(builder, op, call_args);
 
-    auto error = builder.create<cpu::SuccessOp>(error_type);
-    builder.create<mlir::func::ReturnOp>(error.getResult());
+    auto error = cpu::SuccessOp::create(builder, error_type);
+    mlir::func::ReturnOp::create(builder, error.getResult());
 
     op->setAttr("xla.cpu.is_wrapped", builder.getUnitAttr());
     op.setPrivate();
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
index 6ae31ae3594c00..b82d03d47c699e 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lower_xtile_entry.cc
@@ -76,8 +76,9 @@ struct LowerXtileEntry : mlir::OpRewritePattern<xtile::EntryFuncOp> {
       }
     }
 
-    auto new_func_op = rewriter.create<mlir::func::FuncOp>(
-        op->getLoc(), op.getSymName(), op.getFunctionType(), filtered_attrs);
+    auto new_func_op =
+        mlir::func::FuncOp::create(rewriter, op->getLoc(), op.getSymName(),
+                                   op.getFunctionType(), filtered_attrs);
     new_func_op.setArgAttrsAttr(op.getArgAttrsAttr());
 
     // Move the region from the old function to the new one.
@@ -99,7 +100,8 @@ struct LowerXTileEntryReturn
   mlir::LogicalResult matchAndRewrite(
       xtile::EntryFuncReturnOp op,
       mlir::PatternRewriter& rewriter) const override {
-    rewriter.replaceOp(op, rewriter.create<mlir::func::ReturnOp>(op->getLoc()));
+    rewriter.replaceOp(op,
+                       mlir::func::ReturnOp::create(rewriter, op->getLoc()));
     return mlir::success();
   }
 };
@@ -151,8 +153,8 @@ class LowerXTileEntryPass
       auto call_frame_type = CallFrameType::get(context);
       auto error_type = ErrorType::get(context);
       builder.setInsertionPointToStart(module.getBody());
-      mlir::func::FuncOp kernel_func = builder.create<mlir::func::FuncOp>(
-          kernel_name,
+      mlir::func::FuncOp kernel_func = mlir::func::FuncOp::create(
+          builder, kernel_name,
           builder.getFunctionType({call_frame_type}, {error_type}));
 
       builder.setInsertionPointToStart(kernel_func.addEntryBlock());
@@ -162,7 +164,7 @@ class LowerXTileEntryPass
       llvm::SmallVector<mlir::Value> call_args;
       for (const auto& [idx, arg] :
            llvm::enumerate(entry_func.getBufferArgs())) {
-        LoadOp load = builder.create<LoadOp>(arg.getType(), call_frame, idx);
+        LoadOp load = LoadOp::create(builder, arg.getType(), call_frame, idx);
         call_args.push_back(load);
       }
 
@@ -177,11 +179,11 @@ class LowerXTileEntryPass
       int32_t tiles_per_workgroup = tile_info.getTilesPerWorkgroup();
 
       mlir::Value tile_count_value =
-          builder.create<mlir::arith::ConstantIndexOp>(tile_count);
+          mlir::arith::ConstantIndexOp::create(builder, tile_count);
       mlir::Value tiles_per_workgroup_value =
-          builder.create<mlir::arith::ConstantIndexOp>(tiles_per_workgroup);
-      mlir::Value workgroup_id = builder.create<ExtractWorkgroupIdOp>(
-          builder.getIndexType(), call_frame, WorkGroupDimension::x);
+          mlir::arith::ConstantIndexOp::create(builder, tiles_per_workgroup);
+      mlir::Value workgroup_id = ExtractWorkgroupIdOp::create(
+          builder, builder.getIndexType(), call_frame, WorkGroupDimension::x);
 
       auto flags = mlir::arith::IntegerOverflowFlags::nsw |
                    mlir::arith::IntegerOverflowFlags::nuw;
@@ -189,23 +191,24 @@ class LowerXTileEntryPass
       // This isn't needed for correctness as the workgroup id passed from the
       // runtime will always be in bounds but it constrains the range which LLVM
       // can then take advantage of.
-      mlir::Value bounded_workgroup_id = builder.create<mlir::arith::MaxSIOp>(
-          workgroup_id, builder.create<mlir::arith::ConstantIndexOp>(0));
+      mlir::Value bounded_workgroup_id = mlir::arith::MaxSIOp::create(
+          builder, workgroup_id,
+          mlir::arith::ConstantIndexOp::create(builder, 0));
 
-      mlir::Value start_tile_id = builder.create<mlir::arith::MulIOp>(
-          bounded_workgroup_id, tiles_per_workgroup_value, flags);
-      mlir::Value bounded_start_tile_id =
-          builder.create<mlir::arith::MinSIOp>(start_tile_id, tile_count_value);
+      mlir::Value start_tile_id = mlir::arith::MulIOp::create(
+          builder, bounded_workgroup_id, tiles_per_workgroup_value, flags);
+      mlir::Value bounded_start_tile_id = mlir::arith::MinSIOp::create(
+          builder, start_tile_id, tile_count_value);
 
-      mlir::Value end_tile_id = builder.create<mlir::arith::AddIOp>(
-          start_tile_id, tiles_per_workgroup_value, flags);
+      mlir::Value end_tile_id = mlir::arith::AddIOp::create(
+          builder, start_tile_id, tiles_per_workgroup_value, flags);
       mlir::Value bounded_end_tile_id =
-          builder.create<mlir::arith::MinSIOp>(end_tile_id, tile_count_value);
+          mlir::arith::MinSIOp::create(builder, end_tile_id, tile_count_value);
 
-      mlir::Value step = builder.create<mlir::arith::ConstantIndexOp>(1);
+      mlir::Value step = mlir::arith::ConstantIndexOp::create(builder, 1);
 
-      auto for_op = builder.create<mlir::scf::ForOp>(bounded_start_tile_id,
-                                                     bounded_end_tile_id, step);
+      auto for_op = mlir::scf::ForOp::create(builder, bounded_start_tile_id,
+                                             bounded_end_tile_id, step);
       {
         mlir::ImplicitLocOpBuilder body_builder(entry_func->getLoc(),
                                                 entry_func);
@@ -213,12 +216,12 @@ class LowerXTileEntryPass
 
         call_args.push_back(for_op.getInductionVar());
 
-        body_builder.create<mlir::func::CallOp>(kernel_impl_name,
-                                                mlir::TypeRange(), call_args);
+        mlir::func::CallOp::create(body_builder, kernel_impl_name,
+                                   mlir::TypeRange(), call_args);
       }
 
-      auto error = builder.create<cpu::SuccessOp>(error_type);
-      builder.create<mlir::func::ReturnOp>(error.getResult());
+      auto error = cpu::SuccessOp::create(builder, error_type);
+      mlir::func::ReturnOp::create(builder, error.getResult());
     }
 
     return mlir::success();
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
index c88fd34f242e40..0eae2b15ec6e66 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/lowering_utils.cc
@@ -44,8 +44,9 @@ mlir::VectorType GetVectorType(mlir::ShapedType type) {
 mlir::TypedValue<mlir::VectorType> ReadTensorToVector(mlir::OpBuilder& builder,
                                                       mlir::Value input) {
   if (input.getType().isIntOrFloat()) {
-    return builder.create<mlir::vector::FromElementsOp>(
-        input.getLoc(), mlir::VectorType::get({}, input.getType()), input);
+    return mlir::vector::FromElementsOp::create(
+        builder, input.getLoc(), mlir::VectorType::get({}, input.getType()),
+        input);
   }
 
   auto input_tensor =
@@ -65,9 +66,9 @@ mlir::RankedTensorType GetTensorType(mlir::ShapedType type) {
 mlir::TypedValue<mlir::RankedTensorType> WriteVectorToTensor(
     mlir::OpBuilder& builder, mlir::Value input) {
   if (input.getType().isIntOrFloat()) {
-    return builder.create<mlir::tensor::FromElementsOp>(
-        input.getLoc(), mlir::RankedTensorType::get({}, input.getType()),
-        input);
+    return mlir::tensor::FromElementsOp::create(
+        builder, input.getLoc(),
+        mlir::RankedTensorType::get({}, input.getType()), input);
   }
 
   auto input_vector = mlir::cast<mlir::TypedValue<mlir::VectorType>>(input);
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
index 257bef3ecf366a..913ff9426f5cfd 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/shlo_to_vector.cc
@@ -180,9 +180,9 @@ struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
     mlir::ArrayAttr iterator_types = GetIteratorTypes(
         rewriter, iterator_count, lhs_batch.size(), lhs_contracting.size());
 
-    mlir::Value result = rewriter.create<mlir::vector::ContractionOp>(
-        op->getLoc(), lhs_vector, rhs_vector, accumulator, indexing_maps,
-        iterator_types);
+    mlir::Value result = mlir::vector::ContractionOp::create(
+        rewriter, op->getLoc(), lhs_vector, rhs_vector, accumulator,
+        indexing_maps, iterator_types);
 
     rewriter.replaceOp(op, WriteVectorToTensor(rewriter, result));
 
@@ -193,16 +193,16 @@ struct LowerDotGeneral : mlir::OpRewritePattern<mlir::stablehlo::DotGeneralOp> {
   mlir::Value GetAccumulator(mlir::OpBuilder& builder, mlir::Location loc,
                              mlir::RankedTensorType result_type) const {
     mlir::Type element_type = result_type.getElementType();
-    auto zero_const = builder.create<mlir::arith::ConstantOp>(
-        loc, element_type, builder.getZeroAttr(element_type));
+    auto zero_const = mlir::arith::ConstantOp::create(
+        builder, loc, element_type, builder.getZeroAttr(element_type));
 
     if (result_type.getRank() == 0) {
       return zero_const;
     }
 
     auto result_vector_type = GetVectorType(result_type);
-    return builder.create<mlir::vector::BroadcastOp>(loc, result_vector_type,
-                                                     zero_const);
+    return mlir::vector::BroadcastOp::create(builder, loc, result_vector_type,
+                                             zero_const);
   }
 };
 
@@ -215,8 +215,8 @@ struct LowerTranspose : mlir::OpRewritePattern<mlir::stablehlo::TransposeOp> {
     mlir::Value source_vector = ReadTensorToVector(rewriter, op.getOperand());
 
     mlir::TypedValue<mlir::VectorType> dest_vector =
-        rewriter.create<mlir::vector::TransposeOp>(op->getLoc(), source_vector,
-                                                   op.getPermutation());
+        mlir::vector::TransposeOp::create(rewriter, op->getLoc(), source_vector,
+                                          op.getPermutation());
 
     mlir::Value dest_tensor = WriteVectorToTensor(rewriter, dest_vector);
 
@@ -242,8 +242,9 @@ struct LowerReduce : mlir::OpRewritePattern<mlir::stablehlo::ReduceOp> {
     auto result_type =
         mlir::cast<mlir::RankedTensorType>(result_tensor.getType());
 
-    mlir::Value init_value = rewriter.create<mlir::tensor::ExtractOp>(
-        op->getLoc(), result_type.getElementType(), op.getInitValues().front());
+    mlir::Value init_value = mlir::tensor::ExtractOp::create(
+        rewriter, op->getLoc(), result_type.getElementType(),
+        op.getInitValues().front());
 
     // Ensure the reduction dimensions are sorted so we can easily check if the
     // minor dimension is reduced.
diff --git a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
index 4f7354d0cc669c..b0711dba1037b2 100644
--- a/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/tiled/transforms/vectorized_reduce_emitter.cc
@@ -73,7 +73,7 @@ static void InsertValue(mlir::OpBuilder& builder, mlir::Location loc,
   llvm::SmallVector<mlir::Value> padded_indices(indices);
   while (padded_indices.size() < buffer.getType().getRank()) {
     padded_indices.push_back(
-        builder.create<mlir::arith::ConstantIndexOp>(loc, 0));
+        mlir::arith::ConstantIndexOp::create(builder, loc, 0));
   }
 
   if (mlir::isa<mlir::VectorType>(value.getType())) {
@@ -105,14 +105,14 @@ static std::array<llvm::SmallVector<mlir::Value>, 3> GetLoopBounds(
     llvm::ArrayRef<int64_t> upper_bounds, int64_t lower_bound = 0) {
   llvm::SmallVector<mlir::Value> lbs(
       upper_bounds.size(),
-      builder.create<mlir::arith::ConstantIndexOp>(loc, lower_bound));
+      mlir::arith::ConstantIndexOp::create(builder, loc, lower_bound));
   llvm::SmallVector<mlir::Value> ubs =
       llvm::map_to_vector(upper_bounds, [&](int64_t size) -> mlir::Value {
-        return builder.create<mlir::arith::ConstantIndexOp>(loc, size);
+        return mlir::arith::ConstantIndexOp::create(builder, loc, size);
       });
   llvm::SmallVector<mlir::Value> step(
       upper_bounds.size(),
-      builder.create<mlir::arith::ConstantIndexOp>(loc, 1));
+      mlir::arith::ConstantIndexOp::create(builder, loc, 1));
   return {lbs, ubs, step};
 }
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
index 7ac2dd35a193d8..1d7e637a0eaa90 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/transforms/int4_passes.cc
@@ -172,15 +172,15 @@ class I4ToI8Converter : public TypeConverter {
 // Divides a value by an integer constant.
 Value div(ConversionPatternRewriter& r, Value value, int64_t constant) {
   auto const_attr = r.getIntegerAttr(value.getType(), constant);
-  auto const_op = r.template create<ma::ConstantOp>(value.getLoc(), const_attr);
-  return r.template create<ma::DivSIOp>(value.getLoc(), value, const_op);
+  auto const_op = ma::ConstantOp::create(r, value.getLoc(), const_attr);
+  return ma::DivSIOp::create(r, value.getLoc(), value, const_op);
 }
 
 // Divides a value by an integer constant.
 Value ceilDiv(ConversionPatternRewriter& r, Value value, int64_t constant) {
   auto const_attr = r.getIntegerAttr(value.getType(), constant);
-  auto const_op = r.template create<ma::ConstantOp>(value.getLoc(), const_attr);
-  return r.template create<ma::CeilDivSIOp>(value.getLoc(), value, const_op);
+  auto const_op = ma::ConstantOp::create(r, value.getLoc(), const_attr);
+  return ma::CeilDivSIOp::create(r, value.getLoc(), value, const_op);
 }
 
 // Returns the integer value of a constant op.
diff --git a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
index d2283480a1b372..22d70efcf3f33b 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/expand_float_ops.cc
@@ -88,25 +88,25 @@ struct RewriteErf32Pattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
 
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     auto c = [&](float v) -> Value {
-      return b.create<ma::ConstantFloatOp>(rewriter.getF32Type(),
-                                           llvm::APFloat(v));
+      return ma::ConstantFloatOp::create(b, rewriter.getF32Type(),
+                                         llvm::APFloat(v));
     };
 
     auto poly = [&](auto x, auto coefficients) -> Value {
       auto r = c(coefficients[0]);
       for (int i = 1; i < coefficients.size(); ++i) {
-        r = b.create<mlir::math::FmaOp>(r, x, c(coefficients[i]));
+        r = mlir::math::FmaOp::create(b, r, x, c(coefficients[i]));
       }
       return r;
     };
 
     Value x = op.getOperand();
-    x = b.create<ma::MaximumFOp>(x, c(-kErfInvOneMinusHalfULP));
-    x = b.create<ma::MinimumFOp>(x, c(kErfInvOneMinusHalfULP));
-    Value x2 = b.create<ma::MulFOp>(x, x);
+    x = ma::MaximumFOp::create(b, x, c(-kErfInvOneMinusHalfULP));
+    x = ma::MinimumFOp::create(b, x, c(kErfInvOneMinusHalfULP));
+    Value x2 = ma::MulFOp::create(b, x, x);
 
     rewriter.replaceOpWithNewOp<ma::DivFOp>(
-        op, b.create<ma::MulFOp>(x, poly(x2, kAlpha)), poly(x2, kBeta));
+        op, ma::MulFOp::create(b, x, poly(x2, kAlpha)), poly(x2, kBeta));
 
     return mlir::success();
   }
@@ -129,39 +129,39 @@ bool IsFNUZ(mlir::FloatType ty) {
 Value IsInf(Value value, mlir::ImplicitLocOpBuilder& b) {
   auto ty = mlir::cast<mlir::FloatType>(value.getType());
   if (mlir::LLVM::isCompatibleOuterType(ty)) {
-    value = b.create<mlir::math::AbsFOp>(value);
-    Value inf = b.create<ma::ConstantFloatOp>(
-        ty, llvm::APFloat::getInf(ty.getFloatSemantics()));
-    return b.create<ma::CmpFOp>(ma::CmpFPredicate::OEQ, value, inf);
+    value = mlir::math::AbsFOp::create(b, value);
+    Value inf = ma::ConstantFloatOp::create(
+        b, ty, llvm::APFloat::getInf(ty.getFloatSemantics()));
+    return ma::CmpFOp::create(b, ma::CmpFPredicate::OEQ, value, inf);
   }
 
   assert(ty.getIntOrFloatBitWidth() <= 8);
   // F8E5M2, F8E4M3, F8E3M4 are the only 8 bit float with infinities.
   if (llvm::isa<mlir::Float8E5M2Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+    Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
     return (bits & 0x7F) == 0x7C;
   } else if (llvm::isa<mlir::Float8E4M3Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+    Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
     return (bits & 0x7F) == 0x78;
   } else if (llvm::isa<mlir::Float8E3M4Type>(ty)) {
-    Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+    Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
     return (bits & 0x7F) == 0x70;
   } else {
-    return b.create<ma::ConstantIntOp>(b.getI1Type(), false);
+    return ma::ConstantIntOp::create(b, b.getI1Type(), false);
   }
 }
 
 Value IsNaN(Value value, mlir::ImplicitLocOpBuilder& b) {
   auto ty = value.getType();
   if (mlir::LLVM::isCompatibleOuterType(ty)) {
-    return b.create<ma::CmpFOp>(ma::CmpFPredicate::UNO, value, value);
+    return ma::CmpFOp::create(b, ma::CmpFPredicate::UNO, value, value);
   }
   if (llvm::isa<mlir::Float4E2M1FNType>(ty)) {
-    return b.create<ma::ConstantIntOp>(b.getI1Type(), false);
+    return ma::ConstantIntOp::create(b, b.getI1Type(), false);
   }
 
   assert(ty.getIntOrFloatBitWidth() == 8);
-  Val bits{b.create<ma::BitcastOp>(b.getI8Type(), value), &b};
+  Val bits{ma::BitcastOp::create(b, b.getI8Type(), value), &b};
   if (llvm::isa<mlir::Float8E5M2Type>(ty)) {
     return (bits & 0b0111'1111).cmp(ma::CmpIPredicate::ugt, 0b0111'1100);
   } else if (llvm::isa<mlir::Float8E4M3Type>(ty)) {
@@ -189,21 +189,21 @@ Value EmitReducePrecision(Value value, int exponent_bits, int mantissa_bits,
 }
 
 Value EmitF16ToF8e5m2(Value in, mlir::ImplicitLocOpBuilder& b) {
-  Val in_bits{b.create<ma::BitcastOp>(b.getI16Type(), in), &b};
+  Val in_bits{ma::BitcastOp::create(b, b.getI16Type(), in), &b};
   // Use this method of checking for NaN because it's the same as what's used
   // in the reduce precision lowering.
   Value is_nan = (in_bits & 32767).cmp(ma::CmpIPredicate::ugt, 31744);
 
   Value value = EmitReducePrecision(in, 5, 2, b);
-  value = b.create<ma::BitcastOp>(b.getI16Type(), value);
-  value = b.create<ma::ShRUIOp>(value,
-                                b.create<ma::ConstantIntOp>(b.getI16Type(), 8));
-  value = b.create<ma::TruncIOp>(b.getI8Type(), value);
+  value = ma::BitcastOp::create(b, b.getI16Type(), value);
+  value = ma::ShRUIOp::create(b, value,
+                              ma::ConstantIntOp::create(b, b.getI16Type(), 8));
+  value = ma::TruncIOp::create(b, b.getI8Type(), value);
   // When the input is NaN, just truncating can turn a NaN into an inf if the
   // mantissa becomes 0.
-  value = b.create<ma::SelectOp>(
-      is_nan, b.create<ma::ConstantIntOp>(value.getType(), 0x7F), value);
-  return b.create<ma::BitcastOp>(b.getType<mlir::Float8E5M2Type>(), value);
+  value = ma::SelectOp::create(
+      b, is_nan, ma::ConstantIntOp::create(b, value.getType(), 0x7F), value);
+  return ma::BitcastOp::create(b, b.getType<mlir::Float8E5M2Type>(), value);
 }
 
 Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
@@ -220,8 +220,8 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
     // Going through f32 and f16 is significantly faster than the fallback code
     // below.
     return EmitF16ToF8e5m2(
-        b.create<ma::TruncFOp>(b.getF16Type(),
-                               b.create<ma::ExtFOp>(b.getF32Type(), value)),
+        ma::TruncFOp::create(b, b.getF16Type(),
+                             ma::ExtFOp::create(b, b.getF32Type(), value)),
         b);
   }
 
@@ -265,23 +265,23 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
       return {v, &b};
     }
     if (ty.getIntOrFloatBitWidth() < v.getType().getIntOrFloatBitWidth()) {
-      return {b.create<ma::TruncIOp>(ty, v), &b};
+      return {ma::TruncIOp::create(b, ty, v), &b};
     }
-    return {b.create<ma::ExtUIOp>(ty, v), &b};
+    return {ma::ExtUIOp::create(b, ty, v), &b};
   };
 
   int64_t exp_offset = to_bias - from_bias;
   int digit_shift = to_mantissa - from_mantissa;
 
   int from_width = value.getType().getIntOrFloatBitWidth();
-  Val from_bits{b.create<ma::BitcastOp>(b.getIntegerType(from_width), value),
+  Val from_bits{ma::BitcastOp::create(b, b.getIntegerType(from_width), value),
                 &b};
   if (from_width < 8) {
     from_bits = convert_int(b.getIntegerType(8), from_bits);
   }
 
   auto cst = [&](mlir::Type ty, int64_t n) -> Val {
-    return {b.create<ma::ConstantIntOp>(ty, n), &b};
+    return {ma::ConstantIntOp::create(b, ty, n), &b};
   };
 
   // Shift bits to destination type, without sign bit.
@@ -368,8 +368,8 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
     // `From` supports larger values than `To`, we may overflow.
     if (std::make_pair(to_max_exp, to_mantissa) <
         std::make_pair(from_max_exp, from_mantissa)) {
-      result = b.create<SelectOp>(
-          rounded_from_bits.cmp(CmpIPredicate::ugt, aligned_highest), to_inf,
+      result = SelectOp::create(
+          b, rounded_from_bits.cmp(CmpIPredicate::ugt, aligned_highest), to_inf,
           result);
     }
   }
@@ -386,7 +386,7 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
 
     // Determine exponent in target type.
     Value clz = convert_int(
-        i32_ty, b.create<mlir::math::CountLeadingZerosOp>(from_bits));
+        i32_ty, mlir::math::CountLeadingZerosOp::create(b, from_bits));
     Value msb = cst(i32_ty, std::max(from_width, 8) - 1) - clz;
     Value normalization_factor = cst(i32_ty, from_mantissa) - msb;
 
@@ -408,7 +408,7 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
 
     Value biased_exp_sle_zero = biased_exponent.cmp(CmpIPredicate::sle, 0);
     bits = Val(
-        b.create<SelectOp>(biased_exp_sle_zero, subnormal_bits, normal_bits),
+        SelectOp::create(b, biased_exp_sle_zero, subnormal_bits, normal_bits),
         &b);
     if (digit_shift >= 0) {
       bits = bits.shl(digit_shift);
@@ -420,7 +420,7 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
     }
     bits = convert_int(to_int_ty, bits);
 
-    result = b.create<SelectOp>(biased_from_exp == 0, bits, result);
+    result = SelectOp::create(b, biased_from_exp == 0, bits, result);
   } else if (to_min_exp > from_min_exp) {
     // `To` supports fewer exponents near zero which means that some values in
     // `From` may become subnormal.
@@ -451,19 +451,19 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
             .shrui(exponent_shift_from_ty));
     // To avoid UB, limit rounding and shifting to the full mantissa plus
     // leading 1.
-    positive_bits =
-        Val(b.create<SelectOp>(
-                exponent_shift_i32.cmp(CmpIPredicate::sle, from_mantissa + 1),
-                positive_bits, to_zero),
-            &b);
+    positive_bits = Val(
+        SelectOp::create(
+            b, exponent_shift_i32.cmp(CmpIPredicate::sle, from_mantissa + 1),
+            positive_bits, to_zero),
+        &b);
 
     Val negative_bits = convert_int(to_int_ty, rounded_from_bits)
                             .shl(to_zero - exponent_shift_to_ty);
     Value bits =
-        b.create<SelectOp>(exponent_shift_i32.cmp(CmpIPredicate::sgt, 0),
-                           positive_bits, negative_bits);
-    result = b.create<SelectOp>(biased_to_exp.cmp(CmpIPredicate::sle, 0), bits,
-                                result);
+        SelectOp::create(b, exponent_shift_i32.cmp(CmpIPredicate::sgt, 0),
+                         positive_bits, negative_bits);
+    result = SelectOp::create(b, biased_to_exp.cmp(CmpIPredicate::sle, 0), bits,
+                              result);
   }
 
   Value result_is_inf = IsInf(value, b);
@@ -485,17 +485,17 @@ Value EmitFloatConversion(Value value, mlir::FloatType to_ty,
   }
 
   if (!llvm::isa<mlir::Float8E8M0FNUType>(from_ty)) {
-    result = b.create<SelectOp>(from_bits == 0, to_zero, result);
+    result = SelectOp::create(b, from_bits == 0, to_zero, result);
   }
-  result = b.create<SelectOp>(result_is_inf, to_inf, result);
-  result = b.create<SelectOp>(input_is_nan, to_nan, result);
+  result = SelectOp::create(b, result_is_inf, to_inf, result);
+  result = SelectOp::create(b, input_is_nan, to_nan, result);
 
   // Insert sign bit.
   if (!llvm::isa<mlir::Float8E8M0FNUType>(from_ty)) {
     Value neg_result = Val{result, &b} | (1ll << (to_int_ty.getWidth() - 1));
-    result = b.create<SelectOp>(from_sign_bit, neg_result, result);
+    result = SelectOp::create(b, from_sign_bit, neg_result, result);
   }
-  result = b.create<ma::BitcastOp>(to_ty, result);
+  result = ma::BitcastOp::create(b, to_ty, result);
   return result;
 }
 
@@ -569,7 +569,7 @@ struct RewriteF8Cst : public mlir::OpRewritePattern<ma::CmpFOp> {
     if (op.getPredicate() == ma::CmpFPredicate::UNE &&
         mlir::matchPattern(rhs, mlir::m_ConstantFloat(&rhs_cst))) {
       mlir::Type int_ty = rewriter.getIntegerType(lhs.getType().getWidth());
-      Val int_value{b.create<ma::BitcastOp>(int_ty, lhs), &b};
+      Val int_value{ma::BitcastOp::create(b, int_ty, lhs), &b};
       int64_t constant = rhs_cst.bitcastToAPInt().getZExtValue();
       // If we're comparing to +-0, compare the absolute values.
       if (rhs_cst.isZero() && !IsFNUZ(lhs.getType())) {
@@ -577,14 +577,14 @@ struct RewriteF8Cst : public mlir::OpRewritePattern<ma::CmpFOp> {
         int_value = int_value & mask;
         constant &= mask;
       }
-      auto cst = b.create<ma::ConstantIntOp>(int_ty, constant);
+      auto cst = ma::ConstantIntOp::create(b, int_ty, constant);
       rewriter.replaceOpWithNewOp<ma::CmpIOp>(op, ma::CmpIPredicate::ne,
                                               int_value, cst);
       return mlir::success();
     }
 
-    auto lhs_ext = b.create<ma::ExtFOp>(b.getF32Type(), lhs);
-    auto rhs_ext = b.create<ma::ExtFOp>(b.getF32Type(), rhs);
+    auto lhs_ext = ma::ExtFOp::create(b, b.getF32Type(), lhs);
+    auto rhs_ext = ma::ExtFOp::create(b, b.getF32Type(), rhs);
     rewriter.replaceOpWithNewOp<ma::CmpFOp>(op, op->getResultTypes(),
                                             mlir::ValueRange{lhs_ext, rhs_ext},
                                             op->getAttrs());
@@ -618,7 +618,7 @@ struct RewriteAbsFPattern : public mlir::OpRewritePattern<mlir::math::AbsFOp> {
 
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     mlir::Type i_ty = rewriter.getIntegerType(src.getType().getWidth());
-    Val value{b.create<ma::BitcastOp>(i_ty, src), &b};
+    Val value{ma::BitcastOp::create(b, i_ty, src), &b};
     int64_t mask = (1ull << (src.getType().getWidth() - 1)) - 1;
     value = value & mask;
     rewriter.replaceOpWithNewOp<ma::BitcastOp>(op, src.getType(), value);
@@ -636,7 +636,7 @@ struct RewriteIToFpPattern : public mlir::OpRewritePattern<Op> {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) itofp");
     }
     Value to_float =
-        rewriter.create<Op>(op.getLoc(), rewriter.getF32Type(), op.getIn());
+        Op::create(rewriter, op.getLoc(), rewriter.getF32Type(), op.getIn());
     rewriter.replaceOpWithNewOp<ma::TruncFOp>(op, op.getType(), to_float);
     return mlir::success();
   }
@@ -652,8 +652,8 @@ struct RewriteFpToIPattern : public mlir::OpRewritePattern<Op> {
         op.getIn().getType().getIntOrFloatBitWidth() > 8) {
       return rewriter.notifyMatchFailure(op, "not an f8 (or less) fptoi");
     }
-    Value to_f32 = rewriter.create<ma::ExtFOp>(
-        op.getLoc(), rewriter.getF32Type(), op.getIn());
+    Value to_f32 = ma::ExtFOp::create(rewriter, op.getLoc(),
+                                      rewriter.getF32Type(), op.getIn());
     rewriter.replaceOpWithNewOp<Op>(op, op.getType(), to_f32);
     return mlir::success();
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
index f603a2d89a7b25..7f937e13a9fa82 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/flatten_tensors.cc
@@ -385,13 +385,13 @@ struct RewriteTensorInsert : OpRewritePattern<InsertOp> {
     auto linear_index = LinearizeIndex(loc, tensor_type, op.getIndices(),
                                        rewriter, tensor_type.getEncoding());
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto tensor_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(tensor_type), tensor)
+    auto tensor_1D = UnrealizedConversionCastOp::create(
+                         b, GetFlattenedType(tensor_type), tensor)
                          .getResult(0);
     auto new_insert =
-        b.create<InsertOp>(op.getScalar(), tensor_1D, linear_index);
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        tensor_type, new_insert.getResult());
+        InsertOp::create(b, op.getScalar(), tensor_1D, linear_index);
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, tensor_type, new_insert.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
@@ -411,13 +411,13 @@ struct RewriteVectorInsert : OpRewritePattern<mv::InsertOp> {
     auto indices = mv::getAsValues(rewriter, loc, op.getMixedPosition());
     auto linear_index = LinearizeIndex(loc, vector_type, indices, rewriter);
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto vector_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(vector_type), vector)
+    auto vector_1D = UnrealizedConversionCastOp::create(
+                         b, GetFlattenedType(vector_type), vector)
                          .getResult(0);
     auto new_insert =
-        b.create<mv::InsertOp>(op.getValueToStore(), vector_1D, linear_index);
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        vector_type, new_insert.getResult());
+        mv::InsertOp::create(b, op.getValueToStore(), vector_1D, linear_index);
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, vector_type, new_insert.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
@@ -435,10 +435,10 @@ struct RewriteVectorFromElements : OpRewritePattern<mv::FromElementsOp> {
     }
     auto loc = op.getLoc();
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto new_from_elements = b.create<mv::FromElementsOp>(
-        GetFlattenedType(vector_type), op.getElements());
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        vector_type, new_from_elements.getResult());
+    auto new_from_elements = mv::FromElementsOp::create(
+        b, GetFlattenedType(vector_type), op.getElements());
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, vector_type, new_from_elements.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
@@ -458,14 +458,14 @@ struct RewriteAtomicRMW : OpRewritePattern<AtomicRMWOp> {
     auto linear_index = LinearizeIndex(loc, tensor_type, op.getIndices(),
                                        rewriter, tensor_type.getEncoding());
     mlir::ImplicitLocOpBuilder b(loc, rewriter);
-    auto tensor_1D = b.create<UnrealizedConversionCastOp>(
-                          GetFlattenedType(tensor_type), tensor)
+    auto tensor_1D = UnrealizedConversionCastOp::create(
+                         b, GetFlattenedType(tensor_type), tensor)
                          .getResult(0);
-    auto new_atomic_rmw = b.create<AtomicRMWOp>(tensor_1D, linear_index);
+    auto new_atomic_rmw = AtomicRMWOp::create(b, tensor_1D, linear_index);
     rewriter.inlineRegionBefore(op.getRegion(),
                                 &new_atomic_rmw.getRegion().front());
-    auto cast_to_orig_type = b.create<UnrealizedConversionCastOp>(
-        tensor_type, new_atomic_rmw.getResult());
+    auto cast_to_orig_type = UnrealizedConversionCastOp::create(
+        b, tensor_type, new_atomic_rmw.getResult());
     rewriter.replaceOp(op, cast_to_orig_type.getResult(0));
     return mlir::success();
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
index a42af497a0d317..261ad5326095c0 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_intrinsic_lib.cc
@@ -74,7 +74,7 @@ mlir::func::FuncOp GetOrInsertDeclaration(mlir::PatternRewriter& rewriter,
   rewriter.setInsertionPointToStart(module_op.getBody());
 
   auto func_decl =
-      rewriter.create<mlir::func::FuncOp>(module_op.getLoc(), name, func_type);
+      mlir::func::FuncOp::create(rewriter, module_op.getLoc(), name, func_type);
   func_decl.setPrivate();
   return func_decl;
 }
@@ -110,14 +110,14 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
       mlir::Type f32_type = get_vector_type(b.getF32Type());
 
       mlir::Value input_value =
-          b.create<mlir::arith::ExtFOp>(f32_type, op.getOperand());
+          mlir::arith::ExtFOp::create(b, f32_type, op.getOperand());
 
       auto erf_decl = codegen::intrinsics::Erf::GetOrInsertDeclaration(
           rewriter, module_op_, Type::TypeFromIrType(f32_type));
-      auto call_op = b.create<mlir::func::CallOp>(erf_decl, input_value);
+      auto call_op = mlir::func::CallOp::create(b, erf_decl, input_value);
 
       mlir::Value f32_result = call_op.getResult(0);
-      mlir::Value result = b.create<mlir::arith::TruncFOp>(type, f32_result);
+      mlir::Value result = mlir::arith::TruncFOp::create(b, type, f32_result);
 
       rewriter.replaceOp(op, result);
       return mlir::success();
@@ -129,7 +129,7 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
       auto erf_decl = GetErf64Declaration(rewriter);
 
       if (!maybe_vector_type) {
-        auto call_op = b.create<mlir::func::CallOp>(erf_decl, op.getOperand());
+        auto call_op = mlir::func::CallOp::create(b, erf_decl, op.getOperand());
         rewriter.replaceOp(op, call_op->getResults());
         return mlir::success();
       }
@@ -139,7 +139,7 @@ class LowerErfPattern : public mlir::OpRewritePattern<mlir::math::ErfOp> {
         mlir::Value extracted = mlir::vector::ExtractOp::create(
             rewriter, op.getLoc(), op.getOperand(), idx);
         mlir::Value scalar_erf =
-            b.create<mlir::func::CallOp>(erf_decl, extracted).getResult(0);
+            mlir::func::CallOp::create(b, erf_decl, extracted).getResult(0);
         scalar_erf_results.push_back(scalar_erf);
       }
       rewriter.replaceOpWithNewOp<mlir::vector::FromElementsOp>(
@@ -196,7 +196,7 @@ class LowerTruncF32BF16FPattern
         codegen::intrinsics::FpTrunc::GetOrInsertDeclaration(
             rewriter, module_op_, src_type, dst_type);
     auto call_op =
-        b.create<mlir::func::CallOp>(f32_to_bf16_decl, op.getOperand());
+        mlir::func::CallOp::create(b, f32_to_bf16_decl, op.getOperand());
     rewriter.replaceOp(op, call_op->getResults());
     return mlir::success();
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc
index 4c1fcd127de205..2d91bf339d1ec7 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/lower_xla_to_scf.cc
@@ -77,13 +77,14 @@ struct RewritePredicatedInsert : mlir::OpRewritePattern<PredicatedInsertOp> {
     rewriter.replaceOpWithNewOp<mlir::scf::IfOp>(
         op, op.getCondition(),
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(
-              loc, b.create<mlir::tensor::InsertOp>(
-                        loc, op.getValue(), op.getDest(), op.getIndices())
-                       .getResult());
+          mlir::scf::YieldOp::create(
+              b, loc,
+              mlir::tensor::InsertOp::create(b, loc, op.getValue(),
+                                             op.getDest(), op.getIndices())
+                  .getResult());
         },
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(loc, op.getDest());
+          mlir::scf::YieldOp::create(b, loc, op.getDest());
         });
     return success();
   }
@@ -99,13 +100,13 @@ struct RewritePredicatedExtract : mlir::OpRewritePattern<PredicatedExtractOp> {
     rewriter.replaceOpWithNewOp<mlir::scf::IfOp>(
         op, op.getCondition(),
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(
-              loc, b.create<mlir::tensor::ExtractOp>(loc, op.getSrc(),
-                                                     op.getIndices())
-                       .getResult());
+          mlir::scf::YieldOp::create(b, loc,
+                                     mlir::tensor::ExtractOp::create(
+                                         b, loc, op.getSrc(), op.getIndices())
+                                         .getResult());
         },
         [&](mlir::OpBuilder& b, mlir::Location loc) {
-          b.create<mlir::scf::YieldOp>(loc, op.getFallback());
+          mlir::scf::YieldOp::create(b, loc, op.getFallback());
         });
     return success();
   }
@@ -222,8 +223,8 @@ struct RewriteXlaLoop : mlir::OpRewritePattern<LoopOp> {
           mlir::ImplicitLocOpBuilder nested_b(loc, nested_builder);
           auto is_in_bounds = emitters::CheckConstraints(
               indexing_map, op.getDims(), symbol_values, nested_b);
-          auto if_op = nested_b.create<mlir::scf::IfOp>(
-              is_in_bounds,
+          auto if_op = mlir::scf::IfOp::create(
+              nested_b, is_in_bounds,
               [&](OpBuilder& then_builder, Location then_loc) -> void {
                 ImplicitLocOpBuilder then_b(then_loc, then_builder);
                 mlir::IRMapping mapping;
diff --git a/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc b/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc
index 08ab06e65a3c13..f151bce51ba1c8 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/unswitch_loops.cc
@@ -57,28 +57,31 @@ struct UnswitchLoop : mlir::OpRewritePattern<mlir::scf::ForOp> {
       return rewriter.notifyMatchFailure(op, "condition is a constant");
     }
 
-    auto true_cst = rewriter.create<mlir::arith::ConstantOp>(
-        op.getLoc(), rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
-    auto false_cst = rewriter.create<mlir::arith::ConstantOp>(
-        op.getLoc(), rewriter.getIntegerAttr(rewriter.getI1Type(), 0));
+    auto true_cst = mlir::arith::ConstantOp::create(
+        rewriter, op.getLoc(),
+        rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
+    auto false_cst = mlir::arith::ConstantOp::create(
+        rewriter, op.getLoc(),
+        rewriter.getIntegerAttr(rewriter.getI1Type(), 0));
     rewriter.setInsertionPoint(op);
     mlir::IRMapping mapping;
     mapping.map(if_op.getCondition(), false_cst);
     auto false_branch_loop = op->clone(mapping);
-    auto new_if = rewriter.create<mlir::scf::IfOp>(
-        op.getLoc(), op.getResultTypes(), if_op.getCondition(), true, true);
+    auto new_if =
+        mlir::scf::IfOp::create(rewriter, op.getLoc(), op.getResultTypes(),
+                                if_op.getCondition(), true, true);
     rewriter.replaceAllUsesWith(op.getResults(), new_if.getResults());
 
     auto then_builder = new_if.getThenBodyBuilder(rewriter.getListener());
     auto then_yield =
-        then_builder.create<mlir::scf::YieldOp>(op.getLoc(), op.getResults());
+        mlir::scf::YieldOp::create(then_builder, op.getLoc(), op.getResults());
     rewriter.moveOpBefore(op, then_yield);
     rewriter.modifyOpInPlace(if_op, [&]() { if_op->setOperand(0, true_cst); });
 
     auto else_builder = new_if.getElseBodyBuilder(rewriter.getListener());
     else_builder.insert(false_branch_loop);
-    else_builder.create<mlir::scf::YieldOp>(op.getLoc(),
-                                            false_branch_loop->getResults());
+    mlir::scf::YieldOp::create(else_builder, op.getLoc(),
+                               false_branch_loop->getResults());
 
     return mlir::success();
   }
diff --git a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
index cb8ad0580d740d..63bb6361e55939 100644
--- a/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
+++ b/third_party/xla/xla/codegen/emitters/transforms/vectorize_loads_stores.cc
@@ -198,7 +198,7 @@ std::optional<Value> GetVectorBaseIndices(Value index, scf::ForOp loop,
                                           mlir::ImplicitLocOpBuilder& b) {
   Value induction_var = loop.getInductionVar();
   if (index == induction_var) {
-    return b.create<arith::ConstantIndexOp>(0);
+    return arith::ConstantIndexOp::create(b, 0);
   }
 
   auto apply_indexing =
@@ -248,9 +248,9 @@ std::optional<Value> GetVectorBaseIndices(Value index, scf::ForOp loop,
   }
 
   auto operands = llvm::to_vector(apply_indexing.getOperands());
-  operands[induction_var_operand_index] = b.create<arith::ConstantIndexOp>(0);
+  operands[induction_var_operand_index] = arith::ConstantIndexOp::create(b, 0);
 
-  return b.create<ApplyIndexingOp>(operands, apply_indexing.getIndexingMap())
+  return ApplyIndexingOp::create(b, operands, apply_indexing.getIndexingMap())
       ->getResult(0);
 }
 
@@ -287,8 +287,8 @@ struct VectorizeLoad : mlir::OpRewritePattern<mlir::tensor::ExtractOp> {
       return rewriter.notifyMatchFailure(
           op, "the instruction does not access contiguous elements");
     }
-    auto loaded_vector = b.create<mlir::vector::TransferReadOp>(
-        vector_type, op.getTensor(), *vector_index, /*padding=*/std::nullopt,
+    auto loaded_vector = mlir::vector::TransferReadOp::create(
+        b, vector_type, op.getTensor(), *vector_index, /*padding=*/std::nullopt,
         llvm::ArrayRef<bool>{true});
     rewriter.replaceOpWithNewOp<mlir::vector::ExtractOp>(
         op, loaded_vector, loop.getInductionVar());
@@ -356,15 +356,15 @@ class VectorizeAtomicRMW : public mlir::OpRewritePattern<AtomicRMWOp> {
     }
 
     auto init =
-        b.create<arith::ConstantOp>(b.getZeroAttr(vector_type)).getResult();
+        arith::ConstantOp::create(b, b.getZeroAttr(vector_type)).getResult();
 
     auto yield_fn = [&](mlir::OpBuilder& yield_b, mlir::Location yield_loc,
                         llvm::ArrayRef<mlir::BlockArgument> bbarg) {
       auto induction_var =
           mlir::cast<scf::ForOp>(bbarg.front().getOwner()->getParentOp())
               .getInductionVar();
-      auto insert_op = yield_b.create<mlir::vector::InsertOp>(
-          yield_loc, atomic_modifier_parameters->first, bbarg.front(),
+      auto insert_op = mlir::vector::InsertOp::create(
+          yield_b, yield_loc, atomic_modifier_parameters->first, bbarg.front(),
           induction_var);
       return llvm::SmallVector<Value>{insert_op.getResult()};
     };
@@ -377,14 +377,14 @@ class VectorizeAtomicRMW : public mlir::OpRewritePattern<AtomicRMWOp> {
     rewriter.replaceOp(op, op->getOpOperand(0).get());
 
     auto filled_vector = new_for->getResults().back();
-    auto new_atomic_rmw = b.create<AtomicRMWOp>(
-        new_for.getInits()[result_index], *vector_index, vector_type);
+    auto new_atomic_rmw = AtomicRMWOp::create(
+        b, new_for.getInits()[result_index], *vector_index, vector_type);
     mlir::ImplicitLocOpBuilder body_builder(new_atomic_rmw.getLoc(),
                                             new_atomic_rmw.getBodyBuilder());
-    auto addf_op = body_builder.create<arith::AddFOp>(
-        body_builder.getLoc(), vector_type, new_atomic_rmw.getCurrentValue(),
-        filled_vector);
-    body_builder.create<xla::YieldOp>(addf_op.getResult());
+    auto addf_op =
+        arith::AddFOp::create(body_builder, body_builder.getLoc(), vector_type,
+                              new_atomic_rmw.getCurrentValue(), filled_vector);
+    xla::YieldOp::create(body_builder, addf_op.getResult());
     new_for->getResult(result_index)
         .replaceAllUsesWith(new_atomic_rmw.getResult());
 
@@ -422,15 +422,15 @@ struct VectorizeStore : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
     }
 
     auto init =
-        b.create<arith::ConstantOp>(b.getZeroAttr(vector_type)).getResult();
+        arith::ConstantOp::create(b, b.getZeroAttr(vector_type)).getResult();
 
     auto yield_fn = [&](mlir::OpBuilder& yield_b, mlir::Location yield_loc,
                         llvm::ArrayRef<mlir::BlockArgument> bbarg) {
       auto induction_var =
           mlir::cast<scf::ForOp>(bbarg.front().getOwner()->getParentOp())
               .getInductionVar();
-      auto insert_op = yield_b.create<mlir::vector::InsertOp>(
-          yield_loc, op.getScalar(), bbarg.front(), induction_var);
+      auto insert_op = mlir::vector::InsertOp::create(
+          yield_b, yield_loc, op.getScalar(), bbarg.front(), induction_var);
       return llvm::SmallVector<Value>{insert_op.getResult()};
     };
     int result_index = op->use_begin()->getOperandNumber();
@@ -442,8 +442,8 @@ struct VectorizeStore : mlir::OpRewritePattern<mlir::tensor::InsertOp> {
     rewriter.replaceOp(op, op.getDest());
 
     auto filled_vector = new_for->getResults().back();
-    auto written = b.create<mlir::vector::TransferWriteOp>(
-        filled_vector, new_for.getInits()[result_index], *vector_index,
+    auto written = mlir::vector::TransferWriteOp::create(
+        b, filled_vector, new_for.getInits()[result_index], *vector_index,
         llvm::ArrayRef<bool>{true});
     new_for->getResult(result_index).replaceAllUsesWith(written.getResult());
 
diff --git a/third_party/xla/xla/codegen/intrinsic/intrinsic.h b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
index 30d6e31088a581..27e673ac05c711 100644
--- a/third_party/xla/xla/codegen/intrinsic/intrinsic.h
+++ b/third_party/xla/xla/codegen/intrinsic/intrinsic.h
@@ -137,7 +137,7 @@ class Intrinsic {
     mlir::OpBuilder::InsertionGuard guard(b);
     b.setInsertionPointToStart(module.getBody());
 
-    auto decl = b.create<mlir::func::FuncOp>(module.getLoc(), name, type);
+    auto decl = mlir::func::FuncOp::create(b, module.getLoc(), name, type);
     decl.setPrivate();
     return decl;
   }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc
index 03333a03f75c41..8054b819c4a179 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_stablehlo_pass.cc
@@ -156,7 +156,7 @@ class AutoShardingWrapperPass
     mlir::OpBuilder builder(context);
 
     auto original_mesh_op =
-        builder.create<sdy::MeshOp>(module_op.getLoc(), "mesh", sdy_mesh);
+        sdy::MeshOp::create(builder, module_op.getLoc(), "mesh", sdy_mesh);
     symbol_table.insert(original_mesh_op, module_op.getBody()->begin());
     mlir::PassManager dedup_pm(context);
     dedup_pm.addPass(xla::sdy::createSdyRoundTripDedupMeshesPass());
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
index 97c728901b6c07..c091d6e9d645fc 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/async_importer.cc
@@ -127,18 +127,18 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncStart(
       async_builder
           .createBlock(&function.getBody(), {}, Untuple(result_types[0]), locs)
           ->getArguments();
-  auto sync_operation = async_builder.create<sync_op>(
-      loc, Untuple(result_types[1]), sync_operand, attributes);
-  async_builder.create<mlir::func::ReturnOp>(loc, sync_operation->getResults());
+  auto sync_operation = sync_op::create(
+      async_builder, loc, Untuple(result_types[1]), sync_operand, attributes);
+  mlir::func::ReturnOp::create(async_builder, loc,
+                               sync_operation->getResults());
   TF_RETURN_IF_ERROR(mutate_op(sync_operation));
 
   function->setAttr(kExecutionThread, builder->getStringAttr("main"));
 
   auto bundle_result_type =
       mlir::mhlo::AsyncBundleType::get(context, result_types);
-  return builder
-      ->create<mlir::mhlo::AsyncStartOp>(loc, bundle_result_type, operands,
-                                         async_attributes)
+  return mlir::mhlo::AsyncStartOp::create(*builder, loc, bundle_result_type,
+                                          operands, async_attributes)
       .getOperation();
 }
 
@@ -166,13 +166,13 @@ absl::StatusOr<mlir::Operation*> ImportOldStyleAsyncDone(
   auto start_tuple =
       llvm::dyn_cast<mlir::TupleType>(async_bundle.getTypes()[1]);
   if (start_tuple && llvm::isa<mlir::TupleType>(start_tuple.getType(0))) {
-    auto op = builder->create<mlir::mhlo::AsyncDoneOp>(loc, result_type,
-                                                       operands, attributes);
+    auto op = mlir::mhlo::AsyncDoneOp::create(*builder, loc, result_type,
+                                              operands, attributes);
     return {op};
   }
   if (useBundleResult) result_type = async_bundle.getTypes()[1];
-  auto op = builder->create<mlir::mhlo::AsyncDoneOp>(loc, Untuple(result_type),
-                                                     operands, attributes);
+  auto op = mlir::mhlo::AsyncDoneOp::create(*builder, loc, Untuple(result_type),
+                                            operands, attributes);
   return CreateTupleFromOpResults(builder, loc, op.getOperation(), result_type);
 }
 
@@ -233,9 +233,8 @@ absl::StatusOr<mlir::Operation*> ImportSend(
   if (args.size() == 2 && IsEmptyTuple(args[0].getType())) {
     args = args.drop_front(1);
   }
-  auto send = builder
-                  ->create<mlir::stablehlo::SendOp>(loc, token.getType(), args,
-                                                    attributes)
+  auto send = mlir::stablehlo::SendOp::create(*builder, loc, token.getType(),
+                                              args, attributes)
                   .getOperation();
   if (instruction->has_sharding()) {
     const HloSharding& sharding = instruction->sharding();
@@ -305,8 +304,9 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
 
   // Return recv op for non-pipelined send, skip empty tuple result type
   if (!IsEmptyTuple(result_types[0])) {
-    auto recv = builder->create<mlir::stablehlo::RecvOp>(
-        loc, llvm::SmallVector<mlir::Type>{result_types[0], result_types[2]},
+    auto recv = mlir::stablehlo::RecvOp::create(
+        *builder, loc,
+        llvm::SmallVector<mlir::Type>{result_types[0], result_types[2]},
         operands, attributes);
     if (instruction->has_sharding()) {
       const HloSharding& sharding = instruction->sharding();
@@ -328,14 +328,14 @@ absl::StatusOr<mlir::Operation*> ImportRecv(
 
   // Recv with no result, only token.
   // To keep parity, if op only returns token, wrap in tuple<tuple<>, token>
-  auto recv = builder->create<mlir::stablehlo::RecvOp>(
-      loc, llvm::SmallVector<mlir::Type>{result_types[2]}, operands,
+  auto recv = mlir::stablehlo::RecvOp::create(
+      *builder, loc, llvm::SmallVector<mlir::Type>{result_types[2]}, operands,
       attributes);
-  auto empty_tuple = builder->create<mlir::stablehlo::TupleOp>(
-      loc, llvm::ArrayRef<mlir::Value>{});
+  auto empty_tuple = mlir::stablehlo::TupleOp::create(
+      *builder, loc, llvm::ArrayRef<mlir::Value>{});
 
-  return builder->create<mlir::stablehlo::TupleOp>(
-      loc,
+  return mlir::stablehlo::TupleOp::create(
+      *builder, loc,
       llvm::ArrayRef<mlir::Value>{empty_tuple.getResult(), recv.getResult(0)});
 }
 
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
index f71b9561b49551..f3a7f691b703fd 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/custom_call_importer.cc
@@ -65,10 +65,9 @@ absl::StatusOr<mlir::Operation*> ImportDynamicBroadcastInDimOp(
         mlir::cast<mlir::IntegerAttr>(broadcast_dimension).getInt();
   }
 
-  return builder
-      ->create<mlir::stablehlo::DynamicBroadcastInDimOp>(
-          loc, result_type, operands[0], operands[1],
-          builder->getDenseI64ArrayAttr(broadcast_dimensions))
+  return mlir::stablehlo::DynamicBroadcastInDimOp::create(
+             *builder, loc, result_type, operands[0], operands[1],
+             builder->getDenseI64ArrayAttr(broadcast_dimensions))
       .getOperation();
 }
 
@@ -78,8 +77,8 @@ absl::StatusOr<mlir::Operation*> ImportDynamicReshapeOp(
   if (!backend_config.empty()) {
     return Internal("backend_config attribute must be empty.");
   }
-  return builder
-      ->create<mlir::stablehlo::DynamicReshapeOp>(loc, result_type, operands)
+  return mlir::stablehlo::DynamicReshapeOp::create(*builder, loc, result_type,
+                                                   operands)
       .getOperation();
 }
 
@@ -89,8 +88,8 @@ absl::StatusOr<mlir::Operation*> ImportRealDynamicSliceOp(
   if (!backend_config.empty()) {
     return Internal("backend_config attribute must be empty.");
   }
-  return builder
-      ->create<mlir::stablehlo::RealDynamicSliceOp>(loc, result_type, operands)
+  return mlir::stablehlo::RealDynamicSliceOp::create(*builder, loc, result_type,
+                                                     operands)
       .getOperation();
 }
 
@@ -185,20 +184,18 @@ absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
   }
 
   if (custom_call_target == "mhlo.uniform_quantize") {
-    return builder
-        ->create<mlir::stablehlo::UniformQuantizeOp>(
-            loc,
-            mlir::RankedTensorType::get(
-                mlir::cast<mlir::RankedTensorType>(result_type).getShape(),
-                getQuantizedType(backend_config)),
-            operands)
+    return mlir::stablehlo::UniformQuantizeOp::create(
+               *builder, loc,
+               mlir::RankedTensorType::get(
+                   mlir::cast<mlir::RankedTensorType>(result_type).getShape(),
+                   getQuantizedType(backend_config)),
+               operands)
         .getOperation();
   }
 
   if (custom_call_target == "mhlo.uniform_dequantize") {
-    return builder
-        ->create<mlir::stablehlo::UniformDequantizeOp>(loc, result_type,
-                                                       operands)
+    return mlir::stablehlo::UniformDequantizeOp::create(*builder, loc,
+                                                        result_type, operands)
         .getOperation();
   }
   return InvalidArgument("Unsupported MHLO op custom_call %s",
diff --git a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
index 2f65724b80aaae..380d72b54d9c6a 100644
--- a/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/hlo/translate/hlo_to_mhlo/hlo_utils.cc
@@ -203,7 +203,8 @@ mlir::Value CreateTupleValue(mlir::OpBuilder* func_builder, mlir::Location loc,
         CreateTupleValue(func_builder, loc, flatten_values, child_type));
   }
 
-  return func_builder->create<mlir::stablehlo::TupleOp>(loc, flatten_sub_values)
+  return mlir::stablehlo::TupleOp::create(*func_builder, loc,
+                                          flatten_sub_values)
       .getResult();
 }
 
diff --git a/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc b/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
index 7d9b8fc700767a..913635ba3d209b 100644
--- a/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
+++ b/third_party/xla/xla/mlir/framework/transforms/outline_with_xla_framework.cc
@@ -103,9 +103,9 @@ struct OutlineXLAFunc : public RewritePattern {
 
     // The wrapper function will have the same name but with _xla_framework
     // appended and will be annotated with the attribute "xla_entry".
-    auto outline_func = rewriter.create<func::FuncOp>(
-        loc, func.getSymName().str() + "_xla_framework", func_type, attrs,
-        arg_attrs);
+    auto outline_func = func::FuncOp::create(
+        rewriter, loc, func.getSymName().str() + "_xla_framework", func_type,
+        attrs, arg_attrs);
     outline_func->setAttr("outlined", BoolAttr::get(ctx, true));
     outline_func->setAttr("xla_entry", BoolAttr::get(ctx, true));
     auto *b = rewriter.createBlock(&outline_func.getBody(), {},
@@ -114,20 +114,20 @@ struct OutlineXLAFunc : public RewritePattern {
     // Unwrap arguments
     SmallVector<Value> args;
     for (const auto &t : llvm::enumerate(func.getFunctionType().getInputs())) {
-      args.push_back(rewriter.create<xla_framework::XLABufferToMemOp>(
-          loc, t.value(), b->getArgument(t.index())));
+      args.push_back(xla_framework::XLABufferToMemOp::create(
+          rewriter, loc, t.value(), b->getArgument(t.index())));
     }
 
-    auto call = rewriter.create<func::CallOp>(
-        loc, func.getSymName(), func.getFunctionType().getResults(), args);
+    auto call = func::CallOp::create(rewriter, loc, func.getSymName(),
+                                     func.getFunctionType().getResults(), args);
     // Wrap results
     SmallVector<Value> results;
     for (auto t : call.getResults()) {
-      results.push_back(rewriter.create<xla_framework::MemToXLABufferOp>(
-          loc, ::mlir::xla_framework::BufferType::get(ctx), t));
+      results.push_back(xla_framework::MemToXLABufferOp::create(
+          rewriter, loc, ::mlir::xla_framework::BufferType::get(ctx), t));
     }
 
-    rewriter.create<func::ReturnOp>(loc, results);
+    func::ReturnOp::create(rewriter, loc, results);
 
     // Finally, mark the called function as private to prevent users from
     // accidentally trying to use it.
diff --git a/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc b/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
index c40a7ad1b9aa46..cabf3d31fb707c 100644
--- a/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
+++ b/third_party/xla/xla/mlir/framework/transforms/xla_framework_to_llvm_pass.cc
@@ -79,8 +79,9 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
   Value LoadValue(ConversionPatternRewriter &rewriter, Location loc,
                   Value pointer, Value index) const {
     auto ptr = LLVM::LLVMPointerType::get(rewriter.getContext());
-    return rewriter.create<LLVM::LoadOp>(
-        loc, ptr, rewriter.create<LLVM::GEPOp>(loc, ptr, ptr, pointer, index));
+    return LLVM::LoadOp::create(
+        rewriter, loc, ptr,
+        LLVM::GEPOp::create(rewriter, loc, ptr, ptr, pointer, index));
   }
 
   mlir::func::FuncOp convertFuncOpToLLVMFuncOp(
@@ -101,8 +102,9 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
     if (!llvm_type) return nullptr;
 
     rewriter.setInsertionPoint(funcOp);
-    auto new_func_op = rewriter.create<mlir::func::FuncOp>(
-        loc, funcOp.getName(), llvm_type, llvm::SmallVector<NamedAttribute>());
+    auto new_func_op =
+        mlir::func::FuncOp::create(rewriter, loc, funcOp.getName(), llvm_type,
+                                   llvm::SmallVector<NamedAttribute>());
     auto locs = llvm::SmallVector<mlir::Location>(arg_types.size(), loc);
     Block *new_entry =
         rewriter.createBlock(&new_func_op.getBody(), {}, arg_types, locs);
@@ -118,16 +120,18 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
     auto result_index = 0;
     for (unsigned i = 0; i < num_refs; ++i) {
       if (funcOp.getArgAttr(i, "xla_framework.input_mapping")) {
-        Value index = rewriter.create<LLVM::ConstantOp>(
-            loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+        Value index = LLVM::ConstantOp::create(
+            rewriter, loc,
+            typeConverter->convertType(rewriter.getIntegerType(32)),
             funcOp.getArgAttrOfType<mlir::IntegerAttr>(
                 i, "xla_framework.input_mapping"));
 
         Value ptr = LoadValue(rewriter, loc, new_entry->getArgument(3), index);
         mapping.map(funcOp.front().getArgument(i), ptr);
       } else {
-        Value index = rewriter.create<LLVM::ConstantOp>(
-            loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+        Value index = LLVM::ConstantOp::create(
+            rewriter, loc,
+            typeConverter->convertType(rewriter.getIntegerType(32)),
             funcOp->getAttrOfType<mlir::IntegerAttr>(
                 "xla_framework.result_mapping"));
         Value first_load =
@@ -136,8 +140,9 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
         // Handle multi-value results which are wrapped in a tuple.
         if (funcOp->hasAttr("xla_framework.result_inner_mapping")) {
           auto current_index = result_index++;
-          Value inner_index = rewriter.create<LLVM::ConstantOp>(
-              loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+          Value inner_index = LLVM::ConstantOp::create(
+              rewriter, loc,
+              typeConverter->convertType(rewriter.getIntegerType(32)),
               rewriter.getI32IntegerAttr(static_cast<int32_t>(
                   mlir::cast<mlir::IntegerAttr>(
                       funcOp
@@ -152,13 +157,14 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
           mapping.map(funcOp.front().getArgument(i), ptr);
 
           auto ptr_type = LLVM::LLVMPointerType::get(rewriter.getContext());
-          Value second_index = rewriter.create<LLVM::ConstantOp>(
-              loc, typeConverter->convertType(rewriter.getIntegerType(32)),
+          Value second_index = LLVM::ConstantOp::create(
+              rewriter, loc,
+              typeConverter->convertType(rewriter.getIntegerType(32)),
               rewriter.getI32IntegerAttr(current_index));
-          rewriter.create<LLVM::StoreOp>(
-              loc, ptr,
-              rewriter.create<LLVM::GEPOp>(loc, ptr_type, ptr_type, first_load,
-                                           llvm::ArrayRef(second_index)));
+          LLVM::StoreOp::create(
+              rewriter, loc, ptr,
+              LLVM::GEPOp::create(rewriter, loc, ptr_type, ptr_type, first_load,
+                                  llvm::ArrayRef(second_index)));
 
         } else {
           // Non tuple outputs can be simply mapped to the first load op.
@@ -171,7 +177,7 @@ struct BarePtrFuncOpConversion : public ConvertOpToLLVMPattern<func::FuncOp> {
     // return values now.
     for (auto &op : funcOp.front()) {
       if (isa<mlir::func::ReturnOp>(op)) {
-        rewriter.create<mlir::func::ReturnOp>(loc, ValueRange());
+        mlir::func::ReturnOp::create(rewriter, loc, ValueRange());
       } else {
         rewriter.clone(op, mapping);
       }
diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
index 74dfa37326213e..617c529add6407 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
@@ -126,7 +126,7 @@ static Value getConstantLike(OpBuilder& b, Location loc, T constant,
       return complex::NumberAttr::get(complexTy, constant, 0);
     llvm_unreachable("unhandled element type");
   };
-  return b.create<ConstantLikeOp>(loc, cast<TypedAttr>(getAttr()), val);
+  return ConstantLikeOp::create(b, loc, cast<TypedAttr>(getAttr()), val);
 }
 
 Value getConstantLike(OpBuilder& b, Location loc, const APFloat& constant,
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
index 635987f32077e8..f085752125d0ed 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/ifrt_merge_reshards_pass.cc
@@ -133,13 +133,13 @@ bool MergeReshardsIgnoringControlDependencies(mlir::func::FuncOp func_op) {
     // order after the merge.
     rewriter.setInsertionPoint(reshards.back());
     auto merged_reshard =
-        rewriter.create<ReshardOp>(rewriter.getFusedLoc(locs),
-                                   /*outputs=*/output_types,
-                                   /*control_output=*/
-                                   IfrtControlType::get(rewriter.getContext()),
-                                   /*inputs=*/inputs,
-                                   /*donated=*/reshards.front().getDonated(),
-                                   /*control_inputs=*/mlir::ValueRange());
+        ReshardOp::create(rewriter, rewriter.getFusedLoc(locs),
+                          /*outputs=*/output_types,
+                          /*control_output=*/
+                          IfrtControlType::get(rewriter.getContext()),
+                          /*inputs=*/inputs,
+                          /*donated=*/reshards.front().getDonated(),
+                          /*control_inputs=*/mlir::ValueRange());
 
     // Replace the original reshards with the new merged reshard.
     for (auto [index, reshard] : llvm::enumerate(reshards)) {

From af8c7d0e2e006be6e69b0034d3d2d9fb1e744179 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 14:17:57 -0800
Subject: [PATCH 600/753] Apply llvm-use-new-mlir-op-builder fixes

This migrates `builder.create<Op>()` => `Op::create()`

PiperOrigin-RevId: 846865415
---
 .../mlir/stablehlo/transforms/utils.cc        |    8 +-
 .../transforms/cluster_formation.cc           |    8 +-
 .../transforms/cluster_ops_by_policy.cc       |    4 +-
 .../convert_control_to_data_outputs.cc        |   24 +-
 .../transforms/decompose_resource_ops.cc      |   35 +-
 .../mlir/tensorflow/transforms/einsum.cc      |   62 +-
 .../extract_tpu_copy_with_dynamic_shape_op.cc |   15 +-
 .../transforms/fused_kernel_matcher.cc        |    4 +-
 ...ist_replicate_invariant_resource_writes.cc |    6 +-
 .../tpu_merge_variables_with_execute.cc       |   20 +-
 .../mlir/tensorflow/transforms/lower_tf.cc    |  613 ++++----
 .../prepare_tpu_computation_for_tf_export.cc  |   20 +-
 .../replicate_invariant_op_hoisting.cc        |    4 +-
 .../transforms/rewrite_tpu_embedding_ops.cc   |    8 +-
 .../sparsecore/embedding_program_key.cc       |    8 +-
 .../transforms/tf_data_optimization.cc        |    9 +-
 ...pu_parallel_execute_sink_resource_write.cc |    4 +-
 .../transforms/tpu_resource_read_for_write.cc |   11 +-
 .../transforms/unroll_batch_matmul.cc         |   28 +-
 .../tensorflow/utils/parallel_execute_util.cc |    8 +-
 .../utils/tpu_rewrite_device_util_test.cc     |   56 +-
 .../mlir/tensorflow/utils/xla_rewrite_util.cc |   10 +-
 .../tensorflow/utils/xla_sharding_util.cc     |   60 +-
 .../mlir/tf2xla/transforms/legalize_tf.cc     | 1363 +++++++++--------
 .../transforms/legalize_tf_collective.cc      |   20 +-
 .../transforms/legalize_tf_communication.cc   |   44 +-
 .../split_into_island_per_op_pass.cc          |   16 +-
 .../split_into_island_per_op_pass_test.cc     |   17 +-
 .../tfxla_device_specific_transforms.cc       |    5 +-
 .../compiler/mlir/tf2xla/transforms/utils.cc  |    4 +-
 .../compiler/mlir/tf2xla/transforms/utils.h   |    4 +-
 .../transforms/xla_legalize_targets_test.cc   |   12 +-
 tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc    |  130 +-
 .../compiler/mlir/tfr/passes/canonicalize.cc  |    8 +-
 .../compiler/mlir/tfr/passes/raise_to_tf.cc   |   10 +-
 .../transforms/deduplicate_if_result_pass.cc  |   12 +-
 .../fuse_tpu_compile_and_execute_ops.cc       |   11 +-
 .../ifrt/rewrite_cluster_to_ifrt_call.cc      |   12 +-
 .../transforms/ifrt/tf_restore_merging.cc     |   12 +-
 .../transforms/ifrt/tf_restore_splitting.cc   |   12 +-
 .../mlir/tfrt/transforms/lower_saved_model.cc |   34 +-
 .../mlir/tfrt/transforms/merge_tf_if_ops.cc   |   15 +-
 .../transforms/broadcast_propagation_pass.cc  |    4 +-
 .../transforms/embed_tf_framework.cc          |    8 +-
 .../transforms/func_to_jit_invocations.cc     |   66 +-
 .../transforms/merge_assuming_ops_pass.cc     |   25 +-
 .../transforms/rewrite_tf_framework_assert.cc |    8 +-
 .../tensorflow_abi_knowledge_propagation.cc   |   10 +-
 .../tf_framework_legalize_to_llvm.cc          |  123 +-
 .../transforms/tf_kernel_to_llvm_pass.cc      |   45 +-
 .../mlir/tools/kernel_gen/transforms/utils.cc |   11 +-
 51 files changed, 1581 insertions(+), 1485 deletions(-)

diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc b/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
index d440f20e6d9779..f963d7a9c8dcb1 100644
--- a/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/utils.cc
@@ -27,14 +27,14 @@ namespace odml {
 
 mhlo::ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
                                       OpBuilder* builder) {
-  return builder->create<mhlo::ConstantOp>(loc,
-                                           hlo::getScalarOfType(ty, raw_value));
+  return mhlo::ConstantOp::create(*builder, loc,
+                                  hlo::getScalarOfType(ty, raw_value));
 }
 
 mhlo::ConstantOp GetScalarNegZeroOfType(Type ty, Location loc,
                                         OpBuilder* builder) {
-  return builder->create<mhlo::ConstantOp>(loc,
-                                           hlo::getScalarNegZeroOfType(ty));
+  return mhlo::ConstantOp::create(*builder, loc,
+                                  hlo::getScalarNegZeroOfType(ty));
 }
 
 DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
index 93d31b884732c1..2beec1bcd87944 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
@@ -107,8 +107,8 @@ void BuildLaunchForCluster(const TF::Cluster& c, OpBuilder* builder) {
   // as operand.
   OpBuilder return_builder(builder->getContext());
   return_builder.setInsertionPointToEnd(block);
-  return_builder.create<tf_device::ReturnOp>(return_builder.getUnknownLoc(),
-                                             live_outs);
+  tf_device::ReturnOp::create(return_builder, return_builder.getUnknownLoc(),
+                              live_outs);
 
   llvm::SmallVector<Type, 4> live_out_types;
   live_out_types.reserve(live_outs.size());
@@ -116,8 +116,8 @@ void BuildLaunchForCluster(const TF::Cluster& c, OpBuilder* builder) {
     live_out_types.emplace_back(v.getType());
   }
 
-  tf_device::LaunchOp launch_op = builder->create<tf_device::LaunchOp>(
-      builder->getUnknownLoc(), builder->getStringAttr(c.target),
+  tf_device::LaunchOp launch_op = tf_device::LaunchOp::create(
+      *builder, builder->getUnknownLoc(), builder->getStringAttr(c.target),
       live_out_types);
 
   // Attach the region to launch_op.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
index beee1afb1a129e..9158ecc6f7fcd7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_ops_by_policy.cc
@@ -575,7 +575,7 @@ tf_device::ClusterOp CreateClusterOp(Cluster &cluster, StringAttr policy) {
   OpBuilder builder(back);
 
   auto cluster_op =
-      builder.create<tf_device::ClusterOp>(loc, return_types, policy);
+      tf_device::ClusterOp::create(builder, loc, return_types, policy);
 
   // Create block in cluster_op's region and move 'cluster.operations' into
   // it.
@@ -585,7 +585,7 @@ tf_device::ClusterOp CreateClusterOp(Cluster &cluster, StringAttr policy) {
 
   // Add 'tf_device::ReturnOp' at the end of the block.
   builder.setInsertionPointToEnd(block);
-  builder.create<tf_device::ReturnOp>(loc, return_values.getArrayRef());
+  tf_device::ReturnOp::create(builder, loc, return_values.getArrayRef());
 
   // Set device attribute
   if (auto device = back->getAttr(kDeviceAttr))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index d63ace094451a6..ea7dce395d84d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -370,7 +370,7 @@ void AppendFunctionResults(func::FuncOp func, int num_resources,
   // function.
   OpBuilder builder(graph_op);
   auto new_graph_op =
-      builder.create<GraphOp>(graph_op.getLoc(), new_result_types);
+      GraphOp::create(builder, graph_op.getLoc(), new_result_types);
   new_graph_op.getRegion().takeBody(graph_op.getRegion());
   graph_op->replaceAllUsesWith(
       new_graph_op->getResults().drop_back(num_resources));
@@ -388,14 +388,15 @@ IslandOp CreateIsland(Operation* sub_op, ValueRange control_inputs,
                       OpBuilder builder) {
   assert(sub_op);
   auto control_type = ControlType::get(builder.getContext());
-  auto island = builder.create<IslandOp>(
-      sub_op->getLoc(), sub_op->getResultTypes(), control_type, control_inputs);
+  auto island =
+      IslandOp::create(builder, sub_op->getLoc(), sub_op->getResultTypes(),
+                       control_type, control_inputs);
   island.getBody().push_back(new Block);
   Block* block = &island.getBody().back();
   builder.setInsertionPointToEnd(block);
   sub_op->replaceAllUsesWith(island.getOutputs());
   sub_op->moveBefore(block, block->begin());
-  builder.create<YieldOp>(sub_op->getLoc(), sub_op->getResults());
+  YieldOp::create(builder, sub_op->getLoc(), sub_op->getResults());
   return island;
 }
 
@@ -429,12 +430,12 @@ void ChainResourceOps(
     // Create chain source and sink identity islands for current equivalence
     // class.
     auto chain_arg = func.getArgument(chain_index++);
-    auto src_identity = builder_chain_src.create<TF::IdentityOp>(
-        chain_arg.getLoc(), chain_arg.getType(), chain_arg);
+    auto src_identity = TF::IdentityOp::create(
+        builder_chain_src, chain_arg.getLoc(), chain_arg.getType(), chain_arg);
     auto chain_src_island = CreateIsland(src_identity, {}, builder_chain_src);
 
-    auto sink_identity = builder_chain_sink.create<TF::IdentityOp>(
-        chain_arg.getLoc(), chain_arg.getType(), chain_arg);
+    auto sink_identity = TF::IdentityOp::create(
+        builder_chain_sink, chain_arg.getLoc(), chain_arg.getType(), chain_arg);
     auto chain_sink_island =
         CreateIsland(sink_identity, {}, builder_chain_sink);
 
@@ -477,7 +478,7 @@ void ChainResourceOps(
 IslandOp GetDummyConstant(OpBuilder builder, ShapedType const_type,
                           Location loc) {
   DenseIntElementsAttr val = DenseIntElementsAttr::get(const_type, 1);
-  auto const_op = builder.create<TF::ConstOp>(loc, val);
+  auto const_op = TF::ConstOp::create(builder, loc, val);
   auto const_island = CreateIsland(const_op, {}, builder);
   return const_island;
 }
@@ -506,8 +507,9 @@ TF::WhileOp RewriteWhileOp(TF::WhileOp while_op, int num_resource_inputs,
   }
 
   // Replace old while op with new while op.
-  auto new_while_op = builder.create<TF::WhileOp>(
-      while_op.getLoc(), new_result_types, new_operands, while_op->getAttrs());
+  auto new_while_op =
+      TF::WhileOp::create(builder, while_op.getLoc(), new_result_types,
+                          new_operands, while_op->getAttrs());
   auto new_while_wrapper =
       CreateIsland(new_while_op, while_wrapper.getControlInputs(), builder);
   for (auto result : while_wrapper.getOutputs()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
index 144bdb44018649..cda422d0d9938e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
@@ -137,12 +137,12 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
 
     // Read the state value from the resource.
     Value state =
-        rewriter.create<ReadVariableOp>(loc, res_type, rng_op.getResource());
+        ReadVariableOp::create(rewriter, loc, res_type, rng_op.getResource());
 
     // Extract the key and counter from the state.
     RankedTensorType word_type = RankedTensorType::get({}, state_element_type);
-    auto unpacked = rewriter.create<UnpackOp>(
-        loc, SmallVector<Type, 4>(state_size, word_type), state, 0);
+    auto unpacked = UnpackOp::create(
+        rewriter, loc, SmallVector<Type, 4>(state_size, word_type), state, 0);
     Value key = unpacked.getResult(counter_size);
 
     SmallVector<Value, 4> counter;
@@ -153,39 +153,40 @@ class DecomposeRngReadAndSkipOp : public RewritePattern {
     // Set the increment to 256 * delta.
     Type u64 = rewriter.getIntegerType(64, /*isSigned=*/false);
     RankedTensorType u64_scalar = RankedTensorType::get({}, u64);
-    Value step_size = rewriter.create<ConstOp>(loc, GetScalarOfType(u64, 256));
+    Value step_size = ConstOp::create(rewriter, loc, GetScalarOfType(u64, 256));
     Value increment =
-        rewriter.create<MulOp>(loc, u64_scalar, step_size, rng_op.getDelta());
+        MulOp::create(rewriter, loc, u64_scalar, step_size, rng_op.getDelta());
 
     // Increment the counter.
     SmallVector<Value, 4> pack_args;
     RankedTensorType word_u64_type = RankedTensorType::get({}, u64);
-    Value zero_u64 = rewriter.create<ConstOp>(loc, GetScalarOfType(u64, 0));
-    Value one_u64 = rewriter.create<ConstOp>(loc, GetScalarOfType(u64, 1));
+    Value zero_u64 = ConstOp::create(rewriter, loc, GetScalarOfType(u64, 0));
+    Value one_u64 = ConstOp::create(rewriter, loc, GetScalarOfType(u64, 1));
     for (int i = 0; i < counter_size; ++i) {
       Value word = counter[i];
-      Value word_u64 = rewriter.create<CastOp>(loc, word_u64_type, word);
-      Value new_word_u64 = rewriter.create<AddV2Op>(loc, word_u64, increment);
-      Value new_word = rewriter.create<CastOp>(loc, word_type, new_word_u64);
+      Value word_u64 = CastOp::create(rewriter, loc, word_u64_type, word);
+      Value new_word_u64 = AddV2Op::create(rewriter, loc, word_u64, increment);
+      Value new_word = CastOp::create(rewriter, loc, word_type, new_word_u64);
       pack_args.push_back(new_word);
 
-      Value overflow = rewriter.create<LessOp>(loc, new_word_u64, word_u64);
-      increment = rewriter.create<SelectV2Op>(loc, overflow, one_u64, zero_u64);
+      Value overflow = LessOp::create(rewriter, loc, new_word_u64, word_u64);
+      increment =
+          SelectV2Op::create(rewriter, loc, overflow, one_u64, zero_u64);
     }
 
     // Save the new state value to the resource.
     pack_args.push_back(key);
-    Value new_state = rewriter.create<PackOp>(loc, res_type, pack_args);
-    rewriter.create<AssignVariableOp>(loc, rng_op.getResource(), new_state);
+    Value new_state = PackOp::create(rewriter, loc, res_type, pack_args);
+    AssignVariableOp::create(rewriter, loc, rng_op.getResource(), new_state);
 
     // Pad the original state as necessary to fill the output shape.
     int pad = tensorflow::RNG_MAX_COUNTER_SIZE - counter_size;
     Type i64 = rewriter.getI64Type();
     RankedTensorType paddings_ty = RankedTensorType::get({1, 2}, i64);
     std::vector<int64_t> paddings_values = {0, pad};
-    Value paddings = rewriter.create<ConstOp>(
-        loc, DenseIntElementsAttr::get(paddings_ty, paddings_values));
-    Value output = rewriter.create<PadOp>(loc, op_type, state, paddings);
+    Value paddings = ConstOp::create(
+        rewriter, loc, DenseIntElementsAttr::get(paddings_ty, paddings_values));
+    Value output = PadOp::create(rewriter, loc, op_type, state, paddings);
 
     rewriter.replaceOp(op, output);
     return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 954c318b416150..73dc7802c7d56d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -61,7 +61,7 @@ namespace {
 ConstOp createI32ConstOp(int32_t value, Location loc,
                          PatternRewriter* rewriter) {
   auto int_attr = IntegerAttr::get(rewriter->getIntegerType(32), value);
-  return rewriter->create<ConstOp>(loc, int_attr);
+  return ConstOp::create(*rewriter, loc, int_attr);
 }
 
 // Creates ConstantOp for array of int32_t.
@@ -70,7 +70,7 @@ arith::ConstantOp createI32ConstantOp(llvm::ArrayRef<int32_t> values,
   auto values_type = RankedTensorType::get(
       {static_cast<int32_t>(values.size())}, rewriter->getIntegerType(32));
   auto constant_attr = rewriter->getI32TensorAttr(values);
-  return rewriter->create<arith::ConstantOp>(loc, values_type, constant_attr);
+  return arith::ConstantOp::create(*rewriter, loc, values_type, constant_attr);
 }
 
 // Creates ConstantOp for array of int64_t.
@@ -79,7 +79,7 @@ arith::ConstantOp createI64ConstantOp(llvm::ArrayRef<int64_t> values,
   auto values_type = RankedTensorType::get(
       {static_cast<int64_t>(values.size())}, rewriter->getIntegerType(64));
   auto constant_attr = rewriter->getI64TensorAttr(values);
-  return rewriter->create<arith::ConstantOp>(loc, values_type, constant_attr);
+  return arith::ConstantOp::create(*rewriter, loc, values_type, constant_attr);
 }
 
 // Function to create a tf.SumOp to sum the element in 'value' reduced along the
@@ -98,8 +98,9 @@ TF::SumOp createSumOp(Value value, Location loc,
       sum_shape.push_back(shape[i]);
     }
   }
-  return rewriter->create<TF::SumOp>(
-      loc, RankedTensorType::get(sum_shape, value_type.getElementType()), value,
+  return TF::SumOp::create(
+      *rewriter, loc,
+      RankedTensorType::get(sum_shape, value_type.getElementType()), value,
       redux_op);
 }
 
@@ -115,8 +116,8 @@ TF::TransposeOp createTransposeOp(Value value, Location loc,
   }
   auto transposed_type =
       RankedTensorType::get(transposed_shape, value_type.getElementType());
-  return rewriter->create<TF::TransposeOp>(loc, transposed_type, value,
-                                           perm_op);
+  return TF::TransposeOp::create(*rewriter, loc, transposed_type, value,
+                                 perm_op);
 }
 
 TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
@@ -125,8 +126,8 @@ TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
   auto shape_tensor = createI64ConstantOp(
       tensorflow::ConvertMlirShapeToTF(shape), loc, rewriter);
   Type resultType = RankedTensorType::get(shape, element_type);
-  return rewriter->create<TF::ReshapeOp>(loc, resultType, /*tensor=*/value,
-                                         /*shape=*/shape_tensor);
+  return TF::ReshapeOp::create(*rewriter, loc, resultType, /*tensor=*/value,
+                               /*shape=*/shape_tensor);
 }
 
 // Creates ReshapeOp with runtime calcuation of required shape to support
@@ -140,7 +141,7 @@ TF::ReshapeOp createReshapeOpForDynamic(Value value, ArrayRef<int64_t> shape,
                                         PatternRewriter* rewriter) {
   // Build ShapeOp
   auto input_shape =
-      rewriter->create<TF::ShapeOp>(loc, value, rewriter->getBoolAttr(true));
+      TF::ShapeOp::create(*rewriter, loc, value, rewriter->getBoolAttr(true));
 
   // Build UnsortedSegmentProdOp
   Type segProdresultType =
@@ -148,16 +149,16 @@ TF::ReshapeOp createReshapeOpForDynamic(Value value, ArrayRef<int64_t> shape,
   auto segids_tensor = createI32ConstantOp(reshape_segids, loc, rewriter);
   auto num_reshape_segids_tensor =
       createI32ConstOp(num_reshape_segids, loc, rewriter);
-  auto segprod = rewriter->create<TF::UnsortedSegmentProdOp>(
-      loc, segProdresultType, input_shape->getResults()[0], segids_tensor,
-      num_reshape_segids_tensor);
+  auto segprod = TF::UnsortedSegmentProdOp::create(
+      *rewriter, loc, segProdresultType, input_shape->getResults()[0],
+      segids_tensor, num_reshape_segids_tensor);
 
   // Build ReshapeOp with the result of UnsortedSegmentProdOp.
   Type out_tensor_type =
       RankedTensorType::get(shape, getElementTypeOrSelf(value.getType()));
-  return rewriter->create<TF::ReshapeOp>(loc, out_tensor_type,
-                                         /*tensor=*/value,
-                                         /*shape=*/segprod->getResults()[0]);
+  return TF::ReshapeOp::create(*rewriter, loc, out_tensor_type,
+                               /*tensor=*/value,
+                               /*shape=*/segprod->getResults()[0]);
 }
 
 struct EinsumDimensionNumbers {
@@ -178,8 +179,8 @@ TF::ReshapeOp createOutputReshapeOpForDynamic(
     EinsumDimensionNumbers& dnums, Location loc, PatternRewriter* rewriter) {
   BoolAttr true_attr = rewriter->getBoolAttr(true);
   // Build ShapeOp
-  auto shape_lhs = rewriter->create<TF::ShapeOp>(loc, org_lhs, true_attr);
-  auto shape_rhs = rewriter->create<TF::ShapeOp>(loc, org_rhs, true_attr);
+  auto shape_lhs = TF::ShapeOp::create(*rewriter, loc, org_lhs, true_attr);
+  auto shape_rhs = TF::ShapeOp::create(*rewriter, loc, org_rhs, true_attr);
 
   std::vector<int32_t> bl_index;  // Indexes of B0,...,Bn and L0,...,Ln
   bl_index.reserve(dnums.lhs_rhs_out.size() + dnums.lhs_out.size());
@@ -196,20 +197,20 @@ TF::ReshapeOp createOutputReshapeOpForDynamic(
   }
 
   auto lhs_index_tensor = createI32ConstantOp(bl_index, loc, rewriter);
-  auto gather_lhs = rewriter->create<TF::GatherOp>(
-      loc,
+  auto gather_lhs = TF::GatherOp::create(
+      *rewriter, loc,
       RankedTensorType::get({static_cast<int>(bl_index.size())},
                             rewriter->getIntegerType(32)),
       shape_lhs->getResults()[0], lhs_index_tensor->getResults()[0], true_attr);
   auto rhs_index_tensor = createI32ConstantOp(r_index, loc, rewriter);
-  auto gather_rhs = rewriter->create<TF::GatherOp>(
-      loc,
+  auto gather_rhs = TF::GatherOp::create(
+      *rewriter, loc,
       RankedTensorType::get({static_cast<int>(r_index.size())},
                             rewriter->getIntegerType(32)),
       shape_rhs->getResults()[0], rhs_index_tensor->getResults()[0], true_attr);
   Value zero_value = createI32ConstOp(0, loc, rewriter);
-  auto concat_out_shape = rewriter->create<TF::ConcatOp>(
-      loc,
+  auto concat_out_shape = TF::ConcatOp::create(
+      *rewriter, loc,
       RankedTensorType::get({static_cast<int>(bl_index.size()) +
                              static_cast<int>(r_index.size())},
                             rewriter->getIntegerType(32)),
@@ -220,10 +221,9 @@ TF::ReshapeOp createOutputReshapeOpForDynamic(
   // Build ReshapeOp with the calculated output shape.
   Type out_type =
       RankedTensorType::get(shape, getElementTypeOrSelf(value.getType()));
-  return rewriter->create<TF::ReshapeOp>(
-      loc, out_type,
-      /*tensor=*/value,
-      /*shape=*/concat_out_shape->getResults()[0]);
+  return TF::ReshapeOp::create(*rewriter, loc, out_type,
+                               /*tensor=*/value,
+                               /*shape=*/concat_out_shape->getResults()[0]);
 }
 
 std::optional<llvm::SmallDenseMap<char, int64_t>> EquationToMap(
@@ -793,9 +793,9 @@ LogicalResult rewriteToBatchMatmul(TF::EinsumOp op,
 
   auto matmul_type =
       RankedTensorType::get(matmul_shape, original_type.getElementType());
-  Value out = rewriter.create<TF::BatchMatMulV2Op>(
-      op.getLoc(), matmul_type, lhs, rhs, rewriter.getBoolAttr(false),
-      rewriter.getBoolAttr(false));
+  Value out = TF::BatchMatMulV2Op::create(rewriter, op.getLoc(), matmul_type,
+                                          lhs, rhs, rewriter.getBoolAttr(false),
+                                          rewriter.getBoolAttr(false));
 
   bool out_reshape_need = (reshape_shape.size() != matmul_shape.size() ||
                            original_type.getRank() != matmul_shape.size());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
index 18480fbd772fa9..883da73f2fb378 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_tpu_copy_with_dynamic_shape_op.cc
@@ -127,8 +127,8 @@ tf_device::LaunchOp CreateNewHostLaunchOpWithNewResult(
   for (Value result : new_launch_op_results)
     new_launch_op_results_types.push_back(result.getType());
 
-  auto new_launch_op = builder.create<tf_device::LaunchOp>(
-      old_launch_op->getLoc(), old_launch_op->getDeviceAttr(),
+  auto new_launch_op = tf_device::LaunchOp::create(
+      builder, old_launch_op->getLoc(), old_launch_op->getDeviceAttr(),
       /*result_types=*/new_launch_op_results_types);
 
   new_launch_op.getBody().takeBody(old_launch_op->getBody());
@@ -154,17 +154,16 @@ LogicalResult CreateNewDeviceLaunchOp(
     return failure();
   }
 
-  new_device_launch_op = builder.create<tf_device::LaunchOp>(
-      tpu_copy_with_dynamic_shape_op->getLoc(),
+  new_device_launch_op = tf_device::LaunchOp::create(
+      builder, tpu_copy_with_dynamic_shape_op->getLoc(),
       builder.getStringAttr(device_str),
       /*result_types=*/tpu_copy_with_dynamic_shape_op->getResultTypes());
 
   new_device_launch_op.getBody().push_back(new Block);
   builder.setInsertionPointToEnd(&new_device_launch_op.GetBody());
-  auto* return_op = builder
-                        .create<tf_device::ReturnOp>(
-                            tpu_copy_with_dynamic_shape_op->getLoc(),
-                            tpu_copy_with_dynamic_shape_op->getResults())
+  auto* return_op = tf_device::ReturnOp::create(
+                        builder, tpu_copy_with_dynamic_shape_op->getLoc(),
+                        tpu_copy_with_dynamic_shape_op->getResults())
                         .getOperation();
   tpu_copy_with_dynamic_shape_op->moveBefore(return_op);
   return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index e73d76fbc5907d..b2ab71fa5129cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -230,8 +230,8 @@ class FuseContractionWithBiasAdd : public OpRewritePattern<SrcOpT> {
     auto *bias_add_op = bias_add.getOperation();
     if (bias_add_op) rewriter.setInsertionPoint(bias_add_op);
 
-    Value fused_op = rewriter.create<FusedOpT>(fused_loc, result_type,
-                                               ValueRange(operands), attrs);
+    Value fused_op = FusedOpT::create(rewriter, fused_loc, result_type,
+                                      ValueRange(operands), attrs);
     auto op_to_replace = fuse_activation ? activation : bias_add;
     rewriter.replaceOp(op_to_replace, ValueRange({fused_op}));
     return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
index 2c70a078fbb13a..18fc8fc1cb58cc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/hoist_replicate_invariant_resource_writes.cc
@@ -89,9 +89,9 @@ void MoveTailWritesAfterReplicate(
 
   OpBuilder builder(replicate_op);
   // Clone this old replicate op but with new result types.
-  auto new_replicate_op = builder.create<tf_device::ReplicateOp>(
-      replicate_op->getLoc(), new_result_types, replicate_op->getOperands(),
-      replicate_op->getAttrs());
+  auto new_replicate_op = tf_device::ReplicateOp::create(
+      builder, replicate_op->getLoc(), new_result_types,
+      replicate_op->getOperands(), replicate_op->getAttrs());
 
   // Move region to the new op.
   new_replicate_op.getRegion().takeBody(replicate_op.getRegion());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
index 9492c007b07ca5..7806967d7dcfe9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/host_runtime/tpu_merge_variables_with_execute.cc
@@ -413,8 +413,8 @@ void ReplaceParallelExecute(
       &output_types, parallel_execute, region_index + 1, num_regions);
 
   builder->setInsertionPoint(parallel_execute);
-  auto new_parallel_execute = builder->create<tf_device::ParallelExecuteOp>(
-      parallel_execute.getLoc(), num_regions, output_types);
+  auto new_parallel_execute = tf_device::ParallelExecuteOp::create(
+      *builder, parallel_execute.getLoc(), num_regions, output_types);
 
   // Replace the uses of the original parallel_execute before region containing
   // merged execute.
@@ -449,8 +449,8 @@ void ReplaceParallelExecute(
   // execute results.
   Operation* old_terminator = execute_region->front().getTerminator();
   builder->setInsertionPointToEnd(&execute_region->front());
-  builder->create<tf_device::ReturnOp>(old_terminator->getLoc(),
-                                       merged_execute_launch.getResults());
+  tf_device::ReturnOp::create(*builder, old_terminator->getLoc(),
+                              merged_execute_launch.getResults());
   old_terminator->erase();
 
   // Remove the original TPUExecute op.
@@ -532,8 +532,8 @@ LogicalResult MergeForOneTPUExecute(
   }
 
   // Create the merged execute and update variables op.
-  auto merged_execute = builder->create<TF::TPUExecuteAndUpdateVariablesOp>(
-      execute_launch.getLoc(), new_output_types,
+  auto merged_execute = TF::TPUExecuteAndUpdateVariablesOp::create(
+      *builder, execute_launch.getLoc(), new_output_types,
       var_access_info.new_operand_values,
       llvm::ArrayRef<NamedAttribute>{
           builder->getNamedAttr(
@@ -544,14 +544,14 @@ LogicalResult MergeForOneTPUExecute(
               builder->getI64ArrayAttr(device_var_updates_indices))});
 
   // Wrap in launch for device assignment.
-  auto merged_execute_launch = builder->create<tf_device::LaunchOp>(
-      merged_execute.getLoc(), execute_launch.getDeviceAttr(),
+  auto merged_execute_launch = tf_device::LaunchOp::create(
+      *builder, merged_execute.getLoc(), execute_launch.getDeviceAttr(),
       merged_execute.getResultTypes());
   merged_execute_launch.getBody().push_back(new Block);
 
   builder->setInsertionPointToEnd(&merged_execute_launch.GetBody());
-  builder->create<tf_device::ReturnOp>(merged_execute.getLoc(),
-                                       merged_execute.getResults());
+  tf_device::ReturnOp::create(*builder, merged_execute.getLoc(),
+                              merged_execute.getResults());
 
   merged_execute.getOperation()->moveBefore(
       merged_execute_launch.GetBody().getTerminator());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index 85b61d16355077..0b5976b619ea26 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -90,7 +90,7 @@ static Value CreateTFCastOpF32(OpBuilder *builder, Location loc, Value x,
   auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getF32Type());
-  return builder->create<CastOp>(loc, type, x, truncate);
+  return CastOp::create(*builder, loc, type, x, truncate);
 }
 
 // Returns a TF_CastOp to I32. This function is used for CastOps that are
@@ -103,7 +103,7 @@ static Value CreateTFCastOpI32(OpBuilder *builder, Location loc, Value x,
   auto x_type = mlir::dyn_cast_or_null<ShapedType>(x.getType());
   if (!x_type) llvm_unreachable("unsupported type");
   Type type = x_type.clone(builder->getI32Type());
-  return builder->create<CastOp>(loc, type, x, truncate);
+  return CastOp::create(*builder, loc, type, x, truncate);
 }
 
 static APFloat ConvertToAPFloat(double val, Type type) {
@@ -125,22 +125,24 @@ static Value GetDimensionSize(OpBuilder *builder, Location loc, Value input,
     }
     // Return a ConstOp if it's static dimension.
     if (!ranked_ty.isDynamicDim(idx)) {
-      return builder->create<TF::ConstOp>(
-          loc, GetScalarOfType(
-                   builder->getIntegerType(use_32bit.getValue() ? 32 : 64),
-                   ranked_ty.getDimSize(idx)));
+      return TF::ConstOp::create(
+          *builder, loc,
+          GetScalarOfType(
+              builder->getIntegerType(use_32bit.getValue() ? 32 : 64),
+              ranked_ty.getDimSize(idx)));
     }
   }
 
-  auto shape = builder->create<TF::ShapeOp>(loc, input, use_32bit);
-  return builder->create<TF::StridedSliceOp>(
-      loc, mlir::RankedTensorType::get({}, getElementTypeOrSelf(shape)), shape,
+  auto shape = TF::ShapeOp::create(*builder, loc, input, use_32bit);
+  return TF::StridedSliceOp::create(
+      *builder, loc,
+      mlir::RankedTensorType::get({}, getElementTypeOrSelf(shape)), shape,
       /*begin=*/
-      builder->create<TF::ConstOp>(loc, builder->getI32TensorAttr({idx})),
+      TF::ConstOp::create(*builder, loc, builder->getI32TensorAttr({idx})),
       /*end=*/
-      builder->create<TF::ConstOp>(loc, builder->getI32TensorAttr({idx + 1})),
+      TF::ConstOp::create(*builder, loc, builder->getI32TensorAttr({idx + 1})),
       /*strides=*/
-      builder->create<TF::ConstOp>(loc, builder->getI32TensorAttr({1})),
+      TF::ConstOp::create(*builder, loc, builder->getI32TensorAttr({1})),
       /*begin_mask=*/0, /*end_mask=*/0, /*ellipsis_mask=*/0,
       /*new_axis_mask=*/0, /*shrink_axis_mask=*/1);
 }
@@ -211,9 +213,9 @@ Value ValuesToRank1(PatternRewriter &rewriter, Location loc, Type dtype,
                     ArrayRef<Value> vals) {
   int64_t length = vals.size();
   auto type = tensorflow::GetTypeFromTFTensorShape({length}, dtype);
-  auto axis = rewriter.create<ConstOp>(
-      loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
-  return rewriter.create<ConcatV2Op>(loc, type, ValueRange(vals), axis);
+  auto axis = ConstOp::create(rewriter, loc,
+                              GetScalarOfType(rewriter.getIntegerType(64), 0));
+  return ConcatV2Op::create(rewriter, loc, type, ValueRange(vals), axis);
 }
 
 // Lowers AddN op to a sequence of AddV2 ops to accumulate operands.
@@ -277,10 +279,10 @@ class LowerAddNOp : public RewritePattern {
     while (n > 1) {
       for (int64_t i = 0; i < n; i += 2) {
         // Add two adjacent operands if applicable.
-        operands[i / 2] =
-            (i + 1 < n) ? rewriter.create<AddV2Op>(addn_op.getLoc(),
-                                                   operands[i], operands[i + 1])
-                        : operands[i];
+        operands[i / 2] = (i + 1 < n)
+                              ? AddV2Op::create(rewriter, addn_op.getLoc(),
+                                                operands[i], operands[i + 1])
+                              : operands[i];
       }
       n = (n + 1) / 2;
     }
@@ -363,8 +365,8 @@ class LowerDynamicStitchOp : public RewritePattern {
     packed_shape.push_back(-1);
     packed_shape.append(item_shape.begin(), item_shape.end());
     Location loc = op.getLoc();
-    auto packed_shape_val = rewriter.create<ConstOp>(
-        loc, GetI64ElementsAttr(packed_shape, &rewriter));
+    auto packed_shape_val = ConstOp::create(
+        rewriter, loc, GetI64ElementsAttr(packed_shape, &rewriter));
 
     // Prepare each of the output item by unpacking data and then putting it to
     // the specified index.
@@ -374,12 +376,13 @@ class LowerDynamicStitchOp : public RewritePattern {
       Value data = std::get<1>(it);
 
       auto reshaped_data =
-          rewriter.create<ReshapeOp>(loc, data, packed_shape_val);
+          ReshapeOp::create(rewriter, loc, data, packed_shape_val);
       auto num_items =
           mlir::cast<RankedTensorType>(reshaped_data.getType()).getShape()[0];
-      auto items = rewriter.create<UnpackOp>(
-          loc, SmallVector<Type, 4>(num_items, item_ty), reshaped_data,
-          /*axis=*/0);
+      auto items = UnpackOp::create(rewriter, loc,
+                                    SmallVector<Type, 4>(num_items, item_ty),
+                                    reshaped_data,
+                                    /*axis=*/0);
       for (auto index_item : llvm::zip(index_attr, items.getResults())) {
         int64_t output_index = std::get<0>(index_item).getSExtValue();
         Value item = std::get<1>(index_item);
@@ -426,80 +429,84 @@ class ConvertFakeQuantWithMinMaxVarsOp : public RewritePattern {
     auto float_min = op.getMin();
     auto float_max = op.getMax();
 
-    auto float_diff = rewriter.create<SubOp>(op.getLoc(), float_max, float_min);
+    auto float_diff =
+        SubOp::create(rewriter, op.getLoc(), float_max, float_min);
 
     // Compute the range when quantized.
-    auto quant_min = rewriter.create<ConstOp>(
-        op.getLoc(), DenseElementsAttr::get(
-                         scalar_ty, ConvertToAPFloat(bits_min, element_ty)));
-
-    auto quant_max = rewriter.create<ConstOp>(
-        op.getLoc(), DenseElementsAttr::get(
-                         scalar_ty, ConvertToAPFloat(bits_max, element_ty)));
-
-    auto quant_diff = rewriter.create<ConstOp>(
-        op.getLoc(),
+    auto quant_min =
+        ConstOp::create(rewriter, op.getLoc(),
+                        DenseElementsAttr::get(
+                            scalar_ty, ConvertToAPFloat(bits_min, element_ty)));
+
+    auto quant_max =
+        ConstOp::create(rewriter, op.getLoc(),
+                        DenseElementsAttr::get(
+                            scalar_ty, ConvertToAPFloat(bits_max, element_ty)));
+
+    auto quant_diff = ConstOp::create(
+        rewriter, op.getLoc(),
         DenseElementsAttr::get(
             scalar_ty, ConvertToAPFloat(bits_max - bits_min, element_ty)));
 
     auto quant_to_float =
-        rewriter.create<DivOp>(op.getLoc(), float_diff, quant_diff);
+        DivOp::create(rewriter, op.getLoc(), float_diff, quant_diff);
 
     auto float_to_quant =
-        rewriter.create<DivOp>(op.getLoc(), quant_diff, float_diff);
+        DivOp::create(rewriter, op.getLoc(), quant_diff, float_diff);
 
     // During quantization, the quantized min/max values may not line up
     // perfectly with the specified min/max. Nudge them into the right range.
     auto min_scaled =
-        rewriter.create<DivOp>(op.getLoc(), float_min, quant_to_float);
+        DivOp::create(rewriter, op.getLoc(), float_min, quant_to_float);
     auto min_scaled_sub =
-        rewriter.create<SubOp>(op.getLoc(), quant_min, min_scaled);
+        SubOp::create(rewriter, op.getLoc(), quant_min, min_scaled);
 
     auto mid_rounded =
-        rewriter.create<RoundOp>(op.getLoc(), scalar_ty, min_scaled_sub);
+        RoundOp::create(rewriter, op.getLoc(), scalar_ty, min_scaled_sub);
 
-    auto nudged_zero_point_val = rewriter.create<ClipByValueOp>(
-        op.getLoc(), scalar_ty, mid_rounded, quant_min, quant_max);
+    auto nudged_zero_point_val = ClipByValueOp::create(
+        rewriter, op.getLoc(), scalar_ty, mid_rounded, quant_min, quant_max);
 
     auto quant_min_sub =
-        rewriter.create<SubOp>(op.getLoc(), quant_min, nudged_zero_point_val);
+        SubOp::create(rewriter, op.getLoc(), quant_min, nudged_zero_point_val);
     auto quant_max_sub =
-        rewriter.create<SubOp>(op.getLoc(), quant_max, nudged_zero_point_val);
+        SubOp::create(rewriter, op.getLoc(), quant_max, nudged_zero_point_val);
 
     auto nudged_float_min =
-        rewriter.create<MulOp>(op.getLoc(), quant_min_sub, quant_to_float);
+        MulOp::create(rewriter, op.getLoc(), quant_min_sub, quant_to_float);
 
     auto nudged_float_max =
-        rewriter.create<MulOp>(op.getLoc(), quant_max_sub, quant_to_float);
+        MulOp::create(rewriter, op.getLoc(), quant_max_sub, quant_to_float);
 
     // Now quantize the input value with the approximated min/max values.
 
     // Move the input value into quantized space
-    Value quantized_input = rewriter.create<ClipByValueOp>(
-        op.getLoc(), input_ty, input, nudged_float_min, nudged_float_max);
+    Value quantized_input =
+        ClipByValueOp::create(rewriter, op.getLoc(), input_ty, input,
+                              nudged_float_min, nudged_float_max);
 
-    quantized_input = rewriter.create<SubOp>(op.getLoc(), input_ty,
-                                             quantized_input, nudged_float_min);
+    quantized_input = SubOp::create(rewriter, op.getLoc(), input_ty,
+                                    quantized_input, nudged_float_min);
 
-    quantized_input = rewriter.create<MulOp>(op.getLoc(), input_ty,
-                                             quantized_input, float_to_quant);
+    quantized_input = MulOp::create(rewriter, op.getLoc(), input_ty,
+                                    quantized_input, float_to_quant);
 
     // Round the quantized input always to the positive direction.
-    auto half_val = rewriter.create<ConstOp>(
-        op.getLoc(),
+    auto half_val = ConstOp::create(
+        rewriter, op.getLoc(),
         DenseElementsAttr::get(scalar_ty, ConvertToAPFloat(0.5, element_ty)));
 
-    quantized_input = rewriter.create<AddV2Op>(op.getLoc(), input_ty,
-                                               quantized_input, half_val);
+    quantized_input = AddV2Op::create(rewriter, op.getLoc(), input_ty,
+                                      quantized_input, half_val);
 
-    quantized_input = rewriter.create<FloorOp>(op.getLoc(), quantized_input);
+    quantized_input = FloorOp::create(rewriter, op.getLoc(), quantized_input);
 
     // Convert back into floating point spae.
-    Value output = rewriter.create<MulOp>(op.getLoc(), input_ty,
-                                          quantized_input, quant_to_float);
+    Value output = MulOp::create(rewriter, op.getLoc(), input_ty,
+                                 quantized_input, quant_to_float);
 
-    output = rewriter.create<AddV2Op>(op.getLoc(), input_ty, output,
-                                      nudged_float_min);
+    output = AddV2Op::create(rewriter, op.getLoc(), input_ty, output,
+                             nudged_float_min);
 
     rewriter.replaceOp(op, {output});
     return success();
@@ -549,20 +556,21 @@ class LowerInvertPermutationOp : public RewritePattern {
     Type int_type = x_type.getElementType();  // Could be i32 or i64.
 
     auto result_type = x_type;
-    auto start = rewriter.create<ConstOp>(loc, GetScalarOfType(int_type, 0));
-    Value limit = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(int_type, x_type.getShape()[0]));
-    auto delta = rewriter.create<ConstOp>(loc, GetScalarOfType(int_type, 1));
+    auto start = ConstOp::create(rewriter, loc, GetScalarOfType(int_type, 0));
+    Value limit = ConstOp::create(
+        rewriter, loc, GetScalarOfType(int_type, x_type.getShape()[0]));
+    auto delta = ConstOp::create(rewriter, loc, GetScalarOfType(int_type, 1));
     // Construct a sequence of numbers [0, 1, ... len(x)-1].
     auto updates =
-        rewriter.create<RangeOp>(loc, result_type, start, limit, delta);
+        RangeOp::create(rewriter, loc, result_type, start, limit, delta);
 
     auto shape_type =
         tensorflow::GetTypeFromTFTensorShape({2}, rewriter.getIntegerType(32));
-    auto shape = rewriter.create<ConstOp>(
-        loc, DenseElementsAttr::get(
-                 shape_type, {static_cast<int>(x_type.getDimSize(0)), 1}));
-    auto indices = rewriter.create<ReshapeOp>(loc, op.getX(), shape);
+    auto shape = ConstOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(shape_type,
+                               {static_cast<int>(x_type.getDimSize(0)), 1}));
+    auto indices = ReshapeOp::create(rewriter, loc, op.getX(), shape);
 
     rewriter.replaceOpWithNewOp<TensorScatterUpdateOp>(
         op, result_type, op.getX(), indices, updates);
@@ -641,16 +649,17 @@ class LowerLgammaOp : public RewritePattern {
       } else {
         tensor_type = UnrankedTensorType::get(float_type);
       }
-      input = rewriter.create<CastOp>(loc, tensor_type, input);
+      input = CastOp::create(rewriter, loc, tensor_type, input);
     }
 
     // Helper lambda function for creating a ConstOp for a tensor filled with
     // the given constant float value.
     auto create_const_op = [&rewriter, loc, tensor_type,
                             float_type](double value) {
-      return rewriter.create<ConstOp>(
-          loc, DenseElementsAttr::get(tensor_type,
-                                      FloatAttr::get(float_type, value)));
+      return ConstOp::create(
+          rewriter, loc,
+          DenseElementsAttr::get(tensor_type,
+                                 FloatAttr::get(float_type, value)));
     };
 
     Value one_half = create_const_op(0.5);
@@ -664,26 +673,26 @@ class LowerLgammaOp : public RewritePattern {
         create_const_op(std::log(kLanczosGamma + 0.5));
     Value base_lanczos_coeff = create_const_op(kBaseLanczosCoeff);
 
-    Value minus_input = rewriter.create<NegOp>(loc, input);
-    Value input_minus_one = rewriter.create<SubOp>(loc, input, one);
+    Value minus_input = NegOp::create(rewriter, loc, input);
+    Value input_minus_one = SubOp::create(rewriter, loc, input, one);
 
     // If the input is less than 0.5 use Euler's reflection formula:
     // gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
-    Value need_to_reflect = rewriter.create<LessOp>(loc, input, one_half);
+    Value need_to_reflect = LessOp::create(rewriter, loc, input, one_half);
     Type tensor_bool_type = need_to_reflect.getType();
-    Value z = rewriter.create<SelectV2Op>(loc, need_to_reflect, minus_input,
-                                          input_minus_one);
+    Value z = SelectV2Op::create(rewriter, loc, need_to_reflect, minus_input,
+                                 input_minus_one);
 
     Value x = base_lanczos_coeff;
     for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
       Value lanczos_coefficient = create_const_op(kLanczosCoefficients[i]);
       Value index = create_const_op(static_cast<double>(i));
-      Value z_plus_index = rewriter.create<AddV2Op>(loc, z, index);
+      Value z_plus_index = AddV2Op::create(rewriter, loc, z, index);
       Value z_plus_index_plus_one =
-          rewriter.create<AddV2Op>(loc, z_plus_index, one);
-      Value incr = rewriter.create<DivOp>(loc, lanczos_coefficient,
-                                          z_plus_index_plus_one);
-      x = rewriter.create<AddV2Op>(loc, x, incr);
+          AddV2Op::create(rewriter, loc, z_plus_index, one);
+      Value incr = DivOp::create(rewriter, loc, lanczos_coefficient,
+                                 z_plus_index_plus_one);
+      x = AddV2Op::create(rewriter, loc, x, incr);
     }
 
     // To improve accuracy on platforms with less-precise log implementations,
@@ -691,14 +700,14 @@ class LowerLgammaOp : public RewritePattern {
     // the device.
     // log(t) = log(kLanczosGamma + 0.5 + z)
     //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
-    Value t = rewriter.create<AddV2Op>(loc, lanczos_gamma_plus_one_half, z);
+    Value t = AddV2Op::create(rewriter, loc, lanczos_gamma_plus_one_half, z);
     Value z_div_lanczos_gamma_plus_one_half =
-        rewriter.create<DivOp>(loc, z, lanczos_gamma_plus_one_half);
+        DivOp::create(rewriter, loc, z, lanczos_gamma_plus_one_half);
     Value log1p_z_div_lanczos_gamma_plus_one_half =
-        rewriter.create<Log1pOp>(loc, z_div_lanczos_gamma_plus_one_half);
+        Log1pOp::create(rewriter, loc, z_div_lanczos_gamma_plus_one_half);
     Value log_t =
-        rewriter.create<AddV2Op>(loc, log_lanczos_gamma_plus_one_half,
-                                 log1p_z_div_lanczos_gamma_plus_one_half);
+        AddV2Op::create(rewriter, loc, log_lanczos_gamma_plus_one_half,
+                        log1p_z_div_lanczos_gamma_plus_one_half);
 
     // Compute the final result (modulo reflection).  t(z) may be large, and we
     // need to be careful not to overflow to infinity in the first term of
@@ -710,17 +719,17 @@ class LowerLgammaOp : public RewritePattern {
     //   (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
     //
     // log_y = log_sqrt_two_pi + (z + one_half - t / log_t) * log_t + Log(x);
-    Value t_div_log_t = rewriter.create<DivOp>(loc, t, log_t);
+    Value t_div_log_t = DivOp::create(rewriter, loc, t, log_t);
     Value one_half_minus_t_div_log_t =
-        rewriter.create<SubOp>(loc, one_half, t_div_log_t);
+        SubOp::create(rewriter, loc, one_half, t_div_log_t);
     Value z_plus_one_half_minus_t_div_log_t =
-        rewriter.create<AddV2Op>(loc, z, one_half_minus_t_div_log_t);
+        AddV2Op::create(rewriter, loc, z, one_half_minus_t_div_log_t);
     Value z_plus_one_half_minus_t_div_log_t_mul_log_t =
-        rewriter.create<MulOp>(loc, z_plus_one_half_minus_t_div_log_t, log_t);
-    Value log_x = rewriter.create<LogOp>(loc, x);
-    Value log_y_rhs = rewriter.create<AddV2Op>(
-        loc, z_plus_one_half_minus_t_div_log_t_mul_log_t, log_x);
-    Value log_y = rewriter.create<AddV2Op>(loc, log_sqrt_two_pi, log_y_rhs);
+        MulOp::create(rewriter, loc, z_plus_one_half_minus_t_div_log_t, log_t);
+    Value log_x = LogOp::create(rewriter, loc, x);
+    Value log_y_rhs = AddV2Op::create(
+        rewriter, loc, z_plus_one_half_minus_t_div_log_t_mul_log_t, log_x);
+    Value log_y = AddV2Op::create(rewriter, loc, log_sqrt_two_pi, log_y_rhs);
 
     // Compute the reflected value, used when x < 0.5:
     //
@@ -747,48 +756,48 @@ class LowerLgammaOp : public RewritePattern {
     // Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
     // to 1.  To remedy this, we can use the fact that sin(pi * x) in the domain
     // [0, 1] is symmetric across the line Y=0.5.
-    Value abs_input = rewriter.create<AbsOp>(loc, input);
-    Value abs_input_floor = rewriter.create<FloorOp>(loc, abs_input);
+    Value abs_input = AbsOp::create(rewriter, loc, input);
+    Value abs_input_floor = FloorOp::create(rewriter, loc, abs_input);
     Value abs_frac_input =
-        rewriter.create<SubOp>(loc, abs_input, abs_input_floor);
+        SubOp::create(rewriter, loc, abs_input, abs_input_floor);
 
     // Convert values of abs_frac_input > 0.5 to (1 - frac_input) to improve
     // precision of pi * abs_frac_input for values of abs_frac_input close to 1.
     Value one_minus_abs_frac_input =
-        rewriter.create<SubOp>(loc, one, abs_frac_input);
+        SubOp::create(rewriter, loc, one, abs_frac_input);
     Value abs_frac_input_gt_one_half =
-        rewriter.create<GreaterOp>(loc, abs_frac_input, one_half);
+        GreaterOp::create(rewriter, loc, abs_frac_input, one_half);
     Value reduced_frac_input =
-        rewriter.create<SelectV2Op>(loc, abs_frac_input_gt_one_half,
-                                    one_minus_abs_frac_input, abs_frac_input);
+        SelectV2Op::create(rewriter, loc, abs_frac_input_gt_one_half,
+                           one_minus_abs_frac_input, abs_frac_input);
     Value pi_mul_reduced_frac_input =
-        rewriter.create<MulOp>(loc, pi, reduced_frac_input);
+        MulOp::create(rewriter, loc, pi, reduced_frac_input);
     Value sin_pi_mul_reduced_frac_input =
-        rewriter.create<SinOp>(loc, pi_mul_reduced_frac_input);
+        SinOp::create(rewriter, loc, pi_mul_reduced_frac_input);
     Value reflection_denom =
-        rewriter.create<LogOp>(loc, sin_pi_mul_reduced_frac_input);
+        LogOp::create(rewriter, loc, sin_pi_mul_reduced_frac_input);
 
     // Avoid computing -inf - inf, which is nan.  If reflection_denom is +/-inf,
     // then it "wins" and the result is +/-inf.
     Value is_finite =
-        rewriter.create<IsFiniteOp>(loc, tensor_bool_type, reflection_denom);
-    Value neg_reflection_denom = rewriter.create<NegOp>(loc, reflection_denom);
+        IsFiniteOp::create(rewriter, loc, tensor_bool_type, reflection_denom);
+    Value neg_reflection_denom = NegOp::create(rewriter, loc, reflection_denom);
     Value log_pi_minus_reflection_denom =
-        rewriter.create<SubOp>(loc, log_pi, reflection_denom);
+        SubOp::create(rewriter, loc, log_pi, reflection_denom);
     Value reflection_if_finite =
-        rewriter.create<SubOp>(loc, log_pi_minus_reflection_denom, log_y);
-    Value reflection = rewriter.create<SelectV2Op>(
-        loc, is_finite, reflection_if_finite, neg_reflection_denom);
+        SubOp::create(rewriter, loc, log_pi_minus_reflection_denom, log_y);
+    Value reflection = SelectV2Op::create(
+        rewriter, loc, is_finite, reflection_if_finite, neg_reflection_denom);
 
     Value result =
-        rewriter.create<SelectV2Op>(loc, need_to_reflect, reflection, log_y);
+        SelectV2Op::create(rewriter, loc, need_to_reflect, reflection, log_y);
 
     // lgamma(+/-inf) = +inf.
-    Value is_inf = rewriter.create<IsInfOp>(loc, tensor_bool_type, input);
-    result = rewriter.create<SelectV2Op>(loc, is_inf, infinity, result);
+    Value is_inf = IsInfOp::create(rewriter, loc, tensor_bool_type, input);
+    result = SelectV2Op::create(rewriter, loc, is_inf, infinity, result);
 
     if (needs_cast) {
-      result = rewriter.create<CastOp>(loc, original_tensor_type, result);
+      result = CastOp::create(rewriter, loc, original_tensor_type, result);
     }
 
     rewriter.replaceOp(op, result);
@@ -819,10 +828,11 @@ class LowerPackOp : public RewritePattern {
     auto op = cast<PackOp>(src_op);
 
     Location loc = op.getLoc();
-    auto axis_value = rewriter.create<ConstOp>(
-        loc, DenseElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
-                                        {}, rewriter.getIntegerType(64)),
-                                    op.getAxis()));
+    auto axis_value = ConstOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
+                                   {}, rewriter.getIntegerType(64)),
+                               op.getAxis()));
     int64_t axis = op.getAxis();
 
     Type prev_input_ty, inferred_ty;
@@ -838,7 +848,7 @@ class LowerPackOp : public RewritePattern {
         prev_input_ty = input_ty;
       }
       expanded_inputs.push_back(
-          rewriter.create<ExpandDimsOp>(loc, inferred_ty, input, axis_value));
+          ExpandDimsOp::create(rewriter, loc, inferred_ty, input, axis_value));
     }
 
     rewriter.replaceOpWithNewOp<ConcatV2Op>(op, op.getType(), expanded_inputs,
@@ -922,28 +932,28 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     auto block_shape_i64_type = tensorflow::GetTypeFromTFTensorShape(
         block_shape_type.getShape(), rewriter.getIntegerType(64));
     auto block_shape_i64 =
-        rewriter.create<CastOp>(loc, block_shape_i64_type, op.getBlockShape());
+        CastOp::create(rewriter, loc, block_shape_i64_type, op.getBlockShape());
 
     auto paddings_i64_type = tensorflow::GetTypeFromTFTensorShape(
         paddings_type.getShape(), rewriter.getIntegerType(64));
     auto paddings_i64 =
-        rewriter.create<CastOp>(loc, paddings_i64_type, op.getPaddings());
+        CastOp::create(rewriter, loc, paddings_i64_type, op.getPaddings());
 
-    auto pad00 = rewriter.create<ConstOp>(
-        loc, DenseElementsAttr::get<int64_t>(
-                 tensorflow::GetTypeFromTFTensorShape(
-                     {1, 2}, rewriter.getIntegerType(64)),
-                 {0, 0}));
+    auto pad00 = ConstOp::create(rewriter, loc,
+                                 DenseElementsAttr::get<int64_t>(
+                                     tensorflow::GetTypeFromTFTensorShape(
+                                         {1, 2}, rewriter.getIntegerType(64)),
+                                     {0, 0}));
     SmallVector<Value, 4> full_paddings_list{pad00, paddings_i64};
     full_paddings_list.append(remaining_rank, pad00);
     auto full_paddings_type = tensorflow::GetTypeFromTFTensorShape(
         {input_rank, 2}, rewriter.getIntegerType(64));
-    auto zero_i64 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
+    auto zero_i64 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
     // Extends paddings to all dimensions of input by adding 0s to non-block
     // dimensions.
-    auto full_paddings = rewriter.create<ConcatV2Op>(
-        loc, full_paddings_type, full_paddings_list, zero_i64);
+    auto full_paddings = ConcatV2Op::create(rewriter, loc, full_paddings_type,
+                                            full_paddings_list, zero_i64);
 
     // Compute the result type here instead of using shape inference because the
     // full_paddings won't be available as a constant for shape inference.
@@ -973,45 +983,44 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
         tensorflow::GetTypeFromTFTensorShape(padded_shape, element_type);
     // padded = pad(input, full_paddings)
     auto padded =
-        rewriter.create<PadOp>(loc, padded_type, op.getInput(), full_paddings);
+        PadOp::create(rewriter, loc, padded_type, op.getInput(), full_paddings);
 
     auto paddings_sum_type = tensorflow::GetTypeFromTFTensorShape(
         {input_rank}, rewriter.getIntegerType(64));
     // paddings_sum = paddings[*,0] + paddings[*,1]
-    auto paddings_split = rewriter.create<UnpackOp>(
-        loc, TypeRange({paddings_sum_type, paddings_sum_type}), full_paddings,
-        rewriter.getI64IntegerAttr(1));
-    auto paddings_sum = rewriter.create<AddV2Op>(
-        loc, paddings_split.getResult(0), paddings_split.getResult(1));
-
-    auto input_shape_tensor = rewriter.create<ConstOp>(
-        loc,
+    auto paddings_split = UnpackOp::create(
+        rewriter, loc, TypeRange({paddings_sum_type, paddings_sum_type}),
+        full_paddings, rewriter.getI64IntegerAttr(1));
+    auto paddings_sum =
+        AddV2Op::create(rewriter, loc, paddings_split.getResult(0),
+                        paddings_split.getResult(1));
+
+    auto input_shape_tensor = ConstOp::create(
+        rewriter, loc,
         DenseElementsAttr::get(tensorflow::GetTypeFromTFTensorShape(
                                    {input_rank}, rewriter.getIntegerType(64)),
                                input_shape));
 
     // padded_shape_tensor is the shape of padded.
     auto padded_shape_tensor =
-        rewriter.create<AddV2Op>(loc, paddings_sum, input_shape_tensor);
+        AddV2Op::create(rewriter, loc, paddings_sum, input_shape_tensor);
 
-    auto zero_i32 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getIntegerType(32), 0));
+    auto zero_i32 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getIntegerType(32), 0));
     SmallVector<Type, 4> padded_shape_splits_types(
         input_rank,
         tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getIntegerType(64)));
     SmallVector<Value, 4> padded_shape_splits(
-        rewriter
-            .create<SplitOp>(loc, padded_shape_splits_types, zero_i32,
-                             padded_shape_tensor)
+        SplitOp::create(rewriter, loc, padded_shape_splits_types, zero_i32,
+                        padded_shape_tensor)
             .getOutput());
 
     SmallVector<Type, 4> block_shape_splits_types(
         block_rank,
         tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getIntegerType(64)));
     SmallVector<Value, 4> block_shape_splits(
-        rewriter
-            .create<SplitOp>(loc, block_shape_splits_types, zero_i32,
-                             block_shape_i64)
+        SplitOp::create(rewriter, loc, block_shape_splits_types, zero_i32,
+                        block_shape_i64)
             .getOutput());
 
     SmallVector<int64_t, 4> outer_shape_ints;
@@ -1019,8 +1028,8 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     for (int64_t i = 0; i < block_rank; ++i) {
       // TODO(b/157475606): Insert tf.Assert that the following division has
       // remainder 0.
-      outer_shape_vals.push_back(rewriter.create<DivOp>(
-          loc, padded_shape_splits[1 + i], block_shape_splits[i]));
+      outer_shape_vals.push_back(DivOp::create(
+          rewriter, loc, padded_shape_splits[1 + i], block_shape_splits[i]));
 
       auto padded_shape_i = padded_shape[1 + i];
       auto block_shape_ints_i = block_shape_ints[i];
@@ -1049,8 +1058,8 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     auto reshaped_shape = ValuesToRank1(
         rewriter, loc, rewriter.getIntegerType(64), reshaped_shape_vals);
 
-    auto reshaped = rewriter.create<ReshapeOp>(
-        loc,
+    auto reshaped = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(reshaped_shape_ints, element_type),
         padded, reshaped_shape);
 
@@ -1065,14 +1074,14 @@ class LowerSpaceToBatchNDOp : public RewritePattern {
     for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
       permutation_vals.push_back(block_rank + i);
     }
-    auto permutation = rewriter.create<ConstOp>(
-        loc, GetI64ElementsAttr(permutation_vals, &rewriter));
+    auto permutation = ConstOp::create(
+        rewriter, loc, GetI64ElementsAttr(permutation_vals, &rewriter));
 
-    auto permuted = rewriter.create<TransposeOp>(loc, reshaped, permutation);
+    auto permuted = TransposeOp::create(rewriter, loc, reshaped, permutation);
     auto output_batch = padded_shape_splits[0];
     for (int64_t i = 0; i < block_rank; ++i) {
       output_batch =
-          rewriter.create<MulOp>(loc, output_batch, block_shape_splits[i]);
+          MulOp::create(rewriter, loc, output_batch, block_shape_splits[i]);
     }
     SmallVector<Value, 4> output_shape_vals{output_batch};
     for (int64_t i = 0; i < block_rank; ++i) {
@@ -1163,11 +1172,11 @@ class LowerBatchToSpaceND : public RewritePattern {
     std::copy(input_shape.begin() + 1, input_shape.end(),
               reshaped_shape.begin() + block_rank + 1);
 
-    auto reshaped = rewriter.create<TF::ReshapeOp>(
-        op.getLoc(),
+    auto reshaped = TF::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(reshaped_shape, element_ty), input,
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(reshaped_shape)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(reshaped_shape)));
 
     // 2. Permute dimensions of `reshaped` to produce `permuted` of shape
     //      [batch / prod(block_shape),
@@ -1191,12 +1200,12 @@ class LowerBatchToSpaceND : public RewritePattern {
       transpose_shape[it.index()] = reshaped_shape[it.value()];
     }
 
-    auto permuted = rewriter.create<TF::TransposeOp>(
-        op.getLoc(),
+    auto permuted = TF::TransposeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(transpose_shape, element_ty),
         reshaped,
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(permutation)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(permutation)));
 
     // 3. Reshape `permuted` to produce `reshaped_permuted` of shape
     //      [batch / prod(block_shape),
@@ -1219,13 +1228,13 @@ class LowerBatchToSpaceND : public RewritePattern {
     std::copy(remainder_shape.begin(), remainder_shape.end(),
               reshaped_permuted_shape.begin() + 1 + block_rank);
 
-    auto reshaped_permuted = rewriter.create<TF::ReshapeOp>(
-        op.getLoc(),
+    auto reshaped_permuted = TF::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(reshaped_permuted_shape,
                                              element_ty),
         permuted,
-        rewriter.create<ConstOp>(
-            op.getLoc(), rewriter.getI64TensorAttr(reshaped_permuted_shape)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(reshaped_permuted_shape)));
 
     // 4. Crop the start and end of dimensions `[1, ..., M]` of
     //    `reshaped_permuted` according to `crops` to produce the output of
@@ -1263,10 +1272,10 @@ class LowerBatchToSpaceND : public RewritePattern {
     rewriter.replaceOpWithNewOp<TF::SliceOp>(
         op, tensorflow::GetTypeFromTFTensorShape(slice_sizes, element_ty),
         reshaped_permuted,
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(start_indices)),
-        rewriter.create<ConstOp>(op.getLoc(),
-                                 rewriter.getI64TensorAttr(slice_sizes)));
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(start_indices)),
+        ConstOp::create(rewriter, op.getLoc(),
+                        rewriter.getI64TensorAttr(slice_sizes)));
     return success();
   }
 };
@@ -1310,11 +1319,11 @@ class LowerSparseMatMulOp : public RewritePattern {
         tensor_type_f32 = UnrankedTensorType::get(Float32Type::get(context));
       }
       // Add cast to f32 to conform with element type of result.
-      operand = rewriter.create<CastOp>(op.getLoc(), tensor_type_f32, operand);
+      operand = CastOp::create(rewriter, op.getLoc(), tensor_type_f32, operand);
     }
-    Value result = rewriter.create<MatMulOp>(
-        op.getLoc(), op.getProduct().getType(), operands[0], operands[1],
-        op.getTransposeA(), op.getTransposeB());
+    Value result = MatMulOp::create(
+        rewriter, op.getLoc(), op.getProduct().getType(), operands[0],
+        operands[1], op.getTransposeA(), op.getTransposeB());
 
     rewriter.replaceOp(op, {result});
     return success();
@@ -1441,20 +1450,22 @@ class LowerResizeNearestNeighbor : public RewritePattern {
     }
 
     auto one =
-        rewriter.create<ConstOp>(loc, GetScalarOfType(out_size_element_ty, 1));
+        ConstOp::create(rewriter, loc, GetScalarOfType(out_size_element_ty, 1));
 
     // Extract the image shape.
-    Value input_shape = rewriter.create<ShapeOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({4}, rewriter.getI64Type()),
+    Value input_shape = ShapeOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({4}, rewriter.getI64Type()),
         input);
-    input_shape = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
+    input_shape = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
         input_shape);
 
     auto scalar_dim_ty =
         tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty);
-    auto split_image_shape = rewriter.create<UnpackOp>(
-        loc,
+    auto split_image_shape = UnpackOp::create(
+        rewriter, loc,
         TypeRange({scalar_dim_ty, scalar_dim_ty, scalar_dim_ty, scalar_dim_ty}),
         input_shape);
 
@@ -1464,151 +1475,156 @@ class LowerResizeNearestNeighbor : public RewritePattern {
     auto in_x = split_image_shape.getResult(2);
     auto channels = split_image_shape.getResult(3);
 
-    auto in_count = rewriter.create<MulOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty),
-        in_y, in_x);
+    auto in_count = MulOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty), in_y,
+        in_x);
 
     // Unpack and separate the out width/height.
-    auto split_out_size = rewriter.create<UnpackOp>(
-        loc, TypeRange({scalar_dim_ty, scalar_dim_ty}), out_size);
+    auto split_out_size = UnpackOp::create(
+        rewriter, loc, TypeRange({scalar_dim_ty, scalar_dim_ty}), out_size);
 
     auto out_y = split_out_size.getResult(0);
     auto out_x = split_out_size.getResult(1);
 
-    auto out_count = rewriter.create<MulOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty),
-        out_y, out_x);
+    auto out_count = MulOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, out_size_element_ty), out_y,
+        out_x);
 
     // Generate what the final output shape will look like.
-    auto out_shape = rewriter.create<PackOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
+    auto out_shape = PackOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({4}, out_size_element_ty),
         ValueRange({batch, out_y, out_x, channels}));
 
     // Compute the indices along the vertical dimension.
-    auto in_y_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        in_y);
-    auto out_w_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        out_y);
-
-    Value y_scale = rewriter.create<DivOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
+    auto in_y_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), in_y);
+    auto out_w_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), out_y);
+
+    Value y_scale = DivOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
         in_y_f32, out_w_f32);
 
-    Value zero_f32 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getF32Type(), 0.0));
-    Value one_f32 = rewriter.create<ConstOp>(
-        loc, GetScalarOfType(rewriter.getF32Type(), 1.0));
-
-    Value y_range = rewriter.create<RangeOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_height_constant},
-                                             rewriter.getF32Type()),
-        zero_f32, out_w_f32, one_f32);
-
-    y_range = rewriter.create<MulOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_height_constant},
-                                             rewriter.getF32Type()),
-        y_range, y_scale);
-
-    y_range =
-        rewriter.create<CastOp>(loc,
-                                tensorflow::GetTypeFromTFTensorShape(
-                                    {out_height_constant}, out_size_element_ty),
-                                y_range);
-
-    y_range = rewriter.create<ReshapeOp>(
-        loc,
+    Value zero_f32 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getF32Type(), 0.0));
+    Value one_f32 = ConstOp::create(
+        rewriter, loc, GetScalarOfType(rewriter.getF32Type(), 1.0));
+
+    Value y_range =
+        RangeOp::create(rewriter, loc,
+                        tensorflow::GetTypeFromTFTensorShape(
+                            {out_height_constant}, rewriter.getF32Type()),
+                        zero_f32, out_w_f32, one_f32);
+
+    y_range = MulOp::create(rewriter, loc,
+                            tensorflow::GetTypeFromTFTensorShape(
+                                {out_height_constant}, rewriter.getF32Type()),
+                            y_range, y_scale);
+
+    y_range = CastOp::create(rewriter, loc,
+                             tensorflow::GetTypeFromTFTensorShape(
+                                 {out_height_constant}, out_size_element_ty),
+                             y_range);
+
+    y_range = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape({out_height_constant, 1},
                                              out_size_element_ty),
         y_range,
-        rewriter.create<PackOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
+        PackOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
             ValueRange({out_y, one})));
 
-    Value y_indices = rewriter.create<MulOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_height_constant, 1},
-                                             out_size_element_ty),
-        y_range, in_x);
+    Value y_indices =
+        MulOp::create(rewriter, loc,
+                      tensorflow::GetTypeFromTFTensorShape(
+                          {out_height_constant, 1}, out_size_element_ty),
+                      y_range, in_x);
 
     // Compute the indices for the nearest neighbour lookup across the width
     // dim.
-    auto in_x_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        in_x);
-    auto out_h_f32 = rewriter.create<CastOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
-        out_x);
-
-    Value x_scale = rewriter.create<DivOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
+    auto in_x_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), in_x);
+    auto out_h_f32 = CastOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()), out_x);
+
+    Value x_scale = DivOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({}, rewriter.getF32Type()),
         in_x_f32, out_h_f32);
 
-    Value x_range = rewriter.create<RangeOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({out_width_constant},
-                                             rewriter.getF32Type()),
-        zero_f32, out_h_f32, one_f32);
-
-    x_range =
-        rewriter.create<MulOp>(loc,
-                               tensorflow::GetTypeFromTFTensorShape(
-                                   {out_width_constant}, rewriter.getF32Type()),
-                               x_range, x_scale);
-
-    x_range =
-        rewriter.create<CastOp>(loc,
-                                tensorflow::GetTypeFromTFTensorShape(
-                                    {out_width_constant}, out_size_element_ty),
-                                x_range);
-
-    Value x_indices = rewriter.create<ReshapeOp>(
-        loc,
+    Value x_range =
+        RangeOp::create(rewriter, loc,
+                        tensorflow::GetTypeFromTFTensorShape(
+                            {out_width_constant}, rewriter.getF32Type()),
+                        zero_f32, out_h_f32, one_f32);
+
+    x_range = MulOp::create(rewriter, loc,
+                            tensorflow::GetTypeFromTFTensorShape(
+                                {out_width_constant}, rewriter.getF32Type()),
+                            x_range, x_scale);
+
+    x_range = CastOp::create(rewriter, loc,
+                             tensorflow::GetTypeFromTFTensorShape(
+                                 {out_width_constant}, out_size_element_ty),
+                             x_range);
+
+    Value x_indices = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape({1, out_width_constant},
                                              out_size_element_ty),
         x_range,
-        rewriter.create<PackOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
+        PackOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({2}, out_size_element_ty),
             ValueRange({one, out_x})));
 
     // Generate the combined index array, reshape to be 1-D.
-    Value indices = rewriter.create<AddV2Op>(
-        loc,
+    Value indices = AddV2Op::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {out_height_constant, out_width_constant}, out_size_element_ty),
         y_indices, x_indices);
 
-    indices = rewriter.create<ReshapeOp>(
-        loc,
+    indices = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape({out_spatial_cst},
                                              out_size_element_ty),
         indices,
-        rewriter.create<ReshapeOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({1}, out_size_element_ty),
+        ReshapeOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({1}, out_size_element_ty),
             out_count,
-            rewriter.create<ConstOp>(loc, rewriter.getI64TensorAttr({1}))));
+            ConstOp::create(rewriter, loc, rewriter.getI64TensorAttr({1}))));
 
     // Group the spatial indices and gather along that combined index.
-    Value input_collapsed_spatial = rewriter.create<ReshapeOp>(
-        loc,
+    Value input_collapsed_spatial = ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {batch_cst, in_spatial_cst, channels_cst}, input_element_ty),
         input,
-        rewriter.create<PackOp>(
-            loc, tensorflow::GetTypeFromTFTensorShape({3}, out_size_element_ty),
+        PackOp::create(
+            rewriter, loc,
+            tensorflow::GetTypeFromTFTensorShape({3}, out_size_element_ty),
             ValueRange({batch, in_count, channels})));
 
-    Value gathered_values = rewriter.create<GatherV2Op>(
-        loc,
+    Value gathered_values = GatherV2Op::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {batch_cst, out_spatial_cst, channels_cst}, input_element_ty),
         input_collapsed_spatial, indices, /*axis=*/one);
 
     gathered_values =
-        rewriter.create<ReshapeOp>(loc, result_ty, gathered_values, out_shape);
+        ReshapeOp::create(rewriter, loc, result_ty, gathered_values, out_shape);
 
     rewriter.replaceOp(op, gathered_values);
     return success();
@@ -1681,18 +1697,18 @@ struct LowerRollOp : public RewritePattern {
       begin_values[axis_i] = begin_i;
       auto begin_attr = DenseIntElementsAttr::get(axis_type, begin_values);
       auto begin =
-          rewriter.create<ConstOp>(op->getLoc(), axis_type, begin_attr);
+          ConstOp::create(rewriter, op->getLoc(), axis_type, begin_attr);
 
       SmallVector<int64_t, 4> output_shape;
       output_shape.append(input_shape.begin(), input_shape.end());
       output_shape[axis_i] = size_i;
       auto size_attr = DenseIntElementsAttr::get(axis_type, output_shape);
-      auto size = rewriter.create<ConstOp>(op->getLoc(), axis_type, size_attr);
+      auto size = ConstOp::create(rewriter, op->getLoc(), axis_type, size_attr);
 
       auto slice_op_ty = tensorflow::GetTypeFromTFTensorShape(
           output_shape, input_ty.getElementType());
-      return rewriter.create<SliceOp>(op->getLoc(), slice_op_ty, input, begin,
-                                      size);
+      return SliceOp::create(rewriter, op->getLoc(), slice_op_ty, input, begin,
+                             size);
     };
 
     auto result = tf_roll_op.getInput();
@@ -1708,9 +1724,9 @@ struct LowerRollOp : public RewritePattern {
 
       auto dim_attr = DenseIntElementsAttr::get(scalar_type, {axis_i});
       auto concat_dim =
-          rewriter.create<ConstOp>(op->getLoc(), scalar_type, dim_attr);
-      auto concat_op = rewriter.create<ConcatV2Op>(
-          op->getLoc(), input_ty,
+          ConstOp::create(rewriter, op->getLoc(), scalar_type, dim_attr);
+      auto concat_op = ConcatV2Op::create(
+          rewriter, op->getLoc(), input_ty,
           ArrayRef<Value>({slice_op_1.getOutput(), slice_op_2.getOutput()}),
           concat_dim);
       result = concat_op.getResult();
@@ -1741,7 +1757,7 @@ class LowerSoftmaxOp : public OpRewritePattern<OpTy> {
     // Note that the TensorFlow Softmax op verifies that the input rank is
     // greater than or equal to one so the following sequence is valid.
     auto reduce_dim =
-        rewriter.create<TF::ConstOp>(loc, GetI64ElementsAttr({-1}, &rewriter));
+        TF::ConstOp::create(rewriter, loc, GetI64ElementsAttr({-1}, &rewriter));
 
     // Exponential of input values and then their sum can be very large here.
     // Division with large denominator is numerically unstable. To improve
@@ -1750,20 +1766,19 @@ class LowerSoftmaxOp : public OpRewritePattern<OpTy> {
     // after adding or subtracting all inputs in a batch using a common value
     // gives mathematically equivalent result.
     auto max_logits =
-        rewriter.create<TF::MaxOp>(loc, logits, reduce_dim,
-                                   /*keep_dims=*/rewriter.getBoolAttr(true));
-    auto shifted_logits = rewriter.create<TF::SubOp>(loc, logits, max_logits);
+        TF::MaxOp::create(rewriter, loc, logits, reduce_dim,
+                          /*keep_dims=*/rewriter.getBoolAttr(true));
+    auto shifted_logits = TF::SubOp::create(rewriter, loc, logits, max_logits);
 
     // Exponentiate the inputs.
-    Value exp = rewriter.create<TF::ExpOp>(loc, shifted_logits);
+    Value exp = TF::ExpOp::create(rewriter, loc, shifted_logits);
 
     // Compute summation of the exponentials.
-    Value sum =
-        rewriter.create<TF::SumOp>(loc, exp, reduce_dim,
-                                   /*keep_dims=*/rewriter.getBoolAttr(true));
+    Value sum = TF::SumOp::create(rewriter, loc, exp, reduce_dim,
+                                  /*keep_dims=*/rewriter.getBoolAttr(true));
 
     if (use_log) {
-      Value log = rewriter.create<TF::LogOp>(loc, sum);
+      Value log = TF::LogOp::create(rewriter, loc, sum);
       rewriter.replaceOpWithNewOp<TF::SubOp>(op, shifted_logits, log);
     } else {
       rewriter.replaceOpWithNewOp<TF::DivOp>(op, exp, sum);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
index 4ddd3577957163..bd8ae6260ce259 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
@@ -105,13 +105,13 @@ class RewriteXlaHostComputeMlir
       rewriter.setInsertionPointToStart(&cloned_func.getBody().front());
       auto result_type =
           RankedTensorType::get({3}, rewriter.getType<TF::StringType>());
-      auto dynamic_key =
-          rewriter.create<TF::_XlaCompileMlirPlaceholderProgramKeyOp>(
-              func.getLoc(), /*program=*/result_type, llvm::ArrayRef<Value>{});
+      auto dynamic_key = TF::_XlaCompileMlirPlaceholderProgramKeyOp::create(
+          rewriter, func.getLoc(), /*program=*/result_type,
+          llvm::ArrayRef<Value>{});
 
-      auto recv_at_host = rewriter.create<TF::_XlaRecvAtHostOp>(
-          func.getLoc(), op.getOperandTypes(), /*dynamic_key=*/dynamic_key,
-          op.getSendKeyAttr(),
+      auto recv_at_host = TF::_XlaRecvAtHostOp::create(
+          rewriter, func.getLoc(), op.getOperandTypes(),
+          /*dynamic_key=*/dynamic_key, op.getSendKeyAttr(),
           /*device_ordinal=*/rewriter.getI64IntegerAttr(0),
           rewriter.getStringAttr("TPU"));
       for (auto result :
@@ -120,8 +120,8 @@ class RewriteXlaHostComputeMlir
       }
 
       rewriter.setInsertionPoint(cloned_func.getBody().front().getTerminator());
-      rewriter.create<TF::_XlaSendFromHostOp>(
-          func.getLoc(),
+      TF::_XlaSendFromHostOp::create(
+          rewriter, func.getLoc(),
           cloned_func.getBody().front().getTerminator()->getOperands(),
           /*dynamic_key=*/dynamic_key, op.getRecvKeyAttr(),
           /*device_ordinal=*/rewriter.getI64IntegerAttr(0),
@@ -157,8 +157,8 @@ void UpdateArgAttributes(mlir::func::FuncOp func) {
         // 'sharding' attribute.
         // TODO(b/414807890): Not sure whether we need to pass a V2 sharding to
         // the _XlaShardingV2, do this when we actually have a use case.
-        auto updated_arg = builder.create<TF::XlaShardingOp>(
-            func.getLoc(), arg.getType(), arg, /*sharding=*/sharding,
+        auto updated_arg = TF::XlaShardingOp::create(
+            builder, func.getLoc(), arg.getType(), arg, /*sharding=*/sharding,
             /*_XlaSharding=*/sharding, /*_XlaShardingV2=*/mlir::StringAttr());
         func.getArgument(i).replaceAllUsesExcept(
             updated_arg, llvm::SmallPtrSet<Operation*, 1>({updated_arg}));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
index 803f135af624d7..656f87deb0b79f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
@@ -118,8 +118,8 @@ void MakeShapeOpInvariant(tf_device::ReplicateOp replicate_op, int num_replicas,
     if (block_arg.getOwner() != replicate_block) return;
 
     OpBuilder builder(shape_op);
-    auto new_shape_op = builder.create<TF::VariableShapeOp>(
-        shape_op.getLoc(), shape_op.getType(),
+    auto new_shape_op = TF::VariableShapeOp::create(
+        builder, shape_op.getLoc(), shape_op.getType(),
         replicate_op.GetReplicaOperandForBlockArgument(block_arg,
                                                        /*replica=*/0));
     shape_op.replaceAllUsesWith(new_shape_op.getOperation());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
index deef690b4d9636..1945aa6d811c19 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/rewrite_tpu_embedding_ops.cc
@@ -45,8 +45,8 @@ OpT AddOperandAndRewriteAs(Operation* op, Value operand, NamedAttrList attr,
   builder->setInsertionPoint(op);
   auto operands = llvm::to_vector<4>(op->getOperands());
   operands.push_back(operand);
-  auto new_op = builder->create<OpT>(op->getLoc(), op->getResultTypes(),
-                                     operands, attr.getAttrs());
+  auto new_op = OpT::create(*builder, op->getLoc(), op->getResultTypes(),
+                            operands, attr.getAttrs());
   op->replaceAllUsesWith(new_op.getOperation()->getResults());
   op->erase();
   return new_op;
@@ -82,8 +82,8 @@ LogicalResult RunOnRegion(Region* region) {
   OpBuilder builder(region);
   auto output_ty =
       RankedTensorType::get({}, VariantType::get(region->getContext()));
-  auto dedup_op = builder.create<XlaRecvTPUEmbeddingDeduplicationDataOp>(
-      loc, output_ty, config);
+  auto dedup_op = XlaRecvTPUEmbeddingDeduplicationDataOp::create(
+      builder, loc, output_ty, config);
 
   // Rewrite RecvTPUEmbeddingActivations op to the corresponding internal op.
   if (recv_op)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
index 1e7958660fd8c4..ce3b6bb5dd5070 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/sparsecore/embedding_program_key.cc
@@ -213,13 +213,13 @@ tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder,
   }
 
   builder->setInsertionPointAfter(before_op);
-  auto launch = builder->create<tf_device::LaunchOp>(
-      before_op->getLoc(), builder->getStringAttr(host_device),
-      launch_result_types);
+  auto launch = tf_device::LaunchOp::create(*builder, before_op->getLoc(),
+                                            builder->getStringAttr(host_device),
+                                            launch_result_types);
   launch.getBody().push_back(launch_block);
 
   builder->setInsertionPointToEnd(&launch.GetBody());
-  builder->create<tf_device::ReturnOp>(before_op->getLoc(), launch_results);
+  tf_device::ReturnOp::create(*builder, before_op->getLoc(), launch_results);
 
   return launch;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
index 72302903b37fa5..d57390cbc919ad 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
@@ -37,16 +37,17 @@ struct FuseParallelMapAndBatch : public OpRewritePattern<BatchDatasetV2Op> {
 
     // The type of the `num_parallel_calls` argument in ParallelMapDataset
     // and MapAndBatchDataset is different (int32 and int64 respectively)
-    auto num_parallel_calls_op = rewriter.create<CastOp>(
-        op.getLoc(), UnrankedTensorType::get(rewriter.getIntegerType(64)),
+    auto num_parallel_calls_op = CastOp::create(
+        rewriter, op.getLoc(),
+        UnrankedTensorType::get(rewriter.getIntegerType(64)),
         batchInputOp.getNumParallelCalls(), rewriter.getBoolAttr(false));
 
     if (op.getMetadata() != batchInputOp.getMetadata()) {
       return failure();
     }
 
-    auto fused_op = rewriter.create<MapAndBatchDatasetOp>(
-        op.getLoc(), op.getType(), batchInputOp.getInputDataset(),
+    auto fused_op = MapAndBatchDatasetOp::create(
+        rewriter, op.getLoc(), op.getType(), batchInputOp.getInputDataset(),
         batchInputOp.getOtherArguments(), op.getBatchSize(),
         num_parallel_calls_op.getY(), op.getDropRemainder(),
         batchInputOp.getF(), op.getOutputTypes(), op.getOutputShapes(),
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
index bb4c951065f771..2ee19787c7552f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
@@ -131,8 +131,8 @@ void SinkResourceWritesIntoParallelExecute(
     new_result_types.push_back(old_result.getType());
 
   OpBuilder builder(parallel_execute);
-  auto new_parallel_execute = builder.create<tf_device::ParallelExecuteOp>(
-      parallel_execute.getLoc(), num_regions, new_result_types);
+  auto new_parallel_execute = tf_device::ParallelExecuteOp::create(
+      builder, parallel_execute.getLoc(), num_regions, new_result_types);
 
   for (auto region : llvm::zip(new_parallel_execute.getRegions(),
                                parallel_execute.getRegions()))
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
index 5f708ce0ee1a74..8cd90d0a96e9e9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
@@ -106,9 +106,9 @@ void TPUResourceReadForWritePass::runOnOperation() {
       if (!resource_and_type.resource) continue;
       if (ClusterFuncHasResourceRead(cluster_func, resource_and_type.resource))
         continue;
-      auto new_read = builder.create<TF::ReadVariableOp>(
-          resource_and_type.resource.getLoc(), resource_and_type.subtype,
-          resource_and_type.resource);
+      auto new_read = TF::ReadVariableOp::create(
+          builder, resource_and_type.resource.getLoc(),
+          resource_and_type.subtype, resource_and_type.resource);
       read_operands.push_back(new_read.getValue());
     }
 
@@ -119,8 +119,9 @@ void TPUResourceReadForWritePass::runOnOperation() {
     operands.append(read_operands.begin(), read_operands.end());
 
     auto loc = cluster_func.getLoc();
-    auto new_cluster_func = builder.create<tf_device::ClusterFuncOp>(
-        loc, cluster_func.getResultTypes(), operands, cluster_func->getAttrs());
+    auto new_cluster_func = tf_device::ClusterFuncOp::create(
+        builder, loc, cluster_func.getResultTypes(), operands,
+        cluster_func->getAttrs());
     cluster_func.replaceAllUsesWith(new_cluster_func);
     func::FuncOp func = cluster_func.getFuncOp();
     Block& block = func.front();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
index 03618d23464b0a..85db75ea51a543 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/unroll_batch_matmul.cc
@@ -89,9 +89,9 @@ TF::ReshapeOp ConvertTFBatchMatMulOp<BatchMatMulOpType>::createReshapeOp(
   Type resultType = RankedTensorType::get(shape, element_type);
   auto constant_attr = DenseElementsAttr::get(shape_spec_type, shape);
   auto shape_tensor =
-      rewriter.create<TF::ConstOp>(loc, shape_spec_type, constant_attr);
-  return rewriter.create<TF::ReshapeOp>(loc, resultType, /*tensor=*/value,
-                                        /*shape=*/shape_tensor);
+      TF::ConstOp::create(rewriter, loc, shape_spec_type, constant_attr);
+  return TF::ReshapeOp::create(rewriter, loc, resultType, /*tensor=*/value,
+                               /*shape=*/shape_tensor);
 }
 
 template <typename BatchMatMulOpType>
@@ -122,16 +122,16 @@ std::vector<Value> ConvertTFBatchMatMulOp<BatchMatMulOpType>::sliceInput(
     auto split_dimension_type =
         RankedTensorType::get({}, rewriter.getIntegerType(32));
     auto split_dimension_attr = DenseElementsAttr::get(split_dimension_type, 0);
-    auto split_dimension_op = rewriter.create<TF::ConstOp>(
-        loc, split_dimension_type, split_dimension_attr);
+    auto split_dimension_op = TF::ConstOp::create(
+        rewriter, loc, split_dimension_type, split_dimension_attr);
 
     // Split along each batch.
     SmallVector<int64_t, 3> slice_size = {1, num_rows, num_cols};
     Type slice_result_type = RankedTensorType::get(slice_size, element_type);
     llvm::SmallVector<Type, 4> output_types(batch_size, slice_result_type);
-    auto split_op = rewriter.create<TF::SplitOp>(loc, output_types,
-                                                 split_dimension_op.getOutput(),
-                                                 reshape_op.getOutput());
+    auto split_op = TF::SplitOp::create(rewriter, loc, output_types,
+                                        split_dimension_op.getOutput(),
+                                        reshape_op.getOutput());
 
     // Squeeze each batch, i.e. reshape
     // [1, num_rows, num_cols] -> [num_rows, num_cols]
@@ -259,11 +259,11 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
       lhs_batch_idx = batch_idx;
       rhs_batch_idx = batch_idx;
     }
-    auto matmul = rewriter.create<TF::MatMulOp>(loc, matmul_type,
-                                                /*a=*/sliced_lhs[lhs_batch_idx],
-                                                /*b=*/sliced_rhs[rhs_batch_idx],
-                                                /*transpose_a=*/op.getAdjX(),
-                                                /*transpose_b=*/op.getAdjY());
+    auto matmul = TF::MatMulOp::create(rewriter, loc, matmul_type,
+                                       /*a=*/sliced_lhs[lhs_batch_idx],
+                                       /*b=*/sliced_rhs[rhs_batch_idx],
+                                       /*transpose_a=*/op.getAdjX(),
+                                       /*transpose_b=*/op.getAdjY());
     matmuls.emplace_back(matmul.getProduct());
   }
 
@@ -272,7 +272,7 @@ LogicalResult ConvertTFBatchMatMulOp<BatchMatMulOpType>::matchAndRewrite(
       {bcast.output_batch_size(), rows, cols}, element_type);
   const auto axis = rewriter.getI64IntegerAttr(0);
   auto pack_op =
-      rewriter.create<TF::PackOp>(loc, packed_type, /*values=*/matmuls, axis);
+      TF::PackOp::create(rewriter, loc, packed_type, /*values=*/matmuls, axis);
 
   // Reshape the rank-3 tensor into the correct output shape.
   const auto& result_batch_shape = bcast.output_batch_shape().dim_sizes();
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
index 4bca511ca252b5..52d1bfc8ffde3a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/parallel_execute_util.cc
@@ -25,15 +25,15 @@ tf_device::ParallelExecuteOp BuildParallelExecuteOp(
     tf_device::ClusterFuncOp cluster_func, OpBuilder* builder) {
   const auto output_types = cluster_func.getResultTypes();
   builder->setInsertionPoint(cluster_func);
-  auto parallel_execute = builder->create<tf_device::ParallelExecuteOp>(
-      cluster_func.getLoc(), 1, output_types);
+  auto parallel_execute = tf_device::ParallelExecuteOp::create(
+      *builder, cluster_func.getLoc(), 1, output_types);
   cluster_func->remove();
   auto& block = parallel_execute.GetRegionBlockWithIndex(0);
   builder->setInsertionPointToEnd(&block);
   builder->insert(cluster_func);
   cluster_func.replaceAllUsesWith(parallel_execute);
-  builder->create<tf_device::ReturnOp>(block.getParent()->getLoc(),
-                                       cluster_func.getResults());
+  tf_device::ReturnOp::create(*builder, block.getParent()->getLoc(),
+                              cluster_func.getResults());
   return parallel_execute;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 932c941c6f5f7a..a7b676d8541909 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -793,8 +793,8 @@ TEST(TPURewriteDeviceUtilTest, TestHasModelParallelismFalse) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -811,8 +811,8 @@ TEST(TPURewriteDeviceUtilTest, TestHasModelParallelismTrue) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 5));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -830,8 +830,8 @@ TEST(TPURewriteDeviceUtilTest,
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -848,8 +848,8 @@ TEST(TPURewriteDeviceUtilTest,
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
 
   mlir::TF::RuntimeDevices devices;
@@ -865,8 +865,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostFailDeviceMissingAttributes) {
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context));
   mlir::OpBuilder builder(module_ref->getBodyRegion());
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
 
@@ -884,8 +884,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingTopology) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kDeviceAssignmentAttr, builder.getArrayAttr({}));
@@ -904,8 +904,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailMissingDeviceAssignment) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -924,8 +924,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceAssignment) {
   mlir::OpBuilder builder(module_ref->getBodyRegion());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -951,8 +951,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceFailBadDeviceName) {
                     llvm::ArrayRef<llvm::StringRef>({"bad_device_name"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -974,16 +974,16 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
 
   llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<llvm::StringRef, 4>>
       devices;
-  auto replicate = builder.create<mlir::tf_device::ReplicateOp>(
-      mlir::UnknownLoc::get(&context), /*num_replicas=*/2, devices,
+  auto replicate = mlir::tf_device::ReplicateOp::create(
+      builder, mlir::UnknownLoc::get(&context), /*num_replicas=*/2, devices,
       llvm::ArrayRef<std::pair<mlir::ValueRange, mlir::Type>>{},
       mlir::ValueRange{}, mlir::TypeRange{});
   builder.setInsertionPoint(&replicate.getBody().front(),
                             replicate.getBody().front().begin());
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
 
   mlir::TF::RuntimeDevices runtime_devices;
   std::string host_device;
@@ -1007,8 +1007,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceNotReplicated) {
                      "/job:worker/replica:0/task:0/device:CPU:0"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
   cluster->setAttr(kNumCoresPerReplicaAttr,
                    builder.getIntegerAttr(builder.getIntegerType(64), 1));
   cluster->setAttr(kTopologyAttr, builder.getStringAttr(""));
@@ -1034,8 +1034,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceInGenericPipeline) {
                     {"/job:localhost/replica:0/task:0/device:CPU:0"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
 
   mlir::TF::RuntimeDevices runtime_devices;
   (void)GetDevicesFromOp(*module_ref, &runtime_devices);
@@ -1060,8 +1060,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceInGenericPipelineMultiCPUs) {
                      "/job:worker/replica:0/task:2/device:CPU:0"})));
 
   llvm::SmallVector<mlir::Type, 8> result_types;
-  auto cluster = builder.create<mlir::tf_device::ClusterOp>(
-      mlir::UnknownLoc::get(&context), result_types);
+  auto cluster = mlir::tf_device::ClusterOp::create(
+      builder, mlir::UnknownLoc::get(&context), result_types);
 
   mlir::TF::RuntimeDevices runtime_devices;
   (void)GetDevicesFromOp(*module_ref, &runtime_devices);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc
index ba4d1b71a857cd..82b7202d6d78e9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_rewrite_util.cc
@@ -83,8 +83,8 @@ int MovePreservedParallelExecuteChildren(
   // `num_moved_children` is the number of children that will be preserved.
   const size_t num_moved_children =
       old_parallel_execute.getRegions().size() - 1;
-  *new_parallel_execute = builder->create<mlir::tf_device::ParallelExecuteOp>(
-      old_parallel_execute->getLoc(),
+  *new_parallel_execute = mlir::tf_device::ParallelExecuteOp::create(
+      *builder, old_parallel_execute->getLoc(),
       num_moved_children + num_cores_per_replica, concatenated_output_types);
 
   // `cluster_idx` is the index of the child with the `ClusterFuncOp`, which
@@ -118,12 +118,12 @@ mlir::tf_device::LaunchOp WrapOpInLaunch(mlir::OpBuilder* builder,
                                          llvm::StringRef device) {
   mlir::OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();
 
-  auto launch = builder->create<mlir::tf_device::LaunchOp>(
-      loc, builder->getStringAttr(device), op->getResultTypes());
+  auto launch = mlir::tf_device::LaunchOp::create(
+      *builder, loc, builder->getStringAttr(device), op->getResultTypes());
   launch.getBody().push_back(new mlir::Block);
 
   builder->setInsertionPointToEnd(&launch.GetBody());
-  builder->create<mlir::tf_device::ReturnOp>(loc, op->getResults());
+  mlir::tf_device::ReturnOp::create(*builder, loc, op->getResults());
 
   // Move op inside cluster.
   op->moveBefore(launch.GetBody().getTerminator());
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index 89e00e9b4d628c..3bca701131151f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -94,22 +94,23 @@ mlir::TF::SliceOp CreateSliceOp(mlir::OpBuilder* builder,
   auto start_position_type =
       mlir::RankedTensorType::get(shape.dims(), builder->getIntegerType(64));
 
-  auto start_position_op = builder->create<mlir::TF::ConstOp>(
-      input.getLoc(), mlir::DenseIntElementsAttr::get(start_position_type,
-                                                      slice_start_position));
-
-  auto slice_size_op = builder->create<mlir::TF::ConstOp>(
-      input.getLoc(), mlir::DenseIntElementsAttr::get(
-                          mlir::RankedTensorType::get(
-                              shape.dims(), builder->getIntegerType(64)),
-                          slice_size));
+  auto start_position_op =
+      mlir::TF::ConstOp::create(*builder, input.getLoc(),
+                                mlir::DenseIntElementsAttr::get(
+                                    start_position_type, slice_start_position));
+
+  auto slice_size_op = mlir::TF::ConstOp::create(
+      *builder, input.getLoc(),
+      mlir::DenseIntElementsAttr::get(
+          mlir::RankedTensorType::get(shape.dims(),
+                                      builder->getIntegerType(64)),
+          slice_size));
 
   auto slice_result_type =
       mlir::RankedTensorType::get(slice_size, getElementTypeOrSelf(input));
 
-  return builder->create<mlir::TF::SliceOp>(input.getLoc(), slice_result_type,
-                                            input, start_position_op,
-                                            slice_size_op);
+  return mlir::TF::SliceOp::create(*builder, input.getLoc(), slice_result_type,
+                                   input, start_position_op, slice_size_op);
 }
 
 mlir::TF::PadOp CreatePadOp(mlir::OpBuilder* builder,
@@ -135,15 +136,15 @@ mlir::TF::PadOp CreatePadOp(mlir::OpBuilder* builder,
   auto padding_type =
       mlir::RankedTensorType::get({num_dims, 2}, builder->getIntegerType(64));
   auto paddings = mlir::DenseIntElementsAttr::get(padding_type, padding_values);
-  auto paddings_value = builder->create<mlir::TF::ConstOp>(location, paddings);
+  auto paddings_value = mlir::TF::ConstOp::create(*builder, location, paddings);
   mlir::SmallVector<int64_t, 4> expand_shape(padded_shape.begin(),
                                              padded_shape.end());
 
   auto expand_result_type =
       mlir::RankedTensorType::get(expand_shape, input_type.getElementType());
 
-  return builder->create<mlir::TF::PadOp>(location, expand_result_type,
-                                          src_input, paddings_value);
+  return mlir::TF::PadOp::create(*builder, location, expand_result_type,
+                                 src_input, paddings_value);
 }
 
 // Creates a tf::SplitOp that splits 'src_input' into 'num_splits' ways
@@ -198,8 +199,8 @@ mlir::LogicalResult CreateSplitOp(
     output_type = input_type;
   }
 
-  auto split_dimension_op = builder->create<mlir::TF::ConstOp>(
-      location, split_dim_type, split_dimension_attr);
+  auto split_dimension_op = mlir::TF::ConstOp::create(
+      *builder, location, split_dim_type, split_dimension_attr);
   if (is_ici_weight_dist_spmd) {
     split_dimension_op->setAttr(kICIWeightDistributionMlirBridgeMarker,
                                 builder->getBoolAttr(true));
@@ -207,8 +208,9 @@ mlir::LogicalResult CreateSplitOp(
 
   // Creates a split op that splits |src_input| along |split_dimension|.
   llvm::SmallVector<mlir::Type, 4> output_types(num_split, output_type);
-  *split_op = builder->create<mlir::TF::SplitOp>(
-      location, output_types, split_dimension_op.getOutput(), src_input);
+  *split_op =
+      mlir::TF::SplitOp::create(*builder, location, output_types,
+                                split_dimension_op.getOutput(), src_input);
   (*split_op)->setAttr(
       kNumSplitAttr,
       builder->getIntegerAttr(builder->getIntegerType(32), num_split));
@@ -230,8 +232,8 @@ mlir::TF::ConcatOp CreateConcatOp(const int concat_dimension,
       mlir::RankedTensorType::get({}, builder->getIntegerType(32));
   auto concat_dimension_attr =
       mlir::DenseElementsAttr::get(concat_dim_type, concat_dimension);
-  auto concat_dimension_op = builder->create<mlir::TF::ConstOp>(
-      location, concat_dim_type, concat_dimension_attr);
+  auto concat_dimension_op = mlir::TF::ConstOp::create(
+      *builder, location, concat_dim_type, concat_dimension_attr);
 
   // Correctly set output shapes of concat op output if output shape is
   // statically known. Since the shape of TPUExecute op must be the same
@@ -253,8 +255,8 @@ mlir::TF::ConcatOp CreateConcatOp(const int concat_dimension,
     output_type = input_type;
   }
 
-  return builder->create<mlir::TF::ConcatOp>(
-      location, output_type, concat_dimension_op.getOutput(), inputs);
+  return mlir::TF::ConcatOp::create(*builder, location, output_type,
+                                    concat_dimension_op.getOutput(), inputs);
 }
 
 mlir::TF::XlaConcatNDOp CreateXlaConcatNDOp(
@@ -292,9 +294,9 @@ mlir::TF::XlaConcatNDOp CreateXlaConcatNDOp(
     output_type = input_slice_type;
   }
 
-  auto op = builder.create<mlir::TF::XlaConcatNDOp>(
-      location, output_type, inputs, builder.getI64ArrayAttr(num_concats),
-      builder.getI64ArrayAttr(paddings));
+  auto op = mlir::TF::XlaConcatNDOp::create(
+      builder, location, output_type, inputs,
+      builder.getI64ArrayAttr(num_concats), builder.getI64ArrayAttr(paddings));
   return op;
 }
 
@@ -338,9 +340,9 @@ mlir::LogicalResult CreateXlaSplitNDOp(const mlir::Location& location,
           << absl::StrJoin(input_shape, ",")
           << ", Padding: " << absl::StrJoin(paddings, ",");
 
-  *xla_split_op = builder->create<mlir::TF::XlaSplitNDOp>(
-      location, output_types, src_input, builder->getI64ArrayAttr(num_splits),
-      builder->getI64ArrayAttr(paddings));
+  *xla_split_op = mlir::TF::XlaSplitNDOp::create(
+      *builder, location, output_types, src_input,
+      builder->getI64ArrayAttr(num_splits), builder->getI64ArrayAttr(paddings));
   if (is_ici_weight_dist_spmd) {
     (*xla_split_op)
         ->setAttr(kICIWeightDistributionMlirBridgeMarker,
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 2ab0c3c619b292..e4fe30755c2eb7 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -229,8 +229,8 @@ static std::optional<int64_t> GetIntegerHLOAxisFromTFAxis(Value value,
 /// the shape of the input value.
 static stablehlo::ConvertOp CastValueToI64(Location loc, Value value,
                                            PatternRewriter *rewriter) {
-  return rewriter->create<stablehlo::ConvertOp>(loc, value,
-                                                rewriter->getIntegerType(64));
+  return stablehlo::ConvertOp::create(*rewriter, loc, value,
+                                      rewriter->getIntegerType(64));
 }
 
 // Creates an unpack op along the 0th dimension of the tensor. The `value` input
@@ -242,9 +242,9 @@ static TF::UnpackOp UnpackTensorAlongZeroDim(Location loc, Value value,
   SmallVector<Type, 2> unpacked_indices_type(
       num_outputs,
       tensorflow::GetTypeFromTFTensorShape({}, indices_type.getElementType()));
-  auto unpacked_indices = rewriter->create<TF::UnpackOp>(
-      loc, unpacked_indices_type, value,
-      IntegerAttr::get(rewriter->getIntegerType(64), 0));
+  auto unpacked_indices =
+      TF::UnpackOp::create(*rewriter, loc, unpacked_indices_type, value,
+                           IntegerAttr::get(rewriter->getIntegerType(64), 0));
   return unpacked_indices;
 }
 
@@ -277,8 +277,8 @@ tensorflow::TensorShape ToTensorShape(
 static stablehlo::ConstantOp GetScalarLimitConstOfType(Type ty, Location loc,
                                                        hlo::ScalarLimit limit,
                                                        OpBuilder *builder) {
-  return builder->create<stablehlo::ConstantOp>(
-      loc, hlo::getScalarLimitOfType(ty, limit));
+  return stablehlo::ConstantOp::create(*builder, loc,
+                                       hlo::getScalarLimitOfType(ty, limit));
 }
 
 // Deprecated: This is maintained to aid in porting old code that is not yet
@@ -396,12 +396,12 @@ static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
                                      OpBuilder &builder) {
   auto broadcast_dims = GetI64ArrayAttr({feature_dim}, &builder);
   auto to_type = mlir::cast<RankedTensorType>(broadcast_to.getType());
-  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto result_shape = shape::ShapeOfOp::create(builder, loc, broadcast_to);
   auto result_extents_type = GetExtentsTensorTypeFor(to_type);
-  auto result_extents = builder.create<shape::ToExtentTensorOp>(
-      loc, result_extents_type, result_shape);
-  return builder.create<stablehlo::DynamicBroadcastInDimOp>(
-      loc, to_type, broadcast_from, result_extents, broadcast_dims);
+  auto result_extents = shape::ToExtentTensorOp::create(
+      builder, loc, result_extents_type, result_shape);
+  return stablehlo::DynamicBroadcastInDimOp::create(
+      builder, loc, to_type, broadcast_from, result_extents, broadcast_dims);
 }
 
 // Broadcasts `input` to the shape of `broadcast_to` value following
@@ -413,15 +413,15 @@ static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
 // supports unranked inputs in the lowering.
 static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
                                 OpBuilder &builder) {
-  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto result_shape = shape::ShapeOfOp::create(builder, loc, broadcast_to);
   auto to_type = mlir::cast<TensorType>(broadcast_to.getType());
   auto result_extents_type = GetExtentsTensorTypeFor(to_type);
-  auto result_extents = builder.create<shape::ToExtentTensorOp>(
-      loc, result_extents_type, result_shape);
+  auto result_extents = shape::ToExtentTensorOp::create(
+      builder, loc, result_extents_type, result_shape);
   int64_t rank = mlir::cast<RankedTensorType>(input.getType()).getRank();
   auto broadcast_dims = GetI64ArrayAttrForSeq(0, rank, &builder);
-  return builder.create<stablehlo::DynamicBroadcastInDimOp>(
-      loc, to_type, input, result_extents, broadcast_dims);
+  return stablehlo::DynamicBroadcastInDimOp::create(
+      builder, loc, to_type, input, result_extents, broadcast_dims);
 }
 
 // Builds a set of operations for applying reduction on the input value. A
@@ -430,9 +430,9 @@ static Value ApplyReduction(Location loc, Value input,
                             DenseIntElementsAttr reduce_dims,
                             OpBuilder *builder) {
   auto reduce_dims_op =
-      builder->create<stablehlo::ConstantOp>(loc, reduce_dims);
-  return builder->create<TF::SumOp>(loc, input, reduce_dims_op,
-                                    builder->getBoolAttr(false));
+      stablehlo::ConstantOp::create(*builder, loc, reduce_dims);
+  return TF::SumOp::create(*builder, loc, input, reduce_dims_op,
+                           builder->getBoolAttr(false));
 }
 
 // Creates a stablehlo.rng_uniform op with `builder` to generate `num_elements`
@@ -440,17 +440,16 @@ static Value ApplyReduction(Location loc, Value input,
 static stablehlo::RngOp CreateRngUniform32(Location loc, int num_elements,
                                            int lower_limit, int upper_limit,
                                            OpBuilder *builder) {
-  auto shape_tensor = builder->create<stablehlo::ConstantOp>(
-      loc, GetI64ElementsAttr({num_elements}, builder));
+  auto shape_tensor = stablehlo::ConstantOp::create(
+      *builder, loc, GetI64ElementsAttr({num_elements}, builder));
 
-  auto lower = builder->create<stablehlo::ConstantOp>(
-      loc, builder->getI32IntegerAttr(lower_limit));
-  auto upper = builder->create<stablehlo::ConstantOp>(
-      loc, builder->getI32IntegerAttr(upper_limit));
+  auto lower = stablehlo::ConstantOp::create(
+      *builder, loc, builder->getI32IntegerAttr(lower_limit));
+  auto upper = stablehlo::ConstantOp::create(
+      *builder, loc, builder->getI32IntegerAttr(upper_limit));
 
-  return builder->create<stablehlo::RngOp>(
-      loc, lower, upper, shape_tensor,
-      ::mlir::stablehlo::RngDistribution::UNIFORM);
+  return stablehlo::RngOp::create(*builder, loc, lower, upper, shape_tensor,
+                                  ::mlir::stablehlo::RngDistribution::UNIFORM);
 }
 
 using WhileBodyFnType = llvm::function_ref<void(
@@ -489,8 +488,8 @@ static void CreateWhile32(Location loc, int num_iterations,
   init_types_with_loop_iv.reserve(value_count);
 
   // The initial value for the loop induction variable is 0.
-  init_values_with_loop_iv.push_back(builder->create<stablehlo::ConstantOp>(
-      loc, builder->getI32IntegerAttr(0)));
+  init_values_with_loop_iv.push_back(stablehlo::ConstantOp::create(
+      *builder, loc, builder->getI32IntegerAttr(0)));
   init_values_with_loop_iv.append(init_values.begin(), init_values.end());
 
   // Accumulate types of all the init values.
@@ -498,8 +497,8 @@ static void CreateWhile32(Location loc, int num_iterations,
     init_types_with_loop_iv.push_back(init_value_with_loop_iv.getType());
 
   // Create the while op.
-  auto while_op = builder->create<stablehlo::WhileOp>(
-      loc, init_types_with_loop_iv, init_values_with_loop_iv);
+  auto while_op = stablehlo::WhileOp::create(
+      *builder, loc, init_types_with_loop_iv, init_values_with_loop_iv);
   auto ivs_count = init_types_with_loop_iv.size();
 
   {
@@ -513,12 +512,13 @@ static void CreateWhile32(Location loc, int num_iterations,
 
     // Get the loop induction variable and compare it against the upper limit.
     auto loop_iv = block->getArgument(0);
-    auto upper_limit = builder->create<stablehlo::ConstantOp>(
-        loc, builder->getI32IntegerAttr(num_iterations));
-    Value compare = builder->create<stablehlo::CompareOp>(
-        loc, loop_iv, upper_limit, stablehlo::ComparisonDirection::LT);
+    auto upper_limit = stablehlo::ConstantOp::create(
+        *builder, loc, builder->getI32IntegerAttr(num_iterations));
+    Value compare =
+        stablehlo::CompareOp::create(*builder, loc, loop_iv, upper_limit,
+                                     stablehlo::ComparisonDirection::LT);
 
-    builder->create<stablehlo::ReturnOp>(loc, compare);
+    stablehlo::ReturnOp::create(*builder, loc, compare);
   }
 
   {
@@ -540,15 +540,15 @@ static void CreateWhile32(Location loc, int num_iterations,
             &new_values, builder);
 
     // Increment the loop induction variable by one.
-    auto one = builder->create<stablehlo::ConstantOp>(
-        loc, builder->getI32IntegerAttr(1));
+    auto one = stablehlo::ConstantOp::create(*builder, loc,
+                                             builder->getI32IntegerAttr(1));
     auto scalar_broadcast_dims = builder->getDenseI64ArrayAttr({});
-    auto plus_one = builder->create<chlo::BroadcastAddOp>(
-        loc, block->getArgument(0), one, scalar_broadcast_dims);
+    auto plus_one = chlo::BroadcastAddOp::create(
+        *builder, loc, block->getArgument(0), one, scalar_broadcast_dims);
     // Prepend with the updated loop induction variable.
     new_values.insert(new_values.begin(), plus_one);
 
-    builder->create<stablehlo::ReturnOp>(loc, new_values);
+    stablehlo::ReturnOp::create(*builder, loc, new_values);
   }
 
   // TODO(jpienaar): Support multi-operand while op.
@@ -748,20 +748,20 @@ static void BuildArgMinMaxReductionBody(
 
   ImplicitLocOpBuilder b(loc, *builder);
   Value compare_dt =
-      b.create<stablehlo::CompareOp>(lhs_val, rhs_val, direction);
+      stablehlo::CompareOp::create(b, lhs_val, rhs_val, direction);
   Value selected_input =
-      b.create<stablehlo::SelectOp>(input_type, compare_dt, lhs_val, rhs_val);
+      stablehlo::SelectOp::create(b, input_type, compare_dt, lhs_val, rhs_val);
 
-  Value compare_eq = b.create<stablehlo::CompareOp>(
-      lhs_val, rhs_val, stablehlo::ComparisonDirection::EQ);
-  Value min_index = b.create<stablehlo::MinOp>(lhs_index, rhs_index);
-  Value min_val_index = b.create<stablehlo::SelectOp>(index_type, compare_dt,
-                                                      lhs_index, rhs_index);
-  Value selected_index = b.create<stablehlo::SelectOp>(
-      index_type, compare_eq, min_index, min_val_index);
+  Value compare_eq = stablehlo::CompareOp::create(
+      b, lhs_val, rhs_val, stablehlo::ComparisonDirection::EQ);
+  Value min_index = stablehlo::MinOp::create(b, lhs_index, rhs_index);
+  Value min_val_index = stablehlo::SelectOp::create(b, index_type, compare_dt,
+                                                    lhs_index, rhs_index);
+  Value selected_index = stablehlo::SelectOp::create(b, index_type, compare_eq,
+                                                     min_index, min_val_index);
 
   Value return_values[] = {selected_input, selected_index};
-  b.create<stablehlo::ReturnOp>(return_values);
+  stablehlo::ReturnOp::create(b, return_values);
 }
 
 //===----------------------------------------------------------------------===//
@@ -898,9 +898,9 @@ static void BuildBodyWithCall(PatternRewriter &rewriter, const Location &loc,
   Block *block = rewriter.createBlock(body);
   auto inputs = func_ty.getInputs();
   block->addArguments(inputs, SmallVector<Location>(inputs.size(), loc));
-  mlir::func::CallOp call_op = rewriter.create<mlir::func::CallOp>(
-      loc, func, func_ty.getResults(), block->getArguments());
-  rewriter.create<stablehlo::ReturnOp>(loc, call_op.getResults());
+  mlir::func::CallOp call_op = mlir::func::CallOp::create(
+      rewriter, loc, func, func_ty.getResults(), block->getArguments());
+  stablehlo::ReturnOp::create(rewriter, loc, call_op.getResults());
 }
 
 //===----------------------------------------------------------------------===//
@@ -955,9 +955,9 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
     auto bias_broadcast = Broadcast1DToFeatureDim(
         loc, op.getValue(), op.getBias(), feature_dim, rewriter);
     Value add =
-        rewriter.create<stablehlo::AddOp>(loc, op.getValue(), bias_broadcast);
+        stablehlo::AddOp::create(rewriter, loc, op.getValue(), bias_broadcast);
     if (add.getType() != op.getType()) {
-      add = rewriter.create<tensor::CastOp>(loc, op.getType(), add);
+      add = tensor::CastOp::create(rewriter, loc, op.getType(), add);
     }
     rewriter.replaceOp(op, {add});
     return success();
@@ -986,7 +986,7 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     switch (padding_type) {
       case tensorflow::Padding::VALID: {
         auto zero =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 0);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 0);
         *padding_low = *padding_high = zero;
         break;
       }
@@ -994,48 +994,49 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
         break;
       case tensorflow::Padding::SAME: {
         auto zero =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 0);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 0);
         auto one =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 1);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 1);
         auto two =
-            rewriter.create<arith::ConstantIntOp>(loc, shape_scalar_type, 2);
+            arith::ConstantIntOp::create(rewriter, loc, shape_scalar_type, 2);
         // See also the parallel implementation in
         // GetWindowedOutputSizeFromDimsV2. effective_filter_size = (filter_size
         // - 1) * dilation_rate + 1
-        Value stride_value = rewriter.create<arith::ConstantIntOp>(
-            loc, shape_scalar_type, stride);
-        Value dilation_rate_value = rewriter.create<arith::ConstantIntOp>(
-            loc, shape_scalar_type, dilation_rate);
-        Value effective_filter_size_op = rewriter.create<arith::AddIOp>(
-            loc, one,
-            rewriter.create<arith::MulIOp>(
-                loc, dilation_rate_value,
-                rewriter.create<arith::SubIOp>(loc, filter_size, one)));
+        Value stride_value = arith::ConstantIntOp::create(
+            rewriter, loc, shape_scalar_type, stride);
+        Value dilation_rate_value = arith::ConstantIntOp::create(
+            rewriter, loc, shape_scalar_type, dilation_rate);
+        Value effective_filter_size_op = arith::AddIOp::create(
+            rewriter, loc, one,
+            arith::MulIOp::create(
+                rewriter, loc, dilation_rate_value,
+                arith::SubIOp::create(rewriter, loc, filter_size, one)));
         // output_size = (input_size + stride - 1) / stride;
-        Value output_size = rewriter.create<arith::DivUIOp>(
-            loc,
-            rewriter.create<arith::AddIOp>(
-                loc, input_size,
-                rewriter.create<arith::SubIOp>(loc, stride_value, one)),
+        Value output_size = arith::DivUIOp::create(
+            rewriter, loc,
+            arith::AddIOp::create(
+                rewriter, loc, input_size,
+                arith::SubIOp::create(rewriter, loc, stride_value, one)),
             stride_value);
         // std::max(int64{0}, (output_size - 1) * stride +
         //     effective_filter_size - input_size);
-        Value padding_needed = rewriter.create<arith::SubIOp>(
-            loc,
-            rewriter.create<arith::AddIOp>(
-                loc, effective_filter_size_op,
-                rewriter.create<arith::MulIOp>(
-                    loc, stride_value,
-                    rewriter.create<arith::SubIOp>(loc, output_size, one))),
+        Value padding_needed = arith::SubIOp::create(
+            rewriter, loc,
+            arith::AddIOp::create(
+                rewriter, loc, effective_filter_size_op,
+                arith::MulIOp::create(
+                    rewriter, loc, stride_value,
+                    arith::SubIOp::create(rewriter, loc, output_size, one))),
             input_size);
-        Value cond = rewriter.create<mlir::arith::CmpIOp>(
-            loc, arith::CmpIPredicate::sge, padding_needed, zero);
-        padding_needed = rewriter.create<mlir::arith::SelectOp>(
-            loc, padding_needed.getType(), cond, padding_needed, zero);
+        Value cond = mlir::arith::CmpIOp::create(
+            rewriter, loc, arith::CmpIPredicate::sge, padding_needed, zero);
+        padding_needed = mlir::arith::SelectOp::create(
+            rewriter, loc, padding_needed.getType(), cond, padding_needed,
+            zero);
         *padding_low =
-            rewriter.create<arith::DivUIOp>(loc, padding_needed, two);
+            arith::DivUIOp::create(rewriter, loc, padding_needed, two);
         *padding_high =
-            rewriter.create<arith::SubIOp>(loc, padding_needed, *padding_low);
+            arith::SubIOp::create(rewriter, loc, padding_needed, *padding_low);
         break;
       }
     }
@@ -1086,13 +1087,13 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     auto shape_scalar_type = rewriter.getIntegerType(32);
 
     auto get_const = [&](int64_t val) {
-      return rewriter.create<mlir::arith::ConstantIntOp>(loc, shape_scalar_type,
-                                                         val);
+      return mlir::arith::ConstantIntOp::create(rewriter, loc,
+                                                shape_scalar_type, val);
     };
     auto get_dim_value = [&](Value val, int64_t dim) {
-      Value dim_value = rewriter.create<tensor::DimOp>(loc, val, dim);
-      return rewriter.create<arith::IndexCastOp>(loc, shape_scalar_type,
-                                                 dim_value);
+      Value dim_value = tensor::DimOp::create(rewriter, loc, val, dim);
+      return arith::IndexCastOp::create(rewriter, loc, shape_scalar_type,
+                                        dim_value);
     };
 
     for (auto i : llvm::seq<int>(0, num_spatial_dims)) {
@@ -1149,8 +1150,8 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     auto precision_config_attr = rewriter.getNamedAttr(
         "precision_config", GetPrecisionConfig(&rewriter));
 
-    Value paddings_op = rewriter.create<tensor::FromElementsOp>(
-        op.getLoc(),
+    Value paddings_op = tensor::FromElementsOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(2 * num_spatial_dims,
                                              rewriter.getI32Type()),
         paddings);
@@ -1166,8 +1167,8 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
       new_shape.push_back(1);
       new_shape.push_back(filter_shape[num_spatial_dims] *
                           filter_shape[num_spatial_dims + 1]);
-      operands[1] = rewriter.create<stablehlo::ReshapeOp>(
-          op.getLoc(),
+      operands[1] = stablehlo::ReshapeOp::create(
+          rewriter, op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(new_shape,
                                                filter_ty.getElementType()),
           operands[1]);
@@ -1324,8 +1325,8 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
       new_shape.push_back(1);
       new_shape.push_back(filter_shape[num_spatial_dims] *
                           filter_shape[num_spatial_dims + 1]);
-      operands[1] = rewriter.create<stablehlo::ReshapeOp>(
-          op.getLoc(),
+      operands[1] = stablehlo::ReshapeOp::create(
+          rewriter, op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(new_shape,
                                                filter_ty.getElementType()),
           operands[1]);
@@ -1373,35 +1374,35 @@ class ConvertPadOpDynamic : public OpRewritePattern<TF::PadV2Op> {
     auto interior_attr = GetI64ElementsAttr(interior_values, &rewriter);
 
     Value interior_padding_tensor =
-        rewriter.create<stablehlo::ConstantOp>(loc, interior_attr);
+        stablehlo::ConstantOp::create(rewriter, loc, interior_attr);
     Type paddings_elem_ty = paddings_type.getElementType();
     if (!paddings_elem_ty.isInteger(64)) {
-      interior_padding_tensor = rewriter.create<stablehlo::ConvertOp>(
-          loc, interior_padding_tensor, paddings_elem_ty);
+      interior_padding_tensor = stablehlo::ConvertOp::create(
+          rewriter, loc, interior_padding_tensor, paddings_elem_ty);
     }
     llvm::SmallVector<int64_t, 2> transposed_shape = {2, input_rank};
     auto transpose_attr = GetI64ArrayAttr({1, 0}, &rewriter);
     Value transposed_paddings =
-        rewriter.create<stablehlo::TransposeOp>(loc, paddings, transpose_attr);
-    Value reshaped_paddings = rewriter.create<stablehlo::ReshapeOp>(
-        loc,
-        tensorflow::GetTypeFromTFTensorShape({input_rank * 2},
-                                             paddings_elem_ty),
-        transposed_paddings);
+        stablehlo::TransposeOp::create(rewriter, loc, paddings, transpose_attr);
+    Value reshaped_paddings =
+        stablehlo::ReshapeOp::create(rewriter, loc,
+                                     tensorflow::GetTypeFromTFTensorShape(
+                                         {input_rank * 2}, paddings_elem_ty),
+                                     transposed_paddings);
 
     auto left_padding_start_attr = GetI64ArrayAttr({0}, &rewriter);
     auto left_padding_limit_attr = GetI64ArrayAttr({input_rank}, &rewriter);
     auto left_padding_stride_attr = GetI64ArrayAttr({1}, &rewriter);
-    Value left_padding_tensor = rewriter.create<stablehlo::SliceOp>(
-        loc, reshaped_paddings, left_padding_start_attr,
+    Value left_padding_tensor = stablehlo::SliceOp::create(
+        rewriter, loc, reshaped_paddings, left_padding_start_attr,
         left_padding_limit_attr, left_padding_stride_attr);
 
     auto right_padding_start_attr = GetI64ArrayAttr({input_rank}, &rewriter);
     auto right_padding_limit_attr =
         GetI64ArrayAttr({2 * input_rank}, &rewriter);
     auto right_padding_stride_attr = GetI64ArrayAttr({1}, &rewriter);
-    Value right_padding_tensor = rewriter.create<stablehlo::SliceOp>(
-        loc, reshaped_paddings, right_padding_start_attr,
+    Value right_padding_tensor = stablehlo::SliceOp::create(
+        rewriter, loc, reshaped_paddings, right_padding_start_attr,
         right_padding_limit_attr, right_padding_stride_attr);
 
     rewriter.replaceOpWithNewOp<stablehlo::DynamicPadOp>(
@@ -1450,23 +1451,24 @@ class ConvertGatherNdOpDynamic : public OpRewritePattern<TF::GatherNdOp> {
     Value slice_sizes_value = nullptr;
     for (int64_t i = 0; i < params_rank; ++i) {
       if (i < num_index_dims) {
-        slice_sizes_vals.push_back(rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(indices_ty.getElementType(), 1)));
+        slice_sizes_vals.push_back(arith::ConstantOp::create(
+            rewriter, loc,
+            rewriter.getIntegerAttr(indices_ty.getElementType(), 1)));
       } else {
         int64_t dim_size = params_ty.getDimSize(i);
         if (dim_size != ShapedType::kDynamic) {
-          slice_sizes_vals.push_back(rewriter.create<arith::ConstantOp>(
-              loc,
+          slice_sizes_vals.push_back(arith::ConstantOp::create(
+              rewriter, loc,
               rewriter.getIntegerAttr(indices_ty.getElementType(), dim_size)));
         } else {
-          slice_sizes_vals.push_back(rewriter.create<arith::IndexCastOp>(
-              loc, indices_ty.getElementType(),
-              rewriter.create<tensor::DimOp>(loc, params, i)));
+          slice_sizes_vals.push_back(arith::IndexCastOp::create(
+              rewriter, loc, indices_ty.getElementType(),
+              tensor::DimOp::create(rewriter, loc, params, i)));
         }
       }
     }
     slice_sizes_value =
-        rewriter.create<tensor::FromElementsOp>(loc, slice_sizes_vals);
+        tensor::FromElementsOp::create(rewriter, loc, slice_sizes_vals);
 
     // collapsed_slice_dims
     SmallVector<int64_t, 4> collapsed_slice_dims;
@@ -1535,18 +1537,18 @@ class ConvertBF16FloorDivOp : public OpRewritePattern<TF::FloorDivOp> {
 
     auto out_type = op.getZ().getType();
 
-    l = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), l,
-                                              rewriter.getF32Type());
-    r = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), r,
-                                              rewriter.getF32Type());
+    l = stablehlo::ConvertOp::create(rewriter, op.getLoc(), l,
+                                     rewriter.getF32Type());
+    r = stablehlo::ConvertOp::create(rewriter, op.getLoc(), r,
+                                     rewriter.getF32Type());
 
-    auto intermediate = rewriter.create<TF::FloorDivOp>(
-        op.getLoc(),
+    auto intermediate = TF::FloorDivOp::create(
+        rewriter, op.getLoc(),
         ChangeTensorElementType(&rewriter, out_type, rewriter.getF32Type()), l,
         r);
 
-    auto floor_op = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), out_type,
-                                                          intermediate);
+    auto floor_op = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                 out_type, intermediate);
     rewriter.replaceOp(op, floor_op.getResult());
     return success();
   }
@@ -1615,24 +1617,26 @@ class ConvertRollOp : public OpRewritePattern<TF::RollOp> {
     // offset = ((offset % axis_size) + axis_size) % axis_size
     ImplicitLocOpBuilder b(op.getLoc(), rewriter);
     Value offset = op.getShift();
-    auto axis_size = b.create<stablehlo::ConstantOp>(b.getIntegerAttr(
-        getElementTypeOrSelf(offset.getType()), input_shape[axis]));
-    offset = b.create<stablehlo::RemOp>(
-        b.create<stablehlo::AddOp>(
-            b.create<stablehlo::RemOp>(offset, axis_size), axis_size),
+    auto axis_size = stablehlo::ConstantOp::create(
+        b, b.getIntegerAttr(getElementTypeOrSelf(offset.getType()),
+                            input_shape[axis]));
+    offset = stablehlo::RemOp::create(
+        b,
+        stablehlo::AddOp::create(
+            b, stablehlo::RemOp::create(b, offset, axis_size), axis_size),
         axis_size);
 
     // Stack two copies of the dimension, then slice from the calculated
     // offset. This also works if shift is not constant.
     // DynamicSliceOp requires the sizes being integer, and we can get the
     // information from input shape.
-    auto concat = b.create<stablehlo::ConcatenateOp>(
-        ValueRange{op.getInput(), op.getInput()}, b.getI64IntegerAttr(axis));
-    Value zero = b.create<stablehlo::ConstantOp>(
-        b.getIntegerAttr(getElementTypeOrSelf(offset.getType()), 0));
+    auto concat = stablehlo::ConcatenateOp::create(
+        b, ValueRange{op.getInput(), op.getInput()}, b.getI64IntegerAttr(axis));
+    Value zero = stablehlo::ConstantOp::create(
+        b, b.getIntegerAttr(getElementTypeOrSelf(offset.getType()), 0));
     SmallVector<Value> slice_begin_indices(input_rank, zero);
     slice_begin_indices[axis] =
-        b.create<stablehlo::SubtractOp>(axis_size, offset);
+        stablehlo::SubtractOp::create(b, axis_size, offset);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicSliceOp>(
         op, input_ty, concat, slice_begin_indices,
         GetI64ArrayAttr(input_shape, &rewriter));
@@ -1656,10 +1660,10 @@ class ConvertLeakyReluOp : public OpRewritePattern<TF::LeakyReluOp> {
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyActivationVal =
-        rewriter.create<stablehlo::MulOp>(loc, features, alphaVal);
+        stablehlo::MulOp::create(rewriter, loc, features, alphaVal);
 
-    Value compareGtZero = rewriter.create<stablehlo::CompareOp>(
-        loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
+    Value compareGtZero = stablehlo::CompareOp::create(
+        rewriter, loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
 
     rewriter.replaceOpWithNewOp<stablehlo::SelectOp>(
         op, compareGtZero, features, leakyActivationVal);
@@ -1686,10 +1690,10 @@ class ConvertLeakyReluGradOp : public OpRewritePattern<TF::LeakyReluGradOp> {
     Value zeroVal = chlo::getConstantLike(rewriter, loc, 0.0, features);
 
     Value leakyGradientVal =
-        rewriter.create<stablehlo::MulOp>(loc, gradients, alphaVal);
+        stablehlo::MulOp::create(rewriter, loc, gradients, alphaVal);
 
-    Value compareGtZero = rewriter.create<stablehlo::CompareOp>(
-        loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
+    Value compareGtZero = stablehlo::CompareOp::create(
+        rewriter, loc, features, zeroVal, stablehlo::ComparisonDirection::GT);
 
     rewriter.replaceOpWithNewOp<stablehlo::SelectOp>(
         op, featureType, compareGtZero, gradients, leakyGradientVal);
@@ -1733,29 +1737,30 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
       new_size *= input_type.getDimSize(i);
       new_dims.push_back(input_type.getDimSize(i));
     }
-    Value reshaped_input = rewriter.create<stablehlo::ReshapeOp>(
-        op.getLoc(),
+    Value reshaped_input = stablehlo::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({new_size, new_size},
                                              input_type.getElementType()),
         op.getInput());
     auto iota_type = tensorflow::GetTypeFromTFTensorShape(
         {new_size, new_size}, rewriter.getIntegerType(32));
-    auto iota0 = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), iota_type, rewriter.getI64IntegerAttr(0));
-    auto iota1 = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), iota_type, rewriter.getI64IntegerAttr(1));
-    Value compare = rewriter.create<stablehlo::CompareOp>(
-        op.getLoc(), iota0, iota1, stablehlo::ComparisonDirection::EQ);
+    auto iota0 = stablehlo::IotaOp::create(rewriter, op.getLoc(), iota_type,
+                                           rewriter.getI64IntegerAttr(0));
+    auto iota1 = stablehlo::IotaOp::create(rewriter, op.getLoc(), iota_type,
+                                           rewriter.getI64IntegerAttr(1));
+    Value compare =
+        stablehlo::CompareOp::create(rewriter, op.getLoc(), iota0, iota1,
+                                     stablehlo::ComparisonDirection::EQ);
     Value zero = GetScalarConstOfType(input_type.getElementType(), op.getLoc(),
                                       0, &rewriter);
-    Value zero_matrix = rewriter.create<stablehlo::BroadcastOp>(
-        op.getLoc(), reshaped_input.getType(), zero,
+    Value zero_matrix = stablehlo::BroadcastOp::create(
+        rewriter, op.getLoc(), reshaped_input.getType(), zero,
         GetI64ArrayAttr({new_size, new_size}, &rewriter));
-    Value masked = rewriter.create<stablehlo::SelectOp>(
-        op.getLoc(), reshaped_input.getType(), compare, reshaped_input,
-        zero_matrix);
-    auto reduce = rewriter.create<stablehlo::ReduceOp>(
-        op.getLoc(), masked, zero, GetI64ArrayAttr({0}, &rewriter),
+    Value masked = stablehlo::SelectOp::create(
+        rewriter, op.getLoc(), reshaped_input.getType(), compare,
+        reshaped_input, zero_matrix);
+    auto reduce = stablehlo::ReduceOp::create(
+        rewriter, op.getLoc(), masked, zero, GetI64ArrayAttr({0}, &rewriter),
         input_type.getElementType());
     assert(!input_type.getElementType().isInteger(1) &&
            "data type should not be i1");
@@ -1802,8 +1807,8 @@ class ConvertMatrixDiagPartV3Op
   stablehlo::BroadcastOp BroadcastConstant(Location loc, Shape shape,
                                            int32_t constant, int int_size,
                                            PatternRewriter &rewriter) const {
-    return rewriter.create<stablehlo::BroadcastOp>(
-        loc,
+    return stablehlo::BroadcastOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(shape,
                                              rewriter.getIntegerType(int_size)),
         GetScalarConstOfType(rewriter.getIntegerType(int_size), loc, constant,
@@ -1878,10 +1883,10 @@ class ConvertMatrixDiagPartV3Op
 
     RankedTensorType iota_type = tensorflow::GetTypeFromTFTensorShape(
         indices_shape, rewriter.getIntegerType(32));
-    Value iotaM = rewriter.create<stablehlo::IotaOp>(
-        loc, iota_type, rewriter.getI64IntegerAttr(1));
-    Value iotaN = rewriter.create<stablehlo::IotaOp>(
-        loc, iota_type, rewriter.getI64IntegerAttr(2));
+    Value iotaM = stablehlo::IotaOp::create(rewriter, loc, iota_type,
+                                            rewriter.getI64IntegerAttr(1));
+    Value iotaN = stablehlo::IotaOp::create(rewriter, loc, iota_type,
+                                            rewriter.getI64IntegerAttr(2));
 
     // Boradcasted constants, of the same shape as iotaM and iotaN.
     Value b_zero = BroadcastConstant(loc, indices_shape, 0, 32, rewriter);
@@ -1898,17 +1903,19 @@ class ConvertMatrixDiagPartV3Op
     //  subtract m here. This means we start with the superdiagonals and
     //  move downwards towards the subdiagonals. So the start indices will
     //  be decreasing.)
-    Value d = rewriter.create<stablehlo::SubtractOp>(loc, b_k1, iotaM);
-    Value neg_d = rewriter.create<stablehlo::NegOp>(loc, d);
+    Value d = stablehlo::SubtractOp::create(rewriter, loc, b_k1, iotaM);
+    Value neg_d = stablehlo::NegOp::create(rewriter, loc, d);
 
     // diag_len_d = min(rows + min(d, 0), cols - max(d, 0))
     // (Length of a diagonal for a given d. Same as max_diag_len for m = 0.)
-    Value diag_len_d = rewriter.create<stablehlo::MinOp>(
-        loc,
-        rewriter.create<stablehlo::AddOp>(
-            loc, b_rows, rewriter.create<stablehlo::MinOp>(loc, d, b_zero)),
-        rewriter.create<stablehlo::SubtractOp>(
-            loc, b_cols, rewriter.create<stablehlo::MaxOp>(loc, d, b_zero)));
+    Value diag_len_d = stablehlo::MinOp::create(
+        rewriter, loc,
+        stablehlo::AddOp::create(
+            rewriter, loc, b_rows,
+            stablehlo::MinOp::create(rewriter, loc, d, b_zero)),
+        stablehlo::SubtractOp::create(
+            rewriter, loc, b_cols,
+            stablehlo::MaxOp::create(rewriter, loc, d, b_zero)));
 
     // offset is max_diag_len - diag_len_d if we're padding, 0 otherwise.
     Value cmp;
@@ -1916,10 +1923,10 @@ class ConvertMatrixDiagPartV3Op
       cmp = b_true;
     } else if (superdiagonal_align == kRight) {
       // offset = d>=0 ? max_diag_len - diag_len_d : 0
-      cmp = rewriter.create<TF::GreaterEqualOp>(loc, d, b_zero);
+      cmp = TF::GreaterEqualOp::create(rewriter, loc, d, b_zero);
     } else if (subdiagonal_align == kRight) {
       // offset = d<=0 ? max_diag_len - diag_len_d : 0
-      cmp = rewriter.create<TF::LessEqualOp>(loc, d, b_zero);
+      cmp = TF::LessEqualOp::create(rewriter, loc, d, b_zero);
     } else {
       // offset = 0
       cmp = b_false;
@@ -1927,45 +1934,48 @@ class ConvertMatrixDiagPartV3Op
 
     // This offset shifts the diagonals to the "left" or "right", depending
     // on alignment.
-    Value offset = rewriter.create<stablehlo::SelectOp>(
-        loc, b_zero.getType(), cmp,
-        rewriter.create<stablehlo::SubtractOp>(loc, b_max_diag_len, diag_len_d),
+    Value offset = stablehlo::SelectOp::create(
+        rewriter, loc, b_zero.getType(), cmp,
+        stablehlo::SubtractOp::create(rewriter, loc, b_max_diag_len,
+                                      diag_len_d),
         b_zero);
 
     // x = max(d, 0) - offset
     // y = max(-d, 0) - offset
-    Value x = rewriter.create<stablehlo::SubtractOp>(
-        loc, rewriter.create<stablehlo::MaxOp>(loc, d, b_zero), offset);
-    Value y = rewriter.create<stablehlo::SubtractOp>(
-        loc, rewriter.create<stablehlo::MaxOp>(loc, neg_d, b_zero), offset);
+    Value x = stablehlo::SubtractOp::create(
+        rewriter, loc, stablehlo::MaxOp::create(rewriter, loc, d, b_zero),
+        offset);
+    Value y = stablehlo::SubtractOp::create(
+        rewriter, loc, stablehlo::MaxOp::create(rewriter, loc, neg_d, b_zero),
+        offset);
 
-    Value n_plus_x = rewriter.create<stablehlo::AddOp>(loc, iotaN, x);
-    Value n_plus_y = rewriter.create<stablehlo::AddOp>(loc, iotaN, y);
+    Value n_plus_x = stablehlo::AddOp::create(rewriter, loc, iotaN, x);
+    Value n_plus_y = stablehlo::AddOp::create(rewriter, loc, iotaN, y);
 
     // GatherOp is happy about letting us index out of bounds values, but those
     // values will be undefined. So we mask them later. Set up the boolean
     // expression that tells us which entries, in the output shape, are out of
     // bounds and thus become the padding_value.
-    Value x_in_bounds = rewriter.create<stablehlo::AndOp>(
-        loc,
-        rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_x,
-                                            b_zero),
-        rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_x, b_cols));
-    Value y_in_bounds = rewriter.create<stablehlo::AndOp>(
-        loc,
-        rewriter.create<TF::GreaterEqualOp>(loc, b_false.getType(), n_plus_y,
-                                            b_zero),
-        rewriter.create<TF::LessOp>(loc, b_false.getType(), n_plus_y, b_rows));
-    Value in_bounds = rewriter.create<stablehlo::ReshapeOp>(
-        loc,
+    Value x_in_bounds = stablehlo::AndOp::create(
+        rewriter, loc,
+        TF::GreaterEqualOp::create(rewriter, loc, b_false.getType(), n_plus_x,
+                                   b_zero),
+        TF::LessOp::create(rewriter, loc, b_false.getType(), n_plus_x, b_cols));
+    Value y_in_bounds = stablehlo::AndOp::create(
+        rewriter, loc,
+        TF::GreaterEqualOp::create(rewriter, loc, b_false.getType(), n_plus_y,
+                                   b_zero),
+        TF::LessOp::create(rewriter, loc, b_false.getType(), n_plus_y, b_rows));
+    Value in_bounds = stablehlo::ReshapeOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(Shape({num_diags, max_diag_len}),
                                              rewriter.getIntegerType(1)),
-        rewriter.create<stablehlo::AndOp>(loc, x_in_bounds, y_in_bounds));
+        stablehlo::AndOp::create(rewriter, loc, x_in_bounds, y_in_bounds));
 
     // Now combine x and y into the index data structure needed for gather.
     Shape concat_shape({2, num_diags, max_diag_len});
-    Value start_indices = rewriter.create<stablehlo::ConcatenateOp>(
-        loc,
+    Value start_indices = stablehlo::ConcatenateOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(concat_shape,
                                              rewriter.getIntegerType(32)),
         mlir::ValueRange({n_plus_y, n_plus_x}),
@@ -2009,8 +2019,8 @@ class ConvertMatrixDiagPartV3Op
         /*operandBatchingDims=*/{},
         /*startIndicesBatchingDims=*/{}, start_index_map,
         /*indexVectorDim=*/0);
-    Value gather = rewriter.create<stablehlo::GatherOp>(
-        loc, op.getInput(), start_indices, dims_attr,
+    Value gather = stablehlo::GatherOp::create(
+        rewriter, loc, op.getInput(), start_indices, dims_attr,
         GetI64ArrayAttr(slice_sizes, &rewriter));
 
     // We now need to broadcast the "in_bounds" boolean expression, as well as
@@ -2019,22 +2029,24 @@ class ConvertMatrixDiagPartV3Op
     for (int i = 0; i < output_shape.size() - 2; i++) {
       broadcast_bounds.push_back(output_shape[i]);
     }
-    Value b_in_bounds = rewriter.create<stablehlo::BroadcastOp>(
-        loc,
+    Value b_in_bounds = stablehlo::BroadcastOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(output_shape,
                                              rewriter.getIntegerType(1)),
         in_bounds, GetI64ArrayAttr(broadcast_bounds, &rewriter));
-    Value b_padding = rewriter.create<stablehlo::BroadcastOp>(
-        loc, op.getPaddingValue(), GetI64ArrayAttr(output_shape, &rewriter));
+    Value b_padding = stablehlo::BroadcastOp::create(
+        rewriter, loc, op.getPaddingValue(),
+        GetI64ArrayAttr(output_shape, &rewriter));
 
     // Replace all out-of-bounds values in the result with padding_value.
-    Value result = rewriter.create<stablehlo::SelectOp>(loc, b_in_bounds,
-                                                        gather, b_padding);
+    Value result = stablehlo::SelectOp::create(rewriter, loc, b_in_bounds,
+                                               gather, b_padding);
 
     if (num_diags == 1) {
       // matrix_diag_part folds away the 1-sized band dimension if we only
       // extract a single diagonal.
-      result = rewriter.create<stablehlo::ReshapeOp>(loc, op.getType(), result);
+      result =
+          stablehlo::ReshapeOp::create(rewriter, loc, op.getType(), result);
     }
 
     rewriter.replaceOp(op, result);
@@ -2057,9 +2069,10 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
     // creates a scalar constant 1.0 for first operand.
     if (op.getN() == 1) {
       equation_str = "," + equation_str;
-      inputs.push_back(rewriter.create<stablehlo::ConstantOp>(
-          op.getLoc(), hlo::getScalarOfType(
-                           mlir::getElementTypeOrSelf(op.getOperand(0)), 1)));
+      inputs.push_back(stablehlo::ConstantOp::create(
+          rewriter, op.getLoc(),
+          hlo::getScalarOfType(mlir::getElementTypeOrSelf(op.getOperand(0)),
+                               1)));
     }
     // Insert remaining operands into inputs, TF op verifier requires there be
     // 0 or 1 operands.
@@ -2129,8 +2142,8 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
 
     // Last dim larger than expected_dim, slice the input
     if (input_shape.back() > expected_dim) {
-      reshaped = rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(),
+      reshaped = stablehlo::SliceOp::create(
+          rewriter, op.getLoc(),
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
           op.getInput(), GetI64ArrayAttr(begin_indices, &rewriter),
@@ -2144,8 +2157,8 @@ class ConvertFFTOp : public OpRewritePattern<OpTy> {
       padding.push_back(expected_dim - input_shape.back());
       Value zero =
           GetScalarConstOfType(input_ty.getElementType(), loc, 0, &rewriter);
-      reshaped = rewriter.create<stablehlo::PadOp>(
-          loc,
+      reshaped = stablehlo::PadOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(expected_shape,
                                                input_ty.getElementType()),
           op.getInput(), zero, GetI64ArrayAttr(no_padding, &rewriter),
@@ -2193,8 +2206,8 @@ class ConvertFusedBatchNormGradBase
     // To support mixed precision, the statistics type, which maybe more
     // precise than the input types, are used for this op.
     Type kernel_type = mlir::cast<TensorType>(scale.getType()).getElementType();
-    grad = rewriter.create<stablehlo::ConvertOp>(loc, grad, kernel_type);
-    act = rewriter.create<stablehlo::ConvertOp>(loc, act, kernel_type);
+    grad = stablehlo::ConvertOp::create(rewriter, loc, grad, kernel_type);
+    act = stablehlo::ConvertOp::create(rewriter, loc, act, kernel_type);
 
     tensorflow::TensorFormat data_format;
     if (!FormatFromString(op.getDataFormat().str(), &data_format))
@@ -2213,9 +2226,9 @@ class ConvertFusedBatchNormGradBase
 
       SmallVector<Type, 3> operand_types = {act.getType(), feature_type,
                                             feature_type};
-      auto training_op = rewriter.create<stablehlo::BatchNormGradOp>(
-          loc, operand_types, act, scale, mean, var, grad, op.getEpsilon(),
-          feature_dim);
+      auto training_op = stablehlo::BatchNormGradOp::create(
+          rewriter, loc, operand_types, act, scale, mean, var, grad,
+          op.getEpsilon(), feature_dim);
 
       x_backprop = training_op.getResult(0);
 
@@ -2234,52 +2247,55 @@ class ConvertFusedBatchNormGradBase
       // scratch1 = rsqrt(var + epsilon)
       RankedTensorType scalar_float =
           tensorflow::GetTypeFromTFTensorShape({}, kernel_type);
-      auto epsilon = rewriter.create<stablehlo::ConstantOp>(
-          loc, DenseFPElementsAttr::get(scalar_float, {op.getEpsilon()}));
-      auto add_op = rewriter.create<chlo::BroadcastAddOp>(
-          loc, var, epsilon.getResult(), scalar_broadcast_dims);
+      auto epsilon = stablehlo::ConstantOp::create(
+          rewriter, loc,
+          DenseFPElementsAttr::get(scalar_float, {op.getEpsilon()}));
+      auto add_op = chlo::BroadcastAddOp::create(
+          rewriter, loc, var, epsilon.getResult(), scalar_broadcast_dims);
 
-      Value scratch1 = rewriter.create<stablehlo::RsqrtOp>(loc, add_op);
+      Value scratch1 = stablehlo::RsqrtOp::create(rewriter, loc, add_op);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto sub_op = rewriter.create<stablehlo::SubtractOp>(
-          loc, act,
+      auto sub_op = stablehlo::SubtractOp::create(
+          rewriter, loc, act,
           Broadcast1DToFeatureDim(loc, act, mean, feature_dim, rewriter));
-      auto weighted_grad = rewriter.create<stablehlo::MulOp>(loc, grad, sub_op);
+      auto weighted_grad =
+          stablehlo::MulOp::create(rewriter, loc, grad, sub_op);
       Value scratch2 =
           ApplyReduction(loc, weighted_grad, reduce_dims, &rewriter);
 
       // x_backprop = y_backprop * (scale * scratch1)
       auto scaled_grad =
-          rewriter.create<stablehlo::MulOp>(loc, op.getScale(), scratch1);
-      x_backprop = rewriter.create<stablehlo::MulOp>(
-          loc, grad,
+          stablehlo::MulOp::create(rewriter, loc, op.getScale(), scratch1);
+      x_backprop = stablehlo::MulOp::create(
+          rewriter, loc, grad,
           Broadcast1DToFeatureDim(loc, act, scaled_grad, feature_dim,
                                   rewriter));
 
       // scale_backprop = scratch2 * scratch1
       scale_backprop =
-          rewriter.create<stablehlo::MulOp>(loc, scratch1, scratch2);
+          stablehlo::MulOp::create(rewriter, loc, scratch1, scratch2);
 
       // offset_backprop = sum(y_backprop)
       offset_backprop = ApplyReduction(loc, grad, reduce_dims, &rewriter);
     }
 
     x_backprop =
-        rewriter.create<stablehlo::ConvertOp>(loc, x_backprop, act_ele_type);
+        stablehlo::ConvertOp::create(rewriter, loc, x_backprop, act_ele_type);
     Value last_val[2];
     if (op.getResult(3).use_empty() && op.getResult(4).use_empty()) {
       // It doesn't matter what values we provide for the last 2 results.
       last_val[0] = last_val[1] = op.getX();
     } else {
-      auto const_val = rewriter.create<stablehlo::ConstantOp>(
-          op.getLoc(), DenseElementsAttr::get<float>(
-                           tensorflow::GetTypeFromTFTensorShape(
-                               {0}, getElementTypeOrSelf(op.getResult(3))),
-                           0.0));
+      auto const_val = stablehlo::ConstantOp::create(
+          rewriter, op.getLoc(),
+          DenseElementsAttr::get<float>(
+              tensorflow::GetTypeFromTFTensorShape(
+                  {0}, getElementTypeOrSelf(op.getResult(3))),
+              0.0));
       auto maybe_cast = [&](Value val, Type t) -> Value {
         if (val.getType() == t) return val;
-        return rewriter.create<tensor::CastOp>(op.getLoc(), t, val);
+        return tensor::CastOp::create(rewriter, op.getLoc(), t, val);
       };
       last_val[0] = maybe_cast(const_val, op.getResult(3).getType());
       last_val[1] = maybe_cast(const_val, op.getResult(4).getType());
@@ -2333,8 +2349,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
     // TODO(b/69928690): Support mixed precision in the XLA batch
     // normalization operators. As a workaround, create a new x with the same
     // element type as scale (which may be more precise than the input type).
-    Value bn_train_input = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), op.getX(), scale_element_type);
+    Value bn_train_input = stablehlo::ConvertOp::create(
+        rewriter, op.getLoc(), op.getX(), scale_element_type);
     TensorType bn_train_input_type_tensor =
         mlir::cast<TensorType>(bn_train_input.getType());
 
@@ -2351,8 +2367,8 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       // batch_mean, and batch_var.
       SmallVector<Type, 3> operand_types = {bn_train_input_type_tensor,
                                             mean_var_type, mean_var_type};
-      auto bn_train_op = rewriter.create<stablehlo::BatchNormTrainingOp>(
-          op.getLoc(), operand_types, bn_train_input, op.getScale(),
+      auto bn_train_op = stablehlo::BatchNormTrainingOp::create(
+          rewriter, op.getLoc(), operand_types, bn_train_input, op.getScale(),
           op.getOffset(), op.getEpsilon(), feature_dim.getInt());
       // HLO op outputs a tuple of tensors. Extract those results.
       Value y_out = bn_train_op.getResult(0);
@@ -2368,48 +2384,53 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
       int sample_size_minus_one = std::max(1, sample_size - 1);
       double factor = static_cast<double>(sample_size) /
                       static_cast<double>(sample_size_minus_one);
-      auto factor_const_op = rewriter.create<stablehlo::ConstantOp>(
-          op.getLoc(), rewriter.getFloatAttr(scale_element_type, factor));
+      auto factor_const_op = stablehlo::ConstantOp::create(
+          rewriter, op.getLoc(),
+          rewriter.getFloatAttr(scale_element_type, factor));
 
-      Value corrected_variance = rewriter.create<chlo::BroadcastMulOp>(
-          op.getLoc(), batch_variance.getType(), batch_variance,
+      Value corrected_variance = chlo::BroadcastMulOp::create(
+          rewriter, op.getLoc(), batch_variance.getType(), batch_variance,
           factor_const_op, /*broadcast_dimensions=*/DenseI64ArrayAttr());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
-      y_out = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), y_out,
-                                                    input_element_type);
+      y_out = stablehlo::ConvertOp::create(rewriter, op.getLoc(), y_out,
+                                           input_element_type);
 
       float exponential_avg_factor =
           op.getExponentialAvgFactor().convertToFloat();
       if (exponential_avg_factor != 1.0f) {
-        auto alpha = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), rewriter.getFloatAttr(mean_element_type,
-                                               1.0f - exponential_avg_factor));
-        auto beta = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(),
+        auto alpha = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            rewriter.getFloatAttr(mean_element_type,
+                                  1.0f - exponential_avg_factor));
+        auto beta = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
             rewriter.getFloatAttr(mean_element_type, exponential_avg_factor));
 
         // new_running_mean = alpha * old_mean + beta * batch_mean.
-        auto alpha_mul_old_mean = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), op.getMean().getType(), alpha, op.getMean(),
+        auto alpha_mul_old_mean = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), op.getMean().getType(), alpha, op.getMean(),
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        auto beta_mul_batch_mean = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), batch_mean.getType(), beta, batch_mean,
+        auto beta_mul_batch_mean = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), batch_mean.getType(), beta, batch_mean,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        batch_mean = rewriter.create<chlo::BroadcastAddOp>(
-            op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
+        batch_mean = chlo::BroadcastAddOp::create(
+            rewriter, op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
 
         // new_running_variance = alpha * old_variance + beta * batch_variance.
-        auto alpha_mul_old_variance = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), op.getVariance().getType(), alpha, op.getVariance(),
+        auto alpha_mul_old_variance = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), op.getVariance().getType(), alpha,
+            op.getVariance(),
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        auto beta_mul_batch_variance = rewriter.create<chlo::BroadcastMulOp>(
-            op.getLoc(), corrected_variance.getType(), beta, corrected_variance,
+        auto beta_mul_batch_variance = chlo::BroadcastMulOp::create(
+            rewriter, op.getLoc(), corrected_variance.getType(), beta,
+            corrected_variance,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
-        corrected_variance = rewriter.create<chlo::BroadcastAddOp>(
-            op.getLoc(), alpha_mul_old_variance, beta_mul_batch_variance,
+        corrected_variance = chlo::BroadcastAddOp::create(
+            rewriter, op.getLoc(), alpha_mul_old_variance,
+            beta_mul_batch_variance,
             /*broadcast_dimensions=*/DenseI64ArrayAttr());
       }
 
@@ -2433,11 +2454,12 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                : 0;
         auto const_attr_type = tensorflow::GetTypeFromTFTensorShape(
             {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
-        Value dummy_const = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
+        Value dummy_const = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            DenseElementsAttr::get<float>(const_attr_type, 0.0));
         if (const_attr_type != reserve_space_3_type)
-          dummy_const = rewriter.create<tensor::CastOp>(
-              op.getLoc(), reserve_space_3_type, dummy_const);
+          dummy_const = tensor::CastOp::create(
+              rewriter, op.getLoc(), reserve_space_3_type, dummy_const);
         rewriter.replaceOp(op, {y_out, /*batch_mean=*/batch_mean,
                                 /*batch_variance=*/corrected_variance,
                                 /*reserve_space_1=*/reserve_space_1,
@@ -2445,16 +2467,16 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                 /*reserve_space_3=*/dummy_const});
       }
     } else {  // Inference case.
-      auto bn_train_op = rewriter.create<stablehlo::BatchNormInferenceOp>(
-          op.getLoc(),
+      auto bn_train_op = stablehlo::BatchNormInferenceOp::create(
+          rewriter, op.getLoc(),
           /*result_type=*/bn_train_input_type_tensor, bn_train_input,
           op.getScale(), op.getOffset(), op.getMean(), op.getVariance(),
           op.getEpsilon(), feature_dim.getInt());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
-      auto y_out = rewriter.create<stablehlo::ConvertOp>(
-          op.getLoc(), bn_train_op, input_element_type);
+      auto y_out = stablehlo::ConvertOp::create(
+          rewriter, op.getLoc(), bn_train_op, input_element_type);
 
       // The mean, variance, and reserved space outputs of the batch norm op are
       // not used for inference. It doesn't matter what values we provide for
@@ -2477,11 +2499,12 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
                                : 0;
         auto const_attr_type = tensorflow::GetTypeFromTFTensorShape(
             {num_elements}, getElementTypeOrSelf(reserve_space_3_type));
-        Value dummy_const = rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), DenseElementsAttr::get<float>(const_attr_type, 0.0));
+        Value dummy_const = stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            DenseElementsAttr::get<float>(const_attr_type, 0.0));
         if (const_attr_type != reserve_space_3_type)
-          dummy_const = rewriter.create<tensor::CastOp>(
-              op.getLoc(), reserve_space_3_type, dummy_const);
+          dummy_const = tensor::CastOp::create(
+              rewriter, op.getLoc(), reserve_space_3_type, dummy_const);
         rewriter.replaceOp(op, {/*y=*/y_out,
                                 /*batch_mean=*/op.getMean(),
                                 /*batch_variance=*/op.getVariance(),
@@ -2580,8 +2603,8 @@ Operation *AvgPoolDivideByCount(
     Value divisor =
         GetScalarConstOfType(element_type, loc, window_count, &rewriter);
     auto scalar_broadcast_dims = rewriter.getDenseI64ArrayAttr({});
-    result = rewriter.create<chlo::BroadcastDivOp>(
-        loc, pooled_type, pooled, divisor, scalar_broadcast_dims);
+    result = chlo::BroadcastDivOp::create(rewriter, loc, pooled_type, pooled,
+                                          divisor, scalar_broadcast_dims);
   } else {
     assert(op.getPadding() == "SAME");
     // For SAME padding, only original entries that contributed to a window
@@ -2589,7 +2612,7 @@ Operation *AvgPoolDivideByCount(
 
     // Build all-ones tensor of same shape as the original input.
     ElementsAttr splat = hlo::getSplat(&rewriter, orig_input_type, 1);
-    auto all_ones_tensor = rewriter.create<stablehlo::ConstantOp>(loc, splat);
+    auto all_ones_tensor = stablehlo::ConstantOp::create(rewriter, loc, splat);
 
     // Get padding for the input.
     DenseIntElementsAttr input_padding_attr =
@@ -2599,8 +2622,8 @@ Operation *AvgPoolDivideByCount(
 
     // Count the 1's in each window, using the same padding as for the input,
     // which gives us the window counts by which `pooled` needs to be divided.
-    auto divisor = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc, pooled_type,
+    auto divisor = stablehlo::ReduceWindowOp::create(
+        rewriter, loc, pooled_type,
         /*operand=*/all_ones_tensor,
         /*init_value=*/zero,
         /*window_dimensions=*/
@@ -2614,8 +2637,8 @@ Operation *AvgPoolDivideByCount(
                                       &rewriter);
 
     // Divide `pooled` by window counts.
-    result = rewriter.create<stablehlo::DivOp>(loc, pooled_type, pooled,
-                                               divisor.getResult(0));
+    result = stablehlo::DivOp::create(rewriter, loc, pooled_type, pooled,
+                                      divisor.getResult(0));
   }
   return result;
 }
@@ -2651,8 +2674,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
 
     // Convert if we need enlarge the element type's bitwidth.
     if (input_element_type != sum_element_type)
-      input_value = rewriter.create<stablehlo::ConvertOp>(
-          op.getLoc(), input_value, sum_element_type);
+      input_value = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                 input_value, sum_element_type);
 
     // Create the ReduceWindow op.
     Value init =
@@ -2660,8 +2683,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_type.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
-    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
-        op.getLoc(), result_type, input_value, init,
+    auto reduce = stablehlo::ReduceWindowOp::create(
+        rewriter, op.getLoc(), result_type, input_value, init,
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
         /*base_dilations=*/DenseI64ArrayAttr(),
@@ -2683,8 +2706,8 @@ class ConvertAvgPoolOp : public OpRewritePattern<OpTy> {
     // Convert back if we enlarged the element type's bitwidth.
     Value result = result_op->getOpResult(0);
     if (input_element_type != sum_element_type)
-      result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
-                                                     input_element_type);
+      result = stablehlo::ConvertOp::create(rewriter, op.getLoc(), result,
+                                            input_element_type);
 
     rewriter.replaceOp(op, result);
     return success();
@@ -2825,8 +2848,9 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
       out_grad_shape[dim] = low_padding[dim] + high_padding[dim] +
                             (out_grad_shape[dim] - 1) * strides[dim] + 1;
     }
-    Value reduce_window_input = rewriter.create<stablehlo::PadOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape(out_grad_shape, element_type),
+    Value reduce_window_input = stablehlo::PadOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape(out_grad_shape, element_type),
         /*operand=*/out_grad_divided->getOpResult(0),
         /*padding_value=*/zero,
         /*edge_padding_low=*/GetI64ArrayAttr(low_padding, &rewriter),
@@ -2839,13 +2863,13 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
     Type sum_element_type = GetSumAccumulationType(element_type);
     if (element_type != sum_element_type) {
       // Convert to appropriate sum accumulation type to avoid precision loss.
-      reduce_window_input = rewriter.create<stablehlo::ConvertOp>(
-          loc, reduce_window_input, sum_element_type);
+      reduce_window_input = stablehlo::ConvertOp::create(
+          rewriter, loc, reduce_window_input, sum_element_type);
       zero = GetScalarConstOfType(sum_element_type, loc, 0, &rewriter);
     }
     auto ones = GetI64ArrayAttr(DimVector(num_dims, 1), &rewriter);
-    auto reduce_window_op = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc,
+    auto reduce_window_op = stablehlo::ReduceWindowOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(orig_input_shape,
                                              sum_element_type),
         /*operand=*/reduce_window_input,
@@ -2862,8 +2886,8 @@ class ConvertAvgPoolGradOp : public OpRewritePattern<OpTy> {
 
     if (element_type != sum_element_type) {
       // Convert back to original element type.
-      result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
-                                                     element_type);
+      result = stablehlo::ConvertOp::create(rewriter, op.getLoc(), result,
+                                            element_type);
     }
     rewriter.replaceOp(op, {result});
     return success();
@@ -2909,8 +2933,8 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
     DenseIntElementsAttr paddings_attr = GetReduceWindowPaddingAsAttr<num_dims>(
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
-    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc, op.getType(), op.getInput(), init,
+    auto reduce = stablehlo::ReduceWindowOp::create(
+        rewriter, loc, op.getType(), op.getInput(), init,
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
         /*base_dilations=*/DenseI64ArrayAttr(),
@@ -2958,7 +2982,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     bool needs_broadcast = cond_type.getRank() == 1 && then_type.getRank() != 1;
     Value then_shape_split = then_shape;
     if (needs_broadcast) {
-      Value const_one = b.create<arith::ConstantIndexOp>(1);
+      Value const_one = arith::ConstantIndexOp::create(b, 1);
       Type extent_first = shape::getExtentTensorType(b.getContext(), 1);
       Type extent_second =
           shape::getExtentTensorType(b.getContext(), then_type.getRank() - 1);
@@ -2978,7 +3002,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     }
     auto result_type = mlir::cast<TensorType>(op.getResult().getType());
     auto assuming_op =
-        b.create<shape::AssumingOp>(ArrayRef<Type>{result_type}, assumption);
+        shape::AssumingOp::create(b, ArrayRef<Type>{result_type}, assumption);
 
     OpBuilder::InsertionGuard guard(b);
     b.createBlock(&assuming_op.getDoRegion());
@@ -2986,17 +3010,18 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
     // Broadcast the cond if necessary.
     Value cond = op.getCondition();
     if (needs_broadcast) {
-      Value result_extents = b.create<shape::ToExtentTensorOp>(
-          GetExtentsTensorTypeFor(result_type), then_shape);
-      cond = b.create<stablehlo::DynamicBroadcastInDimOp>(
+      Value result_extents = shape::ToExtentTensorOp::create(
+          b, GetExtentsTensorTypeFor(result_type), then_shape);
+      cond = stablehlo::DynamicBroadcastInDimOp::create(
+          b,
           tensorflow::GetTypeFromTFTensorShape(result_type.getShape(),
                                                b.getI1Type()),
           cond, result_extents,
           GetI64ArrayAttrForSeq(0, cond_type.getRank(), &b));
     }
-    Value select = b.create<stablehlo::SelectOp>(
-        result_type, cond, op.getThenValue(), op.getElseValue());
-    b.create<shape::AssumingYieldOp>(select);
+    Value select = stablehlo::SelectOp::create(
+        b, result_type, cond, op.getThenValue(), op.getElseValue());
+    shape::AssumingYieldOp::create(b, select);
     rewriter.replaceOp(op, {assuming_op.getResult(0)});
     return success();
   }
@@ -3034,57 +3059,58 @@ class ConvertSliceOpDynamic : public OpRewritePattern<TF::SliceOp> {
 
     int rank = begin_type.getDimSize(0);
     auto shape_scalar_type = begin_type.getElementType();
-    Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
     SmallVector<Value, 4> stride_values(rank, one);
     SmallVector<Value, 4> end_values;
     SmallVector<Value, 4> begin_values;
     end_values.reserve(rank);
     for (int i = 0; i < rank; ++i) {
       SmallVector<Value, 4> indices;
-      indices.push_back(rewriter.create<arith::ConstantIndexOp>(loc, i));
+      indices.push_back(arith::ConstantIndexOp::create(rewriter, loc, i));
       auto begin_value =
-          rewriter.create<tensor::ExtractOp>(loc, begin_indices, indices);
-      auto size_value = rewriter.create<tensor::ExtractOp>(loc, sizes, indices);
-      Value minus_one = rewriter.create<arith::IndexCastOp>(
-          loc, shape_scalar_type,
-          rewriter.create<arith::ConstantIndexOp>(loc, -1));
-      auto is_minus_one = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::eq, size_value, minus_one);
+          tensor::ExtractOp::create(rewriter, loc, begin_indices, indices);
+      auto size_value =
+          tensor::ExtractOp::create(rewriter, loc, sizes, indices);
+      Value minus_one = arith::IndexCastOp::create(
+          rewriter, loc, shape_scalar_type,
+          arith::ConstantIndexOp::create(rewriter, loc, -1));
+      auto is_minus_one = arith::CmpIOp::create(
+          rewriter, loc, arith::CmpIPredicate::eq, size_value, minus_one);
       Value end_value =
-          rewriter.create<arith::AddIOp>(loc, begin_value, size_value);
-      auto dim_value = rewriter.create<arith::IndexCastOp>(
-          loc, shape_scalar_type,
-          rewriter.create<tensor::DimOp>(loc, input, i));
-      end_value = rewriter.create<mlir::arith::SelectOp>(loc, is_minus_one,
-                                                         dim_value, end_value);
-      auto end_value_casted = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), end_value);
+          arith::AddIOp::create(rewriter, loc, begin_value, size_value);
+      auto dim_value = arith::IndexCastOp::create(
+          rewriter, loc, shape_scalar_type,
+          tensor::DimOp::create(rewriter, loc, input, i));
+      end_value = mlir::arith::SelectOp::create(rewriter, loc, is_minus_one,
+                                                dim_value, end_value);
+      auto end_value_casted = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getIndexType(), end_value);
       end_values.push_back(end_value_casted);
 
-      auto begin_value_casted = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getIndexType(), begin_value);
+      auto begin_value_casted = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getIndexType(), begin_value);
       begin_values.push_back(begin_value_casted);
     }
     auto index_ty = rewriter.getIndexType();
-    auto start_indices = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    auto start_indices = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(begin_values.size())}, index_ty),
         begin_values);
-    auto end_indices = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    auto end_indices = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(end_values.size())}, index_ty),
         end_values);
-    auto stride_indices = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    auto stride_indices = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(stride_values.size())}, index_ty),
         stride_values);
 
-    auto d_slice = rewriter.create<stablehlo::RealDynamicSliceOp>(
-        loc, op.getOperation()->getResult(0).getType(), input, start_indices,
-        end_indices, stride_indices);
+    auto d_slice = stablehlo::RealDynamicSliceOp::create(
+        rewriter, loc, op.getOperation()->getResult(0).getType(), input,
+        start_indices, end_indices, stride_indices);
     rewriter.replaceOp(op, d_slice.getOperation()->getResults());
     return success();
   }
@@ -3110,15 +3136,15 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
 
   // TODO(silvasean): Reduce duplication across reified shape calculations and
   // the static computation of output types needed to create ops.
-  Value lhs_shape = rewriter->create<shape::ShapeOfOp>(loc, lhs);
-  Value rhs_shape = rewriter->create<shape::ShapeOfOp>(loc, rhs);
+  Value lhs_shape = shape::ShapeOfOp::create(*rewriter, loc, lhs);
+  Value rhs_shape = shape::ShapeOfOp::create(*rewriter, loc, rhs);
   Value const_neg2 =
-      rewriter->create<arith::ConstantOp>(loc, rewriter->getIndexAttr(-2));
+      arith::ConstantOp::create(*rewriter, loc, rewriter->getIndexAttr(-2));
   auto shape_type = shape::ShapeType::get(rewriter->getContext());
-  auto lhs_splitted = rewriter->create<shape::SplitAtOp>(
-      loc, TypeRange{shape_type, shape_type}, lhs_shape, const_neg2);
-  auto rhs_splitted = rewriter->create<shape::SplitAtOp>(
-      loc, TypeRange{shape_type, shape_type}, rhs_shape, const_neg2);
+  auto lhs_splitted = shape::SplitAtOp::create(
+      *rewriter, loc, TypeRange{shape_type, shape_type}, lhs_shape, const_neg2);
+  auto rhs_splitted = shape::SplitAtOp::create(
+      *rewriter, loc, TypeRange{shape_type, shape_type}, rhs_shape, const_neg2);
   auto lhs_type = mlir::cast<RankedTensorType>(lhs.getType());
   auto rhs_type = mlir::cast<RankedTensorType>(rhs.getType());
   // The last two dimensions are the matrix row/col dimensions. Don't broadcast
@@ -3127,9 +3153,10 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
   mlir::OpTrait::util::getBroadcastedShape(
       lhs_type.getShape().drop_back(2), rhs_type.getShape().drop_back(2),
       result_batch_shape_compile_time_extents);
-  auto result_batch_shape = rewriter->create<shape::BroadcastOp>(
-      loc, shape_type, lhs_splitted.getHead(), rhs_splitted.getHead(),
-      /*error=*/nullptr);
+  auto result_batch_shape =
+      shape::BroadcastOp::create(*rewriter, loc, shape_type,
+                                 lhs_splitted.getHead(), rhs_splitted.getHead(),
+                                 /*error=*/nullptr);
   // Lambda which handles the broadcasting of one side to the common
   // leading-batch dimensions.
   auto broadcast_one_side = [&](Value side, RankedTensorType type,
@@ -3139,16 +3166,16 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
     result_shape.append(matrix_dims.begin(), matrix_dims.end());
     auto result_type = tensorflow::GetTypeFromTFTensorShape(
         result_shape, type.getElementType());
-    auto shape = rewriter->create<shape::ConcatOp>(
-        loc, shape_type, result_batch_shape, tail_shape);
-    auto shape_tensor = rewriter->create<shape::ToExtentTensorOp>(
-        loc,
+    auto shape = shape::ConcatOp::create(*rewriter, loc, shape_type,
+                                         result_batch_shape, tail_shape);
+    auto shape_tensor = shape::ToExtentTensorOp::create(
+        *rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(result_shape.size())},
             rewriter->getIndexType()),
         shape);
-    *out_side = rewriter->create<TF::BroadcastToOp>(loc, result_type, side,
-                                                    shape_tensor);
+    *out_side = TF::BroadcastToOp::create(*rewriter, loc, result_type, side,
+                                          shape_tensor);
   };
   broadcast_one_side(lhs, lhs_type, lhs_splitted.getTail(), out_lhs);
   broadcast_one_side(rhs, rhs_type, rhs_splitted.getTail(), out_rhs);
@@ -3177,10 +3204,10 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
     auto rhs_type = mlir::dyn_cast<RankedTensorType>(rhs.getType());
     if (!lhs_type || !rhs_type) return failure();
     if (mlir::isa<ComplexType>(lhs_type.getElementType()) && op.getAdjX()) {
-      lhs = rewriter.create<TF::ConjOp>(op.getLoc(), lhs_type, lhs);
+      lhs = TF::ConjOp::create(rewriter, op.getLoc(), lhs_type, lhs);
     }
     if (mlir::isa<ComplexType>(rhs_type.getElementType()) && op.getAdjY()) {
-      rhs = rewriter.create<TF::ConjOp>(op.getLoc(), rhs_type, rhs);
+      rhs = TF::ConjOp::create(rewriter, op.getLoc(), rhs_type, rhs);
     }
 
     // Broadcast both operands.
@@ -3288,8 +3315,8 @@ class ConvertSplitOp : public OpRewritePattern<TF::SplitOp> {
     for (int i = 0; i < num_splits; ++i) {
       begin_indices[dim_index] = i * slice_size;
       end_indices[dim_index] = (i + 1) * slice_size;
-      slices.push_back(rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(), slice_type, op.getValue(),
+      slices.push_back(stablehlo::SliceOp::create(
+          rewriter, op.getLoc(), slice_type, op.getValue(),
           GetI64ArrayAttr(begin_indices, &rewriter),
           GetI64ArrayAttr(end_indices, &rewriter),
           GetI64ArrayAttr(strides, &rewriter)));
@@ -3332,23 +3359,23 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
     if (dim_index < 0) dim_index += input_rank;
 
     Value input_dim_size =
-        rewriter.create<tensor::DimOp>(loc, input, dim_index);
+        tensor::DimOp::create(rewriter, loc, input, dim_index);
     // Calculate the dimension size for each slice along the split dimension.
     int num_splits = op.getNumResults();
-    Value num_splits_value = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getIndexAttr(num_splits));
+    Value num_splits_value = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getIndexAttr(num_splits));
     Value slice_size =
-        rewriter.create<arith::DivSIOp>(loc, input_dim_size, num_splits_value);
+        arith::DivSIOp::create(rewriter, loc, input_dim_size, num_splits_value);
 
-    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    Value one = arith::ConstantIndexOp::create(rewriter, loc, 1);
 
     SmallVector<Value, 4> begin_indices(input_rank, zero);
     SmallVector<Value, 4> end_indices;
     end_indices.reserve(input_rank);
     SmallVector<Value, 4> strides(input_rank, one);
     for (int i = 0; i < input_rank; ++i) {
-      end_indices.push_back(rewriter.create<tensor::DimOp>(loc, input, i));
+      end_indices.push_back(tensor::DimOp::create(rewriter, loc, input, i));
     }
 
     // All HLO d_slice results used to replace the original tf.Split op.
@@ -3356,30 +3383,32 @@ class ConvertSplitOpDynamic : public OpRewritePattern<TF::SplitOp> {
     slices.reserve(num_splits);
 
     for (int i = 0; i < num_splits; ++i) {
-      begin_indices[dim_index] = rewriter.create<arith::MulIOp>(
-          loc, slice_size, rewriter.create<arith::ConstantIndexOp>(loc, i));
-      end_indices[dim_index] = rewriter.create<arith::MulIOp>(
-          loc, slice_size, rewriter.create<arith::ConstantIndexOp>(loc, i + 1));
+      begin_indices[dim_index] = arith::MulIOp::create(
+          rewriter, loc, slice_size,
+          arith::ConstantIndexOp::create(rewriter, loc, i));
+      end_indices[dim_index] = arith::MulIOp::create(
+          rewriter, loc, slice_size,
+          arith::ConstantIndexOp::create(rewriter, loc, i + 1));
 
       Type index_ty = rewriter.getIndexType();
-      auto begin_value = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      auto begin_value = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(begin_indices.size())}, index_ty),
           begin_indices);
-      auto end_value = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      auto end_value = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(end_indices.size())}, index_ty),
           end_indices);
-      auto stride_value = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      auto stride_value = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(strides.size())}, index_ty),
           strides);
-      slices.push_back(rewriter.create<stablehlo::RealDynamicSliceOp>(
-          loc, op.getOperation()->getResult(i).getType(), input, begin_value,
-          end_value, stride_value));
+      slices.push_back(stablehlo::RealDynamicSliceOp::create(
+          rewriter, loc, op.getOperation()->getResult(i).getType(), input,
+          begin_value, end_value, stride_value));
     }
 
     rewriter.replaceOp(op, slices);
@@ -3484,10 +3513,11 @@ class ConvertSplitVOp : public OpRewritePattern<TF::SplitVOp> {
 
     for (int i = 0, end = op.getNumResults(); i < end; ++i) {
       end_indices[dim_index] = begin_indices[dim_index] + split_sizes[i];
-      slices.push_back(rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(), op.getValue(), GetI64ArrayAttr(begin_indices, &rewriter),
-          GetI64ArrayAttr(end_indices, &rewriter),
-          GetI64ArrayAttr(strides, &rewriter)));
+      slices.push_back(
+          stablehlo::SliceOp::create(rewriter, op.getLoc(), op.getValue(),
+                                     GetI64ArrayAttr(begin_indices, &rewriter),
+                                     GetI64ArrayAttr(end_indices, &rewriter),
+                                     GetI64ArrayAttr(strides, &rewriter)));
       // Prepare the begin indice for the next slice.
       begin_indices[dim_index] = end_indices[dim_index];
     }
@@ -3568,11 +3598,11 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     Location loc = op.getLoc();
     Value input = op.getInput();
     if (!dims_to_reverse.empty())
-      input = rewriter.create<stablehlo::ReverseOp>(
-          loc, input_ty, op.getInput(),
+      input = stablehlo::ReverseOp::create(
+          rewriter, loc, input_ty, op.getInput(),
           GetI64ArrayAttr(dims_to_reverse, &rewriter));
-    auto sliced = rewriter.create<stablehlo::SliceOp>(
-        loc, input, GetI64ArrayAttr(hlo_begin_indices, &rewriter),
+    auto sliced = stablehlo::SliceOp::create(
+        rewriter, loc, input, GetI64ArrayAttr(hlo_begin_indices, &rewriter),
         GetI64ArrayAttr(hlo_end_indices, &rewriter),
         GetI64ArrayAttr(hlo_strides, &rewriter));
 
@@ -3663,21 +3693,21 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
         continue;
       }
 
-      auto index = rewriter.create<stablehlo::SliceOp>(
-          loc, op.getBegin(), GetI64ArrayAttr({d}, &rewriter),
+      auto index = stablehlo::SliceOp::create(
+          rewriter, loc, op.getBegin(), GetI64ArrayAttr({d}, &rewriter),
           GetI64ArrayAttr({d + 1}, &rewriter), GetI64ArrayAttr({1}, &rewriter));
       // Convert index to scalar.
       auto reshaped_index =
-          rewriter.create<stablehlo::ReshapeOp>(loc, type, index);
+          stablehlo::ReshapeOp::create(rewriter, loc, type, index);
       // If the index is negative, wrap it around with dimension size.
       auto index_negative =
-          rewriter.create<TF::LessOp>(loc, reshaped_index, zero);
+          TF::LessOp::create(rewriter, loc, reshaped_index, zero);
       auto input_val = GetScalarConstOfType(begin_element_ty, loc,
                                             input_shape[d], &rewriter);
       auto wrapped_index =
-          rewriter.create<TF::AddV2Op>(loc, input_val, reshaped_index);
-      auto final_index = rewriter.create<stablehlo::SelectOp>(
-          loc, type, index_negative, wrapped_index, reshaped_index);
+          TF::AddV2Op::create(rewriter, loc, input_val, reshaped_index);
+      auto final_index = stablehlo::SelectOp::create(
+          rewriter, loc, type, index_negative, wrapped_index, reshaped_index);
       slice_begin_indices.push_back(final_index);
       slice_sizes.push_back(1);
     }
@@ -3687,8 +3717,9 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
         slice_sizes, op.getType().getElementType());
     // This must be an xla DynamicSlice op due to the inputs that aren't
     // constant.
-    auto sliced = rewriter.create<stablehlo::DynamicSliceOp>(
-        loc, sliced_type, op.getInput(), slice_begin_indices, slice_sizes_attr);
+    auto sliced = stablehlo::DynamicSliceOp::create(
+        rewriter, loc, sliced_type, op.getInput(), slice_begin_indices,
+        slice_sizes_attr);
 
     // Reshape slice result so that the shape is updated depending on
     // 'new_axis_mask' or 'shrink_axis_mask' attributes.
@@ -3760,9 +3791,9 @@ class ConvertStridedSliceGradOp
     Type element_type = mlir::cast<ShapedType>(grad.getType()).getElementType();
 
     // Perform reshape to undo any new/shrink axes done by strided slice.
-    grad = rewriter.create<stablehlo::ReshapeOp>(
-        op.getLoc(), tensorflow::GetTypeFromTFTensorShape(shape, element_type),
-        grad);
+    grad = stablehlo::ReshapeOp::create(
+        rewriter, op.getLoc(),
+        tensorflow::GetTypeFromTFTensorShape(shape, element_type), grad);
 
     SmallVector<int64_t, 4> padding_low, padding_high, padding_interm;
     SmallVector<int64_t, 4> dims_to_reverse;
@@ -3797,8 +3828,8 @@ class ConvertStridedSliceGradOp
     }
 
     if (!dims_to_reverse.empty()) {
-      grad = rewriter.create<stablehlo::ReverseOp>(
-          op.getLoc(), grad.getType(), grad,
+      grad = stablehlo::ReverseOp::create(
+          rewriter, op.getLoc(), grad.getType(), grad,
           GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
@@ -3840,10 +3871,10 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
       return failure();
     }
 
-    auto iota = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), result_type, rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<chlo::BroadcastMulOp>(
-        op.getLoc(), result_type, iota, op.getDelta(),
+    auto iota = stablehlo::IotaOp::create(rewriter, op.getLoc(), result_type,
+                                          rewriter.getI64IntegerAttr(0));
+    auto scaled = chlo::BroadcastMulOp::create(
+        rewriter, op.getLoc(), result_type, iota, op.getDelta(),
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, op.getDelta()));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
         op, result_type, scaled, op.getStart(),
@@ -3893,25 +3924,25 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
     //
     // %size = ceil(abs((%limit - %start) / %delta))
     auto range =
-        rewriter.create<stablehlo::SubtractOp>(op.getLoc(), limit, start);
-    auto abs = rewriter.create<stablehlo::AbsOp>(op.getLoc(), range);
+        stablehlo::SubtractOp::create(rewriter, op.getLoc(), limit, start);
+    auto abs = stablehlo::AbsOp::create(rewriter, op.getLoc(), range);
 
     // Delta is not necessarily the same type as start and limit.
     auto abs_cast =
-        rewriter.create<stablehlo::ConvertOp>(op.getLoc(), compute_type, abs);
-    auto delta_cast =
-        rewriter.create<stablehlo::ConvertOp>(op.getLoc(), compute_type, delta);
+        stablehlo::ConvertOp::create(rewriter, op.getLoc(), compute_type, abs);
+    auto delta_cast = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                   compute_type, delta);
 
     // Compute the total number of integer steps and convert to the HLO
     // dimension tensor.
     auto normalized =
-        rewriter.create<stablehlo::DivOp>(op.getLoc(), abs_cast, delta_cast);
-    auto ceil = rewriter.create<stablehlo::CeilOp>(op.getLoc(), normalized);
-    auto steps = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(),
+        stablehlo::DivOp::create(rewriter, op.getLoc(), abs_cast, delta_cast);
+    auto ceil = stablehlo::CeilOp::create(rewriter, op.getLoc(), normalized);
+    auto steps = stablehlo::ConvertOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()), ceil);
-    auto reshape = rewriter.create<stablehlo::ReshapeOp>(
-        op.getLoc(),
+    auto reshape = stablehlo::ReshapeOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getI64Type()),
         steps);
 
@@ -3920,15 +3951,16 @@ class ConvertDynamicRangeOp : public OpRewritePattern<TF::RangeOp> {
     // %range = %start + %delta * iota(%size)
     auto out_scalar_type = tensorflow::GetTypeFromTFTensorShape(
         {}, getElementTypeOrSelf(result_type));
-    auto start_out_cast = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), out_scalar_type, start);
-    auto delta_out_cast = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), out_scalar_type, delta);
-
-    auto iota = rewriter.create<stablehlo::DynamicIotaOp>(
-        op.getLoc(), result_type, reshape, rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<chlo::BroadcastMulOp>(
-        op.getLoc(), result_type, iota, delta_out_cast,
+    auto start_out_cast = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                       out_scalar_type, start);
+    auto delta_out_cast = stablehlo::ConvertOp::create(rewriter, op.getLoc(),
+                                                       out_scalar_type, delta);
+
+    auto iota = stablehlo::DynamicIotaOp::create(rewriter, op.getLoc(),
+                                                 result_type, reshape,
+                                                 rewriter.getI64IntegerAttr(0));
+    auto scaled = chlo::BroadcastMulOp::create(
+        rewriter, op.getLoc(), result_type, iota, delta_out_cast,
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, delta_cast));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
         op, result_type, scaled, start_out_cast,
@@ -3979,29 +4011,32 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     int64_t num = (*num_attr.begin()).getSExtValue();
 
     // Calculate the scaling that needs to be applied to the iota.
-    auto step_numerator = rewriter.create<chlo::BroadcastSubOp>(
-        op.getLoc(), op.getStart().getType(), op.getStop(), op.getStart(),
+    auto step_numerator = chlo::BroadcastSubOp::create(
+        rewriter, op.getLoc(), op.getStart().getType(), op.getStop(),
+        op.getStart(),
         hlo::getBroadcastDimensionsAttr(&rewriter, op.getStop(),
                                         op.getStart()));
-    Value step_denominator = rewriter.create<stablehlo::ConvertOp>(
-        op.getLoc(), op.getNum(), result_type.getElementType());
+    Value step_denominator = stablehlo::ConvertOp::create(
+        rewriter, op.getLoc(), op.getNum(), result_type.getElementType());
     if (num > 1) {
       Value one = GetScalarConstOfType(result_type.getElementType(),
                                        op.getLoc(), 1, &rewriter);
-      step_denominator = rewriter.create<chlo::BroadcastSubOp>(
-          op.getLoc(), step_denominator.getType(), step_denominator, one,
+      step_denominator = chlo::BroadcastSubOp::create(
+          rewriter, op.getLoc(), step_denominator.getType(), step_denominator,
+          one,
           hlo::getBroadcastDimensionsAttr(&rewriter, step_denominator, one));
     }
-    auto step = rewriter.create<chlo::BroadcastDivOp>(
-        op.getLoc(), step_numerator.getType(), step_numerator, step_denominator,
+    auto step = chlo::BroadcastDivOp::create(
+        rewriter, op.getLoc(), step_numerator.getType(), step_numerator,
+        step_denominator,
         hlo::getBroadcastDimensionsAttr(&rewriter, step_numerator,
                                         step_denominator));
 
     // Scale the iota and add the offset.
-    auto iota = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), result_type, rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<chlo::BroadcastMulOp>(
-        op.getLoc(), result_type, iota, step,
+    auto iota = stablehlo::IotaOp::create(rewriter, op.getLoc(), result_type,
+                                          rewriter.getI64IntegerAttr(0));
+    auto scaled = chlo::BroadcastMulOp::create(
+        rewriter, op.getLoc(), result_type, iota, step,
         hlo::getBroadcastDimensionsAttr(&rewriter, iota, step));
     rewriter.replaceOpWithNewOp<chlo::BroadcastAddOp>(
         op, result_type, scaled, op.getStart(),
@@ -4068,14 +4103,14 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
     // repeated arithmetic operations.
     Type reduce_element_type =
         is_accumulation ? GetAccumulationType(element_type) : element_type;
-    auto casted_input = rewriter.create<stablehlo::ConvertOp>(
-        loc, op.getInput(), reduce_element_type);
+    auto casted_input = stablehlo::ConvertOp::create(
+        rewriter, loc, op.getInput(), reduce_element_type);
 
     // Each reduction op can have a different initial value.
     Value init = Derived::GetInitialValue(reduce_element_type, loc, &rewriter);
 
-    auto reduction = rewriter.create<stablehlo::ReduceOp>(
-        loc, casted_input.getResult(), init,
+    auto reduction = stablehlo::ReduceOp::create(
+        rewriter, loc, casted_input.getResult(), init,
         GetI64ArrayAttr(xla_dimensions, &rewriter), reduce_element_type);
     BuildReduceBody<ReductionOp>(reduce_element_type, &reduction.getBody(),
                                  &rewriter);
@@ -4083,32 +4118,34 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
 
     // The mean op needs to divide by the product of the reduced dimensions.
     if (std::is_same<OpTy, TF::MeanOp>::value) {
-      Value in_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
-      Value divisor_count = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+      Value in_shape = shape::ShapeOfOp::create(rewriter, loc, op.getInput());
+      Value divisor_count = arith::ConstantIndexOp::create(rewriter, loc, 1);
       for (size_t i = 0; i < input_shape.size(); ++i) {
         if (reduced_dimensions_bitmap[i]) {
-          Value index = rewriter.create<arith::ConstantIndexOp>(loc, i);
-          auto dim = rewriter.create<tensor::ExtractOp>(loc, in_shape, index);
+          Value index = arith::ConstantIndexOp::create(rewriter, loc, i);
+          auto dim = tensor::ExtractOp::create(rewriter, loc, in_shape, index);
           divisor_count =
-              rewriter.create<arith::MulIOp>(loc, divisor_count, dim);
+              arith::MulIOp::create(rewriter, loc, divisor_count, dim);
         }
       }
       // HLO ops are only defined on tensors, so we cast the divisor from
       // index -> i64 -> tensor<1xi64> -> tensor<i64> -> tensor<reduction type>
-      Value divisor_casted = rewriter.create<arith::IndexCastOp>(
-          loc, rewriter.getI64Type(), divisor_count);
-      Value divisor_tensor = rewriter.create<tensor::FromElementsOp>(
-          loc, tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()),
+      Value divisor_casted = arith::IndexCastOp::create(
+          rewriter, loc, rewriter.getI64Type(), divisor_count);
+      Value divisor_tensor = tensor::FromElementsOp::create(
+          rewriter, loc,
+          tensorflow::GetTypeFromTFTensorShape({}, rewriter.getI64Type()),
           divisor_casted);
-      Value divisor = rewriter.create<stablehlo::ConvertOp>(
-          loc, tensorflow::GetTypeFromTFTensorShape({}, reduce_element_type),
+      Value divisor = stablehlo::ConvertOp::create(
+          rewriter, loc,
+          tensorflow::GetTypeFromTFTensorShape({}, reduce_element_type),
           divisor_tensor);
       auto broadcast_dims = rewriter.getDenseI64ArrayAttr({});
-      result = rewriter.create<chlo::BroadcastDivOp>(loc, result, divisor,
-                                                     broadcast_dims);
+      result = chlo::BroadcastDivOp::create(rewriter, loc, result, divisor,
+                                            broadcast_dims);
     }
 
-    result = rewriter.create<stablehlo::ConvertOp>(loc, result, element_type);
+    result = stablehlo::ConvertOp::create(rewriter, loc, result, element_type);
 
     // Need to reshape back after the reduction if we're keeping the reduced
     // dimensions. Note that we do this through successive (nominally 1)
@@ -4122,8 +4159,8 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
         if (dim_is_reduced.value()) {
           auto index_attr = GetI32ElementsAttr(
               {static_cast<int>(dim_is_reduced.index())}, &rewriter);
-          Value index = rewriter.create<arith::ConstantOp>(loc, index_attr);
-          result = rewriter.create<TF::ExpandDimsOp>(loc, result, index);
+          Value index = arith::ConstantOp::create(rewriter, loc, index_attr);
+          result = TF::ExpandDimsOp::create(rewriter, loc, result, index);
         }
       }
     }
@@ -4300,15 +4337,15 @@ class ConvertArgMinMaxOp : public OpRewritePattern<OpTy> {
 
     IntegerAttr iota_dimension =
         IntegerAttr::get(rewriter.getIntegerType(64), axis);
-    Value input_shape = rewriter.create<shape::ShapeOfOp>(loc, op.getInput());
-    Value index_values = rewriter.create<stablehlo::DynamicIotaOp>(
-        loc, index_type, input_shape, iota_dimension);
+    Value input_shape = shape::ShapeOfOp::create(rewriter, loc, op.getInput());
+    Value index_values = stablehlo::DynamicIotaOp::create(
+        rewriter, loc, index_type, input_shape, iota_dimension);
 
     Value operands[] = {op.getInput(), index_values};
     Value init_values[] = {init_value, index_init_value};
 
-    auto reduction = rewriter.create<stablehlo::ReduceOp>(
-        loc, llvm::ArrayRef<Value>(operands),
+    auto reduction = stablehlo::ReduceOp::create(
+        rewriter, loc, llvm::ArrayRef<Value>(operands),
         llvm::ArrayRef<Value>(init_values), GetI64ArrayAttr({axis}, &rewriter),
         TypeRange({input_element_type, index_element_type}));
     auto direction = Derived::GetDirection();
@@ -4426,14 +4463,14 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
       auto const_attr = GetI64ElementsAttr(expected_update_shape, &rewriter);
 
       auto const_op =
-          rewriter.create<TF::ConstOp>(op->getLoc(), const_type, const_attr);
+          TF::ConstOp::create(rewriter, op->getLoc(), const_type, const_attr);
 
       auto broadcast_to_type = tensorflow::GetTypeFromTFTensorShape(
           llvm::ArrayRef<int64_t>(expected_update_shape),
           updates_ty.getElementType());
 
-      updates = rewriter.create<TF::BroadcastToOp>(
-          op->getLoc(), broadcast_to_type, op.getUpdates(), const_op);
+      updates = TF::BroadcastToOp::create(
+          rewriter, op->getLoc(), broadcast_to_type, op.getUpdates(), const_op);
 
       updates_ty = mlir::dyn_cast<RankedTensorType>(updates.getType());
     }
@@ -4455,9 +4492,9 @@ class ConvertTensorScatterOp : public OpRewritePattern<OpTy> {
         indices_rank - 1);
 
     Location loc = op.getLoc();
-    auto scatter = rewriter.create<stablehlo::ScatterOp>(
-        loc, op.getType(), ValueRange(Value(op.getTensor())), op.getIndices(),
-        updates, dims_attr);
+    auto scatter = stablehlo::ScatterOp::create(
+        rewriter, loc, op.getType(), ValueRange(Value(op.getTensor())),
+        op.getIndices(), updates, dims_attr);
     Derived::BuildScatterBody(tensor_ty.getElementType(),
                               &scatter.getUpdateComputation(), loc, rewriter);
 
@@ -4479,7 +4516,7 @@ class ConvertTensorScatterUpdateOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    builder.create<stablehlo::ReturnOp>(loc, block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, block->getArgument(1));
   }
 };
 
@@ -4496,9 +4533,9 @@ class ConvertTensorScatterAddOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto add_op = builder.create<stablehlo::AddOp>(loc, block->getArgument(0),
-                                                   block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, add_op.getResult());
+    auto add_op = stablehlo::AddOp::create(builder, loc, block->getArgument(0),
+                                           block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, add_op.getResult());
   }
 };
 
@@ -4515,9 +4552,9 @@ class ConvertTensorScatterSubOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto sub_op = builder.create<stablehlo::SubtractOp>(
-        loc, block->getArgument(0), block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, sub_op.getResult());
+    auto sub_op = stablehlo::SubtractOp::create(
+        builder, loc, block->getArgument(0), block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, sub_op.getResult());
   }
 };
 
@@ -4534,9 +4571,9 @@ class ConvertTensorScatterMinOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto min_op = builder.create<stablehlo::MinOp>(loc, block->getArgument(0),
-                                                   block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, min_op.getResult());
+    auto min_op = stablehlo::MinOp::create(builder, loc, block->getArgument(0),
+                                           block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, min_op.getResult());
   }
 };
 
@@ -4553,9 +4590,9 @@ class ConvertTensorScatterMaxOp
     Type type =
         tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
     block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
-    auto max_op = builder.create<stablehlo::MaxOp>(loc, block->getArgument(0),
-                                                   block->getArgument(1));
-    builder.create<stablehlo::ReturnOp>(loc, max_op.getResult());
+    auto max_op = stablehlo::MaxOp::create(builder, loc, block->getArgument(0),
+                                           block->getArgument(1));
+    stablehlo::ReturnOp::create(builder, loc, max_op.getResult());
   }
 };
 
@@ -4670,10 +4707,10 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
       auto dim_size = input_ty.getDimSize(i);
       if (dim_size == ShapedType::kDynamic) {
         input_shape_values.push_back(
-            rewriter.create<tensor::DimOp>(loc, input, i));
+            tensor::DimOp::create(rewriter, loc, input, i));
       } else {
-        input_shape_values.push_back(rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIndexAttr(dim_size)));
+        input_shape_values.push_back(arith::ConstantOp::create(
+            rewriter, loc, rewriter.getIndexAttr(dim_size)));
       }
     }
 
@@ -4691,12 +4728,12 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     SmallVector<Value, 4> out_dim_size;
     out_dim_size.reserve(input_rank * 2);
     for (int64_t dim_idx = 0; dim_idx < input_rank; ++dim_idx) {
-      Value index = rewriter.create<arith::ConstantOp>(
-          loc, rewriter.getIndexAttr(dim_idx));
-      Value multiples_size =
-          rewriter.create<tensor::ExtractOp>(loc, multiples, ValueRange{index});
+      Value index = arith::ConstantOp::create(rewriter, loc,
+                                              rewriter.getIndexAttr(dim_idx));
+      Value multiples_size = tensor::ExtractOp::create(rewriter, loc, multiples,
+                                                       ValueRange{index});
       Value multiples_size_casted =
-          rewriter.create<arith::IndexCastOp>(loc, index_ty, multiples_size);
+          arith::IndexCastOp::create(rewriter, loc, index_ty, multiples_size);
       out_dim_size.push_back(multiples_size_casted);
       out_dim_size.push_back(input_shape_values[dim_idx]);
     }
@@ -4707,8 +4744,8 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
     }
     auto broadcast_dims_attr = GetI64ArrayAttr(broadcast_dimensions, &rewriter);
 
-    Value out_dim_size_tensor = rewriter.create<tensor::FromElementsOp>(
-        loc,
+    Value out_dim_size_tensor = tensor::FromElementsOp::create(
+        rewriter, loc,
         tensorflow::GetTypeFromTFTensorShape(
             {static_cast<int64_t>(out_dim_size.size())}, index_ty),
         out_dim_size);
@@ -4716,19 +4753,21 @@ class ConvertTileOpDynamic : public OpRewritePattern<TF::TileOp> {
                                             ShapedType::kDynamic);
     RankedTensorType broadcast_type =
         tensorflow::GetTypeFromTFTensorShape(broadcast_shape, element_type);
-    Value broadcast = rewriter.create<stablehlo::DynamicBroadcastInDimOp>(
-        loc, broadcast_type, input, out_dim_size_tensor, broadcast_dims_attr);
+    Value broadcast = stablehlo::DynamicBroadcastInDimOp::create(
+        rewriter, loc, broadcast_type, input, out_dim_size_tensor,
+        broadcast_dims_attr);
 
     // %shape = [MS1, MS2]
     SmallVector<Value, 4> shape_values;
     shape_values.reserve(input_rank);
     for (int64_t i = 0; i < input_rank; ++i) {
-      Value dim_size_value = rewriter.create<mlir::arith::MulIOp>(
-          loc, out_dim_size[2 * i], out_dim_size[2 * i + 1]);
+      Value dim_size_value = mlir::arith::MulIOp::create(
+          rewriter, loc, out_dim_size[2 * i], out_dim_size[2 * i + 1]);
       shape_values.push_back(dim_size_value);
     }
-    Value shape = rewriter.create<tensor::FromElementsOp>(
-        loc, tensorflow::GetTypeFromTFTensorShape({input_rank}, index_ty),
+    Value shape = tensor::FromElementsOp::create(
+        rewriter, loc,
+        tensorflow::GetTypeFromTFTensorShape({input_rank}, index_ty),
         shape_values);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(op, op.getType(),
                                                              broadcast, shape);
@@ -4758,8 +4797,8 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
         input_ty.getShape(), op.getKsize(), op.getStrides(), op.getPadding(),
         &rewriter);
 
-    auto result = rewriter.create<stablehlo::SelectAndScatterOp>(
-        loc, op.getType(), op.getOrigInput(), op.getGrad(),
+    auto result = stablehlo::SelectAndScatterOp::create(
+        rewriter, loc, op.getType(), op.getOrigInput(), op.getGrad(),
         GetScalarConstOfType(element_type, loc, 0, &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getKsize()), &rewriter),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getStrides()), &rewriter),
@@ -4776,10 +4815,10 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
           tensorflow::GetTypeFromTFTensorShape(/*shape=*/{}, element_type);
       block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
 
-      auto reducer = rewriter.create<stablehlo::CompareOp>(
-          loc, block->getArgument(0), block->getArgument(1),
+      auto reducer = stablehlo::CompareOp::create(
+          rewriter, loc, block->getArgument(0), block->getArgument(1),
           stablehlo::ComparisonDirection::GE);
-      rewriter.create<stablehlo::ReturnOp>(loc, reducer.getResult());
+      stablehlo::ReturnOp::create(rewriter, loc, reducer.getResult());
     }
 
     rewriter.replaceOp(op, result);
@@ -4955,7 +4994,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       Type filter_element_ty = filter_ty.getElementType();
       auto ty =
           tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<stablehlo::ReshapeOp>(op.getLoc(), ty, filter);
+      filter = stablehlo::ReshapeOp::create(rewriter, op.getLoc(), ty, filter);
 
       // 2. Transpose to [H, W, ..., G, filter_in_depth, out_depth / G].
       llvm::SmallVector<int64_t, 6> perm(num_dims + 1);
@@ -4963,15 +5002,15 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
       std::swap(perm[num_spatial_dims], perm[num_spatial_dims + 1]);
       std::swap(new_shape[num_spatial_dims], new_shape[num_spatial_dims + 1]);
       ty = tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<stablehlo::TransposeOp>(
-          op.getLoc(), ty, filter, GetI64ArrayAttr(perm, &rewriter));
+      filter = stablehlo::TransposeOp::create(rewriter, op.getLoc(), ty, filter,
+                                              GetI64ArrayAttr(perm, &rewriter));
 
       // 3. Reshape to [H, W, ..., in_depth, out_depth / G].
       new_shape[num_spatial_dims] *= new_shape[num_spatial_dims + 1];
       new_shape[num_spatial_dims + 1] = new_shape.back();
       new_shape.pop_back();
       ty = tensorflow::GetTypeFromTFTensorShape(new_shape, filter_element_ty);
-      filter = rewriter.create<stablehlo::ReshapeOp>(op.getLoc(), ty, filter);
+      filter = stablehlo::ReshapeOp::create(rewriter, op.getLoc(), ty, filter);
     }
 
     SmallVector<int64_t, 4> kernel_spatial_dims;
@@ -4979,13 +5018,14 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
     std::iota(kernel_spatial_dims.begin(), kernel_spatial_dims.end(), 0);
 
     // Mirror the filter in the spatial dimensions.
-    filter = rewriter.create<stablehlo::ReverseOp>(
-        op.getLoc(), filter, GetI64ArrayAttr(kernel_spatial_dims, &rewriter));
+    filter = stablehlo::ReverseOp::create(
+        rewriter, op.getLoc(), filter,
+        GetI64ArrayAttr(kernel_spatial_dims, &rewriter));
 
     // activation gradients
     //   = gradients (with padding and dilation) <conv> mirrored_weights
-    Value result = rewriter.create<stablehlo::ConvolutionOp>(
-        op.getLoc(), op.getType(), op.getOutBackprop(), filter,
+    Value result = stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), op.getType(), op.getOutBackprop(), filter,
         /*window_strides=*/
         GetI64ArrayAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
                                 &rewriter),
@@ -5191,8 +5231,8 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
     const int batch_dim =
         tensorflow::GetTensorBatchDimIndex(num_dims, data_format);
 
-    Value result = rewriter.create<stablehlo::ConvolutionOp>(
-        op.getLoc(), op.getType(), op.getInput(), op.getOutBackprop(),
+    Value result = stablehlo::ConvolutionOp::create(
+        rewriter, op.getLoc(), op.getType(), op.getInput(), op.getOutBackprop(),
         /*window_strides=*/GetI64ArrayAttr(window_strides, &rewriter),
         /*padding=*/paddings_attr, /*lhs_dilation=*/
         GetI64ArrayAttrForValue(/*size=*/num_spatial_dims, /*val=*/1,
@@ -5331,14 +5371,15 @@ class ConvertInfeedDequeueTupleOp
 
     // Infeed takes a single token operand. Generate the token using
     // create_token op to pass to the infeed op.
-    auto token = rewriter.create<stablehlo::CreateTokenOp>(
-        op.getLoc(), stablehlo::TokenType::get(rewriter.getContext()));
+    auto token = stablehlo::CreateTokenOp::create(
+        rewriter, op.getLoc(),
+        stablehlo::TokenType::get(rewriter.getContext()));
 
     result_types.push_back(token.getType());
 
     ArrayAttr layout;  // filled in during the xla-adjust-layout pass
-    auto data_and_token = rewriter.create<stablehlo::InfeedOp>(
-        op.getLoc(), result_types, token,
+    auto data_and_token = stablehlo::InfeedOp::create(
+        rewriter, op.getLoc(), result_types, token,
         /*infeed_config=*/rewriter.getStringAttr(""),
         /*layout=*/layout);
 
@@ -5409,11 +5450,11 @@ class ConvertOutfeedEnqueueTupleOp
                                 PatternRewriter &rewriter) const override {
     auto token_type = stablehlo::TokenType::get(rewriter.getContext());
     auto token =
-        rewriter.create<stablehlo::CreateTokenOp>(op.getLoc(), token_type);
+        stablehlo::CreateTokenOp::create(rewriter, op.getLoc(), token_type);
 
-    rewriter.create<stablehlo::OutfeedOp>(
-        op.getLoc(), token_type, op.getInputs(), token,
-        /*outfeed_config=*/rewriter.getStringAttr(""));
+    stablehlo::OutfeedOp::create(rewriter, op.getLoc(), token_type,
+                                 op.getInputs(), token,
+                                 /*outfeed_config=*/rewriter.getStringAttr(""));
     rewriter.eraseOp(op);
     return success();
   }
@@ -5475,14 +5516,15 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
       begin_indices[axis] = i;
       end_indices[axis] = i + 1;
 
-      auto slice_op = rewriter.create<stablehlo::SliceOp>(
-          op.getLoc(), op.getValue(), GetI64ArrayAttr(begin_indices, &rewriter),
-          GetI64ArrayAttr(end_indices, &rewriter),
-          GetI64ArrayAttr(strides, &rewriter));
+      auto slice_op =
+          stablehlo::SliceOp::create(rewriter, op.getLoc(), op.getValue(),
+                                     GetI64ArrayAttr(begin_indices, &rewriter),
+                                     GetI64ArrayAttr(end_indices, &rewriter),
+                                     GetI64ArrayAttr(strides, &rewriter));
       // Reshape to drop the axis dimension.
-      auto result = rewriter.create<TF::SqueezeOp>(
-          op.getLoc(), op.getType(i), slice_op,
-          rewriter.getI64ArrayAttr(op.getAxis()));
+      auto result =
+          TF::SqueezeOp::create(rewriter, op.getLoc(), op.getType(i), slice_op,
+                                rewriter.getI64ArrayAttr(op.getAxis()));
       results.push_back(result);
     }
 
@@ -5525,16 +5567,16 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
     for (int64_t dim_idx = 0; dim_idx < value_rank; ++dim_idx) {
       int64_t dim_size = value_type.getDimSize(dim_idx);
       if (dim_size == ShapedType::kDynamic) {
-        Value dim_i = rewriter.create<arith::IndexCastOp>(
-            loc, shape_scalar_type,
-            rewriter.create<tensor::DimOp>(loc, op.getOperand(), dim_idx));
+        Value dim_i = arith::IndexCastOp::create(
+            rewriter, loc, shape_scalar_type,
+            tensor::DimOp::create(rewriter, loc, op.getOperand(), dim_idx));
         end_indices.push_back(dim_i);
         if (dim_idx != axis) {
           shape_values.push_back(dim_i);
         }
       } else {
-        Value dim_i = rewriter.create<arith::ConstantOp>(
-            loc, shape_scalar_type,
+        Value dim_i = arith::ConstantOp::create(
+            rewriter, loc, shape_scalar_type,
             rewriter.getIntegerAttr(shape_scalar_type, dim_size));
         end_indices.push_back(dim_i);
         if (dim_idx != axis) {
@@ -5545,44 +5587,45 @@ class ConvertUnpackOpDynamic : public OpRewritePattern<TF::UnpackOp> {
         }
       }
       begin_indices.push_back(
-          rewriter.create<arith::ConstantIntOp>(loc, 0, 32));
-      strides.push_back(rewriter.create<arith::ConstantIntOp>(loc, 1, 32));
+          arith::ConstantIntOp::create(rewriter, loc, 0, 32));
+      strides.push_back(arith::ConstantIntOp::create(rewriter, loc, 1, 32));
     }
 
     SmallVector<Value, 4> results;
     results.reserve(op.getNumResults());
     Type i32_ty = rewriter.getI32Type();
     for (int64_t i = 0; i < op.getNumResults(); ++i) {
-      begin_indices[axis] = rewriter.create<arith::ConstantIntOp>(loc, i, 32);
-      end_indices[axis] = rewriter.create<arith::ConstantIntOp>(loc, i + 1, 32);
-      Value slice_op = rewriter.create<stablehlo::RealDynamicSliceOp>(
-          loc,
+      begin_indices[axis] = arith::ConstantIntOp::create(rewriter, loc, i, 32);
+      end_indices[axis] =
+          arith::ConstantIntOp::create(rewriter, loc, i + 1, 32);
+      Value slice_op = stablehlo::RealDynamicSliceOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(slice_shape,
                                                value_type.getElementType()),
           op.getValue(),
-          rewriter.create<tensor::FromElementsOp>(
-              loc,
+          tensor::FromElementsOp::create(
+              rewriter, loc,
               tensorflow::GetTypeFromTFTensorShape(
                   {static_cast<int64_t>(begin_indices.size())}, i32_ty),
               begin_indices),
-          rewriter.create<tensor::FromElementsOp>(
-              loc,
+          tensor::FromElementsOp::create(
+              rewriter, loc,
               tensorflow::GetTypeFromTFTensorShape(
                   {static_cast<int64_t>(end_indices.size())}, i32_ty),
               end_indices),
-          rewriter.create<tensor::FromElementsOp>(
-              loc,
+          tensor::FromElementsOp::create(
+              rewriter, loc,
               tensorflow::GetTypeFromTFTensorShape(
                   {static_cast<int64_t>(strides.size())}, i32_ty),
               strides));
       // Reshape to drop the axis dimension.
-      Value new_shape = rewriter.create<tensor::FromElementsOp>(
-          loc,
+      Value new_shape = tensor::FromElementsOp::create(
+          rewriter, loc,
           tensorflow::GetTypeFromTFTensorShape(
               {static_cast<int64_t>(shape_values.size())}, i32_ty),
           shape_values);
-      Value reshape_op = rewriter.create<stablehlo::DynamicReshapeOp>(
-          loc, op.getType(i), slice_op, new_shape);
+      Value reshape_op = stablehlo::DynamicReshapeOp::create(
+          rewriter, loc, op.getType(i), slice_op, new_shape);
       results.push_back(reshape_op);
     }
 
@@ -5619,16 +5662,20 @@ class ConvertSigmoidGradOpDynamic : public OpRewritePattern<TF::SigmoidGradOp> {
       assert(mlir::isa<FloatType>(elem_tp));
       attr = rewriter.getFloatAttr(elem_tp, 1);
     }
-    Value one = rewriter.create<stablehlo::ConstantOp>(
-        loc, DenseElementsAttr::get(
-                 tensorflow::GetTypeFromTFTensorShape({}, elem_tp), attr));
-
-    auto v0 = rewriter.create<chlo::BroadcastMulOp>(
-        loc, dy, y, hlo::getBroadcastDimensionsAttr(&rewriter, dy, y));
-    auto v1 = rewriter.create<chlo::BroadcastSubOp>(
-        loc, one, y, hlo::getBroadcastDimensionsAttr(&rewriter, one, y));
-    auto result = rewriter.create<chlo::BroadcastMulOp>(
-        loc, v0, v1, hlo::getBroadcastDimensionsAttr(&rewriter, v0, v1));
+    Value one = stablehlo::ConstantOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(
+            tensorflow::GetTypeFromTFTensorShape({}, elem_tp), attr));
+
+    auto v0 = chlo::BroadcastMulOp::create(
+        rewriter, loc, dy, y,
+        hlo::getBroadcastDimensionsAttr(&rewriter, dy, y));
+    auto v1 = chlo::BroadcastSubOp::create(
+        rewriter, loc, one, y,
+        hlo::getBroadcastDimensionsAttr(&rewriter, one, y));
+    auto result = chlo::BroadcastMulOp::create(
+        rewriter, loc, v0, v1,
+        hlo::getBroadcastDimensionsAttr(&rewriter, v0, v1));
 
     rewriter.replaceOp(op, result.getOperation()->getResults());
     return success();
@@ -5684,8 +5731,8 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
     // 'operand' parameter to scatter to for the final scatter op.
     Value init = ConcreteClass::GetInitialValue(data_type.getElementType(),
                                                 op.getLoc(), &rewriter);
-    auto broadcasted_init = rewriter.create<stablehlo::BroadcastOp>(
-        op.getLoc(), output_type, init,
+    auto broadcasted_init = stablehlo::BroadcastOp::create(
+        rewriter, op.getLoc(), output_type, init,
         GetI64ArrayAttr(output_shape, &rewriter));
 
     // Parameters for the generated scatter op.
@@ -5702,9 +5749,10 @@ class GenericConvertUnsortedSegmentReductionOp : public OpRewritePattern<OpTy> {
         /*scatterIndicesBatchingDims=*/{}, scatter_dims_to_operand_dims,
         index_vector_dim);
 
-    auto scatter = rewriter.create<stablehlo::ScatterOp>(
-        op.getLoc(), op.getType(), ValueRange(Value(broadcasted_init)),
-        op.getSegmentIds(), op.getData(), dims_attr);
+    auto scatter = stablehlo::ScatterOp::create(
+        rewriter, op.getLoc(), op.getType(),
+        ValueRange(Value(broadcasted_init)), op.getSegmentIds(), op.getData(),
+        dims_attr);
     BuildReduceBody<ReductionOp>(data_type.getElementType(),
                                  &scatter.getUpdateComputation(), &rewriter);
 
@@ -5868,8 +5916,8 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
     // Generate range(n) as the initial value for the indices to be swapped.
     auto indices_type = tensorflow::GetTypeFromTFTensorShape(
         {first_dim_size}, rewriter.getIntegerType(32));
-    Value indices = rewriter.create<stablehlo::IotaOp>(
-        op.getLoc(), indices_type, rewriter.getI64IntegerAttr(0));
+    Value indices = stablehlo::IotaOp::create(
+        rewriter, op.getLoc(), indices_type, rewriter.getI64IntegerAttr(0));
 
     // Generate random numbers to be used as swaps for the indices.
     Value swaps = CreateRngUniform32(op.getLoc(), first_dim_size, 0,
@@ -5889,22 +5937,23 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
 
       // We need to swap the indices[i] with indices[swaps[i]]. First get
       // these index values.
-      Value source_index = builder->create<stablehlo::DynamicSliceOp>(
-          loc, indices, i, scalar_one);
-      Value swap_index = builder->create<stablehlo::ReshapeOp>(
-          loc, scalar_i32_type,
-          builder->create<stablehlo::DynamicSliceOp>(loc, swaps, i,
-                                                     scalar_one));
-      Value target_index = builder->create<stablehlo::DynamicSliceOp>(
-          loc, indices, swap_index, scalar_one);
+      Value source_index = stablehlo::DynamicSliceOp::create(
+          *builder, loc, indices, i, scalar_one);
+      Value swap_index = stablehlo::ReshapeOp::create(
+          *builder, loc, scalar_i32_type,
+          stablehlo::DynamicSliceOp::create(*builder, loc, swaps, i,
+                                            scalar_one));
+      Value target_index = stablehlo::DynamicSliceOp::create(
+          *builder, loc, indices, swap_index, scalar_one);
 
       // Then perform the swap.
       // indices[i] <- indices[swaps[i]]
-      indices = builder->create<stablehlo::DynamicUpdateSliceOp>(
-          loc, indices.getType(), indices, target_index, llvm::ArrayRef(i));
+      indices = stablehlo::DynamicUpdateSliceOp::create(
+          *builder, loc, indices.getType(), indices, target_index,
+          llvm::ArrayRef(i));
       // indices[swaps[i]] <- indices[i]
-      indices = builder->create<stablehlo::DynamicUpdateSliceOp>(
-          loc, indices.getType(), indices, source_index,
+      indices = stablehlo::DynamicUpdateSliceOp::create(
+          *builder, loc, indices.getType(), indices, source_index,
           llvm::ArrayRef(swap_index));
 
       // Update new values.
@@ -5932,25 +5981,27 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
     SmallVector<Value> slice_sizes_values;
     for (auto i = 0; i < slice_sizes.size(); ++i) {
       if (slice_sizes[i] == tensorflow::kTFDynamicSize) {
-        Value i_const = rewriter.create<arith::ConstantOp>(
-            op.getLoc(), rewriter.getIndexAttr(i));
+        Value i_const = arith::ConstantOp::create(rewriter, op.getLoc(),
+                                                  rewriter.getIndexAttr(i));
         Value slice_size_index =
-            rewriter.create<shape::DimOp>(op.getLoc(), op.getValue(), i_const);
-        Value index_to_i64 = rewriter.create<arith::IndexCastOp>(
-            op.getLoc(), rewriter.getI64Type(), slice_size_index);
-        Value i64_to_tensor = rewriter.create<tensor::FromElementsOp>(
-            op.getLoc(),
+            shape::DimOp::create(rewriter, op.getLoc(), op.getValue(), i_const);
+        Value index_to_i64 = arith::IndexCastOp::create(
+            rewriter, op.getLoc(), rewriter.getI64Type(), slice_size_index);
+        Value i64_to_tensor = tensor::FromElementsOp::create(
+            rewriter, op.getLoc(),
             tensorflow::GetTypeFromTFTensorShape({1}, rewriter.getI64Type()),
             index_to_i64);
         slice_sizes_values.push_back(i64_to_tensor);
       } else {
-        slice_sizes_values.push_back(rewriter.create<stablehlo::ConstantOp>(
-            op.getLoc(), GetI64ElementsAttr({slice_sizes[i]}, &rewriter)));
+        slice_sizes_values.push_back(stablehlo::ConstantOp::create(
+            rewriter, op.getLoc(),
+            GetI64ElementsAttr({slice_sizes[i]}, &rewriter)));
       }
     }
 
-    auto slice_sizes_concat = rewriter.create<stablehlo::ConcatenateOp>(
-        op.getLoc(), slice_sizes_values, rewriter.getI64IntegerAttr(0));
+    auto slice_sizes_concat = stablehlo::ConcatenateOp::create(
+        rewriter, op.getLoc(), slice_sizes_values,
+        rewriter.getI64IntegerAttr(0));
     rewriter.replaceOpWithNewOp<stablehlo::DynamicGatherOp>(
         op, op.getType(), op.getValue(), swaped_indices, slice_sizes_concat,
         dims_attr);
@@ -5981,8 +6032,8 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
     NamedAttribute call_target_name = rewriter.getNamedAttr(
         "call_target_name", rewriter.getStringAttr("Sharding"));
 
-    auto custom_call = rewriter.create<stablehlo::CustomCallOp>(
-        op.getLoc(), op.getType(), op.getInput(),
+    auto custom_call = stablehlo::CustomCallOp::create(
+        rewriter, op.getLoc(), op.getType(), op.getInput(),
         ArrayRef<NamedAttribute>{call_target_name});
     custom_call->setAttr(kShardingAttr, *sharding);
     rewriter.replaceOp(op, custom_call.getResult(0));
@@ -6023,8 +6074,8 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     // subsequent ones are constructed based on zero_attr. Thus the type
     // for zero_attr needs to be i32 as well.
     auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(32), 0);
-    auto unpacked_indices = rewriter.create<TF::UnpackOp>(
-        op.getLoc(), unpacked_indices_type, indices, zero_attr);
+    auto unpacked_indices = TF::UnpackOp::create(
+        rewriter, op.getLoc(), unpacked_indices_type, indices, zero_attr);
 
     SmallVector<int64_t, 4> split_updates_shape;
     split_updates_shape.append(updates_type.getShape().begin(),
@@ -6036,10 +6087,10 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
         tensorflow::GetTypeFromTFTensorShape(split_updates_shape,
                                              updates_type.getElementType()));
 
-    auto cst = rewriter.create<stablehlo::ConstantOp>(op.getLoc(), zero_attr)
+    auto cst = stablehlo::ConstantOp::create(rewriter, op.getLoc(), zero_attr)
                    .getResult();
-    auto split_updates = rewriter.create<TF::SplitOp>(
-        op.getLoc(), split_updates_type, cst, updates);
+    auto split_updates = TF::SplitOp::create(rewriter, op.getLoc(),
+                                             split_updates_type, cst, updates);
 
     SmallVector<Value, 6> input_indices;
     input_indices.resize(input_type.getRank(), cst);
@@ -6047,8 +6098,9 @@ class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
     for (auto pair :
          llvm::zip(unpacked_indices.getOutput(), split_updates.getOutput())) {
       input_indices.front() = std::get<0>(pair);
-      input = rewriter.create<stablehlo::DynamicUpdateSliceOp>(
-          op.getLoc(), op.getType(), input, std::get<1>(pair), input_indices);
+      input = stablehlo::DynamicUpdateSliceOp::create(
+          rewriter, op.getLoc(), op.getType(), input, std::get<1>(pair),
+          input_indices);
     }
 
     rewriter.replaceOp(op, input);
@@ -6073,8 +6125,8 @@ class ConvertXlaDynamicUpdateSliceOp
     SmallVector<Type, 4> unpacked_indices_type(
         indices_type.getDimSize(0), tensorflow::GetTypeFromTFTensorShape(
                                         {}, indices_type.getElementType()));
-    auto unpacked_indices = rewriter.create<TF::UnpackOp>(
-        op.getLoc(), unpacked_indices_type, op.getIndices(),
+    auto unpacked_indices = TF::UnpackOp::create(
+        rewriter, op.getLoc(), unpacked_indices_type, op.getIndices(),
         IntegerAttr::get(rewriter.getIntegerType(64), 0));
     rewriter.replaceOpWithNewOp<stablehlo::DynamicUpdateSliceOp>(
         op, op.getType(), op.getInput(), op.getUpdate(),
@@ -6106,8 +6158,8 @@ class ConvertXlaReduceScatterOp
     Location loc = op.getLoc();
     Type element_type = getElementTypeOrSelf(op.getInput().getType());
 
-    auto reduce_scatter = rewriter.create<stablehlo::ReduceScatterOp>(
-        loc, op.getType(), op.getInput(),
+    auto reduce_scatter = stablehlo::ReduceScatterOp::create(
+        rewriter, loc, op.getType(), op.getInput(),
         rewriter.getIntegerAttr(rewriter.getIntegerType(64),
                                 scatter_dimension.getSExtValue()),
         replica_groups, stablehlo::ChannelHandleAttr());
@@ -6140,8 +6192,8 @@ class ConvertXlaReduceScatterOp
       auto divisor = GetScalarConstOfType(element_type, loc, replica_group_size,
                                           &rewriter);
       auto broadcast_dims = rewriter.getDenseI64ArrayAttr({});
-      result = rewriter.create<chlo::BroadcastDivOp>(
-          loc, result, divisor.getResult(), broadcast_dims);
+      result = chlo::BroadcastDivOp::create(
+          rewriter, loc, result, divisor.getResult(), broadcast_dims);
     }
 
     rewriter.replaceOp(op, {result});
@@ -6171,8 +6223,8 @@ class ConvertXlaReduceWindowOp
 
     SmallVector<Type> result_types{op.getResult().getType()};
     // Create the stablehlo.SelectAndScatter op.
-    auto reduce_window_op = rewriter.create<stablehlo::ReduceWindowOp>(
-        loc, result_types, op.getInput(), op.getInitValue(),
+    auto reduce_window_op = stablehlo::ReduceWindowOp::create(
+        rewriter, loc, result_types, op.getInput(), op.getInitValue(),
         ToDenseI64ArrayAttr(window_dimensions, &rewriter),
         ToDenseI64ArrayAttr(window_strides, &rewriter),
         ToDenseI64ArrayAttr(base_dilations, &rewriter),
@@ -6213,20 +6265,20 @@ class ConvertClipByValueOp : public OpRewritePattern<TF::ClipByValueOp> {
       return failure();
     }
 
-    auto shape = rewriter.create<TF::ShapeOp>(
-        op.getLoc(),
-        tensorflow::GetTypeFromTFTensorShape({input_ty.getRank()},
-                                             rewriter.getI32Type()),
-        input);
+    auto shape =
+        TF::ShapeOp::create(rewriter, op.getLoc(),
+                            tensorflow::GetTypeFromTFTensorShape(
+                                {input_ty.getRank()}, rewriter.getI32Type()),
+                            input);
 
     if (min_ty != input_ty) {
-      min =
-          rewriter.create<TF::BroadcastToOp>(op.getLoc(), input_ty, min, shape);
+      min = TF::BroadcastToOp::create(rewriter, op.getLoc(), input_ty, min,
+                                      shape);
     }
 
     if (max_ty != input_ty) {
-      max =
-          rewriter.create<TF::BroadcastToOp>(op.getLoc(), input_ty, max, shape);
+      max = TF::BroadcastToOp::create(rewriter, op.getLoc(), input_ty, max,
+                                      shape);
     }
 
     rewriter.replaceOpWithNewOp<stablehlo::ClampOp>(op, input_ty, min, input,
@@ -6250,9 +6302,9 @@ class ConvertConstOp : public OpRewritePattern<TF::ConstOp> {
       return failure();
 
     Location loc = op.getLoc();
-    Value result = rewriter.create<stablehlo::ConstantOp>(loc, op.getValue());
+    Value result = stablehlo::ConstantOp::create(rewriter, loc, op.getValue());
     if (result.getType() != op.getType())
-      result = rewriter.create<tensor::CastOp>(loc, op.getType(), result);
+      result = tensor::CastOp::create(rewriter, loc, op.getType(), result);
     rewriter.replaceOp(op, result);
     return success();
   }
@@ -6298,8 +6350,9 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     // the input and then later reverse the output.
     if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
-      input = rewriter.create<stablehlo::ReverseOp>(
-          op.getLoc(), input, GetI64ArrayAttr(dims_to_reverse, &rewriter));
+      input = stablehlo::ReverseOp::create(
+          rewriter, op.getLoc(), input,
+          GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
     // Convert if we need to enlarge the element type's bitwidth to avoid
@@ -6313,8 +6366,8 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     }
 
     Type sum_element_type = GetSumAccumulationType(input_element_type);
-    input = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), input,
-                                                  sum_element_type);
+    input = stablehlo::ConvertOp::create(rewriter, op.getLoc(), input,
+                                         sum_element_type);
 
     SmallVector<int64_t, 4> window_dims(rank, 1);
     SmallVector<int64_t, 4> window_strides(rank, 1);
@@ -6333,8 +6386,8 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
     Value init = GetScalarConstOfType(sum_element_type, op.getLoc(), init_value,
                                       &rewriter);
 
-    auto reduce = rewriter.create<stablehlo::ReduceWindowOp>(
-        op.getLoc(), input.getType(), input, init,
+    auto reduce = stablehlo::ReduceWindowOp::create(
+        rewriter, op.getLoc(), input.getType(), input, init,
         GetI64ArrayAttr(window_dims, &rewriter),
         GetI64ArrayAttr(window_strides, &rewriter),
         /*base_dilations=*/DenseI64ArrayAttr(),
@@ -6353,20 +6406,22 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
       llvm::SmallVector<int64_t, 4> interior_padding(rank, 0);
       low_padding[axis] = 1;
       high_padding[axis] = -1;
-      result = rewriter.create<stablehlo::PadOp>(
-          op.getLoc(), result, init, GetI64ArrayAttr(low_padding, &rewriter),
+      result = stablehlo::PadOp::create(
+          rewriter, op.getLoc(), result, init,
+          GetI64ArrayAttr(low_padding, &rewriter),
           GetI64ArrayAttr(high_padding, &rewriter),
           GetI64ArrayAttr(interior_padding, &rewriter));
     }
 
     // Convert back if we enlarged the element type's bitwidth.
-    result = rewriter.create<stablehlo::ConvertOp>(op.getLoc(), result,
-                                                   input_element_type);
+    result = stablehlo::ConvertOp::create(rewriter, op.getLoc(), result,
+                                          input_element_type);
 
     if (op.getReverse()) {
       llvm::SmallVector<int64_t, 4> dims_to_reverse({axis});
-      result = rewriter.create<stablehlo::ReverseOp>(
-          op.getLoc(), result, GetI64ArrayAttr(dims_to_reverse, &rewriter));
+      result = stablehlo::ReverseOp::create(
+          rewriter, op.getLoc(), result,
+          GetI64ArrayAttr(dims_to_reverse, &rewriter));
     }
 
     rewriter.replaceOp(op, result);
@@ -6397,7 +6452,7 @@ class ConvertShapeOp : public OpRewritePattern<TF::ShapeOp> {
     auto index_tensor = tensorflow::GetTypeFromTFTensorShape(
         result_ty.getShape(), rewriter.getIndexType());
     auto shape_op =
-        rewriter.create<shape::ShapeOfOp>(op.getLoc(), index_tensor, input);
+        shape::ShapeOfOp::create(rewriter, op.getLoc(), index_tensor, input);
     rewriter.replaceOpWithNewOp<arith::IndexCastOp>(op, result_ty, shape_op);
     return success();
   }
@@ -6422,8 +6477,8 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
       return failure();
     }
 
-    auto shape = rewriter.create<shape::ShapeOfOp>(
-        op.getLoc(),
+    auto shape = shape::ShapeOfOp::create(
+        rewriter, op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape({input_ty.getRank()},
                                              rewriter.getIndexType()),
         input);
@@ -6444,17 +6499,18 @@ class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
     }
 
     dims[inserted_dim] =
-        rewriter.create<arith::ConstantIndexOp>(op.getLoc(), 1);
+        arith::ConstantIndexOp::create(rewriter, op.getLoc(), 1);
 
     for (int i = 0; i < dims.size() - 1; i++) {
       // Add the extracted dim.
-      Value index = rewriter.create<arith::ConstantIndexOp>(op.getLoc(), i);
-      Value dim = rewriter.create<tensor::ExtractOp>(op.getLoc(), shape, index);
+      Value index = arith::ConstantIndexOp::create(rewriter, op.getLoc(), i);
+      Value dim =
+          tensor::ExtractOp::create(rewriter, op.getLoc(), shape, index);
       dims[i >= inserted_dim ? i + 1 : i] = dim;
     }
 
     auto from_extents =
-        rewriter.create<tensor::FromElementsOp>(op.getLoc(), dims);
+        tensor::FromElementsOp::create(rewriter, op.getLoc(), dims);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(
         op, result_ty, input, from_extents);
     return success();
@@ -6497,11 +6553,11 @@ class ConvertDynamicSqueezeOp : public OpRewritePattern<TF::SqueezeOp> {
     llvm::SmallVector<Value> dims;
     for (int64_t i = 0; i != input_rank; ++i) {
       if (llvm::is_contained(squeeze_dims, i)) continue;
-      dims.push_back(rewriter.create<tensor::DimOp>(op.getLoc(), input, i));
+      dims.push_back(tensor::DimOp::create(rewriter, op.getLoc(), input, i));
     }
 
     auto from_extents =
-        rewriter.create<tensor::FromElementsOp>(op.getLoc(), dims);
+        tensor::FromElementsOp::create(rewriter, op.getLoc(), dims);
     rewriter.replaceOpWithNewOp<stablehlo::DynamicReshapeOp>(
         op, result_ty, input, from_extents);
     return success();
@@ -6592,9 +6648,9 @@ class ConvertXlaSelectAndScatterOp
 
     SmallVector<Type> result_types{op.getResult().getType()};
     // Create the stablehlo.SelectAndScatter op.
-    auto select_and_scatter_op = rewriter.create<stablehlo::SelectAndScatterOp>(
-        loc, result_types, op.getOperand(), op.getSource(), op.getInitValue(),
-        ToDenseI64ArrayAttr(window_dimensions, &rewriter),
+    auto select_and_scatter_op = stablehlo::SelectAndScatterOp::create(
+        rewriter, loc, result_types, op.getOperand(), op.getSource(),
+        op.getInitValue(), ToDenseI64ArrayAttr(window_dimensions, &rewriter),
         ToDenseI64ArrayAttr(window_strides, &rewriter),
         mlir::cast<DenseIntElementsAttr>(
             hlo::convertElementsAttr(padding, rewriter.getIntegerType(64))));
@@ -6672,8 +6728,9 @@ class ConvertXlaRngBitGeneratorOp
     auto algorithm_attr = mlir::stablehlo::RngAlgorithmAttr::get(
         rewriter.getContext(),
         *mlir::stablehlo::symbolizeRngAlgorithm(xla_alg.value()));
-    auto rng_bit_generator_op = rewriter.create<stablehlo::RngBitGeneratorOp>(
-        loc, op.getResultTypes(), algorithm_attr, op.getInitialState());
+    auto rng_bit_generator_op = stablehlo::RngBitGeneratorOp::create(
+        rewriter, loc, op.getResultTypes(), algorithm_attr,
+        op.getInitialState());
 
     rewriter.replaceOp(op, rng_bit_generator_op.getResults());
 
@@ -6700,8 +6757,8 @@ class ConvertXlaVariadicReduceV2Op
         [](Type ty) { return mlir::cast<ShapedType>(ty).getElementType(); })};
 
     // Create the stablehlo.reduce op.
-    auto reduce_op = rewriter.create<stablehlo::ReduceOp>(
-        loc, op.getInputs(), op.getInitValues(),
+    auto reduce_op = stablehlo::ReduceOp::create(
+        rewriter, loc, op.getInputs(), op.getInitValues(),
         ToDenseI64ArrayAttr(GetI64ElementsAttr(op.getDimensionsToReduce()),
                             &rewriter),
         elementTypes);
@@ -6727,9 +6784,9 @@ class ConvertXlaVariadicSortOp
     ElementsAttr dimension;
     matchPattern(op.getDimension(), m_Constant(&dimension));
     // Create the stablehlo.sort op.
-    auto sort_op = rewriter.create<stablehlo::SortOp>(
-        loc, op.getInputs(), dimension.getValues<IntegerAttr>()[0].getInt(),
-        op.getIsStable());
+    auto sort_op = stablehlo::SortOp::create(
+        rewriter, loc, op.getInputs(),
+        dimension.getValues<IntegerAttr>()[0].getInt(), op.getIsStable());
     mlir::SymbolRefAttr func = op.getComparator();
     auto func_op = cast<mlir::func::FuncOp>(SymbolTable::lookupSymbolIn(
         op->getParentOfType<mlir::ModuleOp>(), func));
@@ -6816,9 +6873,9 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
     if constexpr (std::is_same<DstOpT, stablehlo::CaseOp>::value) {
       // Explicitly handle the Case op because it has variadic regions and takes
       // the number of regions as an input along with the operands.
-      stablehlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
-                                             adaptor.getBranchIndex(),
-                                             op.getBranches().size());
+      stablehlo_op =
+          DstOpT::create(rewriter, loc, op.getResultTypes(),
+                         adaptor.getBranchIndex(), op.getBranches().size());
     } else if constexpr (std::is_same<DstOpT, stablehlo::WhileOp>::value) {
       llvm::SmallVector<Type, 4> while_result_types;
       while_result_types.reserve(num_results);
@@ -6827,11 +6884,11 @@ class LowerControlFlowOp : public OpConversionPattern<SrcOpT> {
         while_result_types.push_back(ty);
       }
 
-      stablehlo_op = rewriter.create<DstOpT>(loc, TypeRange(while_result_types),
-                                             adaptor.getOperands());
+      stablehlo_op = DstOpT::create(
+          rewriter, loc, TypeRange(while_result_types), adaptor.getOperands());
     } else {
-      stablehlo_op = rewriter.create<DstOpT>(loc, op.getResultTypes(),
-                                             adaptor.getOperands());
+      stablehlo_op = DstOpT::create(rewriter, loc, op.getResultTypes(),
+                                    adaptor.getOperands());
     }
 
     int64_t num_regions = op.getNumRegions();
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
index 7061aaa4a5657b..abfcc0d26acc65 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
@@ -143,8 +143,9 @@ LogicalResult ConvertAllReduce(OpBuilder& builder, int64_t channel_id,
   ChannelHandleAttr channel_handle = ConvertChannel(builder, channel_id, mode);
   Location loc = op->getLoc();
   Type element_type = getElementTypeOrSelf(input.getType());
-  auto all_reduce = builder.create<AllReduceOp>(
-      loc, result_type, input, replica_groups, channel_handle, nullptr);
+  auto all_reduce =
+      AllReduceOp::create(builder, loc, result_type, input, replica_groups,
+                          channel_handle, nullptr);
 
   if (all_reduce.getNumResults() != 1) {
     return op->emitOpError()
@@ -178,8 +179,8 @@ LogicalResult ConvertAllReduce(OpBuilder& builder, int64_t channel_id,
     auto divisor =
         GetScalarConstOfType(element_type, loc, replica_group_size, &builder);
     auto broadcast_dims = builder.getDenseI64ArrayAttr({});
-    result = builder.create<chlo::BroadcastDivOp>(
-        loc, all_reduce.getResult(0), divisor.getResult(), broadcast_dims);
+    result = chlo::BroadcastDivOp::create(builder, loc, all_reduce.getResult(0),
+                                          divisor.getResult(), broadcast_dims);
   } else if (final_op != "Id") {
     return op->emitOpError()
            << "invalid final_op " << final_op << ", want one of [Id, Div]";
@@ -373,11 +374,12 @@ class ConvertCollectiveAssignGroupV2
     IntegerAttr group_size = rewriter.getI32IntegerAttr(replica_groups.size());
     IntegerAttr group_key = rewriter.getI32IntegerAttr(0);
 
-    auto const_group_size = rewriter.create<TF::ConstOp>(
-        assign_group->getLoc(), assign_group.getResult(0).getType(),
-        group_size);
-    auto const_group_key = rewriter.create<TF::ConstOp>(
-        assign_group->getLoc(), assign_group.getResult(1).getType(), group_key);
+    auto const_group_size =
+        TF::ConstOp::create(rewriter, assign_group->getLoc(),
+                            assign_group.getResult(0).getType(), group_size);
+    auto const_group_key =
+        TF::ConstOp::create(rewriter, assign_group->getLoc(),
+                            assign_group.getResult(1).getType(), group_key);
     rewriter.replaceAllUsesWith(assign_group.getResult(0), const_group_size);
     rewriter.replaceAllUsesWith(assign_group.getResult(1), const_group_key);
     rewriter.eraseOp(assign_group);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
index 7e653188857283..b1105d1a4e4000 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
@@ -289,10 +289,10 @@ Value CreateSendOp(OpBuilder& builder, Location loc, Value operand,
                                                /*handle=*/GetNextChannelId(),
                                                /*type=*/2);
   auto empty_source_target_pairs = builder.getI64TensorAttr({});
-  auto send = builder.create<SendOp>(
-      loc, token.getType(), operand, token, channel_handle,
-      /*is_host_transfer=*/builder.getBoolAttr(true),
-      /*source_target_pairs=*/empty_source_target_pairs);
+  auto send = SendOp::create(builder, loc, token.getType(), operand, token,
+                             channel_handle,
+                             /*is_host_transfer=*/builder.getBoolAttr(true),
+                             /*source_target_pairs=*/empty_source_target_pairs);
   SetFrontendAttributes(send, index, key, operand.getType(),
                         /*device_to_host=*/true, host_handler_name);
 
@@ -311,10 +311,10 @@ Value CreateRecvOp(OpBuilder& builder, Location loc, Value result,
                                                /*type=*/3);
   auto result_type = result.getType();
   SmallVector<Type, 2> recv_result_type = {result_type, token.getType()};
-  auto recv = builder.create<RecvOp>(
-      loc, recv_result_type, token, channel_handle,
-      /*is_host_transfer=*/builder.getBoolAttr(true),
-      /*source_target_pairs=*/builder.getI64TensorAttr({}));
+  auto recv =
+      RecvOp::create(builder, loc, recv_result_type, token, channel_handle,
+                     /*is_host_transfer=*/builder.getBoolAttr(true),
+                     /*source_target_pairs=*/builder.getI64TensorAttr({}));
 
   SetFrontendAttributes(recv, index, key, result_type,
                         /*device_to_host=*/false, host_handler_name);
@@ -336,7 +336,7 @@ Value CreateSinkToken(OpBuilder& builder, Location loc, ArrayRef<Value> tokens,
   } else if (llvm::hasSingleElement(tokens)) {
     return tokens[0];
   } else {
-    return builder.create<AfterAllOp>(loc, original_token.getType(), tokens)
+    return AfterAllOp::create(builder, loc, original_token.getType(), tokens)
         .getResult();
   }
 }
@@ -413,8 +413,8 @@ Value RewriteCallOp(OpBuilder& builder, func::CallOp call,
   new_operands.push_back(token);
   auto new_result_types = llvm::to_vector(call.getResultTypes());
   new_result_types.push_back(token.getType());
-  auto new_call = builder.create<func::CallOp>(
-      call.getLoc(), new_result_types,
+  auto new_call = func::CallOp::create(
+      builder, call.getLoc(), new_result_types,
       new_symbol ? *new_symbol : call.getCallee(), new_operands);
 
   for (auto results : llvm::zip(call.getResults(), new_call.getResults()))
@@ -435,7 +435,7 @@ struct OpVisitorState {
 
 // Creates a tuple from a sequence of values.
 Value CreateTuple(OpBuilder& builder, Location loc, ArrayRef<Value> operands) {
-  return builder.create<TupleOp>(loc, operands).getResult();
+  return TupleOp::create(builder, loc, operands).getResult();
 }
 
 // Extends `values` with the value `token` attached. If `flatten_tuple` is
@@ -480,7 +480,7 @@ SmallVector<Value> GetValueWithToken(
   SmallVector<Value, 4> tuple_operands;
   for (auto idx : llvm::seq<int32_t>(0, tuple_type.getTypes().size()))
     tuple_operands.push_back(
-        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+        GetTupleElementOp::create(builder, value.getLoc(), value, idx)
             .getResult());
 
   tuple_operands.push_back(token);
@@ -518,7 +518,7 @@ Value CreateSubTuple(OpBuilder& builder, Value value, size_t end) {
   SmallVector<Value, 4> tuple_operands;
   for (auto idx : llvm::seq<int32_t>(0, end))
     tuple_operands.push_back(
-        builder.create<GetTupleElementOp>(value.getLoc(), value, idx)
+        GetTupleElementOp::create(builder, value.getLoc(), value, idx)
             .getResult());
 
   return CreateTuple(builder, value.getLoc(), tuple_operands);
@@ -543,8 +543,8 @@ void ReplaceWithTupleResult(OpBuilder& builder, ValueRange values,
   auto tuple_type = mlir::dyn_cast<TupleType>(value.getType());
   if (!tuple_type) {
     if (!value.use_empty()) {
-      auto new_element = builder.create<GetTupleElementOp>(replacement.getLoc(),
-                                                           replacement, 0);
+      auto new_element = GetTupleElementOp::create(
+          builder, replacement.getLoc(), replacement, 0);
       value.replaceAllUsesWith(new_element.getResult());
     }
     return;
@@ -620,8 +620,8 @@ void RewriteRegionIfOp(OpBuilder& builder, IfOp region_if,
                        /*flatten_tuple=*/true);
 
   // Create new `mhlo.if` op with extra token operands and result.
-  auto new_if = builder.create<IfOp>(region_if.getLoc(), new_result_types,
-                                     region_if.getPred());
+  auto new_if = IfOp::create(builder, region_if.getLoc(), new_result_types,
+                             region_if.getPred());
 
   // Move all regions from the old `mhlo.if` op to its replacement.
   new_if.getTrueBranch().takeBody(region_if.getTrueBranch());
@@ -745,8 +745,8 @@ void RewriteRegionWhileOp(OpBuilder& builder, WhileOp region_while,
                        /*flatten_tuple*/ true);
 
   // Create new `mhlo.while` op with extra token operand and result.
-  auto new_while = builder.create<WhileOp>(region_while.getLoc(),
-                                           new_result_types, new_val_operands);
+  auto new_while = WhileOp::create(builder, region_while.getLoc(),
+                                   new_result_types, new_val_operands);
 
   // Move all regions from the old `mhlo.while` op to its replacement.
   new_while.getCond().takeBody(region_while.getCond());
@@ -815,7 +815,7 @@ void RewriteFunctionTerminator(OpBuilder& builder,
   auto new_results = llvm::to_vector(terminator.getOperands());
   new_results.push_back(token);
   builder.setInsertionPoint(terminator);
-  builder.create<mlir::func::ReturnOp>(terminator.getLoc(), new_results);
+  mlir::func::ReturnOp::create(builder, terminator.getLoc(), new_results);
   terminator.erase();
 }
 
@@ -844,7 +844,7 @@ LogicalResult RewriteFunction(
   // a token will be created. Otherwise a token block argument is inserted.
   Value init_token =
       rewrite_block ? func_body.addArgument(token_type, func.getLoc())
-                    : builder.create<CreateTokenOp>(func.getLoc(), token_type)
+                    : CreateTokenOp::create(builder, func.getLoc(), token_type)
                           .getResult();
 
   // Stack to keep track of region based control flow op nesting and current
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc
index ecf3aea5f65d48..0b0e68548032a9 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass.cc
@@ -108,15 +108,15 @@ void PopulateEmptyIsland(tf_executor::IslandOp island) {
   OpBuilder builder(&island.GetBody(), island.GetBody().begin());
   tf_executor::YieldOp yield = island.GetYield();
   if (yield.getNumOperands() == 0) {
-    builder.create<TF::NoOp>(island.getLoc(), TypeRange{}, ValueRange{});
+    TF::NoOp::create(builder, island.getLoc(), TypeRange{}, ValueRange{});
   } else if (yield.getNumOperands() == 1) {
     Value operand = yield.getOperand(0);
-    auto identity = builder.create<TF::IdentityOp>(island.getLoc(),
-                                                   operand.getType(), operand);
+    auto identity = TF::IdentityOp::create(builder, island.getLoc(),
+                                           operand.getType(), operand);
     yield.setOperand(0, identity.getOutput());
   } else {
-    auto identity_n = builder.create<TF::IdentityNOp>(
-        island.getLoc(), yield.getOperandTypes(), yield.getOperands());
+    auto identity_n = TF::IdentityNOp::create(
+        builder, island.getLoc(), yield.getOperandTypes(), yield.getOperands());
     for (const auto& it : llvm::enumerate(identity_n.getResults()))
       yield.setOperand(it.index(), it.value());
   }
@@ -128,15 +128,15 @@ tf_executor::IslandOp CreateIsland(TypeRange result_types,
                                    const Location& loc, Operation& sub_op,
                                    tf_executor::IslandOp original_island) {
   OpBuilder builder(original_island);
-  auto island = builder.create<tf_executor::IslandOp>(
-      loc, result_types, control_type, mlir::ValueRange{});
+  auto island = tf_executor::IslandOp::create(builder, loc, result_types,
+                                              control_type, mlir::ValueRange{});
   island.getBody().push_back(new Block);
   Block* block = &island.getBody().back();
   OpBuilder island_builder(original_island);
   island_builder.setInsertionPointToEnd(block);
   sub_op.replaceAllUsesWith(island.getOutputs());
   sub_op.moveBefore(block, block->begin());
-  island_builder.create<tf_executor::YieldOp>(loc, sub_op.getResults());
+  tf_executor::YieldOp::create(island_builder, loc, sub_op.getResults());
   return island;
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc
index d888b0c12588c9..2e5e4764f63d34 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/split_into_island_per_op_pass_test.cc
@@ -53,12 +53,11 @@ class SplitIntoIslandPerOpPass : public ::testing::Test {
     llvm::SmallVector<mlir::Type, 1> island_result_types;
     island_result_types.push_back(op_builder_.getF64Type());
 
-    mlir::Operation* yield_op = op_builder_.create<mlir::tf_executor::YieldOp>(
-        op_state.location, mlir::ValueRange{});
-    mlir::tf_executor::IslandOp island_op =
-        op_builder_.create<mlir::tf_executor::IslandOp>(
-            op_state.location, island_result_types, mlir::ValueRange{},
-            mlir::ArrayRef<mlir::NamedAttribute>{});
+    mlir::Operation* yield_op = mlir::tf_executor::YieldOp::create(
+        op_builder_, op_state.location, mlir::ValueRange{});
+    mlir::tf_executor::IslandOp island_op = mlir::tf_executor::IslandOp::create(
+        op_builder_, op_state.location, island_result_types, mlir::ValueRange{},
+        mlir::ArrayRef<mlir::NamedAttribute>{});
     island_op.getBody().push_back(new mlir::Block);
     island_op.getBody().back().push_back(yield_op);
     return island_op;
@@ -126,13 +125,13 @@ TEST_F(SplitIntoIslandPerOpPass, IslandOpTwoOpsSplitsIntoTwoIslands) {
   islandOp.getBody().back().push_front(inner_op_2);
   // Code relies on a parent with a fetch op containing the island op.
   mlir::tf_executor::GraphOp parent_graph_op =
-      op_builder_.create<mlir::tf_executor::GraphOp>(
-          mlir::UnknownLoc::get(&context_),
+      mlir::tf_executor::GraphOp::create(
+          op_builder_, mlir::UnknownLoc::get(&context_),
           mlir::TypeRange{op_builder_.getF64Type()});
   parent_graph_op.getRegion().push_back(new mlir::Block);
   parent_graph_op.push_back(islandOp);
   mlir::tf_executor::FetchOp fetch_op =
-      op_builder_.create<mlir::tf_executor::FetchOp>(parent_graph_op.getLoc());
+      mlir::tf_executor::FetchOp::create(op_builder_, parent_graph_op.getLoc());
   parent_graph_op.GetBody().push_back(fetch_op);
 
   SplitIsland(islandOp, control_type);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc
index a7e9726e7575a3..2f7089edacbe31 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tfxla_device_specific_transforms.cc
@@ -57,8 +57,9 @@ LogicalResult TFXLADeviceSpecificTransforms::ConvertGetAlgOp(
 
   OpBuilder opbuilder(get_alg_op);
 
-  auto tf_const = opbuilder.create<TF::ConstOp>(
-      get_alg_op->getLoc(), opbuilder.getI32IntegerAttr((int)tensorflow_rng));
+  auto tf_const =
+      TF::ConstOp::create(opbuilder, get_alg_op->getLoc(),
+                          opbuilder.getI32IntegerAttr((int)tensorflow_rng));
 
   get_alg_op->replaceAllUsesWith(tf_const);
   get_alg_op->erase();
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc b/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc
index 0152cd1d1a7363..61c8e8e161425d 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/utils.cc
@@ -24,11 +24,11 @@ namespace mhlo {
 
 ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
                                 OpBuilder* builder) {
-  return builder->create<ConstantOp>(loc, hlo::getScalarOfType(ty, raw_value));
+  return ConstantOp::create(*builder, loc, hlo::getScalarOfType(ty, raw_value));
 }
 
 ConstantOp GetScalarNegZeroOfType(Type ty, Location loc, OpBuilder* builder) {
-  return builder->create<ConstantOp>(loc, hlo::getScalarNegZeroOfType(ty));
+  return ConstantOp::create(*builder, loc, hlo::getScalarNegZeroOfType(ty));
 }
 
 DenseIntElementsAttr GetI64ElementsAttr(ArrayAttr attr) {
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/utils.h b/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
index 5dba4a4dcf894c..a6b848ae2fc27b 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/utils.h
@@ -42,8 +42,8 @@ void BuildReduceBody(Type element_type, Region* body, OpBuilder* builder) {
   block->addArguments({type, type}, SmallVector<Location, 2>(2, loc));
 
   auto reducer =
-      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
-  builder->create<ReturnOp>(loc, reducer.getResult());
+      Op::create(*builder, loc, block->getArgument(0), block->getArgument(1));
+  ReturnOp::create(*builder, loc, reducer.getResult());
 }
 
 ConstantOp GetScalarConstOfType(Type ty, Location loc, int64_t raw_value,
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
index 6572aef984b043..71dce38198c96a 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets_test.cc
@@ -63,8 +63,8 @@ class XlaLegalizeTargetsTest : public testing::Test {
 };
 
 TEST_F(XlaLegalizeTargetsTest, CreatesConversionTargets) {
-  auto const_int = builder_.create<mlir::arith::ConstantIntOp>(
-      builder_.getUnknownLoc(), builder_.getI32Type(), /*value=*/10);
+  auto const_int = mlir::arith::ConstantIntOp::create(
+      builder_, builder_.getUnknownLoc(), builder_.getI32Type(), /*value=*/10);
 
   ConversionTarget target =
       GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/false);
@@ -72,8 +72,8 @@ TEST_F(XlaLegalizeTargetsTest, CreatesConversionTargets) {
 }
 
 TEST_F(XlaLegalizeTargetsTest, AllowsCHLODialect) {
-  auto const_int = builder_.create<chlo::ConstantOp>(
-      builder_.getUnknownLoc(), builder_.getI32TensorAttr({42}));
+  auto const_int = chlo::ConstantOp::create(builder_, builder_.getUnknownLoc(),
+                                            builder_.getI32TensorAttr({42}));
 
   ConversionTarget target =
       GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/true);
@@ -82,8 +82,8 @@ TEST_F(XlaLegalizeTargetsTest, AllowsCHLODialect) {
 }
 
 TEST_F(XlaLegalizeTargetsTest, DontAllowCHLODialect) {
-  auto const_int = builder_.create<chlo::ConstantOp>(
-      builder_.getUnknownLoc(), builder_.getI32TensorAttr({42}));
+  auto const_int = chlo::ConstantOp::create(builder_, builder_.getUnknownLoc(),
+                                            builder_.getI32TensorAttr({42}));
 
   ConversionTarget target =
       GetDefaultLegalConversionTargets(context_, /*legalize_chlo=*/false);
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index d44e65f029ada3..66b5167839731b 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -118,10 +118,11 @@ class TFRInlinerInterface : public DialectInlinerInterface {
     auto result_itype = llvm::cast<IntegerType>(result_type);
     if (input_itype.getWidth() == result_itype.getWidth()) return nullptr;
     if (input_itype.getWidth() > result_itype.getWidth()) {
-      return builder.create<arith::TruncIOp>(conversion_loc, result_type,
-                                             input);
+      return arith::TruncIOp::create(builder, conversion_loc, result_type,
+                                     input);
     } else {
-      return builder.create<arith::ExtSIOp>(conversion_loc, result_type, input);
+      return arith::ExtSIOp::create(builder, conversion_loc, result_type,
+                                    input);
     }
   }
 };
@@ -148,11 +149,11 @@ TFRDialect::TFRDialect(MLIRContext *context)
 Operation *TFRDialect::materializeConstant(OpBuilder &builder, Attribute value,
                                            Type type, Location loc) {
   if (arith::ConstantOp::isBuildableWith(value, type))
-    return builder.create<arith::ConstantOp>(loc, type,
-                                             llvm::cast<TypedAttr>(value));
+    return arith::ConstantOp::create(builder, loc, type,
+                                     llvm::cast<TypedAttr>(value));
   if (func::ConstantOp::isBuildableWith(value, type))
-    return builder.create<func::ConstantOp>(
-        loc, type, llvm::cast<FlatSymbolRefAttr>(value));
+    return func::ConstantOp::create(builder, loc, type,
+                                    llvm::cast<FlatSymbolRefAttr>(value));
   return nullptr;
 }
 
@@ -421,9 +422,10 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
           {static_cast<int64_t>(array.size())}, *all_types.begin());
       DenseElementsAttr attr =
           DenseElementsAttr::get(new_out_type, array.getValue());
-      new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, attr);
+      new_cst = TF::ConstOp::create(rewriter, loc, new_out_type, attr);
       if (isa<TFRTensorType>(out_type)) {
-        new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
+        new_cst =
+            CastOp::create(rewriter, loc, out_type, new_cst->getResult(0));
       }
       rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
       return success();
@@ -432,9 +434,10 @@ class ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
     TypedAttr scalar;
     if (matchPattern(cst_tensor_op.getArg(), m_Constant(&scalar))) {
       Type new_out_type = RankedTensorType::get({}, scalar.getType());
-      new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, scalar);
+      new_cst = TF::ConstOp::create(rewriter, loc, new_out_type, scalar);
       if (isa<TFRTensorType>(out_type)) {
-        new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
+        new_cst =
+            CastOp::create(rewriter, loc, out_type, new_cst->getResult(0));
       }
       rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
       return success();
@@ -481,8 +484,8 @@ class RemoveRedundantCast : public OpRewritePattern<CastOp> {
     if ((input_tensor_type.getElementType() !=
          output_tensor_type.getElementType()) &&
         !isQuantizedType(input_type) && !isQuantizedType(output_type)) {
-      auto new_tfr_cast = rewriter.create<TFR::CastOp>(
-          cast_op.getLoc(),
+      auto new_tfr_cast = TFR::CastOp::create(
+          rewriter, cast_op.getLoc(),
           output_tensor_type.clone(input_tensor_type.getElementType()),
           cast_op.getArg());
       rewriter.replaceOpWithNewOp<TF::CastOp>(cast_op, output_type,
@@ -652,8 +655,9 @@ class RemoveRawDataOp : public OpRewritePattern<TFRQuantRawDataOp> {
       new_list_values.push_back(redundant_cast.getArg());
     }
 
-    auto new_list = rewriter.create<BuildListOp>(
-        raw_data_op.getLoc(), preceding_list.getType(), new_list_values);
+    auto new_list =
+        BuildListOp::create(rewriter, raw_data_op.getLoc(),
+                            preceding_list.getType(), new_list_values);
     raw_data_op.getOutput().replaceAllUsesWith(new_list.getOut());
     return success();
   }
@@ -679,11 +683,11 @@ class RemoveQParamsOp : public OpRewritePattern<TFRQuantQParamsOp> {
     rewriter.setInsertionPoint(qparams_op);
     Location loc = qparams_op->getLoc();
     if (auto qtype = llvm::dyn_cast<quant::UniformQuantizedType>(cast_qtype)) {
-      scale_op = rewriter.create<TF::ConstOp>(
-          loc, RankedTensorType::get({}, rewriter.getF32Type()),
+      scale_op = TF::ConstOp::create(
+          rewriter, loc, RankedTensorType::get({}, rewriter.getF32Type()),
           rewriter.getF32FloatAttr(qtype.getScale()));
-      zp_op = rewriter.create<TF::ConstOp>(
-          loc, RankedTensorType::get({}, rewriter.getI32Type()),
+      zp_op = TF::ConstOp::create(
+          rewriter, loc, RankedTensorType::get({}, rewriter.getI32Type()),
           rewriter.getI32IntegerAttr(qtype.getZeroPoint()));
     } else if (auto qtype = llvm::dyn_cast<quant::UniformQuantizedPerAxisType>(
                    cast_qtype)) {
@@ -697,20 +701,20 @@ class RemoveQParamsOp : public OpRewritePattern<TFRQuantQParamsOp> {
           {static_cast<int64_t>(num_channels)}, rewriter.getF32Type());
       auto scales_attr =
           DenseElementsAttr::get(scales_type, llvm::ArrayRef(scales));
-      scale_op = rewriter.create<TF::ConstOp>(loc, scales_attr);
+      scale_op = TF::ConstOp::create(rewriter, loc, scales_attr);
 
       auto zps_type = RankedTensorType::get(
           {static_cast<int64_t>(num_channels)}, rewriter.getI32Type());
       auto zps_attr = DenseElementsAttr::get(zps_type, llvm::ArrayRef(zps));
-      zp_op = rewriter.create<TF::ConstOp>(loc, zps_attr);
+      zp_op = TF::ConstOp::create(rewriter, loc, zps_attr);
     }
     if (!scale_op || !zp_op) {
       return failure();
     }
-    auto scale_cast = rewriter.create<CastOp>(
-        loc, qparams_op.getScale().getType(), scale_op.getOutput());
-    auto zp_cast = rewriter.create<CastOp>(loc, qparams_op.getZp().getType(),
-                                           zp_op.getOutput());
+    auto scale_cast = CastOp::create(
+        rewriter, loc, qparams_op.getScale().getType(), scale_op.getOutput());
+    auto zp_cast = CastOp::create(rewriter, loc, qparams_op.getZp().getType(),
+                                  zp_op.getOutput());
 
     qparams_op.getScale().replaceAllUsesWith(scale_cast.getOut());
     qparams_op.getZp().replaceAllUsesWith(zp_cast.getOut());
@@ -787,10 +791,11 @@ class RemoveScaleFactorOp : public OpRewritePattern<TFRQuantScaleFactorOp> {
     }
     rewriter.setInsertionPoint(scale_factor_op);
     const Location loc = scale_factor_op->getLoc();
-    auto result_scale_op = rewriter.create<TF::ConstOp>(
-        loc, DenseElementsAttr::get(scale_type, llvm::ArrayRef(scale_factors)));
-    auto result_scale_cast_op = rewriter.create<CastOp>(
-        loc, scale_factor_op.getType(), result_scale_op.getOutput());
+    auto result_scale_op = TF::ConstOp::create(
+        rewriter, loc,
+        DenseElementsAttr::get(scale_type, llvm::ArrayRef(scale_factors)));
+    auto result_scale_cast_op = CastOp::create(
+        rewriter, loc, scale_factor_op.getType(), result_scale_op.getOutput());
     scale_factor_op.getScaleFactor().replaceAllUsesWith(
         result_scale_cast_op.getOut());
     return success();
@@ -812,50 +817,55 @@ class RemoveRescaleOp : public OpRewritePattern<TFRQuantRescaleOp> {
     const Location loc = rescale_op->getLoc();
     const auto result_types = rescale_op->getResultTypes();
     auto c_false =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getBoolAttr(false));
+        arith::ConstantOp::create(rewriter, loc, rewriter.getBoolAttr(false));
     TypeAttr f32_attr = TypeAttr::get(rewriter.getF32Type());
     TFRAttrType output_type = TFRAttrType::get(rewriter.getContext());
-    auto constant_f32_op = rewriter.create<ConstOp>(loc, output_type, f32_attr);
+    auto constant_f32_op =
+        ConstOp::create(rewriter, loc, output_type, f32_attr);
     TypeAttr i32_attr = TypeAttr::get(rewriter.getI32Type());
-    auto constant_i32_op = rewriter.create<ConstOp>(loc, output_type, i32_attr);
+    auto constant_i32_op =
+        ConstOp::create(rewriter, loc, output_type, i32_attr);
 
     IntegerAttr zp_attr;
     if (!matchPattern(zp, m_Constant(&zp_attr))) {
       return failure();
     }
     rewriter.setInsertionPoint(zp.getDefiningOp());
-    auto zp_tensor = rewriter.create<TF::ConstOp>(
-        loc, RankedTensorType::get({}, zp.getType()), zp_attr);
-    auto zp_cast = rewriter.create<CastOp>(
-        loc, rewriter.getType<TFRTensorType>(), zp_tensor.getOutput());
+    auto zp_tensor = TF::ConstOp::create(
+        rewriter, loc, RankedTensorType::get({}, zp.getType()), zp_attr);
+    auto zp_cast =
+        CastOp::create(rewriter, loc, rewriter.getType<TFRTensorType>(),
+                       zp_tensor.getOutput());
 
     rewriter.setInsertionPoint(rescale_op);
-    auto cast_input_to_float_op = rewriter.create<CallOp>(
-        loc, result_types,
-        SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
-        ArrayRef<Value>{input, constant_f32_op, c_false},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto input_x_scale_op = rewriter.create<CallOp>(
-        loc, result_types, SymbolRefAttr::get(rewriter.getContext(), "tf__mul"),
+    auto cast_input_to_float_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
+                       ArrayRef<Value>{input, constant_f32_op, c_false},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto input_x_scale_op = CallOp::create(
+        rewriter, loc, result_types,
+        SymbolRefAttr::get(rewriter.getContext(), "tf__mul"),
         ArrayRef<Value>{cast_input_to_float_op.getResult(0), scale},
         /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto round_rescaled_op = rewriter.create<CallOp>(
-        loc, result_types,
-        SymbolRefAttr::get(rewriter.getContext(), "tf__round"),
-        ArrayRef<Value>{input_x_scale_op->getResult(0)},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto cast_zp_to_float_op = rewriter.create<CallOp>(
-        loc, result_types,
-        SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
-        ArrayRef<Value>{zp_cast, constant_f32_op, c_false},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto recentered_op = rewriter.create<CallOp>(
-        loc, result_types, SymbolRefAttr::get(rewriter.getContext(), "tf__add"),
-        ArrayRef<Value>{round_rescaled_op->getResult(0),
-                        cast_zp_to_float_op->getResult(0)},
-        /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
-    auto cast_output_to_i32 = rewriter.create<CallOp>(
-        loc, result_types,
+    auto round_rescaled_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__round"),
+                       ArrayRef<Value>{input_x_scale_op->getResult(0)},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto cast_zp_to_float_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
+                       ArrayRef<Value>{zp_cast, constant_f32_op, c_false},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto recentered_op =
+        CallOp::create(rewriter, loc, result_types,
+                       SymbolRefAttr::get(rewriter.getContext(), "tf__add"),
+                       ArrayRef<Value>{round_rescaled_op->getResult(0),
+                                       cast_zp_to_float_op->getResult(0)},
+                       /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
+    auto cast_output_to_i32 = CallOp::create(
+        rewriter, loc, result_types,
         SymbolRefAttr::get(rewriter.getContext(), "tf__cast"),
         ArrayRef<Value>{recentered_op->getResult(0), constant_i32_op, c_false},
         /*args_attrs=*/nullptr, /*res_attrs=*/nullptr);
diff --git a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
index fb0640536d4fe5..7a03a46972371c 100644
--- a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
@@ -75,12 +75,12 @@ class UnrollSCFForOp : public OpRewritePattern<scf::ForOp> {
     for (auto i = 0; i < trip_count; ++i) {
       if (!iv.use_empty()) {
         // iv' = iv + step * i;
-        Value iter = rewriter.create<arith::ConstantIndexOp>(loc, i);
+        Value iter = arith::ConstantIndexOp::create(rewriter, loc, i);
         Value step_cst =
-            rewriter.create<arith::ConstantIndexOp>(loc, step.getSExtValue());
-        Value stride = rewriter.create<arith::MulIOp>(loc, step_cst, iter);
+            arith::ConstantIndexOp::create(rewriter, loc, step.getSExtValue());
+        Value stride = arith::MulIOp::create(rewriter, loc, step_cst, iter);
         Value iv_unroll =
-            rewriter.create<arith::AddIOp>(loc, mapping.lookup(iv), stride);
+            arith::AddIOp::create(rewriter, loc, mapping.lookup(iv), stride);
         mapping.map(iv, iv_unroll);
       }
 
diff --git a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
index 94a84cc3072ea6..5dd6a22f90c972 100644
--- a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
@@ -148,7 +148,7 @@ class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
         mlir::cast<TypeAttr>(cast_op.getInputElementType()).getValue();
     if (result_elt_type != original_input_type) {
       UnrankedTensorType result_type = UnrankedTensorType::get(result_elt_type);
-      return rewriter.create<TF::CastOp>(loc, result_type, cast_op.getArg());
+      return TF::CastOp::create(rewriter, loc, result_type, cast_op.getArg());
     }
     return cast_op.getArg();
   }
@@ -167,7 +167,7 @@ class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
       Type current_input_type = mlir::cast<TypeAttr>(input_types[i]).getValue();
       if (current_input_type != target_input_type) {
         input_values[i] =
-            rewriter.create<TF::CastOp>(loc, result_type, input_values[i]);
+            TF::CastOp::create(rewriter, loc, result_type, input_values[i]);
       }
     }
   }
@@ -397,7 +397,7 @@ LogicalResult RewriteTFRCallOp::CreateAndReplaceOp(
     Type res_type = res.value();
     if (mlir::dyn_cast<TFRTensorType>(res_type)) {
       Value new_res = new_op->getResult(res.index());
-      auto casted = rewriter.create<CastOp>(loc, res_type, new_res);
+      auto casted = CastOp::create(rewriter, loc, res_type, new_res);
       new_results.push_back(casted.getOut());
     } else if (auto list_type =
                    mlir::dyn_cast<TFRTensorListType>(res.value())) {
@@ -405,10 +405,10 @@ LogicalResult RewriteTFRCallOp::CreateAndReplaceOp(
       for (int i = res.index(); i < new_op->getNumResults(); i++) {
         Value new_res = new_op->getResult(i);
         auto casted =
-            rewriter.create<CastOp>(loc, unconstrainted_type, new_res);
+            CastOp::create(rewriter, loc, unconstrainted_type, new_res);
         tensor_list.push_back(casted.getOut());
       }
-      auto list_op = rewriter.create<BuildListOp>(loc, res_type, tensor_list);
+      auto list_op = BuildListOp::create(rewriter, loc, res_type, tensor_list);
       new_results.push_back(list_op.getOut());
     }
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc b/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc
index a42d2f5d2ad7d1..aafd3d958f826b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/deduplicate_if_result_pass.cc
@@ -94,7 +94,7 @@ mlir::func::FuncOp CreateBranchFunctionWithDeduplicatedResults(
   auto new_func_type = mlir::FunctionType::get(builder.getContext(), arg_types,
                                                new_result_types);
 
-  auto new_func = builder.create<mlir::func::FuncOp>(loc, name, new_func_type);
+  auto new_func = mlir::func::FuncOp::create(builder, loc, name, new_func_type);
   new_func.setVisibility(mlir::func::FuncOp::Visibility::Private);
 
   mlir::OpBuilder::InsertionGuard guard(builder);
@@ -110,8 +110,8 @@ mlir::func::FuncOp CreateBranchFunctionWithDeduplicatedResults(
 
   // Create the call op to the original func. The arguments are simply
   // the arguments from the wrapper function.
-  auto call_op = builder.create<mlir::TF::PartitionedCallOp>(
-      loc, result_types, block->getArguments(), /*args_attrs=*/nullptr,
+  auto call_op = mlir::TF::PartitionedCallOp::create(
+      builder, loc, result_types, block->getArguments(), /*args_attrs=*/nullptr,
       /*res_attrs=*/nullptr,
       mlir::FlatSymbolRefAttr::get(func.getSymNameAttr()), empty_string_attr,
       empty_string_attr, empty_string_attr);
@@ -120,7 +120,7 @@ mlir::func::FuncOp CreateBranchFunctionWithDeduplicatedResults(
     results.push_back(call_op.getResult(i));
   }
 
-  builder.create<mlir::func::ReturnOp>(loc, results);
+  mlir::func::ReturnOp::create(builder, loc, results);
 
   return new_func;
 }
@@ -183,8 +183,8 @@ void DeduplicateIfOps(mlir::ModuleOp module) {
         new_result_types.push_back(op->getResult(i).getType());
       }
 
-      auto new_if_op = builder.create<mlir::TF::IfOp>(
-          op.getLoc(), new_result_types, op.getCond(), op.getInput(),
+      auto new_if_op = mlir::TF::IfOp::create(
+          builder, op.getLoc(), new_result_types, op.getCond(), op.getInput(),
           new_then_func.getSymName(), new_else_func.getSymName(),
           op.getIsStateless());
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
index 77de1e0eb48669..73d5836fa895a6 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
@@ -159,12 +159,11 @@ void FuseCompileAndExecuteOps(
   auto producer_name =
       used_exec_op->getAttrOfType<mlir::StringAttr>("_producer_name");
   if (!producer_name) producer_name = mlir::StringAttr::get(context, "default");
-  auto compile_and_execute_op =
-      builder.create<mlir::TF::TPUCompileMlirAndExecuteOp>(
-          used_exec_op.getLoc(), output_types, exec_op_args,
-          static_shape_tensors,
-          builder.getI32ArrayAttr(static_shaped_operand_indices_attr),
-          compile_op.getMlirModule(), compile_op.getMetadata(), producer_name);
+  auto compile_and_execute_op = mlir::TF::TPUCompileMlirAndExecuteOp::create(
+      builder, used_exec_op.getLoc(), output_types, exec_op_args,
+      static_shape_tensors,
+      builder.getI32ArrayAttr(static_shaped_operand_indices_attr),
+      compile_op.getMlirModule(), compile_op.getMetadata(), producer_name);
 
   for (auto exec_op : exec_op_in_group) {
     exec_op.replaceAllUsesWith(compile_and_execute_op.getResults());
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
index 2fc2c173fed8ba..1e2231f1c59584 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
@@ -151,8 +151,8 @@ class RewriteClusterToIfrtCallPass
       // ifrt program already exists
       builder.setInsertionPoint(cluster_func);
 
-      mlir::TF::IfrtCallOp ifrt_call_op = builder.create<mlir::TF::IfrtCallOp>(
-          cluster_func->getLoc(), cluster_func.getResultTypes(),
+      mlir::TF::IfrtCallOp ifrt_call_op = mlir::TF::IfrtCallOp::create(
+          builder, cluster_func->getLoc(), cluster_func.getResultTypes(),
           cluster_func->getOperands());
 
       int64_t program_id;
@@ -189,8 +189,8 @@ class RewriteClusterToIfrtCallPass
     mlir::OpBuilder::InsertionGuard insertion_guard(builder);
     builder.setInsertionPoint(callee_func);
 
-    mlir::func::FuncOp cloned_ifrt_program = builder.create<mlir::func::FuncOp>(
-        callee_func->getLoc(), ifrt_program_name,
+    mlir::func::FuncOp cloned_ifrt_program = mlir::func::FuncOp::create(
+        builder, callee_func->getLoc(), ifrt_program_name,
         callee_func.getFunctionType());
     mlir::IRMapping mapper;
     callee_func.cloneInto(cloned_ifrt_program, mapper);
@@ -226,8 +226,8 @@ class RewriteClusterToIfrtCallPass
 
     builder.setInsertionPoint(cluster_func);
 
-    mlir::TF::IfrtCallOp ifrt_call_op = builder.create<mlir::TF::IfrtCallOp>(
-        cluster_func->getLoc(), cluster_func.getResultTypes(),
+    mlir::TF::IfrtCallOp ifrt_call_op = mlir::TF::IfrtCallOp::create(
+        builder, cluster_func->getLoc(), cluster_func.getResultTypes(),
         cluster_func->getOperands());
 
     // TODO(b/304839793): populate variable names after adding a variable
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
index 5220824d3f716a..d0c8f03bf7f9c2 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
@@ -130,15 +130,15 @@ class TfRestoreMergingPass
     // merged in order to keep the dominance property.
     mlir::OpBuilder builder(restores_to_merge.front());
 
-    auto new_tensor_names = builder.create<mlir::TF::ConstOp>(
-        builder.getFusedLoc(tensor_names_locs),
+    auto new_tensor_names = mlir::TF::ConstOp::create(
+        builder, builder.getFusedLoc(tensor_names_locs),
         GetStringTensorAttr(merged_tensor_names));
-    auto new_shape_and_slices = builder.create<mlir::TF::ConstOp>(
-        builder.getFusedLoc(shape_and_slices_locs),
+    auto new_shape_and_slices = mlir::TF::ConstOp::create(
+        builder, builder.getFusedLoc(shape_and_slices_locs),
         GetStringTensorAttr(merged_shape_and_slices));
 
-    auto new_restore = builder.create<mlir::TF::RestoreV2Op>(
-        builder.getFusedLoc(restore_locs),
+    auto new_restore = mlir::TF::RestoreV2Op::create(
+        builder, builder.getFusedLoc(restore_locs),
         mlir::TypeRange(mlir::ValueRange(values_to_replace)), prefix,
         new_tensor_names, new_shape_and_slices);
     for (auto [old_value, new_value] :
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
index 130ca0a2e90b74..cb5b3e7afdcc13 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
@@ -93,15 +93,15 @@ class TfRestoreSplittingPass
                    shape_and_slices.getValues<llvm::StringRef>(),
                    restore.getTensors())) {
       auto new_tensor_names =
-          builder.create<mlir::TF::ConstOp>(restore.getTensorNames().getLoc(),
-                                            GetStringTensorAttr({tensor_name}));
+          mlir::TF::ConstOp::create(builder, restore.getTensorNames().getLoc(),
+                                    GetStringTensorAttr({tensor_name}));
 
-      auto new_shape_and_slices = builder.create<mlir::TF::ConstOp>(
-          restore.getShapeAndSlices().getLoc(),
+      auto new_shape_and_slices = mlir::TF::ConstOp::create(
+          builder, restore.getShapeAndSlices().getLoc(),
           GetStringTensorAttr({shape_and_slice}));
 
-      auto new_restore = builder.create<mlir::TF::RestoreV2Op>(
-          restore.getLoc(), mlir::TypeRange({result.getType()}),
+      auto new_restore = mlir::TF::RestoreV2Op::create(
+          builder, restore.getLoc(), mlir::TypeRange({result.getType()}),
           restore.getPrefix(), new_tensor_names, new_shape_and_slices);
       result.replaceAllUsesWith(new_restore.getTensors()[0]);
     }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
index 34b37eeefe7843..916b41620ad33e 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lower_saved_model.cc
@@ -440,8 +440,8 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
   // "_tfrt_resource_init" is the special function that executes all invariant
   // ops (eg. read-only variables) used in the model. This function should be
   // executed after user-specified initialization.
-  auto init_func_op = builder.create<mlir::func::FuncOp>(
-      module.getLoc(), "_tfrt_resource_init",
+  auto init_func_op = mlir::func::FuncOp::create(
+      builder, module.getLoc(), "_tfrt_resource_init",
       mlir::FunctionType::get(module.getContext(), /*inputs=*/{},
                               /*results=*/{}));
   auto *block = init_func_op.addEntryBlock();
@@ -481,8 +481,8 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
     auto *new_op = new_value.getDefiningOp();
     assert(new_op);
     builder.setInsertionPointAfter(new_op);
-    auto set_resource_op = builder.create<mlir::TF::_TfrtSetResourceOp>(
-        new_op->getLoc(), new_value, index);
+    auto set_resource_op = mlir::TF::_TfrtSetResourceOp::create(
+        builder, new_op->getLoc(), new_value, index);
 
     // Preserve the device attribute.
     llvm::StringRef device = kCpuDeviceName;
@@ -494,7 +494,7 @@ void LowerTFSavedModelPass::HoistInvariantOps(mlir::ModuleOp module) {
 
   builder.setInsertionPointToEnd(block);
   // Finish building the init function by inserting an return op.
-  builder.create<mlir::func::ReturnOp>(init_func_op.getLoc());
+  mlir::func::ReturnOp::create(builder, init_func_op.getLoc());
 
   // Now that we have the index for each value that will be replaced, we can
   // create the tf._TfrtGetResource op in each function using these indices.
@@ -568,8 +568,8 @@ void LowerTFSavedModelPass::ReplaceHoistedValues(
       llvm::SmallVector<mlir::Value> new_values;
 
       if (fuse_get_resource_ops_) {
-        auto get_resource_op = builder.create<mlir::TF::_TfrtGetResourceOp>(
-            block->getParentOp()->getLoc(), old_values.getTypes(),
+        auto get_resource_op = mlir::TF::_TfrtGetResourceOp::create(
+            builder, block->getParentOp()->getLoc(), old_values.getTypes(),
             builder.getI64ArrayAttr(indices),
             builder.getStrArrayAttr(shared_name_arr),
             builder.getStrArrayAttr(container_arr));
@@ -577,8 +577,8 @@ void LowerTFSavedModelPass::ReplaceHoistedValues(
         new_values = get_resource_op.getResults();
       } else {
         for (int i = 0; i < old_values.size(); ++i) {
-          auto get_resource_op = builder.create<mlir::TF::_TfrtGetResourceOp>(
-              block->getParentOp()->getLoc(),
+          auto get_resource_op = mlir::TF::_TfrtGetResourceOp::create(
+              builder, block->getParentOp()->getLoc(),
               mlir::TypeRange(old_values[i].getType()),
               builder.getI64ArrayAttr(indices[i]),
               builder.getStrArrayAttr(shared_name_arr[i]),
@@ -670,8 +670,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
 
   mlir::OpBuilder builder(var_op);
 
-  auto var_handle_op = builder.create<mlir::TF::VarHandleOp>(
-      var_op.getLoc(),
+  auto var_handle_op = mlir::TF::VarHandleOp::create(
+      builder, var_op.getLoc(),
       mlir::RankedTensorType::get(
           {}, mlir::TF::ResourceType::get(
                   llvm::ArrayRef<mlir::TensorType>{tensor_type},
@@ -682,8 +682,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // Set insertion point to this identity_op so that the side-effect
     // visibility is preserved.
     builder.setInsertionPoint(op);
-    auto read_var_op = builder.create<mlir::TF::ReadVariableOp>(
-        op.getLoc(), op.getType(), var_handle_op);
+    auto read_var_op = mlir::TF::ReadVariableOp::create(
+        builder, op.getLoc(), op.getType(), var_handle_op);
     op.replaceAllUsesWith(read_var_op.getValue());
     op.erase();
   }
@@ -692,8 +692,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // Set the insertion point after the assign op so that all operands are
     // dominating the newly created op.
     builder.setInsertionPoint(op);
-    builder.create<mlir::TF::AssignVariableOp>(op.getLoc(), var_handle_op,
-                                               op.getValue());
+    mlir::TF::AssignVariableOp::create(builder, op.getLoc(), var_handle_op,
+                                       op.getValue());
     op.erase();
   }
 
@@ -704,8 +704,8 @@ mlir::LogicalResult ConvertReferenceVariableToResourceVariable(
     // the newly created op.
     builder.setInsertionPoint(op);
     // Create a new read variable op, so that the side-effects are preserved.
-    auto read_var_op = builder.create<mlir::TF::ReadVariableOp>(
-        op->getLoc(), tensor_type, var_handle_op);
+    auto read_var_op = mlir::TF::ReadVariableOp::create(
+        builder, op->getLoc(), tensor_type, var_handle_op);
     op->setOperand(idx, read_var_op.getValue());
   }
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
index 59f602c0991faf..38737e22d1c588 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/merge_tf_if_ops.cc
@@ -225,8 +225,8 @@ class MergeTfIfOpsPass
                     [](mlir::TF::IfOp op) { return op.getIsStateless(); });
 
     // Create the merged tf.If op using the new branches.
-    auto new_if_op = builder.create<mlir::TF::IfOp>(
-        loc, new_result_types, if_ops.front().getCond(),
+    auto new_if_op = mlir::TF::IfOp::create(
+        builder, loc, new_result_types, if_ops.front().getCond(),
         if_ops.front().getInput(), then_branch_name, else_branch_name,
         is_stateless);
 
@@ -249,8 +249,8 @@ class MergeTfIfOpsPass
       llvm::ArrayRef<mlir::TF::IfOp> if_ops,
       llvm::function_ref<mlir::FlatSymbolRefAttr(mlir::TF::IfOp)> get_branch) {
     std::string branch_name = absl::StrCat(branch_prefix, branch_suffix);
-    auto branch = builder.create<mlir::func::FuncOp>(loc, branch_name,
-                                                     branch_function_type);
+    auto branch = mlir::func::FuncOp::create(builder, loc, branch_name,
+                                             branch_function_type);
     branch.setVisibility(mlir::func::FuncOp::Visibility::Private);
 
     mlir::OpBuilder::InsertionGuard guard(builder);
@@ -267,8 +267,9 @@ class MergeTfIfOpsPass
     for (auto if_op : if_ops) {
       // Create the call op to the original branch. The arguments are simply
       // the arguments from the wrapper function.
-      auto call_op = builder.create<mlir::TF::PartitionedCallOp>(
-          if_op.getLoc(), if_op.getResultTypes(), block->getArguments(),
+      auto call_op = mlir::TF::PartitionedCallOp::create(
+          builder, if_op.getLoc(), if_op.getResultTypes(),
+          block->getArguments(),
           /*args_attrs=*/nullptr, /*res_attrs=*/nullptr, get_branch(if_op),
           empty_string_attr, empty_string_attr, empty_string_attr);
 
@@ -276,7 +277,7 @@ class MergeTfIfOpsPass
       results.append(call_op.getOutput().begin(), call_op.getOutput().end());
     }
 
-    builder.create<mlir::func::ReturnOp>(loc, results);
+    mlir::func::ReturnOp::create(builder, loc, results);
 
     return branch.getSymName();
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
index 159e630fb8fb16..b0ad89b6b55d24 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/broadcast_propagation_pass.cc
@@ -280,8 +280,8 @@ DenseMap<BroadcastIntent, Value> realizeBroadcastIntents(
       setInsertionPointToEarliestPointWithAllValuesAvailable(
           rewriter, parentBlock,
           ValueRange{it.targetValue, it.outputDimensions});
-      realizations[it] = rewriter.create<DynamicBroadcastInDimOp>(
-          it.targetValue.getLoc(), it.resultType, it.targetValue,
+      realizations[it] = DynamicBroadcastInDimOp::create(
+          rewriter, it.targetValue.getLoc(), it.resultType, it.targetValue,
           it.outputDimensions,
           mlir::cast<DenseIntElementsAttr>(it.broadcastDimensions));
       continue;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
index 200f09c33021b1..18459a9e4e13a8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework.cc
@@ -95,10 +95,10 @@ struct AllocOpConverter : public OpConversionPattern<memref::AllocOp> {
         alloc, alloc.getType(), *ctx, adaptor.getOperands(),
         reuse_input_candidates, reuse_output_index);
     Location loc = buffer.getLoc();
-    Value cond = rewriter.create<IsValidMemRefOp>(
-        loc, rewriter.getIntegerType(1), buffer);
-    rewriter.create<TFAssertOp>(loc, *ctx, cond, ErrorCode::RESOURCE_EXHAUSTED,
-                                "failed to allocate memory");
+    Value cond = IsValidMemRefOp::create(rewriter, loc,
+                                         rewriter.getIntegerType(1), buffer);
+    TFAssertOp::create(rewriter, loc, *ctx, cond, ErrorCode::RESOURCE_EXHAUSTED,
+                       "failed to allocate memory");
     return success();
   }
 };
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
index 89d946516f6b9b..59792ae7297ce2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
@@ -65,8 +65,8 @@ LogicalResult RewriteToFullJit(func::FuncOp op) {
                                          old_body->getArgumentTypes(), locs);
 
   // Create the JIT compile op.
-  auto jit_compile_op = rewriter.create<tf_framework::JITCompileOp>(
-      loc, rewriter.getType<tf_framework::JITCallableType>(),
+  auto jit_compile_op = tf_framework::JITCompileOp::create(
+      rewriter, loc, rewriter.getType<tf_framework::JITCallableType>(),
       /*ctx=*/mlir::Value());
 
   // Move the original functions operations into the body.
@@ -80,18 +80,18 @@ LogicalResult RewriteToFullJit(func::FuncOp op) {
 
     Operation *terminator = jit_block->getTerminator();
     rewriter.setInsertionPointAfter(terminator);
-    rewriter.create<tf_framework::JITCompileYieldOp>(
-        loc, terminator->getOperands().front());
+    tf_framework::JITCompileYieldOp::create(rewriter, loc,
+                                            terminator->getOperands().front());
     terminator->erase();
   }
 
   // Create JIT execute op.
-  auto execute = rewriter.create<tf_framework::JITExecuteOp>(
-      loc, op.getResultTypes().front(), /*ctx=*/Value(),
+  auto execute = tf_framework::JITExecuteOp::create(
+      rewriter, loc, op.getResultTypes().front(), /*ctx=*/Value(),
       jit_compile_op.getResult(), new_body->getArguments());
 
   // Create a return.
-  rewriter.create<func::ReturnOp>(loc, execute.getResult());
+  func::ReturnOp::create(rewriter, loc, execute.getResult());
   return success();
 }
 
@@ -111,28 +111,28 @@ LogicalResult RewriteToLargeSizeJit(FuncOp op) {
 
   // Create large argument condition.
   auto arg_1 = new_body->getArgument(0);
-  auto shape_1 = rewriter.create<shape::ShapeOfOp>(loc, arg_1);
-  auto num_elems_1 = rewriter.create<shape::NumElementsOp>(loc, shape_1);
-  Value cst_i32_limit = rewriter.create<arith::ConstantIndexOp>(loc, i32Limit);
-  Value large_tensor_predicate = rewriter.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::sgt, num_elems_1, cst_i32_limit);
+  auto shape_1 = shape::ShapeOfOp::create(rewriter, loc, arg_1);
+  auto num_elems_1 = shape::NumElementsOp::create(rewriter, loc, shape_1);
+  Value cst_i32_limit = arith::ConstantIndexOp::create(rewriter, loc, i32Limit);
+  Value large_tensor_predicate = arith::CmpIOp::create(
+      rewriter, loc, arith::CmpIPredicate::sgt, num_elems_1, cst_i32_limit);
   if (new_body->getNumArguments() > 1) {
     auto arg_2 = new_body->getArgument(1);
-    auto shape_2 = rewriter.create<shape::ShapeOfOp>(loc, arg_2);
-    auto num_elems_2 = rewriter.create<shape::NumElementsOp>(loc, shape_2);
-    large_tensor_predicate = rewriter.create<arith::OrIOp>(
-        loc, large_tensor_predicate,
+    auto shape_2 = shape::ShapeOfOp::create(rewriter, loc, arg_2);
+    auto num_elems_2 = shape::NumElementsOp::create(rewriter, loc, shape_2);
+    large_tensor_predicate = arith::OrIOp::create(
+        rewriter, loc, large_tensor_predicate,
         // Compare op to check size of the second op
-        rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt,
-                                       num_elems_2, cst_i32_limit));
+        arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::sgt,
+                              num_elems_2, cst_i32_limit));
   }
 
   // Create dispatch code.
   auto jit_body_builder_fn = [&](OpBuilder &b, Location loc) {
     // Create JIT compile op.
     auto callable_ty = b.getType<tf_framework::JITCallableType>();
-    auto jit_compile_op =
-        b.create<tf_framework::JITCompileOp>(loc, callable_ty, /*ctx=*/Value());
+    auto jit_compile_op = tf_framework::JITCompileOp::create(
+        b, loc, callable_ty, /*ctx=*/Value());
     {
       OpBuilder::InsertionGuard g(b);
       Block *block = b.createBlock(
@@ -144,15 +144,15 @@ LogicalResult RewriteToLargeSizeJit(FuncOp op) {
       for (auto &op : old_body->without_terminator()) {
         b.clone(op, bvm);
       }
-      b.create<tf_framework::JITCompileYieldOp>(
-          loc, block->back().getResults().front());
+      tf_framework::JITCompileYieldOp::create(
+          b, loc, block->back().getResults().front());
     }
 
     // Create JIT execute op.
-    auto jit_execute_op = b.create<tf_framework::JITExecuteOp>(
-        loc, op.getResultTypes().front(), /*ctx=*/Value(),
+    auto jit_execute_op = tf_framework::JITExecuteOp::create(
+        b, loc, op.getResultTypes().front(), /*ctx=*/Value(),
         jit_compile_op.getResult(), new_body->getArguments());
-    b.create<scf::YieldOp>(loc, jit_execute_op.getResult());
+    scf::YieldOp::create(b, loc, jit_execute_op.getResult());
   };
   auto aot_body_builder_fn = [&](OpBuilder &b, Location loc) {
     IRMapping bvm;
@@ -161,13 +161,13 @@ LogicalResult RewriteToLargeSizeJit(FuncOp op) {
     for (auto &op : old_body->without_terminator()) {
       last_clone = b.clone(op, bvm);
     }
-    b.create<scf::YieldOp>(loc, last_clone->getResults().front());
+    scf::YieldOp::create(b, loc, last_clone->getResults().front());
   };
 
   // Create the conditional and return operation.
-  auto ifOp = rewriter.create<scf::IfOp>(
-      loc, large_tensor_predicate, jit_body_builder_fn, aot_body_builder_fn);
-  rewriter.create<func::ReturnOp>(loc, ifOp.getResults().front());
+  auto ifOp = scf::IfOp::create(rewriter, loc, large_tensor_predicate,
+                                jit_body_builder_fn, aot_body_builder_fn);
+  func::ReturnOp::create(rewriter, loc, ifOp.getResults().front());
 
   // Remove the old body.
   rewriter.eraseBlock(old_body);
@@ -186,19 +186,19 @@ void PackJITCompileOp(tf_framework::JITCompileOp op,
   // Temporarily, build the module that would be JIT-compiled. This is only to
   // obtain the serialized code attribute.
   auto loc = op->getLoc();
-  auto jit_module = rewriter.create<ModuleOp>(loc);
+  auto jit_module = ModuleOp::create(rewriter, loc);
   {
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointToStart(jit_module.SingleBlock::getBody());
-    auto jit_function = rewriter.create<func::FuncOp>(
-        loc, tf_framework::JITCompileFromStrOp::kJITEntryFunctionName,
+    auto jit_function = func::FuncOp::create(
+        rewriter, loc, tf_framework::JITCompileFromStrOp::kJITEntryFunctionName,
         rewriter.getFunctionType(body->getArgumentTypes(),
                                  yield_op->getOperandTypes()));
     jit_function->setAttr(tf_framework::TFFrameworkDialect::kTFEntryAttrName,
                           rewriter.getUnitAttr());
     jit_function.getBody().takeBody(op.getBodyRegion());
     rewriter.setInsertionPointToEnd(&jit_function.getBody().front());
-    rewriter.create<func::ReturnOp>(loc, yield_op.getResult());
+    func::ReturnOp::create(rewriter, loc, yield_op.getResult());
     rewriter.eraseOp(yield_op);
   }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
index 4b1d10ca8dd372..66a455ca71c745 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/merge_assuming_ops_pass.cc
@@ -73,8 +73,8 @@ struct ShapeReificationPattern : public OpRewritePattern<shape::ShapeOfOp> {
 
     // Insert cast if needed.
     if (reifiedShape.getType() != op.getType()) {
-      reifiedShape = rewriter.create<tensor::CastOp>(op.getLoc(), op.getType(),
-                                                     reifiedShape);
+      reifiedShape = tensor::CastOp::create(rewriter, op.getLoc(), op.getType(),
+                                            reifiedShape);
     }
 
     rewriter.replaceOp(op, reifiedShape);
@@ -148,9 +148,9 @@ LogicalResult moveUpIntoAssumingOpMatchAndRewrite(Operation *op,
   // Insert the rewritten assuming op right before the old one.
   OpBuilder::InsertionGuard guard(rewriter);
   rewriter.setInsertionPoint(assumingOp);
-  auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-      assumingOp.getLoc(), assumingOp.getWitness(),
-      [&](OpBuilder &b, Location) {
+  auto newAssumingOp = shape::AssumingOp::create(
+      rewriter, assumingOp.getLoc(), assumingOp.getWitness(),
+      [&](OpBuilder& b, Location) {
         // Copy body.
         IRMapping mapping;
         for (auto &nested : body->without_terminator())
@@ -304,9 +304,9 @@ struct MoveUpOutOfAssumingOpPattern : public OpRewritePattern<OpTy> {
     // explicitly as they are assumed to be independent. The assuming op is
     // rewritten accordingly.
     SmallVector<Value, 2> replacementValues;
-    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-        assumingOp.getLoc(), assumingOp.getWitness(),
-        [&](OpBuilder &b, Location) {
+    auto newAssumingOp = shape::AssumingOp::create(
+        rewriter, assumingOp.getLoc(), assumingOp.getWitness(),
+        [&](OpBuilder& b, Location) {
           // Copy body.
           IRMapping mapping;
           for (Operation &nested : body->without_terminator()) {
@@ -354,15 +354,16 @@ struct MergeAssumingOpsPattern : public OpRewritePattern<shape::AssumingOp> {
     // Merge witnesses.
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(precedingOp);
-    Value newWitness = rewriter.create<shape::AssumingAllOp>(
-        op.getWitness().getDefiningOp()->getLoc(),
+    Value newWitness = shape::AssumingAllOp::create(
+        rewriter, op.getWitness().getDefiningOp()->getLoc(),
         ValueRange{precedingOp.getWitness(), op.getWitness()});
 
     // Merge assuming ops.
     Block *body_a = precedingOp.getBody();
     Block *body_b = op.getBody();
-    auto newAssumingOp = rewriter.create<shape::AssumingOp>(
-        precedingOp.getLoc(), newWitness, [&](OpBuilder &b, Location) {
+    auto newAssumingOp = shape::AssumingOp::create(
+        rewriter, precedingOp.getLoc(), newWitness,
+        [&](OpBuilder& b, Location) {
           // Copy preceding op's body.
           IRMapping mapping;
           for (auto &nested : body_a->without_terminator()) {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
index ceda47565bf999..959c56a87982ec 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
@@ -49,14 +49,14 @@ class TFAssertOpConverter : public OpConversionPattern<TFAssertOp> {
     auto func = op->getParentOfType<func::FuncOp>();
     Block *error_reporting_block =
         rewriter.createBlock(&func.getRegion(), {}, {});
-    rewriter.create<ReportErrorOp>(loc, adaptor.getCtx(),
-                                   adaptor.getErrorCode(), adaptor.getMsg());
+    ReportErrorOp::create(rewriter, loc, adaptor.getCtx(),
+                          adaptor.getErrorCode(), adaptor.getMsg());
 
     SmallVector<Value, 2> null_memrefs;
     for (auto type : func.getFunctionType().getResults()) {
-      null_memrefs.push_back(rewriter.create<NullMemRefOp>(loc, type));
+      null_memrefs.push_back(NullMemRefOp::create(rewriter, loc, type));
     }
-    rewriter.create<func::ReturnOp>(loc, null_memrefs);
+    func::ReturnOp::create(rewriter, loc, null_memrefs);
 
     rewriter.restoreInsertionPoint(ip);
     rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
index a7d26813239571..2fd419972f4289 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tensorflow_abi_knowledge_propagation.cc
@@ -116,8 +116,8 @@ struct PropagateTfAbiKnowledgeToKernelsPass
           Value offset = kernel.getArgument(kernel_p + 2);
           Value &zero = constants[0];
           if (!zero) {
-            zero = b.create<LLVM::ConstantOp>(loc, offset.getType(),
-                                              b.getIndexAttr(0));
+            zero = LLVM::ConstantOp::create(b, loc, offset.getType(),
+                                            b.getIndexAttr(0));
           }
           offset.replaceAllUsesWith(zero);
         }
@@ -128,9 +128,9 @@ struct PropagateTfAbiKnowledgeToKernelsPass
               kernel.getArgument(kernel_p + 2 + memref.getRank() * 2);
           Value &stride_val = constants[const_stride->second];
           if (!stride_val) {
-            stride_val = b.create<LLVM::ConstantOp>(
-                loc, inner_stride.getType(),
-                b.getIndexAttr(const_stride->second));
+            stride_val =
+                LLVM::ConstantOp::create(b, loc, inner_stride.getType(),
+                                         b.getIndexAttr(const_stride->second));
           }
           inner_stride.replaceAllUsesWith(stride_val);
         }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index 4cbe21b73f62c3..21d477b30547c1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -70,25 +70,27 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
     // If the attribute is missing or empty, set the element count to 0 and
     // return NULL.
     if (!attr.has_value() || attr.value().empty()) {
-      Value zero = rewriter->create<LLVM::ConstantOp>(
-          loc, size_ty, rewriter->getIntegerAttr(size_ty, 0));
-      Value null_ptr = rewriter->create<LLVM::ZeroOp>(loc, ptr_ty);
+      Value zero = LLVM::ConstantOp::create(
+          *rewriter, loc, size_ty, rewriter->getIntegerAttr(size_ty, 0));
+      Value null_ptr = LLVM::ZeroOp::create(*rewriter, loc, ptr_ty);
       return std::make_pair(zero, null_ptr);
     }
 
     // Allocate array to store the elements.
     auto &array_attr = attr.value();
-    Value array_size = rewriter->create<LLVM::ConstantOp>(
-        loc, size_ty, rewriter->getIntegerAttr(size_ty, array_attr.size()));
-    Value array_ptr = rewriter->create<LLVM::AllocaOp>(
-        loc, ptr_ty, element_ty, array_size, /*alignment=*/0);
+    Value array_size = LLVM::ConstantOp::create(
+        *rewriter, loc, size_ty,
+        rewriter->getIntegerAttr(size_ty, array_attr.size()));
+    Value array_ptr = LLVM::AllocaOp::create(*rewriter, loc, ptr_ty, element_ty,
+                                             array_size, /*alignment=*/0);
     for (const auto &e : llvm::enumerate(array_attr)) {
-      Value index = rewriter->create<LLVM::ConstantOp>(
-          loc, size_ty, rewriter->getIntegerAttr(size_ty, e.index()));
-      Value element_ptr = rewriter->create<LLVM::GEPOp>(loc, ptr_ty, element_ty,
-                                                        array_ptr, index);
+      Value index = LLVM::ConstantOp::create(
+          *rewriter, loc, size_ty,
+          rewriter->getIntegerAttr(size_ty, e.index()));
+      Value element_ptr = LLVM::GEPOp::create(*rewriter, loc, ptr_ty,
+                                              element_ty, array_ptr, index);
       Value element = create_element(e.value());
-      rewriter->create<LLVM::StoreOp>(loc, element, element_ptr);
+      LLVM::StoreOp::create(*rewriter, loc, element, element_ptr);
     }
     return std::make_pair(array_size, array_ptr);
   }
@@ -101,8 +103,8 @@ class ConvertToLLVMCallOpPattern : public ConvertOpToLLVMPattern<OpTy> {
     assert(mlir::isa<IntegerType>(element_ty) && "expect integer element type");
     return ConvertArrayAttrToStackAllocatedArray(
         loc, size_ty, element_ty, attr, rewriter, [&](Attribute attr) {
-          return rewriter->create<LLVM::ConstantOp>(
-              loc, element_ty,
+          return LLVM::ConstantOp::create(
+              *rewriter, loc, element_ty,
               rewriter->getIntegerAttr(element_ty,
                                        mlir::cast<IntegerAttr>(attr).getInt()));
         });
@@ -136,8 +138,8 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
 
     // Convert `output_index` or set it to -1 if the attribute is missing.
     Type llvmInt32Type = IntegerType::get(rewriter.getContext(), 32);
-    Value output_index = rewriter.create<LLVM::ConstantOp>(
-        loc, llvmInt32Type,
+    Value output_index = LLVM::ConstantOp::create(
+        rewriter, loc, llvmInt32Type,
         rewriter.getI32IntegerAttr(tf_alloc_op.getOutputIndex().has_value()
                                        ? tf_alloc_op.getOutputIndex().value()
                                        : -1));
@@ -152,12 +154,11 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
     Value allocated_byte_ptr =
-        rewriter
-            .create<LLVM::CallOp>(
-                loc, getVoidPtrType(), tf_func_ref,
-                llvm::ArrayRef({adaptor.getCtx(), num_elements, element_size,
-                                output_index, candidates_count_and_ptr.first,
-                                candidates_count_and_ptr.second}))
+        LLVM::CallOp::create(
+            rewriter, loc, getVoidPtrType(), tf_func_ref,
+            llvm::ArrayRef({adaptor.getCtx(), num_elements, element_size,
+                            output_index, candidates_count_and_ptr.first,
+                            candidates_count_and_ptr.second}))
             .getResult();
 
     MemRefDescriptor memRefDescriptor = CreateMemRefDescriptor(
@@ -213,7 +214,7 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
       // Update stride
       if (pos > 0) {
         stride_carried =
-            rewriter.create<LLVM::MulOp>(loc, stride_carried, size);
+            LLVM::MulOp::create(rewriter, loc, stride_carried, size);
       }
     }
     return memref_desc;
@@ -272,12 +273,12 @@ class JITCompileFromStrOpConverter
         ConvertIntegerArrayAttrToStackAllocatedArray(
             loc, rewriter.getI64Type(), rewriter.getI64Type(),
             op.getUnrollFactors(), &rewriter);
-    Value enable_ftz = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), op.getEnableFtzAttr());
-    Value index_64bit = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), op.getIndex64BitAttr());
-    Value cpu_codegen = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), op.getCpuCodegenAttr());
+    Value enable_ftz = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), op.getEnableFtzAttr());
+    Value index_64bit = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), op.getIndex64BitAttr());
+    Value cpu_codegen = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), op.getCpuCodegenAttr());
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
@@ -327,40 +328,39 @@ class JITExecuteOpConverter : public ConvertToLLVMCallOpPattern<JITExecuteOp> {
         getTypeConverter()->convertType(op->getResultTypes().front());
     Type ptr_ty = LLVM::LLVMPointerType::get(getContext());
     Type i64_ty = rewriter.getI64Type();
-    Value one = rewriter.create<LLVM::ConstantOp>(
-        loc, i64_ty, rewriter.getI64IntegerAttr(1));
+    Value one = LLVM::ConstantOp::create(rewriter, loc, i64_ty,
+                                         rewriter.getI64IntegerAttr(1));
     auto result_ptr =
-        rewriter.create<LLVM::AllocaOp>(loc, ptr_ty, result_ty, one);
+        LLVM::AllocaOp::create(rewriter, loc, ptr_ty, result_ty, one);
 
     // Pass the buffer arguments as a stack-allocated array.
     Type args_elem_ty = adaptor.getInputs().front().getType();
-    Value num_args = rewriter.create<LLVM::ConstantOp>(
-        loc, i64_ty,
+    Value num_args = LLVM::ConstantOp::create(
+        rewriter, loc, i64_ty,
         rewriter.getI64IntegerAttr(
             static_cast<int64_t>(adaptor.getInputs().size())));
     Value args_ptr =
-        rewriter.create<LLVM::AllocaOp>(loc, ptr_ty, args_elem_ty, num_args,
-                                        /*alignment=*/0);
+        LLVM::AllocaOp::create(rewriter, loc, ptr_ty, args_elem_ty, num_args,
+                               /*alignment=*/0);
     for (const auto &it : llvm::enumerate(adaptor.getInputs())) {
-      Value index = rewriter.create<LLVM::ConstantOp>(
-          loc, i64_ty, rewriter.getI64IntegerAttr(it.index()));
-      Value element_ptr = rewriter.create<LLVM::GEPOp>(
-          loc, ptr_ty, args_elem_ty, args_ptr, index);
-      rewriter.create<LLVM::StoreOp>(loc, it.value(), element_ptr);
+      Value index = LLVM::ConstantOp::create(
+          rewriter, loc, i64_ty, rewriter.getI64IntegerAttr(it.index()));
+      Value element_ptr = LLVM::GEPOp::create(rewriter, loc, ptr_ty,
+                                              args_elem_ty, args_ptr, index);
+      LLVM::StoreOp::create(rewriter, loc, it.value(), element_ptr);
     }
 
     // Materialize runtime call.
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
-    rewriter.create<LLVM::CallOp>(
-        loc, mlir::TypeRange(), tf_func_ref,
-        ValueRange{adaptor.getCtx(), adaptor.getCallable(), result_ptr,
-                   num_args, args_ptr});
+    LLVM::CallOp::create(rewriter, loc, mlir::TypeRange(), tf_func_ref,
+                         ValueRange{adaptor.getCtx(), adaptor.getCallable(),
+                                    result_ptr, num_args, args_ptr});
 
     // Copy result (including the descriptor) to a stack-allocated buffer and
     // free the old descriptor.
     llvm::SmallVector<Value, 1> final_result = {
-        rewriter.create<LLVM::LoadOp>(loc, result_ty, result_ptr)};
+        LLVM::LoadOp::create(rewriter, loc, result_ty, result_ptr)};
     if (failed(copyUnrankedDescriptors(rewriter, loc, op->getResultTypes(),
                                        final_result,
                                        /*toDynamic=*/false))) {
@@ -402,8 +402,8 @@ class ReportErrorOpConverter
     // Insert function call.
     FlatSymbolRefAttr tf_func_ref =
         GetOrInsertLLVMFunction(GetFuncName(), GetFuncType(), op, &rewriter);
-    Value error_code = rewriter.create<LLVM::ConstantOp>(
-        loc, typeConverter->convertType(rewriter.getI32Type()),
+    Value error_code = LLVM::ConstantOp::create(
+        rewriter, loc, typeConverter->convertType(rewriter.getI32Type()),
         adaptor.getErrorCodeAttr());
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(
         op, mlir::TypeRange(), tf_func_ref,
@@ -489,7 +489,7 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
 
       // Prepare packed args [allocatedPtr, alignedPtr, offset, sizes, strides]
       // to create a memref descriptor.
-      Value null = rewriter.create<LLVM::ZeroOp>(loc, llvm_ptr_type);
+      Value null = LLVM::ZeroOp::create(rewriter, loc, llvm_ptr_type);
       SmallVector<Value, 12> packed_values{null, null, zero};
       packed_values.append(sizes);
       packed_values.append(strides);
@@ -518,11 +518,12 @@ class NullMemRefOpConverter : public ConvertOpToLLVMPattern<NullMemRefOp> {
     // setting its pointer to NULL.
     Value alloca_size = UnrankedMemRefDescriptor::computeSize(
         rewriter, loc, *getTypeConverter(), desc, addressSpace);
-    Value underlying_desc_ptr = rewriter.create<LLVM::AllocaOp>(
-        loc, getVoidPtrType(), IntegerType::get(getContext(), 8), alloca_size);
+    Value underlying_desc_ptr =
+        LLVM::AllocaOp::create(rewriter, loc, getVoidPtrType(),
+                               IntegerType::get(getContext(), 8), alloca_size);
 
     // Populate underlying ranked descriptor.
-    Value null = rewriter.create<LLVM::ZeroOp>(loc, llvm_ptr_type);
+    Value null = LLVM::ZeroOp::create(rewriter, loc, llvm_ptr_type);
     UnrankedMemRefDescriptor::setAllocatedPtr(
         rewriter, loc, underlying_desc_ptr, llvm_ptr_type, null);
     UnrankedMemRefDescriptor::setAlignedPtr(rewriter, loc, *getTypeConverter(),
@@ -551,21 +552,23 @@ class IsValidMemRefOpConverter
 
     // Compare every size in the descriptor to 0 to check num_elements == 0.
     int64_t rank = mlir::cast<MemRefType>(op.getArg().getType()).getRank();
-    Value is_empty_shape = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI1Type(), rewriter.getBoolAttr(false));
+    Value is_empty_shape = LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getI1Type(), rewriter.getBoolAttr(false));
     Value zero = createIndexAttrConstant(rewriter, loc, getIndexType(), 0);
     for (int i = 0; i < rank; ++i) {
       Value size = desc.size(rewriter, loc, i);
-      Value is_zero_size = rewriter.create<LLVM::ICmpOp>(
-          loc, rewriter.getI1Type(), LLVM::ICmpPredicate::eq, size, zero);
+      Value is_zero_size =
+          LLVM::ICmpOp::create(rewriter, loc, rewriter.getI1Type(),
+                               LLVM::ICmpPredicate::eq, size, zero);
       is_empty_shape =
-          rewriter.create<LLVM::OrOp>(loc, is_empty_shape, is_zero_size);
+          LLVM::OrOp::create(rewriter, loc, is_empty_shape, is_zero_size);
     }
 
     Value ptr = desc.allocatedPtr(rewriter, loc);
-    Value null = rewriter.create<LLVM::ZeroOp>(loc, getVoidPtrType());
-    Value is_not_nullptr = rewriter.create<LLVM::ICmpOp>(
-        loc, rewriter.getI1Type(), LLVM::ICmpPredicate::ne, ptr, null);
+    Value null = LLVM::ZeroOp::create(rewriter, loc, getVoidPtrType());
+    Value is_not_nullptr =
+        LLVM::ICmpOp::create(rewriter, loc, rewriter.getI1Type(),
+                             LLVM::ICmpPredicate::ne, ptr, null);
 
     // Valid memref = ptr != NULL || num_elements == 0;
     rewriter.replaceOpWithNewOp<LLVM::OrOp>(op, is_not_nullptr, is_empty_shape);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index ff19510805fe50..e51a397363e01e 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -115,27 +115,28 @@ Value ConvertLaunchFuncOpToTfRuntimeCallPattern::generateParamsArray(
   for (auto argument : arguments) argument_types.push_back(argument.getType());
   auto struct_type = LLVM::LLVMStructType::getNewIdentified(
       context_, StringRef(), argument_types);
-  auto one = builder.create<LLVM::ConstantOp>(loc, llvm_int32_type_,
-                                              builder.getI32IntegerAttr(1));
-  auto struct_ptr = builder.create<LLVM::AllocaOp>(
-      loc, llvm_pointer_type_, struct_type, one, /*alignment=*/0);
-  auto array_size = builder.create<LLVM::ConstantOp>(
-      loc, llvm_int32_type_, builder.getI32IntegerAttr(num_arguments));
-  auto array_ptr = builder.create<LLVM::AllocaOp>(
-      loc, llvm_pointer_type_, llvm_pointer_type_, array_size, /*alignment=*/0);
-  auto zero = builder.create<LLVM::ConstantOp>(loc, llvm_int32_type_,
-                                               builder.getI32IntegerAttr(0));
+  auto one = LLVM::ConstantOp::create(builder, loc, llvm_int32_type_,
+                                      builder.getI32IntegerAttr(1));
+  auto struct_ptr = LLVM::AllocaOp::create(builder, loc, llvm_pointer_type_,
+                                           struct_type, one, /*alignment=*/0);
+  auto array_size = LLVM::ConstantOp::create(
+      builder, loc, llvm_int32_type_, builder.getI32IntegerAttr(num_arguments));
+  auto array_ptr =
+      LLVM::AllocaOp::create(builder, loc, llvm_pointer_type_,
+                             llvm_pointer_type_, array_size, /*alignment=*/0);
+  auto zero = LLVM::ConstantOp::create(builder, loc, llvm_int32_type_,
+                                       builder.getI32IntegerAttr(0));
   for (auto en : llvm::enumerate(arguments)) {
-    auto index = builder.create<LLVM::ConstantOp>(
-        loc, llvm_int32_type_, builder.getI32IntegerAttr(en.index()));
-    auto field_ptr = builder.create<LLVM::GEPOp>(
-        loc, llvm_pointer_type_, struct_type, struct_ptr,
+    auto index = LLVM::ConstantOp::create(
+        builder, loc, llvm_int32_type_, builder.getI32IntegerAttr(en.index()));
+    auto field_ptr = LLVM::GEPOp::create(
+        builder, loc, llvm_pointer_type_, struct_type, struct_ptr,
         ArrayRef<Value>{zero, index.getResult()});
-    builder.create<LLVM::StoreOp>(loc, en.value(), field_ptr);
+    LLVM::StoreOp::create(builder, loc, en.value(), field_ptr);
     auto element_ptr =
-        builder.create<LLVM::GEPOp>(loc, llvm_pointer_type_, llvm_pointer_type_,
-                                    array_ptr, index.getResult());
-    builder.create<LLVM::StoreOp>(loc, field_ptr, element_ptr);
+        LLVM::GEPOp::create(builder, loc, llvm_pointer_type_,
+                            llvm_pointer_type_, array_ptr, index.getResult());
+    LLVM::StoreOp::create(builder, loc, field_ptr, element_ptr);
   }
   return array_ptr;
 }
@@ -220,11 +221,11 @@ LogicalResult ConvertLaunchFuncOpToTfRuntimeCallPattern::matchAndRewrite(
                          });
     rewriter.setInsertionPointToStart(
         launch_op->getParentOfType<ModuleOp>().getBody());
-    function = rewriter.create<LLVM::LLVMFuncOp>(
-        loc, kTfWrapperLibaryLaunchHelperName, function_type);
+    function = LLVM::LLVMFuncOp::create(
+        rewriter, loc, kTfWrapperLibaryLaunchHelperName, function_type);
   }
-  rewriter.create<LLVM::CallOp>(
-      loc, TypeRange(), mlir::SymbolRefAttr::get(function),
+  LLVM::CallOp::create(
+      rewriter, loc, TypeRange(), mlir::SymbolRefAttr::get(function),
 
       ArrayRef<Value>{context_arg, module_blob, kernel_name_global,
                       adaptor.getGridSizeX(), adaptor.getGridSizeY(),
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
index b3cb73b78baf20..a6ee71bfed73b8 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/utils.cc
@@ -35,7 +35,7 @@ FlatSymbolRefAttr GetOrInsertLLVMFunction(StringRef func_name, Type func_type,
   if (!tf_func) {
     OpBuilder::InsertionGuard guard(*b);
     b->setInsertionPointToStart(module.getBody());
-    tf_func = b->create<LLVMFuncOp>(b->getUnknownLoc(), func_name, func_type);
+    tf_func = LLVMFuncOp::create(*b, b->getUnknownLoc(), func_name, func_type);
   }
   return SymbolRefAttr::get(b->getContext(), func_name);
 }
@@ -55,11 +55,12 @@ Value CreateOrFindGlobalStringConstant(Location loc, StringRef global_name,
     StringRef symbol_name = global_op.getName();
     Type symbol_type = global_op.getType();
     Type ptr_type = LLVM::LLVMPointerType::get(b->getContext());
-    Value global_ptr = b->create<LLVM::AddressOfOp>(loc, ptr_type, symbol_name);
+    Value global_ptr =
+        LLVM::AddressOfOp::create(*b, loc, ptr_type, symbol_name);
     Value c0 =
-        b->create<LLVM::ConstantOp>(loc, b->getI64Type(), b->getIndexAttr(0));
-    return b->create<LLVM::GEPOp>(loc, ptr_type, symbol_type, global_ptr,
-                                  ValueRange{c0, c0});
+        LLVM::ConstantOp::create(*b, loc, b->getI64Type(), b->getIndexAttr(0));
+    return LLVM::GEPOp::create(*b, loc, ptr_type, symbol_type, global_ptr,
+                               ValueRange{c0, c0});
   }
   return LLVM::createGlobalString(loc, *b, global_name, content,
                                   LLVM::Linkage::Internal);

From 3feec3590c4d9d8d00e54954f4da05c7c3e72bcc Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Fri, 19 Dec 2025 15:23:06 -0800
Subject: [PATCH 601/753] Migrate reshape_motion_test to PjRt.

PiperOrigin-RevId: 846886339
---
 third_party/xla/xla/tests/BUILD               | 17 ++++---------
 .../xla/xla/tests/reshape_motion_test.cc      | 25 ++++++-------------
 2 files changed, 12 insertions(+), 30 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 5a26f9cb53eaab..ae3c29fcdd5e72 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -3595,22 +3595,15 @@ xla_test(
 xla_test(
     name = "reshape_motion_test",
     srcs = ["reshape_motion_test.cc"],
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
-        ":client_library_test_base",
-        ":literal_test_util",
+        ":client_library_test_runner_mixin",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
         ":xla_internal_test_main",
-        "//xla:array2d",
-        "//xla:array4d",
-        "//xla:literal",
-        "//xla:reference_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla/client:local_client",
         "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/testlib:test_helpers",
-        "@com_google_absl//absl/status:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
diff --git a/third_party/xla/xla/tests/reshape_motion_test.cc b/third_party/xla/xla/tests/reshape_motion_test.cc
index 54c63a5e64ed26..b46765e5c7ebc6 100644
--- a/third_party/xla/xla/tests/reshape_motion_test.cc
+++ b/third_party/xla/xla/tests/reshape_motion_test.cc
@@ -13,31 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <numeric>
-#include <random>
-#include <vector>
+#include <cstdint>
 
-#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/array2d.h"
-#include "xla/array4d.h"
-#include "xla/client/local_client.h"
 #include "xla/hlo/builder/xla_builder.h"
-#include "xla/hlo/testlib/test_helpers.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/reference_util.h"
-#include "xla/shape_util.h"
-#include "xla/status_macros.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/literal_test_util.h"
-#include "tsl/platform/test.h"
+#include "xla/tests/client_library_test_runner_mixin.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tsl/platform/test.h"
 
 namespace xla {
 namespace {
 
-using ReshapeMotionTest = ClientLibraryTestBase;
+using ReshapeMotionTest = ClientLibraryTestRunnerMixin<
+    HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>;
 
 TEST_F(ReshapeMotionTest, ElementwiseOfReshapesWithNonSameInputShapes) {
   XlaBuilder builder(TestName());

From cb2c66e7150d69af0473ff2ceffab7980111e727 Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Fri, 19 Dec 2025 15:55:14 -0800
Subject: [PATCH 602/753] [XLA:CPU] Enable window dilation support in XLA CPU
 YNN convolution.

PiperOrigin-RevId: 846895590
---
 third_party/xla/xla/backends/cpu/ynn_emitter.cc | 2 +-
 third_party/xla/xla/backends/cpu/ynn_support.cc | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index fb5f1abbb33e3b..ec55a93dccb034 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -725,7 +725,7 @@ static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
     stencil_axes[i] = conv_dimensions.input_spatial_dimensions(i);
     stencil_dims[i] = conv_window.dimensions(i).size();
     stencil_strides[i] = conv_window.dimensions(i).stride();
-    stencil_dilations[i] = 1;
+    stencil_dilations[i] = conv_window.dimensions(i).window_dilation();
     padding_lows[i] = conv_window.dimensions(i).padding_low();
     padding_highs[i] = conv_window.dimensions(i).padding_high();
   }
diff --git a/third_party/xla/xla/backends/cpu/ynn_support.cc b/third_party/xla/xla/backends/cpu/ynn_support.cc
index 7054d92f615ced..d3d260371e75aa 100644
--- a/third_party/xla/xla/backends/cpu/ynn_support.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_support.cc
@@ -348,10 +348,8 @@ bool IsConvolutionOpSupportedByYnn(const HloInstruction* instr) {
     return false;
   }
 
-  // No dilation for now.
-  if ((window.dimensions(0).window_dilation() != 1) ||
-      (window.dimensions(1).window_dilation() != 1) ||
-      (window.dimensions(0).base_dilation() != 1) ||
+  // No base dilation for now.
+  if ((window.dimensions(0).base_dilation() != 1) ||
       (window.dimensions(1).base_dilation() != 1)) {
     return false;
   }

From 49fda87ad762c8210eeac05926ce0281230a7d36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 16:14:19 -0800
Subject: [PATCH 603/753] Update usage of llvm::PointerType::getUnqual to use
 non-deprecated version

PiperOrigin-RevId: 846902320
---
 .../xla/xla/backends/cpu/codegen/vector_ir_builder.cc       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc b/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc
index 2a5d61871b4dfa..96dcbde59ec089 100644
--- a/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc
+++ b/third_party/xla/xla/backends/cpu/codegen/vector_ir_builder.cc
@@ -54,9 +54,11 @@ VectorIrBuilder::VectorIrBuilder(PrimitiveType primitive_type,
       name_(std::move(name)) {
   scalar_type_ =
       llvm_ir::PrimitiveTypeToIrType(primitive_type, b_->getContext());
-  scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
+  scalar_pointer_type_ =
+      llvm::PointerType::getUnqual(scalar_type_->getContext());
   vector_type_ = llvm::VectorType::get(scalar_type_, vector_size, false);
-  vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
+  vector_pointer_type_ =
+      llvm::PointerType::getUnqual(vector_type_->getContext());
 }
 
 void VectorIrBuilder::AssertCorrectTypes(

From 911ce60c2902b58cd892ca05f2297d72ef624e5f Mon Sep 17 00:00:00 2001
From: Quentin Khan <qkhan@google.com>
Date: Fri, 19 Dec 2025 16:58:16 -0800
Subject: [PATCH 604/753] Use the XNNPack packing fingerprints to invalidate
 the weight cache.

PiperOrigin-RevId: 846914182
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  35 +++
 .../channelwise_quantized_conv_2d_test.cc     |  91 ++------
 ...elwise_quantized_depthwise_conv_2d_test.cc | 116 +++-------
 .../lite/delegates/xnnpack/conv_2d_test.cc    | 159 +++-----------
 .../xnnpack/depthwise_conv_2d_test.cc         | 170 +++------------
 .../dynamically_quantized_conv_2d_test.cc     | 179 +++-------------
 ...amically_quantized_fully_connected_test.cc | 113 +---------
 ...namically_quantized_transpose_conv_test.cc |  60 ++----
 ...mically_quantized_transpose_conv_tester.cc |  10 +-
 .../xnnpack/fingerprint_test_helpers.h        | 112 ++++++++++
 .../delegates/xnnpack/fully_connected_test.cc | 152 +++----------
 .../xnnpack/signed_quantized_conv_2d_test.cc  |  97 ++-------
 ...signed_quantized_depthwise_conv_2d_test.cc | 122 +++--------
 .../signed_quantized_fully_connected_test.cc  |  91 ++------
 .../signed_quantized_transpose_conv_test.cc   | 127 +++--------
 .../delegates/xnnpack/transpose_conv_test.cc  | 199 ++++--------------
 .../unsigned_quantized_conv_2d_test.cc        |  97 ++-------
 ...signed_quantized_depthwise_conv_2d_test.cc | 121 +++--------
 ...unsigned_quantized_fully_connected_test.cc |  85 ++------
 .../unsigned_quantized_transpose_conv_test.cc | 127 +++--------
 .../lite/delegates/xnnpack/weight_cache.cc    |  85 ++++++--
 .../lite/delegates/xnnpack/weight_cache.h     |   7 +-
 .../delegates/xnnpack/weight_cache_schema.fbs |   5 +-
 .../delegates/xnnpack/weight_cache_test.cc    | 192 ++++++++++++-----
 24 files changed, 765 insertions(+), 1787 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 227537a79f1454..02d51f21d4fa4e 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -392,6 +392,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "fingerprint_test_helpers",
+    testonly = True,
+    hdrs = ["fingerprint_test_helpers.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":weight_cache",
+        ":weight_cache_test_helpers",
+        ":xnnpack_delegate_hdrs_only",
+        "//tensorflow/lite/c:common",
+        "@XNNPACK",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "mmap_handle",
     srcs = ["mmap_handle.cc"],
@@ -1347,6 +1362,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1363,6 +1379,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1397,6 +1414,7 @@ cc_test(
     }),
     deps = [
         ":conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1446,6 +1464,7 @@ cc_test(
     }),
     deps = [
         ":depthwise_conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1465,6 +1484,7 @@ cc_test(
     tags = ["notap"],
     deps = [
         ":dynamically_quantized_fully_connected_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1481,6 +1501,7 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1497,6 +1518,7 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_transpose_conv_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1513,10 +1535,14 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":fully_connected_tester",
         ":test_main",
+        ":weight_cache",
+        ":weight_cache_test_helpers",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
+        "@XNNPACK",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1864,6 +1890,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1880,6 +1907,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1930,6 +1958,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2163,6 +2192,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2307,6 +2337,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":test_main",
         ":transpose_conv_tester",
         ":xnnpack_delegate_test_mode",
@@ -2386,6 +2417,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2401,6 +2433,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2431,6 +2464,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2641,6 +2675,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
index 92293e08227593..d195d4f25435e8 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
@@ -24,17 +24,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(ChannelwiseQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct ChannelwiseQuantizedConv2D : DelegateTest {};
 
+TEST_F(ChannelwiseQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -71,11 +70,7 @@ TEST(ChannelwiseQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -112,11 +107,7 @@ TEST(ChannelwiseQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -155,11 +146,7 @@ TEST(ChannelwiseQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -198,11 +185,7 @@ TEST(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -241,11 +224,7 @@ TEST(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -288,11 +267,7 @@ TEST(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -335,11 +310,7 @@ TEST(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -382,11 +353,7 @@ TEST(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -429,11 +396,7 @@ TEST(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -476,11 +439,7 @@ TEST(ChannelwiseQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -523,11 +482,7 @@ TEST(ChannelwiseQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -570,13 +525,11 @@ TEST(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, MultiThreading) {
+TEST_F(ChannelwiseQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -619,7 +572,7 @@ TEST(ChannelwiseQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
+TEST_F(ChannelwiseQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -627,9 +580,7 @@ TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -673,15 +624,13 @@ TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
+TEST_F(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
index 25dada01896c34..0c6de84e9a8d2f 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
@@ -23,18 +23,16 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct ChannelwiseQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -66,11 +64,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -103,11 +97,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -140,11 +130,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -179,11 +165,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -216,11 +198,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -255,11 +233,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -297,11 +271,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -339,11 +309,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -385,11 +351,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -431,11 +393,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -477,11 +435,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -523,11 +477,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -573,11 +523,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -619,11 +565,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -665,11 +607,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -711,13 +649,11 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -759,7 +695,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -767,9 +703,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -812,15 +746,13 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 25090bbaf2b5cf..e1b5a674946b73 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(Conv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct Conv2D : DelegateTest {};
 
+TEST_F(Conv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -52,11 +50,7 @@ TEST(Conv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -78,11 +72,7 @@ TEST(Conv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -106,11 +96,7 @@ TEST(Conv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -136,11 +122,7 @@ TEST(Conv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -164,11 +146,7 @@ TEST(Conv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -192,11 +170,7 @@ TEST(Conv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -224,11 +198,7 @@ TEST(Conv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -256,11 +226,7 @@ TEST(Conv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -288,11 +254,7 @@ TEST(Conv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -320,11 +282,7 @@ TEST(Conv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -352,11 +310,7 @@ TEST(Conv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -384,11 +338,7 @@ TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -416,11 +366,7 @@ TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -448,11 +394,7 @@ TEST(Conv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -481,11 +423,7 @@ TEST(Conv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -514,11 +452,7 @@ TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -547,11 +481,7 @@ TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -579,11 +509,7 @@ TEST(Conv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -611,11 +537,7 @@ TEST(Conv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -643,11 +565,7 @@ TEST(Conv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DISABLED_TanhActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DISABLED_TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -675,11 +593,7 @@ TEST(Conv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DISABLED_SignBitActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DISABLED_SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -707,13 +621,11 @@ TEST(Conv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, MultiThreading) {
+TEST_F(Conv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -741,7 +653,7 @@ TEST(Conv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, WeightsCache) {
+TEST_F(Conv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -749,10 +661,7 @@ TEST(Conv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -781,15 +690,13 @@ TEST(Conv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(Conv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index e894bcdc2bc46a..931fff88178dfb 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct DepthwiseConv2D : DelegateTest {};
 
+TEST_F(DepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -47,11 +45,7 @@ TEST(DepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -69,11 +63,7 @@ TEST(DepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -91,11 +81,7 @@ TEST(DepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -115,11 +101,7 @@ TEST(DepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -137,11 +119,7 @@ TEST(DepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -161,11 +139,7 @@ TEST(DepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -188,11 +162,7 @@ TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -215,11 +185,7 @@ TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -246,11 +212,7 @@ TEST(DepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -277,11 +239,7 @@ TEST(DepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -308,11 +266,7 @@ TEST(DepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -339,11 +293,7 @@ TEST(DepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -372,11 +322,7 @@ TEST(DepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -403,11 +349,7 @@ TEST(DepthwiseConv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -434,11 +376,7 @@ TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -465,11 +403,7 @@ TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -496,11 +430,7 @@ TEST(DepthwiseConv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -528,11 +458,7 @@ TEST(DepthwiseConv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -560,11 +486,7 @@ TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -592,11 +514,7 @@ TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -623,11 +541,7 @@ TEST(DepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -654,11 +568,7 @@ TEST(DepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -685,11 +595,7 @@ TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DISABLED_TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -716,11 +622,7 @@ TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DISABLED_SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -747,13 +649,11 @@ TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, MultiThreading) {
+TEST_F(DepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -780,7 +680,7 @@ TEST(DepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, WeightsCache) {
+TEST_F(DepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -788,9 +688,7 @@ TEST(DepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -818,15 +716,13 @@ TEST(DepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, TransientIndirectionBuffer) {
+TEST_F(DepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
index 59507269580cbd..52e8333db4fd04 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
@@ -19,22 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DynamicallyQuantizedConv2D, 3x3) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+struct DynamicallyQuantizedConv2D : DelegateTest {};
 
+TEST_F(DynamicallyQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -56,15 +50,7 @@ TEST(DynamicallyQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, 3x3Stride2) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -88,15 +74,7 @@ TEST(DynamicallyQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, Grouped) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -122,15 +100,7 @@ TEST(DynamicallyQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -154,15 +124,7 @@ TEST(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -186,14 +148,7 @@ TEST(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -221,15 +176,7 @@ TEST(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -257,15 +204,7 @@ TEST(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -293,15 +232,7 @@ TEST(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -329,15 +260,7 @@ TEST(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -364,15 +287,7 @@ TEST(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -399,15 +314,7 @@ TEST(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ReluActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -435,15 +342,7 @@ TEST(DynamicallyQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, Relu6Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -471,15 +370,7 @@ TEST(DynamicallyQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -507,15 +398,7 @@ TEST(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TanhActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -543,15 +426,7 @@ TEST(DynamicallyQuantizedConv2D, TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SignBitActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -579,15 +454,13 @@ TEST(DynamicallyQuantizedConv2D, SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, MultiThreading) {
+TEST_F(DynamicallyQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -615,7 +488,7 @@ TEST(DynamicallyQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, WeightsCache) {
+TEST_F(DynamicallyQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -625,9 +498,7 @@ TEST(DynamicallyQuantizedConv2D, WeightsCache) {
   delegate_options.weights_cache = weights_cache.get();
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -656,16 +527,14 @@ TEST(DynamicallyQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
+TEST_F(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
   xnnpack_options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
index 2f198a95195f11..2d2febcb21ab66 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
@@ -30,9 +30,10 @@ namespace xnnpack {
 
 // Dummy class to use with parameterized test.
 class DynamicallyQuantizedFullyConnectedTest
-    : public testing::TestWithParam<WeightsType> {};
+    : public testing::WithParamInterface<WeightsType>,
+      public DelegateTest {};
 
-int GenInputChannels(const std::function<int()> &rng,
+int GenInputChannels(const std::function<int()>& rng,
                      WeightsType weights_type) {
   switch (weights_type) {
     case WeightsType::kChannelWiseQuantizedInt8:
@@ -45,14 +46,6 @@ int GenInputChannels(const std::function<int()> &rng,
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -71,14 +64,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -99,14 +84,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -128,13 +105,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -156,14 +126,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -184,14 +146,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -214,14 +168,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -244,14 +190,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -275,14 +213,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -304,14 +234,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -333,14 +255,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -362,14 +276,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -393,13 +299,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
 TEST_P(DynamicallyQuantizedFullyConnectedTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+  UseCustomDelegate(delegate_options);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -429,9 +330,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
index de863e4f1e2125..4a40e56852b56c 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct DynamicallyQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -51,10 +49,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -75,11 +70,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -100,11 +91,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -125,11 +112,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -153,10 +136,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -180,11 +160,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -212,11 +188,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -244,13 +216,11 @@ TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -279,7 +249,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -287,9 +257,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
index 3bdcd343373bac..abfd76c12a14f9 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
@@ -55,10 +55,12 @@ void DynamicallyQuantizedTransposeConvTester::Test(
   const Model* model = GetModel(buffer.data());
 
   std::unique_ptr<Interpreter> delegate_interpreter;
-  ASSERT_EQ(InterpreterBuilder(
-                model, ::tflite::ops::builtin::BuiltinOpResolverWithXNNPACK())(
-                &delegate_interpreter),
-            kTfLiteOk);
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &delegate_interpreter),
+      kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
       InterpreterBuilder(
diff --git a/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h b/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
new file mode 100644
index 00000000000000..29edbe5a35c841
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
@@ -0,0 +1,112 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "experimental.h"  // from @XNNPACK
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite::xnnpack {
+
+struct TfLiteDelegateDeleter {
+  void operator()(TfLiteDelegate* delegate) {
+    TfLiteXNNPackDelegateDelete(delegate);
+  }
+};
+
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, TfLiteDelegateDeleter>;
+
+struct DelegateTest : public virtual testing::Test {
+  void SetUp() override {
+    TfLiteXNNPackDelegateOptions delegate_options =
+        TfLiteXNNPackDelegateOptionsDefault();
+
+    // By default, we try to setup a file weight cache to also check fingerprint
+    // generation. If the test system doesn't support a file system, then the
+    // cache file will be invalid.
+    if (cache_file.IsValid()) {
+      xnn_clear_fingerprints();
+      delegate_options.weight_cache_file_path = cache_file.GetCPath();
+      delegate_options.weight_cache_file_descriptor =
+          cache_file.Duplicate().Release();
+      delegate_options.flags |=
+          TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+      check_for_cache_fingerprints = true;
+    }
+
+    xnnpack_delegate =
+        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
+    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
+  }
+
+  void TearDown() override {
+    if (check_for_cache_fingerprints) {
+      ASSERT_TRUE(cache_file.IsValid());
+      EXPECT_TRUE(IsCompatibleCacheFile(cache_file));
+      if (AlterXNNPackFingerprints()) {
+        EXPECT_FALSE(IsCompatibleCacheFile(cache_file));
+      }
+    }
+  }
+
+  // Artificially change fingerprint values.
+  //
+  // This allows us to check that changing a fingerprint value will make the
+  // cache file incompatible.
+  //
+  // Returns the current number of fingerprints.
+  int AlterXNNPackFingerprints() {
+    int i = 0;
+    int modified = 0;
+    for (const xnn_fingerprint* fingerprint = xnn_get_fingerprint_by_idx(i);
+         fingerprint != nullptr;
+         fingerprint = xnn_get_fingerprint_by_idx(++i)) {
+      xnn_fingerprint new_fingerprint = *fingerprint;
+      ++new_fingerprint.value;
+      xnn_set_fingerprint(new_fingerprint);
+      ++modified;
+    }
+    return modified;
+  }
+
+  // Replaces the xnnpack delegate with a custom one.
+  void UseCustomDelegate(const TfLiteXNNPackDelegateOptions& delegate_options) {
+    check_for_cache_fingerprints = false;
+    xnnpack_delegate =
+        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
+    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
+  }
+
+  // Replaces the xnnpack delegate with one that sets up a file backed weight
+  // cache.
+  void UseDelegateWithFileWeightCache() {}
+
+  // The default delegate is created in a generic way.
+  TfLiteDelegatePtr xnnpack_delegate;
+  tflite::xnnpack::TempFileDesc cache_file;
+  bool check_for_cache_fingerprints = false;
+};
+
+}  // namespace tflite::xnnpack
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index 92a6074c464f85..6701d0bc1c8f59 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(FullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct FullyConnectedTest : public DelegateTest {};
 
+TEST_F(FullyConnectedTest, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -45,11 +43,7 @@ TEST(FullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -65,11 +59,7 @@ TEST(FullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -87,11 +77,7 @@ TEST(FullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -110,11 +96,7 @@ TEST(FullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -133,11 +115,7 @@ TEST(FullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -156,11 +134,7 @@ TEST(FullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -180,11 +154,7 @@ TEST(FullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -204,11 +174,7 @@ TEST(FullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -229,11 +195,7 @@ TEST(FullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -252,11 +214,7 @@ TEST(FullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -275,11 +233,7 @@ TEST(FullyConnected, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, FP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, FP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -299,11 +253,7 @@ TEST(FullyConnected, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -322,11 +272,7 @@ TEST(FullyConnected, DynamicWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -346,11 +292,7 @@ TEST(FullyConnected, DynamicWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -369,11 +311,7 @@ TEST(FullyConnected, DynamicBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeightsAndBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeightsAndBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -393,11 +331,7 @@ TEST(FullyConnected, DynamicWeightsAndBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -416,11 +350,7 @@ TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -440,11 +370,7 @@ TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -463,11 +389,7 @@ TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -487,11 +409,7 @@ TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -510,11 +428,7 @@ TEST(FullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -533,11 +447,7 @@ TEST(FullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -556,13 +466,11 @@ TEST(FullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, MultiThreading) {
+TEST_F(FullyConnectedTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -581,7 +489,7 @@ TEST(FullyConnected, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, WeightsCache) {
+TEST_F(FullyConnectedTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -589,9 +497,7 @@ TEST(FullyConnected, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
index f67ba714b01cc8..06daba0d9bada7 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
@@ -21,17 +21,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedConv2D : DelegateTest {};
 
+TEST_F(SignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -59,11 +58,7 @@ TEST(SignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -91,11 +86,7 @@ TEST(SignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -125,11 +116,7 @@ TEST(SignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -163,11 +150,7 @@ TEST(SignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -197,11 +180,7 @@ TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -231,11 +210,7 @@ TEST(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -269,11 +244,7 @@ TEST(SignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -307,11 +278,7 @@ TEST(SignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -345,11 +312,7 @@ TEST(SignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -383,11 +346,7 @@ TEST(SignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -421,11 +380,7 @@ TEST(SignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -459,11 +414,7 @@ TEST(SignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -497,13 +448,11 @@ TEST(SignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, MultiThreading) {
+TEST_F(SignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -537,15 +486,13 @@ TEST(SignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(SignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
index 3acfbaaf34778e..c409b18002ef51 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
@@ -20,18 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(SignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -54,11 +52,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -82,11 +76,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -110,11 +100,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -140,11 +126,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -168,11 +150,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -198,11 +176,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -231,11 +205,7 @@ TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -264,11 +234,7 @@ TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -301,11 +267,7 @@ TEST(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -338,11 +300,7 @@ TEST(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -375,11 +333,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -412,11 +366,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -451,11 +401,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -488,11 +434,7 @@ TEST(SignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -525,11 +467,7 @@ TEST(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -562,13 +500,11 @@ TEST(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(SignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -601,7 +537,7 @@ TEST(SignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(SignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -609,9 +545,7 @@ TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -645,15 +579,13 @@ TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
index 3097d314a3a6ab..5a7a9dfd77b24e 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
@@ -21,17 +21,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedFullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedFullyConnected : DelegateTest {};
 
+TEST_F(SignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -52,11 +51,7 @@ TEST(SignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -78,11 +73,7 @@ TEST(SignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -106,11 +97,7 @@ TEST(SignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -135,11 +122,7 @@ TEST(SignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -164,11 +147,7 @@ TEST(SignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -193,11 +172,7 @@ TEST(SignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -223,11 +198,7 @@ TEST(SignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -253,11 +224,7 @@ TEST(SignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -284,11 +251,7 @@ TEST(SignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -313,11 +276,7 @@ TEST(SignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -342,11 +301,7 @@ TEST(SignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -371,11 +326,7 @@ TEST(SignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -400,13 +351,11 @@ TEST(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, MultiThreading) {
+TEST_F(SignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -431,7 +380,7 @@ TEST(SignedQuantizedFullyConnected, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, WeightsCache) {
+TEST_F(SignedQuantizedFullyConnected, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -439,9 +388,7 @@ TEST(SignedQuantizedFullyConnected, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
index 7daae13ebdea16..d4dceb9077ff26 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -52,11 +51,7 @@ TEST(SignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -79,11 +74,7 @@ TEST(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -105,11 +96,7 @@ TEST(SignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -132,11 +119,7 @@ TEST(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -158,11 +141,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -185,11 +164,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -211,11 +186,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -238,11 +209,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -267,11 +234,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -297,11 +260,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -326,11 +285,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -356,11 +311,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -389,11 +340,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -423,11 +370,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -456,11 +399,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -490,11 +429,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -524,11 +459,7 @@ TEST(SignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -559,13 +490,11 @@ TEST(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(SignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -595,13 +524,11 @@ TEST(SignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST_F(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -632,7 +559,7 @@ TEST(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(SignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -640,9 +567,7 @@ TEST(SignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
index 260fd87e282a63..d37317c34f545a 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
@@ -19,17 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(TransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct TransposeConvTest : DelegateTest {};
 
+TEST_F(TransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -50,11 +49,7 @@ TEST(TransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -76,11 +71,7 @@ TEST(TransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -101,11 +92,7 @@ TEST(TransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -127,11 +114,7 @@ TEST(TransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -152,11 +135,7 @@ TEST(TransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -178,11 +157,7 @@ TEST(TransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -203,11 +178,7 @@ TEST(TransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -229,11 +200,7 @@ TEST(TransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -257,11 +224,7 @@ TEST(TransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -286,11 +249,7 @@ TEST(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -314,11 +273,7 @@ TEST(TransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -343,11 +298,7 @@ TEST(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -375,11 +326,7 @@ TEST(TransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -408,11 +355,7 @@ TEST(TransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -440,11 +383,7 @@ TEST(TransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -473,11 +412,7 @@ TEST(TransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -506,11 +441,7 @@ TEST(TransposeConvTest, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, FP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, FP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -540,11 +471,7 @@ TEST(TransposeConvTest, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -573,11 +500,7 @@ TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -607,11 +530,7 @@ TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -640,11 +559,7 @@ TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -674,11 +589,7 @@ TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -707,11 +618,7 @@ TEST(TransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -741,11 +648,7 @@ TEST(TransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -775,11 +678,7 @@ TEST(TransposeConvTest, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseFP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -810,11 +709,7 @@ TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -844,11 +739,7 @@ TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -879,11 +770,7 @@ TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -913,11 +800,7 @@ TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -948,13 +831,11 @@ TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, MultiThreading) {
+TEST_F(TransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -983,13 +864,11 @@ TEST(TransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, MultiThreadingNoBias) {
+TEST_F(TransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -1019,7 +898,7 @@ TEST(TransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, WeightsCache) {
+TEST_F(TransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -1027,9 +906,7 @@ TEST(TransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
index 6660fc5af75ebe..b8c9d48f4f05a2 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedConv2D : DelegateTest {};
 
+TEST_F(UnsignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -61,11 +60,7 @@ TEST(UnsignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -96,11 +91,7 @@ TEST(UnsignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -137,11 +128,7 @@ TEST(UnsignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -174,11 +161,7 @@ TEST(UnsignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -211,11 +194,7 @@ TEST(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -248,11 +227,7 @@ TEST(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -289,11 +264,7 @@ TEST(UnsignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -330,11 +301,7 @@ TEST(UnsignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -371,11 +338,7 @@ TEST(UnsignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -412,11 +375,7 @@ TEST(UnsignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -453,11 +412,7 @@ TEST(UnsignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -494,11 +449,7 @@ TEST(UnsignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -535,13 +486,11 @@ TEST(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, MultiThreading) {
+TEST_F(UnsignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -578,15 +527,13 @@ TEST(UnsignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
index 7facb9787338c7..a269343dafc512 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -56,11 +55,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -87,11 +82,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -118,11 +109,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -151,11 +138,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -182,11 +165,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -215,11 +194,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -251,11 +226,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -287,11 +258,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -327,11 +294,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -367,11 +330,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -407,11 +366,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -447,11 +402,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -489,11 +440,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -529,11 +476,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -569,11 +512,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -609,13 +548,11 @@ TEST(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -651,7 +588,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -659,9 +596,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -698,15 +633,13 @@ TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
index 90df47c884d042..25aabd2a559413 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedFullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedFullyConnected : DelegateTest {};
 
+TEST_F(UnsignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -54,11 +53,7 @@ TEST(UnsignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -83,11 +78,7 @@ TEST(UnsignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -114,11 +105,7 @@ TEST(UnsignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -146,11 +133,7 @@ TEST(UnsignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -178,11 +161,7 @@ TEST(UnsignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -210,11 +189,7 @@ TEST(UnsignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -243,11 +218,7 @@ TEST(UnsignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -276,11 +247,7 @@ TEST(UnsignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -310,11 +277,7 @@ TEST(UnsignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -342,11 +305,7 @@ TEST(UnsignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -374,11 +333,7 @@ TEST(UnsignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -406,11 +361,7 @@ TEST(UnsignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -438,13 +389,11 @@ TEST(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, MultiThreading) {
+TEST_F(UnsignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
index 8e6a779a1979f9..5167d18443ac30 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
@@ -19,17 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -51,11 +50,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -78,11 +73,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -104,11 +95,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -131,11 +118,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -157,11 +140,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -184,11 +163,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -210,11 +185,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -237,11 +208,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -266,11 +233,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -296,11 +259,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -325,11 +284,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -355,11 +310,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -388,11 +339,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -422,11 +369,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -455,11 +398,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -489,11 +428,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -523,11 +458,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -558,13 +489,11 @@ TEST(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -594,13 +523,11 @@ TEST(UnsignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -631,7 +558,7 @@ TEST(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(UnsignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -639,9 +566,7 @@ TEST(UnsignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index a8c86ff5a25529..9aaf497700f87f 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <unistd.h>
 #endif
 
+#include <algorithm>
 #include <cerrno>  // IWYU pragma: keep
 #include <cinttypes>
 #include <cstddef>
@@ -37,6 +38,7 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/verifier.h"  // from @flatbuffers
@@ -78,6 +80,20 @@ bool FileExists(const char* path) {
   return access(path, F_OK) != -1;
 }
 
+bool CheckFingerprints(const cache::schema::BufferList* buffer_list) {
+  if (buffer_list->fingerprints()) {
+    for (uint64_t cache_fingerprint : *buffer_list->fingerprints()) {
+      xnn_fingerprint fingerprint;
+      static_assert(sizeof(fingerprint) == sizeof(cache_fingerprint));
+      std::memcpy(&fingerprint, &cache_fingerprint, sizeof(fingerprint));
+      XNNPACK_RETURN_CHECK(
+          xnn_check_fingerprint(fingerprint) == xnn_status_success,
+          "fingerprint (id: 0x%x) could not be matched", fingerprint.id);
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 #define XNN_MOVE_CONSTRUCT_MEMBER(x) x(std::move(other.x))
@@ -182,7 +198,8 @@ void* WeightCacheBuilder::Reserve(size_t size) {
 }
 
 BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
-                                          const void* data, uint64_t size) {
+                                          const void* data, uint64_t size,
+                                          int32_t fingerprint_id) {
   XNNPACK_ABORT_CHECK(is_build_step_,
                       "cannot append data to an unstarted builder.");
   // Add some padding so that the cache file can be mmaped and the buffer
@@ -201,6 +218,34 @@ BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
   buffer.size = loc.size;
   schema_.buffers.push_back(std::make_unique<cache::schema::BufferT>(buffer));
 
+  // Not passing a fingerprint id is a logic error on XNNPack's side. If we
+  // don't have a fingerprint for an operation, we have no way of ensuring that
+  // the generation of the cached data hasn't changed when reloading the cache.
+  //
+  // If we just log this and continue on with the work. This run will build a
+  // cache with cached data that can't be checked in the future. This will lead,
+  // in future runs that reuse the cache, to crashes that are impossible to
+  // debug or outputs that are nonsensical without any chance of linking this
+  // back to this error.
+  //
+  // We abort because we have no way of making that failure bubble up to the
+  // calling code to handle it gracefully...
+  XNNPACK_ABORT_CHECK(fingerprint_id != 0,
+                      "XNNPack weight cache: no fingerprint identifier was set "
+                      "when appending a buffer to the cache file.");
+  const xnn_fingerprint* fingerprint = xnn_get_fingerprint(fingerprint_id);
+  XNNPACK_ABORT_CHECK(fingerprint,
+                      "XNNPack weight cache: could not find a fingerprint with "
+                      "id 0x%x when appending a buffer to the cache file.",
+                      fingerprint_id);
+  uint64_t fingerprint_value;
+  static_assert(sizeof(fingerprint_value) == sizeof(*fingerprint));
+  std::memcpy(&fingerprint_value, fingerprint, sizeof(*fingerprint));
+  if (std::find(schema_.fingerprints.begin(), schema_.fingerprints.end(),
+                fingerprint_value) == schema_.fingerprints.end()) {
+    schema_.fingerprints.push_back(fingerprint_value);
+  }
+
   if (!fd_.Write(data, size)) {
     TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
                     "XNNPack weight cache: cannot append buffer to cache file");
@@ -233,16 +278,7 @@ bool WeightCacheBuilder::StopBuildStep() {
   XNNPACK_RETURN_CHECK(fd_.SetPos(layout_offset) != -1,
                        "could not move in the file: %s", strerror(errno));
 
-  XNNPACK_RETURN_CHECK(
-      sizeof(XNNPackCacheHeader::xnnpack_build_identifier) ==
-          xnn_experimental_get_build_identifier_size(),
-      "cache file ('%s') header cannot hold XNNPack's build identifier: %s.",
-      file_path_.c_str(), strerror(errno));
-
   XNNPackCacheHeader header{XNNPackCacheHeader::kVersion};
-  memcpy(header.xnnpack_build_identifier,
-         xnn_experimental_get_build_identifier_data(),
-         xnn_experimental_get_build_identifier_size());
   header.buffer_list_offset = fd_.GetPos();
   header.buffer_list_size = builder.GetSize();
 
@@ -405,12 +441,6 @@ bool MMapWeightCacheProvider::Load() {
                        ", expected %" PRIu64 ". Cache needs to be built again.",
                        header.version, XNNPackCacheHeader::kVersion);
 
-  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
-                           header.xnnpack_build_identifier,
-                           sizeof(header.xnnpack_build_identifier)),
-                       "XNNPack weight cache: incompatible XNNPack version. "
-                       "Cache needs to be built again.");
-
   XNNPACK_RETURN_CHECK(header.buffer_list_offset < mmap_handle.size(),
                        "invalid offset for buffer list descriptor.");
 
@@ -430,6 +460,8 @@ bool MMapWeightCacheProvider::Load() {
   XNNPACK_RETURN_CHECK(buffer_list,
                        "could not get packed weights from flatbuffer.");
 
+  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
+
   mmap_buffer_base_offset_ = buffer_list->base_offset();
   if (const auto buffers = buffer_list->buffers(); buffers) {
     for (auto* buffer : *buffers) {
@@ -584,7 +616,8 @@ size_t MMapWeightCacheProvider::LookUpOrInsert(
     return offset_it->second.offset;
   }
 
-  const BufferLocation location = builder_.Append(pack_id, ptr, size);
+  const BufferLocation location =
+      builder_.Append(pack_id, ptr, size, cache_key->fingerprint_id);
   XNNPACK_ABORT_CHECK(!location.IsInvalid(),
                       "Inserting data in the cache failed.");
   cache_key_to_offset_.emplace(pack_id, location);
@@ -693,10 +726,20 @@ bool IsCompatibleCacheFile(FileDescriptorView fd) {
                        "Cache header version is incompatible. Expected %" PRIu64
                        ", got %" PRIu64 ".",
                        XNNPackCacheHeader::kVersion, header.version);
-  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
-                           header.xnnpack_build_identifier,
-                           sizeof(header.xnnpack_build_identifier)),
-                       "Cache header build identifier is different.");
+
+  fd.SetPos(header.buffer_list_offset);
+  auto buffer = std::make_unique<uint8_t[]>(header.buffer_list_size);
+  XNNPACK_RETURN_CHECK(fd.Read(buffer.get(), header.buffer_list_size));
+
+  flatbuffers::Verifier verifier(buffer.get(), header.buffer_list_size);
+  XNNPACK_RETURN_CHECK(cache::schema::VerifyBufferListBuffer(verifier),
+                       "buffer list validation failed.");
+
+  const cache::schema::BufferList* buffer_list =
+      cache::schema::GetBufferList(buffer.get());
+  XNNPACK_RETURN_CHECK(buffer_list,
+                       "could not get packed weights from flatbuffer.");
+  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
   return true;
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index a7c8654df4f7ec..781422b4bec662 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -56,9 +56,8 @@ inline constexpr char kInMemoryCachePath[] = ":memory";
 // When reading a cache file, the cache should be rejected if `version`
 // doesn't match `kVersion`.
 struct XNNPackCacheHeader {
-  enum : uint64_t { kInvalidHeader = 0, kVersion = 1 };
+  enum : uint64_t { kInvalidHeader = 0, kVersion = 2 };
   uint64_t version;
-  uint8_t xnnpack_build_identifier[32];
   uint64_t buffer_list_offset;
   uint64_t buffer_list_size;
 };
@@ -161,8 +160,8 @@ class WeightCacheBuilder {
   // The buffer space must have been reserved before using `Reserve`. If not, a
   // new call to `Reserve` will be done and the data will be copied over.
   [[nodiscard /*The location to the appended data should be saved.*/]]
-  BufferLocation Append(PackIdentifier pack_id, const void* data,
-                        uint64_t size);
+  BufferLocation Append(PackIdentifier pack_id, const void* data, uint64_t size,
+                        int fingerprint_id);
 
   // Writes the flatbuffer to disk.
   [[nodiscard /*Writing the weight cache can fail.*/]]
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
index 33566b8be2208a..37f19612010709 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
@@ -32,11 +32,14 @@ table Buffer {
 }
 
 table BufferList {
+  /// A list of packing fingerprints. All of these need to be checked when
+  /// loading the cache to ensure that it is compatible.
+  fingerprints: [uint64];
   /// A list of buffers.
   buffers: [Buffer];
   /// Defines the base offset for the data in the file. That offset
   /// may be needed to guarantee data alignment.
-  base_offset:uint64;
+  base_offset: uint64;
 }
 
 root_type BufferList;
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
index dd3093b2736517..c1e4071ff4a353 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
@@ -56,7 +57,13 @@ namespace {
 
 using testing::ElementsAreArray;
 
-TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
+static xnn_fingerprint kDefaultFingerprint{/*id=*/0xf00d, /*value=*/0xb33f};
+
+struct WeightCacheBuilderTest : testing::Test {
+  void SetUp() override { xnn_set_fingerprint(kDefaultFingerprint); }
+};
+
+TEST_F(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -72,7 +79,8 @@ TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   const size_t payload_size = size(payload);
   void* buffer = builder.Reserve(payload_size);
   std::memcpy(buffer, payload.c_str(), payload_size);
-  auto loc = builder.Append(dummy_id, buffer, payload_size);
+  auto loc =
+      builder.Append(dummy_id, buffer, payload_size, kDefaultFingerprint.id);
 
   EXPECT_EQ(loc.size, payload_size);
   EXPECT_GE(builder.capacity(), payload_size);
@@ -123,7 +131,7 @@ TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
+TEST_F(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -137,7 +145,8 @@ TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
+                            kDefaultFingerprint.id);
 
   EXPECT_EQ(loc.size, payload_size);
 
@@ -186,7 +195,7 @@ TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
+TEST_F(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   const std::string cache_path = testing::TempDir() + "/cache";
   const std::string payload = "This is some data in the file.";
   const PackIdentifier dummy_id{1, 2, 3};
@@ -198,7 +207,8 @@ TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
+                            kDefaultFingerprint.id);
   EXPECT_EQ(loc.size, payload_size);
   ASSERT_TRUE(builder.StopBuildStep());
 
@@ -218,13 +228,13 @@ TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   EXPECT_FALSE(builder.StartBuildStep());
 }
 
-TEST(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
+TEST_F(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
   WeightCacheBuilder builder;
   EXPECT_FALSE(builder.Start("", FileDescriptor()));
   EXPECT_FALSE(builder.Start("/seldf/sedsft", FileDescriptor()));
 }
 
-TEST(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
+TEST_F(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   if (!TfLiteXNNPackDelegateCanUseInMemoryWeightCacheProvider()) {
     GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                     "isn't supported by the current system, skipping test.";
@@ -239,7 +249,7 @@ TEST(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   EXPECT_EQ(errno, ENOENT);
 }
 
-TEST(WeightCacheBuilderTest, MultipleStepBuild) {
+TEST_F(WeightCacheBuilderTest, MultipleStepBuild) {
   using std::size;
 
   const std::string payload1 = "This is some data in the file.";
@@ -262,7 +272,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload1);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload1.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id1, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id1, buffer, payload_size, kDefaultFingerprint.id);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -270,7 +281,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload3);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload3.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id3, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id3, buffer, payload_size, kDefaultFingerprint.id);
     (void)loc;
   }
 
@@ -284,7 +296,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload2);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload2.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id2, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id2, buffer, payload_size, kDefaultFingerprint.id);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -389,7 +402,8 @@ struct FakeContext {
                                           const int weights_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = nullptr};
+            .bias = nullptr,
+            .fingerprint_id = kDefaultFingerprint.id};
   }
 
   // Creates a look up key for the XNNPack weight provider C interface.
@@ -398,7 +412,8 @@ struct FakeContext {
                                           const int bias_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = buffers[bias_index].data()};
+            .bias = buffers[bias_index].data(),
+            .fingerprint_id = kDefaultFingerprint.id};
   }
 
   // Helps creating fake packed data.
@@ -505,6 +520,7 @@ struct BuildMMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
+    xnn_set_fingerprint(kDefaultFingerprint);
     AddTensors();
     EndSetup();
   }
@@ -723,6 +739,7 @@ struct MMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
+    xnn_set_fingerprint(kDefaultFingerprint);
   }
   bool use_explicit_fd = GetParam().use_explicit_fd;
   const char* const explicit_fd_path = GetParam().explicit_fd_path;
@@ -783,12 +800,14 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data};
+        .bias = tensors[1].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data};
+        .bias = tensors[4].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     // Lookup non-packed tensor.
     ASSERT_EQ(cache->look_up(cache, &look_up_key_1), SIZE_MAX);
@@ -829,7 +848,8 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data};
+        .bias = tensors[3].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const size_t build_offset_2 = cache->look_up_or_insert(
         cache, &look_up_key_2, (void*)packed_data_ref_2,
@@ -904,17 +924,20 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data};
+        .bias = tensors[1].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data};
+        .bias = tensors[3].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data};
+        .bias = tensors[4].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     ASSERT_TRUE(cache->is_finalized(cache));
 
@@ -945,30 +968,59 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
   }
 }
 
-TEST_P(MMapWeightCacheProviderTest, XnnpackRebuildOnVersionMismatch) {
+TEST_P(MMapWeightCacheProviderTest, CacheIsRebuiltOnFingerprintMismatch) {
+  if (use_in_memory_cache) {
+    GTEST_SUCCEED() << "In-memory cache is never reloaded.";
+    return;
+  }
   TempFileDesc temp_fd;
   const char* temp_fd_cpath = explicit_fd_path;
-  FileDescriptor temp_fd_value = temp_fd.Duplicate();
 
-  {  // Set bad build identifier
-    XNNPackCacheHeader header{.version = XNNPackCacheHeader::kVersion};
-    header.xnnpack_build_identifier[0] += 1;
-    ASSERT_TRUE(temp_fd_value.Write(&header, sizeof(header)));
+  xnn_fingerprint test_fingeprint{0x7357, 0xF33D};
+  {  // Build a cache file with a specific fingerprint.
+    // Clear fingerprints and add a test fingerprint to XNNPack.
+    xnn_clear_fingerprints();
+    xnn_set_fingerprint(test_fingeprint);
+
+    // Build a cache file.
+    MMapWeightCacheProvider cache_provider;
+
+    const char kernel[] = "Fake data.";
+    TfLiteTensor tensor;
+    tensor.data.data = (void*)kernel;
+    cache_provider.MapTensorIdentifiers(
+        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
+    ASSERT_TRUE(
+        cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
+    ASSERT_TRUE(cache_provider.StartBuildStep());
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = 1234,
+        .kernel = kernel,
+        .bias = nullptr,
+        .fingerprint_id = test_fingeprint.id};
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    const size_t build_offset_1 = cache->look_up_or_insert(
+        cache, &look_up_key_1,
+        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
+        sizeof(kernel));
+    (void)build_offset_1;
+    ASSERT_TRUE(cache_provider.StopBuildStep());
   }
 
   if (!use_explicit_fd) {
     temp_fd.Close();
     temp_fd_cpath = temp_fd.GetCPath();
-    temp_fd_value.Close();
-    if (use_in_memory_cache) {
-      temp_fd_cpath = kInMemoryCachePath;
-    }
   }
 
+  // Change the test fingerprint value.
+  test_fingeprint.value = 0xdeadb33f;
+  xnn_set_fingerprint(test_fingeprint);
+
+  // Reload the file.
   auto build_cache_provider = std::make_unique<MMapWeightCacheProvider>();
   MMapWeightCacheProvider& cache_provider = *build_cache_provider;
-  ASSERT_TRUE(cache_provider.LoadOrStartBuild(temp_fd_cpath,
-                                              temp_fd_value.Duplicate()));
+  ASSERT_TRUE(
+      cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
   ASSERT_TRUE(cache_provider.StartBuildStep());
 }
 
@@ -980,29 +1032,53 @@ class IsCompatibleCacheFileTest
   using Param = IsCompatibleCacheFileTestOverload;
 
   void SetUp() override {
-    header_.version = XNNPackCacheHeader::kVersion;
-    memcpy(header_.xnnpack_build_identifier,
-           xnn_experimental_get_build_identifier_data(),
-           xnn_experimental_get_build_identifier_size());
+    xnn_clear_fingerprints();
+    xnn_set_fingerprint(kDefaultFingerprint);
+
+    // Build a cache file.
+    MMapWeightCacheProvider cache_provider;
+
+    const char kernel[] = "Fake data.";
+    TfLiteTensor tensor;
+    tensor.data.data = (void*)kernel;
+    cache_provider.MapTensorIdentifiers(
+        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
+    ASSERT_TRUE(
+        cache_provider.LoadOrStartBuild(fd_.GetCPath(), fd_.Duplicate()));
+    ASSERT_TRUE(cache_provider.StartBuildStep());
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = 1234,
+        .kernel = kernel,
+        .bias = nullptr,
+        .fingerprint_id = kDefaultFingerprint.id};
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    const size_t build_offset_1 = cache->look_up_or_insert(
+        cache, &look_up_key_1,
+        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
+        sizeof(kernel));
+    (void)build_offset_1;
+    ASSERT_TRUE(cache_provider.StopBuildStep());
   }
 
-  bool WriteHeaderAndReturnIsCompatibleCacheFile() {
-    if (!fd_.Write(&header_, sizeof(header_))) {
-      return false;
-    }
-    if (GetParam() == Param::kPath) {
-      fd_.Close();
-      return IsCompatibleCacheFile(fd_.GetCPath());
-    } else {
-      const FileDescriptor::Offset pos = fd_.GetPos();
-      EXPECT_NE(pos, 0);  // Ensure that we are testing with a non 0 position.
-      const bool compatible = IsCompatibleCacheFile(fd_);
-      EXPECT_EQ(pos, fd_.GetPos());
-      return compatible;
+  void ChangeRuntimeFingerprintValue() {
+    xnn_set_fingerprint(
+        {kDefaultFingerprint.id, kDefaultFingerprint.value + 1});
+  }
+
+  bool CallIsCompatibleCacheFile() {
+    switch (GetParam()) {
+      case Param::kPath:
+        fd_.Close();
+        return IsCompatibleCacheFile(fd_.GetCPath());
+      case Param::kDescriptor: {
+        const auto pos = fd_.GetPos();
+        EXPECT_NE(pos, 0);  // We test with a non zero position.
+        return IsCompatibleCacheFile(fd_);
+        EXPECT_EQ(fd_.GetPos(), pos);
+      }
     }
   }
 
-  XNNPackCacheHeader header_{};
   TempFileDesc fd_;
 };
 
@@ -1016,18 +1092,18 @@ std::string Name(
   }
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsTrueForACorrectHeader) {
-  EXPECT_TRUE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsTrueWhenFingerprintMatches) {
+  EXPECT_TRUE(CallIsCompatibleCacheFile());
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongHeaderVersion) {
-  header_.version += 1;
-  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintMismatches) {
+  ChangeRuntimeFingerprintValue();
+  EXPECT_FALSE(CallIsCompatibleCacheFile());
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongBuildIdentifier) {
-  header_.xnnpack_build_identifier[0] += 1;
-  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintIsNotFound) {
+  xnn_clear_fingerprints();
+  EXPECT_FALSE(CallIsCompatibleCacheFile());
 }
 
 INSTANTIATE_TEST_SUITE_P(

From f4a923fa821e2c25d9670ae33e443742f4fc1201 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Fri, 19 Dec 2025 17:05:56 -0800
Subject: [PATCH 605/753] Add a walk to convert unreduced `sdy.constant` to
 replicated `sdy.constant` and `sdy.replicated_to_unreduced`.

PiperOrigin-RevId: 846916791
---
 .../export_manual_reduction_collectives.cc    | 20 ++++++++++++
 ...o_export_manual_reduction_collectives.mlir | 31 +++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
index edb5c3b54457df..876cc3de21b4f8 100644
--- a/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
+++ b/third_party/xla/xla/service/spmd/shardy/stablehlo_round_trip/export_manual_reduction_collectives.cc
@@ -437,6 +437,26 @@ class StablehloExportManualReductionCollectivesPass
     ModuleOp moduleOp = getOperation();
     mlir::IRRewriter rewriter(moduleOp.getContext());
 
+    moduleOp.walk([&](mlir::Operation* op) {
+      if (auto constant = mlir::dyn_cast<sdy::ConstantOp>(op)) {
+        TensorShardingAttr oldSharding = sdy::getSharding(constant);
+        if (!oldSharding || oldSharding.getUnreducedAxes().empty()) {
+          return;
+        }
+
+        TensorShardingAttr newSharding = oldSharding.replaceUnreducedAxes({});
+        sdy::setSharding(constant, newSharding);
+
+        rewriter.setInsertionPointAfter(constant);
+        sdy::ReplicatedToUnreducedOp replicatedToUnreduced =
+            sdy::ReplicatedToUnreducedOp::create(
+                rewriter, constant.getLoc(), constant,
+                oldSharding.getUnreducedAxes(), oldSharding);
+        rewriter.replaceAllUsesExcept(constant, replicatedToUnreduced,
+                                      replicatedToUnreduced);
+      }
+    });
+
     // Do very restricted backward propagation of unreduced axes along specific
     // ops that don't modify the data.
     moduleOp.walk<mlir::WalkOrder::PostOrder, mlir::ReverseIterator>(
diff --git a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
index be4e2a5243a255..6b3b04dda62b93 100644
--- a/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
+++ b/third_party/xla/xla/service/spmd/shardy/test/stablehlo_export_manual_reduction_collectives.mlir
@@ -406,3 +406,34 @@ func.func @replicated_to_unreduced(%arg0: tensor<16x16xf32> {sdy.sharding = #sdy
   %0 = sdy.replicated_to_unreduced {"x", "z"} %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<16x16xf32>
   return %0 : tensor<16x16xf32>
 }
+
+// -----
+
+sdy.mesh @mesh = <["x"=2, "y"=2]>
+
+// CHECK-LABEL: func @unreduced_constant
+func.func @unreduced_constant() -> tensor<2x2xf32> {
+  // CHECK-NEXT: %[[CONST:.*]] = sdy.constant {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}]>]>} dense<{{\[\[}}0.000000e+00, 1.000000e+00], [2.000000e+00, 3.000000e+00]]> : tensor<2x2xf32>
+  // CHECK-NEXT: %[[MANUAL_COMP:.*]] = sdy.manual_computation(%[[CONST]])
+  // CHECK-SAME:     in_shardings=[<@mesh, [{"x"}, {}]>]
+  // CHECK-SAME:     out_shardings=[<@mesh, [{"x"}, {}], unreduced={"y"}>]
+  // CHECK-SAME:     manual_axes={"x", "y"} (%arg0: tensor<1x2xf32>) {
+  // CHECK-NEXT:   %[[PID:.*]] = stablehlo.partition_id : tensor<ui32>
+  // CHECK-NEXT:   %[[PID_I32:.*]] = stablehlo.convert %[[PID]] : (tensor<ui32>) -> tensor<i32>
+  // CHECK-NEXT:   %[[C2:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM:.*]] = stablehlo.remainder %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV:.*]] = stablehlo.divide %[[PID_I32]], %[[C2]] : tensor<i32>
+  // CHECK-NEXT:   %[[C2_0:.*]] = stablehlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT:   %[[REM_0:.*]] = stablehlo.remainder %[[DIV]], %[[C2_0]] : tensor<i32>
+  // CHECK-NEXT:   %[[DIV_0:.*]] = stablehlo.divide %[[DIV]], %[[C2_0]] : tensor<i32>
+  // CHECK-NEXT:   %[[C0:.*]] = stablehlo.constant dense<0> : tensor<i32>
+  // CHECK-NEXT:   %[[CMP:.*]] = stablehlo.compare  EQ, %[[REM]], %[[C0]] : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  // CHECK-NEXT:   %[[ZERO:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT:   %[[ZERO_BCAST:.*]] = stablehlo.broadcast %[[ZERO]], sizes = [1, 2] : (tensor<f32>) -> tensor<1x2xf32>
+  // CHECK-NEXT:   %[[SELECT:.*]] = stablehlo.select %[[CMP]], %arg0, %[[ZERO_BCAST]] : tensor<i1>, tensor<1x2xf32>
+  // CHECK-NEXT:   sdy.return %[[SELECT]] : tensor<1x2xf32>
+  // CHECK-NEXT: } : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK-NEXT: return %[[MANUAL_COMP]] : tensor<2x2xf32>
+  %0 = sdy.constant {sdy.sharding = #sdy.sharding_per_value<[<@mesh, [{"x"}, {}], unreduced={"y"}>]>} dense<[[0.0, 1.0], [2.0, 3.0]]> : tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}

From 9d833374f9caee892d2e915623bbf67fca7c4b26 Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Fri, 19 Dec 2025 17:06:23 -0800
Subject: [PATCH 606/753] Move mutability tracking from Tracked buffers to
 RawBuffers as this is only used for importing foreign memory and clutters the
 API.

PiperOrigin-RevId: 846916941
---
 .../xla/xla/pjrt/common_pjrt_client.cc        | 41 ++++++++-----------
 third_party/xla/xla/pjrt/common_pjrt_client.h |  5 +--
 third_party/xla/xla/pjrt/cpu/cpu_client.cc    | 34 ++++++++-------
 third_party/xla/xla/pjrt/cpu/cpu_client.h     |  6 +--
 third_party/xla/xla/pjrt/cpu/raw_buffer.cc    |  9 ++--
 third_party/xla/xla/pjrt/cpu/raw_buffer.h     | 12 ++++--
 .../xla/pjrt/cpu/tracked_cpu_device_buffer.cc |  3 +-
 .../xla/pjrt/cpu/tracked_cpu_device_buffer.h  |  7 +---
 .../cpu/tracked_cpu_device_buffer_test.cc     | 10 ++---
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  6 +--
 .../xla/pjrt/gpu/se_gpu_pjrt_client_test.cc   |  3 +-
 .../pjrt/host_to_device_transfer_manager.cc   |  3 +-
 .../xla/pjrt/pjrt_stream_executor_client.cc   |  3 +-
 .../xla/pjrt/pjrt_stream_executor_client.h    |  3 +-
 14 files changed, 68 insertions(+), 77 deletions(-)

diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.cc b/third_party/xla/xla/pjrt/common_pjrt_client.cc
index 63ff52c1c3ba63..1eca83fcb8a3d8 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.cc
@@ -158,8 +158,7 @@ CommonPjRtClient::BufferFromHostLiteral(const LiteralSlice& literal,
                     HostBufferSemantics::kImmutableUntilTransferCompletes,
                     raw_buffer));
   return DefineBuffer(device_shape, memory_space, std::move(raw_buffer),
-                      {std::move(definition_event)},
-                      /*raw_buffer_is_mutable=*/true);
+                      {std::move(definition_event)});
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>>
@@ -193,8 +192,7 @@ CommonPjRtClient::CreateUninitializedBuffer(const Shape& shape,
                       raw_buffer->MakeAllocationReadyEvent());
   TF_ASSIGN_OR_RETURN(auto output_buffer,
                       DefineBuffer(device_shape, memory_space, raw_buffer,
-                                   {std::move(definition_event)},
-                                   /*raw_buffer_is_mutable=*/true));
+                                   {std::move(definition_event)}));
   return output_buffer;
 }
 
@@ -270,8 +268,7 @@ CommonPjRtClient::CreateAliasBuffer(const Shape& shape,
 
   TF_ASSIGN_OR_RETURN(auto result_buffer,
                       DefineBuffer(shape, memory_space, std::move(raw_buffer),
-                                   {std::move(definition_event)},
-                                   /*raw_buffer_is_mutable=*/true));
+                                   {std::move(definition_event)}));
 
   return std::make_pair(std::move(result_buffer), std::move(fulfill_cb));
 }
@@ -302,14 +299,14 @@ CommonPjRtClient::BufferFromHostBuffer(
           ImportForeignMemory(
               const_cast<void*>(data),  // CONST_CAST_OK=flag controlled.
               std::move(on_done_with_host_buffer), on_device_bytes_count,
-              memory_space));
+              memory_space,
+              host_buffer_semantics ==
+                  PjRtClient::HostBufferSemantics::kMutableZeroCopy));
       TF_ASSIGN_OR_RETURN(
           auto output_buffer,
           DefineBuffer(
               device_shape, memory_space, raw_buffer,
-              absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
-              /*raw_buffer_is_mutable=*/host_buffer_semantics ==
-                  PjRtClient::HostBufferSemantics::kMutableZeroCopy));
+              absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{}));
       return output_buffer;
     }
   }
@@ -327,8 +324,7 @@ CommonPjRtClient::BufferFromHostBuffer(
           std::move(on_done_with_host_buffer), device_shape, raw_buffer));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> output_buffer,
                       DefineBuffer(device_shape, memory_space, raw_buffer,
-                                   {std::move(definition_event)},
-                                   /*raw_buffer_is_mutable=*/true));
+                                   {std::move(definition_event)}));
   return output_buffer;
 }
 
@@ -351,12 +347,13 @@ CommonPjRtClient::CreateViewOfDeviceBuffer(
   TF_ASSIGN_OR_RETURN(
       auto raw_buffer,
       ImportForeignMemory(device_ptr, std::move(on_delete_callback),
-                          on_device_bytes_count, memory_space));
+                          on_device_bytes_count, memory_space,
+                          /*is_mutable=*/false));
   TF_ASSIGN_OR_RETURN(
       auto output_buffer,
-      DefineBuffer(device_shape, memory_space, raw_buffer,
-                   absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{},
-                   /*raw_buffer_is_mutable=*/false));
+      DefineBuffer(
+          device_shape, memory_space, raw_buffer,
+          absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>{}));
   return output_buffer;
 }
 
@@ -705,9 +702,9 @@ static std::unique_ptr<PjRtBuffer> CreateOutputLeafBuffer(
     CHECK(memory_space) << "No memory space found for device: "
                         << device->DebugString() << " kind: " << kind_id;
   }
-  auto buffer_or = client->DefineBuffer(
-      output_leaf_shape, memory_space, std::move(leaf_buffer),
-      {definition_event}, /*raw_buffer_is_mutable=*/true);
+  auto buffer_or =
+      client->DefineBuffer(output_leaf_shape, memory_space,
+                           std::move(leaf_buffer), {definition_event});
   CHECK_OK(buffer_or);
   return *std::move(buffer_or);
 }
@@ -1154,8 +1151,7 @@ CommonPjRtBufferImpl::CopyToCpuMemorySpace(const xla::Shape& dst_shape,
   TF_ASSIGN_OR_RETURN(
       auto buffer,
       dst_client->DefineBuffer(dst_shape, dst_memory_space, dst_raw_buffer,
-                               {std::move(definition_event)},
-                               /*raw_buffer_is_mutable=*/true));
+                               {std::move(definition_event)}));
   auto* base_ptr = dst_raw_buffer->GetHostPointer();
   std::unique_ptr<MutableLiteralBase> literal;
   bool needs_second_copy = false;
@@ -1265,8 +1261,7 @@ static absl::Status CommonCopyToMemorySpace(
     TF_ASSIGN_OR_RETURN(
         dst_buffer,
         dst_client->DefineBuffer(dst_shape, dst_memory_space, dst_raw_buffer,
-                                 {std::move(definition_event)},
-                                 /*raw_buffer_is_mutable=*/true));
+                                 {std::move(definition_event)}));
     TF_RETURN_IF_ERROR(src_buffer->AcquireScopedRawBuffer(
         [&](tsl::RCReference<CommonPjRtRawBuffer> buf_raw_buffer,
             std::vector<tsl::RCReference<tsl::AsyncValue>>
diff --git a/third_party/xla/xla/pjrt/common_pjrt_client.h b/third_party/xla/xla/pjrt/common_pjrt_client.h
index 7557aa785500d6..d03af18392ec5d 100644
--- a/third_party/xla/xla/pjrt/common_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/common_pjrt_client.h
@@ -87,7 +87,7 @@ class CommonPjRtClient : public PjRtClient {
   ImportForeignMemory(void* device_ptr,
                       absl::AnyInvocable<void() &&> on_delete_callback,
                       size_t on_device_bytes_count,
-                      PjRtMemorySpace* memory_space) {
+                      PjRtMemorySpace* memory_space, bool is_mutable) {
     return absl::UnimplementedError("ImportForeignMemory is not supported");
   }
 
@@ -105,8 +105,7 @@ class CommonPjRtClient : public PjRtClient {
       const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-          definition_device_events,
-      bool raw_buffer_is_mutable) {
+          definition_device_events) {
     return absl::UnimplementedError("DefineBuffer is not supported");
   }
 
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index f0a3fa1a083eb5..3ad33cd715e612 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -888,10 +888,11 @@ static bool IsAlignedData(void* ptr) {
 absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>>
 PjRtCpuClient::ImportForeignMemory(
     void* device_ptr, absl::AnyInvocable<void() &&> on_delete_callback,
-    size_t on_device_bytes_count, PjRtMemorySpace* memory_space) {
-  return CpuRawBuffer::ImportForeignMemory(device_ptr,
-                                           std::move(on_delete_callback),
-                                           on_device_bytes_count, memory_space);
+    size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+    bool is_mutable) {
+  return CpuRawBuffer::ImportForeignMemory(
+      device_ptr, std::move(on_delete_callback), on_device_bytes_count,
+      memory_space, is_mutable);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::CreateErrorBuffer(
@@ -910,7 +911,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::CreateErrorBuffer(
   return std::make_unique<CommonPjRtBufferImpl>(
       shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
-          /*owns_buffers=*/true, std::move(raw_buffer),
+          std::move(raw_buffer),
           tsl::AsyncValueRef<CpuEvent>(
               tsl::MakeErrorAsyncValueRef(std::move(error)))),
       memory_space);
@@ -995,8 +996,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::DefineBuffer(
     const Shape& on_device_shape, PjRtMemorySpace* memory_space,
     tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-        definition_device_events,
-    bool raw_buffer_is_mutable) {
+        definition_device_events) {
   if (raw_buffer && raw_buffer->memory_space() != memory_space) {
     return absl::InvalidArgumentError(
         absl::StrFormat("DefineBuffer: Mismatch in memory spaces: %s vs %s",
@@ -1006,7 +1006,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCpuClient::DefineBuffer(
   return std::unique_ptr<PjRtBuffer>(std::make_unique<CommonPjRtBufferImpl>(
       on_device_shape,
       std::make_unique<TrackedCpuDeviceBuffer>(
-          /*owns_buffers=*/raw_buffer_is_mutable, std::move(raw_buffer),
+          std::move(raw_buffer),
           CpuTrackedDeviceEvent::AfterAll(definition_device_events)),
       memory_space));
 }
@@ -1029,7 +1029,7 @@ PjRtCpuClient::CreateRawBufferChannel(PjRtMemorySpace* memory_space,
   auto buffer_promise = tsl::MakeIndirectAsyncValue();
   auto raw_buffer = tsl::MakeRef<CpuRawBuffer>(
       memory_space, tsl::AsyncValueRef<CpuDeviceMemory>(buffer_promise),
-      on_device_bytes_count);
+      on_device_bytes_count, /*is_mutable=*/true);
 
   auto buffer_promise_cb =
       [buffer_promise = std::move(buffer_promise), memory_space](
@@ -1250,7 +1250,9 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
     // If we don't own the buffer, we can't overwrite it or donate it. For
     // example we might be pointing to a buffer owned by the client whose
     // lifetime will not extend past the lifetime of the donated input buffer.
-    if ((!can_donate || (arg && !arg->owns_buffers())) &&
+    if ((!can_donate ||
+         (arg && !tensorflow::down_cast<CpuRawBuffer*>(arg->raw_buffer().get())
+                      ->is_mutable())) &&
         !allocation.is_readonly()) {
       auto copy = CpuDeviceMemory::CreateDelayedMemory();
 
@@ -1265,7 +1267,9 @@ static absl::StatusOr<BufferInfo> MemoryForAllocation(
     }
 
     buffer_info.buffer = out.CopyRef();
-    buffer_info.owns_buffer = !arg || arg->owns_buffers();
+    buffer_info.owns_buffer =
+        !arg || tensorflow::down_cast<CpuRawBuffer*>(arg->raw_buffer().get())
+                    ->is_mutable();
     buffer_info.buffer_size = buffer_size;
     return buffer_info;
 
@@ -1856,10 +1860,10 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
       // Program execution writes to output buffers so it's a definition event.
       auto leaf_tracked_device_buffer =
           std::make_unique<TrackedCpuDeviceBuffer>(
-              result_buffers_info[i].owns_buffer,
               tsl::MakeRef<CpuRawBuffer>(
                   memory_space, std::move(result_buffers_info[i].buffer),
-                  result_buffers_info[i].buffer_size),
+                  result_buffers_info[i].buffer_size,
+                  result_buffers_info[i].owns_buffer),
               execute_event.CopyRef());
       auto leaf_buffer = std::make_unique<CommonPjRtBufferImpl>(
           result_shape.tuple_shapes(i), std::move(leaf_tracked_device_buffer),
@@ -1870,10 +1874,10 @@ absl::StatusOr<PjRtLoadedExecutable::Result> PjRtCpuExecutable::ExecuteHelper(
     CHECK_EQ(result_buffers_info.size(), 1);
     // Program execution writes to output buffers so it's a definition event.
     auto tracked_device_buffer = std::make_unique<TrackedCpuDeviceBuffer>(
-        result_buffers_info[0].owns_buffer,
         tsl::MakeRef<CpuRawBuffer>(memory_space,
                                    std::move(result_buffers_info[0].buffer),
-                                   result_buffers_info[0].buffer_size),
+                                   result_buffers_info[0].buffer_size,
+                                   result_buffers_info[0].owns_buffer),
         /*definition_event=*/execute_event);
     auto output_buffer = std::make_unique<CommonPjRtBufferImpl>(
         result_shape, std::move(tracked_device_buffer), memory_space);
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.h b/third_party/xla/xla/pjrt/cpu/cpu_client.h
index 548cac91cb8096..813834bfcfe504 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.h
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.h
@@ -174,7 +174,8 @@ class PjRtCpuClient final : public CommonPjRtClient {
 
   absl::StatusOr<tsl::RCReference<CommonPjRtRawBuffer>> ImportForeignMemory(
       void* device_ptr, absl::AnyInvocable<void() &&> on_delete_callback,
-      size_t on_device_bytes_count, PjRtMemorySpace* memory_space) override;
+      size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+      bool is_mutable) override;
 
   tsl::thread::ThreadPool* pjrt_client_thread_pool() const {
     return pjrt_client_thread_pool_.get();
@@ -234,8 +235,7 @@ class PjRtCpuClient final : public CommonPjRtClient {
       const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-          definition_device_events,
-      bool raw_buffer_is_mutable) override;
+          definition_device_events) override;
 
   absl::StatusOr<int64_t> GetOnDeviceBytesCount(
       PjRtMemorySpace* memory_space, const xla::Shape& shape) const override;
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
index bf4a5bf2dc4e8b..57963004deb68f 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.cc
@@ -130,14 +130,15 @@ CpuRawBuffer::Allocate(PjRtMemorySpace* memory_space, size_t size_bytes,
                        const CpuDeviceMemory::Allocator& allocator) {
   TF_ASSIGN_OR_RETURN(auto memory,
                       CpuDeviceMemory::Allocate(size_bytes, allocator));
-  return tsl::MakeRef<CpuRawBuffer>(memory_space, std::move(memory),
-                                    size_bytes);
+  return tsl::MakeRef<CpuRawBuffer>(memory_space, std::move(memory), size_bytes,
+                                    /*is_mutable=*/true);
 }
 
 /*static*/ absl::StatusOr<tsl::RCReference<CpuRawBuffer>>
 CpuRawBuffer::ImportForeignMemory(
     void* data, absl::AnyInvocable<void() &&> on_delete_callback,
-    size_t on_device_bytes_count, PjRtMemorySpace* memory_space) {
+    size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+    bool is_mutable) {
   if ((absl::bit_cast<std::uintptr_t>(data) & (cpu::MinAlign() - 1)) != 0) {
     return InvalidArgument(
         "Can't create a view of buffer with unaligned data, ptr: %#x is not "
@@ -148,7 +149,7 @@ CpuRawBuffer::ImportForeignMemory(
       memory_space,
       CpuDeviceMemory::CreateForeignMemory(data, on_device_bytes_count,
                                            std::move(on_delete_callback)),
-      on_device_bytes_count);
+      on_device_bytes_count, is_mutable);
 }
 
 size_t CpuRawBuffer::GetOnDeviceSizeInBytes() const { return buffer_size_; }
diff --git a/third_party/xla/xla/pjrt/cpu/raw_buffer.h b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
index e7b1dd3a2013f7..03227be74217fa 100644
--- a/third_party/xla/xla/pjrt/cpu/raw_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/raw_buffer.h
@@ -95,10 +95,12 @@ class CpuTrackedDeviceEvent : public PjRtDeviceEvent {
 class CpuRawBuffer : public CommonPjRtRawBuffer {
  public:
   CpuRawBuffer(PjRtMemorySpace* memory_space,
-               tsl::AsyncValueRef<CpuDeviceMemory> buffer, size_t buffer_size)
+               tsl::AsyncValueRef<CpuDeviceMemory> buffer, size_t buffer_size,
+               bool is_mutable)
       : memory_space_(memory_space),
         buffer_(std::move(buffer)),
-        buffer_size_(buffer_size) {}
+        buffer_size_(buffer_size),
+        is_mutable_(is_mutable) {}
 
   absl::Status ValidateSlice(int64_t offset, int64_t slice_size);
 
@@ -111,7 +113,8 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
   // Imports foreign memory.
   static absl::StatusOr<tsl::RCReference<CpuRawBuffer>> ImportForeignMemory(
       void* data, absl::AnyInvocable<void() &&> on_delete_callback,
-      size_t on_device_bytes_count, PjRtMemorySpace* memory_space);
+      size_t on_device_bytes_count, PjRtMemorySpace* memory_space,
+      bool is_mutable);
 
   size_t GetOnDeviceSizeInBytes() const override;
 
@@ -129,6 +132,8 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
 
   PjRtMemorySpace* memory_space() const override { return memory_space_; }
 
+  bool is_mutable() const { return is_mutable_; }
+
   absl::StatusOr<tsl::RCReference<PjRtDeviceEvent>>
   CopyRawHostToDeviceAndReturnEvent(const void* src, int64_t offset,
                                     int64_t transfer_size) override;
@@ -175,6 +180,7 @@ class CpuRawBuffer : public CommonPjRtRawBuffer {
   PjRtMemorySpace* const memory_space_;
   tsl::AsyncValueRef<CpuDeviceMemory> buffer_;
   size_t buffer_size_;
+  bool is_mutable_;
 };
 
 absl::StatusOr<xla::Shape> MakeDefaultCpuBufferShape(xla::Shape shape,
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
index 6b1868725c8422..a172969dccc501 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.cc
@@ -189,10 +189,9 @@ absl::Status CpuDeviceMemory::AllocateInto(
 //===----------------------------------------------------------------------===//
 
 TrackedCpuDeviceBuffer::TrackedCpuDeviceBuffer(
-    bool owns_buffers, tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+    tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     tsl::AsyncValueRef<CpuEvent> definition_event)
     : AbstractTrackedDeviceBuffer(std::move(raw_buffer)),
-      owns_buffers_(owns_buffers),
       definition_event_(std::move(definition_event)) {
   DCHECK(definition_event_);
 }
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
index 907e7045595e1e..a1ca122405e24b 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer.h
@@ -141,8 +141,7 @@ class CpuDeviceMemory {
 class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
  public:
   // Variant with single definition event.
-  TrackedCpuDeviceBuffer(bool owns_buffers,
-                         tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
+  TrackedCpuDeviceBuffer(tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
                          tsl::AsyncValueRef<CpuEvent> definition_event);
 
   TrackedCpuDeviceBuffer(TrackedCpuDeviceBuffer&&) noexcept = default;
@@ -170,8 +169,6 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
   absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>
   LockUseAndTransferUsageEvents();
 
-  bool owns_buffers() const { return owns_buffers_; }
-
   std::vector<tsl::RCReference<tsl::AsyncValue>> GetAsyncValueDefinitionEvents()
       override;
 
@@ -190,8 +187,6 @@ class TrackedCpuDeviceBuffer : public AbstractTrackedDeviceBuffer {
  private:
   void ConfirmDonation() override;
 
-  bool owns_buffers_;
-
   // The definition event are associated with CPU operations that write to the
   // buffers.
   tsl::AsyncValueRef<CpuEvent> definition_event_;
diff --git a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
index 3986adc0abf7a2..7579fef3c0ee3d 100644
--- a/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
+++ b/third_party/xla/xla/pjrt/cpu/tracked_cpu_device_buffer_test.cc
@@ -57,8 +57,7 @@ TEST(TrackedCpuDeviceBufferTest, Basic) {
     definition_event.SetStateConcrete();
   });
 
-  TrackedCpuDeviceBuffer tracked_buffer(
-      /*owns_buffers=*/true, buffer, definition_event);
+  TrackedCpuDeviceBuffer tracked_buffer(buffer, definition_event);
 
   BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
@@ -85,8 +84,7 @@ TEST(TrackedCpuDeviceBufferTest, BasicError) {
         Internal("tracked_cpu_device_buffer_test error."));
   });
 
-  TrackedCpuDeviceBuffer tracked_buffer(
-      /*owns_buffers=*/true, buffer, definition_event);
+  TrackedCpuDeviceBuffer tracked_buffer(buffer, definition_event);
 
   BlockUntilReady(tracked_buffer.definition_event().GetAsyncValue());
 
@@ -108,8 +106,8 @@ TEST(TrackedCpuDeviceBufferTest, DelayedAllocation) {
 
   auto definition_event = MakeConstructedAsyncValueRef<CpuEvent>();
   TrackedCpuDeviceBuffer tracked_buffer(
-      /*owns_buffers=*/true,
-      tsl::MakeRef<CpuRawBuffer>(memory_space, buffer, expected.size()),
+      tsl::MakeRef<CpuRawBuffer>(memory_space, buffer, expected.size(),
+                                 /*is_mutable=*/true),
       definition_event);
   auto result = tracked_buffer.buffer();
   ASSERT_FALSE(result.IsAvailable());
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 172806fbcb2128..d072e94c364fdd 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -611,8 +611,7 @@ absl::StatusOr<PreparedReceive> PrepareReceive(
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtBuffer> buffer,
                       client->DefineBuffer(on_device_shape, memory_space,
-                                           raw_buffer, {definition_event},
-                                           /*raw_buffer_is_mutable=*/true));
+                                           raw_buffer, {definition_event}));
   definition_event->AndThen([raw_buffer]() {});
 
   return PreparedReceive(client, std::move(clique_key), std::move(buffer),
@@ -917,8 +916,7 @@ StreamExecutorGpuClient::PrepareReceiveBuffer(PjRtDevice* device, Shape shape) {
       auto buffer,
       DefineBuffer(
           on_device_shape, memory_space, raw_buffer,
-          {tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_event)},
-          /*raw_buffer_is_mutable=*/true));
+          {tsl::MakeRef<PjRtStreamExecutorDeviceEvent>(definition_event)}));
 
   return PrepareReceiveBufferResult{std::move(buffer), std::move(raw_buffer),
                                     local_device, stream,
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index 787f43b0691a21..a75c9f3900af80 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -2875,8 +2875,7 @@ TEST(StreamExecutorGpuClientTest, LinkedEventPromise) {
                           client->CreateLinkedEventPromise(memory_space, ""));
   TF_ASSERT_OK_AND_ASSIGN(
       auto buffer, client->DefineBuffer(device_shape, memory_space, raw_buffer,
-                                        {std::move(event)},
-                                        /*raw_buffer_is_mutable=*/true));
+                                        {std::move(event)}));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto definition_event,
diff --git a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
index b01c428036110d..aab104323a24a6 100644
--- a/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/host_to_device_transfer_manager.cc
@@ -154,8 +154,7 @@ class CommonAsyncHostToDeviceTransferManager
       TF_ASSIGN_OR_RETURN(
           auto buffer,
           client->DefineBuffer(device_shape, memory_space, raw_buffer,
-                               {std::move(definition_event)},
-                               /*raw_buffer_is_mutable=*/true));
+                               {std::move(definition_event)}));
       device_shapes.push_back(std::move(device_shape));
       buffers.push_back(std::move(buffer));
       undispatched_buffer_refs.push_back(raw_buffer);
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 6b000d5bcd0599..3a28a3e0441676 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -512,8 +512,7 @@ PjRtStreamExecutorClient::DefineBuffer(
     const Shape& on_device_shape, PjRtMemorySpace* memory_space,
     tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
     absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-        definition_device_events,
-    bool raw_buffer_is_mutable) {
+        definition_device_events) {
   if (raw_buffer && raw_buffer->memory_space() != memory_space) {
     return absl::InvalidArgumentError(
         absl::StrFormat("DefineBuffer: Mismatch in memory spaces: %s vs %s",
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index 40125ea7ff1b2e..a67ee1895cbf65 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -396,8 +396,7 @@ class PjRtStreamExecutorClient : public CommonPjRtClient {
       const Shape& on_device_shape, PjRtMemorySpace* memory_space,
       tsl::RCReference<CommonPjRtRawBuffer> raw_buffer,
       absl::InlinedVector<tsl::RCReference<PjRtDeviceEvent>, 4>
-          definition_device_events,
-      bool raw_buffer_is_mutable) override;
+          definition_device_events) override;
 
   absl::StatusOr<std::pair<tsl::RCReference<CommonPjRtRawBuffer>,
                            PjRtFulfillAliasRawBufferCallback>>

From 8f784b11c3dc5106065bc9b9089eb80f004ea747 Mon Sep 17 00:00:00 2001
From: Jian Cai <jiancai@google.com>
Date: Fri, 19 Dec 2025 17:55:17 -0800
Subject: [PATCH 607/753] [XLA][Numerics][HLO Value Tracking] Support HLO
 original value in CopyFusion pass

This updates the HLO orignal value of a fusion accordingly if its shape is updated in the pass.

PiperOrigin-RevId: 846927675
---
 .../xla/service/gpu/transforms/copy_fusion.cc |  9 +++++++
 .../gpu/transforms/copy_fusion_test.cc        | 27 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
index 434a33236b76d4..ce9bf49de7ae16 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "xla/codegen/ir_emission_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_original_value.h"
 #include "xla/hlo/utils/hlo_traversal.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/gpu/gpu_fusible.h"
@@ -176,6 +177,14 @@ absl::StatusOr<bool> CopyFusion::DoCopyFusion(
         HloInstruction::CreateTuple(tuple_elements));
     fused_computation->set_root_instruction(new_root,
                                             /*accept_different_shape=*/true);
+    // Creates a new original value for the fusion instruction and the new root
+    // of the fused computation.
+    if (hlo->original_value() != nullptr) {
+      std::shared_ptr<xla::OriginalValue> new_original_value =
+          xla::OriginalValue::CreateFromInstruction(new_root);
+      new_root->set_original_value(new_original_value);
+      hlo->set_original_value(new_original_value);
+    }
     *hlo->mutable_shape() = new_root->shape();
     for (HloInstruction* caller :
          call_graph->GetComputationCallers(fused_computation)) {
diff --git a/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc b/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
index 1bf5e2237dbbca..fc38dc4920b9bf 100644
--- a/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/transforms/copy_fusion_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/copy_fusion.h"
 
 #include <cstdint>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -649,5 +650,31 @@ TEST_F(CopyFusionTest, CopyFusionWithMoreThanMaxCopies) {
   EXPECT_FALSE(CreateFusionWithNumCopies(max_copies));
 }
 
+TEST_F(CopyFusionTest, PropagateOriginalValue) {
+  ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_computation {
+      two = f32[] constant(2.0)
+      broadcast = f32[16,32]{1,0} broadcast(two), dimensions={}
+      s.1 = f32[16,32]{1,0} sqrt(broadcast)
+      ROOT c.1 = f32[32,16]{1,0} transpose(s.1), dimensions={1,0}, origin={{"transpose"}}
+    }
+
+    ENTRY main {
+      fusion = f32[32,16]{1,0} fusion(), kind=kInput, calls=fused_computation, origin={{"transpose"}}
+      copy.1 = f32[32,16]{1,0} copy(fusion)
+      copy.2 = f32[32,16]{1,0} copy(fusion)
+      ROOT t = (f32[32,16]{1,0}, f32[32,16]{1,0}) tuple(copy.2, copy.1)
+    })")));
+
+  ASSERT_OK_AND_ASSIGN(auto changed, cf_.Run(module.get()));
+  ASSERT_TRUE(changed);
+  const HloInstruction* fusion =
+      *module->entry_computation()->instructions().begin();
+
+  EXPECT_TRUE(fusion != nullptr);
+  EXPECT_EQ(fusion->original_value()->ToString(), R"(({"transpose"}, {}, {}))");
+}
+
 }  // namespace gpu
 }  // namespace xla

From 4baf6a3d215f91e4a58c3fa0cb1fd890248bebbb Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Fri, 19 Dec 2025 18:06:03 -0800
Subject: [PATCH 608/753] Add replicated -> unreduced test coverage

PiperOrigin-RevId: 846930629
---
 third_party/xla/xla/python/version.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/third_party/xla/xla/python/version.h b/third_party/xla/xla/python/version.h
index 96eadc39b1eeb2..a192d8d98efcc5 100644
--- a/third_party/xla/xla/python/version.h
+++ b/third_party/xla/xla/python/version.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 // An increasing version number to protect jax code against breaking changes.
 // In JAX, reference this via jax._src.lib.ifrt_version.
-#define JAX_IFRT_VERSION_NUMBER \
-  45  // Refresh custom layouts when copying an array across clients.
+#define JAX_IFRT_VERSION_NUMBER 46  // Shardy replicated -> unreduced
 
 #endif  // XLA_PYTHON_VERSION_H_

From 269ed39897b61083b8e87160a5ddac7b68669eec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 20:57:56 -0800
Subject: [PATCH 609/753] Automated Code Change

PiperOrigin-RevId: 846974498
---
 tensorflow/core/summary/BUILD                       | 4 ++++
 tensorflow/core/summary/loader.cc                   | 2 ++
 tensorflow/core/summary/schema.cc                   | 1 +
 tensorflow/core/summary/summary_converter.cc        | 2 ++
 tensorflow/core/summary/summary_db_writer.cc        | 3 +++
 tensorflow/core/summary/summary_db_writer_test.cc   | 2 ++
 tensorflow/core/summary/summary_file_writer.cc      | 1 +
 tensorflow/core/summary/summary_file_writer_test.cc | 2 ++
 tensorflow/core/summary/vacuum.cc                   | 1 +
 9 files changed, 18 insertions(+)

diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index 7b0981742dd5b4..8af924c1b40dfe 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/lib/db:sqlite",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -55,6 +56,7 @@ cc_library(
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
         "@local_xla//xla/tsl/protobuf:error_codes_proto_impl_cc",
         "@local_xla//xla/tsl/protobuf:histogram_proto_cc",
     ],
@@ -74,6 +76,7 @@ tf_cc_test(
         "//tensorflow/core/lib/db:sqlite",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/tsl/protobuf:histogram_proto_cc",
     ],
 )
@@ -128,6 +131,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/png:png_io",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/summary/loader.cc b/tensorflow/core/summary/loader.cc
index 1443cffc4c6e6a..08e4ea469b106b 100644
--- a/tensorflow/core/summary/loader.cc
+++ b/tensorflow/core/summary/loader.cc
@@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <cstdint>
 #include <iostream>
 #include <memory>
+#include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
diff --git a/tensorflow/core/summary/schema.cc b/tensorflow/core/summary/schema.cc
index 2cd421afc59bff..3ba5db4037e419 100644
--- a/tensorflow/core/summary/schema.cc
+++ b/tensorflow/core/summary/schema.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/summary/schema.h"
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/summary/summary_converter.cc b/tensorflow/core/summary/summary_converter.cc
index a5e3695e420103..449f851c74669f 100644
--- a/tensorflow/core/summary/summary_converter.cc
+++ b/tensorflow/core/summary/summary_converter.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include <cstdlib>
 #include <functional>
 #include <limits>
+#include <string>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/core/summary/summary_db_writer.cc b/tensorflow/core/summary/summary_db_writer.cc
index 849fc9a6954c7e..2cc0a6b36a1863 100644
--- a/tensorflow/core/summary/summary_db_writer.cc
+++ b/tensorflow/core/summary/summary_db_writer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <deque>
 #include <limits>
 #include <memory>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -28,6 +29,8 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "xla/tsl/protobuf/error_codes.pb.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/core/summary/summary_db_writer_test.cc b/tensorflow/core/summary/summary_db_writer_test.cc
index 8c25da1823f057..b65349e935aa15 100644
--- a/tensorflow/core/summary/summary_db_writer_test.cc
+++ b/tensorflow/core/summary/summary_db_writer_test.cc
@@ -17,10 +17,12 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 #include <memory>
+#include <string>
 #include <utility>
 
 #include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/tsl/protobuf/histogram.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/core/summary/summary_file_writer.cc b/tensorflow/core/summary/summary_file_writer.cc
index dfb1bba4aecbe5..a77641f7e912e5 100644
--- a/tensorflow/core/summary/summary_file_writer.cc
+++ b/tensorflow/core/summary/summary_file_writer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <atomic>
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/core/summary/summary_file_writer_test.cc b/tensorflow/core/summary/summary_file_writer_test.cc
index 94ca029774f40d..c0ef770435f05c 100644
--- a/tensorflow/core/summary/summary_file_writer_test.cc
+++ b/tensorflow/core/summary/summary_file_writer_test.cc
@@ -15,9 +15,11 @@ limitations under the License.
 #include "tensorflow/core/summary/summary_file_writer.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/core/summary/vacuum.cc b/tensorflow/core/summary/vacuum.cc
index 29c459cca89f13..7db3633b4c21c0 100644
--- a/tensorflow/core/summary/vacuum.cc
+++ b/tensorflow/core/summary/vacuum.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <iostream>
+#include <string>
 
 #include "absl/log/log.h"
 #include "tensorflow/core/lib/db/sqlite.h"

From 2c61085107355be636389a00c56bd8c76cd7be2e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 19 Dec 2025 21:54:05 -0800
Subject: [PATCH 610/753] Automated Code Change

PiperOrigin-RevId: 846987657
---
 tensorflow/core/config/BUILD    | 6 +++++-
 tensorflow/core/config/flags.cc | 1 +
 tensorflow/core/config/flags.h  | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/config/BUILD b/tensorflow/core/config/BUILD
index f41dc9f2d94a79..52217b6f7891a9 100644
--- a/tensorflow/core/config/BUILD
+++ b/tensorflow/core/config/BUILD
@@ -21,7 +21,10 @@ cc_library(
         "flags.h",
     ],
     visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/core/platform:stringpiece"],
+    deps = [
+        "//tensorflow/core/platform:stringpiece",
+        "@com_google_absl//absl/strings:string_view",
+    ],
 )
 
 filegroup(
@@ -63,6 +66,7 @@ cc_library(
         "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/util:env_var",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
diff --git a/tensorflow/core/config/flags.cc b/tensorflow/core/config/flags.cc
index d2d1ea502dfe9e..faf53293eb82d2 100644
--- a/tensorflow/core/config/flags.cc
+++ b/tensorflow/core/config/flags.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/config/flags.h"
 
 #include "absl/strings/ascii.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/util/env_var.h"
 
diff --git a/tensorflow/core/config/flags.h b/tensorflow/core/config/flags.h
index c882cd3939f4af..df4379e6ddb4b9 100644
--- a/tensorflow/core/config/flags.h
+++ b/tensorflow/core/config/flags.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_CONFIG_FLAGS_H_
 #define TENSORFLOW_CORE_CONFIG_FLAGS_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/stringpiece.h"
 
 namespace tensorflow {

From 9f75c9a8e30225184a7ba3f555759f4190e803b7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 01:04:11 -0800
Subject: [PATCH 611/753] Update GraphDef version to 2447.

PiperOrigin-RevId: 847036679
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 198b5a6e175699..ec50d4b11648d9 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2446  // Updated: 2025/12/19
+#define TF_GRAPH_DEF_VERSION 2447  // Updated: 2025/12/20
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 39ada7f15caf0e9cc7084fc759cc7860fe38a9b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 01:04:12 -0800
Subject: [PATCH 612/753] compat: Update forward compatibility horizon to
 2025-12-20

PiperOrigin-RevId: 847036681
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index a7fa8eab2a0c08..d99963a6b2858b 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 20)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From f654c43ac211f0cc391d78ba35351311c12a97c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 03:06:54 -0800
Subject: [PATCH 613/753] Automated Code Change

PiperOrigin-RevId: 847065080
---
 .../core/common_runtime/gpu/gpu_bfc_allocator_test.cc     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
index 8fd3dc450c98a6..80ba5156327af4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc
@@ -434,7 +434,7 @@ class GPUBFCAllocatorPrivateMethodsTest
 
     std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins> bin_infos;
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       bin_infos = a.get_bin_debug_info();
     }
 
@@ -486,7 +486,7 @@ class GPUBFCAllocatorPrivateMethodsTest
       initial_ptrs[i] = nullptr;
     }
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       bin_infos = a.get_bin_debug_info();
     }
     for (int i = 0; i < BFCAllocator::kNumBins; i++) {
@@ -610,7 +610,7 @@ class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
     }
 
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       // Make sure there are more than 1 regions in preparation for the test.
       EXPECT_LT(1, a.region_manager_.regions().size());
     }
@@ -623,7 +623,7 @@ class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific
     // Deallocate free regions and there shall be only one region left.
     EXPECT_EQ(true, a.DeallocateFreeRegions(/*rounded_bytes=*/0));
     {
-      absl::MutexLock l(&a.mutex_);
+      absl::MutexLock l(a.mutex_);
       EXPECT_EQ(1, a.region_manager_.regions().size());
     }
 

From cd19aba91fd8bdedbd055bbfb6fbee4efd392bfe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 03:39:28 -0800
Subject: [PATCH 614/753] Automated Code Change

PiperOrigin-RevId: 847070976
---
 tensorflow/core/kernels/matmul_op_fused.cc    | 24 +++---
 tensorflow/core/kernels/matmul_op_impl.h      | 42 +++++-----
 tensorflow/core/kernels/matmul_util.cc        | 45 ++++++-----
 tensorflow/core/kernels/matmul_util.h         | 14 ++--
 .../core/kernels/maxpooling_op_gpu.cu.cc      | 34 ++++----
 tensorflow/core/kernels/maxpooling_op_gpu.h   |  4 +-
 .../core/kernels/multinomial_op_gpu.cu.cc     |  6 +-
 tensorflow/core/kernels/nccl_ops.cc           | 18 ++---
 tensorflow/core/kernels/one_hot_op_test.cc    |  4 +-
 tensorflow/core/kernels/pack_op.cc            |  4 +-
 ...arameterized_truncated_normal_op_gpu.cu.cc | 28 +++----
 .../core/kernels/pooling_ops_common_gpu.h     |  8 +-
 .../kernels/population_count_op_gpu.cu.cc     | 20 ++---
 tensorflow/core/kernels/queue_base.cc         |  9 ++-
 tensorflow/core/kernels/ragged_cross_op.cc    | 38 ++++-----
 .../core/kernels/ragged_gather_op_test.cc     | 26 +++---
 tensorflow/core/kernels/random_op_gpu.h       | 56 ++++++-------
 tensorflow/core/kernels/relu_op_gpu.cu.cc     | 38 ++++-----
 .../core/kernels/reshape_util_gpu.cu.cc       | 22 ++---
 tensorflow/core/kernels/restore_v2_op_test.cc | 34 ++++----
 tensorflow/core/kernels/roll_op_gpu.cu.cc     | 39 ++++-----
 tensorflow/core/kernels/scan_ops.cc           |  6 +-
 .../core/kernels/scatter_functor_gpu.cu.h     |  2 +-
 tensorflow/core/kernels/scatter_nd_op.cc      | 22 ++---
 .../core/kernels/scatter_nd_op_gpu.cu.cc      |  4 +-
 .../core/kernels/searchsorted_op_gpu.cu.cc    | 30 +++----
 .../kernels/segment_reduction_ops_gpu.cu.h    | 80 ++++++++++---------
 .../kernels/spacetobatch_functor_gpu.cu.cc    | 48 +++++------
 .../core/kernels/spacetodepth_op_gpu.cu.cc    |  6 +-
 29 files changed, 364 insertions(+), 347 deletions(-)

diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index 4e6a8d5266608d..343eba3db82f97 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -199,7 +199,7 @@ struct LaunchFusedMatMulOp<CPUDevice, T> {
 namespace {
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
-StatusOr<se::gpu::BlasLt::Epilogue> GetBlasLtEpilogOp(
+absl::StatusOr<stream_executor::gpu::BlasLt::Epilogue> GetBlasLtEpilogOp(
     FusedComputationType fusion) {
   if (fusion == FusedComputationType::kBiasAdd) {
     return se::gpu::BlasLt::Epilogue::kBias;
@@ -235,7 +235,7 @@ se::blas::AlgorithmConfig AutotuneMatmul(
       // scratch space is deallocated between runs.
       BlasScratchAllocator scratch_allocator(context);
 
-      Status cublaslt_launch =
+      absl::Status cublaslt_launch =
           launch_func(scratch_allocator, i, &profile_result);
 
       VLOG(4) << "  Autotune algorithm " << i
@@ -265,7 +265,7 @@ se::blas::AlgorithmConfig AutotuneMatmul(
 #endif
 
 template <typename LaunchFunc, typename Sig>
-StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
+absl::StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
     OpKernelContext* ctx,
     std::vector<std::unique_ptr<const se::dnn::OpRunner<Sig>>>& runners,
     bool actually_do_autotune, const LaunchFunc& launch_func,
@@ -292,10 +292,10 @@ StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
 
     TF_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
     se::dnn::ProfileResult profile_result;
-    Status cudnn_launch_status =
+    absl::Status cudnn_launch_status =
         actually_do_autotune
             ? launch_func(allocator_used, runner, &profile_result)
-            : OkStatus();
+            : absl::OkStatus();
     if (!actually_do_autotune) {
       // Make the result valid according to `is_valid`.
       profile_result.set_algorithm(desc);
@@ -329,7 +329,7 @@ StatusOr<std::vector<xla::AutotuneResult>> AutotuneMatMulImpl(
 }
 
 struct FusedMatmulAutotuneGroup {
-  static string name() { return "FusedMatmul"; }
+  static std::string name() { return "FusedMatmul"; }
 };
 
 typedef AutotuneSingleton<FusedMatmulAutotuneGroup, MatmulParameters,
@@ -337,7 +337,8 @@ typedef AutotuneSingleton<FusedMatmulAutotuneGroup, MatmulParameters,
     FusedMatmulAutotuneMap;
 
 template <typename T>
-StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
+absl::StatusOr<AutotuneEntry<stream_executor::dnn::FusedMatmulOp>>
+AutotuneFusedMatmul(
     bool cudnn_use_autotune,
     AutotuneMap<MatmulParameters, AutotuneEntry<se::dnn::FusedMatmulOp>>*
         autotune_map,
@@ -350,7 +351,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
   AutotuneEntry<se::dnn::FusedMatmulOp> autotune_entry;
   auto* stream = ctx->op_device_context()->stream();
   if (!autotune_map->Find(params, &autotune_entry)) {
-    profiler::ScopedAnnotation trace("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                 stream);
@@ -371,7 +372,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
     auto launch_func =
         [&](se::ScratchAllocator* allocator_used,
             const std::unique_ptr<const se::dnn::FusedMatmulRunner>& runner,
-            se::dnn::ProfileResult* profile_result) -> Status {
+            se::dnn::ProfileResult* profile_result) -> absl::Status {
       TF_ASSIGN_OR_RETURN(auto scratch, allocator_used->AllocateBytes(
                                             runner->GetWorkspaceSize()));
       return (*runner)(stream, profile_result, scratch, a_ptr, b_ptr, bias_ptr,
@@ -562,8 +563,9 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
       auto runner_and_scratch = std::move(runner_and_scratch_or).value();
       auto& runner =
           *std::get<const se::dnn::FusedMatmulRunner*>(runner_and_scratch);
-      Status cudnn_launch_status = runner(
-          stream, nullptr, std::get<se::DeviceMemoryBase>(runner_and_scratch),
+      absl::Status cudnn_launch_status = runner(
+          stream, nullptr,
+          std::get<stream_executor::DeviceAddressBase>(runner_and_scratch),
           a_ptr, b_ptr, bias_ptr, c_ptr);
       OP_REQUIRES_OK(context, cudnn_launch_status);
       return;
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index f4991bc1fe252a..628e6d8dabceb2 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -477,7 +477,7 @@ struct LaunchBatchMatMul<CPUDevice, Scalar> {
 namespace {
 // A dummy type to group matmul autotune results together.
 struct BlasLtMatmulAutoTuneGroup {
-  static string name() { return "MatmulLt"; }
+  static std::string name() { return "MatmulLt"; }
 };
 
 typedef AutotuneSingleton<BlasLtMatmulAutoTuneGroup, BlasLtMatmulPlanParams,
@@ -493,7 +493,7 @@ typedef AutotuneSingleton<BlasLtMatmulAutoTuneGroup, BlasLtMatmulPlanParams,
 class BlasScratchAllocator : public se::ScratchAllocator {
  public:
   using Stream = se::Stream;
-  using DeviceMemoryBytes = se::DeviceMemory<uint8>;
+  using DeviceMemoryBytes = stream_executor::DeviceAddress<uint8>;
 
   BlasScratchAllocator(OpKernelContext* context)
       : memory_limit_(0), total_byte_size_(0), context_(context) {}
@@ -503,21 +503,22 @@ class BlasScratchAllocator : public se::ScratchAllocator {
 
   int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
 
-  tsl::StatusOr<DeviceMemoryBytes> AllocateBytes(int64_t byte_size) override {
+  absl::StatusOr<BlasScratchAllocator::DeviceMemoryBytes> AllocateBytes(
+      int64_t byte_size) override {
     Tensor temporary_memory;
 
     if (memory_limit_ > 0 && byte_size > memory_limit_) {
-      return tsl::Status{
+      return absl::Status{
           absl::StatusCode::kUnavailable,
           absl::StrCat("Requested memory size (", byte_size,
                        ") exceeds the memory limit (", memory_limit_, ").")};
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
-    Status allocation_status(context_->allocate_temp(
+    absl::Status allocation_status(context_->allocate_temp(
         DT_UINT8, TensorShape({byte_size}), &temporary_memory));
     if (!allocation_status.ok()) {
-      return tsl::Status{
+      return absl::Status{
           absl::StatusCode::kUnavailable,
           absl::StrCat("Failed to allocate requested memory of (", byte_size,
                        ").")};
@@ -526,11 +527,12 @@ class BlasScratchAllocator : public se::ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return tsl::StatusOr<DeviceMemoryBytes>(DeviceMemoryBytes::MakeFromByteSize(
-        temporary_memory.flat<uint8>().data(),
-        temporary_memory.flat<uint8>().size()));
+    return absl::StatusOr<BlasScratchAllocator::DeviceMemoryBytes>(
+        DeviceMemoryBytes::MakeFromByteSize(
+            temporary_memory.flat<uint8_t>().data(),
+            temporary_memory.flat<uint8_t>().size()));
   }
-  int64 TotalByteSize() { return total_byte_size_; }
+  int64_t TotalByteSize() { return total_byte_size_; }
 
  private:
   int64_t memory_limit_;
@@ -548,9 +550,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
                                    se::blas::Transpose::kTranspose,
                                    se::blas::Transpose::kConjugateTranspose};
-    const uint64 m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
-    const uint64 k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
-    const uint64 n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
+    const uint64_t m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
+    const uint64_t k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
+    const uint64_t n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
     const int64_t batch_size = bcast.output_batch_size();
     auto blas_transpose_a = trans[adj_x ? 2 : (trans_x ? 1 : 0)];
     auto blas_transpose_b = trans[adj_y ? 2 : (trans_y ? 1 : 0)];
@@ -574,9 +576,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     auto* a_base_ptr = in_x.template flat<Scalar>().data();
     auto* b_base_ptr = in_y.template flat<Scalar>().data();
     auto* c_base_ptr = out->template flat<Scalar>().data();
-    uint64 a_stride;
-    uint64 b_stride;
-    uint64 c_stride;
+    uint64_t a_stride;
+    uint64_t b_stride;
+    uint64_t c_stride;
 
     bool is_full_broadcast =
         std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1;
@@ -658,9 +660,11 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
             // Create a new scratch allocator with every autotuning run so that
             // scratch space is deallocated between runs.
             BlasScratchAllocator scratch_allocator(context, max_scratch_size);
-            Status cublas_launch_status = plan_and_algorithms->ExecuteOnStream(
-                stream, *a_ptrs[0], *b_ptrs[0], *c_ptrs[0], i,
-                scratch_allocator, se::DeviceMemoryBase{}, &profile_result);
+            absl::Status cublas_launch_status =
+                plan_and_algorithms->ExecuteOnStream(
+                    stream, *a_ptrs[0], *b_ptrs[0], *c_ptrs[0], i,
+                    scratch_allocator, stream_executor::DeviceAddressBase{},
+                    &profile_result);
 
             VLOG(4) << "  Autotune algorithm " << i
                     << " result: " << profile_result.elapsed_time_in_ms()
diff --git a/tensorflow/core/kernels/matmul_util.cc b/tensorflow/core/kernels/matmul_util.cc
index 3675018709dfc0..cd3a950f8f5c69 100644
--- a/tensorflow/core/kernels/matmul_util.cc
+++ b/tensorflow/core/kernels/matmul_util.cc
@@ -36,8 +36,7 @@ int64_t GetWorkspaceLimit(int64_t default_value_in_bytes) {
   if (workspace_limit_in_mb_str != nullptr &&
       strcmp(workspace_limit_in_mb_str, "") != 0) {
     int64_t scratch_limit_in_mb = -1;
-    if (strings::safe_strto64(workspace_limit_in_mb_str,
-                              &scratch_limit_in_mb)) {
+    if (absl::SimpleAtoi(workspace_limit_in_mb_str, &scratch_limit_in_mb)) {
       return scratch_limit_in_mb * (1 << 20);
     } else {
       LOG(WARNING) << "Invalid value for TF_CUBLAS_WORKSPACE_LIMIT_IN_MB: "
@@ -77,7 +76,7 @@ struct BlasLtMatmulPlanMap {
 
 int MatmulMaxAutotuneAlgorithmCount() {
   int64_t value;
-  Status status =
+  absl::Status status =
       ReadInt64FromEnvVar("TF_MATMUL_AUTOTUNE_MAX_ALGORITHMS", 10, &value);
   if (!status.ok()) {
     LOG(ERROR) << status.message();
@@ -90,7 +89,7 @@ int MatmulMaxAutotuneAlgorithmCount() {
   return value;
 }
 
-StatusOr<se::blas::ComputationType> GetBlasComputationType(
+absl::StatusOr<stream_executor::blas::ComputationType> GetBlasComputationType(
     se::blas::DataType dtype) {
   using se::blas::ComputationType;
   static bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input();
@@ -114,9 +113,11 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
 
 }  // namespace
 
-/* static */ StatusOr<const PlanAndAlgorithms*> PlanAndAlgorithms::GetOrCreate(
-    se::Stream* stream, const BlasLtMatmulPlanParams& params,
-    absl::Mutex** ppmu, std::optional<int> max_algorithm_count) {
+/* static */ absl::StatusOr<const PlanAndAlgorithms*>
+PlanAndAlgorithms::GetOrCreate(se::Stream* stream,
+                               const BlasLtMatmulPlanParams& params,
+                               absl::Mutex** ppmu,
+                               std::optional<int> max_algorithm_count) {
   static const int64_t max_scratch_size =
       GetWorkspaceLimit(1LL << 32);  // 4GB by default
   static const int64_t max_autotune_algorithm_count =
@@ -189,25 +190,27 @@ StatusOr<se::blas::ComputationType> GetBlasComputationType(
   return ptr->second.get();
 }
 
-Status PlanAndAlgorithms::ExecuteOnStream(
-    se::Stream* stream, const se::DeviceMemoryBase& a,
-    const se::DeviceMemoryBase& b, se::DeviceMemoryBase& c,
-    size_t algorithm_idx, se::ScratchAllocator& scratch_allocator,
-    const se::DeviceMemoryBase& bias,
+absl::Status PlanAndAlgorithms::ExecuteOnStream(
+    se::Stream* stream, const stream_executor::DeviceAddressBase& a,
+    const stream_executor::DeviceAddressBase& b,
+    stream_executor::DeviceAddressBase& c, size_t algorithm_idx,
+    se::ScratchAllocator& scratch_allocator,
+    const stream_executor::DeviceAddressBase& bias,
     se::blas::ProfileResult* profile_result) const {
   if (!plan || algorithm_idx >= algorithms.size()) {
     return errors::Internal("MatmulPlan or algorithms are not initialized!");
   }
   TF_RETURN_IF_ERROR(plan->SetAlgorithm(algorithms[algorithm_idx]));
-  return plan->ExecuteOnStream(stream, a, b, c, c,
-                               bias,                    // bias_buffer
-                               se::DeviceMemoryBase{},  // aux_buffer
-                               se::DeviceMemoryBase{},  // a_scale_buffer
-                               se::DeviceMemoryBase{},  // b_scale_buffer
-                               se::DeviceMemoryBase{},  // c_scale_buffer
-                               se::DeviceMemoryBase{},  // d_scale_buffer
-                               se::DeviceMemoryBase{},  // d_amax_buffer
-                               scratch_allocator, profile_result);
+  return plan->ExecuteOnStream(
+      stream, a, b, c, c,
+      bias,                                  // bias_buffer
+      stream_executor::DeviceAddressBase{},  // aux_buffer
+      stream_executor::DeviceAddressBase{},  // a_scale_buffer
+      stream_executor::DeviceAddressBase{},  // b_scale_buffer
+      stream_executor::DeviceAddressBase{},  // c_scale_buffer
+      stream_executor::DeviceAddressBase{},  // d_scale_buffer
+      stream_executor::DeviceAddressBase{},  // d_amax_buffer
+      scratch_allocator, profile_result);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_util.h b/tensorflow/core/kernels/matmul_util.h
index 0bf7f8acb48cf1..abcbe0ad1bea44 100644
--- a/tensorflow/core/kernels/matmul_util.h
+++ b/tensorflow/core/kernels/matmul_util.h
@@ -51,15 +51,17 @@ struct BlasLtMatmulPlanParams {
 };
 
 struct PlanAndAlgorithms {
-  static StatusOr<const PlanAndAlgorithms*> GetOrCreate(
+  static absl::StatusOr<const PlanAndAlgorithms*> GetOrCreate(
       se::Stream* stream, const BlasLtMatmulPlanParams& params,
       absl::Mutex** pmu, std::optional<int> max_algorithm_count = std::nullopt);
 
-  Status ExecuteOnStream(
-      se::Stream* stream, const se::DeviceMemoryBase& a,
-      const se::DeviceMemoryBase& b, se::DeviceMemoryBase& c,
-      size_t algorithm_idx, se::ScratchAllocator& scratch_allocator,
-      const se::DeviceMemoryBase& bias = se::DeviceMemoryBase{},
+  absl::Status ExecuteOnStream(
+      se::Stream* stream, const stream_executor::DeviceAddressBase& a,
+      const stream_executor::DeviceAddressBase& b,
+      stream_executor::DeviceAddressBase& c, size_t algorithm_idx,
+      se::ScratchAllocator& scratch_allocator,
+      const stream_executor::DeviceAddressBase& bias =
+          stream_executor::DeviceAddressBase{},
       se::blas::ProfileResult* profile_result = nullptr) const;
 
   se::gpu::BlasLt::MatmulPlanPtr plan;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 759811dd74ec47..e7799161eba16c 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -70,7 +70,7 @@ __global__ void MaxPoolForwardNCHW(
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, const int kernel_h,
     const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
-    const int pad_l, dtype* __restrict__ top_data, int64* __restrict__ mask,
+    const int pad_l, dtype* __restrict__ top_data, int64_t* __restrict__ mask,
     const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
@@ -110,13 +110,13 @@ __global__ void MaxPoolForwardNCHW(
 // the same X, y coordinate.
 // (so channels = outer_channels, output_size = real output size / 4).
 __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
-    const int nthreads, const int32* __restrict__ bottom_data, const int height,
-    const int width, const int channels, const int pooled_height,
-    const int pooled_width, const int kernel_h, const int kernel_w,
-    const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    int32* __restrict__ top_data) {
+    const int nthreads, const int32_t* __restrict__ bottom_data,
+    const int height, const int width, const int channels,
+    const int pooled_height, const int pooled_width, const int kernel_h,
+    const int kernel_w, const int stride_h, const int stride_w, const int pad_t,
+    const int pad_l, int32_t* __restrict__ top_data) {
   // TODO(pauldonnelly): Implement a better optimized version of this kernel.
-  const int32 kMinINT8X4 = 0x80808080;
+  const int32_t kMinINT8X4 = 0x80808080;
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int pw = index % pooled_width;
     int ph = (index / pooled_width) % pooled_height;
@@ -128,8 +128,8 @@ __global__ void MaxPoolForwardNoMaskKernel_NCHW_VECT_C(
     int wend = min(wstart + kernel_w, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    int32 maxval = kMinINT8X4;
-    const int32* bottom_data_n = bottom_data + n * channels * height * width;
+    int32_t maxval = kMinINT8X4;
+    const int32_t* bottom_data_n = bottom_data + n * channels * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int idx = (c * height + h) * width + w;
@@ -147,7 +147,7 @@ __global__ void MaxPoolForwardNHWC(
     const int width, const int channels, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    dtype* __restrict__ top_data, int64* __restrict__ mask,
+    dtype* __restrict__ top_data, int64_t* __restrict__ mask,
     const bool include_batch_in_index) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int n = index;
@@ -203,7 +203,7 @@ __global__ void MaxPoolForwardNHWC(
 template <typename dtype>
 __global__ void MaxPoolBackward(const int nthreads,
                                 const dtype* __restrict__ top_diff,
-                                const int64* __restrict__ mask,
+                                const int64_t* __restrict__ mask,
                                 const int top_offset, const int bottom_offset,
                                 dtype* __restrict__ bottom_diff,
                                 const bool include_batch_in_index) {
@@ -332,7 +332,7 @@ __global__ void MaxPoolGradBackwardNoMaskNHWC(
 template <typename dtype>
 __global__ void MaxPoolGradBackward(const int nthreads,
                                     const dtype* __restrict__ top_diff,
-                                    const int64* __restrict__ mask,
+                                    const int64_t* __restrict__ mask,
                                     const int top_offset,
                                     const int bottom_offset,
                                     dtype* __restrict__ bottom_diff,
@@ -353,11 +353,11 @@ namespace functor {
 // Note: channels is the outer channels (dim 1) which has already been
 // divided by 4.
 bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
-    const int32* bottom_data, const int batch, const int height,
+    const int32_t* bottom_data, const int batch, const int height,
     const int width, int channels, const int pooled_height,
     const int pooled_width, const int kernel_h, const int kernel_w,
     const int stride_h, const int stride_w, const int pad_t, const int pad_l,
-    int32* top_data, const Eigen::GpuDevice& d) {
+    int32_t* top_data, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
   if (output_size == 0) return true;
@@ -377,7 +377,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     const int channels, const int pooled_height, const int pooled_width,
     const int kernel_h, const int kernel_w, const int stride_h,
     const int stride_w, const int pad_t, const int pad_l, T* top_data,
-    int64* mask, const Eigen::GpuDevice& d, bool propagate_nans,
+    int64_t* mask, const Eigen::GpuDevice& d, bool propagate_nans,
     const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
@@ -405,7 +405,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
 template <typename T>
 bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
-    const int64* mask, const int top_offset, const int bottom_offset,
+    const int64_t* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d,
     const bool include_batch_in_index) {
   const int kThreadsPerBlock = 1024;
@@ -454,7 +454,7 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
 template <typename T>
 bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
-    const int64* mask, const int top_offset, const int bottom_offset,
+    const int64_t* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d,
     const bool include_batch_in_index) {
   if (input_size == 0) return true;
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.h b/tensorflow/core/kernels/maxpooling_op_gpu.h
index 650a01e3ff0dc1..3e8ba784d9714e 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.h
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.h
@@ -44,11 +44,11 @@ struct MaxPoolForwardWithOptionalArgmax {
 };
 
 struct MaxPoolForwardNoMask_NCHW_VECT_C {
-  bool operator()(const int32* bottom_data, const int batch, const int height,
+  bool operator()(const int32_t* bottom_data, const int batch, const int height,
                   const int width, int channels, const int pooled_height,
                   const int pooled_width, const int kernel_h,
                   const int kernel_w, const int stride_h, const int stride_w,
-                  const int pad_t, const int pad_l, int32* top_data,
+                  const int pad_t, const int pad_l, int32_t* top_data,
                   const Eigen::GpuDevice& d);
 };
 
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 6136177effa4f3..9a76c85aba09c7 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -39,8 +39,8 @@ using GPUDevice = Eigen::GpuDevice;
 // Kernel for Multinomial op.  Data is interpreted to have the following shapes:
 //   scores: [B, S, C];  maxima: [B, S];  output: [B, S].
 template <typename OutputType>
-__global__ void MultinomialKernel(int32 nthreads, const int32 num_classes,
-                                  const int32 num_samples,
+__global__ void MultinomialKernel(int32_t nthreads, const int32_t num_classes,
+                                  const int32_t num_samples,
                                   const float* __restrict__ scores,
                                   const float* __restrict__ maxima,
                                   OutputType* __restrict__ output) {
@@ -113,7 +113,7 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
     // Necessary for atomicMax() inside the kernel.
     output.device(d) = output.constant(0LL);
 
-    const int32 work_items = batch_size * num_samples * num_classes;
+    const int32_t work_items = batch_size * num_samples * num_classes;
     GpuLaunchConfig config = GetGpuLaunchConfig(work_items, d);
     TF_CHECK_OK(GpuLaunchKernel(
         MultinomialKernel<OutputType>, config.block_count,
diff --git a/tensorflow/core/kernels/nccl_ops.cc b/tensorflow/core/kernels/nccl_ops.cc
index 00242596140499..77eb070e628576 100644
--- a/tensorflow/core/kernels/nccl_ops.cc
+++ b/tensorflow/core/kernels/nccl_ops.cc
@@ -52,7 +52,7 @@ class NcclAsyncOpBase : public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("shared_name", &collective_prefix_));
   }
 
-  string GetCollectiveKey(OpKernelContext* c) {
+  std::string GetCollectiveKey(OpKernelContext* c) {
     return strings::StrCat(collective_prefix_, ";", c->step_id(), ";",
                            c->frame_iter().frame_id, ":",
                            c->frame_iter().iter_id);
@@ -62,7 +62,7 @@ class NcclAsyncOpBase : public AsyncOpKernel {
 
  private:
   int num_devices_;
-  string collective_prefix_;
+  std::string collective_prefix_;
 
   NcclAsyncOpBase(const NcclAsyncOpBase&) = delete;
   void operator=(const NcclAsyncOpBase&) = delete;
@@ -71,7 +71,7 @@ class NcclAsyncOpBase : public AsyncOpKernel {
 class NcclReduceOpBase : public NcclAsyncOpBase {
  public:
   explicit NcclReduceOpBase(OpKernelConstruction* c) : NcclAsyncOpBase(c) {
-    string reduction;
+    std::string reduction;
     OP_REQUIRES_OK(c, c->GetAttr("reduction", &reduction));
     if (reduction == "min") {
       reduction_op_ = ncclMin;
@@ -106,7 +106,7 @@ class NcclAllReduceOpKernel : public NcclReduceOpBase {
     OP_REQUIRES_OK_ASYNC(
         c, c->forward_input_or_allocate_output({0}, 0, input->shape(), &output),
         done);
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -137,7 +137,7 @@ class NcclReduceSendKernel : public NcclReduceOpBase {
       : NcclReduceOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -173,7 +173,7 @@ class NcclReduceRecvKernel : public NcclReduceOpBase {
     OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, input->shape(), &output),
                          done);
 
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -207,7 +207,7 @@ class NcclBroadcastSendKernel : public NcclAsyncOpBase {
       : NcclAsyncOpBase(c) {}
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
@@ -239,11 +239,11 @@ class NcclBroadcastRecvKernel : public NcclAsyncOpBase {
     const Tensor& shape_t = c->input(0);
     TensorShape shape;
     OP_REQUIRES_OK_ASYNC(
-        c, TensorShapeUtils::MakeShape(shape_t.vec<int32>(), &shape), done);
+        c, TensorShapeUtils::MakeShape(shape_t.vec<int32_t>(), &shape), done);
     Tensor* output;
     OP_REQUIRES_OK_ASYNC(c, c->allocate_output(0, shape, &output), done);
 
-    auto actual_done = [c, done](Status s) {
+    auto actual_done = [c, done](absl::Status s) {
       OP_REQUIRES_OK_ASYNC(c, s, done);
       done();
     };
diff --git a/tensorflow/core/kernels/one_hot_op_test.cc b/tensorflow/core/kernels/one_hot_op_test.cc
index 6801b29e2509f7..09cb9b8d9388ea 100644
--- a/tensorflow/core/kernels/one_hot_op_test.cc
+++ b/tensorflow/core/kernels/one_hot_op_test.cc
@@ -30,13 +30,13 @@ static Graph* OneHot(int batch_size, int num_classes, int axis) {
   std::mt19937 gen(rd());
   std::uniform_int_distribution<> dist(0, num_classes - 1);
 
-  auto indices_t = indices.flat<int32>();
+  auto indices_t = indices.flat<int32_t>();
   for (int i = 0; i < batch_size; ++i) {
     indices_t(i) = dist(gen);
   }
 
   Tensor depth(DT_INT32, TensorShape({}));
-  depth.scalar<int32>()() = num_classes;
+  depth.scalar<int32_t>()() = num_classes;
 
   Tensor on_value(DT_FLOAT, TensorShape({}));
   on_value.scalar<float>()() = 1.0f;
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 0e60b84dc9ff25..f4c1db06bad961 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -168,8 +168,8 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
                             .Device(DEVICE_GPU)
                             .HostMemory("values")
                             .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PackOp<CPUDevice, int32>);
+                            .TypeConstraint<int32_t>("T"),
+                        PackOp<CPUDevice, int32_t>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
index e7b76653dc329e..0fbb33816c8b14 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc
@@ -51,16 +51,16 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename T>
 
 __global__ void __launch_bounds__(1024)
-    TruncatedNormalKernel(random::PhiloxRandom gen, T* data, int64 num_batches,
-                          int64 samples_per_batch, int64 num_elements,
-                          const T* __restrict__ means, bool single_mean,
-                          const T* __restrict__ stddevs, bool single_stddev,
-                          const T* __restrict__ minvals, bool single_minval,
-                          const T* __restrict__ maxvals, bool single_maxval,
-                          int64 kMaxIterations) {
-  const int32 max_samples_per_item = 2 * kMaxIterations;
+    TruncatedNormalKernel(random::PhiloxRandom gen, T* data,
+                          int64_t num_batches, int64_t samples_per_batch,
+                          int64_t num_elements, const T* __restrict__ means,
+                          bool single_mean, const T* __restrict__ stddevs,
+                          bool single_stddev, const T* __restrict__ minvals,
+                          bool single_minval, const T* __restrict__ maxvals,
+                          bool single_maxval, int64_t kMaxIterations) {
+  const int32_t max_samples_per_item = 2 * kMaxIterations;
   // Initial offset as given by GPU_1D_KERNEL_LOOP.
-  const int32 initial_offset = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t initial_offset = blockIdx.x * blockDim.x + threadIdx.x;
   gen.Skip(max_samples_per_item * initial_offset);
   typedef random::UniformDistribution<random::PhiloxRandom, T> Uniform;
   typedef random::NormalDistribution<random::PhiloxRandom, T> Normal;
@@ -82,15 +82,15 @@ __global__ void __launch_bounds__(1024)
   // skips max_samples_per_item in the generator. Then after generating this
   // item, we need to skip the samples for one element for every thread to get
   // to the next element that we actually process.
-  const int32 samples_between_processed_elements =
+  const int32_t samples_between_processed_elements =
       max_samples_per_item * (gridDim.x * blockDim.x);
 
   GPU_1D_KERNEL_LOOP(offset, num_elements) {
     // Track how many more samples we need to skip before we process the next
     // element.
-    int32 remaining_samples = samples_between_processed_elements;
+    int32_t remaining_samples = samples_between_processed_elements;
 
-    const int64 batch_id = offset / samples_per_batch;
+    const int64_t batch_id = offset / samples_per_batch;
     T mean = means[single_mean ? 0 : batch_id];
     const T input_stddev = stddevs[single_stddev ? 0 : batch_id];
     T minval = minvals[single_minval ? 0 : batch_id];
@@ -231,8 +231,8 @@ __global__ void __launch_bounds__(1024)
 // Partial specialization for GPU
 template <typename T>
 struct TruncatedNormalFunctor<GPUDevice, T> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d, int64 num_batches,
-                  int64 samples_per_batch, int64 num_elements,
+  void operator()(OpKernelContext* ctx, const GPUDevice& d, int64_t num_batches,
+                  int64_t samples_per_batch, int64_t num_elements,
                   typename TTypes<T>::ConstFlat means,
                   typename TTypes<T>::ConstFlat stddevs,
                   typename TTypes<T>::ConstFlat minvals,
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index c5d51e5935677a..7a891ddd63f2b3 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -41,8 +41,8 @@ class DnnPoolingOp {
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::vector<int32>& size,
-                      const std::vector<int32>& stride, Padding padding,
+                      const std::vector<int32_t>& size,
+                      const std::vector<int32_t>& stride, Padding padding,
                       std::vector<int64_t> explicit_paddings,
                       TensorFormat data_format, const Tensor& tensor_in,
                       const TensorShape& tensor_out_shape, bool propagate_nans);
@@ -57,8 +57,8 @@ class DnnPoolingGradOp {
   typedef GPUDevice Device;
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::vector<int32>& size,
-                      const std::vector<int32>& stride, Padding padding,
+                      const std::vector<int32_t>& size,
+                      const std::vector<int32_t>& stride, Padding padding,
                       std::vector<int64_t> explicit_paddings,
                       TensorFormat data_format, const Tensor* tensor_in,
                       const Tensor* tensor_out, const Tensor& out_backprop,
diff --git a/tensorflow/core/kernels/population_count_op_gpu.cu.cc b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
index 5f2f14cfba0fb7..7df72b3a8f0b84 100644
--- a/tensorflow/core/kernels/population_count_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/population_count_op_gpu.cu.cc
@@ -35,34 +35,34 @@ namespace functor {
 template <typename T>
 __global__ void PopulationCountKernel(const int size,
                                       const T* __restrict__ input,
-                                      uint8* __restrict__ output) {
+                                      uint8_t* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popc(ldg(input + i)); }
 }
 
 template <>
 __global__ void PopulationCountKernel(const int size,
-                                      const int8* __restrict__ input,
-                                      uint8* __restrict__ output) {
+                                      const int8_t* __restrict__ input,
+                                      uint8_t* __restrict__ output) {
   // For some reason, __popc on a negative int8 gets confused.
   GPU_1D_KERNEL_LOOP(i, size) {
-    output[i] = __popc(ldg(reinterpret_cast<const uint8*>(input + i)));
+    output[i] = __popc(ldg(reinterpret_cast<const uint8_t*>(input + i)));
   }
 }
 
 template <>
 __global__ void PopulationCountKernel(const int size,
-                                      const int16* __restrict__ input,
-                                      uint8* __restrict__ output) {
+                                      const int16_t* __restrict__ input,
+                                      uint8_t* __restrict__ output) {
   // For some reason, __popc on a negative int16 gets confused.
   GPU_1D_KERNEL_LOOP(i, size) {
-    output[i] = __popc(ldg(reinterpret_cast<const uint16*>(input + i)));
+    output[i] = __popc(ldg(reinterpret_cast<const uint16_t*>(input + i)));
   }
 }
 
 template <>
-__global__ void PopulationCountKernel<int64_t>(const int size,
-                                               const int64* __restrict__ input,
-                                               uint8* __restrict__ output) {
+__global__ void PopulationCountKernel<int64_t>(
+    const int size, const int64_t* __restrict__ input,
+    uint8_t* __restrict__ output) {
   GPU_1D_KERNEL_LOOP(i, size) { output[i] = __popcll(ldg(input + i)); }
 }
 
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 4274c775bd1557..e62b4cdf2db9d6 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -51,7 +51,7 @@ absl::Status HandleSliceToElement(const Tensor& parent, Tensor* element,
 
 QueueBase::QueueBase(int32_t capacity, const DataTypeVector& component_dtypes,
                      const std::vector<TensorShape>& component_shapes,
-                     const string& name)
+                     const std::string& name)
     : capacity_(capacity),
       component_dtypes_(component_dtypes),
       component_shapes_(component_shapes),
@@ -78,8 +78,9 @@ absl::Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const {
 }
 
 // static
-string QueueBase::ShapeListString(const absl::Span<const TensorShape>& shapes) {
-  string result = "[";
+std::string QueueBase::ShapeListString(
+    const absl::Span<const TensorShape>& shapes) {
+  std::string result = "[";
   bool first = true;
   for (const TensorShape& shape : shapes) {
     absl::StrAppend(&result, first ? "" : ", ", shape.DebugString());
@@ -90,7 +91,7 @@ string QueueBase::ShapeListString(const absl::Span<const TensorShape>& shapes) {
 }
 
 absl::Status QueueBase::MatchesNodeDefOp(const NodeDef& node_def,
-                                         const string& op) const {
+                                         const std::string& op) const {
   if (node_def.op() != op) {
     return errors::InvalidArgument("Shared queue '", name_, "' has type '", op,
                                    "' that does not match type of Node '",
diff --git a/tensorflow/core/kernels/ragged_cross_op.cc b/tensorflow/core/kernels/ragged_cross_op.cc
index 9e7d0d52ac2ae7..9612e6bcdbabfb 100644
--- a/tensorflow/core/kernels/ragged_cross_op.cc
+++ b/tensorflow/core/kernels/ragged_cross_op.cc
@@ -51,7 +51,7 @@ class FeatureReader {
   virtual int64_t FeatureCount(int64_t batch) const = 0;
 
   // Copies the value for the specified feature to `out`.
-  virtual void ReadValue(int64_t batch, int64_t n, uint64* out) const = 0;
+  virtual void ReadValue(int64_t batch, int64_t n, uint64_t* out) const = 0;
   virtual void ReadValue(int64_t batch, int64_t n, tstring* out) const = 0;
 
   virtual ~FeatureReader() {}
@@ -70,10 +70,10 @@ void CopyToString(const tstring& src, tstring* dst) {
 void CopyToString(int64_t src, tstring* dst) { *dst = std::to_string(src); }
 
 // Copies a feature value `src` to an int64 fingerprint `dst`.
-void CopyToFingerprint(const tstring& feature, uint64* dst) {
+void CopyToFingerprint(const tstring& feature, uint64_t* dst) {
   *dst = Fingerprint64(feature);
 }
-void CopyToFingerprint(int64_t feature, uint64* dst) { *dst = feature; }
+void CopyToFingerprint(int64_t feature, uint64_t* dst) { *dst = feature; }
 
 // A FeatureReader that is backed by a ragged tensor.
 template <typename ValuesType, typename SplitsType>
@@ -87,7 +87,7 @@ class RaggedFeatureReader : public FeatureReader {
     return row_splits_(batch + 1) - row_splits_(batch);
   }
 
-  void ReadValue(int64_t batch, int64_t n, uint64* out) const override {
+  void ReadValue(int64_t batch, int64_t n, uint64_t* out) const override {
     CopyToFingerprint(values_(row_splits_(batch) + n), out);
   }
 
@@ -110,7 +110,7 @@ class DenseFeatureReader : public FeatureReader {
 
   int64_t FeatureCount(int64_t batch) const override { return feature_count_; }
 
-  void ReadValue(int64_t batch, int64_t n, uint64* out) const override {
+  void ReadValue(int64_t batch, int64_t n, uint64_t* out) const override {
     CopyToFingerprint(values_(batch, n), out);
   }
 
@@ -145,7 +145,7 @@ class SparseFeatureReader : public FeatureReader {
     return row_splits_[batch + 1] - row_splits_[batch];
   }
 
-  void ReadValue(int64_t batch, int64_t n, uint64* out) const override {
+  void ReadValue(int64_t batch, int64_t n, uint64_t* out) const override {
     CopyToFingerprint(values_(row_splits_[batch] + n), out);
   }
 
@@ -179,7 +179,7 @@ class OutputWriterImpl : public OutputWriter {
   using FlatSplits = typename TTypes<SplitsType>::ConstFlat;
 
   OutputWriterImpl(const FeatureReaders& features, int64_t num_buckets,
-                   uint64 hash_key, const Tensor* splits_out,
+                   uint64_t hash_key, const Tensor* splits_out,
                    Tensor* values_out)
       : features_(features),
         num_buckets_(num_buckets),
@@ -220,9 +220,9 @@ class OutputWriterImpl : public OutputWriter {
   void WriteCombination(int64_t batch_index,
                         const std::vector<int>& combination, int64_t* out) {
     // Do the fingerprint concatenation on uint64.
-    uint64 hashed_output = hash_key_;
+    uint64_t hashed_output = hash_key_;
     for (size_t i = 0; i < combination.size(); ++i) {
-      uint64 hash_i;
+      uint64_t hash_i;
       features_[i]->ReadValue(batch_index, combination[i], &hash_i);
       hashed_output = FingerprintCat64(hashed_output, hash_i);
     }
@@ -254,7 +254,7 @@ class OutputWriterImpl : public OutputWriter {
 
   const FeatureReaders& features_;
   const int64_t num_buckets_;
-  const uint64 hash_key_;
+  const uint64_t hash_key_;
   FlatSplits splits_out_;
   FlatValues values_out_;
 };
@@ -263,7 +263,7 @@ class OutputWriterImpl : public OutputWriter {
 // given tensors.
 std::unique_ptr<OutputWriter> MakeOutputWriter(const FeatureReaders& features,
                                                int64_t num_buckets,
-                                               uint64 hash_key,
+                                               uint64_t hash_key,
                                                const Tensor* splits_out,
                                                Tensor* values_out) {
   if (values_out->dtype() == DT_INT64) {
@@ -271,7 +271,7 @@ std::unique_ptr<OutputWriter> MakeOutputWriter(const FeatureReaders& features,
       return std::make_unique<OutputWriterImpl<int64_t, int64_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     } else {
-      return std::make_unique<OutputWriterImpl<int64_t, int32>>(
+      return std::make_unique<OutputWriterImpl<int64_t, int32_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     }
   } else {
@@ -279,7 +279,7 @@ std::unique_ptr<OutputWriter> MakeOutputWriter(const FeatureReaders& features,
       return std::make_unique<OutputWriterImpl<tstring, int64_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     } else {
-      return std::make_unique<OutputWriterImpl<tstring, int32>>(
+      return std::make_unique<OutputWriterImpl<tstring, int32_t>>(
           features, num_buckets, hash_key, splits_out, values_out);
     }
   }
@@ -298,7 +298,7 @@ class RaggedCrossOp : public OpKernel {
     // supported by REGISTER_OP.
     int64_t signed_hash_key_;
     OP_REQUIRES_OK(context, context->GetAttr("hash_key", &signed_hash_key_));
-    hash_key_ = static_cast<uint64>(signed_hash_key_);
+    hash_key_ = static_cast<uint64_t>(signed_hash_key_);
 
     int num_sparse;
     OP_REQUIRES_OK(context, context->GetAttr("Nsparse", &num_sparse));
@@ -542,7 +542,7 @@ class RaggedCrossOp : public OpKernel {
             new RaggedFeatureReader<int64_t, int64_t>(values, splits));
       } else {
         features->emplace_back(
-            new RaggedFeatureReader<int64_t, int32>(values, splits));
+            new RaggedFeatureReader<int64_t, int32_t>(values, splits));
       }
     } else {
       if (splits.dtype() == DT_INT64) {
@@ -550,7 +550,7 @@ class RaggedCrossOp : public OpKernel {
             new RaggedFeatureReader<tstring, int64_t>(values, splits));
       } else {
         features->emplace_back(
-            new RaggedFeatureReader<tstring, int32>(values, splits));
+            new RaggedFeatureReader<tstring, int32_t>(values, splits));
       }
     }
     return absl::OkStatus();
@@ -632,7 +632,7 @@ class RaggedCrossOp : public OpKernel {
   }
 
   int64_t num_buckets_;
-  uint64 hash_key_;
+  uint64_t hash_key_;
   std::vector<DataType> ragged_values_types_;
   std::vector<DataType> ragged_splits_types_;
   std::vector<DataType> sparse_values_types_;
@@ -642,8 +642,8 @@ class RaggedCrossOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("RaggedCross")
                             .Device(DEVICE_CPU)
-                            .TypeConstraint<int32>("out_row_splits_type"),
-                        RaggedCrossOp<int32>);
+                            .TypeConstraint<int32_t>("out_row_splits_type"),
+                        RaggedCrossOp<int32_t>);
 REGISTER_KERNEL_BUILDER(Name("RaggedCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<int64_t>("out_row_splits_type"),
diff --git a/tensorflow/core/kernels/ragged_gather_op_test.cc b/tensorflow/core/kernels/ragged_gather_op_test.cc
index ca070524a62acc..cebccdd360f2d4 100644
--- a/tensorflow/core/kernels/ragged_gather_op_test.cc
+++ b/tensorflow/core/kernels/ragged_gather_op_test.cc
@@ -65,7 +65,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather) {
   // indices = [2, 1, 0, 3]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
   // params.shape = [4, None]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({4}),                     // indices.shape
       {2, 1, 0, 3},                         // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -87,7 +87,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_3DParams) {
   // indices = [2, 1, 0, 2, 3]
   // params = [[[]], [[.1, 2], [.3]], [], [[.4, .5], [.6, .7, .8]], [[.9]]]
   // params.shape = [5, None, None]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({5}),                             // indices.shape
       {2, 1, 0, 2, 3},                              // indices
       {{0, 1, 3, 3, 5, 6}, {0, 0, 2, 3, 5, 8, 9}},  // params_nested_splits
@@ -111,7 +111,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_4DParams) {
   // indices = [2, 1, 0, 2]
   // params = [[[]], [[[1, 2], [3, 4], [5, 6]], [[7, 8]]], []]
   // params.shape = [4, None, None, 2]
-  BuildRaggedGatherGraph<int32, int32>(
+  BuildRaggedGatherGraph<int32_t, int32_t>(
       TensorShape({4}),              // indices.shape
       {2, 1, 0, 2},                  // indices
       {{0, 1, 3, 3}, {0, 0, 3, 4}},  // params_nested_splits
@@ -129,15 +129,15 @@ TEST_F(RaggedGatherOpTest, RaggedGather_4DParams) {
                                    test::AsTensor<int64_t>({0, 0, 2, 3, 3}));
   test::ExpectTensorEqual<int64_t>(*GetOutput(1),
                                    test::AsTensor<int64_t>({0, 3, 4, 4}));
-  test::ExpectTensorEqual<int32>(
+  test::ExpectTensorEqual<int32_t>(
       *GetOutput(2),
-      test::AsTensor<int32>({1, 2, 3, 4, 5, 6, 7, 8}, TensorShape({4, 2})));
+      test::AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8}, TensorShape({4, 2})));
 }
 
 TEST_F(RaggedGatherOpTest, RaggedGather_2DIndices) {
   // indices = [[2, 1], [0, 3]]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2, 2}),                  // indices.shape
       {2, 1, 0, 3},                         // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -161,7 +161,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_2DIndices) {
 TEST_F(RaggedGatherOpTest, RaggedGather_ScalarIndices) {
   // indices = 2
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({}),                      // indices.shape
       {2},                                  // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -178,7 +178,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_ScalarIndices) {
 TEST_F(RaggedGatherOpTest, RaggedGather_OutOfBounds) {
   // indices = [2, 10]
   // params = [[.1, .2, .3], [], [.4, .5, .6, .7], [.8, .9]]
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {2, 10},                              // indices
       {{0, 3, 3, 7, 9}},                    // params_nested_splits
@@ -189,7 +189,7 @@ TEST_F(RaggedGatherOpTest, RaggedGather_OutOfBounds) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsNotSorted) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {0, 2},                               // indices
       {{0, 3, 5, 2, 9}},                    // params_nested_splits
@@ -200,7 +200,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsNotSorted) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsNegative) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {0, 2},                               // indices
       {{-1, 3, 2, 7, 9}},                   // params_nested_splits
@@ -211,7 +211,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsNegative) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsEmpty) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({0}),  // indices.shape
       {},                // indices
       {{}},              // params_nested_splits
@@ -222,7 +222,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsEmpty) {
 }
 
 TEST_F(RaggedGatherOpTest, InvalidSplitsTooBig) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({2}),                     // indices.shape
       {0, 2},                               // indices
       {{0, 20, 40, 80, 100}},               // params_nested_splits
@@ -234,7 +234,7 @@ TEST_F(RaggedGatherOpTest, InvalidSplitsTooBig) {
 }
 
 TEST_F(RaggedGatherOpTest, BadValuesShape) {
-  BuildRaggedGatherGraph<float, int32>(
+  BuildRaggedGatherGraph<float, int32_t>(
       TensorShape({0}),  // indices.shape
       {},                // indices
       {{0}},             // params_nested_splits
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
index f8efa21daba8ff..dbb66c2148397d 100644
--- a/tensorflow/core/kernels/random_op_gpu.h
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -34,17 +34,17 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
-                                random::PhiloxRandom gen, T* data, int64 size,
+  PHILOX_DEVICE_INLINE void Run(const uint64_t* key, const uint64_t* counter,
+                                random::PhiloxRandom gen, T* data, int64_t size,
                                 Distribution dist);
 };
 
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+  PHILOX_DEVICE_INLINE void Run(const uint64_t* key, const uint64_t* counter,
                                 random::PhiloxRandom base_gen, T* data,
-                                int64 size, Distribution dist);
+                                int64_t size, Distribution dist);
 };
 
 template <typename T, int ElementCount>
@@ -83,14 +83,14 @@ class SampleCopier<float, 4> {
 };
 
 template <>
-class SampleCopier<int32, 4> {
+class SampleCopier<int32_t, 4> {
  public:
   // Copies the elements from the array to buf. buf must be 128-bit aligned,
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      int32* __restrict__ buf,
-      const tensorflow::random::Array<int32, 4>& array) const {
+      int32_t* __restrict__ buf,
+      const tensorflow::random::Array<int32_t, 4>& array) const {
     ::int4 vec;
     vec.x = array[0];
     vec.y = array[1];
@@ -119,14 +119,14 @@ class SampleCopier<double, 2> {
 };
 
 template <>
-class SampleCopier<int64, 2> {
+class SampleCopier<int64_t, 2> {
  public:
   // Copies the elements from the array to buf. buf must be 128-bit aligned,
   // which is true for tensor data, and all offsets that are a multiple of the
   // vector size (because the vectors are 128 bits long).
   inline __device__ void operator()(
-      int64* __restrict__ buf,
-      const tensorflow::random::Array<int64, 2>& array) const {
+      int64_t* __restrict__ buf,
+      const tensorflow::random::Array<int64_t, 2>& array) const {
     longlong2 vec;
     vec.x = array[0];
     vec.y = array[1];
@@ -139,13 +139,13 @@ class SampleCopier<int64, 2> {
 // distribution. Each output takes a fixed number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
-    const uint64* key, const uint64* counter, random::PhiloxRandom gen, T* data,
-    int64 size, Distribution dist) {
+    const uint64_t* key, const uint64_t* counter, random::PhiloxRandom gen,
+    T* data, int64_t size, Distribution dist) {
   const int kGroupSize = Distribution::kResultElementCount;
 
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
-  int64 offset = thread_id * kGroupSize;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
+  int64_t offset = thread_id * kGroupSize;
   if (key != nullptr && counter != nullptr) {
     gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
   }
@@ -174,8 +174,8 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
 // distribution. Each output takes a variable number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
-    const uint64* key, const uint64* counter, random::PhiloxRandom base_gen,
-    T* data, int64 size, Distribution dist) {
+    const uint64_t* key, const uint64_t* counter, random::PhiloxRandom base_gen,
+    T* data, int64_t size, Distribution dist) {
   if (key != nullptr && counter != nullptr) {
     base_gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
   }
@@ -189,10 +189,10 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
                                            kReservedSamplesPerOutput /
                                            PhiloxRandom::kResultElementCount;
 
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
-  int64 group_index = thread_id;
-  int64 offset = group_index * kGroupSize;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
+  int64_t group_index = thread_id;
+  int64_t offset = group_index * kGroupSize;
 
   while (offset < size) {
     // Since each output takes a variable number of samples, we need to
@@ -219,10 +219,10 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
 // A simple launch pad to call the correct function templates to fill the data
 template <class Distribution>
 __global__ void __launch_bounds__(1024)
-    FillPhiloxRandomKernelLaunch(const uint64* key, const uint64* counter,
+    FillPhiloxRandomKernelLaunch(const uint64_t* key, const uint64_t* counter,
                                  random::PhiloxRandom base_gen,
                                  typename Distribution::ResultElementType* data,
-                                 int64 size, Distribution dist) {
+                                 int64_t size, Distribution dist) {
   FillPhiloxRandomKernel<Distribution,
                          Distribution::kVariableSamplesPerOutput>()
       .Run(key, counter, base_gen, data, size, dist);
@@ -231,13 +231,13 @@ __global__ void __launch_bounds__(1024)
 // Partial specialization for GPU
 template <class Distribution>
 void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
-    OpKernelContext*, const GPUDevice& d, const uint64* key,
-    const uint64* counter, random::PhiloxRandom gen,
-    typename Distribution::ResultElementType* data, int64 size,
+    OpKernelContext*, const GPUDevice& d, const uint64_t* key,
+    const uint64_t* counter, random::PhiloxRandom gen,
+    typename Distribution::ResultElementType* data, int64_t size,
     Distribution dist) {
   if (size == 0) return;
-  const int32 block_size = d.maxGpuThreadsPerBlock();
-  const int32 num_blocks =
+  const int32_t block_size = d.maxGpuThreadsPerBlock();
+  const int32_t num_blocks =
       std::min<int64_t>(
           d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
           size + block_size - 1) /
diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc
index dab0b9b07ad84b..b6a1ee9e57515a 100644
--- a/tensorflow/core/kernels/relu_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc
@@ -44,10 +44,10 @@ namespace functor {
 __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
                                    const Eigen::half* __restrict__ feature,
                                    Eigen::half* __restrict__ backprop,
-                                   int32 count) {
-  int32 half2_count = count >> 1;
-  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_device_threads = gridDim.x * blockDim.x;
+                                   int32_t count) {
+  int32_t half2_count = count >> 1;
+  int32_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_device_threads = gridDim.x * blockDim.x;
 
   while (index < half2_count) {
     // The fast branch.
@@ -97,9 +97,9 @@ __global__ void ReluGradHalfKernel(const Eigen::half* __restrict__ gradient,
 __global__ void ReluGradHalfKernelVector(
     const Eigen::half* __restrict__ gradient,
     const Eigen::half* __restrict__ feature, Eigen::half* __restrict__ backprop,
-    int32 count) {
-  int32 half8_count = count / VectorSizeElements;
-  int32 index = blockIdx.x * blockDim.x + threadIdx.x;
+    int32_t count) {
+  int32_t half8_count = count / VectorSizeElements;
+  int32_t index = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (index < half8_count) {
     // Cast to xx_h8 for vector load and store.
@@ -174,17 +174,17 @@ struct ReluGrad<Device, Eigen::half> {
     auto backprop_ptr = reinterpret_cast<uintptr_t>(backprop.data());
     bool aligned = gradient_ptr % 16 == 0 && feature_ptr % 16 == 0 &&
                    backprop_ptr % 16 == 0;
-    int32 count = gradient.size();
-    constexpr int32 kThreadInBlock = 512;
+    int32_t count = gradient.size();
+    constexpr int32_t kThreadInBlock = 512;
     if (count == 0) return;
     if (aligned) {
-      int32 half8_count = Eigen::divup(count, VectorSizeElements);
-      int32 kBlock = Eigen::divup(half8_count, kThreadInBlock);
+      int32_t half8_count = Eigen::divup(count, VectorSizeElements);
+      int32_t kBlock = Eigen::divup(half8_count, kThreadInBlock);
       TF_CHECK_OK(GpuLaunchKernel(
           ReluGradHalfKernelVector, kBlock, kThreadInBlock, 0, d.stream(),
           gradient.data(), feature.data(), backprop.data(), count));
     } else {
-      int32 half2_count = Eigen::divup(count, 2);
+      int32_t half2_count = Eigen::divup(count, 2);
       GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
           half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock);
       TF_CHECK_OK(GpuLaunchKernel(
@@ -195,8 +195,8 @@ struct ReluGrad<Device, Eigen::half> {
 };
 
 __global__ void Relu_int8x4_kernel(int vect_count,
-                                   const int32* __restrict__ input,
-                                   int32* __restrict__ output) {
+                                   const int32_t* __restrict__ input,
+                                   int32_t* __restrict__ output) {
   CUDA_1D_KERNEL_LOOP(index, vect_count) {
 #if GOOGLE_CUDA
     output[index] = __vmaxs4(input[index], 0);
@@ -221,17 +221,17 @@ struct Relu<Device, qint8> {
   // 'output' should have the same size as 'input'.
   void operator()(const Device& d, typename TTypes<qint8>::ConstTensor input,
                   typename TTypes<qint8>::Tensor output) {
-    int32 count = input.size();
+    int32_t count = input.size();
     if (count == 0) return;
 
-    int32 vect_count = Eigen::divup(count, 4);
-    constexpr int32 kThreadInBlock = 512;
+    int32_t vect_count = Eigen::divup(count, 4);
+    constexpr int32_t kThreadInBlock = 512;
     GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize(
         vect_count, d, Relu_int8x4_kernel, 0, kThreadInBlock);
     TF_CHECK_OK(GpuLaunchKernel(
         Relu_int8x4_kernel, config.block_count, config.thread_per_block, 0,
-        d.stream(), vect_count, reinterpret_cast<const int32*>(input.data()),
-        reinterpret_cast<int32*>(output.data())));
+        d.stream(), vect_count, reinterpret_cast<const int32_t*>(input.data()),
+        reinterpret_cast<int32_t*>(output.data())));
   }
 };
 
diff --git a/tensorflow/core/kernels/reshape_util_gpu.cu.cc b/tensorflow/core/kernels/reshape_util_gpu.cu.cc
index 22f09a0ee92aa8..f3c48ef42c9ae4 100644
--- a/tensorflow/core/kernels/reshape_util_gpu.cu.cc
+++ b/tensorflow/core/kernels/reshape_util_gpu.cu.cc
@@ -36,7 +36,7 @@ __global__ void ReshapeSparseTensorKernel(
   GPU_1D_KERNEL_LOOP(sparse_index, nnz) {
     const Tindex* input_index = &input_indices[sparse_index * input_rank];
     Tindex* output_index = &output_indices[sparse_index * output_rank];
-    int64 dense_index = 0;  // int64 to avoid overflow if Tindex is int32
+    int64_t dense_index = 0;  // int64 to avoid overflow if Tindex is int32
     // Flatten input index from slowest- to fastest-changing dimension.
     for (int i = 0; i < input_rank; ++i) {
       dense_index = dense_index * input_shape[i] + input_index[i];
@@ -55,14 +55,14 @@ __global__ void ReshapeSparseTensorKernel(
 namespace functor {
 
 template <>
-Status ReshapeSparseTensorFunctor<GPUDevice>::operator()(
+absl::Status ReshapeSparseTensorFunctor<GPUDevice>::operator()(
     OpKernelContext* context, const TensorShape& input_shape,
     const TensorShape& output_shape,
     typename TTypes<int64_t>::ConstMatrix input_indices,
     typename TTypes<int64_t>::Matrix output_indices) const {
-  const int64 input_rank = input_shape.dims();
-  const int64 output_rank = output_shape.dims();
-  const int64 nnz = input_indices.dimension(0);
+  const int64_t input_rank = input_shape.dims();
+  const int64_t output_rank = output_shape.dims();
+  const int64_t nnz = input_indices.dimension(0);
   // We copy input_shape and output_shape to the GPU and then launch a kernel
   // to compute output_indices.
   Tensor input_shape_gpu_t;
@@ -75,16 +75,16 @@ Status ReshapeSparseTensorFunctor<GPUDevice>::operator()(
   auto output_shape_gpu = output_shape_gpu_t.flat<int64_t>();
   se::Stream* stream = context->op_device_context()->stream();
   if (!stream) return errors::Internal("No GPU stream available.");
-  se::DeviceMemoryBase input_shape_gpu_mem(input_shape_gpu.data(),
-                                           input_rank * sizeof(int64));
+  stream_executor::DeviceAddressBase input_shape_gpu_mem(
+      input_shape_gpu.data(), input_rank * sizeof(int64_t));
   TF_RETURN_IF_ERROR(stream->Memcpy(&input_shape_gpu_mem,
                                     input_shape.dim_sizes().data(),
-                                    input_rank * sizeof(int64)));
-  se::DeviceMemoryBase output_shape_gpu_mem(output_shape_gpu.data(),
-                                            output_rank * sizeof(int64));
+                                    input_rank * sizeof(int64_t)));
+  stream_executor::DeviceAddressBase output_shape_gpu_mem(
+      output_shape_gpu.data(), output_rank * sizeof(int64_t));
   TF_RETURN_IF_ERROR(stream->Memcpy(&output_shape_gpu_mem,
                                     output_shape.dim_sizes().data(),
-                                    output_rank * sizeof(int64)));
+                                    output_rank * sizeof(int64_t)));
   const GPUDevice& device = context->template eigen_device<GPUDevice>();
   auto config = GetGpuLaunchConfig(nnz, device);
   return GpuLaunchKernel(ReshapeSparseTensorKernel<int64_t>, config.block_count,
diff --git a/tensorflow/core/kernels/restore_v2_op_test.cc b/tensorflow/core/kernels/restore_v2_op_test.cc
index c102cc42e2063f..0a66a0f31d4366 100644
--- a/tensorflow/core/kernels/restore_v2_op_test.cc
+++ b/tensorflow/core/kernels/restore_v2_op_test.cc
@@ -61,9 +61,9 @@ class RestoreV2OpTest : public OpsTestBase {
   }
 
   void RunTest(absl::string_view save_op_to_use) {
-    const string filename =
+    const std::string filename =
         io::JoinPath(testing::TmpDir(), "tensor_simple-", save_op_to_use);
-    const std::vector<string> tensor_names = {
+    const std::vector<std::string> tensor_names = {
         "tensor_bool",  "tensor_int",    "tensor_float",     "tensor_double",
         "tensor_qint8", "tensor_qint32", "tensor_uint8",     "tensor_int8",
         "tensor_int16", "tensor_int64",  "tensor_complex64", "tensor_half"};
@@ -114,12 +114,12 @@ class RestoreV2OpTest : public OpsTestBase {
       // Input #1 is the tensor names
       Tensor input_1 = MakeInput<tstring>(
           TensorShape({static_cast<int>(tensor_names.size())}),
-          [&tensor_names](int x) -> string { return tensor_names[x]; });
+          [&tensor_names](int x) -> std::string { return tensor_names[x]; });
       inputs.push_back({nullptr, &input_1});
 
       Tensor shape_and_slices = MakeInput<tstring>(
           TensorShape({static_cast<int>(tensor_names.size())}),
-          [](int x) -> string { return "" /* saves in full */; });
+          [](int x) -> std::string { return "" /* saves in full */; });
       if (save_op_to_use != "Save") {
         inputs.push_back({nullptr, &shape_and_slices});
       }
@@ -129,8 +129,8 @@ class RestoreV2OpTest : public OpsTestBase {
                                        [](int x) -> bool { return x != 0; });
       inputs.push_back({nullptr, &input_2});
       // Input #3 is a 1-d integer tensor
-      Tensor input_3 = MakeInput<int32>(TensorShape({10}),
-                                        [](int x) -> int32 { return x + 1; });
+      Tensor input_3 = MakeInput<int32_t>(
+          TensorShape({10}), [](int x) -> int32_t { return x + 1; });
       inputs.push_back({nullptr, &input_3});
       // Input #4 is a 2-d float tensor
       Tensor input_4 = MakeInput<float>(
@@ -154,20 +154,20 @@ class RestoreV2OpTest : public OpsTestBase {
           });
       inputs.push_back({nullptr, &input_7});
       // Input #8 is a 1-d uint8 tensor
-      Tensor input_8 = MakeInput<uint8>(TensorShape({11}),
-                                        [](int x) -> uint8 { return x + 1; });
+      Tensor input_8 = MakeInput<uint8_t>(
+          TensorShape({11}), [](int x) -> uint8_t { return x + 1; });
       inputs.push_back({nullptr, &input_8});
       // Input #9 is a 1-d int8 tensor
-      Tensor input_9 = MakeInput<int8>(TensorShape({7}),
-                                       [](int x) -> int8 { return x - 7; });
+      Tensor input_9 = MakeInput<int8_t>(TensorShape({7}),
+                                         [](int x) -> int8_t { return x - 7; });
       inputs.push_back({nullptr, &input_9});
       // Input #10 is a 1-d int16 tensor
-      Tensor input_10 = MakeInput<int16>(TensorShape({7}),
-                                         [](int x) -> int16 { return x - 8; });
+      Tensor input_10 = MakeInput<int16_t>(
+          TensorShape({7}), [](int x) -> int16_t { return x - 8; });
       inputs.push_back({nullptr, &input_10});
       // Input #11 is a 1-d int64 tensor
       Tensor input_11 = MakeInput<int64_t>(
-          TensorShape({9}), [](int x) -> int64 { return x - 9; });
+          TensorShape({9}), [](int x) -> int64_t { return x - 9; });
       inputs.push_back({nullptr, &input_11});
       // Input #12 is a 1-d complex64 tensor
       Tensor input_13 = MakeInput<complex64>(
@@ -222,7 +222,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({10});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 10; ++i) {
-        EXPECT_EQ(i + 1, output->flat<int32>()(i));
+        EXPECT_EQ(i + 1, output->flat<int32_t>()(i));
       }
     }
     // The 2-d float tensor
@@ -283,7 +283,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({11});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 11; ++i) {
-        EXPECT_EQ(i + 1, output->flat<uint8>()(i));
+        EXPECT_EQ(i + 1, output->flat<uint8_t>()(i));
       }
     }
     // The 1-d int8 tensor
@@ -295,7 +295,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({7});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 7; ++i) {
-        EXPECT_EQ(i - 7, output->flat<int8>()(i));
+        EXPECT_EQ(i - 7, output->flat<int8_t>()(i));
       }
     }
     // The 1-d int16 tensor
@@ -307,7 +307,7 @@ class RestoreV2OpTest : public OpsTestBase {
       TensorShape expected({7});
       EXPECT_TRUE(output->shape().IsSameSize(expected));
       for (int i = 0; i < 7; ++i) {
-        EXPECT_EQ(i - 8, output->flat<int16>()(i));
+        EXPECT_EQ(i - 8, output->flat<int16_t>()(i));
       }
     }
     // The 1-d int64 tensor
diff --git a/tensorflow/core/kernels/roll_op_gpu.cu.cc b/tensorflow/core/kernels/roll_op_gpu.cu.cc
index dca487fc060003..130bdd206b67fd 100644
--- a/tensorflow/core/kernels/roll_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/roll_op_gpu.cu.cc
@@ -30,15 +30,15 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 
 template <typename T>
-__global__ void RollKernel(const int32 nthreads, const int32 num_dims,
+__global__ void RollKernel(const int32_t nthreads, const int32_t num_dims,
                            const T* __restrict__ input, T* __restrict__ output,
-                           const int32* __restrict__ dim_size,
-                           const int32* __restrict__ threshold,
-                           const int64* __restrict__ dim_range) {
+                           const int32_t* __restrict__ dim_size,
+                           const int32_t* __restrict__ threshold,
+                           const int64_t* __restrict__ dim_range) {
   CUDA_1D_KERNEL_LOOP(out_idx, nthreads) {
-    int64 offset = 0;
+    int64_t offset = 0;
     for (int i = 0; i < num_dims; i++) {
-      const int64 stride = dim_range[i] / dim_size[i];
+      const int64_t stride = dim_range[i] / dim_size[i];
       const int shift = dim_size[i] - threshold[i];
       const int indx = (out_idx / stride) % dim_size[i];
       const int shifted_indx = (indx + shift) % dim_size[i];
@@ -53,21 +53,22 @@ namespace functor {
 
 template <typename T>
 struct Roll<GPUDevice, T> {
-  void operator()(const OpKernelContext* context, const int64 num_elements,
-                  const int num_dims, const gtl::ArraySlice<int32> dim_size,
+  void operator()(const OpKernelContext* context, const int64_t num_elements,
+                  const int num_dims, const absl::Span<const int32> dim_size,
                   const T* input, T* output,
-                  const gtl::ArraySlice<int32> threshold,
-                  const gtl::ArraySlice<int64_t> dim_range, const int64 isd) {
+                  const absl::Span<const int32> threshold,
+                  const absl::Span<const int64_t> dim_range,
+                  const int64_t isd) {
     if (!num_elements) return;
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
-    auto dim_bytes = sizeof(int32) * dim_size.size();
+    auto dim_bytes = sizeof(int32_t) * dim_size.size();
     auto dim_buf = d.allocate(dim_bytes);
 
-    auto thres_bytes = sizeof(int32) * threshold.size();
+    auto thres_bytes = sizeof(int32_t) * threshold.size();
     auto thres_buf = d.allocate(thres_bytes);
 
-    auto range_bytes = sizeof(int64) * dim_range.size();
+    auto range_bytes = sizeof(int64_t) * dim_range.size();
     auto range_buf = d.allocate(range_bytes);
 
     d.memcpyHostToDevice(dim_buf, dim_size.data(), dim_bytes);
@@ -76,12 +77,12 @@ struct Roll<GPUDevice, T> {
 
     GpuLaunchConfig cfg = GetGpuLaunchConfig(num_elements, d);
 
-    TF_CHECK_OK(GpuLaunchKernel(RollKernel<T>, cfg.block_count,
-                                cfg.thread_per_block, 0, d.stream(),
-                                cfg.virtual_thread_count, num_dims, input,
-                                output, reinterpret_cast<const int32*>(dim_buf),
-                                reinterpret_cast<const int32*>(thres_buf),
-                                reinterpret_cast<const int64*>(range_buf)));
+    TF_CHECK_OK(
+        GpuLaunchKernel(RollKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
+                        d.stream(), cfg.virtual_thread_count, num_dims, input,
+                        output, reinterpret_cast<const int32_t*>(dim_buf),
+                        reinterpret_cast<const int32_t*>(thres_buf),
+                        reinterpret_cast<const int64_t*>(range_buf)));
 
     d.deallocate(dim_buf);
     d.deallocate(thres_buf);
diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc
index 7e9054f997172d..f9dac8363f8f37 100644
--- a/tensorflow/core/kernels/scan_ops.cc
+++ b/tensorflow/core/kernels/scan_ops.cc
@@ -104,7 +104,7 @@ namespace functor {
   DECLARE(Eigen::internal::ProdReducer<T>, T);
 
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_ALL_REDUCERS);
-DECLARE_FOR_ALL_REDUCERS(int32);
+DECLARE_FOR_ALL_REDUCERS(int32_t);
 DECLARE_FOR_ALL_REDUCERS(int64_t);
 #undef DECLARE_FOR_ALL_REDUCERS
 
@@ -151,7 +151,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .HostMemory("axis"),                                           \
       ScanOp<GPUDevice, type, Eigen::internal::SumReducer<type>, int64>)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-REGISTER_GPU_KERNELS(int32);
+REGISTER_GPU_KERNELS(int32_t);
 REGISTER_GPU_KERNELS(int64_t);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -190,7 +190,7 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
           .HostMemory("axis"),                                            \
       ScanOp<GPUDevice, type, Eigen::internal::ProdReducer<type>, int64>)
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
-REGISTER_GPU_KERNELS(int32);
+REGISTER_GPU_KERNELS(int32_t);
 REGISTER_GPU_KERNELS(int64_t);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.h b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
index 61868b7853e400..e4f43d51b46075 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.h
@@ -97,7 +97,7 @@ __global__ void ScatterOpCustomKernel(T* __restrict__ params,
       // Ignore indices that are out of range.
       continue;
     }
-    int64 params_i = param_first_index * update_block + (i % update_block);
+    int64_t params_i = param_first_index * update_block + (i % update_block);
     body(&params[params_i], ldg(updates + updates_i));
   }
 }
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 7d61e1aa2f257e..d5e3b2ad9eb0a9 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -1040,10 +1040,10 @@ absl::Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
 // and the GPU implementation is not. Tensor inputs to this function must be on
 // the GPU.
 template <typename T, typename Index, scatter_nd_op::UpdateOp Op>
-Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
-                        const Tensor& updates, const TensorShape& shape,
-                        Tensor* out, bool allocate,
-                        BadIndicesPolicy bad_indices_policy) {
+absl::Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
+                              const Tensor& updates, const TensorShape& shape,
+                              Tensor* out, bool allocate,
+                              BadIndicesPolicy bad_indices_policy) {
   AllocatorAttributes alloc_attr;
   alloc_attr.set_on_host(true);
   alloc_attr.set_gpu_compatible(true);
@@ -1053,7 +1053,7 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
   Tensor host_indices;
   TF_RETURN_IF_ERROR(c->allocate_temp(indices.dtype(), indices.shape(),
                                       &host_indices, alloc_attr));
-  se::DeviceMemoryBase indices_ptr(
+  stream_executor::DeviceAddressBase indices_ptr(
       const_cast<Tensor&>(indices).flat<Index>().data(),
       indices.flat<Index>().size() * sizeof(Index));
   TF_RETURN_IF_ERROR(stream->Memcpy(host_indices.flat<Index>().data(),
@@ -1063,7 +1063,7 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
   Tensor host_updates;
   TF_RETURN_IF_ERROR(c->allocate_temp(updates.dtype(), updates.shape(),
                                       &host_updates, alloc_attr));
-  se::DeviceMemoryBase updates_ptr(
+  stream_executor::DeviceAddressBase updates_ptr(
       const_cast<Tensor&>(updates).flat<T>().data(),
       updates.flat<T>().size() * sizeof(T));
   TF_RETURN_IF_ERROR(stream->Memcpy(host_updates.flat<T>().data(), updates_ptr,
@@ -1078,8 +1078,8 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
     fill(c->eigen_device<CPUDevice>(), host_out.flat<T>());
   } else {
     CHECK_NOTNULL(out);  // Crash OK
-    se::DeviceMemoryBase out_ptr(out->flat<T>().data(),
-                                 out->flat<T>().size() * sizeof(T));
+    stream_executor::DeviceAddressBase out_ptr(
+        out->flat<T>().data(), out->flat<T>().size() * sizeof(T));
     TF_RETURN_IF_ERROR(stream->Memcpy(host_out.flat<T>().data(), out_ptr,
                                       host_out.NumElements() * sizeof(T)));
   }
@@ -1090,13 +1090,13 @@ Status DoScatterNdOnCpu(OpKernelContext* c, const Tensor& indices,
       bad_indices_policy));
 
   // Copy 'host_out' to device.
-  se::DeviceMemoryBase out_ptr(out->flat<T>().data(),
-                               out->flat<T>().size() * sizeof(T));
+  stream_executor::DeviceAddressBase out_ptr(out->flat<T>().data(),
+                                             out->flat<T>().size() * sizeof(T));
   TF_RETURN_IF_ERROR(stream->Memcpy(&out_ptr, host_out.flat<T>().data(),
                                     host_out.NumElements() * sizeof(T)));
   // Block host, since 'host_out' cannot be destructed until the copy is done.
   TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index fd1d4747c40982..ae2402b2a228e1 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -98,7 +98,7 @@ template <typename T, typename Index, scatter_nd_op::UpdateOp op, int IXDIM>
 __global__ void ScatterNdOpKernel(
     const Index* indices, const T* updates, T* out,
     const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
-    const Eigen::array<int64, IXDIM> batch_strides, const int64 num_indices,
+    const Eigen::array<int64_t, IXDIM> batch_strides, const int64_t num_indices,
     const Index slice_size) {
   auto update = LeftUpdate<T, op>();
 
@@ -141,7 +141,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
     const Eigen::DenseIndex batch_size = Tindices.dimension(0);
 
     // Index batch_strides[IXDIM];
-    Eigen::array<int64, IXDIM> batch_strides;
+    Eigen::array<int64_t, IXDIM> batch_strides;
     if (IXDIM > 0) {
       batch_strides[IXDIM - 1] = 1;
     }
diff --git a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
index 67602db6164561..10448882a9296d 100644
--- a/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/searchsorted_op_gpu.cu.cc
@@ -60,15 +60,16 @@ __global__ void LowerBoundKernel(const T* __restrict__ sorted_inputs,
 namespace functor {
 template <typename T, typename OutType>
 struct UpperBoundFunctor<GPUDevice, T, OutType> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
-                        const typename TTypes<T, 1>::ConstTensor& values,
-                        int batch_size, int num_inputs, int num_values,
-                        typename TTypes<OutType, 1>::Tensor* output) {
+  static absl::Status Compute(
+      OpKernelContext* context,
+      const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
+      const typename TTypes<T, 1>::ConstTensor& values, int batch_size,
+      int num_inputs, int num_values,
+      typename TTypes<OutType, 1>::Tensor* output) {
     const GPUDevice& device = context->eigen_device<GPUDevice>();
     if (values.size() == 0) {
       // GetGpuLaunchConfig requires work_element_count > 0
-      return OkStatus();
+      return absl::OkStatus();
     }
     GpuLaunchConfig config = GetGpuLaunchConfig(values.size(), device);
 
@@ -77,21 +78,22 @@ struct UpperBoundFunctor<GPUDevice, T, OutType> {
         config.thread_per_block, 0, device.stream(), sorted_inputs.data(),
         batch_size, num_inputs, num_values, values.data(), output->data()));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
 template <typename T, typename OutType>
 struct LowerBoundFunctor<GPUDevice, T, OutType> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
-                        const typename TTypes<T, 1>::ConstTensor& values,
-                        int batch_size, int num_inputs, int num_values,
-                        typename TTypes<OutType, 1>::Tensor* output) {
+  static absl::Status Compute(
+      OpKernelContext* context,
+      const typename TTypes<T, 1>::ConstTensor& sorted_inputs,
+      const typename TTypes<T, 1>::ConstTensor& values, int batch_size,
+      int num_inputs, int num_values,
+      typename TTypes<OutType, 1>::Tensor* output) {
     const GPUDevice& device = context->eigen_device<GPUDevice>();
     if (values.size() == 0) {
       // GetGpuLaunchConfig requires work_element_count > 0
-      return OkStatus();
+      return absl::OkStatus();
     }
     GpuLaunchConfig config = GetGpuLaunchConfig(values.size(), device);
 
@@ -100,7 +102,7 @@ struct LowerBoundFunctor<GPUDevice, T, OutType> {
         config.thread_per_block, 0, device.stream(), sorted_inputs.data(),
         batch_size, num_inputs, num_values, values.data(), output->data()));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
index f0ba0ce2c27572..dc63e6c5602956 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_gpu.cu.h
@@ -165,7 +165,7 @@ __global__ void SegmentMeanNormalizeKernel(
 }
 
 template <typename SegmentId, typename Index, typename T>
-Status LaunchSegmentMeanNormalizeKernel(
+absl::Status LaunchSegmentMeanNormalizeKernel(
     const GPUDevice& d, SegmentId nsegments, Index ninner,
     const Index* __restrict__ segment_offsets,  // [nsegments + 1]
     T* __restrict__ output) {                   // [nsegments, ninner]
@@ -195,7 +195,7 @@ __global__ void SegmentSetEmptyKernel(
 }
 
 template <typename SegmentId, typename Index, typename T>
-Status LaunchSegmentSetEmptyKernel(
+absl::Status LaunchSegmentSetEmptyKernel(
     const GPUDevice& d, SegmentId nsegments, Index ninner,
     const Index* __restrict__ segment_offsets,  // [nsegments + 1]
     const T empty_value,
@@ -263,7 +263,7 @@ __global__ void SegmentOffsetsKernel(
 // value at segment_offsets[nsegments] is set to the end index of the last valid
 // ID (e.g., nsegments if all IDs are valid).
 template <typename Toffsets, typename Tsegmentids>
-Status LaunchSegmentOffsetsKernel(
+absl::Status LaunchSegmentOffsetsKernel(
     const GPUDevice& d, Toffsets size, Tsegmentids nsegments,
     const Tsegmentids* segment_ids,  // [size]
     Toffsets* segment_offsets) {     // [nsegments + 1]
@@ -397,7 +397,7 @@ __global__ void SegmentReduceVectorKernel(
 template <typename Treducevec, typename Tvec, typename Toffsets,
           typename Tindices, typename Tsegmentids, typename ReduceOp,
           typename Tinit, typename Tweights>
-Status LaunchSegmentReduceVectorKernel(
+absl::Status LaunchSegmentReduceVectorKernel(
     const GPUDevice& d, Toffsets nouter, Toffsets ninner_vec,
     Tsegmentids nsegments, ReduceOp reduce_op, Tinit initial_value,
     Tinit empty_segment_value, bool is_mean, bool is_sqrtn,
@@ -467,7 +467,7 @@ __global__ void SegmentReduceEpilogueKernel(
 // be a higher-precision type than the output type Tvec (e.g., float vs. half).
 template <typename Tvec, typename Treducevec, typename Toffsets,
           typename Tsegmentids, typename Tinit>
-Status LaunchSegmentReduceEpilogueKernel(
+absl::Status LaunchSegmentReduceEpilogueKernel(
     const GPUDevice& d, Tsegmentids nsegments, Tinit empty_segment_value,
     bool is_mean, bool is_sqrtn,
     const Treducevec* output_raw,     // [nsegments]
@@ -542,7 +542,7 @@ MakeLookupAndScaleAndCastInputsIterator(const Tvec* input_vec,
 template <typename Treducevec, typename Tvec, typename Toffsets,
           typename Tindices, typename Tsegmentids, typename ReduceOp,
           typename Tinit, typename Tweights>
-Status SegmentReduceGPUImplNoInnerDim(
+absl::Status SegmentReduceGPUImplNoInnerDim(
     OpKernelContext* ctx, Toffsets nouter, Tsegmentids nsegments,
     ReduceOp reduce_op, Tinit initial_value, Tinit empty_segment_value,
     bool is_mean, bool is_sqrtn,
@@ -568,7 +568,7 @@ Status SegmentReduceGPUImplNoInnerDim(
         TensorShape({static_cast<int64_t>(nsegments * sizeof(Treducevec))}),
         &output_raw));
     output_raw_ptr =
-        reinterpret_cast<Treducevec*>(output_raw.flat<int8>().data());
+        reinterpret_cast<Treducevec*>(output_raw.flat<int8_t>().data());
   }
   auto input_iter =
       MakeLookupAndScaleAndCastInputsIterator<Treducevec, Toffsets>(
@@ -586,13 +586,13 @@ Status SegmentReduceGPUImplNoInnerDim(
         device, nsegments, empty_segment_value, is_mean, is_sqrtn,
         output_raw_ptr, segment_offsets, output_vec));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Treducevec, typename Tvec, typename Toffsets,
           typename Tindices, typename Tsegmentids, typename ReduceOp,
           typename Tinit, typename Tweights>
-Status SegmentReduceGPUImpl(
+absl::Status SegmentReduceGPUImpl(
     OpKernelContext* ctx, Toffsets nouter, Toffsets ninner_vec,
     Tsegmentids nsegments, ReduceOp reduce_op, Tinit initial_value,
     Tinit empty_segment_value, bool is_mean, bool is_sqrtn,
@@ -648,12 +648,13 @@ struct SegmentReduceGPUVectorized {
   struct Impl {
     template <typename T, typename Toffsets, typename Tindices,
               typename Tsegmentids, typename ReduceOp, typename Tweights>
-    Status operator()(OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
-                      Tsegmentids nsegments, ReduceOp reduce_op,
-                      T initial_value, T empty_segment_value, bool is_mean,
-                      bool is_sqrtn, const T* input,
-                      const Tsegmentids* segment_ids, const Tindices* indices,
-                      const Tweights* weights, T* output) {
+    absl::Status operator()(OpKernelContext* ctx, Toffsets nouter,
+                            Toffsets ninner, Tsegmentids nsegments,
+                            ReduceOp reduce_op, T initial_value,
+                            T empty_segment_value, bool is_mean, bool is_sqrtn,
+                            const T* input, const Tsegmentids* segment_ids,
+                            const Tindices* indices, const Tweights* weights,
+                            T* output) {
       DCHECK_EQ(ninner % vec_size, 0);
       DCHECK_EQ(reinterpret_cast<std::uintptr_t>(input) % vec_size, 0);
       DCHECK_EQ(reinterpret_cast<std::uintptr_t>(output) % vec_size, 0);
@@ -682,16 +683,16 @@ struct SegmentReduceGPUVectorized {
 // Note: Treduce is to allow reducing in higher precision than T.
 template <typename Treduce, typename T, typename Toffsets, typename Tindices,
           typename Tsegmentids, typename ReduceOp, typename Tweights>
-Status SegmentReduceGPU(OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
-                        Tsegmentids nsegments, ReduceOp reduce_op,
-                        T initial_value, T empty_segment_value, bool is_mean,
-                        bool is_sqrtn,
-                        const T* input,  // [nouter or any, ninner]
-                        const Tsegmentids* segment_ids,  // [nouter]
-                        const Tindices* indices,         // [nouter] (optional)
-                        const Tweights* weights,  // [nouter or any] (optional)
-                        T* output) {              // [nsegments, ninner]
-  if (ninner == 0 || nsegments == 0) return OkStatus();
+absl::Status SegmentReduceGPU(
+    OpKernelContext* ctx, Toffsets nouter, Toffsets ninner,
+    Tsegmentids nsegments, ReduceOp reduce_op, T initial_value,
+    T empty_segment_value, bool is_mean, bool is_sqrtn,
+    const T* input,                  // [nouter or any, ninner]
+    const Tsegmentids* segment_ids,  // [nouter]
+    const Tindices* indices,         // [nouter] (optional)
+    const Tweights* weights,         // [nouter or any] (optional)
+    T* output) {                     // [nsegments, ninner]
+  if (ninner == 0 || nsegments == 0) return absl::OkStatus();
   return DispatchToVectorized<
       T, SegmentReduceGPUVectorized<Treduce>::template Impl>(
       MinAlignmentOf(input, output, ninner), ctx, nouter, ninner, nsegments,
@@ -716,7 +717,7 @@ __global__ void SegmentWeightsKernel(
 }
 
 template <typename SegmentId, typename Index, typename Tweights>
-Status LaunchSegmentWeightsKernel(
+absl::Status LaunchSegmentWeightsKernel(
     const GPUDevice& d, SegmentId nsegments,
     SparseSegmentReductionOperation operation,
     const Index* segment_offsets,  // [nsegments + 1]
@@ -945,7 +946,7 @@ struct UnsortedSegmentFunctor<GPUDevice, T, Index, InitialValueF, ReductionF> {
 };
 
 template <typename T, typename Index, typename SegmentId>
-Status SparseSegmentReductionFunctor<T, Index, SegmentId>::operator()(
+absl::Status SparseSegmentReductionFunctor<T, Index, SegmentId>::operator()(
     OpKernelContext* context, bool is_mean, bool is_sqrtn, T default_value,
     typename TTypes<T, 2>::ConstTensor input,
     typename TTypes<Index>::ConstVec indices,
@@ -1087,7 +1088,7 @@ __global__ void ScatterUniqueIndicesKernel(
 
 template <typename Toffsets, typename EdgeIndicatorIter,
           typename TindicesCompact, typename Tindices>
-Status LaunchScatterUniqueIndicesKernel(
+absl::Status LaunchScatterUniqueIndicesKernel(
     const GPUDevice& d, Toffsets nouter,
     EdgeIndicatorIter sorted_indices_edge_indicator,     // [nouter]
     const TindicesCompact* __restrict__ sorted_indices,  // [nouter]
@@ -1122,7 +1123,7 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
     const int64_t nouter64 = indices_vec.dimension(0);
     // Note: nouter and ninner are not expected to be huge, so we use int32 to
     // save memory bandwidth.
-    using Toffsets = int32;
+    using Toffsets = int32_t;
     OP_REQUIRES_ASYNC(context, nouter64 <= std::numeric_limits<Toffsets>::max(),
                       absl::InvalidArgumentError(
                           absl::StrCat("Indices vector of length ", nouter64,
@@ -1140,7 +1141,7 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
     // worth it because the vector is used multiple times).
     // Note that we can currently assume int32 is safe because the op's dense
     // output_dim0 input is always int32.
-    using TindicesCompact = int32;
+    using TindicesCompact = int32_t;
     Tensor tmp_indices_internal;
     const TindicesCompact* indices_internal_ptr;
     if constexpr (std::is_same<Tindices, TindicesCompact>::value) {
@@ -1163,9 +1164,9 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
           context, operation, nouter, ninner, nsegments, input_flat.data(),
           tmp_indices_internal, indices_internal_ptr, segment_vec,
           dense_output_shape, done);
-    } else if (sizeof(Tsegmentids) > sizeof(int32) &&
-               nsegments <= std::numeric_limits<int32>::max()) {
-      CastSegmentIdsThenImpl<Toffsets, TindicesCompact, int32>(
+    } else if (sizeof(Tsegmentids) > sizeof(int32_t) &&
+               nsegments <= std::numeric_limits<int32_t>::max()) {
+      CastSegmentIdsThenImpl<Toffsets, TindicesCompact, int32_t>(
           context, operation, nouter, ninner, nsegments, input_flat.data(),
           tmp_indices_internal, indices_internal_ptr, segment_vec,
           dense_output_shape, done);
@@ -1295,12 +1296,13 @@ struct SparseSegmentGradV2Functor<GPUDevice, T, Tindices, Tsegmentids> {
     ScratchSpace<Toffsets> last_idx_host(context, 1, /*on_host=*/true);
     OP_REQUIRES_OK_ASYNC(
         context,
-        stream->Memcpy(last_idx_host.mutable_data(),
-                       se::DeviceMemoryBase(const_cast<Toffsets*>(
-                                                sorted_indices_unique_ids_ptr) +
-                                                (nouter - 1),
-                                            sizeof(*last_idx_host.data())),
-                       sizeof(*last_idx_host.data())),
+        stream->Memcpy(
+            last_idx_host.mutable_data(),
+            stream_executor::DeviceAddressBase(
+                const_cast<Toffsets*>(sorted_indices_unique_ids_ptr) +
+                    (nouter - 1),
+                sizeof(*last_idx_host.data())),
+            sizeof(*last_idx_host.data())),
         done);
 
     auto async_finish_computation =
diff --git a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
index bdeb782dc47e49..4bc9c22b33bb00 100644
--- a/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetobatch_functor_gpu.cu.cc
@@ -31,11 +31,11 @@ typedef Eigen::GpuDevice GPUDevice;
 // GPU kernel.
 template <int NUM_BLOCK_DIMS>
 struct S2BParameters {
-  int32 space_tensor_batch;
-  int32 batch_tensor_shape[NUM_BLOCK_DIMS + 2];
-  int32 space_tensor_spatial_shape[NUM_BLOCK_DIMS];
-  int32 pad_start[NUM_BLOCK_DIMS];
-  int32 block_shape[NUM_BLOCK_DIMS];
+  int32_t space_tensor_batch;
+  int32_t batch_tensor_shape[NUM_BLOCK_DIMS + 2];
+  int32_t space_tensor_spatial_shape[NUM_BLOCK_DIMS];
+  int32_t pad_start[NUM_BLOCK_DIMS];
+  int32_t block_shape[NUM_BLOCK_DIMS];
 };
 
 // GPU kernel for space-to-batch (if B2S = false) and batch-to-space conversion
@@ -44,13 +44,13 @@ struct S2BParameters {
 // To simplify template implementation given lack of constexpr if, both the
 // input and output pointers are non-const.
 template <typename T, int NUM_BLOCK_DIMS, bool B2S>
-__global__ void S2B(const int32 nthreads, T* __restrict__ space_tensor_ptr,
+__global__ void S2B(const int32_t nthreads, T* __restrict__ space_tensor_ptr,
                     S2BParameters<NUM_BLOCK_DIMS> args,
                     T* __restrict__ batch_tensor_ptr) {
   GPU_1D_KERNEL_LOOP(batch_tensor_idx, nthreads) {
-    int32 remaining_batch_tensor_idx = batch_tensor_idx;
+    int32_t remaining_batch_tensor_idx = batch_tensor_idx;
 
-    int32 batch_tensor_pos[NUM_BLOCK_DIMS + 2];
+    int32_t batch_tensor_pos[NUM_BLOCK_DIMS + 2];
 
     for (int dim = NUM_BLOCK_DIMS + 1; dim >= 1; --dim) {
       batch_tensor_pos[dim] =
@@ -59,17 +59,17 @@ __global__ void S2B(const int32 nthreads, T* __restrict__ space_tensor_ptr,
     }
     batch_tensor_pos[0] = remaining_batch_tensor_idx;
 
-    int32 remaining_block_idx = batch_tensor_pos[0] / args.space_tensor_batch;
-    int32 space_tensor_idx = batch_tensor_pos[NUM_BLOCK_DIMS + 1];
-    int32 space_tensor_stride = args.batch_tensor_shape[NUM_BLOCK_DIMS + 1];
-    const int32 space_tensor_batch_pos =
+    int32_t remaining_block_idx = batch_tensor_pos[0] / args.space_tensor_batch;
+    int32_t space_tensor_idx = batch_tensor_pos[NUM_BLOCK_DIMS + 1];
+    int32_t space_tensor_stride = args.batch_tensor_shape[NUM_BLOCK_DIMS + 1];
+    const int32_t space_tensor_batch_pos =
         batch_tensor_pos[0] % args.space_tensor_batch;
     for (int block_dim = NUM_BLOCK_DIMS - 1; block_dim >= 0; --block_dim) {
-      int32 offset = remaining_block_idx;
+      int32_t offset = remaining_block_idx;
       if (block_dim > 0) {
         offset %= args.block_shape[block_dim];
       }
-      int32 space_tensor_pos =
+      int32_t space_tensor_pos =
           batch_tensor_pos[block_dim + 1] * args.block_shape[block_dim] +
           offset - args.pad_start[block_dim];
       if (space_tensor_pos < 0 ||
@@ -102,45 +102,45 @@ template <typename T, int NUM_BLOCK_DIMS, bool B2S>
 struct SpaceToBatchFunctor<GPUDevice, T, NUM_BLOCK_DIMS, B2S> {
   using SpaceT = typename std::conditional<B2S, T, const T>::type;
   using BatchT = typename std::conditional<B2S, const T, T>::type;
-  Status operator()(
+  absl::Status operator()(
       const GPUDevice& d,
       typename TTypes<SpaceT, NUM_BLOCK_DIMS + 2>::Tensor space_tensor,
-      const int64 block_shape[NUM_BLOCK_DIMS],
-      const int64 paddings[NUM_BLOCK_DIMS * 2],
+      const int64_t block_shape[NUM_BLOCK_DIMS],
+      const int64_t paddings[NUM_BLOCK_DIMS * 2],
       typename TTypes<BatchT, NUM_BLOCK_DIMS + 2>::Tensor batch_tensor) {
     // Kernel execution fails if number of elements is zero.
     if (batch_tensor.size() == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     S2BParameters<NUM_BLOCK_DIMS> args;
     args.space_tensor_batch = space_tensor.dimension(0);
     for (int block_dim = 0; block_dim < NUM_BLOCK_DIMS; ++block_dim) {
-      if (block_shape[block_dim] > std::numeric_limits<int32>::max()) {
+      if (block_shape[block_dim] > std::numeric_limits<int32_t>::max()) {
         return errors::InvalidArgument("block_shape value exceeds 2^32-1");
       }
       args.block_shape[block_dim] = block_shape[block_dim];
       if (space_tensor.dimension(block_dim + 1) >
-          std::numeric_limits<int32>::max()) {
+          std::numeric_limits<int32_t>::max()) {
         return errors::InvalidArgument("space_tensor dimension exceeds 2^32-1");
       }
       args.space_tensor_spatial_shape[block_dim] =
           space_tensor.dimension(block_dim + 1);
-      if (paddings[block_dim * 2] > std::numeric_limits<int32>::max()) {
+      if (paddings[block_dim * 2] > std::numeric_limits<int32_t>::max()) {
         return errors::InvalidArgument("paddings/crops value exceeds 2^32-1");
       }
       args.pad_start[block_dim] = paddings[block_dim * 2];
     }
-    int64 total_count = 1;
+    int64_t total_count = 1;
     for (int dim = 0; dim < NUM_BLOCK_DIMS + 2; ++dim) {
       args.batch_tensor_shape[dim] = batch_tensor.dimension(dim);
       total_count *= args.batch_tensor_shape[dim];
     }
-    if (total_count > std::numeric_limits<int32>::max()) {
+    if (total_count > std::numeric_limits<int32_t>::max()) {
       return errors::InvalidArgument(
           "number of batch_tensor elements exceeds 2^32-1");
     }
     GpuLaunchConfig config =
-        GetGpuLaunchConfig(static_cast<int32>(total_count), d);
+        GetGpuLaunchConfig(static_cast<int32_t>(total_count), d);
     return GpuLaunchKernel(S2B<T, NUM_BLOCK_DIMS, B2S>, config.block_count,
                            config.thread_per_block, 0, d.stream(),
                            config.virtual_thread_count,
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index 8bb9474ca9b524..97acca5442890d 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -29,7 +29,7 @@ typedef Eigen::GpuDevice GPUDevice;
 // Space2Depth kernel for FORMAT_NHWC.
 // See 'spacetodepth_op.h' for a more detailed description.
 template <typename dtype>
-__global__ void S2D_NHWC(const int32 nthreads,
+__global__ void S2D_NHWC(const int32_t nthreads,
                          const dtype* __restrict__ input_ptr,
                          const int block_size, const int batch_size,
                          const int input_height, const int input_width,
@@ -61,7 +61,7 @@ __global__ void S2D_NHWC(const int32 nthreads,
 // Space2Depth kernel for FORMAT_NCHW.
 // See 'spacetodepth_op.h' for a more detailed description.
 template <typename dtype>
-__global__ void S2D_NCHW(const int32 nthreads,
+__global__ void S2D_NCHW(const int32_t nthreads,
                          const dtype* __restrict__ input_ptr,
                          const int block_size, const int output_width,
                          const int input_depth_by_output_height,
@@ -99,7 +99,7 @@ __global__ void S2D_NCHW(const int32 nthreads,
 // Space2Depth kernel for FORMAT_NCHW using a loop over block area.
 // See 'spacetodepth_op.h' for functional specification.
 template <typename dtype, int block_size>
-__global__ void S2D_NCHW_LOOP(const int32 nthreads,
+__global__ void S2D_NCHW_LOOP(const int32_t nthreads,
                               const dtype* __restrict__ input,
                               const int output_width, const int input_width,
                               const int input_depth_by_output_area,

From e5a57dc540b368b2fe9b939af70a15534b343a21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 04:36:57 -0800
Subject: [PATCH 615/753] Automated Code Change

PiperOrigin-RevId: 847082744
---
 .../core/util/autotune_maps/conv_autotune_maps.h     |  4 ++--
 .../core/util/autotune_maps/conv_parameters.cc       |  8 ++++----
 tensorflow/core/util/autotune_maps/conv_parameters.h | 12 ++++++------
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/util/autotune_maps/conv_autotune_maps.h b/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
index 7c00348adfe1ba..ebf542b2afbd75 100644
--- a/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
+++ b/tensorflow/core/util/autotune_maps/conv_autotune_maps.h
@@ -39,7 +39,7 @@ namespace tensorflow {
 
 // A dummy type to group forward convolution autotune results together.
 struct ConvAutotuneGroup {
-  static string name() { return "Conv"; }
+  static std::string name() { return "Conv"; }
 };
 
 using ConvAutotuneMap = AutotuneSingleton<ConvAutotuneGroup, ConvParameters,
@@ -47,7 +47,7 @@ using ConvAutotuneMap = AutotuneSingleton<ConvAutotuneGroup, ConvParameters,
 
 // A dummy type to group fused convolution autotune results together.
 struct ConvFusedAutotuneGroup {
-  static string name() { return "FusedConv"; }
+  static std::string name() { return "FusedConv"; }
 };
 
 using FusedConvAutotuneMap =
diff --git a/tensorflow/core/util/autotune_maps/conv_parameters.cc b/tensorflow/core/util/autotune_maps/conv_parameters.cc
index 3ef5626eeb8d61..be47f880c299be 100644
--- a/tensorflow/core/util/autotune_maps/conv_parameters.cc
+++ b/tensorflow/core/util/autotune_maps/conv_parameters.cc
@@ -31,11 +31,11 @@ namespace tensorflow {
 namespace {
 using ::tsl::protobuf::util::MessageDifferencer;
 
-uint64 ComputeHash(int device_id, const ConvParametersProto& proto) {
+uint64_t ComputeHash(int device_id, const ConvParametersProto& proto) {
   return Hash64Combine(device_id, tsl::DeterministicProtoHash64(proto));
 }
 
-uint64 ComputeHash(int device_id, const MatmulParametersProto& proto) {
+uint64_t ComputeHash(int device_id, const MatmulParametersProto& proto) {
   return Hash64Combine(device_id, tsl::DeterministicProtoHash64(proto));
 }
 }  // namespace
@@ -99,7 +99,7 @@ bool ConvParameters::operator==(const ConvParameters& other) const {
          MessageDifferencer::Equals(this->proto_, other.proto_);
 }
 
-string ConvParameters::ToString() const { return proto_.DebugString(); }
+std::string ConvParameters::ToString() const { return proto_.DebugString(); }
 
 MatmulParameters::MatmulParameters(
     se::StreamExecutor* stream_exec, DataType ab_dtype, DataType c_dtype,
@@ -137,7 +137,7 @@ bool MatmulParameters::operator==(const MatmulParameters& other) const {
          MessageDifferencer::Equals(this->proto_, other.proto_);
 }
 
-string MatmulParameters::ToString() const { return proto_.DebugString(); }
+std::string MatmulParameters::ToString() const { return proto_.DebugString(); }
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/autotune_maps/conv_parameters.h b/tensorflow/core/util/autotune_maps/conv_parameters.h
index b213dba9298dd3..12da493f4a59f4 100644
--- a/tensorflow/core/util/autotune_maps/conv_parameters.h
+++ b/tensorflow/core/util/autotune_maps/conv_parameters.h
@@ -90,16 +90,16 @@ class ConvParameters {
   bool operator!=(const ConvParameters& other) const {
     return !(*this == other);
   }
-  uint64 hash() const { return hash_code_; }
+  uint64_t hash() const { return hash_code_; }
 
-  string ToString() const;
+  std::string ToString() const;
 
   const ConvParametersProto& proto() const { return proto_; }
 
  private:
   int device_id_;
   ConvParametersProto proto_;
-  uint64 hash_code_;
+  uint64_t hash_code_;
 };
 
 class MatmulParameters {
@@ -127,16 +127,16 @@ class MatmulParameters {
   bool operator!=(const MatmulParameters& other) const {
     return !(*this == other);
   }
-  uint64 hash() const { return hash_code_; }
+  uint64_t hash() const { return hash_code_; }
 
-  string ToString() const;
+  std::string ToString() const;
 
   const MatmulParametersProto& proto() const { return proto_; }
 
  private:
   int device_id_;
   MatmulParametersProto proto_;
-  uint64 hash_code_;
+  uint64_t hash_code_;
 };
 
 }  // namespace tensorflow

From 4c4373fe645c803b5651e7fe6ad141839acc272f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 05:08:08 -0800
Subject: [PATCH 616/753] Automated Code Change

PiperOrigin-RevId: 847088166
---
 tensorflow/core/util/tensor_bundle/naming.cc  |  10 +-
 tensorflow/core/util/tensor_bundle/naming.h   |   6 +-
 .../util/tensor_bundle/tensor_bundle_test.cc  | 147 +++++++++---------
 3 files changed, 83 insertions(+), 80 deletions(-)

diff --git a/tensorflow/core/util/tensor_bundle/naming.cc b/tensorflow/core/util/tensor_bundle/naming.cc
index d59f12cd856148..fc5ab0b624754e 100644
--- a/tensorflow/core/util/tensor_bundle/naming.cc
+++ b/tensorflow/core/util/tensor_bundle/naming.cc
@@ -24,16 +24,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-string MetaFilename(absl::string_view prefix) {
-  return strings::Printf("%.*s.index", static_cast<int>(prefix.size()),
+std::string MetaFilename(absl::string_view prefix) {
+  return absl::StrFormat("%.*s.index", static_cast<int>(prefix.size()),
                          prefix.data());
 }
 
-string DataFilename(absl::string_view prefix, int32_t shard_id,
-                    int32_t num_shards) {
+std::string DataFilename(absl::string_view prefix, int32_t shard_id,
+                         int32_t num_shards) {
   DCHECK_GT(num_shards, 0);
   DCHECK_LT(shard_id, num_shards);
-  return strings::Printf("%.*s.data-%05d-of-%05d",
+  return absl::StrFormat("%.*s.data-%05d-of-%05d",
                          static_cast<int>(prefix.size()), prefix.data(),
                          shard_id, num_shards);
 }
diff --git a/tensorflow/core/util/tensor_bundle/naming.h b/tensorflow/core/util/tensor_bundle/naming.h
index c98abac755102a..3acd5dcdd9bbe8 100644
--- a/tensorflow/core/util/tensor_bundle/naming.h
+++ b/tensorflow/core/util/tensor_bundle/naming.h
@@ -40,9 +40,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-string MetaFilename(absl::string_view prefix);
-string DataFilename(absl::string_view prefix, int32_t shard_id,
-                    int32_t num_shards);
+std::string MetaFilename(absl::string_view prefix);
+std::string DataFilename(absl::string_view prefix, int32_t shard_id,
+                         int32_t num_shards);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index d25c6018e5beb9..592583c1acb2de 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -50,13 +50,13 @@ using ::testing::ElementsAre;
 namespace {
 
 // Prepend the current test case's working temporary directory to <prefix>
-string Prefix(const string& prefix) {
+std::string Prefix(const std::string& prefix) {
   return absl::StrCat(testing::TmpDir(), "/", prefix);
 }
 
 // Construct a data input directory by prepending the test data root
 // directory to <prefix>
-string TestdataPrefix(const string& prefix) {
+std::string TestdataPrefix(const std::string& prefix) {
   return absl::StrCat(testing::TensorFlowSrcRoot(),
                       "/core/util/tensor_bundle/testdata/", prefix);
 }
@@ -87,7 +87,7 @@ Tensor ByteSwap(Tensor t) {
 // Assert that <reader> has a tensor under <key> matching <expected_val> in
 // terms of both shape, dtype, and value
 template <typename T>
-void Expect(BundleReader* reader, const string& key,
+void Expect(BundleReader* reader, const std::string& key,
             const Tensor& expected_val) {
   // Tests for Contains().
   EXPECT_TRUE(reader->Contains(key));
@@ -104,7 +104,7 @@ void Expect(BundleReader* reader, const string& key,
 }
 
 template <class T>
-void ExpectVariant(BundleReader* reader, const string& key,
+void ExpectVariant(BundleReader* reader, const std::string& key,
                    const Tensor& expected_t) {
   // Tests for Contains().
   EXPECT_TRUE(reader->Contains(key));
@@ -137,8 +137,8 @@ void ExpectNext(BundleReader* reader, const Tensor& expected_val) {
   test::ExpectTensorEqual<T>(val, expected_val);
 }
 
-std::vector<string> AllTensorKeys(BundleReader* reader) {
-  std::vector<string> ret;
+std::vector<std::string> AllTensorKeys(BundleReader* reader) {
+  std::vector<std::string> ret;
   reader->Seek(kHeaderEntryKey);
   reader->Next();
   for (; reader->Valid(); reader->Next()) {
@@ -149,9 +149,9 @@ std::vector<string> AllTensorKeys(BundleReader* reader) {
 
 // Writes out the metadata file of a bundle again, with the endianness marker
 // bit flipped.
-absl::Status FlipEndiannessBit(const string& prefix) {
+absl::Status FlipEndiannessBit(const std::string& prefix) {
   Env* env = Env::Default();
-  const string metadata_tmp_path = Prefix("some_tmp_path");
+  const std::string metadata_tmp_path = Prefix("some_tmp_path");
   std::unique_ptr<WritableFile> metadata_file;
   TF_RETURN_IF_ERROR(env->NewWritableFile(metadata_tmp_path, &metadata_file));
   // We create the builder lazily in case we run into an exception earlier, in
@@ -161,8 +161,8 @@ absl::Status FlipEndiannessBit(const string& prefix) {
 
   // Reads the existing metadata file, and fills the builder.
   {
-    const string filename = MetaFilename(prefix);
-    uint64 file_size;
+    const std::string filename = MetaFilename(prefix);
+    uint64_t file_size;
     TF_RETURN_IF_ERROR(env->GetFileSize(filename, &file_size));
     std::unique_ptr<RandomAccessFile> file;
     TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
@@ -213,7 +213,7 @@ void TestBasic() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "foo_000", Constant_2x3(T(0)));
     Expect<T>(&reader, "foo_001", Constant_2x3(T(1)));
     Expect<T>(&reader, "foo_002", Constant_2x3(T(2)));
@@ -243,7 +243,7 @@ void TestBasic() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
     Expect<T>(&reader, "bar_003", Constant_2x3(T(3)));
     Expect<T>(&reader, "bar_002", Constant_2x3(T(2)));
     Expect<T>(&reader, "bar_001", Constant_2x3(T(1)));
@@ -267,8 +267,8 @@ void TestBasic() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003",
-                             "foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003",
+                                  "foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "bar_000", Constant_2x3(T(0)));
     Expect<T>(&reader, "bar_001", Constant_2x3(T(1)));
     Expect<T>(&reader, "bar_002", Constant_2x3(T(2)));
@@ -361,8 +361,8 @@ TEST(TensorBundleTest, SwapBytes) {
 
   // 64-bit types
   // Cast to uint64*/int64* to make DataTypeToEnum<T> happy
-  TestByteSwap(reinterpret_cast<const uint64*>(forward_64),
-               reinterpret_cast<const uint64*>(swapped_64), arr_len_64);
+  TestByteSwap(reinterpret_cast<const uint64_t*>(forward_64),
+               reinterpret_cast<const uint64_t*>(swapped_64), arr_len_64);
   TestByteSwap(reinterpret_cast<const int64_t*>(forward_64),
                reinterpret_cast<const int64_t*>(swapped_64), arr_len_64);
   TestByteSwap(reinterpret_cast<const double*>(forward_64),
@@ -413,7 +413,7 @@ void TestEndianness() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "foo_000", Constant_2x3<T>(T(0)));
     Expect<T>(&reader, "foo_001", Constant_2x3<T>(T(1)));
     Expect<T>(&reader, "foo_002", Constant_2x3<T>(T(2)));
@@ -444,7 +444,7 @@ void TestEndianness() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003"}));
     Expect<T>(&reader, "bar_003", Constant_2x3<T>(T(3)));
     Expect<T>(&reader, "bar_002", Constant_2x3<T>(T(2)));
     Expect<T>(&reader, "bar_001", Constant_2x3<T>(T(1)));
@@ -468,8 +468,8 @@ void TestEndianness() {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"bar_000", "bar_001", "bar_002", "bar_003",
-                             "foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"bar_000", "bar_001", "bar_002", "bar_003",
+                                  "foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<T>(&reader, "bar_000", Constant_2x3<T>(T(0)));
     Expect<T>(&reader, "bar_001", Constant_2x3<T>(T(1)));
     Expect<T>(&reader, "bar_002", Constant_2x3<T>(T(2)));
@@ -519,7 +519,7 @@ void TestNonStandardShapes() {
 
 // Writes a bundle to disk with a bad "version"; checks for "expected_error".
 void VersionTest(const VersionDef& version, absl::string_view expected_error) {
-  const string path = Prefix("version_test");
+  const std::string path = Prefix("version_test");
   {
     // Prepare an empty bundle with the given version information.
     BundleHeaderProto header;
@@ -543,10 +543,10 @@ void VersionTest(const VersionDef& version, absl::string_view expected_error) {
 TEST(TensorBundleTest, Basic) {
   TestBasic<float>();
   TestBasic<double>();
-  TestBasic<int32>();
-  TestBasic<uint8>();
-  TestBasic<int16>();
-  TestBasic<int8>();
+  TestBasic<int32_t>();
+  TestBasic<uint8_t>();
+  TestBasic<int16_t>();
+  TestBasic<int8_t>();
   TestBasic<complex64>();
   TestBasic<complex128>();
   TestBasic<int64_t>();
@@ -560,10 +560,10 @@ TEST(TensorBundleTest, Basic) {
 TEST(TensorBundleTest, Endianness) {
   TestEndianness<float>();
   TestEndianness<double>();
-  TestEndianness<int32>();
-  TestEndianness<uint8>();
-  TestEndianness<int16>();
-  TestEndianness<int8>();
+  TestEndianness<int32_t>();
+  TestEndianness<uint8_t>();
+  TestEndianness<int16_t>();
+  TestEndianness<int8_t>();
   TestEndianness<complex64>();
   TestEndianness<complex128>();
   TestEndianness<int64_t>();
@@ -704,10 +704,10 @@ TEST(TensorBundleTest, EquivalentSliceTest) {
 TEST(TensorBundleTest, NonStandardShapes) {
   TestNonStandardShapes<float>();
   TestNonStandardShapes<double>();
-  TestNonStandardShapes<int32>();
-  TestNonStandardShapes<uint8>();
-  TestNonStandardShapes<int16>();
-  TestNonStandardShapes<int8>();
+  TestNonStandardShapes<int32_t>();
+  TestNonStandardShapes<uint8_t>();
+  TestNonStandardShapes<int16_t>();
+  TestNonStandardShapes<int8_t>();
   TestNonStandardShapes<complex64>();
   TestNonStandardShapes<complex128>();
   TestNonStandardShapes<int64_t>();
@@ -723,15 +723,16 @@ TEST(TensorBundleTest, StringTensorsOldFormat) {
   // varint32s to store string lengths (we now use varint64s).
   BundleReader reader(Env::Default(), TestdataPrefix("old_string_tensors/foo"));
   TF_ASSERT_OK(reader.status());
-  EXPECT_EQ(AllTensorKeys(&reader),
-            std::vector<string>({"floats", "scalar", "string_tensor", "strs"}));
+  EXPECT_EQ(
+      AllTensorKeys(&reader),
+      std::vector<std::string>({"floats", "scalar", "string_tensor", "strs"}));
 
   Expect<tstring>(&reader, "string_tensor",
                   Tensor(DT_STRING, TensorShape({1})));
   Expect<tstring>(&reader, "scalar", test::AsTensor<tstring>({"hello"}));
   Expect<tstring>(
       &reader, "strs",
-      test::AsTensor<tstring>({"hello", "", "x01", string(1 << 10, 'c')}));
+      test::AsTensor<tstring>({"hello", "", "x01", std::string(1 << 10, 'c')}));
   Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
 }
 
@@ -758,8 +759,8 @@ TEST(TensorBundleTest, StringTensors) {
                             Tensor(DT_STRING, TensorShape({1}))));  // Empty.
     TF_EXPECT_OK(writer.Add("scalar", test::AsTensor<tstring>({"hello"})));
     TF_EXPECT_OK(writer.Add(
-        "strs",
-        test::AsTensor<tstring>({"hello", "", "x01", string(1 << 25, 'c')})));
+        "strs", test::AsTensor<tstring>(
+                    {"hello", "", "x01", std::string(1 << 25, 'c')})));
 
     // Requires a 64-bit length.
     tstring* backing_string = long_string_tensor.flat<tstring>().data();
@@ -775,15 +776,15 @@ TEST(TensorBundleTest, StringTensors) {
     BundleReader reader(Env::Default(), Prefix("foo"));
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(AllTensorKeys(&reader),
-              std::vector<string>({"floats", "long_scalar", "scalar",
-                                   "string_tensor", "strs"}));
+              std::vector<std::string>({"floats", "long_scalar", "scalar",
+                                        "string_tensor", "strs"}));
 
     Expect<tstring>(&reader, "string_tensor",
                     Tensor(DT_STRING, TensorShape({1})));
     Expect<tstring>(&reader, "scalar", test::AsTensor<tstring>({"hello"}));
-    Expect<tstring>(
-        &reader, "strs",
-        test::AsTensor<tstring>({"hello", "", "x01", string(1 << 25, 'c')}));
+    Expect<tstring>(&reader, "strs",
+                    test::AsTensor<tstring>(
+                        {"hello", "", "x01", std::string(1 << 25, 'c')}));
 
     Expect<float>(&reader, "floats", Constant_2x3<float>(16.18));
 
@@ -825,10 +826,10 @@ TEST(TensorBundleTest, StringTensors) {
 class VariantObject {
  public:
   VariantObject() {}
-  VariantObject(const string& metadata, int64_t value)
+  VariantObject(const std::string& metadata, int64_t value)
       : metadata_(metadata), value_(value) {}
 
-  string TypeName() const { return "TEST VariantObject"; }
+  std::string TypeName() const { return "TEST VariantObject"; }
   void Encode(VariantTensorData* data) const {
     data->set_type_name(TypeName());
     data->set_metadata(metadata_);
@@ -846,7 +847,7 @@ class VariantObject {
   bool operator==(const VariantObject other) const {
     return metadata_ == other.metadata_ && value_ == other.value_;
   }
-  string metadata_;
+  std::string metadata_;
   int64_t value_;
 };
 
@@ -874,8 +875,8 @@ TEST(TensorBundleTest, VariantTensors) {
 TEST(TensorBundleTest, DirectoryStructure) {
   Env* env = Env::Default();
   // Writes two bundles.
-  const std::vector<string> kBundlePrefixes = {Prefix("worker0"),
-                                               Prefix("worker1")};
+  const std::vector<std::string> kBundlePrefixes = {Prefix("worker0"),
+                                                    Prefix("worker1")};
   for (int i = 0; i < 2; ++i) {
     BundleWriter writer(env, kBundlePrefixes[i]);
     TF_EXPECT_OK(
@@ -884,10 +885,10 @@ TEST(TensorBundleTest, DirectoryStructure) {
   }
 
   // Ensures we have the expected files.
-  auto CheckDirFiles = [env](const string& bundle_prefix,
-                             absl::Span<const string> expected_files) {
+  auto CheckDirFiles = [env](const std::string& bundle_prefix,
+                             absl::Span<const std::string> expected_files) {
     absl::string_view dir = io::Dirname(bundle_prefix);
-    for (const string& expected_file : expected_files) {
+    for (const std::string& expected_file : expected_files) {
       TF_EXPECT_OK(env->FileExists(io::JoinPath(dir, expected_file)));
     }
   };
@@ -901,7 +902,7 @@ TEST(TensorBundleTest, DirectoryStructure) {
                 {"worker1.index", "worker1.data-00000-of-00001"});
 
   // Trivially "merge" one bundle to some other location (i.e., a renaming).
-  const string kAnotherPrefix = Prefix("another");
+  const std::string kAnotherPrefix = Prefix("another");
   TF_ASSERT_OK(MergeBundles(env, {kBundlePrefixes[0]}, kAnotherPrefix));
   CheckDirFiles(kAnotherPrefix,
                 {"another.index", "another.data-00000-of-00001"});
@@ -910,7 +911,7 @@ TEST(TensorBundleTest, DirectoryStructure) {
   //   merged.index
   //   merged.data-00000-of-00002
   //   merged.data-00001-of-00002
-  const string kMerged = Prefix("merged");
+  const std::string kMerged = Prefix("merged");
   TF_ASSERT_OK(
       MergeBundles(env, {kAnotherPrefix, kBundlePrefixes[1]}, kMerged));
   CheckDirFiles(kMerged, {"merged.index", "merged.data-00000-of-00002",
@@ -919,8 +920,8 @@ TEST(TensorBundleTest, DirectoryStructure) {
 
 TEST(TensorBundleTest, SortForSequentialAccess) {
   Env* env = Env::Default();
-  const std::vector<string> kBundlePrefixes = {Prefix("worker0"),
-                                               Prefix("worker1")};
+  const std::vector<std::string> kBundlePrefixes = {Prefix("worker0"),
+                                                    Prefix("worker1")};
   BundleWriter writer0(env, kBundlePrefixes[0]);
   for (int i = 0; i < 3; ++i) {
     TF_EXPECT_OK(
@@ -935,7 +936,7 @@ TEST(TensorBundleTest, SortForSequentialAccess) {
   }
   TF_ASSERT_OK(writer1.Finish());
 
-  const string kMerged = Prefix("merged");
+  const std::string kMerged = Prefix("merged");
   TF_ASSERT_OK(
       MergeBundles(env, {kBundlePrefixes[0], kBundlePrefixes[1]}, kMerged));
 
@@ -945,10 +946,11 @@ TEST(TensorBundleTest, SortForSequentialAccess) {
 
   BundleReader reader(env, kMerged);
   TF_ASSERT_OK(reader.status());
-  std::vector<string> tensor_names = {"tensor-1-0", "tensor-0-1", "tensor-1-2",
-                                      "tensor-0-0", "tensor-1-1", "tensor-0-2"};
-  TF_ASSERT_OK(reader.SortForSequentialAccess<string>(
-      tensor_names, [](const string& element) { return element; }));
+  std::vector<std::string> tensor_names = {"tensor-1-0", "tensor-0-1",
+                                           "tensor-1-2", "tensor-0-0",
+                                           "tensor-1-1", "tensor-0-2"};
+  TF_ASSERT_OK(reader.SortForSequentialAccess<std::string>(
+      tensor_names, [](const std::string& element) { return element; }));
   EXPECT_THAT(tensor_names,
               ElementsAre("tensor-0-0", "tensor-0-1", "tensor-0-2",
                           "tensor-1-2", "tensor-1-1", "tensor-1-0"));
@@ -976,11 +978,11 @@ TEST(TensorBundleTest, Error) {
 TEST(TensorBundleTest, Checksum) {
   // Randomly flips a byte in [pos_lhs, end of data file), or exactly byte
   // pos_lhs if exact_pos == True.
-  auto FlipByte = [](const string& prefix, int pos_lhs,
+  auto FlipByte = [](const std::string& prefix, int pos_lhs,
                      bool exact_pos = false) {
     DCHECK_GE(pos_lhs, 0);
-    const string& datafile = DataFilename(Prefix(prefix), 0, 1);
-    string data;
+    const std::string& datafile = DataFilename(Prefix(prefix), 0, 1);
+    std::string data;
     TF_ASSERT_OK(ReadFileToString(Env::Default(), datafile, &data));
 
     int byte_pos = 0;
@@ -995,8 +997,8 @@ TEST(TensorBundleTest, Checksum) {
     TF_ASSERT_OK(WriteStringToFile(Env::Default(), datafile, data));
   };
   // The lookup should fail with a checksum-related message.
-  auto ExpectLookupFails = [](const string& prefix, const string& key,
-                              const string& expected_msg, Tensor& val) {
+  auto ExpectLookupFails = [](const std::string& prefix, const std::string& key,
+                              const std::string& expected_msg, Tensor& val) {
     BundleReader reader(Env::Default(), Prefix(prefix));
     absl::Status status = reader.Lookup(key, &val);
     EXPECT_TRUE(absl::IsDataLoss(status));
@@ -1048,8 +1050,8 @@ TEST(TensorBundleTest, TruncatedTensorContents) {
   TF_ASSERT_OK(writer.Finish());
 
   // Truncates the data file by one byte, so that we hit EOF.
-  const string datafile = DataFilename(Prefix("end"), 0, 1);
-  string data;
+  const std::string datafile = DataFilename(Prefix("end"), 0, 1);
+  std::string data;
   TF_ASSERT_OK(ReadFileToString(env, datafile, &data));
   ASSERT_TRUE(!data.empty());
   TF_ASSERT_OK(WriteStringToFile(
@@ -1143,7 +1145,7 @@ TEST(TensorBundleTest, LargeVariableLoadingTest) {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<float>(&reader, "foo_000", Constant_100x100<float>(0));
     Expect<float>(&reader, "foo_001", Constant_100x100<float>(1));
     Expect<float>(&reader, "foo_002", Constant_100x100<float>(2));
@@ -1220,7 +1222,8 @@ TEST(BundleCacheTest, ConcurrentGetFile) {
 class TensorBundleAlignmentTest : public ::testing::Test {
  protected:
   template <typename T>
-  void ExpectAlignment(BundleReader* reader, const string& key, int alignment) {
+  void ExpectAlignment(BundleReader* reader, const std::string& key,
+                       int alignment) {
     BundleEntryProto full_tensor_entry;
     TF_ASSERT_OK(reader->GetBundleEntryProto(key, &full_tensor_entry));
     EXPECT_EQ(0, full_tensor_entry.offset() % alignment);
@@ -1243,7 +1246,7 @@ TEST_F(TensorBundleAlignmentTest, AlignmentTest) {
     TF_ASSERT_OK(reader.status());
     EXPECT_EQ(
         AllTensorKeys(&reader),
-        std::vector<string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
+        std::vector<std::string>({"foo_000", "foo_001", "foo_002", "foo_003"}));
     Expect<float>(&reader, "foo_000", Constant_2x3<float>(0));
     Expect<float>(&reader, "foo_001", Constant_2x3<float>(1));
     Expect<float>(&reader, "foo_002", Constant_2x3<float>(2));
@@ -1298,7 +1301,7 @@ BENCHMARK(BM_BundleAlignment)->ArgPair(4096, 1048576);
 
 static void BM_BundleWriterSmallTensor(::testing::benchmark::State& state) {
   const int64_t bytes = state.range(0);
-  Tensor t = Constant(static_cast<int8>('a'), TensorShape{bytes});
+  Tensor t = Constant(static_cast<int8_t>('a'), TensorShape{bytes});
   BundleWriter writer(Env::Default(), Prefix("foo"));
   int suffix = 0;
   for (auto s : state) {
@@ -1311,7 +1314,7 @@ BENCHMARK(BM_BundleWriterSmallTensor)->Range(1, 1 << 20);
 static void BM_BundleWriterLargeTensor(::testing::benchmark::State& state) {
   const int mb = state.range(0);
   const int64_t bytes = static_cast<int64_t>(mb) * (1 << 20);
-  Tensor t = Constant(static_cast<int8>('a'), TensorShape{bytes});
+  Tensor t = Constant(static_cast<int8_t>('a'), TensorShape{bytes});
   for (auto s : state) {
     BundleWriter writer(Env::Default(), Prefix("foo"));
     TF_CHECK_OK(writer.Add("big", t));

From 44a702d0be1fcba7641fbdfa4bac4d2ea6103653 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 11:20:21 -0800
Subject: [PATCH 617/753] Reverts 6383e3632c91bdc8eccd458f699b317f03968b84

PiperOrigin-RevId: 847161450
---
 third_party/xla/xla/pjrt/BUILD             |   1 -
 third_party/xla/xla/pjrt/transpose.cc      | 359 ++++++++++-----------
 third_party/xla/xla/pjrt/transpose.h       |  36 +--
 third_party/xla/xla/pjrt/transpose_test.cc |   5 +-
 4 files changed, 173 insertions(+), 228 deletions(-)

diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index f266bac15df8e0..a0bdf45320c23e 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -972,7 +972,6 @@ cc_library(
 xla_cc_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
-    shard_count = 10,
     deps = [
         ":transpose",
         "//xla:array",
diff --git a/third_party/xla/xla/pjrt/transpose.cc b/third_party/xla/xla/pjrt/transpose.cc
index 91aa71119141ce..c7eb090396085c 100644
--- a/third_party/xla/xla/pjrt/transpose.cc
+++ b/third_party/xla/xla/pjrt/transpose.cc
@@ -120,8 +120,9 @@ static constexpr int kMaxInnerBlockSizeBytes = 16;
 // A plan is a data structure that describes a loop nest.
 // TODO(phawkins): consider shrinking Node so it fits in a cache line.
 struct TransposePlan::Node {
-  // The loop should iterate over the index space range(0, end, inc).
+  // The loop should iterate over the index space range(start, end, inc).
   // These fields are ignored by the macrokernel.
+  int64_t start;
   int64_t end;  // For the inner loop of a memcpy loop nest, this is the size of
                 // the transfer.
   int64_t inc;  // The transpose sentinel node has inc < 0.
@@ -202,6 +203,7 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
   DVLOG(10) << "Transpose " << outer_bs_a << " " << outer_bs_b;
   DCHECK_GT(outer_bs_a, 0);
   DCHECK_GT(outer_bs_b, 0);
+  const int64_t start = node->start;
   const int64_t end = node->end;
   const int64_t stop = node->end - (node->inc - 1);
   const int64_t lda = node->lda;
@@ -215,7 +217,7 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
     const int64_t lda_block = next_node->lda;
     const int64_t ldb_block = next_node->ldb;
     int64_t i;
-    for (i = 0; i < stop; i += inc) {
+    for (i = start; i < stop; i += inc) {
       MacroKernel<T, inner_bs, transformation>(a + i * lda, lda_block,
                                                outer_bs_a, b + i * ldb,
                                                ldb_block, outer_bs_b, scratch);
@@ -279,7 +281,7 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
     // inner loops. Structurally this code is identical to the previous case,
     // but we call Transpose() recursively instead of MacroKernel().
     int64_t i;
-    for (i = 0; i < stop; i += inc) {
+    for (i = start; i < stop; i += inc) {
       Transpose<T, inner_bs, transformation>(
           a + i * lda, outer_bs_a, b + i * ldb, outer_bs_b, next_node, scratch);
     }
@@ -333,44 +335,59 @@ void Transpose(const char* __restrict a, int outer_bs_a, char* __restrict b,
 
 void TransposeConstStride1(const char* __restrict a, char* __restrict b,
                            TransposePlan::Node const* __restrict node) {
+  a += node[0].start * node[0].lda;
+  b += node[0].start * node[0].ldb;
   if (node[0].is_inner_dim_in_a) {
     int64_t num_bytes = node->end;
     std::memcpy(b, a, num_bytes);
   } else if (node[1].is_inner_dim_in_a) {
+    int64_t offset_a = node[1].start * node[1].lda;
+    int64_t offset_b = node[1].start * node[1].ldb;
     int64_t num_bytes = node[1].end;
-    for (int64_t i = 0; i < node[0].end; ++i) {
+    a += offset_a;
+    b += offset_b;
+    for (int64_t i = node[0].start; i < node[0].end; ++i) {
       std::memcpy(b, a, num_bytes);
       a += node[0].lda;
       b += node[0].ldb;
     }
     if (node[0].trailing_tile_next_node_inc) {
-      TransposeConstStride1(a, b, node + node[0].trailing_tile_next_node_inc);
+      TransposeConstStride1(a - offset_a, b - offset_b,
+                            node + node[0].trailing_tile_next_node_inc);
     }
   } else if (node[2].is_inner_dim_in_a) {
     int64_t num_bytes = node[2].end;
-    for (int64_t i = 0; i < node[0].end; ++i) {
+    int64_t offset_a1 = node[1].start * node[1].lda;
+    int64_t offset_b1 = node[1].start * node[1].ldb;
+    int64_t offset_a2 = node[2].start * node[2].lda;
+    int64_t offset_b2 = node[2].start * node[2].ldb;
+    a += offset_a1 + offset_a2;
+    b += offset_b1 + offset_b2;
+    for (int64_t i = node[0].start; i < node[0].end; ++i) {
       const char* a1 = a;
       char* b1 = b;
-      for (int64_t j = 0; j < node[1].end; ++j) {
+      for (int64_t j = node[1].start; j < node[1].end; ++j) {
         std::memcpy(b1, a1, num_bytes);
         a1 += node[1].lda;
         b1 += node[1].ldb;
       }
       if (node[1].trailing_tile_next_node_inc) {
-        TransposeConstStride1(a1, b1,
+        TransposeConstStride1(a1 - offset_a2, b1 - offset_b2,
                               &node[1] + node[1].trailing_tile_next_node_inc);
       }
       a += node[0].lda;
       b += node[0].ldb;
     }
     if (node[0].trailing_tile_next_node_inc) {
-      TransposeConstStride1(a, b, node + node[0].trailing_tile_next_node_inc);
+      TransposeConstStride1(a - offset_a1 - offset_a2,
+                            b - offset_b1 - offset_b2,
+                            node + node[0].trailing_tile_next_node_inc);
     }
   } else {
-    for (int64_t i = 0; i < node[0].end; ++i) {
-      const char* a1 = a;
-      char* b1 = b;
-      for (int64_t j = 0; j < node[1].end; ++j) {
+    for (int64_t i = node[0].start; i < node[0].end; ++i) {
+      const char* a1 = a + node[1].start * node[1].lda;
+      char* b1 = b + node[1].start * node[1].ldb;
+      for (int64_t j = node[1].start; j < node[1].end; ++j) {
         TransposeConstStride1(a1, b1, node + 2);
         a1 += node[1].lda;
         b1 += node[1].ldb;
@@ -442,49 +459,6 @@ struct uint128 {
 };
 static_assert(sizeof(uint128) == 16, "uint128 should be 16 bytes in size");
 
-void TransposePlan::ExecuteChunk(int chunk_id, const void* a, void* b) const {
-  if (num_elems_ == 0) {
-    return;
-  }
-  tsl::profiler::TraceMe traceme("Transpose::ExecuteChunk", /*level=*/2);
-
-  absl::Span<Node const> nodes = nodes_[chunk_id];
-  const char* ac = static_cast<const char*>(a) + input_offset_bytes_[chunk_id];
-  char* bc = static_cast<char*>(b) + output_offset_bytes_[chunk_id];
-
-  if (inner_kernel_is_memcpy_) {
-    DCHECK(transformation_ == Transformation::kNone);
-    // Memcpy-based plans all assume element size 1 (i.e., bytes).
-    TransposeConstStride1(ac, bc, nodes.data());
-    return;
-  }
-
-  switch (elem_size_in_bytes_) {
-    case 1:
-      ExecuteTyped<uint8_t, Transformation::kNone>(ac, bc, nodes);
-      break;
-    case 2:
-      ExecuteTyped<uint16_t, Transformation::kNone>(ac, bc, nodes);
-      break;
-    case 4:
-      if (transformation_ == Transformation::kNone) {
-        ExecuteTyped<uint32_t, Transformation::kNone>(ac, bc, nodes);
-      } else {
-        DCHECK(transformation_ == Transformation::kF64ToEf57);
-        ExecuteTyped<uint32_t, Transformation::kF64ToEf57>(ac, bc, nodes);
-      }
-      break;
-    case 8:
-      ExecuteTyped<uint64_t, Transformation::kNone>(ac, bc, nodes);
-      break;
-    case 16:
-      ExecuteTyped<uint128, Transformation::kNone>(ac, bc, nodes);
-      break;
-    default:
-      LOG(FATAL) << "Unimplemented element size " << elem_size_in_bytes_;
-  }
-}
-
 void TransposePlan::Execute(
     const void* a, void* b,
     std::optional<absl::FunctionRef<void(std::function<void(void)>)>>
@@ -494,19 +468,58 @@ void TransposePlan::Execute(
   }
   tsl::profiler::TraceMe traceme("Transpose::Execute", /*level=*/2);
 
-  if (!schedule_work || Parallelism() <= 1) {
-    for (int i = 0; i < Parallelism(); ++i) {
-      ExecuteChunk(i, a, b);
+  const char* ac = static_cast<const char*>(a);
+  char* bc = static_cast<char*>(b);
+
+  auto execute_by_type = [&](absl::Span<Node const> nodes) {
+    if (inner_kernel_is_memcpy_) {
+      DCHECK(transformation_ == Transformation::kNone);
+      // Memcpy-based plans all assume element size 1 (i.e., bytes).
+      TransposeConstStride1(ac, bc, nodes.data());
+      return;
+    }
+
+    switch (elem_size_in_bytes_) {
+      case 1:
+        ExecuteTyped<uint8_t, Transformation::kNone>(ac, bc, nodes);
+        break;
+      case 2:
+        ExecuteTyped<uint16_t, Transformation::kNone>(ac, bc, nodes);
+        break;
+      case 4:
+        if (transformation_ == Transformation::kNone) {
+          ExecuteTyped<uint32_t, Transformation::kNone>(ac, bc, nodes);
+        } else {
+          DCHECK(transformation_ == Transformation::kF64ToEf57);
+          ExecuteTyped<uint32_t, Transformation::kF64ToEf57>(ac, bc, nodes);
+        }
+        break;
+      case 8:
+        ExecuteTyped<uint64_t, Transformation::kNone>(ac, bc, nodes);
+        break;
+      case 16:
+        ExecuteTyped<uint128, Transformation::kNone>(ac, bc, nodes);
+        break;
+      default:
+        LOG(FATAL) << "Unimplemented element size " << elem_size_in_bytes_;
+    }
+  };
+
+  if (!schedule_work || nodes_.size() <= 1) {
+    for (const auto& nodes : nodes_) {
+      execute_by_type(nodes);
     }
   } else {
-    absl::BlockingCounter counter(Parallelism() - 1);
+    absl::BlockingCounter counter(nodes_.size() - 1);
     for (size_t i = 1; i < nodes_.size(); ++i) {
-      (*schedule_work)([&, i]() {
-        ExecuteChunk(i, a, b);
+      absl::Span<Node const> nodes = nodes_[i];
+      (*schedule_work)([&, nodes]() {
+        execute_by_type(nodes);
         counter.DecrementCount();
       });
     }
-    ExecuteChunk(0, a, b);
+    // Run the first chunk inline in this thread.
+    execute_by_type(nodes_[0]);
     counter.Wait();
   }
 }
@@ -587,41 +600,51 @@ bool TransposePlan::Loop::operator==(const Loop& other) const {
          lda == other.lda && ldb == other.ldb &&
          is_inner_dim_in_a == other.is_inner_dim_in_a &&
          is_inner_dim_in_b == other.is_inner_dim_in_b &&
-         parallelism == other.parallelism && start == other.start &&
-         end == other.end;
+         parallelism == other.parallelism;
 }
 
 // Helper function that builds a plan.
-void TransposePlan::BuildPlanNodes(int chunk_id,
+void TransposePlan::BuildPlanNodes(int thread_id,
                                    std::vector<TransposePlan::Node>& nodes) {
   VLOG(8) << "Before plan build: " << ToString();
   const int ndim = a_dims_.size();
   DCHECK_GT(ndim, 0);
 
-  // Use the pre-computed chunk loops which have start/end bounds already set.
-  absl::Span<const Loop> chunk_loops = chunk_loops_[chunk_id];
-
   // We build plans in a depth-first order, visiting loops from outermost to
   // innermost. We use a stack (depth-first) order to handle trailing partial
   // tiles, which we "come back to" after handling the non-trailing case.
   struct Agendum {
-    // The ID of the loop to visit in chunk_loops.
+    // The ID of the loop to visit in loop_order_.
     int loop_id;
     // The parent node ID whose trailing tile should be made to point to this
     // node.
     int parent_node_id;
 
+    // The number of parallel tasks available to run this loop and its
+    // successors.
+    int num_tasks_at_loop;
+
+    // The ID number of the current thread in the tasks at this loop.
+    int task_id_at_loop;
+
     // For which dimensions of `a` are we to visit the partial trailing tile
     // a loop that visits that tile's interior?
     absl::InlinedVector<bool, 4> partial_tiles;
   };
   std::stack<Agendum> agenda;
 
+  int total_tasks = 1;
+  for (const Loop& loop : loop_order_) {
+    total_tasks *= loop.parallelism;
+  }
+
   agenda.push(Agendum{/*loop_id=*/0, /*parent_node_id=*/-1,
+                      /*num_tasks_at_loop=*/total_tasks,
+                      /*task_id_at_loop=*/thread_id,
                       absl::InlinedVector<bool, 4>(ndim, false)});
 
   auto loop_has_trivial_iteration_space = [](const Node& node) {
-    return node.inc == node.end;
+    return node.start == 0 && node.start + node.inc == node.end;
   };
 
   while (!agenda.empty()) {
@@ -636,13 +659,14 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
           node_id - agendum.parent_node_id;
     }
 
-    if (agendum.loop_id == chunk_loops.size()) {
+    if (agendum.loop_id == loop_order_.size()) {
       // We've reached the end of the loop nest.
+      DCHECK_EQ(agendum.num_tasks_at_loop, 1);
       // Transpose loops have a sentinel node, indicated by a negative `inc`
       // value, that describes the striding of the inner transpose kernel.
       if (!inner_kernel_is_memcpy_) {
         Node node;
-        node.end = node.inc = -1;
+        node.start = node.end = node.inc = -1;
         node.lda = sentinel_lda_;
         node.ldb = sentinel_ldb_;
         nodes.push_back(node);
@@ -651,9 +675,14 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
       continue;
     }
 
-    const Loop& loop = chunk_loops[agendum.loop_id];
+    const Loop& loop = loop_order_[agendum.loop_id];
     int a_dim = loop.dim_in_a;
 
+    // Compute the number of tasks for the next loop iteration.
+    int task_id_at_loop = agendum.task_id_at_loop;
+    int num_tasks_at_loop = agendum.num_tasks_at_loop / loop.parallelism;
+    int task_id_at_next_loop = task_id_at_loop % num_tasks_at_loop;
+
     Node node;
     node.lda = loop.lda;
     node.ldb = loop.ldb;
@@ -666,20 +695,22 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
       node.inc = inner_block_elems_ * outer_block_elems_b_;
     }
 
+    int task_id = task_id_at_loop / num_tasks_at_loop;
+
     if (loop.tile_interior) {
       // We are visiting the tile interior of a tiled dimension.
       bool partial = agendum.partial_tiles[a_dim];
 
       int64_t size = partial ? loop.dim_size % loop.tile_size : loop.tile_size;
-      // loop.start and loop.end are in element units.
-      // Verify alignment to block boundaries.
-      CHECK(loop.start % node.inc == 0)
-          << "loop.start=" << loop.start
-          << " must be aligned to node.inc=" << node.inc;
-      node.end = std::min<int64_t>(size, loop.end) - loop.start;
+      int64_t num_iterations = CeilOfRatio(size, node.inc);
+      int64_t num_iterations_per_task =
+          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
+      node.start = std::min(size, task_id * num_iterations_per_task * node.inc);
+      node.end =
+          std::min(size, (task_id + 1) * num_iterations_per_task * node.inc);
 
       if (node.is_inner_dim_in_a && inner_kernel_is_memcpy_) {
-        node.end *= elem_size_in_bytes_;
+        node.end = (node.end - node.start) * elem_size_in_bytes_;
       }
 
       if (!loop_has_trivial_iteration_space(node) ||
@@ -689,6 +720,8 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
       Agendum new_agendum;
       new_agendum.loop_id = agendum.loop_id + 1;
       new_agendum.parent_node_id = -1;
+      new_agendum.task_id_at_loop = task_id_at_next_loop;
+      new_agendum.num_tasks_at_loop = num_tasks_at_loop;
       new_agendum.partial_tiles = agendum.partial_tiles;
       agenda.push(std::move(new_agendum));
     } else {
@@ -699,16 +732,14 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
 
       // If there is a trailing partial tile as well as complete tiles, handle
       // it as a trailer on the loop over complete tiles.
-      // A chunk is responsible for the trailing tile if its loop.end covers
-      // the full dimension.
-      int64_t full_size = CeilOfRatio(loop.dim_size, loop.tile_size);
-      bool handles_trailing =
-          loop.end >= full_size && loop.start <= num_complete_tiles;
       bool has_trailing_plan_node = false;
-      if (num_complete_tiles > 0 && has_partial_tile && handles_trailing) {
+      if (num_complete_tiles > 0 && has_partial_tile &&
+          task_id == loop.parallelism - 1) {
         Agendum new_agendum;
         new_agendum.loop_id = agendum.loop_id + 1;
         new_agendum.parent_node_id = node_id;
+        new_agendum.task_id_at_loop = task_id_at_next_loop;
+        new_agendum.num_tasks_at_loop = num_tasks_at_loop;
         new_agendum.partial_tiles = agendum.partial_tiles;
         new_agendum.partial_tiles[a_dim] = true;
         agenda.push(std::move(new_agendum));
@@ -720,12 +751,18 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
       // path to handle the trailing tile.
       bool partial = num_complete_tiles == 0 && has_partial_tile;
 
-      // loop.start and loop.end are in tile units.
+      // Evenly divide the loop iterations amongst the threads.
       int64_t num_tiles = partial ? 1 : num_complete_tiles;
-      node.end = std::min<int64_t>(num_tiles, loop.end) - loop.start;
+      int64_t num_iterations = CeilOfRatio(num_tiles, node.inc);
+      int64_t num_iterations_per_task =
+          CeilOfRatio<int64_t>(num_iterations, loop.parallelism);
+      node.start =
+          std::min(num_tiles, task_id * num_iterations_per_task * node.inc);
+      node.end = std::min(num_tiles,
+                          (task_id + 1) * num_iterations_per_task * node.inc);
 
       if (node.is_inner_dim_in_a && inner_kernel_is_memcpy_) {
-        node.end *= elem_size_in_bytes_;
+        node.end = (node.end - node.start) * elem_size_in_bytes_;
       }
 
       // If this loop has a trivial iteration space, drop it.
@@ -737,6 +774,8 @@ void TransposePlan::BuildPlanNodes(int chunk_id,
       Agendum new_agendum;
       new_agendum.loop_id = agendum.loop_id + 1;
       new_agendum.parent_node_id = -1;
+      new_agendum.task_id_at_loop = task_id_at_next_loop;
+      new_agendum.num_tasks_at_loop = num_tasks_at_loop;
       new_agendum.partial_tiles = agendum.partial_tiles;
       new_agendum.partial_tiles[a_dim] = partial;
       agenda.push(std::move(new_agendum));
@@ -960,10 +999,7 @@ void TransposePlan::Initialize() {
                         : ldb_[pos_stride1a_in_b];
   }
 
-  // Order to traverse dimensions, from slowest-varying to fastest-varying.
-  std::vector<Loop> loop_order;
-
-  loop_order.reserve(ndim);
+  loop_order_.reserve(ndim);
   for (int i = 0; i < ndim; ++i) {
     Loop loop;
     loop.dim_in_a = i;
@@ -981,7 +1017,7 @@ void TransposePlan::Initialize() {
     }
     loop.is_inner_dim_in_a = (loop.tile_size == 1) && (i == pos_stride1a);
     loop.is_inner_dim_in_b = (loop.tile_size == 1) && (i == pos_stride1b_in_a);
-    loop_order.push_back(loop);
+    loop_order_.push_back(loop);
 
     if (loop.tile_size > 1) {
       loop.tile_interior = true;
@@ -990,12 +1026,12 @@ void TransposePlan::Initialize() {
                              : ldb_[inverse_permutation[i]];
       loop.is_inner_dim_in_a = (i == pos_stride1a);
       loop.is_inner_dim_in_b = (i == pos_stride1b_in_a);
-      loop_order.push_back(loop);
+      loop_order_.push_back(loop);
     }
   }
 
-  RemoveTrivialLoops(loop_order);
-  CoalesceLoops(loop_order);
+  RemoveTrivialLoops(loop_order_);
+  CoalesceLoops(loop_order_);
 
   // Bound the block sizes so they are smaller than the stride-1 dimension
   // size.
@@ -1082,7 +1118,7 @@ void TransposePlan::Initialize() {
                            inner_kernel_is_memcpy_ && l.tile_interior,
                            -std::min<double>(a_stride * penalty, b_stride));
   };
-  absl::c_stable_sort(loop_order, [&](const Loop& a, const Loop& b) {
+  absl::c_stable_sort(loop_order_, [&](const Loop& a, const Loop& b) {
     return cost(a) < cost(b);
   });
   // It is a required invariant of the loop order that tile interiors always
@@ -1091,15 +1127,13 @@ void TransposePlan::Initialize() {
   // both input and output.
 
   // The stride-1 loop must be innermost for a memcpy loop.
-  DCHECK(!inner_kernel_is_memcpy_ || loop_order.back().is_inner_dim_in_a)
+  DCHECK(!inner_kernel_is_memcpy_ || loop_order_.back().is_inner_dim_in_a)
       << ToString();
 
-  int num_chunks = ChooseParallelizationStrategy(loop_order);
-  PartitionLoops(num_chunks, loop_order, chunk_loops_, input_offset_bytes_,
-                 output_offset_bytes_);
-  nodes_.resize(num_chunks);
-  for (int chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
-    BuildPlanNodes(chunk_id, nodes_[chunk_id]);
+  int num_threads = ChooseParallelizationStrategy();
+  nodes_.resize(num_threads);
+  for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
+    BuildPlanNodes(thread_id, nodes_[thread_id]);
   }
 
   switch (transformation_) {
@@ -1114,8 +1148,7 @@ void TransposePlan::Initialize() {
   }
 }
 
-int TransposePlan::ChooseParallelizationStrategy(
-    std::vector<Loop>& loop_order) {
+int TransposePlan::ChooseParallelizationStrategy() {
   int available_parallelism = num_threads_requested_;
 
   // Compute the number of iterations in `loop`.
@@ -1137,14 +1170,14 @@ int TransposePlan::ChooseParallelizationStrategy(
   };
 
   // Estimate the number of bytes each iteration of each loop processes.
-  absl::InlinedVector<int64_t, 4> work_in_bytes(loop_order.size());
+  absl::InlinedVector<int64_t, 4> work_in_bytes(loop_order_.size());
   int64_t acc = elem_size_in_bytes_;
   if (!inner_kernel_is_memcpy_) {
     acc *= inner_block_elems_ * inner_block_elems_ * outer_block_elems_a_ *
            outer_block_elems_b_;
   }
   auto work_it = work_in_bytes.rbegin();
-  for (auto it = loop_order.rbegin(); it != loop_order.rend(); ++it) {
+  for (auto it = loop_order_.rbegin(); it != loop_order_.rend(); ++it) {
     *work_it++ = acc;
     acc *= loop_iterations(*it);
   }
@@ -1153,17 +1186,11 @@ int TransposePlan::ChooseParallelizationStrategy(
 
   // Heuristic that attempts to parallelize the outermost loops, down to a
   // minimum per-thread number of bytes processed.
-  int num_chunks = 1;
-  for (size_t i = 0; i < loop_order.size(); ++i) {
-    Loop& loop = loop_order[i];
+  int num_threads = 1;
+  for (size_t i = 0; i < loop_order_.size(); ++i) {
+    Loop& loop = loop_order_[i];
     CHECK_GE(available_parallelism, 1);
     int64_t iterations = loop_iterations(loop);
-
-    // Initialize loop iteration bounds to full range in element units.
-    loop.start = 0;
-    loop.end = loop.tile_interior ? loop.tile_size
-                                  : CeilOfRatio(loop.dim_size, loop.tile_size);
-
     int kMinBytesPerThread = inner_kernel_is_memcpy_ ? (1 << 20) : (1 << 26);
     int64_t min_iterations_per_thread =
         CeilOfRatio<int64_t>(kMinBytesPerThread, work_in_bytes[i]);
@@ -1171,61 +1198,16 @@ int TransposePlan::ChooseParallelizationStrategy(
 
     VLOG(8) << "iterations=" << iterations << " parallel_work=" << parallel_work
             << " available_parallelism=" << available_parallelism;
-    int parallelism = std::min<int64_t>(available_parallelism, parallel_work);
-    if (parallelism > 1) {
-      // If we use CeilOfRatio(iterations, parallelism) as the chunk size, we
-      // might end up with fewer chunks than parallelism if the chunk size is
-      // large. For example, if iterations=17 and parallelism=16,
-      // chunk_size=2. Then useful_tasks=9. We should reduce parallelism to 9.
-      int64_t chunk_size =
-          CeilOfRatio(iterations, static_cast<int64_t>(parallelism));
-      int64_t useful_tasks = CeilOfRatio(iterations, chunk_size);
-      parallelism = useful_tasks;
-    }
-    loop.parallelism = parallelism;
-    available_parallelism /= parallelism;
-    num_chunks *= loop.parallelism;
-  }
-  return num_chunks;
-}
-
-/*static*/ void TransposePlan::PartitionLoops(
-    int num_chunks, const std::vector<Loop>& loop_order,
-    std::vector<std::vector<TransposePlan::Loop>>& result,
-    std::vector<int64_t>& input_offset_bytes,
-    std::vector<int64_t>& output_offset_bytes) {
-  // Copy the base loop order for each chunk.
-  result.resize(num_chunks, loop_order);
-  input_offset_bytes.resize(num_chunks);
-  output_offset_bytes.resize(num_chunks);
-  for (int chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
-    // For each loop, narrow the start/end bounds to this chunk's portion.
-    int task_id_remaining = chunk_id;
-    int num_tasks_remaining = num_chunks;
-
-    for (size_t i = 0; i < loop_order.size(); ++i) {
-      Loop& chunk_loop = result[chunk_id][i];
-      const Loop& base_loop = loop_order[i];
-
-      num_tasks_remaining /= base_loop.parallelism;
-      int task_id = task_id_remaining / num_tasks_remaining;
-      task_id_remaining = task_id_remaining % num_tasks_remaining;
-
-      // Divide this loop's iterations (in element units) among parallelism
-      // tasks.
-      int64_t iterations = base_loop.end - base_loop.start;
-      int64_t iterations_per_task =
-          CeilOfRatio<int64_t>(iterations, base_loop.parallelism);
-
-      chunk_loop.start =
-          base_loop.start + std::min(iterations, task_id * iterations_per_task);
-      chunk_loop.end =
-          base_loop.start +
-          std::min(iterations, (task_id + 1) * iterations_per_task);
-      input_offset_bytes[chunk_id] += chunk_loop.start * chunk_loop.lda;
-      output_offset_bytes[chunk_id] += chunk_loop.start * chunk_loop.ldb;
+    if (parallel_work >= available_parallelism) {
+      loop.parallelism = available_parallelism;
+      available_parallelism = 1;
+    } else {
+      loop.parallelism = parallel_work;
+      available_parallelism /= parallel_work;
     }
+    num_threads *= loop.parallelism;
   }
+  return num_threads;
 }
 
 std::string TransposePlan::ToString() const {
@@ -1238,28 +1220,19 @@ std::string TransposePlan::ToString() const {
                   absl::StrAppendFormat(
                       out,
                       "    "
-                      "Node(end=%d,inc=%d,lda=%"
+                      "Node(start=%d,end=%d,inc=%d,lda=%"
                       "d,ldb=%d,next_trailing=%d,inner_a=%s,inner_b=%s)",
-                      node.end, node.inc, node.lda, node.ldb,
+                      node.start, node.end, node.inc, node.lda, node.ldb,
                       node.trailing_tile_next_node_inc,
                       node.is_inner_dim_in_a ? "y" : "n",
                       node.is_inner_dim_in_b ? "y" : "n");
                 }));
       });
-  auto format_loop = [](std::string* out, const Loop& loop) {
-    absl::StrAppendFormat(out, "%d%s[%d,%d](%d)", loop.dim_in_a,
-                          loop.tile_interior ? "[tile]" : "", loop.start,
-                          loop.end, loop.parallelism);
+  auto format_loop_order = [](std::string* out, const Loop& loop) {
+    return absl::StrAppend(out, loop.dim_in_a,
+                           loop.tile_interior ? "[tile]" : "", "(",
+                           loop.parallelism, ")");
   };
-  std::vector<std::string> chunk_strings;
-  chunk_strings.reserve(chunk_loops_.size());
-  for (int i = 0; i < chunk_loops_.size(); ++i) {
-    chunk_strings.push_back(absl::StrFormat(
-        "    chunk %d: input_offset=%d output_offset=%d loops=%s", i,
-        input_offset_bytes_[i], output_offset_bytes_[i],
-        absl::StrJoin(chunk_loops_[i], ", ", format_loop)));
-  }
-  std::string chunk_loops_str = absl::StrJoin(chunk_strings, "\n");
   std::string transformation_str;
   switch (transformation_) {
     case Transformation::kNone:
@@ -1271,19 +1244,19 @@ std::string TransposePlan::ToString() const {
   }
   return absl::StrFormat(
       "elem_size=%d a_dims=%s b_dims=%s permutation=%s a_tiling=%s b_tiling=%s "
-      "lda=%s lda_tile=%s ldb=%s ldb_tile=%s "
+      "lda=%s lda_tile=%s ldb=%s ldb_tile=%s loop_order=%s "
       "outer_bs=[%d,%d] inner_bs=%d "
       "transformation=%s scratch_size=%d\n"
-      "chunk_loops:\n%s\n"
       "nodes:\n%s",
       elem_size_in_bytes_, absl::StrJoin(a_dims_, ","),
       absl::StrJoin(Permute(a_dims_, permutation_), ","),
       absl::StrJoin(permutation_, ","), absl::StrJoin(a_tiling_, ","),
       absl::StrJoin(b_tiling_, ","), absl::StrJoin(lda_, ","),
       absl::StrJoin(lda_tile_, ","), absl::StrJoin(ldb_, ","),
-      absl::StrJoin(ldb_tile_, ","), outer_block_elems_a_, outer_block_elems_b_,
-      inner_block_elems_, transformation_str, scratch_size_, chunk_loops_str,
-      nodes_str);
+      absl::StrJoin(ldb_tile_, ","),
+      absl::StrJoin(loop_order_, ",", format_loop_order), outer_block_elems_a_,
+      outer_block_elems_b_, inner_block_elems_, transformation_str,
+      scratch_size_, nodes_str);
 }
 
 bool TransposePlanCacheKey::operator==(
@@ -1367,7 +1340,7 @@ absl::StatusOr<std::shared_ptr<TransposePlan>> TransposePlanCache::GetOrCreate(
   }
 
   // Coalesce from slow-varying to fast-varying (outer to inner).
-  // loops[0] is slowest.
+  // loop_order_[0] is slowest.
   int write_pos = 0;
   for (int read_pos = 1; read_pos < loops.size(); ++read_pos) {
     Loop& outer = loops[write_pos];
diff --git a/third_party/xla/xla/pjrt/transpose.h b/third_party/xla/xla/pjrt/transpose.h
index d6dea32e6c97c6..aef51be791a04b 100644
--- a/third_party/xla/xla/pjrt/transpose.h
+++ b/third_party/xla/xla/pjrt/transpose.h
@@ -124,12 +124,6 @@ class TransposePlan {
                std::optional<absl::FunctionRef<void(std::function<void(void)>)>>
                    schedule_work = std::nullopt) const;
 
-  // Executes a single chunk of the transposition. To perform a complete
-  // transposition, call ExecuteChunk for each chunk ID from 0 to Parallelism()
-  // - 1. It is legal to call ExecuteChunk for independent chunks in parallel.
-  // This is useful for callers that want to manage their own threading.
-  void ExecuteChunk(int chunk_id, const void* a, void* b) const;
-
   // Returns a human-readable description of the plan.
   std::string ToString() const;
 
@@ -181,11 +175,6 @@ class TransposePlan {
     // Number of parallel threads to use for this loop.
     int64_t parallelism;
 
-    // Iteration bounds for this chunk. Initially [0, full_iterations).
-    // After chunk splitting, each chunk's loops have narrowed bounds.
-    int64_t start = 0;  // Inclusive start of iteration range
-    int64_t end = 0;    // Exclusive end of iteration range
-
     bool operator==(const Loop& other) const;
   };
 
@@ -197,20 +186,11 @@ class TransposePlan {
   // Performs plan initialization that cannot fail.
   void Initialize();
 
-  void BuildPlanNodes(int chunk_id, std::vector<Node>& nodes);
-
-  // Chooses a parallelism for each loop. Returns the number of separate chunks
-  // in the plan, and populates the `parallelism` field of each loop.
-  int ChooseParallelizationStrategy(std::vector<Loop>& loop_order);
+  void BuildPlanNodes(int thread_id, std::vector<Node>& output_nodes);
 
-  // Creates per-chunk loop vectors by splitting loop_order_ into per-chunk
-  // loops. Returns a vector of loop vectors, one per chunk. Each chunk's
-  // loops have their start/end bounds narrowed to represent that chunk's work.
-  static void PartitionLoops(
-      int num_chunks, const std::vector<Loop>& loop_order,
-      std::vector<std::vector<TransposePlan::Loop>>& result,
-      std::vector<int64_t>& input_offset_bytes,
-      std::vector<int64_t>& output_offset_bytes);
+  // Chooses a parallelism for each loop. Returns the total number of parallel
+  // work units.
+  int ChooseParallelizationStrategy();
 
   // The signature of ExecuteTyped uses char* pointers because we perform
   // address calculations with strides in bytes; the strides need not be
@@ -257,13 +237,9 @@ class TransposePlan {
   bool a_is_tiled_;
   bool b_is_tiled_;
 
-  // Per-chunk loop nests. Each loop nest has its own start/end bounds
-  // representing one chunk of the work.
-  std::vector<std::vector<Loop>> chunk_loops_;
+  // Order to traverse dimensions, from slowest-varying to fastest-varying.
 
-  // Per-chunk byte offsets into the input and output arrays.
-  std::vector<int64_t> input_offset_bytes_;
-  std::vector<int64_t> output_offset_bytes_;
+  std::vector<Loop> loop_order_;
 
   // Root nodes of the plan, i.e., pointing to the outermost loops in the loop
   // nest. The outer vector is indexed on the thread ID.
diff --git a/third_party/xla/xla/pjrt/transpose_test.cc b/third_party/xla/xla/pjrt/transpose_test.cc
index 716f5d3bdff220..c136540eee1175 100644
--- a/third_party/xla/xla/pjrt/transpose_test.cc
+++ b/third_party/xla/xla/pjrt/transpose_test.cc
@@ -462,10 +462,7 @@ std::vector<TransposeTestCase> GetTransposeTestCases() {
                         /*permutation=*/{3, 1, 2, 0},
                         /*input_tiling=*/{},
                         /*output_tiling=*/{8, 128}),
-      TransposeTestCase{/*dims=*/{129, 1234567},
-                        /*permutation=*/{0, 1},
-                        /*input_tiling=*/{},
-                        /*output_tiling=*/{8, 128}}};
+  };
   return cases;
 }
 

From 68a2b2890091c8cca9cf223ec17d504e6049d9b6 Mon Sep 17 00:00:00 2001
From: Zixuan Jiang <zixuanjiang@google.com>
Date: Sat, 20 Dec 2025 13:03:15 -0800
Subject: [PATCH 618/753] Run
 `third_party/openxla/shardy/google/integrate_latest.sh` for the recent
 unreduced axes.

PiperOrigin-RevId: 847181792
---
 .../xla/third_party/shardy/temporary.patch    | 471 ------------------
 .../xla/third_party/shardy/workspace.bzl      |   4 +-
 2 files changed, 2 insertions(+), 473 deletions(-)

diff --git a/third_party/xla/third_party/shardy/temporary.patch b/third_party/xla/third_party/shardy/temporary.patch
index 3e0e0520e60482..e69de29bb2d1d6 100644
--- a/third_party/xla/third_party/shardy/temporary.patch
+++ b/third_party/xla/third_party/shardy/temporary.patch
@@ -1,471 +0,0 @@
-diff --git a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc
-index aceb4d7..8752484 100644
---- a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc
-+++ b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.cc
-@@ -908,8 +908,8 @@ void insertAllReducesForReductionFactors(
-   }
- }
- 
--bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
--                                          const SymbolTable& symbolTable) {
-+bool convertReshardToShardedToUnreduced(Operation* op, IRRewriter& rewriter,
-+                                        const SymbolTable& symbolTable) {
-   ReshardOp reshardOp = dyn_cast<ReshardOp>(op);
-   if (!reshardOp) {
-     return false;
-@@ -934,12 +934,7 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
-       << "Reshard op has different meshes for input and output. The result has "
-          "non-empty unreduced axes.";
- 
--  // The relationship of the unreduced axes is "out = in + r2u + s2u", where
--  // "r2u" is the replicated-to-unreduced axes and "s2u" is the
--  // sharded-to-unreduced axes.
--  SmallVector<AxisRefAttr> r2uAnds2uAxes =
--      getAxisSetDiff(outUnreducedAxes, inUnreducedAxes, inMesh);
--  if (r2uAnds2uAxes.empty()) {
-+  if (getAxisSetDiff(outUnreducedAxes, inUnreducedAxes, inMesh).empty()) {
-     return false;
-   }
- 
-@@ -950,7 +945,7 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
-       << "Input of sharded-to-unreduced reshard must be a block argument or a "
-          "reshard op.";
- 
--  SmallVector<AxisRefAttr> s2uAxes;
-+  SmallVector<AxisRefAttr> newUnreducedAxes = llvm::to_vector(inUnreducedAxes);
-   SmallVector<AxisRefListAttr> axesPerDim(inSharding.getRank());
-   for (auto [inDimSharding, outDimSharding, axes] :
-        llvm::zip_equal(inSharding.getDimShardings(),
-@@ -971,7 +966,7 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
-       }
-       diff.append(inAxes.begin() + outAxes.size(), inAxes.end());
-       axes = AxisRefListAttr::get(rewriter.getContext(), diff);
--      s2uAxes.append(diff);
-+      newUnreducedAxes.append(diff);
-     } else {
-       SDY_LOG(FATAL)
-           << "The reshard op needs to be decomposed to a sharded-to-unreduced "
-@@ -979,27 +974,17 @@ bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
-     }
-   }
- 
-+  sortAndMergeAxes(newUnreducedAxes, inMesh);
-+
-   rewriter.setInsertionPoint(reshardOp);
--  Value result = input;
--
--  SmallVector<AxisRefAttr> r2uAxes =
--      getAxisSetDiff(r2uAnds2uAxes, s2uAxes, inMesh);
--  if (!r2uAxes.empty()) {
--    SmallVector<AxisRefAttr> inPlusR2uAxes = llvm::to_vector(inUnreducedAxes);
--    inPlusR2uAxes.append(r2uAxes.begin(), r2uAxes.end());
--    sortAndMergeAxes(inPlusR2uAxes, inMesh);
--    TensorShardingAttr r2uSharding =
--        TensorShardingAttr::get(rewriter.getContext(), inSharding.getMeshName(),
--                                inSharding.getDimShardings(),
--                                outSharding.getReplicatedAxes(), inPlusR2uAxes);
--    result = ReplicatedToUnreducedOp::create(rewriter, reshardOp.getLoc(),
--                                             result, r2uAxes, r2uSharding);
--  }
--  if (!s2uAxes.empty()) {
--    result = ShardedToUnreducedOp::create(rewriter, reshardOp.getLoc(), result,
--                                          axesPerDim, outSharding);
-+  Operation* result = ShardedToUnreducedOp::create(
-+      rewriter, reshardOp.getLoc(), input, axesPerDim,
-+      outSharding.replaceUnreducedAxes(newUnreducedAxes));
-+  if (newUnreducedAxes != outUnreducedAxes) {
-+    SDY_LOG(WARNING) << "need repliaced-to-unreduced";
-+    result = ReshardOp::create(rewriter, reshardOp.getLoc(),
-+                               result->getResult(0), outSharding);
-   }
--
-   rewriter.replaceOp(reshardOp, result);
-   return true;
- }
-diff --git a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h
-index 0a5563f..c183216 100644
---- a/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h
-+++ b/shardy/dialect/sdy/transforms/export/explicit_reshards_util.h
-@@ -164,19 +164,15 @@ AxesPerFactor findCommonAxes(const ShardingProjection& shardingProjection,
-                              OpShardingRuleAttr shardingRule,
-                              ArrayRef<int64_t> tensorSizes, const Mesh& mesh);
- 
--// Converts a `sdy.reshard` op to an `sdy.replicated-to-unreduced` op and/or an
--// `sdy.sharded-to-unreduced` op. Returns true if the conversion is successful.
--//
--// `r2u` keeps the sharded size, while `s2u` increases the sharded size. Hence,
--// we do `r2u` first and then `s2u`.
-+// Converts a `sdy.reshard` op to an `sdy.sharded-to-unreduced` op. Returns true
-+// if the conversion is successful.
- //
- // The requirements are:
- // 1. `op` is a `sdy.reshard` op.
--// 2. The input and output shardings have the same mesh.
--// 3. The input of `op` is another `sdy.reshard` op or a block argument.
--// 4. The input unreduced axes is a strict subset of the output unreduced axes.
--bool convertReshardToUnreducedCollectives(Operation* op, IRRewriter& rewriter,
--                                          const SymbolTable& symbolTable);
-+// 2. The input of `op` is another `sdy.reshard` op or a block argument.
-+// 3. The `op` can be converted to a single `sdy.sharded-to-unreduced` op.
-+bool convertReshardToShardedToUnreduced(Operation* op, IRRewriter& rewriter,
-+                                        const SymbolTable& symbolTable);
- 
- }  // namespace sdy
- }  // namespace mlir
-diff --git a/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc b/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc
-index 85d048e..7f96c9b 100644
---- a/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc
-+++ b/shardy/dialect/sdy/transforms/export/insert_explicit_reshards.cc
-@@ -486,7 +486,7 @@ struct InsertExplicitReshardsPass
-         return;
-       }
- 
--      if (convertReshardToUnreducedCollectives(op, rewriter, symbolTable)) {
-+      if (convertReshardToShardedToUnreduced(op, rewriter, symbolTable)) {
-         return;
-       }
- 
-diff --git a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir
-index f30109e..f3868a9 100644
---- a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir
-+++ b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards.mlir
-@@ -2,7 +2,6 @@
- 
- sdy.mesh @mesh = <["x"=2, "y"=2, "z"=4]>
- sdy.mesh @other_mesh = <["x"=2, "y"=2]>
--sdy.mesh @mesh_x16 = <["x"=16]>
- sdy.mesh @mesh_abcd = <["a"=2, "b"=2, "c"=2, "d"=2]>
- 
- //===----------------------------------------------------------------------===//
-@@ -521,17 +520,17 @@ func.func @different_arguments_to_multiple_named_computations_with_same_input_ou
- }
- 
- //===----------------------------------------------------------------------===//
--// Replicated and sharded to unreduced tests
-+// Sharded to unreduced tests
- //===----------------------------------------------------------------------===//
- 
--// CHECK-LABEL: func @sharded_to_unreduced
--func.func @sharded_to_unreduced(
--    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
--    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
-+// CHECK-LABEL: func @sharded_to_unreduced_1
-+func.func @sharded_to_unreduced_1(
-+    %arg0 : tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
-+    -> (tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
-   // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x"}>
-   // CHECK-NEXT: return %0
--  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<16x8xf32>
--  return %0 : tensor<16x8xf32>
-+  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<24x8xf32>
-+  return %0 : tensor<24x8xf32>
- }
- 
- // CHECK-LABEL: func @sharded_to_unreduced_single_axis
-@@ -574,44 +573,13 @@ func.func @sharded_to_unreduced_with_subaxis(
-  return %0 : tensor<16x8xf32>
- }
- 
--// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_full_axis
--func.func @implicitly_and_explicitly_replicated_to_unreduced_full_axis(
--    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], replicated={"z"}, unreduced={"y"}>})
--    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x", "z"} %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}>
--  // CHECK-NEXT: return %0
--  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<16x8xf32>
--  return %0 : tensor<16x8xf32>
--}
--
--// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis
--func.func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis(
--    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], replicated={"x":(8)2}, unreduced={"x":(4)2}>})
--    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x":(2)2, "x":(8)2} %arg0 out_sharding=<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>
--  // CHECK-NEXT: return %0
--  %0 = sdy.reshard %arg0 <@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}> : tensor<16x8xf32>
--  return %0 : tensor<16x8xf32>
--}
--
--// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_full_axis
--func.func @replicated_and_sharded_to_unreduced_full_axis(
-+// CHECK-LABEL: func @sharded_to_unreduced_and_replicated_to_unreduced
-+func.func @sharded_to_unreduced_and_replicated_to_unreduced(
-     %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y"}>})
-     -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z"} %arg0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<16x8xf32>
--  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{"x"}, {}] %0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<16x8xf32>
-+  // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y"}>
-+  // CHECK-NEXT: %1 = sdy.reshard %0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}>
-   // CHECK-NEXT: return %1
-  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> :  tensor<16x8xf32>
-  return %0 : tensor<16x8xf32>
- }
--
--// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_sub_axis
--func.func @replicated_and_sharded_to_unreduced_sub_axis(
--    %arg0 : tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y"}>})
--    -> (tensor<16x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y", "z"}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z":(2)2} %arg0 out_sharding=<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y", "z":(2)2}> : tensor<16x8xf32>
--  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{}, {"z":(1)2}] %0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<16x8xf32>
--  // CHECK-NEXT: return %1
-- %0 = sdy.reshard %arg0 <@mesh, [{"x"}, {}], unreduced={"y", "z"}> :  tensor<16x8xf32>
-- return %0 : tensor<16x8xf32>
--}
-diff --git a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir
-index 5b1973a..5dea360 100644
---- a/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir
-+++ b/shardy/dialect/sdy/transforms/export/test/insert_explicit_reshards/unreduced.mlir
-@@ -1,7 +1,6 @@
- // RUN: sdy_opt %s -sdy-insert-explicit-reshards='enable-full-version=true' | FileCheck %s
- 
- sdy.mesh @mesh = <["x"=4, "y"=2, "z"=4]>
--sdy.mesh @mesh_x16 = <["x"=16]>
- 
- // CHECK-LABEL: func @all_reduce_on_func_input
- func.func @all_reduce_on_func_input(%arg0: tensor<4x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"y"}>}, %arg1: tensor<4x8xf32>) -> tensor<4x8xf32> {
-@@ -307,17 +306,17 @@ func.func @all_reduce_source_and_target_fully_replicated_shardings_and_different
- }
- 
- //===----------------------------------------------------------------------===//
--// Replicated and sharded to unreduced tests
-+// Sharded to unreduced tests
- //===----------------------------------------------------------------------===//
- 
--// CHECK-LABEL: func @sharded_to_unreduced
--func.func @sharded_to_unreduced(
--    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
--    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
-+// CHECK-LABEL: func @sharded_to_unreduced_1
-+func.func @sharded_to_unreduced_1(
-+    %arg0 : tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}]>})
-+    -> (tensor<24x8xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x"}>}) {
-   // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x"}>
-   // CHECK-NEXT: return %0
--  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<32x32xf32>
--  return %0 : tensor<32x32xf32>
-+  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x"}> : tensor<24x8xf32>
-+  return %0 : tensor<24x8xf32>
- }
- 
- // CHECK-LABEL: func @sharded_to_unreduced_single_axis
-@@ -360,44 +359,13 @@ func.func @sharded_to_unreduced_with_subaxis(
-  return %0 : tensor<32x32xf32>
- }
- 
--// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_full_axis
--func.func @implicitly_and_explicitly_replicated_to_unreduced_full_axis(
--    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], replicated={"z"}, unreduced={"y"}>})
--    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x", "z"} %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}>
--  // CHECK-NEXT: return %0
--  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<32x32xf32>
--  return %0 : tensor<32x32xf32>
--}
--
--// CHECK-LABEL: func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis
--func.func @implicitly_and_explicitly_replicated_to_unreduced_sub_axis(
--    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], replicated={"x":(8)2}, unreduced={"x":(4)2}>})
--    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"x":(2)2, "x":(8)2} %arg0 out_sharding=<@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}>
--  // CHECK-NEXT: return %0
--  %0 = sdy.reshard %arg0 <@mesh_x16, [{"x":(1)2}, {}], unreduced={"x":(2)8}> : tensor<32x32xf32>
--  return %0 : tensor<32x32xf32>
--}
--
--// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_full_axis
--func.func @replicated_and_sharded_to_unreduced_full_axis(
-+// CHECK-LABEL: func @sharded_to_unreduced_and_replicated_to_unreduced
-+func.func @sharded_to_unreduced_and_replicated_to_unreduced(
-     %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y"}>})
-     -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{}, {}], unreduced={"x", "y", "z"}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z"} %arg0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<32x32xf32>
--  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{"x"}, {}] %0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y", "z"}> : tensor<32x32xf32>
-+  // CHECK-NEXT: %0 = sdy.sharded_to_unreduced [{"x"}, {}] %arg0 out_sharding=<@mesh, [{}, {}], unreduced={"x", "y"}>
-+  // CHECK-NEXT: %1 = sdy.reshard %0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}>
-   // CHECK-NEXT: return %1
-  %0 = sdy.reshard %arg0 <@mesh, [{}, {}], unreduced={"x", "y", "z"}> :  tensor<32x32xf32>
-  return %0 : tensor<32x32xf32>
- }
--
--// CHECK-LABEL: func @replicated_and_sharded_to_unreduced_sub_axis
--func.func @replicated_and_sharded_to_unreduced_sub_axis(
--    %arg0 : tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y"}>})
--    -> (tensor<32x32xf32> {sdy.sharding = #sdy.sharding<@mesh, [{"x"}, {}], unreduced={"y", "z"}>}) {
--  // CHECK-NEXT: %0 = sdy.replicated_to_unreduced {"z":(2)2} %arg0 out_sharding=<@mesh, [{"x"}, {"z":(1)2}], unreduced={"y", "z":(2)2}> : tensor<32x32xf32>
--  // CHECK-NEXT: %1 = sdy.sharded_to_unreduced [{}, {"z":(1)2}] %0 out_sharding=<@mesh, [{"x"}, {}], unreduced={"y", "z"}> : tensor<32x32xf32>
--  // CHECK-NEXT: return %1
-- %0 = sdy.reshard %arg0 <@mesh, [{"x"}, {}], unreduced={"y", "z"}> :  tensor<32x32xf32>
-- return %0 : tensor<32x32xf32>
--}
-diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
-index 509398d..f82404c 100644
---- a/third_party/llvm/generated.patch
-+++ b/third_party/llvm/generated.patch
-@@ -1 +1,152 @@
- Auto generated patch. Do not edit or delete it, even if empty.
-+diff -ruN --strip-trailing-cr a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
-+--- a/clang/lib/Serialization/ASTReaderDecl.cpp
-++++ b/clang/lib/Serialization/ASTReaderDecl.cpp
-+@@ -2107,8 +2107,9 @@
-+     auto *Def = DD.Definition;
-+     DD = std::move(MergeDD);
-+     DD.Definition = Def;
-+-    for (auto *D : Def->redecls())
-+-      cast<CXXRecordDecl>(D)->DefinitionData = &DD;
-++    for (auto *R = Reader.getMostRecentExistingDecl(Def); R;
-++         R = R->getPreviousDecl())
-++      cast<CXXRecordDecl>(R)->DefinitionData = &DD;
-+     return;
-+   }
-+ 
-+diff -ruN --strip-trailing-cr a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
-+--- a/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
-++++ b/libc/src/__support/FPUtil/x86_64/fenv_mxcsr_utils.h
-+@@ -61,14 +61,14 @@
-+ LIBC_INLINE static void write_mxcsr(uint32_t w) { _mm_setcsr(w); }
-+ 
-+ LIBC_INLINE static void clear_except(uint16_t excepts) {
-+-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
-++  uint32_t mxcsr = get_mxcsr();
-+   mxcsr &= ~static_cast<uint32_t>(excepts);
-+-  _MM_SET_EXCEPTION_STATE(mxcsr);
-++  write_mxcsr(mxcsr);
-+ }
-+ 
-+ LIBC_INLINE static uint16_t test_except(uint16_t excepts) {
-+   uint32_t mxcsr = get_mxcsr();
-+-  return static_cast<uint16_t>(excepts & mxcsr);
-++  return static_cast<uint16_t>(excepts & ExceptionFlags::ALL_F & mxcsr);
-+ }
-+ 
-+ LIBC_INLINE static uint16_t get_except() {
-+@@ -83,9 +83,9 @@
-+ }
-+ 
-+ LIBC_INLINE static void raise_except(uint16_t excepts) {
-+-  uint32_t mxcsr = _MM_GET_EXCEPTION_STATE();
-+-  mxcsr |= excepts;
-+-  _MM_SET_EXCEPTION_STATE(mxcsr);
-++  uint32_t mxcsr = get_mxcsr();
-++  mxcsr |= excepts & ExceptionFlags::ALL_F;
-++  write_mxcsr(mxcsr);
-+ #ifdef LIBC_TRAP_ON_RAISE_FP_EXCEPT
-+   // We will try to trigger the SIGFPE if floating point exceptions are not
-+   // masked.  Since we already set all the floating point exception flags, we
-+diff -ruN --strip-trailing-cr a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
-+--- a/libcxx/include/__flat_map/flat_map.h
-++++ b/libcxx/include/__flat_map/flat_map.h
-+@@ -465,13 +465,13 @@
-+   }
-+ 
-+   // [flat.map.access], element access
-+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
-++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](const key_type& __x)
-+     requires is_constructible_v<mapped_type>
-+   {
-+     return try_emplace(__x).first->second;
-+   }
-+ 
-+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
-++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](key_type&& __x)
-+     requires is_constructible_v<mapped_type>
-+   {
-+     return try_emplace(std::move(__x)).first->second;
-+@@ -480,7 +480,7 @@
-+   template <class _Kp>
-+     requires(__is_compare_transparent && is_constructible_v<key_type, _Kp> && is_constructible_v<mapped_type> &&
-+              !is_convertible_v<_Kp &&, const_iterator> && !is_convertible_v<_Kp &&, iterator>)
-+-  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
-++  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 mapped_type& operator[](_Kp&& __x) {
-+     return try_emplace(std::forward<_Kp>(__x)).first->second;
-+   }
-+ 
-+diff -ruN --strip-trailing-cr a/libcxx/include/map b/libcxx/include/map
-+--- a/libcxx/include/map
-++++ b/libcxx/include/map
-+@@ -1092,9 +1092,9 @@
-+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __tree_.size(); }
-+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return __tree_.max_size(); }
-+ 
-+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
-++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
-+ #  ifndef _LIBCPP_CXX03_LANG
-+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
-++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
-+ #  endif
-+ 
-+   template <class _Arg,
-+diff -ruN --strip-trailing-cr a/libcxx/include/unordered_map b/libcxx/include/unordered_map
-+--- a/libcxx/include/unordered_map
-++++ b/libcxx/include/unordered_map
-+@@ -1262,9 +1262,9 @@
-+   }
-+ #  endif // _LIBCPP_STD_VER >= 20
-+ 
-+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
-++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](const key_type& __k);
-+ #  ifndef _LIBCPP_CXX03_LANG
-+-  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
-++  _LIBCPP_HIDE_FROM_ABI mapped_type& operator[](key_type&& __k);
-+ #  endif
-+ 
-+   [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI mapped_type& at(const key_type& __k);
-+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
-+--- a/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
-++++ b/libcxx/test/libcxx/diagnostics/flat_map.nodiscard.verify.cpp
-+@@ -66,9 +66,9 @@
-+   TransparentKey<int> tkey;
-+ 
-+   std::flat_map<int, int> nfm;
-+-  nfm[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+-  fm[std::move(key)];  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+-  fm[std::move(tkey)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-++  nfm[key];            // no-warning
-++  fm[std::move(key)];  // no-warning
-++  fm[std::move(tkey)]; // no-warning
-+ 
-+   fm.at(key);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+   cfm.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
-+--- a/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
-++++ b/libcxx/test/libcxx/diagnostics/map.nodiscard.verify.cpp
-+@@ -55,8 +55,8 @@
-+ 
-+   int key = 0;
-+ 
-+-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-++  m[key];            // no-warning
-++  m[std::move(key)]; // no-warning
-+ 
-+ #if TEST_STD_VER >= 14
-+   std::map<std::string, int, std::less<>> strMap;
-+diff -ruN --strip-trailing-cr a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
-+--- a/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
-++++ b/libcxx/test/libcxx/diagnostics/unordered_map.nodiscard.verify.cpp
-+@@ -81,8 +81,8 @@
-+   ctm.equal_range(tkey); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+ #endif
-+ 
-+-  m[key];            // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+-  m[std::move(key)]; // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-++  m[key];            // no-warning
-++  m[std::move(key)]; // no-warning
-+ 
-+   m.at(key);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-+   cm.at(key); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
-index f2c3289..29af0ff 100644
---- a/third_party/llvm/workspace.bzl
-+++ b/third_party/llvm/workspace.bzl
-@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
- 
- def repo(name):
-     """Imports LLVM."""
--    LLVM_COMMIT = "8f264586d7521b0e305ca7bb78825aa3382ffef7"
--    LLVM_SHA256 = "5784c4af94caba66bc8c460e07e222f751e4f4c9db9c45b3a68ff55379cf587d"
-+    LLVM_COMMIT = "7d381f2a5634d1e41b61299839d652cc4a021898"
-+    LLVM_SHA256 = "f1641918fd3f5e1667d39afb9c261da39ed9f74e30f1c2f98031d6d609a8de15"
- 
-     tf_http_archive(
-         name = name,
diff --git a/third_party/xla/third_party/shardy/workspace.bzl b/third_party/xla/third_party/shardy/workspace.bzl
index 03bd1efd1ba577..0d75504d3d6e06 100644
--- a/third_party/xla/third_party/shardy/workspace.bzl
+++ b/third_party/xla/third_party/shardy/workspace.bzl
@@ -3,8 +3,8 @@
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
-    SHARDY_COMMIT = "05276b9c4469f2331e326f614d712da7b907f7df"
-    SHARDY_SHA256 = "f76bef82a597c4d72505dc1c5f8559cf77e720bdeacf976845578970e03265ea"
+    SHARDY_COMMIT = "89db8f8a60c810205365b1117e6c27ac99aa40f3"
+    SHARDY_SHA256 = "a5d33fa1af43f162e62a7bdff411cda7ca0a4992c6c304ac2e3344524c30e65d"
 
     tf_http_archive(
         name = "shardy",

From 6f24565d891bd53bfb2364ba56f80dc4b955d9ed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:27:46 -0800
Subject: [PATCH 619/753] Automated Code Change

PiperOrigin-RevId: 847186391
---
 tensorflow/core/lib/strings/proto_serialization_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/strings/proto_serialization_test.cc b/tensorflow/core/lib/strings/proto_serialization_test.cc
index 052a6dff016d25..fa2e2364d9f216 100644
--- a/tensorflow/core/lib/strings/proto_serialization_test.cc
+++ b/tensorflow/core/lib/strings/proto_serialization_test.cc
@@ -47,7 +47,7 @@ static void BM_ProtoSerializationToString(::testing::benchmark::State& state) {
   GraphDef graph_def = MakeGraphDef(num_nodes);
 
   for (auto i : state) {
-    string serialized;
+    std::string serialized;
     testing::DoNotOptimize(
         SerializeToStringDeterministic(graph_def, &serialized));
   }

From 6e91c3160bd67988013c9ee8b74bc4d5f86019d2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:34:17 -0800
Subject: [PATCH 620/753] Automated Code Change

PiperOrigin-RevId: 847187496
---
 tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
index 4ff627046e47a7..b5957d99dee649 100644
--- a/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/fft_spmd_expander.cc
@@ -281,7 +281,7 @@ StatusOr<mlir::Operation*> ExpandFFTN(mlir::Operation* fft_op,
   } else {
     TF_ASSIGN_OR_RETURN(auto fft_length_vec, ExtractFFTLengthFromOp(fft_op));
     mlir::Value fft_length = IntConst(
-        builder, location, (int32)fft_length_vec[num_transform_axes - 1]);
+        builder, location, (int32_t)fft_length_vec[num_transform_axes - 1]);
     llvm::ArrayRef<int64_t> rfft_shape =
         mlir::dyn_cast<mlir::TensorType>(intermediate.getType()).getShape();
     std::vector<int64_t> rfft_shape_vec = rfft_shape.vec();
@@ -380,7 +380,7 @@ StatusOr<mlir::Operation*> ExpandIFFTN(mlir::Operation* ifft_op,
                         ExtractFFTLengthFromOp(ifft_op));
     mlir::Value ifft_length =
         IntConst(builder, location,
-                 (int32)complex_fft_length_vec[num_transform_axes - 1]);
+                 (int32_t)complex_fft_length_vec[num_transform_axes - 1]);
     // IRFFT for the last axis.
     mlir::TF::IRFFTOp irfft_output_op = mlir::TF::IRFFTOp::create(
         builder, location, ifft_op->getResult(0).getType(), transposed_output,

From cd1558548f474241f9656bb631495d4d3a7505ac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:38:40 -0800
Subject: [PATCH 621/753] Automated Code Change

PiperOrigin-RevId: 847188276
---
 tensorflow/core/grappler/utils/scc_test.cc | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/grappler/utils/scc_test.cc b/tensorflow/core/grappler/utils/scc_test.cc
index 4fc4e7abaa4339..d4c196167fca43 100644
--- a/tensorflow/core/grappler/utils/scc_test.cc
+++ b/tensorflow/core/grappler/utils/scc_test.cc
@@ -31,7 +31,7 @@ namespace {
 class SCCTest : public ::testing::Test {
  public:
   void SetUp() override {
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     DeviceProperties unknown_device;
     devices["MY_DEVICE"] = unknown_device;
     cluster_ = std::make_unique<VirtualCluster>(devices);
@@ -41,11 +41,11 @@ class SCCTest : public ::testing::Test {
   void TearDown() override { cluster_.reset(); }
 
  protected:
-  static NodeDef CreateNode(const string& name,
-                            absl::Span<const string> inputs) {
+  static NodeDef CreateNode(const std::string& name,
+                            absl::Span<const std::string> inputs) {
     NodeDef node;
     node.set_name(name);
-    for (const string& input : inputs) {
+    for (const std::string& input : inputs) {
       node.add_input(input);
     }
     return node;
@@ -86,7 +86,7 @@ TEST_F(SCCTest, DisjointCycleAndPath) {
   *graph.add_node() = CreateNode("h", {"g"});
 
   std::vector<const NodeDef*> nodes;
-  std::unordered_map<string, const NodeDef*> name_to_node;
+  std::unordered_map<std::string, const NodeDef*> name_to_node;
   for (const auto& n : graph.node()) {
     nodes.push_back(&n);
     name_to_node[n.name()] = &n;
@@ -149,7 +149,7 @@ TEST_F(SCCTest, WikipediaExample) {
   *graph.add_node() = CreateNode("h", {"h"});
 
   std::vector<const NodeDef*> nodes;
-  std::unordered_map<string, const NodeDef*> name_to_node;
+  std::unordered_map<std::string, const NodeDef*> name_to_node;
   for (const auto& n : graph.node()) {
     nodes.push_back(&n);
     name_to_node[n.name()] = &n;
@@ -187,7 +187,7 @@ TEST_F(SCCTest, TensorFlowLoop) {
        with open('/tmp/graph.txt', 'w') as f:
          f.write(str(tf.get_default_graph().as_graph_def()))
   */
-  const string gdef_ascii = R"EOF(
+  const std::string gdef_ascii = R"EOF(
 node {
   name: "Const"
   op: "Const"
@@ -411,7 +411,7 @@ versions {
 
 TEST_F(SCCTest, NestedLoops) {
   GrapplerItem item;
-  string filename = io::JoinPath(
+  std::string filename = io::JoinPath(
       testing::TensorFlowSrcRoot(),
       "core/grappler/costs/graph_properties_testdata/nested_loop.pbtxt");
   TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));

From bc145c0cd78fa5962257c80cb6c798d445c5c855 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:38:42 -0800
Subject: [PATCH 622/753] Automated Code Change

PiperOrigin-RevId: 847188284
---
 tensorflow/core/lib/wav/wav_io.cc | 69 +++++++++++++++++--------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index 41db93ae910a18..5edbfd28bdccbf 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -73,7 +73,7 @@ constexpr char kRiffType[] = "WAVE";
 constexpr char kFormatChunkId[] = "fmt ";
 constexpr char kDataChunkId[] = "data";
 
-inline int16 FloatToInt16Sample(float data) {
+inline int16_t FloatToInt16Sample(float data) {
   constexpr float kMultiplier = 1.0f * (1 << 15);
   return std::min<float>(std::max<float>(roundf(data * kMultiplier),
                                          std::numeric_limits<int16_t>::min()),
@@ -212,7 +212,7 @@ absl::Status EncodeAudioAsS16LEWav(const float* audio, size_t sample_rate,
   for (size_t i = 0; i < num_samples; ++i) {
     int16_t sample = FloatToInt16Sample(audio[i]);
     core::EncodeFixed16(&data[i * kBytesPerSample],
-                        static_cast<uint16>(sample));
+                        static_cast<uint16_t>(sample));
   }
   return absl::OkStatus();
 }
@@ -230,13 +230,14 @@ template Status EncodeAudioAsS16LEWav<tstring>(const float* audio,
 
 absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
                                           std::vector<float>* float_values,
-                                          uint32* sample_count,
-                                          uint16* channel_count,
-                                          uint32* sample_rate) {
+                                          uint32_t* sample_count,
+                                          uint16_t* channel_count,
+                                          uint32_t* sample_rate) {
   int offset = 0;
   TF_RETURN_IF_ERROR(ExpectText(wav_string, kRiffChunkId, &offset));
-  uint32 total_file_size;
-  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &total_file_size, &offset));
+  uint32_t total_file_size;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint32_t>(wav_string, &total_file_size, &offset));
   TF_RETURN_IF_ERROR(ExpectText(wav_string, kRiffType, &offset));
   std::string found_text;
   TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &found_text, &offset));
@@ -252,57 +253,61 @@ absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
         found_text != "link" && found_text != "axml") {
       return errors::InvalidArgument("Unexpected field ", found_text);
     }
-    uint32 size_of_chunk;
-    TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &size_of_chunk, &offset));
+    uint32_t size_of_chunk;
+    TF_RETURN_IF_ERROR(
+        ReadValue<uint32_t>(wav_string, &size_of_chunk, &offset));
     TF_RETURN_IF_ERROR(
         IncrementOffset(offset, size_of_chunk, wav_string.size(), &offset));
     TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &found_text, &offset));
   }
-  uint32 format_chunk_size;
+  uint32_t format_chunk_size;
   TF_RETURN_IF_ERROR(
-      ReadValue<uint32>(wav_string, &format_chunk_size, &offset));
+      ReadValue<uint32_t>(wav_string, &format_chunk_size, &offset));
   if ((format_chunk_size != 16) && (format_chunk_size != 18)) {
     return errors::InvalidArgument(
         "Bad format chunk size for WAV: Expected 16 or 18, but got",
         format_chunk_size);
   }
-  uint16 audio_format;
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &audio_format, &offset));
+  uint16_t audio_format;
+  TF_RETURN_IF_ERROR(ReadValue<uint16_t>(wav_string, &audio_format, &offset));
   if (audio_format != 1) {
     return errors::InvalidArgument(
         "Bad audio format for WAV: Expected 1 (PCM), but got", audio_format);
   }
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, channel_count, &offset));
+  TF_RETURN_IF_ERROR(ReadValue<uint16_t>(wav_string, channel_count, &offset));
   if (*channel_count < 1) {
     return errors::InvalidArgument(
         "Bad number of channels for WAV: Expected at least 1, but got ",
         *channel_count);
   }
-  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, sample_rate, &offset));
-  uint32 bytes_per_second;
-  TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &bytes_per_second, &offset));
-  uint16 bytes_per_sample;
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &bytes_per_sample, &offset));
+  TF_RETURN_IF_ERROR(ReadValue<uint32_t>(wav_string, sample_rate, &offset));
+  uint32_t bytes_per_second;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint32_t>(wav_string, &bytes_per_second, &offset));
+  uint16_t bytes_per_sample;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint16_t>(wav_string, &bytes_per_sample, &offset));
   // Confusingly, bits per sample is defined as holding the number of bits for
   // one channel, unlike the definition of sample used elsewhere in the WAV
   // spec. For example, bytes per sample is the memory needed for all channels
   // for one point in time.
-  uint16 bits_per_sample;
-  TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &bits_per_sample, &offset));
+  uint16_t bits_per_sample;
+  TF_RETURN_IF_ERROR(
+      ReadValue<uint16_t>(wav_string, &bits_per_sample, &offset));
   if (bits_per_sample != 16) {
     return errors::InvalidArgument(
         "Can only read 16-bit WAV files, but received ", bits_per_sample);
   }
-  const uint32 expected_bytes_per_sample =
+  const uint32_t expected_bytes_per_sample =
       ((bits_per_sample * *channel_count) + 7) / 8;
   if (bytes_per_sample != expected_bytes_per_sample) {
     return errors::InvalidArgument(
         "Bad bytes per sample in WAV header: Expected ",
         expected_bytes_per_sample, " but got ", bytes_per_sample);
   }
-  const uint64 expected_bytes_per_second =
-      static_cast<uint64>(bytes_per_sample) * *sample_rate;
-  if (static_cast<uint64>(bytes_per_second) != expected_bytes_per_second) {
+  const uint64_t expected_bytes_per_second =
+      static_cast<uint64_t>(bytes_per_sample) * *sample_rate;
+  if (static_cast<uint64_t>(bytes_per_second) != expected_bytes_per_second) {
     return errors::InvalidArgument(
         "Bad bytes per second in WAV header: Expected ",
         expected_bytes_per_second, " but got ", bytes_per_second,
@@ -318,12 +323,12 @@ absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
   while (offset < wav_string.size()) {
     std::string chunk_id;
     TF_RETURN_IF_ERROR(ReadString(wav_string, 4, &chunk_id, &offset));
-    uint32 chunk_size;
-    TF_RETURN_IF_ERROR(ReadValue<uint32>(wav_string, &chunk_size, &offset));
-    if (chunk_size > std::numeric_limits<int32>::max()) {
+    uint32_t chunk_size;
+    TF_RETURN_IF_ERROR(ReadValue<uint32_t>(wav_string, &chunk_size, &offset));
+    if (chunk_size > std::numeric_limits<int32_t>::max()) {
       return errors::InvalidArgument(
           "WAV data chunk '", chunk_id, "' is too large: ", chunk_size,
-          " bytes, but the limit is ", std::numeric_limits<int32>::max());
+          " bytes, but the limit is ", std::numeric_limits<int32_t>::max());
     }
     if (chunk_id == kDataChunkId) {
       if (was_data_found) {
@@ -331,18 +336,18 @@ absl::Status DecodeLin16WaveAsFloatVector(const std::string& wav_string,
       }
       was_data_found = true;
       *sample_count = chunk_size / bytes_per_sample;
-      const uint32 data_count = *sample_count * *channel_count;
+      const uint32_t data_count = *sample_count * *channel_count;
       int unused_new_offset = 0;
       // Validate that the data exists before allocating space for it
       // (prevent easy OOM errors).
-      TF_RETURN_IF_ERROR(IncrementOffset(offset, sizeof(int16) * data_count,
+      TF_RETURN_IF_ERROR(IncrementOffset(offset, sizeof(int16_t) * data_count,
                                          wav_string.size(),
                                          &unused_new_offset));
       float_values->resize(data_count);
       for (int i = 0; i < data_count; ++i) {
         int16_t single_channel_value = 0;
         TF_RETURN_IF_ERROR(
-            ReadValue<int16>(wav_string, &single_channel_value, &offset));
+            ReadValue<int16_t>(wav_string, &single_channel_value, &offset));
         (*float_values)[i] = Int16SampleToFloat(single_channel_value);
       }
     } else {

From c0a2b0e8b7e9b75191368fddd18bb01e7149b166 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:42:44 -0800
Subject: [PATCH 623/753] Automated Code Change

PiperOrigin-RevId: 847189104
---
 tensorflow/core/kernels/in_topk_op_gpu.cu.cc  | 18 ++---
 .../kernels/inplace_ops_functor_gpu.cu.cc     | 66 +++++++++----------
 2 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
index cd1d3e88b510bf..b011a24cb1ed1e 100644
--- a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
@@ -39,7 +39,7 @@ template <typename T, typename TargetT>
 __global__ void ComputePredictionMaskKernel(
     const T* __restrict__ predictions,    // dims: [ num_targets x num_classes ]
     const TargetT* __restrict__ targets,  // dims: [ num_targets ]
-    int64* __restrict__ mask,             // dims: [ num_targets x num_classes ]
+    int64_t* __restrict__ mask,           // dims: [ num_targets x num_classes ]
     int num_targets, int num_classes) {
   GPU_1D_KERNEL_LOOP(i, num_targets * num_classes) {
     const int batch_index = i / num_classes;
@@ -67,7 +67,8 @@ __global__ void ComputePredictionMaskKernel(
 // larger than the target, or to '-1' if target class in invalid of predictions
 // in a batch have non-finite values.
 struct MaskSum {
-  __host__ __device__ int64 operator()(const int64& a, const int64& b) const {
+  __host__ __device__ int64_t operator()(const int64_t& a,
+                                         const int64_t& b) const {
     if (a < 0 || b < 0)
       return -1;
     else
@@ -77,8 +78,8 @@ struct MaskSum {
 
 namespace reduction_op_helper {
 template <>
-struct IdentityValue<int64, MaskSum> {
-  int64 operator()() { return 0; }
+struct IdentityValue<int64_t, MaskSum> {
+  int64_t operator()() { return 0; }
 };
 
 }  // namespace reduction_op_helper
@@ -138,8 +139,8 @@ struct InTopKFunctor<GPUDevice, T, TargetT> {
       auto in = predictions_mask.matrix<int64_t>();
       auto out = num_larger_prediction.flat<int64_t>();
 
-      ReduceImpl<int64, MaskSum, int64*, int64*, Dims<1>>(
-          context, (int64*)out.data(), (int64*)in.data(), in.rank(),
+      ReduceImpl<int64_t, MaskSum, int64_t*, int64_t*, Dims<1>>(
+          context, (int64_t*)out.data(), (int64_t*)in.data(), in.rank(),
           in.dimension(0), in.rank() >= 2 ? in.dimension(1) : 1,
           in.rank() >= 3 ? in.dimension(2) : 1, out.rank(), Dims<1>(1),
           MaskSum());
@@ -152,8 +153,9 @@ struct InTopKFunctor<GPUDevice, T, TargetT> {
       if (k.k_tensor->dtype() == DT_INT32) {
         output.device(d) =
             (cnt >= cnt.constant(0)) &&
-            (cnt < k.k_tensor->flat<int32>().template cast<int64_t>().broadcast(
-                       Dims<1>(num_targets)));
+            (cnt <
+             k.k_tensor->flat<int32_t>().template cast<int64_t>().broadcast(
+                 Dims<1>(num_targets)));
       } else {
         output.device(d) =
             (cnt >= cnt.constant(0)) &&
diff --git a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
index 001b6a45e35c5d..6ba369ebdb4346 100644
--- a/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/inplace_ops_functor_gpu.cu.cc
@@ -27,13 +27,13 @@ namespace functor {
 typedef Eigen::GpuDevice Device;
 
 template <typename T>
-__global__ void DoParallelConcatOpKernel(int nthreads, const int64 rows,
-                                         const int64 cols, int32 loc,
+__global__ void DoParallelConcatOpKernel(int nthreads, const int64_t rows,
+                                         const int64_t cols, int32_t loc,
                                          const T* __restrict__ src,
                                          T* __restrict__ dst) {
   GPU_1D_KERNEL_LOOP(idx, nthreads) {
-    int64 c = idx % cols;
-    int64 r = (loc % rows + rows) % rows;  // Guard index range.
+    int64_t c = idx % cols;
+    int64_t r = (loc % rows + rows) % rows;  // Guard index range.
     T* p = dst + r * cols + c;
     const T* q = src + idx;
     *p = ldg(q);
@@ -41,24 +41,24 @@ __global__ void DoParallelConcatOpKernel(int nthreads, const int64 rows,
 }
 
 template <typename T>
-Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32 loc,
-                              Tensor* output) {
-  const int64 nelem = value.NumElements();
+absl::Status DoParallelConcatUpdate(const Device& d, const Tensor& value,
+                                    int32_t loc, Tensor* output) {
+  const int64_t nelem = value.NumElements();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
   auto Toutput = output->flat_outer_dims<T>();
-  const int64 nrows = Toutput.dimension(0);
-  const int64 ncols = Toutput.dimension(1);
+  const int64_t nrows = Toutput.dimension(0);
+  const int64_t ncols = Toutput.dimension(1);
   const T* src = value.flat<T>().data();
   T* dst = output->flat<T>().data();
   TF_CHECK_OK(GpuLaunchKernel(
       DoParallelConcatOpKernel<T>, cfg.block_count, cfg.thread_per_block, 0,
       d.stream(), cfg.virtual_thread_count, nrows, ncols, loc, src, dst));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
-Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
-                        Tensor* output) {
+absl::Status DoParallelConcat(const Device& d, const Tensor& value, int32_t loc,
+                              Tensor* output) {
   CHECK_EQ(value.dtype(), output->dtype());
   switch (value.dtype()) {
 #define CASE(type)                                              \
@@ -77,18 +77,18 @@ Status DoParallelConcat(const Device& d, const Tensor& value, int32 loc,
       return errors::InvalidArgument("Unsupported data type: ",
                                      DataTypeString(value.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T, InplaceOpType op>
-__global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
-                                  const int64 cols, const int64 n,
+__global__ void DoInplaceOpKernel(int nthreads, const int64_t rows,
+                                  const int64_t cols, const int64_t n,
                                   const T* __restrict__ src,
-                                  const int32* __restrict__ rowids,
+                                  const int32_t* __restrict__ rowids,
                                   T* __restrict__ dst) {
   GPU_1D_KERNEL_LOOP(idx, nthreads) {
-    int64 r = idx / cols;
-    int64 c = idx % cols;
+    int64_t r = idx / cols;
+    int64_t c = idx % cols;
     r = (rowids[r] % rows + rows) % rows;  // Guard index range.
     T* p = dst + r * cols + c;
     const T* q = src + idx;
@@ -109,15 +109,15 @@ __global__ void DoInplaceOpKernel(int nthreads, const int64 rows,
 template <typename T>
 void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
-  const int64 nelem = v.NumElements();
+  const int64_t nelem = v.NumElements();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
   auto Ty = y->flat_outer_dims<T>();
-  const int64 nrows = Ty.dimension(0);
-  const int64 ncols = Ty.dimension(1);
-  const int64 n = i.NumElements();
+  const int64_t nrows = Ty.dimension(0);
+  const int64_t ncols = Ty.dimension(1);
+  const int64_t n = i.NumElements();
   const T* src = v.flat<T>().data();
   // TODO(sjhwang): Check that first dimension fits in int32 range.
-  const int32* rowids = i.flat<int32>().data();
+  const int32_t* rowids = i.flat<int32_t>().data();
   T* dst = y->flat<T>().data();
   switch (op) {
     case I_UPDATE:
@@ -144,15 +144,15 @@ void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
 template <bool>
 void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
                  const Tensor& v, Tensor* y) {
-  const int64 nelem = v.NumElements();
+  const int64_t nelem = v.NumElements();
   GpuLaunchConfig cfg = GetGpuLaunchConfig(nelem, d);
   auto Ty = y->flat_outer_dims<bool>();
-  const int64 nrows = Ty.dimension(0);
-  const int64 ncols = Ty.dimension(1);
-  const int64 n = i.NumElements();
+  const int64_t nrows = Ty.dimension(0);
+  const int64_t ncols = Ty.dimension(1);
+  const int64_t n = i.NumElements();
   const bool* src = v.flat<bool>().data();
   // TODO(sjhwang): Check that first dimension fits in int32 range.
-  const int32* rowids = i.flat<int32>().data();
+  const int32_t* rowids = i.flat<int32_t>().data();
   bool* dst = y->flat<bool>().data();
   if (op == I_UPDATE) {
     TF_CHECK_OK(GpuLaunchKernel(DoInplaceOpKernel<bool, I_UPDATE>,
@@ -163,8 +163,8 @@ void DoInplaceOp(const Device& d, InplaceOpType op, const Tensor& i,
 }
 
 template <>
-Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
-                 const Tensor& v, Tensor* y) {
+absl::Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
+                       const Tensor& v, Tensor* y) {
   CHECK_EQ(v.dtype(), y->dtype());
   switch (v.dtype()) {
 #define CASE(type)                     \
@@ -186,11 +186,11 @@ Status DoInplace(const Device& d, InplaceOpType op, const Tensor& i,
       return errors::InvalidArgument("Unsupported data type from DoInplace: ",
                                      DataTypeString(v.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
-Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
+absl::Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
   CHECK_EQ(x.dtype(), y->dtype());
   switch (x.dtype()) {
 #define CASE(type)                              \
@@ -214,7 +214,7 @@ Status DoCopy(const Device& d, const Tensor& x, Tensor* y) {
       return errors::InvalidArgument("Unsupported dtype from DoCopy: ",
                                      DataTypeString(x.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace functor

From 2d5603d02e83b200537b9328d8edef9b945f6e70 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:42:44 -0800
Subject: [PATCH 624/753] Automated Code Change

PiperOrigin-RevId: 847189107
---
 .../core/kernels/sparse/kernels_gpu.cu.cc     | 68 ++++++++++---------
 tensorflow/core/kernels/sparse/mat_mul_op.h   | 26 +++----
 tensorflow/core/kernels/sparse/mul_op.cc      |  6 +-
 3 files changed, 51 insertions(+), 49 deletions(-)

diff --git a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
index 3427538ff98ba4..3c1c79a5f02d6b 100644
--- a/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
+++ b/tensorflow/core/kernels/sparse/kernels_gpu.cu.cc
@@ -37,22 +37,22 @@ namespace functor {
 
 namespace {
 struct StridedDataReader {
-  StridedDataReader(const int64* begin, int stride)
+  StridedDataReader(const int64_t* begin, int stride)
       : begin_(begin), stride_(stride) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const {
     return static_cast<int>(ldg(begin_ + idx * stride_));
   }
 
-  const int64* begin_;
+  const int64_t* begin_;
   const int stride_;
 };
 }  // namespace
 
 template <>
-Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
+absl::Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
-    TTypes<int32>::Vec nnz_per_batch) {
+    TTypes<int32_t>::Vec nnz_per_batch) {
   const auto& cu_stream = GetGpuStream(c);
 
   const int total_nnz = indices.dimension(0);
@@ -96,9 +96,9 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
   TF_RETURN_IF_ERROR(c->allocate_temp(
       DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
       &temp_storage));
-  DCHECK_NE(temp_storage.flat<int8>().data(), nullptr);
+  DCHECK_NE(temp_storage.flat<int8_t>().data(), nullptr);
   auto second_success = gpuprim::DeviceHistogram::HistogramEven(
-      /*d_temp_storage*/ temp_storage.flat<int8>().data(),
+      /*d_temp_storage*/ temp_storage.flat<int8_t>().data(),
       /*temp_storage_bytes&*/ temp_storage_bytes,
       /*d_samples*/ indices_first_column,
       /*d_histogram*/ nnz_per_batch.data(),
@@ -116,13 +116,13 @@ Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
         temp_storage_bytes, ", status: ", GpuGetErrorString(second_success));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(ebrevdo): Write a custom batch-friendly impl of this to update
 // the SparseTensor indices directly.
 template <>
-Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
+absl::Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<const int>::UnalignedVec csr_row_ptr,
     TTypes<int>::UnalignedVec coo_row_ind) {
   GpuSparse gpu_sparse(c);
@@ -133,7 +133,7 @@ Status CSRSparseMatrixToCOOSparseMatrix<GPUDevice>::operator()(
 }
 
 template <int stride>
-__global__ void SparseTensorToCOOMatrixKernel(const int64* indices,
+__global__ void SparseTensorToCOOMatrixKernel(const int64_t* indices,
                                               int* coo_rows_out,
                                               int* coo_cols_out, int size) {
   const int offset = (stride == 3) ? 1 : 0;
@@ -168,7 +168,8 @@ void SparseTensorToCOOSparseMatrix<GPUDevice>::operator()(
 
 __global__ void COOMatrixToSparseTensorKernel2D(const int* coo_rows,
                                                 const int* coo_cols,
-                                                int64* indices_out, int size) {
+                                                int64_t* indices_out,
+                                                int size) {
   GPU_1D_KERNEL_LOOP(i, size) {
     indices_out[i * 2] = static_cast<int64_t>(ldg(coo_rows + i));
     indices_out[i * 2 + 1] = static_cast<int64_t>(ldg(coo_cols + i));
@@ -191,7 +192,7 @@ __device__ inline int BinarySearchRange(int* range, int n, int x) {
 }
 
 __global__ void COOMatrixToSparseTensorKernel3D(
-    const int* coo_rows, const int* coo_cols, int64* indices_out,
+    const int* coo_rows, const int* coo_cols, int64_t* indices_out,
     GpuDeviceArrayStruct<int> batch_ptr_s, const int batch_size,
     const int size) {
   // Step 1: access the batch ptrs and copy to shared memory.
@@ -214,7 +215,7 @@ __global__ void COOMatrixToSparseTensorKernel3D(
 }
 
 template <>
-Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
+absl::Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstVec host_dense_shape,
     TTypes<int>::ConstVec host_batch_ptr, TTypes<int>::Vec coo_row_ind,
     TTypes<int>::ConstVec coo_col_ind, TTypes<int64_t>::Matrix indices) {
@@ -234,7 +235,7 @@ Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
                                 config.block_count, config.thread_per_block, 0,
                                 d.stream(), coo_row_ind.data(),
                                 coo_col_ind.data(), indices.data(), size));
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     const int batch_size = host_dense_shape(0);
     GpuDeviceArrayOnHost<int> batch_ptr_copy(c, host_batch_ptr.size());
@@ -251,7 +252,7 @@ Status COOSparseMatrixToSparseTensor<GPUDevice>::operator()(
                         config.thread_per_block, shared_memory_size, d.stream(),
                         coo_row_ind.data(), coo_col_ind.data(), indices.data(),
                         batch_ptr_copy.data(), batch_size, size));
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -281,10 +282,10 @@ __global__ void CSRSparseMatrixBatchMulVecKernel3D(
 }
 
 template <typename T>
-Status CSRSparseMatrixBatchMulVecImpl(OpKernelContext* ctx,
-                                      const CSRSparseMatrix& a,
-                                      typename TTypes<T>::ConstFlat b,
-                                      CSRSparseMatrix* c) {
+absl::Status CSRSparseMatrixBatchMulVecImpl(OpKernelContext* ctx,
+                                            const CSRSparseMatrix& a,
+                                            typename TTypes<T>::ConstFlat b,
+                                            CSRSparseMatrix* c) {
   DCHECK_EQ(a.dims(), 3);
   const int total_nnz = a.total_nnz();
   Tensor c_values_t;
@@ -321,7 +322,7 @@ Status CSRSparseMatrixBatchMulVecImpl(OpKernelContext* ctx,
       config.thread_per_block, shared_memory_size, d.stream(), a_values.data(),
       b.data(), c_values.data(), batch_ptr_copy.data(), batch_size, total_nnz));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define DEFINE_SPARSE_MUL_VEC_GPU(T)                                        \
@@ -416,12 +417,12 @@ __global__ void CSRSparseMatrixSoftmaxKernel3D(
 }
 
 template <typename T>
-Status CSRSparseMatrixSoftmaxGPUImpl(OpKernelContext* ctx,
-                                     const CSRSparseMatrix& logits,
-                                     typename TTypes<T>::Vec softmax_values) {
+absl::Status CSRSparseMatrixSoftmaxGPUImpl(
+    OpKernelContext* ctx, const CSRSparseMatrix& logits,
+    typename TTypes<T>::Vec softmax_values) {
   auto host_dense_shape = logits.dense_shape().vec<int64_t>();
-  auto host_batch_ptr = logits.batch_pointers().vec<int32>();
-  auto row_ptr = logits.row_pointers().vec<int32>();
+  auto host_batch_ptr = logits.batch_pointers().vec<int32_t>();
+  auto row_ptr = logits.row_pointers().vec<int32_t>();
   auto logits_values = logits.values().vec<T>();
 
   const int ndims = host_dense_shape.size();
@@ -459,7 +460,7 @@ Status CSRSparseMatrixSoftmaxGPUImpl(OpKernelContext* ctx,
                                 logits_values.data(), softmax_values.data()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define DEFINE_SOFTMAX_GPU(T)                                             \
@@ -604,18 +605,19 @@ __global__ void CSRSparseMatrixSoftmaxGradKernel3D(
 }
 
 template <typename T>
-Status CSRSparseMatrixSoftmaxGradGPUImpl(
+absl::Status CSRSparseMatrixSoftmaxGradGPUImpl(
     OpKernelContext* ctx, const CSRSparseMatrix& softmax,
     const CSRSparseMatrix& grad_softmax,
     typename TTypes<T>::Vec gradient_values) {
   auto host_dense_shape = softmax.dense_shape().vec<int64_t>();
-  auto softmax_host_batch_ptr = softmax.batch_pointers().vec<int32>();
-  auto softmax_row_ptr = softmax.row_pointers().vec<int32>();
-  auto softmax_col_ind = softmax.col_indices().vec<int32>();
+  auto softmax_host_batch_ptr = softmax.batch_pointers().vec<int32_t>();
+  auto softmax_row_ptr = softmax.row_pointers().vec<int32_t>();
+  auto softmax_col_ind = softmax.col_indices().vec<int32_t>();
   auto softmax_values = softmax.values().vec<T>();
-  auto grad_softmax_host_batch_ptr = grad_softmax.batch_pointers().vec<int32>();
-  auto grad_softmax_row_ptr = grad_softmax.row_pointers().vec<int32>();
-  auto grad_softmax_col_ind = grad_softmax.col_indices().vec<int32>();
+  auto grad_softmax_host_batch_ptr =
+      grad_softmax.batch_pointers().vec<int32_t>();
+  auto grad_softmax_row_ptr = grad_softmax.row_pointers().vec<int32_t>();
+  auto grad_softmax_col_ind = grad_softmax.col_indices().vec<int32_t>();
   auto grad_softmax_values = grad_softmax.values().vec<T>();
 
   const int ndims = host_dense_shape.size();
@@ -666,7 +668,7 @@ Status CSRSparseMatrixSoftmaxGradGPUImpl(
         grad_softmax_values.data(), gradient_values.data()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define DEFINE_SOFTMAX_GRAD_GPU(T)                                          \
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.h b/tensorflow/core/kernels/sparse/mat_mul_op.h
index 3e55cfbc38f201..5c9bfd8a805a54 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.h
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.h
@@ -276,7 +276,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
   Eigen::Ref<const SparseMatrix> GetSparseMatrixRef(
       const CSRSparseMatrix& csr_matrix, const int batch_index,
       const int64_t row_begin, const int64_t num_shard_rows,
-      std::vector<int32>* row_ptrs) {
+      std::vector<int32_t>* row_ptrs) {
     // Compute the row pointers of the sparse sub-matrix.
     row_ptrs->resize(num_shard_rows + 1);
     const int64_t row_offset =
@@ -325,7 +325,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
 
                 // Define an Eigen::SparseMatrix over the row range:
                 // [row_begin, row_end) of the CSR SparseMatrix A.
-                std::vector<int32> row_ptrs;
+                std::vector<int32_t> row_ptrs;
                 auto sparse_matrix = GetSparseMatrixRef(
                     lhs, batch_idx, row_begin, num_shard_rows, &row_ptrs);
 
@@ -396,7 +396,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
 
                 // Define a new sparse sub-matrix from the row range
                 // [row_begin, row_end) of the sparse matrix A.
-                std::vector<int32> row_ptrs;
+                std::vector<int32_t> row_ptrs;
                 auto sparse_matrix = GetSparseMatrixRef(
                     lhs, batch_idx, row_begin, num_shard_rows, &row_ptrs);
 
@@ -773,9 +773,9 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
   explicit CSRSparseMatrixMatMul(const bool transpose_output)
       : transpose_output_(transpose_output) {}
 
-  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
-                 typename TTypes<T>::UnalignedConstMatrix b,
-                 typename TTypes<T>::UnalignedMatrix c) {
+  absl::Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                       typename TTypes<T>::UnalignedConstMatrix b,
+                       typename TTypes<T>::UnalignedMatrix c) {
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
     {
@@ -859,11 +859,11 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
       Tensor buffer;
       TF_RETURN_IF_ERROR(ctx->allocate_temp(
           DT_INT8, TensorShape({static_cast<int64_t>(bufferSize)}), &buffer));
-      DCHECK(buffer.flat<int8>().data() != nullptr);
+      DCHECK(buffer.flat<int8_t>().data() != nullptr);
 
       TF_RETURN_IF_ERROR(cuda_sparse.SpMM(transA, transB, &alpha, matA, matB,
                                           &beta, matC, algo,
-                                          buffer.flat<int8>().data()));
+                                          buffer.flat<int8_t>().data()));
 
       TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matB));
       TF_RETURN_IF_GPUSPARSE_ERROR(cusparseDestroyDnMat(matC));
@@ -940,7 +940,7 @@ class CSRSparseMatrixMatMul<GPUDevice, T> {
 #endif  // GOOGLE_CUDA && CUDA_VERSION >= 10020
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -954,8 +954,8 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
       : transA_(TransposeAndConjugateToGpuSparseOp(transpose_a, conjugate_a,
                                                    &status_)) {}
 
-  Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
-                 const T* x, T* y) {
+  absl::Status Compute(OpKernelContext* ctx, const ConstCSRComponent<T>& a,
+                       const T* x, T* y) {
     TF_RETURN_IF_ERROR(status_);
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
@@ -1001,11 +1001,11 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
 #endif
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
-  Status status_;
+  absl::Status status_;
   const gpusparseOperation_t transA_;
 };
 
diff --git a/tensorflow/core/kernels/sparse/mul_op.cc b/tensorflow/core/kernels/sparse/mul_op.cc
index 37ce9a6feb51bf..1a68bcc34e9143 100644
--- a/tensorflow/core/kernels/sparse/mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mul_op.cc
@@ -125,8 +125,8 @@ class CSRSparseMatrixMulScalar<GPUDevice, T> {
  public:
   explicit CSRSparseMatrixMulScalar() {}
 
-  Status Compute(OpKernelContext* ctx, const CSRSparseMatrix& a,
-                 typename TTypes<T>::ConstScalar b, CSRSparseMatrix* c) {
+  absl::Status Compute(OpKernelContext* ctx, const CSRSparseMatrix& a,
+                       typename TTypes<T>::ConstScalar b, CSRSparseMatrix* c) {
     const int total_nnz = a.total_nnz();
     Tensor c_values_t;
     TF_RETURN_IF_ERROR(ctx->allocate_temp(
@@ -146,7 +146,7 @@ class CSRSparseMatrixMulScalar<GPUDevice, T> {
     functor::BinaryFunctor<GPUDevice, functor::mul<T>, 1>().Right(
         d, c_values, a_values, b, error_ptr);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 

From 3c431ffe3d29c9d928e9644a15ee8f89ba385d76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:44:41 -0800
Subject: [PATCH 625/753] Automated Code Change

PiperOrigin-RevId: 847189510
---
 .../core/kernels/data/finalize_dataset_op_test.cc     |  7 ++++---
 tensorflow/core/kernels/data/iterator_ops.cc          | 11 ++++++-----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/data/finalize_dataset_op_test.cc b/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
index 2077cc28c161ec..c076e2dcc4dc77 100644
--- a/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
@@ -31,7 +31,7 @@ class FinalizeDatasetParams : public DatasetParams {
   template <typename T>
   FinalizeDatasetParams(T input_dataset_params, DataTypeVector output_dtypes,
                         std::vector<PartialTensorShape> output_shapes,
-                        string node_name)
+                        std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         has_captured_ref_(false) {
@@ -40,7 +40,8 @@ class FinalizeDatasetParams : public DatasetParams {
 
   std::vector<Tensor> GetInputTensors() const override { return {}; }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->emplace_back(FinalizeDatasetOp::kInputDataset);
     return absl::OkStatus();
   }
@@ -52,7 +53,7 @@ class FinalizeDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return "Finalize"; }
+  std::string dataset_type() const override { return "Finalize"; }
 
  private:
   bool has_captured_ref_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index d10513763fa726..a4a3bb4c77afeb 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -583,7 +583,7 @@ AnonymousIteratorHandleOp::AnonymousIteratorHandleOp(
   OP_REQUIRES_OK(context, context->GetAttr(kOutputShapes, &output_shapes_));
 }
 
-string AnonymousIteratorHandleOp::name() { return kAnonymousIterator; }
+std::string AnonymousIteratorHandleOp::name() { return kAnonymousIterator; }
 
 absl::Status AnonymousIteratorHandleOp::CreateResource(
     OpKernelContext* ctx, std::unique_ptr<FunctionLibraryDefinition> flib_def,
@@ -725,7 +725,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
         graph_def_version_(ctx->graph_def_version())
 
   {
-    string shared_name;
+    std::string shared_name;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &shared_name));
     OP_REQUIRES(ctx, shared_name.empty(),
                 errors::InvalidArgument("OneShotIteratorOp does not currently "
@@ -837,9 +837,10 @@ class OneShotIteratorOp : public AsyncOpKernel {
         &f_handle));
     FunctionLibraryRuntime::Options opts;
     opts.cancellation_manager = ctx->cancellation_manager();
-    ScopedStepContainer step_container(opts.step_id, [ctx](const string& name) {
-      ctx->resource_manager()->Cleanup(name).IgnoreError();
-    });
+    ScopedStepContainer step_container(
+        opts.step_id, [ctx](const std::string& name) {
+          ctx->resource_manager()->Cleanup(name).IgnoreError();
+        });
     opts.step_container = &step_container;
     opts.runner = ctx->runner();
     opts.run_all_kernels_inline = ctx->run_all_kernels_inline();

From 818e61079e341f265462be2e8634b02141ef84a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:44:45 -0800
Subject: [PATCH 626/753] Automated Code Change

PiperOrigin-RevId: 847189518
---
 .../gpu/gpu_debug_allocator_test.cc           |  25 ++--
 .../core/common_runtime/gpu/gpu_device.h      | 111 +++++++++---------
 .../common_runtime/gpu/gpu_device_factory.cc  |  30 ++---
 .../common_runtime/gpu/gpu_device_test.cc     |  48 ++++----
 .../common_runtime/gpu/gpu_process_state.cc   |  12 +-
 .../common_runtime/gpu/pool_allocator_test.cc |  20 ++--
 6 files changed, 126 insertions(+), 120 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
index 573e42fea61860..1d252f549d3803 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator_test.cc
@@ -57,7 +57,8 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_None) {
     memset(&cpu_array[0], 0, cpu_array.size() * sizeof(int64_t));
     int64_t* gpu_array =
         TypedAllocator::Allocate<int64_t>(&a, cpu_array.size(), {});
-    se::DeviceMemory<int64_t> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
+    stream_executor::DeviceAddress<int64_t> gpu_array_ptr{
+        stream_executor::DeviceAddressBase{gpu_array}};
     TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(
         &cpu_array[0], s * sizeof(int64_t), &gpu_array_ptr));
     EXPECT_TRUE(a.CheckHeader(gpu_array));
@@ -85,14 +86,14 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Header) {
           int64_t* gpu_array =
               TypedAllocator::Allocate<int64_t>(&a, cpu_array.size(), {});
 
-          se::DeviceMemory<int64_t> gpu_array_ptr{
-              se::DeviceMemoryBase{gpu_array}};
+          stream_executor::DeviceAddress<int64_t> gpu_array_ptr{
+              stream_executor::DeviceAddressBase{gpu_array}};
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(
               &cpu_array[0], cpu_array.size() * sizeof(int64_t),
               &gpu_array_ptr));
 
-          se::DeviceMemory<int64_t> gpu_hdr_ptr{
-              se::DeviceMemoryBase{gpu_array - 1}};
+          stream_executor::DeviceAddress<int64_t> gpu_hdr_ptr{
+              stream_executor::DeviceAddressBase{gpu_array - 1}};
           // Clobber first word of the header.
           float pi = 3.1417;
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(&pi, sizeof(float),
@@ -122,15 +123,15 @@ TEST(GPUDebugAllocatorTest, OverwriteDetection_Footer) {
           int64_t* gpu_array =
               TypedAllocator::Allocate<int64_t>(&a, cpu_array.size(), {});
 
-          se::DeviceMemory<int64_t> gpu_array_ptr{
-              se::DeviceMemoryBase{gpu_array}};
+          stream_executor::DeviceAddress<int64_t> gpu_array_ptr{
+              stream_executor::DeviceAddressBase{gpu_array}};
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(
               &cpu_array[0], cpu_array.size() * sizeof(int64_t),
               &gpu_array_ptr));
 
           // Clobber word of the footer.
-          se::DeviceMemory<int64_t> gpu_ftr_ptr{
-              se::DeviceMemoryBase{gpu_array + s}};
+          stream_executor::DeviceAddress<int64_t> gpu_ftr_ptr{
+              stream_executor::DeviceAddressBase{gpu_array + s}};
           float pi = 3.1417;
           TF_CHECK_OK(stream_exec->SynchronousMemcpyH2D(&pi, sizeof(float),
                                                         &gpu_ftr_ptr));
@@ -156,7 +157,8 @@ TEST(GPUDebugAllocatorTest, ResetToNan) {
 
   // Allocate 1024 floats
   float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
-  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
+  stream_executor::DeviceAddress<float> gpu_array_ptr{
+      stream_executor::DeviceAddressBase{gpu_array}};
   TF_CHECK_OK(stream_exec->SynchronousMemcpyD2H(
       gpu_array_ptr, cpu_array.size() * sizeof(float), &cpu_array[0]));
   for (float f : cpu_array) {
@@ -200,7 +202,8 @@ TEST(GPUDebugAllocatorTest, ResetToNanWithHeaderFooter) {
 
   // Allocate 1024 floats
   float* gpu_array = TypedAllocator::Allocate<float>(&a, cpu_array.size(), {});
-  se::DeviceMemory<float> gpu_array_ptr{se::DeviceMemoryBase{gpu_array}};
+  stream_executor::DeviceAddress<float> gpu_array_ptr{
+      stream_executor::DeviceAddressBase{gpu_array}};
   TF_CHECK_OK(stream_exec->SynchronousMemcpyD2H(
       gpu_array_ptr, cpu_array.size() * sizeof(float), &cpu_array[0]));
   for (float f : cpu_array) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index d09cdc2fb2c0f4..441715bd2d22cb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -105,28 +105,28 @@ class BaseGPUDevice : public LocalDevice {
 #endif
     se::Stream* host_to_device = nullptr;
     se::Stream* device_to_host = nullptr;
-    gtl::InlinedVector<se::Stream*, 4> device_to_device;
+    absl::InlinedVector<stream_executor::Stream*, 4UL> device_to_device;
     int priority = 0;
   };
 
   // Initialize the device and return the status of initialization.
 #ifdef TF_GPU_USE_PJRT
-  Status Init(const SessionOptions& options,
-              xla::LocalDeviceState* xla_local_device_state);
+  absl::Status Init(const SessionOptions& options,
+                    xla::LocalDeviceState* xla_local_device_state);
 #else
-  Status Init(const SessionOptions& options);
+  absl::Status Init(const SessionOptions& options);
 #endif  // TF_GPU_USE_PJRT
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
 
-  Status Sync() override;
+  absl::Status Sync() override;
 
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
 
-  Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override;
+  absl::Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                                   AllocatorAttributes alloc_attrs,
+                                   Tensor* tensor) override;
 
   void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
                               const DeviceContext* device_context,
@@ -135,9 +135,9 @@ class BaseGPUDevice : public LocalDevice {
   // The caller owns the returned device.
   PerOpGpuDevice* MakeGpuDevice() override;
 
-  Status ReinitializeGpuDevice(OpKernelContext* context, PerOpGpuDevice* device,
-                               DeviceContext* dc,
-                               Allocator* allocator) override;
+  absl::Status ReinitializeGpuDevice(OpKernelContext* context,
+                                     PerOpGpuDevice* device, DeviceContext* dc,
+                                     Allocator* allocator) override;
 
   // Returns the platform GPU id of this device within the native driver system;
   // e.g., for CUDA and ROCm this is the ordinal of the GPU within the system.
@@ -164,7 +164,7 @@ class BaseGPUDevice : public LocalDevice {
 
   // If returned value is > 0 then GPU Memory chunks freed before this count
   // are guaranteed not to be in use by any kernel pending on this device.
-  uint64 SafeAllocFrontier(uint64 old_value) override;
+  uint64_t SafeAllocFrontier(uint64_t old_value) override;
 
   // Returns the number of kernels that have been queued for execution on
   // the compute stream and are not yet known to have completed.
@@ -216,13 +216,13 @@ class BaseGPUDevice : public LocalDevice {
   EventMgr* em_ = nullptr;
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   std::unique_ptr<GPUKernelTracker> kernel_tracker_;
-  int32 pending_cap_ = 0;
+  int32_t pending_cap_ = 0;
   bool timestamped_allocator_ = false;
   NodeFileWriter* node_file_writer_ = nullptr;  // not owned
   const GPUOptions::Experimental::StreamMergeOptions stream_merge_options_;
 
   // Initialize scratch buffers used by Eigen.
-  Status InitScratchBuffers();
+  absl::Status InitScratchBuffers();
 
   void ReinitializeDevice(OpKernelContext* context, PerOpGpuDevice* device,
                           int stream_id, Allocator* allocator);
@@ -235,9 +235,9 @@ class BaseGPUDevice : public LocalDevice {
   // allocate memory or if the tensor "from" is not DMA-copyable.
   // If there is no error prior to enqueueing the copy, an OK status
   // is returned.
-  Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs,
-                              const Tensor& from, Tensor* to,
-                              StatusCallback done);
+  absl::Status MaybeCopyTensorToGPU(const AllocatorAttributes& alloc_attrs,
+                                    const Tensor& from, Tensor* to,
+                                    StatusCallback done);
 
   Tensor CopyGpuTensorToHostDebugOnly(const Tensor& gpu_tensor);
   void LogInputs(OpKernel* op_kernel, OpKernelContext* context);
@@ -293,25 +293,25 @@ class GPUKernelTracker {
   // Determine whether a GPU kernel should have a recording event queued
   // immediately afterwards.  If so, advance the counter and return the new
   // counter value after enqueuing.
-  uint64 MaybeQueue(OpKernelContext* ctx);
+  uint64_t MaybeQueue(OpKernelContext* ctx);
 
   // Record that a GPU kernel has just been enqueued on the compute stream.
   // Inserts the supplied counter value in a new PendingKernel record appended
   // to the end of the ring buffer then returns that same count.
   // Caller is responsible for ensuring that RecordTerminate() is eventually
   // called with the same counter value.
-  void RecordQueued(uint64 queued_count, int weight)
+  void RecordQueued(uint64_t queued_count, int weight)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Takes a count value returned by RecordQueued and finds the corresponding
   // PendingKernel record in the ring buffer.  Marks the kernel as completed and
   // advances the completion frontier accordingly.
-  void RecordTerminated(uint64 queued_count);
+  void RecordTerminated(uint64_t queued_count);
 
   // Returns the largest timing count such that all kernels queued no
   // later than that count are known to have terminated.
-  inline uint64 LastTerminatedCount(uint64 old_value) {
-    uint64 new_value = last_terminated_count_.load(std::memory_order_relaxed);
+  inline uint64_t LastTerminatedCount(uint64_t old_value) {
+    uint64_t new_value = last_terminated_count_.load(std::memory_order_relaxed);
     if (new_value == old_value) {
       MaybeQueueProgressEvent();
     }
@@ -344,22 +344,22 @@ class GPUKernelTracker {
   std::unique_ptr<SharedCounter> owned_counter_;
   Allocator* allocator_ = nullptr;
   EventMgr* em_ = nullptr;
-  std::atomic<uint64> last_terminated_count_ = {1};
+  std::atomic<uint64_t> last_terminated_count_ = {1};
 
   void MaybeQueueProgressEvent();
 
   // Records when a kernel was queued for execution.  Kernel launches are
   // identified by a unique count value from a per-GPU device timing counter.
   struct PendingKernel {
-    uint64 queued_count;
+    uint64_t queued_count;
     int weight;
     bool terminated;
     PendingKernel(const PendingKernel& pk) = default;
     PendingKernel() : queued_count(0), weight(0), terminated(false) {}
   };
   mutex mu_;
-  int32 mem_since_last_ TF_GUARDED_BY(mu_);
-  int32 ops_since_last_ TF_GUARDED_BY(mu_);
+  int32_t mem_since_last_ TF_GUARDED_BY(mu_);
+  int32_t ops_since_last_ TF_GUARDED_BY(mu_);
   // Ring buffer of PendingKernel records.
   std::vector<PendingKernel> pending_kernels_ TF_GUARDED_BY(mu_);
   // Next unused slot in pending_kernels_.
@@ -376,12 +376,13 @@ class GPUKernelTracker {
 
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override;
-  Status CreateDevices(const SessionOptions& options,
-                       const std::string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override;
-  Status GetDeviceDetails(int device_index,
-                          std::unordered_map<string, string>* details) override;
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override;
+  absl::Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) override;
+  absl::Status GetDeviceDetails(
+      int device_index,
+      std::unordered_map<std::string, std::string>* details) override;
 
   struct InterconnectMap {
     // Name of interconnect technology, if known.
@@ -390,7 +391,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
     // Where architecture-specific subclassing is not done that won't
     // always be possible.  The minimum expectation is that
     // faster links should have a higher value than slower links.
-    int32 strength;
+    int32_t strength;
     static const int kSameDeviceStrength;
     static const int kStreamExecutorStrength;
     std::set<std::pair<tsl::PlatformDeviceId, tsl::PlatformDeviceId>>
@@ -400,7 +401,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
  protected:
   // Populates *maps with interconnect maps for all local direct access
   // pathways between GPUs.
-  virtual Status GetInterconnectMaps(
+  virtual absl::Status GetInterconnectMaps(
       const std::vector<tsl::PlatformDeviceId>& visible_gpu_order,
       se::Platform* gpu_manager, std::vector<InterconnectMap>* maps);
 
@@ -413,7 +414,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
       LocalityMap;
   // Populates *localities with the DeviceLocality descriptor for
   // every TfDeviceId.
-  virtual Status GetDeviceLocalities(
+  virtual absl::Status GetDeviceLocalities(
       int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
       LocalityMap* localities);
 
@@ -422,29 +423,29 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // 'devices' vector. The 'gpu_allocator' is created by the caller and usually
   // preallocates a set amount of GPU memory.
 #ifdef TF_GPU_USE_PJRT
-  Status CreateGPUDevice(const SessionOptions& options,
-                         const std::string& name_prefix,
-                         tsl::TfDeviceId tf_device_id,
-                         const DeviceLocality& dev_locality,
-                         xla::LocalDeviceState* xla_local_device_state,
-                         Allocator* gpu_allocator,
-                         std::vector<std::unique_ptr<Device>>* devices);
+  absl::Status CreateGPUDevice(const SessionOptions& options,
+                               const std::string& name_prefix,
+                               tsl::TfDeviceId tf_device_id,
+                               const DeviceLocality& dev_locality,
+                               xla::LocalDeviceState* xla_local_device_state,
+                               Allocator* gpu_allocator,
+                               std::vector<std::unique_ptr<Device>>* devices);
 #else
-  Status CreateGPUDevice(const SessionOptions& options,
-                         const std::string& name_prefix,
-                         tsl::TfDeviceId tf_device_id,
-                         const DeviceLocality& dev_locality,
-                         Allocator* gpu_allocator,
-                         std::vector<std::unique_ptr<Device>>* devices);
+  absl::Status CreateGPUDevice(const SessionOptions& options,
+                               const std::string& name_prefix,
+                               tsl::TfDeviceId tf_device_id,
+                               const DeviceLocality& dev_locality,
+                               Allocator* gpu_allocator,
+                               std::vector<std::unique_ptr<Device>>* devices);
 #endif  // TF_GPU_USE_PJRT
 
   virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
-      const SessionOptions& options, const string& name, Bytes memory_limit,
-      const DeviceLocality& dev_locality, tsl::TfDeviceId tf_device_id,
-      const string& physical_device_desc, Allocator* gpu_allocator,
-      Allocator* cpu_allocator) = 0;
+      const SessionOptions& options, const std::string& name,
+      Bytes memory_limit, const DeviceLocality& dev_locality,
+      tsl::TfDeviceId tf_device_id, const std::string& physical_device_desc,
+      Allocator* gpu_allocator, Allocator* cpu_allocator) = 0;
 
-  Status EnablePeerAccess(
+  absl::Status EnablePeerAccess(
       const std::vector<tsl::PlatformDeviceId>& visible_gpu_order);
 
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
@@ -452,7 +453,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // based upon 'visible_gpu_order' which was generated by parsing
   // GPUOptions::visible_device_list which is a comma-separated list of CUDA or
   // ROCm GPU ids.
-  Status GetValidDeviceIds(
+  absl::Status GetValidDeviceIds(
       const std::vector<tsl::PlatformDeviceId>& visible_gpu_order,
       std::vector<tsl::PlatformDeviceId>* ids);
 
@@ -460,7 +461,7 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   // field cached_device_ids_. Passes {0, 1, ..., num_devices-1} to
   // GetValidDeviceIds, so this should only be used in functions where all
   // devices should be treated as visible, like ListPhysicalDevices.
-  Status CacheDeviceIds();
+  absl::Status CacheDeviceIds();
 
   // visible_gpu_initialized_[platform_device_id] is true if visible GPU
   // platform_device_id has been initialized by the process.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index d5b9c127351a36..2848cf5d16d91d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -28,10 +28,11 @@ namespace tensorflow {
 
 class GPUDevice : public BaseGPUDevice {
  public:
-  GPUDevice(const SessionOptions& options, const string& name,
+  GPUDevice(const SessionOptions& options, const std::string& name,
             Bytes memory_limit, const DeviceLocality& locality,
-            tsl::TfDeviceId tf_device_id, const string& physical_device_desc,
-            Allocator* gpu_allocator, Allocator* cpu_allocator)
+            tsl::TfDeviceId tf_device_id,
+            const std::string& physical_device_desc, Allocator* gpu_allocator,
+            Allocator* cpu_allocator)
       : BaseGPUDevice(options, name, memory_limit, locality, tf_device_id,
                       physical_device_desc, gpu_allocator, cpu_allocator,
                       false /* sync every op */),
@@ -64,10 +65,10 @@ class GPUDevice : public BaseGPUDevice {
 class GPUDeviceFactory : public BaseGPUDeviceFactory {
  private:
   std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
-      const SessionOptions& options, const string& name, Bytes memory_limit,
-      const DeviceLocality& locality, tsl::TfDeviceId tf_device_id,
-      const string& physical_device_desc, Allocator* gpu_allocator,
-      Allocator* cpu_allocator) override {
+      const SessionOptions& options, const std::string& name,
+      Bytes memory_limit, const DeviceLocality& locality,
+      tsl::TfDeviceId tf_device_id, const std::string& physical_device_desc,
+      Allocator* gpu_allocator, Allocator* cpu_allocator) override {
     return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
                                         tf_device_id, physical_device_desc,
                                         gpu_allocator, cpu_allocator);
@@ -82,7 +83,7 @@ REGISTER_LOCAL_DEVICE_FACTORY("GPU", GPUDeviceFactory, 210);
 // -----------------------------------------------------------------------------
 class GPUCompatibleCPUDevice : public ThreadPoolDevice {
  public:
-  GPUCompatibleCPUDevice(const SessionOptions& options, const string& name,
+  GPUCompatibleCPUDevice(const SessionOptions& options, const std::string& name,
                          Bytes memory_limit, const DeviceLocality& locality,
                          Allocator* allocator)
       : ThreadPoolDevice(options, name, memory_limit, locality, allocator),
@@ -114,14 +115,15 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
 // The associated factory.
 class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
  public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override {
+  absl::Status ListPhysicalDevices(std::vector<std::string>* devices) override {
     devices->push_back("/physical_device:CPU:0");
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override {
+  absl::Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) override {
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -131,7 +133,7 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
                              ? port::NUMANumNodes()
                              : 1;
     for (int i = 0; i < n; i++) {
-      string name = strings::StrCat(name_prefix, "/device:CPU:", i);
+      std::string name = absl::StrCat(name_prefix, "/device:CPU:", i);
       int numa_node = i % num_numa_nodes;
       DeviceLocality locality;
       locality.set_numa_node(numa_node);
@@ -140,7 +142,7 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
           ProcessState::singleton()->GetCPUAllocator(numa_node)));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 REGISTER_LOCAL_DEVICE_FACTORY("CPU", GPUCompatibleCPUDeviceFactory, 70);
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 3aa8fa1003fbb7..f191c8f32b082f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -76,7 +76,7 @@ bool IsRocm() {
       .IsRocm();
 }
 
-void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
+void ExpectErrorMessageSubstr(const absl::Status& s, absl::string_view substr) {
   EXPECT_TRUE(absl::StrContains(s.ToString(), substr))
       << s << ", expected substring " << substr;
 }
@@ -92,12 +92,12 @@ class GPUDeviceTest : public ::testing::Test {
 
  protected:
   static SessionOptions MakeSessionOptions(
-      const string& visible_device_list = "",
+      const std::string& visible_device_list = "",
       double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
       const std::vector<std::vector<float>>& memory_limit_mb = {},
-      const std::vector<std::vector<int32>>& priority = {},
-      const std::vector<std::vector<int32>>& device_ordinal = {},
-      const int32 num_virtual_devices = 0,
+      const std::vector<std::vector<int32_t>>& priority = {},
+      const std::vector<std::vector<int32_t>>& device_ordinal = {},
+      const int32_t num_virtual_devices = 0,
       const bool use_cuda_malloc_async = false) {
     SessionOptions options;
     ConfigProto* config = &options.config;
@@ -178,7 +178,7 @@ TEST_F(GPUDeviceTest, CudaMallocAsync) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {}, {}, 0,
                                            /*use_cuda_malloc_async=*/true);
   std::vector<std::unique_ptr<Device>> devices;
-  Status status;
+  absl::Status status;
   int number_instantiated =
       se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
   {  // The new scope is to trigger the destruction of the object.
@@ -209,7 +209,7 @@ TEST_F(GPUDeviceTest, CudaMallocAsyncPreallocate) {
                                            /*use_cuda_malloc_async=*/true);
   setenv("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", "2048", 1);
   std::vector<std::unique_ptr<Device>> devices;
-  Status status;
+  absl::Status status;
 
   int number_instantiated =
       se::GpuCudaMallocAsyncAllocator::GetInstantiatedCountTestOnly();
@@ -240,7 +240,7 @@ TEST_F(GPUDeviceTest, CudaMallocAsyncPreallocate) {
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(status, "Could not parse entry");
@@ -249,7 +249,7 @@ TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
 TEST_F(GPUDeviceTest, InvalidGpuId) {
   SessionOptions opts = MakeSessionOptions("100");
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(status,
@@ -259,7 +259,7 @@ TEST_F(GPUDeviceTest, InvalidGpuId) {
 TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,0");
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(status,
@@ -269,7 +269,7 @@ TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(
@@ -281,7 +281,7 @@ TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   // (empty) VirtualDevices messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
   ExpectErrorMessageSubstr(status,
@@ -293,7 +293,7 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   // messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
   ExpectErrorMessageSubstr(status,
@@ -307,7 +307,7 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   // messages.
   SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
   ExpectErrorMessageSubstr(
@@ -376,7 +376,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-9999, 0}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+    absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
     EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
 #if TENSORFLOW_USE_ROCM
@@ -399,7 +399,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
 #endif
     std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+    absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
     EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
 #if TENSORFLOW_USE_ROCM
@@ -457,7 +457,7 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
     // 0 is a valid priority value for both AMD and NVidia GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0}});
     std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+    absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices);
     EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
     ExpectErrorMessageSubstr(
@@ -546,7 +546,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
       ->mutable_experimental()
       ->set_use_unified_memory(true);
   std::vector<std::unique_ptr<Device>> devices;
-  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+  absl::Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INTERNAL);
   ExpectErrorMessageSubstr(status, "does not support oversubscription.");
@@ -611,7 +611,7 @@ TEST_F(GPUDeviceTest, CopyTensorInSameDevice) {
   CopyCPUToGPU(&cpu_tensor, &input_tensor, device, device_context);
   absl::Notification note;
   device->CopyTensorInSameDevice(&input_tensor, &output_tensor, device_context,
-                                 [&note](const Status& s) {
+                                 [&note](const absl::Status& s) {
                                    TF_ASSERT_OK(s);
                                    note.Notify();
                                  });
@@ -629,11 +629,11 @@ TEST_F(GPUDeviceTest, CopyTensorInSameDevice) {
 
 TEST_F(GPUDeviceTest, DeviceDetails) {
   DeviceFactory* factory = DeviceFactory::GetFactory("GPU");
-  std::vector<string> devices;
+  std::vector<std::string> devices;
   TF_ASSERT_OK(factory->ListPhysicalDevices(&devices));
   EXPECT_GE(devices.size(), 1);
   for (int i = 0; i < devices.size(); i++) {
-    std::unordered_map<string, string> details;
+    std::unordered_map<std::string, std::string> details;
     TF_ASSERT_OK(factory->GetDeviceDetails(i, &details));
     EXPECT_NE(details["device_name"], "");
 #if TENSORFLOW_USE_ROCM
@@ -669,7 +669,7 @@ class GPUKernelTrackerTest : public ::testing::Test {
                                                nullptr));
   }
 
-  void RecordQueued(uint64 v) {
+  void RecordQueued(uint64_t v) {
     mutex_lock l(kernel_tracker_->mu_);
     kernel_tracker_->RecordQueued(v, 1);
   }
@@ -686,7 +686,7 @@ TEST_F(GPUKernelTrackerTest, CappingOnly) {
 
   std::deque<int64_t> queued_counts;
   for (int i = 0; i < 32; ++i) {
-    uint64 queued_count = timing_counter_->next();
+    uint64_t queued_count = timing_counter_->next();
     queued_counts.push_back(queued_count);
     RecordQueued(queued_count);
   }
@@ -708,7 +708,7 @@ TEST_F(GPUKernelTrackerTest, CappingOnly) {
   // to introduce gaps between last_completed_ and first_available_.
   int64_t lower_bound = timing_counter_->get();
   for (int i = 0; i < 1111; ++i) {
-    uint64 queued_count = timing_counter_->next();
+    uint64_t queued_count = timing_counter_->next();
     queued_counts.push_back(queued_count);
     RecordQueued(queued_count);
     int64_t upper_bound = timing_counter_->get();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 3179d8858ad154..15fd92a873bea0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -122,11 +122,11 @@ static std::unique_ptr<SubAllocator> CreateSubAllocator(
                              options.experimental().use_unified_memory());
   if (use_unified_memory) {
     auto unified_memory_allocator =
-        executor->CreateMemoryAllocator(stream_executor::MemoryType::kUnified)
+        executor->CreateMemoryAllocator(stream_executor::MemorySpace::kUnified)
             .value();
     return std::make_unique<se::StreamExecutorAllocator>(
         std::move(unified_memory_allocator),
-        stream_executor::MemoryType::kUnified, platform_device_id.value(),
+        stream_executor::MemorySpace::kUnified, platform_device_id.value(),
         alloc_visitors);
   } else {
     return std::make_unique<se::DeviceMemAllocator>(
@@ -140,7 +140,7 @@ Allocator* GPUProcessState::GetGPUAllocator(
   CHECK(process_state_);
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
-  const string& allocator_type = options.allocator_type();
+  const std::string& allocator_type = options.allocator_type();
   mutex_lock lock(mu_);
   tsl::CheckValidTfDeviceId(
       DEVICE_GPU, se::GPUMachineManager()->VisibleDeviceCount(), tf_device_id);
@@ -172,7 +172,7 @@ Allocator* GPUProcessState::GetGPUAllocator(
 
     auto gpu_bfc_allocator = std::make_unique<GPUBFCAllocator>(
         std::move(sub_allocator), total_bytes,
-        strings::StrCat("GPU_", tf_device_id.value(), "_bfc"), [&] {
+        absl::StrCat("GPU_", tf_device_id.value(), "_bfc"), [&] {
           GPUBFCAllocator::Options o;
           o.allow_growth = options.allow_growth();
           o.allow_retry_on_failure =
@@ -366,9 +366,9 @@ Allocator* GPUProcessState::GetGpuHostAllocator(const GPUOptions& options,
       gpu_host_free_visitors_.push_back({});
     }
     auto host_memory_allocator =
-        se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+        se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
     SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-        std::move(host_memory_allocator), stream_executor::MemoryType::kHost,
+        std::move(host_memory_allocator), stream_executor::MemorySpace::kHost,
         numa_node, gpu_host_alloc_visitors_[numa_node],
         gpu_host_free_visitors_[numa_node]);
 
diff --git a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
index fbc733ce4b85d4..954658e1111a4c 100644
--- a/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
+++ b/tensorflow/core/common_runtime/gpu/pool_allocator_test.cc
@@ -30,9 +30,9 @@ TEST(PoolAllocatorTest, ZeroSizeBuffers) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
 
@@ -49,9 +49,9 @@ TEST(PoolAllocatorTest, ZeroSizePool) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(0 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
 
@@ -83,9 +83,9 @@ TEST(PoolAllocatorTest, Alignment) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(0 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
   for (int i = 0; i < 16; ++i) {
@@ -145,9 +145,9 @@ TEST(PoolAllocatorTest, CudaHostAllocator) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0,
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0,
       {alloc_visitor}, {free_visitor});
   PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
@@ -250,9 +250,9 @@ TEST(PoolAllocatorTest, Name) {
       se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   se::StreamExecutor* se = platform->ExecutorForDevice(/*ordinal=*/0).value();
   auto host_memory_allocator =
-      se->CreateMemoryAllocator(stream_executor::MemoryType::kHost).value();
+      se->CreateMemoryAllocator(stream_executor::MemorySpace::kHost).value();
   SubAllocator* sub_allocator = new se::StreamExecutorAllocator(
-      std::move(host_memory_allocator), stream_executor::MemoryType::kHost, 0);
+      std::move(host_memory_allocator), stream_executor::MemorySpace::kHost, 0);
   PoolAllocator pool(2 /*pool_size_limit*/, false /*auto_resize*/,
                      sub_allocator, new NoopRounder, "pool");
   EXPECT_EQ("pool", pool.Name());

From 9e0976de360152d1490063327fbc64b70d914ec8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:44:46 -0800
Subject: [PATCH 627/753] Automated Code Change

PiperOrigin-RevId: 847189521
---
 .../optimizers/auto_mixed_precision_test.cc   |  19 +-
 .../optimizers/constant_folding_test.cc       | 187 +++++++++---------
 .../generic_layout_optimizer_transposer.cc    | 109 +++++-----
 3 files changed, 159 insertions(+), 156 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
index fe7d4eb4f33f67..1f4943889cc06b 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision_test.cc
@@ -72,7 +72,7 @@ Tensor GenerateRandomTensorInRange(const TensorShape& shape, double minval,
 
 void VerifyGraphsEquivalent(const GraphDef& original_graph,
                             const GraphDef& optimized_graph,
-                            const string& func) {
+                            const std::string& func) {
   EXPECT_EQ(original_graph.node_size(), optimized_graph.node_size()) << func;
   GraphView optimized_view(&optimized_graph);
   for (int i = 0; i < original_graph.node_size(); ++i) {
@@ -146,10 +146,10 @@ class AutoMixedPrecisionTest : public GrapplerTest {
 
   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
 
-  NodeDef* AddSimpleNode(const string& name, const string& op,
-                         const std::vector<string>& inputs,
+  NodeDef* AddSimpleNode(const std::string& name, const std::string& op,
+                         const std::vector<std::string>& inputs,
                          GraphDef* graph) const {
-    std::vector<std::pair<string, AttrValue>> attributes;
+    std::vector<std::pair<std::string, AttrValue>> attributes;
     if (op == "AddN" || op == "ShapeN") {
       AttrValue num_inputs;
       num_inputs.set_i(inputs.size());
@@ -203,7 +203,8 @@ class AutoMixedPrecisionTest : public GrapplerTest {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     auto input_tensor = GenerateRandomTensorInRange<DT_FLOAT>(
         TensorShape({size, size}), input_min, input_max);
-    std::vector<std::pair<string, Tensor>> feed = {{"input", input_tensor}};
+    std::vector<std::pair<std::string, Tensor>> feed = {
+        {"input", input_tensor}};
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
     AutoMixedPrecision optimizer(mode_);
@@ -564,7 +565,7 @@ TEST_P(AutoMixedPrecisionParamTest, PreserveIdentityAfterVariable) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   auto var1_tensor =
       GenerateConstantTensor<DT_FLOAT>(TensorShape({32, 32}), 3.141593f);
-  std::vector<std::pair<string, Tensor>> feed = {{"var1", var1_tensor}};
+  std::vector<std::pair<std::string, Tensor>> feed = {{"var1", var1_tensor}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
 
   AutoMixedPrecision optimizer(mode_);
@@ -1035,7 +1036,7 @@ TEST_P(AutoMixedPrecisionParamTest, TensorListThroughFunction) {
   // A separate Tensor List cluster is added to test that it is still changed to
   // DT_HALF.
   FunctionDefLibrary function_lib;
-  const Tensor kShape = test::AsTensor<int32>({32, 32});
+  const Tensor kShape = test::AsTensor<int32_t>({32, 32});
   FunctionDef func1 = FunctionDefHelper::Define(
       "Func1", {"ihandle: variant", "x: float"},
       {"ohandle: variant", "y: float"}, {},
@@ -1120,7 +1121,7 @@ int GetCudaVersion(const Cluster& cluster) {
       const auto& device_env = device_properties.environment();
       auto it = device_env.find("cuda");
       if (it != device_env.end()) {
-        string cuda_version_str = it->second;
+        std::string cuda_version_str = it->second;
         return std::stoi(cuda_version_str);
       }
     }
@@ -1407,7 +1408,7 @@ TEST_F(AutoMixedPrecisionCpuTest, MixedFanout) {
 class AutoMixedPrecisionSimulateGpuTest : public GrapplerTest {
  protected:
   void SetUp() override {
-    std::unordered_map<string, DeviceProperties> devices;
+    std::unordered_map<std::string, DeviceProperties> devices;
     DeviceProperties cpu_device;
     cpu_device.set_type("CPU");
     cpu_device.set_frequency(1000);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 857e33bf028c82..8f3603829ffb46 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -95,11 +95,11 @@ class ConstantFoldingTest : public GrapplerTest {
       TF_EXPECT_OK(status);
 
       EXPECT_EQ(7, output.node_size());
-      const string snapshot_or_identity =
+      const std::string snapshot_or_identity =
           use_snapshot ? "Snapshot" : "Identity";
       for (int i = 0; i < output.node_size(); ++i) {
         const NodeDef& node = output.node(i);
-        const string& name = node.name();
+        const std::string& name = node.name();
         if (name == "mul1") {
           EXPECT_EQ("Const", node.op());
           EXPECT_EQ("^x", node.input(0));
@@ -220,7 +220,7 @@ class ConstantFoldingTest : public GrapplerTest {
     EXPECT_EQ(2, found);
 
     // Check that const folded multiplication node has the expected value.
-    std::vector<string> fetch = {"mul"};
+    std::vector<std::string> fetch = {"mul"};
     Tensor value(DT_FLOAT, input_shape);
     for (int i = 0; i < value.NumElements(); ++i) {
       value.flat<float>()(i) = i;
@@ -309,7 +309,7 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   EXPECT_EQ("d", node_d.name());
   EXPECT_EQ("Const", node_d.op());
 
-  std::vector<string> fetch = {"d"};
+  std::vector<std::string> fetch = {"d"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors_expected.size());
@@ -397,7 +397,7 @@ TEST_F(ConstantFoldingTest, AddTree) {
   auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
 
-  std::vector<string> fetch = {"add_parent", "mul_parent"};
+  std::vector<std::string> fetch = {"add_parent", "mul_parent"};
   auto tensor_expected =
       EvaluateNodes(item.graph, fetch, {{"x", x_t}, {"y", y_t}});
   ASSERT_EQ(fetch.size(), tensor_expected.size());
@@ -453,7 +453,7 @@ TEST_F(ConstantFoldingTest, AddSubtactTree) {
   // Check that the result nodes have the expected value.
   auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
 
-  std::vector<string> fetch = {"add_parent"};
+  std::vector<std::string> fetch = {"add_parent"};
   auto tensor_expected = EvaluateNodes(item.graph, fetch, {{"x", x_t}});
   ASSERT_EQ(fetch.size(), tensor_expected.size());
   fetch = {"add_parent"};
@@ -478,7 +478,7 @@ TEST_F(ConstantFoldingTest, ConstantPushDown) {
                                  ops::Placeholder::Shape(TensorShape({2, 2})));
 
             auto get_op = [&](bool is_commutative, bool is_left_arg_const,
-                              const string& name, const Output& const_arg,
+                              const std::string& name, const Output& const_arg,
                               const Output non_const_arg) -> Output {
               if (is_add) {
                 if (is_commutative) {
@@ -523,7 +523,7 @@ TEST_F(ConstantFoldingTest, ConstantPushDown) {
 
             // Check that the result nodes have the expected value.
             auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
-            std::vector<string> fetch = {"parent"};
+            std::vector<std::string> fetch = {"parent"};
             auto tensor_expected =
                 EvaluateNodes(item.graph, fetch, {{"x", x_t}});
             ASSERT_EQ(fetch.size(), tensor_expected.size());
@@ -600,7 +600,7 @@ TEST_F(ConstantFoldingTest, ConstantPushDownBiasAdd) {
   // Check that the result nodes have the expected value.
   auto x_mat_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto x_vec_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2}));
-  std::vector<string> fetch = item.fetch;
+  std::vector<std::string> fetch = item.fetch;
   auto tensor_expected = EvaluateNodes(
       item.graph, fetch, {{"x_vec", x_vec_t}, {"x_mat", x_mat_t}});
   ASSERT_EQ(fetch.size(), tensor_expected.size());
@@ -615,10 +615,9 @@ TEST_F(ConstantFoldingTest, ConstantPushDownBiasAdd) {
 // This test fails on ROCm platform (see commit message for details)
 #ifndef TENSORFLOW_USE_ROCM
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
-  for (string data_format : {
-         "NHWC",
+  for (std::string data_format : {"NHWC",
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-             "NCHW"
+                                  "NCHW"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
        }) {
     MulConvPushDownTest(
@@ -636,10 +635,9 @@ TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_ScalarConst) {
 // This test fails on ROCm platform (see commit message for details)
 #ifndef TENSORFLOW_USE_ROCM
 TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
-  for (string data_format : {
-         "NHWC",
+  for (std::string data_format : {"NHWC",
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-             "NCHW"
+                                  "NCHW"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
        }) {
     for (auto mul_const_input_shape :
@@ -658,10 +656,9 @@ TEST_F(ConstantFoldingTest, MulConvPushDownTest_Conv2D_SingletonConst) {
 
 TEST_F(ConstantFoldingTest,
        MulConvPushDownTest_Conv2D_SingletonConst_ShapeMismatch) {
-  for (string data_format : {
-         "NHWC",
+  for (std::string data_format : {"NHWC",
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-             "NCHW"
+                                  "NCHW"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
        }) {
     MulConvPushDownTest(
@@ -841,18 +838,18 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
-    const string suffix =
+    const std::string suffix =
         (const_type == kConst ? "_const"
                               : (const_type == kLike ? "_like" : "_fill"));
-    const string zeros_name = strings::StrCat("zeros", suffix);
-    const string ones_name = strings::StrCat("ones", suffix);
-    const string ctrl_zeros_name = strings::StrCat("^zeros", suffix);
-    const string ctrl_ones_name = strings::StrCat("^ones", suffix);
+    const std::string zeros_name = absl::StrCat("zeros", suffix);
+    const std::string ones_name = absl::StrCat("ones", suffix);
+    const std::string ctrl_zeros_name = absl::StrCat("^zeros", suffix);
+    const std::string ctrl_ones_name = absl::StrCat("^ones", suffix);
 
     EXPECT_EQ(const_type == kFill ? 43 : 39, output.node_size());
     for (int i = 0; i < output.node_size(); ++i) {
       const NodeDef& node = output.node(i);
-      const string& name = node.name();
+      const std::string& name = node.name();
       if (name == "mul1") {
         EXPECT_EQ("Const", node.op());
         EXPECT_EQ("^x", node.input(0));
@@ -968,8 +965,8 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ("y", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       }
-      const std::set<string> square_zero_const{"mul1", "mul2",    "mul5",
-                                               "mul6", "matmul1", "matmul2"};
+      const std::set<std::string> square_zero_const{
+          "mul1", "mul2", "mul5", "mul6", "matmul1", "matmul2"};
       if (square_zero_const.count(name) > 0) {
         TensorProto t = node.attr().at("value").tensor();
         EXPECT_EQ(1, t.float_val_size());
@@ -1029,7 +1026,7 @@ TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   EXPECT_EQ(8, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    const string& name = node.name();
+    const std::string& name = node.name();
     if (name == "div_i") {
       // Integer division is unchanged.
       EXPECT_EQ("Div", node.op());
@@ -1061,7 +1058,7 @@ TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   }
 
   // Check that the reciprocals have the expected value.
-  std::vector<string> fetch = {"cf_half"};
+  std::vector<std::string> fetch = {"cf_half"};
   auto tensor_expected = EvaluateNodes(item.graph, fetch);
   EXPECT_EQ(fetch.size(), tensor_expected.size());
   fetch = {"ConstantFolding/div_f_recip", "ConstantFolding/realdiv_recip"};
@@ -1090,13 +1087,13 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   // Multiplies without any additional ops to supply the output shape.
   int count = 0;
   std::vector<Output> muls;
-  std::unordered_set<string> not_converted;
-  std::unordered_set<string> to_const;
-  std::unordered_set<string> to_identity;
+  std::unordered_set<std::string> not_converted;
+  std::unordered_set<std::string> to_const;
+  std::unordered_set<std::string> to_identity;
   for (const auto* x : {&x_known, &x_partially_known, &x_unknown}) {
     for (const auto* zeros :
          {&zeros_known, &zeros_partially_known, &zeros_unknown}) {
-      const string name = strings::StrCat("mul_", count++);
+      const std::string name = absl::StrCat("mul_", count++);
       muls.push_back(ops::Mul(s.WithOpName(name), *x, *zeros));
       if (x == &x_partially_known && zeros == &zeros_partially_known) {
         to_identity.insert(name);
@@ -1120,7 +1117,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   EXPECT_EQ(15, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    const string& name = node.name();
+    const std::string& name = node.name();
     if (to_const.count(name) > 0) {
       EXPECT_EQ("Const", node.op()) << node.name();
     } else if (to_identity.count(name) > 0) {
@@ -1130,7 +1127,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
     }
   }
 
-  const std::vector<string> fetch = {"mul_0", "mul_4", "mul_8"};
+  const std::vector<std::string> fetch = {"mul_0", "mul_4", "mul_8"};
   auto x_known_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto x_partially_unknown_t =
       GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
@@ -1166,11 +1163,11 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   // will propagate the shape back to the inputs of AddN, making the
   // output shapes of all its inputs known
   std::vector<Output> muls_deduced_output_shape;
-  std::unordered_set<string> to_const;
+  std::unordered_set<std::string> to_const;
   int count = 0;
   for (const auto& x : {x_partially_known, x_unknown}) {
     for (const auto& zeros : {zeros_partially_known, zeros_unknown}) {
-      const string name = strings::StrCat("mul_", count++);
+      const std::string name = absl::StrCat("mul_", count++);
       muls_deduced_output_shape.push_back(
           ops::Mul(s.WithOpName(name), x, zeros));
       to_const.insert(name);
@@ -1193,7 +1190,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   EXPECT_EQ(10, output.node_size());
   for (int i = 0; i < output.node_size(); ++i) {
     const NodeDef& node = output.node(i);
-    const string& name = node.name();
+    const std::string& name = node.name();
     if (to_const.count(name) > 0) {
       EXPECT_EQ("Const", node.op()) << node.name();
       EXPECT_EQ(2, node.input_size());
@@ -1201,7 +1198,7 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
       EXPECT_TRUE(IsControlInput(node.input(1)));
     }
   }
-  const std::vector<string> fetch = {"addn1"};
+  const std::vector<std::string> fetch = {"addn1"};
   auto x_partially_unknown_t =
       GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto x_unknown_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
@@ -1230,10 +1227,10 @@ TEST_F(ConstantFoldingTest, CreateConstNodes) {
   MAKE_TEST_GRAPH(float);
   MAKE_TEST_GRAPH(double);
   MAKE_TEST_GRAPH(int64_t);
-  MAKE_TEST_GRAPH(int32);
-  MAKE_TEST_GRAPH(int16);
-  MAKE_TEST_GRAPH(int8);
-  MAKE_TEST_GRAPH(uint8);
+  MAKE_TEST_GRAPH(int32_t);
+  MAKE_TEST_GRAPH(int16_t);
+  MAKE_TEST_GRAPH(int8_t);
+  MAKE_TEST_GRAPH(uint8_t);
 #undef MAKE_TEST_GRAPH
 
   Output bool_const = ops::Const(s.WithOpName("bool_const"), true, {5});
@@ -1307,7 +1304,7 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   EXPECT_EQ("f", new_d.name());
   EXPECT_EQ("Const", new_d.op());
 
-  std::vector<string> fetch = {"e", "f"};
+  std::vector<std::string> fetch = {"e", "f"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(fetch.size(), tensors_expected.size());
@@ -1338,7 +1335,7 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i3"};
+  std::vector<std::string> expected_nodes = {"dflt", "p1", "p2", "i3"};
   EXPECT_EQ(output.node_size(), expected_nodes.size());
   int i = 0;
   int found = 0;
@@ -1381,8 +1378,8 @@ TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> expected_nodes = {"dflt", "p1", "p2", "c",
-                                        "i1",   "i2", "e"};
+  std::vector<std::string> expected_nodes = {"dflt", "p1", "p2", "c",
+                                             "i1",   "i2", "e"};
   EXPECT_EQ(output.node_size(), expected_nodes.size());
   int i = 0;
   int found = 0;
@@ -1439,7 +1436,7 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i2"};
+  std::vector<std::string> expected_nodes = {"dflt", "p1", "p2", "i2"};
   EXPECT_EQ(output.node_size(), expected_nodes.size());
   int i = 0;
   for (const auto& node : output.node()) {
@@ -1466,9 +1463,9 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   ops::DynamicPartition part(scope.WithOpName("partition"), input, indices,
                              num_partitions);
 
-  std::vector<string> outputs;
+  std::vector<std::string> outputs;
   for (int i = 0; i < num_partitions; ++i) {
-    string part_out_name = strings::StrCat("part_out", i);
+    std::string part_out_name = absl::StrCat("part_out", i);
     ops::Identity partition_out(scope.WithOpName(part_out_name),
                                 {part.outputs[i]});
     outputs.push_back(part_out_name);
@@ -1481,7 +1478,7 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   Tensor initial_val(DT_INT32, TensorShape({3}));
   test::FillIota<int>(&initial_val, 7);
   for (int i = 1; i < 5; ++i) {
-    TF_CHECK_OK(NodeDefBuilder(strings::StrCat("in", i), "Const")
+    TF_CHECK_OK(NodeDefBuilder(absl::StrCat("in", i), "Const")
                     .Attr("dtype", DT_INT32)
                     .Attr("value", initial_val)
                     .Finalize(item.graph.add_node()));
@@ -1502,7 +1499,7 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
                   .Finalize(item.graph.add_node()));
 
   for (int i = 0; i < 4; ++i) {
-    string concat_offset_out_name = strings::StrCat("concat_offset_out", i);
+    std::string concat_offset_out_name = absl::StrCat("concat_offset_out", i);
     TF_CHECK_OK(NodeDefBuilder(concat_offset_out_name, "Identity")
                     .Attr("T", DT_INT32)
                     .Input("concat_offsets", i, DT_INT32)
@@ -1518,8 +1515,8 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
 
   int constant_folded = 0;
   for (const auto& node : output.node()) {
-    if (node.name().find("part_out") != string::npos ||
-        node.name().find("concat_offset_out") != string::npos) {
+    if (node.name().find("part_out") != std::string::npos ||
+        node.name().find("concat_offset_out") != std::string::npos) {
       ++constant_folded;
       EXPECT_EQ("Const", node.op());
     }
@@ -1638,7 +1635,7 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3}));
   auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 7}));
   auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({11, 13}));
-  std::vector<string> fetch_nodes = {"p2"};
+  std::vector<std::string> fetch_nodes = {"p2"};
   auto tensors_expected = EvaluateNodes(
       item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
   EXPECT_EQ(1, tensors_expected.size());
@@ -1711,8 +1708,8 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   auto v1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 4}));
   auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 6}));
   auto v3_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6}));
-  const std::vector<string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
-                                           "i2c", "i3a", "i3b"};
+  const std::vector<std::string> fetch_nodes = {"i1a", "i1b", "i2a", "i2b",
+                                                "i2c", "i3a", "i3b"};
   auto tensors_expected = EvaluateNodes(
       item.graph, fetch_nodes, {{"v1", v1_t}, {"v2", v2_t}, {"v3", v3_t}});
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
@@ -1814,15 +1811,16 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::set<string> present_nodes = {"v_in",     "v_ctrl",
-                                    "switch",   "i",
-                                    "p1",       "p2",
-                                    "m",        "false",
-                                    "constant", "switch2",
-                                    "i2",       "i3",
-                                    "m2",       "ConstantFoldingCtrl/switch_0",
-                                    "rank",     "size"};
-  std::set<string> not_present_nodes = {"ConstantFolding/switch2-0"};
+  std::set<std::string> present_nodes = {
+      "v_in",     "v_ctrl",
+      "switch",   "i",
+      "p1",       "p2",
+      "m",        "false",
+      "constant", "switch2",
+      "i2",       "i3",
+      "m2",       "ConstantFoldingCtrl/switch_0",
+      "rank",     "size"};
+  std::set<std::string> not_present_nodes = {"ConstantFolding/switch2-0"};
   EXPECT_EQ(present_nodes.size(), output.node_size());
   int found = 0;
   for (const auto& node : output.node()) {
@@ -1862,7 +1860,7 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   Tensor v_ctrl_t(DT_BOOL, TensorShape({}));
 
   v_ctrl_t.flat<bool>()(0) = true;
-  std::vector<string> fetch_nodes = {"m", "m2"};
+  std::vector<std::string> fetch_nodes = {"m", "m2"};
   auto tensors_expected = EvaluateNodes(
       item.graph, fetch_nodes, {{"v_in", v_in_t}, {"v_ctrl", v_ctrl_t}});
   EXPECT_EQ(2, tensors_expected.size());
@@ -1915,15 +1913,16 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
   GraphDef output;
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
-  std::set<string> present_nodes = {"v_in",     "v_ctrl",
-                                    "switch",   "i",
-                                    "p1",       "p2",
-                                    "m",        "false",
-                                    "constant", "switch2",
-                                    "i2",       "i3",
-                                    "m2",       "ConstantFoldingCtrl/switch_0"};
-  std::set<string> not_present_nodes = {"rank", "size",
-                                        "ConstantFolding/switch2-0"};
+  std::set<std::string> present_nodes = {
+      "v_in",     "v_ctrl",
+      "switch",   "i",
+      "p1",       "p2",
+      "m",        "false",
+      "constant", "switch2",
+      "i2",       "i3",
+      "m2",       "ConstantFoldingCtrl/switch_0"};
+  std::set<std::string> not_present_nodes = {"rank", "size",
+                                             "ConstantFolding/switch2-0"};
   EXPECT_EQ(present_nodes.size(), output.node_size());
 
   int found = 0;
@@ -2584,7 +2583,7 @@ TEST_F(ConstantFoldingTest, MergeConcat_PartialFolding) {
 }
 
 TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
-  PaddingWithZeroSize<int32>();
+  PaddingWithZeroSize<int32_t>();
   PaddingWithZeroSize<int64_t>();
 }
 
@@ -2770,7 +2769,7 @@ TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
       GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 1, 1}));
   auto input_var_one_dim_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
   Tensor input_var_axis_t(DT_INT32, TensorShape({1}));
-  input_var_axis_t.flat<int32>()(0) = 0;
+  input_var_axis_t.flat<int32_t>()(0) = 0;
   auto tensors_expected =
       EvaluateNodes(item.graph, item.fetch,
                     {{"input_var_three_dim", input_var_three_dim_t},
@@ -2895,7 +2894,7 @@ TEST_F(ConstantFoldingTest, Packing) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  const std::vector<string> fetch_nodes = {"i1", "i2"};
+  const std::vector<std::string> fetch_nodes = {"i1", "i2"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes);
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
   auto tensors = EvaluateNodes(output, fetch_nodes);
@@ -2971,7 +2970,7 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
+  std::vector<std::string> fetch_nodes = {"o1", "o2", "p1", "p2"};
   auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1, 5}));
   auto g_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({1}));
   auto tensors_expected =
@@ -3042,7 +3041,7 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch_nodes = {"o1", "o2"};
+  std::vector<std::string> fetch_nodes = {"o1", "o2"};
   auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
   auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
@@ -3331,7 +3330,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
       };
   for (bool use_add_n : {true, false}) {
     auto fun = use_add_n ? addn_fun : accumulate_fun;
-    const string op_name = use_add_n ? "AddN" : "AccumulateNV2";
+    const std::string op_name = use_add_n ? "AddN" : "AccumulateNV2";
     Scope s = Scope::NewRootScope();
     Output x = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
                                 ops::Placeholder::Shape(TensorShape({2, 2})));
@@ -3411,7 +3410,7 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
       }
     }
 
-    std::vector<string> fetch = {"acc0"};
+    std::vector<std::string> fetch = {"acc0"};
     auto tensors_expected = EvaluateNodes(item.graph, fetch);
     auto tensors = EvaluateNodes(output, fetch);
     EXPECT_EQ(1, tensors_expected.size());
@@ -3613,7 +3612,7 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   }
   EXPECT_EQ(found, 3);
 
-  std::vector<string> fetch = {"stack", "stack_no_axis"};
+  std::vector<std::string> fetch = {"stack", "stack_no_axis"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(2, tensors_expected.size());
@@ -3741,8 +3740,8 @@ TEST_F(ConstantFoldingTest, TensorArraySize) {
   auto tensors_actual = EvaluateNodes(output, {"dynamic_sz", "static_sz"});
   EXPECT_EQ(2, tensors_expected.size());
   EXPECT_EQ(2, tensors_actual.size());
-  test::ExpectTensorEqual<int32>(tensors_expected[0], tensors_actual[0]);
-  test::ExpectTensorEqual<int32>(tensors_expected[1], tensors_actual[1]);
+  test::ExpectTensorEqual<int32_t>(tensors_expected[0], tensors_actual[0]);
+  test::ExpectTensorEqual<int32_t>(tensors_expected[1], tensors_actual[1]);
 }
 
 TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
@@ -3770,7 +3769,7 @@ TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
   EXPECT_EQ("c", node_d.name());
   EXPECT_EQ("Const", node_d.op());
 
-  std::vector<string> fetch = {"c"};
+  std::vector<std::string> fetch = {"c"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors_expected.size());
@@ -3800,7 +3799,7 @@ TEST_F(ConstantFoldingTest, EvaluatingLargeConstantNoFoldingMergingLoop) {
   absl::Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::vector<string> fetch = {"result"};
+  std::vector<std::string> fetch = {"result"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors_expected.size());
@@ -3869,9 +3868,9 @@ class ConstantFoldingCastConstTest : public GrapplerTest {
     return output;
   }
 
-  void EvaluateAndCompareUnoptimized(const GraphDef& unoptimized_graph,
-                                     const GraphDef& optimized_graph,
-                                     const std::vector<string>& fetch_nodes) {
+  void EvaluateAndCompareUnoptimized(
+      const GraphDef& unoptimized_graph, const GraphDef& optimized_graph,
+      const std::vector<std::string>& fetch_nodes) {
     auto tensors_expected = EvaluateNodes(unoptimized_graph, fetch_nodes);
     auto tensors = EvaluateNodes(optimized_graph, fetch_nodes);
     ASSERT_EQ(fetch_nodes.size(), tensors_expected.size());
@@ -4093,8 +4092,8 @@ TEST_F(ConstantFoldingTest, SimplifyCase) {
     TensorShapeProto* g_shape = output_shapes.mutable_list()->add_shape();
     g_shape->set_unknown_rank(true);
 
-    const Tensor kZero = test::AsScalar<int32>(0);
-    const Tensor kOne = test::AsScalar<int32>(1);
+    const Tensor kZero = test::AsScalar<int32_t>(0);
+    const Tensor kOne = test::AsScalar<int32_t>(1);
     item.graph = test::function::GDef(
         {NDef("one", "Const", {},
               {{"value", index == 0 ? kZero : kOne}, {"dtype", DT_INT32}},
@@ -4265,8 +4264,8 @@ TEST_F(ConstantFoldingTest, SimplifySelect_BroadcastTo) {
           ASSERT_EQ(node.input_size(), 4);
           EXPECT_EQ(node.input(0), pred_val ? "then" : "else");
           EXPECT_EQ(node.input(1),
-                    strings::StrCat("ConstantFolding/select-broadcastto_shape-",
-                                    pred_val ? 1 : 2));
+                    absl::StrCat("ConstantFolding/select-broadcastto_shape-",
+                                 pred_val ? 1 : 2));
           EXPECT_EQ(node.input(2), pred_val ? "^else" : "^if");
           EXPECT_EQ(node.input(3), pred_val ? "^if" : "^then");
         }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 2854810e3c040f..aef15c4fdf1b2e 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -252,7 +252,7 @@ absl::Status TransposeContext::InitializeTransposeContext(
   TF_RETURN_IF_ERROR(status);
   context->num_nodes = context->graph.node_size();
   const auto& nodes_to_preserve = item.NodesToPreserve();
-  context->nodes_to_preserve = absl::flat_hash_set<string>(
+  context->nodes_to_preserve = absl::flat_hash_set<std::string>(
       nodes_to_preserve.begin(), nodes_to_preserve.end());
   TF_RETURN_IF_ERROR(context->frames.InferFromGraph(context->graph));
   return absl::OkStatus();
@@ -262,9 +262,9 @@ absl::Status TransposeContext::InitializeTransposeContext(
 void TransposeContext::AssignDeviceAndDataFormats(
     absl::string_view target_device, absl::string_view src_format,
     absl::string_view dst_format) {
-  this->target_device = string(target_device);
-  this->src_format = string(src_format);
-  this->dst_format = string(dst_format);
+  this->target_device = std::string(target_device);
+  this->src_format = std::string(src_format);
+  this->dst_format = std::string(dst_format);
   this->src_dim_indices = GetDimensionIndices(src_format);
   this->dst_dim_indices = GetDimensionIndices(dst_format);
   this->src_to_dst = GetPermutation(this->src_dim_indices, dst_format);
@@ -276,9 +276,9 @@ void TransposeContext::AssignDeviceAndDataFormats(
 bool Transposer::ShouldProcess(const TransposeContext& context,
                                const utils::MutableNodeView& node) const {
   const auto* node_def = node.node();
-  const string& device_name = GetDeviceName(*node_def);
-  string device;
-  string task;
+  const std::string& device_name = GetDeviceName(*node_def);
+  std::string device;
+  std::string task;
   const bool is_on_target_device =
       DeviceNameUtils::SplitDeviceName(device_name, &task, &device) &&
       absl::StrContains(absl::AsciiStrToLower(device),
@@ -306,12 +306,12 @@ absl::Status Transposer::CreateConstPermNode(
   DCHECK(!graph_view->HasNode(node_name));
 
   NodeDef node;
-  node.set_name(string(node_name));
+  node.set_name(node_name);
   node.set_op(kOpConst);
-  node.set_device(string(device));
+  node.set_device(device);
 
   if (!control_node_name.empty()) {
-    node.add_input(string(control_node_name));
+    node.add_input(std::string(control_node_name));
   }
 
   AttrValue attr_data_type;
@@ -337,8 +337,8 @@ absl::Status Transposer::CreateTransposeNode(
     const DataType& data_type, absl::string_view device,
     TensorShapeProto fanin_shape, absl::Span<const int> permutation,
     absl::string_view control_node_name, utils::MutationNewNode* added_node,
-    string* transpose_node_name) {
-  const string node_name = absl::Substitute(name_format, kOpTranspose);
+    std::string* transpose_node_name) {
+  const std::string node_name = absl::Substitute(name_format, kOpTranspose);
   auto* graph_view = context->graph_view.get();
   DCHECK(!graph_view->HasNode(node_name));
   *transpose_node_name = node_name;
@@ -346,7 +346,7 @@ absl::Status Transposer::CreateTransposeNode(
   NodeDef node;
   node.set_name(node_name);
   node.set_op(kOpTranspose);
-  node.set_device(string(device));
+  node.set_device(device);
 
   AttrValue attr_data_type;
   attr_data_type.set_type(data_type);
@@ -367,7 +367,7 @@ absl::Status Transposer::CreateTransposeNode(
 
   // Create Const Node
   utils::MutationNewNode const_perm_added_node;
-  const string const_perm_node_name =
+  const std::string const_perm_node_name =
       absl::Substitute(name_format, "PermConst");
   TF_RETURN_IF_ERROR(CreateConstPermNode(context, const_perm_node_name, device,
                                          permutation, control_node_name,
@@ -457,11 +457,11 @@ absl::Status Transposer::CreateDataFormatNode(
 
   // Create the node
   NodeDef node;
-  node.set_name(string(node_name));
+  node.set_name(node_name);
 
   // Set up parameters of node.
-  node.set_op(string(op));
-  node.set_device(string(device));
+  node.set_op(op);
+  node.set_device(device);
   AttrValue attr_data_type;
   attr_data_type.set_type(data_type);
   node.mutable_attr()->insert({"T", attr_data_type});
@@ -503,7 +503,7 @@ absl::Status Transposer::UpdateEdge(
   auto* dst_node_def = dst_node->node();
 
   // TODO(lyandy): Minimize device parsing/fetching.
-  const string device = GetDeviceName(
+  const std::string device = GetDeviceName(
       is_src_format_to_dst_format ? *dst_node_def : *src_node_def);
   DataType data_type =
       is_src_format_to_dst_format
@@ -515,7 +515,7 @@ absl::Status Transposer::UpdateEdge(
                 .dtype();
 
   utils::MutationNewNode added_node;
-  string added_node_name;
+  std::string added_node_name;
   if (op == kOpTranspose) {
     TensorShapeProto input_shape_proto;
     input_shape_proto.set_unknown_rank(true);
@@ -527,7 +527,7 @@ absl::Status Transposer::UpdateEdge(
         input_shape_proto = src_node_shape_attr->list().shape(src_port);
       }
     }
-    const string control_node_name =
+    const std::string control_node_name =
         is_in_frame ? AsControlDependency(src_node_def->name()) : "";
     const std::vector<int>& permutation =
         is_src_format_to_dst_format ? context->src_to_dst : context->dst_to_src;
@@ -540,7 +540,7 @@ absl::Status Transposer::UpdateEdge(
                                 GetDeviceName(*src_node_def), &parsed_name) &&
                             parsed_name.type != "CPU" &&
                             IsHostMemory(*src_node_def, src_port);
-    const string node_name = absl::Substitute(name_format, op);
+    const std::string node_name = absl::Substitute(name_format, op);
     TF_RETURN_IF_ERROR(CreateDataFormatNode(
         context, node_name, op, device, data_type, is_fanin_on_host,
         is_src_format_to_dst_format, &added_node));
@@ -655,40 +655,42 @@ bool Transposer::CanProcessNode(const TransposeContext& context,
          !(node.NumRegularFanouts() == 0 && node.NumControlledFanouts() == 0);
 }
 
-string Transposer::GetFaninNameFormat(absl::string_view node_name, int port,
-                                      absl::string_view src_format,
-                                      absl::string_view dst_format) {
+std::string Transposer::GetFaninNameFormat(absl::string_view node_name,
+                                           int port,
+                                           absl::string_view src_format,
+                                           absl::string_view dst_format) {
   return absl::StrCat(node_name, "-", port, "-$0", src_format, "To", dst_format,
                       "-", kOptimizedSuffix);
 }
 
-string Transposer::GetFanoutNameFormat(absl::string_view node_name, int port,
-                                       int index, absl::string_view src_format,
-                                       absl::string_view dst_format) {
+std::string Transposer::GetFanoutNameFormat(absl::string_view node_name,
+                                            int port, int index,
+                                            absl::string_view src_format,
+                                            absl::string_view dst_format) {
   return absl::StrCat(node_name, "-", port, "-", index, "-$0", dst_format, "To",
                       src_format, "-", kOptimizedSuffix);
 }
 
-string Transposer::LayoutOptimizerNode(absl::string_view node_name) {
+std::string Transposer::LayoutOptimizerNode(absl::string_view node_name) {
   return absl::StrCat(node_name, "-", kOptimizedSuffix);
 }
 
-string Transposer::GetReshapeNodeNameFormat(absl::string_view node_name,
-                                            int index,
-                                            absl::string_view src_format,
-                                            absl::string_view dst_format) {
+std::string Transposer::GetReshapeNodeNameFormat(absl::string_view node_name,
+                                                 int index,
+                                                 absl::string_view src_format,
+                                                 absl::string_view dst_format) {
   return absl::StrCat(node_name, "-", index, "-", kReshape, src_format, "To",
                       dst_format);
 }
 
-string Transposer::GetShapeConstNodeNameFormat(absl::string_view node_name,
-                                               int index) {
+std::string Transposer::GetShapeConstNodeNameFormat(absl::string_view node_name,
+                                                    int index) {
   return absl::StrCat(node_name, "-", index, "-", kReshapeConst);
 }
 
 // Layout sensitive transposer.
 
-inline string GetLayoutSensitiveNodeDataFormat(
+inline std::string GetLayoutSensitiveNodeDataFormat(
     const utils::MutableNodeView& node) {
   const auto* attr = node.GetAttr(kAttrDataFormat);
   if (attr != nullptr) {
@@ -1086,7 +1088,7 @@ inline bool IsValidConstPermTransposeNode(const utils::MutableNodeView& node,
     return false;
   }
 
-  const auto& tensor_data = tensor.unaligned_flat<int32>();
+  const auto& tensor_data = tensor.unaligned_flat<int32_t>();
   for (int i = 0; i < permutation_size; i++) {
     if (permutation[i] != tensor_data(i)) {
       return false;
@@ -1252,11 +1254,11 @@ absl::Status BinaryOpTransposer::AddNodeReshape(
     absl::string_view node_device, absl::string_view input_name,
     absl::string_view shape_const_node_name, const DataType& data_type) {
   NodeDef new_node;
-  new_node.set_name(string(node_name));
-  new_node.add_input(string(input_name));
-  new_node.add_input(string(shape_const_node_name));
+  new_node.set_name(node_name);
+  new_node.add_input(std::string(input_name));
+  new_node.add_input(std::string(shape_const_node_name));
   new_node.set_op(kReshape);
-  new_node.set_device(string(node_device));
+  new_node.set_device(node_device);
 
   AttrValue attr_type_indices;
   attr_type_indices.set_type(DT_INT32);
@@ -1276,9 +1278,9 @@ absl::Status BinaryOpTransposer::AddNodeShapeConst(
     absl::string_view node_device, bool node_in_frame, int num_channels,
     absl::string_view depended_node, int rank) {
   NodeDef new_node;
-  new_node.set_name(string(node_name));
+  new_node.set_name(node_name);
   new_node.set_op(kOpConst);
-  new_node.set_device(string(node_device));
+  new_node.set_device(node_device);
   AttrValue attr_data_type;
   attr_data_type.set_type(DT_INT32);
   new_node.mutable_attr()->insert({"dtype", attr_data_type});
@@ -1296,7 +1298,7 @@ absl::Status BinaryOpTransposer::AddNodeShapeConst(
     // This is to ensure the transpose node and the const node are in the same
     // frame.
     // TODO(halehri): Add Test that exercises this condition.
-    new_node.add_input(AsControlDependency(string(depended_node)));
+    new_node.add_input(AsControlDependency(std::string(depended_node)));
   }
 
   absl::Status status;
@@ -1313,11 +1315,12 @@ absl::Status BinaryOpTransposer::MaybeReshapeVectorFanin(
     vector_index = 0;
   }
   if (vector_index != -1) {
-    const string& node_name = node->GetName();
-    const string& node_device = node->GetDevice();
-    string reshape_node_name = LayoutOptimizerNode(GetReshapeNodeNameFormat(
-        node_name, vector_index, context->src_format, context->dst_format));
-    string shape_const_node_name = LayoutOptimizerNode(
+    const std::string& node_name = node->GetName();
+    const std::string& node_device = node->GetDevice();
+    std::string reshape_node_name =
+        LayoutOptimizerNode(GetReshapeNodeNameFormat(
+            node_name, vector_index, context->src_format, context->dst_format));
+    std::string shape_const_node_name = LayoutOptimizerNode(
         GetShapeConstNodeNameFormat(node_name, vector_index));
     const auto& fanin = node->GetRegularFanin(vector_index);
     auto* fanin_node = fanin.node_view();
@@ -1513,7 +1516,7 @@ bool ReduceTransposer::IsAlongAxis(const Tensor& tensor,
   for (int i = 0; i < axis_size; ++i) {
     int local_axis = 0;
     if (tensor.dtype() == DT_INT32) {
-      local_axis = tensor.flat<int32>()(i);
+      local_axis = tensor.flat<int32_t>()(i);
     } else {
       local_axis = tensor.flat<int64_t>()(i);
     }
@@ -2023,10 +2026,10 @@ absl::Status UnaryGradTransposer::TransposeNode(TransposeContext* context,
 
 // Utils.
 
-string GetDeviceName(const NodeDef& node) { return node.device(); }
+std::string GetDeviceName(const NodeDef& node) { return node.device(); }
 
 bool IsDefaultLayoutSensitiveOp(const NodeDef& node) {
-  static absl::flat_hash_set<string>* default_layout_sensitive_ops =
+  static absl::flat_hash_set<std::string>* default_layout_sensitive_ops =
       new absl::flat_hash_set<std::string>(
           {"AvgPool", "Conv2D", "DepthwiseConv2dNative", "DepthToSpace",
            "FusedBatchNorm", "FusedBatchNormV2", "FusedBatchNormV3",
@@ -2049,7 +2052,7 @@ bool IsLayoutSensitiveOp(const NodeDef& node) {
 }
 
 bool IsDefaultLayoutAgnosticOp(const NodeDef& node) {
-  static absl::flat_hash_set<string>* agnostic_nodes =
+  static absl::flat_hash_set<std::string>* agnostic_nodes =
       new absl::flat_hash_set<std::string>({"Abs",
                                             "Acos",
                                             "Acosh",
@@ -2253,7 +2256,7 @@ bool GetValueAttrFromConstInputNode(
 }
 
 bool IsDataFormatOp(const utils::MutableNodeView& node) {
-  const string& op = node.GetOp();
+  const std::string& op = node.GetOp();
   return op == kOpDataFormatDimMap || op == kOpDataFormatVecPermute;
 }
 

From 8633fb9dcf07ea96e327ef7af6d8958f8b5dbc6b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:44:49 -0800
Subject: [PATCH 628/753] Automated Code Change

PiperOrigin-RevId: 847189531
---
 .../parallel_interleave_dataset_op.cc         | 33 ++++++++++---------
 .../experimental/random_dataset_op_test.cc    |  9 +++--
 .../data/experimental/save_dataset_op_test.cc | 13 +++++---
 .../data/experimental/sleep_dataset_op.cc     |  6 ++--
 .../data/experimental/sql_dataset_op.cc       |  2 +-
 5 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 88e7f1528d4c83..3163f4e62c320a 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -112,9 +112,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         output_shapes_(output_shapes),
         traceme_metadata_(
             {{"block_length",
-              strings::Printf("%lld", static_cast<long long>(block_length))},
+              absl::StrFormat("%lld", static_cast<long long>(block_length))},
              {"cycle_length",
-              strings::Printf("%lld", static_cast<long long>(cycle_length))},
+              absl::StrFormat("%lld", static_cast<long long>(cycle_length))},
              {"deterministic",
               deterministic.IsDeterministic() || deterministic.IsDefault()
                   ? "true"
@@ -126,7 +126,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
   ~Dataset() override { input_->Unref(); }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     name_utils::IteratorPrefixParams params;
     params.op_version = op_version_;
     bool deterministic =
@@ -143,7 +143,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     name_utils::DatasetDebugStringParams params;
     params.op_version = op_version_;
     return name_utils::DatasetDebugString(kDatasetType, params);
@@ -949,7 +949,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     absl::Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-      string iterator_name =
+      std::string iterator_name =
           strings::StrCat(prefix(), "::", kWorker, "_", index);
       TF_RETURN_IF_ERROR(writer->WriteScalar(iterator_name, kInputSize,
                                              workers_[index].input.size()));
@@ -975,7 +975,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     absl::Status ReadWorkerStateLocked(IteratorContext* ctx,
                                        IteratorStateReader* reader, int index)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-      string worker_prefix =
+      std::string worker_prefix =
           strings::StrCat(prefix(), "::", kWorker, "_", index);
       // Restore inputs.
       int64_t input_size;
@@ -1009,7 +1009,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                               IteratorStateWriter* writer,
                                               int index)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-      string iterator_name =
+      std::string iterator_name =
           strings::StrCat(prefix(), "::", kWorkerThread, "_", index);
       if (worker_thread_states_[index].iterator != nullptr) {
         TF_RETURN_IF_ERROR(
@@ -1043,7 +1043,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                              IteratorStateReader* reader,
                                              int index,
                                              WorkerThreadState* state) {
-      string worker_prefix =
+      std::string worker_prefix =
           strings::StrCat(prefix(), "::", kWorkerThread, "_", index);
       // Restore inputs.
       int64_t input_size;
@@ -1083,8 +1083,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
     absl::Status WriteOutputElemLocked(IteratorStateWriter* writer,
                                        const OutputElem& output_elem,
-                                       const string& iterator_name,
-                                       const string& prefix)
+                                       const std::string& iterator_name,
+                                       const std::string& prefix)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       TF_RETURN_IF_ERROR(WriteStatusLocked(writer, iterator_name,
                                            absl::StrCat(prefix, "_", kStatus),
@@ -1103,8 +1103,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     absl::Status ReadOutputElemLocked(IteratorContext* ctx,
                                       IteratorStateReader* reader,
                                       OutputElem* output_elem,
-                                      const string& iterator_name,
-                                      const string& prefix) {
+                                      const std::string& iterator_name,
+                                      const std::string& prefix) {
       TF_RETURN_IF_ERROR(ReadStatusLocked(reader, iterator_name,
                                           absl::StrCat(prefix, "_", kStatus),
                                           &output_elem->status));
@@ -1123,8 +1123,8 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     absl::Status WriteStatusLocked(IteratorStateWriter* writer,
-                                   const string& iterator_name,
-                                   const string& prefix,
+                                   const std::string& iterator_name,
+                                   const std::string& prefix,
                                    const absl::Status& status)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
       TF_RETURN_IF_ERROR(
@@ -1139,8 +1139,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     absl::Status ReadStatusLocked(IteratorStateReader* reader,
-                                  const string& iterator_name,
-                                  const string& prefix, absl::Status* status) {
+                                  const std::string& iterator_name,
+                                  const std::string& prefix,
+                                  absl::Status* status) {
       int64_t code_int;
       TF_RETURN_IF_ERROR(reader->ReadScalar(
           iterator_name, absl::StrCat(prefix, "_", kCode), &code_int));
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
index a3e38ce4aeab90..f5d94b30bbd7ba 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
@@ -80,7 +80,7 @@ class RandomDatasetParams : public DatasetParams {
                       bool rerandomize_each_iteration,
                       DataTypeVector output_dtypes,
                       std::vector<PartialTensorShape> output_shapes,
-                      string node_name)
+                      std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         seed_(CreateTensor<int64_t>(TensorShape({}), {seed})),
@@ -98,7 +98,8 @@ class RandomDatasetParams : public DatasetParams {
     return {seed_, seed2_, seed_generator_resource_};
   }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     *input_names = {RandomDatasetOp::kSeed, RandomDatasetOp::kSeed2};
     if (op_version_ == 2) {
       input_names->emplace_back("seed_generator");
@@ -117,7 +118,9 @@ class RandomDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return RandomDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return RandomDatasetOp::kDatasetType;
+  }
 
  private:
   Tensor seed_;
diff --git a/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
index fe2315e35bd6a4..01f96cb04ed82e 100644
--- a/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
@@ -38,7 +38,7 @@ class SaveDatasetV2Params : public DatasetParams {
                       std::vector<FunctionDef> func_lib, bool use_shard_func,
                       DataTypeVector output_dtypes,
                       std::vector<PartialTensorShape> output_shapes,
-                      string node_name, DataTypeVector type_arguments)
+                      std::string node_name, DataTypeVector type_arguments)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         path_(path),
@@ -59,7 +59,8 @@ class SaveDatasetV2Params : public DatasetParams {
     return input_tensors;
   }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->clear();
     input_names->emplace_back(SaveDatasetV2Op::kInputDataset);
     input_names->emplace_back(SaveDatasetV2Op::kPath);
@@ -78,11 +79,13 @@ class SaveDatasetV2Params : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string path() const { return path_; }
+  std::string path() const { return path_; }
 
-  string dataset_type() const override { return SaveDatasetV2Op::kDatasetType; }
+  std::string dataset_type() const override {
+    return SaveDatasetV2Op::kDatasetType;
+  }
 
-  string op_name() const override { return "SaveDatasetV2"; }
+  std::string op_name() const override { return "SaveDatasetV2"; }
 
   std::vector<FunctionDef> func_lib() const override { return func_lib_; }
 
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index b765f96d60e71c..ff15bd00f4e1c6 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -60,7 +60,7 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
     ~Dataset() override { input_->Unref(); }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
+        const std::string& prefix) const override {
       return std::make_unique<Iterator>(
           Iterator::Params{this, absl::StrCat(prefix, "::Sleep")});
     }
@@ -72,7 +72,9 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() const override { return "SleepDatasetOp::Dataset"; }
+    std::string DebugString() const override {
+      return "SleepDatasetOp::Dataset";
+    }
 
     int64_t CardinalityInternal(CardinalityOptions options) const override {
       return input_->Cardinality(options);
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index cab138c9903c42..3ab56ba9af36bd 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -72,7 +72,7 @@ class SqlDatasetOp : public DatasetOpKernel {
     // TODO(b/64276826) Change this check when we add support for other
     // databases.
     OP_REQUIRES(ctx, driver_name == "sqlite",
-                errors::InvalidArgument(tensorflow::strings::Printf(
+                errors::InvalidArgument(absl::StrFormat(
                     "The database type, %s, is not supported by SqlDataset. "
                     "The set of supported databases is: {'sqlite'}.",
                     driver_name.c_str())));

From b1082f97e5d537191f51eba5331322a98ee5154c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:45:28 -0800
Subject: [PATCH 629/753] Automated Code Change

PiperOrigin-RevId: 847189651
---
 tensorflow/core/kernels/batching_util/concat_split_util.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/batching_util/concat_split_util.h b/tensorflow/core/kernels/batching_util/concat_split_util.h
index b5354be35c70a9..4ac0100fbdf44a 100644
--- a/tensorflow/core/kernels/batching_util/concat_split_util.h
+++ b/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -81,7 +81,7 @@ absl::Status Concat(OpKernelContext* context,
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
     if (std::is_same<Device, GPUDevice>::value) {
       ConcatGPU<T>(context, inputs_flat, output, &output_flat);
-      return OkStatus();
+      return absl::OkStatus();
     }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
@@ -198,9 +198,9 @@ absl::Status SplitCPU(OpKernelContext* context, const Tensor& input,
 
 // Handles the general case, on GPU.
 template <typename T>
-Status SplitGPU(OpKernelContext* context, const Tensor& input,
-                const gtl::ArraySlice<int64_t>& sizes,
-                std::vector<Tensor>* outputs) {
+absl::Status SplitGPU(OpKernelContext* context, const Tensor& input,
+                      const absl::Span<const int64_t>& sizes,
+                      std::vector<Tensor>* outputs) {
   // TODO(olston, apassos): Implement this.
   LOG(FATAL) << "Not yet implemented";  // Crash ok
 }

From b1b2723815eac81689c219da3c82d4092e4c5b66 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:47:29 -0800
Subject: [PATCH 630/753] Automated Code Change

PiperOrigin-RevId: 847190131
---
 tensorflow/c/eager/custom_device_testutil.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/c/eager/custom_device_testutil.cc b/tensorflow/c/eager/custom_device_testutil.cc
index f4221e765cd39b..d31bd6e4257c6c 100644
--- a/tensorflow/c/eager/custom_device_testutil.cc
+++ b/tensorflow/c/eager/custom_device_testutil.cc
@@ -27,8 +27,8 @@ limitations under the License.
 namespace {
 
 struct LoggingDevice {
-  tensorflow::string device_name;
-  tensorflow::string underlying_device;
+  std::string device_name;
+  std::string underlying_device;
   // Set to true whenever a TensorHandle is copied onto the device
   bool* arrived_flag;
   // Set to true whenever an operation is executed
@@ -59,9 +59,10 @@ void LoggedTensorDeallocator(void* data) {
   delete reinterpret_cast<LoggedTensor*>(data);
 }
 
-TFE_TensorHandle* MakeLoggedTensorHandle(
-    TFE_Context* context, const tensorflow::string& logging_device_name,
-    std::unique_ptr<LoggedTensor> t, TF_Status* status) {
+TFE_TensorHandle* MakeLoggedTensorHandle(TFE_Context* context,
+                                         const std::string& logging_device_name,
+                                         std::unique_ptr<LoggedTensor> t,
+                                         TF_Status* status) {
   auto dtype = TFE_TensorHandleDataType(t->tensor);
   TFE_CustomDeviceTensorHandleMethods handle_methods;
   handle_methods.num_dims = &LoggedTensorNumDims;

From 9799cb0a78dbd44f953377b1947cf919f2c7138e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:48:12 -0800
Subject: [PATCH 631/753] Automated Code Change

PiperOrigin-RevId: 847190272
---
 tensorflow/core/kernels/conv_ops_gpu.cc  | 4 ++--
 tensorflow/core/kernels/cudnn_rnn_ops.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/conv_ops_gpu.cc b/tensorflow/core/kernels/conv_ops_gpu.cc
index bf46cf15a7e4fc..8f11683fe20070 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cc
@@ -85,7 +85,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
   auto* stream = ctx->op_device_context()->stream();
 
   if (!autotune_map->Find(params, &autotune_entry)) {
-    profiler::ScopedAnnotation trace("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
                                                 stream);
@@ -250,7 +250,7 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
   auto* stream = ctx->op_device_context()->stream();
 
   if (!autotune_map->Find(conv_parameters, &autotune_entry)) {
-    profiler::ScopedAnnotation annotation("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation annotation("cudnn_autotuning");
 
 #if GOOGLE_CUDA
     se::TfAllocatorAdapter tf_allocator_adapter(ctx->device()->GetAllocator({}),
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index f81c3176424843..00ce115511e76d 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -1924,7 +1924,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
               << algo_config->algorithm()->tensor_ops_enabled() << ").";
       return OkStatus();
     }
-    profiler::ScopedAnnotation trace("cudnn_autotuning");
+    tsl::profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     // Create temp tensors when profiling backprop pass.
     auto data_type = input->dtype();

From 618abe0548934511ba3b0a5bc089494ac732f6fc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:49:22 -0800
Subject: [PATCH 632/753] Automated Code Change

PiperOrigin-RevId: 847190441
---
 .../fuzzing/example_proto_fast_parsing_fuzz.cc     |  2 +-
 tensorflow/core/kernels/fuzzing/fuzz_session.h     | 14 +++++++-------
 tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc    | 12 ++++++------
 .../core/kernels/fuzzing/parse_tensor_op_fuzz.cc   |  3 ++-
 tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc |  6 +++---
 5 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
index 73fa3c4b74e296..794f51cd1cb394 100644
--- a/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/example_proto_fast_parsing_fuzz.cc
@@ -62,7 +62,7 @@ class FuzzExampleProtoFastParsing : public FuzzSession {
     // TODO(dga):  Test the batch case also.
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<tstring>()() =
-        string(reinterpret_cast<const char*>(data), size);
+        std::string(reinterpret_cast<const char*>(data), size);
     RunInputs({{"input", input_tensor}});
   }
 };
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 09c7563d2efd17..d178208a1a35e0 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -81,7 +81,7 @@ class FuzzSession {
   // Initializes the FuzzSession.  Not safe for multithreading.
   // Separate init function because the call to virtual BuildGraphDef
   // can't be put into the constructor.
-  Status InitIfNeeded() {
+  absl::Status InitIfNeeded() {
     if (initialized_) {
       return absl::OkStatus();
     }
@@ -96,7 +96,7 @@ class FuzzSession {
     GraphDef graph_def;
     TF_CHECK_OK(root.ToGraphDef(&graph_def));
 
-    Status status = session_->Create(graph_def);
+    absl::Status status = session_->Create(graph_def);
     if (!status.ok()) {
       // This is FATAL, because this code is designed to fuzz an op
       // within a session.  Failure to create the session means we
@@ -111,20 +111,20 @@ class FuzzSession {
   // any returned output.
   // Note: We are ignoring Status from Run here since fuzzers don't need to
   // check it (as that will slow them down and printing/logging is useless).
-  void RunInputs(const std::vector<std::pair<string, Tensor> >& inputs) {
+  void RunInputs(const std::vector<std::pair<std::string, Tensor> >& inputs) {
     RunInputsWithStatus(inputs).IgnoreError();
   }
 
   // Same as RunInputs but don't ignore status
-  Status RunInputsWithStatus(
-      const std::vector<std::pair<string, Tensor> >& inputs) {
+  absl::Status RunInputsWithStatus(
+      const std::vector<std::pair<std::string, Tensor> >& inputs) {
     return session_->Run(inputs, {}, {"output"}, nullptr);
   }
 
   // Dispatches to FuzzImpl;  small amount of sugar to keep the code
   // of the per-op fuzzers tiny.
   int Fuzz(const uint8_t* data, size_t size) {
-    Status status = InitIfNeeded();
+    absl::Status status = InitIfNeeded();
     TF_CHECK_OK(status) << "Fuzzer graph initialization failed: "
                         << status.message();
     // No return value from fuzzing:  Success is defined as "did not
@@ -146,7 +146,7 @@ class FuzzStringInputOp : public FuzzSession {
   void FuzzImpl(const uint8_t* data, size_t size) final {
     Tensor input_tensor(tensorflow::DT_STRING, TensorShape({}));
     input_tensor.scalar<tstring>()() =
-        string(reinterpret_cast<const char*>(data), size);
+        std::string(reinterpret_cast<const char*>(data), size);
     RunInputs({{"input", input_tensor}});
   }
 };
diff --git a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
index 08af574ac9ae4e..458329000ca349 100644
--- a/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/one_hot_fuzz.cc
@@ -42,7 +42,7 @@ class FuzzOneHot : public FuzzSession {
   void FuzzImpl(const uint8_t* data, size_t size) override {
     int64_t input_size;
     int32_t depth;
-    uint8 on, off;
+    uint8_t on, off;
     const uint8_t* input_data;
 
     if (size > 3) {
@@ -51,7 +51,7 @@ class FuzzOneHot : public FuzzSession {
       if (size > kMaxSize) {
         size = kMaxSize;
       }
-      depth = static_cast<int32>(data[0]);
+      depth = static_cast<int32_t>(data[0]);
       on = data[1];
       off = data[2];
       input_size = static_cast<int64_t>(size - 3);
@@ -69,13 +69,13 @@ class FuzzOneHot : public FuzzSession {
     Tensor on_tensor(tensorflow::DT_UINT8, TensorShape({}));
     Tensor off_tensor(tensorflow::DT_UINT8, TensorShape({}));
 
-    auto flat_tensor = input_tensor.flat<uint8>();
+    auto flat_tensor = input_tensor.flat<uint8_t>();
     for (size_t i = 0; i < input_size; i++) {
       flat_tensor(i) = input_data[i];
     }
-    depth_tensor.scalar<int32>()() = depth;
-    on_tensor.scalar<uint8>()() = on;
-    off_tensor.scalar<uint8>()() = off;
+    depth_tensor.scalar<int32_t>()() = depth;
+    on_tensor.scalar<uint8_t>()() = on;
+    off_tensor.scalar<uint8_t>()() = off;
 
     RunInputs({{"input", input_tensor},
                {"depth", depth_tensor},
diff --git a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
index de3ae36dc75d56..a8cc47e599ee43 100644
--- a/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/parse_tensor_op_fuzz.cc
@@ -53,7 +53,8 @@ class FuzzParseTensor : public FuzzSession {
     // detects another similar OOM.
     // After adding `-fsanitize=null` to ASAN (cl/317376103), the memory
     // footprint increased, so we lower the maximum threshold to 2^18.
-    string as_string = string(reinterpret_cast<const char*>(data), size);
+    std::string as_string =
+        std::string(reinterpret_cast<const char*>(data), size);
     TensorProto proto;
     if (!ParseProtoUnlimited(&proto, as_string)) {
       LOG(WARNING) << "Unable to parse proto of tensor\n";
diff --git a/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
index 5104711ad3048f..81f489b2080d80 100644
--- a/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/scatter_nd_fuzz.cc
@@ -67,7 +67,7 @@ class FuzzScatterNd : public FuzzSession {
 
     // Subsequent elements give the contents of the shape tensor.
     // To not get out of memory, reduce all dimensions to at most kMaxDim
-    auto flat_shape = shape_tensor.flat<int32>();
+    auto flat_shape = shape_tensor.flat<int32_t>();
     for (i = 0; i < shape_dims; i++) {
       flat_shape(i) = data[data_ix++] % kMaxDim;
     }
@@ -94,7 +94,7 @@ class FuzzScatterNd : public FuzzSession {
     Tensor indices_tensor(tensorflow::DT_INT32, TensorShape(indices_dims));
 
     // Rest of the buffer is used to fill in the indices_tensor
-    auto flat_indices = indices_tensor.flat<int32>();
+    auto flat_indices = indices_tensor.flat<int32_t>();
     for (i = 0; i < num_indices && data_ix < size; i++) {
       flat_indices(i) = data[data_ix++];
     }
@@ -118,7 +118,7 @@ class FuzzScatterNd : public FuzzSession {
     Tensor updates_tensor(tensorflow::DT_INT32, TensorShape(updates_dims));
 
     // We don't care about the values in the updates_tensor, make them all be 1
-    auto flat_updates = updates_tensor.flat<int32>();
+    auto flat_updates = updates_tensor.flat<int32_t>();
     for (i = 0; i < num_indices; i++) {
       flat_updates(i) = 1;
     }

From 9f41228dad15e62693920599487c8006748302e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:49:31 -0800
Subject: [PATCH 633/753] Automated Code Change

PiperOrigin-RevId: 847190468
---
 tensorflow/core/kernels/autotune_conv_impl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/autotune_conv_impl.h b/tensorflow/core/kernels/autotune_conv_impl.h
index 63c6a64d1282a7..91530d7bbc269f 100644
--- a/tensorflow/core/kernels/autotune_conv_impl.h
+++ b/tensorflow/core/kernels/autotune_conv_impl.h
@@ -28,7 +28,7 @@ limitations under the License.
 namespace tensorflow::internal {
 
 template <typename LaunchFunc, typename Sig>
-StatusOr<std::vector<xla::AutotuneResult>> AutotuneConvImpl(
+absl::StatusOr<std::vector<xla::AutotuneResult>> AutotuneConvImpl(
     OpKernelContext* ctx,
     std::vector<std::unique_ptr<const se::dnn::OpRunner<Sig>>>& runners,
     bool actually_do_autotune, const LaunchFunc& launch_func,
@@ -54,10 +54,10 @@ StatusOr<std::vector<xla::AutotuneResult>> AutotuneConvImpl(
 
     TF_ASSIGN_OR_RETURN(auto desc, runner->ToAlgorithmDesc());
     se::dnn::ProfileResult profile_result;
-    Status cudnn_launch_status =
+    absl::Status cudnn_launch_status =
         actually_do_autotune
             ? launch_func(allocator_used, runner, &profile_result)
-            : OkStatus();
+            : absl::OkStatus();
     if (!actually_do_autotune) {
       // Make the result valid according to `is_valid`.
       profile_result.set_algorithm(desc);

From 580eeae4c35b992f31019389691fa60bf1483123 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 20 Dec 2025 13:49:35 -0800
Subject: [PATCH 634/753] Automated Code Change

PiperOrigin-RevId: 847190483
---
 .../device/device_event_mgr_test.cc           | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
index d252b541fcddfb..e485fb8c7d31b0 100644
--- a/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/device/device_event_mgr_test.cc
@@ -184,12 +184,12 @@ class EMBenchmarkHelper {
   // The rest of these are one per chain.
   NodeDef add_node_def_;
   NodeDef id_node_def_;
-  gtl::InlinedVector<TensorValue, 4> add_inputs_;
+  absl::InlinedVector<TensorValue, 4UL> add_inputs_;
   std::vector<AllocatorAttributes> allocator_attrs_;
-  gtl::InlinedVector<Tensor, 4> gpu_inputs_;
-  gtl::InlinedVector<Tensor, 4> gpu_outputs_;
-  gtl::InlinedVector<Tensor, 4> host_inputs_;
-  gtl::InlinedVector<Tensor, 4> host_outputs_;
+  absl::InlinedVector<Tensor, 4UL> gpu_inputs_;
+  absl::InlinedVector<Tensor, 4UL> gpu_outputs_;
+  absl::InlinedVector<Tensor, 4UL> host_inputs_;
+  absl::InlinedVector<Tensor, 4UL> host_outputs_;
 
  public:
   // Length of tensors.  TODO(tucker): make this a variable parameter.
@@ -242,7 +242,7 @@ class EMBenchmarkHelper {
   }
 
   std::unique_ptr<OpKernel> GetOpKernel(const NodeDef& node_def,
-                                        Status* status) {
+                                        absl::Status* status) {
     return CreateOpKernel("GPU", gpu_helper_->gpu(),
                           gpu_helper_->gpu_allocator(), node_def,
                           TF_GRAPH_DEF_VERSION, status);
@@ -256,7 +256,7 @@ class EMBenchmarkHelper {
                        .Device("/job:a/replica:0/task:0/GPU:0")
                        .Finalize(&add_node_def_));
     }
-    Status status;
+    absl::Status status;
     add_kernels_.emplace_back(GetOpKernel(add_node_def_, &status));
     TF_ASSERT_OK(status);
     add_params_.push_back(new OpKernelContext::Params);
@@ -385,12 +385,12 @@ class EMBenchmarkHelper {
           gpu_helper_->h2d_stream()->WaitFor(gpu_helper_->compute_stream()));
       // Begin by copying the input values from CPU to GPU.
       const int64_t src_bytes = host_inputs_[0].TotalBytes();
-      se::DeviceMemoryBase gpu_dst_ptr0(DMAHelper::base(&gpu_inputs_[0]),
-                                        src_bytes);
+      stream_executor::DeviceAddressBase gpu_dst_ptr0(
+          DMAHelper::base(&gpu_inputs_[0]), src_bytes);
       TF_ASSERT_OK(gpu_helper_->h2d_stream()->Memcpy(
           &gpu_dst_ptr0, DMAHelper::base(&host_inputs_[0]), src_bytes));
-      se::DeviceMemoryBase gpu_dst_ptr1(DMAHelper::base(&gpu_inputs_[1]),
-                                        src_bytes);
+      stream_executor::DeviceAddressBase gpu_dst_ptr1(
+          DMAHelper::base(&gpu_inputs_[1]), src_bytes);
       TF_ASSERT_OK(gpu_helper_->h2d_stream()->Memcpy(
           &gpu_dst_ptr1, DMAHelper::base(&host_inputs_[1]), src_bytes));
       TF_ASSERT_OK(
@@ -421,8 +421,8 @@ class EMBenchmarkHelper {
       TF_ASSERT_OK(
           gpu_helper_->d2h_stream()->WaitFor(gpu_helper_->compute_stream()));
       const int64_t return_bytes = ctx->mutable_output(0)->TotalBytes();
-      se::DeviceMemoryBase gpu_src_ptr(DMAHelper::base(ctx->mutable_output(0)),
-                                       return_bytes);
+      stream_executor::DeviceAddressBase gpu_src_ptr(
+          DMAHelper::base(ctx->mutable_output(0)), return_bytes);
       TF_ASSERT_OK(gpu_helper_->d2h_stream()->Memcpy(
           DMAHelper::base(&host_outputs_[0]), gpu_src_ptr, return_bytes));
       gpu_helper_->event_mgr()->ThenExecute(gpu_helper_->d2h_stream(),

From ff7eb222c2bc3f80289f8d30f3d9a651707a2ac7 Mon Sep 17 00:00:00 2001
From: Bhupendra Dubey <bhupendradubey@google.com>
Date: Sat, 20 Dec 2025 19:50:25 -0800
Subject: [PATCH 635/753] Refactor XLA Profiler State Check to Use Low-Overhead
 C API

This CL refactors the XLA profiler's state-checking mechanism to resolve GIL deadlocks and improve performance.

Previously, the C++ profiler context would import a Python module to update the profiler's state. This operation, performed while holding the GIL, could cause deadlocks if the import failed (e.g., in a JAX-only environment).

This change replaces the fragile cross-language import with a shared C++ std::atomic<bool>. Python code now queries this state via a new, low-overhead C function (is_traceme_enabled_raw) instead of ctypes.

This approach eliminates the deadlocks, decouples the C++ profiler from Python modules, and maintains high performance for the state check. The internal C++ API was also updated to use a safer reference instead of a raw pointer.

PiperOrigin-RevId: 847261952
---
 tensorflow/compiler/jit/BUILD                 |  5 +++-
 tensorflow/python/profiler/internal/BUILD     |  1 +
 .../profiler/internal/_pywrap_traceme.pyi     |  2 ++
 .../profiler/internal/traceme_wrapper.cc      | 19 ++++++++++++
 .../python/profiler/profiler_v2_test.py       |  8 +++++
 tensorflow/python/profiler/trace.py           | 10 +++----
 .../xla/xla/python/profiler/internal/BUILD    | 16 +++++++++-
 .../python/profiler/internal/python_hooks.cc  | 16 ++--------
 .../python/profiler/internal/python_hooks.h   |  1 -
 .../python/profiler/internal/traceme_state.cc | 25 ++++++++++++++++
 .../python/profiler/internal/traceme_state.h  | 29 +++++++++++++++++++
 11 files changed, 111 insertions(+), 21 deletions(-)
 create mode 100644 third_party/xla/xla/python/profiler/internal/traceme_state.cc
 create mode 100644 third_party/xla/xla/python/profiler/internal/traceme_state.h

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 7c1772c084750c..9c2de5b39016e7 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -65,7 +65,10 @@ filegroup(
 # Please use the individual targets in the deps list as needed. See b/336889334.
 cc_library(
     name = "jit",
-    visibility = internal_visibility([":legacy_jit_users"]),
+    visibility = internal_visibility([
+        ":legacy_jit_users",
+        "//tensorflow/python/profiler:__pkg__",
+    ]),
     deps = [
         ":xla_cpu_device",
         ":xla_cpu_jit",
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 18cf1e126fd049..bcf9a42a941023 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -108,6 +108,7 @@ tf_python_pybind_extension(
         "//tensorflow/tools/pip_package:__subpackages__",
     ],
     deps = [
+        "@local_xla//xla/python/profiler/internal:traceme_state",
         "@local_xla//xla/python/profiler/internal:traceme_wrapper",
         "@pybind11",
     ],
diff --git a/tensorflow/python/profiler/internal/_pywrap_traceme.pyi b/tensorflow/python/profiler/internal/_pywrap_traceme.pyi
index 105e2dce09d3a7..47b8b56c94a269 100644
--- a/tensorflow/python/profiler/internal/_pywrap_traceme.pyi
+++ b/tensorflow/python/profiler/internal/_pywrap_traceme.pyi
@@ -17,3 +17,5 @@ class TraceMe:
     def __init__(self, arg0: str, **kwargs) -> None: ...
     def SetMetadata(self, **kwargs) -> None: ...
     def Stop(self) -> None: ...
+
+def traceme_enabled(*args, **kwargs): ...
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index ba1b1a63674491..9397eb18134cf3 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -17,14 +17,33 @@ limitations under the License.
 
 #include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "xla/python/profiler/internal/traceme_state.h"
 
 namespace py = ::pybind11;
 
 using ::xla::profiler::TraceMeWrapper;
 
+// Returns true if TraceMe is enabled.
+// This is a low-overhead function that can be called frequently.
+static PyObject* traceme_enabled(PyObject* self, PyObject* args) {
+  if (xla::profiler::traceme_enabled) {
+    Py_RETURN_TRUE;
+  }
+  Py_RETURN_FALSE;
+}
+
+static PyMethodDef traceme_method_def = {"traceme_enabled", traceme_enabled,
+                                         METH_NOARGS,
+                                         "Returns true if TraceMe is enabled."};
+
 PYBIND11_MODULE(_pywrap_traceme, m) {
   py::class_<TraceMeWrapper>(m, "TraceMe", py::module_local())
       .def(py::init<const py::str&, const py::kwargs&>())
       .def("SetMetadata", &TraceMeWrapper::SetMetadata)
       .def("Stop", &TraceMeWrapper::Stop);
+
+  py::object module_name = m.attr("__name__");
+  m.attr("traceme_enabled") =
+      py::reinterpret_steal<py::object>(PyCFunction_NewEx(
+          &traceme_method_def, /*self=*/nullptr, module_name.ptr()));
 };
diff --git a/tensorflow/python/profiler/profiler_v2_test.py b/tensorflow/python/profiler/profiler_v2_test.py
index b0b4ff301f6b0b..bec85cdc60bba8 100644
--- a/tensorflow/python/profiler/profiler_v2_test.py
+++ b/tensorflow/python/profiler/profiler_v2_test.py
@@ -98,6 +98,14 @@ def test_context_manager_with_options(self):
     file_list = gfile.ListDirectory(logdir)
     self.assertEqual(len(file_list), 1)
 
+  def test_callback(self):
+    logdir = self.get_temp_dir()
+    self.assertFalse(trace.enabled())
+    profiler.start(logdir)
+    self.assertTrue(trace.enabled())
+    profiler.stop()
+    self.assertFalse(trace.enabled())
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 6b6bc7ac243a75..4c877803ef7623 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -19,9 +19,9 @@
 from tensorflow.python.profiler.internal import _pywrap_traceme
 from tensorflow.python.util.tf_export import tf_export
 
-# This variable is modified by PythonHooks::Start/Stop() in C++. Such
-# arrangement will reduce the number of calls through pybind11.
-enabled = False
+# This is a low-overhead function that directly calls C++ to check if the
+# profiler is enabled.
+enabled = _pywrap_traceme.traceme_enabled
 
 
 @tf_export('profiler.experimental.Trace', v1=[])
@@ -74,7 +74,7 @@ def __init__(self, name, **kwargs):
       The example above uses the keyword argument "step_num" to specify the
       training step being traced.
     """
-    if enabled:
+    if enabled():
       # Creating _pywrap_traceme.TraceMe starts the clock.
       self._traceme = _pywrap_traceme.TraceMe(name, **kwargs)
     else:
@@ -177,7 +177,7 @@ def inner_wrapper(func):
 
     @functools.wraps(func)
     def wrapped(*args, **kwargs):
-      if enabled:
+      if enabled():
         with Trace(trace_name, **trace_kwargs):
           return func(*args, **kwargs)
       return func(*args, **kwargs)
diff --git a/third_party/xla/xla/python/profiler/internal/BUILD b/third_party/xla/xla/python/profiler/internal/BUILD
index 8332ea79552d77..08d97088e51d8d 100644
--- a/third_party/xla/xla/python/profiler/internal/BUILD
+++ b/third_party/xla/xla/python/profiler/internal/BUILD
@@ -1,4 +1,4 @@
-load("//xla/tsl:tsl.bzl", "internal_visibility")
+load("//xla/tsl:tsl.bzl", "if_windows", "internal_visibility")
 load("//xla/tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 load("//xla/tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
@@ -21,6 +21,7 @@ cc_library(
         "//tensorflow/python/profiler/internal:__subpackages__",
     ]),
     deps = [
+        ":traceme_state",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:macros",
         "//xla/tsl/profiler/utils:time_utils",
@@ -40,6 +41,19 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "traceme_state",
+    srcs = ["traceme_state.cc"],
+    hdrs = ["traceme_state.h"],
+    copts = tf_profiler_copts() + if_windows(["/DTF_COMPILE_LIBRARY"]),
+    visibility = internal_visibility([
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ]),
+    deps = [
+        "//xla/tsl/platform:macros",
+    ],
+)
+
 cc_library(
     name = "traceme_wrapper",
     hdrs = ["traceme_wrapper.h"],
diff --git a/third_party/xla/xla/python/profiler/internal/python_hooks.cc b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
index 052a2a06da6021..fbe1afabf26270 100644
--- a/third_party/xla/xla/python/profiler/internal/python_hooks.cc
+++ b/third_party/xla/xla/python/profiler/internal/python_hooks.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "xla/python/profiler/internal/traceme_state.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/profiler/utils/time_utils.h"
 #include "xla/tsl/profiler/utils/xplane_builder.h"
@@ -148,7 +149,7 @@ void PythonHookContext::Start(const PythonHooksOptions& options) {
   if (options_.enable_python_traceme || options_.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
     if (options_.enable_python_traceme) {
-      EnableTraceMe(true);
+      traceme_enabled = true;
     }
     if (options_.enable_trace_python_function) {
       SetProfilerInAllThreads();
@@ -187,7 +188,7 @@ void PythonHookContext::Stop() {
       ClearProfilerInAllThreads();
     }
     if (options_.enable_python_traceme) {
-      EnableTraceMe(false);
+      traceme_enabled = false;
     }
     PyGILState_Release(gil_state);
   }
@@ -408,16 +409,5 @@ void PythonHookContext::ProfileFast(PyFrameObject* frame, int what,
   ThreadingSetProfile(py::none());
 }
 
-/*static*/ void PythonHookContext::EnableTraceMe(bool enable) {
-  const char* kModuleName =
-      "tensorflow.python.profiler.trace";
-  try {
-    auto trace_module = py::module::import(kModuleName);
-    trace_module.attr("enabled") = py::bool_(enable);
-  } catch (const py::error_already_set& e) {
-    LOG(INFO) << "Can't import " << kModuleName;
-  }
-}
-
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/python/profiler/internal/python_hooks.h b/third_party/xla/xla/python/profiler/internal/python_hooks.h
index fd1ffbf4c925c3..623df122ed1cd9 100644
--- a/third_party/xla/xla/python/profiler/internal/python_hooks.h
+++ b/third_party/xla/xla/python/profiler/internal/python_hooks.h
@@ -135,7 +135,6 @@ class PythonHookContext {
   void Stop();
   void ProfileFast(PyFrameObject* frame, int what, PyObject* arg);
   void CollectData(tensorflow::profiler::XPlane* raw_plane);
-  static void EnableTraceMe(bool enable);
 
   static void SetProfilerInAllThreads();
   static void ClearProfilerInAllThreads();
diff --git a/third_party/xla/xla/python/profiler/internal/traceme_state.cc b/third_party/xla/xla/python/profiler/internal/traceme_state.cc
new file mode 100644
index 00000000000000..b4959e725ab372
--- /dev/null
+++ b/third_party/xla/xla/python/profiler/internal/traceme_state.cc
@@ -0,0 +1,25 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/python/profiler/internal/traceme_state.h"
+
+#include <atomic>
+
+namespace xla {
+namespace profiler {
+
+std::atomic<bool> traceme_enabled{false};
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/python/profiler/internal/traceme_state.h b/third_party/xla/xla/python/profiler/internal/traceme_state.h
new file mode 100644
index 00000000000000..937321772507a5
--- /dev/null
+++ b/third_party/xla/xla/python/profiler/internal/traceme_state.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_PYTHON_PROFILER_INTERNAL_TRACEME_STATE_H_
+#define XLA_PYTHON_PROFILER_INTERNAL_TRACEME_STATE_H_
+
+#include <atomic>
+
+#include "xla/tsl/platform/macros.h"
+namespace xla {
+namespace profiler {
+
+// Indicates whether TraceMe is enabled.
+TF_EXPORT extern std::atomic<bool> traceme_enabled;
+}  // namespace profiler
+}  // namespace xla
+
+#endif  // XLA_PYTHON_PROFILER_INTERNAL_TRACEME_STATE_H_

From e60b3eb36242117511268c747824c8d543b86a8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 01:03:55 -0800
Subject: [PATCH 636/753] compat: Update forward compatibility horizon to
 2025-12-21

PiperOrigin-RevId: 847339112
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d99963a6b2858b..0008a1f31bae3b 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 20)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 21)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From ffb02301df8e6d398c0874b6a548b3dc271d5a30 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 01:04:06 -0800
Subject: [PATCH 637/753] Update GraphDef version to 2448.

PiperOrigin-RevId: 847339150
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ec50d4b11648d9..6091b3a6cb3d81 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2447  // Updated: 2025/12/20
+#define TF_GRAPH_DEF_VERSION 2448  // Updated: 2025/12/21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From a27a856d1ca2f577ec835cfc8b12c6dc03515704 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 02:50:18 -0800
Subject: [PATCH 638/753] Automated Code Change

PiperOrigin-RevId: 847361371
---
 .../xla/backends/gpu/target_config/target_config_test.cc   | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc b/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc
index e94d8100f157f3..543a21730dae26 100644
--- a/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc
+++ b/third_party/xla/xla/backends/gpu/target_config/target_config_test.cc
@@ -44,12 +44,13 @@ TEST_P(GetGpuTargetConfigTest, TestProtoRetrieval) {
   auto config = GetGpuTargetConfig(test_case.gpu_model);
 
   if (test_case.expect_ok) {
-    ASSERT_THAT(config, IsOk());
+    ASSERT_THAT(config, absl_testing::IsOk());
     EXPECT_TRUE(config->has_gpu_device_info());
     EXPECT_GT(config->gpu_device_info().threads_per_block_limit(), 0);
   } else {
-    EXPECT_THAT(config, StatusIs(absl::StatusCode::kNotFound,
-                                 HasSubstr("Embedded file not found")));
+    EXPECT_THAT(config,
+                absl_testing::StatusIs(absl::StatusCode::kNotFound,
+                                       HasSubstr("Embedded file not found")));
   }
 }
 

From 5042531aa849393d2a0c9af04f56a4462081d53a Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Sun, 21 Dec 2025 04:52:21 -0800
Subject: [PATCH 639/753] Moving definitions to cpp file, match function
 definition declaration order

PiperOrigin-RevId: 847385799
---
 third_party/xla/xla/hlo/ir/mesh_and_axis.cc | 73 +++++++++++++++------
 third_party/xla/xla/hlo/ir/mesh_and_axis.h  | 37 +----------
 2 files changed, 57 insertions(+), 53 deletions(-)

diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
index ae638cd8d9bdc3..d0cb57442ec492 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.cc
@@ -28,6 +28,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
@@ -94,6 +96,28 @@ Mesh::Mesh(TileAssignment device_assignment,
   CHECK_OK(ValidateMesh());
 }
 
+std::string Mesh::ToString() const {
+  std::string mesh_str = "@mesh";
+  // Add the mesh axes names and sizes.
+  std::vector<std::string> formatted_axes_names;
+  formatted_axes_names.reserve(axes_names_.size());
+  for (int64_t i = 0; i < axes_names_.size(); ++i) {
+    formatted_axes_names.push_back(
+        absl::StrCat(axes_names_[i], "=", device_assignment_.dim(i)));
+  }
+
+  // Add the device assignment if it is not an iota case.
+  std::optional<IotaTileAssignment> iota = device_assignment_.iota();
+  std::string device_assignment_str = "";
+  if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
+    device_assignment_str =
+        absl::StrCat("(", device_assignment_.ArrayToString(), ")");
+  }
+  absl::StrAppend(&mesh_str, "<", absl::StrJoin(formatted_axes_names, ","), ">",
+                  device_assignment_str);
+  return mesh_str;
+}
+
 MeshProto Mesh::ToProto() const {
   MeshProto proto;
   int64_t num_axes = axes_names_.size();
@@ -168,26 +192,15 @@ Mesh Mesh::FromProto(const MeshProto& proto) {
   return Mesh(tile_assignment, mesh_axis_names_span);
 }
 
-absl::Status AxisRef::Validate(const Mesh& mesh) const {
-  if (mesh_axis_index_ >= mesh.axis_names().size()) {
-    return absl::InvalidArgumentError(
-        "Axis index must be less than number of axes.");
-  }
-  if (!sub_axis_info_.has_value()) {
-    return absl::OkStatus();
-  }
-
-  int64_t axis_size = mesh.axis_size(mesh_axis_index_);
-  if (axis_size % sub_axis_info_->pre_size != 0 ||
-      axis_size % sub_axis_info_->size != 0) {
-    return absl::InvalidArgumentError(
-        "Pre-size and size must divide the full axis size.");
-  }
-  if (sub_axis_info_->size >= axis_size) {
-    return absl::InvalidArgumentError(
-        "Sub-axis size must be strictly less than the full axis size.");
+std::string AxisRef::ToString(const Mesh& mesh) const {
+  CHECK_GE(mesh_axis_index_, 0);
+  CHECK_LT(mesh_axis_index_, mesh.axis_names().size());
+  std::string axis_str = mesh.axis_names()[mesh_axis_index_];
+  if (sub_axis_info_.has_value()) {
+    absl::StrAppend(&axis_str, ":(", sub_axis_info_->pre_size, ")",
+                    sub_axis_info_->size);
   }
-  return absl::OkStatus();
+  return axis_str;
 }
 
 AxisRefProto AxisRef::ToProto() const {
@@ -309,6 +322,28 @@ bool AxisRef::CanCoexistWithoutOverlap(const AxisRef& other) const {
   return max_pre_size % min_next_pre_size == 0;
 }
 
+absl::Status AxisRef::Validate(const Mesh& mesh) const {
+  if (mesh_axis_index_ >= mesh.axis_names().size()) {
+    return absl::InvalidArgumentError(
+        "Axis index must be less than number of axes.");
+  }
+  if (!sub_axis_info_.has_value()) {
+    return absl::OkStatus();
+  }
+
+  int64_t axis_size = mesh.axis_size(mesh_axis_index_);
+  if (axis_size % sub_axis_info_->pre_size != 0 ||
+      axis_size % sub_axis_info_->size != 0) {
+    return absl::InvalidArgumentError(
+        "Pre-size and size must divide the full axis size.");
+  }
+  if (sub_axis_info_->size >= axis_size) {
+    return absl::InvalidArgumentError(
+        "Sub-axis size must be strictly less than the full axis size.");
+  }
+  return absl::OkStatus();
+}
+
 int64_t AxisRef::size(const Mesh& mesh) const {
   if (sub_axis_info_.has_value()) {
     return sub_axis_info_->size;
diff --git a/third_party/xla/xla/hlo/ir/mesh_and_axis.h b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
index f6190d038ff625..e83f58b7c0cb43 100644
--- a/third_party/xla/xla/hlo/ir/mesh_and_axis.h
+++ b/third_party/xla/xla/hlo/ir/mesh_and_axis.h
@@ -24,8 +24,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/array.h"
@@ -92,32 +90,12 @@ class Mesh {
 
   bool operator!=(const Mesh& other) const { return !(*this == other); }
 
-  std::string ToString() const {
-    std::string mesh_str = "@mesh";
-    // Add the mesh axes names and sizes.
-    std::vector<std::string> formatted_axes_names;
-    formatted_axes_names.reserve(axes_names_.size());
-    for (int64_t i = 0; i < axes_names_.size(); ++i) {
-      formatted_axes_names.push_back(
-          absl::StrCat(axes_names_[i], "=", device_assignment_.dim(i)));
-    }
-
-    // Add the device assignment if it is not an iota case.
-    std::optional<IotaTileAssignment> iota = device_assignment_.iota();
-    std::string device_assignment_str = "";
-    if (!(iota.has_value() && iota->reshape_dims().size() == 1)) {
-      device_assignment_str =
-          absl::StrCat("(", device_assignment_.ArrayToString(), ")");
-    }
-    absl::StrAppend(&mesh_str, "<", absl::StrJoin(formatted_axes_names, ","),
-                    ">", device_assignment_str);
-    return mesh_str;
-  }
-
   bool DeviceAssignmentEquals(const Mesh& other) const {
     return device_assignment_ == other.device_assignment_;
   }
 
+  std::string ToString() const;
+
   MeshProto ToProto() const;
 
   static Mesh FromProto(const MeshProto& proto);
@@ -178,16 +156,7 @@ class AxisRef {
 
   bool operator!=(const xla::AxisRef& other) const { return !(*this == other); }
 
-  std::string ToString(const Mesh& mesh) const {
-    CHECK_GE(mesh_axis_index_, 0);
-    CHECK_LT(mesh_axis_index_, mesh.axis_names().size());
-    std::string axis_str = mesh.axis_names()[mesh_axis_index()];
-    if (sub_axis_info_.has_value()) {
-      absl::StrAppend(&axis_str, ":(", sub_axis_info_->pre_size, ")",
-                      sub_axis_info_->size);
-    }
-    return axis_str;
-  }
+  std::string ToString(const Mesh& mesh) const;
 
   AxisRefProto ToProto() const;
 

From f253afed706572e8e316b9c07f80d93ab87baea4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 07:16:50 -0800
Subject: [PATCH 640/753] Automated Code Change

PiperOrigin-RevId: 847412849
---
 tensorflow/core/kernels/conv_2d_gpu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 60d2e83194eefa..1afa68d87430ed 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -1046,7 +1046,7 @@ template <typename T, bool conjugate>
 struct SwapDimension1And2InTensor3<GPUDevice, T, conjugate> {
   typedef GPUDevice Device;
   void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64_t>& combined_dims, T* out) {
+                  const absl::Span<const int64_t>& combined_dims, T* out) {
     Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
                                static_cast<int>(combined_dims[1]),
                                static_cast<int>(combined_dims[2])};
@@ -1060,7 +1060,7 @@ template <typename T, bool conjugate>
 struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
   typedef GPUDevice Device;
   void operator()(const Device& d, const T* in,
-                  const gtl::ArraySlice<int64_t>& combined_dims, T* out) {
+                  const absl::Span<const int64_t>& combined_dims, T* out) {
     Dimension<3> input_dims = {static_cast<int>(combined_dims[0]),
                                static_cast<int>(combined_dims[1]),
                                static_cast<int>(combined_dims[2])};

From 630698a3af0f443cb71b5c9cf77f73a6bc48fe37 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 07:25:04 -0800
Subject: [PATCH 641/753] Automated Code Change

PiperOrigin-RevId: 847414309
---
 tensorflow/core/kernels/collective_nccl_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index 6244c9ffa1a40a..cd8de9525d542e 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -141,7 +141,7 @@ class NcclTestBase : public ::testing::Test {
       if (VLOG_IS_ON(3)) {
         string str_buf;
         for (const auto& x : expected) {
-          strings::StrAppend(&str_buf, " ", x);
+          absl::StrAppend(&str_buf, " ", x);
         }
         VLOG(3) << "Expected output " << str_buf;
       }

From f356a762f32477f212f3fe1c5c44a49b6948f083 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 07:27:16 -0800
Subject: [PATCH 642/753] Automated Code Change

PiperOrigin-RevId: 847414761
---
 tensorflow/core/kernels/conv_ops_bfloat16.cc | 10 +++---
 tensorflow/core/kernels/conv_ops_gpu.h       | 34 +++++++++++---------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/conv_ops_bfloat16.cc b/tensorflow/core/kernels/conv_ops_bfloat16.cc
index 37507841647f0b..d2b9bc71b5d3a3 100644
--- a/tensorflow/core/kernels/conv_ops_bfloat16.cc
+++ b/tensorflow/core/kernels/conv_ops_bfloat16.cc
@@ -110,8 +110,8 @@ void LaunchConvOp<GPUDevice, Eigen::bfloat16>::operator()(
     Tensor* output) {
   // Get spatial dims for dilations and strides.
   int spatial_dims = input.dims() - 2;
-  gtl::InlinedVector<int64_t, 3> strides_spatial(spatial_dims);
-  gtl::InlinedVector<int64_t, 3> dilations_spatial(spatial_dims);
+  absl::InlinedVector<int64_t, 3UL> strides_spatial(spatial_dims);
+  absl::InlinedVector<int64_t, 3UL> dilations_spatial(spatial_dims);
   for (int i = 0; i < spatial_dims; ++i) {
     strides_spatial[i] =
         GetTensorDim(strides, data_format, static_cast<char>(i + '0'));
@@ -166,9 +166,9 @@ void LaunchConv2DOp<GPUDevice, Eigen::bfloat16>::operator()(
     const std::vector<int64_t>& explicit_paddings, Tensor* output,
     TensorFormat data_format) {
   // Cast strides and dilations.
-  gtl::InlinedVector<int64_t, 3> casted_strides = {row_stride, col_stride};
-  gtl::InlinedVector<int64_t, 3> casted_dilations = {row_dilation,
-                                                     col_dilation};
+  absl::InlinedVector<int64_t, 3UL> casted_strides = {row_stride, col_stride};
+  absl::InlinedVector<int64_t, 3UL> casted_dilations = {row_dilation,
+                                                        col_dilation};
 
   auto* stream = ctx->op_device_context()->stream();
   const bool cast_to_float = !IsBF16SupportedInOps(stream);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 8b9bd81c3d5e76..02fe25ff64aa8a 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -68,18 +68,18 @@ class DnnScratchAllocator : public se::ScratchAllocator {
   DnnScratchAllocator(int64_t memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64 GetMemoryLimitInBytes() override { return memory_limit_; }
-  tsl::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
       int64_t byte_size) override {
     Tensor temporary_memory;
     if (byte_size < 0) {
-      return tsl::Status{absl::StatusCode::kInvalidArgument,
-                         "Requested negative byte size!"};
+      return absl::Status{absl::StatusCode::kInvalidArgument,
+                          "Requested negative byte size!"};
     }
     if (byte_size > memory_limit_) {
-      return tsl::Status{absl::StatusCode::kUnavailable,
-                         absl::StrCat("Requested memory size (", byte_size,
-                                      ") exceeds the max memory limit (",
-                                      memory_limit_, ").")};
+      return absl::Status{absl::StatusCode::kUnavailable,
+                          absl::StrCat("Requested memory size (", byte_size,
+                                       ") exceeds the max memory limit (",
+                                       memory_limit_, ").")};
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
@@ -87,7 +87,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return tsl::Status{
+      return absl::Status{
           absl::StatusCode::kUnavailable,
           absl::StrCat("Failed to allocate the requested memory size (",
                        byte_size, ").")};
@@ -96,7 +96,7 @@ class DnnScratchAllocator : public se::ScratchAllocator {
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return tsl::StatusOr<se::DeviceMemory<uint8>>(
+    return absl::StatusOr<stream_executor::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
@@ -115,7 +115,8 @@ typedef Eigen::GpuDevice GPUDevice;
 // autotuning with a cache, or by falling back to a default if
 // 'cudnn_use_autotune' is true and cuDNN is the statically-chosen DNN backend.
 template <typename T>
-StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
+absl::StatusOr<AutotuneEntry<stream_executor::dnn::FusedConvOp>>
+AutotuneFusedConv(
     bool cudnn_use_autotune,
     AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::FusedConvOp>>*
         autotune_map,
@@ -132,7 +133,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
     se::DeviceMemory<T> side_input_ptr, int64_t scratch_size);
 
 template <typename T>
-StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
+absl::StatusOr<AutotuneEntry<stream_executor::dnn::ConvOp>> AutotuneUnfusedConv(
     bool cudnn_use_autotune,
     AutotuneMap<ConvParameters, AutotuneEntry<se::dnn::ConvOp>>* autotune_map,
     const ConvParameters& conv_parameters, OpKernelContext* ctx,
@@ -155,7 +156,7 @@ AllocateScratchOrFallback(se::ScratchAllocator* scratch_allocator,
 
   auto workspace_size = selected_runner->GetWorkspaceSize();
 
-  se::DeviceMemoryBase scratch_memory;
+  stream_executor::DeviceAddressBase scratch_memory;
   if (workspace_size > 0) {
     auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);
     if (scratch_or.ok()) {
@@ -206,9 +207,10 @@ Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
                         AllocateScratchOrFallback<se::dnn::ConvOp::Signature>(
                             scratch_allocator, primary, no_scratch_fallback));
     auto& runner = *std::get<const se::dnn::ConvRunner*>(runner_and_scratch);
-    return runner(stream, nullptr,
-                  std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
-                  filter_ptr, out_ptr);
+    return runner(
+        stream, nullptr,
+        std::get<stream_executor::DeviceAddressBase>(runner_and_scratch),
+        in_ptr, filter_ptr, out_ptr);
   } else {
     auto dnn = stream->parent()->AsDnn();
     if (dnn == nullptr) {
@@ -231,7 +233,7 @@ Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
     std::unique_ptr<const se::dnn::ConvRunner> runner =
         std::move(runner_or).value();
 
-    se::DeviceMemoryBase scratch_memory;
+    stream_executor::DeviceAddressBase scratch_memory;
     int64_t workspace_size = runner->GetWorkspaceSize();
     if (workspace_size > 0) {
       auto scratch_or = scratch_allocator->AllocateBytes(workspace_size);

From 0cfba6c85271749a8b65a91fa9866d4a6f5cb689 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 07:27:23 -0800
Subject: [PATCH 643/753] Automated Code Change

PiperOrigin-RevId: 847414785
---
 tensorflow/core/kernels/pad_op.cc            | 20 +++++------
 tensorflow/core/kernels/queue_op.cc          |  6 ++--
 tensorflow/core/kernels/reduction_ops_sum.cc | 36 ++++++++++----------
 tensorflow/core/kernels/tensor_array_ops.cc  | 24 ++++++-------
 4 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index c650648147adf8..890a9954faa4a7 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -400,38 +400,38 @@ TF_CALL_uint8(REGISTER_GPU_KERNEL);
 // registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("Pad")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
+                        PadOp<CPUDevice, int32_t, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("Pad")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .TypeConstraint<int64_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
+                        PadOp<CPUDevice, int32_t, int64_t>);
 REGISTER_KERNEL_BUILDER(Name("PadV2")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("constant_values")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
+                        PadOp<CPUDevice, int32_t, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("PadV2")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .TypeConstraint<int64_t>("Tpaddings")
                             .HostMemory("input")
                             .HostMemory("paddings")
                             .HostMemory("constant_values")
                             .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
+                        PadOp<CPUDevice, int32_t, int64_t>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/queue_op.cc b/tensorflow/core/kernels/queue_op.cc
index e16c6034de4596..2f77020256080a 100644
--- a/tensorflow/core/kernels/queue_op.cc
+++ b/tensorflow/core/kernels/queue_op.cc
@@ -210,7 +210,7 @@ DequeueManyOp::DequeueManyOp(OpKernelConstruction* context)
 void DequeueManyOp::ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                                  DoneCallback callback) {
   const Tensor& Tnum_elements = ctx->input(1);
-  int32_t num_elements = Tnum_elements.flat<int32>()(0);
+  int32_t num_elements = Tnum_elements.flat<int32_t>()(0);
 
   OP_REQUIRES_ASYNC(ctx, num_elements >= 0,
                     errors::InvalidArgument("DequeueManyOp requested ",
@@ -283,7 +283,7 @@ DequeueUpToOp::DequeueUpToOp(OpKernelConstruction* context)
 void DequeueUpToOp::ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                                  DoneCallback callback) {
   const Tensor& Tnum_elements = ctx->input(1);
-  int32_t num_elements = Tnum_elements.flat<int32>()(0);
+  int32_t num_elements = Tnum_elements.flat<int32_t>()(0);
 
   OP_REQUIRES_ASYNC(ctx, num_elements >= 0,
                     errors::InvalidArgument("DequeueUpToOp requested ",
@@ -349,7 +349,7 @@ void QueueSizeOp::ComputeAsync(OpKernelContext* ctx, QueueInterface* queue,
                                DoneCallback callback) {
   Tensor* Tqueue_size = nullptr;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &Tqueue_size));
-  Tqueue_size->flat<int32>().setConstant(queue->size());
+  Tqueue_size->flat<int32_t>().setConstant(queue->size());
   callback();
 }
 
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index e28964905fe85c..43b921c40829ce 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -60,24 +60,24 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 // A special DEVICE_DEFAULT kernel for int32.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_DEFAULT)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::SumReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_DEFAULT)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64_t>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::SumReducer<int32>>);
+REGISTER_KERNEL_BUILDER(Name("Sum")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int32_t>("Tidx")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<CPUDevice, int32_t, int32_t,
+                                    Eigen::internal::SumReducer<int32_t>>);
+REGISTER_KERNEL_BUILDER(Name("Sum")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32_t>("T")
+                            .TypeConstraint<int64_t>("Tidx")
+                            .HostMemory("input")
+                            .HostMemory("output")
+                            .HostMemory("reduction_indices"),
+                        ReductionOp<CPUDevice, int32_t, int64_t,
+                                    Eigen::internal::SumReducer<int32_t>>);
 
 #endif
 
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index bd2956c734a1b7..7bd5f5be719565 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -836,24 +836,24 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 REGISTER_KERNEL_BUILDER(
     Name("TensorArrayGather")
         .Device(DEVICE_GPU)
-        .TypeConstraint<int32>("dtype")
+        .TypeConstraint<int32_t>("dtype")
         .HostMemory("indices")
         .HostMemory("handle"),
-    TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
+    TensorArrayPackOrGatherOp<CPUDevice, int32_t, false /* LEGACY_PACK */>);
 REGISTER_KERNEL_BUILDER(
     Name("TensorArrayGatherV2")
         .Device(DEVICE_GPU)
-        .TypeConstraint<int32>("dtype")
+        .TypeConstraint<int32_t>("dtype")
         .HostMemory("indices")
         .HostMemory("handle"),
-    TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
+    TensorArrayPackOrGatherOp<CPUDevice, int32_t, false /* LEGACY_PACK */>);
 REGISTER_KERNEL_BUILDER(
     Name("TensorArrayGatherV3")
         .Device(DEVICE_GPU)
-        .TypeConstraint<int32>("dtype")
+        .TypeConstraint<int32_t>("dtype")
         .HostMemory("indices")
         .HostMemory("handle"),
-    TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
+    TensorArrayPackOrGatherOp<CPUDevice, int32_t, false /* LEGACY_PACK */>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -1050,22 +1050,22 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 // registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("dtype")
+                            .TypeConstraint<int32_t>("dtype")
                             .HostMemory("lengths")
                             .HostMemory("handle"),
-                        TensorArrayConcatOp<CPUDevice, int32>);
+                        TensorArrayConcatOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("dtype")
+                            .TypeConstraint<int32_t>("dtype")
                             .HostMemory("lengths")
                             .HostMemory("handle"),
-                        TensorArrayConcatOp<CPUDevice, int32>);
+                        TensorArrayConcatOp<CPUDevice, int32_t>);
 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("dtype")
+                            .TypeConstraint<int32_t>("dtype")
                             .HostMemory("lengths")
                             .HostMemory("handle"),
-                        TensorArrayConcatOp<CPUDevice, int32>);
+                        TensorArrayConcatOp<CPUDevice, int32_t>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 

From ce61030c67c3772a17d78bc01b53e0d0b881804f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 13:16:18 -0800
Subject: [PATCH 644/753] Automated Code Change

PiperOrigin-RevId: 847480750
---
 third_party/xla/xla/python/ifrt_proxy/client/client.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
index e902620da7dd7c..65380233637223 100644
--- a/third_party/xla/xla/python/ifrt_proxy/client/client.cc
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -20,7 +20,6 @@
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"

From 7733c4c03da55a124cd0c2f652e33f606d8fed1a Mon Sep 17 00:00:00 2001
From: Junwhan Ahn <junwhan@google.com>
Date: Sun, 21 Dec 2025 17:25:22 -0800
Subject: [PATCH 645/753] Use `StartDetachedThread` instead of `SchedClosure`
 to dispatch atom program compilation

PiperOrigin-RevId: 847528854
---
 .../ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
index 24785ec7e59522..ee938534b809b0 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/multi_threaded_atom_program_compiler.cc
@@ -180,7 +180,8 @@ absl::StatusOr<CompileFuture> MultiThreadedAtomProgramCompiler::CompileXla(
       /*context=*/nullptr,  // Shares the same long-living context.
       mlir::OwningOpRef<mlir::ModuleOp>(module_op.clone()));
   auto [promise, future] = CompileFuture::MakePromise();
-  tsl::Env::Default()->SchedClosure(
+  tsl::Env::Default()->StartDetachedThread(
+      tsl::ThreadOptions(), /*name=*/"MultiThreadedAtomProgramCompiler",
       WithCurrentUserContext([this, hlo_program = std::move(hlo_program),
                               compile_options = std::move(compile_options),
                               promise = std::move(promise)]() mutable {

From e3e3bc194639d51d7c6f17594b059b3d812a5255 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 18:04:49 -0800
Subject: [PATCH 646/753] Reverts c549ee47f87dc9083b4891b7f6dafa2063ddf12a

PiperOrigin-RevId: 847535506
---
 .../xla/xla/backends/gpu/runtime/BUILD        |   1 -
 .../gpu/runtime/ragged_all_to_all_thunk.cc    | 116 +++++++++++-------
 .../gpu/runtime/ragged_all_to_all_thunk.h     |  18 ---
 3 files changed, 70 insertions(+), 65 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index de324ce85f41f5..5b67961fdae183 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1521,7 +1521,6 @@ cc_library(
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index f89a39e29bc3e9..f6d65a9a110d13 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
-#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -216,75 +215,99 @@ absl::Status RunRaggedAllToAll(
   return future.Await();
 }
 
-}  // namespace
+// Contains the values that are passed between host threads with rendezvous.
+struct RendezvousValue {
+  RankId rank;
+  se::DeviceAddressBase output_buffer;
+  se::Event* start_event;
+  se::Event* end_event;
 
-absl::StatusOr<
-    std::shared_ptr<std::vector<const RaggedAllToAllStartThunk::StreamState*>>>
-RaggedAllToAllStartThunk::RendezvousBeforeKernelStart(
-    const GpuCliqueKey& clique_key, se::Stream& stream,
-    const StreamState& state) {
-  int64_t num_ranks = clique_key.num_local_participants();
+  bool operator<(const RendezvousValue& other) const {
+    return rank < other.rank;
+  }
+};
+
+// Executes the rendezvous before the kernel start.
+// Inserts CUDA events into the stream to ensure that all devices have reached
+// the start event before the kernel starts.
+absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
+RendezvousBeforeKernelStart(absl::string_view name,
+                            const GpuCliqueKey& clique_key, RankId rank,
+                            int64_t num_ranks,
+                            const se::DeviceAddressBase& output_buffer,
+                            se::Stream& stream, se::Event* start_event,
+                            se::Event* end_event) {
+  RendezvousValue rendezvous_value;
+  rendezvous_value.rank = rank;
+  rendezvous_value.output_buffer = output_buffer;
+  rendezvous_value.start_event = start_event;
+  rendezvous_value.end_event = end_event;
 
   // Record that this device has started the memcpy ragged-all-to-all. We do
   // this before the rendezvous to make sure that RecordEvent is called before
   // WaitFor on another stream.
-  RETURN_IF_ERROR(stream.RecordEvent(state.start_event.get()));
+  TF_RETURN_IF_ERROR(stream.RecordEvent(start_event));
 
-  auto rendezvous_fn = [](absl::Span<const StreamState* const> values) {
-    std::vector<const StreamState*> values_copy;
+  auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
+    std::vector<RendezvousValue> values_copy;
     for (const auto& value : values) {
-      values_copy.push_back(value);
+      values_copy.push_back(*value);
     }
     // Sort to make sure that values are in the same order as the devices are
     // ordered in the communicator.
-    absl::c_sort(values_copy, [](const StreamState* a, const StreamState* b) {
-      return a->rank < b->rank;
-    });
+    absl::c_sort(values_copy);
     return values_copy;
   };
 
   std::string start_rendezvous_key =
-      absl::StrFormat("start one-shot ragged-all-to-all for rank %d, clique %s",
-                      state.rank.value(), clique_key.ToString());
-  ASSIGN_OR_RETURN(
-      std::shared_ptr<std::vector<const StreamState*>> rendezvous_values,
-      Rendezvous<std::vector<const StreamState*>>(
-          start_rendezvous_key, clique_key, state, num_ranks, rendezvous_fn));
+      absl::StrFormat("start %s ragged-all-to-all for rank %d, clique %s", name,
+                      rank.value(), clique_key.ToString());
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
+      Rendezvous<std::vector<RendezvousValue>>(
+          /*name=*/
+          start_rendezvous_key, /*key=*/clique_key,
+          /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
+          rendezvous_fn));
 
   // Wait for all devices to reach the start event. This indicates that all
   // output buffers are ready for transfer.
-  for (const StreamState* remote_stream_state : *rendezvous_values) {
-    RETURN_IF_ERROR(stream.WaitFor(remote_stream_state->start_event.get()));
+  for (auto& value : *rendezvous_values) {
+    TF_RETURN_IF_ERROR(stream.WaitFor(value.start_event));
   }
 
   return rendezvous_values;
 }
 
-absl::Status RaggedAllToAllStartThunk::RendezvousAfterKernelFinish(
-    const GpuCliqueKey& clique_key, se::Stream& stream,
-    const StreamState& state,
-    absl::Span<const StreamState* const> remote_stream_states) {
-  int64_t num_ranks = clique_key.num_local_participants();
-
+// Executes the rendezvous after the kernel finish. Waits for all devices to
+// reach the end event.
+absl::Status RendezvousAfterKernelFinish(
+    absl::string_view name, const GpuCliqueKey& clique_key, RankId rank,
+    int64_t num_ranks, se::Stream& stream, se::Event* end_event,
+    const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
   // Record that this device has finished the memcpy ragged-all-to-all.
-  RETURN_IF_ERROR(stream.RecordEvent(state.end_event.get()));
+  TF_RETURN_IF_ERROR(stream.RecordEvent(end_event));
 
   // Do another rendezvous to make sure that we call RecordEvent for end_event
   // before WaitFor on another stream.
-  std::string finish_rendezvous_key = absl::StrFormat(
-      "finish one-shot ragged-all-to-all for rank %d, clique %s",
-      state.rank.value(), clique_key.ToString());
-  RETURN_IF_ERROR(Rendezvous(finish_rendezvous_key, clique_key, num_ranks));
+  std::string finish_rendezvous_key =
+      absl::StrFormat("finish %s ragged-all-to-all for rank %d, clique %s",
+                      name, rank.value(), clique_key.ToString());
+  TF_RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
+                                /*key=*/clique_key,
+                                /*num_threads=*/num_ranks));
 
   // Wait for all devices to reach the end event. This indicates that all
   // updates from other devices have arrived.
-  for (const StreamState* remote_stream_state : remote_stream_states) {
-    RETURN_IF_ERROR(stream.WaitFor(remote_stream_state->end_event.get()));
+  for (auto& value : *rendezvous_values) {
+    TF_RETURN_IF_ERROR(stream.WaitFor(value.end_event));
   }
 
   return absl::OkStatus();
 }
 
+}  // namespace
+
 absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
     const GpuCliqueKey& clique_key, se::Stream& stream,
     const StreamState& state, absl::Span<DeviceBufferPair const> buffers) {
@@ -299,16 +322,19 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
   PrimitiveType element_type = buffers[0].element_type;
 
   se::DeviceAddressBase input_buffer = buffers[0].source_buffer;
+  se::DeviceAddressBase output_buffer = buffers[1].destination_buffer;
 
   TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<std::vector<const StreamState*>> remote_stream_states,
-      RendezvousBeforeKernelStart(clique_key, stream, state));
+      std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
+      RendezvousBeforeKernelStart(
+          /*name=*/"one-shot", clique_key, rank, num_ranks, output_buffer,
+          stream, state.start_event.get(), state.end_event.get()));
 
   const int64_t num_updates_per_replica = config_.num_total_updates / num_ranks;
 
   absl::InlinedVector<se::DeviceAddressBase, 4> output_ptrs;
-  for (const StreamState* remote_stream_state : *remote_stream_states) {
-    output_ptrs.push_back(remote_stream_state->local_output_buffer);
+  for (auto& value : *rendezvous_values) {
+    output_ptrs.push_back(value.output_buffer);
   }
 
   TF_RETURN_IF_ERROR(RunRaggedAllToAllKernel(
@@ -317,8 +343,9 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
       buffers[4].source_buffer, num_ranks, num_updates_per_replica,
       config_.num_input_rows, config_.num_row_elements));
 
-  return RendezvousAfterKernelFinish(clique_key, stream, state,
-                                     *remote_stream_states);
+  return RendezvousAfterKernelFinish(
+      /*name=*/"one-shot", clique_key, rank, num_ranks, stream,
+      state.end_event.get(), rendezvous_values);
 }
 
 RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
@@ -413,9 +440,6 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
     return absl::InternalError("Failed to allocate output offsets buffer.");
   }
 
-  state->local_output_buffer = params.buffer_allocations->GetDeviceAddress(
-      buffers_[1].destination_buffer);
-
   if (is_local()) {
     TF_ASSIGN_OR_RETURN(state->start_event, executor->CreateEvent());
     TF_ASSIGN_OR_RETURN(state->end_event, executor->CreateEvent());
@@ -469,7 +493,7 @@ absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
                                       device_buffers[0].element_type);
 
   if (should_use_one_shot_kernel) {
-    RETURN_IF_ERROR(
+    TF_RETURN_IF_ERROR(
         RunOneShotRaggedAllToAll(clique_key, stream, *state, device_buffers));
     return false;
   }
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
index b3354b814fda60..6a48a5fac956b0 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_handle.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -92,9 +91,6 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
     // Device memory buffer for output offsets.
     se::DeviceAddressHandle output_offsets_device_buffer;
 
-    // Device memory buffer for local output.
-    se::DeviceAddressBase local_output_buffer;
-
     // Event to synchronize streams on different devices at the start of the
     // kernel.
     std::unique_ptr<se::Event> start_event;
@@ -107,20 +103,6 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
         : device_ordinal(device_ordinal), rank(rank) {}
   };
 
-  // Executes the rendezvous before the kernel start.
-  // Inserts CUDA events into the stream to ensure that all devices have reached
-  // the start event before the kernel starts.
-  absl::StatusOr<std::shared_ptr<std::vector<const StreamState*>>>
-  RendezvousBeforeKernelStart(const GpuCliqueKey& clique_key,
-                              se::Stream& stream, const StreamState& state);
-
-  // Executes the rendezvous after the kernel finish. Waits for all devices to
-  // reach the end event.
-  absl::Status RendezvousAfterKernelFinish(
-      const GpuCliqueKey& clique_key, se::Stream& stream,
-      const StreamState& state,
-      absl::Span<const StreamState* const> remote_stream_states);
-
   absl::Status RunOneShotRaggedAllToAll(
       const GpuCliqueKey& clique_key, se::Stream& stream,
       const StreamState& state, absl::Span<DeviceBufferPair const> buffers);

From f4e53263b14f1f9a1ccd29d32c02bb9266b850eb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 23:16:33 -0800
Subject: [PATCH 647/753] Automated Code Change

PiperOrigin-RevId: 847622680
---
 .../xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc b/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
index d63966f02427e2..bc491734f969b6 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_on_demand_compiler.cc
@@ -179,7 +179,7 @@ class TpuCompiler : public Compiler {
 
   // Compiles the HLO module group for ahead-of-time execution.  This is
   // intended for use in static compilation.
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                      const AotCompilationOptions& options) override {
     return Unimplemented("This compiler does not support CompileAheadOfTime.");

From 2b621d61f93793baa71eedb90655a57af50d4873 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 23:17:07 -0800
Subject: [PATCH 648/753] Automated Code Change

PiperOrigin-RevId: 847622876
---
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 6b4d08c8553341..dd31e129319c43 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -188,7 +188,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
   const std::string name = hlo_module->name();
   const std::string fingerprint = hlo_module->GetFingerprint128();
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+      std::vector<std::unique_ptr<CompiledModule>> aot_results,
       gpu_compiler->CompileAheadOfTime(std::move(hlo_module), aot_options));
   return std::make_unique<StreamExecutorExecutable>(
       std::move(input_options), std::move(aot_results), num_replicas,

From 6165d577f980ef2ee2845d35aff87be706b7ea3b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 23:17:28 -0800
Subject: [PATCH 649/753] Automated Code Change

PiperOrigin-RevId: 847622964
---
 third_party/xla/xla/client/compile_only_client.cc | 2 +-
 third_party/xla/xla/client/compile_only_client.h  | 2 +-
 third_party/xla/xla/client/local_client.h         | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/client/compile_only_client.cc b/third_party/xla/xla/client/compile_only_client.cc
index 3a9fe305a04948..665607f524b2f1 100644
--- a/third_party/xla/xla/client/compile_only_client.cc
+++ b/third_party/xla/xla/client/compile_only_client.cc
@@ -44,7 +44,7 @@ CompileOnlyClient::CreateModuleConfig(
                                                execution_options);
 }
 
-absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
 CompileOnlyClient::CompileAheadOfTime(
     const AotXlaComputationInstance& computation,
     const AotCompilationOptions& options,
diff --git a/third_party/xla/xla/client/compile_only_client.h b/third_party/xla/xla/client/compile_only_client.h
index 53d17b87795a4f..87baff417d7a04 100644
--- a/third_party/xla/xla/client/compile_only_client.h
+++ b/third_party/xla/xla/client/compile_only_client.h
@@ -59,7 +59,7 @@ class CompileOnlyClient : public Client {
   // This is intended for use in static compilation. The |options|
   // parameter describes the target for which the compiler should emit
   // code. |metadata|, if provided, is populated during compilation.
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(
       const AotXlaComputationInstance& computation,
       const AotCompilationOptions& options,
diff --git a/third_party/xla/xla/client/local_client.h b/third_party/xla/xla/client/local_client.h
index 4429dc84664f6b..364852b956af56 100644
--- a/third_party/xla/xla/client/local_client.h
+++ b/third_party/xla/xla/client/local_client.h
@@ -161,7 +161,7 @@ class LocalClient : public Client {
   // Same as Compile() above, but return AotCompilationResult objects (instead
   // of LocalExecutable objects), which can be persisted to later load
   // LocalExecutable(s) using the Load() method below.
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(const XlaComputation& computation,
                      absl::Span<const Shape* const> argument_layouts,
                      const ExecutableBuildOptions& options);
@@ -174,7 +174,7 @@ class LocalClient : public Client {
 
   // Variant of `Load()` that accepts an AotCompilationResult.
   absl::StatusOr<std::unique_ptr<LocalExecutable>> Load(
-      std::unique_ptr<xla::AotCompilationResult> aot_result,
+      std::unique_ptr<CompiledModule> aot_result,
       const ExecutableBuildOptions& options);
 
   // Copy the literal data to the device with the given ordinal and return as a
@@ -249,7 +249,7 @@ class LocalClient : public Client {
   LocalService* local_service_;
 
   absl::StatusOr<std::unique_ptr<LocalExecutable>> LoadInternal(
-      std::unique_ptr<xla::AotCompilationResult> aot_result,
+      std::unique_ptr<CompiledModule> aot_result,
       const ExecutableBuildOptions& options);
 };
 

From 64337a1e3ca723020ec0c70f5cf58fc180ae4988 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 23:25:05 -0800
Subject: [PATCH 650/753] Automated Code Change

PiperOrigin-RevId: 847624939
---
 .../xla/xla/service/cpu/tests/cpu_aot_export_test.cc       | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
index e69757ba09d095..d544c86159d194 100644
--- a/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
+++ b/third_party/xla/xla/service/cpu/tests/cpu_aot_export_test.cc
@@ -53,15 +53,14 @@ class CpuAotCompilationTest : public HloTestBase {
         std::vector<std::unique_ptr<Executable>> executables,
         compiler->Compile(std::move(module), {stream_exec}, nullptr));
 
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<AotCompilationResult> exported_aot_result,
-        compiler->Export(executables[0].get()));
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<CompiledModule> exported_aot_result,
+                            compiler->Export(executables[0].get()));
 
     // Serialize-deserialize AOT compilation result.
     TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
                             exported_aot_result->SerializeAsString());
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<AotCompilationResult> loaded_aot_result,
+        std::unique_ptr<CompiledModule> loaded_aot_result,
         compiler->LoadAotCompilationResult(serialized_aot_result));
 
     // Load Executable from AOT compilation result.

From 7b0d71c54e5dfdcf96c40e17fd6aba9a5ab9aa98 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 23:26:00 -0800
Subject: [PATCH 651/753] Automated Code Change

PiperOrigin-RevId: 847625245
---
 .../cpu/vectorized_reduce_with_no_vector_registers_test.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
index 23a21b04905d92..facd5747f004a9 100644
--- a/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
+++ b/third_party/xla/xla/service/cpu/vectorized_reduce_with_no_vector_registers_test.cc
@@ -119,7 +119,7 @@ ENTRY main {
       cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
 
   TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<std::unique_ptr<AotCompilationResult>> aot_compilation_result,
+      std::vector<std::unique_ptr<CompiledModule>> aot_compilation_result,
       cpu_compiler.CompileAheadOfTime(std::move(hlo_module),
                                       aot_compilation_options));
   EXPECT_EQ(aot_compilation_result.size(), 1);

From b1d25385411365db38891bd8bb0fb42fdf02c994 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Sun, 21 Dec 2025 23:34:56 -0800
Subject: [PATCH 652/753] [Autotuner] Initialize random input values for buffer
 checks. If values are initialized to 0 buffer checker will fail to detect
 backends with wrong results.

PiperOrigin-RevId: 847627821
---
 .../xla/xla/service/gpu/autotuning/autotuner_pass.cc     | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
index 714f6ef7fe1c68..d053517780f951 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
@@ -80,10 +80,12 @@ AutotuneConfig GetAutotuneConfig(const DebugOptions& debug_options,
   return autotune_config;
 }
 
-ProfileOptions GetProfileOptions(const DebugOptions& debug_options) {
+ProfileOptions GetProfileOptions(const DebugOptions& debug_options,
+                                 const AutotuneConfig& autotune_config) {
   ProfileOptions profile_options;
   profile_options.redzone_padding_bytes =
       debug_options.xla_gpu_redzone_padding_bytes();
+  profile_options.should_init_buffers = autotune_config.check_buffers;
   return profile_options;
 }
 
@@ -103,8 +105,9 @@ absl::StatusOr<std::unique_ptr<AutotunerPass>> AutotunerPass::Create(
       GetAutotuneConfig(debug_options, is_deviceless, optimize_scratch_bytes);
 
   if (!is_deviceless) {
-    profiler = GpuProfiler::Create(stream_executor,
-                                   GetProfileOptions(debug_options), allocator);
+    profiler = GpuProfiler::Create(
+        stream_executor, GetProfileOptions(debug_options, autotune_config),
+        allocator);
   }
 
   std::unique_ptr<AutotunerCacheInterface> cache =

From bb8c750b2f2dca46272871181740966f092b42ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 21 Dec 2025 23:37:50 -0800
Subject: [PATCH 653/753] Automated Code Change

PiperOrigin-RevId: 847628658
---
 tensorflow/compiler/jit/xla_device_compiler_client.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/jit/xla_device_compiler_client.cc b/tensorflow/compiler/jit/xla_device_compiler_client.cc
index 71be1f7ec6b25d..ff565042347ae1 100644
--- a/tensorflow/compiler/jit/xla_device_compiler_client.cc
+++ b/tensorflow/compiler/jit/xla_device_compiler_client.cc
@@ -81,7 +81,7 @@ absl::StatusOr<std::string> XlaDeviceCompilerClient::BuildSerializedExecutable(
   xla::ExecutableBuildOptions build_options = GetExecutableBuildOptions(
       options, result, client_->default_device_ordinal());
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_results,
+      std::vector<std::unique_ptr<xla::CompiledModule>> aot_results,
       client_->CompileAheadOfTime(*result.computation, argument_layouts,
                                   build_options));
   TF_RET_CHECK(aot_results.size() == 1);

From 6e5d62bf3e76e46f8eed6380e7eed8af5333e555 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Sun, 21 Dec 2025 23:54:13 -0800
Subject: [PATCH 654/753] Increase shards for fusion_emitter_device_test to
 speed up the test.

PiperOrigin-RevId: 847632914
---
 third_party/xla/xla/backends/gpu/codegen/triton/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 81cdd32773c6e8..ce2fae5553f1e1 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -699,7 +699,7 @@ xla_test(
         "b200",
         "amdgpu_any",
     ],
-    shard_count = 5,
+    shard_count = 10,
     tags = [
         "no_mac",
     ],

From 3c7c52e73027a32da5aabe94b54afb9b5e107076 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 00:17:58 -0800
Subject: [PATCH 655/753] Automated Code Change

PiperOrigin-RevId: 847641457
---
 .../xla/backends/gpu/autotuner/native_emitter_test.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
index bedf1bb18be870..2a5e1742bbe6ff 100644
--- a/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
+++ b/third_party/xla/xla/backends/gpu/autotuner/native_emitter_test.cc
@@ -203,12 +203,11 @@ class MockCompiler : public Compiler {
                std::vector<se::StreamExecutor*> stream_execs,
                const CompileOptions& options),
               (override));
-  MOCK_METHOD(
-      absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>,
-      CompileAheadOfTime,
-      (std::unique_ptr<HloModule> hlo_module,
-       const AotCompilationOptions& options),
-      (override));
+  MOCK_METHOD(absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>,
+              CompileAheadOfTime,
+              (std::unique_ptr<HloModule> hlo_module,
+               const AotCompilationOptions& options),
+              (override));
   MOCK_METHOD(HloCostAnalysis::ShapeSizeFunction, ShapeSizeBytesFunction, (),
               (const, override));
 };

From 53c2f78993469ed2b9d2e588077667c51df520b6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 00:24:57 -0800
Subject: [PATCH 656/753] Automated Code Change

PiperOrigin-RevId: 847643376
---
 third_party/xla/xla/backends/cpu/nanort/nanort_client.cc | 2 +-
 third_party/xla/xla/backends/cpu/nanort/nanort_client.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
index bec4d24483521b..cad17e33cfafa1 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.cc
@@ -87,7 +87,7 @@ absl::StatusOr<std::unique_ptr<NanoRtExecutable>> NanoRtClient::Compile(
                                   optimized_hlo_program_shape);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>> NanoRtClient::Export(
+absl::StatusOr<std::unique_ptr<CompiledModule>> NanoRtClient::Export(
     NanoRtExecutable* executable) {
   cpu::CpuCompiler compiler;
   return compiler.Export(executable->executable());
diff --git a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
index cad9c584a0fba4..54b890360b3e7d 100644
--- a/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
+++ b/third_party/xla/xla/backends/cpu/nanort/nanort_client.h
@@ -39,7 +39,7 @@ class NanoRtClient {
           ExecutableBuildOptions());
 
   // Exports the given NanoRtExecutable to an AotCompilationResult.
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
+  absl::StatusOr<std::unique_ptr<CompiledModule>> Export(
       NanoRtExecutable* executable);
 };
 

From 2e5b1e44fc2f25c1716bf0f9acc0f628b6a847f6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 00:27:24 -0800
Subject: [PATCH 657/753] Automated Code Change

PiperOrigin-RevId: 847644164
---
 third_party/xla/xla/tests/codegen_test_base.cc | 4 ++--
 third_party/xla/xla/tests/codegen_test_base.h  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/third_party/xla/xla/tests/codegen_test_base.cc b/third_party/xla/xla/tests/codegen_test_base.cc
index c4a56308074b75..ac768913fc01ce 100644
--- a/third_party/xla/xla/tests/codegen_test_base.cc
+++ b/third_party/xla/xla/tests/codegen_test_base.cc
@@ -33,12 +33,12 @@ CodegenTestBase::CompileToExecutable(std::unique_ptr<HloModule> hlo_module,
                                           /*device_allocator=*/nullptr);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CodegenTestBase::CompileToAotCompilationResult(
     std::unique_ptr<HloModule> hlo_module,
     const AotCompilationOptions& options) {
   TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<AotCompilationResult>> results,
+      std::vector<std::unique_ptr<CompiledModule>> results,
       backend().compiler()->CompileAheadOfTime(std::move(hlo_module), options));
   return std::move(results.front());
 }
diff --git a/third_party/xla/xla/tests/codegen_test_base.h b/third_party/xla/xla/tests/codegen_test_base.h
index d4f4ff7d5ad441..fffe0863c4ae79 100644
--- a/third_party/xla/xla/tests/codegen_test_base.h
+++ b/third_party/xla/xla/tests/codegen_test_base.h
@@ -34,9 +34,9 @@ class CodegenTestBase : public HloTestBaseWithMLIRContext {
       bool run_optimization_passes = true);
 
   // Compiles hlo_module with the AOT compiler.
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-  CompileToAotCompilationResult(std::unique_ptr<HloModule> hlo_module,
-                                const AotCompilationOptions& options);
+  absl::StatusOr<std::unique_ptr<CompiledModule>> CompileToAotCompilationResult(
+      std::unique_ptr<HloModule> hlo_module,
+      const AotCompilationOptions& options);
 };
 
 }  // namespace xla

From ec8a966f0d42855d2b061c313c1e28d173759ae6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 00:27:53 -0800
Subject: [PATCH 658/753] Automated Code Change

PiperOrigin-RevId: 847644299
---
 .../xla/xla/service/cpu/cpu_aot_compilation_result.h  |  2 +-
 .../xla/xla/service/cpu/cpu_aot_compiler_test.cc      |  4 ++--
 third_party/xla/xla/service/cpu/cpu_aot_loader.cc     |  6 +++---
 third_party/xla/xla/service/cpu/cpu_aot_loader.h      |  6 +++---
 third_party/xla/xla/service/cpu/cpu_compiler.cc       | 10 +++++-----
 third_party/xla/xla/service/cpu/cpu_compiler.h        | 11 +++++------
 6 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
index 0a2b8cf2a17378..e61d8fe61c3f28 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compilation_result.h
@@ -100,7 +100,7 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
 };
 
 // This class represents the result of a CPU AOT compilation.
-class CpuAotCompilationResult : public AotCompilationResult {
+class CpuAotCompilationResult : public CompiledModule {
  public:
   static absl::StatusOr<std::unique_ptr<CpuAotCompilationResult>> Create(
       const HloModule* hlo_module, const BufferAssignment* buffer_assignment,
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
index 114829539d4e8e..d295e230fd081a 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_compiler_test.cc
@@ -101,13 +101,13 @@ ENTRY e {
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                             ParseAndReturnVerifiedModule(hlo));
     TF_ASSERT_OK_AND_ASSIGN(
-        std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
+        std::vector<std::unique_ptr<CompiledModule>> aot_results,
         compiler->CompileAheadOfTime(std::move(module), *aot_options));
 
     TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
                             aot_results[0]->SerializeAsString());
     TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<AotCompilationResult> aot_result,
+        std::unique_ptr<CompiledModule> aot_result,
         compiler->LoadAotCompilationResult(serialized_aot_result));
 
     TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
index 9175ab43b33216..243771e4617563 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.cc
@@ -147,11 +147,11 @@ absl::StatusOr<std::unique_ptr<Executable>> CpuAotLoader::LoadExecutable(
 }
 
 absl::StatusOr<std::unique_ptr<Executable>> CpuAotLoader::LoadExecutable(
-    xla::AotCompilationResult&& compilation_result) {
+    CompiledModule&& compilation_result) {
   return std::move(compilation_result).LoadExecutable(/*executor=*/nullptr);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuAotLoader::LoadAotCompilationResult(
     const std::string& serialized_aot_result) {
   xla::cpu::CompilationResultProto proto;
@@ -161,7 +161,7 @@ CpuAotLoader::LoadAotCompilationResult(
   return LoadAotCompilationResult(proto);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuAotLoader::LoadAotCompilationResult(
     const xla::cpu::CompilationResultProto& aot_result_proto) {
   TF_ASSIGN_OR_RETURN(
diff --git a/third_party/xla/xla/service/cpu/cpu_aot_loader.h b/third_party/xla/xla/service/cpu/cpu_aot_loader.h
index 1f8e8def43db40..fa0aad5bbfb0f7 100644
--- a/third_party/xla/xla/service/cpu/cpu_aot_loader.h
+++ b/third_party/xla/xla/service/cpu/cpu_aot_loader.h
@@ -53,12 +53,12 @@ class CpuAotLoader {
       const xla::cpu::CompilationResultProto& aot_result_proto);
 
   static absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      xla::AotCompilationResult&& compilation_result);
+      CompiledModule&& compilation_result);
 
-  static absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  static absl::StatusOr<std::unique_ptr<CompiledModule>>
   LoadAotCompilationResult(const std::string& serialized_aot_result);
 
-  static absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+  static absl::StatusOr<std::unique_ptr<CompiledModule>>
   LoadAotCompilationResult(
       const xla::cpu::CompilationResultProto& aot_result_proto);
 };
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 8675446af52fc6..295b74e1acc7fb 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -2091,7 +2091,7 @@ absl::StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   return std::unique_ptr<Executable>(std::move(cpu_executable));
 }
 
-absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
 CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                                 const AotCompilationOptions& aot_options) {
   auto llvm_options = llvm_ir::ExtractXlaBackendExtraOptions(
@@ -2155,7 +2155,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   std::unique_ptr<llvm::TargetMachine> target_machine =
       target_machine_builder();
 
-  std::vector<std::unique_ptr<AotCompilationResult>> results;
+  std::vector<std::unique_ptr<CompiledModule>> results;
   VLOG(1) << "Compiling ahead-of-time: " << hlo_module->name();
   if (hlo_module->has_schedule()) {
     return results;
@@ -2174,7 +2174,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
   return std::move(results);
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuCompiler::CompileAheadOfTimeThunks(
     std::unique_ptr<HloModule> module,
     IrCompiler::TargetMachineBuilder target_machine_builder,
@@ -2254,7 +2254,7 @@ HloCostAnalysis::ShapeSizeFunction CpuCompiler::ShapeSizeBytesFunction() const {
   return CpuExecutable::ShapeSizeBytes;
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
+absl::StatusOr<std::unique_ptr<CompiledModule>> CpuCompiler::Export(
     Executable* executable) {
   auto* cpu_executable = tensorflow::down_cast<CpuExecutable*>(executable);
   if (!cpu_executable)
@@ -2292,7 +2292,7 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> CpuCompiler::Export(
       cpu_executable->target_machine_options().ToProto());
 }
 
-absl::StatusOr<std::unique_ptr<AotCompilationResult>>
+absl::StatusOr<std::unique_ptr<CompiledModule>>
 CpuCompiler::LoadAotCompilationResult(
     const std::string& serialized_aot_result) {
   return CpuAotLoader::LoadAotCompilationResult(serialized_aot_result);
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index a01fef46396135..b72c188ba27dea 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -73,7 +73,7 @@ class CpuCompiler : public LLVMCompiler {
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
       const CompileOptions& options) override;
 
-  absl::StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
+  absl::StatusOr<std::vector<std::unique_ptr<CompiledModule>>>
   CompileAheadOfTime(std::unique_ptr<HloModule> hlo_module,
                      const AotCompilationOptions& options) override;
 
@@ -81,13 +81,13 @@ class CpuCompiler : public LLVMCompiler {
 
   HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;
 
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>> Export(
+  absl::StatusOr<std::unique_ptr<CompiledModule>> Export(
       Executable* executable) override;
 
   // Returns a (deserialized) AotCompilationResult from a serialized
   // AotCompilationResult.
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-  LoadAotCompilationResult(const std::string& serialized_aot_result) override;
+  absl::StatusOr<std::unique_ptr<CompiledModule>> LoadAotCompilationResult(
+      const std::string& serialized_aot_result) override;
 
   absl::StatusOr<HloSchedule> CreateHloSchedule(
       const HloModule& hlo_module) const;
@@ -123,8 +123,7 @@ class CpuCompiler : public LLVMCompiler {
       const llvm::PICLevel::Level& pic_level = llvm::PICLevel::NotPIC,
       const llvm::PIELevel::Level& pie_level = llvm::PIELevel::Default);
 
-  absl::StatusOr<std::unique_ptr<AotCompilationResult>>
-  CompileAheadOfTimeThunks(
+  absl::StatusOr<std::unique_ptr<CompiledModule>> CompileAheadOfTimeThunks(
       std::unique_ptr<HloModule> module,
       IrCompiler::TargetMachineBuilder target_machine_builder,
       const CpuAotCompilationOptions& aot_options, const llvm::Triple& triple,

From 37da2f66584cd5b05a9ded5d13709f561866c3b5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 00:39:25 -0800
Subject: [PATCH 659/753] Automated Code Change

PiperOrigin-RevId: 847648279
---
 .../xla/pjrt/stream_executor_executable.cc    | 27 ++++++++-----------
 .../xla/xla/pjrt/stream_executor_executable.h |  4 +--
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
index 4245c4da9ba100..8ee76ec15f4103 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -43,12 +43,10 @@ namespace xla {
 absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
     const {
   std::string serialized;
-  if (std::holds_alternative<
-          std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+  if (std::holds_alternative<std::vector<std::unique_ptr<CompiledModule>>>(
           executables_)) {
     const auto& aot_executables =
-        std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
-            executables_);
+        std::get<std::vector<std::unique_ptr<CompiledModule>>>(executables_);
     if (aot_executables.empty()) {
       return absl::InternalError("No local executable");
     }
@@ -64,7 +62,7 @@ absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
     Executable* built_executable = local_executables[0]->executable();
     CHECK(local_client_ != nullptr);
     TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<AotCompilationResult> aot_result,
+        std::unique_ptr<CompiledModule> aot_result,
         local_client_->backend().compiler()->Export(built_executable));
 
     TF_ASSIGN_OR_RETURN(serialized, aot_result->SerializeAsString());
@@ -84,9 +82,9 @@ absl::StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable()
 
 StreamExecutorExecutable::StreamExecutorExecutable(
     const CompileOptions& compile_options,
-    std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
-    int num_replicas, int num_partitions, absl::string_view name,
-    absl::string_view fingerprint, absl::string_view default_memory_kind)
+    std::vector<std::unique_ptr<CompiledModule>> executables, int num_replicas,
+    int num_partitions, absl::string_view name, absl::string_view fingerprint,
+    absl::string_view default_memory_kind)
     : compile_options_(compile_options),
       executables_(std::move(executables)),
       num_replicas_(num_replicas),
@@ -96,8 +94,7 @@ StreamExecutorExecutable::StreamExecutorExecutable(
       default_memory_kind_(default_memory_kind) {
   std::vector<std::shared_ptr<HloModule>> hlo_modules;
   for (const auto& executable :
-       std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
-           executables_)) {
+       std::get<std::vector<std::unique_ptr<CompiledModule>>>(executables_)) {
     hlo_modules.push_back(executable->shared_optimized_module());
   }
   hlo_modules_ = std::move(hlo_modules);
@@ -131,7 +128,7 @@ absl::StatusOr<CompiledMemoryStats>
 StreamExecutorExecutable::GetCompiledMemoryStats() const {
   CompiledMemoryStats memory_stats = CompiledMemoryStats();
   if (auto* aot_executables =
-          std::get_if<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+          std::get_if<std::vector<std::unique_ptr<CompiledModule>>>(
               &executables_)) {
     if (aot_executables->size() != 1) {
       return Unimplemented(
@@ -176,8 +173,7 @@ StreamExecutorExecutable::GetCompiledMemoryStats() const {
 }
 
 int64_t StreamExecutorExecutable::SizeOfGeneratedCodeInBytes() const {
-  if (std::holds_alternative<
-          std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+  if (std::holds_alternative<std::vector<std::unique_ptr<CompiledModule>>>(
           executables_)) {
     return 0;
   }
@@ -249,10 +245,9 @@ StreamExecutorExecutable::ConsumeExecutable(
     return std::get<std::vector<std::unique_ptr<LocalExecutable>>>(
         std::move(executables_));
   } else if (std::holds_alternative<
-                 std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
-                 executables_)) {
+                 std::vector<std::unique_ptr<CompiledModule>>>(executables_)) {
     auto aot_executables =
-        std::get<std::vector<std::unique_ptr<xla::AotCompilationResult>>>(
+        std::get<std::vector<std::unique_ptr<CompiledModule>>>(
             std::move(executables_));
     std::vector<std::unique_ptr<LocalExecutable>> local_executables;
     local_executables.reserve(aot_executables.size());
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
index a0a97daa935646..0e21789076b3a4 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -43,7 +43,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
  public:
   StreamExecutorExecutable(
       const CompileOptions& compile_options,
-      std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
+      std::vector<std::unique_ptr<CompiledModule>> executables,
       int num_replicas, int num_partitions, absl::string_view name,
       absl::string_view fingerprint, absl::string_view default_memory_kind);
 
@@ -101,7 +101,7 @@ class StreamExecutorExecutable : public PjRtExecutable {
   // The unoptimized HLO module proto is necessary for HLO debug dumping. It is
   // not available for deserialized executables.
   std::optional<HloModuleProto> unoptimized_hlo_module_proto_;
-  std::variant<std::vector<std::unique_ptr<xla::AotCompilationResult>>,
+  std::variant<std::vector<std::unique_ptr<CompiledModule>>,
                std::vector<std::unique_ptr<LocalExecutable>>>
       executables_;
   LocalClient* local_client_ = nullptr;

From 85172d7831f2af0d20c27a8608c99235968aacb5 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Mon, 22 Dec 2025 01:02:40 -0800
Subject: [PATCH 660/753] [XLA:GPU] Shard the gpu_compiler_test. The _h100 test
 regularly causes timeouts.

PiperOrigin-RevId: 847654247
---
 third_party/xla/xla/service/gpu/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 20378ae18cce67..49cab857c01cd0 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -1900,6 +1900,7 @@ xla_test(
     },
     backends = ["gpu"],
     data = ["gpu_compiler_test_autotune_db.textproto"],
+    shard_count = 2,
     deps = [
         ":alias_info",
         ":backend_configs_cc",

From 14b51dd70078ad43af54a83666427a9aa8a50c05 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 01:04:04 -0800
Subject: [PATCH 661/753] Update GraphDef version to 2449.

PiperOrigin-RevId: 847654695
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 6091b3a6cb3d81..d59705a833dfa8 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2448  // Updated: 2025/12/21
+#define TF_GRAPH_DEF_VERSION 2449  // Updated: 2025/12/22
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From d48869043b227074cd8bd1f7851f47a0737bcaf9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 01:04:16 -0800
Subject: [PATCH 662/753] compat: Update forward compatibility horizon to
 2025-12-22

PiperOrigin-RevId: 847654748
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 0008a1f31bae3b..52dc4100f8da06 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 21)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 22)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 79af5068fdb70c1e954fb8eed2b95c435931696c Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Mon, 22 Dec 2025 01:09:10 -0800
Subject: [PATCH 663/753] [Autotuner] Avoid compiling all configurations if we
 only return the first one. This happens when we want to select the first
 configuration that successfuly compiles. E.g. for determinism.

PiperOrigin-RevId: 847656341
---
 .../xla/xla/backends/autotuner/autotuner.cc   |  19 +++
 .../xla/backends/autotuner/autotuner_test.cc  | 157 ++++++++++++++----
 2 files changed, 143 insertions(+), 33 deletions(-)

diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index 40a0f43608e097..554895ab1a17e1 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -444,6 +444,25 @@ std::vector<absl::StatusOr<std::unique_ptr<Executable>>> Autotuner::CompileAll(
   XLA_SCOPED_LOGGING_TIMER_LEVEL("CompileAll", 5);
   tsl::profiler::TraceMe traceme("CompileAll");
   tsl::profiler::ScopedAnnotation annotation("XlaAutotunerCompilation");
+
+  if (autotune_config_.select_first_config) {
+    std::vector<absl::StatusOr<std::unique_ptr<Executable>>> executables;
+    for (int i = 0; i < configs.size(); ++i) {
+      absl::StatusOr<std::unique_ptr<Executable>> executable =
+          configs[i].codegen_backend->Compile(*instr,
+                                              *configs[i].backend_config);
+      if (executable.ok()) {
+        std::vector<absl::StatusOr<std::unique_ptr<Executable>>> success_result;
+        success_result.push_back(std::move(executable));
+        Config selected_config = std::move(configs[i]);
+        configs.clear();
+        configs.push_back(std::move(selected_config));
+        return success_result;
+      }
+    }
+    return executables;
+  }
+
   if (thread_pool_ == nullptr) {
     std::vector<absl::StatusOr<std::unique_ptr<Executable>>> executables;
     executables.reserve(configs.size());
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_test.cc b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
index bd4cc84715935b..449d837c23334d 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_test.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
@@ -241,7 +241,7 @@ TEST_F(AutotunerTest, AutotuneButNoSupportedConfigs) {
   auto profiler = std::make_unique<MockProfiler>();
   auto device_description = CreateDummyDeviceDescription();
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -269,7 +269,7 @@ TEST_F(AutotunerTest, AutotuneButNoCompiledConfigs) {
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -308,7 +308,7 @@ TEST_F(AutotunerTest, AutotuneAppliesBestConfigAndSkipsNonCompilableConfig) {
       .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -346,7 +346,7 @@ TEST_F(AutotunerTest, AutotuneAppliesBestConfigUsingThreadPool) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
   tsl::thread::ThreadPool thread_pool(tsl::Env::Default(), "test", 2);
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager), &thread_pool));
@@ -360,27 +360,27 @@ TEST_F(AutotunerTest, AutotuneModuleFindsNoInstructionsToAutotune) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
   auto device_description = CreateDummyDeviceDescription();
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), nullptr, config_,
                         std::make_unique<MockAutotunerCache>()));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
   EXPECT_THAT(autotuner->Autotune(
                   module.get(), [](const HloInstruction& _) { return false; }),
               absl_testing::IsOk());
 }
 
 TEST_F(AutotunerTest, AutotuneModuleFollowsFilter) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
 
   auto should_autotune = [](const HloInstruction& instruction) {
     return instruction.opcode() == HloOpcode::kCopy;
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kCopy},
@@ -391,13 +391,13 @@ TEST_F(AutotunerTest, AutotuneModuleFollowsFilter) {
 }
 
 TEST_F(AutotunerTest, AutotuneModuleWithDuplicateInstructions) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
 
   auto should_autotune = [](const HloInstruction& instruction) {
     return instruction.opcode() == HloOpcode::kAdd;
   };
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kAdd},
@@ -430,7 +430,7 @@ TEST_F(AutotunerTest, AutotuneButOneBackendFails) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(good_backend));
   backends.push_back(std::move(bad_backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -459,7 +459,7 @@ TEST_F(AutotunerTest, CacheHit) {
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::move(cache_manager)));
@@ -506,7 +506,7 @@ TEST_F(AutotunerTest, AutotuneWithBufferCheckFiltersWrongResults) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend_1));
   backends.push_back(std::move(backend_2));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::make_unique<MockAutotunerCache>()));
@@ -543,7 +543,7 @@ TEST_F(AutotunerTest, AutotuneSkipsBufferCheckWhenNoReferenceOutput) {
 
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::make_unique<MockAutotunerCache>()));
@@ -600,7 +600,7 @@ TEST_F(AutotunerTest, AutotuneWithScratchBytesOptimization) {
   backends.push_back(std::move(backend_1));
   config_.optimize_scratch_bytes = true;
   config_.scratch_bytes_window_size_us = 8;
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), std::move(profiler), config_,
                         std::make_unique<MockAutotunerCache>()));
@@ -620,16 +620,16 @@ TEST_F(AutotunerTest, ExpectAllInstructionsInCache) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto autotuner, Autotuner::Create(std::move(backends), nullptr, config_,
-                                        std::move(cache_manager)));
+  ASSERT_OK_AND_ASSIGN(auto autotuner,
+                       Autotuner::Create(std::move(backends), nullptr, config_,
+                                         std::move(cache_manager)));
   auto dummy_instr = HloInstruction::CreateConstant(LiteralUtil::CreateR0(1));
   EXPECT_THAT(autotuner->Autotune(dummy_instr.get()),
               StatusIs(absl::StatusCode::kNotFound));
 }
 
 TEST_F(AutotunerTest, DumpLogsToFile) {
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       tsl::testing::TemporaryDirectory temp_dir,
       tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
   config_.dump_logs_to = tsl::io::JoinPath(temp_dir.path(), "dump.log");
@@ -659,7 +659,7 @@ TEST_F(AutotunerTest, DumpLogsToFile) {
       .WillOnce(Return(ProfileResult({absl::Seconds(1)})));
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
                                         std::move(profiler), config_, nullptr));
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -709,7 +709,7 @@ TEST_F(AutotunerTest, ExcludeCublasConfig) {
   backends.push_back(std::move(backend));
 
   auto profiler = std::make_unique<MockProfiler>();
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
                                         std::move(profiler), config_, nullptr));
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -729,7 +729,6 @@ TEST_F(AutotunerTest, SelectFirstConfig) {
   EXPECT_CALL(*backend, GetSupportedConfigs(_))
       .WillOnce(Return(std::move(configs)));
   EXPECT_CALL(*backend, Compile(_, _))
-      .WillOnce(Return(std::unique_ptr<Executable>()))
       .WillOnce(Return(std::unique_ptr<Executable>()));
   EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_1")))
       .Times(1)
@@ -739,7 +738,39 @@ TEST_F(AutotunerTest, SelectFirstConfig) {
 
   auto profiler = std::make_unique<MockProfiler>();
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
+      auto autotuner, Autotuner::Create(std::move(backends),
+                                        std::move(profiler), config_, nullptr));
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  auto dummy_instr = module->entry_computation()->root_instruction();
+  EXPECT_THAT(autotuner->Autotune(dummy_instr), absl_testing::IsOk());
+}
+
+TEST_F(AutotunerTest, SelectFirstConfigStopsAfterFirstSuccess) {
+  config_.select_first_config = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+  configs.push_back(GetTestConfig("test_config_3"));
+
+  auto backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*backend, GetSupportedConfigs(_))
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_1")))
+      .WillOnce(Return(std::unique_ptr<Executable>()));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_2"))).Times(0);
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_3"))).Times(0);
+
+  EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_1")))
+      .Times(1)
+      .WillRepeatedly(Return(absl::OkStatus()));
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+
+  auto profiler = std::make_unique<MockProfiler>();
+
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner, Autotuner::Create(std::move(backends),
                                         std::move(profiler), config_, nullptr));
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -747,6 +778,66 @@ TEST_F(AutotunerTest, SelectFirstConfig) {
   EXPECT_THAT(autotuner->Autotune(dummy_instr), absl_testing::IsOk());
 }
 
+TEST_F(AutotunerTest, SelectFirstConfigFirstConfigFails) {
+  config_.select_first_config = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+
+  auto backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*backend, GetSupportedConfigs(_))
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_1")))
+      .WillOnce(Return(absl::InternalError("test error")));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_2")))
+      .WillOnce(Return(std::unique_ptr<Executable>()));
+
+  EXPECT_CALL(*backend, ApplyConfig(_, ConfigMatcher("test_config_2")))
+      .Times(1)
+      .WillRepeatedly(Return(absl::OkStatus()));
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+
+  auto profiler = std::make_unique<MockProfiler>();
+
+  ASSERT_OK_AND_ASSIGN(
+      auto autotuner, Autotuner::Create(std::move(backends),
+                                        std::move(profiler), config_, nullptr));
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  auto dummy_instr = module->entry_computation()->root_instruction();
+  EXPECT_THAT(autotuner->Autotune(dummy_instr), absl_testing::IsOk());
+}
+
+TEST_F(AutotunerTest, SelectFirstConfigAllConfigsFail) {
+  config_.select_first_config = true;
+
+  std::vector<std::unique_ptr<BackendConfig>> configs;
+  configs.push_back(GetTestConfig("test_config_1"));
+  configs.push_back(GetTestConfig("test_config_2"));
+
+  auto backend = std::make_unique<MockCodegenBackend>();
+  EXPECT_CALL(*backend, GetSupportedConfigs(_))
+      .WillOnce(Return(std::move(configs)));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_1")))
+      .WillOnce(Return(absl::InternalError("test error")));
+  EXPECT_CALL(*backend, Compile(_, ConfigMatcher("test_config_2")))
+      .WillOnce(Return(absl::InternalError("test error")));
+
+  std::vector<std::unique_ptr<CodegenBackend>> backends;
+  backends.push_back(std::move(backend));
+
+  auto profiler = std::make_unique<MockProfiler>();
+
+  ASSERT_OK_AND_ASSIGN(
+      auto autotuner, Autotuner::Create(std::move(backends),
+                                        std::move(profiler), config_, nullptr));
+  auto module = ParseAndReturnVerifiedModule(kHlo).value();
+  auto dummy_instr = module->entry_computation()->root_instruction();
+  EXPECT_THAT(autotuner->Autotune(dummy_instr),
+              StatusIs(absl::StatusCode::kInternal));
+}
+
 TEST_F(AutotunerTest, UseDefaultConfig) {
   config_.use_default_config = true;
 
@@ -760,7 +851,7 @@ TEST_F(AutotunerTest, UseDefaultConfig) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), /*profiler=*/nullptr, config_,
                         /*cache=*/nullptr));
@@ -782,7 +873,7 @@ TEST_F(AutotunerTest, UseDefaultConfigUnimplemented) {
   std::vector<std::unique_ptr<CodegenBackend>> backends;
   backends.push_back(std::move(backend));
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       auto autotuner,
       Autotuner::Create(std::move(backends), /*profiler=*/nullptr, config_,
                         /*cache=*/nullptr));
@@ -810,8 +901,8 @@ AutotunerCacheInterface::Config GetCacheConfig(absl::string_view name) {
 };
 
 TEST_F(AutotunerTest, ShardedAutotuning) {
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHlo));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       ParseAndReturnVerifiedModule(kHlo));
   constexpr int kShardCount = 2;
   auto should_autotune = [](const HloInstruction& instruction) {
     return instruction.opcode() == HloOpcode::kAdd ||
@@ -842,7 +933,7 @@ TEST_F(AutotunerTest, ShardedAutotuning) {
   EXPECT_CALL(*cache, Lookup(InstrPtrMatcher(HloOpcode::kAdd)))
       .WillOnce(Return(GetCacheConfig("best_config")));
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kCopy},
@@ -859,7 +950,7 @@ TEST_F(AutotunerTest, ShardedAutotuning) {
 }
 
 TEST_F(AutotunerTest, DumpHlos) {
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       tsl::testing::TemporaryDirectory dump_dir,
       tsl::testing::TemporaryDirectory::CreateForCurrentTestcase());
   auto module = ParseAndReturnVerifiedModule(kHlo).value();
@@ -870,7 +961,7 @@ TEST_F(AutotunerTest, DumpHlos) {
            instruction.opcode() == HloOpcode::kAdd;
   };
 
-  TF_ASSERT_OK_AND_ASSIGN(
+  ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<Autotuner> autotuner,
       SetupAutotunerWithExpectations(
           /*instrs_to_autotune=*/{HloOpcode::kCopy, HloOpcode::kAdd},

From dfc5b243cae343738a0ecab747bbe7ed96859afd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 01:45:07 -0800
Subject: [PATCH 664/753] Automated Code Change

PiperOrigin-RevId: 847667783
---
 tensorflow/core/graph/algorithm_test.cc       | 19 ++++-----
 .../core/graph/collective_order_test.cc       | 23 ++++++-----
 tensorflow/core/graph/graph_test.cc           | 40 +++++++++----------
 tensorflow/core/graph/optimizer_cse.cc        | 12 +++---
 tensorflow/core/graph/subgraph_test.cc        | 39 +++++++++---------
 5 files changed, 68 insertions(+), 65 deletions(-)

diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index 3c6cc215e95bc5..0c560b57044cb4 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -47,11 +47,12 @@ REGISTER_OP("TestBinary")
 
 // Compares that the order of nodes in 'inputs' respects the
 // pair orders described in 'ordered_pairs'.
-bool ExpectBefore(const std::vector<std::pair<string, string>>& ordered_pairs,
-                  const std::vector<Node*>& inputs, string* error) {
-  for (const std::pair<string, string>& pair : ordered_pairs) {
-    const string& before_node = pair.first;
-    const string& after_node = pair.second;
+bool ExpectBefore(
+    const std::vector<std::pair<std::string, std::string>>& ordered_pairs,
+    const std::vector<Node*>& inputs, std::string* error) {
+  for (const std::pair<std::string, std::string>& pair : ordered_pairs) {
+    const std::string& before_node = pair.first;
+    const std::string& after_node = pair.second;
     bool seen_before = false;
     bool seen_both = false;
     for (const Node* node : inputs) {
@@ -97,10 +98,10 @@ TEST(AlgorithmTest, ReversePostOrder) {
   GetReversePostOrder(g, &order);
 
   // Check that the order respects the dependencies correctly.
-  std::vector<std::pair<string, string>> reverse_orders = {
+  std::vector<std::pair<std::string, std::string>> reverse_orders = {
       {"W1", "input"}, {"W1", "t1"},    {"W1", "t2"}, {"W1", "t3"},
       {"input", "t1"}, {"input", "t3"}, {"t1", "t2"}, {"W2", "t3"}};
-  string error;
+  std::string error;
   EXPECT_TRUE(ExpectBefore(reverse_orders, order, &error)) << error;
 
   // A false ordering should fail the check.
@@ -111,7 +112,7 @@ TEST(AlgorithmTest, ReversePostOrder) {
   GetPostOrder(g, &order);
 
   // Check that the order respects the dependencies correctly.
-  std::vector<std::pair<string, string>> orders = {
+  std::vector<std::pair<std::string, std::string>> orders = {
       {"input", "W1"}, {"t1", "W1"},    {"t2", "W1"}, {"t3", "W1"},
       {"t1", "input"}, {"t3", "input"}, {"t2", "t1"}, {"t3", "W2"}};
   EXPECT_TRUE(ExpectBefore(orders, order, &error)) << error;
@@ -131,7 +132,7 @@ TEST(AlgorithmTest, ReversePostOrderStable) {
     // raw pointer value of Node. Stable post order suppose to remove this
     // nondeterminism by enforcing an ordering based on node ids.
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
-    string error;
+    std::string error;
     Node* w1 = SourceOp("TestParams", b.opts().WithName("W1"));
     Node* input =
         SourceOp("TestInput", b.opts().WithName("input").WithControlInput(w1));
diff --git a/tensorflow/core/graph/collective_order_test.cc b/tensorflow/core/graph/collective_order_test.cc
index 46333535cbbaad..2206fc1b309d3b 100644
--- a/tensorflow/core/graph/collective_order_test.cc
+++ b/tensorflow/core/graph/collective_order_test.cc
@@ -32,11 +32,12 @@ REGISTER_OP("TestParams").Output("o: float");
 // `expected_collective_nodes`, and that the list of control edges between these
 // collective nodes matches `expected_collective_control_edges`.
 void VerifyGraph(const Graph& graph,
-                 const std::vector<string>& expected_collective_nodes,
-                 const std::vector<std::pair<string, string>>&
+                 const std::vector<std::string>& expected_collective_nodes,
+                 const std::vector<std::pair<std::string, std::string>>&
                      expected_collective_control_edges) {
-  std::vector<string> actual_collective_nodes;
-  std::vector<std::pair<string, string>> actual_collective_control_edges;
+  std::vector<std::string> actual_collective_nodes;
+  std::vector<std::pair<std::string, std::string>>
+      actual_collective_control_edges;
   for (const Node* src : graph.nodes()) {
     if (!src->IsCollective()) {
       continue;
@@ -63,13 +64,13 @@ void VerifyGraph(const Graph& graph,
 // `wait_for_map`.
 void VerifyAttrs(
     const Graph& graph,
-    const std::unordered_map<string, std::vector<int32>> wait_for_map) {
+    const std::unordered_map<std::string, std::vector<int32_t>> wait_for_map) {
   for (const Node* node : graph.nodes()) {
     if (node->IsCollective() ||
         wait_for_map.find(node->name()) == wait_for_map.end()) {
       continue;
     }
-    std::vector<int32> wait_for_actual;
+    std::vector<int32_t> wait_for_actual;
     TF_EXPECT_OK(GetNodeAttr(node->attrs(), "wait_for", &wait_for_actual));
     auto wait_for_expected = wait_for_map.at(node->name());
     EXPECT_THAT(wait_for_actual, UnorderedElementsAreArray(wait_for_expected));
@@ -77,7 +78,7 @@ void VerifyAttrs(
 }
 
 Node* CollectiveReduceNode(GraphDefBuilder* builder, Node* input,
-                           const string& name, const string& device,
+                           const std::string& name, const std::string& device,
                            int instance_key) {
   Node* collective_node =
       ops::UnaryOp("CollectiveReduce", input,
@@ -109,8 +110,8 @@ Node* CollectiveReduceNode(GraphDefBuilder* builder, Node* input,
 // inputs, `id` is identity node.
 std::unique_ptr<Graph> InitGraph() {
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const string dev1 = "/job:localhost/replica:0/task:0/device:CPU:1";
+  const std::string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string dev1 = "/job:localhost/replica:0/task:0/device:CPU:1";
   Node* a = ops::SourceOp("TestParams",
                           builder.opts().WithName("a").WithDevice(dev0));
   Node* b = ops::SourceOp("TestParams",
@@ -165,7 +166,7 @@ TEST(CollectiveOrderTest, SimpleOrderAttr) {
 // `id` is identity node.
 std::unique_ptr<Graph> InitGraph2() {
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
   Node* a = ops::SourceOp("TestParams",
                           builder.opts().WithName("a").WithDevice(dev0));
   Node* c1 = CollectiveReduceNode(&builder, a, "c1", dev0, 1);
@@ -201,7 +202,7 @@ TEST(CollectiveOrderTest, SimpleOrder2) {
 //
 std::unique_ptr<Graph> InitGraphForPruning() {
   GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
-  const string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const std::string dev0 = "/job:localhost/replica:0/task:0/device:CPU:0";
   Node* w = ops::SourceOp("TestParams",
                           builder.opts().WithName("w").WithDevice(dev0));
   Node* x = ops::SourceOp("TestParams",
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index a5b519365034f2..fb5ce07959a424 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -100,13 +100,13 @@ class GraphTest : public ::testing::Test {
     EXPECT_EQ(edges, graph_.num_edges());
   }
 
-  Node* AddNodeWithName(const string& name) {
+  Node* AddNodeWithName(const std::string& name) {
     Node* node;
     TF_CHECK_OK(NodeBuilder(name, "NoOp").Finalize(&graph_, &node));
     return node;
   }
 
-  Node* FromNodeDef(const string& name, const string& node_type,
+  Node* FromNodeDef(const std::string& name, const std::string& node_type,
                     int num_inputs) {
     auto builder = NodeDefBuilder(name, node_type);
     for (int i = 0; i < num_inputs; ++i) {
@@ -122,14 +122,14 @@ class GraphTest : public ::testing::Test {
     return node;
   }
 
-  void FromGraphDef(const string& gdef_ascii) {
+  void FromGraphDef(const std::string& gdef_ascii) {
     GraphDef gdef;
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef));
     GraphConstructorOptions opts;
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef, &graph_));
   }
 
-  Node* FindNode(const string& name) {
+  Node* FindNode(const std::string& name) {
     for (Node* node : graph_.nodes()) {
       if (node->name() == name) return node;
     }
@@ -158,8 +158,8 @@ class GraphTest : public ::testing::Test {
  private:
   // Convert a list of nodes to a sorted list of strings so failure messages
   // are readable.
-  static std::vector<string> Stringify(const std::vector<Node*>& nodes) {
-    std::vector<string> result;
+  static std::vector<std::string> Stringify(const std::vector<Node*>& nodes) {
+    std::vector<std::string> result;
     result.reserve(nodes.size());
     for (Node* n : nodes) {
       result.push_back(n->DebugString());
@@ -322,14 +322,14 @@ TEST_F(GraphTest, NodeIteration) {
   graph_.RemoveNode(c);
 
   // expected = set of all node DebugStrings we expect in the graph
-  std::set<string> expected;
+  std::set<std::string> expected;
   expected.insert(graph_.source_node()->DebugString());
   expected.insert(a->DebugString());
   expected.insert(d->DebugString());
   expected.insert(graph_.sink_node()->DebugString());
 
   // Verify that iterating through ids gets the same set of nodes.
-  std::set<string> actual;
+  std::set<std::string> actual;
   for (int id = 0; id < graph_.num_node_ids(); ++id) {
     Node* node = graph_.FindNodeId(id);
     if (node != nullptr) {
@@ -370,7 +370,7 @@ TEST_F(GraphTest, AddAttr) {
 
   n1->AddAttr("_a", "new_attr");
 
-  string attr;
+  std::string attr;
   EXPECT_EQ(absl::OkStatus(), GetNodeAttr(n1->attrs(), "_a", &attr));
   EXPECT_EQ("new_attr", attr);
 
@@ -389,13 +389,13 @@ TEST_F(GraphTest, AddAttr) {
 }
 
 // Convert edge iteration results into a sorted string.
-static string EdgeIter(const Graph& g) {
+static std::string EdgeIter(const Graph& g) {
   std::vector<std::pair<int, int> > edges;
   for (const Edge* e : g.edges()) {
     edges.push_back(std::make_pair(e->src()->id(), e->dst()->id()));
   }
   std::sort(edges.begin(), edges.end());
-  string result;
+  std::string result;
   for (auto& p : edges) {
     absl::StrAppend(&result, p.first, "->", p.second, ";");
   }
@@ -422,9 +422,9 @@ TEST_F(GraphTest, EdgeIteration) {
 }
 
 TEST_F(GraphTest, NewName) {
-  string a1 = graph_.NewName("A");
-  string a2 = graph_.NewName("A");
-  string b1 = graph_.NewName("B");
+  std::string a1 = graph_.NewName("A");
+  std::string a2 = graph_.NewName("A");
+  std::string b1 = graph_.NewName("B");
   EXPECT_NE(a1, a2);
   EXPECT_NE(a1, b1);
   EXPECT_NE(a2, b1);
@@ -446,19 +446,19 @@ TEST_F(GraphTest, IsValidNode) {
   // nullptr
   absl::Status s = graph_.IsValidNode(nullptr);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(string("Node is null"), s.message());
+  EXPECT_EQ(std::string("Node is null"), s.message());
 
   // node id_ is too high
   s = graph_.IsValidNode(g2_node2);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(string("node id 3 is >= than number of nodes in graph 3"),
+  EXPECT_EQ(std::string("node id 3 is >= than number of nodes in graph 3"),
             s.message());
 
   // valid id_ but different ptr
   s = graph_.IsValidNode(g2_node1);
   EXPECT_EQ(error::INVALID_ARGUMENT, s.code());
-  EXPECT_EQ(string("Node with id 2 is different from the passed in node. "
-                   "Does it belong to a different graph?"),
+  EXPECT_EQ(std::string("Node with id 2 is different from the passed in node. "
+                        "Does it belong to a different graph?"),
             s.message());
 }
 
@@ -695,8 +695,8 @@ TEST_F(GraphTest, BuildNodeNameIndex) {
   auto node_name_index = graph_.BuildNodeNameIndex();
   EXPECT_EQ(node_name_index.size(), 5);
 
-  std::vector<string> node_names{"_SOURCE", "_SINK", "A", "B", "C"};
-  for (const string& node_name : node_names) {
+  std::vector<std::string> node_names{"_SOURCE", "_SINK", "A", "B", "C"};
+  for (const std::string& node_name : node_names) {
     EXPECT_NE(node_name_index.find(node_name), node_name_index.end());
     EXPECT_EQ(node_name_index[node_name], FindNode(node_name));
   }
diff --git a/tensorflow/core/graph/optimizer_cse.cc b/tensorflow/core/graph/optimizer_cse.cc
index 39b53541081659..f18d8a3bca0f1a 100644
--- a/tensorflow/core/graph/optimizer_cse.cc
+++ b/tensorflow/core/graph/optimizer_cse.cc
@@ -93,9 +93,9 @@ static size_t kIllegalNodeHash = 0;
 
 class Hasher {
  public:
-  uint64 hash() { return h_ == kIllegalNodeHash ? kIllegalNodeHash + 1 : h_; }
+  uint64_t hash() { return h_ == kIllegalNodeHash ? kIllegalNodeHash + 1 : h_; }
 
-  void MixString(const string& s) { h_ = Hash64(s.data(), s.size(), h_); }
+  void MixString(const std::string& s) { h_ = Hash64(s.data(), s.size(), h_); }
 
   void MixInteger(size_t z) { h_ = Hash64Combine(h_, z); }
 
@@ -122,7 +122,7 @@ class Hasher {
     // This kBufSize makes sizeof(HashingOutputStream) == 256.  It's not chosen
     // for any particular reason except it's a nice even number of cache lines.
     static constexpr size_t kBufSize = 228;
-    static constexpr uint64 kDefaultSeed = 2570847921467975139ULL;
+    static constexpr uint64_t kDefaultSeed = 2570847921467975139ULL;
     bool Next(void** data, int* size) override {
       if (i_ == kBufSize) {
         // Mix the chunk in.
@@ -174,7 +174,7 @@ class Hasher {
 
     bool AllowsAliasing() const override { return true; }
 
-    uint64 hash() {
+    uint64_t hash() {
       if (i_ != 0) {
         Mix(buf_, i_);
         i_ = 0;
@@ -190,10 +190,10 @@ class Hasher {
     char buf_[kBufSize];
     int i_ = 0;
     int64_t byte_count_ = 0;
-    uint64 h_ = kDefaultSeed;
+    uint64_t h_ = kDefaultSeed;
   };
 
-  uint64 h_ = HashingOutputStream::kDefaultSeed;
+  uint64_t h_ = HashingOutputStream::kDefaultSeed;
 };
 
 size_t OptimizerCSE::NodeHash(const Node* n) {
diff --git a/tensorflow/core/graph/subgraph_test.cc b/tensorflow/core/graph/subgraph_test.cc
index 31c5cf8a3bb444..a5f4be88e8e5de 100644
--- a/tensorflow/core/graph/subgraph_test.cc
+++ b/tensorflow/core/graph/subgraph_test.cc
@@ -49,24 +49,24 @@ class SubgraphTest : public ::testing::Test {
 
   ~SubgraphTest() override {}
 
-  void ExpectOK(const string& gdef_ascii) {
+  void ExpectOK(const std::string& gdef_ascii) {
     CHECK(protobuf::TextFormat::ParseFromString(gdef_ascii, &gdef_));
     GraphConstructorOptions opts;
     TF_CHECK_OK(ConvertGraphDefToGraph(opts, gdef_, g_.get()));
   }
 
-  Node* FindNode(const string& name) {
+  Node* FindNode(const std::string& name) {
     for (Node* n : g_->nodes()) {
       if (n->name() == name) return n;
     }
     return nullptr;
   }
 
-  bool HasNode(const string& name) { return FindNode(name) != nullptr; }
+  bool HasNode(const std::string& name) { return FindNode(name) != nullptr; }
 
-  void ExpectNodes(const string& nodes) {
+  void ExpectNodes(const std::string& nodes) {
     int count = 0;
-    std::vector<string> actual_nodes;
+    std::vector<std::string> actual_nodes;
     for (Node* n : g_->nodes()) {
       if (n->IsOp()) {
         count++;
@@ -77,9 +77,9 @@ class SubgraphTest : public ::testing::Test {
 
     LOG(INFO) << "Nodes present: " << absl::StrJoin(actual_nodes, " ");
 
-    std::vector<string> expected_nodes = str_util::Split(nodes, ',');
+    std::vector<std::string> expected_nodes = str_util::Split(nodes, ',');
     std::sort(expected_nodes.begin(), expected_nodes.end());
-    for (const string& s : expected_nodes) {
+    for (const std::string& s : expected_nodes) {
       Node* n = FindNode(s);
       EXPECT_TRUE(n != nullptr) << s;
       if (n->type_string() == "_Send" || n->type_string() == "_Recv") {
@@ -92,7 +92,8 @@ class SubgraphTest : public ::testing::Test {
         << "\nExpected: " << absl::StrJoin(expected_nodes, ",");
   }
 
-  bool HasEdge(const string& src, int src_out, const string& dst, int dst_in) {
+  bool HasEdge(const std::string& src, int src_out, const std::string& dst,
+               int dst_in) {
     for (const Edge* e : g_->edges()) {
       if (e->src()->name() == src && e->src_output() == src_out &&
           e->dst()->name() == dst && e->dst_input() == dst_in)
@@ -100,20 +101,20 @@ class SubgraphTest : public ::testing::Test {
     }
     return false;
   }
-  bool HasControlEdge(const string& src, const string& dst) {
+  bool HasControlEdge(const std::string& src, const std::string& dst) {
     return HasEdge(src, Graph::kControlSlot, dst, Graph::kControlSlot);
   }
 
-  string Subgraph(const string& fed_str, const string& fetch_str,
-                  const string& targets_str,
-                  bool use_function_convention = false) {
+  std::string Subgraph(const std::string& fed_str, const std::string& fetch_str,
+                       const std::string& targets_str,
+                       bool use_function_convention = false) {
     Graph* subgraph = new Graph(OpRegistry::Global());
     CopyGraph(*g_, subgraph);
-    std::vector<string> fed =
+    std::vector<std::string> fed =
         str_util::Split(fed_str, ',', str_util::SkipEmpty());
-    std::vector<string> fetch =
+    std::vector<std::string> fetch =
         str_util::Split(fetch_str, ',', str_util::SkipEmpty());
-    std::vector<string> targets =
+    std::vector<std::string> targets =
         str_util::Split(targets_str, ',', str_util::SkipEmpty());
 
     subgraph::RewriteGraphMetadata metadata;
@@ -355,7 +356,7 @@ void BM_SubgraphHelper(::testing::benchmark::State& state,
     GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
     Node* last_node = nullptr;
     for (int i = 0; i < num_nodes; i++) {
-      string name = absl::StrCat("N", i);
+      std::string name = absl::StrCat("N", i);
       if (i > 0) {
         last_node = ops::UnaryOp("Op", last_node, b.opts().WithName(name));
       } else {
@@ -365,12 +366,12 @@ void BM_SubgraphHelper(::testing::benchmark::State& state,
     TF_CHECK_OK(GraphDefBuilderToGraph(b, &g));
   }
 
-  std::vector<string> fed;
+  std::vector<std::string> fed;
   if (num_nodes > 1000) {
     fed.push_back(absl::StrCat("N", num_nodes - 1000));
   }
-  std::vector<string> fetch;
-  std::vector<string> targets = {absl::StrCat("N", num_nodes - 1)};
+  std::vector<std::string> fetch;
+  std::vector<std::string> targets = {absl::StrCat("N", num_nodes - 1)};
 
   for (auto s : state) {
     Graph* subgraph = new Graph(OpRegistry::Global());

From 23dd865ee573dbe5b112f5e1feab048df75ff227 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 22 Dec 2025 02:22:47 -0800
Subject: [PATCH 665/753] Remove redundant TENSORFLOW_USE_ROCM define.

The `TENSORFLOW_USE_ROCM=1` local define is no longer required for the `rocm_solver_context` target.

PiperOrigin-RevId: 847677878
---
 third_party/xla/xla/stream_executor/rocm/BUILD | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 885d88ec2812b0..5b01213aeb2a06 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -422,9 +422,6 @@ cc_library(
     name = "rocm_solver_context",
     srcs = ["rocm_solver_context.cc"],
     hdrs = ["rocm_solver_context.h"],
-    local_defines = [
-        "TENSORFLOW_USE_ROCM=1",
-    ],
     tags = [
         "gpu",
         "manual",

From f5b102299e277dfea97cee3cfc630278bcd8daa1 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Mon, 22 Dec 2025 02:41:59 -0800
Subject: [PATCH 666/753] [Autotuner] Log autotuner config in readable json
 format. When debugging the autotuner we often want to know the values of the
 AutotuneConfig.

PiperOrigin-RevId: 847683182
---
 .../xla/xla/backends/autotuner/autotuner.cc   | 27 ++++++++++++++++
 .../xla/xla/backends/autotuner/autotuner.h    |  2 ++
 .../xla/backends/autotuner/autotuner_test.cc  | 31 +++++++++++++++++++
 .../service/gpu/autotuning/autotuner_pass.cc  |  1 +
 4 files changed, 61 insertions(+)

diff --git a/third_party/xla/xla/backends/autotuner/autotuner.cc b/third_party/xla/xla/backends/autotuner/autotuner.cc
index 554895ab1a17e1..2f950b590aa8ef 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner.cc
@@ -499,6 +499,7 @@ absl::StatusOr<std::vector<Autotuner::ConfigResult>> Autotuner::ProfileAll(
 
   std::optional<ScopedShapedBuffer> reference_output;
   if (autotune_config_.check_buffers) {
+    VLOG(2) << "Checking buffers";
     reference_output = GetReferenceOutput(candidates, *input_buffers);
     if (!reference_output.has_value()) {
       LOG(WARNING) << "No reference output found even though buffer checking "
@@ -605,6 +606,8 @@ std::optional<ScopedShapedBuffer> Autotuner::GetReferenceOutput(
       continue;
     }
     if (profile_result.value().output_buffer.has_value()) {
+      VLOG(2) << "Found reference output for config: "
+              << candidate.config.ToString();
       return std::move(profile_result.value().output_buffer.value());
     }
   }
@@ -732,4 +735,28 @@ std::string Autotuner::Config::ToString() const {
                          UnpackedAnyShortDebugString(*backend_config));
 }
 
+std::string AutotuneConfig::ToString() const {
+  return absl::StrFormat(
+      "{\n"
+      "  \"check_buffers\": %s,\n"
+      "  \"relative_tolerance\": %f,\n"
+      "  \"crash_on_check_failure\": %s,\n"
+      "  \"optimize_scratch_bytes\": %s,\n"
+      "  \"scratch_bytes_window_size_us\": %d,\n"
+      "  \"expect_all_instructions_in_cache\": %s,\n"
+      "  \"dump_logs_to\": \"%s\",\n"
+      "  \"exclude_cublas_config\": %s,\n"
+      "  \"select_first_config\": %s,\n"
+      "  \"use_default_config\": %s,\n"
+      "  \"dump_hlos\": %s\n"
+      "}",
+      check_buffers ? "true" : "false", relative_tolerance,
+      crash_on_check_failure ? "true" : "false",
+      optimize_scratch_bytes ? "true" : "false", scratch_bytes_window_size_us,
+      expect_all_instructions_in_cache ? "true" : "false", dump_logs_to,
+      exclude_cublas_config ? "true" : "false",
+      select_first_config ? "true" : "false",
+      use_default_config ? "true" : "false", dump_hlos ? "true" : "false");
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/autotuner/autotuner.h b/third_party/xla/xla/backends/autotuner/autotuner.h
index eb15e516070b22..bec84c70609d33 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner.h
+++ b/third_party/xla/xla/backends/autotuner/autotuner.h
@@ -87,6 +87,8 @@ struct AutotuneConfig {
   // If true, dump the autotuned instructions to the modules's xla_dump_to or
   // to stdout if not set.
   bool dump_hlos = false;
+
+  std::string ToString() const;
 };
 
 class Autotuner {
diff --git a/third_party/xla/xla/backends/autotuner/autotuner_test.cc b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
index 449d837c23334d..1ea8b49cff2ddf 100644
--- a/third_party/xla/xla/backends/autotuner/autotuner_test.cc
+++ b/third_party/xla/xla/backends/autotuner/autotuner_test.cc
@@ -986,5 +986,36 @@ TEST_F(AutotunerTest, DumpHlos) {
           MatchesRegex(".*\\.test_module\\.autotuner_1\\.add\\.before\\.txt")));
 }
 
+TEST(AutotuneConfigTest, ToString) {
+  AutotuneConfig config;
+  config.check_buffers = true;
+  config.relative_tolerance = 1e-4;
+  config.crash_on_check_failure = false;
+  config.optimize_scratch_bytes = true;
+  config.scratch_bytes_window_size_us = 10;
+  config.expect_all_instructions_in_cache = false;
+  config.dump_logs_to = "/tmp/log";
+  config.exclude_cublas_config = true;
+  config.select_first_config = false;
+  config.use_default_config = true;
+  config.dump_hlos = false;
+
+  std::string expected =
+      "{\n"
+      "  \"check_buffers\": true,\n"
+      "  \"relative_tolerance\": 0.000100,\n"
+      "  \"crash_on_check_failure\": false,\n"
+      "  \"optimize_scratch_bytes\": true,\n"
+      "  \"scratch_bytes_window_size_us\": 10,\n"
+      "  \"expect_all_instructions_in_cache\": false,\n"
+      "  \"dump_logs_to\": \"/tmp/log\",\n"
+      "  \"exclude_cublas_config\": true,\n"
+      "  \"select_first_config\": false,\n"
+      "  \"use_default_config\": true,\n"
+      "  \"dump_hlos\": false\n"
+      "}";
+  EXPECT_EQ(config.ToString(), expected);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
index d053517780f951..630899b170c5d2 100644
--- a/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
+++ b/third_party/xla/xla/service/gpu/autotuning/autotuner_pass.cc
@@ -103,6 +103,7 @@ absl::StatusOr<std::unique_ptr<AutotunerPass>> AutotunerPass::Create(
   bool is_deviceless = stream_executor == nullptr;
   AutotuneConfig autotune_config =
       GetAutotuneConfig(debug_options, is_deviceless, optimize_scratch_bytes);
+  VLOG(1) << "Autotune config: " << autotune_config.ToString();
 
   if (!is_deviceless) {
     profiler = GpuProfiler::Create(

From d0b7f40548c9fa54b9eea1d71e24c49b67933984 Mon Sep 17 00:00:00 2001
From: deeptanshusekhri <deeptanshu.sekhri@arm.com>
Date: Mon, 22 Dec 2025 12:10:39 +0000
Subject: [PATCH 667/753] [tosa] : fixing dynamic batch handling in
 FullyConnected legalization (#106638)

---
 .../mlir/tosa/tests/tfl-to-tosa-pipeline.mlir     | 15 +++++++++++++++
 .../compiler/mlir/tosa/transforms/legalize_tfl.cc |  5 ++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index 7e4573aa5a09e4..78e616d8967bb6 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -3282,6 +3282,21 @@ func.func @test_fullyconnected_dynamic_output(%arg0: tensor<1x2048xf32>, %arg1:
 
 // -----
 
+// CHECK-LABEL: @test_fullyconnected_dynamic_batch
+func.func @test_fullyconnected_dynamic_batch(%arg0: tensor<?x512xf32>, %arg1: tensor<256x512xf32>, %arg2: tensor<256xf32>) -> tensor<?x256xf32> {
+  // CHECK-DAG: %[[OUT_SHAPE:.*]] = tosa.const_shape  {values = dense<[-1, 256]> : tensor<2xindex>} : () -> !tosa.shape<2>
+  // CHECK-DAG: %[[FILTER_SHAPE:.*]] = tosa.const_shape  {values = dense<[256, 1, 1, 512]> : tensor<4xindex>} : () -> !tosa.shape<4>
+  // CHECK-DAG: %[[IN_SHAPE:.*]] = tosa.const_shape  {values = dense<[-1, 1, 1, 512]> : tensor<4xindex>} : () -> !tosa.shape<4>
+  // CHECK: %[[RESHAPE_IN:.*]] = tosa.reshape %arg0, %[[IN_SHAPE]]
+  // CHECK: %[[RESHAPE_FILTER:.*]] = tosa.reshape %arg1, %[[FILTER_SHAPE]]
+  // CHECK: %[[CONV:.*]] = tosa.conv2d %[[RESHAPE_IN]], %[[RESHAPE_FILTER]], %arg2, {{.*}}, {{.*}}
+  // CHECK: tosa.reshape %[[CONV]], %[[OUT_SHAPE]]
+  %0 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<?x512xf32>, tensor<256x512xf32>, tensor<256xf32>) -> tensor<?x256xf32>
+  func.return %0 : tensor<?x256xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @test_fullyconnected_keep_dims
 func.func @test_fullyconnected_keep_dims(%arg0: tensor<1x64x64x768x!quant.uniform<i8:f32, 0.13852123916149139:5>>, %arg1: tensor<3072x768x!quant.uniform<i8<-127:127>:f32, 0.003333511995151639>>, %arg2: tensor<3072x!quant.uniform<i32:f32, 4.6176221803762019E-4>>) -> tensor<1x64x64x3072x!quant.uniform<i8:f32, 0.1022367924451828:45>> {
     // CHECK-DAG: %[[CONST_SHAPE0:.*]] = tosa.const_shape  {values = dense<[1, 64, 64, 3072]> : tensor<4xindex>}
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index d9c20cb9ab67b7..b5e19e35e9d40a 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -2336,7 +2336,10 @@ LogicalResult ConvertTFLFullyConnectedOp::matchAndRewrite(
   // shape[1].
   if (input_type.getRank() != 2) {
     int64_t num_elems = filter_type.getShape()[1];
-    int64_t num_batch = input_type.getNumElements() / num_elems;
+    int64_t num_batch = ShapedType::kDynamic;
+    if (input_type.hasStaticShape()) {
+      num_batch = input_type.getNumElements() / num_elems;
+    }
     SmallVector<int64_t, 2> shape_vals({num_batch, num_elems});
 
     RankedTensorType reshape_type =

From 2f90852c17800308bc406ac1ea7de4cf9e78fd01 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 22 Dec 2025 04:50:10 -0800
Subject: [PATCH 668/753] [XLA:GPU] Remove TF_ prefix from RETURN_IF_ERROR and
 ASSIGN_OR_RETURN macros.

PiperOrigin-RevId: 847716343
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  1 +
 .../gpu/runtime/ragged_all_to_all_thunk.cc    | 91 +++++++++----------
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 5b67961fdae183..de324ce85f41f5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1521,6 +1521,7 @@ cc_library(
         "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index f6d65a9a110d13..39b735c1381572 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -91,9 +92,9 @@ absl::Status LoadRaggedTensorMetadata(
     se::Stream& stream, absl::Span<DeviceBufferPair const> buffers,
     absl::Span<int64_t* const> ragged_metadata_allocs) {
   for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
-    TF_RETURN_IF_ERROR(stream.Memcpy(ragged_metadata_allocs[i],
-                                     buffers[i + 2].source_buffer,
-                                     buffers[i + 2].source_buffer.size()));
+    RETURN_IF_ERROR(stream.Memcpy(ragged_metadata_allocs[i],
+                                  buffers[i + 2].source_buffer,
+                                  buffers[i + 2].source_buffer.size()));
   }
 
   // Wait for the copies to complete.
@@ -111,7 +112,7 @@ absl::Status RunAllToAllOnIndexBuffer(
     const se::DeviceAddressBase& source_buffer, int64_t num_updates_per_replica,
     const se::DeviceAddressBase& destination_buffer, PrimitiveType element_type,
     se::Stream& stream, Communicator& comm) {
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
+  ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
 
   auto* gpu_comm = tsl::down_cast<GpuCommunicator*>(&comm);
   Future<> future = gpu_comm->GroupExecute(
@@ -125,18 +126,18 @@ absl::Status RunAllToAllOnIndexBuffer(
           se::DeviceAddressBase recv_slice =
               GpuCollectives::Slice(destination_buffer, element_type, offset,
                                     /*count=*/num_updates_per_replica);
-          TF_RETURN_IF_ERROR(comm->LaunchSend(send_slice, element_type,
-                                              /*count=*/num_updates_per_replica,
-                                              RankId(peer),
-                                              GpuCollectives::On(stream)));
-          TF_RETURN_IF_ERROR(comm->LaunchRecv(recv_slice, element_type,
-                                              /*count=*/num_updates_per_replica,
-                                              RankId(peer),
-                                              GpuCollectives::On(stream)));
+          RETURN_IF_ERROR(comm->LaunchSend(send_slice, element_type,
+                                           /*count=*/num_updates_per_replica,
+                                           RankId(peer),
+                                           GpuCollectives::On(stream)));
+          RETURN_IF_ERROR(comm->LaunchRecv(recv_slice, element_type,
+                                           /*count=*/num_updates_per_replica,
+                                           RankId(peer),
+                                           GpuCollectives::On(stream)));
         }
         return absl::OkStatus();
       });
-  TF_RETURN_IF_ERROR(future.Await());
+  RETURN_IF_ERROR(future.Await());
   return stream.BlockHostUntilDone();
 }
 
@@ -149,7 +150,7 @@ absl::Status RunRaggedAllToAll(
   int device_ordinal = stream.parent()->device_ordinal();
   XLA_VLOG_DEVICE(3, device_ordinal)
       << "Performing ragged-all-to-all from device ordinal: " << device_ordinal;
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
+  ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
 
   std::vector<DeviceBufferPair> buffers = original_buffers;
 
@@ -161,13 +162,13 @@ absl::Status RunRaggedAllToAll(
   // local output buffer. To get the correct offsets we perform an AllToAll on
   // the output_offsets buffer.
   DeviceBufferPair& output_offsets_buffer_pair = buffers[4];
-  TF_RETURN_IF_ERROR(RunAllToAllOnIndexBuffer(
+  RETURN_IF_ERROR(RunAllToAllOnIndexBuffer(
       output_offsets_buffer_pair.source_buffer, num_updates_per_replica,
       output_offsets_device_buffer, output_offsets_buffer_pair.element_type,
       stream, comm));
   output_offsets_buffer_pair.source_buffer = output_offsets_device_buffer;
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       LoadRaggedTensorMetadata(stream, buffers, ragged_metadata_allocs));
 
   const int64_t* input_offsets = ragged_metadata_allocs[0];
@@ -198,12 +199,12 @@ absl::Status RunRaggedAllToAll(
                 output_offsets[idx] * ragged_row_element_size,
                 recv_sizes[idx] * ragged_row_element_size);
 
-            TF_RETURN_IF_ERROR(
+            RETURN_IF_ERROR(
                 comm->LaunchSend(send_slice, element_type,
                                  send_sizes[idx] * ragged_row_element_size,
                                  RankId(peer), GpuCollectives::On(stream)));
 
-            TF_RETURN_IF_ERROR(
+            RETURN_IF_ERROR(
                 comm->LaunchRecv(recv_slice, element_type,
                                  recv_sizes[idx] * ragged_row_element_size,
                                  RankId(peer), GpuCollectives::On(stream)));
@@ -246,7 +247,7 @@ RendezvousBeforeKernelStart(absl::string_view name,
   // Record that this device has started the memcpy ragged-all-to-all. We do
   // this before the rendezvous to make sure that RecordEvent is called before
   // WaitFor on another stream.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(start_event));
+  RETURN_IF_ERROR(stream.RecordEvent(start_event));
 
   auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
     std::vector<RendezvousValue> values_copy;
@@ -262,7 +263,7 @@ RendezvousBeforeKernelStart(absl::string_view name,
   std::string start_rendezvous_key =
       absl::StrFormat("start %s ragged-all-to-all for rank %d, clique %s", name,
                       rank.value(), clique_key.ToString());
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
       Rendezvous<std::vector<RendezvousValue>>(
           /*name=*/
@@ -273,7 +274,7 @@ RendezvousBeforeKernelStart(absl::string_view name,
   // Wait for all devices to reach the start event. This indicates that all
   // output buffers are ready for transfer.
   for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.start_event));
+    RETURN_IF_ERROR(stream.WaitFor(value.start_event));
   }
 
   return rendezvous_values;
@@ -286,21 +287,21 @@ absl::Status RendezvousAfterKernelFinish(
     int64_t num_ranks, se::Stream& stream, se::Event* end_event,
     const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
   // Record that this device has finished the memcpy ragged-all-to-all.
-  TF_RETURN_IF_ERROR(stream.RecordEvent(end_event));
+  RETURN_IF_ERROR(stream.RecordEvent(end_event));
 
   // Do another rendezvous to make sure that we call RecordEvent for end_event
   // before WaitFor on another stream.
   std::string finish_rendezvous_key =
       absl::StrFormat("finish %s ragged-all-to-all for rank %d, clique %s",
                       name, rank.value(), clique_key.ToString());
-  TF_RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
-                                /*key=*/clique_key,
-                                /*num_threads=*/num_ranks));
+  RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
+                             /*key=*/clique_key,
+                             /*num_threads=*/num_ranks));
 
   // Wait for all devices to reach the end event. This indicates that all
   // updates from other devices have arrived.
   for (auto& value : *rendezvous_values) {
-    TF_RETURN_IF_ERROR(stream.WaitFor(value.end_event));
+    RETURN_IF_ERROR(stream.WaitFor(value.end_event));
   }
 
   return absl::OkStatus();
@@ -324,7 +325,7 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
   se::DeviceAddressBase input_buffer = buffers[0].source_buffer;
   se::DeviceAddressBase output_buffer = buffers[1].destination_buffer;
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
       RendezvousBeforeKernelStart(
           /*name=*/"one-shot", clique_key, rank, num_ranks, output_buffer,
@@ -337,7 +338,7 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
     output_ptrs.push_back(value.output_buffer);
   }
 
-  TF_RETURN_IF_ERROR(RunRaggedAllToAllKernel(
+  RETURN_IF_ERROR(RunRaggedAllToAllKernel(
       &stream, element_type, input_buffer, output_ptrs,
       buffers[2].source_buffer, buffers[3].source_buffer,
       buffers[4].source_buffer, num_ranks, num_updates_per_replica,
@@ -370,7 +371,7 @@ RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
   auto status = [&instr]() -> absl::Status {
     for (HloInstruction* operand : instr->operands()) {
       Shape shape = operand->shape();
-      TF_RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kRaggedAllToAll));
+      RETURN_IF_ERROR(IsValidOperand(shape, Thunk::kRaggedAllToAll));
     }
 
     if (!ShapeUtil::IsEffectivelyMostMajorDimension(instr->shape(), 0)) {
@@ -399,7 +400,7 @@ RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
 
 absl::Status RaggedAllToAllStartThunk::Initialize(
     const InitializeParams& params) {
-  TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
+  RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
   device_count_ = params.local_device_count;
 
   se::StreamExecutor* executor = params.executor;
@@ -414,7 +415,7 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
     }
   }
 
-  TF_ASSIGN_OR_RETURN(
+  ASSIGN_OR_RETURN(
       const GpuCliqueKey clique_key,
       GetCollectiveGpuCliqueKey(*params.collective_params, config_.config));
   const std::optional<RankId> rank =
@@ -426,9 +427,9 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
   // Allocate temp buffers in the host memory to load the sizes and offsets of
   // ragged tensors from device memory.
   for (int64_t i = 0; i < kNumRaggedMetadataOperands; ++i) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> alloc,
-                        executor->HostMemoryAllocate(config_.num_total_updates *
-                                                     sizeof(int64_t)));
+    ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> alloc,
+                     executor->HostMemoryAllocate(config_.num_total_updates *
+                                                  sizeof(int64_t)));
     state->host_buffer_allocs.push_back(std::move(alloc));
   }
 
@@ -441,8 +442,8 @@ absl::Status RaggedAllToAllStartThunk::Initialize(
   }
 
   if (is_local()) {
-    TF_ASSIGN_OR_RETURN(state->start_event, executor->CreateEvent());
-    TF_ASSIGN_OR_RETURN(state->end_event, executor->CreateEvent());
+    ASSIGN_OR_RETURN(state->start_event, executor->CreateEvent());
+    ASSIGN_OR_RETURN(state->end_event, executor->CreateEvent());
   }
 
   {
@@ -470,16 +471,14 @@ bool RaggedAllToAllStartThunk::is_local() const {
 absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
     const ExecuteParams& params, const GpuCliqueKey& clique_key,
     se::Stream& stream, Communicator& comm) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
+  ASSIGN_OR_RETURN(std::vector<DeviceBufferPair> device_buffers,
+                   ConvertToDeviceBuffers(params, buffers_,
+                                          config_.config.operand_element_type));
 
-  TF_ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
+  ASSIGN_OR_RETURN(int32_t num_ranks, comm.NumRanks());
 
-  TF_ASSIGN_OR_RETURN(
-      bool peer_access_enabled,
-      params.collective_cliques->peer_access_enabled(clique_key));
+  ASSIGN_OR_RETURN(bool peer_access_enabled,
+                   params.collective_cliques->peer_access_enabled(clique_key));
 
   StreamState* state = nullptr;
   {
@@ -493,7 +492,7 @@ absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
                                       device_buffers[0].element_type);
 
   if (should_use_one_shot_kernel) {
-    TF_RETURN_IF_ERROR(
+    RETURN_IF_ERROR(
         RunOneShotRaggedAllToAll(clique_key, stream, *state, device_buffers));
     return false;
   }
@@ -507,7 +506,7 @@ absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
         reinterpret_cast<int64_t*>(state->host_buffer_allocs[i]->opaque()));
   }
 
-  TF_RETURN_IF_ERROR(
+  RETURN_IF_ERROR(
       RunRaggedAllToAll(config_.num_row_elements, config_.num_total_updates,
                         device_buffers, stream, comm, ragged_metadata_allocs,
                         state->output_offsets_device_buffer.memory(),

From 12502acbf569675fbdec61b305a1f4e169decdf9 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Mon, 22 Dec 2025 06:18:09 -0800
Subject: [PATCH 669/753] Remove unnecessary if_gpu_is_configured from Triton
 tests.

The tests in xla/backends/gpu/codegen/triton/BUILD are already configured to run only on specific GPU backends, making the if_gpu_is_configured check on the srcs redundant.

PiperOrigin-RevId: 847738574
---
 .../xla/xla/backends/gpu/codegen/triton/BUILD     | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index ce2fae5553f1e1..ea9866b1695c74 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -3,7 +3,6 @@ load("//xla:xla.default.bzl", "xla_cc_test")
 load(
     "//xla/stream_executor:build_defs.bzl",
     "if_cuda_or_rocm_is_configured",
-    "if_gpu_is_configured",
 )
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google")
@@ -541,7 +540,7 @@ xla_cc_test(
 
 xla_test(
     name = "triton_gemm_fusion_test",
-    srcs = if_gpu_is_configured(["triton_gemm_fusion_test.cc"]),
+    srcs = ["triton_gemm_fusion_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -595,7 +594,7 @@ xla_test(
 xla_test(
     name = "fusion_emitter_int4_device_test",
     size = "large",
-    srcs = if_gpu_is_configured(["fusion_emitter_int4_device_test.cc"]),
+    srcs = ["fusion_emitter_int4_device_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -692,7 +691,7 @@ xla_test(
 
 xla_test(
     name = "fusion_emitter_device_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_device_test.cc"]),
+    srcs = ["fusion_emitter_device_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -799,7 +798,7 @@ cc_library(
 xla_test(
     name = "fusion_emitter_large_test",
     size = "large",
-    srcs = if_gpu_is_configured(["fusion_emitter_large_test.cc"]),
+    srcs = ["fusion_emitter_large_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -828,7 +827,7 @@ xla_test(
 
 xla_test(
     name = "fusion_emitter_parametrized_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_parametrized_test.cc"]),
+    srcs = ["fusion_emitter_parametrized_test.cc"],
     backends = [
         "a100",
         "h100",
@@ -857,7 +856,7 @@ xla_test(
 
 xla_cc_test(
     name = "fusion_emitter_shared_dialect_test",
-    srcs = if_gpu_is_configured(["fusion_emitter_shared_dialect_test.cc"]),
+    srcs = ["fusion_emitter_shared_dialect_test.cc"],
     # TODO(b/353912594): this test does not need to run on GPU, but it is broken on CPU in OSS.
     # Force it to run on GPU temporarily in order to get important OSS coverage.
     tags = [
@@ -948,7 +947,7 @@ xla_cc_test(
 
 xla_test(
     name = "support_legacy_test",
-    srcs = if_gpu_is_configured(["support_legacy_test.cc"]),
+    srcs = ["support_legacy_test.cc"],
     backends = [
         "a100",
         "h100",

From 4d0edd395fce495b07f1b6a38901808ebbe80e76 Mon Sep 17 00:00:00 2001
From: Kanish Anand <kanishanand@google.com>
Date: Mon, 22 Dec 2025 07:00:54 -0800
Subject: [PATCH 670/753] Refactor `std::optional` comparison in
 `ReshapeSharding` tests

PiperOrigin-RevId: 847749800
---
 .../xla/hlo/utils/hlo_sharding_util_test.cc   | 92 +++++++------------
 1 file changed, 32 insertions(+), 60 deletions(-)

diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
index 27b6f4ee1f9076..c6b95c58cf8b54 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -196,8 +196,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned1) {
       HloSharding::PartialTile(TileAssignment({2, 2, 3}, {3, 2, 2}, {1, 2, 0}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned2) {
@@ -208,8 +207,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned2) {
       HloSharding::PartialTile(TileAssignment({2, 2, 3}, {2, 3, 2}, {0, 2, 1}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned3) {
@@ -220,8 +218,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned3) {
       HloSharding::PartialTile(TileAssignment({4, 3}, {2, 3, 2}, {0, 2, 1}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned4) {
@@ -232,8 +229,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned4) {
       HloSharding::PartialTile(TileAssignment({2, 2, 3}, {3, 4}, {1, 0}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned5) {
@@ -243,8 +239,7 @@ TEST(HloShardingUtilTest, ReshapeShardingDimensionSizeOnePartitioned5) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 3, 2, 2});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
@@ -253,8 +248,7 @@ TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
   HloSharding sharding = HloSharding::AssignDevice(7);
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
@@ -263,7 +257,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
   HloSharding sharding = HloSharding::IotaTile({1, 2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
@@ -273,8 +267,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
@@ -284,8 +277,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplit2) {
@@ -295,8 +287,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit2) {
   HloSharding output_sharding = HloSharding::IotaTile({4, 4, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplit3) {
@@ -307,8 +298,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplit3) {
       HloSharding::PartialTile(TileAssignment({2, 1, 2}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
@@ -318,8 +308,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
@@ -328,8 +317,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
   HloSharding sharding = HloSharding::IotaTile({2, 1, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
@@ -339,8 +327,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
   HloSharding output_sharding = HloSharding::IotaTile({1, 2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTrivialDimensionInsertedToEnd) {
@@ -350,16 +337,14 @@ TEST(HloShardingUtilTest, ReshapeShardingTrivialDimensionInsertedToEnd) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, NoopReshapeShardingEmptyTile) {
   Shape shape = ShapeUtil::MakeShape(F32, {7, 1, 1});
   HloSharding sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result = ReshapeSharding(shape, shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingScalar) {
@@ -368,7 +353,7 @@ TEST(HloShardingUtilTest, ReshapeShardingScalar) {
   HloSharding sharding = HloSharding::IotaTile({2, 1, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne1) {
@@ -379,12 +364,10 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne1) {
 
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 
   result = ReshapeSharding(output_shape, input_shape, output_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), input_sharding);
+  EXPECT_EQ(result, input_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne2) {
@@ -395,8 +378,7 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne2) {
       HloSharding::PartialTile(TileAssignment({4, 2, 8}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne3) {
@@ -406,8 +388,7 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne3) {
   HloSharding output_sharding = HloSharding::IotaTile({4, 2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne4) {
@@ -418,8 +399,7 @@ TEST(HloShardingUtilTest, ReshapeShardingSuffixShapeSizeOne4) {
       HloSharding::PartialTile(TileAssignment({4, 2, 4}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne1) {
@@ -429,12 +409,10 @@ TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne1) {
   HloSharding output_sharding = HloSharding::IotaTile({1, 4});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 
   result = ReshapeSharding(output_shape, input_shape, output_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), input_sharding);
+  EXPECT_EQ(result, input_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne2) {
@@ -444,12 +422,10 @@ TEST(HloShardingUtilTest, ReshapeShardingPrefixShapeSizeOne2) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 
   result = ReshapeSharding(output_shape, input_shape, output_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), input_sharding);
+  EXPECT_EQ(result, input_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose1) {
@@ -458,8 +434,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose1) {
   HloSharding sharding = HloSharding::IotaTile({2, 1, 5});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), sharding);
+  EXPECT_EQ(result, sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose2) {
@@ -469,8 +444,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose2) {
   HloSharding output_sharding = HloSharding::IotaTile({2, 1, 13});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  ASSERT_TRUE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose3) {
@@ -479,7 +453,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose3) {
   HloSharding input_sharding = HloSharding::IotaTile({1, 1, 5});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingTranspose4) {
@@ -490,8 +464,7 @@ TEST(HloShardingUtilTest, ReshapeShardingTranspose4) {
       HloSharding::PartialTile(TileAssignment({1, 1, 5, 1, 1, 1, 13}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingWithPadding1) {
@@ -500,7 +473,7 @@ TEST(HloShardingUtilTest, ReshapeShardingWithPadding1) {
   HloSharding input_sharding = HloSharding::IotaTile({8});
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_FALSE(result.has_value());
+  ASSERT_FALSE(result.has_value());
 }
 
 TEST(HloShardingUtilTest, ReshapeShardingWithPadding2) {
@@ -511,8 +484,7 @@ TEST(HloShardingUtilTest, ReshapeShardingWithPadding2) {
       HloSharding::PartialTile(TileAssignment({4, 2}));
   std::optional<HloSharding> result =
       ReshapeSharding(input_shape, output_shape, input_sharding);
-  EXPECT_TRUE(result.has_value());
-  EXPECT_EQ(result.value(), output_sharding);
+  EXPECT_EQ(result, output_sharding);
 }
 
 TEST(HloShardingUtilTest, PropagateReshapeShardingTranspose1) {

From 3ea706cab38f3e0cc873e921c1ef11a39bbc7fef Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Mon, 22 Dec 2025 07:04:01 -0800
Subject: [PATCH 671/753] Add --xla_gpu_experimental_autotune_backends to allow
 for selecting backends. This change for the new autotuner. The new autotuner
 with its Triton backend competes with cuDNN fusions leading to flaky tests.
 Also some tests disable some autotuning paths via
 --xla_gpu_cudnn_gemm_fusion_level or --xla_gpu_cublas_fallback which are not
 fully compatible with the new autotuner. Other tests rely on the order of the
 backends, which would be resolved by adding a backend selection mechanism.

PiperOrigin-RevId: 847750954
---
 third_party/xla/xla/debug_options_flags.cc    | 33 +++++++++++++++++++
 .../xla/xla/debug_options_parsers_test.cc     | 32 ++++++++++++++++++
 third_party/xla/xla/xla.proto                 | 16 +++++++--
 3 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 5ad2bb235fd49c..8b05ff60afbc84 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -474,6 +474,15 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_experimental_scaled_dot_with_triton(false);
   opts.set_xla_gpu_experimental_use_raft_select_k(false);
 
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUDNN);
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_TRITON);
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUBLAS);
+  opts.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUBLASLT);
+
   opts.set_xla_cpu_collective_call_warn_stuck_seconds(20);
   opts.set_xla_cpu_collective_call_terminate_timeout_seconds(40);
   opts.set_xla_cpu_collective_timeout_seconds(30 * 60);
@@ -713,6 +722,16 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
     return absl::StrJoin(command_types, ", ", Formatter());
   };
 
+  auto autotune_backends_to_string =
+      [](google::protobuf::RepeatedField<int> backends) -> std::string {
+    struct Formatter {
+      void operator()(std::string* out, int type) const {
+        absl::StrAppend(out, DebugOptions::AutotuneBackend_Name(type));
+      }
+    };
+    return absl::StrJoin(backends, ", ", Formatter());
+  };
+
   // Custom "sub-parser" for xla_fuel.  Note that ConsumeFuel does not do any
   // locking on the fuel global variables.  This means that it's
   // illegal/undefined behavior to modify this flag value while the compiler is
@@ -2360,6 +2379,20 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_experimental_autotuner_cache_dir),
       debug_options->xla_gpu_experimental_autotuner_cache_dir(),
       "Experimental: Specify the directory to read/write autotuner cache to."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_experimental_autotune_backends",
+      SetterForRepeatedEnum<DebugOptions::AutotuneBackend>(
+          "xla_gpu_experimental_autotune_backends",
+          /*enum_prefix=*/"AUTOTUNE_BACKEND_",
+          &DebugOptions::AutotuneBackend_Parse,
+          debug_options->mutable_xla_gpu_experimental_autotune_backends()),
+      autotune_backends_to_string(
+          debug_options->xla_gpu_experimental_autotune_backends()),
+      "Backends to enable for autotuning. Comma-separated (no spaces). "
+      "Examples:\n"
+      "  'cudnn,triton' (overwrites defaults)\n"
+      "  '+cudnn,-cublas' (adds/removes from defaults)\n"
+      "Available: cudnn, triton, cublas, cublaslt."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_gemm_autotuner_override_file",
       string_setter_for(
diff --git a/third_party/xla/xla/debug_options_parsers_test.cc b/third_party/xla/xla/debug_options_parsers_test.cc
index 7bb39cc0915e9c..e09c83fbcbf57f 100644
--- a/third_party/xla/xla/debug_options_parsers_test.cc
+++ b/third_party/xla/xla/debug_options_parsers_test.cc
@@ -507,6 +507,38 @@ TEST(ParseRepeatedEnumFlagsTest, XnnFusionType) {
   TestLibraryFusionType("xnn");
 }
 
+TEST(ParseRepeatedEnumFlagsTest, AutotuneBackend) {
+  DebugOptions debug_options = DefaultDebugOptionsIgnoringFlags();
+  std::vector<tsl::Flag> flag_objects;
+  MakeDebugOptionsFlags(&flag_objects, &debug_options);
+
+  const auto& enabled_backends =
+      debug_options.xla_gpu_experimental_autotune_backends();
+
+  // Check that the default setting is populated.
+  ASSERT_THAT(enabled_backends,
+              ElementsAre(DebugOptions::AUTOTUNE_BACKEND_CUDNN,
+                          DebugOptions::AUTOTUNE_BACKEND_TRITON,
+                          DebugOptions::AUTOTUNE_BACKEND_CUBLAS,
+                          DebugOptions::AUTOTUNE_BACKEND_CUBLASLT));
+
+  // Overwriting the default setting.
+  SetXlaFlagsEnvVar("--xla_gpu_experimental_autotune_backends=cudnn,triton");
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
+  EXPECT_EQ(enabled_backends.size(), 2);
+  EXPECT_THAT(enabled_backends,
+              ElementsAre(DebugOptions::AUTOTUNE_BACKEND_CUDNN,
+                          DebugOptions::AUTOTUNE_BACKEND_TRITON));
+
+  // Adding / removing options from the existing setting.
+  SetXlaFlagsEnvVar("--xla_gpu_experimental_autotune_backends=+cublas,-triton");
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", flag_objects);
+  EXPECT_EQ(enabled_backends.size(), 2);
+  EXPECT_THAT(enabled_backends,
+              ElementsAre(DebugOptions::AUTOTUNE_BACKEND_CUDNN,
+                          DebugOptions::AUTOTUNE_BACKEND_CUBLAS));
+}
+
 TEST(ParseIntRangeInclusiveTest, SingleInteger) {
   IntRangeInclusive range;
   EXPECT_TRUE(ParseIntRangeInclusive("10", range));
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index e0d8356b32869b..bd5234e66d22d2 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -61,6 +61,15 @@ message ThunkBufferDebugFilter {
 // field presence is available to support merging between command-line flags and
 // stored instances. This is enforced via a unit test.
 message DebugOptions {
+  // Enum to define all backends that can be autotuned.
+  enum AutotuneBackend {
+    AUTOTUNE_BACKEND_ALL = 0;
+    AUTOTUNE_BACKEND_CUDNN = 1;
+    AUTOTUNE_BACKEND_TRITON = 2;
+    AUTOTUNE_BACKEND_CUBLAS = 3;
+    AUTOTUNE_BACKEND_CUBLASLT = 4;
+  }
+
   // Enum to define all collective ops
   // that xla supports.
   enum CollectiveOpType {
@@ -306,7 +315,7 @@ message DebugOptions {
   // XLA:GPU options.
   //--------------------------------------------------------------------------//
   // clang-format off
-  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["optional AutotuneCacheMode","optional bool","optional float","optional int32","optional int64","optional LibNvJitLinkMode","map<string, string>","optional PGLEStrictnessLevel","optional PipelineParallelismOptLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","optional ShapeChecks","optional string","optional WhileLoopUnrolling","repeated GenericTritonEmitterFeature","optional CommandBufferSchedulingMode"] // NOLINT
+  // go/keep-sorted start newline_separated=yes skip_lines=2 ignore_prefixes=["optional AutotuneCacheMode","optional bool","optional float","optional int32","optional int64","optional LibNvJitLinkMode","map<string, string>","optional PGLEStrictnessLevel","optional PipelineParallelismOptLevel","repeated CollectiveOpType","repeated CommandBufferCmdType","repeated string","optional ShapeChecks","optional string","optional WhileLoopUnrolling","repeated GenericTritonEmitterFeature","optional CommandBufferSchedulingMode", "repeated AutotuneBackend"] // NOLINT
   // clang-format on
 
   // Command buffer scheduling mode.
@@ -630,6 +639,9 @@ message DebugOptions {
   // up to the HLO optimization stage, before Thunk generation.
   optional bool xla_gpu_experimental_aot_compiled_thunks = 435;
 
+  // List of autotuner backends to enable. If empty, all backends are enabled.
+  repeated AutotuneBackend xla_gpu_experimental_autotune_backends = 442;
+
   // Specifies the behavior of per kernel autotuning cache.
   optional AutotuneCacheMode xla_gpu_experimental_autotune_cache_mode = 324;
 
@@ -1341,7 +1353,7 @@ message DebugOptions {
   // Note: when adding a new flag, please add it to one of the hardware-specific
   // or hardware-agnostic sections at the top of this proto message.
 
-  // Next id: 442
+  // Next id: 443
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.

From af38f913d0d10089033ad23bd699e2ab64c5810a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 07:27:26 -0800
Subject: [PATCH 672/753] Automated Code Change

PiperOrigin-RevId: 847756872
---
 .../compiler/mlir/tensorflow/utils/error_util_test.cc     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
index a7ea08924aea5e..8634afe5fc1498 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util_test.cc
@@ -52,13 +52,13 @@ TEST_F(StatusScopedDiagnosticHandlerTest,
        VerifyDiagnosticsAreCapturedAsUnknownStatus) {
   StatusScopedDiagnosticHandler handler(&context_);
   emitError(loc_) << "Diagnostic message";
-  ASSERT_TRUE(tensorflow::errors::IsUnknown(handler.ConsumeStatus()));
+  ASSERT_TRUE(absl::IsUnknown(handler.ConsumeStatus()));
 }
 
 TEST_F(StatusScopedDiagnosticHandlerTest, VerifyPassedInErrorsArePropagated) {
   const Status err = tensorflow::errors::Internal("Passed in error");
-  ASSERT_TRUE(tensorflow::errors::IsInternal(
-      StatusScopedDiagnosticHandler(&context_).Combine(err)));
+  ASSERT_TRUE(
+      absl::IsInternal(StatusScopedDiagnosticHandler(&context_).Combine(err)));
 }
 
 TEST_F(StatusScopedDiagnosticHandlerTest,
@@ -68,7 +68,7 @@ TEST_F(StatusScopedDiagnosticHandlerTest,
   emitError(loc_) << "Second diagnostic message reported";
   const Status s =
       ssdh.Combine(tensorflow::errors::Internal("Passed in error"));
-  ASSERT_TRUE(tensorflow::errors::IsInternal(s));
+  ASSERT_TRUE(absl::IsInternal(s));
   EXPECT_THAT(s.message(), HasSubstr("Passed in error"));
   EXPECT_THAT(s.message(), HasSubstr("Diagnostic message reported"));
   EXPECT_THAT(s.message(), HasSubstr("Second diagnostic message reported"));

From 3cec0d7b92afcebd3b12252daea9dfa05d8b42b9 Mon Sep 17 00:00:00 2001
From: Oleg Shyshkov <shyshkov@google.com>
Date: Mon, 22 Dec 2025 08:59:00 -0800
Subject: [PATCH 673/753] [XLA:GPU] Clean up RaggedAllToAllStartThunk
 rendezvous helpers.

PiperOrigin-RevId: 847783200
---
 .../gpu/runtime/ragged_all_to_all_thunk.cc    | 78 ++++++++-----------
 .../gpu/runtime/ragged_all_to_all_thunk.h     | 28 +++++++
 2 files changed, 59 insertions(+), 47 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index 39b735c1381572..75e3ca5dc0eb81 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -216,38 +216,29 @@ absl::Status RunRaggedAllToAll(
   return future.Await();
 }
 
-// Contains the values that are passed between host threads with rendezvous.
-struct RendezvousValue {
-  RankId rank;
-  se::DeviceAddressBase output_buffer;
-  se::Event* start_event;
-  se::Event* end_event;
-
-  bool operator<(const RendezvousValue& other) const {
-    return rank < other.rank;
-  }
-};
+}  // namespace
 
 // Executes the rendezvous before the kernel start.
 // Inserts CUDA events into the stream to ensure that all devices have reached
 // the start event before the kernel starts.
-absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
-RendezvousBeforeKernelStart(absl::string_view name,
-                            const GpuCliqueKey& clique_key, RankId rank,
-                            int64_t num_ranks,
-                            const se::DeviceAddressBase& output_buffer,
-                            se::Stream& stream, se::Event* start_event,
-                            se::Event* end_event) {
+absl::StatusOr<
+    std::shared_ptr<std::vector<RaggedAllToAllStartThunk::RendezvousValue>>>
+RaggedAllToAllStartThunk::RendezvousBeforeKernelStart(
+    const GpuCliqueKey& clique_key, se::Stream& stream,
+    const StreamState& state, const se::DeviceAddressBase& output_buffer) {
+  int64_t num_ranks = clique_key.num_local_participants();
+  const RankId& rank = state.rank;
+
   RendezvousValue rendezvous_value;
   rendezvous_value.rank = rank;
   rendezvous_value.output_buffer = output_buffer;
-  rendezvous_value.start_event = start_event;
-  rendezvous_value.end_event = end_event;
+  rendezvous_value.start_event = state.start_event.get();
+  rendezvous_value.end_event = state.end_event.get();
 
   // Record that this device has started the memcpy ragged-all-to-all. We do
   // this before the rendezvous to make sure that RecordEvent is called before
   // WaitFor on another stream.
-  RETURN_IF_ERROR(stream.RecordEvent(start_event));
+  RETURN_IF_ERROR(stream.RecordEvent(state.start_event.get()));
 
   auto rendezvous_fn = [](absl::Span<const RendezvousValue* const> values) {
     std::vector<RendezvousValue> values_copy;
@@ -260,16 +251,13 @@ RendezvousBeforeKernelStart(absl::string_view name,
     return values_copy;
   };
 
-  std::string start_rendezvous_key =
-      absl::StrFormat("start %s ragged-all-to-all for rank %d, clique %s", name,
+  std::string name =
+      absl::StrFormat("start one-shot ragged-all-to-all for rank %d, clique %s",
                       rank.value(), clique_key.ToString());
   ASSIGN_OR_RETURN(
       std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
       Rendezvous<std::vector<RendezvousValue>>(
-          /*name=*/
-          start_rendezvous_key, /*key=*/clique_key,
-          /*value=*/rendezvous_value, /*num_threads=*/num_ranks,
-          rendezvous_fn));
+          name, clique_key, rendezvous_value, num_ranks, rendezvous_fn));
 
   // Wait for all devices to reach the start event. This indicates that all
   // output buffers are ready for transfer.
@@ -282,33 +270,32 @@ RendezvousBeforeKernelStart(absl::string_view name,
 
 // Executes the rendezvous after the kernel finish. Waits for all devices to
 // reach the end event.
-absl::Status RendezvousAfterKernelFinish(
-    absl::string_view name, const GpuCliqueKey& clique_key, RankId rank,
-    int64_t num_ranks, se::Stream& stream, se::Event* end_event,
-    const std::shared_ptr<std::vector<RendezvousValue>>& rendezvous_values) {
+absl::Status RaggedAllToAllStartThunk::RendezvousAfterKernelFinish(
+    const GpuCliqueKey& clique_key, se::Stream& stream,
+    const StreamState& state,
+    const std::vector<RendezvousValue>& rendezvous_values) {
+  int64_t num_ranks = clique_key.num_local_participants();
+  const RankId& rank = state.rank;
+
   // Record that this device has finished the memcpy ragged-all-to-all.
-  RETURN_IF_ERROR(stream.RecordEvent(end_event));
+  RETURN_IF_ERROR(stream.RecordEvent(state.end_event.get()));
 
   // Do another rendezvous to make sure that we call RecordEvent for end_event
   // before WaitFor on another stream.
-  std::string finish_rendezvous_key =
-      absl::StrFormat("finish %s ragged-all-to-all for rank %d, clique %s",
-                      name, rank.value(), clique_key.ToString());
-  RETURN_IF_ERROR(Rendezvous(/*name=*/finish_rendezvous_key,
-                             /*key=*/clique_key,
-                             /*num_threads=*/num_ranks));
+  std::string name = absl::StrFormat(
+      "finish one-shot ragged-all-to-all for rank %d, clique %s", rank.value(),
+      clique_key.ToString());
+  RETURN_IF_ERROR(Rendezvous(name, clique_key, num_ranks));
 
   // Wait for all devices to reach the end event. This indicates that all
   // updates from other devices have arrived.
-  for (auto& value : *rendezvous_values) {
+  for (auto& value : rendezvous_values) {
     RETURN_IF_ERROR(stream.WaitFor(value.end_event));
   }
 
   return absl::OkStatus();
 }
 
-}  // namespace
-
 absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
     const GpuCliqueKey& clique_key, se::Stream& stream,
     const StreamState& state, absl::Span<DeviceBufferPair const> buffers) {
@@ -327,9 +314,7 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
 
   ASSIGN_OR_RETURN(
       std::shared_ptr<std::vector<RendezvousValue>> rendezvous_values,
-      RendezvousBeforeKernelStart(
-          /*name=*/"one-shot", clique_key, rank, num_ranks, output_buffer,
-          stream, state.start_event.get(), state.end_event.get()));
+      RendezvousBeforeKernelStart(clique_key, stream, state, output_buffer));
 
   const int64_t num_updates_per_replica = config_.num_total_updates / num_ranks;
 
@@ -344,9 +329,8 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
       buffers[4].source_buffer, num_ranks, num_updates_per_replica,
       config_.num_input_rows, config_.num_row_elements));
 
-  return RendezvousAfterKernelFinish(
-      /*name=*/"one-shot", clique_key, rank, num_ranks, stream,
-      state.end_event.get(), rendezvous_values);
+  return RendezvousAfterKernelFinish(clique_key, stream, state,
+                                     *rendezvous_values);
 }
 
 RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
index 6a48a5fac956b0..eb46ba6ef3b12d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_handle.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/memory_allocation.h"
@@ -80,6 +81,18 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
                                      Communicator& comm) override;
 
  private:
+  // Contains the values that are passed between host threads with rendezvous.
+  struct RendezvousValue {
+    RankId rank;
+    se::DeviceAddressBase output_buffer;
+    se::Event* start_event = nullptr;
+    se::Event* end_event = nullptr;
+
+    bool operator<(const RendezvousValue& other) const {
+      return rank < other.rank;
+    }
+  };
+
   struct StreamState {
     int device_ordinal;
     RankId rank;
@@ -103,6 +116,21 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
         : device_ordinal(device_ordinal), rank(rank) {}
   };
 
+  // Executes the rendezvous before the kernel start.
+  // Inserts CUDA events into the stream to ensure that all devices have reached
+  // the start event before the kernel starts.
+  absl::StatusOr<std::shared_ptr<std::vector<RendezvousValue>>>
+  RendezvousBeforeKernelStart(const GpuCliqueKey& clique_key,
+                              se::Stream& stream, const StreamState& state,
+                              const se::DeviceAddressBase& output_buffer);
+
+  // Executes the rendezvous after the kernel finish. Waits for all devices to
+  // reach the end event.
+  absl::Status RendezvousAfterKernelFinish(
+      const GpuCliqueKey& clique_key, se::Stream& stream,
+      const StreamState& state,
+      const std::vector<RendezvousValue>& rendezvous_values);
+
   absl::Status RunOneShotRaggedAllToAll(
       const GpuCliqueKey& clique_key, se::Stream& stream,
       const StreamState& state, absl::Span<DeviceBufferPair const> buffers);

From 573bbe2b4163fb4349c8c662635f99cca40c8406 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 09:45:57 -0800
Subject: [PATCH 674/753] Migrates `builder.create<Op>()` => `Op::create()` in
 tablegen files

PiperOrigin-RevId: 847796796
---
 .../mlir/lite/transforms/legalize_patterns.td        |  4 ++--
 .../mlir/lite/transforms/legalize_tensorlist.td      |  2 +-
 .../mlir/lite/transforms/prepare_patterns.td         |  2 +-
 tensorflow/compiler/mlir/lite/utils/utils.td         |  2 +-
 .../stablehlo/transforms/legalize_tf_patterns.td     |  2 +-
 .../tensorflow/transforms/decompose_resource_ops.td  | 12 ++++++------
 .../compiler/mlir/tensorflow/transforms/lower_tf.td  |  6 +++---
 .../mlir/tf2xla/transforms/legalize_tf_patterns.td   |  2 +-
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 36091686021e2c..26c5496ff3b08b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -75,14 +75,14 @@ def CreateInt32ConstOrCast : NativeCodeCall<
 
 // Creates an int32 constant op from an integer attribute $0.
 def CreateInt32ConstOpFromIntAttr
-  : NativeCodeCall<"$_builder.create<TF::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>(llvm::cast<IntegerAttr>($0).getInt())}))">;
+  : NativeCodeCall<"TF::ConstOp::create($_builder, $_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>(llvm::cast<IntegerAttr>($0).getInt())}))">;
 
 //===----------------------------------------------------------------------===//
 // Nullary ops patterns.
 //===----------------------------------------------------------------------===//
 
 def createConstOp
-  : NativeCodeCall<"$_builder.create<ConstOp>($_loc, $0.getType(), $1)">;
+  : NativeCodeCall<"ConstOp::create($_builder, $_loc, $0.getType(), $1)">;
 
 def LegalizeTFConstToTFLConst: Pat<(TF_ConstOp:$res ElementsAttr:$value),
                                    (createConstOp $res, $value)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
index 9894e7df7587f9..ce9b6af564d2a4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tensorlist.td
@@ -20,7 +20,7 @@ include "mlir/IR/OpBase.td"
 include "mlir/Dialect/Arith/IR/ArithOps.td"
 
 def ConstDenseElementsI32ZeroAttr
-  : NativeCodeCall<"$_builder.create<TFL::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {0}))">;
+  : NativeCodeCall<"TFL::ConstOp::create($_builder, $_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {0}))">;
 
 def Size1InputRange : NativeCodeCall<
   "SmallVector<Value, 1>{$0}">;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 235ec7d38615fc..d14ee12b7e55a3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -24,7 +24,7 @@ def DenseElementsAttr : ElementsAttrBase<
   "non-opaque constant tensor">;
 
 def CreateGatherNdOp : NativeCodeCall<
-    "$_builder.create<TF::GatherNdOp>($0.getLoc(), $0.getType(), $1, $2, $3)">;
+    "TF::GatherNdOp::create($_builder, $0.getLoc(), $0.getType(), $1, $2, $3)">;
 
 def CreateTFCastOpI32 : NativeCodeCall<
     "CreateTFCastOpI32(&$_builder, $_loc, $0, $1)">;
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
index 7583d48618f4fc..d38cf411ea9f2c 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.td
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -136,7 +136,7 @@ def HasSameStaticShapes : Constraint<
     "have the same static shape">;
 
 def CreateNoneValue : NativeCodeCall<
-  "$_builder.create<TFL::NoValueOp>($0.getLoc(), $_builder.getUnitAttr())">;
+  "TFL::NoValueOp::create($_builder, $0.getLoc(), $_builder.getUnitAttr())">;
 
 // Returns shape of a ranked tensor.
 // if called without a ranked tensor it will fail.
diff --git a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
index 24b1d05bce9735..ce91055db9c666 100644
--- a/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/stablehlo/transforms/legalize_tf_patterns.td
@@ -40,7 +40,7 @@ def CastValueToI64: NativeCodeCall<
   "CastValueToI64($0.getLoc(), $1, &$_builder)">;
 
 def CastValueToElementType: NativeCodeCall<
-  "$_builder.create<ConvertOp>($0.getLoc(), $1, "
+  "ConvertOp::create($_builder, $0.getLoc(), $1, "
   "getElementTypeOrSelf($2.getType()))">;
 
 // Here, $0 is an ElementsAttr with exactly one element of type integer. $1 is
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index 1fc666da4a8d95..9130ae844bc6b9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -27,7 +27,7 @@ def EmptyList: NativeCodeCall<"llvm::SmallVector<mlir::Value>{}">;
 // Creates a tf.ReadVariable op that reads a resource `$2` that has the same
 // element type as `$1`. The op created will use location of `$0`.
 def CreateTFReadVariableOp : NativeCodeCall<
-    "$_builder.create<TF::ReadVariableOp>("
+    "TF::ReadVariableOp::create($_builder, "
     "  $0.getLoc(),"
     "  GetResourceSubtypeOrDefault("
     "    $2, llvm::cast<TensorType>($1.getType()).getElementType()),"
@@ -39,19 +39,19 @@ def CheckHasResourceSubtype : Constraint<CPred<"HasResourceSubtype($0)">>;
 def CreateConstBoolAttrFalse : NativeCodeCall<"$_builder.getBoolAttr(false)">;
 
 def CreateTensorScatterAddOp : NativeCodeCall<
-    "$_builder.create<TF::TensorScatterAddOp>("
+    "TF::TensorScatterAddOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $0, $1, $2, $_builder.getStringAttr(\"\"))">;
 
 def CreateTensorScatterUpdateOp : NativeCodeCall<
-    "$_builder.create<TF::TensorScatterUpdateOp>("
+    "TF::TensorScatterUpdateOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $0, $1, $2, $_builder.getStringAttr(\"\"))">;
 
 def CreateTFReadVariableOpFromResourceHandle : NativeCodeCall<
-    "$_builder.create<TF::ReadVariableOp>("
+    "TF::ReadVariableOp::create($_builder, "
     "$0.getLoc(), GetResourceSubtype($1), $1)">;
 
 def CreateTFSelectOp: NativeCodeCall<
-    "$_builder.create<TF::SelectOp>("
+    "TF::SelectOp::create($_builder, "
     "$0.getLoc(), $3.getType(), $1, $2, $3)">;
 
 def ConstAttrIfThenElse: NativeCodeCall<
@@ -59,7 +59,7 @@ def ConstAttrIfThenElse: NativeCodeCall<
 
 // Convert clamp(lo, x, hi) to clipbyvalue(x, lo, hi).
 def Clamp: NativeCodeCall<
-    "$_builder.create<TF::ClipByValueOp>("
+    "TF::ClipByValueOp::create($_builder, "
     "  $0.getLoc(),"
     "  $2.getType(), $2, $1, $3)">;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index a9ff5a8f76268a..1061d564f51afc 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -37,7 +37,7 @@ class GetF32Scalar<int value> :
 def TrueBoolAttr : AttrConstraint<CPred<"llvm::cast<::mlir::BoolAttr>($_self).getValue()">>;
 
 def CreateTFShapeOp : NativeCodeCall<
-    "$_builder.create<TF::ShapeOp>($0.getLoc(), $1, $2)">;
+    "TF::ShapeOp::create($_builder, $0.getLoc(), $1, $2)">;
 
 def IsI32 : NativeCodeCall<
     "$_builder.getBoolAttr(getElementTypeOrSelf($0.getType()).isInteger(32))">;
@@ -49,11 +49,11 @@ def CreateTFCastOpI32 : NativeCodeCall<
     "CreateTFCastOpI32(&$_builder, $0.getLoc(), $1, $2)">;
 
 def CreateTensorScatterNdOp : NativeCodeCall<
-    "$_builder.create<TF::ScatterNdOp>("
+    "TF::ScatterNdOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $1, $2, $3, $4)">;
 
 def CreateTensorScatterUpdateOp : NativeCodeCall<
-    "$_builder.create<TF::TensorScatterUpdateOp>("
+    "TF::TensorScatterUpdateOp::create($_builder, "
     "$0.getLoc(), $0.getType(), $0, $1, $2, $3)">;
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index 5507c82bc6f479..957c4887366e16 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -41,7 +41,7 @@ def CastValueToI64: NativeCodeCall<
   "CastValueToI64($0.getLoc(), $1, &$_builder)">;
 
 def CastValueToElementType: NativeCodeCall<
-  "$_builder.create<stablehlo::ConvertOp>($0.getLoc(), $1, "
+  "stablehlo::ConvertOp::create($_builder, $0.getLoc(), $1, "
   "getElementTypeOrSelf($2.getType()))">;
 
 // Here, $0 is an ElementsAttr with exactly one element of type integer. $1 is

From 9ca49fcfa5899ae8a87cfc99e41c49cb34c4f384 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Mon, 22 Dec 2025 10:08:37 -0800
Subject: [PATCH 675/753] Limit CublasDot deterministic test to Cublas
 autotuning backend.

PiperOrigin-RevId: 847803638
---
 third_party/xla/xla/service/gpu/determinism_test.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/determinism_test.cc b/third_party/xla/xla/service/gpu/determinism_test.cc
index 2dc6cb2e0fcdb3..481eb3997c7a26 100644
--- a/third_party/xla/xla/service/gpu/determinism_test.cc
+++ b/third_party/xla/xla/service/gpu/determinism_test.cc
@@ -171,6 +171,10 @@ class DeterminismTest : public GpuCodegenTest {
 };
 
 TEST_F(DeterminismTest, CublasDot) {
+  // This test expects to use Cublas. Disable other backends, including Triton.
+  debug_options_.clear_xla_gpu_experimental_autotune_backends();
+  debug_options_.add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_CUBLAS);
   constexpr absl::string_view kHloText = R"(
 ENTRY e {
   p0 = f32[128,128] parameter(0)

From fec780d7febddc82045efa6923ed00040437de9e Mon Sep 17 00:00:00 2001
From: Byungchul Kim <byungchul@google.com>
Date: Mon, 22 Dec 2025 10:38:48 -0800
Subject: [PATCH 676/753] Set FC's keep_num_dims to false when output dims is
 different from input dims after quantization.

On gemma3n with decode batch > 1, it happens when the embedding is coupled with PLE by einsum.
The export steps are:
1) Initial: BMM([b,2048]x[2048,7680] -> [b,7680])
2) FuseInputReshape_BatchMatMulWithFlattenedRhsDims: BMM([b,2048]x[2048,7680] -> [b,7680])
3) ConvertBatchMatMulOp2FullyConnectedOp_Rank2ConstantRhs: FC([b,2048]x[2048,7680] -> [b,7680])
4) StrictQuantizationPattern(by IsDrqTensor): FC([b,1,2048]x[2048,7680] -> [b,7680])

When FC's keep_num_dims is false and it's followed by reshape op (like gemma3n), keep_num_dims will be set to true later with correct shapes by EnableFullyConnectedKeepNumDimsBeforeReshape.

PiperOrigin-RevId: 847813526
---
 .../compiler/mlir/lite/transforms/quantize.cc | 46 ++++++++++++++++---
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index c213c1ee498250..c50e0a26e71c48 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -80,13 +80,13 @@ static LogicalResult IsDrqTensor(Value value, Value& fq_input) {
   // fake quant op.
   // This is to support the case such as:
   // %2077 = "vhlo.composite_v1"(%73, %69, %2070) : (tensor<i32>, tensor<i32>,
-  //  tensor<1x?x512xf32>) -> tensor<1x?x512xf32>
+  //   tensor<1x?x512xf32>) -> tensor<1x?x512xf32>
   // %2078 = "tfl.reshape"(%2077, %99) : (tensor<1x?x512xf32>, tensor<2xi32>) ->
-  //  tensor<?x512xf32>
+  //   tensor<?x512xf32>
   // %2079 = "tfl.pseudo_qconst"() <{qtype = tensor<64x512x!quant.uniform<i8....
-  // %2080 = "tfl.dequantize"(%2079) %2081 = "tfl.fully_connected"
-  //  (%2078, %2080, %0) : (tensor<?x512xf32>, tensor<64x512xf32>, none) ->
-  //  tensor<?x64xf32>
+  // %2080 = "tfl.dequantize"(%2079)
+  // %2081 = "tfl.fully_connected"(%2078, %2080, %0) : (tensor<?x512xf32>,
+  //   tensor<64x512xf32>, none) -> tensor<?x64xf32>
   // TODO - b/422588785: Have proper support for dynamic shaped models.
   auto v = value;
   if (auto reshape_op = llvm::dyn_cast_or_null<ReshapeOp>(v.getDefiningOp())) {
@@ -228,6 +228,40 @@ class PushForwardDrqFQ : public OpRewritePattern<stablehlo::CompositeOp> {
   }
 };
 
+// Fixes keep_num_dims option of FC if output dims is different from input dims
+// though keep_num_dims is true. It happens when FC's input has changed after
+// quantization, e.g. by IsDrqTensor().
+// Sets keep_num_dims to false if that's the case. Otherwise, it's not
+// compatible with GPU. See CheckGpuDelegateCompatibility() in
+// third_party/tensorflow/lite/tools/versioning/gpu_compatibility.cc.
+// Note that if FC is followed by Reshape, the keep_num_dims will be set to true
+// with a correct shape later by EnableFullyConnectedKeepNumDimsBeforeReshape()
+// in optimize pass.
+struct FixFullyConnectedKeepNumDims
+    : public OpRewritePattern<FullyConnectedOp> {
+  explicit FixFullyConnectedKeepNumDims(MLIRContext* context)
+      : OpRewritePattern<TFL::FullyConnectedOp>(context, /*benefit=*/0) {}
+
+  LogicalResult matchAndRewrite(FullyConnectedOp fc,
+                                PatternRewriter& rewriter) const override {
+    if (!fc.getKeepNumDims()) return failure();
+
+    auto input_ty =
+        mlir::dyn_cast_or_null<RankedTensorType>(fc.getInput().getType());
+    auto fc_ty = mlir::dyn_cast_or_null<RankedTensorType>(fc.getType(0));
+    if (!input_ty || !fc_ty) return failure();
+
+    auto input_shape = input_ty.getShape();
+    auto fc_shape = fc_ty.getShape();
+    if (input_shape.size() == fc_shape.size()) {
+      return failure();
+    }
+
+    fc.setKeepNumDims(false);
+    return success();
+  }
+};
+
 class StrictQuantizationPattern : public RewritePattern {
  public:
   using BaseType = StrictQuantizationPattern;
@@ -764,7 +798,7 @@ void QuantizePass::runOnOperation() {
     patterns.add<TFLFullQuantization, TFLFullQuantizationReverse>(ctx,
                                                                   quant_params);
   }
-
+  patterns.add<FixFullyConnectedKeepNumDims>(ctx);
   (void)applyPatternsGreedily(func, std::move(patterns));
 
   // Constant quantization is a lossy transformation, so they are applied only

From 1fa15367ad8e996e777e1d688d01dcd0f1a33db6 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Mon, 22 Dec 2025 11:28:27 -0800
Subject: [PATCH 677/753] Add proto serialization for RaggedAllToAllStartThunk

PiperOrigin-RevId: 847830182
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  18 ++-
 .../gpu/runtime/ragged_all_to_all_thunk.cc    | 105 +++++++++++++++---
 .../gpu/runtime/ragged_all_to_all_thunk.h     |  16 ++-
 .../runtime/ragged_all_to_all_thunk_test.cc   |  78 +++++++++++++
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  12 ++
 .../runtime/thunk_proto_deserialization.cc    |   5 +
 6 files changed, 216 insertions(+), 18 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk_test.cc

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index de324ce85f41f5..717afa12e2e74f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1498,7 +1498,6 @@ cc_library(
     name = "ragged_all_to_all_thunk",
     srcs = ["ragged_all_to_all_thunk.cc"],
     hdrs = ["ragged_all_to_all_thunk.h"],
-    tags = ["gpu"],
     deps = [
         ":collective_thunk",
         ":ragged_all_to_all",
@@ -1513,6 +1512,7 @@ cc_library(
         "//xla/core/collectives:communicator",
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:rendezvous",
         "//xla/service/gpu/transforms/collectives:collective_ops_utils",
         "//xla/stream_executor:device_address",
@@ -1540,6 +1540,21 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "ragged_all_to_all_thunk_test",
+    srcs = ["ragged_all_to_all_thunk_test.cc"],
+    deps = [
+        ":collective_thunk",
+        ":ragged_all_to_all_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "collective_broadcast_thunk",
     srcs = ["collective_broadcast_thunk.cc"],
@@ -2927,6 +2942,7 @@ cc_library(
         ":memset_thunk",
         ":norm_thunk",
         ":outfeed_thunk",
+        ":ragged_all_to_all_thunk",
         ":replica_id_thunk",
         ":sequential_thunk",
         ":thunk",
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
index 75e3ca5dc0eb81..377d64404c8f34 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "xla/future.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
 #include "xla/shape.h"
@@ -218,6 +219,32 @@ absl::Status RunRaggedAllToAll(
 
 }  // namespace
 
+RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
+    ThunkInfo thunk_info, const HloRaggedAllToAllInstruction* instr,
+    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
+    : RaggedAllToAllStartThunk(
+          std::move(thunk_info), GetRaggedAllToAllConfig(instr),
+          IsGPUSyncCollective(*instr)
+              ? nullptr
+              : std::make_shared<CollectiveThunk::AsyncEvents>(),
+          std::move(buffers),
+          instr->GetModule()
+              ->config()
+              .debug_options()
+              .xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel()) {}
+
+RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
+    ThunkInfo thunk_info, const RaggedAllToAllConfig& config,
+    std::shared_ptr<AsyncEvents> async_events,
+    std::vector<CollectiveThunk::Buffer> buffers, bool one_shot_kernel_enabled)
+    : CollectiveThunk(Thunk::kRaggedAllToAllStart, thunk_info, async_events,
+                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
+      config_(config),
+      buffers_(std::move(buffers)),
+      one_shot_kernel_enabled_(one_shot_kernel_enabled) {
+  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
+}
+
 // Executes the rendezvous before the kernel start.
 // Inserts CUDA events into the stream to ensure that all devices have reached
 // the start event before the kernel starts.
@@ -333,22 +360,6 @@ absl::Status RaggedAllToAllStartThunk::RunOneShotRaggedAllToAll(
                                      *rendezvous_values);
 }
 
-RaggedAllToAllStartThunk::RaggedAllToAllStartThunk(
-    ThunkInfo thunk_info, const HloRaggedAllToAllInstruction* instr,
-    std::vector<CollectiveThunk::Buffer> buffers, bool p2p_memcpy_enabled)
-    : CollectiveThunk(Thunk::kRaggedAllToAllStart, thunk_info,
-                      IsGPUSyncCollective(*instr),
-                      AsyncStreamKind::ASYNC_STREAM_KIND_COLLECTIVE),
-      config_(GetRaggedAllToAllConfig(instr)),
-      buffers_(std::move(buffers)),
-      one_shot_kernel_enabled_(
-          instr->GetModule()
-              ->config()
-              .debug_options()
-              .xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel()) {
-  CHECK_EQ(config_.config.operand_element_type.size(), buffers_.size());
-}
-
 /*static*/ absl::Status RaggedAllToAllStartThunk::CheckImplementable(
     const HloRaggedAllToAllInstruction* instr, int64_t replica_count,
     int64_t partition_count) {
@@ -452,6 +463,68 @@ bool RaggedAllToAllStartThunk::is_local() const {
   return true;
 }
 
+absl::StatusOr<std::unique_ptr<RaggedAllToAllStartThunk>>
+RaggedAllToAllStartThunk::FromProto(
+    ThunkInfo thunk_info, const RaggedAllToAllStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  return std::make_unique<RaggedAllToAllStartThunk>(
+      std::move(thunk_info),
+      RaggedAllToAllConfig{config, thunk_proto.num_total_updates(),
+                           thunk_proto.num_input_rows(),
+                           thunk_proto.num_row_elements()},
+      async_events, std::move(buffers), thunk_proto.one_shot_kernel_enabled());
+}
+
+absl::StatusOr<ThunkProto> RaggedAllToAllStartThunk::ToProto() const {
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  RaggedAllToAllStartThunkProto* thunk_proto =
+      proto.mutable_ragged_all_to_all_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+
+  thunk_proto->set_num_total_updates(config_.num_total_updates);
+  thunk_proto->set_num_input_rows(config_.num_input_rows);
+  thunk_proto->set_num_row_elements(config_.num_row_elements);
+  thunk_proto->set_one_shot_kernel_enabled(one_shot_kernel_enabled_);
+
+  return proto;
+}
+
 absl::StatusOr<bool> RaggedAllToAllStartThunk::RunCollective(
     const ExecuteParams& params, const GpuCliqueKey& clique_key,
     se::Stream& stream, Communicator& comm) {
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
index eb46ba6ef3b12d..fc56dfdae00c5a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "xla/core/collectives/communicator.h"
 #include "xla/core/collectives/rank_id.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/device_address_handle.h"
 #include "xla/stream_executor/event.h"
@@ -57,6 +59,11 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
                            const HloRaggedAllToAllInstruction* instr,
                            std::vector<Buffer> buffers,
                            bool p2p_memcpy_enabled);
+  RaggedAllToAllStartThunk(ThunkInfo thunk_info,
+                           const RaggedAllToAllConfig& config,
+                           std::shared_ptr<AsyncEvents> async_events,
+                           std::vector<CollectiveThunk::Buffer> buffers,
+                           bool one_shot_kernel_enabled);
 
   // Returns whether the given instruction can be lowered to a nccl
   // ragged-all-to-all call.
@@ -66,7 +73,7 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
 
   absl::Status Initialize(const InitializeParams& params) override;
 
-  static const char* GetHloOpName() { return "ragged-all-to-all-start"; }
+  static absl::string_view GetHloOpName() { return "ragged-all-to-all-start"; }
 
   static CollectiveOpGroupMode GetGroupMode(
       const HloRaggedAllToAllInstruction* instr);
@@ -74,6 +81,13 @@ class RaggedAllToAllStartThunk : public CollectiveThunk {
   const CollectiveConfig& config() const override { return config_.config; }
   absl::Span<const Buffer> buffers() const { return buffers_; }
 
+  static absl::StatusOr<std::unique_ptr<RaggedAllToAllStartThunk>> FromProto(
+      ThunkInfo thunk_info, const RaggedAllToAllStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
diff --git a/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk_test.cc
new file mode 100644
index 00000000000000..3b476b5777834f
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/ragged_all_to_all_thunk_test.cc
@@ -0,0 +1,78 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        ragged_all_to_all_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          num_total_updates: 10
+          num_input_rows: 2
+          num_row_elements: 5
+          one_shot_kernel_enabled: true
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<RaggedAllToAllStartThunk> thunk,
+                       RaggedAllToAllStartThunk::FromProto(
+                           thunk_info, proto.ragged_all_to_all_start_thunk(),
+                           buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_ragged_all_to_all_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.ragged_all_to_all_start_thunk()
+          .async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index d7af07c98214dd..661dd23c95dfd6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -435,6 +435,17 @@ message AllToAllStartThunkProto {
   bool p2p_memcpy_enabled = 5;
 }
 
+message RaggedAllToAllStartThunkProto {
+  optional uint64 async_events_unique_id = 1;
+  CollectiveConfigProto collective_config = 2;
+  repeated CollectiveBufferProto buffers = 3;
+
+  int64 num_total_updates = 4;
+  int64 num_input_rows = 5;
+  int64 num_row_elements = 6;
+  bool one_shot_kernel_enabled = 7;
+}
+
 message CollectiveDoneThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
@@ -483,6 +494,7 @@ message ThunkProto {
     AllGatherStartThunkProto all_gather_start_thunk = 38;
     AllReduceStartThunkProto all_reduce_start_thunk = 39;
     AllToAllStartThunkProto all_to_all_start_thunk = 40;
+    RaggedAllToAllStartThunkProto ragged_all_to_all_start_thunk = 41;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index 3700f70f033d1c..f2303b3ac4df9a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/memset_thunk.h"
 #include "xla/backends/gpu/runtime/norm_thunk.h"
 #include "xla/backends/gpu/runtime/outfeed_thunk.h"
+#include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
@@ -257,6 +258,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return AllToAllStartThunk::FromProto(
           std::move(thunk_info), thunk_proto.all_to_all_start_thunk(),
           buffer_allocations, collective_async_events_map);
+    case ThunkProto::kRaggedAllToAllStartThunk:
+      return RaggedAllToAllStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.ragged_all_to_all_start_thunk(),
+          buffer_allocations, collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);

From 678058948b75db748ad264ad1539ddf2b4e7b817 Mon Sep 17 00:00:00 2001
From: Dirk Hornung <dirkh@google.com>
Date: Mon, 22 Dec 2025 12:05:54 -0800
Subject: [PATCH 678/753] [Autotuner] Limit CuDNN tests to CuDNN autotuner
 backend.

PiperOrigin-RevId: 847842272
---
 .../xla/xla/backends/gpu/codegen/BUILD        | 12 +++++------
 .../xla/backends/gpu/codegen/cudnn_test.cc    | 21 +++++++++++++------
 2 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/BUILD b/third_party/xla/xla/backends/gpu/codegen/BUILD
index 702d1850efef92..806b28020c6aca 100644
--- a/third_party/xla/xla/backends/gpu/codegen/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/BUILD
@@ -95,7 +95,7 @@ xla_test(
         "//xla/hlo/testlib:pattern_matcher_gmock",
         "//xla/hlo/testlib:verified_hlo_module",
         "//xla/service:dump",
-        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service/gpu:cudnn_support_utils",
         "//xla/service/gpu:ir_emission_utils",
@@ -105,16 +105,16 @@ xla_test(
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:stream_executor_h",
-        "//xla/stream_executor:stream_executor_memory_allocator",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:env",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "//xla/tsl/platform:test",
         "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
     ],
 )
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
index 2ceef17cbdf053..8ebf435947689e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/cudnn_test.cc
@@ -35,24 +35,24 @@ limitations under the License.
 #include "xla/hlo/testlib/verified_hlo_module.h"
 #include "xla/primitive_util.h"
 #include "xla/service/dump.h"
-#include "xla/service/executable.h"
 #include "xla/service/gpu/cudnn_support_utils.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
 #include "xla/service/gpu/transforms/cudnn_fusion_compiler.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/stream_executor/cuda/cuda_compute_capability.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_memory_allocator.h"
+#include "xla/tsl/platform/env.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "xla/tsl/platform/test.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
-#include "tsl/platform/statusor.h"
-#include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
@@ -66,6 +66,10 @@ class CuDnnFusionTest : public GpuCodegenTest {
     // autotuning.
     debug_options.set_xla_gpu_autotune_level(0);
     debug_options.set_xla_gpu_cudnn_gemm_fusion_level(2);
+    // Only run the CuDNN backend.
+    debug_options.clear_xla_gpu_experimental_autotune_backends();
+    debug_options.add_xla_gpu_experimental_autotune_backends(
+        DebugOptions::AUTOTUNE_BACKEND_CUDNN);
     return debug_options;
   }
   se::CudaComputeCapability get_cuda_cc() const {
@@ -261,12 +265,17 @@ e {
                                      dnn_compiled_graphs);
   EXPECT_THAT(cudnn_compiler.Run(module.get()),
               absl_testing::IsOkAndHolds(false));
+  // Single dot is not supported by cuDNN, so Triton should be used.
+  HloModuleConfig config = GetModuleConfigForTest();
+  config.mutable_debug_options().add_xla_gpu_experimental_autotune_backends(
+      DebugOptions::AUTOTUNE_BACKEND_TRITON);
   EXPECT_TRUE(RunAndCompareTwoModules(kHloText, R"(e {
     a = f32[32,96] parameter(0)
     b = f32[96,64] parameter(1)
     d = f32[32,64] dot(a, b),
       lhs_contracting_dims={1}, rhs_contracting_dims={0}
   })",
+                                      config, config,
                                       ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 

From b7650e843bf5d4ea5596a320cf750416c11df6fa Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Mon, 22 Dec 2025 12:21:12 -0800
Subject: [PATCH 679/753] Add proto serialization for
 CollectivePermuteStartThunk

PiperOrigin-RevId: 847846872
---
 .../xla/xla/backends/gpu/runtime/BUILD        |   5 +
 .../gpu/runtime/collective_permute_thunk.cc   | 104 +++++++++++++++++-
 .../gpu/runtime/collective_permute_thunk.h    |  29 +++--
 .../runtime/collective_permute_thunk_test.cc  |  77 +++++++++++++
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  12 ++
 .../runtime/thunk_proto_deserialization.cc    |   5 +
 6 files changed, 221 insertions(+), 11 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 717afa12e2e74f..c89ff9384af678 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1634,6 +1634,7 @@ cc_library(
         "//xla/core/collectives:rank_id",
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
         "//xla/service:computation_placer",
         "//xla/service:rendezvous",
         "//xla/service/gpu:backend_configs_cc",
@@ -1643,6 +1644,7 @@ cc_library(
         "//xla/stream_executor:stream",
         "//xla/tsl/concurrency:async_value",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
@@ -1691,6 +1693,8 @@ xla_test(
         "//xla/tests:hlo_test_base",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:casts",
     ],
@@ -2922,6 +2926,7 @@ cc_library(
         ":all_gather_thunk",
         ":all_reduce_thunk",
         ":all_to_all_thunk",
+        ":collective_permute_thunk",
         ":collective_thunk",
         ":conditional_thunk",
         ":convolution_reorder_thunk",
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
index f50e40992f8292..469086398b05a6 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/collectives/gpu_communicator.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/transforms/collectives/collective_ops_utils.h"
 #include "xla/service/rendezvous.h"
@@ -56,6 +58,7 @@ limitations under the License.
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -90,9 +93,22 @@ CollectivePermuteStartThunk::CollectivePermuteStartThunk(
     int64_t replica_count, int64_t partition_count,
     const std::vector<Buffer>& buffers, bool p2p_memcpy_enabled,
     AsyncStreamKind stream_kind)
-    : CollectiveThunk(Thunk::kCollectivePermuteStart, thunk_info,
-                      IsGPUSyncCollective(*instr), stream_kind),
-      config_(GetP2PConfig(instr, replica_count, partition_count)),
+    : CollectivePermuteStartThunk(
+          std::move(thunk_info),
+          GetP2PConfig(instr, replica_count, partition_count),
+          IsGPUSyncCollective(*instr)
+              ? nullptr
+              : std::make_shared<CollectiveThunk::AsyncEvents>(),
+          buffers, p2p_memcpy_enabled, stream_kind) {}
+
+CollectivePermuteStartThunk::CollectivePermuteStartThunk(
+    ThunkInfo thunk_info, const P2PConfig& config,
+    std::shared_ptr<AsyncEvents> async_events,
+    const std::vector<Buffer>& buffers, bool p2p_memcpy_enabled,
+    AsyncStreamKind stream_kind)
+    : CollectiveThunk(Thunk::kCollectivePermuteStart, thunk_info, async_events,
+                      stream_kind),
+      config_(config),
       buffers_(buffers),
       p2p_memcpy_enabled_(p2p_memcpy_enabled) {}
 
@@ -230,6 +246,88 @@ bool operator==(const CallRendezvousKey& a, const CallRendezvousKey& b) {
   return a.run_id == b.run_id;
 }
 
+absl::StatusOr<std::unique_ptr<CollectivePermuteStartThunk>>
+CollectivePermuteStartThunk::FromProto(
+    ThunkInfo thunk_info, const CollectivePermuteStartThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::vector<CollectiveThunk::Buffer> buffers;
+  buffers.reserve(thunk_proto.buffers_size());
+  for (const CollectiveBufferProto& proto : thunk_proto.buffers()) {
+    ASSIGN_OR_RETURN(
+        CollectiveThunk::Buffer buffer,
+        CollectiveThunk::Buffer::FromProto(proto, buffer_allocations));
+    buffers.push_back(buffer);
+  }
+
+  std::shared_ptr<CollectiveThunk::AsyncEvents> async_events;
+  if (thunk_proto.has_async_events_unique_id()) {
+    std::shared_ptr<CollectiveThunk::AsyncEvents>& events =
+        async_events_map[AsyncEventsUniqueId{
+            thunk_proto.async_events_unique_id()}];
+    if (!events) {
+      events = std::make_shared<CollectiveThunk::AsyncEvents>();
+    }
+    async_events = events;
+  }
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  P2PConfig::IdToSourceTargetMap id_to_source_target;
+  for (const SourceTarget& source_target : thunk_proto.source_target_pairs()) {
+    id_to_source_target.insert({source_target.target(), {}})
+        .first->second.source = source_target.source();
+    id_to_source_target.insert({source_target.source(), {}})
+        .first->second.target = source_target.target();
+  }
+
+  return std::make_unique<CollectivePermuteStartThunk>(
+      std::move(thunk_info), P2PConfig{config, std::move(id_to_source_target)},
+      async_events, std::move(buffers), thunk_proto.p2p_memcpy_enabled(),
+      thunk_proto.async_stream_kind());
+}
+
+absl::StatusOr<ThunkProto> CollectivePermuteStartThunk::ToProto() const {
+  CHECK_EQ(config_.validation_kind, P2PConfig::ValidationKind::kValid);
+  CHECK(config_.source_target_to_bounds.empty());
+
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  CollectivePermuteStartThunkProto* thunk_proto =
+      proto.mutable_collective_permute_start_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  if (async_events_id.has_value()) {
+    thunk_proto->set_async_events_unique_id(async_events_id->value());
+  }
+
+  for (const Buffer& buffer : buffers_) {
+    ASSIGN_OR_RETURN(*thunk_proto->add_buffers(), buffer.ToProto());
+  }
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  thunk_proto->set_p2p_memcpy_enabled(p2p_memcpy_enabled_);
+
+  std::vector<SourceTarget> source_target_pairs;
+  source_target_pairs.reserve(config_.id_to_source_target.size() / 2);
+  for (const auto& [key_id, map_entry] : config_.id_to_source_target) {
+    SourceTarget pair;
+    if (!map_entry.source.has_value()) {
+      // Same pair is in the map with target/source switched.
+      continue;
+    }
+    pair.set_source(*map_entry.source);
+    pair.set_target(key_id);
+    source_target_pairs.push_back(pair);
+  }
+  thunk_proto->mutable_source_target_pairs()->Assign(
+      source_target_pairs.begin(), source_target_pairs.end());
+
+  return proto;
+}
+
 absl::StatusOr<bool> CollectivePermuteStartThunk::RunCollective(
     const ExecuteParams& params, const GpuCliqueKey& clique_key,
     se::Stream& stream, Communicator& comm) {
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
index 964fcd7ea7526b..c24369f2d85dbc 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk.h
@@ -1,4 +1,5 @@
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
+#include "xla/service/buffer_assignment.h"
 /* Copyright 2021 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -92,6 +93,18 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
         ABSL_GUARDED_BY(mutex_);
   };
 
+  CollectivePermuteStartThunk(ThunkInfo thunk_info,
+                              const HloCollectivePermuteInstruction* instr,
+                              int64_t replica_count, int64_t partition_count,
+                              const std::vector<Buffer>& buffers,
+                              bool p2p_memcpy_enabled,
+                              AsyncStreamKind stream_kind);
+  CollectivePermuteStartThunk(ThunkInfo thunk_info, const P2PConfig& config,
+                              std::shared_ptr<AsyncEvents> async_events,
+                              const std::vector<Buffer>& buffers,
+                              bool p2p_memcpy_enabled,
+                              AsyncStreamKind stream_kind);
+
   static P2PConfig GetP2PConfig(const HloCollectivePermuteInstruction* instr,
                                 int64_t replica_count, int64_t partition_count);
 
@@ -101,16 +114,9 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
   static CollectiveOpGroupMode GetGroupMode(
       const HloCollectivePermuteInstruction* instr);
 
-  CollectivePermuteStartThunk(ThunkInfo thunk_info,
-                              const HloCollectivePermuteInstruction* instr,
-                              int64_t replica_count, int64_t partition_count,
-                              const std::vector<Buffer>& buffers,
-                              bool p2p_memcpy_enabled,
-                              AsyncStreamKind stream_kind);
-
   absl::Status Initialize(const InitializeParams& params) override;
 
-  static const char* GetHloOpName() { return "collective-permute-start"; }
+  static absl::string_view GetHloOpName() { return "collective-permute-start"; }
 
   const CollectiveConfig& config() const override { return config_.config; }
 
@@ -118,6 +124,13 @@ class CollectivePermuteStartThunk : public CollectiveThunk {
 
   const P2PConfig& p2p_config() const { return config_; }
 
+  static absl::StatusOr<std::unique_ptr<CollectivePermuteStartThunk>> FromProto(
+      ThunkInfo thunk_info, const CollectivePermuteStartThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
                                      const GpuCliqueKey& clique_key,
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
index f9e51f21be014b..e6796e68525e88 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_permute_thunk_test.cc
@@ -46,6 +46,8 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -55,6 +57,7 @@ namespace {
 
 using ::testing::ElementsAre;
 using Kind = Thunk::Kind;
+using ::tsl::proto_testing::EqualsProto;
 
 class GpuCollectivePermuteTest : public HloTestBase {};
 
@@ -212,5 +215,79 @@ ENTRY test_computation {
   EXPECT_THAT(kinds, ElementsAre(Kind::kReplicaId, Kind::kKernel,
                                  Kind::kCollectivePermuteStart));
 }
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        collective_permute_start_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          p2p_memcpy_enabled: true
+          async_stream_kind: ASYNC_STREAM_KIND_COLLECTIVE
+          source_target_pairs: { source: 1 target: 2 }
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<CollectivePermuteStartThunk> thunk,
+                       CollectivePermuteStartThunk::FromProto(
+                           thunk_info, proto.collective_permute_start_thunk(),
+                           buffer_allocations, async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_collective_permute_start_thunk()->set_async_events_unique_id(
+      round_trip_proto.collective_permute_start_thunk()
+          .async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+TEST(CollectiveThunkTest, SyncCollective) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        collective_permute_start_thunk {
+          collective_config {}
+          p2p_memcpy_enabled: true
+          async_stream_kind: ASYNC_STREAM_KIND_COLLECTIVE
+          source_target_pairs: { source: 1 target: 2 }
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<CollectivePermuteStartThunk> thunk,
+                       CollectivePermuteStartThunk::FromProto(
+                           thunk_info, proto.collective_permute_start_thunk(),
+                           buffer_allocations, async_events_map));
+  ASSERT_EQ(thunk->async_events(), nullptr);
+}
+
 }  // namespace
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 661dd23c95dfd6..8cc71ce2e0b8d5 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -446,6 +446,17 @@ message RaggedAllToAllStartThunkProto {
   bool one_shot_kernel_enabled = 7;
 }
 
+message CollectivePermuteStartThunkProto {
+  optional uint64 async_events_unique_id = 1;
+  repeated CollectiveBufferProto buffers = 2;
+
+  CollectiveConfigProto collective_config = 3;
+  repeated SourceTarget source_target_pairs = 4;
+
+  AsyncStreamKind async_stream_kind = 5;
+  bool p2p_memcpy_enabled = 6;
+}
+
 message CollectiveDoneThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
@@ -495,6 +506,7 @@ message ThunkProto {
     AllReduceStartThunkProto all_reduce_start_thunk = 39;
     AllToAllStartThunkProto all_to_all_start_thunk = 40;
     RaggedAllToAllStartThunkProto ragged_all_to_all_start_thunk = 41;
+    CollectivePermuteStartThunkProto collective_permute_start_thunk = 42;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index f2303b3ac4df9a..86ced1a02dd418 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/all_gather_thunk.h"
 #include "xla/backends/gpu/runtime/all_reduce_thunk.h"
 #include "xla/backends/gpu/runtime/all_to_all_thunk.h"
+#include "xla/backends/gpu/runtime/collective_permute_thunk.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/conditional_thunk.h"
 #include "xla/backends/gpu/runtime/convolution_reorder_thunk.h"
@@ -262,6 +263,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return RaggedAllToAllStartThunk::FromProto(
           std::move(thunk_info), thunk_proto.ragged_all_to_all_start_thunk(),
           buffer_allocations, collective_async_events_map);
+    case ThunkProto::kCollectivePermuteStartThunk:
+      return CollectivePermuteStartThunk::FromProto(
+          std::move(thunk_info), thunk_proto.collective_permute_start_thunk(),
+          buffer_allocations, collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);

From 3480eee02b8ce5e6545d5b86664a4b0412cc6ab2 Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Mon, 22 Dec 2025 12:54:52 -0800
Subject: [PATCH 680/753] Add HloModuleFromXlaComputation to
 HloRunnerAgnosticTestBase.

Sometimes it is useful to turn an XlaComputation straight into a HloModule in a
test. This is already functionality we basically support, but until now the
computation had to be in the form of an XlaBuilder, which is not always
practical.

PiperOrigin-RevId: 847856677
---
 third_party/xla/xla/tests/BUILD                    |  2 --
 .../xla/xla/tests/hlo_runner_agnostic_test_base.cc | 14 ++++++++++----
 .../xla/xla/tests/hlo_runner_agnostic_test_base.h  |  8 +++++++-
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index ae3c29fcdd5e72..88057624154aa1 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -232,7 +232,6 @@ cc_library(
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:nullability",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/log",
@@ -244,7 +243,6 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
index 71f4c3b69df799..d0e54a61e4660f 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.cc
@@ -56,7 +56,6 @@ limitations under the License.
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
 #include "xla/util.h"
-#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -108,9 +107,9 @@ HloRunnerAgnosticTestBase::ParseAndReturnVerifiedModule(
 }
 
 absl::StatusOr<std::unique_ptr<HloModule>>
-HloRunnerAgnosticTestBase::HloModuleFromXlaBuilder(
-    XlaBuilder* builder, const ExecutionOptions& execution_options) const {
-  TF_ASSIGN_OR_RETURN(XlaComputation computation, builder->Build());
+HloRunnerAgnosticTestBase::HloModuleFromXlaComputation(
+    const XlaComputation& computation,
+    const ExecutionOptions& execution_options) const {
   TF_ASSIGN_OR_RETURN(
       HloModuleConfig module_config,
       HloModule::CreateModuleConfigFromProto(computation.proto(),
@@ -123,6 +122,13 @@ HloRunnerAgnosticTestBase::HloModuleFromXlaBuilder(
   return module;
 }
 
+absl::StatusOr<std::unique_ptr<HloModule>>
+HloRunnerAgnosticTestBase::HloModuleFromXlaBuilder(
+    XlaBuilder* builder, const ExecutionOptions& execution_options) const {
+  TF_ASSIGN_OR_RETURN(XlaComputation computation, builder->Build());
+  return HloModuleFromXlaComputation(computation, execution_options);
+}
+
 HloComputation*
 HloRunnerAgnosticTestBase::AddEntryComputationAndUpdateEntryComputationLayout(
     HloModule* const module, std::unique_ptr<HloComputation> computation) {
diff --git a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
index ea3ffce16ff77c..bbce2c3037becb 100644
--- a/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
+++ b/third_party/xla/xla/tests/hlo_runner_agnostic_test_base.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/base/nullability.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/log/log.h"
@@ -34,6 +33,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
+#include "xla/hlo/builder/xla_computation.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -139,6 +139,12 @@ class HloRunnerAgnosticTestBase : public HloHardwareIndependentTestBase {
       absl::string_view hlo_text, const HloModuleConfig& config,
       const HloParserOptions& parser_options = HloParserOptions()) const;
 
+  // Builds an HLO module from the given XlaComputation using the given
+  // execution options.
+  absl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromXlaComputation(
+      const XlaComputation& computation,
+      const ExecutionOptions& execution_options) const;
+
   // Builds an HLO module from the given XlaBuilder using the given
   // execution options.
   absl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromXlaBuilder(

From 21d80205a696d8df9f831ffa21f402f96115dd7b Mon Sep 17 00:00:00 2001
From: Haibo Huang <hhb@google.com>
Date: Mon, 22 Dec 2025 13:32:14 -0800
Subject: [PATCH 681/753] Add PJRT_Buffer_DonateWithControlDependency to the
 PJRT C API.

PiperOrigin-RevId: 847868215
---
 third_party/xla/xla/pjrt/c/CHANGELOG.md       |  4 ++
 third_party/xla/xla/pjrt/c/pjrt_c_api.h       | 32 +++++++++++++++-
 third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc |  6 +++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc | 36 ++++++++++++++++++
 .../xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h  |  3 ++
 .../pjrt/c_api_client/pjrt_c_api_client.cc    | 37 +++++++++++++++++++
 .../xla/pjrt/c_api_client/pjrt_c_api_client.h |  3 ++
 7 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index c1c7c8cce99139..2818e8ea62f00f 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,9 @@
 # PJRT C API changelog
 
+## 0.88
+
+* Add `PJRT_Buffer_DonateWithControlDependency`.
+
 ## 0.87
 
 * Add `PJRT_Executable_GetCompileOptions`.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index a89d6a4c93bc8c..dd3bf0f8cc9a00 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -104,7 +104,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Extension_Base, next);
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 87
+#define PJRT_API_MINOR 88
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
@@ -2464,6 +2464,33 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args,
 typedef PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
     PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args* args);
 
+struct PJRT_Buffer_DonateWithControlDependency_Callback_Args {
+  size_t struct_size;
+  void* callback_data;
+  PJRT_Error_Code error_code;
+  const char* error_message;
+  size_t error_message_size;
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DonateWithControlDependency_Callback_Args,
+                          error_message_size);
+
+struct PJRT_Buffer_DonateWithControlDependency_Args {
+  size_t struct_size;
+  PJRT_Extension_Base* extension_start;
+  PJRT_Buffer* buffer;
+
+  void* callback_data;  // out
+  void (*dependency_ready_callback)(
+      PJRT_Buffer_DonateWithControlDependency_Callback_Args* args);  // out
+
+  PJRT_Buffer* out_buffer;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DonateWithControlDependency_Args,
+                          out_buffer);
+
+typedef PJRT_Error* PJRT_Buffer_DonateWithControlDependency(
+    PJRT_Buffer_DonateWithControlDependency_Args* args);
+
 // ---------------------------- CopyToDeviceStream -----------------------------
 
 struct PJRT_CopyToDeviceStream_Destroy_Args {
@@ -2852,11 +2879,12 @@ typedef struct PJRT_Api {
   _PJRT_API_STRUCT_FIELD(PJRT_Device_CreateAsyncTrackingEvent);
   _PJRT_API_STRUCT_FIELD(PJRT_AsyncTrackingEvent_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_GetCompileOptions);
+  _PJRT_API_STRUCT_FIELD(PJRT_Buffer_DonateWithControlDependency);
 } PJRT_Api;
 
 enum {
   PJRT_Api_STRUCT_SIZE =
-      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Executable_GetCompileOptions)
+      PJRT_STRUCT_SIZE(PJRT_Api, PJRT_Buffer_DonateWithControlDependency)
 };
 
 #undef _PJRT_API_STRUCT_FIELD
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 246e746bf0ddf6..a0faa37ccbf6d6 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -963,6 +963,9 @@ FieldOffsetsAndSizesForVersion(int major_version, int minor_version) {
     if (minor_version >= 87) {
       add_field("PJRT_Executable_GetCompileOptions", kFnPtrSize);
     }
+    if (minor_version >= 88) {
+      add_field("PJRT_Buffer_DonateWithControlDependency", kFnPtrSize);
+    }
     return version_offsets_and_sizes;
   }
   LOG(FATAL) << "Unsupported API version: " << major_version << "."
@@ -1377,6 +1380,9 @@ TEST_F(PjrtCAbiTestBase, FieldOffsetsAndSizes) {
           {"PJRT_Executable_GetCompileOptions",
            {offsetof(PJRT_Api, PJRT_Executable_GetCompileOptions),
             sizeof(PJRT_Api::PJRT_Executable_GetCompileOptions)}},
+          {"PJRT_Buffer_DonateWithControlDependency",
+           {offsetof(PJRT_Api, PJRT_Buffer_DonateWithControlDependency),
+            sizeof(PJRT_Api::PJRT_Buffer_DonateWithControlDependency)}},
       };
   ASSERT_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
   ASSERT_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 7c95b2c80e7d1e..81dff12b5e6d96 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -2512,6 +2512,40 @@ PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Buffer_DonateWithControlDependency(
+    PJRT_Buffer_DonateWithControlDependency_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_Buffer_DonateWithControlDependency_Args",
+      PJRT_Buffer_DonateWithControlDependency_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  auto [promise, future] = xla::Future<>::MakePromise();
+  PJRT_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::PjRtBuffer> out_buffer,
+      args->buffer->buffer->DonateWithControlDependency(std::move(future)));
+
+  struct CallbackData {
+    xla::Promise<> promise;
+  };
+  args->out_buffer =
+      new PJRT_Buffer{std::move(out_buffer), args->buffer->client};
+  args->callback_data = new CallbackData{std::move(promise)};
+  args->dependency_ready_callback =
+      [](PJRT_Buffer_DonateWithControlDependency_Callback_Args* args) {
+        auto* data = static_cast<CallbackData*>(args->callback_data);
+        if (args->error_code == PJRT_Error_Code_OK) {
+          data->promise.Set();
+        } else {
+          absl::Status status(
+              pjrt::PjrtErrorCodeToStatusCode(args->error_code),
+              absl::string_view(args->error_message, args->error_message_size));
+          data->promise.Set(std::move(status));
+        }
+        delete data;
+      };
+  return nullptr;
+}
+
 // ---------------------------- CopyToDeviceStream -----------------------------
 
 PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
@@ -3300,6 +3334,8 @@ PJRT_Api CreatePjrtApi(PJRT_Client_Create* create_fn,
       pjrt::PJRT_AsyncTrackingEvent_Destroy,
       /*PJRT_Executable_GetCompileOptions=*/
       pjrt::PJRT_Executable_GetCompileOptions,
+      /*PJRT_Buffer_DonateWithControlDependency=*/
+      pjrt::PJRT_Buffer_DonateWithControlDependency,
   };
 }
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index a0da7731d17837..9339e05931ea8a 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -434,6 +434,9 @@ PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
 PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
     PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args* args);
 
+PJRT_Error* PJRT_Buffer_DonateWithControlDependency(
+    PJRT_Buffer_DonateWithControlDependency_Args* args);
+
 PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
     PJRT_CopyToDeviceStream_Destroy_Args* args);
 PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
index 1e4823dc2f80b8..d09986db09030e 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.cc
@@ -3382,6 +3382,43 @@ PjRtCApiBuffer::AcquireExternalReference() {
                                                      device_memory_ptr);
 }
 
+absl::StatusOr<std::unique_ptr<PjRtBuffer>>
+PjRtCApiBuffer::DonateWithControlDependency(Future<> dependency) {
+  if (client_->pjrt_c_api()->pjrt_api_version.major_version == 0 &&
+      client_->pjrt_c_api()->pjrt_api_version.minor_version < 88) {
+    return Unimplemented(
+        "PJRT_Buffer_DonateWithControlDependency requires PJRT C API version "
+        "0.88 or higher.");
+  }
+  PJRT_Buffer_DonateWithControlDependency_Args args;
+  args.struct_size = PJRT_Buffer_DonateWithControlDependency_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.buffer = c_buffer();
+  const PJRT_Api* api = pjrt_c_api();
+  RETURN_STATUS_IF_PJRT_ERROR(
+      api->PJRT_Buffer_DonateWithControlDependency(&args), api);
+
+  dependency.OnReady([callback = args.dependency_ready_callback,
+                      data = args.callback_data](absl::Status s) {
+    PJRT_Buffer_DonateWithControlDependency_Callback_Args cb_args;
+    cb_args.struct_size =
+        PJRT_Buffer_DonateWithControlDependency_Callback_Args_STRUCT_SIZE;
+    cb_args.callback_data = data;
+    if (s.ok()) {
+      cb_args.error_code = PJRT_Error_Code_OK;
+      cb_args.error_message = nullptr;
+      cb_args.error_message_size = 0;
+    } else {
+      cb_args.error_code = pjrt::StatusCodeToPjrtErrorCode(s.code());
+      cb_args.error_message = s.message().data();
+      cb_args.error_message_size = s.message().size();
+    }
+    callback(&cb_args);
+  });
+  return std::unique_ptr<PjRtBuffer>(
+      std::make_unique<PjRtCApiBuffer>(client_, args.out_buffer));
+}
+
 void PjRtCApiBuffer::CopyToRemoteDevice(
     Future<std::string> serialized_descriptor, RemoteSendCallback on_done) {
   PJRT_CrossHostTransfers_Extension* extension =
diff --git a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
index 84d71e9740ec7f..5386b1230050b8 100644
--- a/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
+++ b/third_party/xla/xla/pjrt/c_api_client/pjrt_c_api_client.h
@@ -574,6 +574,9 @@ class PjRtCApiBuffer : public PjRtBuffer {
         "PJRT C API does not support ReleaseDeviceMemoryOwnership");
   }
 
+  absl::StatusOr<std::unique_ptr<PjRtBuffer>> DonateWithControlDependency(
+      Future<> dependency) override;
+
   bool IsDeleted() const override;
 
   absl::StatusOr<std::unique_ptr<PjRtBuffer>> CopyToMemorySpace(

From 69ea2a930865c3ed5f916a832ca602cc9808ed1a Mon Sep 17 00:00:00 2001
From: Subhankar Shah <subhankarshah@google.com>
Date: Mon, 22 Dec 2025 13:47:19 -0800
Subject: [PATCH 682/753] Allow prefetching an hlo value if its use is colored
 in alternate memory even if if loop optimizer has decided otherwise.

PiperOrigin-RevId: 847872344
---
 .../memory_space_assignment/algorithm.cc      |  16 +-
 .../memory_space_assignment_test.cc           | 161 ++++++++++++++++++
 2 files changed, 175 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
index 6d105815e5887d..323cbe0e08c2de 100644
--- a/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/algorithm.cc
@@ -6415,7 +6415,8 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
   }
 
   // Finally, try to prefetch the buffer into alternate memory.
-  if (request.allow_prefetch &&
+  if ((request.allow_prefetch ||
+       request.require_end_colored_in_alternate_memory) &&
       !request.allocation_value->requires_contiguous_allocation() &&
       !request.only_extend_existing_allocation &&
       required_memory_space_at_end != MemorySpace::kDefault &&
@@ -6490,7 +6491,18 @@ AllocationResult MsaAlgorithm::AllocateSegment(AllocationRequest& request) {
     result_mark(prefetch_result, allocation_result);
   }
 
-  CHECK(!request.require_end_colored_in_alternate_memory);
+  CHECK(!request.require_end_colored_in_alternate_memory)
+      << "Failed to allocate end in alternate memory, even though its "
+         "required. "
+         "requires_contiguous_allocation: "
+      << request.allocation_value->requires_contiguous_allocation()
+      << " only_extend_existing_allocation: "
+      << request.only_extend_existing_allocation
+      << " require_end_colored_in_default_memory: "
+      << request.require_end_colored_in_default_memory
+      << " required_memory_space_at_end: "
+      << (required_memory_space_at_end == MemorySpace::kDefault ? "default"
+                                                                : "alternate");
 
   // If the end assignment was required to be in alternate memory but that
   // wasn't possible, then this allocation is invalid.
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index f97b490e296afb..9f9ad74fe726dc 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -14773,6 +14773,167 @@ TEST_F(MemorySpaceAssignmentTest, TestColoringMultipleOperands) {
             kAlternateMemorySpace);
 }
 
+TEST_F(MemorySpaceAssignmentTest, TestColoringWithLoopOptimization) {
+  absl::string_view hlo_string = R"hlo(
+HloModule UnrolledLoop, is_scheduled=true
+
+ENTRY %main {
+  %param.0 = (f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4],
+              f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4], f32[2,4]) parameter(0)
+
+  %gte.0 = f32[2,4] get-tuple-element(%param.0), index=0
+  %gte.1 = f32[2,4] get-tuple-element(%param.0), index=1
+  %add.common = f32[2,4] add(%gte.0, %gte.1)
+  %tanh.common = f32[2,4] tanh(%add.common)
+
+  // loop idx0
+  %neg.0.idx0 = f32[2,4] negate(%tanh.common)
+  %neg.1.idx0 = f32[2,4] negate(%neg.0.idx0)
+  %neg.2.idx0 = f32[2,4] negate(%neg.1.idx0)
+
+  %gte.0.idx0 = f32[2,4] get-tuple-element(%param.0), index=2
+  %add.0.idx0 = f32[2,4] add(%neg.2.idx0, %gte.0.idx0)
+
+  %gte.1.idx0 = f32[2,4] get-tuple-element(%param.0), index=3
+  %add.1.idx0 = f32[2,4] add(%add.0.idx0, %gte.1.idx0)
+
+  %tanh.idx0 = f32[2,4] tanh(%add.1.idx0)
+
+  // loop idx1
+  %neg.0.idx1 = f32[2,4] negate(%tanh.idx0)
+  %neg.1.idx1 = f32[2,4] negate(%neg.0.idx1)
+  %neg.2.idx1 = f32[2,4] negate(%neg.1.idx1)
+
+  %gte.0.idx1 = f32[2,4] get-tuple-element(%param.0), index=4
+  %add.0.idx1 = f32[2,4] add(%neg.2.idx1, %gte.0.idx1)
+
+  %gte.1.idx1 = f32[2,4] get-tuple-element(%param.0), index=5
+  %add.1.idx1 = f32[2,4] add(%add.0.idx1, %gte.1.idx1)
+
+  %tanh.idx1 = f32[2,4] tanh(%add.1.idx1)
+
+  // loop idx2
+  %neg.0.idx2 = f32[2,4] negate(%tanh.idx1)
+  %neg.1.idx2 = f32[2,4] negate(%neg.0.idx2)
+  %neg.2.idx2 = f32[2,4] negate(%neg.1.idx2)
+
+  %gte.0.idx2 = f32[2,4] get-tuple-element(%param.0), index=6
+  %add.0.idx2 = f32[2,4] add(%neg.2.idx2, %gte.0.idx2)
+
+  %gte.1.idx2 = f32[2,4] get-tuple-element(%param.0), index=7
+  %add.1.idx2 = f32[2,4] add(%add.0.idx2, %gte.1.idx2)
+
+  %tanh.idx2 = f32[2,4] tanh(%add.1.idx2)
+
+  // loop idx3
+  %neg.0.idx3 = f32[2,4] negate(%tanh.idx2)
+  %neg.1.idx3 = f32[2,4] negate(%neg.0.idx3)
+  %neg.2.idx3 = f32[2,4] negate(%neg.1.idx3)
+
+  %gte.0.idx3 = f32[2,4] get-tuple-element(%param.0), index=8
+  %add.0.idx3 = f32[2,4] add(%neg.2.idx3, %gte.0.idx3)
+
+  %gte.1.idx3 = f32[2,4] get-tuple-element(%param.0), index=9
+  %add.1.idx3 = f32[2,4] add(%add.0.idx3, %gte.1.idx3)
+
+  %tanh.idx3 = f32[2,4] tanh(%add.1.idx3)
+
+  // loop idx4
+  %neg.0.idx4 = f32[2,4] negate(%tanh.idx3)
+  %neg.1.idx4 = f32[2,4] negate(%neg.0.idx4)
+  %neg.2.idx4 = f32[2,4] negate(%neg.1.idx4)
+
+  %gte.0.idx4 = f32[2,4] get-tuple-element(%param.0), index=10
+  %add.0.idx4 = f32[2,4] add(%neg.2.idx4, %gte.0.idx4)
+
+  %gte.1.idx4 = f32[2,4] get-tuple-element(%param.0), index=11
+  %add.1.idx4 = f32[2,4] add(%add.0.idx4, %gte.1.idx4)
+
+  %tanh.idx4 = f32[2,4] tanh(%add.1.idx4)
+
+  // loop idx5
+  %neg.0.idx5 = f32[2,4] negate(%tanh.idx4)
+  %neg.1.idx5 = f32[2,4] negate(%neg.0.idx5)
+  %neg.2.idx5 = f32[2,4] negate(%neg.1.idx5)
+
+  %gte.0.idx5 = f32[2,4] get-tuple-element(%param.0), index=12
+  %add.0.idx5 = f32[2,4] add(%neg.2.idx5, %gte.0.idx5)
+
+  %gte.1.idx5 = f32[2,4] get-tuple-element(%param.0), index=13
+  %add.1.idx5 = f32[2,4] add(%add.0.idx5, %gte.1.idx5)
+
+  %tanh.idx5 = f32[2,4] tanh(%add.1.idx5)
+
+  // loop idx6
+  %neg.0.idx6 = f32[2,4] negate(%tanh.idx5)
+  %neg.1.idx6 = f32[2,4] negate(%neg.0.idx6)
+  %neg.2.idx6 = f32[2,4] negate(%neg.1.idx6)
+
+  %gte.0.idx6 = f32[2,4] get-tuple-element(%param.0), index=14
+  %add.0.idx6 = f32[2,4] add(%neg.2.idx6, %gte.0.idx6)
+
+  %gte.1.idx6 = f32[2,4] get-tuple-element(%param.0), index=15
+  %add.1.idx6 = f32[2,4] add(%add.0.idx6, %gte.1.idx6)
+
+  %tanh.idx6 = f32[2,4] tanh(%add.1.idx6)
+
+  ROOT %negate.common = f32[2,4] negate(%tanh.idx6)
+})hlo";
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                       ParseAndReturnVerifiedModule(hlo_string));
+  Options memory_space_options = DefaultMemorySpaceOptions();
+  memory_space_options.max_size_in_bytes = 48;
+  memory_space_options.memory_bound_loop_optimizer_options.set_enabled(true);
+  memory_space_options.memory_bound_loop_optimizer_options
+      .set_desired_copy_ratio(0.1);
+  memory_space_options.memory_bound_loop_optimizer_options
+      .set_allow_unsatisfied_fully_pipelined_prefetch(true);
+  memory_space_options.memory_bound_loop_optimizer_options
+      .set_min_num_iterations(4);
+  HloUse add_0_idx0_use{FindInstruction(module.get(), "add.0.idx0"), 1, {}};
+  HloUse add_1_idx0_use{FindInstruction(module.get(), "add.1.idx0"), 1, {}};
+  HloUse add_0_idx1_use{FindInstruction(module.get(), "add.0.idx1"), 1, {}};
+  HloUse add_1_idx1_use{FindInstruction(module.get(), "add.1.idx1"), 1, {}};
+  HloUse add_0_idx2_use{FindInstruction(module.get(), "add.0.idx2"), 1, {}};
+  HloUse add_1_idx2_use{FindInstruction(module.get(), "add.1.idx2"), 1, {}};
+  HloUse add_0_idx3_use{FindInstruction(module.get(), "add.0.idx3"), 1, {}};
+  HloUse add_1_idx3_use{FindInstruction(module.get(), "add.1.idx3"), 1, {}};
+  HloUse add_0_idx4_use{FindInstruction(module.get(), "add.0.idx4"), 1, {}};
+  HloUse add_1_idx4_use{FindInstruction(module.get(), "add.1.idx4"), 1, {}};
+  HloUse add_0_idx5_use{FindInstruction(module.get(), "add.0.idx5"), 1, {}};
+  HloUse add_1_idx5_use{FindInstruction(module.get(), "add.1.idx5"), 1, {}};
+  HloUse add_0_idx6_use{FindInstruction(module.get(), "add.0.idx6"), 1, {}};
+  HloUse add_1_idx6_use{FindInstruction(module.get(), "add.1.idx6"), 1, {}};
+  memory_space_options.buffer_colorings = {
+      {add_0_idx0_use, kAlternateMemorySpace},
+      {add_1_idx0_use, kAlternateMemorySpace},
+      {add_0_idx1_use, kAlternateMemorySpace},
+      {add_1_idx1_use, kAlternateMemorySpace},
+      {add_0_idx2_use, kAlternateMemorySpace},
+      {add_1_idx2_use, kAlternateMemorySpace},
+      {add_0_idx3_use, kAlternateMemorySpace},
+      {add_1_idx3_use, kAlternateMemorySpace},
+      {add_0_idx4_use, kAlternateMemorySpace},
+      {add_1_idx4_use, kAlternateMemorySpace},
+      {add_0_idx5_use, kAlternateMemorySpace},
+      {add_1_idx5_use, kAlternateMemorySpace},
+      {add_0_idx6_use, kAlternateMemorySpace},
+      {add_1_idx6_use, kAlternateMemorySpace}};
+  XLA_VLOG_LINES(1, "Before MSA: \n" + module->ToString());
+
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     std::move(memory_space_options));
+  XLA_VLOG_LINES(1, "After MSA: \n" + module->ToString());
+  std::vector<std::string> alternate_memory_uses = {
+      "add.0.idx0", "add.1.idx0", "add.0.idx1", "add.1.idx1", "add.0.idx2",
+      "add.1.idx2", "add.0.idx3", "add.1.idx3", "add.0.idx4", "add.1.idx4",
+      "add.0.idx5", "add.1.idx5", "add.0.idx6", "add.1.idx6"};
+  CheckOperandOpcodeAndMemorySpaceForInstructionNames(
+      /*module=*/module.get(), /*instruction_names=*/alternate_memory_uses,
+      /*operand_index=*/1, /*operand_opcode=*/HloOpcode::kCopyDone,
+      /*operand_memory_space=*/kAlternateMemorySpace);
+}
+
 TEST_F(MemorySpaceAssignmentTest,
        ScopedAllocationAccountingWhenInstructionsAreRemoved) {
   absl::string_view hlo_string = R"(

From dd10786acdfd2875771b3e3a37fd80f22f4a214b Mon Sep 17 00:00:00 2001
From: Niklas Vangerow <nikv@google.com>
Date: Mon, 22 Dec 2025 15:58:02 -0800
Subject: [PATCH 683/753] Migrate conditional_test to PjRt.

PiperOrigin-RevId: 847911726
---
 third_party/xla/xla/tests/BUILD               |   8 +-
 third_party/xla/xla/tests/conditional_test.cc | 364 ++++++++++--------
 2 files changed, 217 insertions(+), 155 deletions(-)

diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index 88057624154aa1..a53c743d0abd52 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -801,9 +801,12 @@ xla_test(
     name = "conditional_test",
     srcs = ["conditional_test.cc"],
     shard_count = 2,
+    tags = ["test_migrated_to_hlo_runner_pjrt"],
     deps = [
         ":client_library_test_runner_mixin",
-        ":hlo_test_base",
+        ":hlo_pjrt_interpreter_reference_mixin",
+        ":hlo_pjrt_test_base",
+        ":literal_test_util",
         ":xla_internal_test_main",  # fixdeps: keep
         "//xla:array2d",
         "//xla:error_spec",
@@ -812,11 +815,14 @@ xla_test(
         "//xla:shape_util",
         "//xla/hlo/builder:xla_builder",
         "//xla/hlo/builder:xla_computation",
+        "//xla/hlo/ir:hlo",
         "//xla/hlo/testlib:test_helpers",
+        "//xla/service:hlo_runner_interface",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:statusor",
         "//xla/tsl/platform:test",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:status_matchers",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/third_party/xla/xla/tests/conditional_test.cc b/third_party/xla/xla/tests/conditional_test.cc
index 157c89c4241f4a..c293e12186e7d4 100644
--- a/third_party/xla/xla/tests/conditional_test.cc
+++ b/third_party/xla/xla/tests/conditional_test.cc
@@ -13,24 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cstdint>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status_matchers.h"
 #include "absl/strings/str_cat.h"
 #include "xla/array2d.h"
 #include "xla/error_spec.h"
 #include "xla/hlo/builder/xla_builder.h"
 #include "xla/hlo/builder/xla_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/testlib/test_helpers.h"
 #include "xla/literal.h"
 #include "xla/literal_util.h"
+#include "xla/service/hlo_runner_interface.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tests/client_library_test_runner_mixin.h"
-#include "xla/tests/hlo_test_base.h"
+#include "xla/tests/hlo_pjrt_interpreter_reference_mixin.h"
+#include "xla/tests/hlo_pjrt_test_base.h"
+#include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/tsl/platform/test.h"
@@ -39,12 +46,17 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::absl_testing::IsOkAndHolds;
+
 constexpr ErrorSpec kErrorSpec{0.001};
 
-class ConditionalOpTest : public ClientLibraryTestRunnerMixin<HloTestBase> {
+class ConditionalOpTest
+    : public ClientLibraryTestRunnerMixin<
+          HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>> {
  protected:
   void SetUp() override {
-    ClientLibraryTestRunnerMixin<HloTestBase>::SetUp();
+    ClientLibraryTestRunnerMixin<
+        HloPjRtInterpreterReferenceMixin<HloPjRtTestBase>>::SetUp();
     mutable_debug_options()->set_xla_test_add_command_buffer_mode(true);
   }
 
@@ -212,31 +224,37 @@ TEST_F(ConditionalOpTest, Parameters0) {
 
 // Test branch computations that do not take any parameters.
 TEST_P(CaseOpTest, Parameters0) {
-  int num_branches = GetParam();
+  const int num_branches = GetParam();
+
+  XlaBuilder builder(TestName());
+  const XlaOp branch_index =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "branch_index_arg");
+  auto operand = Tuple(&builder, {});
+  std::vector<XlaOp> operands(num_branches, operand);
+  std::vector<XlaComputation> branches;
+  branches.reserve(num_branches);
+  std::vector<const XlaComputation*> branches_p(num_branches);
+  for (int i = 0; i < num_branches; ++i) {
+    branches.push_back(CreateR0ConstantComputation(static_cast<float>(i) * 10));
+    branches_p[i] = &branches[i];
+  }
+  Conditional(branch_index, branches_p, operands);
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> executable,
+                       test_runner().CreateExecutable(std::move(module),
+                                                      /*run_hlo_passes=*/true));
   for (int bi = -1; bi <= num_branches; ++bi) {
     SCOPED_TRACE(bi);
-    XlaBuilder builder(TestName());
-    XlaOp branch_index;
-    auto branch_index_arg = CreateR0Parameter<int32_t>(
-        bi, 0, "branch_index_arg", &builder, &branch_index);
-    auto operand = Tuple(&builder, {});
-
-    std::vector<XlaOp> operands(num_branches, operand);
-    std::vector<XlaComputation> branches;
-    branches.reserve(num_branches);
-    std::vector<const XlaComputation*> branches_p(num_branches);
-    for (int i = 0; i < num_branches; ++i) {
-      branches.emplace_back(
-          CreateR0ConstantComputation(static_cast<float>(i) * 10));
-      branches_p[i] = &branches[i];
-    }
-    Conditional(branch_index, branches_p, operands);
-
-    float expected = 10 * static_cast<float>((bi < 0 || bi >= num_branches)
-                                                 ? num_branches - 1
-                                                 : bi);
-    ComputeAndCompareR0<float>(&builder, expected, {&branch_index_arg},
-                               kErrorSpec);
+    const Literal expected = LiteralUtil::CreateR0<float>(
+        10 * static_cast<float>(
+                 (bi < 0 || bi >= num_branches) ? num_branches - 1 : bi));
+    const Literal branch_index_arg = LiteralUtil::CreateR0<int32_t>(bi);
+    ASSERT_OK_AND_ASSIGN(const Literal result,
+                         test_runner().ExecuteWithExecutable(
+                             executable.get(), {&branch_index_arg}));
+    EXPECT_TRUE(LiteralTestUtil::Near(expected, result, kErrorSpec));
   }
 }
 
@@ -255,40 +273,45 @@ TEST_F(ConditionalOpTest, Parameters1) {
 
 // Test branch computations that take in 1 parameter.
 TEST_P(CaseOpTest, Parameters1) {
-  int num_branches = GetParam();
+  const int num_branches = GetParam();
+
+  XlaBuilder builder(TestName());
+  const XlaOp branch_index =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "branch_index_arg");
+  std::vector<XlaComputation> branches;
+  branches.reserve(num_branches);
+  std::vector<const XlaComputation*> branches_p(num_branches);
+  std::vector<XlaOp> operands;
+  operands.reserve(num_branches);
+  std::vector<Literal> expecteds(num_branches);
+  for (int i = 0; i < num_branches; ++i) {
+    std::unique_ptr<XlaBuilder> sb =
+        builder.CreateSubBuilder(absl::StrCat("branch_", i));
+    Add(ConstantR0<float>(sb.get(), static_cast<float>(i)),
+        Parameter(sb.get(), 0, r0f32_, "p0"));
+    branches.push_back(sb->BuildAndNoteError());
+    branches_p[i] = &branches[i];
+    const float fi = static_cast<float>(i);
+    operands.push_back(ConstantR0<float>(&builder, 10 * fi + 7));
+    expecteds[i] = LiteralUtil::CreateR0<float>(10 * fi + 7 + fi);
+  }
+  Conditional(branch_index, branches_p, operands);
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> executable,
+                       test_runner().CreateExecutable(std::move(module),
+                                                      /*run_hlo_passes=*/true));
   for (int bi = -1; bi <= num_branches; ++bi) {
     SCOPED_TRACE(bi);
-    XlaBuilder builder(TestName());
-    XlaOp branch_index;
-    auto branch_index_arg = CreateR0Parameter<int32_t>(
-        bi, 0, "branch_index_arg", &builder, &branch_index);
-
-    auto make_branch = [&builder, this](int i) {
-      auto sb = builder.CreateSubBuilder(absl::StrCat("branch_", i));
-      Add(ConstantR0<float>(sb.get(), static_cast<float>(i)),
-          Parameter(sb.get(), 0, r0f32_, "p0"));
-      return sb->BuildAndNoteError();
-    };
-    std::vector<XlaComputation> branches;
-    branches.reserve(num_branches);
-    std::vector<const XlaComputation*> branches_p(num_branches);
-    std::vector<XlaOp> operands;
-    operands.reserve(num_branches);
-    std::vector<float> expecteds(num_branches);
-    for (int i = 0; i < num_branches; ++i) {
-      branches.emplace_back(make_branch(i));
-      branches_p[i] = &branches[i];
-      auto fi = static_cast<float>(i);
-      operands.emplace_back(ConstantR0<float>(&builder, 10 * fi + 7));
-      expecteds[i] = 10 * fi + 7 + fi;
-    }
-
-    Conditional(branch_index, branches_p, operands);
-    float expected = (bi < 0 || bi >= num_branches)
-                         ? expecteds[num_branches - 1]
-                         : expecteds[bi];
-    ComputeAndCompareR0<float>(&builder, expected, {&branch_index_arg},
-                               kErrorSpec);
+    const Literal& expected = (bi < 0 || bi >= num_branches)
+                                  ? expecteds[num_branches - 1]
+                                  : expecteds[bi];
+    const Literal branch_index_arg = LiteralUtil::CreateR0<int32_t>(bi);
+    ASSERT_OK_AND_ASSIGN(const Literal result,
+                         test_runner().ExecuteWithExecutable(
+                             executable.get(), {&branch_index_arg}));
+    EXPECT_TRUE(LiteralTestUtil::Near(expected, result, kErrorSpec));
   }
 }
 
@@ -428,38 +451,46 @@ TEST_F(ConditionalOpTest, Parameters2ArrayTrueBranch) {
 
 // Test branch computations that take in 2 array parameters.
 TEST_P(CaseOpTest, Parameters2Array) {
-  int num_branches = GetParam();
+  const int num_branches = GetParam();
+
+  XlaBuilder builder(TestName());
+  const XlaOp branch_index =
+      Parameter(&builder, 0, ShapeUtil::MakeShape(S32, {}), "branch_index_arg");
+  const XlaOp operand1 = ConstantR1<float>(&builder, {24.0f, 56.0f});
+  const XlaOp operand2 = ConstantR1<float>(&builder, {10.0f, 11.0f});
+  const XlaOp operands = Tuple(&builder, {operand1, operand2});
+  std::vector<XlaComputation> branches;
+  branches.reserve(num_branches);
+  std::vector<const XlaComputation*> branches_p(num_branches);
+  for (int i = 0; i < num_branches; ++i) {
+    std::unique_ptr<XlaBuilder> sb =
+        builder.CreateSubBuilder(absl::StrCat("branch_", i));
+    const XlaOp p = Parameter(sb.get(), 0, tuple_2_r1s2f32_, "p0");
+    Add(Mul(ConstantR0<float>(sb.get(), static_cast<float>(i)),
+            GetTupleElement(p, 0)),
+        GetTupleElement(p, 1));
+    branches.push_back(sb->BuildAndNoteError());
+    branches_p[i] = &branches[i];
+  }
+  Conditional(branch_index, branches_p,
+              std::vector<XlaOp>(num_branches, operands));
+
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<OpaqueExecutable> executable,
+                       test_runner().CreateExecutable(std::move(module),
+                                                      /*run_hlo_passes=*/true));
   for (int bi = -1; bi <= num_branches; ++bi) {
     SCOPED_TRACE(bi);
-    XlaBuilder builder(TestName());
-    XlaOp branch_index;
-    auto branch_index_arg =
-        CreateR0Parameter<int32_t>(bi, 0, "pred", &builder, &branch_index);
-    auto operand1 = ConstantR1<float>(&builder, {24.0f, 56.0f});
-    auto operand2 = ConstantR1<float>(&builder, {10.0f, 11.0f});
-    auto operands = Tuple(&builder, {operand1, operand2});
-    auto make_branch = [&builder, this](int i) {
-      auto sb = builder.CreateSubBuilder(absl::StrCat("branch_", i));
-      auto p = Parameter(sb.get(), 0, tuple_2_r1s2f32_, "p0");
-      Add(Mul(ConstantR0<float>(sb.get(), static_cast<float>(i)),
-              GetTupleElement(p, 0)),
-          GetTupleElement(p, 1));
-      return sb->BuildAndNoteError();
-    };
-    std::vector<XlaComputation> branches;
-    branches.reserve(num_branches);
-    std::vector<const XlaComputation*> branches_p(num_branches);
-    for (int i = 0; i < num_branches; ++i) {
-      branches.emplace_back(make_branch(i));
-      branches_p[i] = &branches[i];
-    }
-    Conditional(branch_index, branches_p,
-                std::vector<XlaOp>(num_branches, operands));
-    auto modified_bi = static_cast<float>(
+    const Literal branch_index_arg = LiteralUtil::CreateR0<int32_t>(bi);
+    const float modified_bi = static_cast<float>(
         (bi < 0 || bi >= num_branches) ? num_branches - 1 : bi);
-    ComputeAndCompareR1<float>(
-        &builder, {24.0f * modified_bi + 10, 56.0f * modified_bi + 11},
-        {&branch_index_arg}, kErrorSpec);
+    const Literal expected = LiteralUtil::CreateR1<float>(
+        {24.0f * modified_bi + 10, 56.0f * modified_bi + 11});
+    ASSERT_OK_AND_ASSIGN(const Literal result,
+                         test_runner().ExecuteWithExecutable(
+                             executable.get(), {&branch_index_arg}));
+    EXPECT_TRUE(LiteralTestUtil::Near(expected, result, kErrorSpec));
   }
 }
 
@@ -561,48 +592,53 @@ TEST_F(ConditionalOpTest, ReturnNestedTuple) {
   XlaBuilder true_builder(TestName() + ".true");
   {
     Parameter(&true_builder, 0, empty_tuple_, "tuple");
-    auto true_constant1 = ConstantR0<float>(&true_builder, 12.2f);
-    auto true_constant2 = ConstantR1<float>(&true_builder, {12.8f, 14.6f});
-    auto true_constant3 = ConstantR1<float>(&true_builder, {25.4f, 29.8f});
-    auto true_constant4 = ConstantR0<float>(&true_builder, 35.6f);
+    const XlaOp true_constant1 = ConstantR0<float>(&true_builder, 12.2f);
+    const XlaOp true_constant2 =
+        ConstantR1<float>(&true_builder, {12.8f, 14.6f});
+    const XlaOp true_constant3 =
+        ConstantR1<float>(&true_builder, {25.4f, 29.8f});
+    const XlaOp true_constant4 = ConstantR0<float>(&true_builder, 35.6f);
     Tuple(&true_builder,
           {Tuple(&true_builder, {true_constant1, true_constant2}),
            Tuple(&true_builder, {true_constant3, true_constant4})});
   }
-  auto true_builder_result = true_builder.Build();
-  EXPECT_IS_OK(true_builder_result.status());
+  ASSERT_OK_AND_ASSIGN(XlaComputation true_comp, true_builder.Build());
 
   XlaBuilder false_builder(TestName() + ".false");
   {
     Parameter(&false_builder, 0, empty_tuple_, "tuple");
-    auto false_constant1 = ConstantR0<float>(&false_builder, 46.6f);
-    auto false_constant2 = ConstantR1<float>(&false_builder, {54.4f, 58.4f});
-    auto false_constant3 = ConstantR1<float>(&false_builder, {62.1f, 67.4f});
-    auto false_constant4 = ConstantR0<float>(&false_builder, 9.3f);
+    const XlaOp false_constant1 = ConstantR0<float>(&false_builder, 46.6f);
+    const XlaOp false_constant2 =
+        ConstantR1<float>(&false_builder, {54.4f, 58.4f});
+    const XlaOp false_constant3 =
+        ConstantR1<float>(&false_builder, {62.1f, 67.4f});
+    const XlaOp false_constant4 = ConstantR0<float>(&false_builder, 9.3f);
     Tuple(&false_builder,
           {Tuple(&false_builder, {false_constant1, false_constant2}),
            Tuple(&false_builder, {false_constant3, false_constant4})});
   }
-  auto false_builder_result = false_builder.Build();
-  EXPECT_IS_OK(false_builder_result.status());
+  ASSERT_OK_AND_ASSIGN(XlaComputation false_comp, false_builder.Build());
 
   XlaBuilder builder(TestName());
   XlaOp pred;
-  auto pred_arg = CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
-  auto operands = Tuple(&builder, {});
-  Conditional(pred, operands, std::move(true_builder_result).value(), operands,
-              std::move(false_builder_result).value());
+  const Literal pred_arg =
+      CreateR0Parameter<bool>(false, 0, "pred", &builder, &pred);
+  const XlaOp operands = Tuple(&builder, {});
+  const XlaOp result = Conditional(pred, operands, std::move(true_comp),
+                                   operands, std::move(false_comp));
+  // Flatten nested tuple for PjRt.
+  const XlaOp e0 = GetTupleElement(result, 0);
+  const XlaOp e1 = GetTupleElement(result, 1);
+  Tuple(&builder, {GetTupleElement(e0, 0), GetTupleElement(e0, 1),
+                   GetTupleElement(e1, 0), GetTupleElement(e1, 1)});
 
-  ComputeAndCompareLiteral(
-      &builder,
-      LiteralUtil::MakeTupleFromSlices(
-          {LiteralUtil::MakeTupleFromSlices(
-               {LiteralUtil::CreateR0<float>(46.6f),
-                LiteralUtil::CreateR1<float>({54.4f, 58.4f})}),
-           LiteralUtil::MakeTupleFromSlices(
-               {LiteralUtil::CreateR1<float>({62.1f, 67.4f}),
-                LiteralUtil::CreateR0<float>(9.3f)})}),
-      {&pred_arg}, kErrorSpec);
+  ComputeAndCompareLiteral(&builder,
+                           LiteralUtil::MakeTupleFromSlices(
+                               {LiteralUtil::CreateR0<float>(46.6f),
+                                LiteralUtil::CreateR1<float>({54.4f, 58.4f}),
+                                LiteralUtil::CreateR1<float>({62.1f, 67.4f}),
+                                LiteralUtil::CreateR0<float>(9.3f)}),
+                           {&pred_arg}, kErrorSpec);
 }
 
 // Test conditional that takes in scalar operands in the form of external
@@ -751,21 +787,31 @@ TEST_F(ConditionalOpTest, SwappedInputsInSequentialConditionals) {
     main = builder.Build().value();
   }
 
-  auto test_swap = [&](float a, float b) {
-    XlaBuilder builder(TestName());
-    XlaOp x, y;
-    auto x_arg = CreateR0Parameter<float>(a, 0, "x", &builder, &x);
-    auto y_arg = CreateR0Parameter<float>(b, 1, "y", &builder, &y);
-    auto tuple_operand = Tuple(&builder, {x, y});
-    Call(&builder, main, {tuple_operand});
-    ComputeAndCompareLiteral(
-        &builder,
-        LiteralUtil::MakeTupleFromSlices(
-            {LiteralUtil::CreateR0<float>(a), LiteralUtil::CreateR0<float>(b)}),
-        {&x_arg, &y_arg}, kErrorSpec);
+  XlaBuilder builder(TestName());
+  const XlaOp x = Parameter(&builder, 0, r0f32_, "x");
+  const XlaOp y = Parameter(&builder, 1, r0f32_, "y");
+  const XlaOp tuple_operand = Tuple(&builder, {x, y});
+  Call(&builder, main, {tuple_operand});
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<OpaqueExecutable> executable,
+      CreateExecutable(std::move(module), /*run_hlo_passes=*/true));
+
+  const auto test_swap =
+      [&, this](float a,
+                float b) -> absl::StatusOr<::testing::AssertionResult> {
+    const Literal x_arg = LiteralUtil::CreateR0<float>(a);
+    const Literal y_arg = LiteralUtil::CreateR0<float>(b);
+    const Literal expected = LiteralUtil::MakeTupleFromSlices(
+        {LiteralUtil::CreateR0<float>(a), LiteralUtil::CreateR0<float>(b)});
+    ASSIGN_OR_RETURN(const Literal result,
+                     test_runner().ExecuteWithExecutable(executable.get(),
+                                                         {&x_arg, &y_arg}));
+    return LiteralTestUtil::Near(expected, result, kErrorSpec);
   };
-  test_swap(3.11f, 9.4f);
-  test_swap(11.24f, 5.55f);
+  EXPECT_THAT(test_swap(3.11f, 9.4f), IsOkAndHolds(true));
+  EXPECT_THAT(test_swap(11.24f, 5.55f), IsOkAndHolds(true));
 }
 
 // Test conditional that duplicates tuple elements in the then and else
@@ -792,35 +838,45 @@ TEST_F(ConditionalOpTest, DuplicateElementsConditional) {
     else_comp = builder.Build().value();
   }
 
-  {
-    // Pred is true case.
-    std::vector<Literal> args;
-    args.push_back(LiteralUtil::MakeTupleFromSlices(
-        {LiteralUtil::CreateR0<int32_t>(123),
-         LiteralUtil::CreateR0<int32_t>(-42)}));
-    args.push_back(LiteralUtil::CreateR0<bool>(true));
-    XlaBuilder builder(TestName() + ".main");
-    auto p = Parameter(&builder, 0, tuple2, "p0");
-    auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1");
-    Conditional(p_pred, p, then_comp, p, else_comp);
-    ComputeAndCompare(&builder, {&args[0], &args[1]});
-  }
-  {
-    // Pred is false case.
-    std::vector<Literal> args;
-    args.push_back(LiteralUtil::MakeTupleFromSlices(
-        {LiteralUtil::CreateR0<int32_t>(123),
-         LiteralUtil::CreateR0<int32_t>(-42)}));
-    args.push_back(LiteralUtil::CreateR0<bool>(false));
-    XlaBuilder builder(TestName() + ".main");
-    auto p = Parameter(&builder, 0, tuple2, "p0");
-    auto p_pred = Parameter(&builder, 1, ShapeUtil::MakeShape(PRED, {}), "p1");
-    Conditional(p_pred, p, then_comp, p, else_comp);
-    ComputeAndCompare(&builder, {&args[0], &args[1]});
-  }
+  XlaBuilder builder(TestName() + ".main");
+  auto p0 = Parameter(&builder, 0, scalar, "p0.0");
+  auto p1 = Parameter(&builder, 1, scalar, "p0.1");
+  auto p = Tuple(&builder, {p0, p1});
+  auto p_pred = Parameter(&builder, 2, ShapeUtil::MakeShape(PRED, {}), "p1");
+  Conditional(p_pred, p, then_comp, p, else_comp);
+  ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                       HloModuleFromXlaBuilder(&builder, execution_options()));
+
+  const std::array<Literal, 4> args = {
+      LiteralUtil::CreateR0<int32_t>(123), LiteralUtil::CreateR0<int32_t>(-42),
+      LiteralUtil::CreateR0<bool>(true), LiteralUtil::CreateR0<bool>(false)};
+  const std::array<const Literal*, 3> true_args = {&args[0], &args[1],
+                                                   &args[2]};
+  const std::array<const Literal*, 3> false_args = {&args[0], &args[1],
+                                                    &args[3]};
+
+  // Compute reference values. Because this test is not parameterized, we need
+  // to manually invoke the test runner and reference runner.
+  ASSERT_OK_AND_ASSIGN(Literal true_reference,
+                       reference_runner().Execute(module->Clone(), true_args,
+                                                  /*run_hlo_passes=*/true));
+  ASSERT_OK_AND_ASSIGN(Literal false_reference,
+                       reference_runner().Execute(module->Clone(), false_args,
+                                                  /*run_hlo_passes=*/true));
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<OpaqueExecutable> executable,
+      CreateExecutable(std::move(module), /*run_hlo_passes=*/true));
+  ASSERT_OK_AND_ASSIGN(Literal true_result, test_runner().ExecuteWithExecutable(
+                                                executable.get(), true_args));
+  ASSERT_OK_AND_ASSIGN(
+      Literal false_result,
+      test_runner().ExecuteWithExecutable(executable.get(), false_args));
+  EXPECT_TRUE(LiteralTestUtil::Equal(true_reference, true_result));
+  EXPECT_TRUE(LiteralTestUtil::Equal(false_reference, false_result));
 }
 
-using ConditionalOpHloTest = HloTestBase;
+using ConditionalOpHloTest = HloPjRtTestBase;
 
 TEST_F(ConditionalOpHloTest, ParallelExecution) {
   // Test conditional works when an executable is executed in parallel.

From 6160600087fb23d583897b0300a461526e27e026 Mon Sep 17 00:00:00 2001
From: Quoc Truong <quoct@google.com>
Date: Mon, 22 Dec 2025 17:19:50 -0800
Subject: [PATCH 684/753] Remove ml_build_arm64 Dockerfile

PiperOrigin-RevId: 847933808
---
 .../containers/ml_build_arm64/Dockerfile      |   75 -
 .../containers/ml_build_arm64/apt.conf        |   15 -
 .../builder.devtoolset/build_devtoolset.sh    |  157 ---
 .../builder.devtoolset/fixlinks_aarch64.sh    |   28 -
 .../builder.devtoolset/gcc9-fixups.patch      |  270 ----
 .../builder.devtoolset/rpm-patch.sh           |   28 -
 .../builder.devtoolset/stringop_trunc.patch   | 1204 -----------------
 .../ml_build_arm64/builder.packages.txt       |   47 -
 .../containers/ml_build_arm64/devel.bashrc    |   26 -
 .../containers/ml_build_arm64/ld.so.conf      |   18 -
 .../ml_build_arm64/requirements.txt           |    7 -
 .../ml_build_arm64/setup.packages.sh          |   28 -
 .../containers/ml_build_arm64/setup.python.sh |  108 --
 .../ml_build_arm64/setup.sources.sh           |   45 -
 14 files changed, 2056 deletions(-)
 delete mode 100644 ci/official/containers/ml_build_arm64/Dockerfile
 delete mode 100644 ci/official/containers/ml_build_arm64/apt.conf
 delete mode 100755 ci/official/containers/ml_build_arm64/builder.devtoolset/build_devtoolset.sh
 delete mode 100755 ci/official/containers/ml_build_arm64/builder.devtoolset/fixlinks_aarch64.sh
 delete mode 100644 ci/official/containers/ml_build_arm64/builder.devtoolset/gcc9-fixups.patch
 delete mode 100755 ci/official/containers/ml_build_arm64/builder.devtoolset/rpm-patch.sh
 delete mode 100644 ci/official/containers/ml_build_arm64/builder.devtoolset/stringop_trunc.patch
 delete mode 100644 ci/official/containers/ml_build_arm64/builder.packages.txt
 delete mode 100644 ci/official/containers/ml_build_arm64/devel.bashrc
 delete mode 100644 ci/official/containers/ml_build_arm64/ld.so.conf
 delete mode 100644 ci/official/containers/ml_build_arm64/requirements.txt
 delete mode 100755 ci/official/containers/ml_build_arm64/setup.packages.sh
 delete mode 100755 ci/official/containers/ml_build_arm64/setup.python.sh
 delete mode 100755 ci/official/containers/ml_build_arm64/setup.sources.sh

diff --git a/ci/official/containers/ml_build_arm64/Dockerfile b/ci/official/containers/ml_build_arm64/Dockerfile
deleted file mode 100644
index 379162d0d1af76..00000000000000
--- a/ci/official/containers/ml_build_arm64/Dockerfile
+++ /dev/null
@@ -1,75 +0,0 @@
-################################################################################
-FROM ubuntu:20.04@sha256:8e5c4f0285ecbb4ead070431d29b576a530d3166df73ec44affc1cd27555141b as devel
-################################################################################
-
-# Install devtoolset build dependencies
-COPY setup.sources.sh /setup.sources.sh
-COPY setup.packages.sh /setup.packages.sh
-COPY builder.packages.txt /builder.packages.txt
-
-RUN /setup.sources.sh && /setup.packages.sh /builder.packages.txt
-
-RUN update-ca-certificates
-# Install devtoolset-9 in /dt10 with glibc 2.17 and libstdc++ 4.8, for building
-# manylinux2014-compatible packages.
-COPY builder.devtoolset/fixlinks_aarch64.sh /fixlinks.sh
-COPY builder.devtoolset/rpm-patch.sh /rpm-patch.sh
-COPY builder.devtoolset/build_devtoolset.sh /build_devtoolset.sh
-COPY builder.devtoolset/gcc9-fixups.patch /gcc9-fixups.patch
-COPY builder.devtoolset/stringop_trunc.patch /stringop_trunc.patch
-
-RUN /build_devtoolset.sh devtoolset-10 /dt10
-
-# Build later version of patchelf that is not so buggy
-RUN wget https://github.com/NixOS/patchelf/releases/download/0.18.0/patchelf-0.18.0-aarch64.tar.gz && tar -zxvf patchelf-0.18.0-aarch64.tar.gz -C /usr && rm -rf patchelf-0.18.0-aarch64.tar.gz
-
-RUN curl https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-cli-linux-arm.tar.gz | \
-    tar zxf - google-cloud-sdk && \
-    google-cloud-sdk/install.sh --quiet
-ENV PATH="$PATH:/google-cloud-sdk/bin/"
-
-# Install various tools.
-# - bats: bash unit testing framework
-#         NOTE: v1.6.0 seems to have a bug that made "git" in setup_file break
-# - bazelisk: always use the correct bazel version
-# - buildifier: clean bazel build depshttps://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildifier-linux-arm64
-# - buildozer: clean bazel build deps
-RUN git clone --branch v1.11.0 https://github.com/bats-core/bats-core.git && bats-core/install.sh /usr/local && rm -rf bats-core
-RUN wget https://github.com/bazelbuild/bazelisk/releases/download/v1.21.0/bazelisk-linux-arm64 -O /usr/local/bin/bazel && chmod +x /usr/local/bin/bazel
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildifier-linux-arm64 -O /usr/local/bin/buildifier && chmod +x /usr/local/bin/buildifier
-RUN wget https://github.com/bazelbuild/buildtools/releases/download/v7.3.1/buildozer-linux-arm64 -O /usr/local/bin/buildozer && chmod +x /usr/local/bin/buildozer
-
-RUN groupadd -g 1001 buildslave && useradd -m -u 1001 -g buildslave buildslave
-RUN mkdir -p /tf/venv
-RUN chown -R buildslave:buildslave /tf
-
-# All lines past this point are reset when $CACHEBUSTER is set. We need this
-# for Python specifically because we install some nightly packages which are
-# likely to change daily.
-ARG CACHEBUSTER=0
-RUN echo $CACHEBUSTER
-
-# Setup build and environment
-COPY devel.bashrc /root/.bashrc
-COPY ld.so.conf /dt10/etc/
-
-# Make sure clang is on the path
-RUN ln -s /usr/lib/llvm-18/bin/clang /usr/bin/clang
-
-# Setup JAX Python environment.
-COPY requirements.txt /requirements.txt
-COPY setup.python.sh /setup.python.sh
-RUN /setup.python.sh python3.9 requirements.txt
-RUN /setup.python.sh python3.10 requirements.txt
-RUN /setup.python.sh python3.11 requirements.txt
-RUN /setup.python.sh python3.12 requirements.txt
-RUN /setup.python.sh python3.13 requirements.txt
-# python3.13-nogil is a free-threaded build of python3.13.
-RUN /setup.python.sh python3.13-nogil requirements.txt
-RUN /setup.python.sh python3.14 requirements.txt
-RUN /setup.python.sh python3.14-nogil requirements.txt
-
-# Python commands by default run under 3.11
-RUN ln -sf /usr/bin/python3.11 /usr/bin/python3
-RUN ln -sf /usr/bin/python3.11 /usr/bin/python
-RUN ln -sf /usr/lib/python3.11 /usr/lib/tf_python
diff --git a/ci/official/containers/ml_build_arm64/apt.conf b/ci/official/containers/ml_build_arm64/apt.conf
deleted file mode 100644
index ea7b56091e5fd3..00000000000000
--- a/ci/official/containers/ml_build_arm64/apt.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-APT::Default-Release "focal";
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/build_devtoolset.sh b/ci/official/containers/ml_build_arm64/builder.devtoolset/build_devtoolset.sh
deleted file mode 100755
index d59923d405a8c8..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/build_devtoolset.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/bin/bash -eu
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Builds a devtoolset cross-compiler targeting manylinux2014 (glibc 2.17 / libstdc++ 4.8).
-
-VERSION="$1"
-TARGET="$2"
-
-case "${VERSION}" in
-devtoolset-9)
-  LIBSTDCXX_VERSION="6.0.28"
-  LIBSTDCXX_ABI="new"
-  ;;
-devtoolset-10)
-  LIBSTDCXX_VERSION="6.0.28"
-  LIBSTDCXX_ABI="new"
-  ;;
-*)
-  echo "Usage: $0 {devtoolset-9|devtoolset-10} <target-directory> <arch>"
-  echo "Use 'devtoolset-9' to build a manylinux2014 compatible toolchain"
-  exit 1
-  ;;
-esac
-
-mkdir -p "${TARGET}"
-
-mkdir -p ${TARGET}/usr/include
-
-# Put the current kernel headers from ubuntu in place.
-ln -s "/usr/include/linux" "${TARGET}/usr/include/linux"
-ln -s "/usr/include/asm-generic" "${TARGET}/usr/include/asm-generic"
-ln -s "/usr/include/aarch64-linux-gnu/asm" "${TARGET}/usr/include/asm"
-
-# Download glibc's shared and development libraries based on the value of the
-# `VERSION` parameter.
-# Note: 'Templatizing' this and the other conditional branches would require
-# defining several variables (version, os, path) making it difficult to maintain
-# and extend for future modifications.
-mkdir -p glibc-src
-mkdir -p glibc-build
-cd glibc-src
-wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/os/Source/SPackages/glibc-2.17-317.el7.src.rpm"
-rpm2cpio "glibc-2.17-317.el7.src.rpm" |cpio -idmv
-tar -xvzf "glibc-2.17-c758a686.tar.gz" --strip 1
-tar -xvzf "glibc-2.17-c758a686-releng.tar.gz" --strip 1
-sed -i '/patch0060/d' glibc.spec
-/rpm-patch.sh "glibc.spec"
-rm -f "glibc-2.17-317.el7.src.rpm" "glibc-2.17-c758a686.tar.gz" "glibc-2.17-c758a686-releng.tar.gz"
-patch -p1 < /gcc9-fixups.patch
-patch -p1 < /stringop_trunc.patch
-cd ../glibc-build
-../glibc-src/configure --prefix=/usr --disable-werror --enable-obsolete-rpc --disable-profile
-make -j$(nproc)
-make install DESTDIR=${TARGET}
-cd ..
-
-# Symlinks in the binary distribution are set up for installation in /usr, we
-# need to fix up all the links to stay within /${TARGET}.
-/fixlinks.sh "/${TARGET}"
-
-# Patch to allow non-glibc 2.12 compatible builds to work.
-sed -i '54i#define TCP_USER_TIMEOUT 18' "/${TARGET}/usr/include/netinet/tcp.h"
-
-# Download specific version of libstdc++ shared library based on the value of
-# the `VERSION` parameter
-  # Download binary libstdc++ 4.8 shared library release
-wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "http://old-releases.ubuntu.com/ubuntu/pool/main/g/gcc-4.8/libstdc++6_4.8.1-10ubuntu8_arm64.deb" && \
-    unar "libstdc++6_4.8.1-10ubuntu8_arm64.deb" && \
-    tar -C "${TARGET}" -xvzf "libstdc++6_4.8.1-10ubuntu8_arm64/data.tar.gz" "./usr/lib/aarch64-linux-gnu/libstdc++.so.6.0.18"  && \
-    rm -rf "libstdc++6_4.8.1-10ubuntu8_arm64.deb" "libstdc++6_4.8.1-10ubuntu8_arm64"
-
-mkdir -p "${TARGET}-src"
-cd "${TARGET}-src"
-
-# Build a devtoolset cross-compiler based on our glibc 2.12/glibc 2.17 sysroot setup.
-case "${VERSION}" in
-devtoolset-9)
-  wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-9-gcc-9.3.1-2.2.el7.src.rpm"
-  rpm2cpio "devtoolset-9-gcc-9.3.1-2.2.el7.src.rpm" |cpio -idmv
-  tar -xvf "gcc-9.3.1-20200408.tar.xz" --strip 1
-  ;;
-devtoolset-10)
-  wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 "https://vault.centos.org/centos/7/sclo/Source/rh/devtoolset-10-gcc-10.2.1-11.2.el7.src.rpm"
-  rpm2cpio "devtoolset-10-gcc-10.2.1-11.2.el7.src.rpm" |cpio -idmv
-  tar -xvf "gcc-10.2.1-20210130.tar.xz" --strip 1
-  ;;
-esac
-
-# Apply the devtoolset patches to gcc.
-/rpm-patch.sh "gcc.spec"
-
-./contrib/download_prerequisites
-
-mkdir -p "${TARGET}-build"
-cd "${TARGET}-build"
-
-"${TARGET}-src/configure" \
-      --prefix="${TARGET}/usr" \
-      --with-sysroot="/${TARGET}" \
-      --disable-bootstrap \
-      --disable-libmpx \
-      --enable-libsanitizer \
-      --disable-libunwind-exceptions \
-      --disable-libunwind-exceptions \
-      --disable-lto \
-      --disable-multilib \
-      --enable-__cxa_atexit \
-      --enable-gnu-indirect-function \
-      --enable-gnu-unique-object \
-      --enable-initfini-array \
-      --enable-languages="c,c++" \
-      --enable-linker-build-id \
-      --enable-plugin \
-      --enable-shared \
-      --enable-threads=posix \
-      --with-default-libstdcxx-abi=${LIBSTDCXX_ABI} \
-      --with-gcc-major-version-only \
-      --with-linker-hash-style="gnu" \
-      && \
-      make -j$(nproc) && \
-      make install
-
-
-# Create the devtoolset libstdc++ linkerscript that links dynamically against
-# the system libstdc++ 4.4 and provides all other symbols statically.
-# Note that the installation path for libstdc++ here is ${TARGET}/usr/lib64/
-mv "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}" \
-   "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}.backup"
-echo -e "OUTPUT_FORMAT(elf64-littleaarch64)\nINPUT ( libstdc++.so.6.0.18 -lstdc++_nonshared44 )" \
-   > "${TARGET}/usr/lib64/libstdc++.so.${LIBSTDCXX_VERSION}"
-cp "./aarch64-unknown-linux-gnu/libstdc++-v3/src/.libs/libstdc++_nonshared44.a" \
-   "${TARGET}/usr/lib64"
-
-
-# Link in architecture specific includes from the system; note that we cannot
-# link in the whole aarch64-linux-gnu folder, as otherwise we're overlaying
-# system gcc paths that we do not want to find.
-# TODO(klimek): Automate linking in all non-gcc / non-kernel include
-# directories.
-mkdir -p "${TARGET}/usr/include/aarch64-linux-gnu"
-PYTHON_VERSIONS=("python3.9" "python3.10" "python3.11" "python3.12")
-for v in "${PYTHON_VERSIONS[@]}"; do
-  ln -s "/usr/local/include/${v}" "${TARGET}/usr/include/aarch64-linux-gnu/${v}"
-done
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/fixlinks_aarch64.sh b/ci/official/containers/ml_build_arm64/builder.devtoolset/fixlinks_aarch64.sh
deleted file mode 100755
index 09a5f9854d42ef..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/fixlinks_aarch64.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Re-direct all links in $1 that are relative to be canonical
-
-BASE="$1"
-find "${BASE}" -type l | \
-  while read l ; do
-    if [[ "$(readlink "$l")" == \.\./* ]]; then
-      CANONICAL="$(readlink "$l")";
-      rm "$l";
-      ln -s "${CANONICAL}" "$l"
-    fi
-  done
-
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/gcc9-fixups.patch b/ci/official/containers/ml_build_arm64/builder.devtoolset/gcc9-fixups.patch
deleted file mode 100644
index 7b9bbf358ada74..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/gcc9-fixups.patch
+++ /dev/null
@@ -1,270 +0,0 @@
-diff --git a/iconv/gconv.h b/iconv/gconv.h
-index 3f9112e..8e60197 100644
---- a/iconv/gconv.h
-+++ b/iconv/gconv.h
-@@ -174,7 +174,7 @@ typedef struct __gconv_info
- {
-   size_t __nsteps;
-   struct __gconv_step *__steps;
--  __extension__ struct __gconv_step_data __data __flexarr;
-+  __extension__ struct __gconv_step_data __data[0];
- } *__gconv_t;
- 
- #endif /* gconv.h */
-diff --git a/include/libc-symbols.h b/include/libc-symbols.h
-index c555bf2..143b26d 100644
---- a/include/libc-symbols.h
-+++ b/include/libc-symbols.h
-@@ -107,6 +107,11 @@
- # endif
- #endif
- 
-+#ifndef __attribute_copy__
-+/* Provide an empty definition when cdefs.h is not included.  */
-+# define __attribute_copy__(arg)
-+#endif
-+
- #ifndef __ASSEMBLER__
- /* GCC understands weak symbols and aliases; use its interface where
-    possible, instead of embedded assembly language.  */
-@@ -114,7 +119,8 @@
- /* Define ALIASNAME as a strong alias for NAME.  */
- # define strong_alias(name, aliasname) _strong_alias(name, aliasname)
- # define _strong_alias(name, aliasname) \
--  extern __typeof (name) aliasname __attribute__ ((alias (#name)));
-+  extern __typeof (name) aliasname __attribute__ ((alias (#name))) \
-+    __attribute_copy__ (name);
- 
- /* This comes between the return type and function name in
-    a function definition to make that definition weak.  */
-@@ -125,14 +131,16 @@
-    If weak aliases are not available, this defines a strong alias.  */
- # define weak_alias(name, aliasname) _weak_alias (name, aliasname)
- # define _weak_alias(name, aliasname) \
--  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name)));
-+  extern __typeof (name) aliasname __attribute__ ((weak, alias (#name))) \
-+    __attribute_copy__ (name);
- 
- /* Same as WEAK_ALIAS, but mark symbol as hidden.  */
- # define weak_hidden_alias(name, aliasname) \
-   _weak_hidden_alias (name, aliasname)
- # define _weak_hidden_alias(name, aliasname) \
-   extern __typeof (name) aliasname \
--    __attribute__ ((weak, alias (#name), __visibility__ ("hidden")));
-+    __attribute__ ((weak, alias (#name), __visibility__ ("hidden"))) \
-+    __attribute_copy__ (name);
- 
- /* Declare SYMBOL as weak undefined symbol (resolved to 0 if not defined).  */
- # define weak_extern(symbol) _weak_extern (weak symbol)
-@@ -528,7 +536,8 @@ for linking")
- #  define __hidden_ver1(local, internal, name) \
-   extern __typeof (name) __EI_##name __asm__(__hidden_asmname (#internal)); \
-   extern __typeof (name) __EI_##name \
--	__attribute__((alias (__hidden_asmname (#local))))
-+    __attribute__((alias (__hidden_asmname (#local))))	\
-+    __attribute_copy__ (name)
- #  define hidden_ver(local, name)	__hidden_ver1(local, __GI_##name, name);
- #  define hidden_data_ver(local, name)	hidden_ver(local, name)
- #  define hidden_def(name)		__hidden_ver1(__GI_##name, name, name);
-@@ -541,7 +550,8 @@ for linking")
- #  define __hidden_nolink1(local, internal, name, version) \
-   __hidden_nolink2 (local, internal, name, version)
- #  define __hidden_nolink2(local, internal, name, version) \
--  extern __typeof (name) internal __attribute__ ((alias (#local))); \
-+  extern __typeof (name) internal __attribute__ ((alias (#local)))	\
-+    __attribute_copy__ (name);						\
-   __hidden_nolink3 (local, internal, #name "@" #version)
- #  define __hidden_nolink3(local, internal, vername) \
-   __asm__ (".symver " #internal ", " vername);
-diff --git a/locale/weightwc.h b/locale/weightwc.h
-index e966c03..22ab790 100644
---- a/locale/weightwc.h
-+++ b/locale/weightwc.h
-@@ -79,19 +79,19 @@ findidx (const wint_t **cpp, size_t len)
- 	    if (cp[cnt] != usrc[cnt])
- 	      break;
- 
--	  if (cnt < nhere - 1)
-+	  if (cnt < nhere - 1 || cnt == len)
- 	    {
- 	      cp += 2 * nhere;
- 	      continue;
- 	    }
- 
--	  if (cp[nhere - 1] > usrc[nhere -1])
-+	  if (cp[nhere - 1] > usrc[nhere - 1])
- 	    {
- 	      cp += 2 * nhere;
- 	      continue;
- 	    }
- 
--	  if (cp[2 * nhere - 1] < usrc[nhere -1])
-+	  if (cp[2 * nhere - 1] < usrc[nhere - 1])
- 	    {
- 	      cp += 2 * nhere;
- 	      continue;
-diff --git a/locale/xlocale.h b/locale/xlocale.h
-index 98c080b..843bd45 100644
---- a/locale/xlocale.h
-+++ b/locale/xlocale.h
-@@ -20,6 +20,9 @@
- #ifndef _XLOCALE_H
- #define _XLOCALE_H	1
- 
-+#ifndef _BITS_TYPES___LOCALE_T_H
-+#define _BITS_TYPES___LOCALE_T_H 1
-+
- /* Structure for reentrant locale using functions.  This is an
-    (almost) opaque type for the user level programs.  The file and
-    this data structure is not standardized.  Don't rely on it.  It can
-@@ -41,4 +44,6 @@ typedef struct __locale_struct
- /* POSIX 2008 makes locale_t official.  */
- typedef __locale_t locale_t;
- 
-+#endif /* bits/types/__locale_t.h */
-+
- #endif /* xlocale.h */
-diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
-index d1cb3dd..30482a1 100644
---- a/misc/sys/cdefs.h
-+++ b/misc/sys/cdefs.h
-@@ -423,4 +423,14 @@
- # endif
- #endif
- 
-+/* Undefine (also defined in libc-symbols.h).  */
-+#undef __attribute_copy__
-+#if __GNUC_PREREQ (9, 0)
-+/* Copies attributes from the declaration or type referenced by
-+   the argument.  */
-+# define __attribute_copy__(arg) __attribute__ ((__copy__ (arg)))
-+#else
-+# define __attribute_copy__(arg)
-+#endif
-+
- #endif	 /* sys/cdefs.h */
-diff --git a/stdlib/setenv.c b/stdlib/setenv.c
-index 45efe2e..06bfab0 100644
---- a/stdlib/setenv.c
-+++ b/stdlib/setenv.c
-@@ -319,6 +319,7 @@ unsetenv (const char *name)
- 
-   ep = __environ;
-   if (ep != NULL)
-+  {
-     while (*ep != NULL)
-       if (!strncmp (*ep, name, len) && (*ep)[len] == '=')
- 	{
-@@ -332,6 +333,7 @@ unsetenv (const char *name)
- 	}
-       else
- 	++ep;
-+  }
- 
-   UNLOCK;
- 
-diff --git a/support/Makefile b/support/Makefile
-index a253698..2f4e2a9 100644
---- a/support/Makefile
-+++ b/support/Makefile
-@@ -167,13 +167,6 @@ CFLAGS-support_paths.c = \
- 		-DINSTDIR_PATH=\"$(prefix)\" \
- 		-DLIBDIR_PATH=\"$(libdir)\"
- 
--ifeq (,$(CXX))
--LINKS_DSO_PROGRAM = links-dso-program-c
--else
--LINKS_DSO_PROGRAM = links-dso-program
--LDLIBS-links-dso-program = -lstdc++ -lgcc -lgcc_s $(libunwind)
--endif
--
- LDLIBS-test-container = $(libsupport)
- 
- others += test-container
-@@ -182,9 +175,6 @@ others-noinstall += test-container
- others += shell-container echo-container true-container
- others-noinstall += shell-container echo-container true-container
- 
--others += $(LINKS_DSO_PROGRAM)
--others-noinstall += $(LINKS_DSO_PROGRAM)
--
- $(objpfx)test-container : $(libsupport)
- $(objpfx)shell-container : $(libsupport)
- $(objpfx)echo-container : $(libsupport)
-diff --git a/support/links-dso-program.cc b/support/links-dso-program.cc
-index 8ff3155..f9d2b77 100644
---- a/support/links-dso-program.cc
-+++ b/support/links-dso-program.cc
-@@ -3,6 +3,11 @@
-    backported.  */
- #define _ISOMAC 1
- 
-+#define __GLIBC_USE(F)	__GLIBC_USE_ ## F
-+
-+# define __attribute_alloc_size__(params) \
-+  __attribute__ ((__alloc_size__ params))
-+
- #include <iostream>
- 
- using namespace std;
-diff --git a/sysdeps/aarch64/dl-machine.h b/sysdeps/aarch64/dl-machine.h
-index 185402f..bbdeae0 100644
---- a/sysdeps/aarch64/dl-machine.h
-+++ b/sysdeps/aarch64/dl-machine.h
-@@ -49,23 +49,11 @@ elf_machine_load_address (void)
-   /* To figure out the load address we use the definition that for any symbol:
-      dynamic_addr(symbol) = static_addr(symbol) + load_addr
- 
--     The choice of symbol is arbitrary. The static address we obtain
--     by constructing a non GOT reference to the symbol, the dynamic
--     address of the symbol we compute using adrp/add to compute the
--     symbol's address relative to the PC. */
--
--  ElfW(Addr) static_addr;
--  ElfW(Addr) dynamic_addr;
--
--  asm ("					\n\
--	adrp	%1, _dl_start;			\n\
--        add	%1, %1, #:lo12:_dl_start        \n\
--        ldr	%w0, 1f				\n\
--	b	2f				\n\
--1:	.word	_dl_start			\n\
--2:						\n\
--       " : "=r" (static_addr),  "=r" (dynamic_addr));
--  return dynamic_addr - static_addr;
-+    _DYNAMIC sysmbol is used here as its link-time address stored in
-+    the special unrelocated first GOT entry.  */
-+
-+    extern ElfW(Dyn) _DYNAMIC[] attribute_hidden;
-+    return (ElfW(Addr)) &_DYNAMIC - elf_machine_dynamic ();
- }
- 
- /* Set up the loaded object described by L so its unrelocated PLT
-diff --git a/sysdeps/ieee754/dbl-64/k_rem_pio2.c b/sysdeps/ieee754/dbl-64/k_rem_pio2.c
-index fcf956a..e2c5d29 100644
---- a/sysdeps/ieee754/dbl-64/k_rem_pio2.c
-+++ b/sysdeps/ieee754/dbl-64/k_rem_pio2.c
-@@ -172,7 +172,8 @@ int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, const int3
- 
-     /* compute q[0],q[1],...q[jk] */
- 	for (i=0;i<=jk;i++) {
--	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
-+	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
-+	    q[i] = fw;
- 	}
- 
- 	jz = jk;
-diff --git a/sysdeps/ieee754/flt-32/k_rem_pio2f.c b/sysdeps/ieee754/flt-32/k_rem_pio2f.c
-index e54a067..215b0e0 100644
---- a/sysdeps/ieee754/flt-32/k_rem_pio2f.c
-+++ b/sysdeps/ieee754/flt-32/k_rem_pio2f.c
-@@ -65,7 +65,8 @@ int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const int32
- 
-     /* compute q[0],q[1],...q[jk] */
- 	for (i=0;i<=jk;i++) {
--	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
-+	    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
-+	    q[i] = fw;
- 	}
- 
- 	jz = jk;
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/rpm-patch.sh b/ci/official/containers/ml_build_arm64/builder.devtoolset/rpm-patch.sh
deleted file mode 100755
index 892ae2af86a3fa..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/rpm-patch.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/bash -eu
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Given an RPM spec file $1, apply its patches.
-
-SPEC="$1"
-grep '%patch' "${SPEC}" |while read cmd ; do
-  N=$(echo "${cmd}" |sed 's,%patch\([0-9]\+\).*,\1,')
-  file=$(grep "Patch$N:" "${SPEC}" |sed 's,.*: ,,')
-  parg=$(echo "${cmd}" |sed 's,.*\(-p[0-9]\).*,\1,')
-  if [[ ! "${file}" =~ doxygen && "${cmd}" != \#* ]]; then
-    echo "patch ${parg} -s < ${file}"
-    patch ${parg} -s < "${file}"
-  fi
-done
diff --git a/ci/official/containers/ml_build_arm64/builder.devtoolset/stringop_trunc.patch b/ci/official/containers/ml_build_arm64/builder.devtoolset/stringop_trunc.patch
deleted file mode 100644
index bd9e5533118d6c..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.devtoolset/stringop_trunc.patch
+++ /dev/null
@@ -1,1204 +0,0 @@
-diff --git a/bits/utmp.h b/bits/utmp.h
-index 775123d..bf28c6d 100644
---- a/bits/utmp.h
-+++ b/bits/utmp.h
-@@ -1,5 +1,5 @@
--/* The `struct utmp' type, describing entries in the utmp file.  Generic/BSDish
--   Copyright (C) 1993, 1996, 1997 Free Software Foundation, Inc.
-+/* The `struct utmp' type, describing entries in the utmp file.
-+   Copyright (C) 1993-2022 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -14,7 +14,7 @@
- 
-    You should have received a copy of the GNU Lesser General Public
-    License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
-+   <https://www.gnu.org/licenses/>.  */
- 
- #ifndef _UTMP_H
- # error "Never include <bits/utmp.h> directly; use <utmp.h> instead."
-@@ -24,11 +24,13 @@
- #include <time.h>
- 
- 
--#define	UT_NAMESIZE	8
--#define	UT_LINESIZE	8
--#define	UT_HOSTSIZE	16
-+#define UT_LINESIZE	32
-+#define UT_NAMESIZE	32
-+#define UT_HOSTSIZE	256
- 
- 
-+/* The structure describing an entry in the database of
-+   previous logins.  */
- struct lastlog
-   {
-     time_t ll_time;
-@@ -36,12 +38,16 @@ struct lastlog
-     char ll_host[UT_HOSTSIZE];
-   };
- 
-+/* The structure describing an entry in the user accounting database.  */
- struct utmp
-   {
--    char ut_line[UT_LINESIZE];
--    char ut_user[UT_NAMESIZE];
-+    char ut_line[UT_LINESIZE]
-+      __attribute_nonstring__;	/* Devicename.  */
-+    char ut_user[UT_NAMESIZE]
-+      __attribute_nonstring__;	/* Username.  */
- #define ut_name ut_user
--    char ut_host[UT_HOSTSIZE];
-+    char ut_host[UT_HOSTSIZE]
-+      __attribute_nonstring__;	/* Hostname for remote login.  */
-     long int ut_time;
-   };
- 
-diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h
-index 30482a1..551d5fd 100644
---- a/misc/sys/cdefs.h
-+++ b/misc/sys/cdefs.h
-@@ -391,6 +391,15 @@
- 
- #include <bits/wordsize.h>
- 
-+#if __GNUC_PREREQ (8, 0)
-+/* Describes a char array whose address can safely be passed as the first
-+   argument to strncpy and strncat, as the char array is not necessarily
-+   a NUL-terminated string.  */
-+# define __attribute_nonstring__ __attribute__ ((__nonstring__))
-+#else
-+# define __attribute_nonstring__
-+#endif
-+
- #if defined __LONG_DOUBLE_MATH_OPTIONAL && defined __NO_LONG_DOUBLE_MATH
- # define __LDBL_COMPAT 1
- # ifdef __REDIRECT
-diff --git a/nis/nss_nisplus/nisplus-parser.c b/nis/nss_nisplus/nisplus-parser.c
-index a4d76fb..41600f0 100644
---- a/nis/nss_nisplus/nisplus-parser.c
-+++ b/nis/nss_nisplus/nisplus-parser.c
-@@ -82,7 +82,7 @@ _nss_nisplus_parse_pwent (nis_result *result, struct passwd *pw,
- 
-   char *numstr = NISOBJVAL (2, obj);
-   len = NISOBJLEN (2, obj);
--  if (len == 0 && numstr[len - 1] != '\0')
-+  if (len == 0 || numstr[len - 1] != '\0')
-     {
-       if (len >= room_left)
- 	goto no_more_room;
-@@ -98,7 +98,7 @@ _nss_nisplus_parse_pwent (nis_result *result, struct passwd *pw,
- 
-   numstr = NISOBJVAL (3, obj);
-   len = NISOBJLEN (3, obj);
--  if (len == 0 && numstr[len - 1] != '\0')
-+  if (len == 0 || numstr[len - 1] != '\0')
-     {
-       if (len >= room_left)
- 	goto no_more_room;
-diff --git a/string/bits/string2.h b/string/bits/string2.h
-index c9bf593..f461fc1 100644
---- a/string/bits/string2.h
-+++ b/string/bits/string2.h
-@@ -47,29 +47,7 @@
- #endif
- 
- #if _STRING_ARCH_unaligned
--/* If we can do unaligned memory accesses we must know the endianess.  */
--# include <endian.h>
- # include <bits/types.h>
--
--# if __BYTE_ORDER == __LITTLE_ENDIAN
--#  define __STRING2_SMALL_GET16(src, idx) \
--     (((const unsigned char *) (const char *) (src))[idx + 1] << 8	      \
--      | ((const unsigned char *) (const char *) (src))[idx])
--#  define __STRING2_SMALL_GET32(src, idx) \
--     (((((const unsigned char *) (const char *) (src))[idx + 3] << 8	      \
--	| ((const unsigned char *) (const char *) (src))[idx + 2]) << 8	      \
--       | ((const unsigned char *) (const char *) (src))[idx + 1]) << 8	      \
--      | ((const unsigned char *) (const char *) (src))[idx])
--# else
--#  define __STRING2_SMALL_GET16(src, idx) \
--     (((const unsigned char *) (const char *) (src))[idx] << 8		      \
--      | ((const unsigned char *) (const char *) (src))[idx + 1])
--#  define __STRING2_SMALL_GET32(src, idx) \
--     (((((const unsigned char *) (const char *) (src))[idx] << 8	      \
--	| ((const unsigned char *) (const char *) (src))[idx + 1]) << 8	      \
--       | ((const unsigned char *) (const char *) (src))[idx + 2]) << 8	      \
--      | ((const unsigned char *) (const char *) (src))[idx + 3])
--# endif
- #else
- /* These are a few types we need for the optimizations if we cannot
-    use unaligned memory accesses.  */
-@@ -94,148 +72,11 @@ __STRING2_COPY_TYPE (8);
- 
- /* Set N bytes of S to C.  */
- #if !defined _HAVE_STRING_ARCH_memset
--# if !__GNUC_PREREQ (3, 0)
--#  if _STRING_ARCH_unaligned
--#   define memset(s, c, n) \
--  (__extension__ (__builtin_constant_p (n) && (n) <= 16			      \
--		  ? ((n) == 1						      \
--		     ? __memset_1 (s, c)				      \
--		     : __memset_gc (s, c, n))				      \
--		  : (__builtin_constant_p (c) && (c) == '\0'		      \
--		     ? ({ void *__s = (s); __bzero (__s, n); __s; })	      \
--		     : memset (s, c, n))))
--
--#   define __memset_1(s, c) ({ void *__s = (s);				      \
--			    *((__uint8_t *) __s) = (__uint8_t) c; __s; })
--
--#   define __memset_gc(s, c, n) \
--  ({ void *__s = (s);							      \
--     union {								      \
--       unsigned int __ui;						      \
--       unsigned short int __usi;					      \
--       unsigned char __uc;						      \
--     } *__u = __s;							      \
--     __uint8_t __c = (__uint8_t) (c);					      \
--									      \
--     /* This `switch' statement will be removed at compile-time.  */	      \
--     switch ((unsigned int) (n))					      \
--       {								      \
--       case 15:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 11:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 7:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 3:								      \
--	 __u->__usi = (unsigned short int) __c * 0x0101;		      \
--	 __u = __extension__ ((void *) __u + 2);			      \
--	 __u->__uc = (unsigned char) __c;				      \
--	 break;								      \
--									      \
--       case 14:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 10:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 6:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 2:								      \
--	 __u->__usi = (unsigned short int) __c * 0x0101;		      \
--	 break;								      \
--									      \
--       case 13:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 9:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 5:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 1:								      \
--	 __u->__uc = (unsigned char) __c;				      \
--	 break;								      \
--									      \
--       case 16:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 12:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 8:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--	 __u = __extension__ ((void *) __u + 4);			      \
--       case 4:								      \
--	 __u->__ui = __c * 0x01010101;					      \
--       case 0:								      \
--	 break;								      \
--       }								      \
--									      \
--     __s; })
--#  else
--#   define memset(s, c, n) \
--  (__extension__ (__builtin_constant_p (c) && (c) == '\0'		      \
--		  ? ({ void *__s = (s); __bzero (__s, n); __s; })	      \
--		  : memset (s, c, n)))
--#  endif
--# endif
--
--/* GCC < 3.0 optimizes memset(s, 0, n) but not bzero(s, n).
--   The optimization is broken before EGCS 1.1.
--   GCC 3.0+ has __builtin_bzero as well, but at least till GCC 3.4
--   if it decides to call the library function, it calls memset
--   and not bzero.  */
--# if __GNUC_PREREQ (2, 91)
--#  define __bzero(s, n) __builtin_memset (s, '\0', n)
--# endif
--
-+# define __bzero(s, n) __builtin_memset (s, '\0', n)
- #endif
- 
--
--/* Copy N bytes from SRC to DEST, returning pointer to byte following the
--   last copied.  */
--#ifdef __USE_GNU
--# if !defined _HAVE_STRING_ARCH_mempcpy || defined _FORCE_INLINES
--#  ifndef _HAVE_STRING_ARCH_mempcpy
--#   if __GNUC_PREREQ (3, 4)
--#    define __mempcpy(dest, src, n) __builtin_mempcpy (dest, src, n)
--#   elif __GNUC_PREREQ (3, 0)
--#    define __mempcpy(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  && __string2_1bptr_p (src) && n <= 8			      \
--		  ? __builtin_memcpy (dest, src, n) + (n)		      \
--		  : __mempcpy (dest, src, n)))
--#   else
--#    define __mempcpy(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  && __string2_1bptr_p (src) && n <= 8			      \
--		  ? __mempcpy_small (dest, __mempcpy_args (src), n)	      \
--		  : __mempcpy (dest, src, n)))
--#   endif
--/* In glibc we use this function frequently but for namespace reasons
--   we have to use the name `__mempcpy'.  */
--#   define mempcpy(dest, src, n) __mempcpy (dest, src, n)
--#  endif
--
--#  if !__GNUC_PREREQ (3, 0) || defined _FORCE_INLINES
--#   if _STRING_ARCH_unaligned
--#    ifndef _FORCE_INLINES
--#     define __mempcpy_args(src) \
--     ((const char *) (src))[0], ((const char *) (src))[2],		      \
--     ((const char *) (src))[4], ((const char *) (src))[6],		      \
--     __extension__ __STRING2_SMALL_GET16 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET16 (src, 4),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 4)
--#    endif
--__STRING_INLINE void *__mempcpy_small (void *, char, char, char, char,
--				       __uint16_t, __uint16_t, __uint32_t,
--				       __uint32_t, size_t);
-+#if defined _FORCE_INLINES
-+# if _STRING_ARCH_unaligned
- __STRING_INLINE void *
- __mempcpy_small (void *__dest1,
- 		 char __src0_1, char __src2_1, char __src4_1, char __src6_1,
-@@ -298,44 +139,7 @@ __mempcpy_small (void *__dest1,
-     }
-   return (void *) __u;
- }
--#   else
--#    ifndef _FORCE_INLINES
--#     define __mempcpy_args(src) \
--     ((const char *) (src))[0],						      \
--     __extension__ ((__STRING2_COPY_ARR2)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1] } }),	      \
--     __extension__ ((__STRING2_COPY_ARR3)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2] } }),				      \
--     __extension__ ((__STRING2_COPY_ARR4)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3] } }),	      \
--     __extension__ ((__STRING2_COPY_ARR5)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4] } }),				      \
--     __extension__ ((__STRING2_COPY_ARR6)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5] } }),	      \
--     __extension__ ((__STRING2_COPY_ARR7)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6] } }),				      \
--     __extension__ ((__STRING2_COPY_ARR8)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6], ((const char *) (src))[7] } })
--#    endif
--__STRING_INLINE void *__mempcpy_small (void *, char, __STRING2_COPY_ARR2,
--				       __STRING2_COPY_ARR3,
--				       __STRING2_COPY_ARR4,
--				       __STRING2_COPY_ARR5,
--				       __STRING2_COPY_ARR6,
--				       __STRING2_COPY_ARR7,
--				       __STRING2_COPY_ARR8, size_t);
-+#  else
- __STRING_INLINE void *
- __mempcpy_small (void *__dest, char __src1,
- 		 __STRING2_COPY_ARR2 __src2, __STRING2_COPY_ARR3 __src3,
-@@ -382,8 +186,6 @@ __mempcpy_small (void *__dest, char __src1,
-     }
-   return __extension__ ((void *) __u + __srclen);
- }
--#   endif
--#  endif
- # endif
- #endif
- 
-@@ -391,44 +193,17 @@ __mempcpy_small (void *__dest, char __src1,
- /* Return pointer to C in S.  */
- #ifndef _HAVE_STRING_ARCH_strchr
- extern void *__rawmemchr (const void *__s, int __c);
--# if __GNUC_PREREQ (3, 2)
- #  define strchr(s, c) \
-   (__extension__ (__builtin_constant_p (c) && !__builtin_constant_p (s)	      \
- 		  && (c) == '\0'					      \
- 		  ? (char *) __rawmemchr (s, c)				      \
- 		  : __builtin_strchr (s, c)))
--# else
--#  define strchr(s, c) \
--  (__extension__ (__builtin_constant_p (c) && (c) == '\0'		      \
--		  ? (char *) __rawmemchr (s, c)				      \
--		  : strchr (s, c)))
--# endif
- #endif
- 
- 
- /* Copy SRC to DEST.  */
--#if (!defined _HAVE_STRING_ARCH_strcpy && !__GNUC_PREREQ (3, 0)) \
--    || defined _FORCE_INLINES
--# if !defined _HAVE_STRING_ARCH_strcpy && !__GNUC_PREREQ (3, 0)
--#  define strcpy(dest, src) \
--  (__extension__ (__builtin_constant_p (src)				      \
--		  ? (__string2_1bptr_p (src) && strlen (src) + 1 <= 8	      \
--		     ? __strcpy_small (dest, __strcpy_args (src),	      \
--				       strlen (src) + 1)		      \
--		     : (char *) memcpy (dest, src, strlen (src) + 1))	      \
--		  : strcpy (dest, src)))
--# endif
--
-+#if defined _FORCE_INLINES
- # if _STRING_ARCH_unaligned
--#  ifndef _FORCE_INLINES
--#   define __strcpy_args(src) \
--     __extension__ __STRING2_SMALL_GET16 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET16 (src, 4),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 4)
--#  endif
--__STRING_INLINE char *__strcpy_small (char *, __uint16_t, __uint16_t,
--				      __uint32_t, __uint32_t, size_t);
- __STRING_INLINE char *
- __strcpy_small (char *__dest,
- 		__uint16_t __src0_2, __uint16_t __src4_2,
-@@ -482,42 +257,6 @@ __strcpy_small (char *__dest,
-   return __dest;
- }
- # else
--#  ifndef _FORCE_INLINES
--#   define __strcpy_args(src) \
--     __extension__ ((__STRING2_COPY_ARR2)				      \
--      { { ((const char *) (src))[0], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR3)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR4)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR5)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR6)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR7)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR8)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6], '\0' } })
--#  endif
--__STRING_INLINE char *__strcpy_small (char *, __STRING2_COPY_ARR2,
--				      __STRING2_COPY_ARR3,
--				      __STRING2_COPY_ARR4,
--				      __STRING2_COPY_ARR5,
--				      __STRING2_COPY_ARR6,
--				      __STRING2_COPY_ARR7,
--				      __STRING2_COPY_ARR8, size_t);
- __STRING_INLINE char *
- __strcpy_small (char *__dest,
- 		__STRING2_COPY_ARR2 __src2, __STRING2_COPY_ARR3 __src3,
-@@ -570,44 +309,15 @@ __strcpy_small (char *__dest,
- 
- /* Copy SRC to DEST, returning pointer to final NUL byte.  */
- #ifdef __USE_GNU
--# if !defined _HAVE_STRING_ARCH_stpcpy || defined _FORCE_INLINES
--#  ifndef _HAVE_STRING_ARCH_stpcpy
--#   if __GNUC_PREREQ (3, 4)
--#    define __stpcpy(dest, src) __builtin_stpcpy (dest, src)
--#   elif __GNUC_PREREQ (3, 0)
--#    define __stpcpy(dest, src) \
--  (__extension__ (__builtin_constant_p (src)				      \
--		  ? (__string2_1bptr_p (src) && strlen (src) + 1 <= 8	      \
--		     ? __builtin_strcpy (dest, src) + strlen (src)	      \
--		     : ((char *) (__mempcpy) (dest, src, strlen (src) + 1)    \
--			- 1))						      \
--		  : __stpcpy (dest, src)))
--#   else
--#    define __stpcpy(dest, src) \
--  (__extension__ (__builtin_constant_p (src)				      \
--		  ? (__string2_1bptr_p (src) && strlen (src) + 1 <= 8	      \
--		     ? __stpcpy_small (dest, __stpcpy_args (src),	      \
--				       strlen (src) + 1)		      \
--		     : ((char *) (__mempcpy) (dest, src, strlen (src) + 1)    \
--			- 1))						      \
--		  : __stpcpy (dest, src)))
--#   endif
-+# ifndef _HAVE_STRING_ARCH_stpcpy
-+#  define __stpcpy(dest, src) __builtin_stpcpy (dest, src)
- /* In glibc we use this function frequently but for namespace reasons
-    we have to use the name `__stpcpy'.  */
--#   define stpcpy(dest, src) __stpcpy (dest, src)
--#  endif
-+#  define stpcpy(dest, src) __stpcpy (dest, src)
-+# endif
- 
--#  if !__GNUC_PREREQ (3, 0) || defined _FORCE_INLINES
--#   if _STRING_ARCH_unaligned
--#    ifndef _FORCE_INLINES
--#     define __stpcpy_args(src) \
--     __extension__ __STRING2_SMALL_GET16 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET16 (src, 4),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 0),			      \
--     __extension__ __STRING2_SMALL_GET32 (src, 4)
--#    endif
--__STRING_INLINE char *__stpcpy_small (char *, __uint16_t, __uint16_t,
--				      __uint32_t, __uint32_t, size_t);
-+# ifndef _FORCE_INLINES
-+#  if _STRING_ARCH_unaligned
- __STRING_INLINE char *
- __stpcpy_small (char *__dest,
- 		__uint16_t __src0_2, __uint16_t __src4_2,
-@@ -665,43 +375,7 @@ __stpcpy_small (char *__dest,
-     }
-   return &__u->__c;
- }
--#   else
--#    ifndef _FORCE_INLINES
--#     define __stpcpy_args(src) \
--     __extension__ ((__STRING2_COPY_ARR2)				      \
--      { { ((const char *) (src))[0], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR3)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR4)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR5)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR6)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], '\0' } }),				      \
--     __extension__ ((__STRING2_COPY_ARR7)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  '\0' } }),							      \
--     __extension__ ((__STRING2_COPY_ARR8)				      \
--      { { ((const char *) (src))[0], ((const char *) (src))[1],		      \
--	  ((const char *) (src))[2], ((const char *) (src))[3],		      \
--	  ((const char *) (src))[4], ((const char *) (src))[5],		      \
--	  ((const char *) (src))[6], '\0' } })
--#    endif
--__STRING_INLINE char *__stpcpy_small (char *, __STRING2_COPY_ARR2,
--				      __STRING2_COPY_ARR3,
--				      __STRING2_COPY_ARR4,
--				      __STRING2_COPY_ARR5,
--				      __STRING2_COPY_ARR6,
--				      __STRING2_COPY_ARR7,
--				      __STRING2_COPY_ARR8, size_t);
-+#  else
- __STRING_INLINE char *
- __stpcpy_small (char *__dest,
- 		__STRING2_COPY_ARR2 __src2, __STRING2_COPY_ARR3 __src3,
-@@ -748,27 +422,11 @@ __stpcpy_small (char *__dest,
-   }
-   return __dest + __srclen - 1;
- }
--#   endif
- #  endif
- # endif
- #endif
- 
- 
--/* Copy no more than N characters of SRC to DEST.  */
--#ifndef _HAVE_STRING_ARCH_strncpy
--# if __GNUC_PREREQ (3, 2)
--#  define strncpy(dest, src, n) __builtin_strncpy (dest, src, n)
--# else
--#  define strncpy(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  ? (strlen (src) + 1 >= ((size_t) (n))			      \
--		     ? (char *) memcpy (dest, src, n)			      \
--		     : strncpy (dest, src, n))				      \
--		  : strncpy (dest, src, n)))
--# endif
--#endif
--
--
- /* Append no more than N characters from SRC onto DEST.  */
- #ifndef _HAVE_STRING_ARCH_strncat
- # ifdef _USE_STRING_ARCH_strchr
-@@ -780,380 +438,29 @@ __stpcpy_small (char *__dest,
- 		       : (*((char *) __mempcpy (strchr (__dest, '\0'),	      \
- 						src, n)) = '\0', __dest))     \
- 		    : strncat (dest, src, n); }))
--# elif __GNUC_PREREQ (3, 2)
--#  define strncat(dest, src, n) __builtin_strncat (dest, src, n)
- # else
--#  define strncat(dest, src, n) \
--  (__extension__ (__builtin_constant_p (src) && __builtin_constant_p (n)      \
--		  ? (strlen (src) < ((size_t) (n))			      \
--		     ? strcat (dest, src)				      \
--		     : strncat (dest, src, n))				      \
--		  : strncat (dest, src, n)))
--# endif
--#endif
--
--
--/* Compare characters of S1 and S2.  */
--#ifndef _HAVE_STRING_ARCH_strcmp
--# if __GNUC_PREREQ (3, 2)
--#  define strcmp(s1, s2) \
--  __extension__								      \
--  ({ size_t __s1_len, __s2_len;						      \
--     (__builtin_constant_p (s1) && __builtin_constant_p (s2)		      \
--      && (__s1_len = strlen (s1), __s2_len = strlen (s2),		      \
--	  (!__string2_1bptr_p (s1) || __s1_len >= 4)			      \
--	  && (!__string2_1bptr_p (s2) || __s2_len >= 4))		      \
--      ? __builtin_strcmp (s1, s2)					      \
--      : (__builtin_constant_p (s1) && __string2_1bptr_p (s1)		      \
--	 && (__s1_len = strlen (s1), __s1_len < 4)			      \
--	 ? (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    ? __builtin_strcmp (s1, s2)					      \
--	    : __strcmp_cg (s1, s2, __s1_len))				      \
--	 : (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    && (__s2_len = strlen (s2), __s2_len < 4)			      \
--	    ? (__builtin_constant_p (s1) && __string2_1bptr_p (s1)	      \
--	       ? __builtin_strcmp (s1, s2)				      \
--	       : __strcmp_gc (s1, s2, __s2_len))			      \
--	    : __builtin_strcmp (s1, s2)))); })
--# else
--#  define strcmp(s1, s2) \
--  __extension__								      \
--  ({ size_t __s1_len, __s2_len;						      \
--     (__builtin_constant_p (s1) && __builtin_constant_p (s2)		      \
--      && (__s1_len = strlen (s1), __s2_len = strlen (s2),		      \
--	  (!__string2_1bptr_p (s1) || __s1_len >= 4)			      \
--	  && (!__string2_1bptr_p (s2) || __s2_len >= 4))		      \
--      ? memcmp ((const char *) (s1), (const char *) (s2),		      \
--		(__s1_len < __s2_len ? __s1_len : __s2_len) + 1)	      \
--      : (__builtin_constant_p (s1) && __string2_1bptr_p (s1)		      \
--	 && (__s1_len = strlen (s1), __s1_len < 4)			      \
--	 ? (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    ? __strcmp_cc (s1, s2, __s1_len)				      \
--	    : __strcmp_cg (s1, s2, __s1_len))				      \
--	 : (__builtin_constant_p (s2) && __string2_1bptr_p (s2)		      \
--	    && (__s2_len = strlen (s2), __s2_len < 4)			      \
--	    ? (__builtin_constant_p (s1) && __string2_1bptr_p (s1)	      \
--	       ? __strcmp_cc (s1, s2, __s2_len)				      \
--	       : __strcmp_gc (s1, s2, __s2_len))			      \
--	    : strcmp (s1, s2)))); })
-+#  define strncat(dest, src, n) __builtin_strncat (dest, src, n)
- # endif
--
--# define __strcmp_cc(s1, s2, l) \
--  (__extension__ ({ int __result =					      \
--		      (((const unsigned char *) (const char *) (s1))[0]	      \
--		       - ((const unsigned char *) (const char *)(s2))[0]);    \
--		    if (l > 0 && __result == 0)				      \
--		      {							      \
--			__result = (((const unsigned char *)		      \
--				     (const char *) (s1))[1]		      \
--				    - ((const unsigned char *)		      \
--				       (const char *) (s2))[1]);	      \
--			if (l > 1 && __result == 0)			      \
--			  {						      \
--			    __result =					      \
--			      (((const unsigned char *)			      \
--				(const char *) (s1))[2]			      \
--			       - ((const unsigned char *)		      \
--				  (const char *) (s2))[2]);		      \
--			    if (l > 2 && __result == 0)			      \
--			      __result =				      \
--				(((const unsigned char *)		      \
--				  (const char *) (s1))[3]		      \
--				 - ((const unsigned char *)		      \
--				    (const char *) (s2))[3]);		      \
--			  }						      \
--		      }							      \
--		    __result; }))
--
--# define __strcmp_cg(s1, s2, l1) \
--  (__extension__ ({ const unsigned char *__s2 =				      \
--		      (const unsigned char *) (const char *) (s2);	      \
--		    int __result =					      \
--		      (((const unsigned char *) (const char *) (s1))[0]	      \
--		       - __s2[0]);					      \
--		    if (l1 > 0 && __result == 0)			      \
--		      {							      \
--			__result = (((const unsigned char *)		      \
--				     (const char *) (s1))[1] - __s2[1]);      \
--			if (l1 > 1 && __result == 0)			      \
--			  {						      \
--			    __result = (((const unsigned char *)	      \
--					 (const char *) (s1))[2] - __s2[2]);  \
--			    if (l1 > 2 && __result == 0)		      \
--			      __result = (((const unsigned char *)	      \
--					  (const char *)  (s1))[3]	      \
--					  - __s2[3]);			      \
--			  }						      \
--		      }							      \
--		    __result; }))
--
--# define __strcmp_gc(s1, s2, l2) \
--  (__extension__ ({ const unsigned char *__s1 =				      \
--		      (const unsigned char *) (const char *) (s1);	      \
--		    register int __result =				      \
--		      __s1[0] - ((const unsigned char *)		      \
--				 (const char *) (s2))[0];		      \
--		    if (l2 > 0 && __result == 0)			      \
--		      {							      \
--			__result = (__s1[1]				      \
--				    - ((const unsigned char *)		      \
--				       (const char *) (s2))[1]);	      \
--			if (l2 > 1 && __result == 0)			      \
--			  {						      \
--			    __result =					      \
--			      (__s1[2] - ((const unsigned char *)	      \
--					  (const char *) (s2))[2]);	      \
--			    if (l2 > 2 && __result == 0)		      \
--			      __result =				      \
--				(__s1[3]				      \
--				 - ((const unsigned char *)		      \
--				    (const char *) (s2))[3]);		      \
--			  }						      \
--		      }							      \
--		    __result; }))
--#endif
--
--
--/* Compare N characters of S1 and S2.  */
--#ifndef _HAVE_STRING_ARCH_strncmp
--# define strncmp(s1, s2, n)						      \
--  (__extension__ (__builtin_constant_p (n)				      \
--		  && ((__builtin_constant_p (s1)			      \
--		       && strlen (s1) < ((size_t) (n)))			      \
--		      || (__builtin_constant_p (s2)			      \
--			  && strlen (s2) < ((size_t) (n))))		      \
--		  ? strcmp (s1, s2) : strncmp (s1, s2, n)))
- #endif
- 
- 
- /* Return the length of the initial segment of S which
-    consists entirely of characters not in REJECT.  */
--#if !defined _HAVE_STRING_ARCH_strcspn || defined _FORCE_INLINES
--# ifndef _HAVE_STRING_ARCH_strcspn
--#  if __GNUC_PREREQ (3, 2)
--#   define strcspn(s, reject) \
--  __extension__								      \
--  ({ char __r0, __r1, __r2;						      \
--     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
--      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
--	 ? __builtin_strcspn (s, reject)				      \
--	 : ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
--	    ? strlen (s)						      \
--	    : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')	      \
--	       ? __strcspn_c1 (s, __r0)					      \
--	       : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
--		  ? __strcspn_c2 (s, __r0, __r1)			      \
--		  : (((const char *) (reject))[3] == '\0'		      \
--		     ? __strcspn_c3 (s, __r0, __r1, __r2)		      \
--		     : __builtin_strcspn (s, reject))))))		      \
--      : __builtin_strcspn (s, reject)); })
--#  else
--#   define strcspn(s, reject) \
--  __extension__								      \
--  ({ char __r0, __r1, __r2;						      \
--     (__builtin_constant_p (reject) && __string2_1bptr_p (reject)	      \
--      ? ((__r0 = ((const char *) (reject))[0], __r0 == '\0')		      \
--	 ? strlen (s)							      \
--	 : ((__r1 = ((const char *) (reject))[1], __r1 == '\0')		      \
--	    ? __strcspn_c1 (s, __r0)					      \
--	    : ((__r2 = ((const char *) (reject))[2], __r2 == '\0')	      \
--	       ? __strcspn_c2 (s, __r0, __r1)				      \
--	       : (((const char *) (reject))[3] == '\0'			      \
--		  ? __strcspn_c3 (s, __r0, __r1, __r2)			      \
--		  : strcspn (s, reject)))))				      \
--      : strcspn (s, reject)); })
--#  endif
--# endif
--
--__STRING_INLINE size_t __strcspn_c1 (const char *__s, int __reject);
--__STRING_INLINE size_t
--__strcspn_c1 (const char *__s, int __reject)
--{
--  size_t __result = 0;
--  while (__s[__result] != '\0' && __s[__result] != __reject)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strcspn_c2 (const char *__s, int __reject1,
--				     int __reject2);
--__STRING_INLINE size_t
--__strcspn_c2 (const char *__s, int __reject1, int __reject2)
--{
--  size_t __result = 0;
--  while (__s[__result] != '\0' && __s[__result] != __reject1
--	 && __s[__result] != __reject2)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strcspn_c3 (const char *__s, int __reject1,
--				     int __reject2, int __reject3);
--__STRING_INLINE size_t
--__strcspn_c3 (const char *__s, int __reject1, int __reject2,
--	      int __reject3)
--{
--  size_t __result = 0;
--  while (__s[__result] != '\0' && __s[__result] != __reject1
--	 && __s[__result] != __reject2 && __s[__result] != __reject3)
--    ++__result;
--  return __result;
--}
-+#ifndef _HAVE_STRING_ARCH_strcspn
-+# define strcspn(s, reject) __builtin_strcspn (s, reject)
- #endif
- 
- 
- /* Return the length of the initial segment of S which
-    consists entirely of characters in ACCEPT.  */
--#if !defined _HAVE_STRING_ARCH_strspn || defined _FORCE_INLINES
--# ifndef _HAVE_STRING_ARCH_strspn
--#  if __GNUC_PREREQ (3, 2)
--#   define strspn(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
--	 ? __builtin_strspn (s, accept)					      \
--	 : ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
--	    ? ((void) (s), (size_t) 0)					      \
--	    : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')	      \
--	       ? __strspn_c1 (s, __a0)					      \
--	       : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--		  ? __strspn_c2 (s, __a0, __a1)				      \
--		  : (((const char *) (accept))[3] == '\0'		      \
--		     ? __strspn_c3 (s, __a0, __a1, __a2)		      \
--		     : __builtin_strspn (s, accept))))))		      \
--      : __builtin_strspn (s, accept)); })
--#  else
--#   define strspn(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__a0 = ((const char *) (accept))[0], __a0 == '\0')		      \
--	 ? ((void) (s), (size_t) 0)					      \
--	 : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')		      \
--	    ? __strspn_c1 (s, __a0)					      \
--	    : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--	       ? __strspn_c2 (s, __a0, __a1)				      \
--	       : (((const char *) (accept))[3] == '\0'			      \
--		  ? __strspn_c3 (s, __a0, __a1, __a2)			      \
--		  : strspn (s, accept)))))				      \
--      : strspn (s, accept)); })
--#  endif
--# endif
--
--__STRING_INLINE size_t __strspn_c1 (const char *__s, int __accept);
--__STRING_INLINE size_t
--__strspn_c1 (const char *__s, int __accept)
--{
--  size_t __result = 0;
--  /* Please note that __accept never can be '\0'.  */
--  while (__s[__result] == __accept)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strspn_c2 (const char *__s, int __accept1,
--				    int __accept2);
--__STRING_INLINE size_t
--__strspn_c2 (const char *__s, int __accept1, int __accept2)
--{
--  size_t __result = 0;
--  /* Please note that __accept1 and __accept2 never can be '\0'.  */
--  while (__s[__result] == __accept1 || __s[__result] == __accept2)
--    ++__result;
--  return __result;
--}
--
--__STRING_INLINE size_t __strspn_c3 (const char *__s, int __accept1,
--				    int __accept2, int __accept3);
--__STRING_INLINE size_t
--__strspn_c3 (const char *__s, int __accept1, int __accept2, int __accept3)
--{
--  size_t __result = 0;
--  /* Please note that __accept1 to __accept3 never can be '\0'.  */
--  while (__s[__result] == __accept1 || __s[__result] == __accept2
--	 || __s[__result] == __accept3)
--    ++__result;
--  return __result;
--}
-+#ifndef _HAVE_STRING_ARCH_strspn
-+# define strspn(s, accept) __builtin_strspn (s, accept)
- #endif
- 
- 
- /* Find the first occurrence in S of any character in ACCEPT.  */
--#if !defined _HAVE_STRING_ARCH_strpbrk || defined _FORCE_INLINES
--# ifndef _HAVE_STRING_ARCH_strpbrk
--#  if __GNUC_PREREQ (3, 2)
--#   define strpbrk(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__builtin_constant_p (s) && __string2_1bptr_p (s))		      \
--	 ? __builtin_strpbrk (s, accept)				      \
--	 : ((__a0 = ((const char  *) (accept))[0], __a0 == '\0')	      \
--	    ? ((void) (s), (char *) NULL)				      \
--	    : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')	      \
--	       ? __builtin_strchr (s, __a0)				      \
--	       : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--		  ? __strpbrk_c2 (s, __a0, __a1)			      \
--		  : (((const char *) (accept))[3] == '\0'		      \
--		     ? __strpbrk_c3 (s, __a0, __a1, __a2)		      \
--		     : __builtin_strpbrk (s, accept))))))		      \
--      : __builtin_strpbrk (s, accept)); })
--#  else
--#   define strpbrk(s, accept) \
--  __extension__								      \
--  ({ char __a0, __a1, __a2;						      \
--     (__builtin_constant_p (accept) && __string2_1bptr_p (accept)	      \
--      ? ((__a0 = ((const char  *) (accept))[0], __a0 == '\0')		      \
--	 ? ((void) (s), (char *) NULL)					      \
--	 : ((__a1 = ((const char *) (accept))[1], __a1 == '\0')		      \
--	    ? strchr (s, __a0)						      \
--	    : ((__a2 = ((const char *) (accept))[2], __a2 == '\0')	      \
--	       ? __strpbrk_c2 (s, __a0, __a1)				      \
--	       : (((const char *) (accept))[3] == '\0'			      \
--		  ? __strpbrk_c3 (s, __a0, __a1, __a2)			      \
--		  : strpbrk (s, accept)))))				      \
--      : strpbrk (s, accept)); })
--#  endif
--# endif
--
--__STRING_INLINE char *__strpbrk_c2 (const char *__s, int __accept1,
--				    int __accept2);
--__STRING_INLINE char *
--__strpbrk_c2 (const char *__s, int __accept1, int __accept2)
--{
--  /* Please note that __accept1 and __accept2 never can be '\0'.  */
--  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2)
--    ++__s;
--  return *__s == '\0' ? NULL : (char *) (size_t) __s;
--}
--
--__STRING_INLINE char *__strpbrk_c3 (const char *__s, int __accept1,
--				    int __accept2, int __accept3);
--__STRING_INLINE char *
--__strpbrk_c3 (const char *__s, int __accept1, int __accept2, int __accept3)
--{
--  /* Please note that __accept1 to __accept3 never can be '\0'.  */
--  while (*__s != '\0' && *__s != __accept1 && *__s != __accept2
--	 && *__s != __accept3)
--    ++__s;
--  return *__s == '\0' ? NULL : (char *) (size_t) __s;
--}
--#endif
--
--
--/* Find the first occurrence of NEEDLE in HAYSTACK.  Newer gcc versions
--   do this itself.  */
--#if !defined _HAVE_STRING_ARCH_strstr && !__GNUC_PREREQ (2, 97)
--# define strstr(haystack, needle) \
--  (__extension__ (__builtin_constant_p (needle) && __string2_1bptr_p (needle) \
--		  ? (((const char *) (needle))[0] == '\0'		      \
--		     ? (char *) (size_t) (haystack)			      \
--		     : (((const char *) (needle))[1] == '\0'		      \
--			? strchr (haystack,				      \
--				  ((const char *) (needle))[0]) 	      \
--			: strstr (haystack, needle)))			      \
--		  : strstr (haystack, needle)))
-+#ifndef _HAVE_STRING_ARCH_strpbrk
-+# define strpbrk(s, accept) __builtin_strpbrk (s, accept)
- #endif
- 
- 
-diff --git a/string/strncat.c b/string/strncat.c
-index dcfb04d..a9cb913 100644
---- a/string/strncat.c
-+++ b/string/strncat.c
-@@ -1,4 +1,4 @@
--/* Copyright (C) 1991,1997,2011 Free Software Foundation, Inc.
-+/* Copyright (C) 1991-2022 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -13,14 +13,10 @@
- 
-    You should have received a copy of the GNU Lesser General Public
-    License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
-+   <https://www.gnu.org/licenses/>.  */
- 
- #include <string.h>
- 
--#ifdef _LIBC
--# include <memcopy.h>
--#endif
--
- #ifndef STRNCAT
- # undef strncat
- # define STRNCAT  strncat
-@@ -29,54 +25,16 @@
- char *
- STRNCAT (char *s1, const char *s2, size_t n)
- {
--  char c;
-   char *s = s1;
- 
-   /* Find the end of S1.  */
--  do
--    c = *s1++;
--  while (c != '\0');
--
--  /* Make S1 point before next character, so we can increment
--     it while memory is read (wins on pipelined cpus).  */
--  s1 -= 2;
-+  s1 += strlen (s1);
- 
--  if (n >= 4)
--    {
--      size_t n4 = n >> 2;
--      do
--	{
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    return s;
--	} while (--n4 > 0);
--      n &= 3;
--    }
-+  size_t ss = __strnlen (s2, n);
- 
--  while (n > 0)
--    {
--      c = *s2++;
--      *++s1 = c;
--      if (c == '\0')
--	return s;
--      n--;
--    }
--
--  if (c != '\0')
--    *++s1 = '\0';
-+  s1[ss] = '\0';
-+  memcpy (s1, s2, ss);
- 
-   return s;
- }
-+
-diff --git a/string/strncpy.c b/string/strncpy.c
-index 19d501e..83fb610 100644
---- a/string/strncpy.c
-+++ b/string/strncpy.c
-@@ -1,4 +1,4 @@
--/* Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc.
-+/* Copyright (C) 1991-2022 Free Software Foundation, Inc.
-    This file is part of the GNU C Library.
- 
-    The GNU C Library is free software; you can redistribute it and/or
-@@ -13,75 +13,22 @@
- 
-    You should have received a copy of the GNU Lesser General Public
-    License along with the GNU C Library; if not, see
--   <http://www.gnu.org/licenses/>.  */
-+   <https://www.gnu.org/licenses/>.  */
- 
- #include <string.h>
--#include <memcopy.h>
- 
- #undef strncpy
- 
- #ifndef STRNCPY
--#define STRNCPY strncpy
-+ #define STRNCPY strncpy
- #endif
- 
- char *
- STRNCPY (char *s1, const char *s2, size_t n)
- {
--  char c;
--  char *s = s1;
--
--  --s1;
--
--  if (n >= 4)
--    {
--      size_t n4 = n >> 2;
--
--      for (;;)
--	{
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  c = *s2++;
--	  *++s1 = c;
--	  if (c == '\0')
--	    break;
--	  if (--n4 == 0)
--	    goto last_chars;
--	}
--      n = n - (s1 - s) - 1;
--      if (n == 0)
--	return s;
--      goto zero_fill;
--    }
--
-- last_chars:
--  n &= 3;
--  if (n == 0)
--    return s;
--
--  do
--    {
--      c = *s2++;
--      *++s1 = c;
--      if (--n == 0)
--	return s;
--    }
--  while (c != '\0');
--
-- zero_fill:
--  do
--    *++s1 = '\0';
--  while (--n > 0);
--
--  return s;
-+  size_t size = __strnlen (s2, n);
-+  if (size != n)
-+    memset (s1 + size, '\0', n - size);
-+  return memcpy (s1, s2, size);
- }
- libc_hidden_builtin_def (strncpy)
-diff --git a/sysdeps/gnu/bits/utmp.h b/sysdeps/gnu/bits/utmp.h
-index aed2750..434a533 100644
---- a/sysdeps/gnu/bits/utmp.h
-+++ b/sysdeps/gnu/bits/utmp.h
-@@ -59,10 +59,14 @@ struct utmp
- {
-   short int ut_type;		/* Type of login.  */
-   pid_t ut_pid;			/* Process ID of login process.  */
--  char ut_line[UT_LINESIZE];	/* Devicename.  */
--  char ut_id[4];		/* Inittab ID.  */
--  char ut_user[UT_NAMESIZE];	/* Username.  */
--  char ut_host[UT_HOSTSIZE];	/* Hostname for remote login.  */
-+  char ut_line[UT_LINESIZE]
-+    __attribute_nonstring__;	/* Devicename.  */
-+  char ut_id[4]
-+    __attribute_nonstring__;		/* Inittab ID.  */
-+  char ut_user[UT_NAMESIZE]
-+    __attribute_nonstring__;	/* Username.  */
-+  char ut_host[UT_HOSTSIZE]
-+    __attribute_nonstring__;	/* Hostname for remote login.  */
-   struct exit_status ut_exit;	/* Exit status of a process marked
- 				   as DEAD_PROCESS.  */
- /* The ut_session and ut_tv fields must be the same size when compiled
-diff --git a/sysdeps/gnu/bits/utmpx.h b/sysdeps/gnu/bits/utmpx.h
-index f8716ca..13d84e4 100644
---- a/sysdeps/gnu/bits/utmpx.h
-+++ b/sysdeps/gnu/bits/utmpx.h
-@@ -56,10 +56,14 @@ struct utmpx
- {
-   short int ut_type;		/* Type of login.  */
-   __pid_t ut_pid;		/* Process ID of login process.  */
--  char ut_line[__UT_LINESIZE];	/* Devicename.  */
--  char ut_id[4];		/* Inittab ID. */
--  char ut_user[__UT_NAMESIZE];	/* Username.  */
--  char ut_host[__UT_HOSTSIZE];	/* Hostname for remote login.  */
-+  char ut_line[__UT_LINESIZE]
-+    __attribute_nonstring__;	/* Devicename.  */
-+  char ut_id[4]
-+    __attribute_nonstring__;		/* Inittab ID. */
-+  char ut_user[__UT_NAMESIZE]
-+    __attribute_nonstring__;	/* Username.  */
-+  char ut_host[__UT_HOSTSIZE]
-+    __attribute_nonstring__;	/* Hostname for remote login.  */
-   struct __exit_status ut_exit;	/* Exit status of a process marked
- 				   as DEAD_PROCESS.  */
- 
-diff --git a/sysdeps/unix/sysv/linux/if_index.c b/sysdeps/unix/sysv/linux/if_index.c
-index 8ba5eae..b620d21 100644
---- a/sysdeps/unix/sysv/linux/if_index.c
-+++ b/sysdeps/unix/sysv/linux/if_index.c
-@@ -38,12 +38,19 @@ __if_nametoindex (const char *ifname)
-   return 0;
- #else
-   struct ifreq ifr;
-+  if (strlen (ifname) >= IFNAMSIZ)
-+    {
-+      __set_errno (ENODEV);
-+      return 0;
-+    }
-+
-+  strncpy (ifr.ifr_name, ifname, sizeof (ifr.ifr_name));
-+
-   int fd = __opensock ();
- 
-   if (fd < 0)
-     return 0;
- 
--  strncpy (ifr.ifr_name, ifname, sizeof (ifr.ifr_name));
-   if (__ioctl (fd, SIOCGIFINDEX, &ifr) < 0)
-     {
-       int saved_errno = errno;
-diff --git a/timezone/zic.c b/timezone/zic.c
-index a5202a1..772d081 100644
---- a/timezone/zic.c
-+++ b/timezone/zic.c
-@@ -1609,7 +1609,7 @@ writezone(const char *const name, const char *const string)
- 		}
- #define DO(field)	((void) fwrite(tzh.field, sizeof tzh.field, 1, fp))
- 		tzh = tzh0;
--		(void) strncpy(tzh.tzh_magic, TZ_MAGIC, sizeof tzh.tzh_magic);
-+		memcpy(tzh.tzh_magic, TZ_MAGIC, sizeof tzh.tzh_magic);
- 		tzh.tzh_version[0] = ZIC_VERSION;
- 		convert(eitol(thistypecnt), tzh.tzh_ttisgmtcnt);
- 		convert(eitol(thistypecnt), tzh.tzh_ttisstdcnt);
diff --git a/ci/official/containers/ml_build_arm64/builder.packages.txt b/ci/official/containers/ml_build_arm64/builder.packages.txt
deleted file mode 100644
index 2be317ca4e256b..00000000000000
--- a/ci/official/containers/ml_build_arm64/builder.packages.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-# Packages needed to build devtoolset
-file
-flex
-make
-build-essential
-patch
-rpm2cpio
-unar
-wget
-xz-utils
-cpio
-gawk
-texinfo
-gettext
-
-# Other build-related tools
-software-properties-common
-apt-transport-https
-autoconf
-automake
-ca-certificates
-pkg-config
-libcurl3-dev
-libcurl4-openssl-dev
-libfreetype6-dev
-libhdf5-serial-dev
-libomp-18-dev
-libssl-dev
-libtool
-libssl-dev
-libxml2-dev
-libxslt1-dev
-libzmq3-dev
-llvm-18
-clang-18
-clang-tidy-18
-lld-18
-clang-format-12
-curl
-git
-parallel
-sudo
-swig
-unzip
-zip
-openjdk-21-jdk
-vim
diff --git a/ci/official/containers/ml_build_arm64/devel.bashrc b/ci/official/containers/ml_build_arm64/devel.bashrc
deleted file mode 100644
index 755d48783b1b9a..00000000000000
--- a/ci/official/containers/ml_build_arm64/devel.bashrc
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# ==============================================================================
-
-# Do not print anything if this is not being used interactively
-[ -z "$PS1" ] && return
-
-# Set up attractive prompt
-export PS1="\[\e[31m\]tf-docker\[\e[m\] \[\e[33m\]\w\[\e[m\] > "
-export TERM=xterm-256color
-alias grep="grep --color=auto"
-alias ls="ls --color=auto"
-# Fix nvidia-docker
-ldconfig 
diff --git a/ci/official/containers/ml_build_arm64/ld.so.conf b/ci/official/containers/ml_build_arm64/ld.so.conf
deleted file mode 100644
index e2aa028720ed2c..00000000000000
--- a/ci/official/containers/ml_build_arm64/ld.so.conf
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# Builds a devtoolset cross-compiler targeting manylinux2014 (glibc 2.17 / libstdc++ 4.8).
-
-/lib64
diff --git a/ci/official/containers/ml_build_arm64/requirements.txt b/ci/official/containers/ml_build_arm64/requirements.txt
deleted file mode 100644
index 6ae6deda141234..00000000000000
--- a/ci/official/containers/ml_build_arm64/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-portpicker==1.6.0
-# For wheel verification, and uploading
-auditwheel ~= 6.1.0
-twine ~= 6.1.0
-
-# uv is faster than pip for installing Python packages.
-uv ~= 0.5.30
\ No newline at end of file
diff --git a/ci/official/containers/ml_build_arm64/setup.packages.sh b/ci/official/containers/ml_build_arm64/setup.packages.sh
deleted file mode 100755
index 347b853e349385..00000000000000
--- a/ci/official/containers/ml_build_arm64/setup.packages.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# setup.packages.sh: Given a list of Ubuntu packages, install them and clean up.
-# Usage: setup.packages.sh <package_list.txt>
-set -e
-
-# Prevent apt install tzinfo from asking our location (assumes UTC)
-export DEBIAN_FRONTEND=noninteractive
-
-apt-get update
-# Remove commented lines and blank lines
-apt-get install -y --no-install-recommends $(sed -e '/^\s*#.*$/d' -e '/^\s*$/d' "$1" | sort -u)
-rm -rf /var/lib/apt/lists/*
\ No newline at end of file
diff --git a/ci/official/containers/ml_build_arm64/setup.python.sh b/ci/official/containers/ml_build_arm64/setup.python.sh
deleted file mode 100755
index ff5ade526536fa..00000000000000
--- a/ci/official/containers/ml_build_arm64/setup.python.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# setup.python.sh: Install a specific Python version and packages for it.
-# Usage: setup.python.sh <pyversion> <requirements.txt>
-set -xe
-
-source ~/.bashrc
-VERSION=$1
-REQUIREMENTS=$2
-
-add-apt-repository ppa:deadsnakes/ppa
-# Install Python packages for this container's version
-if [[ ${VERSION} == "python3.13-nogil" ]]; then
-  cat >pythons.txt <<EOF
-$VERSION
-EOF
-elif [[ ${VERSION} == "python3.13" ]]; then
-  cat >pythons.txt <<EOF
-$VERSION
-$VERSION-dev
-$VERSION-venv
-EOF
-else
-  cat >pythons.txt <<EOF
-$VERSION
-$VERSION-dev
-$VERSION-venv
-EOF
-fi
-
-if [[ ${VERSION} == "python3.14" ]]; then
-  if [[ ! -d Python-3.14.0rc1 ]]; then
-    apt update && apt install -y libssl-dev zlib1g-dev libbz2-dev libreadline-dev libncurses5-dev libffi-dev liblzma-dev
-    wget https://www.python.org/ftp/python/3.14.0/Python-3.14.0rc1.tar.xz
-    tar -xf Python-3.14.0rc1.tar.xz
-  fi
-  pushd Python-3.14.0rc1
-  mkdir -p /python314-0rc1
-  CC=clang-18 CXX=clang++-18 ./configure --prefix /python314-0rc1 --with-ensurepip=install
-  make -j$(nproc)
-  make install -j$(nproc)
-  ln -s /python314-0rc1/bin/python3 /usr/bin/python3.14
-  popd
-elif [[ ${VERSION} == "python3.14-nogil" ]]; then
-  if [[ ! -d Python-3.14.0rc1 ]]; then
-    apt update && apt install -y libssl-dev zlib1g-dev libbz2-dev libreadline-dev libncurses5-dev libffi-dev liblzma-dev
-    wget https://www.python.org/ftp/python/3.14.0/Python-3.14.0rc1.tar.xz
-    tar -xf Python-3.14.0rc1.tar.xz
-  fi
-  pushd Python-3.14.0rc1
-  mkdir -p /python314-0rc1-nogil
-  CC=clang-18 CXX=clang++-18 ./configure --prefix /python314-0rc1-nogil --disable-gil --with-ensurepip=install
-  make -j$(nproc)
-  make install -j$(nproc)
-  ln -s /python314-0rc1-nogil/bin/python3 /usr/bin/python3.14-nogil
-  popd
-else
-  /setup.packages.sh pythons.txt
-fi
-
-# Re-link pyconfig.h from aarch64-linux-gnu into the devtoolset directory
-# for any Python version present
-pushd /usr/include/aarch64-linux-gnu
-for f in $(ls | grep python); do
-  # set up symlink for devtoolset-10
-  rm -f /dt10/usr/include/aarch64-linux-gnu/$f
-  ln -s /usr/include/aarch64-linux-gnu/$f /dt10/usr/include/aarch64-linux-gnu/$f
-done
-popd
-
-# Python 3.10 include headers fix:
-# sysconfig.get_path('include') incorrectly points to /usr/local/include/python
-# map /usr/include/python3.10 to /usr/local/include/python3.10
-if [[ ! -f "/usr/local/include/$VERSION" ]]; then
-  ln -sf /usr/include/$VERSION /usr/local/include/$VERSION
-fi
-
-# Install pip
-
-wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 --tries=5 https://bootstrap.pypa.io/get-pip.py
-/usr/bin/$VERSION get-pip.py
-/usr/bin/$VERSION -m pip install --no-cache-dir --upgrade pip
-
-# For Python 3.13t, do not install twine as it does not have pre-built wheels
-# for this Python version and building it from source fails. We only need twine
-# to be present on the system Python which in this case is 3.12.
-if [[ ${VERSION} == "python3.13-nogil" || ${VERSION} == "python3.14" || ${VERSION} == "python3.14-nogil" ]]; then
-  grep -v "twine" $REQUIREMENTS > requirements_without_twine.txt
-  REQUIREMENTS=requirements_without_twine.txt
-fi
-
-# Disable the cache dir to save image space, and install packages
-/usr/bin/$VERSION -m pip install --no-cache-dir -r $REQUIREMENTS -U
diff --git a/ci/official/containers/ml_build_arm64/setup.sources.sh b/ci/official/containers/ml_build_arm64/setup.sources.sh
deleted file mode 100755
index f8c87d4ceade60..00000000000000
--- a/ci/official/containers/ml_build_arm64/setup.sources.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-#
-# setup.python.sh: Install a specific Python version and packages for it.
-# Usage: setup.python.sh <pyversion> <requirements.txt>
-
-# Sets up custom apt sources for our TF images.
-
-# Prevent apt install tzinfo from asking our location (assumes UTC)
-export DEBIAN_FRONTEND=noninteractive
-
-# Set up shared custom sources
-apt-get update
-apt-get install -y gnupg ca-certificates
-
-# Deadsnakes: https://launchpad.net/~deadsnakes/+archive/ubuntu/ppa
-apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776
-
-# LLVM/Clang: https://apt.llvm.org/
-apt-key adv --fetch-keys https://apt.llvm.org/llvm-snapshot.gpg.key
-
-# Set up custom sources
-cat >/etc/apt/sources.list.d/custom.list <<SOURCES
-# More Python versions: Deadsnakes
-deb http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
-deb-src http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal main
-
-# LLVM/Clang 18 repository
-deb http://apt.llvm.org/focal/ llvm-toolchain-focal-18 main
-deb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-18 main
-SOURCES

From 86c82562e1188f693f49cd0b369e70d07a74eb7e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 17:19:54 -0800
Subject: [PATCH 685/753] This is an internal change

Reverts 911ce60c2902b58cd892ca05f2297d72ef624e5f

PiperOrigin-RevId: 847933822
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  35 ---
 .../channelwise_quantized_conv_2d_test.cc     |  91 ++++++--
 ...elwise_quantized_depthwise_conv_2d_test.cc | 116 +++++++---
 .../lite/delegates/xnnpack/conv_2d_test.cc    | 159 +++++++++++---
 .../xnnpack/depthwise_conv_2d_test.cc         | 170 ++++++++++++---
 .../dynamically_quantized_conv_2d_test.cc     | 179 +++++++++++++---
 ...amically_quantized_fully_connected_test.cc | 113 +++++++++-
 ...namically_quantized_transpose_conv_test.cc |  60 ++++--
 ...mically_quantized_transpose_conv_tester.cc |  10 +-
 .../xnnpack/fingerprint_test_helpers.h        | 112 ----------
 .../delegates/xnnpack/fully_connected_test.cc | 152 ++++++++++---
 .../xnnpack/signed_quantized_conv_2d_test.cc  |  97 +++++++--
 ...signed_quantized_depthwise_conv_2d_test.cc | 122 ++++++++---
 .../signed_quantized_fully_connected_test.cc  |  91 ++++++--
 .../signed_quantized_transpose_conv_test.cc   | 127 ++++++++---
 .../delegates/xnnpack/transpose_conv_test.cc  | 199 ++++++++++++++----
 .../unsigned_quantized_conv_2d_test.cc        |  97 +++++++--
 ...signed_quantized_depthwise_conv_2d_test.cc | 121 ++++++++---
 ...unsigned_quantized_fully_connected_test.cc |  85 ++++++--
 .../unsigned_quantized_transpose_conv_test.cc | 127 ++++++++---
 .../lite/delegates/xnnpack/weight_cache.cc    |  85 ++------
 .../lite/delegates/xnnpack/weight_cache.h     |   7 +-
 .../delegates/xnnpack/weight_cache_schema.fbs |   5 +-
 .../delegates/xnnpack/weight_cache_test.cc    | 192 +++++------------
 24 files changed, 1787 insertions(+), 765 deletions(-)
 delete mode 100644 tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 02d51f21d4fa4e..227537a79f1454 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -392,21 +392,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "fingerprint_test_helpers",
-    testonly = True,
-    hdrs = ["fingerprint_test_helpers.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        ":weight_cache",
-        ":weight_cache_test_helpers",
-        ":xnnpack_delegate_hdrs_only",
-        "//tensorflow/lite/c:common",
-        "@XNNPACK",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_library(
     name = "mmap_handle",
     srcs = ["mmap_handle.cc"],
@@ -1362,7 +1347,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1379,7 +1363,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1414,7 +1397,6 @@ cc_test(
     }),
     deps = [
         ":conv_2d_tester",
-        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1464,7 +1446,6 @@ cc_test(
     }),
     deps = [
         ":depthwise_conv_2d_tester",
-        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1484,7 +1465,6 @@ cc_test(
     tags = ["notap"],
     deps = [
         ":dynamically_quantized_fully_connected_tester",
-        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1501,7 +1481,6 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_conv_2d_tester",
-        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1518,7 +1497,6 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_transpose_conv_tester",
-        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1535,14 +1513,10 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":fully_connected_tester",
         ":test_main",
-        ":weight_cache",
-        ":weight_cache_test_helpers",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
-        "@XNNPACK",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1890,7 +1864,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1907,7 +1880,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1958,7 +1930,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2192,7 +2163,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2337,7 +2307,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":test_main",
         ":transpose_conv_tester",
         ":xnnpack_delegate_test_mode",
@@ -2417,7 +2386,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2433,7 +2401,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2464,7 +2431,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2675,7 +2641,6 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
-        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
index d195d4f25435e8..92293e08227593 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
@@ -24,16 +24,17 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct ChannelwiseQuantizedConv2D : DelegateTest {};
+TEST(ChannelwiseQuantizedConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(ChannelwiseQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -70,7 +71,11 @@ TEST_F(ChannelwiseQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, 3x3) {
+TEST(ChannelwiseQuantizedConv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -107,7 +112,11 @@ TEST_F(ChannelwiseQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, 3x3Stride2) {
+TEST(ChannelwiseQuantizedConv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -146,7 +155,11 @@ TEST_F(ChannelwiseQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
+TEST(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -185,7 +198,11 @@ TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
+TEST(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -224,7 +241,11 @@ TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
+TEST(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -267,7 +288,11 @@ TEST_F(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
+TEST(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -310,7 +335,11 @@ TEST_F(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
+TEST(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -353,7 +382,11 @@ TEST_F(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
+TEST(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -396,7 +429,11 @@ TEST_F(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, ReluActivation) {
+TEST(ChannelwiseQuantizedConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -439,7 +476,11 @@ TEST_F(ChannelwiseQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, Relu6Activation) {
+TEST(ChannelwiseQuantizedConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -482,7 +523,11 @@ TEST_F(ChannelwiseQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
+TEST(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -525,11 +570,13 @@ TEST_F(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, MultiThreading) {
+TEST(ChannelwiseQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -572,7 +619,7 @@ TEST_F(ChannelwiseQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, WeightsCache) {
+TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -580,7 +627,9 @@ TEST_F(ChannelwiseQuantizedConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -624,13 +673,15 @@ TEST_F(ChannelwiseQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
+TEST(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(xnnpack_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
index 0c6de84e9a8d2f..25dada01896c34 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
@@ -23,16 +23,18 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct ChannelwiseQuantizedDepthwiseConv2D : DelegateTest {};
+TEST(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -64,7 +66,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -97,7 +103,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -130,7 +140,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -165,7 +179,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -198,7 +216,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -233,7 +255,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -271,7 +297,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -309,7 +339,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -351,7 +385,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -393,7 +431,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -435,7 +477,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -477,7 +523,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -523,7 +573,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -565,7 +619,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -607,7 +665,11 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -649,11 +711,13 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -695,7 +759,7 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -703,7 +767,9 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -746,13 +812,15 @@ TEST_F(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+TEST(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(xnnpack_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index e1b5a674946b73..25090bbaf2b5cf 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -19,16 +19,18 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct Conv2D : DelegateTest {};
+TEST(Conv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(Conv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -50,7 +52,11 @@ TEST_F(Conv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, 3x3) {
+TEST(Conv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -72,7 +78,11 @@ TEST_F(Conv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, 3x3Stride2) {
+TEST(Conv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -96,7 +106,11 @@ TEST_F(Conv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, Grouped) {
+TEST(Conv2D, Grouped) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -122,7 +136,11 @@ TEST_F(Conv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, SmallKernelWithSamePadding) {
+TEST(Conv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -146,7 +164,11 @@ TEST_F(Conv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, SmallKernelWithValidPadding) {
+TEST(Conv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -170,7 +192,11 @@ TEST_F(Conv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, StrideWithSamePadding) {
+TEST(Conv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -198,7 +224,11 @@ TEST_F(Conv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, StrideWithValidPadding) {
+TEST(Conv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -226,7 +256,11 @@ TEST_F(Conv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, DilationWithSamePadding) {
+TEST(Conv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -254,7 +288,11 @@ TEST_F(Conv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, DilationWithValidPadding) {
+TEST(Conv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -282,7 +320,11 @@ TEST_F(Conv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, FP16Weights) {
+TEST(Conv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -310,7 +352,11 @@ TEST_F(Conv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, TensorWiseQuantizedInt8Weights) {
+TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -338,7 +384,11 @@ TEST_F(Conv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, ChannelWiseQuantizedInt8Weights) {
+TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -366,7 +416,11 @@ TEST_F(Conv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, SparseWeights) {
+TEST(Conv2D, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -394,7 +448,11 @@ TEST_F(Conv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, SparseFP16Weights) {
+TEST(Conv2D, SparseFP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -423,7 +481,11 @@ TEST_F(Conv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
+TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -452,7 +514,11 @@ TEST_F(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
+TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -481,7 +547,11 @@ TEST_F(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, ReluActivation) {
+TEST(Conv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -509,7 +579,11 @@ TEST_F(Conv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, Relu6Activation) {
+TEST(Conv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -537,7 +611,11 @@ TEST_F(Conv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, ReluMinus1To1Activation) {
+TEST(Conv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -565,7 +643,11 @@ TEST_F(Conv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, DISABLED_TanhActivation) {
+TEST(Conv2D, DISABLED_TanhActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -593,7 +675,11 @@ TEST_F(Conv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, DISABLED_SignBitActivation) {
+TEST(Conv2D, DISABLED_SignBitActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -621,11 +707,13 @@ TEST_F(Conv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, MultiThreading) {
+TEST(Conv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -653,7 +741,7 @@ TEST_F(Conv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, WeightsCache) {
+TEST(Conv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -661,7 +749,10 @@ TEST_F(Conv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -690,13 +781,15 @@ TEST_F(Conv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(Conv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions delegate_options =
+TEST(Conv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.num_threads = 2;
-  delegate_options.flags |=
+  xnnpack_options.num_threads = 2;
+  xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index 931fff88178dfb..e894bcdc2bc46a 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -19,16 +19,18 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct DepthwiseConv2D : DelegateTest {};
+TEST(DepthwiseConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(DepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -45,7 +47,11 @@ TEST_F(DepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, 2x2) {
+TEST(DepthwiseConv2D, 2x2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -63,7 +69,11 @@ TEST_F(DepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, 3x3) {
+TEST(DepthwiseConv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -81,7 +91,11 @@ TEST_F(DepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, 3x3Stride2) {
+TEST(DepthwiseConv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -101,7 +115,11 @@ TEST_F(DepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, 5x5) {
+TEST(DepthwiseConv2D, 5x5) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -119,7 +137,11 @@ TEST_F(DepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, 5x5Stride2) {
+TEST(DepthwiseConv2D, 5x5Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -139,7 +161,11 @@ TEST_F(DepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, SmallKernelWithSamePadding) {
+TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -162,7 +188,11 @@ TEST_F(DepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, SmallKernelWithValidPadding) {
+TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -185,7 +215,11 @@ TEST_F(DepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, StrideWithSamePadding) {
+TEST(DepthwiseConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -212,7 +246,11 @@ TEST_F(DepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, StrideWithValidPadding) {
+TEST(DepthwiseConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -239,7 +277,11 @@ TEST_F(DepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, DilationWithSamePadding) {
+TEST(DepthwiseConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -266,7 +308,11 @@ TEST_F(DepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, DilationWithValidPadding) {
+TEST(DepthwiseConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -293,7 +339,11 @@ TEST_F(DepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, DepthMultiplier) {
+TEST(DepthwiseConv2D, DepthMultiplier) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -322,7 +372,11 @@ TEST_F(DepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, FP16Weights) {
+TEST(DepthwiseConv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -349,7 +403,11 @@ TEST_F(DepthwiseConv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
+TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -376,7 +434,11 @@ TEST_F(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
+TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -403,7 +465,11 @@ TEST_F(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, SparseWeights) {
+TEST(DepthwiseConv2D, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -430,7 +496,11 @@ TEST_F(DepthwiseConv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, SparseFP16Weights) {
+TEST(DepthwiseConv2D, SparseFP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -458,7 +528,11 @@ TEST_F(DepthwiseConv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
+TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -486,7 +560,11 @@ TEST_F(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
+TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -514,7 +592,11 @@ TEST_F(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, ReluActivation) {
+TEST(DepthwiseConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -541,7 +623,11 @@ TEST_F(DepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, Relu6Activation) {
+TEST(DepthwiseConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -568,7 +654,11 @@ TEST_F(DepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, ReluMinus1To1Activation) {
+TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -595,7 +685,11 @@ TEST_F(DepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, DISABLED_TanhActivation) {
+TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -622,7 +716,11 @@ TEST_F(DepthwiseConv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, DISABLED_SignBitActivation) {
+TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -649,11 +747,13 @@ TEST_F(DepthwiseConv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, MultiThreading) {
+TEST(DepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -680,7 +780,7 @@ TEST_F(DepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, WeightsCache) {
+TEST(DepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -688,7 +788,9 @@ TEST_F(DepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -716,13 +818,15 @@ TEST_F(DepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DepthwiseConv2D, TransientIndirectionBuffer) {
+TEST(DepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(xnnpack_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
index 52e8333db4fd04..59507269580cbd 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
@@ -19,16 +19,22 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_tester.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct DynamicallyQuantizedConv2D : DelegateTest {};
+TEST(DynamicallyQuantizedConv2D, 3x3) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(DynamicallyQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -50,7 +56,15 @@ TEST_F(DynamicallyQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, 3x3Stride2) {
+TEST(DynamicallyQuantizedConv2D, 3x3Stride2) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -74,7 +88,15 @@ TEST_F(DynamicallyQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, Grouped) {
+TEST(DynamicallyQuantizedConv2D, Grouped) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -100,7 +122,15 @@ TEST_F(DynamicallyQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
+TEST(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -124,7 +154,15 @@ TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
+TEST(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -148,7 +186,14 @@ TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
+TEST(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -176,7 +221,15 @@ TEST_F(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
+TEST(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -204,7 +257,15 @@ TEST_F(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
+TEST(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -232,7 +293,15 @@ TEST_F(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
+TEST(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -260,7 +329,15 @@ TEST_F(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
+TEST(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -287,7 +364,15 @@ TEST_F(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
+TEST(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -314,7 +399,15 @@ TEST_F(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, ReluActivation) {
+TEST(DynamicallyQuantizedConv2D, ReluActivation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -342,7 +435,15 @@ TEST_F(DynamicallyQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, Relu6Activation) {
+TEST(DynamicallyQuantizedConv2D, Relu6Activation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -370,7 +471,15 @@ TEST_F(DynamicallyQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
+TEST(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -398,7 +507,15 @@ TEST_F(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, TanhActivation) {
+TEST(DynamicallyQuantizedConv2D, TanhActivation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -426,7 +543,15 @@ TEST_F(DynamicallyQuantizedConv2D, TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, SignBitActivation) {
+TEST(DynamicallyQuantizedConv2D, SignBitActivation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -454,13 +579,15 @@ TEST_F(DynamicallyQuantizedConv2D, SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, MultiThreading) {
+TEST(DynamicallyQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -488,7 +615,7 @@ TEST_F(DynamicallyQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, WeightsCache) {
+TEST(DynamicallyQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -498,7 +625,9 @@ TEST_F(DynamicallyQuantizedConv2D, WeightsCache) {
   delegate_options.weights_cache = weights_cache.get();
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -527,14 +656,16 @@ TEST_F(DynamicallyQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
+TEST(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
   xnnpack_options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  UseCustomDelegate(xnnpack_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
index 2d2febcb21ab66..2f198a95195f11 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
@@ -30,10 +30,9 @@ namespace xnnpack {
 
 // Dummy class to use with parameterized test.
 class DynamicallyQuantizedFullyConnectedTest
-    : public testing::WithParamInterface<WeightsType>,
-      public DelegateTest {};
+    : public testing::TestWithParam<WeightsType> {};
 
-int GenInputChannels(const std::function<int()>& rng,
+int GenInputChannels(const std::function<int()> &rng,
                      WeightsType weights_type) {
   switch (weights_type) {
     case WeightsType::kChannelWiseQuantizedInt8:
@@ -46,6 +45,14 @@ int GenInputChannels(const std::function<int()>& rng,
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -64,6 +71,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -84,6 +99,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -105,6 +128,13 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -126,6 +156,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -146,6 +184,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -168,6 +214,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -190,6 +244,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -213,6 +275,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -234,6 +304,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -255,6 +333,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -276,6 +362,14 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -299,8 +393,13 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
 TEST_P(DynamicallyQuantizedFullyConnectedTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.flags |=
+      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -330,7 +429,9 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
index 4a40e56852b56c..de863e4f1e2125 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
@@ -19,16 +19,18 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct DynamicallyQuantizedTransposeConvTest : DelegateTest {};
+TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -49,7 +51,10 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
+TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -70,7 +75,11 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
+TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -91,7 +100,11 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
+TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -112,7 +125,11 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
+TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -136,7 +153,10 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
+TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -160,7 +180,11 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
+TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -188,7 +212,11 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
+TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -216,11 +244,13 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
+TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -249,7 +279,7 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
+TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -257,7 +287,9 @@ TEST_F(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
index abfd76c12a14f9..3bdcd343373bac 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
@@ -55,12 +55,10 @@ void DynamicallyQuantizedTransposeConvTester::Test(
   const Model* model = GetModel(buffer.data());
 
   std::unique_ptr<Interpreter> delegate_interpreter;
-  ASSERT_EQ(
-      InterpreterBuilder(
-          model,
-          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
-          &delegate_interpreter),
-      kTfLiteOk);
+  ASSERT_EQ(InterpreterBuilder(
+                model, ::tflite::ops::builtin::BuiltinOpResolverWithXNNPACK())(
+                &delegate_interpreter),
+            kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
       InterpreterBuilder(
diff --git a/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h b/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
deleted file mode 100644
index 29edbe5a35c841..00000000000000
--- a/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
-#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
-
-#include <memory>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "experimental.h"  // from @XNNPACK
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
-#include "tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h"
-#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-
-namespace tflite::xnnpack {
-
-struct TfLiteDelegateDeleter {
-  void operator()(TfLiteDelegate* delegate) {
-    TfLiteXNNPackDelegateDelete(delegate);
-  }
-};
-
-using TfLiteDelegatePtr =
-    std::unique_ptr<TfLiteDelegate, TfLiteDelegateDeleter>;
-
-struct DelegateTest : public virtual testing::Test {
-  void SetUp() override {
-    TfLiteXNNPackDelegateOptions delegate_options =
-        TfLiteXNNPackDelegateOptionsDefault();
-
-    // By default, we try to setup a file weight cache to also check fingerprint
-    // generation. If the test system doesn't support a file system, then the
-    // cache file will be invalid.
-    if (cache_file.IsValid()) {
-      xnn_clear_fingerprints();
-      delegate_options.weight_cache_file_path = cache_file.GetCPath();
-      delegate_options.weight_cache_file_descriptor =
-          cache_file.Duplicate().Release();
-      delegate_options.flags |=
-          TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-      check_for_cache_fingerprints = true;
-    }
-
-    xnnpack_delegate =
-        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
-    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
-  }
-
-  void TearDown() override {
-    if (check_for_cache_fingerprints) {
-      ASSERT_TRUE(cache_file.IsValid());
-      EXPECT_TRUE(IsCompatibleCacheFile(cache_file));
-      if (AlterXNNPackFingerprints()) {
-        EXPECT_FALSE(IsCompatibleCacheFile(cache_file));
-      }
-    }
-  }
-
-  // Artificially change fingerprint values.
-  //
-  // This allows us to check that changing a fingerprint value will make the
-  // cache file incompatible.
-  //
-  // Returns the current number of fingerprints.
-  int AlterXNNPackFingerprints() {
-    int i = 0;
-    int modified = 0;
-    for (const xnn_fingerprint* fingerprint = xnn_get_fingerprint_by_idx(i);
-         fingerprint != nullptr;
-         fingerprint = xnn_get_fingerprint_by_idx(++i)) {
-      xnn_fingerprint new_fingerprint = *fingerprint;
-      ++new_fingerprint.value;
-      xnn_set_fingerprint(new_fingerprint);
-      ++modified;
-    }
-    return modified;
-  }
-
-  // Replaces the xnnpack delegate with a custom one.
-  void UseCustomDelegate(const TfLiteXNNPackDelegateOptions& delegate_options) {
-    check_for_cache_fingerprints = false;
-    xnnpack_delegate =
-        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
-    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
-  }
-
-  // Replaces the xnnpack delegate with one that sets up a file backed weight
-  // cache.
-  void UseDelegateWithFileWeightCache() {}
-
-  // The default delegate is created in a generic way.
-  TfLiteDelegatePtr xnnpack_delegate;
-  tflite::xnnpack::TempFileDesc cache_file;
-  bool check_for_cache_fingerprints = false;
-};
-
-}  // namespace tflite::xnnpack
-
-#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index 6701d0bc1c8f59..92a6074c464f85 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -19,16 +19,18 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct FullyConnectedTest : public DelegateTest {};
+TEST(FullyConnected, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(FullyConnectedTest, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -43,7 +45,11 @@ TEST_F(FullyConnectedTest, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 1DKeepDims) {
+TEST(FullyConnected, 1DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -59,7 +65,11 @@ TEST_F(FullyConnectedTest, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 2D) {
+TEST(FullyConnected, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -77,7 +87,11 @@ TEST_F(FullyConnectedTest, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 2DKeepDims) {
+TEST(FullyConnected, 2DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -96,7 +110,11 @@ TEST_F(FullyConnectedTest, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 3D) {
+TEST(FullyConnected, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -115,7 +133,11 @@ TEST_F(FullyConnectedTest, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 3DReshape) {
+TEST(FullyConnected, 3DReshape) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -134,7 +156,11 @@ TEST_F(FullyConnectedTest, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 3DKeepDims) {
+TEST(FullyConnected, 3DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -154,7 +180,11 @@ TEST_F(FullyConnectedTest, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 4D) {
+TEST(FullyConnected, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -174,7 +204,11 @@ TEST_F(FullyConnectedTest, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, 4DKeepDims) {
+TEST(FullyConnected, 4DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -195,7 +229,11 @@ TEST_F(FullyConnectedTest, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, NoBias) {
+TEST(FullyConnected, NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -214,7 +252,11 @@ TEST_F(FullyConnectedTest, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, FP16Weights) {
+TEST(FullyConnected, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -233,7 +275,11 @@ TEST_F(FullyConnectedTest, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, FP16WeightsNoBias) {
+TEST(FullyConnected, FP16WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -253,7 +299,11 @@ TEST_F(FullyConnectedTest, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, DynamicWeights) {
+TEST(FullyConnected, DynamicWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -272,7 +322,11 @@ TEST_F(FullyConnectedTest, DynamicWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, DynamicWeightsNoBias) {
+TEST(FullyConnected, DynamicWeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -292,7 +346,11 @@ TEST_F(FullyConnectedTest, DynamicWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, DynamicBias) {
+TEST(FullyConnected, DynamicBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -311,7 +369,11 @@ TEST_F(FullyConnectedTest, DynamicBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, DynamicWeightsAndBias) {
+TEST(FullyConnected, DynamicWeightsAndBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -331,7 +393,11 @@ TEST_F(FullyConnectedTest, DynamicWeightsAndBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8Weights) {
+TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -350,7 +416,11 @@ TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8WeightsNoBias) {
+TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -370,7 +440,11 @@ TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8Weights) {
+TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -389,7 +463,11 @@ TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8WeightsNoBias) {
+TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -409,7 +487,11 @@ TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, ReluActivation) {
+TEST(FullyConnected, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -428,7 +510,11 @@ TEST_F(FullyConnectedTest, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, Relu6Activation) {
+TEST(FullyConnected, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -447,7 +533,11 @@ TEST_F(FullyConnectedTest, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, ReluMinus1To1Activation) {
+TEST(FullyConnected, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -466,11 +556,13 @@ TEST_F(FullyConnectedTest, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, MultiThreading) {
+TEST(FullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -489,7 +581,7 @@ TEST_F(FullyConnectedTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(FullyConnectedTest, WeightsCache) {
+TEST(FullyConnected, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -497,7 +589,9 @@ TEST_F(FullyConnectedTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
index 06daba0d9bada7..f67ba714b01cc8 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
@@ -21,16 +21,17 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct SignedQuantizedConv2D : DelegateTest {};
+TEST(SignedQuantizedConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(SignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -58,7 +59,11 @@ TEST_F(SignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, 3x3) {
+TEST(SignedQuantizedConv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -86,7 +91,11 @@ TEST_F(SignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, 3x3Stride2) {
+TEST(SignedQuantizedConv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -116,7 +125,11 @@ TEST_F(SignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, Grouped) {
+TEST(SignedQuantizedConv2D, Grouped) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -150,7 +163,11 @@ TEST_F(SignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
+TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -180,7 +197,11 @@ TEST_F(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
+TEST(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -210,7 +231,11 @@ TEST_F(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, StrideWithSamePadding) {
+TEST(SignedQuantizedConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -244,7 +269,11 @@ TEST_F(SignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, StrideWithValidPadding) {
+TEST(SignedQuantizedConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -278,7 +307,11 @@ TEST_F(SignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, DilationWithSamePadding) {
+TEST(SignedQuantizedConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -312,7 +345,11 @@ TEST_F(SignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, DilationWithValidPadding) {
+TEST(SignedQuantizedConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -346,7 +383,11 @@ TEST_F(SignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, ReluActivation) {
+TEST(SignedQuantizedConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -380,7 +421,11 @@ TEST_F(SignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, Relu6Activation) {
+TEST(SignedQuantizedConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -414,7 +459,11 @@ TEST_F(SignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, ReluMinus1To1Activation) {
+TEST(SignedQuantizedConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -448,11 +497,13 @@ TEST_F(SignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, MultiThreading) {
+TEST(SignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -486,13 +537,15 @@ TEST_F(SignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions delegate_options =
+TEST(SignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.num_threads = 2;
-  delegate_options.flags |=
+  xnnpack_options.num_threads = 2;
+  xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
index c409b18002ef51..3acfbaaf34778e 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
@@ -20,16 +20,18 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct SignedQuantizedDepthwiseConv2D : DelegateTest {};
+TEST(SignedQuantizedDepthwiseConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(SignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -52,7 +54,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, 2x2) {
+TEST(SignedQuantizedDepthwiseConv2D, 2x2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -76,7 +82,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, 3x3) {
+TEST(SignedQuantizedDepthwiseConv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -100,7 +110,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
+TEST(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -126,7 +140,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, 5x5) {
+TEST(SignedQuantizedDepthwiseConv2D, 5x5) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -150,7 +168,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
+TEST(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -176,7 +198,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
+TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -205,7 +231,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
+TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -234,7 +264,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
+TEST(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -267,7 +301,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
+TEST(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -300,7 +338,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
+TEST(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -333,7 +375,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
+TEST(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -366,7 +412,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
+TEST(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -401,7 +451,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, ReluActivation) {
+TEST(SignedQuantizedDepthwiseConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -434,7 +488,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
+TEST(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -467,7 +525,11 @@ TEST_F(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
+TEST(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -500,11 +562,13 @@ TEST_F(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST(SignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -537,7 +601,7 @@ TEST_F(SignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -545,7 +609,9 @@ TEST_F(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -579,13 +645,15 @@ TEST_F(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions delegate_options =
+TEST(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.num_threads = 2;
-  delegate_options.flags |=
+  xnnpack_options.num_threads = 2;
+  xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
index 5a7a9dfd77b24e..3097d314a3a6ab 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
@@ -21,16 +21,17 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct SignedQuantizedFullyConnected : DelegateTest {};
+TEST(SignedQuantizedFullyConnected, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(SignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -51,7 +52,11 @@ TEST_F(SignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 1DKeepDims) {
+TEST(SignedQuantizedFullyConnected, 1DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -73,7 +78,11 @@ TEST_F(SignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 2D) {
+TEST(SignedQuantizedFullyConnected, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -97,7 +106,11 @@ TEST_F(SignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 2DKeepDims) {
+TEST(SignedQuantizedFullyConnected, 2DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -122,7 +135,11 @@ TEST_F(SignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 3D) {
+TEST(SignedQuantizedFullyConnected, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -147,7 +164,11 @@ TEST_F(SignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 3DReshape) {
+TEST(SignedQuantizedFullyConnected, 3DReshape) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -172,7 +193,11 @@ TEST_F(SignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 3DKeepDims) {
+TEST(SignedQuantizedFullyConnected, 3DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -198,7 +223,11 @@ TEST_F(SignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 4D) {
+TEST(SignedQuantizedFullyConnected, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -224,7 +253,11 @@ TEST_F(SignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, 4DKeepDims) {
+TEST(SignedQuantizedFullyConnected, 4DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -251,7 +284,11 @@ TEST_F(SignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, NoBias) {
+TEST(SignedQuantizedFullyConnected, NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -276,7 +313,11 @@ TEST_F(SignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, ReluActivation) {
+TEST(SignedQuantizedFullyConnected, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -301,7 +342,11 @@ TEST_F(SignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, Relu6Activation) {
+TEST(SignedQuantizedFullyConnected, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -326,7 +371,11 @@ TEST_F(SignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
+TEST(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -351,11 +400,13 @@ TEST_F(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, MultiThreading) {
+TEST(SignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -380,7 +431,7 @@ TEST_F(SignedQuantizedFullyConnected, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedFullyConnected, WeightsCache) {
+TEST(SignedQuantizedFullyConnected, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -388,7 +439,9 @@ TEST_F(SignedQuantizedFullyConnected, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
index d4dceb9077ff26..7daae13ebdea16 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
@@ -20,16 +20,17 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct SignedQuantizedTransposeConvTest : DelegateTest {};
+TEST(SignedQuantizedTransposeConvTest, 2x2Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -51,7 +52,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
+TEST(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -74,7 +79,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2) {
+TEST(SignedQuantizedTransposeConvTest, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -96,7 +105,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
+TEST(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -119,7 +132,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2) {
+TEST(SignedQuantizedTransposeConvTest, 4x4Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -141,7 +158,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
+TEST(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -164,7 +185,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4) {
+TEST(SignedQuantizedTransposeConvTest, 4x4Stride4) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -186,7 +211,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
+TEST(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -209,7 +238,11 @@ TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
+TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -234,7 +267,11 @@ TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
+TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -260,7 +297,11 @@ TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
+TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -285,7 +326,11 @@ TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
+TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -311,7 +356,11 @@ TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
+TEST(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -340,7 +389,11 @@ TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
+TEST(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -370,7 +423,11 @@ TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
+TEST(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -399,7 +456,11 @@ TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
+TEST(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -429,7 +490,11 @@ TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, SparseWeights) {
+TEST(SignedQuantizedTransposeConvTest, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -459,7 +524,11 @@ TEST_F(SignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
+TEST(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -490,11 +559,13 @@ TEST_F(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, MultiThreading) {
+TEST(SignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -524,11 +595,13 @@ TEST_F(SignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -559,7 +632,7 @@ TEST_F(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(SignedQuantizedTransposeConvTest, WeightsCache) {
+TEST(SignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -567,7 +640,9 @@ TEST_F(SignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
index d37317c34f545a..260fd87e282a63 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
@@ -19,16 +19,17 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct TransposeConvTest : DelegateTest {};
+TEST(TransposeConvTest, 2x2Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(TransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -49,7 +50,11 @@ TEST_F(TransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, 2x2Stride2NoBias) {
+TEST(TransposeConvTest, 2x2Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -71,7 +76,11 @@ TEST_F(TransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, 3x3Stride2) {
+TEST(TransposeConvTest, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -92,7 +101,11 @@ TEST_F(TransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, 3x3Stride2NoBias) {
+TEST(TransposeConvTest, 3x3Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -114,7 +127,11 @@ TEST_F(TransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, 4x4Stride2) {
+TEST(TransposeConvTest, 4x4Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -135,7 +152,11 @@ TEST_F(TransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, 4x4Stride2NoBias) {
+TEST(TransposeConvTest, 4x4Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -157,7 +178,11 @@ TEST_F(TransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, 4x4Stride4) {
+TEST(TransposeConvTest, 4x4Stride4) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -178,7 +203,11 @@ TEST_F(TransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, 4x4Stride4NoBias) {
+TEST(TransposeConvTest, 4x4Stride4NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -200,7 +229,11 @@ TEST_F(TransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SmallKernelWithSamePadding) {
+TEST(TransposeConvTest, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -224,7 +257,11 @@ TEST_F(TransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
+TEST(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -249,7 +286,11 @@ TEST_F(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SmallKernelWithValidPadding) {
+TEST(TransposeConvTest, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -273,7 +314,11 @@ TEST_F(TransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
+TEST(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -298,7 +343,11 @@ TEST_F(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, StrideWithSamePadding) {
+TEST(TransposeConvTest, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -326,7 +375,11 @@ TEST_F(TransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, StrideWithSamePaddingNoBias) {
+TEST(TransposeConvTest, StrideWithSamePaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -355,7 +408,11 @@ TEST_F(TransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, StrideWithValidPadding) {
+TEST(TransposeConvTest, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -383,7 +440,11 @@ TEST_F(TransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, StrideWithValidPaddingNoBias) {
+TEST(TransposeConvTest, StrideWithValidPaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -412,7 +473,11 @@ TEST_F(TransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, FP16Weights) {
+TEST(TransposeConvTest, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -441,7 +506,11 @@ TEST_F(TransposeConvTest, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, FP16WeightsNoBias) {
+TEST(TransposeConvTest, FP16WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -471,7 +540,11 @@ TEST_F(TransposeConvTest, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
+TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -500,7 +573,11 @@ TEST_F(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
+TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -530,7 +607,11 @@ TEST_F(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
+TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -559,7 +640,11 @@ TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
+TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -589,7 +674,11 @@ TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseWeights) {
+TEST(TransposeConvTest, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -618,7 +707,11 @@ TEST_F(TransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseWeightsNoBias) {
+TEST(TransposeConvTest, SparseWeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -648,7 +741,11 @@ TEST_F(TransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseFP16Weights) {
+TEST(TransposeConvTest, SparseFP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -678,7 +775,11 @@ TEST_F(TransposeConvTest, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseFP16WeightsNoBias) {
+TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -709,7 +810,11 @@ TEST_F(TransposeConvTest, SparseFP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
+TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -739,7 +844,11 @@ TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
+TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -770,7 +879,11 @@ TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
+TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -800,7 +913,11 @@ TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
+TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -831,11 +948,13 @@ TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, MultiThreading) {
+TEST(TransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -864,11 +983,13 @@ TEST_F(TransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, MultiThreadingNoBias) {
+TEST(TransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -898,7 +1019,7 @@ TEST_F(TransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(TransposeConvTest, WeightsCache) {
+TEST(TransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -906,7 +1027,9 @@ TEST_F(TransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
index b8c9d48f4f05a2..6660fc5af75ebe 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
@@ -20,16 +20,17 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct UnsignedQuantizedConv2D : DelegateTest {};
+TEST(UnsignedQuantizedConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(UnsignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -60,7 +61,11 @@ TEST_F(UnsignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, 3x3) {
+TEST(UnsignedQuantizedConv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -91,7 +96,11 @@ TEST_F(UnsignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, 3x3Stride2) {
+TEST(UnsignedQuantizedConv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -128,7 +137,11 @@ TEST_F(UnsignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, Grouped) {
+TEST(UnsignedQuantizedConv2D, Grouped) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -161,7 +174,11 @@ TEST_F(UnsignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
+TEST(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -194,7 +211,11 @@ TEST_F(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
+TEST(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -227,7 +248,11 @@ TEST_F(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, StrideWithSamePadding) {
+TEST(UnsignedQuantizedConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -264,7 +289,11 @@ TEST_F(UnsignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, StrideWithValidPadding) {
+TEST(UnsignedQuantizedConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -301,7 +330,11 @@ TEST_F(UnsignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, DilationWithSamePadding) {
+TEST(UnsignedQuantizedConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -338,7 +371,11 @@ TEST_F(UnsignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, DilationWithValidPadding) {
+TEST(UnsignedQuantizedConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -375,7 +412,11 @@ TEST_F(UnsignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, ReluActivation) {
+TEST(UnsignedQuantizedConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -412,7 +453,11 @@ TEST_F(UnsignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, Relu6Activation) {
+TEST(UnsignedQuantizedConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -449,7 +494,11 @@ TEST_F(UnsignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
+TEST(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -486,11 +535,13 @@ TEST_F(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, MultiThreading) {
+TEST(UnsignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -527,13 +578,15 @@ TEST_F(UnsignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions delegate_options =
+TEST(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.num_threads = 2;
-  delegate_options.flags |=
+  xnnpack_options.num_threads = 2;
+  xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
index a269343dafc512..7facb9787338c7 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
@@ -20,16 +20,17 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct UnsignedQuantizedDepthwiseConv2D : DelegateTest {};
+TEST(UnsignedQuantizedDepthwiseConv2D, 1x1) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -55,7 +56,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, 2x2) {
+TEST(UnsignedQuantizedDepthwiseConv2D, 2x2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -82,7 +87,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3) {
+TEST(UnsignedQuantizedDepthwiseConv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -109,7 +118,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
+TEST(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -138,7 +151,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5) {
+TEST(UnsignedQuantizedDepthwiseConv2D, 5x5) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -165,7 +182,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
+TEST(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -194,7 +215,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
+TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -226,7 +251,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
+TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -258,7 +287,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
+TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -294,7 +327,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
+TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -330,7 +367,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
+TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -366,7 +407,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
+TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -402,7 +447,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
+TEST(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -440,7 +489,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
+TEST(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -476,7 +529,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
+TEST(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -512,7 +569,11 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
+TEST(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -548,11 +609,13 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -588,7 +651,7 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -596,7 +659,9 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -633,13 +698,15 @@ TEST_F(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions delegate_options =
+TEST(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.num_threads = 2;
-  delegate_options.flags |=
+  xnnpack_options.num_threads = 2;
+  xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
index 25aabd2a559413..90df47c884d042 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
@@ -20,16 +20,17 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct UnsignedQuantizedFullyConnected : DelegateTest {};
+TEST(UnsignedQuantizedFullyConnected, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(UnsignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -53,7 +54,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 1DKeepDims) {
+TEST(UnsignedQuantizedFullyConnected, 1DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -78,7 +83,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 2D) {
+TEST(UnsignedQuantizedFullyConnected, 2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -105,7 +114,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 2DKeepDims) {
+TEST(UnsignedQuantizedFullyConnected, 2DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -133,7 +146,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 3D) {
+TEST(UnsignedQuantizedFullyConnected, 3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -161,7 +178,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 3DReshape) {
+TEST(UnsignedQuantizedFullyConnected, 3DReshape) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -189,7 +210,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 3DKeepDims) {
+TEST(UnsignedQuantizedFullyConnected, 3DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -218,7 +243,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 4D) {
+TEST(UnsignedQuantizedFullyConnected, 4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -247,7 +276,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, 4DKeepDims) {
+TEST(UnsignedQuantizedFullyConnected, 4DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -277,7 +310,11 @@ TEST_F(UnsignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, NoBias) {
+TEST(UnsignedQuantizedFullyConnected, NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -305,7 +342,11 @@ TEST_F(UnsignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, ReluActivation) {
+TEST(UnsignedQuantizedFullyConnected, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -333,7 +374,11 @@ TEST_F(UnsignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, Relu6Activation) {
+TEST(UnsignedQuantizedFullyConnected, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -361,7 +406,11 @@ TEST_F(UnsignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
+TEST(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -389,11 +438,13 @@ TEST_F(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedFullyConnected, MultiThreading) {
+TEST(UnsignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
index 5167d18443ac30..8e6a779a1979f9 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
@@ -19,16 +19,17 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-struct UnsignedQuantizedTransposeConvTest : DelegateTest {};
+TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -50,7 +51,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -73,7 +78,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
+TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -95,7 +104,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -118,7 +131,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
+TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -140,7 +157,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -163,7 +184,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
+TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -185,7 +210,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -208,7 +237,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
+TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -233,7 +266,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -259,7 +296,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
+TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -284,7 +325,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -310,7 +355,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
+TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -339,7 +388,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -369,7 +422,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
+TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -398,7 +455,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -428,7 +489,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeights) {
+TEST(UnsignedQuantizedTransposeConvTest, SparseWeights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -458,7 +523,11 @@ TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -489,11 +558,13 @@ TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreading) {
+TEST(UnsignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -523,11 +594,13 @@ TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -558,7 +631,7 @@ TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST_F(UnsignedQuantizedTransposeConvTest, WeightsCache) {
+TEST(UnsignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -566,7 +639,9 @@ TEST_F(UnsignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  UseCustomDelegate(delegate_options);
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index 9aaf497700f87f..a8c86ff5a25529 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include <unistd.h>
 #endif
 
-#include <algorithm>
 #include <cerrno>  // IWYU pragma: keep
 #include <cinttypes>
 #include <cstddef>
@@ -38,7 +37,6 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
-#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/verifier.h"  // from @flatbuffers
@@ -80,20 +78,6 @@ bool FileExists(const char* path) {
   return access(path, F_OK) != -1;
 }
 
-bool CheckFingerprints(const cache::schema::BufferList* buffer_list) {
-  if (buffer_list->fingerprints()) {
-    for (uint64_t cache_fingerprint : *buffer_list->fingerprints()) {
-      xnn_fingerprint fingerprint;
-      static_assert(sizeof(fingerprint) == sizeof(cache_fingerprint));
-      std::memcpy(&fingerprint, &cache_fingerprint, sizeof(fingerprint));
-      XNNPACK_RETURN_CHECK(
-          xnn_check_fingerprint(fingerprint) == xnn_status_success,
-          "fingerprint (id: 0x%x) could not be matched", fingerprint.id);
-    }
-  }
-  return true;
-}
-
 }  // namespace
 
 #define XNN_MOVE_CONSTRUCT_MEMBER(x) x(std::move(other.x))
@@ -198,8 +182,7 @@ void* WeightCacheBuilder::Reserve(size_t size) {
 }
 
 BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
-                                          const void* data, uint64_t size,
-                                          int32_t fingerprint_id) {
+                                          const void* data, uint64_t size) {
   XNNPACK_ABORT_CHECK(is_build_step_,
                       "cannot append data to an unstarted builder.");
   // Add some padding so that the cache file can be mmaped and the buffer
@@ -218,34 +201,6 @@ BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
   buffer.size = loc.size;
   schema_.buffers.push_back(std::make_unique<cache::schema::BufferT>(buffer));
 
-  // Not passing a fingerprint id is a logic error on XNNPack's side. If we
-  // don't have a fingerprint for an operation, we have no way of ensuring that
-  // the generation of the cached data hasn't changed when reloading the cache.
-  //
-  // If we just log this and continue on with the work. This run will build a
-  // cache with cached data that can't be checked in the future. This will lead,
-  // in future runs that reuse the cache, to crashes that are impossible to
-  // debug or outputs that are nonsensical without any chance of linking this
-  // back to this error.
-  //
-  // We abort because we have no way of making that failure bubble up to the
-  // calling code to handle it gracefully...
-  XNNPACK_ABORT_CHECK(fingerprint_id != 0,
-                      "XNNPack weight cache: no fingerprint identifier was set "
-                      "when appending a buffer to the cache file.");
-  const xnn_fingerprint* fingerprint = xnn_get_fingerprint(fingerprint_id);
-  XNNPACK_ABORT_CHECK(fingerprint,
-                      "XNNPack weight cache: could not find a fingerprint with "
-                      "id 0x%x when appending a buffer to the cache file.",
-                      fingerprint_id);
-  uint64_t fingerprint_value;
-  static_assert(sizeof(fingerprint_value) == sizeof(*fingerprint));
-  std::memcpy(&fingerprint_value, fingerprint, sizeof(*fingerprint));
-  if (std::find(schema_.fingerprints.begin(), schema_.fingerprints.end(),
-                fingerprint_value) == schema_.fingerprints.end()) {
-    schema_.fingerprints.push_back(fingerprint_value);
-  }
-
   if (!fd_.Write(data, size)) {
     TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
                     "XNNPack weight cache: cannot append buffer to cache file");
@@ -278,7 +233,16 @@ bool WeightCacheBuilder::StopBuildStep() {
   XNNPACK_RETURN_CHECK(fd_.SetPos(layout_offset) != -1,
                        "could not move in the file: %s", strerror(errno));
 
+  XNNPACK_RETURN_CHECK(
+      sizeof(XNNPackCacheHeader::xnnpack_build_identifier) ==
+          xnn_experimental_get_build_identifier_size(),
+      "cache file ('%s') header cannot hold XNNPack's build identifier: %s.",
+      file_path_.c_str(), strerror(errno));
+
   XNNPackCacheHeader header{XNNPackCacheHeader::kVersion};
+  memcpy(header.xnnpack_build_identifier,
+         xnn_experimental_get_build_identifier_data(),
+         xnn_experimental_get_build_identifier_size());
   header.buffer_list_offset = fd_.GetPos();
   header.buffer_list_size = builder.GetSize();
 
@@ -441,6 +405,12 @@ bool MMapWeightCacheProvider::Load() {
                        ", expected %" PRIu64 ". Cache needs to be built again.",
                        header.version, XNNPackCacheHeader::kVersion);
 
+  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
+                           header.xnnpack_build_identifier,
+                           sizeof(header.xnnpack_build_identifier)),
+                       "XNNPack weight cache: incompatible XNNPack version. "
+                       "Cache needs to be built again.");
+
   XNNPACK_RETURN_CHECK(header.buffer_list_offset < mmap_handle.size(),
                        "invalid offset for buffer list descriptor.");
 
@@ -460,8 +430,6 @@ bool MMapWeightCacheProvider::Load() {
   XNNPACK_RETURN_CHECK(buffer_list,
                        "could not get packed weights from flatbuffer.");
 
-  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
-
   mmap_buffer_base_offset_ = buffer_list->base_offset();
   if (const auto buffers = buffer_list->buffers(); buffers) {
     for (auto* buffer : *buffers) {
@@ -616,8 +584,7 @@ size_t MMapWeightCacheProvider::LookUpOrInsert(
     return offset_it->second.offset;
   }
 
-  const BufferLocation location =
-      builder_.Append(pack_id, ptr, size, cache_key->fingerprint_id);
+  const BufferLocation location = builder_.Append(pack_id, ptr, size);
   XNNPACK_ABORT_CHECK(!location.IsInvalid(),
                       "Inserting data in the cache failed.");
   cache_key_to_offset_.emplace(pack_id, location);
@@ -726,20 +693,10 @@ bool IsCompatibleCacheFile(FileDescriptorView fd) {
                        "Cache header version is incompatible. Expected %" PRIu64
                        ", got %" PRIu64 ".",
                        XNNPackCacheHeader::kVersion, header.version);
-
-  fd.SetPos(header.buffer_list_offset);
-  auto buffer = std::make_unique<uint8_t[]>(header.buffer_list_size);
-  XNNPACK_RETURN_CHECK(fd.Read(buffer.get(), header.buffer_list_size));
-
-  flatbuffers::Verifier verifier(buffer.get(), header.buffer_list_size);
-  XNNPACK_RETURN_CHECK(cache::schema::VerifyBufferListBuffer(verifier),
-                       "buffer list validation failed.");
-
-  const cache::schema::BufferList* buffer_list =
-      cache::schema::GetBufferList(buffer.get());
-  XNNPACK_RETURN_CHECK(buffer_list,
-                       "could not get packed weights from flatbuffer.");
-  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
+  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
+                           header.xnnpack_build_identifier,
+                           sizeof(header.xnnpack_build_identifier)),
+                       "Cache header build identifier is different.");
   return true;
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index 781422b4bec662..a7c8654df4f7ec 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -56,8 +56,9 @@ inline constexpr char kInMemoryCachePath[] = ":memory";
 // When reading a cache file, the cache should be rejected if `version`
 // doesn't match `kVersion`.
 struct XNNPackCacheHeader {
-  enum : uint64_t { kInvalidHeader = 0, kVersion = 2 };
+  enum : uint64_t { kInvalidHeader = 0, kVersion = 1 };
   uint64_t version;
+  uint8_t xnnpack_build_identifier[32];
   uint64_t buffer_list_offset;
   uint64_t buffer_list_size;
 };
@@ -160,8 +161,8 @@ class WeightCacheBuilder {
   // The buffer space must have been reserved before using `Reserve`. If not, a
   // new call to `Reserve` will be done and the data will be copied over.
   [[nodiscard /*The location to the appended data should be saved.*/]]
-  BufferLocation Append(PackIdentifier pack_id, const void* data, uint64_t size,
-                        int fingerprint_id);
+  BufferLocation Append(PackIdentifier pack_id, const void* data,
+                        uint64_t size);
 
   // Writes the flatbuffer to disk.
   [[nodiscard /*Writing the weight cache can fail.*/]]
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
index 37f19612010709..33566b8be2208a 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
@@ -32,14 +32,11 @@ table Buffer {
 }
 
 table BufferList {
-  /// A list of packing fingerprints. All of these need to be checked when
-  /// loading the cache to ensure that it is compatible.
-  fingerprints: [uint64];
   /// A list of buffers.
   buffers: [Buffer];
   /// Defines the base offset for the data in the file. That offset
   /// may be needed to guarantee data alignment.
-  base_offset: uint64;
+  base_offset:uint64;
 }
 
 root_type BufferList;
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
index c1e4071ff4a353..dd3093b2736517 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
@@ -57,13 +56,7 @@ namespace {
 
 using testing::ElementsAreArray;
 
-static xnn_fingerprint kDefaultFingerprint{/*id=*/0xf00d, /*value=*/0xb33f};
-
-struct WeightCacheBuilderTest : testing::Test {
-  void SetUp() override { xnn_set_fingerprint(kDefaultFingerprint); }
-};
-
-TEST_F(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
+TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -79,8 +72,7 @@ TEST_F(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   const size_t payload_size = size(payload);
   void* buffer = builder.Reserve(payload_size);
   std::memcpy(buffer, payload.c_str(), payload_size);
-  auto loc =
-      builder.Append(dummy_id, buffer, payload_size, kDefaultFingerprint.id);
+  auto loc = builder.Append(dummy_id, buffer, payload_size);
 
   EXPECT_EQ(loc.size, payload_size);
   EXPECT_GE(builder.capacity(), payload_size);
@@ -131,7 +123,7 @@ TEST_F(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST_F(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
+TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -145,8 +137,7 @@ TEST_F(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
-                            kDefaultFingerprint.id);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
 
   EXPECT_EQ(loc.size, payload_size);
 
@@ -195,7 +186,7 @@ TEST_F(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST_F(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
+TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   const std::string cache_path = testing::TempDir() + "/cache";
   const std::string payload = "This is some data in the file.";
   const PackIdentifier dummy_id{1, 2, 3};
@@ -207,8 +198,7 @@ TEST_F(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
-                            kDefaultFingerprint.id);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
   EXPECT_EQ(loc.size, payload_size);
   ASSERT_TRUE(builder.StopBuildStep());
 
@@ -228,13 +218,13 @@ TEST_F(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   EXPECT_FALSE(builder.StartBuildStep());
 }
 
-TEST_F(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
+TEST(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
   WeightCacheBuilder builder;
   EXPECT_FALSE(builder.Start("", FileDescriptor()));
   EXPECT_FALSE(builder.Start("/seldf/sedsft", FileDescriptor()));
 }
 
-TEST_F(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
+TEST(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   if (!TfLiteXNNPackDelegateCanUseInMemoryWeightCacheProvider()) {
     GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                     "isn't supported by the current system, skipping test.";
@@ -249,7 +239,7 @@ TEST_F(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   EXPECT_EQ(errno, ENOENT);
 }
 
-TEST_F(WeightCacheBuilderTest, MultipleStepBuild) {
+TEST(WeightCacheBuilderTest, MultipleStepBuild) {
   using std::size;
 
   const std::string payload1 = "This is some data in the file.";
@@ -272,8 +262,7 @@ TEST_F(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload1);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload1.c_str(), payload_size);
-    const auto loc =
-        builder.Append(dummy_id1, buffer, payload_size, kDefaultFingerprint.id);
+    const auto loc = builder.Append(dummy_id1, buffer, payload_size);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -281,8 +270,7 @@ TEST_F(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload3);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload3.c_str(), payload_size);
-    const auto loc =
-        builder.Append(dummy_id3, buffer, payload_size, kDefaultFingerprint.id);
+    const auto loc = builder.Append(dummy_id3, buffer, payload_size);
     (void)loc;
   }
 
@@ -296,8 +284,7 @@ TEST_F(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload2);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload2.c_str(), payload_size);
-    const auto loc =
-        builder.Append(dummy_id2, buffer, payload_size, kDefaultFingerprint.id);
+    const auto loc = builder.Append(dummy_id2, buffer, payload_size);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -402,8 +389,7 @@ struct FakeContext {
                                           const int weights_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = nullptr,
-            .fingerprint_id = kDefaultFingerprint.id};
+            .bias = nullptr};
   }
 
   // Creates a look up key for the XNNPack weight provider C interface.
@@ -412,8 +398,7 @@ struct FakeContext {
                                           const int bias_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = buffers[bias_index].data(),
-            .fingerprint_id = kDefaultFingerprint.id};
+            .bias = buffers[bias_index].data()};
   }
 
   // Helps creating fake packed data.
@@ -520,7 +505,6 @@ struct BuildMMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
-    xnn_set_fingerprint(kDefaultFingerprint);
     AddTensors();
     EndSetup();
   }
@@ -739,7 +723,6 @@ struct MMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
-    xnn_set_fingerprint(kDefaultFingerprint);
   }
   bool use_explicit_fd = GetParam().use_explicit_fd;
   const char* const explicit_fd_path = GetParam().explicit_fd_path;
@@ -800,14 +783,12 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data,
-        .fingerprint_id = kDefaultFingerprint.id};
+        .bias = tensors[1].data.data};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data,
-        .fingerprint_id = kDefaultFingerprint.id};
+        .bias = tensors[4].data.data};
 
     // Lookup non-packed tensor.
     ASSERT_EQ(cache->look_up(cache, &look_up_key_1), SIZE_MAX);
@@ -848,8 +829,7 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data,
-        .fingerprint_id = kDefaultFingerprint.id};
+        .bias = tensors[3].data.data};
 
     const size_t build_offset_2 = cache->look_up_or_insert(
         cache, &look_up_key_2, (void*)packed_data_ref_2,
@@ -924,20 +904,17 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data,
-        .fingerprint_id = kDefaultFingerprint.id};
+        .bias = tensors[1].data.data};
 
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data,
-        .fingerprint_id = kDefaultFingerprint.id};
+        .bias = tensors[3].data.data};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data,
-        .fingerprint_id = kDefaultFingerprint.id};
+        .bias = tensors[4].data.data};
 
     ASSERT_TRUE(cache->is_finalized(cache));
 
@@ -968,59 +945,30 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
   }
 }
 
-TEST_P(MMapWeightCacheProviderTest, CacheIsRebuiltOnFingerprintMismatch) {
-  if (use_in_memory_cache) {
-    GTEST_SUCCEED() << "In-memory cache is never reloaded.";
-    return;
-  }
+TEST_P(MMapWeightCacheProviderTest, XnnpackRebuildOnVersionMismatch) {
   TempFileDesc temp_fd;
   const char* temp_fd_cpath = explicit_fd_path;
+  FileDescriptor temp_fd_value = temp_fd.Duplicate();
 
-  xnn_fingerprint test_fingeprint{0x7357, 0xF33D};
-  {  // Build a cache file with a specific fingerprint.
-    // Clear fingerprints and add a test fingerprint to XNNPack.
-    xnn_clear_fingerprints();
-    xnn_set_fingerprint(test_fingeprint);
-
-    // Build a cache file.
-    MMapWeightCacheProvider cache_provider;
-
-    const char kernel[] = "Fake data.";
-    TfLiteTensor tensor;
-    tensor.data.data = (void*)kernel;
-    cache_provider.MapTensorIdentifiers(
-        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
-    ASSERT_TRUE(
-        cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
-    ASSERT_TRUE(cache_provider.StartBuildStep());
-    const xnn_weights_cache_look_up_key look_up_key_1{
-        .seed = 1234,
-        .kernel = kernel,
-        .bias = nullptr,
-        .fingerprint_id = test_fingeprint.id};
-    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
-    const size_t build_offset_1 = cache->look_up_or_insert(
-        cache, &look_up_key_1,
-        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
-        sizeof(kernel));
-    (void)build_offset_1;
-    ASSERT_TRUE(cache_provider.StopBuildStep());
+  {  // Set bad build identifier
+    XNNPackCacheHeader header{.version = XNNPackCacheHeader::kVersion};
+    header.xnnpack_build_identifier[0] += 1;
+    ASSERT_TRUE(temp_fd_value.Write(&header, sizeof(header)));
   }
 
   if (!use_explicit_fd) {
     temp_fd.Close();
     temp_fd_cpath = temp_fd.GetCPath();
+    temp_fd_value.Close();
+    if (use_in_memory_cache) {
+      temp_fd_cpath = kInMemoryCachePath;
+    }
   }
 
-  // Change the test fingerprint value.
-  test_fingeprint.value = 0xdeadb33f;
-  xnn_set_fingerprint(test_fingeprint);
-
-  // Reload the file.
   auto build_cache_provider = std::make_unique<MMapWeightCacheProvider>();
   MMapWeightCacheProvider& cache_provider = *build_cache_provider;
-  ASSERT_TRUE(
-      cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
+  ASSERT_TRUE(cache_provider.LoadOrStartBuild(temp_fd_cpath,
+                                              temp_fd_value.Duplicate()));
   ASSERT_TRUE(cache_provider.StartBuildStep());
 }
 
@@ -1032,53 +980,29 @@ class IsCompatibleCacheFileTest
   using Param = IsCompatibleCacheFileTestOverload;
 
   void SetUp() override {
-    xnn_clear_fingerprints();
-    xnn_set_fingerprint(kDefaultFingerprint);
-
-    // Build a cache file.
-    MMapWeightCacheProvider cache_provider;
-
-    const char kernel[] = "Fake data.";
-    TfLiteTensor tensor;
-    tensor.data.data = (void*)kernel;
-    cache_provider.MapTensorIdentifiers(
-        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
-    ASSERT_TRUE(
-        cache_provider.LoadOrStartBuild(fd_.GetCPath(), fd_.Duplicate()));
-    ASSERT_TRUE(cache_provider.StartBuildStep());
-    const xnn_weights_cache_look_up_key look_up_key_1{
-        .seed = 1234,
-        .kernel = kernel,
-        .bias = nullptr,
-        .fingerprint_id = kDefaultFingerprint.id};
-    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
-    const size_t build_offset_1 = cache->look_up_or_insert(
-        cache, &look_up_key_1,
-        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
-        sizeof(kernel));
-    (void)build_offset_1;
-    ASSERT_TRUE(cache_provider.StopBuildStep());
-  }
-
-  void ChangeRuntimeFingerprintValue() {
-    xnn_set_fingerprint(
-        {kDefaultFingerprint.id, kDefaultFingerprint.value + 1});
+    header_.version = XNNPackCacheHeader::kVersion;
+    memcpy(header_.xnnpack_build_identifier,
+           xnn_experimental_get_build_identifier_data(),
+           xnn_experimental_get_build_identifier_size());
   }
 
-  bool CallIsCompatibleCacheFile() {
-    switch (GetParam()) {
-      case Param::kPath:
-        fd_.Close();
-        return IsCompatibleCacheFile(fd_.GetCPath());
-      case Param::kDescriptor: {
-        const auto pos = fd_.GetPos();
-        EXPECT_NE(pos, 0);  // We test with a non zero position.
-        return IsCompatibleCacheFile(fd_);
-        EXPECT_EQ(fd_.GetPos(), pos);
-      }
+  bool WriteHeaderAndReturnIsCompatibleCacheFile() {
+    if (!fd_.Write(&header_, sizeof(header_))) {
+      return false;
+    }
+    if (GetParam() == Param::kPath) {
+      fd_.Close();
+      return IsCompatibleCacheFile(fd_.GetCPath());
+    } else {
+      const FileDescriptor::Offset pos = fd_.GetPos();
+      EXPECT_NE(pos, 0);  // Ensure that we are testing with a non 0 position.
+      const bool compatible = IsCompatibleCacheFile(fd_);
+      EXPECT_EQ(pos, fd_.GetPos());
+      return compatible;
     }
   }
 
+  XNNPackCacheHeader header_{};
   TempFileDesc fd_;
 };
 
@@ -1092,18 +1016,18 @@ std::string Name(
   }
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsTrueWhenFingerprintMatches) {
-  EXPECT_TRUE(CallIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsTrueForACorrectHeader) {
+  EXPECT_TRUE(WriteHeaderAndReturnIsCompatibleCacheFile());
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintMismatches) {
-  ChangeRuntimeFingerprintValue();
-  EXPECT_FALSE(CallIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongHeaderVersion) {
+  header_.version += 1;
+  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintIsNotFound) {
-  xnn_clear_fingerprints();
-  EXPECT_FALSE(CallIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongBuildIdentifier) {
+  header_.xnnpack_build_identifier[0] += 1;
+  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
 }
 
 INSTANTIATE_TEST_SUITE_P(

From eed068a5ea781e9ea1e8ddb0066b55e83080ed98 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 21:16:50 -0800
Subject: [PATCH 686/753] Automated Code Change

PiperOrigin-RevId: 848001978
---
 third_party/xla/xla/backends/cpu/transforms/BUILD              | 1 +
 third_party/xla/xla/backends/cpu/transforms/library_rewriter.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/third_party/xla/xla/backends/cpu/transforms/BUILD b/third_party/xla/xla/backends/cpu/transforms/BUILD
index ff400cc58cf198..66ba9db13ba559 100644
--- a/third_party/xla/xla/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/backends/cpu/transforms/BUILD
@@ -41,6 +41,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_protobuf//:protobuf_lite",
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
diff --git a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
index 5dfae6ee54a641..69dc325a5cfdb1 100644
--- a/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
+++ b/third_party/xla/xla/backends/cpu/transforms/library_rewriter.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "google/protobuf/repeated_field.h"
 #include "xla/backends/cpu/codegen/target_machine_features.h"
 #include "xla/backends/cpu/transforms/library_matcher.h"
 #include "xla/backends/cpu/transforms/ynn_matcher.h"

From 05c8cf7c132f310c8f8f2a022ba52d22b52e52c4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 22:39:43 -0800
Subject: [PATCH 687/753] respect IGNORE bad indices policy

The "IGNORE" bad indices policy of tensorflow's GatherNd specifies the gathered output from bad indices to be 0.

PiperOrigin-RevId: 848025149
---
 .../compiler/tests/gather_nd_op_test.py       | 22 ++++++-
 .../compiler/tf2xla/kernels/gather_op.cc      | 59 ++++++++++++++++++-
 2 files changed, 78 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py
index 60a7949138e9c1..4f1d6f477e896b 100644
--- a/tensorflow/compiler/tests/gather_nd_op_test.py
+++ b/tensorflow/compiler/tests/gather_nd_op_test.py
@@ -24,12 +24,14 @@
 
 class GatherNdTest(xla_test.XLATestCase):
 
-  def _runGather(self, params, indices):
+  def _runGather(self, params, indices, bad_indices_policy=""):
     with self.session():
       paramsp = array_ops.placeholder(params.dtype)
       indicesp = array_ops.placeholder(indices.dtype)
       with self.test_scope():
-        gather_nd_t = array_ops.gather_nd(paramsp, indicesp)
+        gather_nd_t = array_ops.gather_nd(
+            paramsp, indicesp, bad_indices_policy=bad_indices_policy
+        )
       feed_dict = {paramsp: params, indicesp: indices}
       return gather_nd_t.eval(feed_dict=feed_dict)
 
@@ -139,6 +141,22 @@ def testHigherRankParamsAndIndices(self):
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
 
+  def testIgnoreBadIndices(self):
+    shape = (3, 4, 5)
+    params = np.arange(np.prod(shape), dtype=np.int32).reshape(shape)
+    indices = np.array([[[0, 0], [-1, 3]], [[2, 4], [2, 3]]], dtype=np.int32)
+    gather_nd_val = self._runGather(
+        params, indices, bad_indices_policy="IGNORE"
+    )
+    expected = np.array(
+        [
+            [[0, 1, 2, 3, 4], [0, 0, 0, 0, 0]],
+            [[0, 0, 0, 0, 0], [55, 56, 57, 58, 59]],
+        ],
+        dtype=np.int32,
+    )
+    self.assertAllEqual(expected, gather_nd_val)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index e94f74d1fed8ef..3d12dadcbd53e9 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -283,7 +284,13 @@ REGISTER_XLA_OP(Name("GatherV2").CompileTimeConstantInput("axis"), GatherOp);
 
 class GatherNdOp : public XlaOpKernel {
  public:
-  explicit GatherNdOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+  explicit GatherNdOp(OpKernelConstruction* context) : XlaOpKernel(context) {
+    // Set batch_dims_ to 0 if the attribute does not exist.
+    if (context->HasAttr("bad_indices_policy")) {
+      OP_REQUIRES_OK(context, context->GetAttr("bad_indices_policy",
+                                               &bad_indices_policy_));
+    }
+  }
 
   void Compile(XlaOpKernelContext* context) override {
     DataType params_type = context->input_type(0);
@@ -312,8 +319,58 @@ class GatherNdOp : public XlaOpKernel {
                                       indices_shape, /*axis=*/0,
                                       /*indices_are_nd=*/true, params_type,
                                       indices_type, builder, &gather));
+    // By default, XLA clips OOB indices, while "IGNORE" policy demands to fill
+    // 0s to the output. The following code implements the "IGNORE" policy by
+    // masking the gather result with the valid indices mask.
+    if (bad_indices_policy_ == "IGNORE") {
+      xla::XlaOp valid_mask;
+      for (int i = 0; i < num_index_dims; ++i) {
+        xla::XlaOp i_limit = XlaHelpers::IntegerLiteral(
+            builder, indices_type, params_shape.dim_size(i));
+        xla::XlaOp i_zero = XlaHelpers::Zero(builder, indices_type);
+        xla::XlaOp indices_i =
+            xla::SliceInDim(indices, i, i + 1, 1, indices_shape.dims() - 1);
+
+        xla::XlaOp indices_i_good =
+            xla::And(xla::Ge(indices_i, i_zero), xla::Lt(indices_i, i_limit));
+        if (i == 0) {
+          valid_mask = indices_i_good;
+        } else {
+          valid_mask = xla::And(valid_mask, indices_i_good);
+        }
+      }
+      auto gather_shape = builder->GetShape(gather);
+      OP_REQUIRES_OK(context, gather_shape.status());
+
+      std::vector<int64_t> valid_mask_dims(
+          gather_shape->dimensions().begin(),
+          gather_shape->dimensions().end() - 1);
+      valid_mask = xla::Reshape(valid_mask, valid_mask_dims);
+      if (indices_shape.dims() != gather_shape->dimensions().size()) {
+        OP_REQUIRES(
+            context,
+            gather_shape->dimensions().size() == indices_shape.dims() - 1,
+            errors::InvalidArgument(
+                "Indices rank must be equal to output rank (with channel "
+                "dimension) or 1 less (w/o channel dimension)"));
+      } else {
+        std::vector<int64_t> broadcast_dims(valid_mask_dims.size(), 1);
+        for (int i = 0; i < broadcast_dims.size(); ++i) {
+          broadcast_dims[i] = i;
+        }
+        valid_mask = xla::BroadcastInDim(valid_mask, gather_shape->dimensions(),
+                                         broadcast_dims);
+      }
+
+      gather =
+          xla::Select(valid_mask, gather,
+                      xla::Broadcast(XlaHelpers::Zero(builder, params_type),
+                                     gather_shape->dimensions()));
+    }
     context->SetOutput(0, gather);
   }
+
+  std::string bad_indices_policy_;
 };
 
 REGISTER_XLA_OP(Name("GatherNd"), GatherNdOp);

From 544a1362cf69fc10ad354c3a9f6e7f36af2379b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 22:45:50 -0800
Subject: [PATCH 688/753] Automated Code Change

PiperOrigin-RevId: 848026870
---
 third_party/xla/xla/service/cpu/cpu_executable.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/third_party/xla/xla/service/cpu/cpu_executable.cc b/third_party/xla/xla/service/cpu/cpu_executable.cc
index e04c3205a92c0e..a0597e58eb2e82 100644
--- a/third_party/xla/xla/service/cpu/cpu_executable.cc
+++ b/third_party/xla/xla/service/cpu/cpu_executable.cc
@@ -159,7 +159,7 @@ static absl::StatusOr<MaybeOwningDeviceAddress> MemoryForAllocation(
   if (allocation.is_entry_computation_parameter()) {
     se::DeviceAddressBase out = arguments[allocation.parameter_number()]
                                     .Buffer(allocation.param_shape_index())
-                                    .AsDeviceMemoryBase();
+                                    .AsDeviceAddress();
     CHECK_LE(allocation.size(), out.size())
         << "Size mismatch on param " << allocation.parameter_number()
         << " at shape index " << allocation.param_shape_index().ToString();
@@ -239,8 +239,7 @@ absl::Status CpuExecutable::ExecuteThunks(
   VLOG(3) << absl::StrFormat("  Number of buffer allocations: %u",
                              buffers.size());
   auto mem_printer = [](std::string* out, const MaybeOwningDeviceAddress& mem) {
-    absl::StrAppend(out,
-                    absl::StrFormat("%p", mem.AsDeviceMemoryBase().opaque()));
+    absl::StrAppend(out, absl::StrFormat("%p", mem.AsDeviceAddress().opaque()));
   };
   VLOG(3) << absl::StrFormat("  Buffer allocations: [%s]",
                              absl::StrJoin(buffers, ", ", mem_printer));
@@ -370,9 +369,9 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
         result_buffer = allocated_buffer.Release();
         MaybeOwningDeviceAddress& registered_buffer = buffers[buffer_index];
         CHECK_EQ(result_buffer.size(),
-                 registered_buffer.AsDeviceMemoryBase().size());
+                 registered_buffer.AsDeviceAddress().size());
         std::memcpy(/*dest=*/result_buffer.opaque(),
-                    /*src=*/registered_buffer.AsDeviceMemoryBase().opaque(),
+                    /*src=*/registered_buffer.AsDeviceAddress().opaque(),
                     /*n=*/result_buffer.size());
         registered_buffer = result_buffer;
       }
@@ -385,7 +384,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
         result_buffer = owned_buffer->Release();
         buffer = result_buffer;
       } else {
-        result_buffer = buffer.AsDeviceMemoryBase();
+        result_buffer = buffer.AsDeviceAddress();
         result.AddAliasedIndex(index);
       }
     }

From 458b16c38a6e65531a2f4c6ba5855fdb3c8902ec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 22 Dec 2025 23:10:04 -0800
Subject: [PATCH 689/753] Automated Code Change

PiperOrigin-RevId: 848032987
---
 tensorflow/core/grappler/optimizers/auto_mixed_precision.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 6bbb19623cdd39..ee2ad12f2e769e 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -90,12 +90,12 @@ std::pair<int, int> GetDeviceGPUArch(
   }
 
   int major, minor;
-  if (!strings::safe_strto32(split_arch_str[0], &major)) {
+  if (!absl::SimpleAtoi(split_arch_str[0], &major)) {
     return {0, 0};
   }
 
   if (split_arch_str.size() > 1) {
-    if (strings::safe_strto32(split_arch_str[1], &minor)) {
+    if (absl::SimpleAtoi(split_arch_str[1], &minor)) {
       return {major, minor};
     } else {
       return {0, 0};

From c19720a4815dda7e8c58ca334ade2da8149b6704 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:03:58 -0800
Subject: [PATCH 690/753] compat: Update forward compatibility horizon to
 2025-12-23

PiperOrigin-RevId: 848066260
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 52dc4100f8da06..62b652fca06d5e 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 22)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 23)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From c313fc596d6e458192e018e4a761cd4376b6c052 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:04:06 -0800
Subject: [PATCH 691/753] Update GraphDef version to 2450.

PiperOrigin-RevId: 848066295
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index d59705a833dfa8..492e874659b100 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2449  // Updated: 2025/12/22
+#define TF_GRAPH_DEF_VERSION 2450  // Updated: 2025/12/23
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 07f79114342c0d8d9f3bd61f02355272ca735e5d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:30:26 -0800
Subject: [PATCH 692/753] Automated Code Change

PiperOrigin-RevId: 848073971
---
 tensorflow/lite/python/testdata/double_op.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/python/testdata/double_op.cc b/tensorflow/lite/python/testdata/double_op.cc
index a6f8c542cd3b19..9d227a1e83e8ea 100644
--- a/tensorflow/lite/python/testdata/double_op.cc
+++ b/tensorflow/lite/python/testdata/double_op.cc
@@ -52,8 +52,8 @@ class DoubleOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(
-    Name("Double").Device(DEVICE_CPU).TypeConstraint<int32>("T"),
-    DoubleOp<int32>);
+    Name("Double").Device(DEVICE_CPU).TypeConstraint<int32_t>("T"),
+    DoubleOp<int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("Double").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     DoubleOp<float>);

From 1c22acd932974cff9664f906d887bdd0d22beea5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:30:42 -0800
Subject: [PATCH 693/753] Automated Code Change

PiperOrigin-RevId: 848074084
---
 tensorflow/core/data/service/credentials_factory.cc       | 2 +-
 tensorflow/core/data/service/data_transfer.cc             | 2 +-
 tensorflow/core/data/service/dispatcher_client.cc         | 2 +-
 tensorflow/core/data/service/grpc_dispatcher_impl_test.cc | 2 +-
 tensorflow/core/data/service/grpc_worker_impl_test.cc     | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/data/service/credentials_factory.cc b/tensorflow/core/data/service/credentials_factory.cc
index 721ce5b806e7af..4362800c525137 100644
--- a/tensorflow/core/data/service/credentials_factory.cc
+++ b/tensorflow/core/data/service/credentials_factory.cc
@@ -58,7 +58,7 @@ absl::Status CredentialsFactory::Get(absl::string_view protocol,
     return absl::OkStatus();
   }
 
-  std::vector<string> available_types;
+  std::vector<std::string> available_types;
   for (const auto& factory : credentials_factories()) {
     available_types.push_back(factory.first);
   }
diff --git a/tensorflow/core/data/service/data_transfer.cc b/tensorflow/core/data/service/data_transfer.cc
index 4f45b11d313e31..ee6a0b1c4d3daa 100644
--- a/tensorflow/core/data/service/data_transfer.cc
+++ b/tensorflow/core/data/service/data_transfer.cc
@@ -128,7 +128,7 @@ absl::Status DataTransferClient::Build(
     return it->second(config, out);
   }
 
-  std::vector<string> available_names;
+  std::vector<std::string> available_names;
   for (const auto& factory : transfer_client_factories()) {
     available_names.push_back(factory.first);
   }
diff --git a/tensorflow/core/data/service/dispatcher_client.cc b/tensorflow/core/data/service/dispatcher_client.cc
index c06acb3e332ddf..4a3c8a12a31057 100644
--- a/tensorflow/core/data/service/dispatcher_client.cc
+++ b/tensorflow/core/data/service/dispatcher_client.cc
@@ -55,7 +55,7 @@ absl::Status DataServiceDispatcherClient::Initialize() {
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
   grpc::ChannelArguments args;
-  args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
   args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
   auto channel = grpc::CreateCustomChannel(address_, credentials, args);
   stub_ = DispatcherService::NewStub(channel);
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc b/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
index c04cdf7a718456..6882a6b23e09e3 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
@@ -72,7 +72,7 @@ class GrpcDispatcherImplTest : public ::testing::Test {
     TF_RETURN_IF_ERROR(
         CredentialsFactory::CreateClientCredentials(kProtocol, &credentials));
     ChannelArguments args;
-    args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+    args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
     args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
     std::shared_ptr<Channel> channel =
         ::grpc::CreateCustomChannel(GetDispatcherAddress(), credentials, args);
diff --git a/tensorflow/core/data/service/grpc_worker_impl_test.cc b/tensorflow/core/data/service/grpc_worker_impl_test.cc
index 23eb6989c8cb1a..2d7563274bc295 100644
--- a/tensorflow/core/data/service/grpc_worker_impl_test.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl_test.cc
@@ -83,7 +83,7 @@ class GrpcWorkerImplTest : public ::testing::Test {
     TF_RETURN_IF_ERROR(
         CredentialsFactory::CreateClientCredentials(kProtocol, &credentials));
     ChannelArguments args;
-    args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+    args.SetMaxReceiveMessageSize(std::numeric_limits<int32_t>::max());
     args.SetInt(GRPC_ARG_USE_LOCAL_SUBCHANNEL_POOL, true);
     std::shared_ptr<Channel> channel =
         ::grpc::CreateCustomChannel(GetWorkerAddress(), credentials, args);

From 2c2340a46b3a670ab9d8551c76002d4b48b165db Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:35:33 -0800
Subject: [PATCH 694/753] Automated Code Change

PiperOrigin-RevId: 848075741
---
 tensorflow/core/kernels/in_topk_op.cc  | 8 ++++----
 tensorflow/core/kernels/inplace_ops.cc | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/in_topk_op.cc b/tensorflow/core/kernels/in_topk_op.cc
index e5d03f902eb58f..20e12a56e8778f 100644
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@@ -129,18 +129,18 @@ namespace functor {
       typename TTypes<bool>::Vec output);                           \
   extern template struct InTopKFunctor<GPUDevice, T, TARGET_T>;
 
-DECLARE_GPU_SPEC(float, int32);
+DECLARE_GPU_SPEC(float, int32_t);
 DECLARE_GPU_SPEC(float, int64_t);
 
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
 REGISTER_KERNEL_BUILDER(
-    Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int32>("T"),
-    InTopK<GPUDevice, float, int32>);
+    Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int32_t>("T"),
+    InTopK<GPUDevice, float, int32_t>);
 REGISTER_KERNEL_BUILDER(
     Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int64_t>("T"),
-    InTopK<GPUDevice, float, int64>);
+    InTopK<GPUDevice, float, int64_t>);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 1380406a8a33cc..6948cd86c1f8b1 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -199,7 +199,7 @@ REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
                             .HostMemory("value")
                             .HostMemory("update")
                             .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
+                            .TypeConstraint<int32_t>("T"),
                         ParallelConcatUpdate<CPUDevice>);
 #endif
 
@@ -463,7 +463,7 @@ REGISTER(uint8_t);
 REGISTER(int64_t);
 REGISTER(uint64_t);
 
-REGISTER_EMPTY(int32, GPU);
+REGISTER_EMPTY(int32_t, GPU);
 #undef REGISTER
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

From d612103dcf841138f3c23a713b1dfd4a9f185ad5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:35:35 -0800
Subject: [PATCH 695/753] Automated Code Change

PiperOrigin-RevId: 848075753
---
 tensorflow/core/lib/jpeg/jpeg_mem.cc          |  44 ++++----
 tensorflow/core/lib/jpeg/jpeg_mem.h           |  14 +--
 tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc | 101 +++++++++---------
 3 files changed, 80 insertions(+), 79 deletions(-)

diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index bb729bf99272cc..85c7ef3d268372 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -58,7 +58,7 @@ class FewerArgsForCompiler {
  public:
   FewerArgsForCompiler(int datasize, const UncompressFlags& flags,
                        int64_t* nwarn,
-                       std::function<uint8*(int, int, int)> allocate_output)
+                       std::function<uint8_t*(int, int, int)> allocate_output)
       : datasize_(datasize),
         flags_(flags),
         pnwarn_(nwarn),
@@ -72,7 +72,7 @@ class FewerArgsForCompiler {
   const int datasize_;
   const UncompressFlags flags_;
   int64_t* const pnwarn_;
-  std::function<uint8*(int, int, int)> allocate_output_;
+  std::function<uint8_t*(int, int, int)> allocate_output_;
   int height_read_;  // number of scanline lines successfully read
   int height_;
   int stride_;
@@ -95,7 +95,7 @@ bool IsCropWindowValid(const UncompressFlags& flags, int input_image_width,
 void no_print(j_common_ptr cinfo) {}
 #endif
 
-uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
+uint8_t* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   // unpack the argball
   const int datasize = argball->datasize_;
   const auto& flags = argball->flags_;
@@ -252,8 +252,8 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
                                         target_output_height, components);
   }
 #else
-  uint8* dstdata = argball->allocate_output_(target_output_width,
-                                             target_output_height, components);
+  uint8_t* dstdata = argball->allocate_output_(
+      target_output_width, target_output_height, components);
 #endif
   if (dstdata == nullptr) {
     jpeg_destroy_decompress(&cinfo);
@@ -509,12 +509,12 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
 //  associated libraries aren't good enough to guarantee that 7
 //  parameters won't get clobbered by the longjmp.  So we help
 //  it out a little.
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int64_t* nwarn,
-                  std::function<uint8*(int, int, int)> allocate_output) {
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int64_t* nwarn,
+                    std::function<uint8_t*(int, int, int)> allocate_output) {
   FewerArgsForCompiler argball(datasize, flags, nwarn,
                                std::move(allocate_output));
-  uint8* const dstdata = UncompressLow(srcdata, &argball);
+  uint8_t* const dstdata = UncompressLow(srcdata, &argball);
 
   const float fraction_read =
       argball.height_ == 0
@@ -530,7 +530,7 @@ uint8* Uncompress(const void* srcdata, int datasize,
   // set the unread pixels to black
   if (argball.height_read_ != argball.height_) {
     const int first_bad_line = argball.height_read_;
-    uint8* start = dstdata + first_bad_line * argball.stride_;
+    uint8_t* start = dstdata + first_bad_line * argball.stride_;
     const int nbytes = (argball.height_ - first_bad_line) * argball.stride_;
     memset(static_cast<void*>(start), 0, nbytes);
   }
@@ -538,17 +538,17 @@ uint8* Uncompress(const void* srcdata, int datasize,
   return dstdata;
 }
 
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int* pwidth, int* pheight,
-                  int* pcomponents, int64_t* nwarn) {
-  uint8* buffer = nullptr;
-  uint8* result =
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int* pwidth, int* pheight,
+                    int* pcomponents, int64_t* nwarn) {
+  uint8_t* buffer = nullptr;
+  uint8_t* result =
       Uncompress(srcdata, datasize, flags, nwarn,
                  [=, &buffer](int width, int height, int components) {
                    if (pwidth != nullptr) *pwidth = width;
                    if (pheight != nullptr) *pheight = height;
                    if (pcomponents != nullptr) *pcomponents = components;
-                   buffer = new uint8[height * width * components];
+                   buffer = new uint8_t[height * width * components];
                    return buffer;
                  });
   if (!result) delete[] buffer;
@@ -599,7 +599,7 @@ bool GetImageInfo(const void* srcdata, int datasize, int* width, int* height,
 // Compression
 
 namespace {
-bool CompressInternal(const uint8* srcdata, int width, int height,
+bool CompressInternal(const uint8_t* srcdata, int width, int height,
                       const CompressFlags& flags, tstring* output) {
   if (output == nullptr) {
     LOG(ERROR) << "Output buffer is null: ";
@@ -711,7 +711,7 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
   if (!flags.xmp_metadata.empty()) {
     // XMP metadata is embedded in the APP1 tag of JPEG and requires this
     // namespace header string (null-terminated)
-    const string name_space = "http://ns.adobe.com/xap/1.0/";
+    const std::string name_space = "http://ns.adobe.com/xap/1.0/";
     const int name_space_length = name_space.size();
     const int metadata_length = flags.xmp_metadata.size();
     const int packet_length = metadata_length + name_space_length + 1;
@@ -736,8 +736,8 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
       new JSAMPLE[width * cinfo.input_components]);
   while (cinfo.next_scanline < cinfo.image_height) {
     JSAMPROW row_pointer[1];  // pointer to JSAMPLE row[s]
-    const uint8* r = &srcdata[cinfo.next_scanline * in_stride];
-    uint8* p = static_cast<uint8*>(row_temp.get());
+    const uint8_t* r = &srcdata[cinfo.next_scanline * in_stride];
+    uint8_t* p = static_cast<uint8_t*>(row_temp.get());
     switch (flags.format) {
       case FORMAT_RGBA: {
         for (int i = 0; i < width; ++i, p += 3, r += 4) {
@@ -777,14 +777,14 @@ bool CompressInternal(const uint8* srcdata, int width, int height,
 
 bool Compress(const void* srcdata, int width, int height,
               const CompressFlags& flags, tstring* output) {
-  return CompressInternal(static_cast<const uint8*>(srcdata), width, height,
+  return CompressInternal(static_cast<const uint8_t*>(srcdata), width, height,
                           flags, output);
 }
 
 tstring Compress(const void* srcdata, int width, int height,
                  const CompressFlags& flags) {
   tstring temp;
-  CompressInternal(static_cast<const uint8*>(srcdata), width, height, flags,
+  CompressInternal(static_cast<const uint8_t*>(srcdata), width, height, flags,
                    &temp);
   // If CompressInternal fails, temp will be empty.
   return temp;
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.h b/tensorflow/core/lib/jpeg/jpeg_mem.h
index 859c4702fd09fa..569abb6b79bf74 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.h
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.h
@@ -87,19 +87,19 @@ struct UncompressFlags {
 // The function returns a pointer to the raw uncompressed data or NULL if
 // there was an error. The caller of the function is responsible for
 // freeing the memory (using delete []).
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int* width, int* height,
-                  int* components,  // Output only: useful with autodetect
-                  int64_t* nwarn);
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int* width, int* height,
+                    int* components,  // Output only: useful with autodetect
+                    int64_t* nwarn);
 
 // Version of Uncompress that allocates memory via a callback.  The callback
 // arguments are (width, height, components).  If the size is known ahead of
 // time this function can return an existing buffer; passing a callback allows
 // the buffer to be shaped based on the JPEG header.  The caller is responsible
 // for freeing the memory *even along error paths*.
-uint8* Uncompress(const void* srcdata, int datasize,
-                  const UncompressFlags& flags, int64_t* nwarn,
-                  std::function<uint8*(int, int, int)> allocate_output);
+uint8_t* Uncompress(const void* srcdata, int datasize,
+                    const UncompressFlags& flags, int64_t* nwarn,
+                    std::function<uint8_t*(int, int, int)> allocate_output);
 
 // Read jpeg header and get image information.  Returns true on success.
 // The width, height, and components points may be null.
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
index 0248a453d5586f..a8c5401bf52e01 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem_unittest.cc
@@ -41,12 +41,12 @@ namespace {
 
 const char kTestData[] = "tensorflow/core/lib/jpeg/testdata/";
 
-int ComputeSumAbsoluteDifference(const uint8* a, const uint8* b, int width,
+int ComputeSumAbsoluteDifference(const uint8_t* a, const uint8_t* b, int width,
                                  int height, int a_stride, int b_stride) {
   int totalerr = 0;
   for (int i = 0; i < height; i++) {
-    const uint8* const pa = a + i * a_stride;
-    const uint8* const pb = b + i * b_stride;
+    const uint8_t* const pa = a + i * a_stride;
+    const uint8_t* const pb = b + i * b_stride;
     for (int j = 0; j < 3 * width; j++) {
       totalerr += abs(static_cast<int>(pa[j]) - static_cast<int>(pb[j]));
     }
@@ -55,20 +55,21 @@ int ComputeSumAbsoluteDifference(const uint8* a, const uint8* b, int width,
 }
 
 // Reads the contents of the file into output
-void ReadFileToStringOrDie(Env* env, const string& filename, string* output) {
+void ReadFileToStringOrDie(Env* env, const std::string& filename,
+                           std::string* output) {
   TF_CHECK_OK(ReadFileToString(env, filename, output));
 }
 
-void TestJPEG(Env* env, const string& jpegfile) {
+void TestJPEG(Env* env, const std::string& jpegfile) {
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  const uint8* const temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const uint8_t* const temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   // Try partial decoding (half of the data)
   int w, h, c;
-  std::unique_ptr<uint8[]> imgdata;
+  std::unique_ptr<uint8_t[]> imgdata;
 
   UncompressFlags flags;
   flags.components = 3;
@@ -91,7 +92,7 @@ void TestJPEG(Env* env, const string& jpegfile) {
 
 TEST(JpegMemTest, Jpeg) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Name of a valid jpeg file on the disk
   TestJPEG(env, data_path + "jpeg_merge_test1.jpg");
@@ -100,16 +101,16 @@ TEST(JpegMemTest, Jpeg) {
   TestJPEG(env, data_path + "jpeg_merge_test1_cmyk.jpg");
 }
 
-void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
+void TestCropAndDecodeJpeg(Env* env, const std::string& jpegfile,
                            const UncompressFlags& default_flags) {
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, jpegfile, &jpeg);
   const int fsize = jpeg.size();
-  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   // Decode the whole image.
-  std::unique_ptr<uint8[]> imgdata1;
+  std::unique_ptr<uint8_t[]> imgdata1;
   int w1, h1, c1;
   {
     UncompressFlags flags = default_flags;
@@ -119,13 +120,13 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
       // If stride is not zero, the default allocator would fail because it
       // allocate w*h*c bytes, but the actual required bytes should be stride*h.
       // Therefore, we provide a specialized allocator here.
-      uint8* buffer = nullptr;
+      uint8_t* buffer = nullptr;
       imgdata1.reset(Uncompress(temp, fsize, flags, nullptr,
                                 [&](int width, int height, int components) {
                                   w1 = width;
                                   h1 = height;
                                   c1 = components;
-                                  buffer = new uint8[flags.stride * height];
+                                  buffer = new uint8_t[flags.stride * height];
                                   return buffer;
                                 }));
     }
@@ -134,7 +135,7 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
 
   auto check_crop_and_decode_func = [&](int crop_x, int crop_y, int crop_width,
                                         int crop_height) {
-    std::unique_ptr<uint8[]> imgdata2;
+    std::unique_ptr<uint8_t[]> imgdata2;
     int w, h, c;
     UncompressFlags flags = default_flags;
     flags.crop = true;
@@ -145,13 +146,13 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
     if (flags.stride == 0) {
       imgdata2.reset(Uncompress(temp, fsize, flags, &w, &h, &c, nullptr));
     } else {
-      uint8* buffer = nullptr;
+      uint8_t* buffer = nullptr;
       imgdata2.reset(Uncompress(temp, fsize, flags, nullptr,
                                 [&](int width, int height, int components) {
                                   w = width;
                                   h = height;
                                   c = components;
-                                  buffer = new uint8[flags.stride * height];
+                                  buffer = new uint8_t[flags.stride * height];
                                   return buffer;
                                 }));
     }
@@ -164,8 +165,8 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
     const int stride1 = (flags.stride != 0) ? flags.stride : w1 * c;
     const int stride2 = (flags.stride != 0) ? flags.stride : w * c;
     for (int i = 0; i < crop_height; i++) {
-      const uint8* p1 = &imgdata1[(i + crop_y) * stride1 + crop_x * c];
-      const uint8* p2 = &imgdata2[i * stride2];
+      const uint8_t* p1 = &imgdata1[(i + crop_y) * stride1 + crop_x * c];
+      const uint8_t* p2 = &imgdata2[i * stride2];
 
       for (int j = 0; j < c * w; j++) {
         ASSERT_EQ(p1[j], p2[j])
@@ -185,7 +186,7 @@ void TestCropAndDecodeJpeg(Env* env, const string& jpegfile,
 
 TEST(JpegMemTest, CropAndDecodeJpeg) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
 
   // Test basic flags for jpeg and cmyk jpeg.
@@ -195,7 +196,7 @@ TEST(JpegMemTest, CropAndDecodeJpeg) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithRatio) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
   for (int ratio : {1, 2, 4, 8}) {
     flags.ratio = ratio;
@@ -205,7 +206,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithRatio) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithComponents) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
   for (const int components : {0, 1, 3}) {
     flags.components = components;
@@ -215,7 +216,7 @@ TEST(JpegMemTest, CropAndDecodeJpegWithComponents) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithUpScaling) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
   UncompressFlags flags;
   flags.fancy_upscaling = true;
   TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
@@ -223,13 +224,13 @@ TEST(JpegMemTest, CropAndDecodeJpegWithUpScaling) {
 
 TEST(JpegMemTest, CropAndDecodeJpegWithStride) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
@@ -244,9 +245,9 @@ TEST(JpegMemTest, CropAndDecodeJpegWithStride) {
   TestCropAndDecodeJpeg(env, data_path + "jpeg_merge_test1.jpg", flags);
 }
 
-void CheckInvalidCropWindowFailed(const uint8* const temp, int fsize, int x,
+void CheckInvalidCropWindowFailed(const uint8_t* const temp, int fsize, int x,
                                   int y, int w, int h) {
-  std::unique_ptr<uint8[]> imgdata;
+  std::unique_ptr<uint8_t[]> imgdata;
   int ww, hh, cc;
   UncompressFlags flags;
   flags.components = 3;
@@ -261,13 +262,13 @@ void CheckInvalidCropWindowFailed(const uint8* const temp, int fsize, int x,
 
 TEST(JpegMemTest, CropAndDecodeJpegWithInvalidCropWindow) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Read the data from the jpeg file into memory
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, data_path + "jpeg_merge_test1.jpg", &jpeg);
   const int fsize = jpeg.size();
-  const auto* temp = absl::bit_cast<const uint8*>(jpeg.data());
+  const auto* temp = absl::bit_cast<const uint8_t*>(jpeg.data());
 
   int w, h, c;
   ASSERT_TRUE(GetImageInfo(temp, fsize, &w, &h, &c));
@@ -293,27 +294,27 @@ TEST(JpegMemTest, Jpeg2) {
   const int in_w = 256;
   const int in_h = 256;
   const int stride1 = 3 * in_w;
-  const std::unique_ptr<uint8[]> refdata1(new uint8[stride1 * in_h]);
+  const std::unique_ptr<uint8_t[]> refdata1(new uint8_t[stride1 * in_h]);
   for (int i = 0; i < in_h; i++) {
     for (int j = 0; j < in_w; j++) {
       const int offset = i * stride1 + 3 * j;
       refdata1[offset + 0] = i;
       refdata1[offset + 1] = j;
-      refdata1[offset + 2] = static_cast<uint8>((i + j) >> 1);
+      refdata1[offset + 2] = static_cast<uint8_t>((i + j) >> 1);
     }
   }
 
   // duplicate with weird input stride
   const int stride2 = 3 * 357;
-  const std::unique_ptr<uint8[]> refdata2(new uint8[stride2 * in_h]);
+  const std::unique_ptr<uint8_t[]> refdata2(new uint8_t[stride2 * in_h]);
   for (int i = 0; i < in_h; i++) {
     memcpy(&refdata2[i * stride2], &refdata1[i * stride1], 3 * in_w);
   }
 
   // Test compression
-  string cpdata1, cpdata2;
+  std::string cpdata1, cpdata2;
   {
-    const string kXMP = "XMP_TEST_123";
+    const std::string kXMP = "XMP_TEST_123";
 
     // Compress it to JPEG
     CompressFlags flags;
@@ -327,7 +328,7 @@ TEST(JpegMemTest, Jpeg2) {
     CHECK_EQ(cpdata1, cpdata2);
 
     // Verify valid XMP.
-    CHECK_NE(string::npos, cpdata1.find(kXMP));
+    CHECK_NE(std::string::npos, cpdata1.find(kXMP));
 
     // Test the other API, where a storage string is supplied
     tstring cptest;
@@ -340,7 +341,7 @@ TEST(JpegMemTest, Jpeg2) {
   }
 
   // Uncompress twice: once with 3 components and once with autodetect.
-  std::unique_ptr<uint8[]> imgdata1;
+  std::unique_ptr<uint8_t[]> imgdata1;
   for (const int components : {0, 3}) {
     // Uncompress it
     UncompressFlags flags;
@@ -366,7 +367,7 @@ TEST(JpegMemTest, Jpeg2) {
   {
     UncompressFlags flags;
     flags.stride = 3 * 411;
-    const std::unique_ptr<uint8[]> imgdata2(new uint8[flags.stride * in_h]);
+    const std::unique_ptr<uint8_t[]> imgdata2(new uint8_t[flags.stride * in_h]);
     CHECK(imgdata2.get() == Uncompress(cpdata2.c_str(), cpdata2.length(), flags,
                                        nullptr /* nwarn */,
                                        [=, &imgdata2](int w, int h, int c) {
@@ -404,7 +405,7 @@ TEST(JpegMemTest, Jpeg2) {
 
 // Takes JPEG data and reads its headers to determine whether or not the JPEG
 // was chroma downsampled.
-bool IsChromaDownsampled(const string& jpegdata) {
+bool IsChromaDownsampled(const std::string& jpegdata) {
   // Initialize libjpeg structures to have a memory source
   // Modify the usual jpeg error manager to catch fatal errors.
   struct jpeg_decompress_struct cinfo;
@@ -447,8 +448,8 @@ bool IsChromaDownsampled(const string& jpegdata) {
 
 TEST(JpegMemTest, ChromaDownsampling) {
   // Read the data from a test jpeg file into memory
-  const string jpegfile = string(kTestData) + "jpeg_merge_test1.jpg";
-  string jpeg;
+  const std::string jpegfile = std::string(kTestData) + "jpeg_merge_test1.jpg";
+  std::string jpeg;
   ReadFileToStringOrDie(Env::Default(), jpegfile, &jpeg);
 
   // Verify that compressing the JPEG with chroma downsampling works.
@@ -458,7 +459,7 @@ TEST(JpegMemTest, ChromaDownsampling) {
   unflags.components = 3;
   int w, h, c;
   int64_t num_warnings;
-  std::unique_ptr<uint8[]> uncompressed(Uncompress(
+  std::unique_ptr<uint8_t[]> uncompressed(Uncompress(
       jpeg.c_str(), jpeg.size(), unflags, &w, &h, &c, &num_warnings));
   CHECK(uncompressed != nullptr);
   CHECK_EQ(num_warnings, 0);
@@ -476,10 +477,10 @@ TEST(JpegMemTest, ChromaDownsampling) {
   }
 }
 
-void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
-                 int expected_height, const string& reference_RGB_file,
+void TestBadJPEG(Env* env, const std::string& bad_jpeg_file, int expected_width,
+                 int expected_height, const std::string& reference_RGB_file,
                  const bool try_recover_truncated_jpeg) {
-  string jpeg;
+  std::string jpeg;
   ReadFileToStringOrDie(env, bad_jpeg_file, &jpeg);
 
   UncompressFlags flags;
@@ -487,7 +488,7 @@ void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
   flags.try_recover_truncated_jpeg = try_recover_truncated_jpeg;
 
   int width, height, components;
-  std::unique_ptr<uint8[]> imgdata;
+  std::unique_ptr<uint8_t[]> imgdata;
   imgdata.reset(Uncompress(jpeg.c_str(), jpeg.size(), flags, &width, &height,
                            &components, nullptr));
   if (expected_width > 0) {  // we expect the file to decode into 'something'
@@ -496,7 +497,7 @@ void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
     CHECK_EQ(components, 3);
     CHECK(imgdata.get());
     if (!reference_RGB_file.empty()) {
-      string ref;
+      std::string ref;
       ReadFileToStringOrDie(env, reference_RGB_file, &ref);
       CHECK(!memcmp(ref.data(), imgdata.get(), ref.size()));
     }
@@ -507,7 +508,7 @@ void TestBadJPEG(Env* env, const string& bad_jpeg_file, int expected_width,
 
 TEST(JpegMemTest, BadJpeg) {
   Env* env = Env::Default();
-  const string data_path = kTestData;
+  const std::string data_path = kTestData;
 
   // Test corrupt file
   TestBadJPEG(env, data_path + "bad_huffman.jpg", 1024, 768, "", false);

From 1dc6721ee55c8e8b8d7ef18c20d3e04c31c0d26e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:36:39 -0800
Subject: [PATCH 696/753] Automated Code Change

PiperOrigin-RevId: 848076032
---
 tensorflow/dtensor/mlir/value_utils.cc | 4 ++--
 tensorflow/dtensor/mlir/value_utils.h  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/dtensor/mlir/value_utils.cc b/tensorflow/dtensor/mlir/value_utils.cc
index edc6afb95a67ab..9ecdfa424ac723 100644
--- a/tensorflow/dtensor/mlir/value_utils.cc
+++ b/tensorflow/dtensor/mlir/value_utils.cc
@@ -109,7 +109,7 @@ mlir::Value ReshapeSizeTypeToScalar(mlir::OpBuilder builder, mlir::Location loc,
 }
 
 mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
-                     llvm::ArrayRef<int32> values) {
+                     llvm::ArrayRef<int32_t> values) {
   auto const_type = mlir::RankedTensorType::get(
       {static_cast<int64_t>(values.size())}, builder.getIntegerType(32));
   mlir::Attribute const_attr =
@@ -172,7 +172,7 @@ mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
   if (llvm::cast<mlir::RankedTensorType>(type).getElementType().isInteger(64)) {
     return Int64Const(builder, loc, values);
   } else {
-    llvm::SmallVector<int32, 4> values32(values.begin(), values.end());
+    llvm::SmallVector<int32_t, 4> values32(values.begin(), values.end());
     return IntConst(builder, loc, values32);
   }
 }
diff --git a/tensorflow/dtensor/mlir/value_utils.h b/tensorflow/dtensor/mlir/value_utils.h
index 804683bc56a2cc..9775f57c79db11 100644
--- a/tensorflow/dtensor/mlir/value_utils.h
+++ b/tensorflow/dtensor/mlir/value_utils.h
@@ -48,7 +48,7 @@ StatusOr<llvm::SmallVector<int64_t>> GetTFShapeFromType(mlir::Type type);
 
 // Return a 1-D int32 constant array with the given values.
 mlir::Value IntConst(mlir::OpBuilder& builder, mlir::Location loc,
-                     llvm::ArrayRef<int32> values);
+                     llvm::ArrayRef<int32_t> values);
 // Return a 1-D int64 constant array with the given values.
 mlir::Value Int64Const(mlir::OpBuilder& builder, mlir::Location loc,
                        llvm::ArrayRef<int64_t> values);

From 117cd46dcb304d5a9f33485e28b67b4adcb0a8e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:39:34 -0800
Subject: [PATCH 697/753] Automated Code Change

PiperOrigin-RevId: 848076776
---
 tensorflow/core/kernels/data/zip_dataset_op.cc      |  6 +++---
 tensorflow/core/kernels/data/zip_dataset_op_test.cc | 11 +++++++----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index b381f28def6ea4..e5ef9d1451cd69 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -84,7 +84,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
   }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
+      const std::string& prefix) const override {
     return std::make_unique<Iterator>(Iterator::Params{
         this, name_utils::IteratorPrefix(kDatasetType, prefix)});
   }
@@ -103,7 +103,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
@@ -137,7 +137,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return absl::OkStatus();
   }
 
-  absl::Status Get(OpKernelContext* ctx, int64 index,
+  absl::Status Get(OpKernelContext* ctx, int64_t index,
                    std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     out_tensors->reserve(output_dtypes().size());
diff --git a/tensorflow/core/kernels/data/zip_dataset_op_test.cc b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
index ce62706e224c2f..bfde48fb9509d8 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
@@ -42,7 +42,7 @@ class ZipDatasetParams : public DatasetParams {
   ZipDatasetParams(std::vector<T> input_dataset_params,
                    DataTypeVector output_dtypes,
                    std::vector<PartialTensorShape> output_shapes,
-                   int num_input_datasets, string node_name)
+                   int num_input_datasets, std::string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         num_input_datasets_(num_input_datasets) {
@@ -57,7 +57,8 @@ class ZipDatasetParams : public DatasetParams {
 
   std::vector<Tensor> GetInputTensors() const override { return {}; }
 
-  absl::Status GetInputNames(std::vector<string>* input_names) const override {
+  absl::Status GetInputNames(
+      std::vector<std::string>* input_names) const override {
     input_names->clear();
     for (int i = 0; i < num_input_datasets_; ++i) {
       input_names->emplace_back(
@@ -75,10 +76,12 @@ class ZipDatasetParams : public DatasetParams {
     return absl::OkStatus();
   }
 
-  string dataset_type() const override { return ZipDatasetOp::kDatasetType; }
+  std::string dataset_type() const override {
+    return ZipDatasetOp::kDatasetType;
+  }
 
  private:
-  int32 num_input_datasets_;
+  int32_t num_input_datasets_;
 };
 
 class ZipDatasetOpTest : public DatasetOpsTestBase {};

From 6051ff167afe6e506c9ea6db383ee624e1638730 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:39:36 -0800
Subject: [PATCH 698/753] Automated Code Change

PiperOrigin-RevId: 848076791
---
 tensorflow/core/data/service/snapshot/snapshot_manager.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.h b/tensorflow/core/data/service/snapshot/snapshot_manager.h
index 98861523405206..4b2f19b9ca31ac 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -330,7 +330,7 @@ class SnapshotManager {
   absl::StatusOr<std::vector<Source>> CreateSources(
       const DatasetDef& dataset_def) const;
   // Returns the total number of splits.
-  absl::StatusOr<int64> GetSplitsCardinality();
+  absl::StatusOr<int64_t> GetSplitsCardinality();
   // Resets a source when it runs out of splits, to support repetitions.
   absl::Status ResetSource(Source& source, int64_t source_index);
   int64_t num_sources() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {

From 863879c12c82ae2bf33fb0a3099877859527d882 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:40:03 -0800
Subject: [PATCH 699/753] Automated Code Change

PiperOrigin-RevId: 848076929
---
 tensorflow/core/kernels/sparse/add_op.cc      | 28 +++++++------
 .../sparse/csr_sparse_matrix_to_dense_op.cc   | 18 ++++-----
 .../csr_sparse_matrix_to_sparse_tensor_op.cc  | 18 ++++-----
 .../sparse/dense_to_csr_sparse_matrix_op.cc   | 40 +++++++++----------
 .../core/kernels/sparse/sparse_mat_mul_op.cc  |  6 +--
 .../sparse/sparse_matrix_components_op.cc     |  2 +-
 .../sparse_tensor_to_csr_sparse_matrix_op.cc  | 40 +++++++++----------
 .../core/kernels/sparse/transpose_op.cc       |  6 +--
 8 files changed, 80 insertions(+), 78 deletions(-)

diff --git a/tensorflow/core/kernels/sparse/add_op.cc b/tensorflow/core/kernels/sparse/add_op.cc
index 24e9a8cc5fb98e..ef440aa870dfe3 100644
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@@ -281,17 +281,18 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
         beta_(beta),
         initialized_(false) {}
 
-  Status Initialize() {
+  absl::Status Initialize() {
     TF_RETURN_IF_ERROR(cuda_sparse_.Initialize());
     TF_RETURN_IF_ERROR(descrA_.Initialize());
     TF_RETURN_IF_ERROR(descrB_.Initialize());
     TF_RETURN_IF_ERROR(descrC_.Initialize());
     initialized_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
-                          const ConstCSRComponent<T>& b, size_t* bufferSize) {
+  absl::Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
+                                const ConstCSRComponent<T>& b,
+                                size_t* bufferSize) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -313,13 +314,13 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
         b.row_ptr.data(), b.col_ind.data(), descrC_.descr(), null_T, null_int,
         null_int, bufferSize));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status GetOutputStructure(const ConstCSRComponent<T>& a,
-                            const ConstCSRComponent<T>& b,
-                            TTypes<int32>::UnalignedVec c_row_ptr,
-                            int* output_nnz, void* workspace) {
+  absl::Status GetOutputStructure(const ConstCSRComponent<T>& a,
+                                  const ConstCSRComponent<T>& b,
+                                  TTypes<int32_t>::UnalignedVec c_row_ptr,
+                                  int* output_nnz, void* workspace) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -343,11 +344,12 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
       return errors::Internal(
           "CSRAdd: CsrgeamNnz returned nnzTotalDevHostPtr < 0: ", *output_nnz);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Compute(const ConstCSRComponent<T>& a, const ConstCSRComponent<T>& b,
-                 CSRComponent<T>* c, void* workspace) {
+  absl::Status Compute(const ConstCSRComponent<T>& a,
+                       const ConstCSRComponent<T>& b, CSRComponent<T>* c,
+                       void* workspace) {
     DCHECK(initialized_);
 
     const int m = a.row_ptr.size() - 1;
@@ -368,7 +370,7 @@ struct CSRSparseMatrixAdd<GPUDevice, T>
         b.row_ptr.data(), b.col_ind.data(), descrC_.descr(), c->values.data(),
         c->row_ptr.data(), c->col_ind.data(), workspace));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
index 6829145263baa5..311469571aaf9f 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_dense_op.cc
@@ -237,20 +237,20 @@ REGISTER_GPU(complex128)
 namespace functor {
 template <>
 struct COOSparseMatrixToSparseTensor<GPUDevice> {
-  Status operator()(OpKernelContext* ctx,
-                    TTypes<int64_t>::ConstVec host_dense_shape,
-                    TTypes<int>::ConstVec host_batch_ptrs,
-                    TTypes<int>::Vec coo_row_ind,
-                    TTypes<int>::ConstVec coo_col_ind,
-                    TTypes<int64_t>::Matrix indices);
+  absl::Status operator()(OpKernelContext* ctx,
+                          TTypes<int64_t>::ConstVec host_dense_shape,
+                          TTypes<int>::ConstVec host_batch_ptrs,
+                          TTypes<int>::Vec coo_row_ind,
+                          TTypes<int>::ConstVec coo_col_ind,
+                          TTypes<int64_t>::Matrix indices);
 };
 extern template struct COOSparseMatrixToSparseTensor<GPUDevice>;
 
 template <>
 struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c,
-                    TTypes<const int>::UnalignedVec csr_row_ptr,
-                    TTypes<int>::UnalignedVec coo_row_ind);
+  absl::Status operator()(OpKernelContext* c,
+                          TTypes<const int>::UnalignedVec csr_row_ptr,
+                          TTypes<int>::UnalignedVec coo_row_ind);
 };
 extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;
 
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
index 903d3acbc67966..07448230f398fb 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
@@ -222,20 +222,20 @@ REGISTER_GPU(complex128)
 namespace functor {
 template <>
 struct COOSparseMatrixToSparseTensor<GPUDevice> {
-  Status operator()(OpKernelContext* ctx,
-                    TTypes<int64_t>::ConstVec host_dense_shape,
-                    TTypes<int>::ConstVec host_batch_ptrs,
-                    TTypes<int>::Vec coo_row_ind,
-                    TTypes<int>::ConstVec coo_col_ind,
-                    TTypes<int64_t>::Matrix indices);
+  absl::Status operator()(OpKernelContext* ctx,
+                          TTypes<int64_t>::ConstVec host_dense_shape,
+                          TTypes<int>::ConstVec host_batch_ptrs,
+                          TTypes<int>::Vec coo_row_ind,
+                          TTypes<int>::ConstVec coo_col_ind,
+                          TTypes<int64_t>::Matrix indices);
 };
 extern template struct COOSparseMatrixToSparseTensor<GPUDevice>;
 
 template <>
 struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c,
-                    TTypes<const int>::UnalignedVec csr_row_ptr,
-                    TTypes<int>::UnalignedVec coo_row_ind);
+  absl::Status operator()(OpKernelContext* c,
+                          TTypes<const int>::UnalignedVec csr_row_ptr,
+                          TTypes<int>::UnalignedVec coo_row_ind);
 };
 extern template struct CSRSparseMatrixToCOOSparseMatrix<GPUDevice>;
 
diff --git a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
index 11601d8cf3b6ef..eda72f21e674f9 100644
--- a/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/dense_to_csr_sparse_matrix_op.cc
@@ -174,7 +174,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
     const int64_t rows = dense_tensor_shape.dim_size((rank == 2) ? 0 : 1);
     const int64_t cols = dense_tensor_shape.dim_size((rank == 2) ? 1 : 2);
 
-    ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
+    ScratchSpace<int32_t> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
 
     Tensor nnz_per_batch_device_t;
     if (rank == 2) {
@@ -185,7 +185,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                            c->allocate_temp(DT_INT32, TensorShape({batch_size}),
                                             &nnz_per_batch_device_t),
                            done);
-      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>();
+      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32_t>();
 
       functor::CalculateNNZPerBatchMatrixFromIndices<Device>
           calculate_nnz_from_indices;
@@ -194,14 +194,14 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
           c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device),
           done);
 
-      stream_executor::DeviceMemoryBase nnz_per_batch_device_ptr(
+      stream_executor::DeviceAddressBase nnz_per_batch_device_ptr(
           static_cast<void*>(nnz_per_batch_device.data()));
 
       OP_REQUIRES_OK_ASYNC(
           c,
           stream->Memcpy(nnz_per_batch_host.mutable_data() /*host_dst*/,
                          nnz_per_batch_device_ptr /*gpu_src*/,
-                         batch_size * sizeof(int32) /*size*/),
+                         batch_size * sizeof(int32_t) /*size*/),
           done);
     }
 
@@ -216,7 +216,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
       // tensor by the time we get here; we can unreference it.
       nnz_per_batch_device_ref.Unref();
 
-      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>();
+      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32_t>();
 
       {
         // Ensure that within the callback, the proper GPU settings are
@@ -227,7 +227,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         // Extract out the values.
         Tensor temp_values_t;
         OP_REQUIRES_OK_ASYNC(c,
-                             (functor::DoGatherNd<Device, T, int64>(
+                             (functor::DoGatherNd<Device, T, int64_t>(
                                  c, params_t, indices_t, &temp_values_t)),
                              done);
         const Tensor& values_t = const_cast<const Tensor&>(temp_values_t);
@@ -249,7 +249,7 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
 
         Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
                            TensorShape({batch_size + 1}));
-        auto batch_ptr = batch_ptr_t.vec<int32>();
+        auto batch_ptr = batch_ptr_t.vec<int32_t>();
         auto indices = indices_t.matrix<int64_t>();
 
         batch_ptr(0) = 0;
@@ -286,9 +286,9 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                              &csr_row_ptr_t),
             done);
 
-        auto coo_row_ind = coo_row_ind_t.vec<int32>();
-        auto coo_col_ind = coo_col_ind_t.vec<int32>();
-        auto csr_row_ptr = csr_row_ptr_t.vec<int32>();
+        auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
+        auto coo_col_ind = coo_col_ind_t.vec<int32_t>();
+        auto csr_row_ptr = csr_row_ptr_t.vec<int32_t>();
 
         // Convert SparseTensor rep to coo row ind, coo col ind.
         if (total_nnz > 0) {
@@ -302,8 +302,8 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         // a bug if you have empty coo rows.
         // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle
         // zero-element input coo rows.
-        functor::SetZeroFunctor<Device, int32> set_zero;
-        set_zero(d, csr_row_ptr_t.flat<int32>());
+        functor::SetZeroFunctor<Device, int32_t> set_zero;
+        set_zero(d, csr_row_ptr_t.flat<int32_t>());
 
         functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr;
         for (int i = 0; i < batch_size; ++i) {
@@ -313,9 +313,9 @@ class DenseToCSRSparseMatrixGPUOp : public AsyncOpKernel {
             // handled by the SetZero above.
           } else {
             // Convert coo to csr.
-            auto coo_row_ind_i =
-                TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i);
-            auto csr_row_ptr_i = TTypes<int32>::UnalignedVec(
+            auto coo_row_ind_i = TTypes<int32_t>::UnalignedVec(
+                &coo_row_ind(batch_ptr(i)), nnz_i);
+            auto csr_row_ptr_i = TTypes<int32_t>::UnalignedVec(
                 &csr_row_ptr((rows + 1) * i), rows + 1);
             OP_REQUIRES_OK_ASYNC(
                 c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i),
@@ -368,9 +368,9 @@ REGISTER_GPU(GPU, complex128)
 namespace functor {
 
 template <>
-Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
+absl::Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
-    TTypes<int32>::Vec nnz_per_batch);
+    TTypes<int32_t>::Vec nnz_per_batch);
 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>;
 
 template <>
@@ -384,9 +384,9 @@ extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>;
 
 template <>
 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c, const int rows, const int cols,
-                    TTypes<int>::UnalignedVec coo_row_ind,
-                    TTypes<int>::UnalignedVec csr_row_ptr) {
+  absl::Status operator()(OpKernelContext* c, const int rows, const int cols,
+                          TTypes<int>::UnalignedVec coo_row_ind,
+                          TTypes<int>::UnalignedVec csr_row_ptr) {
     GpuSparse cuda_sparse(c);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
     return cuda_sparse.Coo2csr(coo_row_ind.data(),
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index 0455fa374538fc..be11f9d81065a6 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -506,7 +506,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                      DT_INT8, TensorShape({static_cast<int64_t>(bufferSize1)}),
                      &buffer1_t));
       }
-      void* buffer1 = buffer1_t.flat<int8>().data();
+      void* buffer1 = buffer1_t.flat<int8_t>().data();
 
       // Do workEstimation using buffer1.
       // buffer1 implicitly captured in gemmDesc for use in the compute call.
@@ -525,7 +525,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
                      DT_INT8, TensorShape({static_cast<int64_t>(bufferSize2)}),
                      &buffer2_t));
       }
-      void* buffer2 = buffer2_t.flat<int8>().data();
+      void* buffer2 = buffer2_t.flat<int8_t>().data();
 
       // Compute the gemm.
       // Note that buffer1 is implicitly consumed here and buffer2 is implicitly
@@ -552,7 +552,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
       // Copy product to final c_row_ptr and intermediate column and values
       // tensors.
       void* row_ptr = &c_row_ptr(i * (rows + 1));
-      void* col_ptr = colidx_tmp.flat<int32>().data();
+      void* col_ptr = colidx_tmp.flat<int32_t>().data();
       void* val_ptr = values_tmp.flat<T>().data();
       cusparseStatus_t cusp_status =
           cusparseCsrSetPointers(matC.get(), row_ptr, col_ptr, val_ptr);
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
index 353a8ecb0aa86b..d25a86056b574b 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_matrix_components_op.cc
@@ -137,7 +137,7 @@ namespace functor {
       const Eigen::DSizes<Eigen::DenseIndex, 1>& sizes);        \
   extern template struct Slice<GPUDevice, T, 1>;
 
-DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(int32_t);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(complex64);
diff --git a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
index 259e9a97cd2ff3..7d7bba8601da64 100644
--- a/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_tensor_to_csr_sparse_matrix_op.cc
@@ -166,7 +166,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
     const int64_t rows = dense_shape((rank == 2) ? 0 : 1);
     const int64_t cols = dense_shape((rank == 2) ? 1 : 2);
 
-    static constexpr int64_t kInt32Max = std::numeric_limits<int32>::max();
+    static constexpr int64_t kInt32Max = std::numeric_limits<int32_t>::max();
     OP_REQUIRES_ASYNC(
         c, batch_size < kInt32Max,
         errors::InvalidArgument("dense_shape batch_size must be < Int32Max,"
@@ -187,7 +187,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                                 (rows + 1) * batch_size),
         done);
 
-    ScratchSpace<int32> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
+    ScratchSpace<int32_t> nnz_per_batch_host(c, batch_size, /*on_host*/ true);
 
     Tensor nnz_per_batch_device_t;
     if (rank == 2) {
@@ -198,7 +198,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                            c->allocate_temp(DT_INT32, TensorShape({batch_size}),
                                             &nnz_per_batch_device_t),
                            done);
-      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32>();
+      auto nnz_per_batch_device = nnz_per_batch_device_t.vec<int32_t>();
 
       functor::CalculateNNZPerBatchMatrixFromIndices<Device>
           calculate_nnz_from_indices;
@@ -207,14 +207,14 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
           c, calculate_nnz_from_indices(c, indices, nnz_per_batch_device),
           done);
 
-      stream_executor::DeviceMemoryBase nnz_per_batch_device_ptr(
+      stream_executor::DeviceAddressBase nnz_per_batch_device_ptr(
           static_cast<void*>(nnz_per_batch_device.data()));
 
       OP_REQUIRES_OK_ASYNC(
           c,
           stream->Memcpy(nnz_per_batch_host.mutable_data() /*host_dst*/,
                          nnz_per_batch_device_ptr /*gpu_src*/,
-                         batch_size * sizeof(int32) /*size*/),
+                         batch_size * sizeof(int32_t) /*size*/),
           done);
     }
 
@@ -227,7 +227,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
       // tensor by the time we get here; we can unreference it.
       nnz_per_batch_device_ref.Unref();
 
-      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32>();
+      auto nnz_per_batch = nnz_per_batch_host.tensor().vec<int32_t>();
 
       // Ensure that within the callback, the proper GPU settings are
       // configured.
@@ -237,7 +237,7 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         Tensor batch_ptr_t(cpu_allocator(), DT_INT32,
                            TensorShape({batch_size + 1}));
 
-        auto batch_ptr = batch_ptr_t.vec<int32>();
+        auto batch_ptr = batch_ptr_t.vec<int32_t>();
         auto indices = indices_t.matrix<int64_t>();
 
         batch_ptr(0) = 0;
@@ -274,9 +274,9 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
                              &csr_row_ptr_t),
             done);
 
-        auto coo_row_ind = coo_row_ind_t.vec<int32>();
-        auto coo_col_ind = coo_col_ind_t.vec<int32>();
-        auto csr_row_ptr = csr_row_ptr_t.vec<int32>();
+        auto coo_row_ind = coo_row_ind_t.vec<int32_t>();
+        auto coo_col_ind = coo_col_ind_t.vec<int32_t>();
+        auto csr_row_ptr = csr_row_ptr_t.vec<int32_t>();
 
         // Convert SparseTensor rep to coo row ind, coo col ind.
         if (total_nnz > 0) {
@@ -290,8 +290,8 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
         // a bug if you have empty coo rows.
         // TODO(ebrevdo): File bug w/ nvidia so coo2csr can handle
         // zero-element input coo rows.
-        functor::SetZeroFunctor<Device, int32> set_zero;
-        set_zero(d, csr_row_ptr_t.flat<int32>());
+        functor::SetZeroFunctor<Device, int32_t> set_zero;
+        set_zero(d, csr_row_ptr_t.flat<int32_t>());
 
         functor::COOSparseMatrixToCSRSparseMatrix<Device> coo_to_csr;
         for (int i = 0; i < batch_size; ++i) {
@@ -301,9 +301,9 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
             // handled by the SetZero above.
           } else {
             // Convert coo to csr.
-            auto coo_row_ind_i =
-                TTypes<int32>::UnalignedVec(&coo_row_ind(batch_ptr(i)), nnz_i);
-            auto csr_row_ptr_i = TTypes<int32>::UnalignedVec(
+            auto coo_row_ind_i = TTypes<int32_t>::UnalignedVec(
+                &coo_row_ind(batch_ptr(i)), nnz_i);
+            auto csr_row_ptr_i = TTypes<int32_t>::UnalignedVec(
                 &csr_row_ptr((rows + 1) * i), rows + 1);
             OP_REQUIRES_OK_ASYNC(
                 c, coo_to_csr(c, rows, cols, coo_row_ind_i, csr_row_ptr_i),
@@ -345,9 +345,9 @@ class SparseTensorToCSRSparseMatrixGPUOp : public AsyncOpKernel {
 namespace functor {
 
 template <>
-Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
+absl::Status CalculateNNZPerBatchMatrixFromIndices<GPUDevice>::operator()(
     OpKernelContext* c, TTypes<int64_t>::ConstMatrix indices,
-    TTypes<int32>::Vec nnz_per_batch);
+    TTypes<int32_t>::Vec nnz_per_batch);
 extern template struct CalculateNNZPerBatchMatrixFromIndices<GPUDevice>;
 
 template <>
@@ -361,9 +361,9 @@ extern template struct SparseTensorToCOOSparseMatrix<GPUDevice>;
 
 template <>
 struct COOSparseMatrixToCSRSparseMatrix<GPUDevice> {
-  Status operator()(OpKernelContext* c, const int rows, const int cols,
-                    TTypes<int>::UnalignedVec coo_row_ind,
-                    TTypes<int>::UnalignedVec csr_row_ptr) {
+  absl::Status operator()(OpKernelContext* c, const int rows, const int cols,
+                          TTypes<int>::UnalignedVec coo_row_ind,
+                          TTypes<int>::UnalignedVec csr_row_ptr) {
     GpuSparse cuda_sparse(c);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
     return cuda_sparse.Coo2csr(coo_row_ind.data(),
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index d81c80672b31bc..234b00e5749593 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -255,8 +255,8 @@ struct CSRSparseMatrixTransposeComponent<CPUDevice, T> {
 
 template <typename T>
 struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
-  Status operator()(OpKernelContext* ctx, const ConstCSRComponent<T>& x,
-                    CSRComponent<T>* y) {
+  absl::Status operator()(OpKernelContext* ctx, const ConstCSRComponent<T>& x,
+                          CSRComponent<T>* y) {
     TF_RETURN_IF_ERROR(ValidateTransposeInputs(x, *y));
     GpuSparse cuda_sparse(ctx);
     TF_RETURN_IF_ERROR(cuda_sparse.Initialize());
@@ -277,7 +277,7 @@ struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
         x.col_ind.data() /*csrColInd*/, y->values.data() /*cscVal*/,
         y->col_ind.data() /*cscRowInd*/, y->row_ptr.data() /*cscColPtr*/,
         copyValues);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

From 38b33b3d64553e4a6610fd7a4088f9a1a6ca4b85 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:44:09 -0800
Subject: [PATCH 700/753] Automated Code Change

PiperOrigin-RevId: 848078219
---
 tensorflow/core/kernels/unique_op.cc           |  6 +++---
 tensorflow/core/kernels/unique_op_test.cc      |  6 +++---
 tensorflow/core/kernels/variable_ops.cc        | 14 +++++++-------
 tensorflow/core/kernels/variable_ops_test.cc   |  2 +-
 tensorflow/core/kernels/where_op.cc            | 10 +++++-----
 tensorflow/core/kernels/while_op_test.cc       |  6 ++++--
 tensorflow/core/kernels/whole_file_read_ops.cc |  8 ++++----
 7 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index b1207aeea7f674..b23b7a1b4d4e81 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -83,10 +83,10 @@ class UniqueOp : public OpKernel {
     // TODO(dga):  Make unique polymorphic for returning int32 and int64
     // vectors to support large tensors.
     OP_REQUIRES(context,
-                input.NumElements() <= std::numeric_limits<int32>::max(),
+                input.NumElements() <= std::numeric_limits<int32_t>::max(),
                 errors::InvalidArgument(
                     "unique does not support input tensors larger than ",
-                    std::numeric_limits<int32>::max(), " elements"));
+                    std::numeric_limits<int32_t>::max(), " elements"));
 
     int64_t axis = 0;
     std::vector<int64_t> new_sizes{1, input.NumElements(), 1};
@@ -115,7 +115,7 @@ class UniqueOp : public OpKernel {
                         "axis tensor should be int32 or int64, but got ",
                         DataTypeString(axis_tensor.dtype())));
         if (axis_tensor.dtype() == DT_INT32) {
-          axis = internal::SubtleMustCopy(axis_tensor.scalar<int32>()());
+          axis = internal::SubtleMustCopy(axis_tensor.scalar<int32_t>()());
         } else {
           axis = internal::SubtleMustCopy(axis_tensor.scalar<int64_t>()());
         }
diff --git a/tensorflow/core/kernels/unique_op_test.cc b/tensorflow/core/kernels/unique_op_test.cc
index e21c0bfad6ae52..b870921666bd83 100644
--- a/tensorflow/core/kernels/unique_op_test.cc
+++ b/tensorflow/core/kernels/unique_op_test.cc
@@ -84,7 +84,7 @@ void BM_Unique_INT32(::testing::benchmark::State& state) {
                   "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
       .Run(state);
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * dim *
-                          sizeof(int32));
+                          sizeof(int32_t));
 }
 
 void BM_Unique_INT32_Repeat(::testing::benchmark::State& state) {
@@ -108,7 +108,7 @@ void BM_Unique_INT32_Repeat(::testing::benchmark::State& state) {
                   "SINGLE_THREADED_EXECUTOR", /*old_benchmark_api*/ false)
       .Run(state);
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * dim * 200 *
-                          sizeof(int32));
+                          sizeof(int32_t));
 }
 
 TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
@@ -118,7 +118,7 @@ TensorProto GetRandomStringsTensorProto(int dim, int max_str_len) {
   tensor_proto.mutable_tensor_shape()->set_unknown_rank(false);
   for (int i = 0; i < dim; ++i) {
     const int len = std::rand() % max_str_len + 1;
-    string rand_str;
+    std::string rand_str;
     rand_str.resize(len);
     for (int j = 0; j < len; ++j) {
       rand_str[j] = static_cast<char>(j % 256);
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 25eb23de84e177..a43beaecc040be 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -29,8 +29,8 @@ namespace {
 
 // Makes a unique name for a temporary variable inside a while loop body,
 // because loop can be executed in multiple iterations in parallel.
-string TemporaryVariableName(const string& var_name,
-                             const FrameAndIter& control_frame) {
+std::string TemporaryVariableName(const std::string& var_name,
+                                  const FrameAndIter& control_frame) {
   if (control_frame.frame_id != kIllegalFrameId &&
       control_frame.iter_id != kIllegalIterId) {
     return strings::StrCat(var_name, "/frame:", control_frame.frame_id,
@@ -53,7 +53,7 @@ class LegacyVar : public ResourceBase {
   mutex* mu() { return &mu_; }
   Tensor* tensor() { return &tensor_; }
 
-  string DebugString() const override {
+  std::string DebugString() const override {
     return absl::StrCat(DataTypeString(tensor_.dtype()), "/",
                         tensor_.shape().DebugString());
   }
@@ -130,14 +130,14 @@ class TemporaryVariableOp : public OpKernel {
   struct TmpVar : public ResourceBase {
     mutex mu;
     Tensor val;
-    string name;
-    string DebugString() const override { return name; }
+    std::string name;
+    std::string DebugString() const override { return name; }
     ~TmpVar() override { VLOG(3) << "TmpVar " << name << " deleted"; }
   };
 
   TensorShape shape_;
   DataType dtype_;
-  string var_name_;
+  std::string var_name_;
 };
 
 class DestroyTemporaryVariableOp : public OpKernel {
@@ -171,7 +171,7 @@ class DestroyTemporaryVariableOp : public OpKernel {
   }
 
  private:
-  string var_name_;
+  std::string var_name_;
 };
 
 class IsVariableInitializedOp : public OpKernel {
diff --git a/tensorflow/core/kernels/variable_ops_test.cc b/tensorflow/core/kernels/variable_ops_test.cc
index 0a814aab1db9fe..6ed93a0e643f2f 100644
--- a/tensorflow/core/kernels/variable_ops_test.cc
+++ b/tensorflow/core/kernels/variable_ops_test.cc
@@ -31,7 +31,7 @@ namespace {
 void ManyManyVariablesHelper(int threads, int variables,
                              ::testing::benchmark::State& state) {
   Graph g(OpRegistry::Global());
-  std::vector<string> targets;
+  std::vector<std::string> targets;
   for (int i = 0; i < variables; ++i) {
     Node* v;
     TF_CHECK_OK(
diff --git a/tensorflow/core/kernels/where_op.cc b/tensorflow/core/kernels/where_op.cc
index 1421e24cbb0fdd..42c89f61ff3f48 100644
--- a/tensorflow/core/kernels/where_op.cc
+++ b/tensorflow/core/kernels/where_op.cc
@@ -259,8 +259,8 @@ class WhereGPUOp : public AsyncOpKernel {
     const Tensor& input = context->input(0);
     const int input_dims = input.dims();
 
-    if (input.NumElements() < std::numeric_limits<int32>::max()) {
-      ComputeAsyncType<int32>(input, input_dims, context, done);
+    if (input.NumElements() < std::numeric_limits<int32_t>::max()) {
+      ComputeAsyncType<int32_t>(input, input_dims, context, done);
     } else {
       ComputeAsyncType<int64_t>(input, input_dims, context, done);
     }
@@ -282,7 +282,7 @@ class WhereGPUOp : public AsyncOpKernel {
 
     // Push kernel to stream to get number of true elements.
     const GPUDevice& d = context->eigen_device<GPUDevice>();
-    Status s = functor::NumTrue<GPUDevice, T, Tindex>::Compute(
+    absl::Status s = functor::NumTrue<GPUDevice, T, Tindex>::Compute(
         context, d, input.flat<T>(), num_true_t);
     OP_REQUIRES_OK_ASYNC(context, s, done);
 
@@ -374,9 +374,9 @@ TF_CALL_WHERE_GPU_TYPES(REGISTER_GPU_WHERE_OP);
 
 REGISTER_KERNEL_BUILDER(Name("Where")
                             .Device(DEVICE_DEFAULT)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("input")
                             .HostMemory("index"),
-                        WhereCPUOp<int32>);
+                        WhereCPUOp<int32_t>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/while_op_test.cc b/tensorflow/core/kernels/while_op_test.cc
index b7f5af047b8186..36c68d2b14e508 100644
--- a/tensorflow/core/kernels/while_op_test.cc
+++ b/tensorflow/core/kernels/while_op_test.cc
@@ -123,7 +123,8 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
     return malloc(size);
 #else
-    return tensorflow::port::AlignedMalloc(size, EIGEN_MAX_ALIGN_BYTES);
+    return tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(EIGEN_MAX_ALIGN_BYTES));
 #endif
   };
   se_.host_memory_deallocate = [](const SP_Device* const device, void* mem) {
@@ -136,7 +137,8 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
 #if EIGEN_MAX_ALIGN_BYTES == 0
     mem->opaque = malloc(size);
 #else
-    mem->opaque = tensorflow::port::AlignedMalloc(size, EIGEN_MAX_ALIGN_BYTES);
+    mem->opaque = tsl::port::AlignedMalloc(
+        size, static_cast<std::align_val_t>(EIGEN_MAX_ALIGN_BYTES));
 #endif
     mem->size = size;
   };
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index c5c8e548a7592f..884c7725f43d6b 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -35,7 +35,7 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-static absl::Status ReadEntireFile(Env* env, const string& filename,
+static absl::Status ReadEntireFile(Env* env, const std::string& filename,
                                    T* contents) {
   std::unique_ptr<RandomAccessFile> file;
   TF_RETURN_IF_ERROR(env->NewRandomAccessFile(filename, &file));
@@ -47,7 +47,7 @@ static absl::Status ReadEntireFile(Env* env, const string& filename,
 
 class WholeFileReader : public ReaderBase {
  public:
-  WholeFileReader(Env* env, const string& node_name)
+  WholeFileReader(Env* env, const std::string& node_name)
       : ReaderBase(absl::StrCat("WholeFileReader '", node_name, "'")),
         env_(env) {}
 
@@ -136,8 +136,8 @@ class WriteFileOp : public OpKernel {
                 errors::InvalidArgument(
                     "Contents tensor must be scalar, but had shape: ",
                     contents_input->shape().DebugString()));
-    const string& filename = filename_input->scalar<tstring>()();
-    const string dir(io::Dirname(filename));
+    const std::string& filename = filename_input->scalar<tstring>()();
+    const std::string dir(io::Dirname(filename));
     if (!context->env()->FileExists(dir).ok()) {
       OP_REQUIRES_OK(context, context->env()->RecursivelyCreateDir(dir));
     }

From 55e5af9a8e7ec18403da7788043618a66af72b98 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:47:07 -0800
Subject: [PATCH 701/753] Automated Code Change

PiperOrigin-RevId: 848079100
---
 tensorflow/core/kernels/bias_op_gpu.cu.cc     | 89 ++++++++++---------
 tensorflow/core/kernels/bias_op_gpu.h         |  6 +-
 tensorflow/core/kernels/bincount_op_gpu.cu.cc | 38 ++++----
 tensorflow/core/kernels/broadcast_to_op.cc    |  4 +-
 .../core/kernels/bucketize_op_gpu.cu.cc       | 33 +++----
 tensorflow/core/kernels/cast_op.cc            | 70 +++++++--------
 tensorflow/core/kernels/cast_op_impl_int64.cc |  2 +-
 .../core/kernels/check_numerics_op_gpu.cu.cc  | 20 ++---
 tensorflow/core/kernels/collective_nccl.cc    |  9 +-
 tensorflow/core/kernels/collective_nccl.h     |  9 +-
 tensorflow/core/kernels/concat_lib_gpu.cc     |  9 +-
 .../core/kernels/concat_lib_gpu_impl.cu.cc    |  4 +-
 .../core/kernels/conv_grad_input_ops.cc       | 61 ++++++-------
 tensorflow/core/kernels/cudnn_pooling_gpu.h   | 14 +--
 .../core/kernels/cwise_op_clip_gpu.cu.cc      | 30 +++----
 tensorflow/core/kernels/cwise_op_select.cc    |  2 +-
 .../core/kernels/depthwise_conv_grad_op.cc    | 13 +--
 17 files changed, 210 insertions(+), 203 deletions(-)

diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index 00ebc3af93e762..04aef1c88095da 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -57,23 +57,23 @@ struct AccumulatorType<Eigen::bfloat16> {
 // Definition of the GPU implementations declared in bias_op.cc.
 
 template <typename T>
-__global__ void BiasNHWCKernel(int32 nthreads, const T* __restrict__ input,
+__global__ void BiasNHWCKernel(int32_t nthreads, const T* __restrict__ input,
                                const T* __restrict__ bias,
-                               T* __restrict__ output, int32 bias_size) {
+                               T* __restrict__ output, int32_t bias_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 bias_offset = index % bias_size;
+    int32_t bias_offset = index % bias_size;
     output[index] = ldg(input + index) + ldg(bias + bias_offset);
   }
 }
 
 template <typename T>
-__global__ void BiasNCHWKernel(int32 nthreads, const T* __restrict__ input,
+__global__ void BiasNCHWKernel(int32_t nthreads, const T* __restrict__ input,
                                const T* __restrict__ bias,
-                               T* __restrict__ output, int32 bias_size,
-                               int32 image_size) {
+                               T* __restrict__ output, int32_t bias_size,
+                               int32_t image_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 index2 = index / image_size;
-    int32 bias_offset = index2 % bias_size;
+    int32_t index2 = index / image_size;
+    int32_t bias_offset = index2 % bias_size;
     output[index] = ldg(input + index) + ldg(bias + bias_offset);
   }
 }
@@ -82,11 +82,12 @@ __global__ void BiasNCHWKernel(int32 nthreads, const T* __restrict__ input,
 // dimension.
 template <typename T>
 void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
-                         T* output, int32 batch, int32 height, int32 width,
-                         int depth, int32 channel, TensorFormat data_format) {
-  const int32 bias_size = channel;
-  const int32 image_size = height * width * depth;
-  const int32 total_count = batch * bias_size * image_size;
+                         T* output, int32_t batch, int32_t height,
+                         int32_t width, int depth, int32_t channel,
+                         TensorFormat data_format) {
+  const int32_t bias_size = channel;
+  const int32_t image_size = height * width * depth;
+  const int32_t total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
   }
@@ -109,49 +110,49 @@ void BiasGPU<T>::compute(const GPUDevice& d, const T* input, const T* bias,
 
 // A naive implementation that is functional on all cases.
 template <typename T>
-__global__ void BiasGradNHWC_Naive(int32 nthreads,
+__global__ void BiasGradNHWC_Naive(int32_t nthreads,
                                    const T* __restrict__ output_backprop,
                                    T* __restrict__ bias_backprop,
-                                   int32 bias_size) {
+                                   int32_t bias_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 bias_offset = index % bias_size;
+    int32_t bias_offset = index % bias_size;
     GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
   }
 }
 
 // A naive implementation that is functional on all cases.
 template <typename T>
-__global__ void BiasGradNCHW_Naive(int32 nthreads,
+__global__ void BiasGradNCHW_Naive(int32_t nthreads,
                                    const T* __restrict__ output_backprop,
                                    T* __restrict__ bias_backprop,
-                                   int32 bias_size, int32 image_size) {
+                                   int32_t bias_size, int32_t image_size) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
-    int32 index2 = index / image_size;
-    int32 bias_offset = index2 % bias_size;
+    int32_t index2 = index / image_size;
+    int32_t bias_offset = index2 % bias_size;
     GpuAtomicAdd(bias_backprop + bias_offset, ldg(output_backprop + index));
   }
 }
 
 template <typename T>
 __global__ void BiasGradNHWC_SharedAtomics(
-    int32 nthreads, const T* __restrict__ output_backprop,
-    T* __restrict__ bias_backprop, int32 bias_size) {
+    int32_t nthreads, const T* __restrict__ output_backprop,
+    T* __restrict__ bias_backprop, int32_t bias_size) {
   typedef typename AccumulatorType<T>::type AccT;
   GPU_DYNAMIC_SHARED_MEM_DECL(8, char, s_buf);
   AccT* s_data = reinterpret_cast<AccT*>(s_buf);
-  for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
+  for (int32_t index = threadIdx.x; index < bias_size; index += blockDim.x) {
     s_data[index] = AccT(0);
   }
   __syncthreads();
 
-  for (int32 index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+  for (int32_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-    int32 bias_offset = index % bias_size;
+    int32_t bias_offset = index % bias_size;
     GpuAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index)));
   }
   __syncthreads();
 
-  for (int32 index = threadIdx.x; index < bias_size; index += blockDim.x) {
+  for (int32_t index = threadIdx.x; index < bias_size; index += blockDim.x) {
     GpuAtomicAdd(bias_backprop + index, T(s_data[index]));
   }
 }
@@ -159,26 +160,26 @@ __global__ void BiasGradNHWC_SharedAtomics(
 template <typename T>
 __global__ void BiasGradNCHW_SharedAtomics(
     const T* __restrict__ output_backprop, T* __restrict__ bias_backprop,
-    int32 batch, int32 bias_size, int32 image_size, int group_size) {
+    int32_t batch, int32_t bias_size, int32_t image_size, int group_size) {
   // Initialize the shared memory.
   typedef typename AccumulatorType<T>::type AccT;
-  const int32 kSDataSize = 32;
+  const int32_t kSDataSize = 32;
   __shared__ AccT s_data[kSDataSize];
-  for (int32 index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
+  for (int32_t index = threadIdx.x; index < kSDataSize; index += blockDim.x) {
     s_data[index] = AccT(0);
   }
   __syncthreads();
 
   // Accumulate all the values within this thread. They all have the same bias
   // index.
-  int32 bias_index = blockIdx.x % bias_size;
-  int32 group_index = blockIdx.x / bias_size;
-  int32 total_count = batch * image_size;
+  int32_t bias_index = blockIdx.x % bias_size;
+  int32_t group_index = blockIdx.x / bias_size;
+  int32_t total_count = batch * image_size;
   AccT sum(0);
-  for (int32 index = group_index * blockDim.x + threadIdx.x;
+  for (int32_t index = group_index * blockDim.x + threadIdx.x;
        index < total_count; index += blockDim.x * group_size) {
-    int32 image_offset = index % image_size;
-    int32 batch = index / image_size;
+    int32_t image_offset = index % image_size;
+    int32_t batch = index / image_size;
     T val = ldg(output_backprop +
                 (batch * bias_size + bias_index) * image_size + image_offset);
     sum += AccT(val);
@@ -192,11 +193,11 @@ __global__ void BiasGradNCHW_SharedAtomics(
 
   // Accumulate the results in the shared memory into the first element.
   // No syncthreads is needed since this is only in the same warp.
-  int32 thread_index = threadIdx.x;
+  int32_t thread_index = threadIdx.x;
 #if GOOGLE_CUDA
   if (thread_index < 32) {
     AccT data = s_data[thread_index];
-    for (int32 delta = warpSize / 2; delta > 0; delta /= 2) {
+    for (int32_t delta = warpSize / 2; delta > 0; delta /= 2) {
       data += GpuShuffleXorSync(kCudaWarpAll, data, delta);
     }
     if (thread_index == 0) {
@@ -219,20 +220,20 @@ __global__ void BiasGradNCHW_SharedAtomics(
 
 template <typename T>
 void BiasGradGPU<T>::compute(const GPUDevice& d, const T* output_backprop,
-                             T* bias_backprop, int32 batch, int32 height,
-                             int32 width, int32 depth, int32 channel,
+                             T* bias_backprop, int32_t batch, int32_t height,
+                             int32_t width, int32_t depth, int32_t channel,
                              TensorFormat data_format) {
-  const int32 bias_size = channel;
-  const int32 image_size = height * width * depth;
-  const int32 total_count = batch * bias_size * image_size;
+  const int32_t bias_size = channel;
+  const int32_t image_size = height * width * depth;
+  const int32_t total_count = batch * bias_size * image_size;
   if (total_count == 0) {
     return;
   }
-  static constexpr int32 kWarpSize = 32;
+  static constexpr int32_t kWarpSize = 32;
   GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
 
   const int max_shared_memory_size = d.sharedMemPerBlock() / 2;
-  int32 shared_memory_size = 0;
+  int32_t shared_memory_size = 0;
   if (data_format == FORMAT_NHWC) {
     shared_memory_size = bias_size * sizeof(typename AccumulatorType<T>::type);
   }
diff --git a/tensorflow/core/kernels/bias_op_gpu.h b/tensorflow/core/kernels/bias_op_gpu.h
index 0ece14a946cd19..60f17e6de240de 100644
--- a/tensorflow/core/kernels/bias_op_gpu.h
+++ b/tensorflow/core/kernels/bias_op_gpu.h
@@ -68,12 +68,12 @@ class BiasGradGPUProfileResult {
   }
   BiasAddGradGPUMode algorithm() const { return algorithm_; }
   void set_algorithm(BiasAddGradGPUMode val) { algorithm_ = val; }
-  uint64 elapsed_time() const { return elapsed_time_; }
-  void set_elapsed_time(uint64 val) { elapsed_time_ = val; }
+  uint64_t elapsed_time() const { return elapsed_time_; }
+  void set_elapsed_time(uint64_t val) { elapsed_time_ = val; }
 
  private:
   BiasAddGradGPUMode algorithm_ = BiasAddGradGPUMode::kInvalid;
-  uint64 elapsed_time_ = std::numeric_limits<uint64>::max();
+  uint64_t elapsed_time_ = std::numeric_limits<uint64_t>::max();
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index 8389fb2a8e1180..4d309f6f6286ea 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -36,11 +36,11 @@ namespace functor {
 
 template <typename Tidx, typename T>
 struct BincountFunctor<GPUDevice, Tidx, T, false> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
-                        const typename TTypes<T, 1>::ConstTensor& weights,
-                        typename TTypes<T, 1>::Tensor& output,
-                        const Tidx num_bins) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 1>::ConstTensor& arr,
+                              const typename TTypes<T, 1>::ConstTensor& weights,
+                              typename TTypes<T, 1>::Tensor& output,
+                              const Tidx num_bins) {
     if (weights.size() != 0) {
       return errors::Unimplemented(
           "Weights are not yet supported by the GPU implementation of Bincount."
@@ -48,7 +48,7 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
           " tf.function(jit_compile=True).");
     }
     if (output.size() == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (tensorflow::OpDeterminismRequired()) {
       // TODO(reedwm): Is this really nondeterministic?
@@ -88,11 +88,11 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
     }
     Tensor temp_storage;
     TF_RETURN_IF_ERROR(context->allocate_temp(
-        DataTypeToEnum<int8>::value,
+        DataTypeToEnum<int8_t>::value,
         TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
         &temp_storage));
 
-    void* d_temp_storage = temp_storage.flat<int8>().data();
+    void* d_temp_storage = temp_storage.flat<int8_t>().data();
     // The second HistogramEven is to actual run with d_temp_storage
     // allocated with temp_storage_bytes.
     err = gpuprim::DeviceHistogram::HistogramEven(
@@ -109,7 +109,7 @@ struct BincountFunctor<GPUDevice, Tidx, T, false> {
       return errors::Internal(
           "Could not launch HistogramEven: ", GpuGetErrorString(err), ".");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -126,11 +126,11 @@ __global__ void BincountReduceKernel(const Tidx* in, T* out, const int nthreads,
 
 template <typename Tidx, typename T>
 struct BincountFunctor<GPUDevice, Tidx, T, true> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<Tidx, 1>::ConstTensor& arr,
-                        const typename TTypes<T, 1>::ConstTensor& weights,
-                        typename TTypes<T, 1>::Tensor& output,
-                        const Tidx num_bins) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 1>::ConstTensor& arr,
+                              const typename TTypes<T, 1>::ConstTensor& weights,
+                              typename TTypes<T, 1>::Tensor& output,
+                              const Tidx num_bins) {
     const int nthreads = arr.dimension(0);
 
     auto d = context->eigen_gpu_device();
@@ -206,11 +206,11 @@ __global__ void BincountColReduceSharedKernel(const Tidx* in, const T* weights,
 
 template <typename Tidx, typename T, bool binary_count>
 struct BincountReduceFunctor<GPUDevice, Tidx, T, binary_count> {
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<Tidx, 2>::ConstTensor& in,
-                        const typename TTypes<T, 2>::ConstTensor& weights,
-                        typename TTypes<T, 2>::Tensor& out,
-                        const Tidx num_bins) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<Tidx, 2>::ConstTensor& in,
+                              const typename TTypes<T, 2>::ConstTensor& weights,
+                              typename TTypes<T, 2>::Tensor& out,
+                              const Tidx num_bins) {
     const int num_rows = in.dimension(0);
     const int num_cols = in.dimension(1);
 
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index e354966e744549..e58902ddfccc21 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -149,11 +149,11 @@ TF_CALL_float8_e4m3fn(REGISTER_KERNEL);
 // registration requires all int32 inputs and outputs to be in host memory.
 REGISTER_KERNEL_BUILDER(Name("BroadcastTo")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("input")
                             .HostMemory("shape")
                             .HostMemory("output"),
-                        BroadcastToOp<CPUDevice, int32>);
+                        BroadcastToOp<CPUDevice, int32_t>);
 #endif
 #if defined(PLUGGABLE_DEVICE_SUPPORTED_MACOS)
 REGISTER_KERNEL_BUILDER(Name("BroadcastTo")
diff --git a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
index 93df5a624e76bd..d69244d0c67cad 100644
--- a/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bucketize_op_gpu.cu.cc
@@ -34,18 +34,19 @@ typedef Eigen::GpuDevice GPUDevice;
 
 template <typename T, bool useSharedMem>
 __global__ void BucketizeCustomKernel(
-    const int32 size_in, const T* __restrict__ in, const int32 size_boundaries,
-    GpuDeviceArrayStruct<float> boundaries_array, int32* __restrict__ out) {
+    const int32_t size_in, const T* __restrict__ in,
+    const int32_t size_boundaries, GpuDeviceArrayStruct<float> boundaries_array,
+    int32_t* __restrict__ out) {
   const float* boundaries = GetGpuDeviceArrayOnDevice(&boundaries_array);
 
   GPU_DYNAMIC_SHARED_MEM_DECL(sizeof(float), unsigned char, shared_mem);
   float* shared_mem_boundaries = reinterpret_cast<float*>(shared_mem);
 
   if (useSharedMem) {
-    int32 lidx = threadIdx.y * blockDim.x + threadIdx.x;
-    int32 blockSize = blockDim.x * blockDim.y;
+    int32_t lidx = threadIdx.y * blockDim.x + threadIdx.x;
+    int32_t blockSize = blockDim.x * blockDim.y;
 
-    for (int32 i = lidx; i < size_boundaries; i += blockSize) {
+    for (int32_t i = lidx; i < size_boundaries; i += blockSize) {
       shared_mem_boundaries[i] = boundaries[i];
     }
 
@@ -56,11 +57,11 @@ __global__ void BucketizeCustomKernel(
 
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in[i];
-    int32 bucket = 0;
-    int32 count = size_boundaries;
+    int32_t bucket = 0;
+    int32_t count = size_boundaries;
     while (count > 0) {
-      int32 l = bucket;
-      int32 step = count / 2;
+      int32_t l = bucket;
+      int32_t step = count / 2;
       l += step;
       if (!(value < static_cast<T>(boundaries[l]))) {
         bucket = ++l;
@@ -78,10 +79,10 @@ namespace functor {
 template <typename T>
 struct BucketizeFunctor<GPUDevice, T> {
   // PRECONDITION: boundaries_vector must be sorted.
-  static Status Compute(OpKernelContext* context,
-                        const typename TTypes<T, 1>::ConstTensor& input,
-                        const std::vector<float>& boundaries_vector,
-                        typename TTypes<int32, 1>::Tensor& output) {
+  static absl::Status Compute(OpKernelContext* context,
+                              const typename TTypes<T, 1>::ConstTensor& input,
+                              const std::vector<float>& boundaries_vector,
+                              typename TTypes<int32_t, 1>::Tensor& output) {
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
     GpuDeviceArrayOnHost<float> boundaries_array(context,
@@ -93,8 +94,8 @@ struct BucketizeFunctor<GPUDevice, T> {
     TF_RETURN_IF_ERROR(boundaries_array.Finalize());
 
     GpuLaunchConfig config = GetGpuLaunchConfig(input.size(), d);
-    int32 shared_mem_size = sizeof(float) * boundaries_vector.size();
-    const int32 kMaxSharedMemBytes = 16384;
+    int32_t shared_mem_size = sizeof(float) * boundaries_vector.size();
+    const int32_t kMaxSharedMemBytes = 16384;
     if (shared_mem_size < d.sharedMemPerBlock() &&
         shared_mem_size < kMaxSharedMemBytes) {
       TF_CHECK_OK(GpuLaunchKernel(BucketizeCustomKernel<T, true>,
@@ -108,7 +109,7 @@ struct BucketizeFunctor<GPUDevice, T> {
           config.thread_per_block, 0, d.stream(), input.size(), input.data(),
           boundaries_vector.size(), boundaries_array.data(), output.data()));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index 0a4e011815b80d..98e35f138363d5 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -184,10 +184,10 @@ class GpuCastOp : public CastOpBase {
   }
 
  private:
-  Status Prepare() {
+  absl::Status Prepare() {
     if (external_src_dtype_ == external_dst_dtype_) {
       work_ = nullptr;  // Identity
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (src_dtype_ == DT_BOOL) {
       work_ = GetGpuCastFromBool(dst_dtype_);
@@ -228,7 +228,7 @@ class GpuCastOp : public CastOpBase {
     } else if (src_dtype_ == DT_UINT4) {
       work_ = GetGpuCastFromUint4(dst_dtype_);
     }
-    return work_ == nullptr ? Unimplemented() : OkStatus();
+    return work_ == nullptr ? Unimplemented() : absl::OkStatus();
   }
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -263,14 +263,14 @@ CURRY_TYPES2(REGISTER_CAST_GPU, std::complex<float>);
 CURRY_TYPES2(REGISTER_CAST_GPU, std::complex<double>);
 #else
 REGISTER_CAST_GPU(bool, bfloat16);
-REGISTER_CAST_GPU(int8, bfloat16);
-REGISTER_CAST_GPU(int16, bfloat16);
-REGISTER_CAST_GPU(int32, bfloat16);
-REGISTER_CAST_GPU(int64, bfloat16);
-REGISTER_CAST_GPU(uint8, bfloat16);
-REGISTER_CAST_GPU(uint16, bfloat16);
-REGISTER_CAST_GPU(uint32, bfloat16);
-REGISTER_CAST_GPU(uint64, bfloat16);
+REGISTER_CAST_GPU(int8_t, bfloat16);
+REGISTER_CAST_GPU(int16_t, bfloat16);
+REGISTER_CAST_GPU(int32_t, bfloat16);
+REGISTER_CAST_GPU(int64_t, bfloat16);
+REGISTER_CAST_GPU(uint8_t, bfloat16);
+REGISTER_CAST_GPU(uint16_t, bfloat16);
+REGISTER_CAST_GPU(uint32_t, bfloat16);
+REGISTER_CAST_GPU(uint64_t, bfloat16);
 REGISTER_CAST_GPU(Eigen::half, bfloat16);
 REGISTER_CAST_GPU(float, bfloat16);
 REGISTER_CAST_GPU(double, bfloat16);
@@ -301,43 +301,43 @@ REGISTER_CAST_GPU(float8_e4m3fn, float8_e5m2);
 REGISTER_CAST_GPU(float8_e4m3fn, float8_e4m3fn);
 
 REGISTER_CAST_GPU(int4, int4);
-REGISTER_CAST_GPU(int4, int8);
-REGISTER_CAST_GPU(int4, int16);
-REGISTER_CAST_GPU(int4, int32);
+REGISTER_CAST_GPU(int4, int8_t);
+REGISTER_CAST_GPU(int4, int16_t);
+REGISTER_CAST_GPU(int4, int32_t);
 REGISTER_CAST_GPU(int4, int64_t);
 REGISTER_CAST_GPU(int4, uint4);
-REGISTER_CAST_GPU(int4, uint8);
-REGISTER_CAST_GPU(int4, uint16);
-REGISTER_CAST_GPU(int4, uint32);
+REGISTER_CAST_GPU(int4, uint8_t);
+REGISTER_CAST_GPU(int4, uint16_t);
+REGISTER_CAST_GPU(int4, uint32_t);
 REGISTER_CAST_GPU(int4, uint64_t);
 
-REGISTER_CAST_GPU(int8, int4);
-REGISTER_CAST_GPU(int16, int4);
-REGISTER_CAST_GPU(int32, int4);
+REGISTER_CAST_GPU(int8_t, int4);
+REGISTER_CAST_GPU(int16_t, int4);
+REGISTER_CAST_GPU(int32_t, int4);
 REGISTER_CAST_GPU(int64_t, int4);
 REGISTER_CAST_GPU(uint4, int4);
-REGISTER_CAST_GPU(uint8, int4);
-REGISTER_CAST_GPU(uint16, int4);
-REGISTER_CAST_GPU(uint32, int4);
+REGISTER_CAST_GPU(uint8_t, int4);
+REGISTER_CAST_GPU(uint16_t, int4);
+REGISTER_CAST_GPU(uint32_t, int4);
 REGISTER_CAST_GPU(uint64_t, int4);
 
-REGISTER_CAST_GPU(uint4, int8);
-REGISTER_CAST_GPU(uint4, int16);
-REGISTER_CAST_GPU(uint4, int32);
+REGISTER_CAST_GPU(uint4, int8_t);
+REGISTER_CAST_GPU(uint4, int16_t);
+REGISTER_CAST_GPU(uint4, int32_t);
 REGISTER_CAST_GPU(uint4, int64_t);
 REGISTER_CAST_GPU(uint4, uint4);
-REGISTER_CAST_GPU(uint4, uint8);
-REGISTER_CAST_GPU(uint4, uint16);
-REGISTER_CAST_GPU(uint4, uint32);
+REGISTER_CAST_GPU(uint4, uint8_t);
+REGISTER_CAST_GPU(uint4, uint16_t);
+REGISTER_CAST_GPU(uint4, uint32_t);
 REGISTER_CAST_GPU(uint4, uint64_t);
 
-REGISTER_CAST_GPU(int8, uint4);
-REGISTER_CAST_GPU(int16, uint4);
-REGISTER_CAST_GPU(int32, uint4);
+REGISTER_CAST_GPU(int8_t, uint4);
+REGISTER_CAST_GPU(int16_t, uint4);
+REGISTER_CAST_GPU(int32_t, uint4);
 REGISTER_CAST_GPU(int64_t, uint4);
-REGISTER_CAST_GPU(uint8, uint4);
-REGISTER_CAST_GPU(uint16, uint4);
-REGISTER_CAST_GPU(uint32, uint4);
+REGISTER_CAST_GPU(uint8_t, uint4);
+REGISTER_CAST_GPU(uint16_t, uint4);
+REGISTER_CAST_GPU(uint32_t, uint4);
 REGISTER_CAST_GPU(uint64_t, uint4);
 
 #undef REGISTER_CAST_GPU
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 7963edda7afaca..5f5552edd519ca 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -38,7 +38,7 @@ CastFunctorType GetCpuCastFromInt64(DataType dst_dtype) {
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
 #if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
-  CAST_CASE(GPUDevice, int64, bfloat16);
+  CAST_CASE(GPUDevice, int64_t, bfloat16);
 #else
   CURRY_TYPES3(CAST_CASE, GPUDevice, int64);
 #endif
diff --git a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
index b1d2b3954aa91d..31ceecab9a84ee 100644
--- a/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/check_numerics_op_gpu.cu.cc
@@ -38,10 +38,10 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename T>
 __global__ void CheckNumericsKernel(const T* __restrict__ data, int size,
                                     int abnormal_detected[2]) {
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
 
-  int32 offset = thread_id;
+  int32_t offset = thread_id;
 
   while (offset < size) {
     if (isnan(data[offset])) {
@@ -61,10 +61,10 @@ __global__ void CheckNumericsKernel(const T* __restrict__ data, int size,
 template <typename T>
 __global__ void CheckNumericsKernelV2(const T* __restrict__ data, int size,
                                       int abnormal_detected[3]) {
-  const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32 total_thread_count = gridDim.x * blockDim.x;
+  const int32_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int32_t total_thread_count = gridDim.x * blockDim.x;
 
-  int32 offset = thread_id;
+  int32_t offset = thread_id;
 
   while (offset < size) {
     if (isnan(data[offset])) {
@@ -85,8 +85,8 @@ template <typename T>
 struct CheckNumericsLaunch {
   void Run(const GPUDevice& d, const T* data, int size,
            int abnormal_detected[2]) {
-    const int32 block_size = d.maxGpuThreadsPerBlock();
-    const int32 num_blocks =
+    const int32_t block_size = d.maxGpuThreadsPerBlock();
+    const int32_t num_blocks =
         (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
@@ -103,8 +103,8 @@ template <typename T>
 struct CheckNumericsLaunchV2 {
   void Run(const GPUDevice& d, const T* data, int size,
            int abnormal_detected[3]) {
-    const int32 block_size = d.maxGpuThreadsPerBlock();
-    const int32 num_blocks =
+    const int32_t block_size = d.maxGpuThreadsPerBlock();
+    const int32_t num_blocks =
         (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
         block_size;
 
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
index c44680b27124aa..9e69fb36115602 100644
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -22,10 +22,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-NcclBase::NcclBase(CollectiveType type, const string& name)
+NcclBase::NcclBase(CollectiveType type, const std::string& name)
     : type_(type), name_(name), col_ctx_(nullptr), col_params_(nullptr) {}
 
-Status NcclBase::InitializeCollectiveParams(CollectiveParams* col_params) {
+absl::Status NcclBase::InitializeCollectiveParams(
+    CollectiveParams* col_params) {
   if (type_ != col_params->instance.type) {
     return errors::Internal("Expected initialized type ", type_,
                             " to match type in CollectiveParams ",
@@ -60,10 +61,10 @@ Status NcclBase::InitializeCollectiveParams(CollectiveParams* col_params) {
                             ", expected name ", expected_name);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status NcclBase::InitializeCollectiveContext(
+absl::Status NcclBase::InitializeCollectiveContext(
     std::shared_ptr<CollectiveContext> col_ctx) {
   col_ctx_ = col_ctx;
   col_params_ = col_ctx->col_params.get();
diff --git a/tensorflow/core/kernels/collective_nccl.h b/tensorflow/core/kernels/collective_nccl.h
index 4fc4bebb008e3c..26a096fa3f8bb4 100644
--- a/tensorflow/core/kernels/collective_nccl.h
+++ b/tensorflow/core/kernels/collective_nccl.h
@@ -22,19 +22,20 @@ namespace tensorflow {
 
 class NcclBase : public CollectiveImplementationInterface {
  public:
-  explicit NcclBase(CollectiveType type, const string& name);
+  explicit NcclBase(CollectiveType type, const std::string& name);
   ~NcclBase() override = default;
 
   // No-op for this collective implementation.
-  Status InitializeCollectiveParams(CollectiveParams* col_params) override;
+  absl::Status InitializeCollectiveParams(
+      CollectiveParams* col_params) override;
 
   // Initializes the device objects and device localities.
-  Status InitializeCollectiveContext(
+  absl::Status InitializeCollectiveContext(
       std::shared_ptr<CollectiveContext> col_ctx) override;
 
  protected:
   const CollectiveType type_;
-  const string name_;
+  const std::string name_;
   std::shared_ptr<CollectiveContext> col_ctx_;
   const CollectiveParams* col_params_;  // Not owned
 };
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index 4237a8a6c8b438..58cdf8afd02485 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -74,8 +74,9 @@ void ConcatGPU(
         inputs_flat,
     Tensor* output, typename TTypes<T, 2>::Tensor* output_flat) {
   if (inputs_flat.size() < 16) {
-    if (output->NumElements() < std::numeric_limits<int32>::max()) {
-      ConcatGPUSlice<T, int32>(c->eigen_gpu_device(), inputs_flat, output_flat);
+    if (output->NumElements() < std::numeric_limits<int32_t>::max()) {
+      ConcatGPUSlice<T, int32_t>(c->eigen_gpu_device(), inputs_flat,
+                                 output_flat);
     } else {
       ConcatGPUSlice<T, int64_t>(c->eigen_gpu_device(), inputs_flat,
                                  output_flat);
@@ -84,8 +85,8 @@ void ConcatGPU(
     // Switching indexing to int64 might cause performance issues.
     // Hence, we keep int32 indexing in the GPU kernel unless we need to
     // switch to int64.
-    if (output->NumElements() < std::numeric_limits<int32>::max()) {
-      ConcatGPUCall<T, int32>(c, inputs_flat, output_flat);
+    if (output->NumElements() < std::numeric_limits<int32_t>::max()) {
+      ConcatGPUCall<T, int32_t>(c, inputs_flat, output_flat);
     } else {
       ConcatGPUCall<T, int64_t>(c, inputs_flat, output_flat);
     }
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index a6cece16d20ddf..58b6957a120f2a 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -126,7 +126,7 @@ void ConcatGPUSlice(
     Eigen::array<IntType, 2> size;
     size[0] = inputs_flat[i]->dimension(0);
     size[1] = inputs_flat[i]->dimension(1);
-    if (std::is_same<IntType, int32>::value) {
+    if (std::is_same<IntType, int32_t>::value) {
       To32Bit(*output).slice(offset, size).device(gpu_device) =
           To32Bit(*inputs_flat[i]);
     } else {
@@ -159,7 +159,7 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
     // on most processors
     // possibly due to decreasing occupancy
     // 4096 inputs is a lot, most code will take the smem path
-    const int32 kMaxSmemBytesPerformance = 16384;
+    const int32_t kMaxSmemBytesPerformance = 16384;
     if (smem_usage < smem_max && smem_usage < kMaxSmemBytesPerformance) {
       TF_CHECK_OK(GpuLaunchKernel(
           concat_variable_kernel<T, IntType, true>, config.block_count,
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index ee41b2ddce4eb8..ecd815f3e7e8a2 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -45,7 +45,7 @@ template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
 
 // A dummy type to group forward backward data autotune results together.
 struct ConvBackwardDataAutotuneGroup {
-  static string name() { return "ConvBwdData"; }
+  static std::string name() { return "ConvBwdData"; }
 };
 
 typedef AutotuneSingleton<ConvBackwardDataAutotuneGroup, ConvParameters,
@@ -56,14 +56,14 @@ typedef AutotuneSingleton<ConvBackwardDataAutotuneGroup, ConvParameters,
 // Computes backprop input using Eigen::SpatialConvolutionBackwardInput on GPU
 // for int32 inputs.
 template <>
-struct LaunchConv2DBackpropInputOp<GPUDevice, int32> {
+struct LaunchConv2DBackpropInputOp<GPUDevice, int32_t> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
                   int row_dilation, int col_dilation, int row_stride,
                   int col_stride, const Padding& padding,
                   const std::vector<int64_t>& explicit_paddings,
                   Tensor* in_backprop, TensorFormat data_format) {
-    LaunchConv2DBackpropInputOpImpl<GPUDevice, int32> launcher;
+    LaunchConv2DBackpropInputOpImpl<GPUDevice, int32_t> launcher;
     launcher(ctx, use_cudnn, cudnn_use_autotune, out_backprop, filter,
              row_dilation, col_dilation, row_stride, col_stride, padding,
              explicit_paddings, in_backprop, data_format);
@@ -82,8 +82,8 @@ void LaunchConv2DBackpropInputOpGpuImpl(
   using se::dnn::AlgorithmDesc;
   using se::dnn::ProfileResult;
 
-  std::vector<int32> strides(4, 1);
-  std::vector<int32> dilations(4, 1);
+  std::vector<int32_t> strides(4, 1);
+  std::vector<int32_t> dilations(4, 1);
   auto input_h = GetTensorDimIndex(data_format, 'H');
   auto input_w = GetTensorDimIndex(data_format, 'W');
   strides[input_h] = row_stride;
@@ -144,10 +144,10 @@ void LaunchConv2DBackpropInputOpGpuImpl(
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
       data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
-    const uint64 m = dims.batch_size * dims.spatial_dims[0].input_size *
-                     dims.spatial_dims[1].input_size;
-    const uint64 k = dims.out_depth;
-    const uint64 n = dims.in_depth;
+    const uint64_t m = dims.batch_size * dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size;
+    const uint64_t k = dims.out_depth;
+    const uint64_t n = dims.in_depth;
 
     auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
                                 out_backprop.template flat<T>().size());
@@ -172,10 +172,10 @@ void LaunchConv2DBackpropInputOpGpuImpl(
              data_format == FORMAT_NHWC) {
     // The input data and filter have the same height/width, and we are not
     // using grouped convolution, so call cublas directly.
-    const uint64 m = dims.batch_size;
-    const uint64 k = dims.out_depth;
-    const uint64 n = dims.spatial_dims[0].input_size *
-                     dims.spatial_dims[1].input_size * dims.in_depth;
+    const uint64_t m = dims.batch_size;
+    const uint64_t k = dims.out_depth;
+    const uint64_t n = dims.spatial_dims[0].input_size *
+                       dims.spatial_dims[1].input_size * dims.in_depth;
 
     auto a_ptr = AsDeviceMemory(out_backprop.template flat<T>().data(),
                                 out_backprop.template flat<T>().size());
@@ -279,7 +279,8 @@ void LaunchConv2DBackpropInputOpGpuImpl(
   //   (2) NHWC -> OHWI
 
   Tensor transformed_filter;
-  const auto transform_filter = [&](FilterTensorFormat dst_format) -> Status {
+  const auto transform_filter =
+      [&](FilterTensorFormat dst_format) -> absl::Status {
     VLOG(4) << "Transform filter tensor from " << ToString(FORMAT_HWIO)
             << " to " << ToString(dst_format);
 
@@ -297,7 +298,7 @@ void LaunchConv2DBackpropInputOpGpuImpl(
         To32Bit(filter.tensor<T, 4>()),
         To32Bit(transformed_filter.tensor<T, 4>()));
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   if (compute_data_format == FORMAT_NCHW) {
@@ -391,7 +392,7 @@ void LaunchConv2DBackpropInputOpGpuImpl(
   auto autotune_entry = std::move(entry_or).value();
 
   DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize, ctx);
-  Status cudnn_launch_status =
+  absl::Status cudnn_launch_status =
       LaunchAutotunedConv(autotune_entry, &scratch_allocator,
                           se::dnn::ConvolutionKind::BACKWARD_DATA, stream,
                           input_desc, in_backprop_ptr, filter_desc, filter_ptr,
@@ -531,23 +532,23 @@ DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 
 template <>
-void SpatialConvolutionBackwardInputFunc<GPUDevice, int32>::operator()(
-    const GPUDevice&, typename TTypes<int32, 4>::Tensor,
-    typename TTypes<int32, 4>::ConstTensor,
-    typename TTypes<int32, 4>::ConstTensor, Eigen::DenseIndex,
+void SpatialConvolutionBackwardInputFunc<GPUDevice, int32_t>::operator()(
+    const GPUDevice&, typename TTypes<int32_t, 4>::Tensor,
+    typename TTypes<int32_t, 4>::ConstTensor,
+    typename TTypes<int32_t, 4>::ConstTensor, Eigen::DenseIndex,
     Eigen::DenseIndex, Eigen::DenseIndex, Eigen::DenseIndex);
 extern template struct SpatialConvolutionBackwardInputFunc<GPUDevice, int32>;
 
 template <>
 void SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
-    GPUDevice, int32>::operator()(const GPUDevice&,
-                                  typename TTypes<int32, 4>::Tensor,
-                                  typename TTypes<int32, 4>::ConstTensor,
-                                  typename TTypes<int32, 4>::ConstTensor,
-                                  Eigen::DenseIndex, Eigen::DenseIndex,
-                                  Eigen::DenseIndex, Eigen::DenseIndex,
-                                  Eigen::DenseIndex, Eigen::DenseIndex,
-                                  Eigen::DenseIndex, Eigen::DenseIndex);
+    GPUDevice, int32_t>::operator()(const GPUDevice&,
+                                    typename TTypes<int32_t, 4>::Tensor,
+                                    typename TTypes<int32_t, 4>::ConstTensor,
+                                    typename TTypes<int32_t, 4>::ConstTensor,
+                                    Eigen::DenseIndex, Eigen::DenseIndex,
+                                    Eigen::DenseIndex, Eigen::DenseIndex,
+                                    Eigen::DenseIndex, Eigen::DenseIndex,
+                                    Eigen::DenseIndex, Eigen::DenseIndex);
 extern template struct SpatialConvolutionBackwardInputWithExplicitPaddingFunc<
     GPUDevice, int32>;
 
@@ -575,9 +576,9 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                         Conv2DBackpropInputOp<GPUDevice, Eigen::bfloat16>);
 REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                             .Device(DEVICE_GPU)
-                            .TypeConstraint<int32>("T")
+                            .TypeConstraint<int32_t>("T")
                             .HostMemory("input_sizes"),
-                        Conv2DBackpropInputOp<GPUDevice, int32>);
+                        Conv2DBackpropInputOp<GPUDevice, int32_t>);
 
 // To be used inside depthwise_conv_grad_op.cc.
 // TODO(reedwm): Move this and the definition to depthwise_conv_grad_op.cc.
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.h b/tensorflow/core/kernels/cudnn_pooling_gpu.h
index 970eb533318bb4..d344bb09da1c39 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.h
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.h
@@ -39,9 +39,9 @@ class DnnPooling3dOp {
  public:
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::array<int64, 3>& size,
-                      const std::array<int64, 3>& stride,
-                      const std::array<int64, 3>& padding,
+                      const std::array<int64_t, 3>& size,
+                      const std::array<int64_t, 3>& stride,
+                      const std::array<int64_t, 3>& padding,
                       TensorFormat data_format, const Tensor& tensor_in,
                       Tensor* output);
 };
@@ -53,10 +53,10 @@ class DnnPooling3dGradOp {
  public:
   static void Compute(OpKernelContext* context,
                       se::dnn::PoolingMode pooling_mode,
-                      const std::array<int64, 3>& window,
-                      const std::array<int64, 3>& stride,
-                      const std::array<int64, 3>& padding,
-                      const std::array<int64, 3>& output_size,
+                      const std::array<int64_t, 3>& window,
+                      const std::array<int64_t, 3>& stride,
+                      const std::array<int64_t, 3>& padding,
+                      const std::array<int64_t, 3>& output_size,
                       TensorFormat data_format, const Tensor& out_backprop,
                       const TensorShape& tensor_in_shape,
                       const Tensor* tensor_in, const Tensor* tensor_out,
diff --git a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
index e84885218ea1a3..f22d3bd3db7c96 100644
--- a/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_clip_gpu.cu.cc
@@ -24,11 +24,11 @@ limitations under the License.
 namespace tensorflow {
 
 template <typename T>
-__global__ void UnaryClipCustomKernel(const int32 size_in,
-                                      const T *__restrict__ in0,
-                                      const T *__restrict__ in1,
-                                      const T *__restrict__ in2,
-                                      T *__restrict__ out) {
+__global__ void UnaryClipCustomKernel(const int32_t size_in,
+                                      const T* __restrict__ in0,
+                                      const T* __restrict__ in1,
+                                      const T* __restrict__ in2,
+                                      T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[0] < in0[i] ? in2[0] : in0[i];
     out[i] = value < in1[0] ? in1[0] : value;
@@ -36,11 +36,11 @@ __global__ void UnaryClipCustomKernel(const int32 size_in,
 }
 
 template <typename T>
-__global__ void BinaryRightClipCustomKernel(const int32 size_in,
-                                            const T *__restrict__ in0,
-                                            const T *__restrict__ in1,
-                                            const T *__restrict__ in2,
-                                            T *__restrict__ out) {
+__global__ void BinaryRightClipCustomKernel(const int32_t size_in,
+                                            const T* __restrict__ in0,
+                                            const T* __restrict__ in1,
+                                            const T* __restrict__ in2,
+                                            T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[i] < in0[i] ? in2[i] : in0[i];
     out[i] = value < in1[0] ? in1[0] : value;
@@ -48,11 +48,11 @@ __global__ void BinaryRightClipCustomKernel(const int32 size_in,
 }
 
 template <typename T>
-__global__ void BinaryLeftClipCustomKernel(const int32 size_in,
-                                           const T *__restrict__ in0,
-                                           const T *__restrict__ in1,
-                                           const T *__restrict__ in2,
-                                           T *__restrict__ out) {
+__global__ void BinaryLeftClipCustomKernel(const int32_t size_in,
+                                           const T* __restrict__ in0,
+                                           const T* __restrict__ in1,
+                                           const T* __restrict__ in2,
+                                           T* __restrict__ out) {
   GPU_1D_KERNEL_LOOP(i, size_in) {
     T value = in2[0] < in0[i] ? in2[0] : in0[i];
     out[i] = value < in1[i] ? in1[i] : value;
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index bf572572eace3d..5ef7a4008c8728 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -289,7 +289,7 @@ REGISTER_SELECT_GPU(bool);
 REGISTER_SELECT_GPU(Eigen::half);
 REGISTER_SELECT_GPU(float);
 REGISTER_SELECT_GPU(double);
-REGISTER_SELECT_GPU(int32);
+REGISTER_SELECT_GPU(int32_t);
 REGISTER_SELECT_GPU(int64_t);
 REGISTER_SELECT_GPU(complex64);
 REGISTER_SELECT_GPU(complex128);
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 78ca7948e55c0f..db7cf3f31f7849 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -560,7 +560,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
 
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -619,7 +619,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
             "Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
             input_sizes.dims()));
     TensorShape input_shape;
-    const int32* in_sizes_data = input_sizes.template flat<int32>().data();
+    const int32_t* in_sizes_data = input_sizes.template flat<int32_t>().data();
 
     for (int i = 0; i < input_sizes.NumElements(); ++i) {
       OP_REQUIRES(context, in_sizes_data[i] >= 0,
@@ -695,7 +695,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
   bool use_cudnn_grouped_conv_;
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -1071,7 +1071,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
                 errors::InvalidArgument("Sliding window strides field must "
                                         "specify 4 dimensions"));
 
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -1129,7 +1129,8 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
             "Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
             filter_sizes.dims()));
     TensorShape filter_shape;
-    const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
+    const int32_t* filter_sizes_data =
+        filter_sizes.template flat<int32_t>().data();
     for (int i = 0; i < filter_sizes.NumElements(); ++i) {
       OP_REQUIRES(context, filter_sizes_data[i] >= 0,
                   errors::InvalidArgument("Dimension ", i,
@@ -1249,7 +1250,7 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
   bool use_cudnn_grouped_conv_;
 
  private:
-  std::vector<int32> strides_;
+  std::vector<int32_t> strides_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;

From 6a870779287b1787e19a6dfb615890021bc82e43 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:47:10 -0800
Subject: [PATCH 702/753] Automated Code Change

PiperOrigin-RevId: 848079114
---
 tensorflow/core/kernels/maxpooling_op.cc      | 48 ++++++++--------
 tensorflow/core/kernels/pooling_ops_3d.cc     | 28 +++++-----
 tensorflow/core/kernels/pooling_ops_common.cc | 55 +++++++++---------
 tensorflow/core/kernels/pooling_ops_common.h  |  4 +-
 tensorflow/core/kernels/random_op.h           |  4 +-
 .../core/kernels/regex_full_match_op.cc       |  6 +-
 .../core/kernels/regex_replace_op_test.cc     |  9 +--
 tensorflow/core/kernels/reshape_op.h          |  6 +-
 tensorflow/core/kernels/resource_ops_test.cc  |  4 +-
 .../core/kernels/resource_variable_ops.h      |  6 +-
 tensorflow/core/kernels/restore_op_test.cc    | 47 ++++++++--------
 tensorflow/core/kernels/reverse_op_test.cc    | 56 +++++++++----------
 .../core/kernels/reverse_sequence_op.cc       |  8 +--
 tensorflow/core/kernels/reverse_sequence_op.h |  4 +-
 14 files changed, 143 insertions(+), 142 deletions(-)

diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index c20e9a957be25d..a9de19492d1aff 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -366,7 +366,7 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   typedef Eigen::GpuDevice Device;
 
   explicit MaxPoolingGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -412,16 +412,16 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
 
     TensorShape output_shape = tensor_in.shape();
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -452,8 +452,8 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -698,7 +698,7 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
 
   explicit MaxPoolingGradGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -739,16 +739,16 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, tensor_out.shape(), &output));
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
     if (context->num_inputs() == 5) {
       const Tensor& tensor_ksize = context->input(3);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(4);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -798,8 +798,8 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   TensorFormat data_format_;
   bool use_dnn_;
@@ -1270,7 +1270,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   typedef GPUDevice Device;
   explicit MaxPoolingNoMaskOp(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -1372,8 +1372,8 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
@@ -1386,7 +1386,7 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   typedef GPUDevice Device;
   explicit MaxPoolingNoMaskV2Op(OpKernelConstruction* context)
       : OpKernel(context) {
-    string data_format;
+    std::string data_format;
     OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
     OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
                 errors::InvalidArgument("Invalid data format"));
@@ -1413,17 +1413,17 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    std::vector<int32> ksize = ksize_;
-    std::vector<int32> stride = stride_;
+    std::vector<int32_t> ksize = ksize_;
+    std::vector<int32_t> stride = stride_;
 
     if (context->num_inputs() != 1) {
       const Tensor& tensor_ksize = context->input(1);
-      auto value_ksize = tensor_ksize.flat<int32>();
+      auto value_ksize = tensor_ksize.flat<int32_t>();
       ksize.resize(tensor_ksize.shape().num_elements());
       std::copy_n(&value_ksize(0), ksize.size(), ksize.begin());
 
       const Tensor& tensor_stride = context->input(2);
-      auto value_stride = tensor_stride.flat<int32>();
+      auto value_stride = tensor_stride.flat<int32_t>();
       stride.resize(tensor_stride.shape().num_elements());
       std::copy_n(&value_stride(0), stride.size(), stride.begin());
     }
@@ -1471,8 +1471,8 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   }
 
  private:
-  std::vector<int32> ksize_;
-  std::vector<int32> stride_;
+  std::vector<int32_t> ksize_;
+  std::vector<int32_t> stride_;
   Padding padding_;
   std::vector<int64_t> explicit_paddings_;
   TensorFormat data_format_;
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 42e00c52a8c814..a63a176032f953 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -816,9 +816,9 @@ TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
 template <typename T>
 struct LaunchPoolingOp<GPUDevice, T, AVG> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, window,
@@ -829,9 +829,9 @@ struct LaunchPoolingOp<GPUDevice, T, AVG> {
 template <typename T>
 struct LaunchPoolingOp<GPUDevice, T, MAX> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Padding padding_type,
                      Tensor* output) {
     DnnPooling3dOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, window,
@@ -843,10 +843,10 @@ template <typename T>
 struct LaunchMaxPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context, const Tensor& tensor_in,
                      const Tensor& tensor_out, const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& out,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* input_backprop) {
     const TensorShape output_shape = tensor_in.shape();
     DnnPooling3dGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum,
@@ -861,10 +861,10 @@ struct LaunchAvgPooling3dGradOp<GPUDevice, T> {
   static void launch(OpKernelContext* context,
                      const TensorShape& tensor_in_shape,
                      const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
+                     const std::array<int64_t, 3>& window,
+                     const std::array<int64_t, 3>& stride,
+                     const std::array<int64_t, 3>& out,
+                     const std::array<int64_t, 3>& padding,
                      TensorFormat data_format, Tensor* output) {
     DnnPooling3dGradOp<T>::Compute(
         context, se::dnn::PoolingMode::kAverage, window, stride, padding, out,
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index ac0cd5df525b90..24ed53d027442e 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -56,27 +56,28 @@ struct RawType<qint8> {
 
 template <typename T>
 struct PadInputWithNegativeInf {
-  Status operator()(const GPUDevice& d,
-                    typename TTypes<T, 4, int>::ConstTensor in,
-                    int input_pad_top, int input_pad_bottom, int input_pad_left,
-                    int input_pad_right, typename TTypes<T, 4, int>::Tensor out,
-                    TensorFormat format) {
+  absl::Status operator()(const GPUDevice& d,
+                          typename TTypes<T, 4, int>::ConstTensor in,
+                          int input_pad_top, int input_pad_bottom,
+                          int input_pad_left, int input_pad_right,
+                          typename TTypes<T, 4, int>::Tensor out,
+                          TensorFormat format) {
     T padding_value = -std::numeric_limits<T>::infinity();
     functor::PadInput<GPUDevice, T, int, 4>()(
         d, in, {{input_pad_top, input_pad_left}},
         {{input_pad_bottom, input_pad_right}}, out, format, padding_value);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
 template <>
 struct PadInputWithNegativeInf<qint8> {
-  Status operator()(const GPUDevice& d,
-                    typename TTypes<qint8, 4, int>::ConstTensor in,
-                    int input_pad_top, int input_pad_bottom, int input_pad_left,
-                    int input_pad_right,
-                    typename TTypes<qint8, 4, int>::Tensor out,
-                    TensorFormat format) {
+  absl::Status operator()(const GPUDevice& d,
+                          typename TTypes<qint8, 4, int>::ConstTensor in,
+                          int input_pad_top, int input_pad_bottom,
+                          int input_pad_left, int input_pad_right,
+                          typename TTypes<qint8, 4, int>::Tensor out,
+                          TensorFormat format) {
     return errors::InvalidArgument(
         "Explicit padding not yet supported with qint8");
   }
@@ -227,8 +228,8 @@ absl::Status PoolParameters::forward_output_shape(TensorShape* shape) {
 
 template <typename T>
 void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-                    const std::vector<int32>& size,
-                    const std::vector<int32>& stride, Padding padding,
+                    const std::vector<int32_t>& size,
+                    const std::vector<int32_t>& stride, Padding padding,
                     std::vector<int64_t> explicit_paddings,
                     TensorFormat data_format, const Tensor& tensor_in,
                     const TensorShape& tensor_out_shape, bool propagate_nans,
@@ -438,14 +439,12 @@ void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
 }
 
 template <typename T>
-void DnnPoolingOp<T>::Compute(OpKernelContext* context,
-                              se::dnn::PoolingMode pooling_mode,
-                              const std::vector<int32>& size,
-                              const std::vector<int32>& stride, Padding padding,
-                              std::vector<int64_t> explicit_paddings,
-                              TensorFormat data_format, const Tensor& tensor_in,
-                              const TensorShape& tensor_out_shape,
-                              bool propagate_nans) {
+void DnnPoolingOp<T>::Compute(
+    OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
+    Padding padding, std::vector<int64_t> explicit_paddings,
+    TensorFormat data_format, const Tensor& tensor_in,
+    const TensorShape& tensor_out_shape, bool propagate_nans) {
   Tensor* tensor_out = nullptr;
   OP_REQUIRES_OK(context,
                  context->allocate_output(0, tensor_out_shape, &tensor_out));
@@ -457,7 +456,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
 template <>
 void DnnPoolingOp<Eigen::bfloat16>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
     Padding padding, std::vector<int64_t> explicit_paddings,
     TensorFormat data_format, const Tensor& tensor_in,
     const TensorShape& tensor_out_shape, bool propagate_nans) {
@@ -511,14 +510,14 @@ DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(Eigen::bfloat16);
 DECLARE_GPU_SPEC(double);
-DECLARE_GPU_SPEC(int32);
+DECLARE_GPU_SPEC(int32_t);
 }  // namespace functor
 
 template <typename T>
 void DnnPoolingGradImpl(OpKernelContext* context,
                         se::dnn::PoolingMode pooling_mode,
-                        const std::vector<int32>& size,
-                        const std::vector<int32>& stride, Padding padding,
+                        const std::vector<int32_t>& size,
+                        const std::vector<int32_t>& stride, Padding padding,
                         std::vector<int64_t> explicit_paddings,
                         TensorFormat data_format, const Tensor* tensor_in,
                         const Tensor* tensor_out, const Tensor& out_backprop,
@@ -856,7 +855,7 @@ void DnnPoolingGradImpl(OpKernelContext* context,
 template <typename T>
 void DnnPoolingGradOp<T>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
     Padding padding, std::vector<int64_t> explicit_paddings,
     TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
@@ -873,7 +872,7 @@ void DnnPoolingGradOp<T>::Compute(
 template <>
 void DnnPoolingGradOp<Eigen::bfloat16>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
-    const std::vector<int32>& size, const std::vector<int32>& stride,
+    const std::vector<int32_t>& size, const std::vector<int32_t>& stride,
     Padding padding, std::vector<int64_t> explicit_paddings,
     TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
     const Tensor& out_backprop, const TensorShape& tensor_in_shape,
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 71cddc32bbb3c5..cced70b25d4a39 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -314,12 +314,12 @@ struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
                      const Tensor& input, Tensor* output) {
 #if GOOGLE_CUDA
     bool status = functor::MaxPoolForwardNoMask_NCHW_VECT_C()(
-        reinterpret_cast<const int32*>(input.flat<qint8>().data()),
+        reinterpret_cast<const int32_t*>(input.flat<qint8>().data()),
         params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
         params.depth, params.out_height, params.out_width, params.window_rows,
         params.window_cols, params.row_stride, params.col_stride,
         params.pad_top, params.pad_left,
-        reinterpret_cast<int32*>(output->flat<qint8>().data()),
+        reinterpret_cast<int32_t*>(output->flat<qint8>().data()),
         context->eigen_gpu_device());
     if (!status) {
       context->SetStatus(errors::Internal(
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index 1d6299802f21c7..cef648707d3422 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -51,8 +51,8 @@ typedef Eigen::GpuDevice GPUDevice;
 // Declares the partially GPU-specialized functor struct.
 template <class Distribution>
 struct FillPhiloxRandom<GPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d, const uint64* key,
-                  const uint64* counter, random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const GPUDevice& d, const uint64_t* key,
+                  const uint64_t* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64_t size,
                   Distribution dist);
 };
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
index ddcc165cf5fd18..23be3bd76534fd 100644
--- a/tensorflow/core/kernels/regex_full_match_op.cc
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -41,7 +41,7 @@ class RegexFullMatchOp : public OpKernel {
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
                 errors::InvalidArgument("Pattern must be scalar, but received ",
                                         pattern_tensor->shape().DebugString()));
-    const string pattern = pattern_tensor->flat<tstring>()(0);
+    const std::string pattern = pattern_tensor->flat<tstring>()(0);
     std::shared_ptr<RE2> regex = CachedRE2(pattern);
     OP_REQUIRES(ctx, regex->ok(),
                 errors::InvalidArgument("Invalid pattern: ", pattern,
@@ -57,7 +57,7 @@ class RegexFullMatchOp : public OpKernel {
   }
 
  private:
-  std::shared_ptr<RE2> CachedRE2(const string& pattern) {
+  std::shared_ptr<RE2> CachedRE2(const std::string& pattern) {
     {
       tf_shared_lock l(mu_);
       if (regex_ != nullptr && regex_->pattern() == pattern) {
@@ -88,7 +88,7 @@ REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
 class StaticRegexFullMatchOp : public OpKernel {
  public:
   explicit StaticRegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string pattern;
+    std::string pattern;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("pattern", &pattern));
     re_ = std::make_unique<RE2>(pattern);
     OP_REQUIRES(ctx, re_->ok(),
diff --git a/tensorflow/core/kernels/regex_replace_op_test.cc b/tensorflow/core/kernels/regex_replace_op_test.cc
index 73979d41222f3b..41ee85d7e4b02b 100644
--- a/tensorflow/core/kernels/regex_replace_op_test.cc
+++ b/tensorflow/core/kernels/regex_replace_op_test.cc
@@ -67,8 +67,9 @@ Tensor GetTestTensor(int batch) {
   return t;
 }
 
-Graph* SetupRegexReplaceGraph(const Tensor& input, const string& input_pattern,
-                              const string& input_rewrite) {
+Graph* SetupRegexReplaceGraph(const Tensor& input,
+                              const std::string& input_pattern,
+                              const std::string& input_rewrite) {
   Graph* g = new Graph(OpRegistry::Global());
   Tensor pattern(DT_STRING, TensorShape({}));
   pattern.flat<tstring>().setConstant(input_pattern);
@@ -103,8 +104,8 @@ BENCHMARK(BM_RegexReplace)
     ->Arg(128)
     ->Arg(256);
 
-Graph* SetupStaticGraph(const Tensor& input, const string& input_pattern,
-                        const string& rewrite) {
+Graph* SetupStaticGraph(const Tensor& input, const std::string& input_pattern,
+                        const std::string& rewrite) {
   Graph* g = new Graph(OpRegistry::Global());
 
   TF_CHECK_OK(NodeBuilder("static_regex_replace_op", "StaticRegexReplace")
diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h
index 554c142b9db87a..127381c00034f7 100644
--- a/tensorflow/core/kernels/reshape_op.h
+++ b/tensorflow/core/kernels/reshape_op.h
@@ -61,8 +61,8 @@ class ReshapeOp : public OpKernel {
     switch (sizes.dtype()) {
       case DT_INT32:
         OP_REQUIRES_OK(context,
-                       ValidateSizes<int32>(sizes, &product, &unknown_index,
-                                            &shape, &sizes_has_zero_dim));
+                       ValidateSizes<int32_t>(sizes, &product, &unknown_index,
+                                              &shape, &sizes_has_zero_dim));
         break;
       case DT_INT64:
         OP_REQUIRES_OK(context,
@@ -145,7 +145,7 @@ class ReshapeOp : public OpKernel {
         *has_zero_dim = true;
       } else {
         if (MultiplyWithoutOverflow(shape->num_elements(), size) < 0) {
-          string msg;
+          std::string msg;
           for (int ii = 0; ii < num_dims; ++ii) {
             if (ii != 0) {
               absl::StrAppend(&msg, ", ");
diff --git a/tensorflow/core/kernels/resource_ops_test.cc b/tensorflow/core/kernels/resource_ops_test.cc
index ffc2815d4201d3..43df25dc056eb5 100644
--- a/tensorflow/core/kernels/resource_ops_test.cc
+++ b/tensorflow/core/kernels/resource_ops_test.cc
@@ -42,7 +42,7 @@ class MockResource : public ResourceBase {
       *alive_ = false;
     }
   }
-  string DebugString() const override { return ""; }
+  std::string DebugString() const override { return ""; }
   bool* alive_;
   int payload_;
 };
@@ -103,7 +103,7 @@ TEST_F(MockHandleCreationOpTest, RefCounting) {
   // Feed and run
   AddInputFromArray<int64_t>(TensorShape({}),
                              {reinterpret_cast<int64_t>(&alive)});
-  AddInputFromArray<int32>(TensorShape({}), {payload});
+  AddInputFromArray<int32_t>(TensorShape({}), {payload});
   TF_ASSERT_OK(RunOpKernel());
   EXPECT_TRUE(alive);
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.h b/tensorflow/core/kernels/resource_variable_ops.h
index 1c8d79988a2457..53a52e6cda4303 100644
--- a/tensorflow/core/kernels/resource_variable_ops.h
+++ b/tensorflow/core/kernels/resource_variable_ops.h
@@ -32,9 +32,9 @@ class VarHandleOp : public OpKernel {
  private:
   // Same fields as in ResourceHandleOp.
   bool is_anonymous_;
-  string container_;
-  string name_;
-  string debug_name_;
+  std::string container_;
+  std::string name_;
+  std::string debug_name_;
   Tensor const_tensor_;
 
   DtypeAndPartialTensorShape dtype_and_shape_;
diff --git a/tensorflow/core/kernels/restore_op_test.cc b/tensorflow/core/kernels/restore_op_test.cc
index 15dacaf6d93c45..16bfd01ab4f335 100644
--- a/tensorflow/core/kernels/restore_op_test.cc
+++ b/tensorflow/core/kernels/restore_op_test.cc
@@ -62,8 +62,8 @@ Tensor MakeInput(const TensorShape& shape,
 }
 
 TEST_F(RestoreOpTest, RestoreSimple) {
-  const string filename = io::JoinPath(testing::TmpDir(), "tensor_simple");
-  const std::vector<string> tensor_names = {
+  const std::string filename = io::JoinPath(testing::TmpDir(), "tensor_simple");
+  const std::vector<std::string> tensor_names = {
       "tensor_bool",  "tensor_int",        "tensor_float",  "tensor_double",
       "tensor_qint8", "tensor_qint32",     "tensor_uint8",  "tensor_int8",
       "tensor_int16", "tensor_int64",      "tensor_string", "tensor_complex64",
@@ -103,7 +103,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     // Input #1 is the tensor names
     Tensor input_1 = MakeInput<tstring>(
         TensorShape({static_cast<int>(tensor_names.size())}),
-        [&tensor_names](int x) -> string { return tensor_names[x]; });
+        [&tensor_names](int x) -> std::string { return tensor_names[x]; });
     inputs.push_back({nullptr, &input_1});
 
     // Input #2 is a 1-d bool tensor
@@ -111,8 +111,8 @@ TEST_F(RestoreOpTest, RestoreSimple) {
         MakeInput<bool>(TensorShape({2}), [](int x) -> bool { return x != 0; });
     inputs.push_back({nullptr, &input_2});
     // Input #3 is a 1-d integer tensor
-    Tensor input_3 = MakeInput<int32>(TensorShape({10}),
-                                      [](int x) -> int32 { return x + 1; });
+    Tensor input_3 = MakeInput<int32_t>(TensorShape({10}),
+                                        [](int x) -> int32_t { return x + 1; });
     inputs.push_back({nullptr, &input_3});
     // Input #4 is a 2-d float tensor
     Tensor input_4 = MakeInput<float>(TensorShape({2, 4}), [](int x) -> float {
@@ -136,24 +136,25 @@ TEST_F(RestoreOpTest, RestoreSimple) {
         });
     inputs.push_back({nullptr, &input_7});
     // Input #8 is a 1-d uint8 tensor
-    Tensor input_8 = MakeInput<uint8>(TensorShape({11}),
-                                      [](int x) -> uint8 { return x + 1; });
+    Tensor input_8 = MakeInput<uint8_t>(TensorShape({11}),
+                                        [](int x) -> uint8_t { return x + 1; });
     inputs.push_back({nullptr, &input_8});
     // Input #9 is a 1-d int8 tensor
-    Tensor input_9 =
-        MakeInput<int8>(TensorShape({7}), [](int x) -> int8 { return x - 7; });
+    Tensor input_9 = MakeInput<int8_t>(TensorShape({7}),
+                                       [](int x) -> int8_t { return x - 7; });
     inputs.push_back({nullptr, &input_9});
     // Input #10 is a 1-d int16 tensor
-    Tensor input_10 = MakeInput<int16>(TensorShape({7}),
-                                       [](int x) -> int16 { return x - 8; });
+    Tensor input_10 = MakeInput<int16_t>(
+        TensorShape({7}), [](int x) -> int16_t { return x - 8; });
     inputs.push_back({nullptr, &input_10});
     // Input #11 is a 1-d int64 tensor
-    Tensor input_11 = MakeInput<int64_t>(TensorShape({9}),
-                                         [](int x) -> int64 { return x - 9; });
+    Tensor input_11 = MakeInput<int64_t>(
+        TensorShape({9}), [](int x) -> int64_t { return x - 9; });
     inputs.push_back({nullptr, &input_11});
     // Input #12 is a 1-d string tensor
     Tensor input_12 = MakeInput<tstring>(
-        TensorShape({2}), [](int x) -> string { return x ? "yes" : "no"; });
+        TensorShape({2}),
+        [](int x) -> std::string { return x ? "yes" : "no"; });
     inputs.push_back({nullptr, &input_12});
     // Input #13 is a 1-d complex64 tensor
     Tensor input_13 = MakeInput<complex64>(
@@ -212,7 +213,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({10});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 10; ++i) {
-      EXPECT_EQ(i + 1, output->flat<int32>()(i));
+      EXPECT_EQ(i + 1, output->flat<int32_t>()(i));
     }
   }
   // The 2-d float tensor
@@ -273,7 +274,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({11});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 11; ++i) {
-      EXPECT_EQ(i + 1, output->flat<uint8>()(i));
+      EXPECT_EQ(i + 1, output->flat<uint8_t>()(i));
     }
   }
   // The 1-d int8 tensor
@@ -285,7 +286,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({7});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 7; ++i) {
-      EXPECT_EQ(i - 7, output->flat<int8>()(i));
+      EXPECT_EQ(i - 7, output->flat<int8_t>()(i));
     }
   }
   // The 1-d int16 tensor
@@ -297,7 +298,7 @@ TEST_F(RestoreOpTest, RestoreSimple) {
     TensorShape expected({7});
     EXPECT_TRUE(output->shape().IsSameSize(expected));
     for (int i = 0; i < 7; ++i) {
-      EXPECT_EQ(i - 8, output->flat<int16>()(i));
+      EXPECT_EQ(i - 8, output->flat<int16_t>()(i));
     }
   }
   // The 1-d int64 tensor
@@ -373,8 +374,8 @@ class RestoreSliceOpTest : public OpsTestBase {
 };
 
 TEST_F(RestoreSliceOpTest, RestoreInt) {
-  const string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
-  const string tensor_name = "tensor_int";
+  const std::string filename = io::JoinPath(testing::TmpDir(), "tensor_int");
+  const std::string tensor_name = "tensor_int";
 
   // We first need to write a tensor using the save_op
   {
@@ -412,7 +413,7 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
     // Input #2 is a 4x16 integer tensor.
     Tensor input_2(DT_INT32, TensorShape({4, 16}));
     for (int64_t i = 0; i < input_2.NumElements(); ++i) {
-      input_2.flat<int32>()(i) = i + 1;
+      input_2.flat<int32_t>()(i) = i + 1;
     }
     inputs.push_back({nullptr, &input_2});
 
@@ -433,7 +434,7 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
 
   // Now we restore
   MakeRestoreSliceOp(DT_INT32);
-  string shape_and_slice = "4 16 0,2:-";
+  std::string shape_and_slice = "4 16 0,2:-";
   // Add a file name
   AddInput<tstring>(TensorShape({}),
                     [&filename](int x) -> tstring { return filename; });
@@ -452,7 +453,7 @@ TEST_F(RestoreSliceOpTest, RestoreInt) {
   TensorShape expected({2, 16});
   EXPECT_TRUE(output->shape().IsSameSize(expected));
   for (int64_t i = 0; i < expected.num_elements(); ++i) {
-    EXPECT_EQ(i + 1, output->flat<int32>()(i));
+    EXPECT_EQ(i + 1, output->flat<int32_t>()(i));
   }
 }
 
diff --git a/tensorflow/core/kernels/reverse_op_test.cc b/tensorflow/core/kernels/reverse_op_test.cc
index 09606abc6c61e6..632a5136db8280 100644
--- a/tensorflow/core/kernels/reverse_op_test.cc
+++ b/tensorflow/core/kernels/reverse_op_test.cc
@@ -115,17 +115,17 @@ class ReverseOpTest : public OpsTestBase {
   }
 };
 
-TEST_F(ReverseOpTest, Reverse_0_uint8) { Reverse_0<uint8>(); }
+TEST_F(ReverseOpTest, Reverse_0_uint8) { Reverse_0<uint8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_int8) { Reverse_0<int8>(); }
+TEST_F(ReverseOpTest, Reverse_0_int8) { Reverse_0<int8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_uint16) { Reverse_0<uint16>(); }
+TEST_F(ReverseOpTest, Reverse_0_uint16) { Reverse_0<uint16_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_int16) { Reverse_0<int16>(); }
+TEST_F(ReverseOpTest, Reverse_0_int16) { Reverse_0<int16_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_0_float) { Reverse_0<float>(); }
 
-TEST_F(ReverseOpTest, Reverse_0_int32) { Reverse_0<int32>(); }
+TEST_F(ReverseOpTest, Reverse_0_int32) { Reverse_0<int32_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_0_int64) { Reverse_0<int64_t>(); }
 
@@ -135,17 +135,17 @@ TEST_F(ReverseOpTest, Reverse_0_complex64) { Reverse_0<complex64>(); }
 
 TEST_F(ReverseOpTest, Reverse_0_complex128) { Reverse_0<complex128>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_uint8) { Reverse_234<uint8>(); }
+TEST_F(ReverseOpTest, Reverse_234_uint8) { Reverse_234<uint8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_int8) { Reverse_234<int8>(); }
+TEST_F(ReverseOpTest, Reverse_234_int8) { Reverse_234<int8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_uint16) { Reverse_234<uint16>(); }
+TEST_F(ReverseOpTest, Reverse_234_uint16) { Reverse_234<uint16_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_int16) { Reverse_234<int16>(); }
+TEST_F(ReverseOpTest, Reverse_234_int16) { Reverse_234<int16_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_234_float) { Reverse_234<float>(); }
 
-TEST_F(ReverseOpTest, Reverse_234_int32) { Reverse_234<int32>(); }
+TEST_F(ReverseOpTest, Reverse_234_int32) { Reverse_234<int32_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_234_int64) { Reverse_234<int64_t>(); }
 
@@ -155,17 +155,17 @@ TEST_F(ReverseOpTest, Reverse_234_complex64) { Reverse_234<complex64>(); }
 
 TEST_F(ReverseOpTest, Reverse_234_complex128) { Reverse_234<complex128>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_uint8) { Reverse_1234<uint8>(); }
+TEST_F(ReverseOpTest, Reverse_1234_uint8) { Reverse_1234<uint8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_int8) { Reverse_1234<int8>(); }
+TEST_F(ReverseOpTest, Reverse_1234_int8) { Reverse_1234<int8_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_uint16) { Reverse_1234<uint16>(); }
+TEST_F(ReverseOpTest, Reverse_1234_uint16) { Reverse_1234<uint16_t>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_int16) { Reverse_1234<int16>(); }
+TEST_F(ReverseOpTest, Reverse_1234_int16) { Reverse_1234<int16_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_1234_float) { Reverse_1234<float>(); }
 
-TEST_F(ReverseOpTest, Reverse_1234_int32) { Reverse_1234<int32>(); }
+TEST_F(ReverseOpTest, Reverse_1234_int32) { Reverse_1234<int32_t>(); }
 
 TEST_F(ReverseOpTest, Reverse_1234_int64) { Reverse_1234<int64_t>(); }
 
@@ -190,7 +190,7 @@ static Graph* Reverse(const TensorShape& shape, int reverse_axis) {
   Tensor data(DataTypeToEnum<T>::value, shape);
   data.flat<T>().setRandom();
   Tensor axes(DT_INT32, TensorShape({1}));
-  axes.flat<int32>()(0) = reverse_axis;
+  axes.flat<int32_t>()(0) = reverse_axis;
   test::graph::Reverse(g, test::graph::Constant(g, data),
                        test::graph::Constant(g, axes));
   return g;
@@ -229,8 +229,8 @@ void BM_ReverseRowsOf1Channel_1T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 1 /* intra_threads */, 1 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   1 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_1T_uint8)
@@ -257,8 +257,8 @@ void BM_ReverseRowsOf1Channel_4T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 4 /* intra_threads */, 1 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   4 /* intra_threads */, 1 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf1Channel_4T_uint8)
@@ -286,8 +286,8 @@ void BM_ReverseRowsOf3Channels_1T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 1 /* intra_threads */, 3 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   1 /* intra_threads */, 3 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf3Channels_1T_uint8)
@@ -316,8 +316,8 @@ void BM_ReverseRowsOf3Channels_4T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 4 /* intra_threads */, 3 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   4 /* intra_threads */, 3 /* channels */);
 }
 BENCHMARK(BM_ReverseRowsOf3Channels_4T_uint8)
     ->UseRealTime()
@@ -344,8 +344,8 @@ void BM_ReverseRowsOf4Channels_1T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 1 /* intra_threads */, 4 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   1 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_1T_uint8)
@@ -372,8 +372,8 @@ void BM_ReverseRowsOf4Channels_4T_uint8(::testing::benchmark::State& state) {
   const int outer_dim = state.range(0);
   const int middle_dim = state.range(1);
 
-  RunReverseRowsBenchmark<uint8>(state, outer_dim, middle_dim,
-                                 4 /* intra_threads */, 4 /* channels */);
+  RunReverseRowsBenchmark<uint8_t>(state, outer_dim, middle_dim,
+                                   4 /* intra_threads */, 4 /* channels */);
 }
 
 BENCHMARK(BM_ReverseRowsOf4Channels_4T_uint8)
diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 139520ece5e2a0..7d33356a169ccf 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -98,8 +98,8 @@ void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
 }
 
 template <>
-void CheckErrors<GPUDevice, int32>(OpKernelContext* context, int batch_dim,
-                                   int seq_dim) {
+void CheckErrors<GPUDevice, int32_t>(OpKernelContext* context, int batch_dim,
+                                     int seq_dim) {
   CheckErrorsGPU(context, batch_dim, seq_dim);
 }
 
@@ -164,8 +164,8 @@ class ReverseSequenceOp : public OpKernel {
   }
 
  private:
-  int32 batch_dim_;
-  int32 seq_dim_;
+  int32_t batch_dim_;
+  int32_t seq_dim_;
 
   ReverseSequenceOp(const ReverseSequenceOp&) = delete;
   void operator=(const ReverseSequenceOp&) = delete;
diff --git a/tensorflow/core/kernels/reverse_sequence_op.h b/tensorflow/core/kernels/reverse_sequence_op.h
index f25794f3a2ad39..7db47a4b8bbce3 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.h
+++ b/tensorflow/core/kernels/reverse_sequence_op.h
@@ -49,8 +49,8 @@ class ReverseGenerator {
 
  private:
   typename TTypes<T, Dims>::ConstTensor input_;
-  int32 batch_dim_;
-  int32 seq_dim_;
+  int32_t batch_dim_;
+  int32_t seq_dim_;
   typename TTypes<Tlen>::ConstVec seq_lengths_;
 };
 

From 6ba4ee05fd279472ff5393f860d85fc089f904f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:57:08 -0800
Subject: [PATCH 703/753] Automated Code Change

PiperOrigin-RevId: 848081454
---
 .../rpc/eager/grpc_eager_client.cc                   | 12 ++++++------
 .../rpc/eager/grpc_eager_client_test.cc              |  2 +-
 .../rpc/eager/grpc_eager_service_impl.cc             |  2 +-
 .../rpc/eager/grpc_eager_service_impl.h              |  5 ++---
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 946b245a3e8fce..154eb09ee9d5ff 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -127,7 +127,7 @@ class GrpcEagerClientThread : public core::RefCounted {
 class GrpcEagerClient : public EagerClient {
  public:
   GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
-                  GrpcEagerClientThread* thread, const string& target)
+                  GrpcEagerClientThread* thread, const std::string& target)
       : stub_(channel), thread_(thread), target_(target) {
     // Hold a reference to make sure the corresponding EagerClientThread
     // outlives the client.
@@ -266,13 +266,13 @@ class GrpcEagerClient : public EagerClient {
  private:
   ::grpc::GenericStub stub_;
   const GrpcEagerClientThread* thread_;
-  const string target_;
+  const std::string target_;
 
   ::grpc::CompletionQueue* cq_;
 
   mutable mutex mu_;
 
-  std::unordered_map<uint64, StreamingRPCDispatcher<EnqueueResponse>>
+  std::unordered_map<uint64_t, StreamingRPCDispatcher<EnqueueResponse>>
       enqueue_dispatchers_ TF_GUARDED_BY(mu_);
 
   StatusCallback callback_wrapper(StatusCallback done) {
@@ -313,7 +313,7 @@ class GrpcEagerClientCache : public EagerClientCache {
 
   ~GrpcEagerClientCache() override { threads_.clear(); }
 
-  absl::Status GetClient(const string& target,
+  absl::Status GetClient(const std::string& target,
                          core::RefCountPtr<EagerClient>* client) override {
     mutex_lock l(clients_mu_);
     auto it = clients_.find(target);
@@ -342,7 +342,7 @@ class GrpcEagerClientCache : public EagerClientCache {
       TF_GUARDED_BY(assignment_mu_);
   size_t next_round_robin_assignment_ TF_GUARDED_BY(assignment_mu_);
 
-  size_t AssignClientToThread(const string& target) {
+  size_t AssignClientToThread(const std::string& target) {
     // Round-robin target assignment, but keeps the same target on the same
     // polling thread always, as this is important for gRPC performance
     mutex_lock lock(assignment_mu_);
@@ -358,7 +358,7 @@ class GrpcEagerClientCache : public EagerClientCache {
 
   std::shared_ptr<tensorflow::GrpcChannelCache> cache_;
   mutable mutex clients_mu_;
-  std::unordered_map<string, core::RefCountPtr<EagerClient>> clients_
+  std::unordered_map<std::string, core::RefCountPtr<EagerClient>> clients_
       TF_GUARDED_BY(clients_mu_);
   std::vector<core::RefCountPtr<GrpcEagerClientThread>> threads_;
 };
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
index 2d64e07794d41a..3a11ef95274fbc 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
@@ -48,7 +48,7 @@ TEST(GrpcEagerClientCache, TestGetClientThreadSafety) {
 
   for (int i = 0; i < num_calls; i++) {
     Env::Default()->SchedClosure([&client_cache, i, &counter]() {
-      string target = absl::StrCat("/job:worker/replica:0/task:", i);
+      std::string target = absl::StrCat("/job:worker/replica:0/task:", i);
       core::RefCountPtr<EagerClient> eager_client;
       absl::Status s = client_cache->GetClient(target, &eager_client);
       // With 6 tasks added to the job, querying client for 0--5 should be OK,
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index b9bea2ea437a7a..33d567c56a0a63 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -45,7 +45,7 @@ GrpcEagerServiceImpl::GrpcEagerServiceImpl(
 }
 
 absl::Status GrpcEagerServiceImpl::CreateMasterContext(
-    const tensorflow::uint64 context_id, EagerContext* context) {
+    const uint64_t context_id, EagerContext* context) {
   return local_impl_.CreateMasterContext(context_id, context);
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 083ad55b3f4841..62ee6e9f13a9f0 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -50,8 +50,7 @@ class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
   virtual ~GrpcEagerServiceImpl() {}
 
   // Create a master context in eager service.
-  absl::Status CreateMasterContext(tensorflow::uint64 context_id,
-                                   EagerContext* context);
+  absl::Status CreateMasterContext(uint64_t context_id, EagerContext* context);
 
   void HandleRPCsLoop() override;
   void Shutdown() override;
@@ -136,7 +135,7 @@ class GrpcEagerServiceImpl : public tsl::AsyncServiceInterface {
       // streaming connection.
       absl::Status status = local_impl_.Enqueue(
           /*call_opts=*/nullptr, &call->request(), call->mutable_response(),
-          reinterpret_cast<uint64>(static_cast<void*>(call)));
+          reinterpret_cast<uint64_t>(static_cast<void*>(call)));
 
       if (status.ok()) {
         VLOG(1) << "local_impl_.Enqueue completed successfully";

From 6e5c77d3701af04ff5556e985d20ac3143b4595c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 01:59:33 -0800
Subject: [PATCH 704/753] Automated Code Change

PiperOrigin-RevId: 848082185
---
 third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index 14c44cf1c53925..85f78e92444ec2 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -575,7 +575,7 @@ TfrtGpuClient::DeserializeToLocalExecutable(
   if (serialized.size() > std::numeric_limits<int>::max()) {
     return Internal("Proto is too large (>2GB)");
   }
-  if (!proto.ParseFromArray(serialized.data(), serialized.size())) {
+  if (!proto.ParseFromString(serialized)) {
     return Internal("Proto deserialization failed");
   }
   if (!proto.pjrt_client_name().empty() &&

From bc1b277391caa236590f9c969a96cd2255da965a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 02:00:37 -0800
Subject: [PATCH 705/753] Automated Code Change

PiperOrigin-RevId: 848082460
---
 .../integration_test/c_api_coordination_test.cc          | 8 ++++----
 .../integration_test/c_api_multi_client_function_test.cc | 6 +++---
 .../integration_test/c_api_multi_client_test.cc          | 2 +-
 .../coordination_test_opkernel_registration.cc           | 9 +++++----
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
index d781cb254fa9a9..66e39b5a15ce61 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
@@ -60,7 +60,7 @@ void ConfigCoordinationService(tensorflow::ServerDef* server_def,
   coord_config->set_enable_health_check(enable_health_check);
 }
 
-string SetConfigKeyValueFn() {
+std::string SetConfigKeyValueFn() {
   FunctionDef fdef;
   tensorflow::protobuf::TextFormat::ParseFromString(
       "    signature {"
@@ -86,7 +86,7 @@ string SetConfigKeyValueFn() {
   return fdef.SerializeAsString();
 }
 
-string GetConfigKeyValueFn() {
+std::string GetConfigKeyValueFn() {
   FunctionDef fdef;
   tensorflow::protobuf::TextFormat::ParseFromString(
       "    signature {"
@@ -521,7 +521,7 @@ TEST_P(SingleClientCoordinationServiceTest, TestSetGetConfigInOp) {
   TF_DeleteTensor(t);
   TFE_DeleteOp(get_op2);
 
-  const string& set_fdef = SetConfigKeyValueFn();
+  const std::string& set_fdef = SetConfigKeyValueFn();
   TFE_ContextAddFunctionDef(ctx, set_fdef.data(), set_fdef.size(), status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_Op* set_fn = TFE_NewOp(ctx, "SetConfigKeyValueFn", status);
@@ -542,7 +542,7 @@ TEST_P(SingleClientCoordinationServiceTest, TestSetGetConfigInOp) {
   TFE_DeleteTensorHandle(set_val);
   TFE_DeleteOp(set_fn);
 
-  const string& get_fdef = GetConfigKeyValueFn();
+  const std::string& get_fdef = GetConfigKeyValueFn();
   TFE_ContextAddFunctionDef(ctx, get_fdef.data(), get_fdef.size(), status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_Op* get_fn = TFE_NewOp(ctx, "GetConfigKeyValueFn", status);
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
index 7d767e9a8ce42a..73db4a0bb22cee 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
@@ -39,7 +39,7 @@ namespace {
 
 std::string SendFunction(const std::string& send_device,
                          const std::string& recv_device,
-                         const tensorflow::int64 send_device_incarnation) {
+                         const int64_t send_device_incarnation) {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
       absl::StrCat("    signature {"
@@ -100,7 +100,7 @@ std::string SendFunction(const std::string& send_device,
 
 std::string RecvFunction(const std::string& send_device,
                          const std::string& recv_device,
-                         const tensorflow::int64 send_device_incarnation) {
+                         const int64_t send_device_incarnation) {
   tensorflow::FunctionDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
       absl::StrCat("    signature {"
@@ -239,7 +239,7 @@ TEST_P(MultiClientSendRecvTest, TestMultiClientSendRecv) {
 
     std::vector<tensorflow::DeviceAttributes> device_attrs;
     tensorflow::unwrap(ctx)->ListDevices(&device_attrs);
-    tensorflow::uint64 send_device_incarnation = 0;
+    uint64_t send_device_incarnation = 0;
     for (const auto& device_attr : device_attrs) {
       if (device_attr.name() == send_device) {
         send_device_incarnation = device_attr.incarnation();
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
index a4a1476edaab93..640dbb2a334050 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
@@ -174,7 +174,7 @@ TEST(CAPI, MultiClientSendRecv) {
         tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
     context->ListDevices(&device_attrs);
 
-    tensorflow::uint64 send_device_incarnation = 0;
+    uint64_t send_device_incarnation = 0;
     for (const auto& device_attr : device_attrs) {
       if (device_attr.name() == send_device) {
         send_device_incarnation = device_attr.incarnation();
diff --git a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
index 893ee615659298..5c1864ec2bff3d 100644
--- a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
@@ -45,12 +45,12 @@ class TestSetConfigKeyValueOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
                 errors::InvalidArgument("Key must be scalar."));
-    const string& config_key = key_tensor->scalar<tstring>()();
+    const std::string& config_key = key_tensor->scalar<tstring>()();
     const Tensor* val_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("value", &val_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
                 errors::InvalidArgument("Value must be scalar."));
-    const string& config_value = val_tensor->scalar<tstring>()();
+    const std::string& config_value = val_tensor->scalar<tstring>()();
     LOG(INFO) << "TestSetConfigKeyValueOp key=" << config_key
               << "value=" << config_value;
     auto* coord_agent = ctx->coordination_service_agent();
@@ -90,7 +90,7 @@ class TestGetConfigKeyValueOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
                 errors::InvalidArgument("Key must be scalar."));
-    const string& config_key = key_tensor->scalar<tstring>()();
+    const std::string& config_key = key_tensor->scalar<tstring>()();
     LOG(INFO) << "TestGetConfigKeyValueOp key=" << config_key;
 
     auto* coord_agent = ctx->coordination_service_agent();
@@ -142,7 +142,8 @@ class TestReportErrorToClusterOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->input("error_message", &error_message_tensor));
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(error_message_tensor->shape()),
                 errors::InvalidArgument("Error message must be scalar."));
-    const string& error_message = error_message_tensor->scalar<tstring>()();
+    const std::string& error_message =
+        error_message_tensor->scalar<tstring>()();
     LOG(INFO) << "TestReportErrorToClusterOp error_code=" << error_code
               << " error_message=" << error_message;
     auto* coord_agent = ctx->coordination_service_agent();

From 44359ffdde6f016a4aca25bceb8a90cee3780c82 Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Tue, 23 Dec 2025 02:38:35 -0800
Subject: [PATCH 706/753] [ReplicaGroupV3][Refactor][5/n] Update partitioner
 code to use CollectiveDeviceList (V1 replica group) in place of
 vector<vector<int>>.

PiperOrigin-RevId: 848093189
---
 .../xla/xla/service/spmd/dot_handler.cc       | 14 ++++----
 .../xla/xla/service/spmd/dot_handler_test.cc  | 24 ++++++-------
 .../xla/xla/service/spmd/spmd_partitioner.cc  | 34 +++++++++++--------
 .../xla/service/spmd/spmd_partitioner_util.cc |  8 ++---
 .../xla/service/spmd/spmd_partitioner_util.h  |  4 +--
 .../spmd/spmd_partitioner_util_test.cc        |  8 ++---
 6 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/dot_handler.cc b/third_party/xla/xla/service/spmd/dot_handler.cc
index e9739de81ca05d..e380f36be93052 100644
--- a/third_party/xla/xla/service/spmd/dot_handler.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler.cc
@@ -293,7 +293,7 @@ DotDimensionIndexMapping ComputeDimensionIndexMapping(
                                   output_to_lhs_indices, output_to_rhs_indices};
 }
 
-std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+CollectiveDeviceList GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
   int64_t group_size = 1;
   for (int64_t i : replication_dims) {
@@ -312,7 +312,7 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
         }
         partition_groups[group_id].push_back(partition);
       });
-  return partition_groups;
+  return CollectiveDeviceList(partition_groups);
 }
 
 // Returns true iff all of the following conditions are simultaneously true:
@@ -3395,18 +3395,16 @@ bool PrioritizeContractingDimensionsPartitioning(
        other_non_contracting_dims) {
     ag_replication_dims.push_back(lhs_matching_iterations ? dim.rhs : dim.lhs);
   }
+
   auto all_gather_subgroups =
       GetPartitionGroupsForReplication(other_sharding, ag_replication_dims);
   auto reduce_scatter_subgroups = GetPartitionGroupsForReplication(
       outer_output_tmp_sharding, output_slice_dims);
   const double all_gather_time_in_ms = visitor->GetCommunicationTimeInMilliSec(
-      all_gather_bytes,
-      CollectiveDeviceList(visitor->CreateReplicaGroups(all_gather_subgroups)));
+      all_gather_bytes, all_gather_subgroups);
   const double reduce_scatter_time_in_ms =
-      visitor->GetCommunicationTimeInMilliSec(
-          reduce_scatter_bytes,
-          CollectiveDeviceList(
-              visitor->CreateReplicaGroups(reduce_scatter_subgroups)));
+      visitor->GetCommunicationTimeInMilliSec(reduce_scatter_bytes,
+                                              reduce_scatter_subgroups);
 
   Shape other_original_shape = other_hlo->shape();
   *other_hlo->mutable_shape() =
diff --git a/third_party/xla/xla/service/spmd/dot_handler_test.cc b/third_party/xla/xla/service/spmd/dot_handler_test.cc
index f59d9dde47ffd3..34c599c3ac9557 100644
--- a/third_party/xla/xla/service/spmd/dot_handler_test.cc
+++ b/third_party/xla/xla/service/spmd/dot_handler_test.cc
@@ -138,8 +138,8 @@ HloModule test
 ENTRY main {
   Arg_0 = bf16[2048,24576]{1,0} parameter(0), sharding={devices=[1,4]<=[4]}
   Arg_1 = bf16[24576,98304]{1,0} parameter(1), sharding={devices=[4,1]<=[4]}
-  ROOT dot = bf16[2048,98304]{1,0} dot(Arg_0, Arg_1), 
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}, 
+  ROOT dot = bf16[2048,98304]{1,0} dot(Arg_0, Arg_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
     sharding={devices=[1,4]<=[4]}
 }
 )";
@@ -183,9 +183,9 @@ HloModule test
 ENTRY main {
   Arg_0 = bf16[8,2048,256]{2,1,0} parameter(0), sharding={devices=[4,1,1]<=[4]}
   Arg_1 = bf16[8,256,512]{2,1,0} parameter(1), sharding={devices=[4,1,1]<=[4]}
-  ROOT dot = bf16[8,2048,512]{2,1,0} dot(Arg_0, Arg_1), 
+  ROOT dot = bf16[8,2048,512]{2,1,0} dot(Arg_0, Arg_1),
     lhs_batch_dims={0}, rhs_batch_dims={0},
-    lhs_contracting_dims={2}, rhs_contracting_dims={1}, 
+    lhs_contracting_dims={2}, rhs_contracting_dims={1},
     sharding={devices=[4,1,1]<=[4]}
 }
 )";
@@ -218,8 +218,8 @@ HloModule test
 ENTRY main {
   Arg_0 = bf16[128,256]{1,0} parameter(0), sharding={devices=[1,16]<=[16]}
   Arg_1 = bf16[256,512]{1,0} parameter(1), sharding={devices=[16,1]<=[16]}
-  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1), 
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}, 
+  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
     sharding={devices=[1,16]<=[16]}
 }
 )";
@@ -243,8 +243,8 @@ HloModule test
 ENTRY main {
   Arg_0 = bf16[128,256]{1,0} parameter(0), sharding={devices=[1,32]<=[32]}
   Arg_1 = bf16[256,512]{1,0} parameter(1), sharding={devices=[32,1]<=[32]}
-  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1), 
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}, 
+  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
     sharding={devices=[1,32]<=[32]}
 }
 )";
@@ -269,8 +269,8 @@ HloModule test
 ENTRY main {
   Arg_0 = bf16[128,256]{1,0} parameter(0), sharding={devices=[1,64]<=[64]}
   Arg_1 = bf16[256,512]{1,0} parameter(1), sharding={devices=[64,1]<=[64]}
-  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1), 
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}, 
+  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
     sharding={devices=[1,64]<=[64]}
 }
 )";
@@ -296,8 +296,8 @@ HloModule test
 ENTRY main {
   Arg_0 = bf16[128,256]{1,0} parameter(0), sharding={devices=[1,8]<=[8]}
   Arg_1 = bf16[256,512]{1,0} parameter(1), sharding={devices=[8,1]<=[8]}
-  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1), 
-    lhs_contracting_dims={1}, rhs_contracting_dims={0}, 
+  ROOT dot = bf16[128,512]{1,0} dot(Arg_0, Arg_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0},
     sharding={devices=[1,8]<=[8]}
 }
 )";
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 4f031265de029f..fc233f51925933 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -1750,14 +1750,13 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
     VLOG(5) << "Falling back to creating all-to-all with replica groups V1 "
                "(list of vectors).";
     // The order of ids in the group must follow the temp_target sharding.
-    std::vector<std::vector<int64_t>> groups =
-        GetPartitionGroupsAcrossTargetDims(temp_target, {target_dim},
-                                           {group_size});
+    CollectiveDeviceList groups = GetPartitionGroupsAcrossTargetDims(
+        temp_target, {target_dim}, {group_size});
     // After the reshape, it is guaranteed to have at least 3 dimensions.
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape}, groups, (*state_.next_channel_id)++,
-            target_dim);
+            state_.b, {reshape}, groups.flattened_replica_groups(),
+            (*state_.next_channel_id)++, target_dim);
   }
   CHECK_NE(all_to_all, nullptr);
 
@@ -1939,12 +1938,12 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
   } else {
     VLOG(5) << "Falling back to creating all-to-all with replica groups V1 "
                "(list of vectors).";
-    std::vector<std::vector<int64_t>> groups =
-        GetPartitionGroupsAcrossTargetDims(temp_target, eligible_target_dims,
-                                           group_sizes);
+    CollectiveDeviceList groups = GetPartitionGroupsAcrossTargetDims(
+        temp_target, eligible_target_dims, group_sizes);
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape_1}, groups, (*state_.next_channel_id)++, 0);
+            state_.b, {reshape_1}, groups.flattened_replica_groups(),
+            (*state_.next_channel_id)++, 0);
   }
   // Step 3. Split sharding axes to multiple dimensions
   // 1. reshape_2 (8,16,8,16,8) -> (2,4,16,8,16,8)
@@ -5209,9 +5208,12 @@ SpmdPartitioner::AllGatherShardsInternal(
         auto partition_subgroups =
             GetPartitionGroupsForReplication(sharding, {*it});
         result_shape.set_dimensions(
-            *it, result_shape.dimensions(*it) * partition_subgroups[0].size());
+            *it, result_shape.dimensions(*it) *
+                     partition_subgroups.num_devices_per_group());
         result = collectives_creator.create_cross_partition_all_gather(
-            b, result, result_shape, partition_subgroups, (*next_channel_id)++,
+            b, result, result_shape,
+            partition_subgroups.flattened_replica_groups(),
+            (*next_channel_id)++,
             /*all_gather_dimension=*/*it);
       }
     }
@@ -5247,10 +5249,10 @@ SpmdPartitioner::AllGatherShardsInternal(
   } else {
     auto partition_subgroups =
         GetPartitionGroupsForReplication(sharding, selected_dims);
-    shape[0] *= partition_subgroups[0].size();
+    shape[0] *= partition_subgroups.num_devices_per_group();
     result = collectives_creator.create_cross_partition_all_gather(
         b, result, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
-        partition_subgroups, (*next_channel_id)++,
+        partition_subgroups.flattened_replica_groups(), (*next_channel_id)++,
         /*all_gather_dimension=*/0);
   }
   ag = result;
@@ -5339,7 +5341,8 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
     auto partition_subgroups =
         GetPartitionGroupsForReplication(sharding, selected_dims);
     return collectives_creator.create_cross_partition_all_reduce(
-        b, operand, reduction, partition_subgroups, (*next_channel_id)++);
+        b, operand, reduction, partition_subgroups.flattened_replica_groups(),
+        (*next_channel_id)++);
   }
 
   auto result = operand;
@@ -5362,7 +5365,8 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
       auto partition_subgroups =
           GetPartitionGroupsForReplication(sharding, {*it});
       result = collectives_creator.create_cross_partition_all_reduce(
-          b, result, reduction, partition_subgroups, (*next_channel_id)++);
+          b, result, reduction, partition_subgroups.flattened_replica_groups(),
+          (*next_channel_id)++);
     }
   }
   return result;
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index cac9a5766d88ac..c2b418440c1cc0 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -2895,7 +2895,7 @@ HloInstruction* PadDataFromWindowReshard(
   return sharded_data;
 }
 
-std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+CollectiveDeviceList GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims) {
   absl::Span<const int64_t> sharding_dims = sharding.dimensions();
   DCHECK_GE(sharding_dims.size(), replication_dims.size());
@@ -2939,10 +2939,10 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
         DCHECK_LT(group_id, partition_groups.size());
         partition_groups[group_id].push_back(partition);
       });
-  return partition_groups;
+  return CollectiveDeviceList(partition_groups);
 }
 
-std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
+CollectiveDeviceList GetPartitionGroupsAcrossTargetDims(
     const HloSharding& sharding, std::vector<int64_t> target_dims,
     std::vector<int64_t> group_sizes) {
   CHECK(target_dims.size() == group_sizes.size());
@@ -2966,7 +2966,7 @@ std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
     }
     groups[group_id].push_back(device);
   });
-  return groups;
+  return CollectiveDeviceList(groups);
 }
 
 std::optional<IotaReplicaGroupList> GetIotaPartitionGroupsAcrossTargetDims(
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
index 01ca99b7f5edf7..ff3ffa5e80d7f6 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.h
@@ -595,13 +595,13 @@ HloInstruction* PadDataFromWindowReshard(
 
 // Generates partition groups (groups of devices that will communicate via a
 // collective) from sharding and provided replication_dims.
-std::vector<std::vector<int64_t>> GetPartitionGroupsForReplication(
+CollectiveDeviceList GetPartitionGroupsForReplication(
     const HloSharding& sharding, absl::Span<const int64_t> replication_dims);
 
 // Generates partition groups (groups of devices that will communicate via a
 // collective) across provided target dims with provided group sizes in vector
 // of vector format (legacy format).
-std::vector<std::vector<int64_t>> GetPartitionGroupsAcrossTargetDims(
+CollectiveDeviceList GetPartitionGroupsAcrossTargetDims(
     const HloSharding& sharding, std::vector<int64_t> target_dims,
     std::vector<int64_t> group_sizes);
 
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
index dff93ecb7d4f98..06e90b3061a8e3 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
@@ -77,21 +77,21 @@ TEST(SPMDPartitionerUtilTest, PartialReplicateReshardCompatibleSharding2) {
 
 TEST(SPMDPartitionerUtilTest, GetPartitionGroupsForReplication) {
   HloSharding sharding = HloSharding::IotaTile({2, 2, 2});
-  std::vector<std::vector<int64_t>> actual_partition_groups =
+  CollectiveDeviceList actual_partition_groups =
       GetPartitionGroupsForReplication(sharding, {1});
   std::vector<std::vector<int64_t>> expected_partition_groups = {
       {0, 2}, {1, 3}, {4, 6}, {5, 7}};
-  EXPECT_THAT(actual_partition_groups,
+  EXPECT_THAT(actual_partition_groups.flattened_replica_groups(),
               testing::ContainerEq(expected_partition_groups));
 }
 
 TEST(SPMDPartitionerUtilTest, GetPartitionGroupsForReplication2) {
   HloSharding sharding = HloSharding::IotaTile({2, 2, 2}, {2, 2, 2}, {0, 2, 1});
-  std::vector<std::vector<int64_t>> actual_partition_groups =
+  CollectiveDeviceList actual_partition_groups =
       GetPartitionGroupsForReplication(sharding, {0, 2});
   std::vector<std::vector<int64_t>> expected_partition_groups = {{0, 2, 4, 6},
                                                                  {1, 3, 5, 7}};
-  EXPECT_THAT(actual_partition_groups,
+  EXPECT_THAT(actual_partition_groups.flattened_replica_groups(),
               testing::ContainerEq(expected_partition_groups));
 }
 

From 8f7c36563c0066086f68b37370d869722ed82998 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 03:03:58 -0800
Subject: [PATCH 707/753] Automated Code Change

PiperOrigin-RevId: 848100039
---
 tensorflow/core/framework/type_index.h                | 5 +++--
 tensorflow/core/framework/variant_op_registry_test.cc | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h
index 22c0d608076af5..0277dd1418b524 100644
--- a/tensorflow/core/framework/type_index.h
+++ b/tensorflow/core/framework/type_index.h
@@ -57,8 +57,9 @@ class TypeIndex {
   static TypeIndex Make() {
 #ifdef PLATFORM_CLOUD_TPU
     static bool hash_bit[1];
-    return TypeIndex(static_cast<uint64>(reinterpret_cast<intptr_t>(hash_bit)),
-                     typeid(T).name());
+    return TypeIndex(
+        static_cast<uint64_t>(reinterpret_cast<intptr_t>(hash_bit)),
+        typeid(T).name());
 #endif
 #if defined(__GXX_RTTI) || defined(_CPPRTTI)
 
diff --git a/tensorflow/core/framework/variant_op_registry_test.cc b/tensorflow/core/framework/variant_op_registry_test.cc
index 2506bdd433242d..8a3563ab64322e 100644
--- a/tensorflow/core/framework/variant_op_registry_test.cc
+++ b/tensorflow/core/framework/variant_op_registry_test.cc
@@ -231,8 +231,8 @@ TEST(VariantOpUnaryOpRegistryTest, TestBasicGPU) {
   Variant v_out = VariantValue();
 
   OpKernelContext* null_context_pointer = nullptr;
-  Status s0 = UnaryOpVariant<GPUDevice>(null_context_pointer,
-                                        ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
+  absl::Status s0 = UnaryOpVariant<GPUDevice>(
+      null_context_pointer, ZEROS_LIKE_VARIANT_UNARY_OP, v, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(absl::StrContains(s0.message(), "early exit zeros_like"));
 
@@ -304,7 +304,7 @@ TEST(VariantOpAddRegistryTest, TestBasicGPU) {
   Variant v_out = VariantValue();
 
   OpKernelContext* null_context_pointer = nullptr;
-  Status s0 = BinaryOpVariants<GPUDevice>(
+  absl::Status s0 = BinaryOpVariants<GPUDevice>(
       null_context_pointer, ADD_VARIANT_BINARY_OP, v_a, v_b, &v_out);
   EXPECT_FALSE(s0.ok());
   EXPECT_TRUE(absl::StrContains(s0.message(), "early exit add"));

From 4a49b39e2b2b61089e425106809792cc6db07e5f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 03:53:31 -0800
Subject: [PATCH 708/753] Automated Code Change

PiperOrigin-RevId: 848113142
---
 third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
index 884157fd5f85f1..4f3814c0f61509 100644
--- a/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/runtime_intrinsics.cc
@@ -172,7 +172,7 @@ absl::Status AppendToFileCustomCall(se::Stream* stream, ffi::AnyBuffer buffer,
   std::string filename(path);
 
   {
-    absl::MutexLock lock(&host_mutex);
+    absl::MutexLock lock(host_mutex);
 
     TF_RETURN_IF_ERROR(env->NewAppendableFile(filename, &file));
     tsl::io::RecordWriter writer(file.get());

From f42e8ace05c388247c9098fdde9b709e7564152b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 04:07:29 -0800
Subject: [PATCH 709/753] Automated Code Change

PiperOrigin-RevId: 848117271
---
 .../xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc  | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
index 732462ab004c75..f77d0e8aabcf95 100644
--- a/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/host_execute_thunk_test.cc
@@ -83,9 +83,8 @@ CreateHostExecuteStartThunk(
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::cpu::NanoRtExecutable> executable,
                       client.Compile(host_computation));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<AotCompilationResult> aot_compilation_result,
-      client.Export(executable.get()));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<CompiledModule> aot_compilation_result,
+                      client.Export(executable.get()));
 
   xla::cpu::CpuAotCompilationResult* cpu_aot_compilation_result =
       tsl::down_cast<xla::cpu::CpuAotCompilationResult*>(

From dcdba180c5735ad28e3c37b00ee077230bc1813c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 04:52:24 -0800
Subject: [PATCH 710/753] Automated Code Change

PiperOrigin-RevId: 848128508
---
 tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
index 027f53cc3fc3e2..1c612a8f28a4ff 100644
--- a/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
+++ b/tensorflow/dtensor/cc/xla_spmd/layout_to_xla_sharding.cc
@@ -133,8 +133,8 @@ StatusOr<::xla::OpSharding> ConvertLayoutToXlaOpSharding(const Layout& layout) {
   {
     // Set Tile Assignment Dimensions by handling both partially sharded and
     // fully sharded.
-    int32 product_of_sharded_dimensions = 1;
-    for (int32 dim_size : layout.num_shards()) {
+    int32_t product_of_sharded_dimensions = 1;
+    for (int32_t dim_size : layout.num_shards()) {
       product_of_sharded_dimensions *= dim_size;
       xla_sharding.add_tile_assignment_dimensions(dim_size);
     }

From 0c5029c2183b09c6ddc9562f07ebbb058410dca5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 07:11:07 -0800
Subject: [PATCH 711/753] Automated Code Change

PiperOrigin-RevId: 848165494
---
 tensorflow/core/kernels/fft_ops.cc             | 8 ++++----
 tensorflow/core/kernels/fused_batch_norm_op.cc | 8 +++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index bcc81f903b84f6..5743b6d6cc8cc7 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -599,11 +599,11 @@ class CufftScratchAllocator : public se::ScratchAllocator {
   CufftScratchAllocator(int64_t memory_limit, OpKernelContext* context)
       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
   int64_t GetMemoryLimitInBytes() override { return memory_limit_; }
-  tsl::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
       int64_t byte_size) override {
     Tensor temporary_memory;
     if (byte_size > memory_limit_) {
-      return tsl::StatusOr<se::DeviceMemory<uint8>>();
+      return absl::StatusOr<stream_executor::DeviceMemory<uint8>>();
     }
     AllocationAttributes allocation_attr;
     allocation_attr.retry_on_failure = false;
@@ -611,13 +611,13 @@ class CufftScratchAllocator : public se::ScratchAllocator {
         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
         AllocatorAttributes(), allocation_attr));
     if (!allocation_status.ok()) {
-      return tsl::StatusOr<se::DeviceMemory<uint8>>();
+      return absl::StatusOr<stream_executor::DeviceMemory<uint8>>();
     }
     // Hold the reference of the allocated tensors until the end of the
     // allocator.
     allocated_tensors_.push_back(temporary_memory);
     total_byte_size_ += byte_size;
-    return tsl::StatusOr<se::DeviceMemory<uint8>>(
+    return absl::StatusOr<stream_executor::DeviceMemory<uint8>>(
         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
                        temporary_memory.flat<uint8>().size()));
   }
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 3d0857fa13beb9..177751a7c284d3 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -704,7 +704,8 @@ class CudnnBatchNormAllocatorInTemp : public ScratchAllocator {
     return std::numeric_limits<int64_t>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64_t byte_size) override {
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
+      int64_t byte_size) override {
     Tensor temporary_memory;
     const DataType tf_data_type = DataTypeToEnum<T>::v();
     int64_t allocate_count =
@@ -757,7 +758,8 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
     return std::numeric_limits<int64_t>::max();
   }
 
-  StatusOr<DeviceMemory<uint8>> AllocateBytes(int64_t byte_size) override {
+  absl::StatusOr<stream_executor::DeviceMemory<uint8>> AllocateBytes(
+      int64_t byte_size) override {
     output_allocated = true;
     DCHECK(total_byte_size_ == 0)
         << "Reserve space allocator can only be called once";
@@ -774,7 +776,7 @@ class CudnnBatchNormAllocatorInOutput : public ScratchAllocator {
     auto memory_uint8 = DeviceMemory<uint8>::MakeFromByteSize(
         temporary_memory->template flat<T>().data(),
         temporary_memory->template flat<T>().size() * sizeof(T));
-    return StatusOr<DeviceMemory<uint8>>(memory_uint8);
+    return absl::StatusOr<stream_executor::DeviceMemory<uint8>>(memory_uint8);
   }
 
   int64_t TotalByteSize() { return total_byte_size_; }

From 2505aa5841317b1ad9e2bfa8f5d415554ddc6278 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 07:29:14 -0800
Subject: [PATCH 712/753] Automated Code Change

PiperOrigin-RevId: 848170174
---
 tensorflow/core/util/tensor_bundle/tensor_bundle.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index afa764a2e15227..1037ffd542b668 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -237,7 +237,7 @@ tstring* GetStringBackingBuffer(const Tensor& val) {
 
 absl::Status ParseEntryProto(absl::string_view key, absl::string_view value,
                              protobuf::MessageLite* out) {
-  if (!out->ParseFromArray(value.data(), value.size())) {
+  if (!out->ParseFromString(value)) {
     return errors::DataLoss("Entry for key ", key, " not parseable.");
   }
   return absl::OkStatus();
@@ -1225,7 +1225,7 @@ string BundleReader::DebugString() {
   BundleEntryProto entry;
   Seek(kHeaderEntryKey);
   for (Next(); Valid(); Next()) {
-    CHECK(entry.ParseFromArray(value().data(), value().size()));
+    CHECK(entry.ParseFromString(value()));
     if (entry.slices_size() > 0) continue;  // Slice of some partitioned var.
 
     strings::StrAppend(&shape_str, key(), " (", DataType_Name(entry.dtype()),

From 96f26c95279b4321910e3ddb607567d2674c68e4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 07:29:42 -0800
Subject: [PATCH 713/753] Automated Code Change

PiperOrigin-RevId: 848170320
---
 tensorflow/python/lib/core/ndarray_tensor.cc  |  5 ++--
 .../python/lib/core/ndarray_tensor_bridge.cc  |  4 +--
 tensorflow/python/lib/core/py_func.cc         | 10 ++++---
 tensorflow/python/lib/core/py_seq_tensor.cc   | 30 +++++++++----------
 tensorflow/python/lib/core/py_util.cc         | 12 ++++----
 tensorflow/python/lib/core/py_util.h          |  2 +-
 6 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index c33014cc3ae5b2..1d897c4b67e512 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -100,7 +100,7 @@ absl::Status PyArrayDescr_to_TF_DataType(PyArray_Descr* descr,
     if (!key_string) {
       return errors::Internal("Corrupt numpy type descriptor");
     }
-    tensorflow::string key = key_string;
+    std::string key = key_string;
     // The typenames here should match the field names in the custom struct
     // types constructed in test_util.py.
     // TODO(mrry,keveman): Investigate Numpy type registration to replace this
@@ -320,7 +320,8 @@ absl::Status EncodePyBytesArray(PyArrayObject* array, int64_t nelems,
   return absl::OkStatus();
 }
 
-absl::Status CopyTF_TensorStringsToPyArray(const TF_Tensor* src, uint64 nelems,
+absl::Status CopyTF_TensorStringsToPyArray(const TF_Tensor* src,
+                                           uint64_t nelems,
                                            PyArrayObject* dst) {
   const void* tensor_data = TF_TensorData(src);
   DCHECK(tensor_data != nullptr);
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 703bab0f65a7b8..fbb1f10c855b15 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -253,10 +253,10 @@ absl::Status ArrayFromMemory(int dim_size, npy_intp* dims, void* data,
   auto* np_array = reinterpret_cast<PyArrayObject*>(
       PyArray_SimpleNewFromData(dim_size, dims, type_num, data));
   if (np_array == nullptr) {
-    string shape_str = absl::StrJoin(
+    std::string shape_str = absl::StrJoin(
         absl::Span<npy_intp>{dims, static_cast<size_t>(dim_size)}, ", ");
     if (PyErr_Occurred()) {
-      string exception_str = PyExceptionFetch();
+      std::string exception_str = PyExceptionFetch();
       PyErr_Clear();
       return errors::InvalidArgument(
           "Failed to create numpy array from tensor of shape [", shape_str,
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 16ba9db74ba764..54178667bfddaa 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -63,7 +63,7 @@ PyObject* GetPyTrampoline() {
 struct PyCall {
   // Passed to python runtime to call the python function registered
   // with this "token".
-  string token;
+  std::string token;
 
   // The device on which Tensors are stored; only used for EagerPyFunc.
   Device* device = nullptr;
@@ -164,7 +164,8 @@ absl::Status ExtractTensorFromEagerTensor(const PyObject* eager_tensor,
   TF_RETURN_IF_ERROR(handle->Tensor(output_tensor));
   // actual_device may be nullptr, which implies local CPU.
   if (expected_device == actual_device) return absl::OkStatus();
-  const string& expected_device_name = expected_device->attributes().name();
+  const std::string& expected_device_name =
+      expected_device->attributes().name();
   if (actual_device == nullptr) {
     if (!IsCPUDevice(expected_device)) {
       return errors::Internal(
@@ -380,7 +381,8 @@ class PyFuncOp : public OpKernel {
       return;
     }
 
-    OP_REQUIRES(ctx, static_cast<int32>(call.out.size()) == ctx->num_outputs(),
+    OP_REQUIRES(ctx,
+                static_cast<int32_t>(call.out.size()) == ctx->num_outputs(),
                 errors::InvalidArgument(token_, " returns ", call.out.size(),
                                         " values, but expects to see ",
                                         ctx->num_outputs(), " values."));
@@ -396,7 +398,7 @@ class PyFuncOp : public OpKernel {
   }
 
  private:
-  string token_;
+  std::string token_;
 
   // True if and only if this op should execute the python function eagerly,
   // i.e., if and only if the eager attribute is set.
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 3547cd4a8ddc81..6b2b6be8cf53e7 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -360,8 +360,9 @@ struct ConverterTraits<int64_t> {
 typedef Converter<int64_t> Int64Converter;
 
 template <>
-struct ConverterTraits<uint64> {
-  static AbstractTensorInterface* CreateScalar(TFE_Context* ctx, uint64 value) {
+struct ConverterTraits<uint64_t> {
+  static AbstractTensorInterface* CreateScalar(TFE_Context* ctx,
+                                               uint64_t value) {
     return tensorflow::unwrap(ctx)->CreateUint64Scalar(value);
   }
 
@@ -370,7 +371,7 @@ struct ConverterTraits<uint64> {
     return tensorflow::unwrap(ctx)->CreateTensor(DT_UINT64, dim_sizes);
   }
 
-  static const char* ConvertScalar(PyObject* v, uint64* out) {
+  static const char* ConvertScalar(PyObject* v, uint64_t* out) {
 #if PY_MAJOR_VERSION < 3
     if (TF_PREDICT_TRUE(PyInt_Check(v))) {
       *out = PyInt_AsUnsignedLongLongMask(v);
@@ -394,10 +395,10 @@ struct ConverterTraits<uint64> {
   }
 };
 
-typedef Converter<uint64> UInt64Converter;
+typedef Converter<uint64_t> UInt64Converter;
 
 template <>
-struct ConverterTraits<int32> {
+struct ConverterTraits<int32_t> {
   static AbstractTensorInterface* CreateScalar(TFE_Context* ctx,
                                                int32_t value) {
     return tensorflow::unwrap(ctx)->CreateInt32Scalar(value);
@@ -408,7 +409,7 @@ struct ConverterTraits<int32> {
     return tensorflow::unwrap(ctx)->CreateTensor(DT_INT32, dim_sizes);
   }
 
-  static const char* ConvertScalar(PyObject* v, int32* out) {
+  static const char* ConvertScalar(PyObject* v, int32_t* out) {
     int64_t i;
 #if PY_MAJOR_VERSION < 3
     if (TF_PREDICT_TRUE(PyInt_Check(v))) {
@@ -432,14 +433,14 @@ struct ConverterTraits<int32> {
     } else {
       return ErrorMixedTypes;
     }
-    *out = static_cast<uint32>(static_cast<uint64>(i));
+    *out = static_cast<uint32_t>(static_cast<uint64_t>(i));
     // Check for 32-bit overflow.
     if (TF_PREDICT_FALSE(i != *out)) return ErrorFoundInt64;
     return nullptr;
   }
 };
 
-typedef Converter<int32> Int32Converter;
+typedef Converter<int32_t> Int32Converter;
 
 // Floating-point support
 
@@ -694,11 +695,11 @@ TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
   absl::Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor);
 
   if (TF_PREDICT_FALSE(!status.ok())) {
-    PyErr_SetString(PyExc_ValueError,
-                    tensorflow::strings::StrCat(
-                        "Failed to convert a NumPy array to a Tensor (",
-                        status.message(), ").")
-                        .c_str());
+    PyErr_SetString(
+        PyExc_ValueError,
+        absl::StrCat("Failed to convert a NumPy array to a Tensor (",
+                     status.message(), ").")
+            .c_str());
     return nullptr;
   }
 
@@ -758,8 +759,7 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj,
                .ok()) {
         PyErr_SetString(
             PyExc_TypeError,
-            tensorflow::strings::StrCat("Invalid dtype argument value ", dtype)
-                .c_str());
+            absl::StrCat("Invalid dtype argument value ", dtype).c_str());
         return nullptr;
       }
     }
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
index a78f0a12f21c3f..fa1845bd782841 100644
--- a/tensorflow/python/lib/core/py_util.cc
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -45,7 +45,7 @@ const char* ClassName(PyObject* py) {
 
 // Returns a PyObject containing a string, or null
 void TryAppendTraceback(PyObject* ptype, PyObject* pvalue, PyObject* ptraceback,
-                        string* out) {
+                        std::string* out) {
   // The "traceback" module is assumed to be imported already by script_ops.py.
   PyObject* tb_module = PyImport_AddModule("traceback");
 
@@ -84,7 +84,7 @@ void TryAppendTraceback(PyObject* ptype, PyObject* pvalue, PyObject* ptraceback,
 #if PY_MAJOR_VERSION < 3
     strings::StrAppend(out, PyString_AS_STRING(v), "\n");
 #else
-    strings::StrAppend(out, PyUnicode_AsUTF8(v), "\n");
+    absl::StrAppend(out, PyUnicode_AsUTF8(v), "\n");
 #endif
   }
 
@@ -92,7 +92,7 @@ void TryAppendTraceback(PyObject* ptype, PyObject* pvalue, PyObject* ptraceback,
   Py_DECREF(ret_val);
 }
 
-string PyExceptionFetch() {
+std::string PyExceptionFetch() {
   CHECK(PyErr_Occurred())
       << "Must only call PyExceptionFetch after an exception.";
   PyObject* ptype;
@@ -100,7 +100,7 @@ string PyExceptionFetch() {
   PyObject* ptraceback;
   PyErr_Fetch(&ptype, &pvalue, &ptraceback);
   PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
-  string err = ClassName(ptype);
+  std::string err = ClassName(ptype);
   if (pvalue) {
     PyObject* str = PyObject_Str(pvalue);
 
@@ -108,11 +108,11 @@ string PyExceptionFetch() {
 #if PY_MAJOR_VERSION < 3
       strings::StrAppend(&err, ": ", PyString_AS_STRING(str), "\n");
 #else
-      strings::StrAppend(&err, ": ", PyUnicode_AsUTF8(str), "\n");
+      absl::StrAppend(&err, ": ", PyUnicode_AsUTF8(str), "\n");
 #endif
       Py_DECREF(str);
     } else {
-      strings::StrAppend(&err, "(unknown error message)\n");
+      absl::StrAppend(&err, "(unknown error message)\n");
     }
 
     TryAppendTraceback(ptype, pvalue, ptraceback, &err);
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
index af1b21699e6502..d6b2b9f78ddca6 100644
--- a/tensorflow/python/lib/core/py_util.h
+++ b/tensorflow/python/lib/core/py_util.h
@@ -25,7 +25,7 @@ namespace tensorflow {
 
 // Fetch the exception message as a string. An exception must be set
 // (PyErr_Occurred() must be true).
-string PyExceptionFetch();
+std::string PyExceptionFetch();
 
 // Assert that Python GIL is held.
 inline void DCheckPyGilState() {

From ee2c4c4987d988812d02e079522681d3402befb8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 07:38:25 -0800
Subject: [PATCH 714/753] Automated Code Change

PiperOrigin-RevId: 848173489
---
 tensorflow/cc/framework/scope.cc | 76 ++++++++++++++++----------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index 0185fd11d37dec..b457f602b4a5b0 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -79,7 +79,7 @@ Scope Scope::DisabledShapeInferenceScope() {
                         /* disable_shape_inference */ true));
 }
 
-Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const string& name,
+Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const std::string& name,
                   bool copy_names)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
@@ -98,8 +98,8 @@ Scope::Impl::Impl(const Scope& other, Tags::ScopeName, const string& name,
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
-Scope::Impl::Impl(const Scope& other, Tags::OpName, const string& name,
-                  const string& op_name)
+Scope::Impl::Impl(const Scope& other, Tags::OpName, const std::string& name,
+                  const std::string& op_name)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -140,7 +140,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ControlDeps,
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
-Scope::Impl::Impl(const Scope& other, Tags::Device, const string& device)
+Scope::Impl::Impl(const Scope& other, Tags::Device, const std::string& device)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -158,7 +158,7 @@ Scope::Impl::Impl(const Scope& other, Tags::Device, const string& device)
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::SingleUseScope,
-                  const string& op_name)
+                  const std::string& op_name)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -193,7 +193,7 @@ Scope::Impl::Impl(const Scope& other, Tags::ExitOnError)
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::KernelLabel,
-                  const string& kernel_label)
+                  const std::string& kernel_label)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -227,12 +227,12 @@ Scope::Impl::Impl(const Scope& other, Tags::Colocate,
       xla_cluster_(other.impl()->xla_cluster_),
       colocation_constraints_(
           clear_colocations
-              ? std::unordered_set<string>()
+              ? std::unordered_set<std::string>()
               : other.impl()->GetColocationConstraints(colocate_with_op)),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::AssignedDevice,
-                  const string& assigned_device)
+                  const std::string& assigned_device)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -250,7 +250,7 @@ Scope::Impl::Impl(const Scope& other, Tags::AssignedDevice,
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
 Scope::Impl::Impl(const Scope& other, Tags::XlaCluster,
-                  const string& xla_cluster)
+                  const std::string& xla_cluster)
     : graph_(other.impl()->graph_),
       status_(other.impl()->status_),
       name_map_(other.impl()->name_map_),
@@ -267,13 +267,13 @@ Scope::Impl::Impl(const Scope& other, Tags::XlaCluster,
       colocation_constraints_(other.impl()->colocation_constraints_),
       disable_shape_inference_(other.impl()->disable_shape_inference_) {}
 
-std::unordered_set<string> Scope::Impl::GetColocationConstraints(
+std::unordered_set<std::string> Scope::Impl::GetColocationConstraints(
     const Operation& colocate_with_op) const {
-  std::unordered_set<string> current_constraints(colocation_constraints_);
+  std::unordered_set<std::string> current_constraints(colocation_constraints_);
   const AttrSlice attrs = colocate_with_op.node()->attrs();
-  std::vector<string> node_constraints;
+  std::vector<std::string> node_constraints;
   if (TryGetNodeAttr(attrs, kColocationAttrName, &node_constraints)) {
-    for (const string& entry : node_constraints) {
+    for (const std::string& entry : node_constraints) {
       absl::string_view s(entry);
       if (absl::ConsumePrefix(&s, kColocationGroupPrefix)) {
         current_constraints.emplace(s);
@@ -335,13 +335,14 @@ void Scope::UpdateBuilder(NodeBuilder* builder) const {
   }
 
   if (!impl()->colocation_constraints_.empty()) {
-    std::vector<string> constraints(impl()->colocation_constraints_.begin(),
-                                    impl()->colocation_constraints_.end());
+    std::vector<std::string> constraints(
+        impl()->colocation_constraints_.begin(),
+        impl()->colocation_constraints_.end());
     // Sort the set.
     std::sort(constraints.begin(), constraints.end());
     // Add loc:@ prefix
     std::transform(constraints.begin(), constraints.end(), constraints.begin(),
-                   [](const string& s) {
+                   [](const std::string& s) {
                      return absl::StrCat(kColocationGroupPrefix, s);
                    });
     builder->Attr(kColocationAttrName, constraints);
@@ -357,8 +358,8 @@ void Scope::UpdateBuilder(NodeBuilder* builder) const {
   }
 }
 
-string Scope::Impl::GetUniqueName(const string& prefix,
-                                  bool check_single_use) const {
+std::string Scope::Impl::GetUniqueName(const std::string& prefix,
+                                       bool check_single_use) const {
   if (check_single_use && single_use_scope()) {
     if (*scope_used_) {
       *status_ =
@@ -373,7 +374,7 @@ string Scope::Impl::GetUniqueName(const string& prefix,
     name_map_->insert({prefix, 0});
     return prefix;
   }
-  string unique_name;
+  std::string unique_name;
   do {
     unique_name = absl::StrCat(prefix, kSuffixSeparator, ++entry->second);
   } while (name_map_->find(unique_name) != name_map_->end());
@@ -381,15 +382,15 @@ string Scope::Impl::GetUniqueName(const string& prefix,
   return unique_name;
 }
 
-string Scope::Impl::GetNameForOp(const string& default_name) const {
-  const string unique_name =
+std::string Scope::Impl::GetNameForOp(const std::string& default_name) const {
+  const std::string unique_name =
       GetUniqueName(default_name, true /* check_single_use */);
-  const string sep =
+  const std::string sep =
       name_.empty() || unique_name.empty() ? "" : kScopeSeparator;
   return absl::StrCat(name_, sep, unique_name);
 }
 
-string Scope::GetUniqueNameForOp(const string& default_name) const {
+std::string Scope::GetUniqueNameForOp(const std::string& default_name) const {
   if (impl()->single_use_scope()) {
     if (impl()->op_name_.empty() || *impl()->scope_used_) {
       *impl()->status_ =
@@ -403,21 +404,21 @@ string Scope::GetUniqueNameForOp(const string& default_name) const {
                                   : impl()->GetNameForOp(impl()->op_name_);
 }
 
-Scope Scope::NewSubScope(const string& child_scope_name) const {
+Scope Scope::NewSubScope(const std::string& child_scope_name) const {
   if (child_scope_name.empty()) {
     return Scope(new Impl(*this, Impl::Tags::ScopeName(), impl()->name_,
                           true /* copy_names */));
   }
-  const string unique_name =
+  const std::string unique_name =
       impl()->GetUniqueName(child_scope_name, false /* check_single_use */);
-  const string sep =
+  const std::string sep =
       impl()->name_.empty() || unique_name.empty() ? "" : kScopeSeparator;
   return Scope(new Impl(*this, Impl::Tags::ScopeName(),
                         absl::StrCat(impl()->name_, sep, unique_name),
                         false /* copy_names */));
 }
 
-Scope Scope::WithOpNameImpl(const string& op_name) const {
+Scope Scope::WithOpNameImpl(const std::string& op_name) const {
   if (impl()->single_use_scope()) {
     UpdateStatus(errors::InvalidArgument("Cannot set op name ", op_name,
                                          " on this scope"));
@@ -446,15 +447,15 @@ Scope Scope::WithNoControlDependencies() const {
                         /* clear_control_deps */ true));
 }
 
-Scope Scope::WithDevice(const string& device) const {
+Scope Scope::WithDevice(const std::string& device) const {
   return Scope(new Impl(*this, Impl::Tags::Device(), device));
 }
 
-Scope Scope::WithAssignedDevice(const string& assigned_device) const {
+Scope Scope::WithAssignedDevice(const std::string& assigned_device) const {
   return Scope(new Impl(*this, Impl::Tags::AssignedDevice(), assigned_device));
 }
 
-Scope Scope::WithXlaCluster(const string& xla_cluster) const {
+Scope Scope::WithXlaCluster(const std::string& xla_cluster) const {
   return Scope(new Impl(*this, Impl::Tags::XlaCluster(), xla_cluster));
 }
 
@@ -472,12 +473,12 @@ Scope Scope::ExitOnError() const {
   return Scope(new Impl(*this, Impl::Tags::ExitOnError()));
 }
 
-Scope Scope::WithKernelLabel(const string& kernel_label) const {
+Scope Scope::WithKernelLabel(const std::string& kernel_label) const {
   return Scope(new Impl(*this, Impl::Tags::KernelLabel(), kernel_label));
 }
 
 CompositeOpScopes Scope::GetCompositeOpScopes(
-    const string& composite_op_name) const {
+    const std::string& composite_op_name) const {
   if (impl()->op_name_.empty() && composite_op_name.empty()) {
     UpdateStatus(errors::InvalidArgument(
         "Cannot create composite op scopes with empty name"));
@@ -486,8 +487,9 @@ CompositeOpScopes Scope::GetCompositeOpScopes(
   if (!impl()->single_use_scope()) {
     Scope child = NewSubScope(impl()->op_name_.empty() ? composite_op_name
                                                        : impl()->op_name_);
-    const string child_op_sep = impl()->name_.empty() ? "" : kSuffixSeparator;
-    const string child_name =
+    const std::string child_op_sep =
+        impl()->name_.empty() ? "" : kSuffixSeparator;
+    const std::string child_name =
         absl::StrCat(impl()->name_, child_op_sep, child.impl()->name_);
     return {child,
             Scope(new Impl(child, Impl::Tags::SingleUseScope(), child_name))};
@@ -510,11 +512,11 @@ class InternalScope {
                         ShapeRefiner* refiner) {
     Scope::Impl::NameMap* name_map = new Scope::Impl::NameMap;
     for (const Node* node : graph->nodes()) {
-      const string& name = node->name();
+      const std::string& name = node->name();
       (*name_map)[name] = 0;
       // Add all name prefixes ('/' separated).
       size_t idx = -1;
-      while ((idx = name.find(kScopeSeparator, idx + 1)) != string::npos) {
+      while ((idx = name.find(kScopeSeparator, idx + 1)) != std::string::npos) {
         (*name_map)[name.substr(0, idx)] = 0;
       }
     }
@@ -533,7 +535,7 @@ Scope NewInternalScope(Graph* graph, absl::Status* status,
   return InternalScope::NewScope(graph, status, refiner);
 }
 
-absl::Status CreateOutputWithScope(string op_name,
+absl::Status CreateOutputWithScope(std::string op_name,
                                    absl::Span<const ::tensorflow::Input> inputs,
                                    const Scope& scope, Output* output) {
   TF_RETURN_IF_ERROR(scope.status());

From 2cc7d2ac894e6389568b0f358966963ca45808c1 Mon Sep 17 00:00:00 2001
From: Jing Pu <jingpu@google.com>
Date: Tue, 23 Dec 2025 09:31:53 -0800
Subject: [PATCH 715/753] Reverts 05c8cf7c132f310c8f8f2a022ba52d22b52e52c4

PiperOrigin-RevId: 848205408
---
 .../compiler/tests/gather_nd_op_test.py       | 22 +------
 .../compiler/tf2xla/kernels/gather_op.cc      | 59 +------------------
 2 files changed, 3 insertions(+), 78 deletions(-)

diff --git a/tensorflow/compiler/tests/gather_nd_op_test.py b/tensorflow/compiler/tests/gather_nd_op_test.py
index 4f1d6f477e896b..60a7949138e9c1 100644
--- a/tensorflow/compiler/tests/gather_nd_op_test.py
+++ b/tensorflow/compiler/tests/gather_nd_op_test.py
@@ -24,14 +24,12 @@
 
 class GatherNdTest(xla_test.XLATestCase):
 
-  def _runGather(self, params, indices, bad_indices_policy=""):
+  def _runGather(self, params, indices):
     with self.session():
       paramsp = array_ops.placeholder(params.dtype)
       indicesp = array_ops.placeholder(indices.dtype)
       with self.test_scope():
-        gather_nd_t = array_ops.gather_nd(
-            paramsp, indicesp, bad_indices_policy=bad_indices_policy
-        )
+        gather_nd_t = array_ops.gather_nd(paramsp, indicesp)
       feed_dict = {paramsp: params, indicesp: indices}
       return gather_nd_t.eval(feed_dict=feed_dict)
 
@@ -141,22 +139,6 @@ def testHigherRankParamsAndIndices(self):
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
 
-  def testIgnoreBadIndices(self):
-    shape = (3, 4, 5)
-    params = np.arange(np.prod(shape), dtype=np.int32).reshape(shape)
-    indices = np.array([[[0, 0], [-1, 3]], [[2, 4], [2, 3]]], dtype=np.int32)
-    gather_nd_val = self._runGather(
-        params, indices, bad_indices_policy="IGNORE"
-    )
-    expected = np.array(
-        [
-            [[0, 1, 2, 3, 4], [0, 0, 0, 0, 0]],
-            [[0, 0, 0, 0, 0], [55, 56, 57, 58, 59]],
-        ],
-        dtype=np.int32,
-    )
-    self.assertAllEqual(expected, gather_nd_val)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 3d12dadcbd53e9..e94f74d1fed8ef 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
-#include <string>
 #include <vector>
 
 #include "absl/log/check.h"
@@ -284,13 +283,7 @@ REGISTER_XLA_OP(Name("GatherV2").CompileTimeConstantInput("axis"), GatherOp);
 
 class GatherNdOp : public XlaOpKernel {
  public:
-  explicit GatherNdOp(OpKernelConstruction* context) : XlaOpKernel(context) {
-    // Set batch_dims_ to 0 if the attribute does not exist.
-    if (context->HasAttr("bad_indices_policy")) {
-      OP_REQUIRES_OK(context, context->GetAttr("bad_indices_policy",
-                                               &bad_indices_policy_));
-    }
-  }
+  explicit GatherNdOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
     DataType params_type = context->input_type(0);
@@ -319,58 +312,8 @@ class GatherNdOp : public XlaOpKernel {
                                       indices_shape, /*axis=*/0,
                                       /*indices_are_nd=*/true, params_type,
                                       indices_type, builder, &gather));
-    // By default, XLA clips OOB indices, while "IGNORE" policy demands to fill
-    // 0s to the output. The following code implements the "IGNORE" policy by
-    // masking the gather result with the valid indices mask.
-    if (bad_indices_policy_ == "IGNORE") {
-      xla::XlaOp valid_mask;
-      for (int i = 0; i < num_index_dims; ++i) {
-        xla::XlaOp i_limit = XlaHelpers::IntegerLiteral(
-            builder, indices_type, params_shape.dim_size(i));
-        xla::XlaOp i_zero = XlaHelpers::Zero(builder, indices_type);
-        xla::XlaOp indices_i =
-            xla::SliceInDim(indices, i, i + 1, 1, indices_shape.dims() - 1);
-
-        xla::XlaOp indices_i_good =
-            xla::And(xla::Ge(indices_i, i_zero), xla::Lt(indices_i, i_limit));
-        if (i == 0) {
-          valid_mask = indices_i_good;
-        } else {
-          valid_mask = xla::And(valid_mask, indices_i_good);
-        }
-      }
-      auto gather_shape = builder->GetShape(gather);
-      OP_REQUIRES_OK(context, gather_shape.status());
-
-      std::vector<int64_t> valid_mask_dims(
-          gather_shape->dimensions().begin(),
-          gather_shape->dimensions().end() - 1);
-      valid_mask = xla::Reshape(valid_mask, valid_mask_dims);
-      if (indices_shape.dims() != gather_shape->dimensions().size()) {
-        OP_REQUIRES(
-            context,
-            gather_shape->dimensions().size() == indices_shape.dims() - 1,
-            errors::InvalidArgument(
-                "Indices rank must be equal to output rank (with channel "
-                "dimension) or 1 less (w/o channel dimension)"));
-      } else {
-        std::vector<int64_t> broadcast_dims(valid_mask_dims.size(), 1);
-        for (int i = 0; i < broadcast_dims.size(); ++i) {
-          broadcast_dims[i] = i;
-        }
-        valid_mask = xla::BroadcastInDim(valid_mask, gather_shape->dimensions(),
-                                         broadcast_dims);
-      }
-
-      gather =
-          xla::Select(valid_mask, gather,
-                      xla::Broadcast(XlaHelpers::Zero(builder, params_type),
-                                     gather_shape->dimensions()));
-    }
     context->SetOutput(0, gather);
   }
-
-  std::string bad_indices_policy_;
 };
 
 REGISTER_XLA_OP(Name("GatherNd"), GatherNdOp);

From b689237cba50d56f43ef7abadde86e365af067c0 Mon Sep 17 00:00:00 2001
From: Mehrdad Khani <mehrdadk@google.com>
Date: Tue, 23 Dec 2025 10:57:16 -0800
Subject: [PATCH 716/753] Adds a new test that MemorySpacePropagation currently
 fails on.

PiperOrigin-RevId: 848233727
---
 third_party/xla/xla/hlo/transforms/BUILD      |  1 +
 .../memory_space_propagation_test.cc          | 51 +++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 480674c2bf4495..778942bfc200a3 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -189,6 +189,7 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
         "//xla/tsl/lib/core:status_test_util",
+        "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc b/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
index e2a46b570a86d4..1b4558a9dfe2a6 100644
--- a/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
+++ b/third_party/xla/xla/hlo/transforms/memory_space_propagation_test.cc
@@ -24,9 +24,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/analysis/hlo_dataflow_analysis.h"
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -592,5 +594,54 @@ TEST_F(MemorySpacePropagationTest, RunOnComputationPropagateFromOutput) {
   EXPECT_EQ(absl::HashOf(*module), absl::HashOf(*ref));
 }
 
+// TODO (b/469840065): Re-enable this test once the memory space propagation bug
+// is fixed for nested fusions.
+TEST_F(MemorySpacePropagationTest, DISABLED_NestedFusionShapeMismatchBug) {
+  absl::string_view hlo_string =
+      R"(HloModule jit_insert.fusion.21.isolated, is_scheduled=true
+
+%copy_fusion.20.clone {
+  %input.20 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} parameter(0)
+  ROOT %copy.4014 = s4[8,32768,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} copy(%input.20)
+}
+
+%fused_computation.434.clone {
+  %param_0.777 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} parameter(0)
+  %fusion.505 = s4[8,32768,1,256]{3,1,0,2:T(8,128)(8,1)E(4)S(1)} fusion(%param_0.777), kind=kLoop, output_to_operand_aliasing={{}: (0, {})}, calls=%copy_fusion.20.clone
+  %param_3.751 = pred[]{:T(512)} parameter(3)
+  %broadcast.1846 = pred[1,16384,1,256]{3,1,0,2:T(8,128)(4,1)} broadcast(%param_3.751), dimensions={}, metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}
+  %param_2.1768 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)S(1)} parameter(2)
+  %param_1.980 = s32[]{:T(128)S(6)} parameter(1)
+  %constant.9791 = s32[]{:T(128)} constant(0), metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit"}
+  %dynamic-slice.2042 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} dynamic-slice(%fusion.505, %param_1.980, %constant.9791, %constant.9791, %constant.9791), dynamic_slice_sizes={1,16384,1,256}, metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"indices_config":{"index_known_bits":[{"zeroes":"0","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"}],"is_index_aligned":[true,true,true,true]},"used_scoped_memory_configs":[]}
+  %select.912 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} select(%broadcast.1846, %param_2.1768, %dynamic-slice.2042), metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}
+  ROOT %dynamic-update-slice.455 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} dynamic-update-slice(%fusion.505, %select.912, %param_1.980, %constant.9791, %constant.9791, /*index=5*/%constant.9791), metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"indices_config":{"index_known_bits":[{"zeroes":"0","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"},{"zeroes":"4294967295","ones":"0","bitwidth":"32"}],"is_index_aligned":[true,true,true,true]},"used_scoped_memory_configs":[]}
+}
+
+ENTRY %jit_insert.fusion.21.isolated.root {
+  %bitcast.1556.hbm = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)} parameter(0)
+  %select.32 = s32[]{:T(128)S(6)} parameter(1)
+  %collective-permute.56.hbm = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)} parameter(2)
+  %and.74 = pred[]{:T(512)} parameter(3)
+  %copy = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} copy(%bitcast.1556.hbm)
+  %copy.1 = s4[1,16384,1,256]{3,1,0,2:T(8,128)(8,1)E(4)S(1)} copy(%collective-permute.56.hbm)
+  %fusion.21 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)S(1)} fusion(%copy, %select.32, %copy.1, %and.74), kind=kLoop, calls=%fused_computation.434.clone, metadata={op_name="jit(insert)/jit(main)/pjit/jit(insert)/jit(main)/jit(insert)/pjit/jit(insert)/jit(main)/jit(insert)/jit(insert)/dynamic_update_slice" stack_frame_id=146}, backend_config={"flag_configs":[],"scoped_memory_configs":[],"used_scoped_memory_configs":[],"aliasing_operands":{"lists":[{"indices":["0","4"]}]}}
+  ROOT %copy.2 = s4[8,32768,1,256]{3,1,0,2:T(64,128)(8,1)E(4)} copy(%fusion.21)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  // %copy.4014 output memory space must get modified to match %fusion.505
+  // output shape.
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).value());
+  HloComputation* computation =
+      module->GetComputationWithName("copy_fusion.20.clone");
+  EXPECT_NE(computation, nullptr);
+  const HloInstruction* copy = computation->GetInstructionWithName("copy.4014");
+  EXPECT_NE(copy, nullptr);
+  EXPECT_EQ(copy->shape().layout().memory_space(), 1);
+  TF_EXPECT_OK(Verify(module.get()));
+}
+
 }  // namespace
 }  // namespace xla

From 4e413608beef6a5b0c80ccea3ef6b60c6e3d0d2f Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 23 Dec 2025 11:08:22 -0800
Subject: [PATCH 717/753] switch from deprecated TF_CHECK_OK

PiperOrigin-RevId: 848237348
---
 tensorflow/compiler/jit/BUILD                 |  78 ++++--
 .../compiler/jit/compilability_check_util.cc  |  40 ++-
 .../jit/compilability_check_util_test.cc      |  23 +-
 .../jit/encapsulate_subgraphs_pass_test.cc    |  36 ++-
 .../compiler/jit/encapsulate_util_test.cc     |  17 +-
 .../encapsulate_xla_computations_pass_test.cc |  58 +++--
 .../extract_outside_compilation_pass_test.cc  | 235 ++++++++++--------
 .../jit/mark_for_compilation_pass_test.cc     |  43 +++-
 tensorflow/compiler/jit/node_matchers.cc      |  19 +-
 .../rearrange_function_argument_pass_test.cc  |  63 ++---
 .../compiler/jit/shape_inference_test.cc      |  17 +-
 tensorflow/compiler/jit/test_util.cc          |  22 +-
 tensorflow/compiler/jit/tests/BUILD           |   2 +
 .../jit/tests/auto_clustering_test.cc         |  10 +-
 tensorflow/compiler/jit/xla_cluster_util.cc   |  37 ++-
 tensorflow/compiler/jit/xla_device_context.cc |  35 ++-
 .../compiler/jit/xla_kernel_creator_test.cc   |  25 +-
 tensorflow/compiler/jit/xla_launch_util.cc    |   6 +-
 .../compiler/jit/xla_launch_util_gpu_test.cc  |   2 +-
 .../compiler/jit/xla_launch_util_test.cc      |  27 +-
 .../compiler/jit/xla_platform_info_test.cc    |  20 +-
 21 files changed, 529 insertions(+), 286 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 9c2de5b39016e7..1ed658d73f1a4a 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -256,7 +256,6 @@ cc_library(
     hdrs = ["xla_device_context.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":xla_launch_util",
         ":xla_tensor",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:layout_util",
@@ -264,14 +263,29 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core/common_runtime:device",
         "//tensorflow/core/common_runtime:dma_helper",
         "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_xla//xla:util",
+        "@local_xla//xla:literal",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla:status_macros",
         "@local_xla//xla/client:local_client",
+        "@local_xla//xla/service:stream_pool",
+        "@local_xla//xla/stream_executor:allocator_stats",
+        "@local_xla//xla/stream_executor:event",
+        "@local_xla//xla/stream_executor:stream",
+        "@local_xla//xla/stream_executor:stream_executor_h",
+        "@local_xla//xla/tsl/platform:errors",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
 
@@ -685,7 +699,9 @@ tf_cc_test(
     name = "xla_launch_util_test",
     srcs = ["xla_launch_util_test.cc"],
     deps = [
+        ":device_compilation_profiler",
         ":device_compiler",
+        ":device_executable_persistor",
         ":flags_headers",
         ":pjrt_device_compiler_client",
         ":variable_info",
@@ -694,25 +710,35 @@ tf_cc_test(
         ":xla_cpu_jit",
         ":xla_device_no_jit_rewrite_registration",
         ":xla_launch_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:session_options",
         "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/kernels:ops_testutil",
         "//tensorflow/core/platform:refcount",
         "//tensorflow/core/tfrt/common:create_pjrt_client_util",
         "//tensorflow/core/tfrt/common:pjrt_util",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:literal",
+        "@local_xla//xla:literal_util",
         "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt:pjrt_common",
+        "@local_xla//xla/pjrt:pjrt_executable",
         "@local_xla//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "@local_xla//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
         "@local_xla//xla/tests:literal_test_util",
         "@local_xla//xla/tsl/framework:device_id_utils",
         "@local_xla//xla/tsl/lib/core:status_test_util",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )
 
@@ -979,8 +1005,7 @@ tf_cc_test(
         "//tensorflow/core:session_options",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
     ],
 )
@@ -1060,6 +1085,9 @@ cc_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla:status_macros",
     ],
 )
@@ -1078,11 +1106,10 @@ tf_cc_test(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
-        "//tensorflow/core:portable_gif_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/kernels:constant_op",
-        "@local_tsl//tsl/platform:status",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -1109,12 +1136,15 @@ tf_cc_test(
     deps = [
         ":encapsulate_util",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/log:check",
     ],
 )
 
@@ -1219,11 +1249,16 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:function_body",
         "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/platform:hash",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -1366,6 +1401,9 @@ tf_cc_test(
         "//tensorflow/core:testlib",
         "//tensorflow/core/common_runtime:device_set",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -1417,6 +1455,8 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
@@ -1453,7 +1493,7 @@ cc_library(
         ":xla_activity_proto_cc",
         ":xla_cluster_util",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
-        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/compiler/tf2xla:tf2xla_defs",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:core_cpu",
@@ -1461,12 +1501,17 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime:function_body",
+        "//tensorflow/core/common_runtime:function_utils",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@local_xla//xla:union_find",
         "@local_xla//xla:util",
         "@local_xla//xla/service/graphcycles",
@@ -1477,6 +1522,7 @@ tf_cc_test(
     name = "compilability_check_util_test",
     srcs = ["compilability_check_util_test.cc"],
     deps = [
+        ":common",
         ":compilability_check_util",
         ":xla_cpu_device",
         ":xla_cpu_jit",
@@ -1485,17 +1531,17 @@ tf_cc_test(
         "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
-        "//tensorflow/compiler/tf2xla:test_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/cc:xla_jit_ops",
-        "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 6c77648817f808..8da8b2055c6c2b 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -15,52 +15,40 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/compilability_check_util.h"
 
-#include <algorithm>
-#include <atomic>
-#include <deque>
+#include <cstddef>
 #include <iterator>
-#include <limits>
 #include <string>
-#include <unordered_map>
-#include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/device_util.h"
-#include "tensorflow/compiler/jit/flags.h"
-#include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
-#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
-#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "xla/service/graphcycles/graphcycles.h"
-#include "xla/union_find.h"
-#include "xla/util.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "xla/tsl/platform/errors.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_utils.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/public/version.h"
-#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -341,8 +329,8 @@ bool RecursiveCompilabilityChecker::IsCompilableCall(
     return false;
   }
 
-  auto release_handle_on_return = gtl::MakeCleanup(
-      [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+  auto release_handle_on_return =
+      gtl::MakeCleanup([&] { CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
   const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
   bool is_compilable = true;
   for (const Node* node : fbody->graph->op_nodes()) {
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index ea24176bb04a4a..185afab797ee1e 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -15,21 +15,32 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/compilability_check_util.h"
 
-#include "absl/memory/memory.h"
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/match.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/graph_def_builder_util.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace {
@@ -260,7 +271,7 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalWhileNode) {
   GraphDef graph_def;
   TF_EXPECT_OK(builder.ToGraphDef(&graph_def));
   std::unique_ptr<Graph> graph(new Graph(flib_def_.get()));
-  TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
 
   auto while_node_it = std::find_if(
       graph->nodes().begin(), graph->nodes().end(),
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 94b136a02b99cf..776ec3915e2f73 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -15,26 +15,52 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 
+#include <algorithm>
+#include <functional>
+#include <map>
 #include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/state_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 #include "tensorflow/compiler/jit/test_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/errors.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/equal_graph_def.h"
@@ -499,7 +525,7 @@ absl::Status Encapsulate(
   // Create FunctionLibraryRuntime.
   SessionOptions session_options;
   std::vector<std::unique_ptr<Device>> devices;
-  TF_CHECK_OK(DeviceFactory::AddDevices(
+  CHECK_OK(DeviceFactory::AddDevices(
       session_options, "/job:localhost/replica:0/task:0", &devices));
   OptimizerOptions opts;
   auto device_mgr = std::make_unique<StaticDeviceMgr>(std::move(devices));
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 6d1661222e3eaf..4d2b71327b3250 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -15,12 +15,19 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 
+#include <vector>
+
+#include "absl/log/check.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -35,16 +42,16 @@ TEST(PerformStaticShapeInferenceBeforeEncapsulationTest, Basic) {
   Output add = ops::Add(s.WithOpName("add"), const_0, const_1);
   Output identity = ops::Identity(s.WithOpName("identity"), add);
   Graph g(OpRegistry::Global());
-  TF_CHECK_OK(s.ToGraph(&g));
+  CHECK_OK(s.ToGraph(&g));
 
-  TF_CHECK_OK(PerformStaticShapeInferenceBeforeEncapsulation(&g));
+  CHECK_OK(PerformStaticShapeInferenceBeforeEncapsulation(&g));
 
   // Check that "add" node now has _xla_inferred_shapes attr.
   auto node_index = g.BuildNodeNameIndex();
   Node *add_node = node_index["add"];
   std::vector<PartialTensorShape> output_shapes;
-  TF_CHECK_OK(GetNodeAttr(add_node->attrs(), kXlaInferredShapesAttrName,
-                          &output_shapes));
+  CHECK_OK(GetNodeAttr(add_node->attrs(), kXlaInferredShapesAttrName,
+                       &output_shapes));
   EXPECT_EQ(output_shapes.size(), 1);
   TensorShapeProto shape_proto;
   output_shapes[0].AsProto(&shape_proto);
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index acd5319cf8ed16..6b0570b704e2d7 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -15,19 +15,31 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/encapsulate_xla_computations_pass.h"
 
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
@@ -47,23 +59,23 @@ static std::unique_ptr<Graph> MakeOuterGraph(
   auto w = ops::Placeholder(scope.WithOpName("W"), DT_RESOURCE);
 
   NodeDef def;
-  TF_CHECK_OK(NodeDefBuilder("launch0", function, &flib_def)
-                  .Input(a.node()->name(), 0, DT_INT32)
-                  .Input(b.node()->name(), 0, DT_FLOAT)
-                  .Input(c.node()->name(), 0, DT_INT32)
-                  .Input(d.node()->name(), 0, DT_FLOAT)
-                  .Input(u.node()->name(), 0, DT_RESOURCE)
-                  .Input(v.node()->name(), 0, DT_RESOURCE)
-                  .Input(w.node()->name(), 0, DT_RESOURCE)
-                  .Device("/gpu:0")
-                  .Attr(kXlaClusterIdAttr, "launch0")
-                  .Attr("_variable_start_index", 4)
-                  .Finalize(&def));
+  CHECK_OK(NodeDefBuilder("launch0", function, &flib_def)
+               .Input(a.node()->name(), 0, DT_INT32)
+               .Input(b.node()->name(), 0, DT_FLOAT)
+               .Input(c.node()->name(), 0, DT_INT32)
+               .Input(d.node()->name(), 0, DT_FLOAT)
+               .Input(u.node()->name(), 0, DT_RESOURCE)
+               .Input(v.node()->name(), 0, DT_RESOURCE)
+               .Input(w.node()->name(), 0, DT_RESOURCE)
+               .Device("/gpu:0")
+               .Attr(kXlaClusterIdAttr, "launch0")
+               .Attr("_variable_start_index", 4)
+               .Finalize(&def));
 
   absl::Status status;
   Node* launch = scope.graph()->AddNode(def, &status);
-  TF_CHECK_OK(status);
-  TF_CHECK_OK(scope.DoShapeInference(launch));
+  CHECK_OK(status);
+  CHECK_OK(scope.DoShapeInference(launch));
   scope.graph()->AddEdge(a.node(), 0, launch, 0);
   scope.graph()->AddEdge(b.node(), 0, launch, 1);
   scope.graph()->AddEdge(c.node(), 0, launch, 2);
@@ -89,7 +101,7 @@ static std::unique_ptr<Graph> MakeOuterGraph(
   auto consumer3 = ops::Identity(scope.WithOpName("consumer3"), out3);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(scope.ToGraph(graph.get()));
+  CHECK_OK(scope.ToGraph(graph.get()));
   return graph;
 }
 
@@ -135,7 +147,7 @@ static std::unique_ptr<Graph> MakeBodyGraph() {
       ops::_Retval(scope.WithOpName("readu_0_retval_RetVal"), read_u, 3);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(scope.ToGraph(graph.get()));
+  CHECK_OK(scope.ToGraph(graph.get()));
   return graph;
 }
 
@@ -160,7 +172,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
       };
       add_attrs(e.node());
 
-      TF_CHECK_OK(scope.ToGraph(graph.get()));
+      CHECK_OK(scope.ToGraph(graph.get()));
       auto get_node_in_graph = [&graph](Node* node) {
         return graph->FindNodeId(node->id());
       };
@@ -178,7 +190,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
                               get_node_in_graph(e.node()), true);
       }
     }
-    TF_CHECK_OK(EncapsulateXlaComputationsPass::Encapsulate(&graph, &flib_def));
+    CHECK_OK(EncapsulateXlaComputationsPass::Encapsulate(&graph, &flib_def));
     return SerializeGraphDeterministic(*graph).value();
   };
 
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index 1a6441a80726a0..aa6ad2e4eeed8c 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -15,22 +15,39 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/extract_outside_compilation_pass.h"
 
+#include <initializer_list>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "xla/hlo/testlib/test.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
@@ -50,7 +67,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   auto ret0 = ops::_Retval(s.WithOpName("ret0"), add, 0);
   auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg1, 1);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
   auto node_name_image = g->BuildNodeNameIndex();
   Node *add_node = node_name_image["add"];
   EXPECT_NE(add_node, nullptr);
@@ -61,7 +78,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   node_name_image = g->BuildNodeNameIndex();
 
@@ -75,7 +92,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   Node *recv_at_host = node_name_image["outside_compilation_cluster__0_recv"];
   EXPECT_NE(recv_at_host, nullptr);
   std::vector<DataType> recv_at_host_dtypes;
-  TF_CHECK_OK(
+  CHECK_OK(
       GetNodeAttr(recv_at_host->attrs(), "Toutputs", &recv_at_host_dtypes));
   EXPECT_EQ(recv_at_host_dtypes.size(), 3);
   EXPECT_EQ(recv_at_host_dtypes[0], DT_INT32);
@@ -88,7 +105,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   Node *send_from_host = node_name_image["outside_compilation_cluster__0_send"];
   EXPECT_NE(send_from_host, nullptr);
   std::vector<DataType> send_from_host_dtypes;
-  TF_CHECK_OK(
+  CHECK_OK(
       GetNodeAttr(send_from_host->attrs(), "Tinputs", &send_from_host_dtypes));
   EXPECT_EQ(send_from_host_dtypes.size(), 2);
   EXPECT_EQ(send_from_host_dtypes[0], DT_INT32);
@@ -115,8 +132,8 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, Basic) {
   EXPECT_TRUE(has_control_edge_to_send_from_host);
   // Verify step 7: necessary attrs added to call_node_def.
   NameAttrList shape_inference_graph;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()),
-                          "shape_inference_graph", &shape_inference_graph));
+  CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()),
+                       "shape_inference_graph", &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph.name(),
             "_outside_compilation_shape_inference_cluster__0");
 }
@@ -126,13 +143,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, NoSendFromHost) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output arg0 = ops::_Arg(s.WithOpName("arg0"), DT_INT32, 0);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster", "");
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   auto node_name_image = g->BuildNodeNameIndex();
 
@@ -152,13 +169,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, NoRecvAtHost) {
   Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
   auto ret = ops::_Retval(s.WithOpName("ret"), const0, 0);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster", "");
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   auto node_name_image = g->BuildNodeNameIndex();
 
@@ -176,13 +193,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, NoKeyPlaceholder) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   RewriteOutsideCompilationSubgraphFn rewrite_fn("_xla", "_oc", "cluster", "");
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   auto node_name_image = g->BuildNodeNameIndex();
 
@@ -202,7 +219,7 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
   Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
   auto ret = ops::_Retval(s.WithOpName("ret"), const0, 0);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
   auto node_name_image = g->BuildNodeNameIndex();
   Node *const0_node = node_name_image["const0"];
   EXPECT_NE(const0_node, nullptr);
@@ -214,13 +231,13 @@ TEST(RewriteOutsideCompilationSubgraphFnTest, ShapesInferred) {
   std::vector<OutputTensor> arg_source_tensors;
   NodeDef call_node_def;
   call_node_def.set_op("0");
-  TF_CHECK_OK(
+  CHECK_OK(
       rewrite_fn(arg_source_tensors, &g, nullptr, nullptr, &call_node_def));
   node_name_image = g->BuildNodeNameIndex();
 
   // Check "shape" attr is available in call_node_def.
   std::vector<TensorShapeProto> shapes;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()), "shapes", &shapes));
+  CHECK_OK(GetNodeAttr(AttrSlice(&call_node_def.attr()), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 1);
   EXPECT_EQ(shapes[0].dim_size(), 1);
 }
@@ -230,7 +247,7 @@ class ExtractOutsideCompilationForFunctionTest : public ::testing::Test {
   void SetUp() override {
     SessionOptions session_options;
     std::vector<std::unique_ptr<Device>> devices;
-    TF_CHECK_OK(DeviceFactory::AddDevices(
+    CHECK_OK(DeviceFactory::AddDevices(
         session_options, "/job:localhost/replica:0/task:0", &devices));
     device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(devices));
   }
@@ -275,7 +292,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
     Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
     Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity0"]->AddAttr("_oc", "0");
     node_name_image["identity1"]->AddAttr("_oc", "1");
@@ -284,7 +301,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -295,15 +312,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
   std::unique_ptr<FunctionBody> xla_fbody;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld, &xla_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"), AttrSlice(),
+                                   &fld, &xla_fbody));
   auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
@@ -313,26 +330,26 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   EXPECT_NE(host_compute_1, nullptr);
   // Check XlaHostCompute nodes' "tpu_core" attr.
   int tpu_core;
-  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "tpu_core", &tpu_core));
+  CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "tpu_core", &tpu_core));
   EXPECT_EQ(tpu_core, 1);
-  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "tpu_core", &tpu_core));
+  CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "tpu_core", &tpu_core));
   EXPECT_EQ(tpu_core, 0);
   // Check XlaHostCompute nodes' "shapes" attr. "0" should not have shapes, and
   // "1" should have shapes.
   std::vector<TensorShapeProto> shapes;
-  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shapes", &shapes));
+  CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 0);
-  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
+  CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 1);
   EXPECT_EQ(shapes[0].dim_size(), 1);
   // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
   // empty values.
   NameAttrList shape_inference_graph;
-  TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
-                          &shape_inference_graph));
+  CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
+                       &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph.name(), "");
-  TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
-                          &shape_inference_graph));
+  CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
+                       &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph.name(), "");
 
   // Check `shape_inference_graphs`.
@@ -344,7 +361,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   device_ordinal_temp_value.set_i(0);
   protobuf::Map<std::string, AttrValue> host_func_attrs;
   host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-  TF_CHECK_OK(FunctionDefToBodyHelper(
+  CHECK_OK(FunctionDefToBodyHelper(
       *fld.Find("host_graph"), AttrSlice(&host_func_attrs), &fld, &host_fbody));
   Graph *host_graph = host_fbody->graph;
   Node *key_placeholder = nullptr, *sequencer = nullptr;
@@ -377,7 +394,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, Basic) {
   EXPECT_EQ(num_recv_at_host, 1);
   for (Node *n : send_recv_nodes) {
     Node *input_node;
-    TF_CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
+    CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
     EXPECT_EQ(input_node, key_placeholder);
 
     bool has_control_edge_to_sequencer = false;
@@ -399,10 +416,10 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
     Output const0 = ops::Const(s.WithOpName("const0"), 1, {2});
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -413,7 +430,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -435,7 +452,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Output identity = ops::Identity(s.WithOpName("identity_true_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_true_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -443,7 +460,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *true_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "true_fn", true_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "true_fn", true_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -451,7 +468,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Output identity = ops::Identity(s.WithOpName("identity_false_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_false_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -459,7 +476,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *false_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "false_fn", false_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "false_fn", false_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -474,10 +491,10 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
                          true_fn, false_fn);
     ops::_Retval retval(s.WithOpName("retval"), if_op.output[0], 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -488,7 +505,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -500,9 +517,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     device_ordinal_temp_value.set_i(0);
     protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &host_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &host_fbody));
     Graph *host_graph = host_fbody->graph;
     auto node_name_index = host_graph->BuildNodeNameIndex();
 
@@ -515,7 +532,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Node *if_oc_node = node_name_index["oc_if_if"];
     EXPECT_NE(if_oc_node, nullptr);
     Node *if_oc_node_cond_input;
-    TF_CHECK_OK(if_oc_node->input_node(0, &if_oc_node_cond_input));
+    CHECK_OK(if_oc_node->input_node(0, &if_oc_node_cond_input));
     EXPECT_EQ(if_oc_node_cond_input, recv_if_pred_node);
 
     // Check that then_branch outside compilation has node "identity_true_fn".
@@ -546,8 +563,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
   // Check XLA graph.
   {
     std::unique_ptr<FunctionBody> xla_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                        AttrSlice(), &fld, &xla_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                     AttrSlice(), &fld, &xla_fbody));
     Graph *xla_graph = xla_fbody->graph;
     auto node_name_index = xla_graph->BuildNodeNameIndex();
 
@@ -569,7 +586,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
     Node *if_node = node_name_index["if"];
     EXPECT_NE(if_node, nullptr);
     std::vector<std::string> token_inputs;
-    TF_CHECK_OK(
+    CHECK_OK(
         GetNodeAttr(if_node->def(), "_xla_token_input_nodes", &token_inputs));
     EXPECT_THAT(token_inputs, ::testing::ElementsAre("send_oc_if_pred_if"));
   }
@@ -586,7 +603,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     Output identity = ops::Identity(s.WithOpName("identity_cond_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_cond_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -594,7 +611,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *cond_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cond_fn", cond_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cond_fn", cond_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -602,7 +619,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     Output identity = ops::Identity(s.WithOpName("identity_body_fn"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity_body_fn"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -610,7 +627,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *body_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "body_fn", body_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "body_fn", body_fn_fdef));
   }
   {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -624,10 +641,10 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
                    cond_fn, body_fn);
     ops::_Retval retval(s.WithOpName("retval"), while_op.output[0], 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -638,7 +655,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -650,9 +667,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInWhile) {
     device_ordinal_temp_value.set_i(0);
     protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &host_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &host_fbody));
     Graph *host_graph = host_fbody->graph;
     auto node_name_index = host_graph->BuildNodeNameIndex();
 
@@ -713,7 +730,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     Output identity = ops::Identity(s.WithOpName("identity"), arg);
     ops::_Retval retval(s.WithOpName("retval"), identity, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     auto node_name_image = g->BuildNodeNameIndex();
     node_name_image["identity"]->AddAttr("_oc", "0");
     PartialTensorShape shape({2});
@@ -721,7 +738,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *true_fn_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "fn", true_fn_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "fn", true_fn_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
   {
@@ -736,35 +753,35 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
       tensor_proto.add_int_val(1);
     }
     NodeDef const_def;
-    TF_CHECK_OK(NodeDefBuilder("const", "Const")
-                    .Attr("dtype", DT_INT32)
-                    .Attr("value", tensor_proto)
-                    .Finalize(&const_def));
+    CHECK_OK(NodeDefBuilder("const", "Const")
+                 .Attr("dtype", DT_INT32)
+                 .Attr("value", tensor_proto)
+                 .Finalize(&const_def));
     absl::Status s;
     Node *const_node = g->AddNode(const_def, &s);
-    TF_CHECK_OK(s);
+    CHECK_OK(s);
 
     NodeDef fn_def;
-    TF_CHECK_OK(NodeDefBuilder("fn", "fn", &fld)
-                    .Input("const", 0, DT_INT32)
-                    .Finalize(&fn_def));
+    CHECK_OK(NodeDefBuilder("fn", "fn", &fld)
+                 .Input("const", 0, DT_INT32)
+                 .Finalize(&fn_def));
     Node *fn_node = g->AddNode(fn_def, &s);
-    TF_CHECK_OK(s);
+    CHECK_OK(s);
     g->AddEdge(const_node, 0, fn_node, 0);
 
     NodeDef ret_def;
-    TF_CHECK_OK(NodeDefBuilder("ret", "_Retval")
-                    .Attr("index", 0)
-                    .Attr("T", DT_INT32)
-                    .Input("fn", 0, DT_INT32)
-                    .Finalize(&ret_def));
+    CHECK_OK(NodeDefBuilder("ret", "_Retval")
+                 .Attr("index", 0)
+                 .Attr("T", DT_INT32)
+                 .Input("fn", 0, DT_INT32)
+                 .Finalize(&ret_def));
     Node *ret_node = g->AddNode(ret_def, &s);
-    TF_CHECK_OK(s);
+    CHECK_OK(s);
     g->AddEdge(fn_node, 0, ret_node, 0);
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
-    TF_CHECK_OK(fld.AddFunctionDef(*xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(fld.AddFunctionDef(*xla_fdef));
   }
 
   protobuf::Map<std::string, tensorflow::AttrValue> attrs;
@@ -774,7 +791,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
@@ -786,9 +803,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     device_ordinal_temp_value.set_i(0);
     protobuf::Map<std::string, AttrValue> host_func_attrs;
     host_func_attrs["_device_ordinal"] = device_ordinal_temp_value;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &host_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("host_graph"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &host_fbody));
     Graph *host_graph = host_fbody->graph;
     auto node_name_index = host_graph->BuildNodeNameIndex();
 
@@ -797,9 +814,9 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     EXPECT_NE(call_node, nullptr);
 
     std::unique_ptr<FunctionBody> call_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("oc_func_call_host_fn"),
-                                        AttrSlice(&host_func_attrs), &fld,
-                                        &call_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("oc_func_call_host_fn"),
+                                     AttrSlice(&host_func_attrs), &fld,
+                                     &call_fbody));
 
     // Verify we have _XlaRecvAtHost and _XlaSendFromHost nodes.
     bool has_recv = false, has_send = false;
@@ -817,8 +834,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
   // Check XLA graph.
   {
     std::unique_ptr<FunctionBody> xla_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                        AttrSlice(), &fld, &xla_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
+                                     AttrSlice(), &fld, &xla_fbody));
     Graph *xla_graph = xla_fbody->graph;
     auto node_name_index = xla_graph->BuildNodeNameIndex();
 
@@ -828,8 +845,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInFunction) {
     EXPECT_EQ(fn_node->type_string(), "fn_oc");
 
     std::unique_ptr<FunctionBody> call_fbody;
-    TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("fn_oc"), AttrSlice(), &fld,
-                                        &call_fbody));
+    CHECK_OK(FunctionDefToBodyHelper(*fld.Find("fn_oc"), AttrSlice(), &fld,
+                                     &call_fbody));
 
     // Verify we have XlaHostCompute nodes.
     bool has_hc = false;
@@ -857,7 +874,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
     Output identity1 = ops::Identity(s.WithOpName("identity1"), identity0);
     Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     std::cout << "Graph is " << (*g).ToGraphDefDebug().DebugString()
               << std::endl;
     auto node_name_image = g->BuildNodeNameIndex();
@@ -869,7 +886,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -880,15 +897,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
   std::unique_ptr<FunctionBody> xla_fbody;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld, &xla_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"), AttrSlice(),
+                                   &fld, &xla_fbody));
   auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
@@ -899,8 +916,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
   std::vector<std::string> token_input_nodes;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
 
   std::vector<std::string> expected_token_input_nodes_0(
       {"_xla_token_arg_node"});
@@ -908,8 +925,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   token_input_nodes.clear();
   std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
 
   // Check there is a control edge from host_compute_0 to host_compute_1.
@@ -940,7 +957,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
         s.WithOpName("identity1").WithControlDependencies(identity0), const0);
     Output identity2 = ops::Identity(s.WithOpName("identity2"), identity1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     std::cout << "Graph is " << (*g).ToGraphDefDebug().DebugString()
               << std::endl;
     auto node_name_image = g->BuildNodeNameIndex();
@@ -952,7 +969,7 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
         kXlaInferredShapesAttrName, std::vector<PartialTensorShape>{shape});
 
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "cluster", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -963,15 +980,15 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   NameAttrList name_attrs;
   name_attrs.set_name("cluster");
   *name_attrs.mutable_attr() = attrs;
-  TF_CHECK_OK(ExtractOutsideCompilationTest(
+  CHECK_OK(ExtractOutsideCompilationTest(
       "_xla", "_oc", "cluster", name_attrs, "cluster_rewritten", "host_graph",
       host_compute_core, &fld, &shape_inference_graphs,
       &has_outside_compilation));
 
   // Get rewritten XLA computation function.
   std::unique_ptr<FunctionBody> xla_fbody;
-  TF_CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"),
-                                      AttrSlice(), &fld, &xla_fbody));
+  CHECK_OK(FunctionDefToBodyHelper(*fld.Find("cluster_rewritten"), AttrSlice(),
+                                   &fld, &xla_fbody));
   auto node_name_index = xla_fbody->graph->BuildNodeNameIndex();
 
   // Check XlaHostCompute nodes.
@@ -982,8 +999,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
 
   // Check XlaHostCompute nodes' "_xla_token_input_nodes" attr.
   std::vector<std::string> token_input_nodes;
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_0->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
 
   std::vector<std::string> expected_token_input_nodes_0(
       {"_xla_token_arg_node"});
@@ -991,8 +1008,8 @@ TEST_F(ExtractOutsideCompilationForFunctionTest,
   token_input_nodes.clear();
   std::vector<std::string> expected_token_input_nodes_1(
       {"_xla_token_arg_node", "outside_compilation_0_host_compute"});
-  TF_CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
-                          "_xla_token_input_nodes", &token_input_nodes));
+  CHECK_OK(GetNodeAttr(AttrSlice(host_compute_1->attrs()),
+                       "_xla_token_input_nodes", &token_input_nodes));
   EXPECT_EQ(token_input_nodes, expected_token_input_nodes_1);
 
   // Check there is a control edge from host_compute_0 to host_compute_1.
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 1d4031a4ffc926..89d5ea8863151b 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 
 #include <algorithm>
+#include <cstdint>
+#include <initializer_list>
 #include <memory>
 #include <set>
 #include <string>
@@ -24,33 +26,50 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/memory/memory.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/data_flow_ops.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/list_ops.h"
+#include "tensorflow/cc/ops/logging_ops.h"
+#include "tensorflow/cc/ops/math_ops.h"
+#include "tensorflow/cc/ops/no_op.h"
+#include "tensorflow/cc/ops/random_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/ops/state_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.h"
 #include "tensorflow/compiler/jit/node_matchers.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/core/common_runtime/graph_constructor.h"
+#include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/common_runtime/graph_def_builder_util.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/algorithm.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -503,7 +522,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopesGlobalJitOverridden) {
     ops::BinaryOp(
         "MatMul", a, b,
         builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   FunctionDefLibrary flib;
@@ -536,7 +555,7 @@ TEST(XlaCompilationTest, CyclesWithAllDifferentScopes) {
     ops::BinaryOp(
         "MatMul", a, b,
         builder.opts().WithName("C").WithAttr(kXlaScopeAttr, "ScopeC"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
@@ -574,7 +593,7 @@ TEST(XlaCompilationTest, CyclesWithSplittingScopes) {
                       .WithName("D")
                       .WithAttr(kXlaCompileAttr, true)
                       .WithAttr(kXlaScopeAttr, "Scope2"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
@@ -607,7 +626,7 @@ TEST(XlaCompilationTest, CyclesWithDifferentScopesAndBridge) {
                                .WithAttr(kXlaCompileAttr, true)
                                .WithAttr(kXlaScopeAttr, "ScopeB"));
     ops::BinaryOp("MatMul", a, b, builder.opts().WithName("C"));
-    TF_CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
+    CHECK_OK(GraphDefBuilderToGraph(builder, graph.get()));
   }
 
   TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(
@@ -797,11 +816,11 @@ TEST(XlaCompilationTest, IllegalCycle_UsefulErrorMessage) {
     auto BuildNoopNode = [](absl::string_view name, Graph* graph) {
       NodeDefBuilder builder(name, "NoOp");
       NodeDef def;
-      TF_CHECK_OK(builder.Finalize(&def));
+      CHECK_OK(builder.Finalize(&def));
 
       absl::Status status;
       Node* node = graph->AddNode(def, &status);
-      TF_CHECK_OK(status);
+      CHECK_OK(status);
       return node;
     };
 
diff --git a/tensorflow/compiler/jit/node_matchers.cc b/tensorflow/compiler/jit/node_matchers.cc
index db158fc84a0173..93c07d5539ccc2 100644
--- a/tensorflow/compiler/jit/node_matchers.cc
+++ b/tensorflow/compiler/jit/node_matchers.cc
@@ -15,16 +15,31 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/node_matchers.h"
 
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <optional>
+#include <ostream>
+#include <sstream>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
 
 namespace tensorflow {
@@ -515,7 +530,7 @@ impl::NodeMatcherProperties impl::Attr(std::string name) {
 
 NodeMatcherProperties ConstantValue(
     const ::tensorflow::Input::Initializer& val) {
-  TF_CHECK_OK(val.status);
+  CHECK_OK(val.status);
   NodeMatcherProperties props;
   props.set_constant_value(val.tensor);
   return props;
diff --git a/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
index a833e9827c028a..6f3450f67e0e38 100644
--- a/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
+++ b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
@@ -13,25 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "absl/strings/match.h"
+#include <initializer_list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/functional_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/compiler/jit/encapsulate_util.h"
 #include "tensorflow/compiler/tf2xla/rearrange_function_argument.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "xla/tsl/platform/errors.h"
 #include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 
@@ -49,9 +52,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg1, 0);
     auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg0, 1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
   }
   {
     // Function for While's "body".
@@ -64,9 +67,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg0, 0);
     auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg1, 1);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
   }
   {
     // Function for While's "cond".
@@ -77,9 +80,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
     Output arg1 = ops::_Arg(s.WithOpName("arg1"), DT_BOOL, 1);
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), arg1, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f3", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f3", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -106,11 +109,11 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
   auto ret2 = ops::_Retval(s.WithOpName("ret2"), while_op.output[0], 2);
   auto ret3 = ops::_Retval(s.WithOpName("ret3"), while_op.output[1], 3);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   std::vector<std::unique_ptr<FunctionBody>> fbodies;
-  TF_CHECK_OK(RearrangeFunctionArguments(
-      [&](const NameAttrList &function, const FunctionBody **fbody) {
+  CHECK_OK(RearrangeFunctionArguments(
+      [&](const NameAttrList& function, const FunctionBody** fbody) {
         std::unique_ptr<FunctionBody> new_fbody;
         TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fld.Find(function.name()),
                                                    AttrSlice(&function.attr()),
@@ -136,33 +139,33 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
   const Node *if_node = node_name_index.at("if");
   ASSERT_NE(if_node, nullptr);
   const Node *input_node;
-  TF_CHECK_OK(if_node->input_node(1, &input_node));
+  CHECK_OK(if_node->input_node(1, &input_node));
   EXPECT_EQ(input_node->name(), "arg1");
-  TF_CHECK_OK(if_node->input_node(2, &input_node));
+  CHECK_OK(if_node->input_node(2, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
   const Node *ret0_node = node_name_index.at("ret0");
   ASSERT_NE(ret0_node, nullptr);
-  TF_CHECK_OK(ret0_node->input_node(0, &input_node));
+  CHECK_OK(ret0_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "if");
   const Node *ret1_node = node_name_index.at("ret1");
   ASSERT_NE(ret1_node, nullptr);
-  TF_CHECK_OK(ret1_node->input_node(0, &input_node));
+  CHECK_OK(ret1_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
 
   // Check node "while" input and output edges.
   const Node *while_node = node_name_index.at("while");
   ASSERT_NE(while_node, nullptr);
-  TF_CHECK_OK(while_node->input_node(0, &input_node));
+  CHECK_OK(while_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "arg1");
-  TF_CHECK_OK(while_node->input_node(1, &input_node));
+  CHECK_OK(while_node->input_node(1, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
   const Node *ret2_node = node_name_index.at("ret2");
   ASSERT_NE(ret2_node, nullptr);
-  TF_CHECK_OK(ret2_node->input_node(0, &input_node));
+  CHECK_OK(ret2_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "arg0");
   const Node *ret3_node = node_name_index.at("ret3");
   ASSERT_NE(ret3_node, nullptr);
-  TF_CHECK_OK(ret3_node->input_node(0, &input_node));
+  CHECK_OK(ret3_node->input_node(0, &input_node));
   EXPECT_EQ(input_node->name(), "while");
 }
 
@@ -182,9 +185,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest,
     auto ret1 = ops::_Retval(s.WithOpName("ret1"), arg0, 1);
     auto ret2 = ops::_Retval(s.WithOpName("ret2"), arg2, 2);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f2", xla_fdef));
   }
   {
     // Function for While's "cond".
@@ -197,9 +200,9 @@ TEST(RearrangeFunctionArgumentForFunctionTest,
     Output cond = ops::Const(s.WithOpName("const"), true, TensorShape({}));
     auto ret0 = ops::_Retval(s.WithOpName("ret0"), cond, 0);
     std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-    TF_CHECK_OK(s.ToGraph(g.get()));
+    CHECK_OK(s.ToGraph(g.get()));
     FunctionDef *xla_fdef = fdl.add_function();
-    TF_CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
+    CHECK_OK(GraphToFunctionDef(*g, "f1", xla_fdef));
   }
   FunctionLibraryDefinition fld(OpRegistry::Global(), fdl);
 
@@ -217,7 +220,7 @@ TEST(RearrangeFunctionArgumentForFunctionTest,
                              std::initializer_list<Input>{arg0, arg1, arg2},
                              cond_fn, body_fn);
   std::unique_ptr<Graph> g(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(s.ToGraph(g.get()));
+  CHECK_OK(s.ToGraph(g.get()));
 
   std::vector<std::unique_ptr<FunctionBody>> fbodies;
   absl::Status status = RearrangeFunctionArguments(
diff --git a/tensorflow/compiler/jit/shape_inference_test.cc b/tensorflow/compiler/jit/shape_inference_test.cc
index 599d442de4b092..807505672357cb 100644
--- a/tensorflow/compiler/jit/shape_inference_test.cc
+++ b/tensorflow/compiler/jit/shape_inference_test.cc
@@ -17,27 +17,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/shape_inference.h"
 
+#include <cstdint>
+#include <initializer_list>
 #include <map>
 #include <memory>
+#include <string>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/test_util.h"
 #include "xla/tsl/lib/core/status_test_util.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
-#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace {
@@ -55,7 +56,7 @@ TEST(ShapeInferenceTest, Basics) {
   auto g = ops::AddN(root.WithOpName("G"), std::initializer_list<Output>{e, f});
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(root.ToGraph(graph.get()));
+  CHECK_OK(root.ToGraph(graph.get()));
 
   GraphShapeInfo shape_info;
   TF_ASSERT_OK(InferShapes(graph.get(), /*arg_shapes=*/{},
@@ -84,7 +85,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSize) {
   b.node()->AddAttr("_index", 1);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(root.ToGraph(graph.get()));
+  CHECK_OK(root.ToGraph(graph.get()));
 
   std::map<int, InferredShape> arg_shapes;
   arg_shapes[0].shape = TensorShape({2, 3});
@@ -118,7 +119,7 @@ TEST(ShapeInferenceTest, UseArgShapesForVariableBatchSizeIncompleteUserArgs) {
   b.node()->AddAttr("_index", 0);
 
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_CHECK_OK(root.ToGraph(graph.get()));
+  CHECK_OK(root.ToGraph(graph.get()));
 
   std::map<int, InferredShape> arg_shapes;
   arg_shapes[0].shape = TensorShape({2, 3});
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
index 30a9ab51faf105..b72fd6e7aaa6eb 100644
--- a/tensorflow/compiler/jit/test_util.cc
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -15,14 +15,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/test_util.h"
 
+#include <map>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "xla/status_macros.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
@@ -71,15 +85,15 @@ void DeviceSetup::AddDevicesAndSetUp(
   }
 
   std::vector<std::unique_ptr<Device>> devices;
-  TF_CHECK_OK(DeviceFactory::AddDevices(
-      options, "/job:localhost/replica:0/task:0", &devices));
+  CHECK_OK(DeviceFactory::AddDevices(options, "/job:localhost/replica:0/task:0",
+                                     &devices));
   device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(devices));
 
   OptimizerOptions opts;
   lib_def_ = std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(),
                                                          FunctionDefLibrary());
   if (fdef.has_value()) {
-    TF_CHECK_OK(lib_def_->AddFunctionDef(*fdef));
+    CHECK_OK(lib_def_->AddFunctionDef(*fdef));
   }
   pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_.get(), Env::Default(), /*config=*/nullptr,
@@ -96,7 +110,7 @@ Device* DeviceSetup::GetDevice(const std::string& device_name) {
   std::string full_device_name = absl::StrCat(
       "/job:localhost/replica:0/task:0/device:", device_name, ":0");
   Device* device;
-  TF_CHECK_OK(device_mgr_->LookupDevice(full_device_name, &device));
+  CHECK_OK(device_mgr_->LookupDevice(full_device_name, &device));
   return device;
 }
 
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index 40de3e19dfd6d1..4c6a59e3f682fc 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -49,6 +49,8 @@ tf_cc_test(
     deps = [
         ":auto_clustering_test_helper",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test.cc b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
index d108bc51b5ee33..806abbeb8e6d6a 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test.cc
@@ -13,9 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/tests/auto_clustering_test_helper.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
@@ -85,7 +91,7 @@ absl::Status BenchmarkHelper(absl::string_view key, benchmark::State& state) {
 }
 
 void BM_MarkForCompilationPass_KerasImagenetMain(benchmark::State& state) {
-  TF_CHECK_OK(BenchmarkHelper("keras_imagenet_main", state));
+  CHECK_OK(BenchmarkHelper("keras_imagenet_main", state));
 }
 
 BENCHMARK(BM_MarkForCompilationPass_KerasImagenetMain);
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 6d7e5518524c29..1d51d4d1ca2b90 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -15,25 +15,50 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/match.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "xla/service/graphcycles/graphcycles.h"
 #include "xla/status_macros.h"
-#include "tensorflow/core/common_runtime/function.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/graph/edgeset.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/hash.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/xla_config_registry.h"
@@ -460,8 +485,8 @@ absl::StatusOr<bool> DoesAnyCalleeHaveRefNodes(
       return true;
     }
 
-    auto release_handle_on_return = gtl::MakeCleanup(
-        [&] { TF_CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
+    auto release_handle_on_return =
+        gtl::MakeCleanup([&] { CHECK_OK(lib_runtime->ReleaseHandle(handle)); });
 
     const FunctionBody* fbody = lib_runtime->GetFunctionBody(handle);
     TF_RETURN_IF_ERROR(GetNodesRelatedToRefVariablesInDirection(
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 8b38d79f58e415..027fd494ed8af5 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_device_context.h"
 
+#include <cstddef>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -22,15 +23,37 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "xla/util.h"
-#include "tensorflow/core/common_runtime/device.h"
+#include "xla/client/local_client.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/service/stream_pool.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/allocator_stats.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/stream.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "xla/tsl/platform/errors.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_reference.h"
-#include "tsl/platform/statusor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/threadpool.h"
 
 namespace tensorflow {
 
@@ -249,7 +272,7 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   // shape as it is derived from the cpu_tensor's shape using
   // shape_representation_fn_.
   xla::MutableBorrowingLiteral literal;
-  TF_CHECK_OK(HostTensorToMutableBorrowingLiteral(
+  CHECK_OK(HostTensorToMutableBorrowingLiteral(
       xla::LayoutUtil::GetWithDefaultLayout(
           xla_tensor->shaped_buffer().on_host_shape()),
       cpu_tensor, &literal));
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_test.cc b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
index 12ab76a7c1ce37..1804b1728c8c7f 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_test.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
@@ -15,16 +15,23 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_kernel_creator.h"
 
-#include "absl/memory/memory.h"
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
 #include "absl/status/status.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/function.h"
-#include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/node_def_builder.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_properties.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
@@ -63,7 +70,7 @@ class XlaKernelCreatorTest : public ::testing::Test {
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 1});
     std::vector<std::unique_ptr<Device>> devices;
-    TF_CHECK_OK(DeviceFactory::AddDevices(
+    CHECK_OK(DeviceFactory::AddDevices(
         options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 8ccb236897ce39..c35a7d0457c6ff 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -66,7 +66,6 @@ limitations under the License.
 #include "xla/tsl/framework/device_id_utils.h"
 #include "xla/tsl/framework/serving_device_selector_policies.h"
 #include "xla/tsl/platform/errors.h"
-#include "xla/tsl/platform/status.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/util.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
@@ -85,6 +84,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/common/async_value_tensor.h"
+#include "tsl/platform/casts.h"
 
 namespace tensorflow {
 namespace {
@@ -323,7 +323,7 @@ absl::Status SetOutputForConstant(
     }
     ctx->op_device_context()->CopyCPUTensorToDevice(
         &const_tensor, device, output_tensor,
-        [&](absl::Status status) { TF_CHECK_OK(status); });
+        [&](absl::Status status) { CHECK_OK(status); });
 
     if (device->device_type() == DEVICE_GPU) {
       // The GPUDeviceContext enqueues the host->device transfer in a
@@ -562,7 +562,7 @@ XlaComputationLaunchContext::BuildXlaCompilerArguments(
   }
 
   absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
-  TF_CHECK_OK(CreateVariableInfoLookup(variable_args, variable_info_lookup));
+  CHECK_OK(CreateVariableInfoLookup(variable_args, variable_info_lookup));
   for (int64_t input_num = 0; input_num < inputs.size(); ++input_num) {
     const Tensor* input = inputs[input_num];
     XlaCompiler::Argument& arg = out.emplace_back();
diff --git a/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc b/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc
index 563e75c5d61b28..e3f32f8403379a 100644
--- a/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc
+++ b/tensorflow/compiler/jit/xla_launch_util_gpu_test.cc
@@ -112,7 +112,7 @@ class PjRtExecutionUtilGpuTest : public OpsTestBase {
 
     // Create the DeviceCompiler to help with compiling executables.
     auto pjrt_client_or = GetOrCreatePjRtClient(device_type_);
-    TF_CHECK_OK(pjrt_client_or.status());
+    CHECK_OK(pjrt_client_or.status());
     pjrt_client_ = pjrt_client_or.value();
     device_compiler_ = new PjRtDeviceCompiler(
         std::make_unique<PjRtDeviceExecutablePersistor>(
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
index d8ed5feac79f12..a2eb031da6c38c 100644
--- a/tensorflow/compiler/jit/xla_launch_util_test.cc
+++ b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -21,33 +21,50 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
 #include "tensorflow/compiler/jit/device_compiler.h"
+#include "tensorflow/compiler/jit/device_executable_persistor.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/pjrt_device_compiler_client.h"
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/variable_info_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/framework/device_id_utils.h"
 #include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/type_index.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
@@ -93,11 +110,11 @@ class PjRtExecutionUtilTest : public OpsTestBase {
     xla::CpuClientOptions options;
     options.asynchronous = true;
     options.cpu_device_count = 1;
-    TF_CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
+    CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
         device_type, xla::GetXlaPjrtCpuClient(options).value()));
 
     // device_context_ should be a PjRtDeviceContext.
-    TF_CHECK_OK(device_->TryGetDeviceContext(&device_context_));
+    CHECK_OK(device_->TryGetDeviceContext(&device_context_));
 
     // Get the host allocator.
     AllocatorAttributes host_alloc_attr;
@@ -111,7 +128,7 @@ class PjRtExecutionUtilTest : public OpsTestBase {
 
     // Create the DeviceCompiler to help with compiling executables.
     auto pjrt_client_or = GetOrCreatePjRtClient(device_type_);
-    TF_CHECK_OK(pjrt_client_or.status());
+    CHECK_OK(pjrt_client_or.status());
     pjrt_client_ = pjrt_client_or.value();
     device_compiler_ = new PjRtDeviceCompiler(
         std::make_unique<PjRtDeviceExecutablePersistor>(
diff --git a/tensorflow/compiler/jit/xla_platform_info_test.cc b/tensorflow/compiler/jit/xla_platform_info_test.cc
index 84fd60ef6c7e33..7b45521daf2827 100644
--- a/tensorflow/compiler/jit/xla_platform_info_test.cc
+++ b/tensorflow/compiler/jit/xla_platform_info_test.cc
@@ -18,17 +18,21 @@ limitations under the License.
 #include <memory>
 
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "tensorflow/compiler/jit/device_compilation_profiler.h"
+#include "tensorflow/compiler/jit/device_compiler.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/test_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "xla/client/local_client.h"
+#include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/plugin/xla_cpu/cpu_client_options.h"
 #include "xla/pjrt/plugin/xla_cpu/xla_cpu_pjrt_client.h"
+#include "xla/tsl/lib/core/status_test_util.h"
+#include "xla/tsl/platform/statusor.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/refcount.h"
-#include "tensorflow/core/platform/status_matchers.h"
-#include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
@@ -65,7 +69,7 @@ TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerXlaDeviceMetadata) {
 
   Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
   const XlaDevice::Metadata* metadata = nullptr;
-  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -91,7 +95,7 @@ TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerXlaDeviceCacheEnabled) {
 
   Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
   const XlaDevice::Metadata* metadata = nullptr;
-  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -134,7 +138,7 @@ TEST_F(XlaPlatformInfoTest, GetOrCreatePjRtDeviceCompilerAndProfilerXlaDevice) {
 
   Device* device = device_setup_.GetDevice(device_type.type());
   const XlaDevice::Metadata* metadata = nullptr;
-  TF_CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
+  CHECK_OK(XlaDevice::GetMetadataFromDevice(device, &metadata));
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
 
   ResourceMgr resource_mgr("");
@@ -254,7 +258,7 @@ TEST_F(XlaPlatformInfoTest,
   xla::CpuClientOptions options;
   options.asynchronous = true;
   options.cpu_device_count = 1;
-  TF_CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
+  CHECK_OK(SetPjRtClientInTFGlobalResourceManager(
       device_type, xla::GetXlaPjrtCpuClient(options).value()));
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
 

From 76210ebaa17e3bb5c7776a383263e32e65129972 Mon Sep 17 00:00:00 2001
From: Alex Pivovarov <upwind@google.com>
Date: Tue, 23 Dec 2025 11:10:05 -0800
Subject: [PATCH 718/753] [ASAN] Initialize Thunk::PrepareParams with all
 members in tests.

This change ensures that Thunk::PrepareParams is fully initialized in thunk tests, including collective-related parameters, using a designated initializer list.

It fixes asan runtime error: _Nonnull binding to null pointer of type 'CollectiveMultimemRegistry * _Nonnull'

PiperOrigin-RevId: 848237961
---
 .../xla/xla/backends/gpu/runtime/BUILD        | 14 +++++++-
 .../runtime/buffers_checksum_thunk_test.cc    | 23 +++++++++++--
 .../runtime/buffers_float_check_thunk_test.cc | 22 ++++++++++--
 .../runtime/collective_kernel_thunk_test.cc   | 12 +++----
 .../gpu/runtime/custom_call_thunk_test.cc     | 34 +++++++++++++++++--
 .../gpu/runtime/dynamic_slice_thunk_test.cc   | 17 ++++++++--
 6 files changed, 104 insertions(+), 18 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index c89ff9384af678..5b57b09dffc68d 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -280,6 +280,7 @@ xla_test(
         "gpu",
     ],
     deps = [
+        ":collective_multimem_registry",
         ":custom_call_thunk",
         ":dynamic_slice_thunk",
         ":dynamic_slice_thunk_proto_cc",
@@ -297,6 +298,7 @@ xla_test(
         "//xla/hlo/ir:hlo",
         "//xla/hlo/parser:hlo_parser",
         "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:platform_util",
@@ -757,6 +759,7 @@ xla_test(
     srcs = ["custom_call_thunk_test.cc"],
     backends = ["gpu"],
     deps = [
+        ":collective_multimem_registry",
         ":custom_call_thunk",
         ":shaped_slice",
         ":thunk",
@@ -768,6 +771,7 @@ xla_test(
         "//xla/ffi:ffi_api",
         "//xla/ffi:type_registry",
         "//xla/hlo/ir:hlo",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:custom_call_status_public_headers",
         "//xla/service:custom_call_target_registry",
@@ -1337,6 +1341,7 @@ xla_test(
     },
     backends = ["h100"],
     deps = [
+        ":collective_clique_requests",
         ":collective_kernel_thunk",
         ":collective_multimem_registry",
         ":collective_params",
@@ -1345,7 +1350,6 @@ xla_test(
         "//xla:array",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/backends/gpu/runtime:collective_clique_requests",
         "//xla/core/collectives:reduction_kind",
         "//xla/pjrt:worker_thread",
         "//xla/runtime:device_id",
@@ -3574,8 +3578,12 @@ xla_test(
         ":buffer_debug_log_entry_metadata_store",
         ":buffer_debug_log_structs",
         ":buffers_checksum_thunk",
+        ":collective_clique_requests",
+        ":collective_multimem_registry",
+        ":collective_params",
         ":thunk",
         ":thunk_id",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:buffer_allocations",
@@ -3647,9 +3655,13 @@ xla_test(
         ":buffer_debug_log_entry_metadata_store",
         ":buffer_debug_log_structs",
         ":buffers_float_check_thunk",
+        ":collective_clique_requests",
+        ":collective_multimem_registry",
+        ":collective_params",
         ":thunk",
         ":thunk_id",
         "//xla:types",
+        "//xla/runtime:device_id",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service/gpu:buffer_allocations",
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
index 47208ddc437641..3eb51302f049fc 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_checksum_thunk_test.cc
@@ -28,8 +28,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/service_executable_run_options.h"
@@ -147,11 +151,26 @@ TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
   TF_ASSERT_OK(stream_->Memcpy(&inputs0_mem, zeros.data(), zeros.size()));
   zeros[123] = 56785678;  // expected checksum for inputs_mem[1]
   TF_ASSERT_OK(stream_->Memcpy(&inputs1_mem, zeros.data(), zeros.size()));
+
   // Setup parameters for Initialize/Prepare/ExecuteOnStream
   Thunk::InitializeParams init_params;
   init_params.executor = executor_;
   init_params.stream = stream_.get();
-  auto execute_params = Thunk::ExecuteParams::Create(
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream_.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor_->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor_, collective_params.global_device_id);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor_,
+                                      &allocations};
+
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
       ServiceExecutableRunOptions(), allocations, stream_.get(),
       /*command_buffer_trace_stream=*/stream_.get(),
       /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
@@ -163,7 +182,7 @@ TEST_F(BuffersDebugChecksumThunkTest, CalculatesChecksums) {
       {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
       /*runs_before_checked_thunk=*/true, metadata_store);
   TF_ASSERT_OK(thunk.Initialize(init_params));
-  TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}));
+  TF_ASSERT_OK(thunk.Prepare(prepare_params));
   TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
   TF_ASSERT_OK_AND_ASSIGN(std::vector<BufferDebugLogEntry> entries,
                           device_log.ReadFromDevice(*stream_));
diff --git a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
index c56538c15a6e39..977e2500f76f68 100644
--- a/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/buffers_float_check_thunk_test.cc
@@ -28,8 +28,12 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_entry_metadata_store.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
+#include "xla/backends/gpu/runtime/collective_clique_requests.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
+#include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/service_executable_run_options.h"
@@ -153,7 +157,21 @@ TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
   Thunk::InitializeParams init_params;
   init_params.executor = executor_;
   init_params.stream = stream_.get();
-  auto execute_params = Thunk::ExecuteParams::Create(
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream_.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor_->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor_, collective_params.global_device_id);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor_,
+                                      &allocations};
+
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
       ServiceExecutableRunOptions(), allocations, stream_.get(),
       /*command_buffer_trace_stream=*/stream_.get(),
       /*collective_params=*/nullptr, /*collective_cliques=*/nullptr);
@@ -166,7 +184,7 @@ TEST_F(BuffersDebugFloatCheckThunkTest, CalculatesNanCounts) {
       {{/*buffer_idx=*/0, inputs[0]}, {/*buffer_idx=*/1, inputs[1]}},
       metadata_store);
   TF_ASSERT_OK(thunk.Initialize(init_params));
-  TF_ASSERT_OK(thunk.Prepare(Thunk::PrepareParams{}));
+  TF_ASSERT_OK(thunk.Prepare(prepare_params));
   TF_ASSERT_OK(thunk.ExecuteOnStream(execute_params));
   TF_ASSERT_OK_AND_ASSIGN(std::vector<BufferDebugFloatCheckEntry> entries,
                           device_log.ReadFromDevice(*stream_));
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
index 43631845f37ad3..15f1fdd20c33cd 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_kernel_thunk_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -254,7 +255,7 @@ absl::StatusOr<se::DeviceAddressBase> RunCollectiveKernelThunk(
       &gpu_options);
 
   TF_ASSIGN_OR_RETURN(
-      auto collective_params,
+      CollectiveParams collective_params,
       CollectiveParams::Create(run_options, /*async_streams=*/{},
                                LocalDeviceId(executor->device_ordinal())));
   std::vector<se::DeviceAddressBase> allocated_buffers = {
@@ -276,15 +277,12 @@ absl::StatusOr<se::DeviceAddressBase> RunCollectiveKernelThunk(
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
 
-  Thunk::PrepareParams prepare_params;
   CollectiveMultimemRegistry multimem_registry(
       executor, collective_params.global_device_id);
   CollectiveCliqueRequests clique_requests;
-  prepare_params.executor = executor;
-  prepare_params.buffer_allocations = &buffer_allocations;
-  prepare_params.collective_params = &collective_params;
-  prepare_params.clique_requests = &clique_requests;
-  prepare_params.multimem_registry = &multimem_registry;
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &buffer_allocations};
   TF_RETURN_IF_ERROR(metadata.thunk->Prepare(prepare_params));
 
   TF_RETURN_IF_ERROR(multimem_registry.Build());
diff --git a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
index 897a3bda32f429..82440516d63381 100644
--- a/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/custom_call_thunk_test.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/status/status_matchers.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/executable_run_options.h"
@@ -43,6 +44,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
@@ -286,9 +288,22 @@ TEST(CustomCallThunkTest, CustomCallWithOwnedHandlers) {
     ++execute_calls;
     return absl::OkStatus();
   });
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
   se::StreamExecutorMemoryAllocator allocator(executor);
-  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
   BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &buffer_allocations};
+
   Thunk::InitializeParams initialize_params;
   initialize_params.stream = stream.get();
   initialize_params.buffer_allocations = &buffer_allocations;
@@ -337,10 +352,23 @@ TEST(CustomCallThunkTest, CustomCallWithOwnedHandlersWithoutOptionalOnes) {
     ++execute_calls;
     return absl::OkStatus();
   });
+
+  ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
   se::StreamExecutorMemoryAllocator allocator(executor);
-  Thunk::PrepareParams prepare_params = Thunk::PrepareParams{};
-  Thunk::InitializeParams initialize_params = Thunk::InitializeParams{};
   BufferAllocations buffer_allocations({}, 0, &allocator);
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &buffer_allocations};
+
+  Thunk::InitializeParams initialize_params = Thunk::InitializeParams{};
   Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
       ServiceExecutableRunOptions(), buffer_allocations, stream.get(),
       stream.get(), nullptr, nullptr);
diff --git a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
index 99790239ebee45..7c8913b737f2ed 100644
--- a/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/dynamic_slice_thunk_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/ffi.h"
+#include "xla/backends/gpu/runtime/collective_multimem_registry.h"
 #include "xla/backends/gpu/runtime/custom_call_thunk.h"
 #include "xla/backends/gpu/runtime/dynamic_slice_thunk.pb.h"
 #include "xla/backends/gpu/runtime/gemm_thunk.h"
@@ -44,6 +45,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/parser/hlo_parser.h"
 #include "xla/hlo/testlib/hlo_hardware_independent_test_base.h"
+#include "xla/runtime/device_id.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/matmul_utils.h"
@@ -1940,12 +1942,21 @@ TEST_F(DynamicSliceThunkTest,
 
   // Preparing parameters for thunk execution.
   ServiceExecutableRunOptions run_options;
+  run_options.mutable_run_options()->set_stream(stream.get());
+  ASSERT_OK_AND_ASSIGN(
+      CollectiveParams collective_params,
+      CollectiveParams::Create(run_options, /*async_streams=*/{},
+                               LocalDeviceId(executor->device_ordinal())));
+  CollectiveCliqueRequests clique_requests;
+  CollectiveMultimemRegistry multimem_registry(
+      executor, collective_params.global_device_id);
   se::StreamExecutorMemoryAllocator allocator(executor);
   BufferAllocations allocations(/*buffers=*/{lhs, rhs, out, workspace},
-                                /*device_ordinal=*/0,
+                                /*device_ordinal=*/executor->device_ordinal(),
                                 /*memory_allocator=*/&allocator);
-
-  Thunk::PrepareParams prepare_params{};
+  Thunk::PrepareParams prepare_params{&collective_params, &clique_requests,
+                                      &multimem_registry, executor,
+                                      &allocations};
 
   Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
       run_options, /*buffer_allocations=*/allocations, stream.get(),

From 5bd50f92b6e6200f40addc29c2521620943eba5d Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Tue, 23 Dec 2025 12:03:23 -0800
Subject: [PATCH 719/753] Remove redundant replica group copy in
 CollectiveThunk.

The `config.replica_groups` was being populated twice from `proto.replica_groups`, once by `assign` and once by `absl::c_copy`. The `absl::c_copy` is redundant and causes duplicate entries.

PiperOrigin-RevId: 848255824
---
 .../backends/gpu/runtime/collective_thunk.cc  |  3 --
 .../backends/gpu/runtime/collective_thunk.h   |  1 +
 .../gpu/runtime/collective_thunk_test.cc      | 50 ++++++++++++++++++-
 3 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
index 8ddff57556d6b8..3d339a136aa0f1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.cc
@@ -178,9 +178,6 @@ CollectiveConfig CollectiveConfig::FromProto(
   config.replica_groups.assign(proto.replica_groups().begin(),
                                proto.replica_groups().end());
 
-  absl::c_copy(proto.replica_groups(),
-               std::back_inserter(config.replica_groups));
-
   config.group_mode = proto.group_mode();
   config.use_symmetric_buffer = proto.use_symmetric_buffer();
   return config;
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
index 702db0ab6e5cfd..3f5013bad2a47f 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/runtime/collective_params.h"
 #include "xla/backends/gpu/runtime/thunk.h"
diff --git a/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc
index 43aa470e095778..dd71d9eeb969de 100644
--- a/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/collective_thunk_test.cc
@@ -29,9 +29,11 @@ limitations under the License.
 namespace xla::gpu {
 namespace {
 
+using ::testing::ElementsAre;
 using ::tsl::proto_testing::EqualsProto;
+using ::tsl::proto_testing::ParseTextProtoOrDie;
 
-TEST(CollectiveThunkTest, ProtoRoundTrip) {
+TEST(CollectiveDoneThunkTest, ProtoRoundTrip) {
   ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
       R"pb(
         thunk_info {
@@ -65,5 +67,51 @@ TEST(CollectiveThunkTest, ProtoRoundTrip) {
   EXPECT_THAT(round_trip_proto, EqualsProto(proto));
 }
 
+TEST(CollectiveConfigTest, ToProto) {
+  CollectiveConfig config{
+      /*operand_element_type=*/{PrimitiveType::F32, PrimitiveType::BF16},
+      /*replica_groups=*/
+      {ParseTextProtoOrDie<ReplicaGroup>(
+           R"pb(replica_ids: 0 replica_ids: 1)pb"),
+       ParseTextProtoOrDie<ReplicaGroup>(
+           R"pb(replica_ids: 2 replica_ids: 3)pb")},
+      /*group_mode=*/
+      CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION,
+      /*use_symmetric_buffer=*/true,
+  };
+
+  EXPECT_THAT(config.ToProto(), EqualsProto(R"pb(
+                operand_element_type: F32
+                operand_element_type: BF16
+                replica_groups { replica_ids: 0 replica_ids: 1 }
+                replica_groups { replica_ids: 2 replica_ids: 3 }
+                group_mode: COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION
+                use_symmetric_buffer: true
+              )pb"));
+}
+
+TEST(CollectiveConfigTest, FromProto) {
+  CollectiveConfigProto proto = ParseTextProtoOrDie<CollectiveConfigProto>(
+      R"pb(
+        operand_element_type: F32
+        operand_element_type: BF16
+        replica_groups { replica_ids: 0 replica_ids: 1 }
+        replica_groups { replica_ids: 2 replica_ids: 3 }
+        group_mode: COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION
+        use_symmetric_buffer: true
+      )pb");
+
+  CollectiveConfig config = CollectiveConfig::FromProto(proto);
+
+  EXPECT_THAT(config.operand_element_type,
+              ElementsAre(PrimitiveType::F32, PrimitiveType::BF16));
+  EXPECT_THAT(config.replica_groups,
+              ElementsAre(EqualsProto(R"pb(replica_ids: 0 replica_ids: 1)pb"),
+                          EqualsProto(R"pb(replica_ids: 2 replica_ids: 3)pb")));
+  EXPECT_EQ(config.group_mode,
+            CollectiveOpGroupMode::COLLECTIVE_OP_GROUP_MODE_CROSS_PARTITION);
+  EXPECT_TRUE(config.use_symmetric_buffer);
+}
+
 }  // namespace
 }  // namespace xla::gpu

From db900e578ccd66ca4d37566c66753418d08678aa Mon Sep 17 00:00:00 2001
From: Volodymyr Kysenko <vksnk@google.com>
Date: Tue, 23 Dec 2025 12:17:33 -0800
Subject: [PATCH 720/753] [XLA:CPU] Refactor grouped convolution handling in
 YNN emitter.

For the grouped convolutions split_dim of the input buffer is now combined into one call with stencil copy.

PiperOrigin-RevId: 848260240
---
 .../xla/xla/backends/cpu/ynn_emitter.cc       | 65 +++++++++----------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/third_party/xla/xla/backends/cpu/ynn_emitter.cc b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
index ec55a93dccb034..b2fdada31d3618 100644
--- a/third_party/xla/xla/backends/cpu/ynn_emitter.cc
+++ b/third_party/xla/xla/backends/cpu/ynn_emitter.cc
@@ -436,18 +436,14 @@ static ynn_status DefineConvolution(
     uint32_t input1_id, uint32_t input2_id, uint32_t output_id,
     const std::vector<size_t>& filter_dims, const std::vector<size_t>& out_dims,
     size_t feature_group_count, size_t input_channels,
-    size_t kernel_output_channels, const std::vector<int32_t>& stencil_axes,
-    const std::vector<int32_t>& new_axes,
-    const std::vector<size_t>& stencil_dims,
-    const std::vector<size_t>& stencil_strides,
-    const std::vector<size_t>& stencil_dilations,
+    size_t kernel_output_channels, std::vector<int32_t> stencil_axes,
+    std::vector<size_t> stencil_dims, std::vector<size_t> stencil_strides,
+    std::vector<size_t> stencil_dilations,
     const std::vector<int64_t>& padding_lows,
     const std::vector<int64_t>& padding_highs) {
+  size_t num_k_dims = stencil_dims.size() + 1;
   ynn_status status;
 
-  // Make a copy in case we need to shift these for grouped convolution.
-  std::vector<int32_t> new_axes_shifted = new_axes;
-
   // We will need to create an intermediate buffer for the output if it's
   // grouped convolution.
   uint32_t output_unfused_id =
@@ -455,18 +451,6 @@ static ynn_status DefineConvolution(
 
   if (feature_group_count != 1) {
     uint32_t split_id = YNN_INVALID_VALUE_ID;
-
-    // [n, h, w, ci] -> [n, h, w, g, 1, ci/g].
-    size_t input_split[] = {feature_group_count, 1,
-                            input_channels / feature_group_count};
-    status =
-        ynn_define_split_dim(subgraph, /*axis=*/-1, /*num_splits=*/3,
-                             input_split, input1_id, &split_id, /*flags=*/0);
-    if (status != ynn_status_success) {
-      return status;
-    }
-    input1_id = split_id;
-    split_id = YNN_INVALID_VALUE_ID;
     CHECK_EQ(filter_dims.size(), 4);
     // [kh, kw, ci/g, co] -> [kh, kw, ci/g, g, co/g].
     size_t filter_split[] = {feature_group_count,
@@ -506,11 +490,6 @@ static ynn_status DefineConvolution(
     if (status != ynn_status_success) {
       return status;
     }
-
-    // Shift new stencil axes by two.
-    for (int i = 0; i < new_axes_shifted.size(); ++i) {
-      new_axes_shifted[i] += 2;
-    }
   }
 
   // If any of paddings is not zero, define a padding value and pad the input.
@@ -543,20 +522,40 @@ static ynn_status DefineConvolution(
     padding_id = YNN_INVALID_VALUE_ID;
   }
 
+  std::vector<int32_t> new_axes;
+
+  if (feature_group_count != 1) {
+    // (n, h, w, c) -> (n, h, w, [g, 1,] kh, kw, c / g)
+    stencil_dims.push_back(feature_group_count);
+    stencil_dims.push_back(1);
+    stencil_axes.push_back(3);
+    stencil_axes.push_back(3);
+    // We need to insert stencil dimensions [kh, kw] right before the channel
+    // dimension and [g, 1] before stencil dimensions.
+    new_axes = {-3, -2, -5, -4};
+    stencil_strides.push_back(1);
+    stencil_strides.push_back(1);
+    stencil_dilations.push_back(input_channels / feature_group_count);
+    stencil_dilations.push_back(1);
+  } else {
+    // We need to insert stencil dimensions [kh, kw] right before the channel
+    // dimension.
+    new_axes = {-3, -2};
+  }
+
   uint32_t stencil_id = YNN_INVALID_VALUE_ID;
   // Make a stenciled view of the input [n, h, w, ci] -> [n, h, w, kh, kw, ci].
   status = ynn_define_stencil_copy(
       subgraph, /*num_stencils=*/stencil_dims.size(), stencil_axes.data(),
-      new_axes_shifted.data(), stencil_dims.data(), stencil_strides.data(),
+      new_axes.data(), stencil_dims.data(), stencil_strides.data(),
       stencil_dilations.data(), input1_id, YNN_INVALID_VALUE_ID, &stencil_id,
       /*flags=*/0);
   if (status != ynn_status_success) {
     return status;
   }
 
-  status = ynn_define_dot(subgraph, /*num_k_dims=*/stencil_dims.size() + 1,
-                          stencil_id, input2_id, YNN_INVALID_VALUE_ID,
-                          &output_unfused_id,
+  status = ynn_define_dot(subgraph, num_k_dims, stencil_id, input2_id,
+                          YNN_INVALID_VALUE_ID, &output_unfused_id,
                           /*flags=*/0);
 
   if (status != ynn_status_success) {
@@ -714,7 +713,6 @@ static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
       conv->convolution_dimension_numbers();
 
   std::vector<int32_t> stencil_axes(conv_window_dims_size);
-  std::vector<int32_t> new_axes(conv_window_dims_size);
   std::vector<size_t> stencil_dims(conv_window_dims_size);
   std::vector<size_t> stencil_strides(conv_window_dims_size);
   std::vector<size_t> stencil_dilations(conv_window_dims_size);
@@ -730,8 +728,6 @@ static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
     padding_highs[i] = conv_window.dimensions(i).padding_high();
   }
 
-  std::iota(new_axes.begin(), new_axes.end(), lhs_dims.size() - 1);
-
   YNN_RETURN_IF_ERROR(DefineConvolution(
       subgraph.get(), ynn_lhs_type, ynn_out_type, lhs_id, rhs_id, out_id,
       rhs_dims, out_dims, conv->feature_group_count(),
@@ -739,8 +735,9 @@ static absl::StatusOr<YnnSubgraph> EmitYnnConvolutionSubgraph(
           conv_dimensions.input_feature_dimension()),
       conv->operand(1)->shape().dimensions(
           conv_dimensions.kernel_output_feature_dimension()),
-      stencil_axes, new_axes, stencil_dims, stencil_strides, stencil_dilations,
-      padding_lows, padding_highs));
+      std::move(stencil_axes), std::move(stencil_dims),
+      std::move(stencil_strides), std::move(stencil_dilations), padding_lows,
+      padding_highs));
 
   ynn_status status = ynn_optimize_subgraph(
       subgraph.get(), /*threadpool=*/nullptr, /*flags=*/0);

From 1b61eec06ba2207565262390ef1a36b3aa8dfbd1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 12:25:40 -0800
Subject: [PATCH 721/753] This is an internal change

Reverts 86c82562e1188f693f49cd0b369e70d07a74eb7e

PiperOrigin-RevId: 848262323
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  35 +++
 .../channelwise_quantized_conv_2d_test.cc     |  91 ++------
 ...elwise_quantized_depthwise_conv_2d_test.cc | 116 +++-------
 .../lite/delegates/xnnpack/conv_2d_test.cc    | 159 +++-----------
 .../xnnpack/depthwise_conv_2d_test.cc         | 170 +++------------
 .../dynamically_quantized_conv_2d_test.cc     | 179 +++-------------
 ...amically_quantized_fully_connected_test.cc | 113 +---------
 ...namically_quantized_transpose_conv_test.cc |  60 ++----
 ...mically_quantized_transpose_conv_tester.cc |  10 +-
 .../xnnpack/fingerprint_test_helpers.h        | 112 ++++++++++
 .../delegates/xnnpack/fully_connected_test.cc | 152 +++----------
 .../xnnpack/signed_quantized_conv_2d_test.cc  |  97 ++-------
 ...signed_quantized_depthwise_conv_2d_test.cc | 122 +++--------
 .../signed_quantized_fully_connected_test.cc  |  91 ++------
 .../signed_quantized_transpose_conv_test.cc   | 127 +++--------
 .../delegates/xnnpack/transpose_conv_test.cc  | 199 ++++--------------
 .../unsigned_quantized_conv_2d_test.cc        |  97 ++-------
 ...signed_quantized_depthwise_conv_2d_test.cc | 121 +++--------
 ...unsigned_quantized_fully_connected_test.cc |  85 ++------
 .../unsigned_quantized_transpose_conv_test.cc | 127 +++--------
 .../lite/delegates/xnnpack/weight_cache.cc    |  85 ++++++--
 .../lite/delegates/xnnpack/weight_cache.h     |   7 +-
 .../delegates/xnnpack/weight_cache_schema.fbs |   5 +-
 .../delegates/xnnpack/weight_cache_test.cc    | 192 ++++++++++++-----
 .../lite/tools/cmake/modules/xnnpack.cmake    |   1 +
 25 files changed, 766 insertions(+), 1787 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 227537a79f1454..02d51f21d4fa4e 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -392,6 +392,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "fingerprint_test_helpers",
+    testonly = True,
+    hdrs = ["fingerprint_test_helpers.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":weight_cache",
+        ":weight_cache_test_helpers",
+        ":xnnpack_delegate_hdrs_only",
+        "//tensorflow/lite/c:common",
+        "@XNNPACK",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "mmap_handle",
     srcs = ["mmap_handle.cc"],
@@ -1347,6 +1362,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1363,6 +1379,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1397,6 +1414,7 @@ cc_test(
     }),
     deps = [
         ":conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1446,6 +1464,7 @@ cc_test(
     }),
     deps = [
         ":depthwise_conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1465,6 +1484,7 @@ cc_test(
     tags = ["notap"],
     deps = [
         ":dynamically_quantized_fully_connected_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1481,6 +1501,7 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_conv_2d_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1497,6 +1518,7 @@ cc_test(
     }),
     deps = [
         ":dynamically_quantized_transpose_conv_tester",
+        ":fingerprint_test_helpers",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
@@ -1513,10 +1535,14 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":fully_connected_tester",
         ":test_main",
+        ":weight_cache",
+        ":weight_cache_test_helpers",
         ":xnnpack_delegate_test_mode",
         "//tensorflow/lite/c:c_api_types",
+        "@XNNPACK",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -1864,6 +1890,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1880,6 +1907,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -1930,6 +1958,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2163,6 +2192,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2307,6 +2337,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":test_main",
         ":transpose_conv_tester",
         ":xnnpack_delegate_test_mode",
@@ -2386,6 +2417,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2401,6 +2433,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_depthwise_conv_2d_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2431,6 +2464,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_fully_connected_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
@@ -2641,6 +2675,7 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":fingerprint_test_helpers",
         ":quantized_transpose_conv_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
index 92293e08227593..d195d4f25435e8 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_conv_2d_test.cc
@@ -24,17 +24,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(ChannelwiseQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct ChannelwiseQuantizedConv2D : DelegateTest {};
 
+TEST_F(ChannelwiseQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -71,11 +70,7 @@ TEST(ChannelwiseQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -112,11 +107,7 @@ TEST(ChannelwiseQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -155,11 +146,7 @@ TEST(ChannelwiseQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -198,11 +185,7 @@ TEST(ChannelwiseQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -241,11 +224,7 @@ TEST(ChannelwiseQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -288,11 +267,7 @@ TEST(ChannelwiseQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -335,11 +310,7 @@ TEST(ChannelwiseQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -382,11 +353,7 @@ TEST(ChannelwiseQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -429,11 +396,7 @@ TEST(ChannelwiseQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -476,11 +439,7 @@ TEST(ChannelwiseQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -523,11 +482,7 @@ TEST(ChannelwiseQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -570,13 +525,11 @@ TEST(ChannelwiseQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, MultiThreading) {
+TEST_F(ChannelwiseQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -619,7 +572,7 @@ TEST(ChannelwiseQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
+TEST_F(ChannelwiseQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -627,9 +580,7 @@ TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -673,15 +624,13 @@ TEST(ChannelwiseQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
+TEST_F(ChannelwiseQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
index 25dada01896c34..0c6de84e9a8d2f 100644
--- a/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/channelwise_quantized_depthwise_conv_2d_test.cc
@@ -23,18 +23,16 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct ChannelwiseQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -66,11 +64,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -103,11 +97,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -140,11 +130,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -179,11 +165,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -216,11 +198,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -255,11 +233,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -297,11 +271,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -339,11 +309,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -385,11 +351,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -431,11 +393,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -477,11 +435,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -523,11 +477,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -573,11 +523,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -619,11 +565,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -665,11 +607,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto scale_rng = std::bind(
@@ -711,13 +649,11 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -759,7 +695,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -767,9 +703,7 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -812,15 +746,13 @@ TEST(ChannelwiseQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+TEST_F(ChannelwiseQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 25090bbaf2b5cf..e1b5a674946b73 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(Conv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct Conv2D : DelegateTest {};
 
+TEST_F(Conv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -52,11 +50,7 @@ TEST(Conv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -78,11 +72,7 @@ TEST(Conv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -106,11 +96,7 @@ TEST(Conv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -136,11 +122,7 @@ TEST(Conv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -164,11 +146,7 @@ TEST(Conv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -192,11 +170,7 @@ TEST(Conv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -224,11 +198,7 @@ TEST(Conv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -256,11 +226,7 @@ TEST(Conv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -288,11 +254,7 @@ TEST(Conv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -320,11 +282,7 @@ TEST(Conv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -352,11 +310,7 @@ TEST(Conv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -384,11 +338,7 @@ TEST(Conv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -416,11 +366,7 @@ TEST(Conv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -448,11 +394,7 @@ TEST(Conv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -481,11 +423,7 @@ TEST(Conv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -514,11 +452,7 @@ TEST(Conv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -547,11 +481,7 @@ TEST(Conv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -579,11 +509,7 @@ TEST(Conv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -611,11 +537,7 @@ TEST(Conv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -643,11 +565,7 @@ TEST(Conv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DISABLED_TanhActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DISABLED_TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -675,11 +593,7 @@ TEST(Conv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, DISABLED_SignBitActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(Conv2D, DISABLED_SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -707,13 +621,11 @@ TEST(Conv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, MultiThreading) {
+TEST_F(Conv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -741,7 +653,7 @@ TEST(Conv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, WeightsCache) {
+TEST_F(Conv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -749,10 +661,7 @@ TEST(Conv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -781,15 +690,13 @@ TEST(Conv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(Conv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(Conv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index e894bcdc2bc46a..931fff88178dfb 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct DepthwiseConv2D : DelegateTest {};
 
+TEST_F(DepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -47,11 +45,7 @@ TEST(DepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -69,11 +63,7 @@ TEST(DepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -91,11 +81,7 @@ TEST(DepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -115,11 +101,7 @@ TEST(DepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -137,11 +119,7 @@ TEST(DepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto input_rng =
@@ -161,11 +139,7 @@ TEST(DepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -188,11 +162,7 @@ TEST(DepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -215,11 +185,7 @@ TEST(DepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -246,11 +212,7 @@ TEST(DepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -277,11 +239,7 @@ TEST(DepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -308,11 +266,7 @@ TEST(DepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -339,11 +293,7 @@ TEST(DepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -372,11 +322,7 @@ TEST(DepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -403,11 +349,7 @@ TEST(DepthwiseConv2D, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -434,11 +376,7 @@ TEST(DepthwiseConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -465,11 +403,7 @@ TEST(DepthwiseConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -496,11 +430,7 @@ TEST(DepthwiseConv2D, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -528,11 +458,7 @@ TEST(DepthwiseConv2D, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -560,11 +486,7 @@ TEST(DepthwiseConv2D, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -592,11 +514,7 @@ TEST(DepthwiseConv2D, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -623,11 +541,7 @@ TEST(DepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -654,11 +568,7 @@ TEST(DepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -685,11 +595,7 @@ TEST(DepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DISABLED_TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -716,11 +622,7 @@ TEST(DepthwiseConv2D, DISABLED_TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DepthwiseConv2D, DISABLED_SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -747,13 +649,11 @@ TEST(DepthwiseConv2D, DISABLED_SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, MultiThreading) {
+TEST_F(DepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -780,7 +680,7 @@ TEST(DepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, WeightsCache) {
+TEST_F(DepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -788,9 +688,7 @@ TEST(DepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -818,15 +716,13 @@ TEST(DepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DepthwiseConv2D, TransientIndirectionBuffer) {
+TEST_F(DepthwiseConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
index 59507269580cbd..52e8333db4fd04 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_test.cc
@@ -19,22 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_conv_2d_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DynamicallyQuantizedConv2D, 3x3) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+struct DynamicallyQuantizedConv2D : DelegateTest {};
 
+TEST_F(DynamicallyQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -56,15 +50,7 @@ TEST(DynamicallyQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, 3x3Stride2) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -88,15 +74,7 @@ TEST(DynamicallyQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, Grouped) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -122,15 +100,7 @@ TEST(DynamicallyQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -154,15 +124,7 @@ TEST(DynamicallyQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -186,14 +148,7 @@ TEST(DynamicallyQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -221,15 +176,7 @@ TEST(DynamicallyQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -257,15 +204,7 @@ TEST(DynamicallyQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -293,15 +232,7 @@ TEST(DynamicallyQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -329,15 +260,7 @@ TEST(DynamicallyQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -364,15 +287,7 @@ TEST(DynamicallyQuantizedConv2D, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -399,15 +314,7 @@ TEST(DynamicallyQuantizedConv2D, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ReluActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -435,15 +342,7 @@ TEST(DynamicallyQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, Relu6Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -471,15 +370,7 @@ TEST(DynamicallyQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -507,15 +398,7 @@ TEST(DynamicallyQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TanhActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, TanhActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -543,15 +426,7 @@ TEST(DynamicallyQuantizedConv2D, TanhActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, SignBitActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedConv2D, SignBitActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -579,15 +454,13 @@ TEST(DynamicallyQuantizedConv2D, SignBitActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, MultiThreading) {
+TEST_F(DynamicallyQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -615,7 +488,7 @@ TEST(DynamicallyQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, WeightsCache) {
+TEST_F(DynamicallyQuantizedConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -625,9 +498,7 @@ TEST(DynamicallyQuantizedConv2D, WeightsCache) {
   delegate_options.weights_cache = weights_cache.get();
   delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -656,16 +527,14 @@ TEST(DynamicallyQuantizedConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
+TEST_F(DynamicallyQuantizedConv2D, TransientIndirectionBuffer) {
   TfLiteXNNPackDelegateOptions xnnpack_options =
       TfLiteXNNPackDelegateOptionsDefault();
   xnnpack_options.num_threads = 2;
   xnnpack_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
   xnnpack_options.flags |= TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(xnnpack_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
index 2f198a95195f11..2d2febcb21ab66 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <string>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_fully_connected_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
@@ -30,9 +30,10 @@ namespace xnnpack {
 
 // Dummy class to use with parameterized test.
 class DynamicallyQuantizedFullyConnectedTest
-    : public testing::TestWithParam<WeightsType> {};
+    : public testing::WithParamInterface<WeightsType>,
+      public DelegateTest {};
 
-int GenInputChannels(const std::function<int()> &rng,
+int GenInputChannels(const std::function<int()>& rng,
                      WeightsType weights_type) {
   switch (weights_type) {
     case WeightsType::kChannelWiseQuantizedInt8:
@@ -45,14 +46,6 @@ int GenInputChannels(const std::function<int()> &rng,
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -71,14 +64,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 1D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -99,14 +84,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -128,13 +105,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 2DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -156,14 +126,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -184,14 +146,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DReshape) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -214,14 +168,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 3DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -244,14 +190,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4D) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -275,14 +213,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, 4DKeepDims) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -304,14 +234,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, NoBias) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -333,14 +255,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluActivation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -362,14 +276,6 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, Relu6Activation) {
 }
 
 TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
-  TfLiteXNNPackDelegateOptions delegate_options =
-      TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -393,13 +299,8 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, ReluMinus1To1Activation) {
 TEST_P(DynamicallyQuantizedFullyConnectedTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  delegate_options.flags |=
-      TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
-
+  UseCustomDelegate(delegate_options);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -429,9 +330,7 @@ TEST_P(DynamicallyQuantizedFullyConnectedTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
index de863e4f1e2125..4a40e56852b56c 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct DynamicallyQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -51,10 +49,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -75,11 +70,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -100,11 +91,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -125,11 +112,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -153,10 +136,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+TEST_F(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -180,11 +160,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -212,11 +188,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -244,13 +216,11 @@ TEST(DynamicallyQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -279,7 +249,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -287,9 +257,7 @@ TEST(DynamicallyQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
index 3bdcd343373bac..abfd76c12a14f9 100644
--- a/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/dynamically_quantized_transpose_conv_tester.cc
@@ -55,10 +55,12 @@ void DynamicallyQuantizedTransposeConvTester::Test(
   const Model* model = GetModel(buffer.data());
 
   std::unique_ptr<Interpreter> delegate_interpreter;
-  ASSERT_EQ(InterpreterBuilder(
-                model, ::tflite::ops::builtin::BuiltinOpResolverWithXNNPACK())(
-                &delegate_interpreter),
-            kTfLiteOk);
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &delegate_interpreter),
+      kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
       InterpreterBuilder(
diff --git a/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h b/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
new file mode 100644
index 00000000000000..29edbe5a35c841
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h
@@ -0,0 +1,112 @@
+/* Copyright 2025 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "experimental.h"  // from @XNNPACK
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache.h"
+#include "tensorflow/lite/delegates/xnnpack/weight_cache_test_helpers.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite::xnnpack {
+
+struct TfLiteDelegateDeleter {
+  void operator()(TfLiteDelegate* delegate) {
+    TfLiteXNNPackDelegateDelete(delegate);
+  }
+};
+
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, TfLiteDelegateDeleter>;
+
+struct DelegateTest : public virtual testing::Test {
+  void SetUp() override {
+    TfLiteXNNPackDelegateOptions delegate_options =
+        TfLiteXNNPackDelegateOptionsDefault();
+
+    // By default, we try to setup a file weight cache to also check fingerprint
+    // generation. If the test system doesn't support a file system, then the
+    // cache file will be invalid.
+    if (cache_file.IsValid()) {
+      xnn_clear_fingerprints();
+      delegate_options.weight_cache_file_path = cache_file.GetCPath();
+      delegate_options.weight_cache_file_descriptor =
+          cache_file.Duplicate().Release();
+      delegate_options.flags |=
+          TFLITE_XNNPACK_DELEGATE_FLAG_ENABLE_LATEST_OPERATORS;
+      check_for_cache_fingerprints = true;
+    }
+
+    xnnpack_delegate =
+        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
+    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
+  }
+
+  void TearDown() override {
+    if (check_for_cache_fingerprints) {
+      ASSERT_TRUE(cache_file.IsValid());
+      EXPECT_TRUE(IsCompatibleCacheFile(cache_file));
+      if (AlterXNNPackFingerprints()) {
+        EXPECT_FALSE(IsCompatibleCacheFile(cache_file));
+      }
+    }
+  }
+
+  // Artificially change fingerprint values.
+  //
+  // This allows us to check that changing a fingerprint value will make the
+  // cache file incompatible.
+  //
+  // Returns the current number of fingerprints.
+  int AlterXNNPackFingerprints() {
+    int i = 0;
+    int modified = 0;
+    for (const xnn_fingerprint* fingerprint = xnn_get_fingerprint_by_idx(i);
+         fingerprint != nullptr;
+         fingerprint = xnn_get_fingerprint_by_idx(++i)) {
+      xnn_fingerprint new_fingerprint = *fingerprint;
+      ++new_fingerprint.value;
+      xnn_set_fingerprint(new_fingerprint);
+      ++modified;
+    }
+    return modified;
+  }
+
+  // Replaces the xnnpack delegate with a custom one.
+  void UseCustomDelegate(const TfLiteXNNPackDelegateOptions& delegate_options) {
+    check_for_cache_fingerprints = false;
+    xnnpack_delegate =
+        TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&delegate_options));
+    ASSERT_THAT(xnnpack_delegate, testing::NotNull());
+  }
+
+  // Replaces the xnnpack delegate with one that sets up a file backed weight
+  // cache.
+  void UseDelegateWithFileWeightCache() {}
+
+  // The default delegate is created in a generic way.
+  TfLiteDelegatePtr xnnpack_delegate;
+  tflite::xnnpack::TempFileDesc cache_file;
+  bool check_for_cache_fingerprints = false;
+};
+
+}  // namespace tflite::xnnpack
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_FINGERPRINT_TEST_HELPERS_H_
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index 92a6074c464f85..6701d0bc1c8f59 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -19,18 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(FullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct FullyConnectedTest : public DelegateTest {};
 
+TEST_F(FullyConnectedTest, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -45,11 +43,7 @@ TEST(FullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto channels_rng =
@@ -65,11 +59,7 @@ TEST(FullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -87,11 +77,7 @@ TEST(FullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -110,11 +96,7 @@ TEST(FullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -133,11 +115,7 @@ TEST(FullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -156,11 +134,7 @@ TEST(FullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -180,11 +154,7 @@ TEST(FullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -204,11 +174,7 @@ TEST(FullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto shape_rng =
@@ -229,11 +195,7 @@ TEST(FullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -252,11 +214,7 @@ TEST(FullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -275,11 +233,7 @@ TEST(FullyConnected, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, FP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, FP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -299,11 +253,7 @@ TEST(FullyConnected, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -322,11 +272,7 @@ TEST(FullyConnected, DynamicWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -346,11 +292,7 @@ TEST(FullyConnected, DynamicWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -369,11 +311,7 @@ TEST(FullyConnected, DynamicBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, DynamicWeightsAndBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, DynamicWeightsAndBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -393,11 +331,7 @@ TEST(FullyConnected, DynamicWeightsAndBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -416,11 +350,7 @@ TEST(FullyConnected, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, TensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -440,11 +370,7 @@ TEST(FullyConnected, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -463,11 +389,7 @@ TEST(FullyConnected, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -487,11 +409,7 @@ TEST(FullyConnected, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -510,11 +428,7 @@ TEST(FullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -533,11 +447,7 @@ TEST(FullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(FullyConnectedTest, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -556,13 +466,11 @@ TEST(FullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, MultiThreading) {
+TEST_F(FullyConnectedTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -581,7 +489,7 @@ TEST(FullyConnected, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(FullyConnected, WeightsCache) {
+TEST_F(FullyConnectedTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -589,9 +497,7 @@ TEST(FullyConnected, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
index f67ba714b01cc8..06daba0d9bada7 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_conv_2d_test.cc
@@ -21,17 +21,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedConv2D : DelegateTest {};
 
+TEST_F(SignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -59,11 +58,7 @@ TEST(SignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -91,11 +86,7 @@ TEST(SignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -125,11 +116,7 @@ TEST(SignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -163,11 +150,7 @@ TEST(SignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -197,11 +180,7 @@ TEST(SignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -231,11 +210,7 @@ TEST(SignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -269,11 +244,7 @@ TEST(SignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -307,11 +278,7 @@ TEST(SignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -345,11 +312,7 @@ TEST(SignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -383,11 +346,7 @@ TEST(SignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -421,11 +380,7 @@ TEST(SignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -459,11 +414,7 @@ TEST(SignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -497,13 +448,11 @@ TEST(SignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, MultiThreading) {
+TEST_F(SignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -537,15 +486,13 @@ TEST(SignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(SignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
index 3acfbaaf34778e..c409b18002ef51 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_depthwise_conv_2d_test.cc
@@ -20,18 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(SignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -54,11 +52,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -82,11 +76,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -110,11 +100,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -140,11 +126,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -168,11 +150,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -198,11 +176,7 @@ TEST(SignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -231,11 +205,7 @@ TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -264,11 +234,7 @@ TEST(SignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -301,11 +267,7 @@ TEST(SignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -338,11 +300,7 @@ TEST(SignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -375,11 +333,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -412,11 +366,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -451,11 +401,7 @@ TEST(SignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -488,11 +434,7 @@ TEST(SignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -525,11 +467,7 @@ TEST(SignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -562,13 +500,11 @@ TEST(SignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(SignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -601,7 +537,7 @@ TEST(SignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(SignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -609,9 +545,7 @@ TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -645,15 +579,13 @@ TEST(SignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(SignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
index 3097d314a3a6ab..5a7a9dfd77b24e 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_fully_connected_test.cc
@@ -21,17 +21,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedFullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedFullyConnected : DelegateTest {};
 
+TEST_F(SignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -52,11 +51,7 @@ TEST(SignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -78,11 +73,7 @@ TEST(SignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -106,11 +97,7 @@ TEST(SignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -135,11 +122,7 @@ TEST(SignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -164,11 +147,7 @@ TEST(SignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -193,11 +172,7 @@ TEST(SignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -223,11 +198,7 @@ TEST(SignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -253,11 +224,7 @@ TEST(SignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -284,11 +251,7 @@ TEST(SignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -313,11 +276,7 @@ TEST(SignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -342,11 +301,7 @@ TEST(SignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -371,11 +326,7 @@ TEST(SignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -400,13 +351,11 @@ TEST(SignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, MultiThreading) {
+TEST_F(SignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -431,7 +380,7 @@ TEST(SignedQuantizedFullyConnected, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedFullyConnected, WeightsCache) {
+TEST_F(SignedQuantizedFullyConnected, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -439,9 +388,7 @@ TEST(SignedQuantizedFullyConnected, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
index 7daae13ebdea16..d4dceb9077ff26 100644
--- a/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_transpose_conv_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(SignedQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct SignedQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -52,11 +51,7 @@ TEST(SignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -79,11 +74,7 @@ TEST(SignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -105,11 +96,7 @@ TEST(SignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -132,11 +119,7 @@ TEST(SignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -158,11 +141,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -185,11 +164,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -211,11 +186,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -238,11 +209,7 @@ TEST(SignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -267,11 +234,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -297,11 +260,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -326,11 +285,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -356,11 +311,7 @@ TEST(SignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -389,11 +340,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -423,11 +370,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -456,11 +399,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -490,11 +429,7 @@ TEST(SignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -524,11 +459,7 @@ TEST(SignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -559,13 +490,11 @@ TEST(SignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(SignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -595,13 +524,11 @@ TEST(SignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST_F(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -632,7 +559,7 @@ TEST(SignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(SignedQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(SignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -640,9 +567,7 @@ TEST(SignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
index 260fd87e282a63..d37317c34f545a 100644
--- a/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/transpose_conv_test.cc
@@ -19,17 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(TransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct TransposeConvTest : DelegateTest {};
 
+TEST_F(TransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -50,11 +49,7 @@ TEST(TransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -76,11 +71,7 @@ TEST(TransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -101,11 +92,7 @@ TEST(TransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -127,11 +114,7 @@ TEST(TransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -152,11 +135,7 @@ TEST(TransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -178,11 +157,7 @@ TEST(TransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -203,11 +178,7 @@ TEST(TransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -229,11 +200,7 @@ TEST(TransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -257,11 +224,7 @@ TEST(TransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -286,11 +249,7 @@ TEST(TransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -314,11 +273,7 @@ TEST(TransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -343,11 +298,7 @@ TEST(TransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -375,11 +326,7 @@ TEST(TransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -408,11 +355,7 @@ TEST(TransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -440,11 +383,7 @@ TEST(TransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -473,11 +412,7 @@ TEST(TransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, FP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, FP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -506,11 +441,7 @@ TEST(TransposeConvTest, FP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, FP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, FP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -540,11 +471,7 @@ TEST(TransposeConvTest, FP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -573,11 +500,7 @@ TEST(TransposeConvTest, TensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -607,11 +530,7 @@ TEST(TransposeConvTest, TensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -640,11 +559,7 @@ TEST(TransposeConvTest, ChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -674,11 +589,7 @@ TEST(TransposeConvTest, ChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -707,11 +618,7 @@ TEST(TransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -741,11 +648,7 @@ TEST(TransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseFP16Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseFP16Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -775,11 +678,7 @@ TEST(TransposeConvTest, SparseFP16Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseFP16WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -810,11 +709,7 @@ TEST(TransposeConvTest, SparseFP16WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -844,11 +739,7 @@ TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -879,11 +770,7 @@ TEST(TransposeConvTest, SparseTensorWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -913,11 +800,7 @@ TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8Weights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -948,13 +831,11 @@ TEST(TransposeConvTest, SparseChannelWiseQuantizedInt8WeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, MultiThreading) {
+TEST_F(TransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -983,13 +864,11 @@ TEST(TransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, MultiThreadingNoBias) {
+TEST_F(TransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -1019,7 +898,7 @@ TEST(TransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(TransposeConvTest, WeightsCache) {
+TEST_F(TransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -1027,9 +906,7 @@ TEST(TransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
index 6660fc5af75ebe..b8c9d48f4f05a2 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_conv_2d_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedConv2D : DelegateTest {};
 
+TEST_F(UnsignedQuantizedConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -61,11 +60,7 @@ TEST(UnsignedQuantizedConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -96,11 +91,7 @@ TEST(UnsignedQuantizedConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -137,11 +128,7 @@ TEST(UnsignedQuantizedConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, Grouped) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, Grouped) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -174,11 +161,7 @@ TEST(UnsignedQuantizedConv2D, Grouped) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -211,11 +194,7 @@ TEST(UnsignedQuantizedConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -248,11 +227,7 @@ TEST(UnsignedQuantizedConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -289,11 +264,7 @@ TEST(UnsignedQuantizedConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -330,11 +301,7 @@ TEST(UnsignedQuantizedConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -371,11 +338,7 @@ TEST(UnsignedQuantizedConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -412,11 +375,7 @@ TEST(UnsignedQuantizedConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -453,11 +412,7 @@ TEST(UnsignedQuantizedConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -494,11 +449,7 @@ TEST(UnsignedQuantizedConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -535,13 +486,11 @@ TEST(UnsignedQuantizedConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, MultiThreading) {
+TEST_F(UnsignedQuantizedConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -578,15 +527,13 @@ TEST(UnsignedQuantizedConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(UnsignedQuantizedConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
index 7facb9787338c7..a269343dafc512 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_depthwise_conv_2d_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_depthwise_conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 1x1) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedDepthwiseConv2D : DelegateTest {};
 
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 1x1) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -56,11 +55,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 1x1) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 2x2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 2x2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -87,11 +82,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 2x2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 3x3) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -118,11 +109,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 3x3) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -151,11 +138,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 5x5) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -182,11 +165,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 5x5) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -215,11 +194,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, 5x5Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -251,11 +226,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -287,11 +258,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -327,11 +294,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -367,11 +330,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -407,11 +366,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -447,11 +402,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -489,11 +440,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -529,11 +476,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -569,11 +512,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -609,13 +548,11 @@ TEST(UnsignedQuantizedDepthwiseConv2D, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
+TEST_F(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -651,7 +588,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
+TEST_F(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -659,9 +596,7 @@ TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -698,15 +633,13 @@ TEST(UnsignedQuantizedDepthwiseConv2D, WeightsCache) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
-  TfLiteXNNPackDelegateOptions xnnpack_options =
+TEST_F(UnsignedQuantizedDepthwiseConv2D, TransientIndirectionBuffer) {
+  TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
-  xnnpack_options.num_threads = 2;
-  xnnpack_options.flags |=
+  delegate_options.num_threads = 2;
+  delegate_options.flags |=
       TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&xnnpack_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
index 90df47c884d042..25aabd2a559413 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_fully_connected_test.cc
@@ -20,17 +20,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_fully_connected_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedFullyConnected, 1D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedFullyConnected : DelegateTest {};
 
+TEST_F(UnsignedQuantizedFullyConnected, 1D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -54,11 +53,7 @@ TEST(UnsignedQuantizedFullyConnected, 1D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 1DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 1DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -83,11 +78,7 @@ TEST(UnsignedQuantizedFullyConnected, 1DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 2D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 2D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -114,11 +105,7 @@ TEST(UnsignedQuantizedFullyConnected, 2D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 2DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 2DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -146,11 +133,7 @@ TEST(UnsignedQuantizedFullyConnected, 2DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -178,11 +161,7 @@ TEST(UnsignedQuantizedFullyConnected, 3D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3DReshape) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3DReshape) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -210,11 +189,7 @@ TEST(UnsignedQuantizedFullyConnected, 3DReshape) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 3DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 3DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -243,11 +218,7 @@ TEST(UnsignedQuantizedFullyConnected, 3DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 4D) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 4D) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -276,11 +247,7 @@ TEST(UnsignedQuantizedFullyConnected, 4D) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, 4DKeepDims) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, 4DKeepDims) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -310,11 +277,7 @@ TEST(UnsignedQuantizedFullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -342,11 +305,7 @@ TEST(UnsignedQuantizedFullyConnected, NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, ReluActivation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, ReluActivation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -374,11 +333,7 @@ TEST(UnsignedQuantizedFullyConnected, ReluActivation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, Relu6Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, Relu6Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -406,11 +361,7 @@ TEST(UnsignedQuantizedFullyConnected, Relu6Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto zero_point_rng = std::bind(std::uniform_int_distribution<int32_t>(
@@ -438,13 +389,11 @@ TEST(UnsignedQuantizedFullyConnected, ReluMinus1To1Activation) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedFullyConnected, MultiThreading) {
+TEST_F(UnsignedQuantizedFullyConnected, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
index 8e6a779a1979f9..5167d18443ac30 100644
--- a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_transpose_conv_test.cc
@@ -19,17 +19,16 @@ limitations under the License.
 #include <random>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/fingerprint_test_helpers.h"
 #include "tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 namespace tflite {
 namespace xnnpack {
 
-TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
+struct UnsignedQuantizedTransposeConvTest : DelegateTest {};
 
+TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -51,11 +50,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -78,11 +73,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 2x2Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -104,11 +95,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -131,11 +118,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 3x3Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -157,11 +140,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -184,11 +163,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride2NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -210,11 +185,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto output_rng =
@@ -237,11 +208,7 @@ TEST(UnsignedQuantizedTransposeConvTest, 4x4Stride4NoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -266,11 +233,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -296,11 +259,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -325,11 +284,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -355,11 +310,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SmallKernelWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -388,11 +339,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -422,11 +369,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithSamePaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -455,11 +398,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -489,11 +428,7 @@ TEST(UnsignedQuantizedTransposeConvTest, StrideWithValidPaddingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SparseWeights) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeights) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -523,11 +458,7 @@ TEST(UnsignedQuantizedTransposeConvTest, SparseWeights) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
-                       TfLiteXNNPackDelegateDelete);
-
+TEST_F(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto batch_rng =
@@ -558,13 +489,11 @@ TEST(UnsignedQuantizedTransposeConvTest, SparseWeightsNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, MultiThreading) {
+TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreading) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -594,13 +523,11 @@ TEST(UnsignedQuantizedTransposeConvTest, MultiThreading) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
+TEST_F(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   delegate_options.num_threads = 2;
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -631,7 +558,7 @@ TEST(UnsignedQuantizedTransposeConvTest, MultiThreadingNoBias) {
       .Test(xnnpack_delegate.get());
 }
 
-TEST(UnsignedQuantizedTransposeConvTest, WeightsCache) {
+TEST_F(UnsignedQuantizedTransposeConvTest, WeightsCache) {
   TfLiteXNNPackDelegateOptions delegate_options =
       TfLiteXNNPackDelegateOptionsDefault();
   std::unique_ptr<TfLiteXNNPackDelegateWeightsCache,
@@ -639,9 +566,7 @@ TEST(UnsignedQuantizedTransposeConvTest, WeightsCache) {
       weights_cache(TfLiteXNNPackDelegateWeightsCacheCreate(),
                     TfLiteXNNPackDelegateWeightsCacheDelete);
   delegate_options.weights_cache = weights_cache.get();
-  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
-      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
-                       TfLiteXNNPackDelegateDelete);
+  UseCustomDelegate(delegate_options);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.cc b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
index a8c86ff5a25529..9aaf497700f87f 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <unistd.h>
 #endif
 
+#include <algorithm>
 #include <cerrno>  // IWYU pragma: keep
 #include <cinttypes>
 #include <cstddef>
@@ -37,6 +38,7 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/verifier.h"  // from @flatbuffers
@@ -78,6 +80,20 @@ bool FileExists(const char* path) {
   return access(path, F_OK) != -1;
 }
 
+bool CheckFingerprints(const cache::schema::BufferList* buffer_list) {
+  if (buffer_list->fingerprints()) {
+    for (uint64_t cache_fingerprint : *buffer_list->fingerprints()) {
+      xnn_fingerprint fingerprint;
+      static_assert(sizeof(fingerprint) == sizeof(cache_fingerprint));
+      std::memcpy(&fingerprint, &cache_fingerprint, sizeof(fingerprint));
+      XNNPACK_RETURN_CHECK(
+          xnn_check_fingerprint(fingerprint) == xnn_status_success,
+          "fingerprint (id: 0x%x) could not be matched", fingerprint.id);
+    }
+  }
+  return true;
+}
+
 }  // namespace
 
 #define XNN_MOVE_CONSTRUCT_MEMBER(x) x(std::move(other.x))
@@ -182,7 +198,8 @@ void* WeightCacheBuilder::Reserve(size_t size) {
 }
 
 BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
-                                          const void* data, uint64_t size) {
+                                          const void* data, uint64_t size,
+                                          int32_t fingerprint_id) {
   XNNPACK_ABORT_CHECK(is_build_step_,
                       "cannot append data to an unstarted builder.");
   // Add some padding so that the cache file can be mmaped and the buffer
@@ -201,6 +218,34 @@ BufferLocation WeightCacheBuilder::Append(PackIdentifier pack_id,
   buffer.size = loc.size;
   schema_.buffers.push_back(std::make_unique<cache::schema::BufferT>(buffer));
 
+  // Not passing a fingerprint id is a logic error on XNNPack's side. If we
+  // don't have a fingerprint for an operation, we have no way of ensuring that
+  // the generation of the cached data hasn't changed when reloading the cache.
+  //
+  // If we just log this and continue on with the work. This run will build a
+  // cache with cached data that can't be checked in the future. This will lead,
+  // in future runs that reuse the cache, to crashes that are impossible to
+  // debug or outputs that are nonsensical without any chance of linking this
+  // back to this error.
+  //
+  // We abort because we have no way of making that failure bubble up to the
+  // calling code to handle it gracefully...
+  XNNPACK_ABORT_CHECK(fingerprint_id != 0,
+                      "XNNPack weight cache: no fingerprint identifier was set "
+                      "when appending a buffer to the cache file.");
+  const xnn_fingerprint* fingerprint = xnn_get_fingerprint(fingerprint_id);
+  XNNPACK_ABORT_CHECK(fingerprint,
+                      "XNNPack weight cache: could not find a fingerprint with "
+                      "id 0x%x when appending a buffer to the cache file.",
+                      fingerprint_id);
+  uint64_t fingerprint_value;
+  static_assert(sizeof(fingerprint_value) == sizeof(*fingerprint));
+  std::memcpy(&fingerprint_value, fingerprint, sizeof(*fingerprint));
+  if (std::find(schema_.fingerprints.begin(), schema_.fingerprints.end(),
+                fingerprint_value) == schema_.fingerprints.end()) {
+    schema_.fingerprints.push_back(fingerprint_value);
+  }
+
   if (!fd_.Write(data, size)) {
     TFLITE_LOG_PROD(tflite::TFLITE_LOG_ERROR,
                     "XNNPack weight cache: cannot append buffer to cache file");
@@ -233,16 +278,7 @@ bool WeightCacheBuilder::StopBuildStep() {
   XNNPACK_RETURN_CHECK(fd_.SetPos(layout_offset) != -1,
                        "could not move in the file: %s", strerror(errno));
 
-  XNNPACK_RETURN_CHECK(
-      sizeof(XNNPackCacheHeader::xnnpack_build_identifier) ==
-          xnn_experimental_get_build_identifier_size(),
-      "cache file ('%s') header cannot hold XNNPack's build identifier: %s.",
-      file_path_.c_str(), strerror(errno));
-
   XNNPackCacheHeader header{XNNPackCacheHeader::kVersion};
-  memcpy(header.xnnpack_build_identifier,
-         xnn_experimental_get_build_identifier_data(),
-         xnn_experimental_get_build_identifier_size());
   header.buffer_list_offset = fd_.GetPos();
   header.buffer_list_size = builder.GetSize();
 
@@ -405,12 +441,6 @@ bool MMapWeightCacheProvider::Load() {
                        ", expected %" PRIu64 ". Cache needs to be built again.",
                        header.version, XNNPackCacheHeader::kVersion);
 
-  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
-                           header.xnnpack_build_identifier,
-                           sizeof(header.xnnpack_build_identifier)),
-                       "XNNPack weight cache: incompatible XNNPack version. "
-                       "Cache needs to be built again.");
-
   XNNPACK_RETURN_CHECK(header.buffer_list_offset < mmap_handle.size(),
                        "invalid offset for buffer list descriptor.");
 
@@ -430,6 +460,8 @@ bool MMapWeightCacheProvider::Load() {
   XNNPACK_RETURN_CHECK(buffer_list,
                        "could not get packed weights from flatbuffer.");
 
+  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
+
   mmap_buffer_base_offset_ = buffer_list->base_offset();
   if (const auto buffers = buffer_list->buffers(); buffers) {
     for (auto* buffer : *buffers) {
@@ -584,7 +616,8 @@ size_t MMapWeightCacheProvider::LookUpOrInsert(
     return offset_it->second.offset;
   }
 
-  const BufferLocation location = builder_.Append(pack_id, ptr, size);
+  const BufferLocation location =
+      builder_.Append(pack_id, ptr, size, cache_key->fingerprint_id);
   XNNPACK_ABORT_CHECK(!location.IsInvalid(),
                       "Inserting data in the cache failed.");
   cache_key_to_offset_.emplace(pack_id, location);
@@ -693,10 +726,20 @@ bool IsCompatibleCacheFile(FileDescriptorView fd) {
                        "Cache header version is incompatible. Expected %" PRIu64
                        ", got %" PRIu64 ".",
                        XNNPackCacheHeader::kVersion, header.version);
-  XNNPACK_RETURN_CHECK(xnn_experimental_check_build_identifier(
-                           header.xnnpack_build_identifier,
-                           sizeof(header.xnnpack_build_identifier)),
-                       "Cache header build identifier is different.");
+
+  fd.SetPos(header.buffer_list_offset);
+  auto buffer = std::make_unique<uint8_t[]>(header.buffer_list_size);
+  XNNPACK_RETURN_CHECK(fd.Read(buffer.get(), header.buffer_list_size));
+
+  flatbuffers::Verifier verifier(buffer.get(), header.buffer_list_size);
+  XNNPACK_RETURN_CHECK(cache::schema::VerifyBufferListBuffer(verifier),
+                       "buffer list validation failed.");
+
+  const cache::schema::BufferList* buffer_list =
+      cache::schema::GetBufferList(buffer.get());
+  XNNPACK_RETURN_CHECK(buffer_list,
+                       "could not get packed weights from flatbuffer.");
+  XNNPACK_RETURN_CHECK(CheckFingerprints(buffer_list));
   return true;
 }
 
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache.h b/tensorflow/lite/delegates/xnnpack/weight_cache.h
index a7c8654df4f7ec..781422b4bec662 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache.h
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache.h
@@ -56,9 +56,8 @@ inline constexpr char kInMemoryCachePath[] = ":memory";
 // When reading a cache file, the cache should be rejected if `version`
 // doesn't match `kVersion`.
 struct XNNPackCacheHeader {
-  enum : uint64_t { kInvalidHeader = 0, kVersion = 1 };
+  enum : uint64_t { kInvalidHeader = 0, kVersion = 2 };
   uint64_t version;
-  uint8_t xnnpack_build_identifier[32];
   uint64_t buffer_list_offset;
   uint64_t buffer_list_size;
 };
@@ -161,8 +160,8 @@ class WeightCacheBuilder {
   // The buffer space must have been reserved before using `Reserve`. If not, a
   // new call to `Reserve` will be done and the data will be copied over.
   [[nodiscard /*The location to the appended data should be saved.*/]]
-  BufferLocation Append(PackIdentifier pack_id, const void* data,
-                        uint64_t size);
+  BufferLocation Append(PackIdentifier pack_id, const void* data, uint64_t size,
+                        int fingerprint_id);
 
   // Writes the flatbuffer to disk.
   [[nodiscard /*Writing the weight cache can fail.*/]]
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
index 33566b8be2208a..37f19612010709 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_schema.fbs
@@ -32,11 +32,14 @@ table Buffer {
 }
 
 table BufferList {
+  /// A list of packing fingerprints. All of these need to be checked when
+  /// loading the cache to ensure that it is compatible.
+  fingerprints: [uint64];
   /// A list of buffers.
   buffers: [Buffer];
   /// Defines the base offset for the data in the file. That offset
   /// may be needed to guarantee data alignment.
-  base_offset:uint64;
+  base_offset: uint64;
 }
 
 root_type BufferList;
diff --git a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
index dd3093b2736517..c1e4071ff4a353 100644
--- a/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/weight_cache_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "experimental.h"  // from @XNNPACK
 #include "xnnpack.h"  // from @XNNPACK
 #include "flatbuffers/verifier.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
@@ -56,7 +57,13 @@ namespace {
 
 using testing::ElementsAreArray;
 
-TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
+static xnn_fingerprint kDefaultFingerprint{/*id=*/0xf00d, /*value=*/0xb33f};
+
+struct WeightCacheBuilderTest : testing::Test {
+  void SetUp() override { xnn_set_fingerprint(kDefaultFingerprint); }
+};
+
+TEST_F(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -72,7 +79,8 @@ TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   const size_t payload_size = size(payload);
   void* buffer = builder.Reserve(payload_size);
   std::memcpy(buffer, payload.c_str(), payload_size);
-  auto loc = builder.Append(dummy_id, buffer, payload_size);
+  auto loc =
+      builder.Append(dummy_id, buffer, payload_size, kDefaultFingerprint.id);
 
   EXPECT_EQ(loc.size, payload_size);
   EXPECT_GE(builder.capacity(), payload_size);
@@ -123,7 +131,7 @@ TEST(WeightCacheBuilderTest, ReserveAppendWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
+TEST_F(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   using std::size;
 
   const std::string payload = "This is some data in the file.";
@@ -137,7 +145,8 @@ TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
+                            kDefaultFingerprint.id);
 
   EXPECT_EQ(loc.size, payload_size);
 
@@ -186,7 +195,7 @@ TEST(WeightCacheBuilderTest, AppendWithoutReserveWriteWorks) {
   EXPECT_THAT(cache_data, ElementsAreArray(payload));
 }
 
-TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
+TEST_F(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   const std::string cache_path = testing::TempDir() + "/cache";
   const std::string payload = "This is some data in the file.";
   const PackIdentifier dummy_id{1, 2, 3};
@@ -198,7 +207,8 @@ TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   ASSERT_TRUE(builder.StartBuildStep());
 
   const size_t payload_size = size(payload);
-  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size);
+  auto loc = builder.Append(dummy_id, payload.c_str(), payload_size,
+                            kDefaultFingerprint.id);
   EXPECT_EQ(loc.size, payload_size);
   ASSERT_TRUE(builder.StopBuildStep());
 
@@ -218,13 +228,13 @@ TEST(WeightCacheBuilderTest, CorruptBufferListFailsGracefully) {
   EXPECT_FALSE(builder.StartBuildStep());
 }
 
-TEST(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
+TEST_F(WeightCacheBuilderTest, InvalidFileDescriptorFails) {
   WeightCacheBuilder builder;
   EXPECT_FALSE(builder.Start("", FileDescriptor()));
   EXPECT_FALSE(builder.Start("/seldf/sedsft", FileDescriptor()));
 }
 
-TEST(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
+TEST_F(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   if (!TfLiteXNNPackDelegateCanUseInMemoryWeightCacheProvider()) {
     GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                     "isn't supported by the current system, skipping test.";
@@ -239,7 +249,7 @@ TEST(WeightCacheBuilderTest, InMemoryCacheCanBeBuilt) {
   EXPECT_EQ(errno, ENOENT);
 }
 
-TEST(WeightCacheBuilderTest, MultipleStepBuild) {
+TEST_F(WeightCacheBuilderTest, MultipleStepBuild) {
   using std::size;
 
   const std::string payload1 = "This is some data in the file.";
@@ -262,7 +272,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload1);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload1.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id1, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id1, buffer, payload_size, kDefaultFingerprint.id);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -270,7 +281,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload3);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload3.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id3, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id3, buffer, payload_size, kDefaultFingerprint.id);
     (void)loc;
   }
 
@@ -284,7 +296,8 @@ TEST(WeightCacheBuilderTest, MultipleStepBuild) {
     const size_t payload_size = size(payload2);
     void* buffer = builder.Reserve(payload_size);
     std::memcpy(buffer, payload2.c_str(), payload_size);
-    const auto loc = builder.Append(dummy_id2, buffer, payload_size);
+    const auto loc =
+        builder.Append(dummy_id2, buffer, payload_size, kDefaultFingerprint.id);
     EXPECT_EQ(loc.size, payload_size);
     EXPECT_GE(builder.capacity(), payload_size);
   }
@@ -389,7 +402,8 @@ struct FakeContext {
                                           const int weights_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = nullptr};
+            .bias = nullptr,
+            .fingerprint_id = kDefaultFingerprint.id};
   }
 
   // Creates a look up key for the XNNPack weight provider C interface.
@@ -398,7 +412,8 @@ struct FakeContext {
                                           const int bias_index) const {
     return {.seed = algorithm_seed,
             .kernel = buffers[weights_index].data(),
-            .bias = buffers[bias_index].data()};
+            .bias = buffers[bias_index].data(),
+            .fingerprint_id = kDefaultFingerprint.id};
   }
 
   // Helps creating fake packed data.
@@ -505,6 +520,7 @@ struct BuildMMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
+    xnn_set_fingerprint(kDefaultFingerprint);
     AddTensors();
     EndSetup();
   }
@@ -723,6 +739,7 @@ struct MMapWeightCacheProviderTest : testing::TestWithParam<TestVariant> {
       GTEST_SKIP() << "In-memory weight cache isn't enabled for this build or "
                       "isn't supported by the current system, skipping test.";
     }
+    xnn_set_fingerprint(kDefaultFingerprint);
   }
   bool use_explicit_fd = GetParam().use_explicit_fd;
   const char* const explicit_fd_path = GetParam().explicit_fd_path;
@@ -783,12 +800,14 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data};
+        .bias = tensors[1].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data};
+        .bias = tensors[4].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     // Lookup non-packed tensor.
     ASSERT_EQ(cache->look_up(cache, &look_up_key_1), SIZE_MAX);
@@ -829,7 +848,8 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data};
+        .bias = tensors[3].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const size_t build_offset_2 = cache->look_up_or_insert(
         cache, &look_up_key_2, (void*)packed_data_ref_2,
@@ -904,17 +924,20 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
     const xnn_weights_cache_look_up_key look_up_key_1{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[0].data.data,
-        .bias = tensors[1].data.data};
+        .bias = tensors[1].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_2{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[2].data.data,
-        .bias = tensors[3].data.data};
+        .bias = tensors[3].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     const xnn_weights_cache_look_up_key look_up_key_3{
         .seed = fake_packing_algo_seed,
         .kernel = tensors[3].data.data,
-        .bias = tensors[4].data.data};
+        .bias = tensors[4].data.data,
+        .fingerprint_id = kDefaultFingerprint.id};
 
     ASSERT_TRUE(cache->is_finalized(cache));
 
@@ -945,30 +968,59 @@ TEST_P(MMapWeightCacheProviderTest, XnnpackCApiJourney) {
   }
 }
 
-TEST_P(MMapWeightCacheProviderTest, XnnpackRebuildOnVersionMismatch) {
+TEST_P(MMapWeightCacheProviderTest, CacheIsRebuiltOnFingerprintMismatch) {
+  if (use_in_memory_cache) {
+    GTEST_SUCCEED() << "In-memory cache is never reloaded.";
+    return;
+  }
   TempFileDesc temp_fd;
   const char* temp_fd_cpath = explicit_fd_path;
-  FileDescriptor temp_fd_value = temp_fd.Duplicate();
 
-  {  // Set bad build identifier
-    XNNPackCacheHeader header{.version = XNNPackCacheHeader::kVersion};
-    header.xnnpack_build_identifier[0] += 1;
-    ASSERT_TRUE(temp_fd_value.Write(&header, sizeof(header)));
+  xnn_fingerprint test_fingeprint{0x7357, 0xF33D};
+  {  // Build a cache file with a specific fingerprint.
+    // Clear fingerprints and add a test fingerprint to XNNPack.
+    xnn_clear_fingerprints();
+    xnn_set_fingerprint(test_fingeprint);
+
+    // Build a cache file.
+    MMapWeightCacheProvider cache_provider;
+
+    const char kernel[] = "Fake data.";
+    TfLiteTensor tensor;
+    tensor.data.data = (void*)kernel;
+    cache_provider.MapTensorIdentifiers(
+        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
+    ASSERT_TRUE(
+        cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
+    ASSERT_TRUE(cache_provider.StartBuildStep());
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = 1234,
+        .kernel = kernel,
+        .bias = nullptr,
+        .fingerprint_id = test_fingeprint.id};
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    const size_t build_offset_1 = cache->look_up_or_insert(
+        cache, &look_up_key_1,
+        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
+        sizeof(kernel));
+    (void)build_offset_1;
+    ASSERT_TRUE(cache_provider.StopBuildStep());
   }
 
   if (!use_explicit_fd) {
     temp_fd.Close();
     temp_fd_cpath = temp_fd.GetCPath();
-    temp_fd_value.Close();
-    if (use_in_memory_cache) {
-      temp_fd_cpath = kInMemoryCachePath;
-    }
   }
 
+  // Change the test fingerprint value.
+  test_fingeprint.value = 0xdeadb33f;
+  xnn_set_fingerprint(test_fingeprint);
+
+  // Reload the file.
   auto build_cache_provider = std::make_unique<MMapWeightCacheProvider>();
   MMapWeightCacheProvider& cache_provider = *build_cache_provider;
-  ASSERT_TRUE(cache_provider.LoadOrStartBuild(temp_fd_cpath,
-                                              temp_fd_value.Duplicate()));
+  ASSERT_TRUE(
+      cache_provider.LoadOrStartBuild(temp_fd_cpath, temp_fd.Duplicate()));
   ASSERT_TRUE(cache_provider.StartBuildStep());
 }
 
@@ -980,29 +1032,53 @@ class IsCompatibleCacheFileTest
   using Param = IsCompatibleCacheFileTestOverload;
 
   void SetUp() override {
-    header_.version = XNNPackCacheHeader::kVersion;
-    memcpy(header_.xnnpack_build_identifier,
-           xnn_experimental_get_build_identifier_data(),
-           xnn_experimental_get_build_identifier_size());
+    xnn_clear_fingerprints();
+    xnn_set_fingerprint(kDefaultFingerprint);
+
+    // Build a cache file.
+    MMapWeightCacheProvider cache_provider;
+
+    const char kernel[] = "Fake data.";
+    TfLiteTensor tensor;
+    tensor.data.data = (void*)kernel;
+    cache_provider.MapTensorIdentifiers(
+        &tensor, /*size=*/1, /*tensor_index_to_identifier=*/{{0, 1}});
+    ASSERT_TRUE(
+        cache_provider.LoadOrStartBuild(fd_.GetCPath(), fd_.Duplicate()));
+    ASSERT_TRUE(cache_provider.StartBuildStep());
+    const xnn_weights_cache_look_up_key look_up_key_1{
+        .seed = 1234,
+        .kernel = kernel,
+        .bias = nullptr,
+        .fingerprint_id = kDefaultFingerprint.id};
+    xnn_weights_cache_t cache = &cache_provider.GetCacheProvider();
+    const size_t build_offset_1 = cache->look_up_or_insert(
+        cache, &look_up_key_1,
+        const_cast<void*>(reinterpret_cast<const void*>(kernel)),
+        sizeof(kernel));
+    (void)build_offset_1;
+    ASSERT_TRUE(cache_provider.StopBuildStep());
   }
 
-  bool WriteHeaderAndReturnIsCompatibleCacheFile() {
-    if (!fd_.Write(&header_, sizeof(header_))) {
-      return false;
-    }
-    if (GetParam() == Param::kPath) {
-      fd_.Close();
-      return IsCompatibleCacheFile(fd_.GetCPath());
-    } else {
-      const FileDescriptor::Offset pos = fd_.GetPos();
-      EXPECT_NE(pos, 0);  // Ensure that we are testing with a non 0 position.
-      const bool compatible = IsCompatibleCacheFile(fd_);
-      EXPECT_EQ(pos, fd_.GetPos());
-      return compatible;
+  void ChangeRuntimeFingerprintValue() {
+    xnn_set_fingerprint(
+        {kDefaultFingerprint.id, kDefaultFingerprint.value + 1});
+  }
+
+  bool CallIsCompatibleCacheFile() {
+    switch (GetParam()) {
+      case Param::kPath:
+        fd_.Close();
+        return IsCompatibleCacheFile(fd_.GetCPath());
+      case Param::kDescriptor: {
+        const auto pos = fd_.GetPos();
+        EXPECT_NE(pos, 0);  // We test with a non zero position.
+        return IsCompatibleCacheFile(fd_);
+        EXPECT_EQ(fd_.GetPos(), pos);
+      }
     }
   }
 
-  XNNPackCacheHeader header_{};
   TempFileDesc fd_;
 };
 
@@ -1016,18 +1092,18 @@ std::string Name(
   }
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsTrueForACorrectHeader) {
-  EXPECT_TRUE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsTrueWhenFingerprintMatches) {
+  EXPECT_TRUE(CallIsCompatibleCacheFile());
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongHeaderVersion) {
-  header_.version += 1;
-  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintMismatches) {
+  ChangeRuntimeFingerprintValue();
+  EXPECT_FALSE(CallIsCompatibleCacheFile());
 }
 
-TEST_P(IsCompatibleCacheFileTest, ReturnsFalseForWrongBuildIdentifier) {
-  header_.xnnpack_build_identifier[0] += 1;
-  EXPECT_FALSE(WriteHeaderAndReturnIsCompatibleCacheFile());
+TEST_P(IsCompatibleCacheFileTest, ReturnsFalseWhenFingerprintIsNotFound) {
+  xnn_clear_fingerprints();
+  EXPECT_FALSE(CallIsCompatibleCacheFile());
 }
 
 INSTANTIATE_TEST_SUITE_P(
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index aa11394dd86d9e..14e4370cbbd929 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -49,5 +49,6 @@ include_directories(
    "${PTHREADPOOL_SOURCE_DIR}/include"
    "${FP16_SOURCE_DIR}/include"
    "${XNNPACK_SOURCE_DIR}/include"
+   "${XNNPACK_SOURCE_DIR}"
    "${CPUINFO_SOURCE_DIR}/"
 )

From 69c656e60b7a2abc36e7fc3db3e95efea146d07b Mon Sep 17 00:00:00 2001
From: Praneeth Mandala <praneman@google.com>
Date: Tue, 23 Dec 2025 12:30:28 -0800
Subject: [PATCH 722/753] Add TPU performance counters to XSpace.

PiperOrigin-RevId: 848263591
---
 third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc | 3 +++
 third_party/xla/xla/tsl/profiler/utils/xplane_schema.h  | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
index ba21b7bcdb7db9..57d68c8dfdad36 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
@@ -66,6 +66,7 @@ const absl::string_view kKernelLaunchLineName = "Launch Stats";
 const absl::string_view kSourceLineName = "Source code";
 const absl::string_view kHostOffloadOpLineName = "Host Offload Ops";
 const absl::string_view kCounterEventsLineName = "_counters_";
+const absl::string_view kCounterValue = "counter_value";
 const absl::string_view kTensorCoreSyncFlagLineName = "Tensor Core Sync Flag";
 const absl::string_view kSparseCoreSyncsLineName = "Sparse Core Syncs";
 
@@ -221,6 +222,7 @@ const StatTypeMap& GetStatTypeMap() {
        {"queue_addr", kQueueAddr},
        {"queue_id", kQueueId},
        {"request_id", kRequestId},
+       {"global_chip_id", kGlobalChipId},
        {"run_id", kRunId},
        {"replica_id", kReplicaId},
        {"graph_type", kGraphType},
@@ -297,6 +299,7 @@ const StatTypeMap& GetStatTypeMap() {
        {"dcn_collective_info", kDcnCollectiveInfo},
        {"all_reduce_id", kAllReduceId},
        {"all_reduce_unique_id", kAllReduceUniqueId},
+       {"performance_counter_id", kPerformanceCounterId},
        // Performance counter related.
        {"Raw Value", kRawValue},
        {"Scaled Value", kScaledValue},
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
index b8b6b0afc4c4d0..dba787403d798f 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
@@ -84,6 +84,7 @@ TF_CONST_INIT extern const absl::string_view kXlaAsyncOpLineName;
 TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
 TF_CONST_INIT extern const absl::string_view kSourceLineName;
 TF_CONST_INIT extern const absl::string_view kCounterEventsLineName;
+TF_CONST_INIT extern const absl::string_view kCounterValue;
 TF_CONST_INIT extern const absl::string_view kHostOffloadOpLineName;
 TF_CONST_INIT extern const absl::string_view kTensorCoreSyncFlagLineName;
 TF_CONST_INIT extern const absl::string_view kSparseCoreSyncsLineName;
@@ -209,6 +210,7 @@ enum StatType {
   kQueueId,
   kQueueAddr,
   kRequestId,
+  kGlobalChipId,
   kRunId,
   kReplicaId,
   kGraphType,
@@ -285,6 +287,7 @@ enum StatType {
   kBytesTransferred,
   kDmaQueue,
   kDcnCollectiveInfo,
+  kPerformanceCounterId,
   // Performance counter related.
   kRawValue,
   kScaledValue,

From b39d0a9077f03953292658d500dcadead677a815 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 23 Dec 2025 13:07:55 -0800
Subject: [PATCH 723/753] disable
 //third_party/tensorflow/compiler/xla/service/gpu:determinism_test_h100 for
 now

PiperOrigin-RevId: 848274769
---
 third_party/xla/xla/service/gpu/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 49cab857c01cd0..18e97bd54a005d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -3181,6 +3181,11 @@ xla_test(
     name = "determinism_test",
     srcs = ["determinism_test.cc"],
     backends = ["gpu"],
+    # TODO(b/471244513) disabled because it times out.
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         "//xla:literal",
         "//xla:xla_proto_cc",

From a554adf928274d0f2f2dedd95882642d8b104778 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 23 Dec 2025 13:56:20 -0800
Subject: [PATCH 724/753] add missing dependencies

PiperOrigin-RevId: 848289158
---
 tensorflow/compiler/jit/BUILD | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 1ed658d73f1a4a..91313abca45a24 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -2101,12 +2101,15 @@ tf_cuda_cc_test(
     srcs = ["xla_platform_info_test.cc"],
     tags = tf_cuda_tests_tags() + ["config-cuda-only"],
     deps = [
+        ":device_compilation_profiler",
+        ":device_compiler",
         ":flags_headers",
         ":test_util",
         ":xla_device_no_jit_rewrite_registration",
         ":xla_gpu_device",
         ":xla_gpu_jit",
         "//tensorflow/compiler/tf2xla:layout_util",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:lib_proto_parsing",
@@ -2118,8 +2121,12 @@ tf_cuda_cc_test(
         "//tensorflow/core/tfrt/common:create_pjrt_client_util",
         "//tensorflow/core/tfrt/common:pjrt_util",
         "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
+        "@local_xla//xla/client:local_client",
+        "@local_xla//xla/pjrt:pjrt_client",
         "@local_xla//xla/pjrt/plugin/xla_cpu:cpu_client_options",
         "@local_xla//xla/pjrt/plugin/xla_cpu:xla_cpu_pjrt_client",
+        "@local_xla//xla/tsl/platform:statusor",
     ],
 )

From c16ae6c9195887f62a38fa5f6e3a5dbe859685a0 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 23 Dec 2025 14:23:10 -0800
Subject: [PATCH 725/753] Add proto serialization for SendThunk

PiperOrigin-RevId: 848297480
---
 .../xla/xla/backends/gpu/runtime/BUILD        | 20 +++++
 .../backends/gpu/runtime/p2p_thunk_common.cc  |  3 +-
 .../xla/backends/gpu/runtime/send_thunk.cc    | 89 +++++++++++++++++--
 .../xla/xla/backends/gpu/runtime/send_thunk.h | 14 +++
 .../backends/gpu/runtime/send_thunk_test.cc   | 75 ++++++++++++++++
 .../xla/xla/backends/gpu/runtime/thunk.proto  | 12 +++
 .../runtime/thunk_proto_deserialization.cc    |  5 ++
 7 files changed, 211 insertions(+), 7 deletions(-)
 create mode 100644 third_party/xla/xla/backends/gpu/runtime/send_thunk_test.cc

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 5b57b09dffc68d..1545c1fd615a66 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -1993,16 +1993,35 @@ cc_library(
         "//xla/hlo/ir:collective_op_group_mode",
         "//xla/hlo/ir:hlo",
         "//xla/runtime:device_id",
+        "//xla/service:buffer_assignment",
         "//xla/service:computation_placer",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:stream",
         "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:status_macros",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "send_thunk_test",
+    srcs = ["send_thunk_test.cc"],
+    deps = [
+        ":collective_thunk",
+        ":send_thunk",
+        ":thunk",
+        ":thunk_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/tsl/util/proto:parse_text_proto",
+        "//xla/tsl/util/proto:proto_matchers",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -2953,6 +2972,7 @@ cc_library(
         ":outfeed_thunk",
         ":ragged_all_to_all_thunk",
         ":replica_id_thunk",
+        ":send_thunk",
         ":sequential_thunk",
         ":thunk",
         ":thunk_proto_cc",
diff --git a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
index 52beaae671e837..035a9bb85932f1 100644
--- a/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/p2p_thunk_common.cc
@@ -127,8 +127,7 @@ P2PConfig GetP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
   }
 
   std::vector<ReplicaGroup> replica_groups = statusor.value();
-  P2PConfig::ValidationKind validation_kind = P2PConfig::ValidationKind::kValid;
-  p2p_config.validation_kind = validation_kind;
+  p2p_config.validation_kind = P2PConfig::ValidationKind::kValid;
   for (const ReplicaGroup& replica_group : replica_groups) {
     int64_t source = replica_group.replica_ids(0);
     int64_t target = replica_group.replica_ids(1);
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
index 53d39d7d5e3093..a2144f91b4c924 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.cc
@@ -16,15 +16,18 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/send_thunk.h"
 
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
@@ -35,6 +38,7 @@ limitations under the License.
 #include "xla/hlo/ir/collective_op_group_mode.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/runtime/device_id.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/computation_placer.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_address.h"
@@ -42,6 +46,7 @@ limitations under the License.
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
 #include "xla/xla_data.pb.h"
+#include "xla/tsl/platform/status_macros.h"
 
 namespace xla {
 namespace gpu {
@@ -49,16 +54,24 @@ namespace gpu {
 SendThunk::SendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
                      int64_t replica_count, int64_t partition_count,
                      const Buffer& buffer)
-    : CollectiveThunk(Thunk::kSend, thunk_info,
-                      /*is_sync=*/false, GetStreamKindForP2P(instr)),
-      config_(GetP2PConfigForSendRecv(instr, instr->operand(0)->shape(),
-                                      replica_count, partition_count)),
+    : SendThunk(std::move(thunk_info),
+                GetP2PConfigForSendRecv(instr, instr->operand(0)->shape(),
+                                        replica_count, partition_count),
+                std::make_shared<CollectiveThunk::AsyncEvents>(),
+                GetStreamKindForP2P(instr), buffer, instr->name()) {}
+
+SendThunk::SendThunk(ThunkInfo thunk_info, const P2PConfig& config,
+                     std::shared_ptr<AsyncEvents> async_events,
+                     AsyncStreamKind stream_kind, const Buffer& buffer,
+                     absl::string_view instr_name)
+    : CollectiveThunk(Thunk::kSend, thunk_info, async_events, stream_kind),
+      config_(config),
       buffer_(buffer),
       execution_counters_(config_.validation_kind ==
                                   P2PConfig::ValidationKind::kConditional
                               ? new ExecutionCounters()
                               : nullptr),
-      hlo_name_(instr->name()) {}
+      hlo_name_(instr_name) {}
 
 absl::Status SendThunk::Initialize(const InitializeParams& params) {
   TF_RETURN_IF_ERROR(CollectiveThunk::Initialize(params));
@@ -69,6 +82,72 @@ absl::Status SendThunk::Initialize(const InitializeParams& params) {
   return absl::OkStatus();
 }
 
+absl::StatusOr<std::unique_ptr<SendThunk>> SendThunk::FromProto(
+    ThunkInfo thunk_info, const SendThunkProto& thunk_proto,
+    absl::Span<const BufferAllocation> buffer_allocations,
+    CollectiveThunk::AsyncEventsMap& async_events_map) {
+  std::shared_ptr<CollectiveThunk::AsyncEvents>& async_events =
+      async_events_map[AsyncEventsUniqueId{
+          thunk_proto.async_events_unique_id()}];
+  if (!async_events) {
+    async_events = std::make_shared<CollectiveThunk::AsyncEvents>();
+  }
+
+  ASSIGN_OR_RETURN(CollectiveThunk::Buffer buffer,
+                   CollectiveThunk::Buffer::FromProto(thunk_proto.buffer(),
+                                                      buffer_allocations));
+
+  CollectiveConfig config =
+      CollectiveConfig::FromProto(thunk_proto.collective_config());
+
+  P2PConfig::IdToSourceTargetMap id_to_source_target;
+  for (const SourceTarget& source_target : thunk_proto.source_target_pairs()) {
+    id_to_source_target.insert({source_target.target(), {}})
+        .first->second.source = source_target.source();
+    id_to_source_target.insert({source_target.source(), {}})
+        .first->second.target = source_target.target();
+  }
+
+  return std::make_unique<SendThunk>(
+      std::move(thunk_info), P2PConfig{config, std::move(id_to_source_target)},
+      async_events, thunk_proto.async_stream_kind(), buffer,
+      thunk_proto.instruction_name());
+}
+
+absl::StatusOr<ThunkProto> SendThunk::ToProto() const {
+  CHECK_EQ(config_.validation_kind, P2PConfig::ValidationKind::kValid);
+  CHECK(config_.source_target_to_bounds.empty());
+
+  ThunkProto proto;
+  *proto.mutable_thunk_info() = thunk_info().ToProto();
+
+  SendThunkProto* thunk_proto = proto.mutable_send_thunk();
+
+  std::optional<AsyncEventsUniqueId> async_events_id = GetAsyncEventsUniqueId();
+  CHECK(async_events_id.has_value());
+  thunk_proto->set_async_events_unique_id(async_events_id->value());
+
+  *thunk_proto->mutable_collective_config() = config_.config.ToProto();
+  std::vector<SourceTarget> source_target_pairs;
+  source_target_pairs.reserve(config_.id_to_source_target.size() / 2);
+  for (const auto& [key_id, map_entry] : config_.id_to_source_target) {
+    if (!map_entry.source.has_value()) {
+      // Same pair is in the map with target/source switched.
+      continue;
+    }
+    SourceTarget pair;
+    pair.set_source(*map_entry.source);
+    pair.set_target(key_id);
+    source_target_pairs.push_back(pair);
+  }
+  thunk_proto->mutable_source_target_pairs()->Assign(
+      source_target_pairs.begin(), source_target_pairs.end());
+
+  thunk_proto->set_async_stream_kind(GetAsyncStreamKind());
+  thunk_proto->set_instruction_name(hlo_name_);
+  return proto;
+}
+
 absl::StatusOr<bool> SendThunk::RunCollective(const ExecuteParams& params,
                                               const GpuCliqueKey&,
                                               se::Stream& stream,
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
index e71b7b948f15b0..3651dba6c580ba 100644
--- a/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk.h
@@ -23,12 +23,14 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/backends/gpu/collectives/gpu_clique_key.h"
 #include "xla/backends/gpu/collectives/gpu_collectives.h"
 #include "xla/backends/gpu/runtime/collective_thunk.h"
 #include "xla/backends/gpu/runtime/p2p_thunk_common.h"
 #include "xla/core/collectives/communicator.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/stream_executor/stream.h"
 
 namespace xla {
@@ -40,8 +42,20 @@ class SendThunk : public CollectiveThunk {
   SendThunk(ThunkInfo thunk_info, const HloSendInstruction* instr,
             int64_t replica_count, int64_t partition_count,
             const Buffer& buffer);
+  SendThunk(ThunkInfo thunk_info, const P2PConfig& config,
+            std::shared_ptr<AsyncEvents> async_events,
+            AsyncStreamKind stream_kind, const Buffer& buffer,
+            absl::string_view instr_name);
+
   absl::Status Initialize(const InitializeParams& params) override;
 
+  static absl::StatusOr<std::unique_ptr<SendThunk>> FromProto(
+      ThunkInfo thunk_info, const SendThunkProto& thunk_proto,
+      absl::Span<const BufferAllocation> buffer_allocations,
+      CollectiveThunk::AsyncEventsMap& async_events_map);
+
+  absl::StatusOr<ThunkProto> ToProto() const override;
+
  protected:
   const CollectiveConfig& config() const override { return config_.config; }
   absl::StatusOr<bool> RunCollective(const ExecuteParams& params,
diff --git a/third_party/xla/xla/backends/gpu/runtime/send_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/send_thunk_test.cc
new file mode 100644
index 00000000000000..79516ce955ec6b
--- /dev/null
+++ b/third_party/xla/xla/backends/gpu/runtime/send_thunk_test.cc
@@ -0,0 +1,75 @@
+/* Copyright 2025 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/backends/gpu/runtime/send_thunk.h"
+
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/backends/gpu/runtime/collective_thunk.h"
+#include "xla/backends/gpu/runtime/thunk.h"
+#include "xla/backends/gpu/runtime/thunk.pb.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/tsl/util/proto/parse_text_proto.h"
+#include "xla/tsl/util/proto/proto_matchers.h"
+
+namespace xla::gpu {
+namespace {
+
+using ::tsl::proto_testing::EqualsProto;
+
+TEST(CollectiveThunkTest, ProtoRoundTrip) {
+  ThunkProto proto = tsl::proto_testing::ParseTextProtoOrDie<ThunkProto>(
+      R"pb(
+        thunk_info {
+          profile_annotation: "partition_id_profile_annotation"
+          execution_stream_id: 2
+        }
+        send_thunk {
+          async_events_unique_id: 3
+          collective_config {}
+          async_stream_kind: ASYNC_STREAM_KIND_COLLECTIVE
+          source_target_pairs: { source: 1 target: 2 }
+        }
+      )pb");
+
+  Thunk::ThunkInfo thunk_info;
+  thunk_info.profile_annotation = proto.thunk_info().profile_annotation();
+  thunk_info.execution_stream_id = xla::gpu::ExecutionStreamId{
+      static_cast<xla::gpu::ExecutionStreamId::ValueType>(
+          proto.thunk_info().execution_stream_id())};
+
+  CollectiveThunk::AsyncEventsMap async_events_map;
+  std::vector<BufferAllocation> buffer_allocations = {
+      BufferAllocation(/*index=*/0, /*size=*/4, /*color=*/0)};
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<SendThunk> thunk,
+      SendThunk::FromProto(thunk_info, proto.send_thunk(), buffer_allocations,
+                           async_events_map));
+  ASSERT_NE(thunk->async_events(), nullptr);
+
+  ASSERT_OK_AND_ASSIGN(ThunkProto round_trip_proto, thunk->ToProto());
+
+  // Ids are unique and expected to differ.
+  proto.mutable_send_thunk()->set_async_events_unique_id(
+      round_trip_proto.send_thunk().async_events_unique_id());
+  EXPECT_THAT(round_trip_proto, EqualsProto(proto));
+}
+
+}  // namespace
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 8cc71ce2e0b8d5..72f9af4f5c9871 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -457,6 +457,17 @@ message CollectivePermuteStartThunkProto {
   bool p2p_memcpy_enabled = 6;
 }
 
+message SendThunkProto {
+  uint64 async_events_unique_id = 1;
+  CollectiveBufferProto buffer = 2;
+
+  CollectiveConfigProto collective_config = 3;
+  repeated SourceTarget source_target_pairs = 4;
+
+  AsyncStreamKind async_stream_kind = 5;
+  string instruction_name = 6;
+}
+
 message CollectiveDoneThunkProto {
   ThunkKindProto thunk_kind = 1;
   AsyncStreamKind async_stream_kind = 2;
@@ -507,6 +518,7 @@ message ThunkProto {
     AllToAllStartThunkProto all_to_all_start_thunk = 40;
     RaggedAllToAllStartThunkProto ragged_all_to_all_start_thunk = 41;
     CollectivePermuteStartThunkProto collective_permute_start_thunk = 42;
+    SendThunkProto send_thunk = 43;
   }
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
index 86ced1a02dd418..7ba384ee90c936 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk_proto_deserialization.cc
@@ -54,6 +54,7 @@ limitations under the License.
 #include "xla/backends/gpu/runtime/outfeed_thunk.h"
 #include "xla/backends/gpu/runtime/ragged_all_to_all_thunk.h"
 #include "xla/backends/gpu/runtime/replica_id_thunk.h"
+#include "xla/backends/gpu/runtime/send_thunk.h"
 #include "xla/backends/gpu/runtime/sequential_thunk.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
@@ -267,6 +268,10 @@ absl::StatusOr<std::unique_ptr<Thunk>> DeserializeThunkProtoImpl(
       return CollectivePermuteStartThunk::FromProto(
           std::move(thunk_info), thunk_proto.collective_permute_start_thunk(),
           buffer_allocations, collective_async_events_map);
+    case ThunkProto::kSendThunk:
+      return SendThunk::FromProto(std::move(thunk_info),
+                                  thunk_proto.send_thunk(), buffer_allocations,
+                                  collective_async_events_map);
     default:
       std::optional<absl::string_view> unsupported_thunk_type =
           GetStoredThunkTypeName(thunk_proto);

From 13f6a371ab773474c292cca5176752c35a037260 Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 23 Dec 2025 15:02:25 -0800
Subject: [PATCH 726/753] Add Shape to ConvolutionReorderThunk buffer_uses

Modify Thunk's serialization

PiperOrigin-RevId: 848309350
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  4 ++
 .../runtime/convolution_filter_thunk.proto    |  5 +-
 .../gpu/runtime/convolution_reorder_thunk.cc  | 62 ++++++++++++-------
 .../gpu/runtime/convolution_reorder_thunk.h   | 28 +++++----
 .../runtime/convolution_reorder_thunk_test.cc | 36 ++++++++---
 .../xla/xla/backends/gpu/runtime/thunk.proto  |  7 +--
 .../xla/xla/service/gpu/thunk_emitter.cc      | 39 +++++-------
 7 files changed, 110 insertions(+), 71 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 1545c1fd615a66..1e8947492ff6b8 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -556,8 +556,11 @@ cc_library(
     hdrs = ["convolution_reorder_thunk.h"],
     deps = [
         ":convolution_filter_thunk_proto_cc",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
+        "//xla:util",
         "//xla/runtime:buffer_use",
         "//xla/service:buffer_assignment",
         "//xla/service:buffer_assignment_proto_cc",
@@ -592,6 +595,7 @@ tf_proto_library(
     name = "convolution_filter_thunk_proto",
     srcs = ["convolution_filter_thunk.proto"],
     protodeps = ["//xla/service:buffer_assignment_proto"],
+    deps = [":shaped_slice_proto"],
 )
 
 cc_library(
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto b/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
index f691051f611d5f..94cb9967b42bdb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_filter_thunk.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package xla.gpu;
 
+import "xla/backends/gpu/runtime/shaped_slice.proto";
 import "xla/service/buffer_assignment.proto";
 
 // Dimensions of the convolution filter.
@@ -20,6 +21,6 @@ message ConvolutionFilterDimensions {
 // Buffers for the bias input and output of the convolution reorder thunk.
 // Serialized version of xla::gpu::ConvolutionReorderThunk::BiasBuffers.
 message ConvolutionReorderBiasBuffers {
-  xla.buffer_assignment.BufferAllocationSliceProto bias_input = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto bias_output = 2;
+  ShapedSliceProto bias_input = 1;
+  ShapedSliceProto bias_output = 2;
 }
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
index 51cb69840c1197..872cce67cfda74 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.cc
@@ -24,13 +24,16 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/buffer_assignment.pb.h"
+#include "xla/shape.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tsl/platform/statusor.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace gpu {
@@ -50,7 +53,7 @@ static se::dnn::FilterDescriptor CreateFilterDescriptor(
 
 ConvolutionReorderThunk::ConvolutionReorderThunk(
     ThunkInfo thunk_info, ConvolutionFilterDimensions filter_dimensions,
-    BufferAllocation::Slice filter_input, BufferAllocation::Slice filter_output,
+    ShapedSlice filter_input, ShapedSlice filter_output,
     std::optional<BiasBuffers> biases)
     : Thunk(Kind::kConvolutionReorder, thunk_info),
       filter_dimensions_(std::move(filter_dimensions)),
@@ -59,22 +62,41 @@ ConvolutionReorderThunk::ConvolutionReorderThunk(
       filter_output_(filter_output),
       biases_(biases) {}
 
+absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>>
+ConvolutionReorderThunk::Create(ThunkInfo thunk_info, ShapedSlice filter_input,
+                                ShapedSlice filter_output,
+                                std::optional<BiasBuffers> biases) {
+  Shape shape = filter_output.shape;
+  if (shape.dimensions().size() != 5 || shape.dimensions(4) != 32) {
+    return Internal("Unexpected shape for convolution reorder: %s",
+                    shape.ToString());
+  }
+  ConvolutionFilterDimensions filter_dimensions;
+  filter_dimensions.set_output_feature_map_count(shape.dimensions(0));
+  filter_dimensions.set_input_feature_map_count(shape.dimensions(1) * 32);
+  filter_dimensions.set_input_filter_height(shape.dimensions(2));
+  filter_dimensions.set_input_filter_width(shape.dimensions(3));
+  return std::make_unique<ConvolutionReorderThunk>(
+      std::move(thunk_info), filter_dimensions, filter_input, filter_output,
+      biases);
+}
+
 absl::Status ConvolutionReorderThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
 
   auto filter_input = se::DeviceAddress<int8_t>(
-      buffer_allocations.GetDeviceAddress(filter_input_));
+      buffer_allocations.GetDeviceAddress(filter_input_.slice));
   auto filter_output = se::DeviceAddress<int8_t>(
-      buffer_allocations.GetDeviceAddress(filter_output_));
+      buffer_allocations.GetDeviceAddress(filter_output_.slice));
 
   std::optional<se::DeviceAddress<float>> bias_input;
   std::optional<se::DeviceAddress<float>> bias_output;
   if (biases_.has_value()) {
     bias_input = se::DeviceAddress<float>(
-        buffer_allocations.GetDeviceAddress(biases_->bias_input));
+        buffer_allocations.GetDeviceAddress(biases_->bias_input.slice));
     bias_output = se::DeviceAddress<float>(
-        buffer_allocations.GetDeviceAddress(biases_->bias_output));
+        buffer_allocations.GetDeviceAddress(biases_->bias_output.slice));
   }
 
   auto dnn = params.stream->parent()->AsDnn();
@@ -90,27 +112,26 @@ absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>>
 ConvolutionReorderThunk::FromProto(
     ThunkInfo thunk_info, const ConvolutionReorderThunkProto& proto,
     absl::Span<const BufferAllocation> buffer_allocations) {
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
-                      BufferAllocation::Slice::FromProto(proto.filter_input(),
-                                                         buffer_allocations));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
-                      BufferAllocation::Slice::FromProto(proto.filter_output(),
-                                                         buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      ShapedSlice filter_input,
+      ShapedSlice::FromProto(proto.filter_input(), buffer_allocations));
+  TF_ASSIGN_OR_RETURN(
+      ShapedSlice filter_output,
+      ShapedSlice::FromProto(proto.filter_output(), buffer_allocations));
 
   std::optional<BiasBuffers> biases;
   if (proto.has_biases()) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_input,
-                        BufferAllocation::Slice::FromProto(
-                            proto.biases().bias_input(), buffer_allocations));
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_output,
-                        BufferAllocation::Slice::FromProto(
-                            proto.biases().bias_output(), buffer_allocations));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_input,
+                        ShapedSlice::FromProto(proto.biases().bias_input(),
+                                               buffer_allocations));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_output,
+                        ShapedSlice::FromProto(proto.biases().bias_output(),
+                                               buffer_allocations));
     biases = {{bias_input, bias_output}};
   }
 
-  return std::make_unique<ConvolutionReorderThunk>(
-      std::move(thunk_info), proto.filter_dimensions(), filter_input,
-      filter_output, biases);
+  return ConvolutionReorderThunk::Create(std::move(thunk_info), filter_input,
+                                         filter_output, biases);
 }
 
 absl::StatusOr<ThunkProto> ConvolutionReorderThunk::ToProto() const {
@@ -119,7 +140,6 @@ absl::StatusOr<ThunkProto> ConvolutionReorderThunk::ToProto() const {
 
   ConvolutionReorderThunkProto* reorder_proto =
       thunk_proto.mutable_convolution_reorder_thunk();
-  *reorder_proto->mutable_filter_dimensions() = filter_dimensions_;
 
   TF_ASSIGN_OR_RETURN(*reorder_proto->mutable_filter_input(),
                       filter_input_.ToProto());
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
index 2bd0501b08e365..d409c48819fc22 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/convolution_filter_thunk.pb.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
@@ -36,16 +37,19 @@ namespace gpu {
 class ConvolutionReorderThunk : public Thunk {
  public:
   struct BiasBuffers {
-    BufferAllocation::Slice bias_input;
-    BufferAllocation::Slice bias_output;
+    ShapedSlice bias_input;
+    ShapedSlice bias_output;
   };
 
   ConvolutionReorderThunk(ThunkInfo thunk_info,
                           ConvolutionFilterDimensions filter_dimensions,
-                          BufferAllocation::Slice filter_input,
-                          BufferAllocation::Slice filter_output,
+                          ShapedSlice filter_input, ShapedSlice filter_output,
                           std::optional<BiasBuffers> biases);
 
+  static absl::StatusOr<std::unique_ptr<ConvolutionReorderThunk>> Create(
+      ThunkInfo thunk_info, ShapedSlice filter_input, ShapedSlice filter_output,
+      std::optional<BiasBuffers> biases);
+
   ConvolutionReorderThunk(const ConvolutionReorderThunk&) = delete;
   ConvolutionReorderThunk& operator=(const ConvolutionReorderThunk&) = delete;
 
@@ -53,12 +57,14 @@ class ConvolutionReorderThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     BufferUses res{
-        BufferUse::Read(filter_input_),
-        BufferUse::Write(filter_output_),
+        BufferUse::Read(filter_input_.slice, filter_input_.shape),
+        BufferUse::Write(filter_output_.slice, filter_output_.shape),
     };
     if (biases_.has_value()) {
-      res.push_back(BufferUse::Read(biases_->bias_input));
-      res.push_back(BufferUse::Write(biases_->bias_output));
+      res.push_back(BufferUse::Read(biases_->bias_input.slice,
+                                    biases_->bias_input.shape));
+      res.push_back(BufferUse::Write(biases_->bias_output.slice,
+                                     biases_->bias_output.shape));
     }
     return res;
   }
@@ -72,9 +78,9 @@ class ConvolutionReorderThunk : public Thunk {
  private:
   const ConvolutionFilterDimensions filter_dimensions_;
   const se::dnn::FilterDescriptor filter_descriptor_;
-  BufferAllocation::Slice filter_input_;
-  BufferAllocation::Slice filter_output_;
-  std::optional<BiasBuffers> biases_;
+  const ShapedSlice filter_input_;
+  const ShapedSlice filter_output_;
+  const std::optional<BiasBuffers> biases_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
index 69aaf54193541f..f85b49bd35509a 100644
--- a/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/convolution_reorder_thunk_test.cc
@@ -39,17 +39,35 @@ TEST(ConvolutionReorderThunkTest, ProtoRoundTrip) {
   auto proto = ParseTextProtoOrDie<ThunkProto>(R"pb(
     thunk_info { profile_annotation: "test" execution_stream_id: 0 }
     convolution_reorder_thunk {
-      filter_dimensions {
-        output_feature_map_count: 1
-        input_feature_map_count: 2
-        input_filter_height: 3
-        input_filter_width: 4
+      filter_input {
+        slice { buffer_allocation_index: 0 offset: 0 size: 1024 }
+        shape {}
+      }
+      filter_output {
+        slice { buffer_allocation_index: 1 offset: 0 size: 512 }
+        shape {
+          element_type: F32
+          dimensions: 1
+          dimensions: 2
+          dimensions: 3
+          dimensions: 4
+          dimensions: 32
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+          is_dynamic_dimension: false
+        }
       }
-      filter_input { buffer_allocation_index: 0 offset: 0 size: 1024 }
-      filter_output { buffer_allocation_index: 1 offset: 0 size: 512 }
       biases {
-        bias_input { buffer_allocation_index: 2 offset: 0 size: 256 }
-        bias_output { buffer_allocation_index: 3 offset: 0 size: 128 }
+        bias_input {
+          slice { buffer_allocation_index: 2 offset: 0 size: 256 }
+          shape {}
+        }
+        bias_output {
+          slice { buffer_allocation_index: 3 offset: 0 size: 128 }
+          shape {}
+        }
       }
     }
   )pb");
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index 72f9af4f5c9871..a5a21c6fc89fda 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -344,10 +344,9 @@ message ConvolutionThunkProto {
 }
 
 message ConvolutionReorderThunkProto {
-  ConvolutionFilterDimensions filter_dimensions = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto filter_input = 2;
-  xla.buffer_assignment.BufferAllocationSliceProto filter_output = 3;
-  optional ConvolutionReorderBiasBuffers biases = 4;
+  ShapedSliceProto filter_input = 1;
+  ShapedSliceProto filter_output = 2;
+  optional ConvolutionReorderBiasBuffers biases = 3;
 }
 
 message FftThunkProto {
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 2519719f9a727f..500ec3183e9275 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -778,38 +778,29 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitCublasLtMatmulThunkF8(
 absl::StatusOr<ThunkSequence> ThunkEmitter::EmitConvolutionReorderThunk(
     const HloCustomCallInstruction* instr) {
   bool has_bias = instr->operand_count() > 1;
-  Shape shape = has_bias ? instr->shape().tuple_shapes(0) : instr->shape();
-  if (shape.dimensions().size() != 5 || shape.dimensions(4) != 32) {
-    return Internal("Unexpected shape for convolution reorder: %s",
-                    instr->ToString());
-  }
-  ConvolutionFilterDimensions filter_dimensions;
-  filter_dimensions.set_output_feature_map_count(shape.dimensions(0));
-  filter_dimensions.set_input_feature_map_count(shape.dimensions(1) * 32);
-  filter_dimensions.set_input_filter_height(shape.dimensions(2));
-  filter_dimensions.set_input_filter_width(shape.dimensions(3));
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
-                      GetAllocationSliceForHlo(instr->operand(0)));
 
-  BufferAllocation::Slice filter_output;
+  TF_ASSIGN_OR_RETURN(ShapedSlice filter_input,
+                      GetShapedSliceForHlo(instr->operand(0)));
+
+  ShapedSlice filter_output;
   std::optional<ConvolutionReorderThunk::BiasBuffers> biases;
   if (has_bias) {
-    TF_ASSIGN_OR_RETURN(filter_output, GetAllocationSliceForHlo(instr, {0}));
+    TF_ASSIGN_OR_RETURN(filter_output, GetShapedSliceForHlo(instr, {0}));
 
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_input,
-                        GetAllocationSliceForHlo(instr->operand(1)));
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_output,
-                        GetAllocationSliceForHlo(instr, {1}));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_input,
+                        GetShapedSliceForHlo(instr->operand(1)));
+    TF_ASSIGN_OR_RETURN(ShapedSlice bias_output,
+                        GetShapedSliceForHlo(instr, {1}));
     biases = {{bias_input, bias_output}};
   } else {
-    TF_ASSIGN_OR_RETURN(filter_output, GetAllocationSliceForHlo(instr));
+    TF_ASSIGN_OR_RETURN(filter_output, GetShapedSliceForHlo(instr));
   }
 
-  auto thunk = std::make_unique<ConvolutionReorderThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(
-          instr, ir_emitter_context_->GetNextThunkId()),
-      std::move(filter_dimensions), filter_input, filter_output, biases);
+  ASSIGN_OR_RETURN(auto thunk,
+                   ConvolutionReorderThunk::Create(
+                       Thunk::ThunkInfo::WithProfileAnnotation(
+                           instr, ir_emitter_context_->GetNextThunkId()),
+                       filter_input, filter_output, biases));
   return GetThunkSequence(std::move(thunk));
 }
 

From 989c58d0c73705ae0a23033a7f4bc0a2bdeb51a4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 15:06:05 -0800
Subject: [PATCH 727/753] Reverts 69c656e60b7a2abc36e7fc3db3e95efea146d07b

PiperOrigin-RevId: 848310259
---
 third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc | 3 ---
 third_party/xla/xla/tsl/profiler/utils/xplane_schema.h  | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
index 57d68c8dfdad36..ba21b7bcdb7db9 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.cc
@@ -66,7 +66,6 @@ const absl::string_view kKernelLaunchLineName = "Launch Stats";
 const absl::string_view kSourceLineName = "Source code";
 const absl::string_view kHostOffloadOpLineName = "Host Offload Ops";
 const absl::string_view kCounterEventsLineName = "_counters_";
-const absl::string_view kCounterValue = "counter_value";
 const absl::string_view kTensorCoreSyncFlagLineName = "Tensor Core Sync Flag";
 const absl::string_view kSparseCoreSyncsLineName = "Sparse Core Syncs";
 
@@ -222,7 +221,6 @@ const StatTypeMap& GetStatTypeMap() {
        {"queue_addr", kQueueAddr},
        {"queue_id", kQueueId},
        {"request_id", kRequestId},
-       {"global_chip_id", kGlobalChipId},
        {"run_id", kRunId},
        {"replica_id", kReplicaId},
        {"graph_type", kGraphType},
@@ -299,7 +297,6 @@ const StatTypeMap& GetStatTypeMap() {
        {"dcn_collective_info", kDcnCollectiveInfo},
        {"all_reduce_id", kAllReduceId},
        {"all_reduce_unique_id", kAllReduceUniqueId},
-       {"performance_counter_id", kPerformanceCounterId},
        // Performance counter related.
        {"Raw Value", kRawValue},
        {"Scaled Value", kScaledValue},
diff --git a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
index dba787403d798f..b8b6b0afc4c4d0 100644
--- a/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/xla/tsl/profiler/utils/xplane_schema.h
@@ -84,7 +84,6 @@ TF_CONST_INIT extern const absl::string_view kXlaAsyncOpLineName;
 TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
 TF_CONST_INIT extern const absl::string_view kSourceLineName;
 TF_CONST_INIT extern const absl::string_view kCounterEventsLineName;
-TF_CONST_INIT extern const absl::string_view kCounterValue;
 TF_CONST_INIT extern const absl::string_view kHostOffloadOpLineName;
 TF_CONST_INIT extern const absl::string_view kTensorCoreSyncFlagLineName;
 TF_CONST_INIT extern const absl::string_view kSparseCoreSyncsLineName;
@@ -210,7 +209,6 @@ enum StatType {
   kQueueId,
   kQueueAddr,
   kRequestId,
-  kGlobalChipId,
   kRunId,
   kReplicaId,
   kGraphType,
@@ -287,7 +285,6 @@ enum StatType {
   kBytesTransferred,
   kDmaQueue,
   kDcnCollectiveInfo,
-  kPerformanceCounterId,
   // Performance counter related.
   kRawValue,
   kScaledValue,

From 0cebad66545e6f271086eb4b68f37fecf3b7fc9f Mon Sep 17 00:00:00 2001
From: Maxim Ermilov <maximermilov@google.com>
Date: Tue, 23 Dec 2025 15:52:49 -0800
Subject: [PATCH 728/753] Add Shape to TriangularSolveThunk buffer_uses

Modify Thunk's serialization

PiperOrigin-RevId: 848323137
---
 .../xla/xla/backends/gpu/runtime/BUILD        |  2 +
 .../xla/xla/backends/gpu/runtime/thunk.proto  | 12 +---
 .../gpu/runtime/triangular_solve_thunk.cc     | 58 ++++++++-----------
 .../gpu/runtime/triangular_solve_thunk.h      | 44 +++++++++-----
 .../runtime/triangular_solve_thunk_test.cc    | 27 ++++++---
 .../xla/xla/service/gpu/thunk_emitter.cc      | 40 ++++---------
 6 files changed, 85 insertions(+), 98 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index 1e8947492ff6b8..d1ed9d2c469f38 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -2408,8 +2408,10 @@ cc_library(
     hdrs = ["triangular_solve_thunk.h"],
     deps = [
         ":make_batch_pointers",
+        ":shaped_slice",
         ":thunk",
         ":thunk_proto_cc",
+        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:util",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/backends/gpu/runtime/thunk.proto b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
index a5a21c6fc89fda..f9b79285bad415 100644
--- a/third_party/xla/xla/backends/gpu/runtime/thunk.proto
+++ b/third_party/xla/xla/backends/gpu/runtime/thunk.proto
@@ -188,15 +188,9 @@ message WaitForStreamsThunkProto {
 
 message TriangularSolveThunkProto {
   xla.TriangularSolveOptions options = 1;
-  xla.buffer_assignment.BufferAllocationSliceProto a_buffer = 2;
-  xla.buffer_assignment.BufferAllocationSliceProto b_buffer = 3;
-  xla.buffer_assignment.BufferAllocationSliceProto temp_buffer = 4;
-  xla.PrimitiveType type = 5;
-  int64 batch_size = 6;
-  int64 m = 7;
-  int64 n = 8;
-  int64 a_batch_stride = 9;
-  int64 b_batch_stride = 10;
+  ShapedSliceProto a_buffer = 2;
+  ShapedSliceProto b_buffer = 3;
+  ShapedSliceProto temp_buffer = 4;
 }
 
 message ReplicaIdThunkProto {
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
index c6d36f675715eb..16881b25aa35ee 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/backends/gpu/runtime/make_batch_pointers.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/service/buffer_assignment.h"
@@ -42,11 +43,8 @@ namespace gpu {
 
 TriangularSolveThunk::TriangularSolveThunk(
     ThunkInfo thunk_info, const TriangularSolveOptions& options,
-    const BufferAllocation::Slice& a_buffer,
-    const BufferAllocation::Slice& b_buffer,
-    const BufferAllocation::Slice& temp_buffer,  //
-    PrimitiveType type, int64_t batch_size, int64_t m, int64_t n,
-    int64_t a_batch_stride, int64_t b_batch_stride)
+    const ShapedSlice& a_buffer, const ShapedSlice& b_buffer,
+    const ShapedSlice& temp_buffer)
     : Thunk(Kind::kTriangularSolve, thunk_info),
       uplo_(options.lower() ? se::blas::UpperLower::kLower
                             : se::blas::UpperLower::kUpper),
@@ -57,12 +55,9 @@ TriangularSolveThunk::TriangularSolveThunk(
       a_buffer_(a_buffer),
       b_buffer_(b_buffer),
       temp_buffer_(temp_buffer),
-      type_(type),
-      batch_size_(batch_size),
-      m_(m),
-      n_(n),
-      a_batch_stride_(a_batch_stride),
-      b_batch_stride_(b_batch_stride) {
+      type_(b_buffer.shape.element_type()),
+      m_(b_buffer.shape.dimensions(b_buffer.shape.dimensions().size() - 2)),
+      n_(b_buffer.shape.dimensions(b_buffer.shape.dimensions().size() - 1)) {
   transpose_a_ = [&] {
     switch (options.transpose_a()) {
       case TriangularSolveOptions::NO_TRANSPOSE:
@@ -82,31 +77,30 @@ TriangularSolveThunk::TriangularSolveThunk(
 absl::Status TriangularSolveThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto& buffer_allocations = *params.buffer_allocations;
-  return RunTriangularSolve(buffer_allocations.GetDeviceAddress(a_buffer_),
-                            buffer_allocations.GetDeviceAddress(b_buffer_),
-                            buffer_allocations.GetDeviceAddress(temp_buffer_),
-                            uplo_, side_, unit_diagonal_, transpose_a_, type_,
-                            batch_size_, m_, n_, a_batch_stride_,
-                            b_batch_stride_, params.stream);
+  return RunTriangularSolve(
+      buffer_allocations.GetDeviceAddress(a_buffer_.slice),
+      buffer_allocations.GetDeviceAddress(b_buffer_.slice),
+      buffer_allocations.GetDeviceAddress(temp_buffer_.slice), uplo_, side_,
+      unit_diagonal_, transpose_a_, type_, batch_size(), m_, n_,
+      a_batch_stride(), b_batch_stride(), params.stream);
 }
 
 absl::StatusOr<std::unique_ptr<TriangularSolveThunk>>
 TriangularSolveThunk::FromProto(
     ThunkInfo thunk_info, const TriangularSolveThunkProto& proto,
     absl::Span<const BufferAllocation> allocations) {
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice a_buffer,
-      BufferAllocation::Slice::FromProto(proto.a_buffer(), allocations));
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice b_buffer,
-      BufferAllocation::Slice::FromProto(proto.b_buffer(), allocations));
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice temp_buffer,
-      BufferAllocation::Slice::FromProto(proto.temp_buffer(), allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice a_buffer,
+                      ShapedSlice::FromProto(proto.a_buffer(), allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice b_buffer,
+                      ShapedSlice::FromProto(proto.b_buffer(), allocations));
+  TF_ASSIGN_OR_RETURN(ShapedSlice temp_buffer,
+                      ShapedSlice::FromProto(proto.temp_buffer(), allocations));
+
+  if (b_buffer.shape.dimensions().size() < 2) {
+    return absl::InvalidArgumentError("Unsupported shape for b");
+  }
   return std::make_unique<TriangularSolveThunk>(
-      thunk_info, proto.options(), a_buffer, b_buffer, temp_buffer,
-      proto.type(), proto.batch_size(), proto.m(), proto.n(),
-      proto.a_batch_stride(), proto.b_batch_stride());
+      thunk_info, proto.options(), a_buffer, b_buffer, temp_buffer);
 }
 
 absl::StatusOr<ThunkProto> TriangularSolveThunk::ToProto() const {
@@ -143,12 +137,6 @@ absl::StatusOr<ThunkProto> TriangularSolveThunk::ToProto() const {
                       b_buffer_.ToProto());
   TF_ASSIGN_OR_RETURN(*triangular_solve_thunk_proto->mutable_temp_buffer(),
                       temp_buffer_.ToProto());
-  triangular_solve_thunk_proto->set_type(type_);
-  triangular_solve_thunk_proto->set_batch_size(batch_size_);
-  triangular_solve_thunk_proto->set_m(m_);
-  triangular_solve_thunk_proto->set_n(n_);
-  triangular_solve_thunk_proto->set_a_batch_stride(a_batch_stride_);
-  triangular_solve_thunk_proto->set_b_batch_stride(b_batch_stride_);
   return proto;
 }
 
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h
index 88c05ca23a226d..29e701fb9a627e 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk.h
@@ -17,15 +17,19 @@ limitations under the License.
 #define XLA_BACKENDS_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
 
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <numeric>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
+#include "xla/backends/gpu/runtime/shaped_slice.h"
 #include "xla/backends/gpu/runtime/thunk.h"
 #include "xla/backends/gpu/runtime/thunk.pb.h"
 #include "xla/runtime/buffer_use.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/stream.h"
@@ -42,12 +46,8 @@ class TriangularSolveThunk : public Thunk {
  public:
   TriangularSolveThunk(ThunkInfo thunk_info,
                        const TriangularSolveOptions& options,
-                       const BufferAllocation::Slice& a_buffer,
-                       const BufferAllocation::Slice& b_buffer,
-                       const BufferAllocation::Slice& temp_buffer,
-                       PrimitiveType type, int64_t batch_size, int64_t m,
-                       int64_t n, int64_t a_batch_stride,
-                       int64_t b_batch_stride);
+                       const ShapedSlice& a_buffer, const ShapedSlice& b_buffer,
+                       const ShapedSlice& temp_buffer);
 
   TriangularSolveThunk(const TriangularSolveThunk&) = delete;
   TriangularSolveThunk& operator=(const TriangularSolveThunk&) = delete;
@@ -56,10 +56,9 @@ class TriangularSolveThunk : public Thunk {
 
   BufferUses buffer_uses() const override {
     return {
-        BufferUse::Read(a_buffer_),
-        BufferUse::Write(b_buffer_),
-        BufferUse(temp_buffer_, BufferUse::MemoryAccess::kWrite,
-                  BufferUse::ContentValidity::kUndefined),
+        BufferUse::Read(a_buffer_.slice, a_buffer_.shape),
+        BufferUse::Write(b_buffer_.slice, b_buffer_.shape),
+        BufferUse::Scratch(temp_buffer_.slice, temp_buffer_.shape),
     };
   };
 
@@ -70,21 +69,34 @@ class TriangularSolveThunk : public Thunk {
   absl::StatusOr<ThunkProto> ToProto() const override;
 
  private:
+  int64_t batch_size() const {
+    return std::accumulate(b_buffer_.shape.dimensions().begin(),
+                           b_buffer_.shape.dimensions().end() - 2, int64_t{1},
+                           std::multiplies<int64_t>());
+  }
+
+  int64_t a_batch_stride() const {
+    int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(type_);
+    return side_ == se::blas::Side::kLeft ? (m_ * m_ * elem_size)
+                                          : (n_ * n_ * elem_size);
+  }
+
+  int64_t b_batch_stride() const {
+    return m_ * n_ * ShapeUtil::ByteSizeOfPrimitiveType(type_);
+  }
+
   const se::blas::UpperLower uplo_;
   const se::blas::Side side_;
   const se::blas::Diagonal unit_diagonal_;
   se::blas::Transpose transpose_a_;
 
-  const BufferAllocation::Slice a_buffer_;
-  const BufferAllocation::Slice b_buffer_;
-  const BufferAllocation::Slice temp_buffer_;
+  const ShapedSlice a_buffer_;
+  const ShapedSlice b_buffer_;
+  const ShapedSlice temp_buffer_;
 
   const PrimitiveType type_;
-  const int64_t batch_size_;
   const int64_t m_;
   const int64_t n_;
-  const int64_t a_batch_stride_;
-  const int64_t b_batch_stride_;
 };
 
 absl::Status RunTriangularSolve(se::DeviceAddressBase a_data,
diff --git a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
index 6414135ebb3f8b..641fbc89be34cb 100644
--- a/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
+++ b/third_party/xla/xla/backends/gpu/runtime/triangular_solve_thunk_test.cc
@@ -48,15 +48,24 @@ TEST(TriangularSolveThunkTest, ProtoRoundTrip) {
             unit_diagonal: false
             transpose_a: TRANSPOSE
           }
-          a_buffer { offset: 0 size: 256 buffer_allocation_index: 0 }
-          b_buffer { offset: 0 size: 256 buffer_allocation_index: 1 }
-          temp_buffer { offset: 0 size: 128 buffer_allocation_index: 2 }
-          type: F32
-          batch_size: 1
-          m: 32
-          n: 32
-          a_batch_stride: 0
-          b_batch_stride: 1
+          a_buffer {
+            slice { offset: 0 size: 256 buffer_allocation_index: 0 }
+            shape {}
+          }
+          b_buffer {
+            slice { offset: 0 size: 256 buffer_allocation_index: 1 }
+            shape {
+              element_type: F32
+              dimensions: 32
+              dimensions: 32
+              is_dynamic_dimension: false
+              is_dynamic_dimension: false
+            }
+          }
+          temp_buffer {
+            slice { offset: 0 size: 128 buffer_allocation_index: 2 }
+            shape {}
+          }
         }
       )pb",
       &proto));
diff --git a/third_party/xla/xla/service/gpu/thunk_emitter.cc b/third_party/xla/xla/service/gpu/thunk_emitter.cc
index 500ec3183e9275..f5acc8b63d0669 100644
--- a/third_party/xla/xla/service/gpu/thunk_emitter.cc
+++ b/third_party/xla/xla/service/gpu/thunk_emitter.cc
@@ -1129,17 +1129,10 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTriangularSolveCustomCall(
   TF_RET_CHECK(has_fortran_layout(operands[1]->shape().layout()));
   TF_RET_CHECK(has_fortran_layout(instr->shape().tuple_shapes(0).layout()));
 
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a_slice,
-                      GetAllocationSliceForHlo(operands[0]));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b_slice,
-                      GetAllocationSliceForHlo(operands[1]));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                      GetAllocationSliceForHlo(instr, {0}));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice temp_slice,
-                      GetAllocationSliceForHlo(instr, {1}));
-
-  const Shape b_shape = operands[1]->shape();
-  const PrimitiveType elem_ty = b_shape.element_type();
+  ASSIGN_OR_RETURN(ShapedSlice a_slice, GetShapedSliceForHlo(operands[0]));
+  ASSIGN_OR_RETURN(ShapedSlice b_slice, GetShapedSliceForHlo(operands[1]));
+  ASSIGN_OR_RETURN(ShapedSlice result_slice, GetShapedSliceForHlo(instr, {0}));
+  ASSIGN_OR_RETURN(ShapedSlice temp_slice, GetShapedSliceForHlo(instr, {1}));
 
   TriangularSolveOptions backend_config;
   auto& backend_config_str = instr->raw_backend_config_string();
@@ -1152,30 +1145,19 @@ absl::StatusOr<ThunkSequence> ThunkEmitter::EmitTriangularSolveCustomCall(
 
   // Triangular solve is in-place on 'b', so copy 'b' to the output
   // if they aren't the same buffer.
-  if (b_slice != result_slice) {
+  if (b_slice.slice != result_slice.slice) {
     thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
         Thunk::ThunkInfo::WithProfileAnnotation(
             instr, ir_emitter_context_->GetNextThunkId()),
-        /*source_buffer=*/ShapedSlice{b_slice, b_shape},
-        /*destination_buffer=*/ShapedSlice{result_slice, b_shape},
-        /*mem_size=*/ShapeUtil::ByteSizeOf(b_shape)));
-  }
-
-  int64_t m = b_shape.dimensions(b_shape.dimensions().size() - 2);
-  int64_t n = b_shape.dimensions(b_shape.dimensions().size() - 1);
-  int64_t batch_size = std::accumulate(
-      b_shape.dimensions().begin(), b_shape.dimensions().end() - 2, int64_t{1},
-      [](int64_t a, int64_t b) { return a * b; });
-  int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(elem_ty);
-  int64_t a_batch_stride =
-      backend_config.left_side() ? m * m * elem_size : n * n * elem_size;
-  int64_t b_batch_stride = m * n * elem_size;
+        /*source_buffer=*/b_slice,
+        /*destination_buffer=*/result_slice,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(b_slice.shape)));
+  }
+
   thunks.push_back(std::make_unique<TriangularSolveThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(
           instr, ir_emitter_context_->GetNextThunkId()),
-      backend_config,
-      /*a_buffer=*/a_slice, /*b_buffer=*/result_slice, temp_slice, elem_ty,
-      batch_size, m, n, a_batch_stride, b_batch_stride));
+      backend_config, a_slice, result_slice, temp_slice));
 
   // Elide the sequential thunk if there's no copy.
   if (thunks.size() == 1) {

From 93ae7c23eacc5ca0a0820e684d0f37a4b4a41081 Mon Sep 17 00:00:00 2001
From: Bill Varcho <varcho@google.com>
Date: Tue, 23 Dec 2025 18:12:34 -0800
Subject: [PATCH 729/753] [ReplicaGroupV3][Refactor][6/n] Update rest of
 `spmd/` dir to use CollectiveDeviceListBase in place of vector<vector<int>>
 and reduce cognitive complexity in `GetDefaultCollectiveOpsCreator`.

PiperOrigin-RevId: 848356290
---
 .../xla/service/spmd/convolution_handler.cc   |   9 +-
 .../xla/xla/service/spmd/spmd_partitioner.cc  | 245 +++++++++---------
 .../xla/xla/service/spmd/spmd_partitioner.h   |  11 +-
 .../xla/service/spmd/spmd_partitioner_util.cc |  25 +-
 4 files changed, 146 insertions(+), 144 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/convolution_handler.cc b/third_party/xla/xla/service/spmd/convolution_handler.cc
index 29e7e6ab77dd05..737fbad8e03c08 100644
--- a/third_party/xla/xla/service/spmd/convolution_handler.cc
+++ b/third_party/xla/xla/service/spmd/convolution_handler.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/hlo/ir/replica_group.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/literal_util.h"
 #include "xla/service/dot_as_convolution_util.h"
@@ -512,8 +513,8 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
           new_window));
 
   auto ar = collective_ops_creator.create_cross_partition_all_reduce(
-      b, conv, MakeBinaryAdd(original_hlo->shape().element_type(), module), {},
-      (*lhs.state().next_channel_id)++);
+      b, conv, MakeBinaryAdd(original_hlo->shape().element_type(), module),
+      CollectiveDeviceList(), (*lhs.state().next_channel_id)++);
   ar->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(ar, output_base_shape, lhs.state())
       .Reshard(output_sharding)
@@ -739,8 +740,8 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
           new_window));
   auto ar =
       lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
-          b, conv, MakeBinaryAdd(output_base_shape.element_type(), module), {},
-          (*lhs.state().next_channel_id)++);
+          b, conv, MakeBinaryAdd(output_base_shape.element_type(), module),
+          CollectiveDeviceList(), (*lhs.state().next_channel_id)++);
   ar->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(ar, output_base_shape, lhs.state())
       .Reshard(output_sharding)
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index fc233f51925933..51bb55ed8c1cfd 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -1610,7 +1610,7 @@ PartitionedHlo PartitionedHlo::Broadcast() const {
       MakeBinaryAdd(shape.element_type(), state_.module);
 
   auto result = state_.collective_ops_creator.create_cross_partition_all_reduce(
-      state_.b, operand, reduction, {}, NewChannel());
+      state_.b, operand, reduction, CollectiveDeviceList(), NewChannel());
   result->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(result, base_shape_, state_);
 }
@@ -1755,8 +1755,8 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
     // After the reshape, it is guaranteed to have at least 3 dimensions.
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape}, groups.flattened_replica_groups(),
-            (*state_.next_channel_id)++, target_dim);
+            state_.b, {reshape}, groups, (*state_.next_channel_id)++,
+            target_dim);
   }
   CHECK_NE(all_to_all, nullptr);
 
@@ -1942,8 +1942,7 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
         temp_target, eligible_target_dims, group_sizes);
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape_1}, groups.flattened_replica_groups(),
-            (*state_.next_channel_id)++, 0);
+            state_.b, {reshape_1}, groups, (*state_.next_channel_id)++, 0);
   }
   // Step 3. Split sharding axes to multiple dimensions
   // 1. reshape_2 (8,16,8,16,8) -> (2,4,16,8,16,8)
@@ -4950,103 +4949,111 @@ absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
-SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
-                                                        int64_t num_replicas) {
-  auto create_all_reduce_lists_of_lists =
-      [num_replicas, num_partitions](
-          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
-          const std::vector<std::vector<int64_t>>& partition_subgroups,
-          int64_t channel_id) {
-        std::vector<ReplicaGroup> device_groups;
-        if (partition_subgroups.size() <= 1) {
-          device_groups.reserve(num_replicas);
-          for (int64_t rid = 0; rid < num_replicas; ++rid) {
-            device_groups.emplace_back();
-            for (int64_t pid = 0; pid < num_partitions; ++pid) {
-              device_groups.back().add_replica_ids(rid * num_partitions + pid);
-            }
-          }
-        } else {
-          device_groups.reserve(partition_subgroups.size() * num_replicas);
-          for (int64_t rid = 0; rid < num_replicas; ++rid) {
-            for (const auto& pgroup : partition_subgroups) {
-              device_groups.emplace_back();
-              for (int64_t pid : pgroup) {
-                device_groups.back().add_replica_ids(rid * num_partitions +
-                                                     pid);
-              }
-            }
-          }
-        }
+HloInstruction* CreateAllReduceListsOfLists(
+    int64_t num_replicas, int64_t num_partitions, SpmdBuilder* b,
+    HloInstruction* operand, HloComputation* reduction,
+    const CollectiveDeviceListBase& device_list, int64_t channel_id) {
+  const auto& partition_subgroups = device_list.flattened_replica_groups();
 
-        HloComputation* reduction_clone =
-            reduction->parent()->AddComputationAndUnifyNamesAndIds(
-                reduction->Clone(), false);
-        HloInstruction* all_reduce =
-            b->AddInstruction(HloInstruction::CreateAllReduce(
-                operand->shape(), {operand}, reduction_clone,
-                CollectiveDeviceList(device_groups),
-                /*constrain_layout=*/false, channel_id,
-                /*use_global_device_ids=*/true));
-        return all_reduce;
-      };
-  auto create_all_to_all_list_of_lists =
-      [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-         const std::vector<std::vector<int64_t>>& partition_subgroups,
-         int64_t channel_id, std::optional<int64_t> split_dimension) {
-        std::vector<Shape> shapes(operands.size(), operands[0]->shape());
-        const Shape output_shape =
-            (shapes.size() == 1)
-                ? shapes[0]
-                : ShapeUtil::MakeValidatedTupleShape(shapes).value();
-        std::vector<ReplicaGroup> groups(partition_subgroups.size());
-        for (int64_t i = 0; i < groups.size(); ++i) {
-          for (int64_t id : partition_subgroups[i]) {
-            groups[i].add_replica_ids(id);
-          }
-        }
-        return b->AddInstruction(HloInstruction::CreateAllToAll(
-            output_shape, operands, CollectiveDeviceList(groups),
-            /*constrain_layout=*/false, channel_id, split_dimension));
-      };
-  auto create_all_gather_list_of_lists =
-      [num_replicas, num_partitions](
-          SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-          const std::vector<std::vector<int64_t>>& partition_subgroups,
-          int64_t channel_id, int64_t all_gather_dimension) {
-        std::vector<ReplicaGroup> device_groups;
-        device_groups.reserve(partition_subgroups.size() * num_replicas);
-        for (int64_t i = 0; i < num_replicas; ++i) {
-          for (const auto& pgroup : partition_subgroups) {
-            device_groups.emplace_back();
-            for (int64_t pid : pgroup) {
-              device_groups.back().add_replica_ids(i * num_partitions + pid);
-            }
-          }
-        }
-        return b->AddInstruction(HloInstruction::CreateAllGather(
-            ag_shape, {operand}, all_gather_dimension,
-            CollectiveDeviceList(device_groups),
-            /*constrain_layout=*/false, channel_id,
-            /*use_global_device_ids=*/true));
-      };
+  std::vector<std::vector<int64_t>> normalized_subgroups = partition_subgroups;
+  if (normalized_subgroups.size() <= 1) {
+    normalized_subgroups.assign(1, std::vector<int64_t>(num_partitions));
+    std::iota(normalized_subgroups[0].begin(), normalized_subgroups[0].end(),
+              0);
+  }
+
+  auto create_replica_group = [&](int64_t rid,
+                                  const std::vector<int64_t>& pids) {
+    ReplicaGroup group;
+    group.mutable_replica_ids()->Reserve(pids.size());
+    for (int64_t pid : pids) {
+      group.add_replica_ids(rid * num_partitions + pid);
+    }
+    return group;
+  };
+
+  std::vector<ReplicaGroup> device_groups;
+  device_groups.reserve(num_replicas * normalized_subgroups.size());
+  for (int64_t rid = 0; rid < num_replicas; ++rid) {
+    for (const auto& pgroup : normalized_subgroups) {
+      device_groups.push_back(create_replica_group(rid, pgroup));
+    }
+  }
 
+  HloComputation* reduction_clone =
+      reduction->parent()->AddComputationAndUnifyNamesAndIds(reduction->Clone(),
+                                                             false);
+  return b->AddInstruction(HloInstruction::CreateAllReduce(
+      operand->shape(), {operand}, reduction_clone,
+      CollectiveDeviceList(device_groups),
+      /*constrain_layout=*/false, channel_id,
+      /*use_global_device_ids=*/true));
+}
+
+HloInstruction* CreateAllToAllListsOfLists(
+    int64_t num_replicas, int64_t num_partitions, SpmdBuilder* b,
+    absl::Span<HloInstruction* const> operands,
+    const CollectiveDeviceListBase& device_list, int64_t channel_id,
+    std::optional<int64_t> split_dimension) {
+  const std::vector<std::vector<int64_t>>& partition_subgroups =
+      device_list.flattened_replica_groups();
+  std::vector<Shape> shapes(operands.size(), operands[0]->shape());
+  const Shape output_shape =
+      (shapes.size() == 1) ? shapes[0]
+                           : ShapeUtil::MakeValidatedTupleShape(shapes).value();
+  std::vector<ReplicaGroup> groups(partition_subgroups.size());
+  for (int64_t i = 0; i < groups.size(); ++i) {
+    for (int64_t id : partition_subgroups[i]) {
+      groups[i].add_replica_ids(id);
+    }
+  }
+  return b->AddInstruction(HloInstruction::CreateAllToAll(
+      output_shape, operands, CollectiveDeviceList(groups),
+      /*constrain_layout=*/false, channel_id, split_dimension));
+}
+
+HloInstruction* CreateAllGatherListsOfLists(
+    int64_t num_replicas, int64_t num_partitions, SpmdBuilder* b,
+    HloInstruction* operand, const Shape& ag_shape,
+    const CollectiveDeviceListBase& device_list, int64_t channel_id,
+    int64_t all_gather_dimension) {
+  const std::vector<std::vector<int64_t>>& partition_subgroups =
+      device_list.flattened_replica_groups();
+  std::vector<ReplicaGroup> device_groups;
+  device_groups.reserve(partition_subgroups.size() * num_replicas);
+  for (int64_t i = 0; i < num_replicas; ++i) {
+    for (const auto& pgroup : partition_subgroups) {
+      device_groups.emplace_back();
+      for (int64_t pid : pgroup) {
+        device_groups.back().add_replica_ids(i * num_partitions + pid);
+      }
+    }
+  }
+  return b->AddInstruction(
+      HloInstruction::CreateAllGather(ag_shape, {operand}, all_gather_dimension,
+                                      CollectiveDeviceList(device_groups),
+                                      /*constrain_layout=*/false, channel_id,
+                                      /*use_global_device_ids=*/true));
+}
+
+SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
+                                                        int64_t num_replicas) {
   SPMDCollectiveOpsCreator result = {
       .create_partition_id =
           [](SpmdBuilder* b) {
             return b->AddInstruction(HloInstruction::CreatePartitionId());
           },
       .create_cross_partition_all_reduce =
-          [create_all_reduce_lists_of_lists](
+          [num_replicas, num_partitions](
               SpmdBuilder* b, HloInstruction* operand,
               HloComputation* reduction,
-              const std::vector<std::vector<int64_t>>& partition_subgroups,
-              int64_t channel_id) {
-            return create_all_reduce_lists_of_lists(
-                b, operand, reduction, partition_subgroups, channel_id);
+              const CollectiveDeviceListBase& device_list, int64_t channel_id) {
+            return CreateAllReduceListsOfLists(num_replicas, num_partitions, b,
+                                               operand, reduction, device_list,
+                                               channel_id);
           },
       .create_cross_partition_all_reduce_with_iota_device_list =
-          [create_all_reduce_lists_of_lists, num_replicas, num_partitions](
+          [num_replicas, num_partitions](
               SpmdBuilder* b, HloInstruction* operand,
               HloComputation* reduction,
               const IotaReplicaGroupList& partition_group_list,
@@ -5054,9 +5061,9 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
             if (partition_group_list.num_total_devices() != num_partitions) {
-              return create_all_reduce_lists_of_lists(
-                  b, operand, reduction,
-                  partition_group_list.flattened_replica_groups(), channel_id);
+              return CreateAllReduceListsOfLists(
+                  num_replicas, num_partitions, b, operand, reduction,
+                  partition_group_list, channel_id);
             }
             HloComputation* reduction_clone =
                 reduction->parent()->AddComputationAndUnifyNamesAndIds(
@@ -5096,24 +5103,25 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 operand->shape(), operand, src_dst_pairs, channel_id));
           },
       .create_cross_partition_all_to_all =
-          [create_all_to_all_list_of_lists](
+          [num_replicas, num_partitions](
               SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-              const std::vector<std::vector<int64_t>>& partition_subgroups,
-              int64_t channel_id, std::optional<int64_t> split_dimension) {
-            return create_all_to_all_list_of_lists(
-                b, operands, partition_subgroups, channel_id, split_dimension);
+              const CollectiveDeviceListBase& device_list, int64_t channel_id,
+              std::optional<int64_t> split_dimension) {
+            return CreateAllToAllListsOfLists(num_replicas, num_partitions, b,
+                                              operands, device_list, channel_id,
+                                              split_dimension);
           },
       .create_cross_partition_all_to_all_with_iota_device_list =
-          [create_all_to_all_list_of_lists, num_replicas, num_partitions](
+          [num_replicas, num_partitions](
               SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
               const IotaReplicaGroupList& partition_group_list,
               int64_t channel_id, std::optional<int64_t> split_dimension) {
             // Fallback back to list of lists collective creation if the
             // partition group list does not utilize all the partitions.
             if (partition_group_list.num_total_devices() != num_partitions) {
-              return create_all_to_all_list_of_lists(
-                  b, operands, partition_group_list.flattened_replica_groups(),
-                  channel_id, split_dimension);
+              return CreateAllToAllListsOfLists(num_replicas, num_partitions, b,
+                                                operands, partition_group_list,
+                                                channel_id, split_dimension);
             }
             std::vector<Shape> shapes(operands.size(), operands[0]->shape());
             const Shape output_shape = (shapes.size() == 1)
@@ -5126,26 +5134,25 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 /*constrain_layout=*/false, channel_id, split_dimension));
           },
       .create_cross_partition_all_gather =
-          [create_all_gather_list_of_lists](
+          [num_replicas, num_partitions](
               SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-              const std::vector<std::vector<int64_t>>& partition_subgroups,
-              int64_t channel_id, int64_t all_gather_dimension) {
-            return create_all_gather_list_of_lists(
-                b, operand, ag_shape, partition_subgroups, channel_id,
-                all_gather_dimension);
+              const CollectiveDeviceListBase& device_list, int64_t channel_id,
+              int64_t all_gather_dimension) {
+            return CreateAllGatherListsOfLists(
+                num_replicas, num_partitions, b, operand, ag_shape, device_list,
+                channel_id, all_gather_dimension);
           },
       .create_cross_partition_all_gather_with_iota_device_list =
-          [create_all_gather_list_of_lists, num_replicas, num_partitions](
+          [num_replicas, num_partitions](
               SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
               const IotaReplicaGroupList& partition_group_list,
               int64_t channel_id, int64_t all_gather_dimension) {
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
             if (partition_group_list.num_total_devices() != num_partitions) {
-              return create_all_gather_list_of_lists(
-                  b, operand, ag_shape,
-                  partition_group_list.flattened_replica_groups(), channel_id,
-                  all_gather_dimension);
+              return CreateAllGatherListsOfLists(
+                  num_replicas, num_partitions, b, operand, ag_shape,
+                  partition_group_list, channel_id, all_gather_dimension);
             }
             return b->AddInstruction(HloInstruction::CreateAllGather(
                 ag_shape, {operand}, all_gather_dimension,
@@ -5211,9 +5218,7 @@ SpmdPartitioner::AllGatherShardsInternal(
             *it, result_shape.dimensions(*it) *
                      partition_subgroups.num_devices_per_group());
         result = collectives_creator.create_cross_partition_all_gather(
-            b, result, result_shape,
-            partition_subgroups.flattened_replica_groups(),
-            (*next_channel_id)++,
+            b, result, result_shape, partition_subgroups, (*next_channel_id)++,
             /*all_gather_dimension=*/*it);
       }
     }
@@ -5252,7 +5257,7 @@ SpmdPartitioner::AllGatherShardsInternal(
     shape[0] *= partition_subgroups.num_devices_per_group();
     result = collectives_creator.create_cross_partition_all_gather(
         b, result, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
-        partition_subgroups.flattened_replica_groups(), (*next_channel_id)++,
+        partition_subgroups, (*next_channel_id)++,
         /*all_gather_dimension=*/0);
   }
   ag = result;
@@ -5341,8 +5346,7 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
     auto partition_subgroups =
         GetPartitionGroupsForReplication(sharding, selected_dims);
     return collectives_creator.create_cross_partition_all_reduce(
-        b, operand, reduction, partition_subgroups.flattened_replica_groups(),
-        (*next_channel_id)++);
+        b, operand, reduction, partition_subgroups, (*next_channel_id)++);
   }
 
   auto result = operand;
@@ -5365,8 +5369,7 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
       auto partition_subgroups =
           GetPartitionGroupsForReplication(sharding, {*it});
       result = collectives_creator.create_cross_partition_all_reduce(
-          b, result, reduction, partition_subgroups.flattened_replica_groups(),
-          (*next_channel_id)++);
+          b, result, reduction, partition_subgroups, (*next_channel_id)++);
     }
   }
   return result;
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index 8aae8502e73d9b..8d26760a15fd98 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -203,8 +203,7 @@ struct SPMDCollectiveOpsCreator {
   // Function used to create a cross-partition all-reduce HLO.
   std::function<HloInstruction*(
       SpmdBuilder*, HloInstruction* operand, HloComputation* reduction,
-      const std::vector<std::vector<int64_t>>& partition_subgroups,
-      int64_t channel_id)>
+      const CollectiveDeviceListBase& partition_subgroups, int64_t channel_id)>
       create_cross_partition_all_reduce;
 
   // Function used to create a cross-partition all-reduce HLO using device list
@@ -227,8 +226,8 @@ struct SPMDCollectiveOpsCreator {
   // Function used to create a cross-partition all-to-all HLO.
   std::function<HloInstruction*(
       SpmdBuilder*, absl::Span<HloInstruction* const> operands,
-      const std::vector<std::vector<int64_t>>& partition_subgroups,
-      int64_t channel_id, std::optional<int64_t> split_dimension)>
+      const CollectiveDeviceListBase& partition_subgroups, int64_t channel_id,
+      std::optional<int64_t> split_dimension)>
       create_cross_partition_all_to_all;
 
   // Function used to create a cross-partition all-to-all HLO using device list
@@ -244,8 +243,8 @@ struct SPMDCollectiveOpsCreator {
   // if it is nullptr, the partitioner will use all-reduce instead.
   std::function<HloInstruction*(
       SpmdBuilder*, HloInstruction* operand, const Shape& ag_shape,
-      const std::vector<std::vector<int64_t>>& partition_subgroups,
-      int64_t channel_id, int64_t all_gather_dimension)>
+      const CollectiveDeviceListBase& partition_subgroups, int64_t channel_id,
+      int64_t all_gather_dimension)>
       create_cross_partition_all_gather;
 
   // Function used to create a cross-partition all-gather HLO using device list
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index c2b418440c1cc0..f9ac602d258afc 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -461,14 +461,16 @@ bool IsIota(const Array<int64_t>& x) {
 
 // Expand the device groups, making each device group follow the format of the
 // partition group.
-std::vector<std::vector<int64_t>> ExpandDeviceGroups(
+CollectiveDeviceList ExpandDeviceGroups(
     const DeviceGroupTileAssignment& device_groups,
-    const std::vector<std::vector<int64_t>>& partition_subgroups) {
+    const CollectiveDeviceListBase& collective_device_list) {
   // Example: Given device groups of {{0,1,2,3},{4,5,6,7}} and partition
   // subgroups of {{0,2}, {1,3}} returns device groups of {{0,2}, {1,3}, {4,6},
   // {5,7}}
+  const std::vector<std::vector<int64_t>>& partition_subgroups =
+      collective_device_list.flattened_replica_groups();
   if (partition_subgroups.empty()) {
-    return device_groups.flattened_device_groups();
+    return CollectiveDeviceList(device_groups.flattened_device_groups());
   }
   std::vector<std::vector<int64_t>> result(partition_subgroups.size() *
                                            device_groups.num_groups());
@@ -482,7 +484,7 @@ std::vector<std::vector<int64_t>> ExpandDeviceGroups(
       }
     }
   }
-  return result;
+  return CollectiveDeviceList(result);
 }
 
 // Expand the device groups, making each device group follow the format of the
@@ -552,7 +554,7 @@ CreateCrossPartitionAllReduce(
     std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
   return [creator, device_groups_ptr](
              SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
-             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             const CollectiveDeviceListBase& partition_subgroups,
              int64_t channel_id) {
     return creator.create_cross_partition_all_reduce(
         b, operand, reduction,
@@ -577,8 +579,7 @@ CreateCrossPartitionAllReduceWithIotaDeviceList(
     if (!expanded_iota_partition_group_list.has_value()) {
       return creator.create_cross_partition_all_reduce(
           b, operand, reduction,
-          ExpandDeviceGroups(*device_groups_ptr,
-                             partition_group_list.flattened_replica_groups()),
+          ExpandDeviceGroups(*device_groups_ptr, partition_group_list),
           channel_id);
     }
     return creator.create_cross_partition_all_reduce_with_iota_device_list(
@@ -615,7 +616,7 @@ CreateCrossPartitionAllToAll(
     std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
   return [creator, device_groups_ptr](
              SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             const CollectiveDeviceListBase& partition_subgroups,
              int64_t channel_id, std::optional<int64_t> split_dimension) {
     return creator.create_cross_partition_all_to_all(
         b, operands,
@@ -640,8 +641,7 @@ CreateCrossPartitionAllToAllWithIotaDeviceList(
     if (!expanded_iota_partition_group_list.has_value()) {
       return creator.create_cross_partition_all_to_all(
           b, operands,
-          ExpandDeviceGroups(*device_groups_ptr,
-                             partition_group_list.flattened_replica_groups()),
+          ExpandDeviceGroups(*device_groups_ptr, partition_group_list),
           channel_id, split_dimension);
     }
     return creator.create_cross_partition_all_to_all_with_iota_device_list(
@@ -656,7 +656,7 @@ CreateCrossPartitionAllGather(
     std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
   return [creator, device_groups_ptr](
              SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-             const std::vector<std::vector<int64_t>>& partition_subgroups,
+             const CollectiveDeviceListBase& partition_subgroups,
              int64_t channel_id, int64_t all_gather_dimension) {
     return creator.create_cross_partition_all_gather(
         b, operand, ag_shape,
@@ -682,8 +682,7 @@ CreateCrossPartitionAllGatherWithIotaDeviceList(
     if (!expanded_iota_partition_group_list.has_value()) {
       return creator.create_cross_partition_all_gather(
           b, operand, ag_shape,
-          ExpandDeviceGroups(*device_groups_ptr,
-                             partition_group_list.flattened_replica_groups()),
+          ExpandDeviceGroups(*device_groups_ptr, partition_group_list),
           channel_id, all_gather_dimension);
     }
     return creator.create_cross_partition_all_gather_with_iota_device_list(

From 53988344aa1fc52926e9638aad088fc70f4e5f36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 19:29:49 -0800
Subject: [PATCH 730/753] Automated Code Change

PiperOrigin-RevId: 848375547
---
 third_party/xla/xla/python/ifrt/attribute_map_test.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt/attribute_map_test.cc b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
index e81a3b8448a735..3f0bad64f22583 100644
--- a/third_party/xla/xla/python/ifrt/attribute_map_test.cc
+++ b/third_party/xla/xla/python/ifrt/attribute_map_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
-#include "absl/types/span.h"
 #include "xla/python/ifrt/serdes_test_util.h"
 #include "xla/python/ifrt/serdes_version.h"
 #include "xla/tsl/lib/core/status_test_util.h"

From 7d0d53475633627ec999d904d58c6f50b352dd36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 20:01:30 -0800
Subject: [PATCH 731/753] Automated Code Change

PiperOrigin-RevId: 848382953
---
 .../xla/xla/hlo/transforms/host_offloader.cc  | 43 +++++++++----------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/third_party/xla/xla/hlo/transforms/host_offloader.cc b/third_party/xla/xla/hlo/transforms/host_offloader.cc
index f72145b2e6392a..b31481ecbe2578 100644
--- a/third_party/xla/xla/hlo/transforms/host_offloader.cc
+++ b/third_party/xla/xla/hlo/transforms/host_offloader.cc
@@ -249,11 +249,11 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
       if (is_end_of_offload) {
         // This DynamicSlice is the end of this path of host memory offload.
         continue;
-      } else {
-        // This is not the end of host memory offload. This is treated as device
-        // compute happening on host memory, convert it to host compute.
-        need_to_wrap_instruction_as_host_compute = true;
-      }
+      }  // This is not the end of host memory offload. This is treated as
+         // device
+      // compute happening on host memory, convert it to host compute.
+      need_to_wrap_instruction_as_host_compute = true;
+
     } else if (instruction->opcode() == HloOpcode::kSlice) {
       TF_ASSIGN_OR_RETURN(bool is_end_of_offload,
                           SliceLeadsToMoveToDeviceCustomCall(instruction));
@@ -263,11 +263,11 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
         // memory.
         slices_to_dynamify.insert(instruction);
         continue;
-      } else {
-        // This is not the end of host memory offload. This is treated as device
-        // compute happening on host memory, convert it to host compute.
-        need_to_wrap_instruction_as_host_compute = true;
-      }
+      }  // This is not the end of host memory offload. This is treated as
+         // device
+      // compute happening on host memory, convert it to host compute.
+      need_to_wrap_instruction_as_host_compute = true;
+
     } else if (instruction->opcode() == HloOpcode::kCopy) {
       if (instruction->shape() == instruction->operand(0)->shape()) {
         need_to_wrap_instruction_as_host_compute = true;
@@ -343,17 +343,16 @@ absl::StatusOr<bool> HostOffloader::WalkDownHostMemoryOffloadPaths(
             "Memory offloaded starting from %s is output streamed",
             starting_instruction_and_index.ToString());
         continue;
-      } else {
-        if (VLOG_IS_ON(1)) {
-          LOG(INFO) << "Instruction trace leading to error:";
-          PrintTrace(instruction_and_shape_index, previous);
-        }
-        return absl::InvalidArgumentError(
-            absl::StrFormat("Tensor which is moved to host (starting from %s) "
-                            "is returned from the entry computation but the "
-                            "layout for this output is not set to host memory.",
-                            starting_instruction->name()));
       }
+      if (VLOG_IS_ON(1)) {
+        LOG(INFO) << "Instruction trace leading to error:";
+        PrintTrace(instruction_and_shape_index, previous);
+      }
+      return absl::InvalidArgumentError(
+          absl::StrFormat("Tensor which is moved to host (starting from %s) "
+                          "is returned from the entry computation but the "
+                          "layout for this output is not set to host memory.",
+                          starting_instruction->name()));
     }
     // Push successors onto the queue to be visited.
     TF_ASSIGN_OR_RETURN(
@@ -643,8 +642,8 @@ HostOffloader::GetStartingInstructions(
       // Found a DynamicUpdateSlice.
       result.push_back(instruction_and_shape);
       continue;
-    } else if (!InstructionIsAllowedBetweenMoveToHostAndDus(
-                   current_instruction)) {
+    }
+    if (!InstructionIsAllowedBetweenMoveToHostAndDus(current_instruction)) {
       // Found the start of "normal" memory offloading.
       result.push_back(instruction_and_shape);
       continue;

From ac8d8c396d0ca323d9584aa16a3da4c705b1b10a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 20:15:41 -0800
Subject: [PATCH 732/753] Reverts 93ae7c23eacc5ca0a0820e684d0f37a4b4a41081

PiperOrigin-RevId: 848387274
---
 .../xla/service/spmd/convolution_handler.cc   |   9 +-
 .../xla/xla/service/spmd/spmd_partitioner.cc  | 245 +++++++++---------
 .../xla/xla/service/spmd/spmd_partitioner.h   |  11 +-
 .../xla/service/spmd/spmd_partitioner_util.cc |  25 +-
 4 files changed, 144 insertions(+), 146 deletions(-)

diff --git a/third_party/xla/xla/service/spmd/convolution_handler.cc b/third_party/xla/xla/service/spmd/convolution_handler.cc
index 737fbad8e03c08..29e7e6ab77dd05 100644
--- a/third_party/xla/xla/service/spmd/convolution_handler.cc
+++ b/third_party/xla/xla/service/spmd/convolution_handler.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_sharding.h"
-#include "xla/hlo/ir/replica_group.h"
 #include "xla/hlo/utils/hlo_sharding_util.h"
 #include "xla/literal_util.h"
 #include "xla/service/dot_as_convolution_util.h"
@@ -513,8 +512,8 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
           new_window));
 
   auto ar = collective_ops_creator.create_cross_partition_all_reduce(
-      b, conv, MakeBinaryAdd(original_hlo->shape().element_type(), module),
-      CollectiveDeviceList(), (*lhs.state().next_channel_id)++);
+      b, conv, MakeBinaryAdd(original_hlo->shape().element_type(), module), {},
+      (*lhs.state().next_channel_id)++);
   ar->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(ar, output_base_shape, lhs.state())
       .Reshard(output_sharding)
@@ -740,8 +739,8 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
           new_window));
   auto ar =
       lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
-          b, conv, MakeBinaryAdd(output_base_shape.element_type(), module),
-          CollectiveDeviceList(), (*lhs.state().next_channel_id)++);
+          b, conv, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+          (*lhs.state().next_channel_id)++);
   ar->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(ar, output_base_shape, lhs.state())
       .Reshard(output_sharding)
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index 51bb55ed8c1cfd..fc233f51925933 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -1610,7 +1610,7 @@ PartitionedHlo PartitionedHlo::Broadcast() const {
       MakeBinaryAdd(shape.element_type(), state_.module);
 
   auto result = state_.collective_ops_creator.create_cross_partition_all_reduce(
-      state_.b, operand, reduction, CollectiveDeviceList(), NewChannel());
+      state_.b, operand, reduction, {}, NewChannel());
   result->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(result, base_shape_, state_);
 }
@@ -1755,8 +1755,8 @@ PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
     // After the reshape, it is guaranteed to have at least 3 dimensions.
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape}, groups, (*state_.next_channel_id)++,
-            target_dim);
+            state_.b, {reshape}, groups.flattened_replica_groups(),
+            (*state_.next_channel_id)++, target_dim);
   }
   CHECK_NE(all_to_all, nullptr);
 
@@ -1942,7 +1942,8 @@ PartitionedHlo PartitionedHlo::TryMultipleSourceTargetDims(
         temp_target, eligible_target_dims, group_sizes);
     all_to_all =
         state_.collective_ops_creator.create_cross_partition_all_to_all(
-            state_.b, {reshape_1}, groups, (*state_.next_channel_id)++, 0);
+            state_.b, {reshape_1}, groups.flattened_replica_groups(),
+            (*state_.next_channel_id)++, 0);
   }
   // Step 3. Split sharding axes to multiple dimensions
   // 1. reshape_2 (8,16,8,16,8) -> (2,4,16,8,16,8)
@@ -4949,111 +4950,103 @@ absl::Status SpmdPartitioningVisitor::HandleRaggedDot(HloInstruction* hlo) {
   return absl::OkStatus();
 }
 
-HloInstruction* CreateAllReduceListsOfLists(
-    int64_t num_replicas, int64_t num_partitions, SpmdBuilder* b,
-    HloInstruction* operand, HloComputation* reduction,
-    const CollectiveDeviceListBase& device_list, int64_t channel_id) {
-  const auto& partition_subgroups = device_list.flattened_replica_groups();
-
-  std::vector<std::vector<int64_t>> normalized_subgroups = partition_subgroups;
-  if (normalized_subgroups.size() <= 1) {
-    normalized_subgroups.assign(1, std::vector<int64_t>(num_partitions));
-    std::iota(normalized_subgroups[0].begin(), normalized_subgroups[0].end(),
-              0);
-  }
-
-  auto create_replica_group = [&](int64_t rid,
-                                  const std::vector<int64_t>& pids) {
-    ReplicaGroup group;
-    group.mutable_replica_ids()->Reserve(pids.size());
-    for (int64_t pid : pids) {
-      group.add_replica_ids(rid * num_partitions + pid);
-    }
-    return group;
-  };
-
-  std::vector<ReplicaGroup> device_groups;
-  device_groups.reserve(num_replicas * normalized_subgroups.size());
-  for (int64_t rid = 0; rid < num_replicas; ++rid) {
-    for (const auto& pgroup : normalized_subgroups) {
-      device_groups.push_back(create_replica_group(rid, pgroup));
-    }
-  }
-
-  HloComputation* reduction_clone =
-      reduction->parent()->AddComputationAndUnifyNamesAndIds(reduction->Clone(),
-                                                             false);
-  return b->AddInstruction(HloInstruction::CreateAllReduce(
-      operand->shape(), {operand}, reduction_clone,
-      CollectiveDeviceList(device_groups),
-      /*constrain_layout=*/false, channel_id,
-      /*use_global_device_ids=*/true));
-}
-
-HloInstruction* CreateAllToAllListsOfLists(
-    int64_t num_replicas, int64_t num_partitions, SpmdBuilder* b,
-    absl::Span<HloInstruction* const> operands,
-    const CollectiveDeviceListBase& device_list, int64_t channel_id,
-    std::optional<int64_t> split_dimension) {
-  const std::vector<std::vector<int64_t>>& partition_subgroups =
-      device_list.flattened_replica_groups();
-  std::vector<Shape> shapes(operands.size(), operands[0]->shape());
-  const Shape output_shape =
-      (shapes.size() == 1) ? shapes[0]
-                           : ShapeUtil::MakeValidatedTupleShape(shapes).value();
-  std::vector<ReplicaGroup> groups(partition_subgroups.size());
-  for (int64_t i = 0; i < groups.size(); ++i) {
-    for (int64_t id : partition_subgroups[i]) {
-      groups[i].add_replica_ids(id);
-    }
-  }
-  return b->AddInstruction(HloInstruction::CreateAllToAll(
-      output_shape, operands, CollectiveDeviceList(groups),
-      /*constrain_layout=*/false, channel_id, split_dimension));
-}
-
-HloInstruction* CreateAllGatherListsOfLists(
-    int64_t num_replicas, int64_t num_partitions, SpmdBuilder* b,
-    HloInstruction* operand, const Shape& ag_shape,
-    const CollectiveDeviceListBase& device_list, int64_t channel_id,
-    int64_t all_gather_dimension) {
-  const std::vector<std::vector<int64_t>>& partition_subgroups =
-      device_list.flattened_replica_groups();
-  std::vector<ReplicaGroup> device_groups;
-  device_groups.reserve(partition_subgroups.size() * num_replicas);
-  for (int64_t i = 0; i < num_replicas; ++i) {
-    for (const auto& pgroup : partition_subgroups) {
-      device_groups.emplace_back();
-      for (int64_t pid : pgroup) {
-        device_groups.back().add_replica_ids(i * num_partitions + pid);
-      }
-    }
-  }
-  return b->AddInstruction(
-      HloInstruction::CreateAllGather(ag_shape, {operand}, all_gather_dimension,
-                                      CollectiveDeviceList(device_groups),
-                                      /*constrain_layout=*/false, channel_id,
-                                      /*use_global_device_ids=*/true));
-}
-
 SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                                                         int64_t num_replicas) {
+  auto create_all_reduce_lists_of_lists =
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
+          const std::vector<std::vector<int64_t>>& partition_subgroups,
+          int64_t channel_id) {
+        std::vector<ReplicaGroup> device_groups;
+        if (partition_subgroups.size() <= 1) {
+          device_groups.reserve(num_replicas);
+          for (int64_t rid = 0; rid < num_replicas; ++rid) {
+            device_groups.emplace_back();
+            for (int64_t pid = 0; pid < num_partitions; ++pid) {
+              device_groups.back().add_replica_ids(rid * num_partitions + pid);
+            }
+          }
+        } else {
+          device_groups.reserve(partition_subgroups.size() * num_replicas);
+          for (int64_t rid = 0; rid < num_replicas; ++rid) {
+            for (const auto& pgroup : partition_subgroups) {
+              device_groups.emplace_back();
+              for (int64_t pid : pgroup) {
+                device_groups.back().add_replica_ids(rid * num_partitions +
+                                                     pid);
+              }
+            }
+          }
+        }
+
+        HloComputation* reduction_clone =
+            reduction->parent()->AddComputationAndUnifyNamesAndIds(
+                reduction->Clone(), false);
+        HloInstruction* all_reduce =
+            b->AddInstruction(HloInstruction::CreateAllReduce(
+                operand->shape(), {operand}, reduction_clone,
+                CollectiveDeviceList(device_groups),
+                /*constrain_layout=*/false, channel_id,
+                /*use_global_device_ids=*/true));
+        return all_reduce;
+      };
+  auto create_all_to_all_list_of_lists =
+      [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+         const std::vector<std::vector<int64_t>>& partition_subgroups,
+         int64_t channel_id, std::optional<int64_t> split_dimension) {
+        std::vector<Shape> shapes(operands.size(), operands[0]->shape());
+        const Shape output_shape =
+            (shapes.size() == 1)
+                ? shapes[0]
+                : ShapeUtil::MakeValidatedTupleShape(shapes).value();
+        std::vector<ReplicaGroup> groups(partition_subgroups.size());
+        for (int64_t i = 0; i < groups.size(); ++i) {
+          for (int64_t id : partition_subgroups[i]) {
+            groups[i].add_replica_ids(id);
+          }
+        }
+        return b->AddInstruction(HloInstruction::CreateAllToAll(
+            output_shape, operands, CollectiveDeviceList(groups),
+            /*constrain_layout=*/false, channel_id, split_dimension));
+      };
+  auto create_all_gather_list_of_lists =
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+          const std::vector<std::vector<int64_t>>& partition_subgroups,
+          int64_t channel_id, int64_t all_gather_dimension) {
+        std::vector<ReplicaGroup> device_groups;
+        device_groups.reserve(partition_subgroups.size() * num_replicas);
+        for (int64_t i = 0; i < num_replicas; ++i) {
+          for (const auto& pgroup : partition_subgroups) {
+            device_groups.emplace_back();
+            for (int64_t pid : pgroup) {
+              device_groups.back().add_replica_ids(i * num_partitions + pid);
+            }
+          }
+        }
+        return b->AddInstruction(HloInstruction::CreateAllGather(
+            ag_shape, {operand}, all_gather_dimension,
+            CollectiveDeviceList(device_groups),
+            /*constrain_layout=*/false, channel_id,
+            /*use_global_device_ids=*/true));
+      };
+
   SPMDCollectiveOpsCreator result = {
       .create_partition_id =
           [](SpmdBuilder* b) {
             return b->AddInstruction(HloInstruction::CreatePartitionId());
           },
       .create_cross_partition_all_reduce =
-          [num_replicas, num_partitions](
+          [create_all_reduce_lists_of_lists](
               SpmdBuilder* b, HloInstruction* operand,
               HloComputation* reduction,
-              const CollectiveDeviceListBase& device_list, int64_t channel_id) {
-            return CreateAllReduceListsOfLists(num_replicas, num_partitions, b,
-                                               operand, reduction, device_list,
-                                               channel_id);
+              const std::vector<std::vector<int64_t>>& partition_subgroups,
+              int64_t channel_id) {
+            return create_all_reduce_lists_of_lists(
+                b, operand, reduction, partition_subgroups, channel_id);
           },
       .create_cross_partition_all_reduce_with_iota_device_list =
-          [num_replicas, num_partitions](
+          [create_all_reduce_lists_of_lists, num_replicas, num_partitions](
               SpmdBuilder* b, HloInstruction* operand,
               HloComputation* reduction,
               const IotaReplicaGroupList& partition_group_list,
@@ -5061,9 +5054,9 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
             if (partition_group_list.num_total_devices() != num_partitions) {
-              return CreateAllReduceListsOfLists(
-                  num_replicas, num_partitions, b, operand, reduction,
-                  partition_group_list, channel_id);
+              return create_all_reduce_lists_of_lists(
+                  b, operand, reduction,
+                  partition_group_list.flattened_replica_groups(), channel_id);
             }
             HloComputation* reduction_clone =
                 reduction->parent()->AddComputationAndUnifyNamesAndIds(
@@ -5103,25 +5096,24 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 operand->shape(), operand, src_dst_pairs, channel_id));
           },
       .create_cross_partition_all_to_all =
-          [num_replicas, num_partitions](
+          [create_all_to_all_list_of_lists](
               SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-              const CollectiveDeviceListBase& device_list, int64_t channel_id,
-              std::optional<int64_t> split_dimension) {
-            return CreateAllToAllListsOfLists(num_replicas, num_partitions, b,
-                                              operands, device_list, channel_id,
-                                              split_dimension);
+              const std::vector<std::vector<int64_t>>& partition_subgroups,
+              int64_t channel_id, std::optional<int64_t> split_dimension) {
+            return create_all_to_all_list_of_lists(
+                b, operands, partition_subgroups, channel_id, split_dimension);
           },
       .create_cross_partition_all_to_all_with_iota_device_list =
-          [num_replicas, num_partitions](
+          [create_all_to_all_list_of_lists, num_replicas, num_partitions](
               SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
               const IotaReplicaGroupList& partition_group_list,
               int64_t channel_id, std::optional<int64_t> split_dimension) {
             // Fallback back to list of lists collective creation if the
             // partition group list does not utilize all the partitions.
             if (partition_group_list.num_total_devices() != num_partitions) {
-              return CreateAllToAllListsOfLists(num_replicas, num_partitions, b,
-                                                operands, partition_group_list,
-                                                channel_id, split_dimension);
+              return create_all_to_all_list_of_lists(
+                  b, operands, partition_group_list.flattened_replica_groups(),
+                  channel_id, split_dimension);
             }
             std::vector<Shape> shapes(operands.size(), operands[0]->shape());
             const Shape output_shape = (shapes.size() == 1)
@@ -5134,25 +5126,26 @@ SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64_t num_partitions,
                 /*constrain_layout=*/false, channel_id, split_dimension));
           },
       .create_cross_partition_all_gather =
-          [num_replicas, num_partitions](
+          [create_all_gather_list_of_lists](
               SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-              const CollectiveDeviceListBase& device_list, int64_t channel_id,
-              int64_t all_gather_dimension) {
-            return CreateAllGatherListsOfLists(
-                num_replicas, num_partitions, b, operand, ag_shape, device_list,
-                channel_id, all_gather_dimension);
+              const std::vector<std::vector<int64_t>>& partition_subgroups,
+              int64_t channel_id, int64_t all_gather_dimension) {
+            return create_all_gather_list_of_lists(
+                b, operand, ag_shape, partition_subgroups, channel_id,
+                all_gather_dimension);
           },
       .create_cross_partition_all_gather_with_iota_device_list =
-          [num_replicas, num_partitions](
+          [create_all_gather_list_of_lists, num_replicas, num_partitions](
               SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
               const IotaReplicaGroupList& partition_group_list,
               int64_t channel_id, int64_t all_gather_dimension) {
             // Fallback to list of lists collective creation if the partition
             // group list does not utilize all the partitions.
             if (partition_group_list.num_total_devices() != num_partitions) {
-              return CreateAllGatherListsOfLists(
-                  num_replicas, num_partitions, b, operand, ag_shape,
-                  partition_group_list, channel_id, all_gather_dimension);
+              return create_all_gather_list_of_lists(
+                  b, operand, ag_shape,
+                  partition_group_list.flattened_replica_groups(), channel_id,
+                  all_gather_dimension);
             }
             return b->AddInstruction(HloInstruction::CreateAllGather(
                 ag_shape, {operand}, all_gather_dimension,
@@ -5218,7 +5211,9 @@ SpmdPartitioner::AllGatherShardsInternal(
             *it, result_shape.dimensions(*it) *
                      partition_subgroups.num_devices_per_group());
         result = collectives_creator.create_cross_partition_all_gather(
-            b, result, result_shape, partition_subgroups, (*next_channel_id)++,
+            b, result, result_shape,
+            partition_subgroups.flattened_replica_groups(),
+            (*next_channel_id)++,
             /*all_gather_dimension=*/*it);
       }
     }
@@ -5257,7 +5252,7 @@ SpmdPartitioner::AllGatherShardsInternal(
     shape[0] *= partition_subgroups.num_devices_per_group();
     result = collectives_creator.create_cross_partition_all_gather(
         b, result, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
-        partition_subgroups, (*next_channel_id)++,
+        partition_subgroups.flattened_replica_groups(), (*next_channel_id)++,
         /*all_gather_dimension=*/0);
   }
   ag = result;
@@ -5346,7 +5341,8 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
     auto partition_subgroups =
         GetPartitionGroupsForReplication(sharding, selected_dims);
     return collectives_creator.create_cross_partition_all_reduce(
-        b, operand, reduction, partition_subgroups, (*next_channel_id)++);
+        b, operand, reduction, partition_subgroups.flattened_replica_groups(),
+        (*next_channel_id)++);
   }
 
   auto result = operand;
@@ -5369,7 +5365,8 @@ HloInstruction* SpmdPartitioner::AllReduceAlongShardingDimsInternal(
       auto partition_subgroups =
           GetPartitionGroupsForReplication(sharding, {*it});
       result = collectives_creator.create_cross_partition_all_reduce(
-          b, result, reduction, partition_subgroups, (*next_channel_id)++);
+          b, result, reduction, partition_subgroups.flattened_replica_groups(),
+          (*next_channel_id)++);
     }
   }
   return result;
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.h b/third_party/xla/xla/service/spmd/spmd_partitioner.h
index 8d26760a15fd98..8aae8502e73d9b 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.h
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.h
@@ -203,7 +203,8 @@ struct SPMDCollectiveOpsCreator {
   // Function used to create a cross-partition all-reduce HLO.
   std::function<HloInstruction*(
       SpmdBuilder*, HloInstruction* operand, HloComputation* reduction,
-      const CollectiveDeviceListBase& partition_subgroups, int64_t channel_id)>
+      const std::vector<std::vector<int64_t>>& partition_subgroups,
+      int64_t channel_id)>
       create_cross_partition_all_reduce;
 
   // Function used to create a cross-partition all-reduce HLO using device list
@@ -226,8 +227,8 @@ struct SPMDCollectiveOpsCreator {
   // Function used to create a cross-partition all-to-all HLO.
   std::function<HloInstruction*(
       SpmdBuilder*, absl::Span<HloInstruction* const> operands,
-      const CollectiveDeviceListBase& partition_subgroups, int64_t channel_id,
-      std::optional<int64_t> split_dimension)>
+      const std::vector<std::vector<int64_t>>& partition_subgroups,
+      int64_t channel_id, std::optional<int64_t> split_dimension)>
       create_cross_partition_all_to_all;
 
   // Function used to create a cross-partition all-to-all HLO using device list
@@ -243,8 +244,8 @@ struct SPMDCollectiveOpsCreator {
   // if it is nullptr, the partitioner will use all-reduce instead.
   std::function<HloInstruction*(
       SpmdBuilder*, HloInstruction* operand, const Shape& ag_shape,
-      const CollectiveDeviceListBase& partition_subgroups, int64_t channel_id,
-      int64_t all_gather_dimension)>
+      const std::vector<std::vector<int64_t>>& partition_subgroups,
+      int64_t channel_id, int64_t all_gather_dimension)>
       create_cross_partition_all_gather;
 
   // Function used to create a cross-partition all-gather HLO using device list
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index f9ac602d258afc..c2b418440c1cc0 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -461,16 +461,14 @@ bool IsIota(const Array<int64_t>& x) {
 
 // Expand the device groups, making each device group follow the format of the
 // partition group.
-CollectiveDeviceList ExpandDeviceGroups(
+std::vector<std::vector<int64_t>> ExpandDeviceGroups(
     const DeviceGroupTileAssignment& device_groups,
-    const CollectiveDeviceListBase& collective_device_list) {
+    const std::vector<std::vector<int64_t>>& partition_subgroups) {
   // Example: Given device groups of {{0,1,2,3},{4,5,6,7}} and partition
   // subgroups of {{0,2}, {1,3}} returns device groups of {{0,2}, {1,3}, {4,6},
   // {5,7}}
-  const std::vector<std::vector<int64_t>>& partition_subgroups =
-      collective_device_list.flattened_replica_groups();
   if (partition_subgroups.empty()) {
-    return CollectiveDeviceList(device_groups.flattened_device_groups());
+    return device_groups.flattened_device_groups();
   }
   std::vector<std::vector<int64_t>> result(partition_subgroups.size() *
                                            device_groups.num_groups());
@@ -484,7 +482,7 @@ CollectiveDeviceList ExpandDeviceGroups(
       }
     }
   }
-  return CollectiveDeviceList(result);
+  return result;
 }
 
 // Expand the device groups, making each device group follow the format of the
@@ -554,7 +552,7 @@ CreateCrossPartitionAllReduce(
     std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
   return [creator, device_groups_ptr](
              SpmdBuilder* b, HloInstruction* operand, HloComputation* reduction,
-             const CollectiveDeviceListBase& partition_subgroups,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
              int64_t channel_id) {
     return creator.create_cross_partition_all_reduce(
         b, operand, reduction,
@@ -579,7 +577,8 @@ CreateCrossPartitionAllReduceWithIotaDeviceList(
     if (!expanded_iota_partition_group_list.has_value()) {
       return creator.create_cross_partition_all_reduce(
           b, operand, reduction,
-          ExpandDeviceGroups(*device_groups_ptr, partition_group_list),
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
           channel_id);
     }
     return creator.create_cross_partition_all_reduce_with_iota_device_list(
@@ -616,7 +615,7 @@ CreateCrossPartitionAllToAll(
     std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
   return [creator, device_groups_ptr](
              SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-             const CollectiveDeviceListBase& partition_subgroups,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
              int64_t channel_id, std::optional<int64_t> split_dimension) {
     return creator.create_cross_partition_all_to_all(
         b, operands,
@@ -641,7 +640,8 @@ CreateCrossPartitionAllToAllWithIotaDeviceList(
     if (!expanded_iota_partition_group_list.has_value()) {
       return creator.create_cross_partition_all_to_all(
           b, operands,
-          ExpandDeviceGroups(*device_groups_ptr, partition_group_list),
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
           channel_id, split_dimension);
     }
     return creator.create_cross_partition_all_to_all_with_iota_device_list(
@@ -656,7 +656,7 @@ CreateCrossPartitionAllGather(
     std::shared_ptr<const DeviceGroupTileAssignment> device_groups_ptr) {
   return [creator, device_groups_ptr](
              SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
-             const CollectiveDeviceListBase& partition_subgroups,
+             const std::vector<std::vector<int64_t>>& partition_subgroups,
              int64_t channel_id, int64_t all_gather_dimension) {
     return creator.create_cross_partition_all_gather(
         b, operand, ag_shape,
@@ -682,7 +682,8 @@ CreateCrossPartitionAllGatherWithIotaDeviceList(
     if (!expanded_iota_partition_group_list.has_value()) {
       return creator.create_cross_partition_all_gather(
           b, operand, ag_shape,
-          ExpandDeviceGroups(*device_groups_ptr, partition_group_list),
+          ExpandDeviceGroups(*device_groups_ptr,
+                             partition_group_list.flattened_replica_groups()),
           channel_id, all_gather_dimension);
     }
     return creator.create_cross_partition_all_gather_with_iota_device_list(

From 5e56a937deedc72d229aff85657ee702a61f2d78 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 20:38:31 -0800
Subject: [PATCH 733/753] Automated Code Change

PiperOrigin-RevId: 848393091
---
 tensorflow/cc/experimental/libexport/load.cc      | 5 +++--
 tensorflow/cc/experimental/libexport/save_test.cc | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/cc/experimental/libexport/load.cc b/tensorflow/cc/experimental/libexport/load.cc
index fd7f2d159e6166..670fa4f51f5cc1 100644
--- a/tensorflow/cc/experimental/libexport/load.cc
+++ b/tensorflow/cc/experimental/libexport/load.cc
@@ -31,8 +31,9 @@ using protobuf::RepeatedPtrField;
 absl::StatusOr<TFPackage> TFPackage::Load(const std::string& path) {
   // Load the proto
   TFPackage tf_package;
-  const string saved_model_pb_path = io::JoinPath(path, kSavedModelFilenamePb);
-  const string saved_model_pbtxt_path =
+  const std::string saved_model_pb_path =
+      io::JoinPath(path, kSavedModelFilenamePb);
+  const std::string saved_model_pbtxt_path =
       io::JoinPath(path, kSavedModelFilenamePbTxt);
   if (Env::Default()->FileExists(saved_model_pb_path).ok()) {
     TF_RETURN_IF_ERROR(ReadBinaryProto(Env::Default(), saved_model_pb_path,
diff --git a/tensorflow/cc/experimental/libexport/save_test.cc b/tensorflow/cc/experimental/libexport/save_test.cc
index fbcc3c2e53b426..1a0ba4f0662a92 100644
--- a/tensorflow/cc/experimental/libexport/save_test.cc
+++ b/tensorflow/cc/experimental/libexport/save_test.cc
@@ -25,7 +25,7 @@ namespace libexport {
 namespace {
 
 TEST(SaveTest, TestDirectoryStructure) {
-  const string base_dir = tensorflow::io::JoinPath(
+  const std::string base_dir = tensorflow::io::JoinPath(
       tensorflow::testing::TmpDir(), "test_directory_structure");
   TF_ASSERT_OK(Save(base_dir));
   TF_ASSERT_OK(Env::Default()->IsDirectory(base_dir));

From 4272c47e39277618936790b8948e8eee8c465cab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 22:28:14 -0800
Subject: [PATCH 734/753] Automated Code Change

PiperOrigin-RevId: 848423026
---
 third_party/xla/xla/backends/interpreter/executor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 0ab41357230986..00952eb5003bc0 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -136,8 +136,8 @@ class XlaInterpreterExecutor : public StreamExecutorCommon {
     return std::make_unique<InterpreterStream>(this);
   }
   absl::StatusOr<std::unique_ptr<MemoryAllocator>> CreateMemoryAllocator(
-      MemoryType type) override {
-    if (type == MemoryType::kHost) {
+      MemorySpace type) override {
+    if (type == MemorySpace::kHost) {
       return std::make_unique<GenericMemoryAllocator>(
           [](uint64_t size)
               -> absl::StatusOr<std::unique_ptr<MemoryAllocation>> {

From 2b19036a2cca499817d687f803c3e1b6d26f8f59 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 22:52:36 -0800
Subject: [PATCH 735/753] Fix test when it launched on the machine with 8
 devices.

PiperOrigin-RevId: 848429925
---
 .../xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
index 6d9d181703312f..a89c134efd8829 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor_multigpu_test.cc
@@ -93,7 +93,7 @@ TEST(CudaExecutorMultiGpuTest, PeerAccess) {
   EXPECT_TRUE(executors[0]->CanEnablePeerAccessTo(1));
   EXPECT_TRUE(executors[1]->CanEnablePeerAccessTo(0));
   EXPECT_TRUE(executors[1]->CanEnablePeerAccessTo(1));
-  EXPECT_FALSE(executors[0]->CanEnablePeerAccessTo(3));
+  EXPECT_FALSE(executors[0]->CanEnablePeerAccessTo(100));
 }
 
 TEST(CudaExecutorMultiGpuTest, CudaMulticastMemoryResubscriptionFails) {

From 354860ee1c125d2b34457200ffbbd1e0b0aa9a07 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 23:07:51 -0800
Subject: [PATCH 736/753] Automated Code Change

PiperOrigin-RevId: 848434764
---
 third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
index 173e1017162708..ceb213b9e29eae 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "absl/status/status_matchers.h"

From 4a2a5aed202f85210175c9d73bdf145040b96f96 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 23 Dec 2025 23:33:34 -0800
Subject: [PATCH 737/753] Automated Code Change

PiperOrigin-RevId: 848441651
---
 tensorflow/tools/mlpbtxt/frommlpbtxt.cc | 10 +++++-----
 tensorflow/tools/mlpbtxt/tomlpbtxt.cc   | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/tools/mlpbtxt/frommlpbtxt.cc b/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
index dec8b6b542a8d0..2817d919dbc915 100644
--- a/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
+++ b/tensorflow/tools/mlpbtxt/frommlpbtxt.cc
@@ -29,15 +29,15 @@ namespace tensorflow {
 namespace {
 
 int Run(int argc, char** argv) {
-  string FLAGS_in = "";
-  string FLAGS_out = "";
+  std::string FLAGS_in = "";
+  std::string FLAGS_out = "";
 
   std::vector<Flag> flag_list = {
       Flag("in", &FLAGS_in, "Input multi-line proto text (.mlpbtxt) file name"),
       Flag("out", &FLAGS_out, "Output proto text (.pbtxt) file name")};
 
   // Parse the command-line.
-  const string usage = Flags::Usage(argv[0], flag_list);
+  const std::string usage = Flags::Usage(argv[0], flag_list);
   const bool parse_ok = Flags::Parse(&argc, argv, flag_list);
   if (argc != 1 || !parse_ok) {
     printf("%s", usage.c_str());
@@ -47,7 +47,7 @@ int Run(int argc, char** argv) {
   port::InitMain(argv[0], &argc, &argv);
 
   // Read the input file --in.
-  string in_contents;
+  std::string in_contents;
   absl::Status s = ReadFileToString(Env::Default(), FLAGS_in, &in_contents);
   if (!s.ok()) {
     printf("Error reading file %s: %s\n", FLAGS_in.c_str(),
@@ -56,7 +56,7 @@ int Run(int argc, char** argv) {
   }
 
   // Write the output file --out.
-  const string out_contents = PBTxtFromMultiline(in_contents);
+  const std::string out_contents = PBTxtFromMultiline(in_contents);
   s = WriteStringToFile(Env::Default(), FLAGS_out, out_contents);
   if (!s.ok()) {
     printf("Error writing file %s: %s\n", FLAGS_out.c_str(),
diff --git a/tensorflow/tools/mlpbtxt/tomlpbtxt.cc b/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
index 552d4075619cd3..8c69f5047bb384 100644
--- a/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
+++ b/tensorflow/tools/mlpbtxt/tomlpbtxt.cc
@@ -30,9 +30,9 @@ namespace tensorflow {
 namespace {
 
 int Run(int argc, char** argv) {
-  string FLAGS_in = "";
-  string FLAGS_out = "";
-  string FLAGS_fields = "description";
+  std::string FLAGS_in = "";
+  std::string FLAGS_out = "";
+  std::string FLAGS_fields = "description";
 
   std::vector<Flag> flag_list = {
       Flag("in", &FLAGS_in, "Input proto text (.pbtxt) file name"),
@@ -41,7 +41,7 @@ int Run(int argc, char** argv) {
       Flag("fields", &FLAGS_fields, "Comma-separated list of field names")};
 
   // Parse the command-line.
-  const string usage = Flags::Usage(argv[0], flag_list);
+  const std::string usage = Flags::Usage(argv[0], flag_list);
   const bool parse_ok = Flags::Parse(&argc, argv, flag_list);
   if (argc != 1 || !parse_ok) {
     printf("%s", usage.c_str());
@@ -49,7 +49,7 @@ int Run(int argc, char** argv) {
   }
 
   // Parse the --fields option.
-  std::vector<string> fields =
+  std::vector<std::string> fields =
       str_util::Split(FLAGS_fields, ',', str_util::SkipEmpty());
   if (fields.empty()) {
     printf("--fields must be non-empty.\n%s", usage.c_str());
@@ -59,7 +59,7 @@ int Run(int argc, char** argv) {
   port::InitMain(argv[0], &argc, &argv);
 
   // Read the input file --in.
-  string in_contents;
+  std::string in_contents;
   absl::Status s = ReadFileToString(Env::Default(), FLAGS_in, &in_contents);
   if (!s.ok()) {
     printf("Error reading file %s: %s\n", FLAGS_in.c_str(),
@@ -68,7 +68,7 @@ int Run(int argc, char** argv) {
   }
 
   // Write the output file --out.
-  const string out_contents = PBTxtToMultiline(in_contents, fields);
+  const std::string out_contents = PBTxtToMultiline(in_contents, fields);
   s = WriteStringToFile(Env::Default(), FLAGS_out, out_contents);
   if (!s.ok()) {
     printf("Error writing file %s: %s\n", FLAGS_out.c_str(),

From 0a8d3e310bb9b8611469f77f5181f62a91652508 Mon Sep 17 00:00:00 2001
From: Henning Becker <hebecker@google.com>
Date: Tue, 23 Dec 2025 23:36:04 -0800
Subject: [PATCH 738/753] Refactor xtile_compiler build targets to separate
 implementation and stub.

The `xtile_compiler` target now acts as a selector, depending on either `xtile_compiler_impl` or `xtile_compiler_stub` based on whether CUDA or ROCm is configured. The full implementation is moved to the new `xtile_compiler_impl` target, while `xtile_compiler_stub` provides a minimal version for other configurations.

This has the advantage that build_cleaner can run on xtile_compiler_impl. (Doing that removed around 20 dependencies)

PiperOrigin-RevId: 848442213
---
 .../xla/xla/backends/gpu/codegen/triton/BUILD | 180 +++++++++++-------
 1 file changed, 109 insertions(+), 71 deletions(-)

diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index ea9866b1695c74..0ccc0e71026012 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -313,60 +313,70 @@ cc_library(
 )
 
 cc_library(
-    name = "xtile_compiler",
-    # Using if_cuda_or_rocm_is_configured guard to prevent sycl target build / link errors.
-    srcs = if_cuda_or_rocm_is_configured(
-        ["xtile_compiler.cc"],
-        ["xtile_compiler_stub.cc"],
-    ),
-    hdrs = ["xtile_compiler.h"],
-    compatible_with = get_compatible_with_portable(),
+    name = "xtile_compiler_impl",
+    srcs =
+        [
+            "xtile_compiler.cc",
+            "xtile_compiler.h",
+        ],
+    tags = [
+        "gpu",
+        "manual",
+        "no-oneapi",
+    ],
+    visibility = ["//visibility:private"],
     deps = [
+        ":collective_emitter",
+        ":compilation_pipeline",
+        ":fusion_emitter",
+        ":lowering_util",
+        ":support",
         "//xla:autotuning_proto_cc",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla:xla_proto_cc",
+        "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
         "//xla/backends/gpu/codegen/triton/ir:triton_xla",
+        "//xla/backends/gpu/codegen/triton/transforms:passes",
+        "//xla/codegen/emitters/ir:xla",
+        "//xla/codegen/emitters/transforms:passes",
         "//xla/codegen/tiling:symbolic_tile_analysis",
-        "//xla/codegen/tiling:tiled_hlo_computation",
-        "//xla/codegen/tiling:tiled_hlo_fusion_instruction",
-        "//xla/codegen/tiling:tiled_hlo_instruction",
-        "//xla/codegen/tiling:tiled_hlo_schedule",
         "//xla/codegen/tiling:tiling_specification",
         "//xla/codegen/xtile/ir:xtile",
         "//xla/codegen/xtile/ir/transforms:passes",
-        "//xla/hlo/analysis:symbolic_expr",
+        "//xla/hlo/builder:xla_builder",
         "//xla/hlo/ir:hlo",
+        "//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
+        "//xla/service:dump",
         "//xla/service:hlo_module_config",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu/llvm_gpu_backend:nvptx_libdevice_path",
         "//xla/service/gpu/model:block_level_parameters",
+        "//xla/service/gpu/model:triton_emitter_constraints",
+        "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/stream_executor/gpu:tma_metadata",
+        "//xla/tools:hlo_decomposer_lib",
         "//xla/tsl/framework/mlir:status_scoped_diagnostic_handler",
+        "//xla/tsl/platform:errors",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Linker",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@stablehlo//:stablehlo_ops",
-        "@triton//:TritonDialects",
-    ] + if_cuda_or_rocm_is_configured([
-        ":fusion_emitter",
-        ":lowering_util",
-        ":compilation_pipeline",
-        ":collective_emitter",
-        ":dot_algorithms",
-        ":emitter_helpers",
-        ":support",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Linker",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
@@ -376,59 +386,87 @@ cc_library(
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:FunctionInterfaces",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:IndexToLLVM",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMIRTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
-        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:ToLLVMIRTranslation",
         "@llvm-project//mlir:Transforms",
-        "//xla:permutation_util",
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/backends/gpu/codegen/emitters/ir:xla_gpu",
-        "//xla/backends/gpu/codegen/emitters/transforms:passes",
-        "//xla/backends/gpu/codegen/triton/transforms:passes",
-        "//xla/codegen/emitters:elemental_hlo_to_mlir",
-        "//xla/codegen/emitters/ir:xla",
-        "//xla/codegen/emitters/transforms:passes",
-        "//xla/hlo/analysis:indexing_analysis",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/translate/hlo_to_mhlo:hlo_function_importer",
-        "//xla/hlo/utils:hlo_traversal",
-        "//xla/mlir_hlo",
-        "//xla/service:dump",
-        "//xla/service:instruction_fusion",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu/llvm_gpu_backend:nvptx_libdevice_path",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:triton_fusion_analysis",
-        "//xla/service/gpu/model:triton_emitter_constraints",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/stream_executor/cuda:cuda_compute_capability",
-        "//xla/tools:hlo_decomposer_lib",
-        "//xla/tsl/platform:errors",
-        "//xla/tsl/platform:rocm_rocdl_path",
-        "//xla/tsl/platform:statusor",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:statusor",
+        "@stablehlo//:stablehlo_ops",
+        "@triton//:TritonDialects",
         "@triton//:TritonTransforms",
-    ]) + if_cuda_is_configured([
+    ],
+)
+
+cc_library(
+    name = "xtile_compiler_stub",
+    srcs = [
+        "xtile_compiler.h",
+        "xtile_compiler_stub.cc",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:private"],
+    deps = [
+        "//xla:autotuning_proto_cc_impl",
+        "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@triton//:TritonDialects",
+    ],
+)
+
+cc_library(
+    name = "xtile_compiler",
+    hdrs = [
+        "xtile_compiler.h",
+    ],
+    compatible_with = get_compatible_with_portable(),
+    deps = if_cuda_or_rocm_is_configured(
+        [":xtile_compiler_impl"],
+        [":xtile_compiler_stub"],
+    ) + if_cuda_is_configured([
         "//xla/service/gpu/llvm_gpu_backend:nvptx_backend",
-    ]),
+    ]) + [
+        "//xla:autotuning_proto_cc",
+        "//xla/codegen/tiling:symbolic_tile_analysis",
+        "//xla/codegen/tiling:tiling_specification",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu/model:block_level_parameters",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:tma_metadata",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
+        "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
 )
 
 cc_library(

From 1a037d6b2bb77a3998348eba48d80c5b422651b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Dec 2025 00:18:53 -0800
Subject: [PATCH 739/753] Automated Code Change

PiperOrigin-RevId: 848455572
---
 .../core/kernels/image/non_max_suppression_op_gpu_test.cc    | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc b/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
index bcdc406d85201a..151a956ca22fd4 100644
--- a/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
+++ b/tensorflow/core/kernels/image/non_max_suppression_op_gpu_test.cc
@@ -195,8 +195,7 @@ TEST_F(NonMaxSuppressionV2GPUOpTest, TestInconsistentBoxAndScoreShapes) {
   Status s = RunOpKernel();
 
   ASSERT_FALSE(s.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+  EXPECT_TRUE(absl::StrContains(s.ToString(), "scores has incompatible shape"))
       << s;
 }
 
@@ -210,7 +209,7 @@ TEST_F(NonMaxSuppressionV2GPUOpTest, TestInvalidIOUThreshold) {
 
   ASSERT_FALSE(s.ok());
   EXPECT_TRUE(
-      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      absl::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
       << s;
 }
 

From c8d1b4edb55bd4cf980efb22ba2f126cbaffa3cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Dec 2025 01:03:34 -0800
Subject: [PATCH 740/753] Update GraphDef version to 2451.

PiperOrigin-RevId: 848467225
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 492e874659b100..964b014a3aa3f7 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -93,7 +93,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 2450  // Updated: 2025/12/23
+#define TF_GRAPH_DEF_VERSION 2451  // Updated: 2025/12/24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 6b075d4aca1477826b7ed7baa51e8bede609691e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Dec 2025 01:03:44 -0800
Subject: [PATCH 741/753] compat: Update forward compatibility horizon to
 2025-12-24

PiperOrigin-RevId: 848467272
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 62b652fca06d5e..79c85e589caa13 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 23)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2025, 12, 24)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From f77105b23dedcf214f502ec56fa59cafe71104c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Dec 2025 01:30:30 -0800
Subject: [PATCH 742/753] Automated Code Change

PiperOrigin-RevId: 848475361
---
 .../tools/graph_transforms/quantize_nodes.cc  | 144 +++++++++---------
 .../graph_transforms/remove_attribute.cc      |   4 +-
 2 files changed, 77 insertions(+), 71 deletions(-)

diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index 269137c997d447..3d8eabc8361f6b 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -32,17 +32,17 @@ namespace graph_transforms {
 // into the quantized equivalent.
 struct QuantizedOpInfo {
   // The name of the float op.
-  string float_name;
+  std::string float_name;
   // Which attributes to copy directly over.
-  std::vector<string> attrs_to_copy;
+  std::vector<std::string> attrs_to_copy;
   // Extra data type attributes we need to set.
-  std::vector<std::pair<string, DataType>> dtypes_to_set;
+  std::vector<std::pair<std::string, DataType>> dtypes_to_set;
   // What depth of inputs the op can read in.
   DataType input_bit_depth;
   // The depth of the op's quantized outputs.
   DataType output_bit_depth;
   // Which inputs (e.g. shapes) aren't involved in the quantization process.
-  std::set<int32> unquantized_inputs;
+  std::set<int32_t> unquantized_inputs;
   // How the outputs are arranged, either
   // [input0, input1, min0, max0, min1, max1] for contiguous, or
   // [input0, input1, min0, min1, max0, max1] for separate.
@@ -145,12 +145,12 @@ const std::vector<QuantizedOpInfo>& GetQuantizedOpList() {
 
 namespace {
 // Replaces invalid characters in input names to get a unique node name.
-string UniqueNodeNameFromInput(const string& input_name) {
-  string prefix;
-  string node_name;
-  string suffix;
+std::string UniqueNodeNameFromInput(const std::string& input_name) {
+  std::string prefix;
+  std::string node_name;
+  std::string suffix;
   NodeNamePartsFromInput(input_name, &prefix, &node_name, &suffix);
-  string result;
+  std::string result;
   if (prefix == "^") {
     result += "__hat__";
   }
@@ -163,9 +163,10 @@ string UniqueNodeNameFromInput(const string& input_name) {
 
 // Pulls two float values from the named parameters, with a lot of checking.
 absl::Status ExtractRangeFromParams(const TransformFuncContext& context,
-                                    const string& min_name,
-                                    const string& max_name, float* min_value,
-                                    float* max_value, bool* has_range) {
+                                    const std::string& min_name,
+                                    const std::string& max_name,
+                                    float* min_value, float* max_value,
+                                    bool* has_range) {
   // See if we've been given quantized inputs with a known range.
   const bool has_min = (context.params.count(min_name) != 0);
   const bool has_max = (context.params.count(max_name) != 0);
@@ -193,17 +194,17 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
                                  const TransformFuncContext& context,
                                  GraphDef* output_graph_def) {
   // Make sure we can look up inputs and outputs quickly.
-  std::set<string> input_names(context.input_names.begin(),
-                               context.input_names.end());
-  std::set<string> output_names(context.output_names.begin(),
-                                context.output_names.end());
+  std::set<std::string> input_names(context.input_names.begin(),
+                                    context.input_names.end());
+  std::set<std::string> output_names(context.output_names.begin(),
+                                     context.output_names.end());
   GraphDef current_graph_def = input_graph_def;
   // Keep running the merging until no more duplicates are found.
   bool any_duplicates_found;
   do {
     any_duplicates_found = false;
     // First arrange all of the nodes by a hash of their contents.
-    std::map<uint64, std::vector<const NodeDef*>> hashed_nodes;
+    std::map<uint64_t, std::vector<const NodeDef*>> hashed_nodes;
     for (const NodeDef& node : current_graph_def.node()) {
       NodeDef nameless_node = node;
       // The name matters if it's being used as an input or output node,
@@ -211,14 +212,14 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
       if (!input_names.count(node.name()) && !output_names.count(node.name())) {
         nameless_node.set_name("");
       }
-      const uint64 hash = HashNodeDef(nameless_node);
+      const uint64_t hash = HashNodeDef(nameless_node);
       hashed_nodes[hash].push_back(&node);
     }
     // If we have multiple nodes with the same hash, then we know they're
     // duplicates and can be removed, unless they're stateful.
-    std::map<string, string> inputs_to_rename;
+    std::map<std::string, std::string> inputs_to_rename;
     GraphDef merged_graph_def;
-    for (const std::pair<const uint64, std::vector<const NodeDef*>>&
+    for (const std::pair<const uint64_t, std::vector<const NodeDef*>>&
              hashed_node_info : hashed_nodes) {
       const std::vector<const NodeDef*>& hash_node_list =
           hashed_node_info.second;
@@ -229,7 +230,7 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
             OpRegistry::Global()->LookUpOpDef(current_node->op(), &op_def));
         const bool is_duplicate = ((!op_def->is_stateful()) && (i > 0));
         if (is_duplicate) {
-          const string original_name = hash_node_list[0]->name();
+          const std::string original_name = hash_node_list[0]->name();
           inputs_to_rename[current_node->name() + ":*"] = original_name;
           any_duplicates_found = true;
         } else {
@@ -241,7 +242,7 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
     // Update the graph so that any nodes that referred to removed inputs now
     // pull from the remaining duplicate.
     TF_RETURN_IF_ERROR(RenameNodeInputs(merged_graph_def, inputs_to_rename,
-                                        std::unordered_set<string>(),
+                                        std::unordered_set<std::string>(),
                                         &current_graph_def));
   } while (any_duplicates_found);
 
@@ -261,11 +262,11 @@ absl::Status MergeDuplicateNodes(const GraphDef& input_graph_def,
 absl::Status RemoveRedundantQuantizations(const GraphDef& input_graph_def,
                                           const TransformFuncContext& context,
                                           GraphDef* output_graph_def) {
-  std::set<string> graph_outputs;
-  for (const string& output_name : context.output_names) {
+  std::set<std::string> graph_outputs;
+  for (const std::string& output_name : context.output_names) {
     graph_outputs.insert(NodeNameFromInput(output_name));
   }
-  std::map<string, string> inputs_to_rename;
+  std::map<std::string, std::string> inputs_to_rename;
   GraphDef replaced_graph_def;
   TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
       input_graph_def,  // clang-format off
@@ -276,10 +277,10 @@ absl::Status RemoveRedundantQuantizations(const GraphDef& input_graph_def,
           {"Max"},
         }
       },  // clang-format on
-      [&inputs_to_rename, &graph_outputs](const NodeMatch& match,
-                                          const std::set<string>& input_nodes,
-                                          const std::set<string>& output_nodes,
-                                          std::vector<NodeDef>* new_nodes) {
+      [&inputs_to_rename, &graph_outputs](
+          const NodeMatch& match, const std::set<std::string>& input_nodes,
+          const std::set<std::string>& output_nodes,
+          std::vector<NodeDef>* new_nodes) {
         const NodeDef& quantize_node = match.node;
         const NodeDef& dequantize_node = match.inputs[0].node;
         inputs_to_rename[quantize_node.name() + ":0"] =
@@ -302,7 +303,7 @@ absl::Status RemoveRedundantQuantizations(const GraphDef& input_graph_def,
       {true}, &replaced_graph_def));
 
   return RenameNodeInputs(replaced_graph_def, inputs_to_rename,
-                          std::unordered_set<string>(), output_graph_def);
+                          std::unordered_set<std::string>(), output_graph_def);
 }
 
 // If the user has passed in the input_min and input_max args, then we need to
@@ -321,15 +322,15 @@ absl::Status QuantizePlaceholders(const GraphDef& input_graph_def,
     *output_graph_def = input_graph_def;
     return absl::OkStatus();
   }
-  std::map<string, string> inputs_to_rename_first_pass;
-  std::map<string, string> inputs_to_rename_second_pass;
+  std::map<std::string, std::string> inputs_to_rename_first_pass;
+  std::map<std::string, std::string> inputs_to_rename_second_pass;
   GraphDef placeholder_graph_def;
   placeholder_graph_def.Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     if (node.op() != "Placeholder") {
       *(placeholder_graph_def.mutable_node()->Add()) = node;
     } else {
-      string namespace_prefix = node.name() + "_eightbit";
+      std::string namespace_prefix = node.name() + "_eightbit";
 
       NodeDef quantized_placeholder;
       quantized_placeholder = node;
@@ -354,7 +355,7 @@ absl::Status QuantizePlaceholders(const GraphDef& input_graph_def,
       SetNodeTensorAttr<float>("value", max_tensor, &max_node);
       *(placeholder_graph_def.mutable_node()->Add()) = max_node;
 
-      const string rename_suffix = "__RENAMED_PLACEHOLDER__";
+      const std::string rename_suffix = "__RENAMED_PLACEHOLDER__";
       NodeDef dequantize_node;
       dequantize_node.set_op("Dequantize");
       dequantize_node.set_name(namespace_prefix + "/dequantize");
@@ -375,12 +376,12 @@ absl::Status QuantizePlaceholders(const GraphDef& input_graph_def,
   }
 
   GraphDef first_pass_graph_def;
-  TF_RETURN_IF_ERROR(
-      RenameNodeInputs(placeholder_graph_def, inputs_to_rename_first_pass,
-                       std::unordered_set<string>(), &first_pass_graph_def));
+  TF_RETURN_IF_ERROR(RenameNodeInputs(
+      placeholder_graph_def, inputs_to_rename_first_pass,
+      std::unordered_set<std::string>(), &first_pass_graph_def));
   TF_RETURN_IF_ERROR(
       RenameNodeInputs(first_pass_graph_def, inputs_to_rename_second_pass,
-                       std::unordered_set<string>(), output_graph_def));
+                       std::unordered_set<std::string>(), output_graph_def));
 
   return absl::OkStatus();
 }
@@ -400,15 +401,15 @@ absl::Status ConvertFakeQuantsToRequantize(const GraphDef& input_graph_def,
           {"Const"},
         }
       },  // clang-format on
-      [](const NodeMatch& match, const std::set<string>& input_nodes,
-         const std::set<string>& output_nodes,
+      [](const NodeMatch& match, const std::set<std::string>& input_nodes,
+         const std::set<std::string>& output_nodes,
          std::vector<NodeDef>* new_nodes) {
         const NodeDef& fake_quant_node = match.node;
         const NodeDef& original_op_node = match.inputs[0].node;
         const NodeDef& fake_quant_min_node = match.inputs[1].node;
         const NodeDef& fake_quant_max_node = match.inputs[2].node;
 
-        string namespace_prefix = fake_quant_node.name() + "_eightbit";
+        std::string namespace_prefix = fake_quant_node.name() + "_eightbit";
 
         new_nodes->push_back(original_op_node);
         new_nodes->push_back(fake_quant_min_node);
@@ -494,8 +495,8 @@ absl::Status MergeAdjacentRequantizes(const GraphDef& input_graph_def,
           {"Const"},
         }
       },  // clang-format on
-      [](const NodeMatch& match, const std::set<string>& input_nodes,
-         const std::set<string>& output_nodes,
+      [](const NodeMatch& match, const std::set<std::string>& input_nodes,
+         const std::set<std::string>& output_nodes,
          std::vector<NodeDef>* new_nodes) {
         const NodeDef& fake_requantize_node = match.node;
         const NodeDef& original_op_node =
@@ -544,8 +545,9 @@ absl::Status HoistFakeQuants(const GraphDef& input_graph_def,
     GraphDef hoisted_graph_def;
     TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
         current_graph_def, pattern,
-        [depth](const NodeMatch& match, const std::set<string>& input_nodes,
-                const std::set<string>& output_nodes,
+        [depth](const NodeMatch& match,
+                const std::set<std::string>& input_nodes,
+                const std::set<std::string>& output_nodes,
                 std::vector<NodeDef>* new_nodes) {
           const NodeDef& fake_quant_node = match.node;
           const NodeDef& fake_quant_min_node = match.inputs[1].node;
@@ -633,17 +635,17 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
   // between adjacent quantized ops, but a later pass removes these where it
   // can.
 
-  std::set<string> ops_to_ignore;
+  std::set<std::string> ops_to_ignore;
   if (context.params.count("ignore_op") > 0) {
-    for (const string& name : context.params.at("ignore_op")) {
+    for (const std::string& name : context.params.at("ignore_op")) {
       ops_to_ignore.insert(name);
     }
   }
 
   const std::vector<QuantizedOpInfo>& op_list = GetQuantizedOpList();
-  string op_pattern;
+  std::string op_pattern;
   bool is_first = true;
-  std::map<string, QuantizedOpInfo> op_map;
+  std::map<std::string, QuantizedOpInfo> op_map;
   for (const QuantizedOpInfo& op_info : op_list) {
     if (ops_to_ignore.count(op_info.float_name) == 0) {
       absl::StrAppend(&op_pattern, is_first ? "" : "|", op_info.float_name);
@@ -692,8 +694,8 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
   TF_RETURN_IF_ERROR(ReplaceMatchingOpTypes(
       converted_graph_def, {op_pattern},
       [&op_map, fallback_min, fallback_max, has_fallback_range](
-          const NodeMatch& match, const std::set<string>& input_nodes,
-          const std::set<string>& output_nodes,
+          const NodeMatch& match, const std::set<std::string>& input_nodes,
+          const std::set<std::string>& output_nodes,
           std::vector<NodeDef>* new_nodes) {
         const NodeDef& float_node = match.node;
         const QuantizedOpInfo& op_info = op_map[float_node.op()];
@@ -728,18 +730,18 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           return absl::OkStatus();
         }
 
-        string namespace_prefix = float_node.name() + "_eightbit";
+        std::string namespace_prefix = float_node.name() + "_eightbit";
 
         // Quantize all of the inputs.
-        std::vector<string> quantized_input_names;
+        std::vector<std::string> quantized_input_names;
         for (int i = 0; i < float_node.input_size(); ++i) {
           // Skip any non-float inputs.
           if (op_info.unquantized_inputs.count(i)) {
             continue;
           }
 
-          const string& input_name = float_node.input(i);
-          string unique_input_name =
+          const std::string& input_name = float_node.input(i);
+          std::string unique_input_name =
               namespace_prefix + "/" + UniqueNodeNameFromInput(input_name);
 
           // Add some common constants we need for reshaping inputs.
@@ -749,8 +751,9 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           AddNodeInput("^" + NodeNameFromInput(input_name), &reshape_dims);
           SetNodeAttr("dtype", DT_INT32, &reshape_dims);
           Tensor reshape_dims_tensor(DT_INT32, {1});
-          reshape_dims_tensor.flat<int32>()(0) = -1;
-          SetNodeTensorAttr<int32>("value", reshape_dims_tensor, &reshape_dims);
+          reshape_dims_tensor.flat<int32_t>()(0) = -1;
+          SetNodeTensorAttr<int32_t>("value", reshape_dims_tensor,
+                                     &reshape_dims);
           new_nodes->push_back(reshape_dims);
 
           NodeDef reduction_dims;
@@ -759,9 +762,9 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           AddNodeInput("^" + NodeNameFromInput(input_name), &reduction_dims);
           SetNodeAttr("dtype", DT_INT32, &reduction_dims);
           Tensor reduction_dims_tensor(DT_INT32, {1});
-          reduction_dims_tensor.flat<int32>()(0) = 0;
-          SetNodeTensorAttr<int32>("value", reduction_dims_tensor,
-                                   &reduction_dims);
+          reduction_dims_tensor.flat<int32_t>()(0) = 0;
+          SetNodeTensorAttr<int32_t>("value", reduction_dims_tensor,
+                                     &reduction_dims);
           new_nodes->push_back(reduction_dims);
 
           NodeDef reshape_node;
@@ -806,11 +809,11 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
         NodeDef quantized_main_node;
         quantized_main_node.set_op("Quantized" + float_node.op());
         quantized_main_node.set_name(float_node.name() + "/eightbit");
-        for (const string& attr_to_copy : op_info.attrs_to_copy) {
+        for (const std::string& attr_to_copy : op_info.attrs_to_copy) {
           CopyNodeAttr(float_node, attr_to_copy, attr_to_copy,
                        &quantized_main_node);
         }
-        for (const std::pair<string, DataType>& dtype_to_set :
+        for (const std::pair<std::string, DataType>& dtype_to_set :
              op_info.dtypes_to_set) {
           SetNodeAttr(dtype_to_set.first, dtype_to_set.second,
                       &quantized_main_node);
@@ -820,32 +823,35 @@ absl::Status QuantizeNodes(const GraphDef& input_graph_def,
           if (op_info.unquantized_inputs.count(i)) {
             AddNodeInput(float_node.input(i), &quantized_main_node);
           } else {
-            const string& quantized_input_name =
+            const std::string& quantized_input_name =
                 quantized_input_names[quantized_input_index];
             AddNodeInput(quantized_input_name + ":0", &quantized_main_node);
             ++quantized_input_index;
           }
         }
         if (op_info.min_max_order == QuantizedOpInfo::CONTIGUOUS_MIN_MAX) {
-          for (const string& quantized_input_name : quantized_input_names) {
+          for (const std::string& quantized_input_name :
+               quantized_input_names) {
             AddNodeInput(quantized_input_name + ":1", &quantized_main_node);
             AddNodeInput(quantized_input_name + ":2", &quantized_main_node);
           }
         } else {
-          for (const string& quantized_input_name : quantized_input_names) {
+          for (const std::string& quantized_input_name :
+               quantized_input_names) {
             AddNodeInput(quantized_input_name + ":1", &quantized_main_node);
           }
-          for (const string& quantized_input_name : quantized_input_names) {
+          for (const std::string& quantized_input_name :
+               quantized_input_names) {
             AddNodeInput(quantized_input_name + ":2", &quantized_main_node);
           }
         }
         new_nodes->push_back(quantized_main_node);
 
-        string eight_bit_node_name;
+        std::string eight_bit_node_name;
         if (op_info.output_bit_depth == DT_QINT32) {
           // Shrink the range of the output down from 32 bits to 8.
-          string requantize_min_input;
-          string requantize_max_input;
+          std::string requantize_min_input;
+          std::string requantize_max_input;
           if (has_fallback_range) {
             // Use constant values for the min/max range if they were given.
             NodeDef fallback_min_node;
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index 6fca08585fb271..128672734f7c0b 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -36,7 +36,7 @@ absl::Status RemoveAttribute(const GraphDef& input_graph_def,
         "argument, e.g. remove_attribute(op_name=Mul, attribute_name=foo)");
   }
 
-  string op_name;
+  std::string op_name;
   if (context.params.count("op_name")) {
     if (context.params.at("op_name").size() != 1) {
       return errors::InvalidArgument(
@@ -48,7 +48,7 @@ absl::Status RemoveAttribute(const GraphDef& input_graph_def,
     op_name = "*";
   }
 
-  const string attribute_name = context.params.at("attribute_name")[0];
+  const std::string attribute_name = context.params.at("attribute_name")[0];
   output_graph_def->Clear();
   for (const NodeDef& node : input_graph_def.node()) {
     NodeDef* new_node = output_graph_def->mutable_node()->Add();

From 8e84202c4e53eb575077d5726c6c42c099999391 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Wed, 24 Dec 2025 04:45:18 -0800
Subject: [PATCH 743/753] [XLA] Move xla.GpuTopology proto out of PJRT to XLA.

It has to become a part of Compiler::CompilerOptions, but CompilerOptions should not depend on PJRT. So, moving it here.

PiperOrigin-RevId: 848523186
---
 tensorflow/core/common_runtime/eager/BUILD    |  2 +-
 .../eager/context_distributed_manager.cc      |  2 +-
 third_party/xla/xla/pjrt/c/BUILD              |  2 +-
 .../xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc |  2 +-
 third_party/xla/xla/pjrt/distributed/BUILD    |  2 +-
 .../xla/xla/pjrt/distributed/topology_util.cc |  2 +-
 .../xla/xla/pjrt/distributed/topology_util.h  |  2 +-
 third_party/xla/xla/pjrt/gpu/BUILD            | 39 +++++--------------
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc    |  4 +-
 .../xla/xla/pjrt/gpu/se_gpu_pjrt_client.h     |  4 +-
 .../gpu/se_gpu_pjrt_client_nvshmem_test.cc    |  2 +-
 .../xla/pjrt/gpu/se_gpu_pjrt_client_test.cc   |  4 +-
 .../xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc |  2 +-
 .../pjrt/gpu/se_gpu_topology_description.cc   |  4 +-
 .../pjrt/gpu/se_gpu_topology_description.h    |  2 +-
 .../gpu/se_gpu_topology_description_test.cc   |  2 +-
 third_party/xla/xla/pjrt/gpu/tfrt/BUILD       |  8 ++--
 ...u_async_host_to_device_transfer_manager.cc |  2 +-
 ...pu_async_host_to_device_transfer_manager.h |  2 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc  |  2 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc  |  4 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h   |  2 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc |  4 +-
 .../xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc  |  2 +-
 .../xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc  |  2 +-
 third_party/xla/xla/pjrt/gpu/tfrt/utils.cc    |  4 +-
 third_party/xla/xla/pjrt/gpu/tfrt/utils.h     |  2 +-
 third_party/xla/xla/service/BUILD             | 25 ++++++++++++
 .../xla/{pjrt/gpu => service}/gpu_topology.cc |  4 +-
 .../xla/{pjrt/gpu => service}/gpu_topology.h  |  9 +++--
 .../{pjrt/gpu => service}/gpu_topology.proto  |  3 ++
 31 files changed, 80 insertions(+), 72 deletions(-)
 rename third_party/xla/xla/{pjrt/gpu => service}/gpu_topology.cc (94%)
 rename third_party/xla/xla/{pjrt/gpu => service}/gpu_topology.h (94%)
 rename third_party/xla/xla/{pjrt/gpu => service}/gpu_topology.proto (89%)

diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index d285bb2f8740d1..ed5ab0149ecbee 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -272,7 +272,7 @@ tf_cuda_library(
                 "//tensorflow/core/framework:resource_base",
                 "@local_xla//xla/pjrt/distributed:key_value_store_interface",
                 "@local_xla//xla/pjrt:local_device_state",
-                "@local_xla//xla/pjrt/gpu:gpu_topology",
+                "@local_xla//xla/service:gpu_topology",
                 "@local_xla//xla/pjrt:pjrt_client",
                 "@local_xla//xla/pjrt:pjrt_compiler",
                 "@local_xla//xla/service/gpu:gpu_executable_run_options",
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index deeab20af15aea..8725479fb891da 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -80,11 +80,11 @@ limitations under the License.
 #if (defined(PLATFORM_GOOGLE) && defined(TF_PLATFORM_LINUX_X86_64))
 #define TF_GPU_USE_PJRT
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/local_device_state.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
 #include "tensorflow/core/framework/resource_base.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/tfrt/common/global_state.h"
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index babc266cfdb341..a799192970ef25 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -454,7 +454,6 @@ cc_library(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/extensions/cross_host_transfers:pjrt_c_api_cross_host_transfer_extension",
         "//xla/pjrt/gpu:gpu_helpers",
-        "//xla/pjrt/gpu:gpu_topology",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
         "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # buildcleaner: keep to register GPU AOT compiler
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_pjrt_client",
@@ -464,6 +463,7 @@ cc_library(
         "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
         "//xla/service:compiler",
         "//xla/service:custom_call_target_registry",
+        "//xla/service:gpu_topology",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index dd5fa7400a7ccb..3575515a78fa6c 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
 #include "xla/pjrt/extensions/cross_host_transfers/pjrt_c_api_cross_host_transfers_extension.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
@@ -58,6 +57,7 @@ limitations under the License.
 #include "xla/python/custom_partition_callback.h"
 #include "xla/service/compiler.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/service/gpu_topology.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 4a2f1f754c3536..0795d46b0e4bd6 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -115,7 +115,7 @@ cc_library(
         ":protocol_proto_cc",
         "//xla:util",
         "//xla/pjrt:utils",
-        "//xla/pjrt/gpu:gpu_topology_proto_cc",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/tsl/platform:env",
         "//xla/tsl/platform:errors",
         "//xla/tsl/platform:statusor",
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.cc b/third_party/xla/xla/pjrt/distributed/topology_util.cc
index 09ae283e73e35e..16b3b3f9605643 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.cc
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.cc
@@ -38,8 +38,8 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/utils.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/tsl/platform/env.h"
 #include "xla/tsl/platform/errors.h"
 #include "xla/tsl/platform/statusor.h"
diff --git a/third_party/xla/xla/pjrt/distributed/topology_util.h b/third_party/xla/xla/pjrt/distributed/topology_util.h
index 55f0d69a2ef433..d586137712cad1 100644
--- a/third_party/xla/xla/pjrt/distributed/topology_util.h
+++ b/third_party/xla/xla/pjrt/distributed/topology_util.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index d4a9ae9085c66e..61a60fe1070437 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -5,7 +5,6 @@ load("//xla/pjrt/gpu:package_groups.bzl", "xla_gpu_internal_packages")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_google", "internal_visibility")
-load("//xla/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//xla/tsl/platform:rules_cc.bzl", "cc_library")
 
 # Integrate with PJRT rather than the GPU client directly.
@@ -56,8 +55,6 @@ cc_library(
     deps = [
         ":gpu_helpers",
         ":gpu_metrics",
-        ":gpu_topology",
-        ":gpu_topology_proto_cc",
         ":se_gpu_topology_description",
         "//xla:executable_run_options",
         "//xla:future",
@@ -112,6 +109,8 @@ cc_library(
         "//xla/service:compiler",
         "//xla/service:computation_placer_hdr",
         "//xla/service:executable",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
@@ -204,8 +203,6 @@ xla_test(
     backends = ["gpu"],
     tags = ["nofixdeps"],
     deps = [
-        ":gpu_topology",
-        ":gpu_topology_proto_cc",
         ":se_gpu_pjrt_client",
         ":se_gpu_topology_description",
         "//xla:debug_options_flags",
@@ -243,6 +240,8 @@ xla_test(
         "//xla/pjrt/profiling:device_time_measurement",
         "//xla/pjrt/profiling/test_util:mock_device_time_measurement",
         "//xla/pjrt/proto:compile_options_proto_cc",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
         "//xla/service/gpu:gpu_memory_space_assignment",
         "//xla/stream_executor:device_address",
@@ -377,7 +376,6 @@ xla_test(
         "XLA_FLAGS": "--xla_gpu_experimental_enable_nvshmem=true",
     },
     deps = [
-        ":gpu_topology_proto_cc",
         ":se_gpu_pjrt_client",
         "//xla:shape_util",
         "//xla:util",
@@ -399,6 +397,7 @@ xla_test(
         "//xla/pjrt/distributed:in_memory_key_value_store",
         "//xla/pjrt/distributed:service",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
         "//xla/tests:literal_test_util",
         "//xla/tsl/lib/core:status_test_util",
@@ -431,26 +430,6 @@ xla_test(
     ],
 )
 
-tf_proto_library(
-    name = "gpu_topology_proto",
-    srcs = ["gpu_topology.proto"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "gpu_topology",
-    srcs = ["gpu_topology.cc"],
-    hdrs = ["gpu_topology.h"],
-    visibility = internal_visibility([
-        "//xla/pjrt/gpu:legacy_gpu_topology_users",
-        ":__subpackages__",
-    ]),
-    deps = [
-        ":gpu_topology_proto_cc",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
 cc_library(
     name = "se_gpu_pjrt_compiler_impl",
     srcs = ["se_gpu_pjrt_compiler.cc"],
@@ -566,7 +545,6 @@ xla_test(
     srcs = ["se_gpu_pjrt_compiler_test.cc"],
     backends = ["gpu"],
     deps = [
-        ":gpu_topology",
         ":se_gpu_pjrt_client",
         ":se_gpu_pjrt_compiler_impl",
         ":se_gpu_topology_description",
@@ -580,6 +558,7 @@ xla_test(
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
+        "//xla/service:gpu_topology",
         "//xla/tests:literal_test_util",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/status",
@@ -636,8 +615,6 @@ cc_library(
         ":__subpackages__",
     ]),
     deps = [
-        ":gpu_topology",
-        ":gpu_topology_proto_cc",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/pjrt:pjrt_common",
@@ -646,6 +623,8 @@ cc_library(
         "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_stream_executor_device_description",
         "//xla/pjrt/proto:topology_description_proto_cc",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tsl/lib/strings:proto_serialization",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -664,13 +643,13 @@ xla_cc_test(
     name = "se_gpu_topology_description_test",
     srcs = ["se_gpu_topology_description_test.cc"],
     deps = [
-        ":gpu_topology",
         ":se_gpu_topology_description",
         "//xla/pjrt:pjrt_common",
         "//xla/pjrt:pjrt_compiler",
         "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt:pjrt_device_dimensions",
         "//xla/pjrt:pjrt_stream_executor_device_description",
+        "//xla/service:gpu_topology",
         "//xla/tsl/platform:statusor",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index d072e94c364fdd..b97b44146533e8 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -73,8 +73,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/event_pool.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/host_memory_spaces.h"
@@ -96,6 +94,8 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_memory_space_assignment.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index b65f9a7f4af02a..2e36a66365d7e7 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -42,8 +42,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/host_memory_allocator.h"
 #include "xla/pjrt/local_device_state.h"
@@ -57,6 +55,8 @@ limitations under the License.
 #include "xla/runtime/device_id.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_tree.h"
 #include "xla/stream_executor/device_address_allocator.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
index 07cbfb7ba3276f..1c171fc070861f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_nvshmem_test.cc
@@ -39,13 +39,13 @@ limitations under the License.
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/service.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index a75c9f3900af80..f995e57d579cd6 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -69,8 +69,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/distributed.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
 #include "xla/pjrt/distributed/service.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/host_memory_spaces.h"
 #include "xla/pjrt/local_device_state.h"
@@ -87,6 +85,8 @@ limitations under the License.
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/raw_buffer.h"
 #include "xla/service/gpu/gpu_memory_space_assignment.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
index 357c7c0a2d1483..ba467b0c8be09f 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -36,13 +36,13 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tsl/platform/statusor.h"
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
index f725d4359fe496..a966082b434edc 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.cc
@@ -27,8 +27,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/layout.h"
 #include "xla/layout_util.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
@@ -36,6 +34,8 @@ limitations under the License.
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
 #include "xla/primitive_util.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/tsl/lib/strings/proto_serialization.h"
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
index 50dc6f291c4fc3..7e8a340ff4ad31 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description.h
@@ -24,13 +24,13 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
 #include "xla/pjrt/proto/topology_description.pb.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/xla_data.pb.h"
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
index 3d73431e71b2d5..3e6b300947fc73 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_topology_description_test.cc
@@ -21,12 +21,12 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/pjrt_common.h"
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_device_description.h"
 #include "xla/pjrt/pjrt_device_dimensions.h"
 #include "xla/pjrt/pjrt_stream_executor_device_description.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/tsl/platform/statusor.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
index 04d720161c86b5..8f7af9cb74b53c 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/BUILD
@@ -85,8 +85,6 @@ cc_library(
         "//xla/pjrt/distributed:topology_util",
         "//xla/pjrt/dump",
         "//xla/pjrt/gpu:gpu_helpers",
-        "//xla/pjrt/gpu:gpu_topology",
-        "//xla/pjrt/gpu:gpu_topology_proto_cc",
         "//xla/pjrt/gpu:se_gpu_topology_description",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_allocator_config",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
@@ -98,6 +96,8 @@ cc_library(
         "//xla/service:computation_placer_hdr",
         "//xla/service:executable",
         "//xla/service:generic_transfer_manager",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_proto_cc",
         "//xla/service:maybe_owning_device_address",
@@ -212,10 +212,10 @@ xla_test(
         "//xla/pjrt:pjrt_executable",
         "//xla/pjrt:raw_buffer",
         "//xla/pjrt/distributed:in_memory_key_value_store",
-        "//xla/pjrt/gpu:gpu_topology",
-        "//xla/pjrt/gpu:gpu_topology_proto_cc",
         "//xla/pjrt/plugin/xla_gpu:xla_gpu_client_options",
         "//xla/pjrt/proto:compile_options_proto_cc",
+        "//xla/service:gpu_topology",
+        "//xla/service:gpu_topology_proto_cc",
         "//xla/service:platform_util",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:device_description",
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
index 3f987409f0b21f..c15aa659676f74 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.cc
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
@@ -49,6 +48,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h
index 4479525d4798d3..2f5cac56f62c93 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
@@ -46,6 +45,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_compiler.h"
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_description.pb.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
index 506fae3cfe85ba..6bfc7a4f27967f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
@@ -52,6 +51,7 @@ limitations under the License.
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/transpose.h"
 #include "xla/primitive_util.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
index 85f78e92444ec2..a80e288238ac2f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.cc
@@ -60,8 +60,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/dump/dump.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_async_host_to_device_transfer_manager.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
@@ -87,6 +85,8 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/shaped_buffer.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
index f08c7d9076c9d8..55469f351f373b 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client.h
@@ -46,7 +46,6 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/maybe_owning.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
@@ -60,6 +59,7 @@ limitations under the License.
 #include "xla/pjrt/transpose.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
index 32a04ec6b5115a..ceeb56ce27b7ba 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_client_test.cc
@@ -57,8 +57,6 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/pjrt/distributed/in_memory_key_value_store.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_device.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_executable.h"
@@ -72,6 +70,8 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/raw_buffer.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/platform_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
index 96204308bc2384..c915187600b2d9 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_device.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/literal.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/utils.h"
@@ -52,6 +51,7 @@ limitations under the License.
 #include "xla/pjrt/pjrt_executable.h"
 #include "xla/pjrt/proto/compile_options.pb.h"
 #include "xla/pjrt/utils.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/status_macros.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
index 88fce7477ce884..839c1fe31ded3f 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/tfrt_gpu_executable.cc
@@ -48,7 +48,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_print_options.h"
 #include "xla/layout.h"
 #include "xla/pjrt/distributed/protocol.pb.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
 #include "xla/pjrt/gpu/tfrt/tracked_gpu_device_buffer.h"
@@ -68,6 +67,7 @@ limitations under the License.
 #include "xla/service/computation_placer.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/maybe_owning_device_address.h"
 #include "xla/service/shaped_buffer.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
index a59dd22155ddb6..7c011e83558428 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.cc
@@ -60,8 +60,6 @@ limitations under the License.
 #include "xla/pjrt/distributed/protocol.pb.h"
 #include "xla/pjrt/distributed/topology_util.h"
 #include "xla/pjrt/gpu/gpu_helpers.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_client.h"
@@ -77,6 +75,8 @@ limitations under the License.
 #include "xla/runtime/device_id.h"
 #include "xla/service/compiler.h"
 #include "xla/service/computation_placer.h"
+#include "xla/service/gpu_topology.h"
+#include "xla/service/gpu_topology.pb.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
diff --git a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
index c7599bd4967d97..21b1bdf4110fcd 100644
--- a/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
+++ b/third_party/xla/xla/pjrt/gpu/tfrt/utils.h
@@ -38,7 +38,6 @@ limitations under the License.
 #include "xla/layout.h"
 #include "xla/maybe_owning.h"
 #include "xla/pjrt/distributed/key_value_store_interface.h"
-#include "xla/pjrt/gpu/gpu_topology.h"
 #include "xla/pjrt/gpu/se_gpu_topology_description.h"
 #include "xla/pjrt/gpu/tfrt/gpu_event.h"
 #include "xla/pjrt/gpu/tfrt/tfrt_gpu_buffer.h"
@@ -51,6 +50,7 @@ limitations under the License.
 #include "xla/pjrt/plugin/xla_gpu/xla_gpu_client_options.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu_topology.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_address_allocator.h"
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index d42050eb0f5a2a..1fd795ebfb143a 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -6257,6 +6257,31 @@ xla_cc_test(
     ],
 )
 
+tf_proto_library(
+    name = "gpu_topology_proto",
+    srcs = ["gpu_topology.proto"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "gpu_topology",
+    srcs = ["gpu_topology.cc"],
+    hdrs = ["gpu_topology.h"],
+    visibility = internal_visibility([
+        "//xla/pjrt:__subpackages__",
+        "//third_party/pathways:__subpackages__",
+        "//learning/brain/research/pjrt:__subpackages__",
+        "//learning/brain/research/jax:__subpackages__",
+        "//learning/pathways/compilation_service:__subpackages__",
+        "//tensorflow/core/common_runtime/eager:__subpackages__",
+        ":__subpackages__",
+    ]),
+    deps = [
+        ":gpu_topology_proto_cc",
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
 cc_library(
     name = "matmul_indexing_utils",
     srcs = ["matmul_indexing_utils.cc"],
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc b/third_party/xla/xla/service/gpu_topology.cc
similarity index 94%
rename from third_party/xla/xla/pjrt/gpu/gpu_topology.cc
rename to third_party/xla/xla/service/gpu_topology.cc
index 9932d1a52c74d7..fd3304126ffd4c 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.cc
+++ b/third_party/xla/xla/service/gpu_topology.cc
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/pjrt/gpu/gpu_topology.h"
+#include "xla/service/gpu_topology.h"
 
 #include <memory>
 
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 
 namespace xla {
 
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.h b/third_party/xla/xla/service/gpu_topology.h
similarity index 94%
rename from third_party/xla/xla/pjrt/gpu/gpu_topology.h
rename to third_party/xla/xla/service/gpu_topology.h
index dea991f99bfeb4..9d5137e6b99f64 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.h
+++ b/third_party/xla/xla/service/gpu_topology.h
@@ -13,17 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_PJRT_GPU_GPU_TOPOLOGY_H_
-#define XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+#ifndef XLA_SERVICE_GPU_TOPOLOGY_H_
+#define XLA_SERVICE_GPU_TOPOLOGY_H_
 
 #include <cstdint>
 #include <memory>
 #include <string>
 
 #include "absl/strings/string_view.h"
-#include "xla/pjrt/gpu/gpu_topology.pb.h"
+#include "xla/service/gpu_topology.pb.h"
 
 namespace xla {
+
 class GpuTopology {
  public:
   explicit GpuTopology(absl::string_view platform_version,
@@ -77,4 +78,4 @@ class GpuTopology {
 
 }  // namespace xla
 
-#endif  // XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+#endif  // XLA_SERVICE_GPU_TOPOLOGY_H_
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_topology.proto b/third_party/xla/xla/service/gpu_topology.proto
similarity index 89%
rename from third_party/xla/xla/pjrt/gpu/gpu_topology.proto
rename to third_party/xla/xla/service/gpu_topology.proto
index 405346499901eb..7f12d68fb71b81 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_topology.proto
+++ b/third_party/xla/xla/service/gpu_topology.proto
@@ -2,6 +2,9 @@ syntax = "proto3";
 
 package xla;
 
+option java_multiple_files = true;
+option java_outer_classname = "GpuTopologyProto";
+
 // A proto used to serialize GpuTopology instances.
 message GpuTopologyProto {
   reserved 1;  // Was: device_ids

From fcc2b82ea348cb55c0818aa5bb94b850325d14b6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 24 Dec 2025 05:33:41 -0800
Subject: [PATCH 744/753] Automated Code Change

PiperOrigin-RevId: 848534440
---
 third_party/xla/xla/python/ifrt/ir/BUILD                         | 1 +
 third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc   | 1 -
 third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index 8f35b34468e5b8..77ea0524094d7b 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -179,6 +179,7 @@ cc_library(
         "//xla/python/ifrt",
         "//xla/python/ifrt:device_proto_cc",
         "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt:serdes_proto_cc",
         "//xla/python/ifrt:serdes_version",
         "//xla/python/ifrt:serdes_week_4_old_version_accessor",
         "//xla/python/pjrt_ifrt:xla_executable_version",
diff --git a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
index e1d96bff98046b..2cd2b723343cbd 100644
--- a/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
+++ b/third_party/xla/xla/python/ifrt/ir/compiled_ifrt_ir_program.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
diff --git a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc
index 76cad284e95c04..27443bda6b27f4 100644
--- a/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc
+++ b/third_party/xla/xla/python/ifrt/ir/ifrt_ir_executable_version.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/python/ifrt/ir/ifrt_ir_executable_version.pb.h"
 #include "xla/python/ifrt/ir/version.h"
 #include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/serdes.pb.h"
 #include "xla/python/ifrt/serdes_version.h"
 #include "xla/python/ifrt/serdes_week_4_old_version_accessor.h"
 #include "xla/python/pjrt_ifrt/xla_executable_version.h"

From 5d50048b6f809c47041046a205a5a2b23d870268 Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Wed, 24 Dec 2025 15:13:56 +0000
Subject: [PATCH 745/753] Fix merge conflicts

---
 tensorflow/core/kernels/bias_op_gpu.cu.cc     |  7 +-----
 .../xla/xla/backends/gpu/codegen/triton/BUILD |  8 ------
 .../gpu/tests/triton_calling_convention.hlo   |  4 ---
 third_party/xla/xla/stream_executor/gpu/BUILD | 25 -------------------
 .../gpu/gpu_device_info_test.cc               |  6 -----
 .../xla/xla/tools/xla_gpu_compile_lib_test.cc |  7 ------
 6 files changed, 1 insertion(+), 56 deletions(-)

diff --git a/tensorflow/core/kernels/bias_op_gpu.cu.cc b/tensorflow/core/kernels/bias_op_gpu.cu.cc
index d744703ec332f5..dac9640b1e61ff 100644
--- a/tensorflow/core/kernels/bias_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bias_op_gpu.cu.cc
@@ -147,13 +147,8 @@ __global__ void BiasGradNHWC_SharedAtomics(
 
   for (int32_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
-<<<<<<< HEAD
-    int32 bias_offset = index % bias_size;
-    GpuAtomicAddShared(s_data + bias_offset, AccT(ldg(output_backprop + index)));
-=======
     int32_t bias_offset = index % bias_size;
-    GpuAtomicAdd(s_data + bias_offset, AccT(ldg(output_backprop + index)));
->>>>>>> upstream/master
+    GpuAtomicAddShared(s_data + bias_offset, AccT(ldg(output_backprop + index)));
   }
   __syncthreads();
 
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 30d7933bc86be7..75366a8756f8ef 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -631,12 +631,8 @@ xla_test(
 
 xla_test(
     name = "fusion_emitter_int4_device_test",
-<<<<<<< HEAD
-    srcs = if_gpu_is_configured(["fusion_emitter_int4_device_test.cc"]),
-=======
     size = "large",
     srcs = ["fusion_emitter_int4_device_test.cc"],
->>>>>>> upstream/master
     backends = [
         "a100",
         "h100",
@@ -839,12 +835,8 @@ cc_library(
 
 xla_test(
     name = "fusion_emitter_large_test",
-<<<<<<< HEAD
-    srcs = if_gpu_is_configured(["fusion_emitter_large_test.cc"]),
-=======
     size = "large",
     srcs = ["fusion_emitter_large_test.cc"],
->>>>>>> upstream/master
     backends = [
         "a100",
         "h100",
diff --git a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
index 5f28a515e93f54..1722fd44a349ae 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_calling_convention.hlo
@@ -1,8 +1,4 @@
-<<<<<<< HEAD
-// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
-=======
 // RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../backends/gpu/target_config/specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
->>>>>>> upstream/master
 
 // Verify that Triton kernels have the correct calling convention:
 // - PTX_KERNEL (71) for NVIDIA targets
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index ab1a2a79db6ad5..699589d14bdf5e 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -690,32 +690,7 @@ xla_test(
     name = "gpu_device_info_test",
     srcs = ["gpu_device_info_test.cc"],
     backends = ["gpu"],
-<<<<<<< HEAD
-    data = if_cuda_is_configured([
-        "//xla/tools/hlo_opt:gpu_specs/a100_pcie_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_40.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a100_sxm_80.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/b200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_pcie.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/h100_sxm.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
-        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
-    ]) + if_rocm_is_configured([
-        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
-    ]),
-    local_defines = if_cuda_is_configured([
-        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"a100_pcie_80\\", \\"a100_sxm_40\\", \
-         \\"a100_sxm_80\\", \\"a6000\\", \\"h100_pcie\\", \\"h100_sxm\\", \\"p100\\", \\"v100\\"}',
-        'PLATFORM_NAME=\\"CUDA\\"',
-    ]) + if_rocm_is_configured([
-        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"mi200\\"}',
-        'PLATFORM_NAME=\\"ROCM\\"',
-    ]),
-=======
     data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
->>>>>>> upstream/master
     deps = [
         "//xla/service:platform_util",
         "//xla/stream_executor:device_description",
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
index 38b2dba40541f5..2e1ff2f437d97b 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
@@ -46,15 +46,9 @@ TEST(DeviceInfoTest, DeviceInfoMatches) {
     path = path.erase(path.length() - 4);
     TF_ASSERT_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
-<<<<<<< HEAD
-        tsl::io::JoinPath(path, "external/local_xla/xla",
-                          "tools", "hlo_opt",
-                          "gpu_specs", absl::StrCat(file_name, ".txtpb")),
-=======
         tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
                           "backends/gpu/target_config/specs",
                           absl::StrCat(file_name, ".txtpb")),
->>>>>>> upstream/master
         &spec_string));
     ASSERT_TRUE(
         tsl::protobuf::TextFormat::ParseFromString(spec_string, &proto));
diff --git a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
index 0831c755db41cc..4b6fbab412cb2a 100644
--- a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
@@ -72,16 +72,9 @@ TEST_F(XlaCompileLibTest, CompilesForGpuWithDevice) {
 }
 
 TEST_F(XlaCompileLibTest, CompilesForGpuWithoutDevice) {
-<<<<<<< HEAD
-  auto path = tsl::testing::XlaSrcRoot();
-  path = path.erase(path.length() - 4);
-  const std::string target_config_path = tsl::io::JoinPath(
-      path, "external/local_xla/xla/tools/hlo_opt/gpu_specs", "h100_sxm.txtpb");
-=======
   const std::string target_config_path =
       tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
                         "backends/gpu/target_config/specs", "h100_sxm.txtpb");
->>>>>>> upstream/master
   stream_executor::GpuTargetConfigProto target_config;
   TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,
                                   &target_config));

From 14e1429cb3ae0a674141bee3f4a6a00da7c71670 Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Wed, 24 Dec 2025 15:14:14 +0000
Subject: [PATCH 746/753] Revert 692e221939e502bb782b678c960261088f7dd533

---
 tensorflow/workspace2.bzl | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 7612bf4125f576..583ef31b4f61c0 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -1,6 +1,5 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("//third_party:repo.bzl", "tf_vendored")
 load("@bazel_features//:deps.bzl", "bazel_features_deps")
 load("@bazel_skylib//lib:versions.bzl", "versions")
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
@@ -410,16 +409,8 @@ def _tf_repositories():
         },
     )
 
-    # Use XLA's googletest wrapper which provides EXPECT_OK and ASSERT_OK macros.
-    # This wrapper adds those macros to the open-source gmock/gmock.h header,
-    # matching the behavior of internal builds.
-    tf_vendored(
-        name = "com_google_googletest",
-        path = "third_party/xla/third_party/xla_googletest_wrapper",
-    )
-
     tf_http_archive(
-        name = "com_google_googletest_upstream",
+        name = "com_google_googletest",
         # Use the commit on 2025/6/09:
         # https://github.com/google/googletest/commit/28e9d1f26771c6517c3b4be10254887673c94018
         sha256 = "f253ca1a07262f8efde8328e4b2c68979e40ddfcfc001f70d1d5f612c7de2974",

From 422ffee86b34def974ec8432067007cbde51d420 Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Thu, 25 Dec 2025 11:39:34 +0000
Subject: [PATCH 747/753] Remove leftover diff symbols

---
 .../xla_googletest_wrapper/include/gmock/gmock.h         | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
index e5a79543245c08..cd1c5d7891af7f 100644
--- a/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
+++ b/third_party/xla/third_party/xla_googletest_wrapper/include/gmock/gmock.h
@@ -1,5 +1,3 @@
-<<<<<<< Conflict 1 of 1
-%%%%%%% Changes from base to side #1
  /* Copyright 2025 The Abseil Authors & TensorFlow Authors. All Rights Reserved.
  
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -64,8 +62,7 @@
    ASSERT_THAT(expression, ::xla_testing::internal::IsOk())
  
  #define ASSERT_OK_AND_ASSIGN(lhs, rexpr)                            \
--  TF_ASSERT_OK_AND_ASSIGN_IMPL(                                     \
-+  ASSERT_OK_AND_ASSIGN_IMPL(                                        \
+  ASSERT_OK_AND_ASSIGN_IMPL(                                        \
        XLA_STATUS_MACROS_CONCAT_NAME(_status_or_value, __COUNTER__), \
        lhs, rexpr);
  
@@ -129,6 +126,4 @@
  }  // namespace internal
  }  // namespace xla_testing
  
- #endif  // GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
-+++++++ Contents of side #2
->>>>>>> Conflict 1 of 1 ends
+ #endif  // GOOGLETEST_WRAPPER_GMOCK_GMOCK_H_
\ No newline at end of file

From 1feb80c4265f78bc4d9200d84cde22462225ee7b Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Tue, 30 Dec 2025 08:16:20 +0000
Subject: [PATCH 748/753] Fix gpu_device_info_test

---
 third_party/xla/xla/stream_executor/gpu/BUILD             | 8 ++++++++
 .../xla/xla/stream_executor/gpu/gpu_device_info_test.cc   | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index 699589d14bdf5e..9b11c015f54a7f 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -710,6 +710,14 @@ xla_test(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
+    local_defines = if_cuda_is_configured([
+        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"a100_pcie_80\\", \\"a100_sxm_40\\", \
+         \\"a100_sxm_80\\", \\"a6000\\", \\"h100_pcie\\", \\"h100_sxm\\", \\"p100\\", \\"v100\\"}',
+         'PLATFORM_NAME=\\"CUDA\\"'
+    ]) + if_rocm_is_configured([
+        'GPU_SPEC_FILE_NAMES=(std::string[]){\\"mi200\\"}',
+        'PLATFORM_NAME=\\"ROCM\\"'
+    ]),
 )
 
 tf_proto_library(
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
index 2e1ff2f437d97b..b56153a0668dfd 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_device_info_test.cc
@@ -46,7 +46,7 @@ TEST(DeviceInfoTest, DeviceInfoMatches) {
     path = path.erase(path.length() - 4);
     TF_ASSERT_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
-        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+        tsl::io::JoinPath(path, "external/local_xla/xla",
                           "backends/gpu/target_config/specs",
                           absl::StrCat(file_name, ".txtpb")),
         &spec_string));

From cd67c4fa79426d8ca92db9330a256b12618613cb Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Tue, 30 Dec 2025 08:16:45 +0000
Subject: [PATCH 749/753] Fix amdgpu_register_spilling_test

---
 .../gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
index 74b1c94feffa47..1edd0d62a1ca70 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_register_spilling_test.cc
@@ -50,8 +50,10 @@ class AMDGPURegisterSpillingTest
   // Helper to load IR module from test data
   std::unique_ptr<llvm::Module> LoadTestModule(llvm::LLVMContext* context,
                                                const std::string& filename) {
+    auto path = tsl::testing::XlaSrcRoot();
+    path = path.erase(path.length() - 4);                                            
     return LoadIRModule(
-        tsl::io::JoinPath(tsl::testing::XlaSrcRoot(), "service", "gpu",
+        tsl::io::JoinPath(path, "external/local_xla/xla", "service", "gpu",
                           "llvm_gpu_backend", "tests_data", filename),
         context);
   }

From 178730046a28208ae1a6b2b963fabbd5dbd9b507 Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Tue, 30 Dec 2025 09:03:50 +0000
Subject: [PATCH 750/753] Use googletest status assert macros patches in tf
 workspace2.bzl too

---
 tensorflow/workspace2.bzl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 583ef31b4f61c0..067de3cdcac811 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -419,6 +419,8 @@ def _tf_repositories():
         #   - avoid dependencies on @fuchsia_sdk,
         #   - refer to re2 as @com_googlesource_code_re2,
         #   - refer to abseil as @com_google_absl.
+        #   - add status assert macros for consistency with internal gmock (see
+        #     README.add-status-macros.md).
         #
         # To update the patch, run:
         # $ cd ~
@@ -431,7 +433,11 @@ def _tf_repositories():
         # $ git diff > <client-root>/third_party/tensorflow/third_party/googletest/googletest.patch
         #
         # The patch path is relative to third_party/tensorflow.
-        patch_file = ["@local_xla//third_party/googletest:googletest.patch"],
+        patch_file = [
+            "@local_xla//third_party/googletest:googletest.patch",
+            "@local_xla//third_party/googletest:0001-Add-ASSERT_OK-EXPECT_OK-ASSERT_OK_AND_ASSIGN-macros.patch",
+            "@local_xla//third_party/googletest:0002-Rename-dependencies-for-workspace.bzl-build.patch",
+            ],
         urls = tf_mirror_urls("https://github.com/google/googletest/archive/28e9d1f26771c6517c3b4be10254887673c940189.zip"),
     )
 

From b28eff180ee9c078aafbbb63a632ad103c0881c5 Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Tue, 30 Dec 2025 09:25:09 +0000
Subject: [PATCH 751/753] Remove remaining cuda-only tags and move failing
 subtests to test scripts

---
 .../tensorflow/tests/tf_saved_model/BUILD     |  1 -
 tensorflow/compiler/tests/BUILD               |  3 -
 tensorflow/core/kernels/mkl/BUILD             |  1 -
 tensorflow/core/kernels/mlir_generated/BUILD  |  1 -
 tensorflow/core/nccl/BUILD                    |  1 -
 tensorflow/core/util/autotune_maps/BUILD      |  1 -
 tensorflow/dtensor/cc/BUILD                   |  1 -
 tensorflow/dtensor/python/tests/BUILD         |  4 --
 .../python/compiler/tensorrt/test/BUILD       |  1 -
 tensorflow/python/debug/lib/BUILD             |  2 -
 tensorflow/python/distribute/BUILD            |  2 -
 tensorflow/python/feature_column/BUILD        |  2 -
 tensorflow/python/framework/BUILD             |  1 -
 .../python/kernel_tests/image_ops/BUILD       |  1 -
 tensorflow/python/kernel_tests/nn_ops/BUILD   |  2 -
 .../python/kernel_tests/sparse_ops/BUILD      |  1 -
 tensorflow/python/ops/BUILD                   | 11 ---
 tensorflow/python/ops/parallel_for/BUILD      |  1 -
 tensorflow/python/training/BUILD              |  1 -
 .../ci_build/linux/rocm/run_gpu_single.sh     | 16 ++++-
 .../tools/ci_build/linux/rocm/run_xla.sh      | 67 +++++++++++++++++++
 .../xla/xla/backends/gpu/codegen/triton/BUILD |  6 +-
 .../xla/xla/backends/gpu/profiler/BUILD       |  3 -
 .../xla/xla/backends/gpu/runtime/BUILD        |  2 -
 third_party/xla/xla/service/gpu/BUILD         |  3 -
 .../xla/xla/service/gpu/autotuning/BUILD      |  3 +-
 third_party/xla/xla/service/gpu/tests/BUILD   |  2 -
 .../xla/xla/service/gpu/transforms/BUILD      |  1 -
 third_party/xla/xla/tests/BUILD               |  6 --
 29 files changed, 85 insertions(+), 62 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
index e052268b6ede98..162a597ef7c40e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
@@ -294,7 +294,6 @@ glob_lit_tests(
     default_tags = [
         "no_mac",  # TODO(b/191167848)
         "no_oss",  # TODO(b/190855110)
-        "cuda-only",
     ],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index d65d30a0c88cbd..3989c361047566 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1788,7 +1788,6 @@ tf_xla_py_strict_test(
     srcs = ["unary_ops_test.py"],
     shard_count = 20,
     tags = [
-        "cuda-only",
         "no_aarch64",  # TODO(b/348125886)
         "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -2861,7 +2860,6 @@ tf_cuda_cc_test(
     tags = [
         "config-cuda-only",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "cuda-only",  # ROCmSoftwarePlatform #958
         "noasan",  # TODO(b/201651800)
         "requires-gpu-nvidia",
     ] + tf_cuda_tests_tags(),
@@ -2882,7 +2880,6 @@ tf_cuda_cc_test(
     tags = [
         "config-cuda-only",
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "cuda-only",  # ROCmSoftwarePlatform #958
         "noasan",  # TODO(b/201651800)
         "requires-gpu-nvidia",
     ] + tf_cuda_tests_tags(),
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 99786dee930818..702fae1c2b37ea 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -427,7 +427,6 @@ tf_cc_test_mkl(
     size = "small",
     srcs = ["mkl_fused_batch_norm_op_test.cc"],
     linkstatic = 1,
-    tags = ["cuda-only"], # fails on AMD Rome CPUs as of 2021-03-29
     deps = [
         ":mkl_conv_op",
         ":mkl_fused_batch_norm_op",
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index b2f1e1d16579bf..7105823f79f543 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -557,7 +557,6 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags() + [
         "no_cuda",  # TODO(b/196608406): re-enable
         "no_cuda_asan",  # TODO(b/171341759): re-enable.
-        "cuda-only",
     ],
     deps = [
         ":base_binary_ops_test",
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 9fdae56fb81d87..ec1ee113fcff2d 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -61,7 +61,6 @@ tf_cuda_cc_test(
         "multi_gpu",
         "no_oss",
         "notap",
-        "cuda-only", # flaky on CI as of 2022-05-30
     ],
     deps = [
         "//tensorflow/core:test",
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index 3868d4971b8035..a5de2c3ba00baa 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -193,7 +193,6 @@ tf_cuda_only_cc_test(
     size = "small",
     srcs = ["autotune_serialize_test.cc"],
     features = ["-layering_check"],
-    tags = ["cuda-only"],
     deps = [
         ":autotune_serialize",
         ":conv_autotune_maps",
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 1705ba2425577c..ccdf73f79f15b1 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -225,7 +225,6 @@ tf_kernel_library(
         "dtensor_tpu_kernels.cc",
     ],
     tags = [
-        "cuda-only",
         "tpu",
     ],  # Disable building of TPU kernels on non-TPU platforms.
     deps = [
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index 9b38fcdeb48bb0..38c84bc127ef90 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -746,9 +746,6 @@ dtensor_test(
         "tpu": 10,
         TPU_V3_DONUT_BACKEND: 32,
     },
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":test_util",
         "//tensorflow/dtensor/python:api",
@@ -802,7 +799,6 @@ dtensor_test(
     },
     tags = [
         "no_oss_py38",  # TODO(b/267017937)
-        "cuda-only",
     ],
     deps = [
         ":test_util",
diff --git a/tensorflow/python/compiler/tensorrt/test/BUILD b/tensorflow/python/compiler/tensorrt/test/BUILD
index 388140b04fac1d..26582e8aac4f51 100644
--- a/tensorflow/python/compiler/tensorrt/test/BUILD
+++ b/tensorflow/python/compiler/tensorrt/test/BUILD
@@ -74,7 +74,6 @@ filegroup(
 
 base_tags = [
     "no_cuda_on_cpu_tap",
-    "cuda-only",
     "no_windows",
     "nomac",
     # TODO(b/303453873): Re-enable tests once TensorRT has been updated
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index f30a5a8c6668ec..0b3860dbaa9934 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -331,7 +331,6 @@ cuda_py_strict_test(
     shard_count = 4,
     tags = [
         "no_windows",  # TODO(b/142475891): Enable this test on Windows.
-        "cuda-only", #TODO(ROCm) Re-enable after issue is fixed.
     ],
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
@@ -363,7 +362,6 @@ cuda_py_strict_test(
     python_version = "PY3",
     tags = [
         "no_windows_gpu",
-        "cuda-only", #TODO(ROCm) Re-enable after issue is fixed.
     ],
     deps = [
         ":debug_events_reader",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 7b5f6a94506487..cf156d75a4380d 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -712,7 +712,6 @@ distribute_py_strict_test(
         "multi_and_single_gpu",
         "no_cuda_asan",  # b/213388775
         "no_oss",  # b/241013307
-	    "cuda-only",
         "notap",  # Flaky; TODO(b/289970206)
     ],
     tpu_tags = [
@@ -2502,7 +2501,6 @@ distribute_py_strict_test(
         "multi_and_single_gpu",
         "nomac",  # TODO(b/201788023): Attempt MultiProcessCluster to fix this.
         "notpu",
-        "cuda-only", #times out
     ],
     deps = [
         ":distribute_lib",
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 7e905c3d51b0c1..1c28e3b8cc706c 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -164,7 +164,6 @@ tf_py_strict_test(
         "no_cuda_on_cpu_tap",
         "no_oss",  # TODO(b/206860622): Broken with numpy 1.20+
         "no_pip",
-        "cuda-only",
         "no_windows",
     ],
     deps = [
@@ -209,7 +208,6 @@ tf_py_strict_test(
         "no_cuda_on_cpu_tap",
         "no_oss",  # TODO(b/206860622): Broken with numpy 1.20+
         "no_pip",
-        "cuda-only",
         "no_windows",
     ],
     deps = [":feature_column_v2_test_main_lib"],
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index ba216815b3a623..e3ebab0c442106 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -1502,7 +1502,6 @@ cuda_py_strict_test(
     srcs = ["config_test.py"],
     tags = [
         "no_pip",  # test_ops are not available in pip
-        "cuda-only",
     ],
     deps = [
         ":config",
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index b7c79c74dae5f9..640edf8f97a629 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -141,7 +141,6 @@ cuda_py_strict_test(
     shard_count = 15,
     tags = [
         "no_oss",  # b/241024908
-        "cuda-only",
         "nomac",  # b/181799478
         "notap",  # b/31080670
     ],
diff --git a/tensorflow/python/kernel_tests/nn_ops/BUILD b/tensorflow/python/kernel_tests/nn_ops/BUILD
index 507d00c15d196c..df5b780a6e7367 100644
--- a/tensorflow/python/kernel_tests/nn_ops/BUILD
+++ b/tensorflow/python/kernel_tests/nn_ops/BUILD
@@ -296,7 +296,6 @@ cuda_py_strict_test(
     shard_count = 4,
     tags = [
         "no_mac_arm64",
-		"cuda-only",
         "optonly",  # times out
     ],
     deps = [
@@ -438,7 +437,6 @@ cuda_py_strict_test(
     size = "medium",  # http://b/30603882
     timeout = "long",
     srcs = ["depthwise_conv_op_d9m_test.py"],
-    tags = ["cuda-only"],
     shard_count = 8,
     deps = [
         ":depthwise_conv_op_base",
diff --git a/tensorflow/python/kernel_tests/sparse_ops/BUILD b/tensorflow/python/kernel_tests/sparse_ops/BUILD
index 37b8518b3c1ebe..20fe7ab1adfb98 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/BUILD
+++ b/tensorflow/python/kernel_tests/sparse_ops/BUILD
@@ -108,7 +108,6 @@ cuda_py_strict_test(
     shard_count = 5,
     tags = [
         "optonly",  # b/77589990
-        "cuda-only"
     ],
     deps = [
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 069280d4425fb7..c624c412f3d12d 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -1006,9 +1006,6 @@ tf_py_strict_test(
     name = "collective_ops_test",
     size = "small",
     srcs = ["collective_ops_test.py"],
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":array_ops",
         ":collective_ops",
@@ -1037,7 +1034,6 @@ tf_py_strict_test(
     srcs = ["collective_ops_xla_test.py"],
     tags = [
         "no_pip",
-        "cuda-only",
         "no_windows",
         "nomac",
     ],
@@ -3594,9 +3590,6 @@ cuda_py_strict_test(
     srcs = ["nn_fused_batchnorm_d9m_test.py"],
     main = "nn_fused_batchnorm_d9m_test.py",
     shard_count = 4,
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":nn_grad",
         ":nn_impl",
@@ -3618,9 +3611,6 @@ cuda_py_strict_test(
     srcs = ["nn_fused_batchnorm_test.py"],
     main = "nn_fused_batchnorm_test.py",
     shard_count = 24,
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":array_ops",
         ":gradient_checker",
@@ -3765,7 +3755,6 @@ cuda_py_strict_test(
     main = "special_math_ops_test.py",
     shard_count = 10,
     tags = [
-        "cuda-only",
         "no_windows_gpu",
     ],
     deps = [
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index c1272a552ed4e0..67bc83f6dd5445 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -127,7 +127,6 @@ cuda_py_strict_test(
     shard_count = 16,
     tags = [
         "no_oss",
-        "cuda-only",
     ],
     deps = [
         ":control_flow_ops",
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index 4bc1e84a600dee..3d6038075b86cf 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -1185,7 +1185,6 @@ cuda_py_strict_test(
     name = "basic_loops_test",
     size = "medium",
     srcs = ["basic_loops_test.py"],
-    tags = ["cuda-only"], #TODO(ROCm) Re-enable after issue is fixed.
     deps = [
         ":basic_loops",
         ":supervisor",
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
index 36c8dbd6c29948..e70180f6398daf 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
@@ -64,13 +64,27 @@ if [ ! -d /tf ];then
         mkdir /tf
 fi
 
-# vvv TODO (rocm) weekly-sync-20251021 excluded tests
+# vvv TODO (rocm) weekly-sync-20251224 excluded tests
 EXCLUDED_TESTS=(
     # //tensorflow/core/kernels:matmul_op_test_gpu
     Test/FusedMatMulWithBiasOpTest/1.MatMul*
 
     # //tensorflow/core/common_runtime:process_function_library_runtime_test_gpu
     ProcessFunctionLibraryRuntimeTest.MultiDevice_ResourceOutput_GPU
+
+    # //tensorflow/compiler/tests:randomized_tests_seeded
+    # //tensorflow/compiler/tests:randomized_tests_mlir_seeded
+    OpTest.ScatterNd
+
+    # //tensorflow/core/util/autotune_maps:autotune_serialize_test_gpu
+    AutotuneSerializeTest.Consistency
+    AutotuneSerializeTest.VersionControl
+
+    # //tensorflow/python/kernel_tests/nn_ops:depthwise_conv_op_d9m_test
+    DepthwiseConv2DDeterministicTest.testBackwardDeterminismGPU
+
+    # //tensorflow/python/kernel_tests/sparse_ops:sparse_ops_test
+    SparseFillEmptyRowsTest.testSparseFillEmptyRowsGradInvalidReverseIndexMap
 )
 
 # Run bazel test command. Double test timeouts to avoid flakes.
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_xla.sh b/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
index ea43eddc474626..9bef5a24a46366 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_xla.sh
@@ -138,6 +138,73 @@ EXCLUDED_TESTS=(
 
     # @local_xla//xla/tests:multioutput_fusion_test_amdgpu_any
     MultiOutputFusionTest.MultiOutputReduceFusionMajorWithExtraOutput
+
+    # vvv TODO (rocm) weekly-sync-20251224 excluded tests
+    # @local_xla//xla/service/gpu:gpu_compiler_test_amdgpu_any
+    PersistedAutotuningTest.SingleOperationGetsAutotuned
+
+    # @local_xla//xla/backends/gpu/codegen/triton:support_test
+    BitcastOrReshapeTestSuite/BitcastOrReshapeTest.IsTritonSupportedBitcastOrReshape*
+    BitcastOrReshapeTestSuite/BitcastOrReshapeTest.IsTritonSupported0DBitcastOrReshape*
+    BitcastConvertSuite/BitcastConvertTest.BitcastConvertDisguisedAsBitcast*
+    UnaryElementwiseTestSuite/UnaryElementwiseTest.IsTritonSupportedUnaryElementwise*
+    ConvertTestSuite/ConvertTest.Convert*
+    BinaryElementwiseTestSuite/BinaryElementwiseTest.IsTritonSupportedBinaryElementwise*
+    TernaryElementwiseTestSuite/TernaryElementwiseTest.IsTritonSupportedTernaryElementwise*
+    ReductionComputationTestSuite/ReductionComputationTest.DifferentBinaryOps*
+    TransposeTestSuite/TransposeTest.LoadTranspose3D*
+    SliceTestSuite/SliceTest.ContinuousSlice*
+    BroadcastTestSuite/BroadcastTest.Broadcast*
+    ParameterTestSuite/ParameterTest.Parameter*
+    ConstantTestSuite/ConstantTest.ConstantEffectiveScalar*
+    DotTestSuite/DotTypesTest.Dot*
+
+    # @local_xla//xla/backends/gpu/codegen/triton:support_legacy_test
+    DotTestTestSuite/DotTest.IsTritonSupportedExecutesCorrectlyForDot/f8e5m2_dot
+
+    # @local_xla//xla/backends/gpu/profiler:kernel_name_tracer_test
+    KernelNameTracerTest.Create
+    KernelNameTracerTest.CaptureKernelNames
+    KernelNameTracerTest.CaptureKernelNamesFromCommandBufferThunk
+
+    # @local_xla//xla/service/gpu/autotuning:gemm_fusion_autotuner_test
+    GemmFusionAutotunerTest.Int8FusedGemm256
+    GemmFusionAutotunerLevelSweep/GemmFusionAutotunerLevelTest.Deviceless/0
+
+    # @local_xla//xla/service/gpu/tests:swap_conv_operands_test
+    SwapConvOperandsTest.LargePadding
+    SwapConvOperandsTest.SmallPadding
+    SwapConvOperandsTest.DoesNotLower
+
+    # @local_xla//xla/service/gpu/tests:gpu_triton_custom_call_test
+    GpuIrEmitterUnnestedTest.CanNotEmitTritonCustomCallOnPreAmpereGpu
+
+    # @local_xla//xla/tests:convolution_autotune_disabled_test
+    Transposed2DConvHloTest/Transposed2DConvHloTest.Simple*
+    ConvolveWithAndWithoutCanonicalization_Instantiation/ConvolveWithAndWithoutCanonicalization.Convolve2D_NoSpatialDims*
+    ConvolutionHloTest.ConvolveBackwardInput
+    ConvolutionHloTest.TestConv0D
+    ConvolutionHloTest.TestConv2DF16
+    ConvolutionHloTest.SwappedOperandConvolveWithStride
+    ConvolutionHloTest.TestFusedConv3D
+    ConvolutionHloTest.SwappedOperandConvolve
+    ConvolutionHloTest.TestBooleanInput
+    ConvolutionHloTest.SwappedOperandConvolve2
+    ConvolutionTest.Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid
+    ConvolutionTest.ConvolveF32BackwardInputGroupedConvolution
+    Convolve_1x1x4x4_1x1x2x2_Valid/2.Types
+    Convolve_1x1x4x4_1x1x2x2_Valid/1.Types
+    Convolve_1x1x4x4_1x1x2x2_Same/1.Types
+    Convolve_1x1x4x4_1x1x2x2_Same/2.Types
+    Convolve_1x1x4x4_1x1x3x3_Same/1.Types
+    Convolve_1x1x4x4_1x1x3x3_Same/2.Types
+    Convolve2D*
+
+    # @local_xla//xla/tests:convolution_1d_autotune_disabled_test
+    ConvolutionTest.Convolve1D*
+    Convolve1D_1x2x5_1x2x2*
+    Convolve1D1WindowTest_Instantiation/Convolve1D1WindowTestFloat*
+    Convolve1D1WindowTest_Instantiation/Convolve1D1WindowTestHalf*
 )
 
 bazel --bazelrc=tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/rocm.bazelrc test \
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
index 75366a8756f8ef..5b34e62ebd634e 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/BUILD
@@ -956,8 +956,7 @@ xla_cc_test(
     shard_count = 25,
     # TODO(b/353912594): this test does not need to run on GPU, but it is broken on CPU in OSS.
     # Force it to run on GPU temporarily in order to get important OSS coverage.
-    tags = ["gpu"] +
-           ["cuda-only"], # TODO(rocm) 240729
+    tags = ["gpu"],
     deps = [
         ":fusion_emitter",
         ":support",
@@ -994,8 +993,7 @@ xla_test(
         "b200",
         "amdgpu_any",
     ],
-    tags = ["no_mac",
-            "cuda-only"], # TODO(rocm) 240729
+    tags = ["no_mac",],
     deps = [
         ":fusion_emitter",
         ":support",
diff --git a/third_party/xla/xla/backends/gpu/profiler/BUILD b/third_party/xla/xla/backends/gpu/profiler/BUILD
index bbbdeb708afda8..251fc2aac50469 100644
--- a/third_party/xla/xla/backends/gpu/profiler/BUILD
+++ b/third_party/xla/xla/backends/gpu/profiler/BUILD
@@ -68,9 +68,6 @@ xla_test(
     backends = [
         "gpu",
     ],
-    tags = [
-        "cuda-only",
-    ],
     deps = [
         ":kernel_name_tracer",
         "//xla/backends/gpu/runtime:command_buffer_cmd",
diff --git a/third_party/xla/xla/backends/gpu/runtime/BUILD b/third_party/xla/xla/backends/gpu/runtime/BUILD
index a59d0cd42afa58..3952f956af78d9 100644
--- a/third_party/xla/xla/backends/gpu/runtime/BUILD
+++ b/third_party/xla/xla/backends/gpu/runtime/BUILD
@@ -3600,7 +3600,6 @@ xla_test(
     },
     backends = ["gpu"],
     tags = [
-        "cuda-only",
         "gpu",
     ],
     deps = [
@@ -3677,7 +3676,6 @@ xla_test(
     },
     backends = ["gpu"],
     tags = [
-        "cuda-only",
         "gpu",
     ],
     deps = [
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index 5abee85eee2eff..18e97bd54a005d 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -2928,9 +2928,6 @@ xla_cc_test(
 xla_test(
     name = "float_support_test",
     srcs = ["float_support_test.cc"],
-    backend_tags = {"gpu": [
-        "cuda-only"
-    ]},
     backends = [
         "a100",
         "h100",
diff --git a/third_party/xla/xla/service/gpu/autotuning/BUILD b/third_party/xla/xla/service/gpu/autotuning/BUILD
index 9029094626dcf1..a3c8ef193cb532 100644
--- a/third_party/xla/xla/service/gpu/autotuning/BUILD
+++ b/third_party/xla/xla/service/gpu/autotuning/BUILD
@@ -242,9 +242,9 @@ xla_test(
         "a100",
         "h100",
         "b200",
+        "amdgpu_any"
     ],
     tags = [
-        "cuda-only",
         "no_mac",
     ],
     deps = [
@@ -627,7 +627,6 @@ xla_test(
         "amdgpu_any",
     ],
     tags = [
-        "cuda-only",
         "noasan",
         "nomsan",
     ],
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 246ba2fe0a5e51..dc9e3e9bafe18c 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -267,7 +267,6 @@ xla_test(
         "swap_conv_operands_test.cc",
     ],
     backends = ["gpu"],
-    tags = ["cuda-only"],
     deps = [
         ":gpu_codegen_test",
         "//xla:error_spec",
@@ -398,7 +397,6 @@ xla_test(
         "b200",
         "amdgpu_any",
     ],
-    tags = ["cuda-only"], # TODO(rocm) 240729 Test checks only for cuda capability
     deps = [
         ":gpu_codegen_test",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
index 835bea38697e89..3b7c9f8a81361d 100644
--- a/third_party/xla/xla/service/gpu/transforms/BUILD
+++ b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -1208,7 +1208,6 @@ xla_cc_test(
     name = "dynamic_slice_fusion_rewriter_test",
     srcs = ["dynamic_slice_fusion_rewriter_test.cc"],
     tags = [
-        "cuda-only",
         "gpu",
     ],
     deps = [
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index a61c33c1ae1eac..113a3b4d975b73 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -1511,7 +1511,6 @@ xla_test(
     ],
     shard_count = 40,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1544,7 +1543,6 @@ xla_test(
     ],
     shard_count = 50,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1578,7 +1576,6 @@ xla_test(
     backends = ["gpu"],
     shard_count = 40,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1616,7 +1613,6 @@ xla_test(
     backends = ["gpu"],
     shard_count = 40,
     tags = [
-        "cuda-only",
         "optonly",
         "test_migrated_to_hlo_runner_pjrt",
     ],
@@ -1681,7 +1677,6 @@ xla_test(
     backends = ["gpu"],
     shard_count = 25,
     tags = [
-        "cuda-only",
         "test_migrated_to_hlo_runner_pjrt",
     ],
     deps = [
@@ -1767,7 +1762,6 @@ xla_test(
     name = "convolution_cudnn_test",
     timeout = "long",
     srcs = ["convolution_cudnn_test.cc"],
-    tags = ["cuda-only"],  # No int8
     backends = [
         "v100",
         "a100",

From c72c18e9d17a61ae40d80fb7e25d316472dcca66 Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Tue, 13 Jan 2026 00:01:22 +0000
Subject: [PATCH 752/753] Fix device_tracer_test build error and move failing
 subtests to test script

---
 tensorflow/core/profiler/backends/gpu/BUILD            | 7 +++----
 tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh | 4 ++++
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index fee9ede90965fe..feb6a553313283 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -18,7 +18,6 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags() + [
         "gpu_cupti",
         "nomac",
-        "cuda-only", # flaky on CI
     ],
     deps = [
         "//tensorflow/cc:cc_ops",
@@ -43,11 +42,11 @@ tf_cuda_cc_test(
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
-        "@local_xla//xla/backends/profiler/gpu:cuda_test",
-        "@local_xla//xla/backends/profiler/gpu:cupti_collector",
         "@local_xla//xla/backends/profiler/gpu:device_tracer",
         "@local_xla//xla/tsl/profiler/utils:tf_xplane_visitor",
-    ] + if_cuda_is_configured([
+        ] + if_cuda_is_configured([
+        "@local_xla//xla/backends/profiler/gpu:cupti_collector",
+        "@local_xla//xla/backends/profiler/gpu:cuda_test",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cupti_headers",
     ]),
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
index e70180f6398daf..ca6e0b612d5a47 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_gpu_single.sh
@@ -85,6 +85,10 @@ EXCLUDED_TESTS=(
 
     # //tensorflow/python/kernel_tests/sparse_ops:sparse_ops_test
     SparseFillEmptyRowsTest.testSparseFillEmptyRowsGradInvalidReverseIndexMap
+
+    # //tensorflow/core/profiler/backends/gpu:device_tracer_test
+    DeviceTracerTest.StartTwoTracers
+    DeviceTracerTest.TraceToXSpace
 )
 
 # Run bazel test command. Double test timeouts to avoid flakes.

From 3a69036d1e5f898febff5dbd0510dd3426c00774 Mon Sep 17 00:00:00 2001
From: Milica Makevic <Milica.Makevic@amd.com>
Date: Tue, 13 Jan 2026 00:10:21 +0000
Subject: [PATCH 753/753] Fix xla_gpu_compile_lib_test

---
 third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
index 4b6fbab412cb2a..dfb1a60da7df27 100644
--- a/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_gpu_compile_lib_test.cc
@@ -72,8 +72,10 @@ TEST_F(XlaCompileLibTest, CompilesForGpuWithDevice) {
 }
 
 TEST_F(XlaCompileLibTest, CompilesForGpuWithoutDevice) {
+  auto path = tsl::testing::XlaSrcRoot();
+  path = path.erase(path.length() - 4);
   const std::string target_config_path =
-      tsl::io::JoinPath(tsl::testing::XlaSrcRoot(),
+      tsl::io::JoinPath(path, "external/local_xla/xla",
                         "backends/gpu/target_config/specs", "h100_sxm.txtpb");
   stream_executor::GpuTargetConfigProto target_config;
   TF_ASSERT_OK(tsl::ReadTextProto(tsl::Env::Default(), target_config_path,